Skip to content

Commit

Permalink
(Pipeline)[Breaking] Move PostProcessor.get_dict() from Dict to List[…
Browse files Browse the repository at this point in the history
…Dict]
  • Loading branch information
PonteIneptique committed Apr 23, 2020
1 parent 6143b6e commit c8be021
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 35 deletions.
13 changes: 6 additions & 7 deletions pie_extended/pipeline/postprocessor/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ class GlueProcessor(ChainedProcessor):
>>> x.set_tasks(["lemma", "1", "2"]) # You can see things are remaped
['lemma', 'task3']
>>> # Merges b and c values from task 1 and 2 into a new task
>>> x.get_dict("a", ["a", "b", "c"]) == {"form": "a", "lemma": "a", "task3": "1=b|2=c"}
>>> x.get_dict("a", ["a", "b", "c"]) == [{"form": "a", "lemma": "a", "task3": "1=b|2=c"}]
True
>>> # Keeps only one task because 2 is empty
>>> x.get_dict("a", ["a", "b", "_"]) == {"form": "a", "lemma": "a", "task3": "1=b"}
>>> x.get_dict("a", ["a", "b", "_"]) == [{"form": "a", "lemma": "a", "task3": "1=b"}]
True
>>> # Fills with the default empty tag because both task 1 and 2 were empty
>>> x.get_dict("a", ["a", "_", "_"]) == {"form": "a", "lemma": "a", "task3": "NO-DATA"}
>>> x.get_dict("a", ["a", "_", "_"]) == [{"form": "a", "lemma": "a", "task3": "NO-DATA"}]
True
You can also use remaped tasks:
Expand All @@ -35,7 +35,7 @@ class GlueProcessor(ChainedProcessor):
['lemma', 'POS', 'task3']
>>> # Merges b and c values from task 1 and 2 into a new task
>>> x.get_dict("a", ["a", "p", "b", "c"])
{'form': 'a', 'lemma': 'a', 'POS': 'p', 'task3': '1=b|2=c'}
[{'form': 'a', 'lemma': 'a', 'POS': 'p', 'task3': '1=b|2=c'}]
"""

Expand Down Expand Up @@ -83,9 +83,8 @@ def _yield_annotation(
def reinsert(self, form: str) -> Dict[str, str]:
return dict(form=form, **{key: self.empty_value for key in self._out if key != "form"})

def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
as_dict = super(GlueProcessor, self).get_dict(token, tags)
return dict(self._yield_annotation(as_dict))
def get_dict(self, token: str, tags: List[str]) -> List[Dict[str, str]]:
return [dict(self._yield_annotation(as_dict)) for as_dict in super(GlueProcessor, self).get_dict(token, tags)]

@property
def tasks(self) -> List[str]:
Expand Down
24 changes: 13 additions & 11 deletions pie_extended/pipeline/postprocessor/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ class MemoryzingProcessor(ChainedProcessor):
['lem', 'treated']
>>> # Lowercase a was taken in the input but uppercase a is returned in form. For transparency, input seen
>>> # By the tagger is returned in a new column, treated (cf. MemorizingProcessor.KEY)
>>> processor.get_dict("a", ["lemma"]) == {"form": "A", "treated": "a", "lem": "lemma"}
>>> processor.get_dict("a", ["lemma"]) == [{"form": "A", "treated": "a", "lem": "lemma"}]
True
>>> # Some would have the same treated and input
>>> processor.get_dict("b", ["lemma"]) == {"form": "b", "treated": "b", "lem": "lemma"}
>>> processor.get_dict("b", ["lemma"]) == [{"form": "b", "treated": "b", "lem": "lemma"}]
True
>>> # Some differ with more characters
>>> processor.get_dict("q", ["lemma"]) == {"form": "q'", "treated": "q", "lem": "lemma"}
>>> processor.get_dict("q", ["lemma"]) == [{"form": "q'", "treated": "q", "lem": "lemma"}]
True
This allows for easier output alignment as well as removing unknown characters to the model. If your lemmatizer
Expand All @@ -43,16 +43,18 @@ def __init__(self, tokenizer_memory: "MemorizingTokenizer", head_processor: Proc
self.memory: "MemorizingTokenizer" = tokenizer_memory
self._key: str = key or type(self).KEY

def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
def get_dict(self, token: str, tags: List[str]) -> List[Dict[str, str]]:
# First we get the dictionary
token_dict = self.head_processor.get_dict(token, tags)
index, input_token, out_token = self.memory.tokens.pop(0)
if token != out_token:
raise Exception("The output token does not match our inputs %s : %s" % (token, out_token))
list_token_dict = []
for token_dict in self.head_processor.get_dict(token, tags):
index, input_token, out_token = self.memory.tokens.pop(0)
if token != out_token:
raise Exception("The output token does not match our inputs %s : %s" % (token, out_token))

token_dict[self._key] = out_token
token_dict["form"] = input_token
return token_dict
token_dict[self._key] = out_token
token_dict["form"] = input_token
list_token_dict.append(token_dict)
return list_token_dict

@property
def tasks(self) -> List[str]:
Expand Down
22 changes: 10 additions & 12 deletions pie_extended/pipeline/postprocessor/proto.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self, empty_value: Optional[str] = None):
['a', 'b']
>>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"}
True
>>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"}
>>> x.get_dict("y", ["1", "2"]) == [{"form": "y", "a": "1", "b": "2"}]
True
"""
self._tasks = []
Expand Down Expand Up @@ -51,7 +51,7 @@ def reinsert(self, form: str) -> Dict[str, str]:
"""
return dict(form=form, **{task: self.empty_value for task in self._tasks})

def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
def get_dict(self, token: str, tags: List[str]) -> List[Dict[str, str]]:
""" Get the dictionary representation of a token annotation
:param token: Token used as input for pie
Expand All @@ -61,10 +61,10 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
>>> x = ProcessorPrototype(empty_value="%")
>>> x.set_tasks(["a", "b"])
['a', 'b']
>>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"}
>>> x.get_dict("y", ["1", "2"]) == [{"form": "y", "a": "1", "b": "2"}]
True
"""
return {"form": token, **{k: val for k, val in zip(self._tasks, tags)}}
return [{"form": token, **{k: val for k, val in zip(self._tasks, tags)}}]

def reset(self):
""" Functions that should be run in between documents
Expand All @@ -84,7 +84,7 @@ def __init__(self, task_map: Dict[str, str], **kwargs):
>>> x = RenamedTaskProcessor({"task_name_1": "renamed"})
>>> x.set_tasks(["task_name_1", "y"])
['renamed', 'y']
>>> x.get_dict("token", ["a", "b"]) == {"form": "token", "renamed": "a", "y": "b"}
>>> x.get_dict("token", ["a", "b"]) == [{"form": "token", "renamed": "a", "y": "b"}]
True
"""
super(RenamedTaskProcessor, self).__init__(**kwargs)
Expand Down Expand Up @@ -122,17 +122,15 @@ class ChainedProcessor(ProcessorPrototype):
... annotation["col3"] = "x"
... return annotation
...
... def get_dict(self, form: str, tags: List[str]) -> Dict[str, str]:
... annotation = self.head_processor.get_dict(form, tags)
... annotation["col3"] = "x"
... return annotation
...
... def get_dict(self, form: str, tags: List[str]) -> List[Dict[str, str]]:
... return [{"col3": "x", **{x:y for x, y in anno.items() if x != "col3"}}
... for anno in self.head_processor.get_dict(form, tags)]
>>> x = ExampleChained(ProcessorPrototype(empty_value="EMPTY"))
>>> x.set_tasks(["a", "b"])
['a', 'b']
>>> x.reinsert("x") == {"form": "x", "a": "EMPTY", "b": "EMPTY", "col3": "x"}
True
>>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2", "col3": "x"}
>>> x.get_dict("y", ["1", "2"]) == [{"form": "y", "a": "1", "b": "2", "col3": "x"}]
True
"""
Expand All @@ -152,7 +150,7 @@ def set_tasks(self, tasks):
def reinsert(self, form: str) -> Dict[str, str]:
return self.head_processor.reinsert(form)

def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
def get_dict(self, token: str, tags: List[str]) -> List[Dict[str, str]]:
return self.head_processor.get_dict(token, tags)

def reset(self):
Expand Down
8 changes: 4 additions & 4 deletions pie_extended/pipeline/postprocessor/rulebased.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ def __init__(self, apply_on_reinsert: bool = False, head_processor: Optional[Pro
>>> processor = ExampleRule()
>>> processor.set_tasks(["1", "2"])
['1', '2']
>>> processor.get_dict("token", ["a", "b"]) == {"form": "token", "1": "a", "2": "b"}
>>> processor.get_dict("token", ["a", "b"]) == [{"form": "token", "1": "a", "2": "b"}]
True
>>> processor.get_dict("need", ["a", "b"]) == {"form": "need", "1": "REPLACED", "2": "b"}
>>> processor.get_dict("need", ["a", "b"]) == [{"form": "need", "1": "REPLACED", "2": "b"}]
True
"""
super(RuleBasedProcessor, self).__init__(head_processor=head_processor, **kwargs)
Expand All @@ -40,5 +40,5 @@ def reinsert(self, form: str) -> Dict[str, str]:
return self.rules(anno)
return anno

def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
return self.rules(self.head_processor.get_dict(token, tags))
def get_dict(self, token: str, tags: List[str]) -> List[Dict[str, str]]:
return [self.rules(anno) for anno in self.head_processor.get_dict(token, tags)]
2 changes: 1 addition & 1 deletion pie_extended/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor
del sent_reinsertion[reinsertion_index + index]
reinsertion_index += 1

yield processor.get_dict(token, tags)
yield from processor.get_dict(token, tags)

for reinsertion in sorted(list(sent_reinsertion.keys())):
yield processor.reinsert(sent_reinsertion[reinsertion])
Expand Down

0 comments on commit c8be021

Please sign in to comment.