(Pipeline)[Breaking] Move PostProcessor.get_dict() from Dict to List[…

…Dict]
hipster-philology · Apr 23, 2020 · c8be021 · c8be021
1 parent 6143b6e
commit c8be021
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 35 deletions.
diff --git a/pie_extended/pipeline/postprocessor/glue.py b/pie_extended/pipeline/postprocessor/glue.py
@@ -14,13 +14,13 @@ class GlueProcessor(ChainedProcessor):
  >>> x.set_tasks(["lemma", "1", "2"]) # You can see things are remaped
  ['lemma', 'task3']
  >>> # Merges b and c values from task 1 and 2 into a new task
- >>> x.get_dict("a", ["a", "b", "c"]) == {"form": "a", "lemma": "a", "task3": "1=b|2=c"}
+ >>> x.get_dict("a", ["a", "b", "c"]) == [{"form": "a", "lemma": "a", "task3": "1=b|2=c"}]
  True
  >>> # Keeps only one task because 2 is empty
- >>> x.get_dict("a", ["a", "b", "_"]) == {"form": "a", "lemma": "a", "task3": "1=b"}
+ >>> x.get_dict("a", ["a", "b", "_"]) == [{"form": "a", "lemma": "a", "task3": "1=b"}]
  True
  >>> # Fills with the default empty tag because both task 1 and 2 were empty
- >>> x.get_dict("a", ["a", "_", "_"]) == {"form": "a", "lemma": "a", "task3": "NO-DATA"}
+ >>> x.get_dict("a", ["a", "_", "_"]) == [{"form": "a", "lemma": "a", "task3": "NO-DATA"}]
  True
 
  You can also use remaped tasks:
@@ -35,7 +35,7 @@ class GlueProcessor(ChainedProcessor):
  ['lemma', 'POS', 'task3']
  >>> # Merges b and c values from task 1 and 2 into a new task
  >>> x.get_dict("a", ["a", "p", "b", "c"])
- {'form': 'a', 'lemma': 'a', 'POS': 'p', 'task3': '1=b|2=c'}
+ [{'form': 'a', 'lemma': 'a', 'POS': 'p', 'task3': '1=b|2=c'}]
 
  """
 
@@ -83,9 +83,8 @@ def _yield_annotation(
  def reinsert(self, form: str) -> Dict[str, str]:
  return dict(form=form, **{key: self.empty_value for key in self._out if key != "form"})
 
- def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
- as_dict = super(GlueProcessor, self).get_dict(token, tags)
- return dict(self._yield_annotation(as_dict))
+ def get_dict(self, token: str, tags: List[str]) -> List[Dict[str, str]]:
+ return [dict(self._yield_annotation(as_dict)) for as_dict in super(GlueProcessor, self).get_dict(token, tags)]
 
  @property
  def tasks(self) -> List[str]:

diff --git a/pie_extended/pipeline/postprocessor/memory.py b/pie_extended/pipeline/postprocessor/memory.py
@@ -21,13 +21,13 @@ class MemoryzingProcessor(ChainedProcessor):
  ['lem', 'treated']
  >>> # Lowercase a was taken in the input but uppercase a is returned in form. For transparency, input seen
  >>> # By the tagger is returned in a new column, treated (cf. MemorizingProcessor.KEY)
- >>> processor.get_dict("a", ["lemma"]) == {"form": "A", "treated": "a", "lem": "lemma"}
+ >>> processor.get_dict("a", ["lemma"]) == [{"form": "A", "treated": "a", "lem": "lemma"}]
  True
  >>> # Some would have the same treated and input
- >>> processor.get_dict("b", ["lemma"]) == {"form": "b", "treated": "b", "lem": "lemma"}
+ >>> processor.get_dict("b", ["lemma"]) == [{"form": "b", "treated": "b", "lem": "lemma"}]
  True
  >>> # Some differ with more characters
- >>> processor.get_dict("q", ["lemma"]) == {"form": "q'", "treated": "q", "lem": "lemma"}
+ >>> processor.get_dict("q", ["lemma"]) == [{"form": "q'", "treated": "q", "lem": "lemma"}]
  True
 
  This allows for easier output alignment as well as removing unknown characters to the model. If your lemmatizer
@@ -43,16 +43,18 @@ def __init__(self, tokenizer_memory: "MemorizingTokenizer", head_processor: Proc
  self.memory: "MemorizingTokenizer" = tokenizer_memory
  self._key: str = key or type(self).KEY
 
- def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
+ def get_dict(self, token: str, tags: List[str]) -> List[Dict[str, str]]:
  # First we get the dictionary
- token_dict = self.head_processor.get_dict(token, tags)
- index, input_token, out_token = self.memory.tokens.pop(0)
- if token != out_token:
- raise Exception("The output token does not match our inputs %s : %s" % (token, out_token))
+ list_token_dict = []
+ for token_dict in self.head_processor.get_dict(token, tags):
+ index, input_token, out_token = self.memory.tokens.pop(0)
+ if token != out_token:
+ raise Exception("The output token does not match our inputs %s : %s" % (token, out_token))
 
- token_dict[self._key] = out_token
- token_dict["form"] = input_token
- return token_dict
+ token_dict[self._key] = out_token
+ token_dict["form"] = input_token
+ list_token_dict.append(token_dict)
+ return list_token_dict
 
  @property
  def tasks(self) -> List[str]:

diff --git a/pie_extended/pipeline/postprocessor/proto.py b/pie_extended/pipeline/postprocessor/proto.py
@@ -17,7 +17,7 @@ def __init__(self, empty_value: Optional[str] = None):
  ['a', 'b']
  >>> x.reinsert("x") == {"form": "x", "a": "%", "b": "%"}
  True
- >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"}
+ >>> x.get_dict("y", ["1", "2"]) == [{"form": "y", "a": "1", "b": "2"}]
  True
  """
  self._tasks = []
@@ -51,7 +51,7 @@ def reinsert(self, form: str) -> Dict[str, str]:
  """
  return dict(form=form, **{task: self.empty_value for task in self._tasks})
 
- def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
+ def get_dict(self, token: str, tags: List[str]) -> List[Dict[str, str]]:
  """ Get the dictionary representation of a token annotation
 
  :param token: Token used as input for pie
@@ -61,10 +61,10 @@ def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
  >>> x = ProcessorPrototype(empty_value="%")
  >>> x.set_tasks(["a", "b"])
  ['a', 'b']
- >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2"}
+ >>> x.get_dict("y", ["1", "2"]) == [{"form": "y", "a": "1", "b": "2"}]
  True
  """
- return {"form": token, **{k: val for k, val in zip(self._tasks, tags)}}
+ return [{"form": token, **{k: val for k, val in zip(self._tasks, tags)}}]
 
  def reset(self):
  """ Functions that should be run in between documents
@@ -84,7 +84,7 @@ def __init__(self, task_map: Dict[str, str], **kwargs):
  >>> x = RenamedTaskProcessor({"task_name_1": "renamed"})
  >>> x.set_tasks(["task_name_1", "y"])
  ['renamed', 'y']
- >>> x.get_dict("token", ["a", "b"]) == {"form": "token", "renamed": "a", "y": "b"}
+ >>> x.get_dict("token", ["a", "b"]) == [{"form": "token", "renamed": "a", "y": "b"}]
  True
  """
  super(RenamedTaskProcessor, self).__init__(**kwargs)
@@ -122,17 +122,15 @@ class ChainedProcessor(ProcessorPrototype):
  ... annotation["col3"] = "x"
  ... return annotation
  ...
- ... def get_dict(self, form: str, tags: List[str]) -> Dict[str, str]:
- ... annotation = self.head_processor.get_dict(form, tags)
- ... annotation["col3"] = "x"
- ... return annotation
- ...
+ ... def get_dict(self, form: str, tags: List[str]) -> List[Dict[str, str]]:
+ ... return [{"col3": "x", **{x:y for x, y in anno.items() if x != "col3"}}
+ ... for anno in self.head_processor.get_dict(form, tags)]
  >>> x = ExampleChained(ProcessorPrototype(empty_value="EMPTY"))
  >>> x.set_tasks(["a", "b"])
  ['a', 'b']
  >>> x.reinsert("x") == {"form": "x", "a": "EMPTY", "b": "EMPTY", "col3": "x"}
  True
- >>> x.get_dict("y", ["1", "2"]) == {"form": "y", "a": "1", "b": "2", "col3": "x"}
+ >>> x.get_dict("y", ["1", "2"]) == [{"form": "y", "a": "1", "b": "2", "col3": "x"}]
  True
 
  """
@@ -152,7 +150,7 @@ def set_tasks(self, tasks):
  def reinsert(self, form: str) -> Dict[str, str]:
  return self.head_processor.reinsert(form)
 
- def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
+ def get_dict(self, token: str, tags: List[str]) -> List[Dict[str, str]]:
  return self.head_processor.get_dict(token, tags)
 
  def reset(self):

diff --git a/pie_extended/pipeline/postprocessor/rulebased.py b/pie_extended/pipeline/postprocessor/rulebased.py
@@ -23,9 +23,9 @@ def __init__(self, apply_on_reinsert: bool = False, head_processor: Optional[Pro
  >>> processor = ExampleRule()
  >>> processor.set_tasks(["1", "2"])
  ['1', '2']
- >>> processor.get_dict("token", ["a", "b"]) == {"form": "token", "1": "a", "2": "b"}
+ >>> processor.get_dict("token", ["a", "b"]) == [{"form": "token", "1": "a", "2": "b"}]
  True
- >>> processor.get_dict("need", ["a", "b"]) == {"form": "need", "1": "REPLACED", "2": "b"}
+ >>> processor.get_dict("need", ["a", "b"]) == [{"form": "need", "1": "REPLACED", "2": "b"}]
  True
  """
  super(RuleBasedProcessor, self).__init__(head_processor=head_processor, **kwargs)
@@ -40,5 +40,5 @@ def reinsert(self, form: str) -> Dict[str, str]:
  return self.rules(anno)
  return anno
 
- def get_dict(self, token: str, tags: List[str]) -> Dict[str, str]:
- return self.rules(self.head_processor.get_dict(token, tags))
+ def get_dict(self, token: str, tags: List[str]) -> List[Dict[str, str]]:
+ return [self.rules(anno) for anno in self.head_processor.get_dict(token, tags)]
diff --git a/pie_extended/tagger.py b/pie_extended/tagger.py
@@ -86,7 +86,7 @@ def iter_tag_token(self, data: str, iterator: DataIterator, processor: Processor
  del sent_reinsertion[reinsertion_index + index]
  reinsertion_index += 1
 
- yield processor.get_dict(token, tags)
+ yield from processor.get_dict(token, tags)
 
  for reinsertion in sorted(list(sent_reinsertion.keys())):
  yield processor.reinsert(sent_reinsertion[reinsertion])