Merge pull request #2 from flairNLP/master

update
flairNLP · Sep 8, 2020 · 3ebfa73 · 3ebfa73
2 parents 8df8fd2 + ff94fb5
commit 3ebfa73
Show file tree

Hide file tree

Showing 3 changed files with 162 additions and 3 deletions.
diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
@@ -12,9 +12,11 @@
 from .sequence_labeling import CONLL_03
 from .sequence_labeling import CONLL_03_GERMAN
 from .sequence_labeling import CONLL_03_DUTCH
+from .sequence_labeling import TWITTER_NER
 from .sequence_labeling import CONLL_03_SPANISH
 from .sequence_labeling import CONLL_2000
 from .sequence_labeling import DANE
+from .sequence_labeling import EUROPARL_NER_GERMAN
 from .sequence_labeling import GERMEVAL_14
 from .sequence_labeling import INSPEC
 from .sequence_labeling import LER_GERMAN

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
@@ -446,6 +446,53 @@ def __init__(
             in_memory=in_memory,
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
+class TWITTER_NER(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
+    ):
+        """
+        Initialize a dataset called twitter_ner which can be found on the following page:
+        https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
+
+        The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: 'text', 1: 'ner'}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
+        cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)
+
+        super(TWITTER_NER, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            train_file="ner.txt",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+        )
 
 class MIT_RESTAURANTS(ColumnCorpus):
     def __init__(
@@ -494,12 +541,72 @@ def __init__(
         )
 
 
+def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
+    """
+Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
+of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
+the letter 'O'. Additionally it removes lines with no tags in the data file and can also
+be used if the data is only partially IOB tagged.
+Parameters
+----------
+data_file : Union[str, Path]
+    Path to the data file.
+encoding : str, optional
+    Encoding used in open function. The default is "utf8".
+ner_column : int, optional
+    Specifies the ner-tagged column. The default is 1 (the second column).
+
+"""
+    def add_I_prefix(current_line: List[str], ner: int, tag: str):
+        for i in range(0, len(current_line)):
+            if i == 0:
+                f.write(line_list[i])
+            elif i == ner:
+                f.write(' I-' + tag)
+            else:
+                f.write(' ' + current_line[i])
+        f.write('\n')
+
+
+    with open(file=data_file, mode='r', encoding=encoding) as f:
+        lines = f.readlines()
+    with open(file=data_file, mode='w', encoding=encoding) as f:
+        pred = 'O'  # remembers ner tag of predecessing line
+        for line in lines:
+            line_list = line.split()
+            if len(line_list) > 2:  # word with tags
+                ner_tag = line_list[ner_column]
+                if ner_tag in ['0', 'O']:  # no chunk
+                    for i in range(0,len(line_list)):
+                        if i == 0:
+                            f.write(line_list[i])
+                        elif i == ner_column:
+                            f.write(' O')
+                        else:
+                            f.write(' ' + line_list[i])
+                    f.write('\n')
+                    pred = 'O'
+                elif '-' not in ner_tag:  # no IOB tags
+                    if pred == 'O':  # found a new chunk
+                        add_I_prefix(line_list, ner_column, ner_tag)
+                        pred = ner_tag
+                    else:  # found further part of chunk or new chunk directly after old chunk
+                        add_I_prefix(line_list, ner_column, ner_tag)
+                        pred = ner_tag
+                else:  # line already has IOB tag (tag contains '-')
+                    f.write(line)
+                    pred = ner_tag.split('-')[1]
+            elif len(line_list) == 0:  # empty line
+                f.write('\n')
+                pred = 'O'
+
+
 def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"):
     """
 Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
 of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
-the letter 'O'. Additionaly it removes lines with no tags in the data file and can also
-be used if the data is only partialy IOB tagged.
+the letter 'O'. Additionally it removes lines with no tags in the data file and can also
+be used if the data is only partially IOB tagged.
 Parameters
 ----------
 data_file : Union[str, Path]
@@ -702,6 +809,56 @@ def __init__(
         )
 
 
+class EUROPARL_NER_GERMAN(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
+    ):
+        """
+        Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: 'ner' by default, should not be changed.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
+        cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
+        cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)
+
+        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
+        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)
+
+        super(EUROPARL_NER_GERMAN, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            in_memory=in_memory,
+            train_file='ep-96-04-16.conll',
+            test_file='ep-96-04-15.conll'
+        )
+
+
 class GERMEVAL_14(ColumnCorpus):
     def __init__(
             self,

diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md
@@ -169,7 +169,7 @@ data the first time you call the corresponding constructor ID. The following dat
 | 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | 
 | 'NER_BASQUE' | Basque  |  [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) |
 | 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) | 
-| 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER] (https://github.com/klintan/swedish-ner-corpus/) 4-class NER |
+| 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER |
 | 'WNUT_17' | English  |  [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
 | 'WIKINER_ENGLISH' | English  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_GERMAN'  | German  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |