Skip to content

Commit

Permalink
Merge pull request #2 from flairNLP/master
Browse files Browse the repository at this point in the history
update
  • Loading branch information
megantosh authored Sep 8, 2020
2 parents 8df8fd2 + ff94fb5 commit 3ebfa73
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 3 deletions.
2 changes: 2 additions & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
from .sequence_labeling import CONLL_03
from .sequence_labeling import CONLL_03_GERMAN
from .sequence_labeling import CONLL_03_DUTCH
from .sequence_labeling import TWITTER_NER
from .sequence_labeling import CONLL_03_SPANISH
from .sequence_labeling import CONLL_2000
from .sequence_labeling import DANE
from .sequence_labeling import EUROPARL_NER_GERMAN
from .sequence_labeling import GERMEVAL_14
from .sequence_labeling import INSPEC
from .sequence_labeling import LER_GERMAN
Expand Down
161 changes: 159 additions & 2 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,53 @@ def __init__(
in_memory=in_memory,
document_separator_token=None if not document_as_sequence else "-DOCSTART-",
)
class TWITTER_NER(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = True,
document_as_sequence: bool = False,
):
"""
Initialize a dataset called twitter_ner which can be found on the following page:
https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
The first time you call this constructor it will automatically
download the dataset.
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param tag_to_bioes: NER by default, need not be changed
:param in_memory: If True, keeps dataset in memory giving speedups in training.
:param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
"""
if type(base_path) == str:
base_path: Path = Path(base_path)

# column format
columns = {0: 'text', 1: 'ner'}

# this dataset name
dataset_name = self.__class__.__name__.lower()

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name

# download data if necessary
twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)

super(TWITTER_NER, self).__init__(
data_folder,
columns,
tag_to_bioes=tag_to_bioes,
encoding="latin-1",
train_file="ner.txt",
in_memory=in_memory,
document_separator_token=None if not document_as_sequence else "-DOCSTART-",
)

class MIT_RESTAURANTS(ColumnCorpus):
def __init__(
Expand Down Expand Up @@ -494,12 +541,72 @@ def __init__(
)


def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
"""
Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
the letter 'O'. Additionally it removes lines with no tags in the data file and can also
be used if the data is only partially IOB tagged.
Parameters
----------
data_file : Union[str, Path]
Path to the data file.
encoding : str, optional
Encoding used in open function. The default is "utf8".
ner_column : int, optional
Specifies the ner-tagged column. The default is 1 (the second column).
"""
def add_I_prefix(current_line: List[str], ner: int, tag: str):
for i in range(0, len(current_line)):
if i == 0:
f.write(line_list[i])
elif i == ner:
f.write(' I-' + tag)
else:
f.write(' ' + current_line[i])
f.write('\n')


with open(file=data_file, mode='r', encoding=encoding) as f:
lines = f.readlines()
with open(file=data_file, mode='w', encoding=encoding) as f:
pred = 'O' # remembers ner tag of predecessing line
for line in lines:
line_list = line.split()
if len(line_list) > 2: # word with tags
ner_tag = line_list[ner_column]
if ner_tag in ['0', 'O']: # no chunk
for i in range(0,len(line_list)):
if i == 0:
f.write(line_list[i])
elif i == ner_column:
f.write(' O')
else:
f.write(' ' + line_list[i])
f.write('\n')
pred = 'O'
elif '-' not in ner_tag: # no IOB tags
if pred == 'O': # found a new chunk
add_I_prefix(line_list, ner_column, ner_tag)
pred = ner_tag
else: # found further part of chunk or new chunk directly after old chunk
add_I_prefix(line_list, ner_column, ner_tag)
pred = ner_tag
else: # line already has IOB tag (tag contains '-')
f.write(line)
pred = ner_tag.split('-')[1]
elif len(line_list) == 0: # empty line
f.write('\n')
pred = 'O'


def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"):
"""
Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
the letter 'O'. Additionaly it removes lines with no tags in the data file and can also
be used if the data is only partialy IOB tagged.
the letter 'O'. Additionally it removes lines with no tags in the data file and can also
be used if the data is only partially IOB tagged.
Parameters
----------
data_file : Union[str, Path]
Expand Down Expand Up @@ -702,6 +809,56 @@ def __init__(
)


class EUROPARL_NER_GERMAN(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = False,
):
"""
Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
download the dataset.
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param tag_to_bioes: 'ner' by default, should not be changed.
:param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
:param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
"""

if type(base_path) == str:
base_path: Path = Path(base_path)

# column format
columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}

# this dataset name
dataset_name = self.__class__.__name__.lower()

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name

# download data if necessary
europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)

add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)

super(EUROPARL_NER_GERMAN, self).__init__(
data_folder,
columns,
tag_to_bioes=tag_to_bioes,
encoding="latin-1",
in_memory=in_memory,
train_file='ep-96-04-16.conll',
test_file='ep-96-04-15.conll'
)


class GERMEVAL_14(ColumnCorpus):
def __init__(
self,
Expand Down
2 changes: 1 addition & 1 deletion resources/docs/TUTORIAL_6_CORPUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ data the first time you call the corresponding constructor ID. The following dat
| 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents |
| 'NER_BASQUE' | Basque | [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) |
| 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) |
| 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER] (https://github.com/klintan/swedish-ner-corpus/) 4-class NER |
| 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER |
| 'WNUT_17' | English | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
| 'WIKINER_ENGLISH' | English | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
| 'WIKINER_GERMAN' | German | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
Expand Down

0 comments on commit 3ebfa73

Please sign in to comment.