diff --git a/flair/datasets.py b/flair/datasets.py index 7f3c37b704..d567b54851 100644 --- a/flair/datasets.py +++ b/flair/datasets.py @@ -20,6 +20,7 @@ def __init__( test_file=None, dev_file=None, tag_to_biloes=None, + in_memory: bool = True, ): """ Helper function to get a TaggedCorpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. @@ -73,11 +74,15 @@ def __init__( log.info("Test: {}".format(test_file)) # get train data - train = ColumnDataset(train_file, column_format, tag_to_biloes) + train = ColumnDataset( + train_file, column_format, tag_to_biloes, in_memory=in_memory + ) # read in test file if exists, otherwise sample 10% of train data as test dataset if test_file is not None: - test = ColumnDataset(test_file, column_format, tag_to_biloes) + test = ColumnDataset( + test_file, column_format, tag_to_biloes, in_memory=in_memory + ) else: train_length = len(train) test_size: int = round(train_length / 10) @@ -87,7 +92,9 @@ def __init__( # read in dev file if exists, otherwise sample 10% of train data as dev dataset if dev_file is not None: - dev = ColumnDataset(dev_file, column_format, tag_to_biloes) + dev = ColumnDataset( + dev_file, column_format, tag_to_biloes, in_memory=in_memory + ) else: train_length = len(train) dev_size: int = round(train_length / 10) @@ -237,70 +244,128 @@ def __init__( self, path_to_column_file: Path, column_name_map: Dict[int, str], - tag_to_biloes=None, + tag_to_bioes: str = None, + in_memory: bool = True, ): assert path_to_column_file.exists() + self.path_to_column_file = path_to_column_file + self.tag_to_bioes = tag_to_bioes + self.column_name_map = column_name_map - self.sentences: List[Sentence] = [] + # store either Sentence objects in memory, or only file offsets + self.in_memory = in_memory + if self.in_memory: + self.sentences: List[Sentence] = [] + else: + self.indices: List[int] = [] + self.total_sentence_count: int = 0 + + # most data sets have the token text in the first column, if not, pass 'text' as column + self.text_column: int = 0 + for column in self.column_name_map: + if column_name_map[column] == "text": + self.text_column = column + + # determine encoding of text file + encoding = "utf-8" try: - lines: List[str] = open( - str(path_to_column_file), encoding="utf-8" - ).read().strip().split("\n") + lines: List[str] = open(str(path_to_column_file), encoding="utf-8").read( + 10 + ).strip().split("\n") except: log.info( 'UTF-8 can\'t read: {} ... using "latin-1" instead.'.format( path_to_column_file ) ) - lines: List[str] = open( - str(path_to_column_file), encoding="latin1" - ).read().strip().split("\n") - - # most data sets have the token text in the first column, if not, pass 'text' as column - text_column: int = 0 - for column in column_name_map: - if column_name_map[column] == "text": - text_column = column + encoding = "latin1" sentence: Sentence = Sentence() - for line in lines: + with open(str(self.path_to_column_file), encoding=encoding) as f: - if line.startswith("#"): - continue + line = f.readline() + position = 0 - if line.strip().replace("", "") == "": - if len(sentence) > 0: - sentence.infer_space_after() - self.sentences.append(sentence) - sentence: Sentence = Sentence() + while line: - else: - fields: List[str] = re.split("\s+", line) - token = Token(fields[text_column]) - for column in column_name_map: - if len(fields) > column: - if column != text_column: - token.add_tag(column_name_map[column], fields[column]) + if line.startswith("#"): + line = f.readline() + continue - sentence.add_token(token) + if line.strip().replace("", "") == "": + if len(sentence) > 0: + sentence.infer_space_after() + if self.in_memory: + self.sentences.append(sentence) + else: + self.indices.append(position) + position = f.tell() + self.total_sentence_count += 1 + sentence: Sentence = Sentence() + + else: + fields: List[str] = re.split("\s+", line) + token = Token(fields[self.text_column]) + for column in column_name_map: + if len(fields) > column: + if column != self.text_column: + token.add_tag( + self.column_name_map[column], fields[column] + ) + + sentence.add_token(token) + + line = f.readline() if len(sentence.tokens) > 0: sentence.infer_space_after() - self.sentences.append(sentence) - - if tag_to_biloes is not None: - # convert tag scheme to iobes - for sentence in self.sentences: - sentence.convert_tag_scheme( - tag_type=tag_to_biloes, target_scheme="iobes" - ) + if self.in_memory: + self.sentences.append(sentence) + else: + self.indices.append(position) + self.total_sentence_count += 1 def __len__(self): - return len(self.sentences) + return self.total_sentence_count def __getitem__(self, index: int = 0) -> Sentence: - return self.sentences[index] + + if self.in_memory: + sentence = self.sentences[index] + else: + with open(str(self.path_to_column_file), encoding="utf-8") as file: + file.seek(self.indices[index]) + line = file.readline() + sentence: Sentence = Sentence() + while line: + if line.startswith("#"): + line = file.readline() + continue + + if line.strip().replace("", "") == "": + if len(sentence) > 0: + sentence.infer_space_after() + break + else: + fields: List[str] = re.split("\s+", line) + token = Token(fields[self.text_column]) + for column in self.column_name_map: + if len(fields) > column: + if column != self.text_column: + token.add_tag( + self.column_name_map[column], fields[column] + ) + + sentence.add_token(token) + line = file.readline() + + if self.tag_to_bioes is not None: + sentence.convert_tag_scheme( + tag_type=self.tag_to_bioes, target_scheme="iobes" + ) + + return sentence class UniversalDependenciesDataset(Dataset): @@ -459,7 +524,9 @@ def __getitem__(self, index: int = 0) -> Sentence: class CONLL_03(ColumnCorpus): - def __init__(self, base_path=None, tag_to_biloes: str = "ner"): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = True + ): # column format columns = {0: "text", 1: "pos", 2: "np", 3: "ner"} @@ -472,14 +539,54 @@ def __init__(self, base_path=None, tag_to_biloes: str = "ner"): base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name + # check if data there + if not data_folder.exists(): + log.warning("-" * 100) + log.warning(f'ACHTUNG: CoNLL-03 dataset not found at "{data_folder}".') + log.warning( + 'Instructions for obtaining the data can be found here: https://www.clips.uantwerpen.be/conll2003/ner/"' + ) + log.warning("-" * 100) + super(CONLL_03, self).__init__( - data_folder, columns, tag_to_biloes=tag_to_biloes + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory ) -class CONLL_03_DUTCH(ColumnCorpus): - def __init__(self, base_path=None, tag_to_biloes: str = "ner"): +class CONLL_03_GERMAN(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = True + ): + + # column format + columns = {0: "text", 1: "lemma", 2: "pos", 3: "np", 4: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + # check if data there + if not data_folder.exists(): + log.warning("-" * 100) + log.warning(f'ACHTUNG: CoNLL-03 dataset not found at "{data_folder}".') + log.warning( + 'Instructions for obtaining the data can be found here: https://www.clips.uantwerpen.be/conll2003/ner/"' + ) + log.warning("-" * 100) + + super(CONLL_03_GERMAN, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +class CONLL_03_DUTCH(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = True + ): # column format columns = {0: "text", 1: "pos", 2: "ner"} @@ -498,13 +605,14 @@ def __init__(self, base_path=None, tag_to_biloes: str = "ner"): cached_path(f"{conll_02_path}ned.train", Path("datasets") / dataset_name) super(CONLL_03_DUTCH, self).__init__( - data_folder, columns, tag_to_biloes=tag_to_biloes + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory ) class CONLL_03_SPANISH(ColumnCorpus): - def __init__(self, base_path=None, tag_to_biloes: str = "ner"): - + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = True + ): # column format columns = {0: "text", 1: "ner"} @@ -523,12 +631,342 @@ def __init__(self, base_path=None, tag_to_biloes: str = "ner"): cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name) super(CONLL_03_SPANISH, self).__init__( - data_folder, columns, tag_to_biloes=tag_to_biloes + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +class WNUT_17(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = True + ): + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + wnut_path = "https://noisy-text.github.io/2017/files/" + cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) + cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) + cached_path( + f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name + ) + + super(WNUT_17, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +class CONLL_2000(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "np", in_memory: bool = True + ): + + # column format + columns = {0: "text", 1: "pos", 2: "np"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" + data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt" + if not data_file.is_file(): + cached_path( + f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name + ) + cached_path( + f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name + ) + import gzip, shutil + + with gzip.open( + Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz", + "rb", + ) as f_in: + with open( + Path(flair.cache_root) / "datasets" / dataset_name / "train.txt", + "wb", + ) as f_out: + shutil.copyfileobj(f_in, f_out) + with gzip.open( + Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb" + ) as f_in: + with open( + Path(flair.cache_root) / "datasets" / dataset_name / "test.txt", + "wb", + ) as f_out: + shutil.copyfileobj(f_in, f_out) + + super(CONLL_2000, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +def _download_wikiner(language_code: str, dataset_name: str): + # download data if necessary + wikiner_path = ( + "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" + ) + lc = language_code + + data_file = ( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.train" + ) + if not data_file.is_file(): + + cached_path( + f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name + ) + import bz2, shutil + + # unpack and write out in CoNLL column-like format + bz_file = bz2.BZ2File( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.bz2", + "rb", + ) + with bz_file as f, open( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.train", + "w", + ) as out: + for line in f: + line = line.decode("utf-8") + words = line.split(" ") + for word in words: + out.write("\t".join(word.split("|")) + "\n") + + +class WIKINER_ENGLISH(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = False + ): + # column format + columns = {0: "text", 1: "pos", 2: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + _download_wikiner("en", dataset_name) + + super(WIKINER_ENGLISH, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +class WIKINER_GERMAN(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = False + ): + # column format + columns = {0: "text", 1: "pos", 2: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + _download_wikiner("en", dataset_name) + + super(WIKINER_GERMAN, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +class WIKINER_DUTCH(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = False + ): + # column format + columns = {0: "text", 1: "pos", 2: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + _download_wikiner("nl", dataset_name) + + super(WIKINER_DUTCH, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +class WIKINER_FRENCH(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = False + ): + # column format + columns = {0: "text", 1: "pos", 2: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + _download_wikiner("fr", dataset_name) + + super(WIKINER_FRENCH, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +class WIKINER_ITALIAN(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = False + ): + # column format + columns = {0: "text", 1: "pos", 2: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + _download_wikiner("it", dataset_name) + + super(WIKINER_ITALIAN, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +class WIKINER_SPANISH(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = False + ): + # column format + columns = {0: "text", 1: "pos", 2: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + _download_wikiner("es", dataset_name) + + super(WIKINER_SPANISH, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +class WIKINER_PORTUGUESE(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = False + ): + # column format + columns = {0: "text", 1: "pos", 2: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + _download_wikiner("pt", dataset_name) + + super(WIKINER_PORTUGUESE, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +class WIKINER_POLISH(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = False + ): + # column format + columns = {0: "text", 1: "pos", 2: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + _download_wikiner("pl", dataset_name) + + super(WIKINER_POLISH, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory + ) + + +class WIKINER_RUSSIAN(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = False + ): + # column format + columns = {0: "text", 1: "pos", 2: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + _download_wikiner("ru", dataset_name) + + super(WIKINER_RUSSIAN, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory ) class GERMEVAL(ColumnCorpus): - def __init__(self, base_path=None, tag_to_biloes: str = "ner"): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = True + ): # column format columns = {1: "text", 2: "ner"} @@ -541,16 +979,26 @@ def __init__(self, base_path=None, tag_to_biloes: str = "ner"): base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name + # check if data there + if not data_folder.exists(): + log.warning("-" * 100) + log.warning(f'ACHTUNG: GermEval-14 dataset not found at "{data_folder}".') + log.warning( + 'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/home/"' + ) + log.warning("-" * 100) super(GERMEVAL, self).__init__( - data_folder, columns, tag_to_biloes=tag_to_biloes + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory ) -class CONLL_2000(ColumnCorpus): - def __init__(self, base_path=None, tag_to_biloes: str = "np"): +class NER_BASQUE(ColumnCorpus): + def __init__( + self, base_path=None, tag_to_biloes: str = "ner", in_memory: bool = True + ): # column format - columns = {0: "text", 1: "pos", 2: "np"} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -560,14 +1008,35 @@ def __init__(self, base_path=None, tag_to_biloes: str = "np"): base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - super(CONLL_2000, self).__init__( - data_folder, columns, tag_to_biloes=tag_to_biloes + # download data if necessary + ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" + data_path = Path(flair.cache_root) / "datasets" / dataset_name + data_file = data_path / "named_ent_eu.train" + if not data_file.is_file(): + cached_path( + f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name + ) + import tarfile, shutil + + with tarfile.open( + Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz", + "r:gz", + ) as f_in: + corpus_files = ( + "eiec_v1.0/named_ent_eu.train", + "eiec_v1.0/named_ent_eu.test", + ) + for corpus_file in corpus_files: + f_in.extract(corpus_file, data_path) + shutil.move(f"{data_path}/{corpus_file}", data_path) + + super(NER_BASQUE, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes, in_memory=in_memory ) class UD_ENGLISH(UniversalDependenciesCorpus): def __init__(self, base_path=None): - # this dataset name dataset_name = self.__class__.__name__.lower() @@ -591,7 +1060,6 @@ def __init__(self, base_path=None): class UD_GERMAN(UniversalDependenciesCorpus): def __init__(self, base_path=None): - # this dataset name dataset_name = self.__class__.__name__.lower()