From 4e0067c4f6249c72394a5700e5195aec7c747570 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 27 Mar 2023 14:59:39 +0200 Subject: [PATCH 01/12] configure docstring tests for google docstrings --- .flake8 | 3 +++ pyproject.toml | 2 +- requirements-dev.txt | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000..be1e976a3a --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] + +docstring-convention=google \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 2d83636b46..02c1a71f07 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ exclude = ''' ''' [tool.pytest.ini_options] flake8-max-line-length = 210 -flake8-ignore = ["E203", "W503"] # See https://github.com/PyCQA/pycodestyle/issues/373 +flake8-ignore = ["E203", "W503", "D100", "D101", "D102", "D103", "D104", "D105", "D107"] # See https://github.com/PyCQA/pycodestyle/issues/373 addopts = "--flake8 --mypy --isort" filterwarnings = [ "error", # Convert all warnings to errors diff --git a/requirements-dev.txt b/requirements-dev.txt index b846f86b98..b853e91b38 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,3 +12,4 @@ types-requests pytest-github-actions-annotate-failures konoha<5.0.0,>=4.0.0 pre-commit +flake8-docstrings \ No newline at end of file From 7e321f830eb7a56a3f2f5e1c3c6503b781ff2bdd Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 27 Mar 2023 16:14:22 +0200 Subject: [PATCH 02/12] fix documentation format for non model/embedding/dataset modules --- flair/data.py | 104 +++++++++--------- flair/file_utils.py | 39 +++---- flair/inference_utils.py | 24 ++-- flair/models/sequence_tagger_utils/crf.py | 10 +- flair/models/sequence_tagger_utils/viterbi.py | 40 +++---- flair/models/tars_model.py | 65 +++++------ flair/models/text_classification_model.py | 9 +- flair/models/word_tagger_model.py | 8 +- flair/nn/distance/cosine.py | 4 +- flair/nn/distance/euclidean.py | 6 +- flair/nn/distance/hyperbolic.py | 8 +- flair/nn/dropout.py | 10 +- flair/nn/model.py | 76 ++++++++----- flair/optim.py | 18 +-- flair/samplers.py | 23 ++-- flair/splitter.py | 18 ++- flair/tokenization.py | 26 ++--- flair/trainers/trainer.py | 4 +- flair/training_utils.py | 13 ++- flair/visual/ner_html.py | 3 +- flair/visual/training_curves.py | 5 +- 21 files changed, 260 insertions(+), 253 deletions(-) diff --git a/flair/data.py b/flair/data.py index 5ccd47405f..271f7271b4 100644 --- a/flair/data.py +++ b/flair/data.py @@ -43,9 +43,7 @@ def _len_dataset(dataset: Optional[Dataset]) -> int: class Dictionary: - """ - This class holds a dictionary that maps strings to IDs, used to generate one-hot encodings of strings. - """ + """This class holds a dictionary that maps strings to IDs, used to generate one-hot encodings of strings.""" def __init__(self, add_unk=True): # init dictionaries @@ -65,8 +63,8 @@ def remove_item(self, item: str): del self.item2idx[bytes_item] def add_item(self, item: str) -> int: - """ - add string - if already in dictionary returns its ID. if not in dictionary, it will get a new ID. + """Add string - if already in dictionary returns its ID. if not in dictionary, it will get a new ID. + :param item: a string for which to assign an id. :return: ID of string """ @@ -77,8 +75,8 @@ def add_item(self, item: str) -> int: return self.item2idx[bytes_item] def get_idx_for_item(self, item: str) -> int: - """ - returns the ID of the string, otherwise 0 + """Returns the ID of the string, otherwise 0. + :param item: string for which ID is requested :return: ID of string, otherwise 0 """ @@ -95,8 +93,8 @@ def get_idx_for_item(self, item: str) -> int: raise IndexError def get_idx_for_items(self, items: List[str]) -> List[int]: - """ - returns the IDs for each item of the list of string, otherwise 0 if not found + """Returns the IDs for each item of the list of string, otherwise 0 if not found. + :param items: List of string for which IDs are requested :return: List of ID of strings """ @@ -198,9 +196,10 @@ def __str__(self): class Label: - """ - This class represents a label. Each label has a value and optionally a confidence score. The - score needs to be between 0.0 and 1.0. Default value for the score is 1.0. + """This class represents a label. + + Each label has a value and optionally a confidence score. The score needs to be between 0.0 and 1.0. + Default value for the score is 1.0. """ def __init__(self, data_point: "DataPoint", value: str, score: float = 1.0): @@ -253,10 +252,11 @@ def unlabeled_identifier(self): class DataPoint: - """ - This is the parent class of all data points in Flair (including Token, Sentence, Image, etc.). Each DataPoint - must be embeddable (hence the abstract property embedding() and methods to() and clear_embeddings()). Also, - each DataPoint may have Labels in several layers of annotation (hence the functions add_label(), get_labels() + """This is the parent class of all data points in Flair. + + Examples for data points are Token, Sentence, Image, etc. + Each DataPoint must be embeddable (hence the abstract property embedding() and methods to() and clear_embeddings()). + Also, each DataPoint may have Labels in several layers of annotation (hence the functions add_label(), get_labels() and the property 'label') """ @@ -456,9 +456,9 @@ def remove_labels(self, typename: str): class Token(_PartOfSentence): - """ - This class represents one word in a tokenized sentence. Each token may have any number of tags. It may also point - to its head in a dependency tree. + """This class represents one word in a tokenized sentence. + + Each token may have any number of tags. It may also point to its head in a dependency tree. """ def __init__( @@ -530,20 +530,16 @@ def __repr__(self): return self.__str__() def add_label(self, typename: str, value: str, score: float = 1.0): - """ - The Token is a special _PartOfSentence in that it may be initialized without a Sentence. - Therefore, labels get added only to the Sentence if it exists - """ + # The Token is a special _PartOfSentence in that it may be initialized without a Sentence. + # therefore, labels get added only to the Sentence if it exists if self.sentence: super().add_label(typename=typename, value=value, score=score) else: DataPoint.add_label(self, typename=typename, value=value, score=score) def set_label(self, typename: str, value: str, score: float = 1.0): - """ - The Token is a special _PartOfSentence in that it may be initialized without a Sentence. - Therefore, labels get set only to the Sentence if it exists - """ + # The Token is a special _PartOfSentence in that it may be initialized without a Sentence. + # Therefore, labels get set only to the Sentence if it exists if self.sentence: super().set_label(typename=typename, value=value, score=score) else: @@ -551,9 +547,7 @@ def set_label(self, typename: str, value: str, score: float = 1.0): class Span(_PartOfSentence): - """ - This class represents one textual span consisting of Tokens. - """ + """This class represents one textual span consisting of Tokens.""" def __new__(self, tokens: List[Token]): # check if the span already exists. If so, return it @@ -674,9 +668,7 @@ def embedding(self): class Sentence(DataPoint): - """ - A Sentence is a list of tokens and is used to represent a sentence or text fragment. - """ + """A Sentence is a list of tokens and is used to represent a sentence or text fragment.""" def __init__( self, @@ -685,8 +677,9 @@ def __init__( language_code: str = None, start_position: int = 0, ): - """ - Class to hold all meta related to a text (tokens, predictions, language code, ...) + """Class to hold all metadata related to a text. + + Metadata can be tokens, predictions, language code, ... :param text: original string (sentence), or a list of string tokens (words) :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`) more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer` @@ -905,9 +898,10 @@ def to_plain_string(self): return plain.rstrip() def infer_space_after(self): - """ - Heuristics in case you wish to infer whitespace_after values for tokenized text. This is useful for some old NLP - tasks (such as CoNLL-03 and CoNLL-2000) that provide only tokenized data with no info of original whitespacing. + """Heuristics in case you wish to infer whitespace_after values for tokenized text. + + This is useful for some old NLP tasks (such as CoNLL-03 and CoNLL-2000) that provide only tokenized data with + no info of original whitespacing. :return: """ last_token = None @@ -1034,8 +1028,9 @@ def to_windows_1252(match): return re.sub(r"[\u0080-\u0099]", to_windows_1252, text) def next_sentence(self): - """ - Get the next sentence in the document (works only if context is set through dataloader or elsewhere) + """Get the next sentence in the document. + + This only works if context is set through dataloader or elsewhere :return: next Sentence in document if set, otherwise None """ if self._next_sentence is not None: @@ -1050,8 +1045,9 @@ def next_sentence(self): return None def previous_sentence(self): - """ - Get the previous sentence in the document (works only if context is set through dataloader or elsewhere) + """Get the previous sentence in the document. + + works only if context is set through dataloader or elsewhere :return: previous Sentence in document if set, otherwise None """ if self._previous_sentence is not None: @@ -1066,7 +1062,8 @@ def previous_sentence(self): return None def is_context_set(self) -> bool: - """ + """Determines if this sentence has a context of sentences before or after set. + Return True or False depending on whether context is set (for instance in dataloader or elsewhere) :return: True if context is set, else False """ @@ -1317,8 +1314,8 @@ def _filter_empty_sentences(dataset) -> Dataset: return subset def make_vocab_dictionary(self, max_tokens=-1, min_freq=1) -> Dictionary: - """ - Creates a dictionary of all tokens contained in the corpus. + """Creates a dictionary of all tokens contained in the corpus. + By defining `max_tokens` you can set the maximum number of tokens that should be contained in the dictionary. If there are more than `max_tokens` tokens in the corpus, the most frequent tokens are added first. If `min_freq` is set the a value greater than 1 only tokens occurring more than `min_freq` times are considered @@ -1358,9 +1355,9 @@ def _downsample_to_proportion(dataset: Dataset, proportion: float): return splits[0] def obtain_statistics(self, label_type: str = None, pretty_print: bool = True) -> Union[dict, str]: - """ - Print statistics about the class distribution (only labels of sentences are taken into account) and sentence - sizes. + """Print statistics about the class distribution and sentence sizes. + + only labels of sentences are taken into account """ json_data = { "TRAIN": self._obtain_statistics_for(self.train, "TRAIN", label_type), @@ -1435,8 +1432,8 @@ def __str__(self) -> str: def make_label_dictionary( self, label_type: str, min_count: int = -1, add_unk: bool = True, add_dev_test: bool = False ) -> Dictionary: - """ - Creates a dictionary of all labels assigned to the sentences in the corpus. + """Creates a dictionary of all labels assigned to the sentences in the corpus. + :return: dictionary of labels """ if min_count > 0 and not add_unk: @@ -1517,8 +1514,8 @@ def add_label_noise( split: str = "train", noise_transition_matrix: Optional[Dict[str, List[float]]] = None, ): - """ - Generates uniform label noise distribution in the chosen dataset split. + """Generates uniform label noise distribution in the chosen dataset split. + :label_type: the type of labels for which the noise should be simulated. :labels: an array with unique labels of said type (retrievable from label dictionary). :noise_share: the desired share of noise in the train split. @@ -1728,7 +1725,8 @@ def cummulative_sizes(self): def iob2(tags): - """ + """Converts the tags to the IOB2 format. + Check that tags have a valid IOB format. Tags in IOB1 format are converted to IOB2. """ diff --git a/flair/file_utils.py b/flair/file_utils.py index fbf5e1e943..bea3927f19 100644 --- a/flair/file_utils.py +++ b/flair/file_utils.py @@ -1,6 +1,4 @@ -""" -Utilities for working with the local dataset cache. Copied from AllenNLP -""" +"""Utilities for working with the local dataset cache. Copied from AllenNLP.""" import base64 import functools import io @@ -32,8 +30,9 @@ def set_proxies(proxies: typing.Dict[str, str]) -> None: - """ - Allows for data downloaded from urls to be forwarded to a proxy, see https://requests.readthedocs.io/en/latest/user/advanced/#proxies + """Allows for data downloaded from urls to be forwarded to a proxy. + + see https://requests.readthedocs.io/en/latest/user/advanced/#proxies :param proxies: A dictionary of proxies according to the requests documentation. :return: None """ @@ -42,8 +41,9 @@ def set_proxies(proxies: typing.Dict[str, str]) -> None: def load_big_file(f: str): - """ - Workaround for loading a big pickle file. Files over 2GB cause pickle errors on certain Mac and Windows distributions. + """Workaround for loading a big pickle file. + + Files over 2GB cause pickle errors on certain Mac and Windows distributions. :param f: :return: """ @@ -55,8 +55,8 @@ def load_big_file(f: str): def url_to_filename(url: str, etag: str = None) -> str: - """ - Converts a url into a filename in a reversible way. + """Converts an url into a filename in a reversible way. + If `etag` is specified, add it on the end, separated by a period (which necessarily won't appear in the base64-encoded filename). Get rid of the quotes in the etag, since Windows doesn't like them. @@ -74,9 +74,9 @@ def url_to_filename(url: str, etag: str = None) -> str: def filename_to_url(filename: str) -> Tuple[str, Optional[str]]: - """ - Recovers the the url from the encoded filename. Returns it and the ETag - (which may be ``None``) + """Recovers the the url from the encoded filename. + + Returns it and the ETag (which may be ``None``) """ etag: Optional[str] try: @@ -92,7 +92,8 @@ def filename_to_url(filename: str) -> Tuple[str, Optional[str]]: def cached_path(url_or_filename: str, cache_dir: Union[str, Path]) -> Path: - """ + """Download the given path and return the local path from the cache. + Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and return the path to the cached file. If it's already a local path, @@ -147,8 +148,7 @@ def unzip_file(file: Union[str, Path], unzip_to: Union[str, Path]): def unpack_file(file: Path, unpack_to: Path, mode: str = None, keep: bool = True): - """ - Unpacks a file to the given location. + """Unpacks an archive file to the given location. :param file Archive file to unpack :param unpack_to Destination where to store the output @@ -198,9 +198,9 @@ def unpack_file(file: Path, unpack_to: Path, mode: str = None, keep: bool = True # TODO(joelgrus): do we want to do checksums or anything like that? def get_from_cache(url: str, cache_dir: Path) -> Path: - """ - Given a URL, look for the corresponding dataset in the local cache. - If it's not there, download it. Then return the path to the cached file. + """Given a URL, look for the corresponding file in the local cache or download it. + + return: the path to the cached file. """ cache_dir.mkdir(parents=True, exist_ok=True) @@ -317,7 +317,8 @@ def set_default_mininterval(value: float) -> None: @staticmethod def set_slower_interval(use_slower_interval: bool) -> None: - """ + """Slows down the tqdm update interval. + If ``use_slower_interval`` is ``True``, we will dramatically slow down ``tqdm's`` default output rate. ``tqdm's`` default output rate is great for interactively watching progress, but it is not great for log files. You might want to set this if you are primarily going diff --git a/flair/inference_utils.py b/flair/inference_utils.py index b513b3bd92..ed4240b581 100644 --- a/flair/inference_utils.py +++ b/flair/inference_utils.py @@ -20,8 +20,7 @@ class WordEmbeddingsStore: - """ - class to simulate a WordEmbeddings class from flair. + """Class to simulate a WordEmbeddings class from flair. Run this to generate a headless (without word embeddings) model as well a stored word embeddings: @@ -63,7 +62,8 @@ class to simulate a WordEmbeddings class from flair. """ def __init__(self, embedding: WordEmbeddings, backend="sqlite", verbose=True): - """ + """Instantiates the WordEmbeddingsStore. + :param embedding: Flair WordEmbeddings instance. :param backend: cache database backend name e.g ``'sqlite'``, ``'lmdb'``. Default value is ``'sqlite'``. @@ -101,9 +101,7 @@ def get_names(self): @staticmethod def _get_store_path(embedding, backend="sqlite"): - """ - get the filename of the store - """ + """Get the filename of the store.""" cache_dir = flair.cache_root embedding_filename = re.findall("/(embeddings/.*)", embedding.name)[0] store_path = cache_dir / (embedding_filename + "." + backend) @@ -123,9 +121,9 @@ def _word_embeddings(model): @staticmethod def create_stores(model, backend="sqlite"): - """ - creates database versions of all word embeddings in the model and - deletes the original vectors to save memory + """Creates database versions of all word embeddings in the model. + + Also deletes the original vectors to save memory. """ for embedding in WordEmbeddingsStore._word_embeddings(model): if type(embedding) == WordEmbeddings: @@ -134,9 +132,7 @@ def create_stores(model, backend="sqlite"): @staticmethod def load_stores(model, backend="sqlite"): - """ - loads the db versions of all word embeddings in the model - """ + """Loads the db versions of all word embeddings in the model.""" embeds = WordEmbeddingsStore._word_embeddings(model) for i, embedding in enumerate(embeds): if type(embedding) == WordEmbeddings: @@ -144,9 +140,7 @@ def load_stores(model, backend="sqlite"): @staticmethod def delete_stores(model, backend="sqlite"): - """ - deletes the db versions of all word embeddings - """ + """Deletes the db versions of all word embeddings.""" for embedding in WordEmbeddingsStore._word_embeddings(model): store_path: Path = WordEmbeddingsStore._get_store_path(embedding) logger.info(f"delete store: {str(store_path)}") diff --git a/flair/models/sequence_tagger_utils/crf.py b/flair/models/sequence_tagger_utils/crf.py index 3b37d1aeec..a6fbb6609d 100644 --- a/flair/models/sequence_tagger_utils/crf.py +++ b/flair/models/sequence_tagger_utils/crf.py @@ -7,14 +7,16 @@ class CRF(torch.nn.Module): - """ + """Conditional Random Field. + Conditional Random Field Implementation according to sgrvinod (https://github.com/sgrvinod). Classifier which predicts single tag / class / label for given word based on not just the word, but also on previous seen annotations. """ def __init__(self, tag_dictionary, tagset_size: int, init_from_state_dict: bool): - """ + """Initialize the Conditional Random Field. + :param tag_dictionary: tag dictionary in order to find ID for start and stop tags :param tagset_size: number of tag from tag dictionary :param init_from_state_dict: whether we load pretrained model from state dict @@ -33,8 +35,8 @@ def __init__(self, tag_dictionary, tagset_size: int, init_from_state_dict: bool) self.to(flair.device) def forward(self, features: torch.Tensor) -> torch.Tensor: - """ - Forward propagation of Conditional Random Field. + """Forward propagation of Conditional Random Field. + :param features: output from RNN / Linear layer in shape (batch size, seq len, hidden size) :return: CRF scores (emission scores for each token + transitions prob from previous state) in shape (batch_size, seq len, tagset size, tagset size) diff --git a/flair/models/sequence_tagger_utils/viterbi.py b/flair/models/sequence_tagger_utils/viterbi.py index 607a39cf66..3d2d173b24 100644 --- a/flair/models/sequence_tagger_utils/viterbi.py +++ b/flair/models/sequence_tagger_utils/viterbi.py @@ -14,12 +14,11 @@ class ViterbiLoss(torch.nn.Module): - """ - Calculates the loss for each sequence up to its length t. - """ + """Calculates the loss for each sequence up to its length t.""" def __init__(self, tag_dictionary: Dictionary): - """ + """Create an instance of the Viterbi loss. + :param tag_dictionary: tag_dictionary of task """ super(ViterbiLoss, self).__init__() @@ -29,8 +28,7 @@ def __init__(self, tag_dictionary: Dictionary): self.stop_tag = tag_dictionary.get_idx_for_item(STOP_TAG) def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor: - """ - Forward propagation of Viterbi Loss + """Forward propagation of Viterbi Loss. :param features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size), lengths of sentences in batch, transitions from CRF @@ -82,8 +80,7 @@ def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor: @staticmethod def _log_sum_exp(tensor, dim): - """ - Calculates the log-sum-exponent of a tensor's dimension in a numerically stable way. + """Calculates the log-sum-exponent of a tensor's dimension in a numerically stable way. :param tensor: tensor :param dim: dimension to calculate log-sum-exp of @@ -94,13 +91,13 @@ def _log_sum_exp(tensor, dim): return m + torch.log(torch.sum(torch.exp(tensor - m_expanded), dim)) def _format_targets(self, targets: torch.Tensor, lengths: torch.IntTensor): - """ - Formats targets into matrix indices. + """Formats targets into matrix indices. + CRF scores contain per sentence, per token a (tagset_size x tagset_size) matrix, containing emission score for - token j + transition prob from previous token i. Means, if we think of our rows as "to tag" and our columns - as "from tag", the matrix in cell [10,5] would contain the emission score for tag 10 + transition score - from previous tag 5 and could directly be addressed through the 1-dim indices (10 + tagset_size * 5) = 70, - if our tagset consists of 12 tags. + token j + transition prob from previous token i. Means, if we think of our rows as "to tag" and our columns + as "from tag", the matrix in cell [10,5] would contain the emission score for tag 10 + transition score + from previous tag 5 and could directly be addressed through the 1-dim indices (10 + tagset_size * 5) = 70, + if our tagset consists of 12 tags. :param targets: targets as in tag dictionary :param lengths: lengths of sentences in batch @@ -127,12 +124,11 @@ def _format_targets(self, targets: torch.Tensor, lengths: torch.IntTensor): class ViterbiDecoder: - """ - Decodes a given sequence using the Viterbi algorithm. - """ + """Decodes a given sequence using the Viterbi algorithm.""" def __init__(self, tag_dictionary: Dictionary): - """ + """Initialize the Viterbi Decoder. + :param tag_dictionary: Dictionary of tags for sequence labeling task """ self.tag_dictionary = tag_dictionary @@ -143,8 +139,8 @@ def __init__(self, tag_dictionary: Dictionary): def decode( self, features_tuple: tuple, probabilities_for_all_classes: bool, sentences: List[Sentence] ) -> Tuple[List, List]: - """ - Decoding function returning the most likely sequence of tags. + """Decoding function returning the most likely sequence of tags. + :param features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size), lengths of sentence in batch, transitions of CRF :param probabilities_for_all_classes: whether to return probabilities for all tags @@ -224,8 +220,8 @@ def decode( return tags, all_tags def _all_scores_for_token(self, scores: torch.Tensor, lengths: torch.IntTensor, sentences: List[Sentence]): - """ - Returns all scores for each tag in tag dictionary. + """Returns all scores for each tag in tag dictionary. + :param scores: Scores for current sentence. """ scores = scores.numpy() diff --git a/flair/models/tars_model.py b/flair/models/tars_model.py index 8631b5820c..3dfc871e2e 100644 --- a/flair/models/tars_model.py +++ b/flair/models/tars_model.py @@ -118,7 +118,7 @@ def _get_nearest_labels_for(self, labels): return already_sampled_negative_labels def train(self, mode=True): - """Populate label similarity map based on cosine similarity before running epoch + """Populate label similarity map based on cosine similarity before running epoch. If the `num_negative_labels_to_sample` is set to an integer value then before starting each epoch the model would create a similarity measure between the label names based @@ -126,15 +126,11 @@ def train(self, mode=True): """ if mode and self.num_negative_labels_to_sample is not None: self._compute_label_similarity_for_current_epoch() - super().train(mode) super().train(mode) def _compute_label_similarity_for_current_epoch(self): - """ - Compute the similarity between all labels for better sampling of negatives - """ - + """Compute the similarity between all labels for better sampling of negatives.""" # get and embed all labels by making a Sentence object that contains only the label text all_labels = [label.decode("utf-8") for label in self.get_current_label_dictionary().idx2item] label_sentences = [Sentence(label) for label in all_labels] @@ -182,10 +178,11 @@ def add_and_switch_to_new_task( multi_label: bool = True, force_switch: bool = False, ): - """ - Adds a new task to an existing TARS model. Sets necessary attributes and finally 'switches' - to the new task. Parameters are similar to the constructor except for model choice, batch - size and negative sampling. This method does not store the resultant model onto disk. + """Adds a new task to an existing TARS model. + + Sets necessary attributes and finally 'switches' to the new task. Parameters are similar to the constructor + except for model choice, batch size and negative sampling. This method does not store the resultant model onto + disk. :param task_name: a string depicting the name of the task :param label_dictionary: dictionary of the labels you want to predict :param label_type: string to identify the label type ('ner', 'sentiment', etc.) @@ -221,15 +218,11 @@ def add_and_switch_to_new_task( self.switch_to_task(task_name) def list_existing_tasks(self) -> Set[str]: - """ - Lists existing tasks in the loaded TARS model on the console. - """ + """Lists existing tasks in the loaded TARS model on the console.""" return set(self._task_specific_attributes.keys()) def switch_to_task(self, task_name): - """ - Switches to a task which was previously added. - """ + """Switches to a task which was previously added.""" if task_name not in self._task_specific_attributes: log.error( "Provided `%s` does not exist in the model. Consider calling " "`add_and_switch_to_new_task` first.", @@ -267,13 +260,12 @@ def predict_zero_shot( candidate_label_set: Union[List[str], Set[str], str], multi_label: bool = True, ): - """ - Method to make zero shot predictions from the TARS model + """Make zero shot predictions from the TARS model. + :param sentences: input sentence objects to classify :param candidate_label_set: set of candidate labels :param multi_label: indicates whether multi-label or single class prediction. Defaults to True. """ - # check if candidate_label_set is empty if candidate_label_set is None or len(candidate_label_set) == 0: log.warning("Provided candidate_label_set is empty") @@ -324,11 +316,11 @@ def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "FewshotClassifie class TARSTagger(FewshotClassifier): - """ - TARS model for sequence tagging. In the backend, the model uses a BERT based 5-class - sequence labeler which given a pair predicts the probability for each word - to belong to one of the BIOES classes. The input data is a usual Sentence object which is inflated - by the model internally before pushing it through the transformer stack of BERT. + """TARS model for sequence tagging. + + In the backend, the model uses a BERT based 5-class sequence labeler which given a pair predicts the + probability for each word to belong to one of the BIOES classes. The input data is a usual Sentence object which + is inflated by the model internally before pushing it through the transformer stack of BERT. """ static_label_type = "tars_label" @@ -343,8 +335,8 @@ def __init__( prefix: bool = True, **tagger_args, ): - """ - Initializes a TextClassifier + """Initializes a TarsTagger. + :param task_name: a string depicting the name of the task :param label_dictionary: dictionary of labels you want to predict :param embeddings: name of the pre-trained transformer model e.g., @@ -476,9 +468,8 @@ def predict( embedding_storage_mode="none", most_probable_first: bool = True, ): - # return - """ - Predict sequence tags for Named Entity Recognition task + """Predict sequence tags for Named Entity Recognition task. + :param sentences: a Sentence or a List of Sentence :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory, up to a point when it has no more effect. @@ -634,10 +625,10 @@ def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "TARSTagger": class TARSClassifier(FewshotClassifier): - """ - TARS model for text classification. In the backend, the model uses a BERT based binary - text classifier which given a pair predicts the probability of two classes - "True", and "False". The input data is a usual Sentence object which is inflated + """TARS model for text classification. + + In the backend, the model uses a BERT based binary text classifier which given a pair predicts the + probability of two classes "True", and "False". The input data is a usual Sentence object which is inflated by the model internally before pushing it through the transformer stack of BERT. """ @@ -655,8 +646,8 @@ def __init__( prefix: bool = True, **tagger_args, ): - """ - Initializes a TextClassifier + """Initializes a TarsClassifier. + :param task_name: a string depicting the name of the task :param label_dictionary: dictionary of labels you want to predict :param embeddings: name of the pre-trained transformer model e.g., @@ -810,8 +801,8 @@ def predict( label_threshold: float = 0.5, multi_label: Optional[bool] = None, ): - """ - Predict sequence tags for Named Entity Recognition task + """Predict sentences on the Text Classification task. + :param sentences: a Sentence or a List of Sentence :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory, up to a point when it has no more effect. diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py index 5757343aa5..0a6b5b4596 100644 --- a/flair/models/text_classification_model.py +++ b/flair/models/text_classification_model.py @@ -13,8 +13,8 @@ class TextClassifier(flair.nn.DefaultClassifier[Sentence, Sentence]): - """ - Text Classification Model + """Text Classification Model. + The model takes word embeddings, puts them into an RNN to obtain a text representation, and puts the text representation in the end into a linear layer to get the actual class label. The model can handle single and multi @@ -27,8 +27,8 @@ def __init__( label_type: str, **classifierargs, ): - """ - Initializes a TextClassifier + """Initializes a TextClassifier. + :param embeddings: embeddings used to embed each data point :param label_dictionary: dictionary of labels you want to predict :param multi_label: auto-detected by default, but you can set this to True to force multi-label prediction @@ -38,7 +38,6 @@ def __init__( :param loss_weights: Dictionary of weights for labels for the loss function (if any label's weight is unspecified it will default to 1.0) """ - super(TextClassifier, self).__init__( **classifierargs, embeddings=embeddings, diff --git a/flair/models/word_tagger_model.py b/flair/models/word_tagger_model.py index b24ce0c522..250404fcf4 100644 --- a/flair/models/word_tagger_model.py +++ b/flair/models/word_tagger_model.py @@ -12,9 +12,7 @@ class WordTagger(flair.nn.DefaultClassifier[Sentence, Token]): - """ - This is a simple class of models that tags individual words in text. - """ + """This is a simple class of models that tags individual words in text.""" def __init__( self, @@ -23,8 +21,8 @@ def __init__( tag_type: str, **classifierargs, ): - """ - Initializes a WordTagger + """Initializes a WordTagger. + :param embeddings: word embeddings used in tagger :param tag_dictionary: dictionary of tags you want to predict :param tag_type: string identifier for tag type diff --git a/flair/nn/distance/cosine.py b/flair/nn/distance/cosine.py index d7da1ccc8c..a19cc8452d 100644 --- a/flair/nn/distance/cosine.py +++ b/flair/nn/distance/cosine.py @@ -4,8 +4,8 @@ def dot_product(a: torch.Tensor, b: torch.Tensor, normalize=False): - """ - Computes dot product for pairs of vectors. + """Computes dot product for pairs of vectors. + :param normalize: Vectors are normalized (leads to cosine similarity) :return: Matrix with res[i][j] = dot_product(a[i], b[j]) """ diff --git a/flair/nn/distance/euclidean.py b/flair/nn/distance/euclidean.py index 05d842d1eb..fc464adaf4 100644 --- a/flair/nn/distance/euclidean.py +++ b/flair/nn/distance/euclidean.py @@ -1,4 +1,5 @@ -""" +"""Euclidean distances implemented in pytorch. + This module was copied from the repository the following repository: https://github.com/asappresearch/dynamic-classification @@ -26,8 +27,7 @@ class EuclideanDistance(nn.Module): """Implement a EuclideanDistance object.""" def forward(self, mat_1: Tensor, mat_2: Tensor) -> Tensor: # type: ignore - """Returns the squared euclidean distance between each - element in mat_1 and each element in mat_2. + """Returns the squared euclidean distance between each element in mat_1 and each element in mat_2. Parameters ---------- diff --git a/flair/nn/distance/hyperbolic.py b/flair/nn/distance/hyperbolic.py index 4f6ee351b7..da00b2d532 100644 --- a/flair/nn/distance/hyperbolic.py +++ b/flair/nn/distance/hyperbolic.py @@ -1,4 +1,5 @@ -""" +"""Hyperbolic distances implemented in pytorch. + This module was copied from the repository the following repository: https://github.com/asappresearch/dynamic-classification @@ -55,7 +56,7 @@ def log_map(x, y): def norm(x): - """Compute the norm""" + """Compute the norm.""" n = torch.sqrt(torch.abs(mdot(x, x))) return n @@ -75,8 +76,7 @@ class HyperbolicDistance(nn.Module): """Implement a HyperbolicDistance object.""" def forward(self, mat_1: Tensor, mat_2: Tensor) -> Tensor: # type: ignore - """Returns the squared euclidean distance between each - element in mat_1 and each element in mat_2. + """Returns the squared euclidean distance between each element in mat_1 and each element in mat_2. Parameters ---------- diff --git a/flair/nn/dropout.py b/flair/nn/dropout.py index dfb4f7e801..43337926ee 100644 --- a/flair/nn/dropout.py +++ b/flair/nn/dropout.py @@ -2,8 +2,9 @@ class LockedDropout(torch.nn.Module): - """ - Implementation of locked (or variational) dropout. Randomly drops out entire parameters in embedding space. + """Implementation of locked (or variational) dropout. + + Randomly drops out entire parameters in embedding space. """ def __init__(self, dropout_rate=0.5, batch_first=True, inplace=False): @@ -31,8 +32,9 @@ def extra_repr(self): class WordDropout(torch.nn.Module): - """ - Implementation of word dropout. Randomly drops out entire words (or characters) in embedding space. + """Implementation of word dropout. + + Randomly drops out entire words (or characters) in embedding space. """ def __init__(self, dropout_rate=0.05, inplace=False): diff --git a/flair/nn/model.py b/flair/nn/model.py index fdb8652423..ed705ae5af 100644 --- a/flair/nn/model.py +++ b/flair/nn/model.py @@ -24,23 +24,26 @@ class Model(torch.nn.Module, typing.Generic[DT], ABC): - """Abstract base class for all downstream task models in Flair, - such as SequenceTagger and TextClassifier. - Every new type of model must implement these methods.""" + """Abstract base class for all downstream task models in Flair, such as SequenceTagger and TextClassifier. + + Every new type of model must implement these methods. + """ model_card: Optional[Dict[str, Any]] = None @property @abstractmethod def label_type(self): - """Each model predicts labels of a certain type. - TODO: can we find a better name for this?""" + """Each model predicts labels of a certain type.""" + # TODO: can we find a better name for this? raise NotImplementedError @abstractmethod def forward_loss(self, data_points: List[DT]) -> Tuple[torch.Tensor, int]: """Performs a forward pass and returns a loss tensor for backpropagation. - Implement this to enable training.""" + + Implement this to enable training. + """ raise NotImplementedError @abstractmethod @@ -57,8 +60,9 @@ def evaluate( return_loss: bool = True, **kwargs, ) -> Result: - """Evaluates the model. Returns a Result object containing evaluation - results and a loss value. Implement this to enable evaluation. + """Evaluates the model. Returns a Result object containing evaluation results and a loss value. + + Implement this to enable evaluation. :param data_loader: DataLoader that iterates over dataset to be evaluated :param out_path: Optional output path to store predictions :param embedding_storage_mode: One of 'none', 'cpu' or 'gpu'. 'none' means all embeddings are deleted and freshly recomputed, 'cpu' means all embeddings are stored on CPU, or 'gpu' means all embeddings are stored on GPU # noqa: E501 @@ -95,8 +99,8 @@ def _fetch_model(model_name) -> str: return model_name def save(self, model_file: Union[str, Path], checkpoint: bool = False): - """ - Saves the current model to the provided file. + """Saves the current model to the provided file. + :param model_file: the model file """ model_state = self._get_state_dict() @@ -110,8 +114,8 @@ def save(self, model_file: Union[str, Path], checkpoint: bool = False): @classmethod def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "Model": - """ - Loads the model from the given file. + """Loads the model from the given file. + :param model_path: the model file or the already loaded state dict :return: the loaded text classifier model """ @@ -210,10 +214,11 @@ def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[List[str]]: class Classifier(Model[DT], typing.Generic[DT], ReduceTransformerVocabMixin, ABC): - """Abstract base class for all Flair models that do classification, - both single- and multi-label. It inherits from flair.nn.Model and adds an - unified evaluate() function so that all classification models use the same - evaluation routines and compute the same numbers.""" + """Abstract base class for all Flair models that do classification. + + The classifier inherits from flair.nn.Model and adds unified functionality for both, single- and multi-label + classification and evaluation. Therefore, it is ensured to have a fair comparison between multiple classifiers. + """ def evaluate( self, @@ -491,8 +496,9 @@ def predict( return_loss=False, embedding_storage_mode="none", ): - """ - Predicts the class labels for the given sentences. The labels are directly added to the sentences. # noqa: E501 + """Predicts the class labels for the given sentences. + + The labels are directly added to the sentences. :param sentences: list of sentences :param mini_batch_size: mini batch size to use :param return_probabilities_for_all_classes : return probabilities for all classes instead of only best predicted # noqa: E501 @@ -533,12 +539,12 @@ def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "Classifier": class DefaultClassifier(Classifier[DT], typing.Generic[DT, DT2], ABC): - """Default base class for all Flair models that do classification, both - single- and multi-label. It inherits from flair.nn.Classifier and thus from - flair.nn.Model. All features shared by all classifiers are implemented here, - including the loss calculation and the predict() method. Currently, the - TextClassifier, RelationExtractor, TextPairClassifier and - SimpleSequenceTagger implement this class. + """Default base class for all Flair models that do classification. + + It inherits from flair.nn.Classifier and thus from flair.nn.Model. All features shared by all classifiers are + implemented here, including the loss calculation, prediction heads for both single- and multi- label classification + and the `predict()` method. Example implementations of this class are the TextClassifier, RelationExtractor, + TextPairClassifier and SimpleSequenceTagger. """ def __init__( @@ -612,7 +618,10 @@ def __init__( self.train_on_gold_pairs_only = train_on_gold_pairs_only def _filter_data_point(self, data_point: DT) -> bool: - """Specify if a data point should be kept. That way you can remove for example empty texts. + """Specify if a data point should be kept. + + That way you can remove for example empty texts. Per default all datapoints that have length zero + will be removed. Return true if the data point should be kept and false if it should be removed. """ return True if len(data_point) > 0 else False @@ -623,15 +632,22 @@ def _get_embedding_for_data_point(self, prediction_data_point: DT2) -> torch.Ten @abstractmethod def _get_data_points_from_sentence(self, sentence: DT) -> List[DT2]: - """Returns the data_points to which labels are added (Sentence, Span, Token, ... objects)""" + """Returns the data_points to which labels are added. + + The results should be of any type that inherits from DataPoint (Sentence, Span, Token, ... objects). + """ raise NotImplementedError def _get_data_points_for_batch(self, sentences: List[DT]) -> List[DT2]: - """Returns the data_points to which labels are added (Sentence, Span, Token, ... objects)""" + """Returns the data_points to which labels are added. + + The results should be of any type that inherits from DataPoint (Sentence, Span, Token, ... objects). + """ return [data_point for sentence in sentences for data_point in self._get_data_points_from_sentence(sentence)] def _get_label_of_datapoint(self, data_point: DT2) -> List[str]: """Extracts the labels from the data points. + Each data point might return a list of strings, representing multiple labels. """ if self.multi_label: @@ -750,14 +766,14 @@ def predict( return_loss=False, embedding_storage_mode="none", ): - """ - Predicts the class labels for the given sentences. The labels are directly added to the sentences. # noqa: E501 + """Predicts the class labels for the given sentences. The labels are directly added to the sentences. + :param sentences: list of sentences :param mini_batch_size: mini batch size to use :param return_probabilities_for_all_classes : return probabilities for all classes instead of only best predicted # noqa: E501 :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss - :param label_name: set this to change the name of the label type that is predicted # noqa: E501 + :param label_name: set this to change the name of the label type that is predicted :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. # noqa: E501 'gpu' to store embeddings in GPU memory. """ diff --git a/flair/optim.py b/flair/optim.py index e116bc9159..cde885e6cf 100644 --- a/flair/optim.py +++ b/flair/optim.py @@ -9,9 +9,9 @@ class SGDW(Optimizer): - r"""Implements stochastic gradient descent (optionally with momentum) with - weight decay from the paper `Fixing Weight Decay Regularization in Adam`_. + r"""Implements stochastic gradient descent (optionally with momentum) with weight decay. + Implementation from the paper `Fixing Weight Decay Regularization in Adam`_. Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__. @@ -135,8 +135,7 @@ def step(self, closure=None): class ExpAnnealLR(_LRScheduler): - """Exponentially anneal the learning rate of each parameter group - from the initial lr to end_lr over a number of iterations. + """Exponentially anneal the lr of each parameter group from the initial lr to end_lr over a number of iterations. Args: optimizer (Optimizer): Wrapped optimizer. @@ -158,9 +157,9 @@ def get_lr(self): class LinearSchedulerWithWarmup(LambdaLR): - """Linearly increase the learning from 0 to initial learning rate during warmup - and decrease the learning rate to 0 after the warmup. Uses LambaLR scheduler - where the learning rate is multiplied by a lambda factor after calling scheduler.step(). + """Linearly increase the lr from 0 to initial lr during warmup and decrease the lr to 0 after the warmup. + + Uses LambaLR scheduler where the learning rate is multiplied by a lambda factor after calling scheduler.step(). Args: optimizer (Optimizer): Wrapped optimizer. @@ -185,8 +184,9 @@ def linear_lr_lambda(current_step: int): class ReduceLRWDOnPlateau(ReduceLROnPlateau): - """Reduce learning rate and weight decay when a metric has stopped - improving. Models often benefit from reducing the learning rate by + """Reduce learning rate and weight decay when a metric has stopped improving. + + Models often benefit from reducing the learning rate by a factor of 2-10 once learning stagnates. This scheduler reads a metric quantity and if no improvement is seen for a 'patience' number of epochs, the learning rate and weight decay factor is reduced for diff --git a/flair/samplers.py b/flair/samplers.py index 2a08d17c0b..8ee6958d97 100644 --- a/flair/samplers.py +++ b/flair/samplers.py @@ -11,8 +11,9 @@ class FlairSampler(Sampler): def set_dataset(self, data_source): - """Initialize by passing a block_size and a plus_window parameter. - :param data_source: dataset to sample from + """Initialize the data source for the FlairSampler. + + :param data_source: dataset to sample from. """ self.data_source = data_source self.num_samples = len(self.data_source) @@ -28,8 +29,8 @@ def __init__(self): super(ImbalancedClassificationDatasetSampler, self).__init__(None) def set_dataset(self, data_source): - """ - Initialize by passing a classification dataset with labels, i.e. either TextClassificationDataSet or + """Initialize the dataset used for sampling. + :param data_source: """ self.data_source = data_source @@ -53,8 +54,9 @@ def __iter__(self): class ChunkSampler(FlairSampler): - """Splits data into blocks and randomizes them before sampling. This causes some order of the data to be preserved, - while still shuffling the data. + """Splits data into blocks and randomizes them before sampling. + + This causes some order of the data to be preserved, while still shuffling the data. """ def __init__(self, block_size=5, plus_window=5): @@ -80,13 +82,16 @@ def __iter__(self): class ExpandingChunkSampler(FlairSampler): - """Splits data into blocks and randomizes them before sampling. Block size grows with each epoch. + """Splits data into blocks and randomizes them before sampling. + + Block size grows with each epoch. This causes some order of the data to be preserved, while still shuffling the data. """ def __init__(self, step=3): - """Initialize by passing a block_size and a plus_window parameter. - :param data_source: dataset to sample from + """Initialize the ExpandingChunkSampler. + + :param step: every *step* epochs the block size increments by one. """ super(ExpandingChunkSampler, self).__init__(None) self.block_size = 1 diff --git a/flair/splitter.py b/flair/splitter.py index f75ece3112..1590197947 100644 --- a/flair/splitter.py +++ b/flair/splitter.py @@ -43,7 +43,8 @@ def tokenizer(self, value: Tokenizer): class SegtokSentenceSplitter(SentenceSplitter): - """ + """Sentence Splitter using SegTok. + Implementation of :class:`SentenceSplitter` using the SegTok library. For further details see: https://github.com/fnl/segtok @@ -92,7 +93,8 @@ def tokenizer(self, value: Tokenizer): class SpacySentenceSplitter(SentenceSplitter): - """ + """Sentence Splitter using Spacy. + Implementation of :class:`SentenceSplitter`, using models from Spacy. :param model Spacy V2 model or the name of the model to load. @@ -159,7 +161,8 @@ def name(self) -> str: class SciSpacySentenceSplitter(SpacySentenceSplitter): - """ + """Sentence splitter using the spacy model `en_core_sci_sm`. + Convenience class to instantiate :class:`SpacySentenceSplitter` with Spacy model `en_core_sci_sm` for sentence splitting and :class:`SciSpacyTokenizer` as tokenizer. """ @@ -169,7 +172,8 @@ def __init__(self): class TagSentenceSplitter(SentenceSplitter): - """ + """SentenceSplitter which assumes that there is a tag within the text that is used to mark sentence boundaries. + Implementation of :class:`SentenceSplitter` which assumes that there is a special tag within the text that is used to mark sentence boundaries. """ @@ -215,7 +219,8 @@ def name(self) -> str: class NewlineSentenceSplitter(TagSentenceSplitter): - """ + r"""Sentence Splitter using newline as boundary marker. + Convenience class to instantiate :class:`SentenceTagSplitter` with newline ("\n") as sentence boundary marker. """ @@ -229,7 +234,8 @@ def name(self) -> str: class NoSentenceSplitter(SentenceSplitter): - """ + """Sentence Splitter which treats the full text as a single Sentence. + Implementation of :class:`SentenceSplitter` which treats the complete text as one sentence. """ diff --git a/flair/tokenization.py b/flair/tokenization.py index df0c3b3dd6..8efbf86e45 100644 --- a/flair/tokenization.py +++ b/flair/tokenization.py @@ -28,7 +28,8 @@ def name(self) -> str: class SpacyTokenizer(Tokenizer): - """ + """Tokenizer using spacy under the hood. + Implementation of :class:`Tokenizer`, using models from Spacy. :param model a Spacy V2 model or the name of the model to load. @@ -73,8 +74,7 @@ def name(self) -> str: class SegtokTokenizer(Tokenizer): - """ - Tokenizer using segtok, a third party library dedicated to rules-based Indo-European languages. + """Tokenizer using segtok, a third party library dedicated to rules-based Indo-European languages. For further details see: https://github.com/fnl/segtok """ @@ -100,9 +100,7 @@ def run_tokenize(text: str) -> List[str]: class SpaceTokenizer(Tokenizer): - """ - Tokenizer based on space character only. - """ + """Tokenizer based on space character only.""" def __init__(self): super(SpaceTokenizer, self).__init__() @@ -132,7 +130,8 @@ def run_tokenize(text: str) -> List[str]: class JapaneseTokenizer(Tokenizer): - """ + """Tokenizer using konoha to support popular japanese tokenizers. + Tokenizer using konoha, a third party library which supports multiple Japanese tokenizer such as MeCab, Janome and SudachiPy. @@ -184,9 +183,7 @@ def name(self) -> str: class TokenizerWrapper(Tokenizer): - """ - Helper class to wrap tokenizer functions to the class-based tokenizer interface. - """ + """Helper class to wrap tokenizer functions to the class-based tokenizer interface.""" def __init__(self, tokenizer_func: Callable[[str], List[str]]): super(TokenizerWrapper, self).__init__() @@ -201,10 +198,11 @@ def name(self) -> str: class SciSpacyTokenizer(Tokenizer): - """ + """Tokenizer that uses the en_core_sci_sm Spacy model and some special heuristics. + Implementation of :class:`Tokenizer` which uses the en_core_sci_sm Spacy model extended by special heuristics to consider characters such as "(", ")" "-" as - additional token separators. The latter distinguishs this implementation from + additional token separators. The latter distinguishes this implementation from :class:`SpacyTokenizer`. Note, you if you want to use the "normal" SciSpacy tokenization just use @@ -230,8 +228,8 @@ def __init__(self): def combined_rule_prefixes() -> List[str]: """Helper function that returns the prefix pattern for the tokenizer. - It is a helper function to accommodate spacy tests that only test - prefixes. + + It is a helper function to accommodate spacy tests that only test prefixes. """ prefix_punct = char_classes.PUNCT.replace("|", " ") diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 71658b4a47..9a0994cce9 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -52,8 +52,8 @@ class ModelTrainer(Pluggable): } def __init__(self, model: flair.nn.Model, corpus: Corpus): - """ - Initialize a model trainer + """Initialize a model trainer. + :param model: The model that you want to train. The model should inherit from flair.nn.Model # noqa: E501 :param corpus: The dataset used to train the model, should be of type Corpus """ diff --git a/flair/training_utils.py b/flair/training_utils.py index 1efc291dfe..09adf8d3ea 100644 --- a/flair/training_utils.py +++ b/flair/training_utils.py @@ -155,10 +155,11 @@ def _init_weights_index(self, key, state_dict, weights_to_watch): class AnnealOnPlateau(object): - """This class is a modification of + """A learningrate sheduler for annealing on plateau. + + This class is a modification of torch.optim.lr_scheduler.ReduceLROnPlateau that enables setting an "auxiliary metric" to break ties. - Reduce learning rate when a metric has stopped improving. Models often benefit from reducing the learning rate by a factor of 2-10 once learning stagnates. This scheduler reads a metrics @@ -333,8 +334,8 @@ def load_state_dict(self, state_dict): def init_output_file(base_path: Union[str, Path], file_name: str) -> Path: - """ - Creates a local file. + """Creates a local file which can be appended to. + :param base_path: the path to the directory :param file_name: the file name :return: the created file @@ -348,8 +349,8 @@ def init_output_file(base_path: Union[str, Path], file_name: str) -> Path: def convert_labels_to_one_hot(label_list: List[List[str]], label_dict: Dictionary) -> List[List[int]]: - """ - Convert list of labels (strings) to a one hot list. + """Convert list of labels to a one hot list. + :param label_list: list of labels :param label_dict: label dictionary :return: converted label list diff --git a/flair/visual/ner_html.py b/flair/visual/ner_html.py index a254623601..01715119f9 100644 --- a/flair/visual/ner_html.py +++ b/flair/visual/ner_html.py @@ -54,7 +54,8 @@ def render_ner_html( wrap_page=True, label_name="ner", ) -> str: - """ + """Create the html code to visualize some sentences. + :param sentences: single sentence or list of sentences to convert to HTML :param title: title of the HTML page :param colors: dict where keys are tags and values are color HTML codes diff --git a/flair/visual/training_curves.py b/flair/visual/training_curves.py index 443f660d3c..b6b26b7014 100644 --- a/flair/visual/training_curves.py +++ b/flair/visual/training_curves.py @@ -17,9 +17,8 @@ class Plotter(object): - """ - Plots training parameters (loss, f-score, and accuracy) and training - weights over time. + """Plots training parameters (loss, f-score, and accuracy) and training weights over time. + Input files are the output files 'loss.tsv' and 'weights.txt' from training either a sequence tagger or text classification model. """ From fd710545e82cd693be1e6bf0a8c42c315fd93b0f Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 27 Mar 2023 17:36:50 +0200 Subject: [PATCH 03/12] fix documentation for models and embeddings --- flair/embeddings/base.py | 22 +++-- flair/embeddings/document.py | 33 ++++--- flair/embeddings/token.py | 51 ++++++----- flair/embeddings/transformer.py | 9 +- flair/models/clustering.py | 31 +++---- flair/models/entity_linker_model.py | 15 ++-- flair/models/lemmatizer_model.py | 22 ++--- flair/models/multitask_model.py | 31 ++++--- flair/models/pairwise_classification_model.py | 8 +- flair/models/regexp_tagger.py | 26 +++--- flair/models/relation_classifier_model.py | 90 +++++++++---------- flair/models/relation_extractor_model.py | 9 +- flair/models/sequence_tagger_model.py | 47 +++++----- pyproject.toml | 2 +- 14 files changed, 195 insertions(+), 201 deletions(-) diff --git a/flair/embeddings/base.py b/flair/embeddings/base.py index 2a15b09a44..2400148c28 100644 --- a/flair/embeddings/base.py +++ b/flair/embeddings/base.py @@ -38,9 +38,10 @@ def embedding_type(self) -> str: raise NotImplementedError def embed(self, data_points: Union[DT, List[DT]]) -> List[DT]: - """Add embeddings to all words in a list of sentences. If embeddings are already added, updates only if embeddings - are non-static.""" + """Add embeddings to all words in a list of sentences. + If embeddings are already added, updates only if embeddings are non-static. + """ # if only one sentence is passed, convert to list of sentence if not isinstance(data_points, list): data_points = [data_points] @@ -62,9 +63,12 @@ def _add_embeddings_internal(self, sentences: List[DT]): pass def get_names(self) -> List[str]: - """Returns a list of embedding names. In most cases, it is just a list with one item, namely the name of + """Returns a list of embedding names. + + In most cases, it is just a list with one item, namely the name of this embedding. But in some cases, the embedding is made up by different embeddings (StackedEmbedding). - Then, the list contains the names of all embeddings in the stack.""" + Then, the list contains the names of all embeddings in the stack. + """ return [self.name] def get_named_embeddings_dict(self) -> Dict: @@ -108,7 +112,8 @@ def save_embeddings(self, use_state_dict: bool = True): class ScalarMix(torch.nn.Module): - """ + """Mixes several tensors by a learned weighting. + Computes a parameterised scalar mixture of N tensors. This method was proposed by Liu et al. (2019) in the paper: "Linguistic Knowledge and Transferability of Contextual Representations" (https://arxiv.org/abs/1903.08855) @@ -119,8 +124,8 @@ class ScalarMix(torch.nn.Module): """ def __init__(self, mixture_size: int, trainable: bool = False) -> None: - """ - Inits scalar mix implementation. + """Inits scalar mix implementation. + ``mixture = gamma * sum(s_k * tensor_k)`` where ``s = softmax(w)``, with ``w`` and ``gamma`` scalar parameters. :param mixture_size: size of mixtures (usually the number of layers) """ @@ -152,7 +157,8 @@ def __init__(self, mixture_size: int, trainable: bool = False) -> None: ) def forward(self, tensors: List[torch.Tensor]) -> torch.Tensor: - """ + """Forward pass of scalar mix. + Computes a weighted average of the ``tensors``. The input tensors an be any shape with at least two dimensions, but must all be the same shape. :param tensors: list of input tensors diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py index 31711ebc80..74e5d2e5d2 100644 --- a/flair/embeddings/document.py +++ b/flair/embeddings/document.py @@ -35,8 +35,8 @@ def __init__( is_token_embedding: bool = False, **kwargs, ): - """ - Bidirectional transformer embeddings of words from various transformer architectures. + """Bidirectional transformer embeddings of words from various transformer architectures. + :param model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for options) :param layers: string indicating which layers to take for embedding (-1 is topmost layer) @@ -70,6 +70,7 @@ def __init__( pooling: str = "mean", ): """The constructor takes a list of embeddings to be combined. + :param embeddings: a list of token embeddings :param fine_tune_mode: if set to "linear" a trainable layer is added, if set to "nonlinear", a nonlinearity is added as well. Set this to make the pooling trainable. @@ -112,9 +113,10 @@ def embedding_length(self) -> int: return self.__embedding_length def embed(self, sentences: Union[List[Sentence], Sentence]): - """Add embeddings to every sentence in the given list of sentences. If embeddings are already added, updates - only if embeddings are non-static.""" + """Add embeddings to every sentence in the given list of sentences. + If embeddings are already added, updates only if embeddings are non-static. + """ # if only one sentence is passed, convert to list of sentence if isinstance(sentences, Sentence): sentences = [sentences] @@ -170,7 +172,8 @@ def __init__( **vectorizer_params, ): """The constructor for DocumentTFIDFEmbeddings. - :param train_dataset: the train dataset which will be used to construct vectorizer + + :param train_dataset: the train dataset which will be used to construct a vectorizer :param vectorizer_params: parameters given to Scikit-learn's TfidfVectorizer constructor """ super().__init__() @@ -198,7 +201,6 @@ def embedding_length(self) -> int: def embed(self, sentences: Union[List[Sentence], Sentence]): """Add embeddings to every sentence in the given list of sentences.""" - # if only one sentence is passed, convert to list of sentence if isinstance(sentences, Sentence): sentences = [sentences] @@ -238,7 +240,8 @@ def __init__( rnn_type="GRU", fine_tune: bool = True, ): - """The constructor takes a list of embeddings to be combined. + """Instantiates an RNN that works upon some token embeddings. + :param embeddings: a list of token embeddings :param hidden_size: the number of hidden states in the rnn :param rnn_layers: the number of layers for the rnn @@ -311,9 +314,10 @@ def embedding_length(self) -> int: return self.__embedding_length def _add_embeddings_internal(self, sentences: List[Sentence]): - """Add embeddings to all sentences in the given list of sentences. If embeddings are already added, update - only if embeddings are non-static.""" + """Add embeddings to all sentences in the given list of sentences. + If embeddings are already added, update only if embeddings are non-static. + """ # TODO: remove in future versions if not hasattr(self, "locked_dropout"): self.locked_dropout = None @@ -536,7 +540,8 @@ def __init__( model: str = "bert-base-nli-mean-tokens", batch_size: int = 1, ): - """ + """Instantiates a document embedding using the SentenceTransformer Embeddings. + :param model: string name of models from SentencesTransformer Class :param name: string name of embedding type which will be set to Sentence object :param batch_size: int number of sentences to processed in one batch @@ -609,7 +614,8 @@ def __init__( locked_dropout: float = 0.0, fine_tune: bool = True, ): - """The constructor takes a list of embeddings to be combined. + """Instantiates a CNN that works uppons some token embeddings. + :param embeddings: a list of token embeddings :param kernels: list of (number of kernels, kernel size) :param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear @@ -668,9 +674,10 @@ def embedding_length(self) -> int: return self.__embedding_length def _add_embeddings_internal(self, sentences: List[Sentence]): - """Add embeddings to all sentences in the given list of sentences. If embeddings are already added, update - only if embeddings are non-static.""" + """Add embeddings to all sentences in the given list of sentences. + If embeddings are already added, update only if embeddings are non-static. + """ # TODO: remove in future versions if not hasattr(self, "locked_dropout"): self.locked_dropout = None diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py index fb07b53662..974b82b50c 100644 --- a/flair/embeddings/token.py +++ b/flair/embeddings/token.py @@ -38,8 +38,8 @@ def __init__( allow_long_sentences: bool = True, **kwargs, ): - """ - Bidirectional transformer embeddings of words from various transformer architectures. + """Bidirectional transformer embeddings of words from various transformer architectures. + :param model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for options) :param layers: string indicating which layers to take for embedding (-1 is topmost layer) @@ -118,9 +118,12 @@ def __str__(self): return f'StackedEmbeddings [{",".join([str(e) for e in self.embeddings])}]' def get_names(self) -> List[str]: - """Returns a list of embedding names. In most cases, it is just a list with one item, namely the name of - this embedding. But in some cases, the embedding is made up by different embeddings (StackedEmbedding). - Then, the list contains the names of all embeddings in the stack.""" + """Returns a list of embedding names. + + In most cases, it is just a list with one item, namely the name of this embedding. But in some cases, the + embedding is made up by different embeddings (StackedEmbedding). + Then, the list contains the names of all embeddings in the stack. + """ # make compatible with serialized models if "__names" not in self.__dict__: self.__names = [name for embedding in self.embeddings for name in embedding.get_names()] @@ -158,13 +161,13 @@ def __init__( embedding_length: Optional[int] = None, name: Optional[str] = None, ): - """ - Initializes classic word embeddings. Constructor downloads required files if not there. + """Initializes classic word embeddings. + + Constructor downloads required files if not there. :param embeddings: one of: 'glove', 'extvec', 'crawl' or two-letter language code or custom If you want to use a custom embedding file, just pass the path to the embeddings as embeddings variable. set stable=True to use the stable embeddings as described in https://arxiv.org/abs/2110.02861 """ - self.instance_parameters = self.get_instance_parameters(locals=locals()) if fine_tune and force_cpu and flair.device.type != "cpu": @@ -444,8 +447,10 @@ def __init__( char_embedding_dim: int = 25, hidden_size_char: int = 25, ): - """Uses the default character dictionary if none provided.""" + """Instantiates a bidirectional lstm layer toi encode words by their character representation. + Uses the default character dictionary if none provided. + """ super().__init__() self.name = "Char" self.static_embeddings = False @@ -561,8 +566,8 @@ def __init__( name: Optional[str] = None, has_decoder: bool = False, ): - """ - initializes contextual string embeddings using a character-level language model. + """Initializes contextual string embeddings using a character-level language model. + :param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward', etc (see https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md) @@ -1007,12 +1012,12 @@ def to_params(self): @register_embeddings class FastTextEmbeddings(TokenEmbeddings): - """FastText Embeddings with oov functionality""" + """FastText Embeddings with oov functionality.""" def __init__(self, embeddings: str, use_local: bool = True, field: str = None, name: Optional[str] = None): - """ - Initializes fasttext word embeddings. Constructor downloads required embedding file and stores in cache - if use_local is False. + """Initializes fasttext word embeddings. + + Constructor downloads required embedding file and stores in cache if use_local is False. :param embeddings: path to your embeddings '.bin' file :param use_local: set this to False if you are using embeddings from a remote source @@ -1103,8 +1108,8 @@ def __init__( embedding_length: int = 300, stable: bool = False, ): - """ - Initializes one-hot encoded word embeddings and a trainable embedding layer + """Initializes one-hot encoded word embeddings and a trainable embedding layer. + :param vocab_dictionary: the vocabulary that will be encoded :param field: by default, the 'text' of tokens is embedded, but you can also embed tags such as 'pos' :param embedding_length: dimensionality of the trainable embedding layer @@ -1398,8 +1403,9 @@ def __init__( name: Optional[str] = None, **kwargs, ): - """ - Initializes BP embeddings. Constructor downloads required files if not there. + """Initializes BP embeddings. + + Constructor downloads required files if not there. """ self.instance_parameters = self.get_instance_parameters(locals=locals()) @@ -1484,14 +1490,15 @@ def to_params(self): @register_embeddings class NILCEmbeddings(WordEmbeddings): def __init__(self, embeddings: str, model: str = "skip", size: int = 100): - """ - Initializes portuguese classic word embeddings trained by NILC Lab (http://www.nilc.icmc.usp.br/embeddings). + """Initializes portuguese classic word embeddings trained by NILC Lab. + + See: http://www.nilc.icmc.usp.br/embeddings Constructor downloads required files if not there. + :param embeddings: one of: 'fasttext', 'glove', 'wang2vec' or 'word2vec' :param model: one of: 'skip' or 'cbow'. This is not applicable to glove. :param size: one of: 50, 100, 300, 600 or 1000. """ - self.instance_parameters = self.get_instance_parameters(locals=locals()) base_path = "http://143.107.183.175:22980/download.php?file=embeddings/" diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py index 91909acf83..23c37b7fc9 100644 --- a/flair/embeddings/transformer.py +++ b/flair/embeddings/transformer.py @@ -275,7 +275,8 @@ def _reconstruct_word_ids_from_subtokens(embedding, tokens: List[str], subtokens class TransformerBaseEmbeddings(Embeddings[Sentence]): - """Base class for all TransformerEmbeddings + """Base class for all TransformerEmbeddings. + This base class handles the tokenizer and the input preparation, however it won't implement the actual model. This can be further extended to implement the model in either a pytorch, jit or onnx way of working. """ @@ -717,7 +718,7 @@ def remove_session(self): self.session = None def optimize_model(self, optimize_model_path, use_external_data_format: bool = False, **kwargs): - """Wrapper for onnxruntime.transformers.optimizer.optimize_model""" + """Wrapper for `onnxruntime.transformers.optimizer.optimize_model`.""" from onnxruntime.transformers.optimizer import optimize_model self.remove_session() @@ -1334,8 +1335,8 @@ def _forward_tensors(self, tensors) -> Dict[str, torch.Tensor]: def export_onnx( self, path: Union[str, Path], example_sentences: List[Sentence], **kwargs ) -> TransformerOnnxEmbeddings: - """ - Export TransformerEmbeddings to OnnxFormat. + """Export TransformerEmbeddings to OnnxFormat. + :param example_sentences: a list of sentences that will be used for tracing. It is recommended to take 2-4 sentences with some variation. """ diff --git a/flair/models/clustering.py b/flair/models/clustering.py index d310408771..39739be9a3 100644 --- a/flair/models/clustering.py +++ b/flair/models/clustering.py @@ -17,21 +17,20 @@ class ClusteringModel: - """ - A wrapper class for the sklearn clustering models. With this class clustering with the library 'flair' can be done. - """ + """A wrapper class to apply sklearn clustering models on DocumentEmbeddings.""" def __init__(self, model: Union[ClusterMixin, BaseEstimator], embeddings: DocumentEmbeddings): - """ - :param model: the clustering algortihm from sklearn this wrapper will use. + """Instantiate the ClusteringModel. + + :param model: the clustering algorithm from sklearn this wrapper will use. :param embeddings: the flair DocumentEmbedding this wrapper uses to calculate a vector for each sentence. """ self.model = model self.embeddings = embeddings def fit(self, corpus: Corpus, **kwargs): - """ - Trains the model. + """Trains the model. + :param corpus: the flair corpus this wrapper will use for fitting the model. """ X = self._convert_dataset(corpus) @@ -41,12 +40,10 @@ def fit(self, corpus: Corpus, **kwargs): log.info("Finished clustering.") def predict(self, corpus: Corpus): - """ - Predict labels given a list of sentences and returns the respective class indices. + """Predict labels given a list of sentences and returns the respective class indices. :param corpus: the flair corpus this wrapper will use for predicting the labels. """ - X = self._convert_dataset(corpus) log.info("Start the prediction " + str(self.model) + " with " + str(len(X)) + " Datapoints.") predict = self.model.predict(X) @@ -58,8 +55,7 @@ def predict(self, corpus: Corpus): return predict def save(self, model_file: Union[str, Path]): - """ - Saves current model. + """Saves current model. :param model_file: path where to save the model. """ @@ -69,8 +65,7 @@ def save(self, model_file: Union[str, Path]): @staticmethod def load(model_file: Union[str, Path]): - """ - Loads a model from a given path. + """Loads a model from a given path. :param model_file: path to the file where the model is saved. """ @@ -78,13 +73,13 @@ def load(model_file: Union[str, Path]): return pickle.loads(joblib.load(str(model_file))) def _convert_dataset(self, corpus, label_type: str = None, batch_size: int = 32, return_label_dict: bool = False): - """ + """Makes a flair-corpus sklearn compatible. + Turns the corpora into X, y datasets as required for most sklearn clustering models. Ref.: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster :param label_type: the label from sentences will be extracted. If the value is none this will be skipped. """ - log.info("Embed sentences...") sentences = [] for batch in tqdm(DataLoader(corpus.get_all_sentences(), batch_size=batch_size)): @@ -106,8 +101,8 @@ def _convert_dataset(self, corpus, label_type: str = None, batch_size: int = 32, return X, y def evaluate(self, corpus: Corpus, label_type: str): - """ - This method calculates some evaluation metrics for the clustering. + """This method calculates some evaluation metrics for the clustering. + Also, the result of the evaluation is logged. :param corpus: the flair corpus this wrapper will use for evaluation. diff --git a/flair/models/entity_linker_model.py b/flair/models/entity_linker_model.py index c317e5b56a..cda7d00cc5 100644 --- a/flair/models/entity_linker_model.py +++ b/flair/models/entity_linker_model.py @@ -16,9 +16,7 @@ class CandidateGenerator: - """ - Given a string, the CandidateGenerator returns possible target classes as candidates. - """ + """Given a string, the CandidateGenerator returns possible target classes as candidates.""" def __init__(self, candidates: Union[str, Dict], backoff: bool = True): # internal candidate lists of generator @@ -70,7 +68,7 @@ def _make_backoff_string(self, mention: str) -> str: return backoff_mention def get_candidates(self, mention: str) -> Set[str]: - """Given a mention, this method returns a set of candidate classes""" + """Given a mention, this method returns a set of candidate classes.""" if self.backoff: mention = self._make_backoff_string(mention) @@ -78,8 +76,8 @@ def get_candidates(self, mention: str) -> Set[str]: class EntityLinker(flair.nn.DefaultClassifier[Sentence, Span]): - """ - Entity Linking Model + """Entity Linking Model. + The model expects text/sentences with annotated entity mentions and predicts entities to these mentions. To this end a word embedding is used to embed the sentences and the embedding of the entity mention goes through a linear layer to get the actual class label. The model is able to predict '' for entity mentions that the model can not confidently match to any of the known labels. @@ -94,8 +92,8 @@ def __init__( candidates: Optional[CandidateGenerator] = None, **classifierargs, ): - """ - Initializes an EntityLinker + """Initializes an EntityLinker. + :param embeddings: embeddings used to embed the words/sentences :param label_dictionary: dictionary that gives ids to all classes. Should contain :param pooling_operation: either 'average', 'first', 'last' or 'first&last'. Specifies the way of how text representations of entity mentions (with more than one word) are handled. @@ -103,7 +101,6 @@ def __init__( the embedding of the first and the embedding of the last word. :param label_type: name of the label you use. """ - super(EntityLinker, self).__init__( embeddings=embeddings, label_dictionary=label_dictionary, diff --git a/flair/models/lemmatizer_model.py b/flair/models/lemmatizer_model.py index 42d0adbdef..bab4018b48 100644 --- a/flair/models/lemmatizer_model.py +++ b/flair/models/lemmatizer_model.py @@ -32,8 +32,8 @@ def __init__( end_symbol_for_encoding: bool = True, bidirectional_encoding: bool = True, ): - """ - Initializes a Lemmatizer model + """Initializes a Lemmatizer model. + The model consists of a decoder and an encoder. The encoder is either a RNN-cell (torch.nn.GRU) or a Token-Embedding from flair if an embedding is handed to the constructor (token_embedding). The output of the encoder is used as the initial hidden state to the decoder, which is an RNN-cell (GRU) @@ -55,9 +55,8 @@ def __init__( computed as the length of the longest token in the sentences plus one. :param max_sequence_length: If set to True and max_sequence_length_dependend_on_input is False a fixed maximum length for the decoding will be used for all sentences. - :param use_attention: whether or not to use attention. Only sensible if encoding via RNN + :param use_attention: whether to use attention. Only sensible if encoding via RNN """ - super().__init__() self._label_type = label_type @@ -160,8 +159,8 @@ def words_to_char_indices( padding_in_front=False, seq_length=None, ): - """ - For a given list of strings this function creates index vectors that represent the characters of the strings. + """For a given list of strings this function creates index vectors that represent the characters of the strings. + Each string is represented by sequence_length (maximum string length + entries for special symbold) many indices representing characters in self.char_dict. One can manually set the vector length with the parameter seq_length, though the vector length is always @@ -407,8 +406,8 @@ def predict( return_loss=False, embedding_storage_mode="none", ): - """ - Predict lemmas of words for a given (list of) sentence(s). + """Predict lemmas of words for a given (list of) sentence(s). + :param sentences: sentences to predict :param label_name: label name used for predicted lemmas :param mini_batch_size: number of tokens that are send through the RNN simultaneously, assuming batching_in_rnn @@ -707,12 +706,7 @@ def _print_predictions(self, batch, gold_label_type): return lines def evaluate(self, *args, **kwargs) -> Result: - """ - Overwrites evaluate of parent class to remove the "by class" printout - :param args: - :param kwargs: - :return: - """ + # Overwrites evaluate of parent class to remove the "by class" printout result = super().evaluate(*args, **kwargs) result.detailed_results = result.detailed_results.split("\n\n")[0] return result diff --git a/flair/models/multitask_model.py b/flair/models/multitask_model.py index a59bac606c..f192166081 100644 --- a/flair/models/multitask_model.py +++ b/flair/models/multitask_model.py @@ -15,8 +15,8 @@ class MultitaskModel(flair.nn.Classifier): - """ - Multitask Model class which acts as wrapper for creating custom multitask models. + """Multitask Model class which acts as wrapper for creating custom multitask models. + Takes different tasks as input, parameter sharing is done by objects in flair, i.e. creating a Embedding Layer and passing it to two different Models, will result in a hard parameter-shared embedding layer. The abstract class takes care @@ -31,7 +31,8 @@ def __init__( loss_factors: Optional[List[float]] = None, use_all_tasks: bool = False, ): - """ + """Instantiates the MultiTaskModel. + :param models: Key (Task ID) - Value (flair.nn.Model) Pairs to stack model """ super(MultitaskModel, self).__init__() @@ -61,9 +62,8 @@ def _prepare_tensors(self, data_points: List[DT]) -> Tuple[torch.Tensor, ...]: raise NotImplementedError("`_prepare_tensors` is not used for multitask learning") def forward_loss(self, sentences: Union[List[Sentence], Sentence]) -> Tuple[torch.Tensor, int]: - """ - Abstract forward loss implementation of flair.nn.Model's interface. - Calls the respective forward loss of each model. + """Calls the respective forward loss of each model and sums them weighted by their loss factors. + :param sentences: batch of sentences :return: loss """ @@ -86,9 +86,10 @@ def predict( @staticmethod def split_batch_to_task_ids(sentences: Union[List[Sentence], Sentence], all_tasks: bool = False) -> Dict: - """ - Splits a batch of sentences to its respective model. If single sentence is assigned to several tasks - (i.e. same corpus but different tasks), then the model assignment for this batch is randomly choosen. + """Splits a batch of sentences to its respective model. + + If single sentence is assigned to several tasks (i.e. same corpus but different tasks), then the model + assignment for this batch is randomly chosen. :param sentences: batch of sentences :param all_tasks: use all tasks of each sentence. If deactivated, a random task will be sampled :return: Key-value pairs as (task_id, list of sentences ids in batch) @@ -120,7 +121,8 @@ def evaluate( evaluate_all: bool = True, **evalargs, ) -> Result: - """ + """Evaluates the model. Returns a Result object containing evaluation results and a loss value. + :param sentences: batch of sentences :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed), 'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU) @@ -128,7 +130,6 @@ def evaluate( :param evaluate_all: choose if all tasks should be evaluated, or a single one, depending on gold_label_type :return: Tuple of Result object and loss value (float) """ - if not evaluate_all: if gold_label_type not in self.tasks: raise ValueError( @@ -203,8 +204,8 @@ def evaluate( ) def _get_state_dict(self): - """ - Returns the state dict of the multitask model which has multiple models underneath. + """Returns the state dict of the multitask model which has multiple models underneath. + :return model_state: model state for the multitask model """ initial_model_state = super()._get_state_dict() @@ -220,9 +221,7 @@ def _get_state_dict(self): @classmethod def _init_model_with_state_dict(cls, state, **kwargs): - """ - Initializes the model based on given state dict. - """ + """Initializes the model based on given state dict.""" models = [] tasks = [] loss_factors = state["loss_factors"] diff --git a/flair/models/pairwise_classification_model.py b/flair/models/pairwise_classification_model.py index 8d28d82cef..8286b0b6d6 100644 --- a/flair/models/pairwise_classification_model.py +++ b/flair/models/pairwise_classification_model.py @@ -9,8 +9,8 @@ class TextPairClassifier(flair.nn.DefaultClassifier[TextPair, TextPair]): - """ - Text Pair Classification Model for tasks such as Recognizing Textual Entailment, build upon TextClassifier. + """Text Pair Classification Model for tasks such as Recognizing Textual Entailment, build upon TextClassifier. + The model takes document embeddings and puts resulting text representation(s) into a linear layer to get the actual class label. We provide two ways to embed the DataPairs: Either by embedding both DataPoints and concatenating the resulting vectors ("embed_separately=True") or by concatenating the DataPoints and embedding @@ -24,8 +24,8 @@ def __init__( embed_separately: bool = False, **classifierargs, ): - """ - Initializes a TextClassifier + """Initializes a TextPairClassifier. + :param embeddings: embeddings used to embed each data point :param label_dictionary: dictionary of labels you want to predict :param multi_label: auto-detected by default, but you can set this to True to force multi-label prediction diff --git a/flair/models/regexp_tagger.py b/flair/models/regexp_tagger.py index af33f03fa0..37f77e985a 100644 --- a/flair/models/regexp_tagger.py +++ b/flair/models/regexp_tagger.py @@ -8,8 +8,8 @@ @dataclass class TokenCollection: - """ - A utility class for RegexpTagger to hold all tokens for a given Sentence and define some functionality + """A utility class for RegexpTagger to hold all tokens for a given Sentence and define some functionality. + :param sentence: A Sentence object """ @@ -27,7 +27,8 @@ def tokens(self) -> List[Token]: return list(self.sentence) def get_token_span(self, span: Tuple[int, int]) -> Span: - """ + """Find a span by the token character positions. + Given an interval specified with start and end pos as tuple, this function returns a Span object spanning the tokens included in the interval. If the interval is overlapping with a token span, a ValueError is raised @@ -42,8 +43,7 @@ def get_token_span(self, span: Tuple[int, int]) -> Span: class RegexpTagger: def __init__(self, mapping: Union[List[Tuple[str, str]], Tuple[str, str]]): - """ - This tagger is capable of tagging sentence objects with given regexp -> label mappings. + r"""This tagger is capable of tagging sentence objects with given regexp -> label mappings. I.e: The tuple (r'(["\'])(?:(?=(\\?))\2.)*?\1', 'QUOTE') maps every match of the regexp to a labeled span and therefore labels the given sentence object with RegexpTagger.predict(). @@ -62,8 +62,8 @@ def registered_labels(self): return self._regexp_mapping def register_labels(self, mapping: Union[List[Tuple[str, str]], Tuple[str, str]]): - """ - Register a regexp -> label mapping. + """Register a regexp -> label mapping. + :param mapping: A list of tuples or a single tuple representing a mapping as regexp -> label """ mapping = self._listify(mapping) @@ -77,8 +77,8 @@ def register_labels(self, mapping: Union[List[Tuple[str, str]], Tuple[str, str]] ) def remove_labels(self, labels: Union[List[str], str]): - """ - Remove a registered regexp -> label mapping given by label. + """Remove a registered regexp -> label mapping given by label. + :param labels: A list of labels or a single label as strings. """ labels = self._listify(labels) @@ -96,9 +96,7 @@ def _listify(element: object) -> list: return element def predict(self, sentences: Union[List[Sentence], Sentence]) -> List[Sentence]: - """ - Predict the given sentences according to the registered mappings. - """ + """Predict the given sentences according to the registered mappings.""" if not isinstance(sentences, list): sentences = [sentences] if not sentences: @@ -110,8 +108,8 @@ def predict(self, sentences: Union[List[Sentence], Sentence]) -> List[Sentence]: return sentences def _label(self, sentence: Sentence): - """ - This will add a complex_label to the given sentence for every match.span() for every registered_mapping. + """This will add a complex_label to the given sentence for every match.span() for every registered_mapping. + If a match span overlaps with a token span an exception is raised. """ collection = TokenCollection(sentence) diff --git a/flair/models/relation_classifier_model.py b/flair/models/relation_classifier_model.py index f64d3cdc6e..e02b762e71 100644 --- a/flair/models/relation_classifier_model.py +++ b/flair/models/relation_classifier_model.py @@ -39,9 +39,8 @@ class EncodedSentence(Sentence): - """ - This class is a wrapper of the regular `Sentence` object - that expresses that a sentence is encoded and compatible with the relation classifier. + """A Sentence that expresses that a sentence is encoded and compatible with the relation classifier. + For inference, i.e. `predict` and `evaluate`, the relation classifier internally encodes the sentences. Therefore, these functions work with the regular flair sentence objects. """ @@ -50,10 +49,7 @@ class EncodedSentence(Sentence): class EncodingStrategy(ABC): - """ - The :class:`EncodingStrategy` protocol defines - the encoding of the head and tail entities in a sentence with a relation annotation. - """ + """The encoding of the head and tail entities in a sentence with a relation annotation.""" special_tokens: Set[str] = set() @@ -62,24 +58,23 @@ def __init__(self, add_special_tokens: bool = False) -> None: @abstractmethod def encode_head(self, head_span: Span, label: Label) -> str: - """ - Returns the encoded string representation of the head span. + """Returns the encoded string representation of the head span. + Multi-token head encodings tokens are separated by a space. """ ... @abstractmethod def encode_tail(self, tail_span: Span, label: Label) -> str: - """ - Returns the encoded string representation of the tail span. + """Returns the encoded string representation of the tail span. + Multi-token tail encodings tokens are separated by a space. """ ... class EntityMask(EncodingStrategy): - """ - An `class`:EncodingStrategy: that masks the head and tail relation entities. + """An `class`:EncodingStrategy: that masks the head and tail relation entities. Example: For the `founded_by` relation from `ORG` to `PER` and @@ -99,8 +94,7 @@ def encode_tail(self, tail_span: Span, label: Label) -> str: class TypedEntityMask(EncodingStrategy): - """ - An `class`:EncodingStrategy: that masks the head and tail relation entities with their label. + """An `class`:EncodingStrategy: that masks the head and tail relation entities with their label. Example: For the `founded_by` relation from `ORG` to `PER` and @@ -118,8 +112,7 @@ def encode_tail(self, tail: Span, label: Label) -> str: class EntityMarker(EncodingStrategy): - """ - An `class`:EncodingStrategy: that marks the head and tail relation entities. + """An `class`:EncodingStrategy: that marks the head and tail relation entities. Example: For the `founded_by` relation from `ORG` to `PER` and @@ -143,8 +136,7 @@ def encode_tail(self, tail: Span, label: Label) -> str: class TypedEntityMarker(EncodingStrategy): - """ - An `class`:EncodingStrategy: that marks the head and tail relation entities with their label. + """An `class`:EncodingStrategy: that marks the head and tail relation entities with their label. Example: For the `founded_by` relation from `ORG` to `PER` and @@ -166,8 +158,7 @@ def encode_tail(self, tail: Span, label: Label) -> str: class EntityMarkerPunct(EncodingStrategy): - """ - An alternate version of `class`:EntityMarker: with punctuations as control tokens. + """An alternate version of `class`:EntityMarker: with punctuations as control tokens. Example: For the `founded_by` relation from `ORG` to `PER` and @@ -187,8 +178,7 @@ def encode_tail(self, tail: Span, label: Label) -> str: class TypedEntityMarkerPunct(EncodingStrategy): - """ - An alternate version of `class`:TypedEntityMarker: with punctuations as control tokens. + """An alternate version of `class`:TypedEntityMarker: with punctuations as control tokens. Example: For the `founded_by` relation from `ORG` to `PER` and @@ -210,8 +200,8 @@ def encode_tail(self, tail: Span, label: Label) -> str: class _Entity(NamedTuple): - """ - A `_Entity` encapsulates either a relation's head or a tail span, including its label. + """A `_Entity` encapsulates either a relation's head or a tail span, including its label. + This class servers as an internal helper class. """ @@ -223,13 +213,13 @@ class _Entity(NamedTuple): # - MaskedRelationClassifier ? # This depends if this relation classification architecture should replace or offer as an alternative. class RelationClassifier(flair.nn.DefaultClassifier[EncodedSentence, EncodedSentence]): - """ + """Relation Classifier to predict the relation between two entities. + ---- Task ---- Relation Classification (RC) is the task of identifying the semantic relation between two entities in a text. In contrast to (end-to-end) Relation Extraction (RE), RC requires pre-labelled entities. Example: - For the `founded_by` relation from `ORG` (head) to `PER` (tail) and the sentence "Larry Page and Sergey Brin founded Google .", we extract the relations - founded_by(head='Google', tail='Larry Page') and @@ -263,8 +253,7 @@ def __init__( allow_unk_tag: bool = True, **classifierargs, ) -> None: - """ - Initializes a `RelationClassifier`. + """Initializes a `RelationClassifier`. :param embeddings: The document embeddings used to embed each sentence :param label_dictionary: A Dictionary containing all predictable labels from the corpus @@ -347,8 +336,7 @@ def __init__( self.to(flair.device) def _valid_entities(self, sentence: Sentence) -> Iterator[_Entity]: - """ - Yields all valid entities, filtered under the specification of `self.entity_label_types`. + """Yields all valid entities, filtered under the specification of `self.entity_label_types`. :param sentence: A flair `Sentence` object with entity annotations :return: Valid entities as `_Entity` @@ -371,15 +359,15 @@ def _entity_pair_permutations( self, sentence: Sentence, ) -> Iterator[Tuple[_Entity, _Entity, Optional[str]]]: - """ - Yields all valid entity pair permutations (relation candidates). + """Yields all valid entity pair permutations (relation candidates). + If the passed sentence contains relation annotations, the relation gold label will be yielded along with the participating entities. The permutations are constructed by a filtered cross-product under the specification of `self.entity_label_types` and `self.entity_pair_labels`. :param sentence: A flair `Sentence` object with entity annotations - :return: Tuples of (HEAD, TAIL, gold_label). + :yields: Tuples of (HEAD, TAIL, gold_label). The head and tail `_Entity`s have span references to the passed sentence. """ valid_entities: List[_Entity] = list(self._valid_entities(sentence)) @@ -416,8 +404,8 @@ def _encode_sentence( tail: _Entity, gold_label: Optional[str] = None, ) -> EncodedSentence: - """ - Returns a new `Sentence` object with masked/marked head and tail spans according to the encoding strategy. + """Returns a new `Sentence` object with masked/marked head and tail spans according to the encoding strategy. + If provided, the encoded sentence also has the corresponding gold label annotation from `self.label_type`. :param head: The head `_Entity` @@ -467,7 +455,8 @@ def _encode_sentence_for_inference( self, sentence: Sentence, ) -> Iterator[Tuple[EncodedSentence, Relation]]: - """ + """Create Encoded Sentences and Relation pairs for Inference. + Yields encoded sentences annotated with their gold relation and the corresponding relation object in the original sentence for all valid entity pair permutations. The created encoded sentences are newly created sentences with no reference to the passed sentence. @@ -491,8 +480,10 @@ def _encode_sentence_for_inference( yield masked_sentence, original_relation def _encode_sentence_for_training(self, sentence: Sentence) -> Iterator[EncodedSentence]: - """ - Same as `self._encode_sentence_for_inference`, + """Create Encoded Sentences and Relation pairs for Training. + + Same as `self._encode_sentence_for_inference`. + with the option of disabling cross augmentation via `self.cross_augmentation` (and that the relation with reference to the original sentence is not returned). """ @@ -512,8 +503,8 @@ def _encode_sentence_for_training(self, sentence: Sentence) -> Iterator[EncodedS yield masked_sentence def transform_sentence(self, sentences: Union[Sentence, List[Sentence]]) -> List[EncodedSentence]: - """ - Transforms sentences into encoded sentences specific to the `RelationClassifier`. + """Transforms sentences into encoded sentences specific to the `RelationClassifier`. + For more information on the internal sentence transformation procedure, see the :class:`RelationClassifier` architecture and the different :class:`EncodingStrategy` variants docstrings. @@ -531,8 +522,8 @@ def transform_sentence(self, sentences: Union[Sentence, List[Sentence]]) -> List ] def transform_dataset(self, dataset: Dataset[Sentence]) -> FlairDatapointDataset[EncodedSentence]: - """ - Transforms a dataset into a dataset containing encoded sentences specific to the `RelationClassifier`. + """Transforms a dataset into a dataset containing encoded sentences specific to the `RelationClassifier`. + The returned dataset is stored in memory. For more information on the internal sentence transformation procedure, see the :class:`RelationClassifier` architecture and @@ -546,8 +537,8 @@ def transform_dataset(self, dataset: Dataset[Sentence]) -> FlairDatapointDataset return FlairDatapointDataset(self.transform_sentence(original_sentences)) def transform_corpus(self, corpus: Corpus[Sentence]) -> Corpus[EncodedSentence]: - """ - Transforms a corpus into a corpus containing encoded sentences specific to the `RelationClassifier`. + """Transforms a corpus into a corpus containing encoded sentences specific to the `RelationClassifier`. + The splits of the returned corpus are stored in memory. For more information on the internal sentence transformation procedure, see the :class:`RelationClassifier` architecture and @@ -571,11 +562,10 @@ def _get_embedding_for_data_point(self, prediction_data_point: EncodedSentence) return prediction_data_point.get_embedding(embedding_names) def _get_data_points_from_sentence(self, sentence: EncodedSentence) -> List[EncodedSentence]: - """ - Returns the encoded sentences to which labels are added. + """Returns the encoded sentences to which labels are added. + To encode sentences, use the `transform` function of the `RelationClassifier`. """ - # Ensure that all sentences are encoded properly if not isinstance(sentence, EncodedSentence): raise ValueError( @@ -605,8 +595,8 @@ def predict( return_loss: bool = False, embedding_storage_mode: str = "none", ) -> Optional[Tuple[torch.Tensor, int]]: - """ - Predicts the class labels for the given sentence(s). + """Predicts the class labels for the given sentence(s). + Standard `Sentence` objects and `EncodedSentences` specific to the `RelationClassifier` are allowed as input. The (relation) labels are directly added to the sentences. diff --git a/flair/models/relation_extractor_model.py b/flair/models/relation_extractor_model.py index ab762bb91c..62fe7470ad 100644 --- a/flair/models/relation_extractor_model.py +++ b/flair/models/relation_extractor_model.py @@ -23,16 +23,15 @@ def __init__( train_on_gold_pairs_only: bool = False, **classifierargs, ): - """ - Initializes a RelationClassifier + """Initializes a RelationClassifier. + :param document_embeddings: embeddings used to embed each data point :param label_dictionary: dictionary of labels you want to predict :param beta: Parameter for F-beta score for evaluation and training annealing - :param loss_weights: Dictionary of weights for labels for the loss function :param train_on_gold_pairs_only: Set true to not train to predict no relation. - (if any label's weight is unspecified it will default to 1.0) + :param loss_weights: Dictionary of weights for labels for the loss function + (if any label's weight is unspecified it will default to 1.0) """ - # pooling operation to get embeddings for entites self.pooling_operation = pooling_operation relation_representation_length = 2 * embeddings.embedding_length diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py index 7f0609e66a..a5830a0fc6 100644 --- a/flair/models/sequence_tagger_model.py +++ b/flair/models/sequence_tagger_model.py @@ -45,8 +45,8 @@ def __init__( init_from_state_dict: bool = False, allow_unk_predictions: bool = False, ): - """ - Sequence Tagger class for predicting labels for single tokens. Can be parameterized by several attributes. + """Sequence Tagger class for predicting labels for single tokens. Can be parameterized by several attributes. + In case of multitask learning, pass shared embeddings or shared rnn into respective attributes. :param embeddings: Embeddings to use during training and prediction :param tag_dictionary: Dictionary containing all tags from corpus which can be predicted @@ -206,8 +206,8 @@ def label_type(self): return self.tag_type def _init_loss_weights(self, loss_weights: Dict[str, float]) -> torch.Tensor: - """ - Intializes the loss weights based on given dictionary: + """Initializes the loss weights based on given dictionary. + :param loss_weights: dictionary - contains loss weights """ n_classes = len(self.label_dictionary) @@ -219,8 +219,8 @@ def _init_loss_weights(self, loss_weights: Dict[str, float]) -> torch.Tensor: return torch.tensor(weight_list).to(flair.device) def _init_initial_hidden_state(self, num_directions: int): - """ - Intializes hidden states given the number of directions in RNN. + """Initializes hidden states given the number of directions in RNN. + :param num_directions: Number of directions in RNN. """ hs_initializer = torch.nn.init.xavier_normal_ @@ -243,8 +243,8 @@ def RNN( bidirectional: bool, rnn_input_dim: int, ) -> torch.nn.RNN: - """ - Static wrapper function returning an RNN instance from PyTorch + """Static wrapper function returning an RNN instance from PyTorch. + :param rnn_type: Type of RNN from torch.nn :param rnn_layers: number of layers to include :param hidden_size: hidden size of RNN cell @@ -292,8 +292,8 @@ def _prepare_tensors(self, data_points: Union[List[Sentence], Sentence]) -> Tupl return sentence_tensor, lengths def forward(self, sentence_tensor: torch.Tensor, lengths: torch.LongTensor): # type: ignore[override] - """ - Forward propagation through network. + """Forward propagation through network. + :param sentence_tensor: A tensor representing the batch of sentences. :param lengths: A IntTensor representing the lengths of the respective sentences. """ @@ -366,9 +366,10 @@ def _make_padded_tensor_for_batch(self, sentences: List[Sentence]) -> Tuple[torc @staticmethod def _get_scores_from_features(features: torch.Tensor, lengths: torch.Tensor): - """ - Trims current batch tensor in shape (batch size, sequence length, tagset size) in such a way that all - pads are going to be removed. + """Remove paddings to get a smaller tensor. + + Trims current batch tensor in shape (batch size, sequence length, tagset size) + in such a way that all pads are going to be removed. :param features: torch.tensor containing all features from forward propagation :param lengths: length from each sentence in batch in order to trim padding tokens """ @@ -380,8 +381,8 @@ def _get_scores_from_features(features: torch.Tensor, lengths: torch.Tensor): return scores def _get_gold_labels(self, sentences: List[Sentence]) -> List[str]: - """ - Extracts gold labels from each sentence. + """Extracts gold labels from each sentence. + :param sentences: List of sentences in batch """ # spans need to be encoded as token-level predictions @@ -432,8 +433,8 @@ def predict( embedding_storage_mode="none", force_token_predictions: bool = False, ): # type: ignore - """ - Predicts labels for current batch with CRF or Softmax. + """Predicts labels for current batch with CRF or Softmax. + :param sentences: List of sentences in batch :param mini_batch_size: batch size for test data :param return_probabilities_for_all_classes: Whether to return probabilities for all classes @@ -533,8 +534,8 @@ def predict( return overall_loss, label_count def _standard_inference(self, features: torch.Tensor, batch: List[Sentence], probabilities_for_all_classes: bool): - """ - Softmax over emission scores from forward propagation. + """Softmax over emission scores from forward propagation. + :param features: sentence tensor from forward propagation :param batch: list of sentence :param probabilities_for_all_classes: whether to return score for each tag in tag dictionary @@ -563,8 +564,8 @@ def _standard_inference(self, features: torch.Tensor, batch: List[Sentence], pro return predictions, all_tags def _all_scores_for_token(self, sentences: List[Sentence], scores: torch.Tensor, lengths: List[int]): - """ - Returns all scores for each tag in tag dictionary. + """Returns all scores for each tag in tag dictionary. + :param scores: Scores for current sentence. """ scores = scores.numpy() @@ -921,8 +922,8 @@ def push_to_hub( private: bool = None, commit_message: str = "Add new SequenceTagger model.", ): - """ - Uploads the Sequence Tagger model to a Hugging Face Hub repository. + """Uploads the Sequence Tagger model to a Hugging Face Hub repository. + :param repo_id: A namespace (user or an organization) and a repo name separated by a `/`. :param token: An authentication token (See https://huggingface.co/settings/token). :param private: Whether the repository is private. diff --git a/pyproject.toml b/pyproject.toml index 02c1a71f07..e085f4af8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ exclude = ''' ''' [tool.pytest.ini_options] flake8-max-line-length = 210 -flake8-ignore = ["E203", "W503", "D100", "D101", "D102", "D103", "D104", "D105", "D107"] # See https://github.com/PyCQA/pycodestyle/issues/373 +flake8-ignore = ["E203", "W503", "D100", "D101", "D102", "D103", "D104", "D105", "D107", "legacy.py ALL"] # See https://github.com/PyCQA/pycodestyle/issues/373 addopts = "--flake8 --mypy --isort" filterwarnings = [ "error", # Convert all warnings to errors From a52c77b9f183a1c2bea1fb547dbea1d05b8deb86 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 27 Mar 2023 17:59:24 +0200 Subject: [PATCH 04/12] fix docstrings for some dataset files --- flair/datasets/base.py | 22 ++++---- flair/datasets/ocr.py | 11 ++-- flair/datasets/relation_extraction.py | 20 +++---- flair/datasets/text_text.py | 76 +++++++++++++-------------- flair/datasets/treebanks.py | 14 ++--- 5 files changed, 68 insertions(+), 75 deletions(-) diff --git a/flair/datasets/base.py b/flair/datasets/base.py index 04b3ab881f..43e8c8e7e3 100644 --- a/flair/datasets/base.py +++ b/flair/datasets/base.py @@ -39,13 +39,11 @@ def __init__( class FlairDatapointDataset(FlairDataset, Generic[DT]): - """ - A simple Dataset object to wrap a List of Datapoints, for example Sentences - """ + """A simple Dataset object to wrap a List of Datapoints, for example Sentences.""" def __init__(self, datapoints: Union[DT, List[DT]]): - """ - Instantiate FlairDatapointDataset + """Instantiate FlairDatapointDataset. + :param sentences: DT or List of DT that make up FlairDatapointDataset """ # cast to list if necessary @@ -70,17 +68,15 @@ def __init__(self, sentences: Union[Sentence, List[Sentence]]): class StringDataset(FlairDataset): - """ - A Dataset taking string as input and returning Sentence during iteration - """ + """A Dataset taking string as input and returning Sentence during iteration.""" def __init__( self, texts: Union[str, List[str]], use_tokenizer: Union[bool, Tokenizer] = SpaceTokenizer(), ): - """ - Instantiate StringDataset + """Instantiate StringDataset. + :param texts: a string or List of string that make up StringDataset :param use_tokenizer: Custom tokenizer to use (default is SpaceTokenizer, more advanced options are SegTokTokenizer to use segtok or SpacyTokenizer to use Spacy library models @@ -121,8 +117,9 @@ def __init__( in_memory: bool = True, tag_type: str = "class", ): - """ - Reads Mongo collections. Each collection should contain one document/text per item. + """Reads Mongo collections. + + Each collection should contain one document/text per item. Each item should have the following format: { @@ -147,7 +144,6 @@ def __init__( :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings :return: list of sentences """ - # first, check if pymongo is installed try: import pymongo diff --git a/flair/datasets/ocr.py b/flair/datasets/ocr.py index 2ad960ff5d..c9f31c4023 100644 --- a/flair/datasets/ocr.py +++ b/flair/datasets/ocr.py @@ -22,8 +22,8 @@ def __init__( normalize_coords_to_thousands: bool = True, label_name_map: Dict[str, str] = None, ): - """ - Instantiates a Dataset from a OCR-Json format. + """Instantiates a Dataset from a OCR-Json format. + The folder is structured with a "images" folder and a "tagged" folder. Those folders contain respectively .jpg and .json files with matching file name. The json contains 3 fields "words", "bbox", "labels" which are lists of equal length @@ -136,8 +136,7 @@ def __init__( label_name_map: Dict[str, str] = None, **corpusargs, ): - """ - Instantiates a Corpus from a OCR-Json format + """Instantiates a Corpus from a OCR-Json format. :param train_path: the folder for the training data :param dev_path: the folder for the dev data @@ -210,8 +209,8 @@ def __init__( label_name_map: Dict[str, str] = None, **corpusargs, ): - """ - Instantiates the SROIE corpus with perfect ocr boxes. + """Instantiates the SROIE corpus with perfect ocr boxes. + :param base_path: the path to store the dataset or load it from :param label_type: the label_type to add the ocr labels to :param encoding: the encoding to load the .json files with diff --git a/flair/datasets/relation_extraction.py b/flair/datasets/relation_extraction.py index 3730879a8f..b852651c7f 100644 --- a/flair/datasets/relation_extraction.py +++ b/flair/datasets/relation_extraction.py @@ -22,7 +22,7 @@ def convert_ptb_token(token: str) -> str: - """Convert PTB tokens to normal tokens""" + """Convert PTB tokens to normal tokens.""" return { "-lrb-": "(", "-rrb-": ")", @@ -41,9 +41,9 @@ def __init__( augment_train: bool = False, **corpusargs, ): - """ - SemEval-2010 Task 8 on Multi-Way Classification of Semantic Relations Between Pairs of - Nominals: https://aclanthology.org/S10-1006.pdf + """SemEval-2010 Task 8 on Multi-Way Classification of Semantic Relations Between Pairs of Nominals. + + see https://aclanthology.org/S10-1006.pdf :param base_path: :param in_memory: :param augment_train: @@ -228,8 +228,9 @@ def _semeval_lines_to_token_list(self, raw_lines, augment_relations): class RE_ENGLISH_TACRED(ColumnCorpus): def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, **corpusargs): - """ - TAC Relation Extraction Dataset with 41 relations from https://nlp.stanford.edu/projects/tacred/. + """TAC Relation Extraction Dataset. + + with 41 relations from https://nlp.stanford.edu/projects/tacred/. Manual download is required for this dataset. :param base_path: :param in_memory: @@ -537,9 +538,10 @@ def __init__( sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(), **corpusargs, ): - """ - DrugProt corpus: Biocreative VII Track 1 from https://zenodo.org/record/5119892#.YSdSaVuxU5k/ on - drug and chemical-protein interactions. + """Initialize the DrugProt corpus. + + Biocreative VII Track 1 from https://zenodo.org/record/5119892#.YSdSaVuxU5k/ on drug and chemical-protein + interactions. """ if not base_path: base_path = flair.cache_root / "datasets" diff --git a/flair/datasets/text_text.py b/flair/datasets/text_text.py index 3bb5471562..96fa1459f5 100644 --- a/flair/datasets/text_text.py +++ b/flair/datasets/text_text.py @@ -23,8 +23,7 @@ def __init__( in_memory: bool = True, **corpusargs, ): - """ - Instantiates a Corpus for text classification from CSV column formatted data + """Instantiates a Corpus for text classification from CSV column formatted data. :param data_folder: base folder with the task data :param train_file: the name of the train file @@ -61,8 +60,9 @@ def __init__( in_memory: bool = True, **corpusargs, ): - """ - Instantiates a Parallel Corpus from OPUS (http://opus.nlpl.eu/) + """Instantiates a Parallel Corpus from OPUS. + + see http://opus.nlpl.eu/ :param dataset: Name of the dataset (one of "tatoeba") :param l1: Language code of first language in pair ("en", "de", etc.) :param l2: Language code of second language in pair ("en", "de", etc.) @@ -71,7 +71,6 @@ def __init__( :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters :param in_memory: If True, keeps dataset fully in memory """ - if l1 > l2: l1, l2 = l2, l1 @@ -211,8 +210,9 @@ def __init__( separator: str = "\t", encoding: str = "utf-8", ): - """ - Corpus for tasks involving pairs of sentences or paragraphs. The data files are expected to be in column format where each line has a colmun + r"""Corpus for tasks involving pairs of sentences or paragraphs. + + The data files are expected to be in column format where each line has a column for the first sentence/paragraph, the second sentence/paragraph and the labels, respectively. The columns must be separated by a given separator (default: '\t'). :param data_folder: base folder with the task data @@ -234,7 +234,6 @@ def __init__( :return: a Corpus with annotated train, dev and test data """ - # find train, dev and test files if not specified dev_file, test_file, train_file = find_train_dev_test_files( data_folder, @@ -321,8 +320,9 @@ def __init__( encoding: str = "utf-8", label: bool = True, ): - """ - Creates a Dataset for pairs of sentences/paragraphs. The file needs to be in a column format, + r"""Creates a Dataset for pairs of sentences/paragraphs. + + The file needs to be in a column format, where each line has a column for the first sentence/paragraph, the second sentence/paragraph and the label seperated by e.g. '\t' (just like in the glue RTE-dataset https://gluebenchmark.com/tasks) . For each data pair we create a flair.data.DataPair object. @@ -340,7 +340,6 @@ def __init__( :param encoding: Encoding of the data file :param label: If False, the dataset expects unlabeled data """ - path_to_data = Path(path_to_data) # stop if file does not exist @@ -447,12 +446,12 @@ def __init__( in_memory: bool = True, sample_missing_splits: bool = True, ): - """ - Creates a DataPairCorpus for the Glue Recognizing Textual Entailment (RTE) data (https://gluebenchmark.com/tasks). - Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data. + """Creates a DataPairCorpus for the Glue Recognizing Textual Entailment (RTE) data. + + See https://gluebenchmark.com/tasks + Additionally to the Corpus we have a eval_dataset containing the test file of the Glue data. This file contains unlabeled test data to evaluate models on the Glue RTE task. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -532,13 +531,13 @@ def __init__( in_memory: bool = True, sample_missing_splits: bool = True, ): - """ - Creates a DataPairCorpus for the Multi-Genre Natural Language Inference Corpus (MNLI) - from GLUE benchmark (https://gluebenchmark.com/tasks). Entailment annotations are: - entailment, contradiction, neutral. This corpus includes two dev sets mathced/mismatched - and two unlabeled test sets: eval_dataset_matched, eval_dataset_mismatched. - """ + """Creates a DataPairCorpus for the Multi-Genre Natural Language Inference Corpus (MNLI) from GLUE benchmark. + see https://gluebenchmark.com/tasks + Entailment annotations are: entailment, contradiction, neutral. + This corpus includes two dev sets mathced/mismatched and two unlabeled test sets: eval_dataset_matched, + eval_dataset_mismatched. + """ if not base_path: base_path = flair.cache_root / "datasets" else: @@ -652,12 +651,11 @@ def __init__( in_memory: bool = True, sample_missing_splits: bool = True, ): - """ - Creates a DataPairCorpus for the Microsoft Research Paraphrase Corpus (MRPC) - from Glue benchmark (https://gluebenchmark.com/tasks). MRPC includes annotated - train and test sets. Dev set is sampled each time when creating this corpus. - """ + """Creates a DataPairCorpus for the Microsoft Research Paraphrase Corpus (MRPC) from Glue benchmark. + See https://gluebenchmark.com/tasks + MRPC includes annotated train and test sets. Dev set is sampled each time when creating this corpus. + """ if not base_path: base_path = flair.cache_root / "datasets" else: @@ -733,13 +731,12 @@ def __init__( in_memory: bool = True, sample_missing_splits: bool = True, ): - """ - Creates a DataPairCorpus for the Question-answering Natural Language Inference dataset - (QNLI) from GLUE benchmark (https://gluebenchmark.com/tasks). - Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data. + """Creates a DataPairCorpus for the Question-answering Natural Language Inference dataset (QNLI) from GLUE. + + see https://gluebenchmark.com/tasks + Additionally, to the Corpus we have an eval_dataset containing the test file of the Glue data. This file contains unlabeled test data to evaluate models on the Glue QNLI task. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -820,13 +817,13 @@ def __init__( in_memory: bool = True, sample_missing_splits: bool = True, ): - """ - Creates a Quora Question Pairs (QQP) Corpus from the Glue benchmark (https://gluebenchmark.com/tasks). + """Creates a Quora Question Pairs (QQP) Corpus from the Glue benchmark. + + See https://gluebenchmark.com/tasks The task is to determine whether a pair of questions are semantically equivalent. Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data. This file contains unlabeled test data to evaluate models on the Glue QQP task. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -907,13 +904,12 @@ def __init__( in_memory: bool = True, sample_missing_splits: bool = True, ): - """ - Creates a Winograd Schema Challenge Corpus formated as Natural Language Inference task (WNLI). + """Creates a Winograd Schema Challenge Corpus formated as Natural Language Inference task (WNLI). + The task is to predict if the sentence with the pronoun substituted is entailed by the original sentence. Additionaly to the Corpus we have a eval_dataset containing the test file of the Glue data. This file contains unlabeled test data to evaluate models on the Glue WNLI task. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -992,12 +988,12 @@ def __init__( in_memory: bool = True, sample_missing_splits: bool = True, ): - """ - Creates a DataPairCorpus for the SuperGlue Recognizing Textual Entailment (RTE) data (https://super.gluebenchmark.com/tasks). + """Creates a DataPairCorpus for the SuperGlue Recognizing Textual Entailment (RTE) data. + + See https://super.gluebenchmark.com/tasks Additionaly to the Corpus we have a eval_dataset containing the test file of the SuperGlue data. This file contains unlabeled test data to evaluate models on the SuperGlue RTE task. """ - if not base_path: base_path = flair.cache_root / "datasets" else: diff --git a/flair/datasets/treebanks.py b/flair/datasets/treebanks.py index 6e13626385..d6dd777644 100755 --- a/flair/datasets/treebanks.py +++ b/flair/datasets/treebanks.py @@ -21,8 +21,7 @@ def __init__( in_memory: bool = True, split_multiwords: bool = True, ): - """ - Instantiates a Corpus from CoNLL-U column-formatted task data such as the UD corpora + """Instantiates a Corpus from CoNLL-U column-formatted task data such as the UD corpora. :param data_folder: base folder with the task data :param train_file: the name of the train file @@ -32,7 +31,6 @@ def __init__( :param split_multiwords: If set to True, multiwords are split (default), otherwise kept as single tokens :return: a Corpus with annotated train, dev and test data """ - # find train, dev and test files if not specified dev_file, test_file, train_file = find_train_dev_test_files(data_folder, dev_file, test_file, train_file) @@ -63,8 +61,7 @@ def __init__( in_memory: bool = True, split_multiwords: bool = True, ): - """ - Instantiates a column dataset in CoNLL-U format. + """Instantiates a column dataset in CoNLL-U format. :param path_to_conll_file: Path to the CoNLL-U formatted file :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads @@ -505,10 +502,13 @@ def __init__( class UD_FAROESE(UniversalDependenciesCorpus): - """This treebank includes the Faroese treebank dataset from the following link: + """This treebank includes the Faroese treebank dataset. + + The data is obtained from the following link: https://github.com/UniversalDependencies/UD_Faroese-FarPaHC/tree/master - Faronese is a small Western Scandinavian language with 60.000-100.000, related to Icelandic and Old Norse""" + Faronese is a small Western Scandinavian language with 60.000-100.000, related to Icelandic and Old Norse. + """ def __init__( self, From c92424b95187bc4465d2db7cec59d224ed2f90bb Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 17 Apr 2023 16:50:39 +0200 Subject: [PATCH 05/12] fix more documentation errors --- flair/datasets/biomedical.py | 630 +++++++++------------- flair/datasets/document_classification.py | 270 +++++----- flair/datasets/entity_linking.py | 123 +++-- flair/datasets/sequence_labeling.py | 386 +++++++------ 4 files changed, 626 insertions(+), 783 deletions(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index aa0f0c9335..af88148460 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -50,10 +50,11 @@ class Entity: - """ - Internal class to represent entities while converting biomedical NER corpora to a standardized format - (only used for pre-processing purposes!). Each entity consists of the char span it addresses in - the original text as well as the type of entity (e.g. Chemical, Gene, and so on). + """Internal class to represent entities while converting biomedical NER + corpora to a standardized format (only used for pre-processing purposes!). + + Each entity consists of the char span it addresses in the original + text as well as the type of entity (e.g. Chemical, Gene, and so on). """ def __init__(self, char_span: Tuple[int, int], entity_type: str): @@ -68,16 +69,14 @@ def __repr__(self): return str(self) def is_before(self, other_entity) -> bool: - """ - Checks whether this entity is located before the given one + """Checks whether this entity is located before the given one. :param other_entity: Entity to check """ return self.char_span.stop <= other_entity.char_span.start def contains(self, other_entity) -> bool: - """ - Checks whether the given entity is fully contained in this entity + """Checks whether the given entity is fully contained in this entity. :param other_entity: Entity to check """ @@ -86,8 +85,7 @@ def contains(self, other_entity) -> bool: ) def overlaps(self, other_entity) -> bool: - """ - Checks whether this and the given entity overlap + """Checks whether this and the given entity overlap. :param other_entity: Entity to check """ @@ -97,9 +95,7 @@ def overlaps(self, other_entity) -> bool: class InternalBioNerDataset: - """ - Internal class to represent a corpus and it's entities. - """ + """Internal class to represent a corpus and it's entities.""" def __init__(self, documents: Dict[str, str], entities_per_document: Dict[str, List[Entity]]): self.documents = documents @@ -194,10 +190,9 @@ def filter_nested_entities(dataset: InternalBioNerDataset) -> None: def bioc_to_internal(bioc_file: Path): - """ - Helper function to parse corpora that are given in BIOC format. See + """Helper function to parse corpora that are given in BIOC format. See. - http://bioc.sourceforge.net/ + http://bioc.sourceforge.net/ for details. """ @@ -281,13 +276,11 @@ def bioc_to_internal(bioc_file: Path): def brat_to_internal(corpus_dir: Path, ann_file_suffixes=None) -> InternalBioNerDataset: - """ - Helper function to parse corpora that are annotated using BRAT. See + """Helper function to parse corpora that are annotated using BRAT. See. - https://brat.nlplab.org/ + https://brat.nlplab.org/ for details. - """ if ann_file_suffixes is None: ann_file_suffixes = [".ann"] @@ -418,8 +411,7 @@ def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path): class HunerDataset(ColumnCorpus, ABC): - """ - Base class for HUNER datasets. + """Base class for HUNER datasets. Every subclass has to implement the following methods: - `to_internal', which reads the complete data set (incl. train, dev, test) and returns the corpus @@ -443,10 +435,12 @@ def split_url() -> str: raise NotImplementedError() def get_corpus_sentence_splitter(self) -> Optional[SentenceSplitter]: - """ - If the corpus has a pre-defined sentence splitting, then this method returns - the sentence splitter to be used to reconstruct the original splitting. - If the corpus has no pre-defined sentence splitting None will be returned. + """If the corpus has a pre-defined sentence splitting, then this method + returns the sentence splitter to be used to reconstruct the original + splitting. + + If the corpus has no pre-defined sentence splitting None will be + returned. """ return None @@ -462,7 +456,6 @@ def __init__( :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which segments the text into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -532,8 +525,7 @@ def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path class BIO_INFER(ColumnCorpus): - """ - Original BioInfer corpus + """Original BioInfer corpus. For further information see Pyysalo et al.: BioInfer: a corpus for information extraction in the biomedical domain @@ -549,7 +541,6 @@ def __init__( :param base_path: Path to the corpus on your machine :param in_memory: If True, keeps dataset in memory giving speedups in training. """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -614,9 +605,8 @@ def parse_dataset(cls, original_file: Path): class HUNER_GENE_BIO_INFER(HunerDataset): - """ - HUNER version of the BioInfer corpus containing only gene/protein annotations - """ + """HUNER version of the BioInfer corpus containing only gene/protein + annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -646,12 +636,11 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class JNLPBA(ColumnCorpus): - """ - Original corpus of the JNLPBA shared task. + """Original corpus of the JNLPBA shared task. - For further information see Kim et al.: - Introduction to the Bio-Entity Recognition Task at JNLPBA - https://www.aclweb.org/anthology/W04-1213.pdf + For further information see Kim et al.: Introduction to the Bio- + Entity Recognition Task at JNLPBA + https://www.aclweb.org/anthology/W04-1213.pdf """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -659,7 +648,6 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): :param base_path: Path to the corpus on your machine :param in_memory: If True, keeps dataset in memory giving speedups in training. """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -786,9 +774,7 @@ def read_file(cls, input_iob_file: Path, sentence_tag: str) -> InternalBioNerDat class HUNER_GENE_JNLPBA(HunerDataset): - """ - HUNER version of the JNLPBA corpus containing gene annotations. - """ + """HUNER version of the JNLPBA corpus containing gene annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -818,9 +804,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_CELL_LINE_JNLPBA(HunerDataset): - """ - HUNER version of the JNLPBA corpus containing cell line annotations. - """ + """HUNER version of the JNLPBA corpus containing cell line annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -850,12 +834,12 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class CELL_FINDER(ColumnCorpus): - """ - Original CellFinder corpus containing cell line, species and gene annotations. + """Original CellFinder corpus containing cell line, species and gene + annotations. - For futher information see Neves et al.: - Annotating and evaluating text for stem cell research - https://pdfs.semanticscholar.org/38e3/75aeeeb1937d03c3c80128a70d8e7a74441f.pdf + For futher information see Neves et al.: Annotating and + evaluating text for stem cell research + https://pdfs.semanticscholar.org/38e3/75aeeeb1937d03c3c80128a70d8e7a74441f.pdf """ def __init__( @@ -935,9 +919,8 @@ def read_folder(cls, data_folder: Path) -> InternalBioNerDataset: class HUNER_CELL_LINE_CELL_FINDER(HunerDataset): - """ - HUNER version of the CellFinder corpus containing only cell line annotations. - """ + """HUNER version of the CellFinder corpus containing only cell line + annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -954,9 +937,8 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_SPECIES_CELL_FINDER(HunerDataset): - """ - HUNER version of the CellFinder corpus containing only species annotations. - """ + """HUNER version of the CellFinder corpus containing only species + annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -973,9 +955,8 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_GENE_CELL_FINDER(HunerDataset): - """ - HUNER version of the CellFinder corpus containing only gene annotations. - """ + """HUNER version of the CellFinder corpus containing only gene + annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -993,12 +974,11 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class MIRNA(ColumnCorpus): - """ - Original miRNA corpus. + """Original miRNA corpus. - For further information see Bagewadi et al.: - Detecting miRNA Mentions and Relations in Biomedical Literature - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4602280/ + For further information see Bagewadi et al.: Detecting miRNA + Mentions and Relations in Biomedical Literature + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4602280/ """ def __init__( @@ -1122,9 +1102,8 @@ def get_mirna_subset(dataset: InternalBioNerDataset, split_url: str, split_dir: class HUNER_GENE_MIRNA(HunerDataset): - """ - HUNER version of the miRNA corpus containing protein / gene annotations. - """ + """HUNER version of the miRNA corpus containing protein / gene + annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1160,9 +1139,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_SPECIES_MIRNA(HunerDataset): - """ - HUNER version of the miRNA corpus containing species annotations. - """ + """HUNER version of the miRNA corpus containing species annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1198,9 +1175,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_DISEASE_MIRNA(HunerDataset): - """ - HUNER version of the miRNA corpus containing disease annotations. - """ + """HUNER version of the miRNA corpus containing disease annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1236,7 +1211,8 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class KaewphanCorpusHelper: - """Helper class for the corpora from Kaewphan et al., i.e. CLL and Gellus""" + """Helper class for the corpora from Kaewphan et al., i.e. CLL and + Gellus.""" @staticmethod def download_cll_dataset(data_folder: Path): @@ -1345,12 +1321,12 @@ def read_dataset(nersuite_folder: Path, sentence_separator: str) -> InternalBioN class CLL(ColumnCorpus): - """ - Original CLL corpus containing cell line annotations. + """Original CLL corpus containing cell line annotations. - For further information, see Kaewphan et al.: - Cell line name recognition in support of the identification of synthetic lethality in cancer from text - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/ + For further information, see Kaewphan et al.: Cell line name + recognition in support of the identification of synthetic lethality + in cancer from text + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/ """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -1383,9 +1359,7 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): class HUNER_CELL_LINE_CLL(HunerDataset): - """ - HUNER version of the CLL corpus containing cell line annotations. - """ + """HUNER version of the CLL corpus containing cell line annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1411,12 +1385,12 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class GELLUS(ColumnCorpus): - """ - Original Gellus corpus containing cell line annotations. + """Original Gellus corpus containing cell line annotations. - For further information, see Kaewphan et al.: - Cell line name recognition in support of the identification of synthetic lethality in cancer from text - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/ + For further information, see Kaewphan et al.: Cell line name + recognition in support of the identification of synthetic lethality + in cancer from text + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4708107/ """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -1457,9 +1431,7 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): class HUNER_CELL_LINE_GELLUS(HunerDataset): - """ - HUNER version of the Gellus corpus containing cell line annotations. - """ + """HUNER version of the Gellus corpus containing cell line annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1488,8 +1460,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class LOCTEXT(ColumnCorpus): - """ - Original LOCTEXT corpus containing species annotations. + """Original LOCTEXT corpus containing species annotations. For further information see Cejuela et al.: LocText: relation extraction of protein localizations to assist database curation @@ -1582,9 +1553,7 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset: class HUNER_SPECIES_LOCTEXT(HunerDataset): - """ - HUNER version of the Loctext corpus containing species annotations. - """ + """HUNER version of the Loctext corpus containing species annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1601,9 +1570,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_GENE_LOCTEXT(HunerDataset): - """ - HUNER version of the Loctext corpus containing protein annotations. - """ + """HUNER version of the Loctext corpus containing protein annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1621,12 +1588,11 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class CHEMDNER(ColumnCorpus): - """ - Original corpus of the CHEMDNER shared task. + """Original corpus of the CHEMDNER shared task. - For further information see Krallinger et al.: - The CHEMDNER corpus of chemicals and drugs and its annotation principles - https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-7-S1-S2 + For further information see Krallinger et al.: The CHEMDNER corpus + of chemicals and drugs and its annotation principles + https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-7-S1-S2 """ def __init__( @@ -1641,7 +1607,6 @@ def __init__( :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which segements documents into sentences and tokens """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -1688,9 +1653,7 @@ def download_dataset(data_dir: Path): class HUNER_CHEMICAL_CHEMDNER(HunerDataset): - """ - HUNER version of the CHEMDNER corpus containing chemical annotations. - """ + """HUNER version of the CHEMDNER corpus containing chemical annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1725,13 +1688,12 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class IEPA(ColumnCorpus): - """ - IEPA corpus as provided by http://corpora.informatik.hu-berlin.de/ + """IEPA corpus as provided by http://corpora.informatik.hu-berlin.de/ (Original corpus is 404) For further information see Ding, Berleant, Nettleton, Wurtele: - Mining MEDLINE: abstracts, sentences, or phrases? - https://www.ncbi.nlm.nih.gov/pubmed/11928487 + Mining MEDLINE: abstracts, sentences, or phrases? + https://www.ncbi.nlm.nih.gov/pubmed/11928487 """ def __init__( @@ -1743,7 +1705,6 @@ def __init__( :param base_path: Path to the corpus on your machine :param in_memory: If True, keeps dataset in memory giving speedups in training. """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -1815,9 +1776,7 @@ def parse_dataset(cls, original_file: Path): class HUNER_GENE_IEPA(HunerDataset): - """ - HUNER version of the IEPA corpus containing gene annotations. - """ + """HUNER version of the IEPA corpus containing gene annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1841,8 +1800,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class LINNEAUS(ColumnCorpus): - """ - Original LINNEAUS corpus containing species annotations. + """Original LINNEAUS corpus containing species annotations. For further information see Gerner et al.: LINNAEUS: a species name identification system for biomedical literature @@ -1861,7 +1819,6 @@ def __init__( :param tokenizer: Custom implementation of :class:`Tokenizer` which segments sentence into tokens (default :class:`SciSpacyTokenizer`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -1929,9 +1886,7 @@ def download_and_parse_dataset(data_dir: Path): class HUNER_SPECIES_LINNEAUS(HunerDataset): - """ - HUNER version of the LINNEAUS corpus containing species annotations. - """ + """HUNER version of the LINNEAUS corpus containing species annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1946,12 +1901,12 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class CDR(ColumnCorpus): - """ - CDR corpus as provided by https://github.com/JHnlp/BioCreative-V-CDR-Corpus + """CDR corpus as provided by https://github.com/JHnlp/BioCreative-V-CDR- + Corpus. - For further information see Li et al.: - BioCreative V CDR task corpus: a resource for chemical disease relation extraction - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4860626/ + For further information see Li et al.: BioCreative V CDR task + corpus: a resource for chemical disease relation extraction + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4860626/ """ def __init__( @@ -1966,7 +1921,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -2013,9 +1967,7 @@ def download_dataset(data_dir: Path): class HUNER_DISEASE_CDR(HunerDataset): - """ - HUNER version of the IEPA corpus containing disease annotations. - """ + """HUNER version of the IEPA corpus containing disease annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2037,9 +1989,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_CHEMICAL_CDR(HunerDataset): - """ - HUNER version of the IEPA corpus containing chemical annotations. - """ + """HUNER version of the IEPA corpus containing chemical annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2062,12 +2012,12 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class VARIOME(ColumnCorpus): - """ - Variome corpus as provided by http://corpora.informatik.hu-berlin.de/corpora/brat2bioc/hvp_bioc.xml.zip + """Variome corpus as provided by http://corpora.informatik.hu- + berlin.de/corpora/brat2bioc/hvp_bioc.xml.zip. - For further information see Verspoor et al.: - Annotating the biomedical literature for the human variome - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3676157/ + For further information see Verspoor et al.: Annotating the + biomedical literature for the human variome + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3676157/ """ def __init__( @@ -2082,7 +2032,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -2158,9 +2107,7 @@ def parse_corpus(corpus_xml: Path) -> InternalBioNerDataset: class HUNER_GENE_VARIOME(HunerDataset): - """ - HUNER version of the Variome corpus containing gene annotations. - """ + """HUNER version of the Variome corpus containing gene annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2179,9 +2126,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_DISEASE_VARIOME(HunerDataset): - """ - HUNER version of the Variome corpus containing disease annotations. - """ + """HUNER version of the Variome corpus containing disease annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2200,9 +2145,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_SPECIES_VARIOME(HunerDataset): - """ - HUNER version of the Variome corpus containing species annotations. - """ + """HUNER version of the Variome corpus containing species annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2222,12 +2165,11 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class NCBI_DISEASE(ColumnCorpus): - """ - Original NCBI disease corpus containing disease annotations. + """Original NCBI disease corpus containing disease annotations. - For further information see Dogan et al.: - NCBI disease corpus: a resource for disease name recognition and concept normalization - https://www.ncbi.nlm.nih.gov/pubmed/24393765 + For further information see Dogan et al.: NCBI disease corpus: a + resource for disease name recognition and concept normalization + https://www.ncbi.nlm.nih.gov/pubmed/24393765 """ def __init__( @@ -2242,7 +2184,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -2359,9 +2300,7 @@ def parse_input_file(input_file: Path): class HUNER_DISEASE_NCBI(HunerDataset): - """ - HUNER version of the NCBI corpus containing disease annotations. - """ + """HUNER version of the NCBI corpus containing disease annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2381,7 +2320,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class ScaiCorpus(ColumnCorpus): - """Base class to support the SCAI chemicals and disease corpora""" + """Base class to support the SCAI chemicals and disease corpora.""" def __init__( self, @@ -2395,7 +2334,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -2478,12 +2416,11 @@ def parse_input_file(input_file: Path): @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class SCAI_CHEMICALS(ScaiCorpus): - """ - Original SCAI chemicals corpus containing chemical annotations. + """Original SCAI chemicals corpus containing chemical annotations. - For further information see Kolářik et al.: - Chemical Names: Terminological Resources and Corpora Annotation - https://pub.uni-bielefeld.de/record/2603498 + For further information see Kolářik et al.: Chemical Names: + Terminological Resources and Corpora Annotation + https://pub.uni-bielefeld.de/record/2603498 """ def __init__(self, *args, **kwargs): @@ -2507,12 +2444,12 @@ def perform_corpus_download(data_dir: Path) -> Path: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class SCAI_DISEASE(ScaiCorpus): - """ - Original SCAI disease corpus containing disease annotations. + """Original SCAI disease corpus containing disease annotations. - For further information see Gurulingappa et al.: - An Empirical Evaluation of Resources for the Identification of Diseases and Adverse Effects in Biomedical Literature - https://pub.uni-bielefeld.de/record/2603398 + For further information see Gurulingappa et al.: An Empirical + Evaluation of Resources for the Identification of Diseases and + Adverse Effects in Biomedical Literature + https://pub.uni-bielefeld.de/record/2603398 """ def __init__(self, *args, **kwargs): @@ -2533,9 +2470,8 @@ def perform_corpus_download(data_dir: Path) -> Path: class HUNER_CHEMICAL_SCAI(HunerDataset): - """ - HUNER version of the SCAI chemicals corpus containing chemical annotations. - """ + """HUNER version of the SCAI chemicals corpus containing chemical + annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2564,9 +2500,8 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_DISEASE_SCAI(HunerDataset): - """ - HUNER version of the SCAI chemicals corpus containing chemical annotations. - """ + """HUNER version of the SCAI chemicals corpus containing chemical + annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2587,12 +2522,12 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class OSIRIS(ColumnCorpus): - """ - Original OSIRIS corpus containing variation and gene annotations. + """Original OSIRIS corpus containing variation and gene annotations. - For further information see Furlong et al.: - Osiris v1.2: a named entity recognition system for sequence variants of genes in biomedical literature - https://www.ncbi.nlm.nih.gov/pubmed/18251998 + For further information see Furlong et al.: Osiris v1.2: a named + entity recognition system for sequence variants of genes in + biomedical literature + https://www.ncbi.nlm.nih.gov/pubmed/18251998 """ def __init__( @@ -2611,7 +2546,6 @@ def __init__( erroneously annotates two sentences as a protein. Set to True if you don't want the fixed version. """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -2690,10 +2624,8 @@ def parse_dataset(cls, corpus_folder: Path, fix_annotation=True): class HUNER_GENE_OSIRIS(HunerDataset): - """ - HUNER version of the OSIRIS corpus containing (only) gene annotations. - - """ + """HUNER version of the OSIRIS corpus containing (only) gene + annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2711,12 +2643,10 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class S800(ColumnCorpus): - """ - S800 corpus - For further information see Pafilis et al.: - The SPECIES and ORGANISMS Resources for Fast and Accurate Identification of Taxonomic Names in Text - http://www.plosone.org/article/info:doi%2F10.1371%2Fjournal.pone.0065390 - """ + """S800 corpus For further information see Pafilis et al.: The SPECIES and + ORGANISMS Resources for Fast and Accurate Identification of Taxonomic Names + in Text http://www.plosone.org/article/info:doi%2F10.1371%2Fjournal.pone.00 + 65390.""" def __init__( self, @@ -2730,7 +2660,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -2792,9 +2721,7 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset: class HUNER_SPECIES_S800(HunerDataset): - """ - HUNER version of the S800 corpus containing species annotations. - """ + """HUNER version of the S800 corpus containing species annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2812,11 +2739,10 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class GPRO(ColumnCorpus): - """ - Original GPRO corpus containing gene annotations. + """Original GPRO corpus containing gene annotations. For further information see: - https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/gpro-detailed-task-description/ + https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/gpro-detailed-task-description/ """ def __init__( @@ -2831,7 +2757,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -2930,9 +2855,7 @@ def parse_input_file(text_file: Path, ann_file: Path) -> InternalBioNerDataset: class HUNER_GENE_GPRO(HunerDataset): - """ - HUNER version of the GPRO corpus containing gene annotations. - """ + """HUNER version of the GPRO corpus containing gene annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2956,12 +2879,11 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class DECA(ColumnCorpus): - """ - Original DECA corpus containing gene annotations. + """Original DECA corpus containing gene annotations. - For further information see Wang et al.: - Disambiguating the species of biomedical named entities using natural language parsers - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2828111/ + For further information see Wang et al.: Disambiguating the + species of biomedical named entities using natural language parsers + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2828111/ """ def __init__( @@ -2976,7 +2898,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default BioSpacySentenceSpliiter) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -3045,9 +2966,7 @@ def parse_corpus(text_dir: Path, gold_file: Path) -> InternalBioNerDataset: class HUNER_GENE_DECA(HunerDataset): - """ - HUNER version of the DECA corpus containing gene annotations. - """ + """HUNER version of the DECA corpus containing gene annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -3065,12 +2984,11 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class FSU(ColumnCorpus): - """ - Original FSU corpus containing protein and derived annotations. + """Original FSU corpus containing protein and derived annotations. - For further information see Hahn et al.: - A proposal for a configurable silver standard - https://www.aclweb.org/anthology/W10-1838/ + For further information see Hahn et al.: A proposal for a + configurable silver standard + https://www.aclweb.org/anthology/W10-1838/ """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -3078,7 +2996,6 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): :param base_path: Path to the corpus on your machine :param in_memory: If True, keeps dataset in memory giving speedups in training. """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -3197,9 +3114,7 @@ def parse_corpus(corpus_dir: Path, sentence_separator: str) -> InternalBioNerDat class HUNER_GENE_FSU(HunerDataset): - """ - HUNER version of the FSU corpus containing (only) gene annotations. - """ + """HUNER version of the FSU corpus containing (only) gene annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -3231,12 +3146,12 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class CRAFT(ColumnCorpus): - """ - Original CRAFT corpus (version 2.0) containing all but the coreference and sections/typography annotations. + """Original CRAFT corpus (version 2.0) containing all but the coreference + and sections/typography annotations. - For further information see Bada et al.: - Concept annotation in the craft corpus - https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-13-161 + For further information see Bada et al.: Concept annotation in the + craft corpus + https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-13-161 """ def __init__( @@ -3251,7 +3166,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -3327,12 +3241,11 @@ def parse_corpus(corpus_dir: Path) -> InternalBioNerDataset: class BIOSEMANTICS(ColumnCorpus): - """ - Original Biosemantics corpus. + """Original Biosemantics corpus. - For further information see Akhondi et al.: - Annotated chemical patent corpus: a gold standard for text mining - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4182036/ + For further information see Akhondi et al.: Annotated chemical + patent corpus: a gold standard for text mining + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4182036/ """ def __init__( @@ -3461,12 +3374,11 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class BC2GM(ColumnCorpus): - """ - Original BioCreative-II-GM corpus containing gene annotations. + """Original BioCreative-II-GM corpus containing gene annotations. - For further information see Smith et al.: - Overview of BioCreative II gene mention recognition - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/ + For further information see Smith et al.: Overview of + BioCreative II gene mention recognition + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/ """ def __init__( @@ -3585,9 +3497,8 @@ def parse_dataset(text_file: Path, ann_file: Path) -> InternalBioNerDataset: class HUNER_GENE_BC2GM(HunerDataset): - """ - HUNER version of the BioCreative-II-GM corpus containing gene annotations. - """ + """HUNER version of the BioCreative-II-GM corpus containing gene + annotations.""" def __init__(self, *args, **kwargs): super().__init__( @@ -3608,11 +3519,10 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class CEMP(ColumnCorpus): - """ - Original CEMP corpus containing chemical annotations. + """Original CEMP corpus containing chemical annotations. For further information see: - https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/cemp-detailed-task-description/ + https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/cemp-detailed-task-description/ """ def __init__( @@ -3627,7 +3537,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -3727,9 +3636,7 @@ def parse_input_file(text_file: Path, ann_file: Path) -> InternalBioNerDataset: class HUNER_CHEMICAL_CEMP(HunerDataset): - """ - HUNER version of the CEMP corpus containing chemical annotations. - """ + """HUNER version of the CEMP corpus containing chemical annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -3767,12 +3674,12 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class CHEBI(ColumnCorpus): - """ - Original CHEBI corpus containing all annotations. + """Original CHEBI corpus containing all annotations. - For further information see Shardlow et al.: - A New Corpus to Support Text Mining for the Curation of Metabolites in the ChEBI Database - http://www.lrec-conf.org/proceedings/lrec2018/pdf/229.pdf + For further information see Shardlow et al.: A New Corpus to + Support Text Mining for the Curation of Metabolites in the ChEBI + Database + http://www.lrec-conf.org/proceedings/lrec2018/pdf/229.pdf """ def __init__( @@ -3889,9 +3796,7 @@ def get_entities(f): class HUNER_CHEMICAL_CHEBI(HunerDataset): - """ - HUNER version of the CHEBI corpus containing chemical annotations. - """ + """HUNER version of the CHEBI corpus containing chemical annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -3908,9 +3813,7 @@ def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDatas class HUNER_GENE_CHEBI(HunerDataset): - """ - HUNER version of the CHEBI corpus containing gene annotations. - """ + """HUNER version of the CHEBI corpus containing gene annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -3927,9 +3830,7 @@ def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDatas class HUNER_SPECIES_CHEBI(HunerDataset): - """ - HUNER version of the CHEBI corpus containing species annotations. - """ + """HUNER version of the CHEBI corpus containing species annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -3946,11 +3847,10 @@ def to_internal(self, data_dir: Path, annotator: int = 0) -> InternalBioNerDatas class BioNLPCorpus(ColumnCorpus): - """ - Base class for corpora from BioNLP event extraction shared tasks + """Base class for corpora from BioNLP event extraction shared tasks. For further information see: - http://2013.bionlp-st.org/Intro + http://2013.bionlp-st.org/Intro """ def __init__( @@ -3965,7 +3865,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -4032,12 +3931,11 @@ def parse_input_files(input_folder: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class BIONLP2013_PC(BioNLPCorpus): - """ - Corpus of the BioNLP'2013 Pathway Curation shared task + """Corpus of the BioNLP'2013 Pathway Curation shared task. - For further information see Ohta et al. - Overview of the pathway curation (PC) task of bioNLP shared task 2013. - https://www.aclweb.org/anthology/W13-2009/ + For further information see Ohta et al. Overview of the pathway + curation (PC) task of bioNLP shared task 2013. + https://www.aclweb.org/anthology/W13-2009/ """ @staticmethod @@ -4075,12 +3973,11 @@ def download_corpus(download_folder: Path) -> Tuple[Path, Path, Path]: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class BIONLP2013_CG(BioNLPCorpus): - """ - Corpus of the BioNLP'2013 Cancer Genetics shared task + """Corpus of the BioNLP'2013 Cancer Genetics shared task. For further information see Pyysalo, Ohta & Ananiadou 2013 - Overview of the Cancer Genetics (CG) task of BioNLP Shared Task 2013 - https://www.aclweb.org/anthology/W13-2008/ + Overview of the Cancer Genetics (CG) task of BioNLP Shared Task 2013 + https://www.aclweb.org/anthology/W13-2008/ """ @staticmethod @@ -4104,13 +4001,12 @@ def download_corpus(download_folder: Path) -> Tuple[Path, Path, Path]: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class ANAT_EM(ColumnCorpus): - """ - Corpus for anatomical named entity mention recognition. + """Corpus for anatomical named entity mention recognition. - For further information see Pyysalo and Ananiadou: - Anatomical entity mention recognition at literature scale - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3957068/ - http://nactem.ac.uk/anatomytagger/#AnatEM + For further information see Pyysalo and Ananiadou: Anatomical + entity mention recognition at literature scale + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3957068/ + http://nactem.ac.uk/anatomytagger/#AnatEM """ def __init__( @@ -4234,13 +4130,12 @@ def parse_input_files(input_dir: Path, sentence_separator: str) -> InternalBioNe class BioBertHelper(ColumnCorpus): - """ - Helper class to convert corpora and the respective train, dev and test split - used by BioBERT. + """Helper class to convert corpora and the respective train, dev and test + split used by BioBERT. For further details see Lee et al.: - https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 - https://github.com/dmis-lab/biobert + https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 + https://github.com/dmis-lab/biobert """ @staticmethod @@ -4295,13 +4190,13 @@ def convert_and_write(download_folder, data_folder, tag_type): class BIOBERT_CHEMICAL_BC4CHEMD(ColumnCorpus): - """ - BC4CHEMD corpus with chemical annotations as used in the evaluation - of BioBERT. + """BC4CHEMD corpus with chemical annotations as used in the evaluation of + BioBERT. - For further details regarding BioBERT and it's evaluation, see Lee et al.: - https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 - https://github.com/dmis-lab/biobert + For further details regarding BioBERT and it's evaluation, see Lee + et al.: + https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 + https://github.com/dmis-lab/biobert """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -4330,13 +4225,13 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): class BIOBERT_GENE_BC2GM(ColumnCorpus): - """ - BC4CHEMD corpus with gene annotations as used in the evaluation - of BioBERT. + """BC4CHEMD corpus with gene annotations as used in the evaluation of + BioBERT. - For further details regarding BioBERT and it's evaluation, see Lee et al.: - https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 - https://github.com/dmis-lab/biobert + For further details regarding BioBERT and it's evaluation, see Lee + et al.: + https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 + https://github.com/dmis-lab/biobert """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -4364,13 +4259,13 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): class BIOBERT_GENE_JNLPBA(ColumnCorpus): - """ - JNLPBA corpus with gene annotations as used in the evaluation - of BioBERT. + """JNLPBA corpus with gene annotations as used in the evaluation of + BioBERT. - For further details regarding BioBERT and it's evaluation, see Lee et al.: - https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 - https://github.com/dmis-lab/biobert + For further details regarding BioBERT and it's evaluation, see Lee + et al.: + https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 + https://github.com/dmis-lab/biobert """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -4398,13 +4293,13 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): class BIOBERT_CHEMICAL_BC5CDR(ColumnCorpus): - """ - BC5CDR corpus with chemical annotations as used in the evaluation - of BioBERT. + """BC5CDR corpus with chemical annotations as used in the evaluation of + BioBERT. - For further details regarding BioBERT and it's evaluation, see Lee et al.: - https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 - https://github.com/dmis-lab/biobert + For further details regarding BioBERT and it's evaluation, see Lee + et al.: + https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 + https://github.com/dmis-lab/biobert """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -4432,13 +4327,13 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): class BIOBERT_DISEASE_BC5CDR(ColumnCorpus): - """ - BC5CDR corpus with disease annotations as used in the evaluation - of BioBERT. + """BC5CDR corpus with disease annotations as used in the evaluation of + BioBERT. - For further details regarding BioBERT and it's evaluation, see Lee et al.: - https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 - https://github.com/dmis-lab/biobert + For further details regarding BioBERT and it's evaluation, see Lee + et al.: + https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 + https://github.com/dmis-lab/biobert """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -4466,12 +4361,12 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): class BIOBERT_DISEASE_NCBI(ColumnCorpus): - """ - NCBI disease corpus as used in the evaluation of BioBERT. + """NCBI disease corpus as used in the evaluation of BioBERT. - For further details regarding BioBERT and it's evaluation, see Lee et al.: - https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 - https://github.com/dmis-lab/biobert + For further details regarding BioBERT and it's evaluation, see Lee + et al.: + https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 + https://github.com/dmis-lab/biobert """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -4499,13 +4394,13 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): class BIOBERT_SPECIES_LINNAEUS(ColumnCorpus): - """ - Linneaeus corpus with species annotations as used in the evaluation - of BioBERT. + """Linneaeus corpus with species annotations as used in the evaluation of + BioBERT. - For further details regarding BioBERT and it's evaluation, see Lee et al.: - https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 - https://github.com/dmis-lab/biobert + For further details regarding BioBERT and it's evaluation, see Lee + et al.: + https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 + https://github.com/dmis-lab/biobert """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -4533,13 +4428,13 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): class BIOBERT_SPECIES_S800(ColumnCorpus): - """ - S800 corpus with species annotations as used in the evaluation - of BioBERT. + """S800 corpus with species annotations as used in the evaluation of + BioBERT. - For further details regarding BioBERT and it's evaluation, see Lee et al.: - https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 - https://github.com/dmis-lab/biobert + For further details regarding BioBERT and it's evaluation, see Lee + et al.: + https://academic.oup.com/bioinformatics/article/36/4/1234/5566506 + https://github.com/dmis-lab/biobert """ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): @@ -4567,11 +4462,11 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True): class CRAFT_V4(ColumnCorpus): - """ - Version 4.0.1 of the CRAFT corpus containing all but the co-reference and structural annotations. + """Version 4.0.1 of the CRAFT corpus containing all but the co-reference + and structural annotations. For further information see: - https://github.com/UCDenver-ccp/CRAFT + https://github.com/UCDenver-ccp/CRAFT """ def __init__( @@ -4586,7 +4481,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -4718,9 +4612,8 @@ def parse_corpus(corpus_dir: Path) -> InternalBioNerDataset: class HUNER_CHEMICAL_CRAFT_V4(HunerDataset): - """ - HUNER version of the CRAFT corpus containing (only) chemical annotations. - """ + """HUNER version of the CRAFT corpus containing (only) chemical + annotations.""" def __init__(self, *args, **kwargs): super().__init__( @@ -4741,9 +4634,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_GENE_CRAFT_V4(HunerDataset): - """ - HUNER version of the CRAFT corpus containing (only) gene annotations. - """ + """HUNER version of the CRAFT corpus containing (only) gene annotations.""" def __init__(self, *args, **kwargs): super().__init__( @@ -4764,9 +4655,8 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HUNER_SPECIES_CRAFT_V4(HunerDataset): - """ - HUNER version of the CRAFT corpus containing (only) species annotations. - """ + """HUNER version of the CRAFT corpus containing (only) species + annotations.""" def __init__(self, *args, **kwargs): super().__init__( @@ -4875,11 +4765,11 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class AZDZ(ColumnCorpus): - """ - Arizona Disease Corpus from the Biomedical Informatics Lab at Arizona State University. + """Arizona Disease Corpus from the Biomedical Informatics Lab at Arizona + State University. - For further information see: - http://diego.asu.edu/index.php + For further information see: + http://diego.asu.edu/index.php """ def __init__( @@ -4894,7 +4784,6 @@ def __init__( :param tokenizer: Implementation of :class:`Tokenizer` which segments sentences into tokens (default :class:`SciSpacyTokenizer`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -4984,14 +4873,13 @@ def parse_corpus(input_file: Path) -> InternalBioNerDataset: @deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class PDR(ColumnCorpus): - """ - Corpus of plant-disease relations from Kim et al., consisting of named entity annotations - for plants and disease. + """Corpus of plant-disease relations from Kim et al., consisting of named + entity annotations for plants and disease. - For further information see Kim et al.: - A corpus of plant-disease relations in the biomedical domain - https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0221582 - http://gcancer.org/pdr/ + For further information see Kim et al.: A corpus of plant-disease + relations in the biomedical domain + https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0221582 + http://gcancer.org/pdr/ """ def __init__( @@ -5006,7 +4894,6 @@ def __init__( :param sentence_splitter: Implementation of :class:`SentenceSplitter` which segments documents into sentences and tokens (default :class:`SciSpacySentenceSplitter`) """ - if base_path is None: base_path = flair.cache_root / "datasets" else: @@ -5046,9 +4933,7 @@ def download_corpus(cls, data_dir: Path) -> Path: class HUNER_DISEASE_PDR(HunerDataset): - """ - PDR Dataset with only Disease annotations - """ + """PDR Dataset with only Disease annotations.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -5066,9 +4951,8 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: class HunerMultiCorpus(MultiCorpus): - """ - Base class to build the union of all HUNER data sets considering a particular entity type. - """ + """Base class to build the union of all HUNER data sets considering a + particular entity type.""" def __init__(self, entity_type: str, sentence_splitter: SentenceSplitter = None): self.entity_type = entity_type @@ -5117,45 +5001,35 @@ def entity_type_predicate(member): class HUNER_CELL_LINE(HunerMultiCorpus): - """ - Union of all HUNER cell line data sets. - """ + """Union of all HUNER cell line data sets.""" def __init__(self, sentence_splitter: SentenceSplitter = None): super(HUNER_CELL_LINE, self).__init__(entity_type="CELL_LINE", sentence_splitter=sentence_splitter) class HUNER_CHEMICAL(HunerMultiCorpus): - """ - Union of all HUNER chemical data sets. - """ + """Union of all HUNER chemical data sets.""" def __init__(self, sentence_splitter: SentenceSplitter = None): super(HUNER_CHEMICAL, self).__init__(entity_type="CHEMICAL", sentence_splitter=sentence_splitter) class HUNER_DISEASE(HunerMultiCorpus): - """ - Union of all HUNER disease data sets. - """ + """Union of all HUNER disease data sets.""" def __init__(self, sentence_splitter: SentenceSplitter = None): super(HUNER_DISEASE, self).__init__(entity_type="DISEASE", sentence_splitter=sentence_splitter) class HUNER_GENE(HunerMultiCorpus): - """ - Union of all HUNER gene data sets. - """ + """Union of all HUNER gene data sets.""" def __init__(self, sentence_splitter: SentenceSplitter = None): super(HUNER_GENE, self).__init__(entity_type="GENE", sentence_splitter=sentence_splitter) class HUNER_SPECIES(HunerMultiCorpus): - """ - Union of all HUNER species data sets. - """ + """Union of all HUNER species data sets.""" def __init__(self, sentence_splitter: SentenceSplitter = None): super(HUNER_SPECIES, self).__init__(entity_type="SPECIES", sentence_splitter=sentence_splitter) diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py index 7a0663197d..54b63f7ceb 100644 --- a/flair/datasets/document_classification.py +++ b/flair/datasets/document_classification.py @@ -22,9 +22,7 @@ class ClassificationCorpus(Corpus): - """ - A classification corpus from FastText-formatted text files. - """ + """A classification corpus from FastText-formatted text files.""" def __init__( self, @@ -44,8 +42,7 @@ def __init__( sample_missing_splits: bool = True, encoding: str = "utf-8", ): - """ - Instantiates a Corpus from text classification-formatted task data + """Instantiates a Corpus from text classification-formatted task data. :param data_folder: base folder with the task data :param label_type: name of the label @@ -64,7 +61,6 @@ def __init__( :param encoding: Default is 'utf-8' but some datasets are in 'latin-1 :return: a Corpus with annotated train, dev and test data """ - # find train, dev and test files if not specified dev_file, test_file, train_file = find_train_dev_test_files(data_folder, dev_file, test_file, train_file) @@ -128,9 +124,7 @@ def __init__( class ClassificationDataset(FlairDataset): - """ - Dataset for classification instantiated from a single FastText-formatted file. - """ + """Dataset for classification instantiated from a single FastText-formatted file.""" def __init__( self, @@ -146,8 +140,9 @@ def __init__( allow_examples_without_labels=False, encoding: str = "utf-8", ): - """ - Reads a data file for text classification. The file should contain one document/text per line. + """Reads a data file for text classification. + + The file should contain one document/text per line. The line should have the following format: __label__ If you have a multi class task, you can have as many labels as you want at the beginning of the line, e.g., @@ -313,9 +308,7 @@ def __getitem__(self, index: int = 0) -> Sentence: class CSVClassificationCorpus(Corpus): - """ - Classification corpus instantiated from CSV data files. - """ + """Classification corpus instantiated from CSV data files.""" def __init__( self, @@ -335,8 +328,7 @@ def __init__( no_class_label=None, **fmtparams, ): - """ - Instantiates a Corpus for text classification from CSV column formatted data + """Instantiates a Corpus for text classification from CSV column formatted data. :param data_folder: base folder with the task data :param column_name_map: a column name map that indicates which column is text and which the label(s) @@ -353,7 +345,6 @@ def __init__( :param fmtparams: additional parameters for the CSV file reader :return: a Corpus with annotated train, dev and test data """ - # find train, dev and test files if not specified dev_file, test_file, train_file = find_train_dev_test_files(data_folder, dev_file, test_file, train_file) @@ -411,9 +402,7 @@ def __init__( class CSVClassificationDataset(FlairDataset): - """ - Dataset for text classification from CSV column formatted data. - """ + """Dataset for text classification from CSV column formatted data.""" def __init__( self, @@ -429,8 +418,7 @@ def __init__( no_class_label=None, **fmtparams, ): - """ - Instantiates a Dataset for text classification from CSV column formatted data + """Instantiates a Dataset for text classification from CSV column formatted data. :param path_to_file: path to the file with the CSV data :param column_name_map: a column name map that indicates which column is text and which the label(s) @@ -444,7 +432,6 @@ def __init__( :param fmtparams: additional parameters for the CSV file reader :return: a Corpus with annotated train, dev and test data """ - path_to_file = Path(path_to_file) assert path_to_file.exists() @@ -568,9 +555,11 @@ def __getitem__(self, index: int = 0) -> Sentence: class AMAZON_REVIEWS(ClassificationCorpus): - """ - A very large corpus of Amazon reviews with positivity ratings. Corpus is downloaded from and documented at - https://nijianmo.github.io/amazon/index.html. We download the 5-core subset which is still tens of millions of + """A very large corpus of Amazon reviews with positivity ratings. + + Corpus is downloaded from and documented at + https://nijianmo.github.io/amazon/index.html. + We download the 5-core subset which is still tens of millions of reviews. """ @@ -591,8 +580,9 @@ def __init__( memory_mode="partial", **corpusargs, ): - """ - Constructs corpus object. Split_max indicates how many data points from each of the 28 splits are used, so + """Constructs corpus object. + + Split_max indicates how many data points from each of the 28 splits are used, so set this higher or lower to increase/decrease corpus size. :param label_name_map: Map label names to different schema. By default, the 5-star rating is mapped onto 3 classes (POSITIVE, NEGATIVE, NEUTRAL) @@ -604,7 +594,6 @@ def __init__( :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) :param corpusargs: Arguments for ClassificationCorpus """ - # dataset name includes the split size dataset_name = self.__class__.__name__.lower() + "_" + str(split_max) + "_" + str(fraction_of_5_star_reviews) @@ -753,9 +742,9 @@ def download_and_prepare_amazon_product_file( class IMDB(ClassificationCorpus): - """ - Corpus of IMDB movie reviews labeled by sentiment (POSITIVE, NEGATIVE). Downloaded from and documented at - http://ai.stanford.edu/~amaas/data/sentiment/. + """Corpus of IMDB movie reviews labeled by sentiment (POSITIVE, NEGATIVE). + + Downloaded from and documented at http://ai.stanford.edu/~amaas/data/sentiment/. """ def __init__( @@ -777,7 +766,6 @@ def __init__( processing or 'none' for less memory. :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -837,9 +825,13 @@ def __init__( class NEWSGROUPS(ClassificationCorpus): - """ - 20 newsgroups corpus available at "http://qwone.com/~jason/20Newsgroups", classifying - news items into one of 20 categories. Each data point is a full news article so documents may be very long. + """20 newsgroups corpus, classifying news items into one of 20 categories. + + Downloaded from http://qwone.com/~jason/20Newsgroups + + + Each data point is a full news article so documents may be very + long. """ def __init__( @@ -849,15 +841,14 @@ def __init__( memory_mode: str = "partial", **corpusargs, ): - """ - Instantiates 20 newsgroups corpus. + """Instantiates 20 newsgroups corpus. + :param base_path: Provide this only if you store the IMDB corpus in a specific folder, otherwise use default. :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) :param memory_mode: Set to 'partial' because this is a big corpus, but you can also set to 'full' for faster processing or 'none' for less memory. :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -924,9 +915,11 @@ def __init__( class STACKOVERFLOW(ClassificationCorpus): - """ - Stackoverflow corpus available at "https://github.com/jacoxu/StackOverflow", classifying - news items into one of 20 labels. Each data point is a question. + """Stackoverflow corpus classifying questions into one of 20 labels. + + The data will be downloaded from "https://github.com/jacoxu/StackOverflow", + + Each data point is a question. """ def __init__( @@ -936,15 +929,14 @@ def __init__( memory_mode: str = "partial", **corpusargs, ): - """ - Instantiates Stackoverflow corpus. + """Instantiates Stackoverflow corpus. + :param base_path: Provide this only if you store the IMDB corpus in a specific folder, otherwise use default. :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) :param memory_mode: Set to 'partial' because this is a big corpus, but you can also set to 'full' for faster processing or 'none' for less memory. :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1013,24 +1005,26 @@ def __init__( class SENTIMENT_140(ClassificationCorpus): - """ - Twitter sentiment corpus downloaded from and documented at http://help.sentiment140.com/for-students. Two sentiments - in train data (POSITIVE, NEGATIVE) and three sentiments in test data (POSITIVE, NEGATIVE, NEUTRAL). + """Twitter sentiment corpus. + + See http://help.sentiment140.com/for-students + + Two sentiments in train data (POSITIVE, NEGATIVE) and three + sentiments in test data (POSITIVE, NEGATIVE, NEUTRAL). """ def __init__( self, label_name_map=None, tokenizer: Tokenizer = SegtokTokenizer(), memory_mode: str = "partial", **corpusargs ): - """ - Instantiates twitter sentiment corpus. + """Instantiates twitter sentiment corpus. + :param label_name_map: By default, the numeric values are mapped to ('NEGATIVE', 'POSITIVE' and 'NEUTRAL') :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) :param memory_mode: Set to 'partial' because this is a big corpus, but you can also set to 'full' for faster processing or 'none' for less memory. :param corpusargs: Other args for ClassificationCorpus. """ - - # by defaut, map point score to POSITIVE / NEGATIVE values + # by default, map point score to POSITIVE / NEGATIVE values if label_name_map is None: label_name_map = {"0": "NEGATIVE", "2": "NEUTRAL", "4": "POSITIVE"} @@ -1085,9 +1079,9 @@ def __init__( class SENTEVAL_CR(ClassificationCorpus): - """ - The customer reviews dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into - NEGATIVE or POSITIVE sentiment. + """The customer reviews dataset of SentEval, classified into NEGATIVE or POSITIVE sentiment. + + see https://github.com/facebookresearch/SentEval """ def __init__( @@ -1096,13 +1090,12 @@ def __init__( memory_mode: str = "full", **corpusargs, ): - """ - Instantiates SentEval customer reviews dataset. + """Instantiates SentEval customer reviews dataset. + :param corpusargs: Other args for ClassificationCorpus. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer()) :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. """ - # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1137,9 +1130,9 @@ def __init__( class SENTEVAL_MR(ClassificationCorpus): - """ - The movie reviews dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into - NEGATIVE or POSITIVE sentiment. + """The movie reviews dataset of SentEval, classified into NEGATIVE or POSITIVE sentiment. + + see https://github.com/facebookresearch/SentEval """ def __init__( @@ -1148,13 +1141,12 @@ def __init__( memory_mode: str = "full", **corpusargs, ): - """ - Instantiates SentEval movie reviews dataset. + """Instantiates SentEval movie reviews dataset. + :param corpusargs: Other args for ClassificationCorpus. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. """ - # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1189,9 +1181,9 @@ def __init__( class SENTEVAL_SUBJ(ClassificationCorpus): - """ - The subjectivity dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into - SUBJECTIVE or OBJECTIVE sentiment. + """The subjectivity dataset of SentEval, classified into SUBJECTIVE or OBJECTIVE sentiment. + + see https://github.com/facebookresearch/SentEval """ def __init__( @@ -1200,13 +1192,12 @@ def __init__( memory_mode: str = "full", **corpusargs, ): - """ - Instantiates SentEval subjectivity dataset. + """Instantiates SentEval subjectivity dataset. + :param corpusargs: Other args for ClassificationCorpus. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. """ - # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1241,9 +1232,9 @@ def __init__( class SENTEVAL_MPQA(ClassificationCorpus): - """ - The opinion-polarity dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified into - NEGATIVE or POSITIVE polarity. + """The opinion-polarity dataset of SentEval, classified into NEGATIVE or POSITIVE polarity. + + see https://github.com/facebookresearch/SentEval """ def __init__( @@ -1252,13 +1243,12 @@ def __init__( memory_mode: str = "full", **corpusargs, ): - """ - Instantiates SentEval opinion polarity dataset. + """Instantiates SentEval opinion polarity dataset. + :param corpusargs: Other args for ClassificationCorpus. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. """ - # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1293,9 +1283,9 @@ def __init__( class SENTEVAL_SST_BINARY(ClassificationCorpus): - """ - The Stanford sentiment treebank dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified - into NEGATIVE or POSITIVE sentiment. + """The Stanford sentiment treebank dataset of SentEval, classified into NEGATIVE or POSITIVE sentiment. + + see https://github.com/facebookresearch/SentEval """ def __init__( @@ -1304,13 +1294,12 @@ def __init__( memory_mode: str = "full", **corpusargs, ): - """ - Instantiates SentEval Stanford sentiment treebank dataset. + """Instantiates SentEval Stanford sentiment treebank dataset. + :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) :param corpusargs: Other args for ClassificationCorpus. """ - # this dataset name dataset_name = self.__class__.__name__.lower() + "_v2" @@ -1352,9 +1341,9 @@ def __init__( class SENTEVAL_SST_GRANULAR(ClassificationCorpus): - """ - The Stanford sentiment treebank dataset of SentEval, see https://github.com/facebookresearch/SentEval, classified - into 5 sentiment classes. + """The Stanford sentiment treebank dataset of SentEval, classified into 5 sentiment classes. + + see https://github.com/facebookresearch/SentEval """ def __init__( @@ -1363,13 +1352,12 @@ def __init__( memory_mode: str = "full", **corpusargs, ): - """ - Instantiates SentEval Stanford sentiment treebank dataset. + """Instantiates SentEval Stanford sentiment treebank dataset. + :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) :param corpusargs: Other args for ClassificationCorpus. """ - # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1405,10 +1393,12 @@ def __init__( class GLUE_COLA(ClassificationCorpus): - """ - Corpus of Linguistic Acceptability from GLUE benchmark (https://gluebenchmark.com/tasks). - The task is to predict whether an English sentence is grammatically correct. - Additionaly to the Corpus we have eval_dataset containing the unlabeled test data for Glue evaluation. + """Corpus of Linguistic Acceptability from GLUE benchmark + (https://gluebenchmark.com/tasks). + + The task is to predict whether an English sentence is grammatically + correct. Additionaly to the Corpus we have eval_dataset containing + the unlabeled test data for Glue evaluation. """ def __init__( @@ -1424,7 +1414,6 @@ def __init__( :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1502,8 +1491,10 @@ def tsv_from_eval_dataset(self, folder_path: Union[str, Path]): class GO_EMOTIONS(ClassificationCorpus): - """ - GoEmotions dataset containing 58k Reddit comments labeled with 27 emotion categories, see. https://github.com/google-research/google-research/tree/master/goemotions + """GoEmotions dataset containing 58k Reddit comments labeled with 27 + emotion categories, see. + + https://github.com/google-research/google-research/tree/master/goemotions """ def __init__( @@ -1524,7 +1515,6 @@ def __init__( **corpusargs : Other args for ClassificationCorpus. """ - label_name_map = { "0": "ADMIRATION", "1": "AMUSEMENT", @@ -1607,9 +1597,8 @@ def __init__( class TREC_50(ClassificationCorpus): - """ - The TREC Question Classification Corpus, classifying questions into 50 fine-grained answer types. - """ + """The TREC Question Classification Corpus, classifying questions into 50 + fine-grained answer types.""" def __init__( self, @@ -1618,14 +1607,13 @@ def __init__( memory_mode="full", **corpusargs, ): - """ - Instantiates TREC Question Classification Corpus with 6 classes. + """Instantiates TREC Question Classification Corpus with 6 classes. + :param base_path: Provide this only if you store the TREC corpus in a specific folder, otherwise use default. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1668,10 +1656,8 @@ def __init__( class TREC_6(ClassificationCorpus): - """ - The TREC Question Classification Corpus, classifying questions into 6 coarse-grained answer types - (DESC, HUM, LOC, ENTY, NUM, ABBR). - """ + """The TREC Question Classification Corpus, classifying questions into 6 + coarse-grained answer types (DESC, HUM, LOC, ENTY, NUM, ABBR).""" def __init__( self, @@ -1680,14 +1666,13 @@ def __init__( memory_mode="full", **corpusargs, ): - """ - Instantiates TREC Question Classification Corpus with 6 classes. + """Instantiates TREC Question Classification Corpus with 6 classes. + :param base_path: Provide this only if you store the TREC corpus in a specific folder, otherwise use default. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1732,9 +1717,8 @@ def __init__( class YAHOO_ANSWERS(ClassificationCorpus): - """ - The YAHOO Question Classification Corpus, classifying questions into 10 coarse-grained answer types - """ + """The YAHOO Question Classification Corpus, classifying questions into 10 + coarse-grained answer types.""" def __init__( self, @@ -1743,14 +1727,13 @@ def __init__( memory_mode="partial", **corpusargs, ): - """ - Instantiates YAHOO Question Classification Corpus with 10 classes. + """Instantiates YAHOO Question Classification Corpus with 10 classes. + :param base_path: Provide this only if you store the YAHOO corpus in a specific folder, otherwise use default. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) :param memory_mode: Set to 'partial' by default since this is a rather big corpus. Can also be 'full' or 'none'. :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1809,10 +1792,11 @@ def __init__( class GERMEVAL_2018_OFFENSIVE_LANGUAGE(ClassificationCorpus): - """ - GermEval 2018 corpus for identification of offensive language. - Classifying German tweets into 2 coarse-grained categories OFFENSIVE and OTHER - or 4 fine-grained categories ABUSE, INSULT, PROFATINTY and OTHER. + """GermEval 2018 corpus for identification of offensive language. + + Classifying German tweets into 2 coarse-grained categories OFFENSIVE + and OTHER or 4 fine-grained categories ABUSE, INSULT, PROFATINTY and + OTHER. """ def __init__( @@ -1823,15 +1807,14 @@ def __init__( fine_grained_classes: bool = False, **corpusargs, ): - """ - Instantiates GermEval 2018 Offensive Language Classification Corpus. + """Instantiates GermEval 2018 Offensive Language Classification Corpus. + :param base_path: Provide this only if you store the Offensive Language corpus in a specific folder, otherwise use default. :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. :param fine_grained_classes: Set to True to load the dataset with 4 fine-grained classes :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1882,9 +1865,10 @@ def __init__( class COMMUNICATIVE_FUNCTIONS(ClassificationCorpus): - """ - The Communicative Functions Classification Corpus. - Classifying sentences from scientific papers into 39 communicative functions. + """The Communicative Functions Classification Corpus. + + Classifying sentences from scientific papers into 39 communicative + functions. """ def __init__( @@ -1894,14 +1878,14 @@ def __init__( tokenizer: Union[bool, Tokenizer] = SpaceTokenizer(), **corpusargs, ): - """ - Instantiates Communicative Functions Classification Corpus with 39 classes. + """Instantiates Communicative Functions Classification Corpus with 39 + classes. + :param base_path: Provide this only if you store the Communicative Functions date in a specific folder, otherwise use default. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) :param memory_mode: Set to 'full' by default since this is a small corpus. Can also be 'partial' or 'none'. :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1969,10 +1953,9 @@ def _download_wassa_if_not_there(emotion, data_folder, dataset_name): class WASSA_ANGER(ClassificationCorpus): - """ - WASSA-2017 anger emotion-intensity dataset downloaded from and documented at - https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html - """ + """WASSA-2017 anger emotion-intensity dataset downloaded from and + documented at https://saifmohammad.com/WebPages/EmotionIntensity- + SharedTask.html.""" def __init__(self, base_path: Union[str, Path] = None, tokenizer: Tokenizer = SegtokTokenizer(), **corpusargs): """ @@ -1981,7 +1964,6 @@ def __init__(self, base_path: Union[str, Path] = None, tokenizer: Tokenizer = Se :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1999,10 +1981,8 @@ def __init__(self, base_path: Union[str, Path] = None, tokenizer: Tokenizer = Se class WASSA_FEAR(ClassificationCorpus): - """ - WASSA-2017 fear emotion-intensity dataset downloaded from and documented at - https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html - """ + """WASSA-2017 fear emotion-intensity dataset downloaded from and documented + at https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html.""" def __init__(self, base_path: Union[str, Path] = None, tokenizer: Tokenizer = SegtokTokenizer(), **corpusargs): """ @@ -2011,7 +1991,6 @@ def __init__(self, base_path: Union[str, Path] = None, tokenizer: Tokenizer = Se :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -2029,10 +2008,8 @@ def __init__(self, base_path: Union[str, Path] = None, tokenizer: Tokenizer = Se class WASSA_JOY(ClassificationCorpus): - """ - WASSA-2017 joy emotion-intensity dataset downloaded from and documented at - https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html - """ + """WASSA-2017 joy emotion-intensity dataset downloaded from and documented + at https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html.""" def __init__(self, base_path: Union[str, Path] = None, tokenizer: Tokenizer = SegtokTokenizer(), **corpusargs): """ @@ -2041,7 +2018,6 @@ def __init__(self, base_path: Union[str, Path] = None, tokenizer: Tokenizer = Se :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: @@ -2059,10 +2035,9 @@ def __init__(self, base_path: Union[str, Path] = None, tokenizer: Tokenizer = Se class WASSA_SADNESS(ClassificationCorpus): - """ - WASSA-2017 sadness emotion-intensity dataset downloaded from and documented at - https://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html - """ + """WASSA-2017 sadness emotion-intensity dataset downloaded from and + documented at https://saifmohammad.com/WebPages/EmotionIntensity- + SharedTask.html.""" def __init__(self, base_path: Union[str, Path] = None, tokenizer: Tokenizer = SegtokTokenizer(), **corpusargs): """ @@ -2071,7 +2046,6 @@ def __init__(self, base_path: Union[str, Path] = None, tokenizer: Tokenizer = Se :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer) :param corpusargs: Other args for ClassificationCorpus. """ - if not base_path: base_path = flair.cache_root / "datasets" else: diff --git a/flair/datasets/entity_linking.py b/flair/datasets/entity_linking.py index 89fe7c0d49..40ead91138 100644 --- a/flair/datasets/entity_linking.py +++ b/flair/datasets/entity_linking.py @@ -23,9 +23,9 @@ def __init__( column_format={0: "text", 2: "nel"}, **corpusargs, ): - """ - Initialize ZELDA Entity Linking corpus introduced in "ZELDA: A Comprehensive Benchmark for Supervised - Entity Disambiguation" (Milich and Akbik, 2023). + """Initialize ZELDA Entity Linking corpus. + + introduced in "ZELDA: A Comprehensive Benchmark for Supervised Entity Disambiguation" (Milich and Akbik, 2023). When calling the constructor for the first time, the dataset gets automatically downloaded. Parameters @@ -89,12 +89,11 @@ def __init__( sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(), **corpusargs, ): - """ - Initialize Aquaint Entity Linking corpus introduced in: D. Milne and I. H. Witten. - Learning to link with wikipedia - (https://www.cms.waikato.ac.nz/~ihw/papers/08-DNM-IHW-LearningToLinkWithWikipedia.pdf). - If you call the constructor the first time the dataset gets automatically downloaded and transformed in - tab-separated column format (aquaint.txt). + """Initialize Aquaint Entity Linking corpus. + + introduced in: D. Milne and I. H. Witten. Learning to link with wikipedia + https://www.cms.waikato.ac.nz/~ihw/papers/08-DNM-IHW-LearningToLinkWithWikipedia.pdf . If you call the constructor the first + time the dataset gets automatically downloaded and transformed in tab-separated column format (aquaint.txt). Parameters ---------- @@ -265,12 +264,12 @@ def __init__( wiki_language: str = "dewiki", **corpusargs, ): - """ - Initialize a sentence-segmented version of the HIPE entity linking corpus for historical German (see description - of HIPE at https://impresso.github.io/CLEF-HIPE-2020/). This version was segmented by @stefan-it and is hosted - at https://github.com/stefan-it/clef-hipe. - If you call the constructor the first time the dataset gets automatically downloaded and transformed in - tab-separated column format. + """Initialize a sentence-segmented version of the HIPE entity linking corpus for historical German. + + see description of HIPE at https://impresso.github.io/CLEF-HIPE-2020/. + + This version was segmented by @stefan-it and is hosted at https://github.com/stefan-it/clef-hipe. + If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format. Parameters ---------- @@ -593,9 +592,11 @@ def __init__( sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(), **corpusargs, ): - """ - Initialize ITTB Entity Linking corpus introduced in "Collective Annotation of Wikipedia Entities in Web Text" Sayali Kulkarni, Amit Singh, Ganesh Ramakrishnan, and Soumen Chakrabarti. - If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format. + """Initialize ITTB Entity Linking corpus introduced in "Collective + Annotation of Wikipedia Entities in Web Text" Sayali Kulkarni, Amit + Singh, Ganesh Ramakrishnan, and Soumen Chakrabarti. If you call the + constructor the first time the dataset gets automatically downloaded + and transformed in tab-separated column format. Parameters ---------- @@ -732,10 +733,12 @@ def __init__( in_memory: bool = True, **corpusargs, ): - """ - Initialize Tweeki Entity Linking corpus introduced in "Tweeki: Linking Named Entities on Twitter to a Knowledge Graph" Harandizadeh, Singh. - The data consits of tweets with manually annotated wikipedia links. - If you call the constructor the first time the dataset gets automatically downloaded and transformed in tab-separated column format. + """Initialize Tweeki Entity Linking corpus introduced in "Tweeki: + Linking Named Entities on Twitter to a Knowledge Graph" Harandizadeh, + Singh. The data consits of tweets with manually annotated wikipedia + links. If you call the constructor the first time the dataset gets + automatically downloaded and transformed in tab-separated column + format. Parameters ---------- @@ -799,8 +802,10 @@ def __init__( in_memory: bool = True, **corpusargs, ): - """ - Initialize the Reddit Entity Linking corpus containing gold annotations only (https://arxiv.org/abs/2101.01228v2) in the NER-like column format. + """Initialize the Reddit Entity Linking corpus containing gold + annotations only (https://arxiv.org/abs/2101.01228v2) in the NER-like + column format. + The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. @@ -1031,9 +1036,9 @@ def _fill_annot_array(self, annot_array: list, key: str, post_flag: bool) -> lis return annot_array def _fill_curr_comment(self, fix_flag: bool): - """ - Extends the string containing the current comment thread, which is passed to _text_to_cols method, when the - comments are parsed. + """Extends the string containing the current comment thread, which is + passed to _text_to_cols method, when the comments are parsed. + :param fix_flag: flag indicating whether the method is called when the incorrectly imported rows are parsed (=True) or regular rows (=False) """ @@ -1061,8 +1066,9 @@ def from_ufsac_to_tsv( encoding: str = "utf8", cut_multisense: bool = True, ): - """ - Function that converts the UFSAC format into tab separated column format in a new file. + """Function that converts the UFSAC format into tab separated column format + in a new file. + Parameters ---------- xml_file : Union[str, Path] @@ -1077,12 +1083,11 @@ def from_ufsac_to_tsv( Boolean that determines whether or not the wn30_key tag should be cut if it contains multiple possible senses. If True only the first listed sense will be used. Otherwise the whole list of senses will be detected as one new sense. The default is True. - """ def make_line(word, begin_or_inside, attributes): - """ - Function that creates an output line from a word. + """Function that creates an output line from a word. + Parameters ---------- word : @@ -1106,8 +1111,9 @@ def make_line(word, begin_or_inside, attributes): return line def split_span(word_fields: List[str], datasetname: str): - """ - Function that splits a word if necessary, i.e. if it is a multiple-word-span. + """Function that splits a word if necessary, i.e. if it is a multiple- + word-span. + Parameters ---------- word_fields : @@ -1115,7 +1121,6 @@ def split_span(word_fields: List[str], datasetname: str): datasetname: name of corresponding dataset """ - span = word_fields[0] if datasetname in [ @@ -1181,7 +1186,6 @@ def determine_tsv_file(filename: str, data_folder: Path, cut_multisense: bool = If True only the first listed sense will be used. Otherwise the whole list of senses will be detected as one new sense. The default is True. """ - if cut_multisense is True and filename not in [ "semeval2007task17", "trainomatic", @@ -1221,8 +1225,9 @@ def __init__( use_raganato_ALL_as_test_data: bool = False, name: str = "multicorpus", ): - """ - Initialize a custom corpus with any Word Sense Disambiguation (WSD) datasets in the UFSAC format from https://github.com/getalp/UFSAC. + """Initialize a custom corpus with any Word Sense Disambiguation (WSD) + datasets in the UFSAC format from https://github.com/getalp/UFSAC. + If the constructor is called for the first time the data is automatically downloaded and transformed from xml to a tab separated column format. Since only the WordNet 3.0 version for senses is consistently available for all provided datasets we will only consider this version. Also we ignore the id annotation used in datasets that were originally created for evaluation tasks @@ -1363,10 +1368,9 @@ def __init__( sample_missing_splits: bool = True, cut_multisense: bool = True, ): - """ - Initialize ragnato_ALL (concatenation of all SensEval and SemEval all-words tasks) provided in UFSAC https://github.com/getalp/UFSAC - When first initializing the corpus the whole UFSAC data is downloaded. - """ + """Initialize ragnato_ALL (concatenation of all SensEval and SemEval + all-words tasks) provided in UFSAC https://github.com/getalp/UFSAC When + first initializing the corpus the whole UFSAC data is downloaded.""" if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1429,10 +1433,9 @@ def __init__( cut_multisense: bool = True, use_raganato_ALL_as_test_data: bool = False, ): - """ - Initialize SemCor provided in UFSAC https://github.com/getalp/UFSAC - When first initializing the corpus the whole UFSAC data is downloaded. - """ + """Initialize SemCor provided in UFSAC https://github.com/getalp/UFSAC + When first initializing the corpus the whole UFSAC data is + downloaded.""" if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1505,10 +1508,9 @@ def __init__( sample_missing_splits: Union[bool, str] = True, use_raganato_ALL_as_test_data: bool = False, ): - """ - Initialize Princeton WordNet Gloss Corpus provided in UFSAC https://github.com/getalp/UFSAC - When first initializing the corpus the whole UFSAC data is downloaded. - """ + """Initialize Princeton WordNet Gloss Corpus provided in UFSAC + https://github.com/getalp/UFSAC When first initializing the corpus the + whole UFSAC data is downloaded.""" if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1580,10 +1582,9 @@ def __init__( cut_multisense: bool = True, use_raganato_ALL_as_test_data: bool = False, ): - """ - Initialize MASC (Manually Annotated Sub-Corpus) provided in UFSAC https://github.com/getalp/UFSAC - When first initializing the corpus the whole UFSAC data is downloaded. - """ + """Initialize MASC (Manually Annotated Sub-Corpus) provided in UFSAC + https://github.com/getalp/UFSAC When first initializing the corpus the + whole UFSAC data is downloaded.""" if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1658,10 +1659,9 @@ def __init__( cut_multisense: bool = True, use_raganato_ALL_as_test_data: bool = False, ): - """ - Initialize OMSTI (One Million Sense-Tagged Instances) provided in UFSAC https://github.com/getalp/UFSAC - When first initializing the corpus the whole UFSAC data is downloaded. - """ + """Initialize OMSTI (One Million Sense-Tagged Instances) provided in + UFSAC https://github.com/getalp/UFSAC When first initializing the + corpus the whole UFSAC data is downloaded.""" if not base_path: base_path = flair.cache_root / "datasets" else: @@ -1736,10 +1736,9 @@ def __init__( sample_missing_splits: Union[bool, str] = True, use_raganato_ALL_as_test_data: bool = False, ): - """ - Initialize Train-O-Matic provided in UFSAC https://github.com/getalp/UFSAC - When first initializing the corpus the whole UFSAC data is downloaded. - """ + """Initialize Train-O-Matic provided in UFSAC + https://github.com/getalp/UFSAC When first initializing the corpus the + whole UFSAC data is downloaded.""" if not base_path: base_path = flair.cache_root / "datasets" else: diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 7fb6623602..daaa63e5bf 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -38,9 +38,7 @@ class MultiFileJsonlCorpus(Corpus): - """ - This class represents a generic Jsonl corpus with multiple train, dev, and test files. - """ + """This class represents a generic Jsonl corpus with multiple train, dev, and test files.""" def __init__( self, @@ -53,8 +51,8 @@ def __init__( label_type: str = "ner", **corpusargs, ): - """ - Instantiates a MuliFileJsonlCorpus as, e.g., created with doccanos JSONL export. + """Instantiates a MuliFileJsonlCorpus as, e.g., created with doccanos JSONL export. + Note that at least one of train_files, test_files, and dev_files must contain one path. Otherwise, the initialization will fail. @@ -135,8 +133,7 @@ def __init__( name: Optional[str] = None, **corpusargs, ): - """ - Instantiates a JsonlCorpus with one file per Dataset (train, dev, and test). + """Instantiates a JsonlCorpus with one file per Dataset (train, dev, and test). :param data_folder: Path to the folder containing the JSONL corpus :param train_file: the name of the train file @@ -173,8 +170,7 @@ def __init__( label_column_name: str = "label", label_type: str = "ner", ): - """ - Instantiates a JsonlDataset and converts all annotated char spans to token tags using the IOB scheme. + """Instantiates a JsonlDataset and converts all annotated char spans to token tags using the IOB scheme. The expected file format is: { "": "", "label_column_name": [[, ,