From 0e5cd6b0c009e54e07f655f28c733ea3d69c4906 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 5 Aug 2022 16:42:26 +0200 Subject: [PATCH 01/35] Add foundation for find-threshold CLI functionality. --- spacy/cli/__init__.py | 2 + spacy/cli/find_threshold.py | 121 ++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 spacy/cli/find_threshold.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index ce76ef9a9cd..c39aa144d68 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -27,6 +27,8 @@ from .project.push import project_push # noqa: F401 from .project.pull import project_pull # noqa: F401 from .project.document import project_document # noqa: F401 +from .find_threshold import find_threshold # noqa: F401 +from .find_threshold import find_threshold_cli # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py new file mode 100644 index 00000000000..3651dc66e71 --- /dev/null +++ b/spacy/cli/find_threshold.py @@ -0,0 +1,121 @@ +from pathlib import Path +import logging +from typing import Optional + +# import numpy + +import spacy +from ._util import app, Arg, Opt +from .. import util +from ..pipeline import MultiLabel_TextCategorizer + +_DEFAULTS = { + "aggregation": "weighted", + "pipe_name": None, + "n_trials": 10, + "beta": 1, + "reverse": False, +} + + +@app.command( + "find-threshold", + context_settings={"allow_extra_args": False, "ignore_unknown_options": True}, +) +def find_threshold_cli( + # fmt: off + model_path: Path = Arg(..., help="Path to model file", exists=True, allow_dash=True), + doc_path: Path = Arg(..., help="Path to doc bin file", exists=True, allow_dash=True), + aggregation: str = Arg(_DEFAULTS["aggregation"], help="How to aggregate F-scores over labels. One of ('micro', 'macro', 'weighted')", exists=True, allow_dash=True), + pipe_name: Optional[str] = Opt(_DEFAULTS["pipe_name"], "--pipe_name", "-p", help="Name of pipe to examine thresholds for"), + n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"), + beta: float = Opt(_DEFAULTS["beta"], "--beta", help="Beta for F1 calculation. Ignored if different metric is used"), + reverse: bool = Opt(_DEFAULTS["reverse"], "--reverse", "-r", help="Minimizes metric instead of maximizing it."), + verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + # fmt: on +): + """ + Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric from CLI. + model_path (Path): Path to file with trained model. + doc_path (Path): Path to file with DocBin with docs to use for threshold search. + aggregation (str): How to aggregate F-scores across labels. One of ('micro', 'macro', 'weighted'). + pipe_name (Optional[str]): Name of pipe to examine thresholds for. If None, pipe of type MultiLabel_TextCategorizer + is seleted. If there are multiple, an error is raised. + n_trials (int): Number of trials to determine optimal thresholds + beta (float): Beta for F1 calculation. Ignored if different metric is used. + reverse (bool): Whether to minimize metric instead of maximizing it. + verbose (bool): Display more information for debugging purposes + """ + + util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + find_threshold( + model_path, + doc_path, + aggregation=aggregation, + pipe_name=pipe_name, + n_trials=n_trials, + beta=beta, + reverse=reverse, + ) + + +def find_threshold( + model_path: Path, + doc_path: Path, + *, + aggregation: str = _DEFAULTS["aggregation"], + pipe_name: Optional[str] = _DEFAULTS["pipe_name"], + n_trials: int = _DEFAULTS["n_trials"], + beta: float = _DEFAULTS["beta"], + reverse: bool = _DEFAULTS["reverse"] +) -> None: + """ + Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric. + model_path (Path): Path to file with trained model. + doc_path (Path): Path to file with DocBin with docs to use for threshold search. + aggregation (str): How to aggregate F-scores across labels. One of ('micro', 'macro', 'weighted'). + pipe_name (Optional[str]): Name of pipe to examine thresholds for. If None, pipe of type MultiLabel_TextCategorizer + is seleted. If there are multiple, an error is raised. + n_trials (int): Number of trials to determine optimal thresholds + beta (float): Beta for F1 calculation. Ignored if different metric is used. + reverse (bool): Whether to minimize metric instead of maximizing it. + """ + + nlp = spacy.load(model_path) + pipe: Optional[MultiLabel_TextCategorizer] = None + selected_pipe_name: Optional[str] = pipe_name + + for _pipe_name, _pipe in nlp.pipeline: + if pipe_name and _pipe_name == pipe_name: + if not isinstance(_pipe, MultiLabel_TextCategorizer): + # todo convert to error + assert "Specified name is not a MultiLabel_TextCategorizer." + pipe = _pipe + break + elif pipe_name is None: + if isinstance(_pipe, MultiLabel_TextCategorizer): + if pipe: + # todo convert to error + assert ( + "Multiple components of type MultiLabel_TextCategorizer in pipeline. Please specify " + "component name." + ) + pipe = _pipe + selected_pipe_name = _pipe_name + + # counts = {label: 0 for label in pipe.labels} + # true_positive_counts = counts.copy() + # false_positive_counts = counts.copy() + # f_scores = counts.copy() + # thresholds = numpy.linspace(0, 1, n_trials) + + # todo iterate over docs, assert categories are 1 or 0. + # todo run pipe for all docs in docbin. + # todo iterate over thresholds. for each: + # - iterate over all docs. for each: + # - iterate over all labels. for each: + # - mark as positive/negative based on current threshold + # - update count, f_score stats + # - compute f_scores for all labels + # - output best threshold + print(selected_pipe_name, pipe.labels, pipe.predict([nlp("aaa")])) From 4981700ced25143c52752976e488fff3b1a3ab5a Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 8 Aug 2022 13:49:42 +0200 Subject: [PATCH 02/35] Finish first draft for find-threshold. --- spacy/cli/__init__.py | 1 - spacy/cli/find_threshold.py | 160 +++++++++++++++++++++++++++--------- 2 files changed, 123 insertions(+), 38 deletions(-) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index c39aa144d68..aab2c8d12bf 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -28,7 +28,6 @@ from .project.pull import project_pull # noqa: F401 from .project.document import project_document # noqa: F401 from .find_threshold import find_threshold # noqa: F401 -from .find_threshold import find_threshold_cli # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 3651dc66e71..eb072817bac 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -2,19 +2,19 @@ import logging from typing import Optional -# import numpy +import numpy +import wasabi.tables -import spacy from ._util import app, Arg, Opt from .. import util from ..pipeline import MultiLabel_TextCategorizer +from ..tokens import DocBin _DEFAULTS = { - "aggregation": "weighted", + "average": "micro", "pipe_name": None, "n_trials": 10, "beta": 1, - "reverse": False, } @@ -26,11 +26,10 @@ def find_threshold_cli( # fmt: off model_path: Path = Arg(..., help="Path to model file", exists=True, allow_dash=True), doc_path: Path = Arg(..., help="Path to doc bin file", exists=True, allow_dash=True), - aggregation: str = Arg(_DEFAULTS["aggregation"], help="How to aggregate F-scores over labels. One of ('micro', 'macro', 'weighted')", exists=True, allow_dash=True), + average: str = Arg(_DEFAULTS["average"], help="How to aggregate F-scores over labels. One of ('micro', 'macro')", exists=True, allow_dash=True), pipe_name: Optional[str] = Opt(_DEFAULTS["pipe_name"], "--pipe_name", "-p", help="Name of pipe to examine thresholds for"), n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"), beta: float = Opt(_DEFAULTS["beta"], "--beta", help="Beta for F1 calculation. Ignored if different metric is used"), - reverse: bool = Opt(_DEFAULTS["reverse"], "--reverse", "-r", help="Minimizes metric instead of maximizing it."), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), # fmt: on ): @@ -38,12 +37,11 @@ def find_threshold_cli( Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric from CLI. model_path (Path): Path to file with trained model. doc_path (Path): Path to file with DocBin with docs to use for threshold search. - aggregation (str): How to aggregate F-scores across labels. One of ('micro', 'macro', 'weighted'). + average (str): How to average F-scores across labels. One of ('micro', 'macro'). pipe_name (Optional[str]): Name of pipe to examine thresholds for. If None, pipe of type MultiLabel_TextCategorizer is seleted. If there are multiple, an error is raised. n_trials (int): Number of trials to determine optimal thresholds beta (float): Beta for F1 calculation. Ignored if different metric is used. - reverse (bool): Whether to minimize metric instead of maximizing it. verbose (bool): Display more information for debugging purposes """ @@ -51,11 +49,10 @@ def find_threshold_cli( find_threshold( model_path, doc_path, - aggregation=aggregation, + average=average, pipe_name=pipe_name, n_trials=n_trials, beta=beta, - reverse=reverse, ) @@ -63,59 +60,148 @@ def find_threshold( model_path: Path, doc_path: Path, *, - aggregation: str = _DEFAULTS["aggregation"], + average: str = _DEFAULTS["average"], pipe_name: Optional[str] = _DEFAULTS["pipe_name"], n_trials: int = _DEFAULTS["n_trials"], beta: float = _DEFAULTS["beta"], - reverse: bool = _DEFAULTS["reverse"] ) -> None: """ Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric. model_path (Path): Path to file with trained model. doc_path (Path): Path to file with DocBin with docs to use for threshold search. - aggregation (str): How to aggregate F-scores across labels. One of ('micro', 'macro', 'weighted'). + average (str): How to average F-scores across labels. One of ('micro', 'macro'). pipe_name (Optional[str]): Name of pipe to examine thresholds for. If None, pipe of type MultiLabel_TextCategorizer is seleted. If there are multiple, an error is raised. n_trials (int): Number of trials to determine optimal thresholds beta (float): Beta for F1 calculation. Ignored if different metric is used. - reverse (bool): Whether to minimize metric instead of maximizing it. """ - nlp = spacy.load(model_path) + nlp = util.load_model(model_path) pipe: Optional[MultiLabel_TextCategorizer] = None selected_pipe_name: Optional[str] = pipe_name + if average not in ("micro", "macro"): + wasabi.msg.fail( + "Expected 'micro' or 'macro' for F-score averaging method, received '{avg_method}'.", + exits=1, + ) + for _pipe_name, _pipe in nlp.pipeline: if pipe_name and _pipe_name == pipe_name: if not isinstance(_pipe, MultiLabel_TextCategorizer): - # todo convert to error - assert "Specified name is not a MultiLabel_TextCategorizer." + wasabi.msg.fail( + "Specified component {component} is not of type `MultiLabel_TextCategorizer`.", + exits=1, + ) pipe = _pipe break elif pipe_name is None: if isinstance(_pipe, MultiLabel_TextCategorizer): if pipe: - # todo convert to error - assert ( - "Multiple components of type MultiLabel_TextCategorizer in pipeline. Please specify " - "component name." + wasabi.msg.fail( + "Multiple components of type `MultiLabel_TextCategorizer` exist in pipeline. Specify name of " + "component to evaluate.", + exits=1, ) pipe = _pipe selected_pipe_name = _pipe_name - # counts = {label: 0 for label in pipe.labels} - # true_positive_counts = counts.copy() - # false_positive_counts = counts.copy() - # f_scores = counts.copy() - # thresholds = numpy.linspace(0, 1, n_trials) - - # todo iterate over docs, assert categories are 1 or 0. - # todo run pipe for all docs in docbin. - # todo iterate over thresholds. for each: - # - iterate over all docs. for each: - # - iterate over all labels. for each: - # - mark as positive/negative based on current threshold - # - update count, f_score stats - # - compute f_scores for all labels - # - output best threshold - print(selected_pipe_name, pipe.labels, pipe.predict([nlp("aaa")])) + if pipe is None: + if pipe_name: + wasabi.msg.fail( + f"No component with name {pipe_name} found in pipeline.", exits=1 + ) + wasabi.msg.fail( + "No component of type `MultiLabel_TextCategorizer` found in pipeline.", + exits=1, + ) + + print( + f"Searching threshold with the best {average} F-score for pipe '{selected_pipe_name}' with {n_trials} trials" + f" and beta = {beta}." + ) + + thresholds = numpy.linspace(0, 1, n_trials) + ref_pos_counts = {label: 0 for label in pipe.labels} + pred_pos_counts = { + t: {True: ref_pos_counts.copy(), False: ref_pos_counts.copy()} + for t in thresholds + } + f_scores_per_label = {t: ref_pos_counts.copy() for t in thresholds} + f_scores = {t: 0 for t in thresholds} + + # Count true/false positives for provided docs. + doc_bin = DocBin() + doc_bin.from_disk(doc_path) + for ref_doc in doc_bin.get_docs(nlp.vocab): + for label, score in ref_doc.cats.items(): + if score not in (0, 1): + wasabi.msg.fail( + f"Expected category scores in evaluation dataset to be 0 <= x <= 1, received {score}.", + exits=1, + ) + ref_pos_counts[label] += ref_doc.cats[label] == 1 + + pred_doc = nlp(ref_doc.text) + # Collect count stats per threshold value and label. + for threshold in thresholds: + for label, score in pred_doc.cats.items(): + label_value = int(score >= threshold) + if label_value == ref_doc.cats[label] == 1: + pred_pos_counts[threshold][True][label] += 1 + elif label_value == 1 and ref_doc.cats[label] == 0: + pred_pos_counts[threshold][False][label] += 1 + + # Compute f_scores. + for threshold in thresholds: + for label in ref_pos_counts: + n_pos_preds = ( + pred_pos_counts[threshold][True][label] + + pred_pos_counts[threshold][False][label] + ) + precision = ( + (pred_pos_counts[threshold][True][label] / n_pos_preds) + if n_pos_preds > 0 + else 0 + ) + recall = pred_pos_counts[threshold][True][label] / ref_pos_counts[label] + f_scores_per_label[threshold][label] = ( + ( + (1 + beta**2) + * (precision * recall / (precision * beta**2 + recall)) + ) + if precision + else 0 + ) + + # Aggregate F-scores. + if average == "micro": + f_scores[threshold] = sum( + [ + f_scores_per_label[threshold][label] * ref_pos_counts[label] + for label in ref_pos_counts + ] + ) / sum(ref_pos_counts.values()) + else: + f_scores[threshold] = sum( + [f_scores_per_label[threshold][label] for label in ref_pos_counts] + ) / len(ref_pos_counts) + + best_threshold = max(f_scores, key=f_scores.get) + print( + f"Best threshold: {round(best_threshold, ndigits=4)} with F-score of {f_scores[best_threshold]}." + ) + print( + wasabi.tables.table( + data=[ + (threshold, label, f_score) + for threshold, label_f_scores in f_scores_per_label.items() + for label, f_score in label_f_scores.items() + ], + header=["Threshold", "Label", "F-Score"], + ), + wasabi.tables.table( + data=[(threshold, f_score) for threshold, f_score in f_scores.items()], + header=["Threshold", f"F-Score ({average})"], + ), + ) From 1d0f5d35924917339476f76bd7a823588f887d2a Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 8 Aug 2022 16:39:22 +0200 Subject: [PATCH 03/35] Add tests. --- spacy/cli/find_threshold.py | 68 ++++++++++-------- spacy/tests/test_cli.py | 134 ++++++++++++++++++++++++++++++++++-- 2 files changed, 168 insertions(+), 34 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index eb072817bac..1ffc04bbd51 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -1,6 +1,6 @@ from pathlib import Path import logging -from typing import Optional +from typing import Optional, Tuple, Union import numpy import wasabi.tables @@ -57,23 +57,26 @@ def find_threshold_cli( def find_threshold( - model_path: Path, - doc_path: Path, + model_path: Union[str, Path], + doc_path: Union[str, Path], *, average: str = _DEFAULTS["average"], pipe_name: Optional[str] = _DEFAULTS["pipe_name"], n_trials: int = _DEFAULTS["n_trials"], beta: float = _DEFAULTS["beta"], -) -> None: + verbose: bool = True, +) -> Tuple[float, float]: """ Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric. - model_path (Path): Path to file with trained model. - doc_path (Path): Path to file with DocBin with docs to use for threshold search. + model_path (Union[str, Path]): Path to file with trained model. + doc_path (Union[str, Path]): Path to file with DocBin with docs to use for threshold search. average (str): How to average F-scores across labels. One of ('micro', 'macro'). pipe_name (Optional[str]): Name of pipe to examine thresholds for. If None, pipe of type MultiLabel_TextCategorizer is seleted. If there are multiple, an error is raised. n_trials (int): Number of trials to determine optimal thresholds beta (float): Beta for F1 calculation. Ignored if different metric is used. + verbose (bool): Whether to print non-error-related output to stdout. + RETURNS (Tuple[float, float]): Best found threshold with corresponding F-score. """ nlp = util.load_model(model_path) @@ -90,10 +93,13 @@ def find_threshold( if pipe_name and _pipe_name == pipe_name: if not isinstance(_pipe, MultiLabel_TextCategorizer): wasabi.msg.fail( - "Specified component {component} is not of type `MultiLabel_TextCategorizer`.", + "Specified component '{component}' is not of type `MultiLabel_TextCategorizer`.".format( + component=pipe_name + ), exits=1, ) pipe = _pipe + print(pipe_name, _pipe_name, pipe.labels) break elif pipe_name is None: if isinstance(_pipe, MultiLabel_TextCategorizer): @@ -116,10 +122,11 @@ def find_threshold( exits=1, ) - print( - f"Searching threshold with the best {average} F-score for pipe '{selected_pipe_name}' with {n_trials} trials" - f" and beta = {beta}." - ) + if verbose: + print( + f"Searching threshold with the best {average} F-score for component '{selected_pipe_name}' with {n_trials} " + f"trials and beta = {beta}." + ) thresholds = numpy.linspace(0, 1, n_trials) ref_pos_counts = {label: 0 for label in pipe.labels} @@ -146,13 +153,15 @@ def find_threshold( # Collect count stats per threshold value and label. for threshold in thresholds: for label, score in pred_doc.cats.items(): + if label not in pipe.labels: + continue label_value = int(score >= threshold) if label_value == ref_doc.cats[label] == 1: pred_pos_counts[threshold][True][label] += 1 elif label_value == 1 and ref_doc.cats[label] == 0: pred_pos_counts[threshold][False][label] += 1 - # Compute f_scores. + # Compute F-scores. for threshold in thresholds: for label in ref_pos_counts: n_pos_preds = ( @@ -188,20 +197,21 @@ def find_threshold( ) / len(ref_pos_counts) best_threshold = max(f_scores, key=f_scores.get) - print( - f"Best threshold: {round(best_threshold, ndigits=4)} with F-score of {f_scores[best_threshold]}." - ) - print( - wasabi.tables.table( - data=[ - (threshold, label, f_score) - for threshold, label_f_scores in f_scores_per_label.items() - for label, f_score in label_f_scores.items() - ], - header=["Threshold", "Label", "F-Score"], - ), - wasabi.tables.table( - data=[(threshold, f_score) for threshold, f_score in f_scores.items()], - header=["Threshold", f"F-Score ({average})"], - ), - ) + if verbose: + print( + f"Best threshold: {round(best_threshold, ndigits=4)} with F-score of {f_scores[best_threshold]}.", + wasabi.tables.table( + data=[ + (threshold, label, f_score) + for threshold, label_f_scores in f_scores_per_label.items() + for label, f_score in label_f_scores.items() + ], + header=["Threshold", "Label", "F-Score"], + ), + wasabi.tables.table( + data=[(threshold, f_score) for threshold, f_score in f_scores.items()], + header=["Threshold", f"F-Score ({average})"], + ), + ) + + return best_threshold, f_scores[best_threshold] diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 838e003698f..264549a693c 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,8 +1,8 @@ import os import math -from random import sample -from typing import Counter +from typing import Counter, Iterable, Tuple, List +import numpy import pytest import srsly from click import NoSuchOption @@ -26,19 +26,23 @@ from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name from spacy.cli.validate import get_model_pkgs +from spacy.cli.find_threshold import find_threshold from spacy.lang.en import English from spacy.lang.nl import Dutch from spacy.language import Language from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate -from spacy.tokens import Doc +from spacy.tokens import Doc, DocBin from spacy.tokens.span import Span from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs +from spacy.pipeline import TextCategorizer, Pipe from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config -from ..cli.init_pipeline import _init_labels -from .util import make_tempdir +# from ..cli.init_pipeline import _init_labels +# from .util import make_tempdir +from spacy.cli.init_pipeline import _init_labels +from spacy.tests.util import make_tempdir @pytest.mark.issue(4665) @@ -855,3 +859,123 @@ def test_span_length_freq_dist_output_must_be_correct(): span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold) assert sum(span_freqs.values()) >= threshold assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] + + +def test_cli_find_threshold(capsys): + def make_get_examples_multi_label(_nlp: Language) -> List[Example]: + return [ + Example.from_dict(_nlp.make_doc(t[0]), t[1]) + for t in [ + ( + "I'm angry and confused", + {"cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0}}, + ), + ( + "I'm confused but happy", + {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}, + ), + ] + ] + + def init_nlp( + component_factory_names: Tuple[str] = (), + ) -> Tuple[Language, List[Example]]: + _nlp = English() + + textcat: TextCategorizer = _nlp.add_pipe(factory_name="textcat_multilabel", name="tc_multi") # type: ignore + textcat.add_label("ANGRY") + textcat.add_label("CONFUSED") + textcat.add_label("HAPPY") + for cfn in component_factory_names: + comp = _nlp.add_pipe(cfn) + if isinstance(comp, TextCategorizer): + comp.add_label("dummy") + + _nlp.initialize() + + _examples = make_get_examples_multi_label(_nlp) + for i in range(5): + _nlp.update(_examples) + + return _nlp, _examples + + with make_tempdir() as docs_dir: + # Check whether find_threshold() identifies lowest threshold above 0 as (first) ideal threshold, as this matches + # the current model behavior with the examples above. This can break once the model behavior changes and serves + # mostly as a smoke test. + nlp, examples = init_nlp() + DocBin(docs=[example.reference for example in examples]).to_disk( + docs_dir / "docs" + ) + with make_tempdir() as nlp_dir: + nlp.to_disk(nlp_dir) + assert ( + find_threshold(nlp_dir, docs_dir / "docs", verbose=False)[0] + == numpy.linspace(0, 1, 10)[1] + ) + + # Specifying name of non-MultiLabel_TextCategorizer component should fail. + nlp, _ = init_nlp(("sentencizer",)) + with make_tempdir() as nlp_dir: + nlp.to_disk(nlp_dir) + with pytest.raises(SystemExit) as error: + find_threshold(nlp_dir, docs_dir / "docs", pipe_name="sentencizer") + assert error.value.code == 1 + + # Having multiple textcat_multilabel components without specifying the name should fail. + nlp, _ = init_nlp(("textcat_multilabel",)) + with make_tempdir() as nlp_dir: + nlp.to_disk(nlp_dir) + with pytest.raises(SystemExit) as error: + find_threshold(nlp_dir, docs_dir / "docs") + assert error.value.code == 1 + + # Having multiple textcat_multilabel components should work when specifying the name. + nlp, _ = init_nlp(("textcat_multilabel",)) + with make_tempdir() as nlp_dir: + nlp.to_disk(nlp_dir) + assert ( + find_threshold( + nlp_dir, docs_dir / "docs", pipe_name="tc_multi", verbose=False + )[0] + == numpy.linspace(0, 1, 10)[1] + ) + + # Specifying the name of an non-existing pipe should fail. + nlp, _ = init_nlp() + with make_tempdir() as nlp_dir: + nlp.to_disk(nlp_dir) + with pytest.raises(SystemExit) as error: + find_threshold(nlp_dir, docs_dir / "docs", pipe_name="_") + assert error.value.code == 1 + + # Using a pipe with no textcat components should fail. + nlp = English() + with make_tempdir() as nlp_dir: + nlp.to_disk(nlp_dir) + with pytest.raises(SystemExit) as error: + find_threshold(nlp_dir, docs_dir / "docs") + assert error.value.code == 1 + + # Specifying scores not in range 0 <= x <= 1 should fail. + nlp, _ = init_nlp() + DocBin( + docs=[ + Example.from_dict(nlp.make_doc(t[0]), t[1]).reference + for t in [ + ( + "I'm angry and confused", + {"cats": {"ANGRY": 1.0, "CONFUSED": 2.0, "HAPPY": 0.0}}, + ), + ( + "I'm confused but happy", + {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}, + ), + ] + ] + ).to_disk(docs_dir / "docs") + with make_tempdir() as nlp_dir: + nlp.to_disk(nlp_dir) + with pytest.raises(SystemExit) as error: + find_threshold(nlp_dir, docs_dir / "docs") + assert error.value.code == 1 From a7b56e82cf8847360cc8e23dbf9432a3caf9a814 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 8 Aug 2022 16:44:15 +0200 Subject: [PATCH 04/35] Revert adjusted import statements. --- spacy/tests/test_cli.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 264549a693c..8b0bab5b6f0 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -39,10 +39,8 @@ from spacy.pipeline import TextCategorizer, Pipe from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config -# from ..cli.init_pipeline import _init_labels -# from .util import make_tempdir -from spacy.cli.init_pipeline import _init_labels -from spacy.tests.util import make_tempdir +from ..cli.init_pipeline import _init_labels +from .util import make_tempdir @pytest.mark.issue(4665) From d689d97ab5f1ba3dc3f63a717dbb74b51be34d0a Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 9 Aug 2022 10:03:43 +0200 Subject: [PATCH 05/35] Fix mypy errors. --- spacy/cli/find_threshold.py | 22 ++++++++++++---------- spacy/tests/test_cli.py | 2 +- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 1ffc04bbd51..6f8bb68b878 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -1,10 +1,11 @@ from pathlib import Path import logging -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Union, Dict, cast import numpy import wasabi.tables +from pipeline import Pipe from ._util import app, Arg, Opt from .. import util from ..pipeline import MultiLabel_TextCategorizer @@ -60,10 +61,10 @@ def find_threshold( model_path: Union[str, Path], doc_path: Union[str, Path], *, - average: str = _DEFAULTS["average"], - pipe_name: Optional[str] = _DEFAULTS["pipe_name"], - n_trials: int = _DEFAULTS["n_trials"], - beta: float = _DEFAULTS["beta"], + average: str = _DEFAULTS["average"], # type: ignore + pipe_name: Optional[str] = _DEFAULTS["pipe_name"], # type: ignore + n_trials: int = _DEFAULTS["n_trials"], # type: ignore + beta: float = _DEFAULTS["beta"], # type: ignore verbose: bool = True, ) -> Tuple[float, float]: """ @@ -80,7 +81,7 @@ def find_threshold( """ nlp = util.load_model(model_path) - pipe: Optional[MultiLabel_TextCategorizer] = None + pipe: Optional[Pipe] = None selected_pipe_name: Optional[str] = pipe_name if average not in ("micro", "macro"): @@ -99,7 +100,6 @@ def find_threshold( exits=1, ) pipe = _pipe - print(pipe_name, _pipe_name, pipe.labels) break elif pipe_name is None: if isinstance(_pipe, MultiLabel_TextCategorizer): @@ -121,6 +121,8 @@ def find_threshold( "No component of type `MultiLabel_TextCategorizer` found in pipeline.", exits=1, ) + # This is purely for MyPy. Type checking is done in loop above already. + assert isinstance(pipe, MultiLabel_TextCategorizer) if verbose: print( @@ -134,8 +136,8 @@ def find_threshold( t: {True: ref_pos_counts.copy(), False: ref_pos_counts.copy()} for t in thresholds } - f_scores_per_label = {t: ref_pos_counts.copy() for t in thresholds} - f_scores = {t: 0 for t in thresholds} + f_scores_per_label = {t: {label: 0.0 for label in pipe.labels} for t in thresholds} + f_scores = {t: 0.0 for t in thresholds} # Count true/false positives for provided docs. doc_bin = DocBin() @@ -196,7 +198,7 @@ def find_threshold( [f_scores_per_label[threshold][label] for label in ref_pos_counts] ) / len(ref_pos_counts) - best_threshold = max(f_scores, key=f_scores.get) + best_threshold = max(f_scores.keys(), key=(lambda key: f_scores[key])) if verbose: print( f"Best threshold: {round(best_threshold, ndigits=4)} with F-score of {f_scores[best_threshold]}.", diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 8b0bab5b6f0..48cc364f00f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -876,7 +876,7 @@ def make_get_examples_multi_label(_nlp: Language) -> List[Example]: ] def init_nlp( - component_factory_names: Tuple[str] = (), + component_factory_names: Tuple[str, ...] = (), ) -> Tuple[Language, List[Example]]: _nlp = English() From 6c3ae8dfcc3e2279ae05c8e707c2aab08e924d06 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 9 Aug 2022 12:26:23 +0200 Subject: [PATCH 06/35] Fix imports. --- spacy/cli/find_threshold.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 6f8bb68b878..5c8bc679851 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -1,14 +1,13 @@ from pathlib import Path import logging -from typing import Optional, Tuple, Union, Dict, cast +from typing import Optional, Tuple, Union import numpy import wasabi.tables -from pipeline import Pipe from ._util import app, Arg, Opt from .. import util -from ..pipeline import MultiLabel_TextCategorizer +from ..pipeline import MultiLabel_TextCategorizer, Pipe from ..tokens import DocBin _DEFAULTS = { From 63c80288ef46a279065736020b9f1ede7300d161 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 30 Aug 2022 11:48:04 +0200 Subject: [PATCH 07/35] Harmonize arguments with spacy evaluate command. --- spacy/cli/find_threshold.py | 102 +++++++++++++++++------------------- 1 file changed, 48 insertions(+), 54 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 5c8bc679851..d5cd9e3dea5 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -1,21 +1,16 @@ from pathlib import Path import logging -from typing import Optional, Tuple, Union +from typing import Optional, Tuple import numpy import wasabi.tables -from ._util import app, Arg, Opt +from ._util import app, Arg, Opt, import_code, setup_gpu from .. import util from ..pipeline import MultiLabel_TextCategorizer, Pipe from ..tokens import DocBin -_DEFAULTS = { - "average": "micro", - "pipe_name": None, - "n_trials": 10, - "beta": 1, -} +_DEFAULTS = {"average": "micro", "n_trials": 10, "beta": 1, "use_gpu": -1} @app.command( @@ -24,62 +19,73 @@ ) def find_threshold_cli( # fmt: off - model_path: Path = Arg(..., help="Path to model file", exists=True, allow_dash=True), - doc_path: Path = Arg(..., help="Path to doc bin file", exists=True, allow_dash=True), + model: str = Arg(..., help="Model name or path"), + data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), + pipe_name: str = Opt(..., "--pipe_name", "-p", help="Name of pipe to examine thresholds for"), average: str = Arg(_DEFAULTS["average"], help="How to aggregate F-scores over labels. One of ('micro', 'macro')", exists=True, allow_dash=True), - pipe_name: Optional[str] = Opt(_DEFAULTS["pipe_name"], "--pipe_name", "-p", help="Name of pipe to examine thresholds for"), n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"), beta: float = Opt(_DEFAULTS["beta"], "--beta", help="Beta for F1 calculation. Ignored if different metric is used"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), + verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"), # fmt: on ): """ Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric from CLI. - model_path (Path): Path to file with trained model. - doc_path (Path): Path to file with DocBin with docs to use for threshold search. + model (Path): Path to file with trained model. + data_path (Path): Path to file with DocBin with docs to use for threshold search. + pipe_name (str): Name of pipe to examine thresholds for. average (str): How to average F-scores across labels. One of ('micro', 'macro'). - pipe_name (Optional[str]): Name of pipe to examine thresholds for. If None, pipe of type MultiLabel_TextCategorizer - is seleted. If there are multiple, an error is raised. n_trials (int): Number of trials to determine optimal thresholds - beta (float): Beta for F1 calculation. Ignored if different metric is used. - verbose (bool): Display more information for debugging purposes + beta (float): Beta for F1 calculation. + code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported. + use_gpu (int): GPU ID or -1 for CPU. + silent (bool): Display more information for debugging purposes """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + import_code(code_path) find_threshold( - model_path, - doc_path, - average=average, + model, + data_path, pipe_name=pipe_name, + average=average, n_trials=n_trials, beta=beta, + use_gpu=use_gpu, + silent=False, ) def find_threshold( - model_path: Union[str, Path], - doc_path: Union[str, Path], + model: str, + data_path: Path, *, + pipe_name: str, # type: ignore average: str = _DEFAULTS["average"], # type: ignore - pipe_name: Optional[str] = _DEFAULTS["pipe_name"], # type: ignore n_trials: int = _DEFAULTS["n_trials"], # type: ignore - beta: float = _DEFAULTS["beta"], # type: ignore - verbose: bool = True, + beta: float = _DEFAULTS["beta"], # type: ignore, + use_gpu: int = _DEFAULTS["use_gpu"], + silent: bool = True, ) -> Tuple[float, float]: """ Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric. - model_path (Union[str, Path]): Path to file with trained model. - doc_path (Union[str, Path]): Path to file with DocBin with docs to use for threshold search. + model (Union[str, Path]): Path to file with trained model. + data_path (Union[str, Path]): Path to file with DocBin with docs to use for threshold search. + pipe_name (str): Name of pipe to examine thresholds for. average (str): How to average F-scores across labels. One of ('micro', 'macro'). - pipe_name (Optional[str]): Name of pipe to examine thresholds for. If None, pipe of type MultiLabel_TextCategorizer - is seleted. If there are multiple, an error is raised. - n_trials (int): Number of trials to determine optimal thresholds - beta (float): Beta for F1 calculation. Ignored if different metric is used. - verbose (bool): Whether to print non-error-related output to stdout. + n_trials (int): Number of trials to determine optimal thresholds. + beta (float): Beta for F1 calculation. + use_gpu (int): GPU ID or -1 for CPU. + silent (bool): Whether to print non-error-related output to stdout. RETURNS (Tuple[float, float]): Best found threshold with corresponding F-score. """ - nlp = util.load_model(model_path) + setup_gpu(use_gpu, silent=silent) + data_path = util.ensure_path(data_path) + if not data_path.exists(): + wasabi.msg.fail("Evaluation data not found", data_path, exits=1) + nlp = util.load_model(model) pipe: Optional[Pipe] = None selected_pipe_name: Optional[str] = pipe_name @@ -90,7 +96,9 @@ def find_threshold( ) for _pipe_name, _pipe in nlp.pipeline: - if pipe_name and _pipe_name == pipe_name: + # todo instead of instance check, assert _pipe has a .threshold arg + # won't work, actually. e.g. spancat doesn't .threshold. + if _pipe_name == pipe_name: if not isinstance(_pipe, MultiLabel_TextCategorizer): wasabi.msg.fail( "Specified component '{component}' is not of type `MultiLabel_TextCategorizer`.".format( @@ -100,36 +108,22 @@ def find_threshold( ) pipe = _pipe break - elif pipe_name is None: - if isinstance(_pipe, MultiLabel_TextCategorizer): - if pipe: - wasabi.msg.fail( - "Multiple components of type `MultiLabel_TextCategorizer` exist in pipeline. Specify name of " - "component to evaluate.", - exits=1, - ) - pipe = _pipe - selected_pipe_name = _pipe_name if pipe is None: - if pipe_name: - wasabi.msg.fail( - f"No component with name {pipe_name} found in pipeline.", exits=1 - ) wasabi.msg.fail( - "No component of type `MultiLabel_TextCategorizer` found in pipeline.", - exits=1, + f"No component with name {pipe_name} found in pipeline.", exits=1 ) # This is purely for MyPy. Type checking is done in loop above already. assert isinstance(pipe, MultiLabel_TextCategorizer) - if verbose: + if silent: print( f"Searching threshold with the best {average} F-score for component '{selected_pipe_name}' with {n_trials} " f"trials and beta = {beta}." ) thresholds = numpy.linspace(0, 1, n_trials) + # todo use Scorer.score_cats. possibly to be extended? ref_pos_counts = {label: 0 for label in pipe.labels} pred_pos_counts = { t: {True: ref_pos_counts.copy(), False: ref_pos_counts.copy()} @@ -140,7 +134,7 @@ def find_threshold( # Count true/false positives for provided docs. doc_bin = DocBin() - doc_bin.from_disk(doc_path) + doc_bin.from_disk(data_path) for ref_doc in doc_bin.get_docs(nlp.vocab): for label, score in ref_doc.cats.items(): if score not in (0, 1): @@ -198,7 +192,7 @@ def find_threshold( ) / len(ref_pos_counts) best_threshold = max(f_scores.keys(), key=(lambda key: f_scores[key])) - if verbose: + if silent: print( f"Best threshold: {round(best_threshold, ndigits=4)} with F-score of {f_scores[best_threshold]}.", wasabi.tables.table( From 3a0a3854f72dc726261658bc3701931f23bb9ef1 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 1 Sep 2022 14:00:05 +0200 Subject: [PATCH 08/35] Generalize component and threshold handling. Harmonize arguments with 'spacy evaluate' CLI. --- spacy/cli/find_threshold.py | 205 ++++++++++++++---------------------- spacy/tests/test_cli.py | 155 ++++++++++++++------------- 2 files changed, 157 insertions(+), 203 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index d5cd9e3dea5..fe3bdedae52 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -1,16 +1,23 @@ +import functools +import operator from pathlib import Path import logging -from typing import Optional, Tuple +from typing import Optional, Tuple, Any, Dict, List import numpy import wasabi.tables +from ..training import Corpus from ._util import app, Arg, Opt, import_code, setup_gpu from .. import util -from ..pipeline import MultiLabel_TextCategorizer, Pipe -from ..tokens import DocBin -_DEFAULTS = {"average": "micro", "n_trials": 10, "beta": 1, "use_gpu": -1} +_DEFAULTS = { + "average": "micro", + "n_trials": 10, + "beta": 1, + "use_gpu": -1, + "gold_preproc": False, +} @app.command( @@ -21,12 +28,14 @@ def find_threshold_cli( # fmt: off model: str = Arg(..., help="Model name or path"), data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), - pipe_name: str = Opt(..., "--pipe_name", "-p", help="Name of pipe to examine thresholds for"), - average: str = Arg(_DEFAULTS["average"], help="How to aggregate F-scores over labels. One of ('micro', 'macro')", exists=True, allow_dash=True), + pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"), + threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"), + scores_key: str = Arg(..., help="Name of score to metric to optimize"), n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"), beta: float = Opt(_DEFAULTS["beta"], "--beta", help="Beta for F1 calculation. Ignored if different metric is used"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), + gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"), # fmt: on ): @@ -35,24 +44,30 @@ def find_threshold_cli( model (Path): Path to file with trained model. data_path (Path): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. - average (str): How to average F-scores across labels. One of ('micro', 'macro'). + threshold_key (str): Key of threshold attribute in component's configuration. + scores_key (str): Name of score to metric to optimize. n_trials (int): Number of trials to determine optimal thresholds beta (float): Beta for F1 calculation. code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported. use_gpu (int): GPU ID or -1 for CPU. + gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the + tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due + to train/test skew. silent (bool): Display more information for debugging purposes """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) import_code(code_path) find_threshold( - model, - data_path, + model=model, + data_path=data_path, pipe_name=pipe_name, - average=average, + threshold_key=threshold_key, + scores_key=scores_key, n_trials=n_trials, beta=beta, use_gpu=use_gpu, + gold_preproc=gold_preproc, silent=False, ) @@ -60,12 +75,14 @@ def find_threshold_cli( def find_threshold( model: str, data_path: Path, + pipe_name: str, + threshold_key: str, + scores_key: str, *, - pipe_name: str, # type: ignore - average: str = _DEFAULTS["average"], # type: ignore - n_trials: int = _DEFAULTS["n_trials"], # type: ignore - beta: float = _DEFAULTS["beta"], # type: ignore, + n_trials: int = _DEFAULTS["n_trials"], + beta: float = _DEFAULTS["beta"], use_gpu: int = _DEFAULTS["use_gpu"], + gold_preproc: bool = _DEFAULTS["gold_preproc"], silent: bool = True, ) -> Tuple[float, float]: """ @@ -73,10 +90,14 @@ def find_threshold( model (Union[str, Path]): Path to file with trained model. data_path (Union[str, Path]): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. - average (str): How to average F-scores across labels. One of ('micro', 'macro'). + threshold_key (str): Key of threshold attribute in component's configuration. + scores_key (str): Name of score to metric to optimize. n_trials (int): Number of trials to determine optimal thresholds. beta (float): Beta for F1 calculation. use_gpu (int): GPU ID or -1 for CPU. + gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the + tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due + to train/test skew. silent (bool): Whether to print non-error-related output to stdout. RETURNS (Tuple[float, float]): Best found threshold with corresponding F-score. """ @@ -86,127 +107,57 @@ def find_threshold( if not data_path.exists(): wasabi.msg.fail("Evaluation data not found", data_path, exits=1) nlp = util.load_model(model) - pipe: Optional[Pipe] = None - selected_pipe_name: Optional[str] = pipe_name - if average not in ("micro", "macro"): - wasabi.msg.fail( - "Expected 'micro' or 'macro' for F-score averaging method, received '{avg_method}'.", - exits=1, - ) - - for _pipe_name, _pipe in nlp.pipeline: - # todo instead of instance check, assert _pipe has a .threshold arg - # won't work, actually. e.g. spancat doesn't .threshold. - if _pipe_name == pipe_name: - if not isinstance(_pipe, MultiLabel_TextCategorizer): - wasabi.msg.fail( - "Specified component '{component}' is not of type `MultiLabel_TextCategorizer`.".format( - component=pipe_name - ), - exits=1, - ) - pipe = _pipe - break - - if pipe is None: - wasabi.msg.fail( - f"No component with name {pipe_name} found in pipeline.", exits=1 - ) - # This is purely for MyPy. Type checking is done in loop above already. - assert isinstance(pipe, MultiLabel_TextCategorizer) + try: + pipe = nlp.get_pipe(pipe_name) + except KeyError as err: + wasabi.msg.fail(title=str(err), exits=1) - if silent: - print( - f"Searching threshold with the best {average} F-score for component '{selected_pipe_name}' with {n_trials} " + if not silent: + wasabi.msg.info( + title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} " f"trials and beta = {beta}." ) - thresholds = numpy.linspace(0, 1, n_trials) - # todo use Scorer.score_cats. possibly to be extended? - ref_pos_counts = {label: 0 for label in pipe.labels} - pred_pos_counts = { - t: {True: ref_pos_counts.copy(), False: ref_pos_counts.copy()} - for t in thresholds - } - f_scores_per_label = {t: {label: 0.0 for label in pipe.labels} for t in thresholds} - f_scores = {t: 0.0 for t in thresholds} - - # Count true/false positives for provided docs. - doc_bin = DocBin() - doc_bin.from_disk(data_path) - for ref_doc in doc_bin.get_docs(nlp.vocab): - for label, score in ref_doc.cats.items(): - if score not in (0, 1): - wasabi.msg.fail( - f"Expected category scores in evaluation dataset to be 0 <= x <= 1, received {score}.", - exits=1, - ) - ref_pos_counts[label] += ref_doc.cats[label] == 1 - - pred_doc = nlp(ref_doc.text) - # Collect count stats per threshold value and label. - for threshold in thresholds: - for label, score in pred_doc.cats.items(): - if label not in pipe.labels: - continue - label_value = int(score >= threshold) - if label_value == ref_doc.cats[label] == 1: - pred_pos_counts[threshold][True][label] += 1 - elif label_value == 1 and ref_doc.cats[label] == 0: - pred_pos_counts[threshold][False][label] += 1 - - # Compute F-scores. - for threshold in thresholds: - for label in ref_pos_counts: - n_pos_preds = ( - pred_pos_counts[threshold][True][label] - + pred_pos_counts[threshold][False][label] - ) - precision = ( - (pred_pos_counts[threshold][True][label] / n_pos_preds) - if n_pos_preds > 0 - else 0 - ) - recall = pred_pos_counts[threshold][True][label] / ref_pos_counts[label] - f_scores_per_label[threshold][label] = ( - ( - (1 + beta**2) - * (precision * recall / (precision * beta**2 + recall)) - ) - if precision - else 0 + # Load evaluation corpus. + corpus = Corpus(data_path, gold_preproc=gold_preproc) + dev_dataset = list(corpus(nlp)) + config_keys = threshold_key.split(".") + + def set_nested_item( + config: Dict[str, Any], keys: List[str], value: float + ) -> Dict[str, Any]: + """Set item in nested dictionary. Adapated from https://stackoverflow.com/a/54138200. + config (Dict[str, Any]): Configuration dictionary. + keys (List[Any]): + value (float): Value to set. + RETURNS (Dict[str, Any]): Updated dictionary. + """ + functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value + return config + + # Evaluate with varying threshold values. + scores: Dict[float, float] = {} + for threshold in numpy.linspace(0, 1, n_trials): + pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold) + scores[threshold] = nlp.evaluate(dev_dataset)[scores_key] + if not ( + isinstance(scores[threshold], float) or isinstance(scores[threshold], int) + ): + wasabi.msg.fail( + f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric " + f"scores.", + exits=1, ) - # Aggregate F-scores. - if average == "micro": - f_scores[threshold] = sum( - [ - f_scores_per_label[threshold][label] * ref_pos_counts[label] - for label in ref_pos_counts - ] - ) / sum(ref_pos_counts.values()) - else: - f_scores[threshold] = sum( - [f_scores_per_label[threshold][label] for label in ref_pos_counts] - ) / len(ref_pos_counts) - - best_threshold = max(f_scores.keys(), key=(lambda key: f_scores[key])) - if silent: + best_threshold = max(scores.keys(), key=(lambda key: scores[key])) + if not silent: print( - f"Best threshold: {round(best_threshold, ndigits=4)} with F-score of {f_scores[best_threshold]}.", - wasabi.tables.table( - data=[ - (threshold, label, f_score) - for threshold, label_f_scores in f_scores_per_label.items() - for label, f_score in label_f_scores.items() - ], - header=["Threshold", "Label", "F-Score"], - ), + f"Best threshold: {round(best_threshold, ndigits=4)} with value of {scores[best_threshold]}.", wasabi.tables.table( - data=[(threshold, f_score) for threshold, f_score in f_scores.items()], - header=["Threshold", f"F-Score ({average})"], + data=[(threshold, score) for threshold, score in scores.items()], + header=["Threshold", f"{scores_key}"], ), ) - return best_threshold, f_scores[best_threshold] + return best_threshold, scores[best_threshold] diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 48cc364f00f..b0d173fdf7c 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,6 +1,6 @@ import os import math -from typing import Counter, Iterable, Tuple, List +from typing import Counter, Tuple, List, Dict, Any import numpy import pytest @@ -36,7 +36,7 @@ from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs -from spacy.pipeline import TextCategorizer, Pipe +from spacy.pipeline import TextCategorizer, Pipe, SpanCategorizer from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config from ..cli.init_pipeline import _init_labels @@ -860,38 +860,55 @@ def test_span_length_freq_dist_output_must_be_correct(): def test_cli_find_threshold(capsys): - def make_get_examples_multi_label(_nlp: Language) -> List[Example]: - return [ - Example.from_dict(_nlp.make_doc(t[0]), t[1]) - for t in [ - ( - "I'm angry and confused", - {"cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0}}, - ), - ( - "I'm confused but happy", - {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}, - ), - ] - ] + def make_examples(_nlp: Language) -> List[Example]: + docs: List[Example] = [] + + for t in [ + ( + "I'm angry and confused in the Bank of America.", + { + "cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0}, + "spans": {"sc": [(7, 10, "ORG")]}, + }, + ), + ( + "I'm confused but happy in New York.", + { + "cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}, + "spans": {"sc": [(6, 7, "GPE")]}, + }, + ), + ]: + doc = _nlp.make_doc(t[0]) + docs.append(Example.from_dict(doc, t[1])) + + return docs def init_nlp( - component_factory_names: Tuple[str, ...] = (), + components: Tuple[Tuple[str, Dict[str, Any]], ...] = () ) -> Tuple[Language, List[Example]]: _nlp = English() + textcat: TextCategorizer = _nlp.add_pipe( # type: ignore + factory_name="textcat_multilabel", + name="tc_multi", + config={"threshold": 0.9}, + ) + textcat_labels = ("ANGRY", "CONFUSED", "HAPPY") + for label in textcat_labels: + textcat.add_label(label) - textcat: TextCategorizer = _nlp.add_pipe(factory_name="textcat_multilabel", name="tc_multi") # type: ignore - textcat.add_label("ANGRY") - textcat.add_label("CONFUSED") - textcat.add_label("HAPPY") - for cfn in component_factory_names: - comp = _nlp.add_pipe(cfn) + # Append additional components to pipeline. + for cfn, comp_config in components: + comp = _nlp.add_pipe(cfn, config=comp_config) if isinstance(comp, TextCategorizer): - comp.add_label("dummy") + for label in textcat_labels: + comp.add_label(label) + if isinstance(comp, SpanCategorizer): + comp.add_label("GPE") + comp.add_label("ORG") _nlp.initialize() - - _examples = make_get_examples_multi_label(_nlp) + _examples = make_examples(_nlp) for i in range(5): _nlp.update(_examples) @@ -903,77 +920,63 @@ def init_nlp( # mostly as a smoke test. nlp, examples = init_nlp() DocBin(docs=[example.reference for example in examples]).to_disk( - docs_dir / "docs" + docs_dir / "docs.spacy" ) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) assert ( - find_threshold(nlp_dir, docs_dir / "docs", verbose=False)[0] + find_threshold( + model=nlp_dir, + data_path=docs_dir / "docs.spacy", + pipe_name="tc_multi", + threshold_key="threshold", + scores_key="cats_macro_f", + silent=True, + )[0] == numpy.linspace(0, 1, 10)[1] ) + # todo fix spancat test # Specifying name of non-MultiLabel_TextCategorizer component should fail. - nlp, _ = init_nlp(("sentencizer",)) - with make_tempdir() as nlp_dir: - nlp.to_disk(nlp_dir) - with pytest.raises(SystemExit) as error: - find_threshold(nlp_dir, docs_dir / "docs", pipe_name="sentencizer") - assert error.value.code == 1 - - # Having multiple textcat_multilabel components without specifying the name should fail. - nlp, _ = init_nlp(("textcat_multilabel",)) - with make_tempdir() as nlp_dir: - nlp.to_disk(nlp_dir) - with pytest.raises(SystemExit) as error: - find_threshold(nlp_dir, docs_dir / "docs") - assert error.value.code == 1 - - # Having multiple textcat_multilabel components should work when specifying the name. - nlp, _ = init_nlp(("textcat_multilabel",)) + nlp, _ = init_nlp((("spancat", {"spans_key": "sc", "threshold": 0.5}),)) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) assert ( find_threshold( - nlp_dir, docs_dir / "docs", pipe_name="tc_multi", verbose=False + model=nlp_dir, + data_path=docs_dir / "docs.spacy", + pipe_name="spancat", + threshold_key="threshold", + scores_key="spans_sc_f", + silent=True, )[0] == numpy.linspace(0, 1, 10)[1] ) - # Specifying the name of an non-existing pipe should fail. - nlp, _ = init_nlp() - with make_tempdir() as nlp_dir: - nlp.to_disk(nlp_dir) - with pytest.raises(SystemExit) as error: - find_threshold(nlp_dir, docs_dir / "docs", pipe_name="_") - assert error.value.code == 1 - - # Using a pipe with no textcat components should fail. - nlp = English() + # Having multiple textcat_multilabel components should work, since the name has to be specified. + nlp, _ = init_nlp((("textcat_multilabel", {}),)) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) - with pytest.raises(SystemExit) as error: - find_threshold(nlp_dir, docs_dir / "docs") - assert error.value.code == 1 + assert find_threshold( + model=nlp_dir, + data_path=docs_dir / "docs.spacy", + pipe_name="tc_multi", + threshold_key="threshold", + scores_key="cats_macro_f", + silent=True, + ) - # Specifying scores not in range 0 <= x <= 1 should fail. + # Specifying the name of an non-existing pipe should fail. nlp, _ = init_nlp() - DocBin( - docs=[ - Example.from_dict(nlp.make_doc(t[0]), t[1]).reference - for t in [ - ( - "I'm angry and confused", - {"cats": {"ANGRY": 1.0, "CONFUSED": 2.0, "HAPPY": 0.0}}, - ), - ( - "I'm confused but happy", - {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}, - ), - ] - ] - ).to_disk(docs_dir / "docs") with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) with pytest.raises(SystemExit) as error: - find_threshold(nlp_dir, docs_dir / "docs") + find_threshold( + model=nlp_dir, + data_path=docs_dir / "docs.spacy", + pipe_name="_", + threshold_key="threshold", + scores_key="cats_macro_f", + silent=True, + ) assert error.value.code == 1 From 51863cd267aed749bb192f9badf001f4e01b711e Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 1 Sep 2022 16:01:53 +0200 Subject: [PATCH 09/35] Fix Spancat test. --- spacy/tests/test_cli.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index b0d173fdf7c..ed16ea37b18 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -865,17 +865,17 @@ def make_examples(_nlp: Language) -> List[Example]: for t in [ ( - "I'm angry and confused in the Bank of America.", + "I am angry and confused in the Bank of America.", { "cats": {"ANGRY": 1.0, "CONFUSED": 1.0, "HAPPY": 0.0}, - "spans": {"sc": [(7, 10, "ORG")]}, + "spans": {"sc": [(31, 46, "ORG")]}, }, ), ( - "I'm confused but happy in New York.", + "I am confused but happy in New York.", { "cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}, - "spans": {"sc": [(6, 7, "GPE")]}, + "spans": {"sc": [(27, 35, "GPE")]}, }, ), ]: @@ -903,12 +903,9 @@ def init_nlp( if isinstance(comp, TextCategorizer): for label in textcat_labels: comp.add_label(label) - if isinstance(comp, SpanCategorizer): - comp.add_label("GPE") - comp.add_label("ORG") - _nlp.initialize() _examples = make_examples(_nlp) + _nlp.initialize(get_examples=lambda: _examples) for i in range(5): _nlp.update(_examples) @@ -936,9 +933,8 @@ def init_nlp( == numpy.linspace(0, 1, 10)[1] ) - # todo fix spancat test # Specifying name of non-MultiLabel_TextCategorizer component should fail. - nlp, _ = init_nlp((("spancat", {"spans_key": "sc", "threshold": 0.5}),)) + nlp, _ = init_nlp((("spancat", {}),)) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) assert ( From ea9737a664bfeda6003bd904e914afc26234ecba Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 1 Sep 2022 16:52:50 +0200 Subject: [PATCH 10/35] Add beta parameter to Scorer and PRFScore. --- spacy/cli/find_threshold.py | 21 ++++++++----- spacy/errors.py | 1 + spacy/scorer.py | 60 +++++++++++++++++++++---------------- 3 files changed, 50 insertions(+), 32 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index fe3bdedae52..1641c2d0458 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -7,6 +7,8 @@ import numpy import wasabi.tables +from ..pipeline import TrainablePipe, Pipe +from ..errors import Errors from ..training import Corpus from ._util import app, Arg, Opt, import_code, setup_gpu from .. import util @@ -47,7 +49,7 @@ def find_threshold_cli( threshold_key (str): Key of threshold attribute in component's configuration. scores_key (str): Name of score to metric to optimize. n_trials (int): Number of trials to determine optimal thresholds - beta (float): Beta for F1 calculation. + beta (float): Beta for F-score calculation. code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported. use_gpu (int): GPU ID or -1 for CPU. gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the @@ -79,10 +81,10 @@ def find_threshold( threshold_key: str, scores_key: str, *, - n_trials: int = _DEFAULTS["n_trials"], - beta: float = _DEFAULTS["beta"], - use_gpu: int = _DEFAULTS["use_gpu"], - gold_preproc: bool = _DEFAULTS["gold_preproc"], + n_trials: int = _DEFAULTS["n_trials"], # type: ignore + beta: float = _DEFAULTS["beta"], # type: ignore + use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore + gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore silent: bool = True, ) -> Tuple[float, float]: """ @@ -93,7 +95,7 @@ def find_threshold( threshold_key (str): Key of threshold attribute in component's configuration. scores_key (str): Name of score to metric to optimize. n_trials (int): Number of trials to determine optimal thresholds. - beta (float): Beta for F1 calculation. + beta (float): Beta for F-score calculation. use_gpu (int): GPU ID or -1 for CPU. gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due @@ -108,10 +110,13 @@ def find_threshold( wasabi.msg.fail("Evaluation data not found", data_path, exits=1) nlp = util.load_model(model) + pipe: Optional[Pipe] = None try: pipe = nlp.get_pipe(pipe_name) except KeyError as err: wasabi.msg.fail(title=str(err), exits=1) + if not isinstance(pipe, TrainablePipe): + raise TypeError(Errors.E1044) if not silent: wasabi.msg.info( @@ -140,7 +145,9 @@ def set_nested_item( scores: Dict[float, float] = {} for threshold in numpy.linspace(0, 1, n_trials): pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold) - scores[threshold] = nlp.evaluate(dev_dataset)[scores_key] + scores[threshold] = nlp.evaluate(dev_dataset, scorer_cfg={"beta": beta})[ + scores_key + ] if not ( isinstance(scores[threshold], float) or isinstance(scores[threshold], int) ): diff --git a/spacy/errors.py b/spacy/errors.py index fd412a4da8a..18d3cd5f250 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -939,6 +939,7 @@ class Errors(metaclass=ErrorsWithCodes): "`{arg2}`={arg2_values} but these arguments are conflicting.") E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got " "{value}.") + E1044 = ("Only components of type `TrainablePipe` are supported by `find_threshold()`.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/scorer.py b/spacy/scorer.py index 8cd755ac40c..3bb3c5cab6c 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -22,15 +22,12 @@ class PRFScore: """A precision / recall / F score.""" def __init__( - self, - *, - tp: int = 0, - fp: int = 0, - fn: int = 0, + self, *, tp: int = 0, fp: int = 0, fn: int = 0, beta: float = 1 ) -> None: self.tp = tp self.fp = fp self.fn = fn + self.beta = beta def __len__(self) -> int: return self.tp + self.fp + self.fn @@ -42,8 +39,12 @@ def __iadd__(self, other): return self def __add__(self, other): + assert self.beta == other.beta return PRFScore( - tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn + tp=self.tp + other.tp, + fp=self.fp + other.fp, + fn=self.fn + other.fn, + beta=self.beta, ) def score_set(self, cand: set, gold: set) -> None: @@ -63,7 +64,7 @@ def recall(self) -> float: def fscore(self) -> float: p = self.precision r = self.recall - return 2 * ((p * r) / (p + r + 1e-100)) + return (1 + self.beta**2) * ((p * r) / ((self.beta**2 * p) + r + 1e-100)) def to_dict(self) -> Dict[str, float]: return {"p": self.precision, "r": self.recall, "f": self.fscore} @@ -101,6 +102,8 @@ def score(self): class Scorer: """Compute evaluation scores.""" + BETA = 1 + def __init__( self, nlp: Optional["Language"] = None, @@ -149,8 +152,9 @@ def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]: DOCS: https://spacy.io/api/scorer#score_tokenization """ - acc_score = PRFScore() - prf_score = PRFScore() + beta = cfg.get("beta", Scorer.BETA) + acc_score = PRFScore(beta=beta) + prf_score = PRFScore(beta=beta) for example in examples: gold_doc = example.reference pred_doc = example.predicted @@ -210,7 +214,7 @@ def score_token_attr( DOCS: https://spacy.io/api/scorer#score_token_attr """ - tag_score = PRFScore() + tag_score = PRFScore(beta=cfg.get("beta", Scorer.BETA)) for example in examples: gold_doc = example.reference pred_doc = example.predicted @@ -261,7 +265,8 @@ def score_token_attr_per_feat( key attr_micro_p/r/f and the per-feat PRF scores under attr_per_feat. """ - micro_score = PRFScore() + beta = cfg.get("beta", Scorer.BETA) + micro_score = PRFScore(beta=beta) per_feat = {} for example in examples: pred_doc = example.predicted @@ -276,7 +281,7 @@ def score_token_attr_per_feat( for feat in morph.split(Morphology.FEATURE_SEP): field, values = feat.split(Morphology.FIELD_SEP) if field not in per_feat: - per_feat[field] = PRFScore() + per_feat[field] = PRFScore(beta=beta) if field not in gold_per_feat: gold_per_feat[field] = set() gold_per_feat[field].add((gold_i, feat)) @@ -298,7 +303,7 @@ def score_token_attr_per_feat( for feat in morph.split(Morphology.FEATURE_SEP): field, values = feat.split(Morphology.FIELD_SEP) if field not in per_feat: - per_feat[field] = PRFScore() + per_feat[field] = PRFScore(beta=beta) if field not in pred_per_feat: pred_per_feat[field] = set() pred_per_feat[field].add((gold_i, feat)) @@ -353,7 +358,8 @@ def score_spans( DOCS: https://spacy.io/api/scorer#score_spans """ - score = PRFScore() + beta = cfg.get("beta", Scorer.BETA) + score = PRFScore(beta=beta) score_per_type = dict() for example in examples: pred_doc = example.predicted @@ -372,7 +378,7 @@ def score_spans( gold_per_type: Dict[str, Set] = {label: set() for label in labels} for label in labels: if label not in score_per_type: - score_per_type[label] = PRFScore() + score_per_type[label] = PRFScore(beta=beta) # Find all predidate labels, for all and per type gold_spans = set() pred_spans = set() @@ -469,9 +475,10 @@ def score_cats( DOCS: https://spacy.io/api/scorer#score_cats """ + beta = cfg.get("beta", Scorer.BETA) if threshold is None: threshold = 0.5 if multi_label else 0.0 - f_per_type = {label: PRFScore() for label in labels} + f_per_type = {label: PRFScore(beta=beta) for label in labels} auc_per_type = {label: ROCAUCScore() for label in labels} labels = set(labels) if labels: @@ -519,7 +526,7 @@ def score_cats( pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) if pred_score >= threshold: f_per_type[pred_label].fp += 1 - micro_prf = PRFScore() + micro_prf = PRFScore(beta=beta) for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp micro_prf.fn += label_prf.fn @@ -576,6 +583,7 @@ def score_links( DOCS: https://spacy.io/api/scorer#score_links """ + beta = cfg.get("beta", Scorer.BETA) f_per_type = {} for example in examples: gold_ent_by_offset = {} @@ -589,7 +597,7 @@ def score_links( if gold_span is not None: label = gold_span.label_ if label not in f_per_type: - f_per_type[label] = PRFScore() + f_per_type[label] = PRFScore(beta=beta) gold = gold_span.kb_id_ # only evaluating entities that overlap between gold and pred, # to disentangle the performance of the NEL from the NER @@ -608,7 +616,7 @@ def score_links( # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN f_per_type[label].fp += 1 f_per_type[label].fn += 1 - micro_prf = PRFScore() + micro_prf = PRFScore(beta=beta) for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp micro_prf.fn += label_prf.fn @@ -663,8 +671,9 @@ def score_deps( DOCS: https://spacy.io/api/scorer#score_deps """ - unlabelled = PRFScore() - labelled = PRFScore() + beta = cfg.get("beta", Scorer.BETA) + unlabelled = PRFScore(beta=beta) + labelled = PRFScore(beta=beta) labelled_per_dep = dict() missing_indices = set() for example in examples: @@ -680,7 +689,7 @@ def score_deps( if dep not in ignore_labels: gold_deps.add((gold_i, head.i, dep)) if dep not in labelled_per_dep: - labelled_per_dep[dep] = PRFScore() + labelled_per_dep[dep] = PRFScore(beta=beta) if dep not in gold_deps_per_dep: gold_deps_per_dep[dep] = set() gold_deps_per_dep[dep].add((gold_i, head.i, dep)) @@ -711,7 +720,7 @@ def score_deps( else: pred_deps.add((gold_i, gold_head, dep)) if dep not in labelled_per_dep: - labelled_per_dep[dep] = PRFScore() + labelled_per_dep[dep] = PRFScore(beta=beta) if dep not in pred_deps_per_dep: pred_deps_per_dep[dep] = set() pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) @@ -742,6 +751,7 @@ def score_deps( def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Compute micro-PRF and per-entity PRF scores for a sequence of examples.""" score_per_type = defaultdict(PRFScore) + beta = kwargs.get("beta", Scorer.BETA) for eg in examples: if not eg.y.has_annotation("ENT_IOB"): continue @@ -749,7 +759,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: align_x2y = eg.alignment.x2y for pred_ent in eg.x.ents: if pred_ent.label_ not in score_per_type: - score_per_type[pred_ent.label_] = PRFScore() + score_per_type[pred_ent.label_] = PRFScore(beta=beta) indices = align_x2y[pred_ent.start : pred_ent.end] if len(indices): g_span = eg.y[indices[0] : indices[-1] + 1] @@ -765,7 +775,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: score_per_type[pred_ent.label_].fp += 1 for label, start, end in golds: score_per_type[label].fn += 1 - totals = PRFScore() + totals = PRFScore(beta=beta) for prf in score_per_type.values(): totals += prf if len(totals) > 0: From 110850f095c9c328be3b4559b78733198b8e40e6 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 2 Sep 2022 12:35:46 +0200 Subject: [PATCH 11/35] Make beta a component scorer setting. --- spacy/cli/find_threshold.py | 9 ++++++--- spacy/errors.py | 3 ++- spacy/pipeline/spancat.py | 5 +++-- spacy/pipeline/textcat_multilabel.py | 5 +++-- spacy/scorer.py | 8 +++++--- spacy/tests/test_cli.py | 4 ++-- 6 files changed, 21 insertions(+), 13 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 1641c2d0458..6ce4f732133 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -1,4 +1,5 @@ import functools +from functools import partial import operator from pathlib import Path import logging @@ -115,8 +116,12 @@ def find_threshold( pipe = nlp.get_pipe(pipe_name) except KeyError as err: wasabi.msg.fail(title=str(err), exits=1) + if not isinstance(pipe, TrainablePipe): raise TypeError(Errors.E1044) + if not hasattr(pipe, "scorer"): + raise AttributeError(Errors.E1045) + setattr(pipe, "scorer", partial(pipe.scorer.func, beta=beta)) if not silent: wasabi.msg.info( @@ -145,9 +150,7 @@ def set_nested_item( scores: Dict[float, float] = {} for threshold in numpy.linspace(0, 1, n_trials): pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold) - scores[threshold] = nlp.evaluate(dev_dataset, scorer_cfg={"beta": beta})[ - scores_key - ] + scores[threshold] = nlp.evaluate(dev_dataset)[scores_key] if not ( isinstance(scores[threshold], float) or isinstance(scores[threshold], int) ): diff --git a/spacy/errors.py b/spacy/errors.py index 18d3cd5f250..08ab40987c4 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -939,7 +939,8 @@ class Errors(metaclass=ErrorsWithCodes): "`{arg2}`={arg2_values} but these arguments are conflicting.") E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got " "{value}.") - E1044 = ("Only components of type `TrainablePipe` are supported by `find_threshold()`.") + E1044 = ("`find_threshold()` only supports components of type `TrainablePipe`.") + E1045 = ("`find_threshold()` only supports components with a `scorer` attribute.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 1b7a9eecb9b..ce34a20d0d6 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,3 +1,4 @@ +from functools import partial from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Optimizer @@ -165,8 +166,8 @@ def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: @registry.scorers("spacy.spancat_scorer.v1") -def make_spancat_scorer(): - return spancat_score +def make_spancat_scorer(beta: float = 1.0): + return partial(spancat_score, beta=beta) class SpanCategorizer(TrainablePipe): diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index e33a885f833..cb9ae77199e 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -1,3 +1,4 @@ +from functools import partial from typing import Iterable, Optional, Dict, List, Callable, Any from thinc.types import Floats2d from thinc.api import Model, Config @@ -121,8 +122,8 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, @registry.scorers("spacy.textcat_multilabel_scorer.v1") -def make_textcat_multilabel_scorer(): - return textcat_multilabel_score +def make_textcat_multilabel_scorer(beta: float = 1.0): + return partial(textcat_multilabel_score, beta=beta) class MultiLabel_TextCategorizer(TextCategorizer): diff --git a/spacy/scorer.py b/spacy/scorer.py index 3bb3c5cab6c..0a893fcce5b 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -102,7 +102,7 @@ def score(self): class Scorer: """Compute evaluation scores.""" - BETA = 1 + BETA = 1.0 def __init__( self, @@ -336,6 +336,7 @@ def score_spans( has_annotation: Optional[Callable[[Doc], bool]] = None, labeled: bool = True, allow_overlap: bool = False, + beta: float = 1.0, **cfg, ) -> Dict[str, Any]: """Returns PRF scores for labeled spans. @@ -353,12 +354,12 @@ def score_spans( equal if their start and end match, irrespective of their label. allow_overlap (bool): Whether or not to allow overlapping spans. If set to 'False', the alignment will automatically resolve conflicts. + beta (float): Beta coefficient for F-score calculation. Defaults to 1.0. RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under the keys attr_p/r/f and the per-type PRF scores under attr_per_type. DOCS: https://spacy.io/api/scorer#score_spans """ - beta = cfg.get("beta", Scorer.BETA) score = PRFScore(beta=beta) score_per_type = dict() for example in examples: @@ -439,6 +440,7 @@ def score_cats( multi_label: bool = True, positive_label: Optional[str] = None, threshold: Optional[float] = None, + beta: float = 1.0, **cfg, ) -> Dict[str, Any]: """Returns PRF and ROC AUC scores for a doc-level attribute with a @@ -458,6 +460,7 @@ def score_cats( threshold (float): Cutoff to consider a prediction "positive". Defaults to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring) otherwise. + beta (float): Beta coefficient for F-score calculation. RETURNS (Dict[str, Any]): A dictionary containing the scores, with inapplicable scores as None: for all: @@ -475,7 +478,6 @@ def score_cats( DOCS: https://spacy.io/api/scorer#score_cats """ - beta = cfg.get("beta", Scorer.BETA) if threshold is None: threshold = 0.5 if multi_label else 0.0 f_per_type = {label: PRFScore(beta=beta) for label in labels} diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index ed16ea37b18..733c7c87653 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -928,12 +928,12 @@ def init_nlp( pipe_name="tc_multi", threshold_key="threshold", scores_key="cats_macro_f", - silent=True, + silent=False, )[0] == numpy.linspace(0, 1, 10)[1] ) - # Specifying name of non-MultiLabel_TextCategorizer component should fail. + # Test with spancat. nlp, _ = init_nlp((("spancat", {}),)) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) From 24b69a1be81c53519f65535ffe8f355cde0232a7 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 2 Sep 2022 14:21:06 +0200 Subject: [PATCH 12/35] Remove beta. --- spacy/pipeline/spancat.py | 9 +++--- spacy/pipeline/textcat_multilabel.py | 5 ++- spacy/scorer.py | 47 +++++++++++----------------- 3 files changed, 24 insertions(+), 37 deletions(-) diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index ce34a20d0d6..4ddd6c9fff1 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,8 +1,7 @@ -from functools import partial -from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast +from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Optimizer -from thinc.types import Ragged, Ints2d, Floats2d, Ints1d +from thinc.types import Ragged, Ints2d, Floats2d import numpy @@ -166,8 +165,8 @@ def spancat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: @registry.scorers("spacy.spancat_scorer.v1") -def make_spancat_scorer(beta: float = 1.0): - return partial(spancat_score, beta=beta) +def make_spancat_scorer(): + return spancat_score class SpanCategorizer(TrainablePipe): diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index cb9ae77199e..e33a885f833 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -1,4 +1,3 @@ -from functools import partial from typing import Iterable, Optional, Dict, List, Callable, Any from thinc.types import Floats2d from thinc.api import Model, Config @@ -122,8 +121,8 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str, @registry.scorers("spacy.textcat_multilabel_scorer.v1") -def make_textcat_multilabel_scorer(beta: float = 1.0): - return partial(textcat_multilabel_score, beta=beta) +def make_textcat_multilabel_scorer(): + return textcat_multilabel_score class MultiLabel_TextCategorizer(TextCategorizer): diff --git a/spacy/scorer.py b/spacy/scorer.py index 0a893fcce5b..95258af9e17 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -102,8 +102,6 @@ def score(self): class Scorer: """Compute evaluation scores.""" - BETA = 1.0 - def __init__( self, nlp: Optional["Language"] = None, @@ -152,9 +150,8 @@ def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]: DOCS: https://spacy.io/api/scorer#score_tokenization """ - beta = cfg.get("beta", Scorer.BETA) - acc_score = PRFScore(beta=beta) - prf_score = PRFScore(beta=beta) + acc_score = PRFScore() + prf_score = PRFScore() for example in examples: gold_doc = example.reference pred_doc = example.predicted @@ -214,7 +211,7 @@ def score_token_attr( DOCS: https://spacy.io/api/scorer#score_token_attr """ - tag_score = PRFScore(beta=cfg.get("beta", Scorer.BETA)) + tag_score = PRFScore() for example in examples: gold_doc = example.reference pred_doc = example.predicted @@ -265,8 +262,7 @@ def score_token_attr_per_feat( key attr_micro_p/r/f and the per-feat PRF scores under attr_per_feat. """ - beta = cfg.get("beta", Scorer.BETA) - micro_score = PRFScore(beta=beta) + micro_score = PRFScore() per_feat = {} for example in examples: pred_doc = example.predicted @@ -281,7 +277,7 @@ def score_token_attr_per_feat( for feat in morph.split(Morphology.FEATURE_SEP): field, values = feat.split(Morphology.FIELD_SEP) if field not in per_feat: - per_feat[field] = PRFScore(beta=beta) + per_feat[field] = PRFScore() if field not in gold_per_feat: gold_per_feat[field] = set() gold_per_feat[field].add((gold_i, feat)) @@ -303,7 +299,7 @@ def score_token_attr_per_feat( for feat in morph.split(Morphology.FEATURE_SEP): field, values = feat.split(Morphology.FIELD_SEP) if field not in per_feat: - per_feat[field] = PRFScore(beta=beta) + per_feat[field] = PRFScore() if field not in pred_per_feat: pred_per_feat[field] = set() pred_per_feat[field].add((gold_i, feat)) @@ -336,7 +332,6 @@ def score_spans( has_annotation: Optional[Callable[[Doc], bool]] = None, labeled: bool = True, allow_overlap: bool = False, - beta: float = 1.0, **cfg, ) -> Dict[str, Any]: """Returns PRF scores for labeled spans. @@ -354,13 +349,12 @@ def score_spans( equal if their start and end match, irrespective of their label. allow_overlap (bool): Whether or not to allow overlapping spans. If set to 'False', the alignment will automatically resolve conflicts. - beta (float): Beta coefficient for F-score calculation. Defaults to 1.0. RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under the keys attr_p/r/f and the per-type PRF scores under attr_per_type. DOCS: https://spacy.io/api/scorer#score_spans """ - score = PRFScore(beta=beta) + score = PRFScore() score_per_type = dict() for example in examples: pred_doc = example.predicted @@ -379,7 +373,7 @@ def score_spans( gold_per_type: Dict[str, Set] = {label: set() for label in labels} for label in labels: if label not in score_per_type: - score_per_type[label] = PRFScore(beta=beta) + score_per_type[label] = PRFScore() # Find all predidate labels, for all and per type gold_spans = set() pred_spans = set() @@ -440,7 +434,6 @@ def score_cats( multi_label: bool = True, positive_label: Optional[str] = None, threshold: Optional[float] = None, - beta: float = 1.0, **cfg, ) -> Dict[str, Any]: """Returns PRF and ROC AUC scores for a doc-level attribute with a @@ -460,7 +453,6 @@ def score_cats( threshold (float): Cutoff to consider a prediction "positive". Defaults to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring) otherwise. - beta (float): Beta coefficient for F-score calculation. RETURNS (Dict[str, Any]): A dictionary containing the scores, with inapplicable scores as None: for all: @@ -480,7 +472,7 @@ def score_cats( """ if threshold is None: threshold = 0.5 if multi_label else 0.0 - f_per_type = {label: PRFScore(beta=beta) for label in labels} + f_per_type = {label: PRFScore() for label in labels} auc_per_type = {label: ROCAUCScore() for label in labels} labels = set(labels) if labels: @@ -528,7 +520,7 @@ def score_cats( pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) if pred_score >= threshold: f_per_type[pred_label].fp += 1 - micro_prf = PRFScore(beta=beta) + micro_prf = PRFScore() for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp micro_prf.fn += label_prf.fn @@ -585,7 +577,6 @@ def score_links( DOCS: https://spacy.io/api/scorer#score_links """ - beta = cfg.get("beta", Scorer.BETA) f_per_type = {} for example in examples: gold_ent_by_offset = {} @@ -599,7 +590,7 @@ def score_links( if gold_span is not None: label = gold_span.label_ if label not in f_per_type: - f_per_type[label] = PRFScore(beta=beta) + f_per_type[label] = PRFScore() gold = gold_span.kb_id_ # only evaluating entities that overlap between gold and pred, # to disentangle the performance of the NEL from the NER @@ -618,7 +609,7 @@ def score_links( # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN f_per_type[label].fp += 1 f_per_type[label].fn += 1 - micro_prf = PRFScore(beta=beta) + micro_prf = PRFScore() for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp micro_prf.fn += label_prf.fn @@ -673,9 +664,8 @@ def score_deps( DOCS: https://spacy.io/api/scorer#score_deps """ - beta = cfg.get("beta", Scorer.BETA) - unlabelled = PRFScore(beta=beta) - labelled = PRFScore(beta=beta) + unlabelled = PRFScore() + labelled = PRFScore() labelled_per_dep = dict() missing_indices = set() for example in examples: @@ -691,7 +681,7 @@ def score_deps( if dep not in ignore_labels: gold_deps.add((gold_i, head.i, dep)) if dep not in labelled_per_dep: - labelled_per_dep[dep] = PRFScore(beta=beta) + labelled_per_dep[dep] = PRFScore() if dep not in gold_deps_per_dep: gold_deps_per_dep[dep] = set() gold_deps_per_dep[dep].add((gold_i, head.i, dep)) @@ -722,7 +712,7 @@ def score_deps( else: pred_deps.add((gold_i, gold_head, dep)) if dep not in labelled_per_dep: - labelled_per_dep[dep] = PRFScore(beta=beta) + labelled_per_dep[dep] = PRFScore() if dep not in pred_deps_per_dep: pred_deps_per_dep[dep] = set() pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) @@ -753,7 +743,6 @@ def score_deps( def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Compute micro-PRF and per-entity PRF scores for a sequence of examples.""" score_per_type = defaultdict(PRFScore) - beta = kwargs.get("beta", Scorer.BETA) for eg in examples: if not eg.y.has_annotation("ENT_IOB"): continue @@ -761,7 +750,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: align_x2y = eg.alignment.x2y for pred_ent in eg.x.ents: if pred_ent.label_ not in score_per_type: - score_per_type[pred_ent.label_] = PRFScore(beta=beta) + score_per_type[pred_ent.label_] = PRFScore() indices = align_x2y[pred_ent.start : pred_ent.end] if len(indices): g_span = eg.y[indices[0] : indices[-1] + 1] @@ -777,7 +766,7 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: score_per_type[pred_ent.label_].fp += 1 for label, start, end in golds: score_per_type[label].fn += 1 - totals = PRFScore(beta=beta) + totals = PRFScore() for prf in score_per_type.values(): totals += prf if len(totals) > 0: From 73432c6bfb53fa3b79b2f6e2a1595009a4b60e6a Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 2 Sep 2022 16:25:57 +0200 Subject: [PATCH 13/35] Update nlp.config (workaround). --- spacy/cli/find_threshold.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 6ce4f732133..a451b7d4a36 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -1,5 +1,4 @@ import functools -from functools import partial import operator from pathlib import Path import logging @@ -7,6 +6,7 @@ import numpy import wasabi.tables +from confection import Config from ..pipeline import TrainablePipe, Pipe from ..errors import Errors @@ -121,7 +121,6 @@ def find_threshold( raise TypeError(Errors.E1044) if not hasattr(pipe, "scorer"): raise AttributeError(Errors.E1045) - setattr(pipe, "scorer", partial(pipe.scorer.func, beta=beta)) if not silent: wasabi.msg.info( @@ -150,6 +149,7 @@ def set_nested_item( scores: Dict[float, float] = {} for threshold in numpy.linspace(0, 1, n_trials): pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold) + nlp._pipe_configs[pipe_name] = Config(pipe.cfg) scores[threshold] = nlp.evaluate(dev_dataset)[scores_key] if not ( isinstance(scores[threshold], float) or isinstance(scores[threshold], int) From 20c4a0d6137d1c338e157872274b634cf99f2c2d Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 5 Sep 2022 11:39:29 +0200 Subject: [PATCH 14/35] Reload pipeline on threshold change. Adjust tests. Remove confection reference. --- spacy/cli/find_threshold.py | 35 ++++++++++++++++++++++------ spacy/tests/test_cli.py | 46 ++++++++++++++++++++----------------- 2 files changed, 53 insertions(+), 28 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index a451b7d4a36..082fa3b8210 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -6,7 +6,6 @@ import numpy import wasabi.tables -from confection import Config from ..pipeline import TrainablePipe, Pipe from ..errors import Errors @@ -87,7 +86,7 @@ def find_threshold( use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore silent: bool = True, -) -> Tuple[float, float]: +) -> Tuple[float, float, Dict[float, float]]: """ Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric. model (Union[str, Path]): Path to file with trained model. @@ -102,7 +101,8 @@ def find_threshold( tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due to train/test skew. silent (bool): Whether to print non-error-related output to stdout. - RETURNS (Tuple[float, float]): Best found threshold with corresponding F-score. + RETURNS (Tuple[float, float, Dict[float, float]]): Best found threshold, the corresponding score, scores for all + evaluated thresholds. """ setup_gpu(use_gpu, silent=silent) @@ -138,18 +138,39 @@ def set_nested_item( ) -> Dict[str, Any]: """Set item in nested dictionary. Adapated from https://stackoverflow.com/a/54138200. config (Dict[str, Any]): Configuration dictionary. - keys (List[Any]): + keys (List[Any]): Path to value to set. value (float): Value to set. RETURNS (Dict[str, Any]): Updated dictionary. """ functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value return config + def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: + """Filters provided config dictionary so that only the specified keys path remains. + config (Dict[str, Any]): Configuration dictionary. + keys (List[Any]): Path to value to set. + RETURNS (Dict[str, Any]): Filtered dictionary. + """ + return { + keys[0]: filter_config(config[keys[0]], keys[1:]) + if len(keys) > 1 + else config[keys[0]] + } + # Evaluate with varying threshold values. scores: Dict[float, float] = {} + config_keys_full = ["components", pipe_name, *config_keys] for threshold in numpy.linspace(0, 1, n_trials): - pipe.cfg = set_nested_item(pipe.cfg, config_keys, threshold) - nlp._pipe_configs[pipe_name] = Config(pipe.cfg) + # Reload pipeline with overrides specifying the new threshold. + nlp = util.load_model( + model, + config=set_nested_item( + filter_config(nlp.config, config_keys_full).copy(), + config_keys_full, + threshold, + ), + ) + nlp.get_pipe(pipe_name).cfg = set_nested_item(pipe.cfg, config_keys, threshold) scores[threshold] = nlp.evaluate(dev_dataset)[scores_key] if not ( isinstance(scores[threshold], float) or isinstance(scores[threshold], int) @@ -170,4 +191,4 @@ def set_nested_item( ), ) - return best_threshold, scores[best_threshold] + return best_threshold, scores[best_threshold], scores diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 733c7c87653..5aa4fe43b76 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -36,7 +36,7 @@ from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs -from spacy.pipeline import TextCategorizer, Pipe, SpanCategorizer +from spacy.pipeline import TextCategorizer from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config from ..cli.init_pipeline import _init_labels @@ -860,6 +860,8 @@ def test_span_length_freq_dist_output_must_be_correct(): def test_cli_find_threshold(capsys): + thresholds = numpy.linspace(0, 1, 10) + def make_examples(_nlp: Language) -> List[Example]: docs: List[Example] = [] @@ -921,33 +923,35 @@ def init_nlp( ) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) - assert ( - find_threshold( - model=nlp_dir, - data_path=docs_dir / "docs.spacy", - pipe_name="tc_multi", - threshold_key="threshold", - scores_key="cats_macro_f", - silent=False, - )[0] - == numpy.linspace(0, 1, 10)[1] + res = find_threshold( + model=nlp_dir, + data_path=docs_dir / "docs.spacy", + pipe_name="tc_multi", + threshold_key="threshold", + scores_key="cats_macro_f", + silent=False, ) + assert res[0] != thresholds[0] + assert thresholds[0] < res[0] < thresholds[9] + assert res[1] == 1.0 + assert res[2][1.0] == 0.0 # Test with spancat. nlp, _ = init_nlp((("spancat", {}),)) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) - assert ( - find_threshold( - model=nlp_dir, - data_path=docs_dir / "docs.spacy", - pipe_name="spancat", - threshold_key="threshold", - scores_key="spans_sc_f", - silent=True, - )[0] - == numpy.linspace(0, 1, 10)[1] + res = find_threshold( + model=nlp_dir, + data_path=docs_dir / "docs.spacy", + pipe_name="spancat", + threshold_key="threshold", + scores_key="spans_sc_f", + silent=True, ) + assert res[0] != thresholds[0] + assert thresholds[0] < res[0] < thresholds[8] + assert res[1] == 1.0 + assert res[2][1.0] == 0.0 # Having multiple textcat_multilabel components should work, since the name has to be specified. nlp, _ = init_nlp((("textcat_multilabel", {}),)) From 03666f6e4ef66e22eaf1be9fb289c0f4a20422f7 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 5 Sep 2022 12:19:36 +0200 Subject: [PATCH 15/35] Remove assumption of component being a Pipe object or having a .cfg attribute. --- spacy/cli/find_threshold.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 082fa3b8210..0b8e6fbdbfc 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -7,7 +7,6 @@ import numpy import wasabi.tables -from ..pipeline import TrainablePipe, Pipe from ..errors import Errors from ..training import Corpus from ._util import app, Arg, Opt, import_code, setup_gpu @@ -111,14 +110,11 @@ def find_threshold( wasabi.msg.fail("Evaluation data not found", data_path, exits=1) nlp = util.load_model(model) - pipe: Optional[Pipe] = None + pipe: Optional[Any] = None try: pipe = nlp.get_pipe(pipe_name) except KeyError as err: wasabi.msg.fail(title=str(err), exits=1) - - if not isinstance(pipe, TrainablePipe): - raise TypeError(Errors.E1044) if not hasattr(pipe, "scorer"): raise AttributeError(Errors.E1045) @@ -170,10 +166,16 @@ def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: threshold, ), ) - nlp.get_pipe(pipe_name).cfg = set_nested_item(pipe.cfg, config_keys, threshold) + if hasattr(pipe, "cfg"): + setattr( + nlp.get_pipe(pipe_name), + "cfg", + set_nested_item(getattr(pipe, "cfg"), config_keys, threshold), + ) + scores[threshold] = nlp.evaluate(dev_dataset)[scores_key] - if not ( - isinstance(scores[threshold], float) or isinstance(scores[threshold], int) + if not isinstance(scores[threshold], float) and not isinstance( + scores[threshold], int ): wasabi.msg.fail( f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric " From b61cf873b3291c7efc975f1e36b34d8e85654566 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 5 Sep 2022 13:53:19 +0200 Subject: [PATCH 16/35] Adjust test output and reference values. --- spacy/tests/test_cli.py | 4 +- spacy/tests/universe/universe.json | 3831 ++++++++++++++++++++++++++++ 2 files changed, 3833 insertions(+), 2 deletions(-) create mode 100644 spacy/tests/universe/universe.json diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 5aa4fe43b76..7b5bc88c3b0 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -929,7 +929,7 @@ def init_nlp( pipe_name="tc_multi", threshold_key="threshold", scores_key="cats_macro_f", - silent=False, + silent=True, ) assert res[0] != thresholds[0] assert thresholds[0] < res[0] < thresholds[9] @@ -950,7 +950,7 @@ def init_nlp( ) assert res[0] != thresholds[0] assert thresholds[0] < res[0] < thresholds[8] - assert res[1] == 1.0 + assert res[1] >= 0.6 assert res[2][1.0] == 0.0 # Having multiple textcat_multilabel components should work, since the name has to be specified. diff --git a/spacy/tests/universe/universe.json b/spacy/tests/universe/universe.json new file mode 100644 index 00000000000..b1a61598ecf --- /dev/null +++ b/spacy/tests/universe/universe.json @@ -0,0 +1,3831 @@ +{ + "resources": [ + { + "id": "spacypdfreader", + "title": "spadypdfreader", + "category": ["pipeline"], + "tags": ["PDF"], + "slogan": "Easy PDF to text to spaCy text extraction in Python.", + "description": "*spacypdfreader* is a Python library that allows you to convert PDF files directly into *spaCy* `Doc` objects. The library provides several built in parsers or bring your own parser. `Doc` objects are annotated with several custom attributes including: `token._.page_number`, `doc._.page_range`, `doc._.first_page`, `doc._.last_page`, `doc._.pdf_file_name`, and `doc._.page(int)`.", + "github": "SamEdwardes/spacypdfreader", + "pip": "spacypdfreader", + "url": "https://samedwardes.github.io/spacypdfreader/", + "code_language": "python", + "author": "Sam Edwardes", + "author_links": { + "twitter": "TheReaLSamlam", + "github": "SamEdwardes", + "website": "https://samedwardes.com" + }, + "code_example": [ + "import spacy", + "from spacypdfreader import pdf_reader", + "", + "nlp = spacy.load('en_core_web_sm')", + "doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)", + "", + "# Get the page number of any token.", + "print(doc[0]._.page_number) # 1", + "print(doc[-1]._.page_number) # 4", + "", + "# Get page meta data about the PDF document.", + "print(doc._.pdf_file_name) # 'tests/data/test_pdf_01.pdf'", + "print(doc._.page_range) # (1, 4)", + "print(doc._.first_page) # 1", + "print(doc._.last_page) # 4", + "", + "# Get all of the text from a specific PDF page.", + "print(doc._.page(4)) # 'able to display the destination page (unless...'" + ] + }, + { + "id": "nlpcloud", + "title": "NLPCloud.io", + "slogan": "Production-ready API for spaCy models in production", + "description": "A highly-available hosted API to easily deploy and use spaCy models in production. Supports NER, POS tagging, dependency parsing, and tokenization.", + "github": "nlpcloud", + "pip": "nlpcloud", + "code_example": [ + "import nlpcloud", + "", + "client = nlpcloud.Client('en_core_web_lg', '4eC39HqLyjWDarjtT1zdp7dc')", + "client.entities('John Doe is a Go Developer at Google')", + "# [{'end': 8, 'start': 0, 'text': 'John Doe', 'type': 'PERSON'}, {'end': 25, 'start': 13, 'text': 'Go Developer', 'type': 'POSITION'}, {'end': 35,'start': 30, 'text': 'Google', 'type': 'ORG'}]" + ], + "thumb": "https://avatars.githubusercontent.com/u/77671902", + "image": "https://nlpcloud.io/assets/images/logo.svg", + "code_language": "python", + "author": "NLPCloud.io", + "author_links": { + "github": "nlpcloud", + "twitter": "cloud_nlp", + "website": "https://nlpcloud.io" + }, + "category": ["apis", "nonpython", "standalone"], + "tags": ["api", "deploy", "production"] + }, + { + "id": "eMFDscore", + "title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python", + "slogan": "Extended Moral Foundation Dictionary Scoring for Python", + "description": "eMFDscore is a library for the fast and flexible extraction of various moral information metrics from textual input data. eMFDscore is built on spaCy for faster execution and performs minimal preprocessing consisting of tokenization, syntactic dependency parsing, lower-casing, and stopword/punctuation/whitespace removal. eMFDscore lets users score documents with multiple Moral Foundations Dictionaries, provides various metrics for analyzing moral information, and extracts moral patient, agent, and attribute words related to entities.", + "github": "medianeuroscience/emfdscore", + "code_example": [ + "from emfdscore.scoring import score_docs", + "import pandas as pd", + "template_input = pd.read_csv('emfdscore/template_input.csv', header=None)", + "DICT_TYPE = 'emfd'", + "PROB_MAP = 'single'", + "SCORE_METHOD = 'bow'", + "OUT_METRICS = 'vice-virtue'", + "OUT_CSV_PATH = 'single-vv.csv'", + "df = score_docs(template_input,DICT_TYPE,PROB_MAP,SCORE_METHOD,OUT_METRICS,num_docs)" + ], + "code_language": "python", + "author": "Media Neuroscience Lab", + "author_links": { + "github": "medianeuroscience", + "twitter": "medianeuro" + }, + "category": ["research", "teaching"], + "tags": ["morality", "dictionary", "sentiment"] + }, + { + "id": "skweak", + "title": "skweak", + "slogan": "Weak supervision for NLP", + "description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.", + "github": "NorskRegnesentral/skweak", + "pip": "skweak", + "code_example": [ + "import spacy, re", + "from skweak import heuristics, gazetteers, aggregation, utils", + "", + "# LF 1: heuristic to detect occurrences of MONEY entities", + "def money_detector(doc):", + " for tok in doc[1:]:", + " if tok.text[0].isdigit() and tok.nbor(-1).is_currency:", + " yield tok.i-1, tok.i+1, 'MONEY'", + "lf1 = heuristics.FunctionAnnotator('money', money_detector)", + "", + "# LF 2: detection of years with a regex", + "lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')", + "", + "# LF 3: a gazetteer with a few names", + "NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]", + "trie = gazetteers.Trie(NAMES)", + "lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})", + "", + "# We create a corpus (here with a single text)", + "nlp = spacy.load('en_core_web_sm')", + "doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')", + "", + "# apply the labelling functions", + "doc = lf3(lf2(lf1(doc)))", + "", + "# and aggregate them", + "hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])", + "hmm.fit_and_aggregate([doc])", + "", + "# we can then visualise the final result (in Jupyter)", + "utils.display_entities(doc, 'hmm')" + ], + "code_language": "python", + "url": "https://github.com/NorskRegnesentral/skweak", + "thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg", + "image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg", + "author": "Pierre Lison", + "author_links": { + "twitter": "plison2", + "github": "plison", + "website": "https://www.nr.no/~plison" + }, + "category": ["pipeline", "standalone", "research", "training"], + "tags": [], + "spacy_version": 3 + }, + { + "id": "numerizer", + "title": "numerizer", + "slogan": "Convert natural language numerics into ints and floats.", + "description": "A SpaCy extension for Docs, Spans and Tokens that converts numerical words and quantitative named entities into numeric strings.", + "github": "jaidevd/numerizer", + "pip": "numerizer", + "code_example": [ + "from spacy import load", + "import numerizer", + "nlp = load('en_core_web_sm') # or any other model", + "doc = nlp('The Hogwarts Express is at platform nine and three quarters')", + "doc._.numerize()", + "# {nine and three quarters: '9.75'}" + ], + "author": "Jaidev Deshpande", + "author_links": { + "github": "jaidevd", + "twitter": "jaidevd" + }, + "category": ["standalone"] + }, + { + "id": "spikex", + "title": "SpikeX - SpaCy Pipes for Knowledge Extraction", + "slogan": "Use SpikeX to build knowledge extraction tools with almost-zero effort", + "description": "SpikeX is a collection of pipes ready to be plugged in a spaCy pipeline. It aims to help in building knowledge extraction tools with almost-zero effort.", + "github": "erre-quadro/spikex", + "pip": "spikex", + "code_example": [ + "from spacy import load as spacy_load", + "from spikex.wikigraph import load as wg_load", + "from spikex.pipes import WikiPageX", + "", + "# load a spacy model and get a doc", + "nlp = spacy_load('en_core_web_sm')", + "doc = nlp('An apple a day keeps the doctor away')", + "# load a WikiGraph", + "wg = wg_load('simplewiki_core')", + "# get a WikiPageX and extract all pages", + "wikipagex = WikiPageX(wg)", + "doc = wikipagex(doc)", + "# see all pages extracted from the doc", + "for span in doc._.wiki_spans:", + " print(span._.wiki_pages)" + ], + "category": ["pipeline", "standalone"], + "author": "Erre Quadro", + "author_links": { + "github": "erre-quadro", + "website": "https://www.errequadrosrl.com" + } + }, + { + "id": "spacy-dbpedia-spotlight", + "title": "DBpedia Spotlight for SpaCy", + "slogan": "Use DBpedia Spotlight to link entities inside SpaCy", + "description": "This library links SpaCy with [DBpedia Spotlight](https://www.dbpedia-spotlight.org/). You can easily get the DBpedia entities from your documents, using the public web service or by using your own instance of DBpedia Spotlight. The `doc.ents` are populated with the entities and all their details (URI, type, ...).", + "github": "MartinoMensio/spacy-dbpedia-spotlight", + "pip": "spacy-dbpedia-spotlight", + "code_example": [ + "import spacy_dbpedia_spotlight", + "# load your model as usual", + "nlp = spacy.load('en_core_web_lg')", + "# add the pipeline stage", + "nlp.add_pipe('dbpedia_spotlight')", + "# get the document", + "doc = nlp('The president of USA is calling Boris Johnson to decide what to do about coronavirus')", + "# see the entities", + "print('Entities', [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])", + "# inspect the raw data from DBpedia spotlight", + "print(doc.ents[0]._.dbpedia_raw_result)" + ], + "category": ["models", "pipeline"], + "author": "Martino Mensio", + "author_links": { + "twitter": "MartinoMensio", + "github": "MartinoMensio", + "website": "https://martinomensio.github.io" + } + }, + { + "id": "spacy-textblob", + "title": "spaCyTextBlob", + "slogan": "Easy sentiment analysis for spaCy using TextBlob. Now supports spaCy 3.0!", + "thumb": "https://github.com/SamEdwardes/spaCyTextBlob/raw/main/website/static/img/logo-thumb-square-250x250.png", + "description": "spaCyTextBlob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extensions `._.polarity`, `._.subjectivity`, and `._.assessments` to `Doc`, `Span`, and `Token` objects. For spaCy 2 please use `pip install pip install spacytextblob==0.1.7`", + "github": "SamEdwardes/spaCyTextBlob", + "pip": "spacytextblob", + "code_example": [ + "import spacy", + "from spacytextblob.spacytextblob import SpacyTextBlob", + "", + "nlp = spacy.load('en_core_web_sm')", + "nlp.add_pipe('spacytextblob')", + "text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'", + "doc = nlp(text)", + "doc._.polarity # Polarity: -0.125", + "doc._.subjectivity # Sujectivity: 0.9", + "doc._.assessments # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]" + ], + "code_language": "python", + "url": "https://spacytextblob.netlify.app/", + "author": "Sam Edwardes", + "author_links": { + "twitter": "TheReaLSamlam", + "github": "SamEdwardes", + "website": "https://samedwardes.com" + }, + "category": ["pipeline"], + "tags": ["sentiment", "textblob"] + }, + { + "id": "spacy-ray", + "title": "spacy-ray", + "slogan": "Parallel and distributed training with spaCy and Ray", + "description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.", + "github": "explosion/spacy-ray", + "pip": "spacy-ray", + "category": ["training"], + "author": "Explosion / Anyscale", + "thumb": "https://i.imgur.com/7so6ZpS.png" + }, + { + "id": "spacy-sentence-bert", + "title": "spaCy - sentence-transformers", + "slogan": "Pipelines for pretrained sentence-transformers (BERT, RoBERTa, XLM-RoBERTa & Co.) directly within spaCy", + "description": "This library lets you use the embeddings from [sentence-transformers](https://github.com/UKPLab/sentence-transformers) of Docs, Spans and Tokens directly from spaCy. Most models are for the english language but three of them are multilingual.", + "github": "MartinoMensio/spacy-sentence-bert", + "pip": "spacy-sentence-bert", + "code_example": [ + "import spacy_sentence_bert", + "# load one of the models listed at https://github.com/MartinoMensio/spacy-sentence-bert/", + "nlp = spacy_sentence_bert.load_model('en_roberta_large_nli_stsb_mean_tokens')", + "# get two documents", + "doc_1 = nlp('Hi there, how are you?')", + "doc_2 = nlp('Hello there, how are you doing today?')", + "# use the similarity method that is based on the vectors, on Doc, Span or Token", + "print(doc_1.similarity(doc_2[0:7]))" + ], + "category": ["models", "pipeline"], + "author": "Martino Mensio", + "author_links": { + "twitter": "MartinoMensio", + "github": "MartinoMensio", + "website": "https://martinomensio.github.io" + } + }, + { + "id": "spacy-streamlit", + "title": "spacy-streamlit", + "slogan": "spaCy building blocks for Streamlit apps", + "github": "explosion/spacy-streamlit", + "description": "This package contains utilities for visualizing spaCy models and building interactive spaCy-powered apps with [Streamlit](https://streamlit.io). It includes various building blocks you can use in your own Streamlit app, like visualizers for **syntactic dependencies**, **named entities**, **text classification**, **semantic similarity** via word vectors, token attributes, and more.", + "pip": "spacy-streamlit", + "category": ["visualizers"], + "thumb": "https://i.imgur.com/mhEjluE.jpg", + "image": "https://user-images.githubusercontent.com/13643239/85388081-f2da8700-b545-11ea-9bd4-e303d3c5763c.png", + "code_example": [ + "import spacy_streamlit", + "", + "models = [\"en_core_web_sm\", \"en_core_web_md\"]", + "default_text = \"Sundar Pichai is the CEO of Google.\"", + "spacy_streamlit.visualize(models, default_text)" + ], + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines", + "website": "https://ines.io" + } + }, + { + "id": "spaczz", + "title": "spaczz", + "slogan": "Fuzzy matching and more for spaCy.", + "description": "Spaczz provides fuzzy matching and multi-token regex matching functionality for spaCy. Spaczz's components have similar APIs to their spaCy counterparts and spaczz pipeline components can integrate into spaCy pipelines where they can be saved/loaded as models.", + "github": "gandersen101/spaczz", + "pip": "spaczz", + "code_example": [ + "import spacy", + "from spaczz.pipeline import SpaczzRuler", + "", + "nlp = spacy.blank('en')", + "ruler = SpaczzRuler(nlp)", + "ruler.add_patterns([{'label': 'PERSON', 'pattern': 'Bill Gates', 'type': 'fuzzy'}])", + "nlp.add_pipe(ruler)", + "", + "doc = nlp('Oops, I spelled Bill Gatez wrong.')", + "print([(ent.text, ent.start, ent.end, ent.label_) for ent in doc.ents])" + ], + "code_language": "python", + "url": "https://spaczz.readthedocs.io/en/latest/", + "author": "Grant Andersen", + "author_links": { + "twitter": "gandersen101", + "github": "gandersen101" + }, + "category": ["pipeline"], + "tags": ["fuzzy-matching", "regex"] + }, + { + "id": "spacy-universal-sentence-encoder", + "title": "spaCy - Universal Sentence Encoder", + "slogan": "Make use of Google's Universal Sentence Encoder directly within spaCy", + "description": "This library lets you use Universal Sentence Encoder embeddings of Docs, Spans and Tokens directly from TensorFlow Hub", + "github": "MartinoMensio/spacy-universal-sentence-encoder", + "pip": "spacy-universal-sentence-encoder", + "code_example": [ + "import spacy_universal_sentence_encoder", + "# load one of the models: ['en_use_md', 'en_use_lg', 'xx_use_md', 'xx_use_lg']", + "nlp = spacy_universal_sentence_encoder.load_model('en_use_lg')", + "# get two documents", + "doc_1 = nlp('Hi there, how are you?')", + "doc_2 = nlp('Hello there, how are you doing today?')", + "# use the similarity method that is based on the vectors, on Doc, Span or Token", + "print(doc_1.similarity(doc_2[0:7]))" + ], + "category": ["models", "pipeline"], + "author": "Martino Mensio", + "author_links": { + "twitter": "MartinoMensio", + "github": "MartinoMensio", + "website": "https://martinomensio.github.io" + } + }, + { + "id": "whatlies", + "title": "whatlies", + "slogan": "Make interactive visualisations to figure out 'what lies' in word embeddings.", + "description": "This small library offers tools to make visualisation easier of both word embeddings as well as operations on them. It has support for spaCy prebuilt models as a first class citizen but also offers support for sense2vec. There's a convenient API to perform linear algebra as well as support for popular transformations like PCA/UMAP/etc.", + "github": "rasahq/whatlies", + "pip": "whatlies", + "thumb": "https://i.imgur.com/rOkOiLv.png", + "image": "https://raw.githubusercontent.com/RasaHQ/whatlies/master/docs/gif-two.gif", + "code_example": [ + "from whatlies import EmbeddingSet", + "from whatlies.language import SpacyLanguage", + "", + "lang = SpacyLanguage('en_core_web_md')", + "words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', 'king', 'queen', 'doctor', 'nurse']", + "", + "emb = lang[words]", + "emb.plot_interactive(x_axis='man', y_axis='woman')" + ], + "category": ["visualizers", "research"], + "author": "Vincent D. Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning", + "website": "https://koaning.io" + } + }, + { + "id": "tokenwiser", + "title": "tokenwiser", + "slogan": "Connect vowpal-wabbit & scikit-learn models to spaCy to run simple classification benchmarks. Comes with many utility functions for spaCy pipelines.", + "github": "koaning/tokenwiser", + "pip": "tokenwiser", + "thumb": "https://koaning.github.io/tokenwiser/token.png", + "image": "https://koaning.github.io/tokenwiser/logo-tokw.png", + "code_example": [ + "import spacy", + "", + "from sklearn.pipeline import make_pipeline", + "from sklearn.feature_extraction.text import CountVectorizer", + "from sklearn.linear_model import LogisticRegression", + "", + "from tokenwiser.component import attach_sklearn_categoriser", + "", + "X = [", + " 'i really like this post',", + " 'thanks for that comment',", + " 'i enjoy this friendly forum',", + " 'this is a bad post',", + " 'i dislike this article',", + " 'this is not well written'", + "]", + "", + "y = ['pos', 'pos', 'pos', 'neg', 'neg', 'neg']", + "", + "# Note that we're training a pipeline here via a single-batch `.fit()` method", + "pipe = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y)", + "", + "nlp = spacy.load('en_core_web_sm')", + "# This is where we attach our pre-trained model as a pipeline step.", + "attach_sklearn_categoriser(nlp, pipe_name='silly_sentiment', estimator=pipe)" + ], + "category": ["pipeline", "training"], + "author": "Vincent D. Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning", + "website": "https://koaning.io" + } + }, + { + "id": "spacy-stanza", + "title": "spacy-stanza", + "slogan": "Use the latest Stanza (StanfordNLP) research models directly in spaCy", + "description": "This package wraps the Stanza (formerly StanfordNLP) library, so you can use Stanford's models as a spaCy pipeline. Using this wrapper, you'll be able to use the following annotations, computed by your pretrained `stanza` model:\n\n- Statistical tokenization (reflected in the `Doc` and its tokens)\n - Lemmatization (`token.lemma` and `token.lemma_`)\n - Part-of-speech tagging (`token.tag`, `token.tag_`, `token.pos`, `token.pos_`)\n - Dependency parsing (`token.dep`, `token.dep_`, `token.head`)\n - Named entity recognition (`doc.ents`, `token.ent_type`, `token.ent_type_`, `token.ent_iob`, `token.ent_iob_`)\n - Sentence segmentation (`doc.sents`)", + "github": "explosion/spacy-stanza", + "pip": "spacy-stanza", + "thumb": "https://i.imgur.com/myhLjMJ.png", + "code_example": [ + "import stanza", + "import spacy_stanza", + "", + "stanza.download(\"en\")", + "nlp = spacy_stanza.load_pipeline(\"en\")", + "", + "doc = nlp(\"Barack Obama was born in Hawaii. He was elected president in 2008.\")", + "for token in doc:", + " print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)", + "print(doc.ents)" + ], + "category": ["pipeline", "standalone", "models", "research"], + "author": "Explosion", + "author_links": { + "twitter": "explosion_ai", + "github": "explosion", + "website": "https://explosion.ai" + } + }, + { + "id": "spacy-udpipe", + "title": "spacy-udpipe", + "slogan": "Use the latest UDPipe models directly in spaCy", + "description": "This package wraps the fast and efficient UDPipe language-agnostic NLP pipeline (via its Python bindings), so you can use UDPipe pre-trained models as a spaCy pipeline for 50+ languages out-of-the-box. Inspired by spacy-stanza, this package offers slightly less accurate models that are in turn much faster.", + "github": "TakeLab/spacy-udpipe", + "pip": "spacy-udpipe", + "code_example": [ + "import spacy_udpipe", + "", + "spacy_udpipe.download(\"en\") # download English model", + "", + "text = \"Wikipedia is a free online encyclopedia, created and edited by volunteers around the world.\"", + "nlp = spacy_udpipe.load(\"en\")", + "", + "doc = nlp(text)", + "for token in doc:", + " print(token.text, token.lemma_, token.pos_, token.dep_)" + ], + "category": ["pipeline", "standalone", "models", "research"], + "author": "TakeLab", + "author_links": { + "github": "TakeLab", + "website": "https://takelab.fer.hr/" + } + }, + { + "id": "spacy-server", + "title": "spaCy Server", + "slogan": "\uD83E\uDD9C Containerized HTTP API for spaCy NLP", + "description": "For developers who need programming language agnostic NLP, spaCy Server is a containerized HTTP API that provides industrial-strength natural language processing. Unlike other servers, our server is fast, idiomatic, and well documented.", + "github": "neelkamath/spacy-server", + "code_example": [ + "docker run --rm -dp 8080:8080 neelkamath/spacy-server", + "curl http://localhost:8080/ner -H 'Content-Type: application/json' -d '{\"sections\": [\"My name is John Doe. I grew up in California.\"]}'" + ], + "code_language": "shell", + "url": "https://hub.docker.com/r/neelkamath/spacy-server", + "author": "Neel Kamath", + "author_links": { + "github": "neelkamath", + "website": "https://neelkamath.com" + }, + "category": ["apis"], + "tags": ["docker"] + }, + { + "id": "nlp-architect", + "title": "NLP Architect", + "slogan": "Python lib for exploring Deep NLP & NLU by Intel AI", + "github": "NervanaSystems/nlp-architect", + "pip": "nlp-architect", + "thumb": "https://i.imgur.com/vMideRx.png", + "category": ["standalone", "research"], + "tags": ["pytorch"] + }, + { + "id": "NeuroNER", + "title": "NeuroNER", + "slogan": "Named-entity recognition using neural networks", + "github": "Franck-Dernoncourt/NeuroNER", + "category": ["models"], + "pip": "pyneuroner[cpu]", + "code_example": [ + "from neuroner import neuromodel", + "nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True)" + ], + "tags": ["standalone"] + }, + { + "id": "NLPre", + "title": "NLPre", + "slogan": "Natural Language Preprocessing Library for health data and more", + "github": "NIHOPA/NLPre", + "pip": "nlpre", + "code_example": [ + "from nlpre import titlecaps, dedash, identify_parenthetical_phrases", + "from nlpre import replace_acronyms, replace_from_dictionary", + "ABBR = identify_parenthetical_phrases()(text)", + "parsers = [dedash(), titlecaps(), replace_acronyms(ABBR),", + " replace_from_dictionary(prefix='MeSH_')]", + "for f in parsers:", + " text = f(text)", + "print(text)" + ], + "category": ["scientific", "biomedical"], + "author": "Travis Hoppe", + "author_links": { + "github": "thoppe", + "twitter": "metasemantic", + "website": "http://thoppe.github.io/" + } + }, + { + "id": "Chatterbot", + "title": "Chatterbot", + "slogan": "A machine-learning based conversational dialog engine for creating chat bots", + "github": "gunthercox/ChatterBot", + "pip": "chatterbot", + "thumb": "https://i.imgur.com/eyAhwXk.jpg", + "code_example": [ + "from chatterbot import ChatBot", + "from chatterbot.trainers import ListTrainer", + "# Create a new chat bot named Charlie", + "chatbot = ChatBot('Charlie')", + "trainer = ListTrainer(chatbot)", + "trainer.train([", + "'Hi, can I help you?',", + "'Sure, I would like to book a flight to Iceland.',", + "'Your flight has been booked.'", + "])", + "", + "response = chatbot.get_response('I would like to book a flight.')" + ], + "author": "Gunther Cox", + "author_links": { + "github": "gunthercox" + }, + "category": ["conversational", "standalone"], + "tags": ["chatbots"] + }, + { + "id": "saber", + "title": "saber", + "slogan": "Deep-learning based tool for information extraction in the biomedical domain", + "github": "BaderLab/saber", + "pip": "saber", + "thumb": "https://raw.githubusercontent.com/BaderLab/saber/master/docs/img/saber_logo.png", + "code_example": [ + "from saber.saber import Saber", + "saber = Saber()", + "saber.load('PRGE')", + "saber.annotate('The phosphorylation of Hdm2 by MK2 promotes the ubiquitination of p53.')" + ], + "author": "Bader Lab, University of Toronto", + "category": ["scientific"], + "tags": ["keras", "biomedical"] + }, + { + "id": "alibi", + "title": "alibi", + "slogan": "Algorithms for monitoring and explaining machine learning models ", + "github": "SeldonIO/alibi", + "pip": "alibi", + "thumb": "https://i.imgur.com/YkzQHRp.png", + "code_example": [ + "from alibi.explainers import AnchorTabular", + "explainer = AnchorTabular(predict_fn, feature_names)", + "explainer.fit(X_train)", + "explainer.explain(x)" + ], + "author": "Seldon", + "category": ["standalone", "research"] + }, + { + "id": "spacymoji", + "slogan": "Emoji handling and meta data as a spaCy pipeline component", + "github": "ines/spacymoji", + "description": "spaCy extension and pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Human-readable emoji descriptions are added as a custom attribute, and an optional lookup table can be provided for your own descriptions. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_emoji`, `._.emoji_desc`, `._.has_emoji` and `._.emoji`.", + "pip": "spacymoji", + "category": ["pipeline"], + "tags": ["emoji", "unicode"], + "thumb": "https://i.imgur.com/XOTYIgn.jpg", + "code_example": [ + "import spacy", + "from spacymoji import Emoji", + "", + "nlp = spacy.load('en')", + "emoji = Emoji(nlp)", + "nlp.add_pipe(emoji, first=True)", + "", + "doc = nlp('This is a test 😻 👍🏿')", + "assert doc._.has_emoji == True", + "assert doc[2:5]._.has_emoji == True", + "assert doc[0]._.is_emoji == False", + "assert doc[4]._.is_emoji == True", + "assert doc[5]._.emoji_desc == 'thumbs up dark skin tone'", + "assert len(doc._.emoji) == 2", + "assert doc._.emoji[1] == ('👍🏿', 5, 'thumbs up dark skin tone')" + ], + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines", + "website": "https://ines.io" + } + }, + { + "id": "spacyopentapioca", + "title": "spaCyOpenTapioca", + "slogan": "Named entity linking on Wikidata in spaCy via OpenTapioca", + "description": "A spaCy wrapper of OpenTapioca for named entity linking on Wikidata", + "github": "UB-Mannheim/spacyopentapioca", + "pip": "spacyopentapioca", + "code_example": [ + "import spacy", + "nlp = spacy.blank('en')", + "nlp.add_pipe('opentapioca')", + "doc = nlp('Christian Drosten works in Germany.')", + "for span in doc.ents:", + " print((span.text, span.kb_id_, span.label_, span._.description, span._.score))", + "# ('Christian Drosten', 'Q1079331', 'PERSON', 'German virologist and university teacher', 3.6533377082098895)", + "# ('Germany', 'Q183', 'LOC', 'sovereign state in Central Europe', 2.1099332471902863)", + "## Check also span._.types, span._.aliases, span._.rank" + ], + "category": ["models", "pipeline"], + "tags": ["NER", "NEL"], + "author": "Renat Shigapov", + "author_links": { + "twitter": "_shigapov", + "github": "shigapov" + } + }, + { + "id": "spacy_hunspell", + "slogan": "Add spellchecking and spelling suggestions to your spaCy pipeline using Hunspell", + "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add [Hunspell](http://hunspell.github.io) support for spellchecking.", + "github": "tokestermw/spacy_hunspell", + "pip": "spacy_hunspell", + "code_example": [ + "import spacy", + "from spacy_hunspell import spaCyHunSpell", + "", + "nlp = spacy.load('en_core_web_sm')", + "hunspell = spaCyHunSpell(nlp, 'mac')", + "nlp.add_pipe(hunspell)", + "doc = nlp('I can haz cheezeburger.')", + "haz = doc[2]", + "haz._.hunspell_spell # False", + "haz._.hunspell_suggest # ['ha', 'haze', 'hazy', 'has', 'hat', 'had', 'hag', 'ham', 'hap', 'hay', 'haw', 'ha z']" + ], + "author": "Motoki Wu", + "author_links": { + "github": "tokestermw", + "twitter": "plusepsilon" + }, + "category": ["pipeline"], + "tags": ["spellcheck"] + }, + { + "id": "spacy_grammar", + "slogan": "Language Tool style grammar handling with spaCy", + "description": "This packages leverages the [Matcher API](https://spacy.io/docs/usage/rule-based-matching) in spaCy to quickly match on spaCy tokens not dissimilar to regex. It reads a `grammar.yml` file to load up custom patterns and returns the results inside `Doc`, `Span`, and `Token`. It is extensible through adding rules to `grammar.yml` (though currently only the simple string matching is implemented).", + "github": "tokestermw/spacy_grammar", + "code_example": [ + "import spacy", + "from spacy_grammar.grammar import Grammar", + "", + "nlp = spacy.load('en')", + "grammar = Grammar(nlp)", + "nlp.add_pipe(grammar)", + "doc = nlp('I can haz cheeseburger.')", + "doc._.has_grammar_error # True" + ], + "author": "Motoki Wu", + "author_links": { + "github": "tokestermw", + "twitter": "plusepsilon" + }, + "category": ["pipeline"] + }, + { + "id": "spacy_kenlm", + "slogan": "KenLM extension for spaCy 2.0", + "github": "tokestermw/spacy_kenlm", + "pip": "spacy_kenlm", + "code_example": [ + "import spacy", + "from spacy_kenlm import spaCyKenLM", + "", + "nlp = spacy.load('en_core_web_sm')", + "spacy_kenlm = spaCyKenLM() # default model from test.arpa", + "nlp.add_pipe(spacy_kenlm)", + "doc = nlp('How are you?')", + "doc._.kenlm_score # doc score", + "doc[:2]._.kenlm_score # span score", + "doc[2]._.kenlm_score # token score" + ], + "author": "Motoki Wu", + "author_links": { + "github": "tokestermw", + "twitter": "plusepsilon" + }, + "category": ["pipeline"] + }, + { + "id": "spacy_readability", + "slogan": "Add text readability meta data to Doc objects", + "description": "spaCy v2.0 pipeline component for calculating readability scores of of text. Provides scores for Flesh-Kincaid grade level, Flesh-Kincaid reading ease, and Dale-Chall.", + "github": "mholtzscher/spacy_readability", + "pip": "spacy-readability", + "code_example": [ + "import spacy", + "from spacy_readability import Readability", + "", + "nlp = spacy.load('en')", + "read = Readability(nlp)", + "nlp.add_pipe(read, last=True)", + "doc = nlp(\"I am some really difficult text to read because I use obnoxiously large words.\")", + "doc._.flesch_kincaid_grade_level", + "doc._.flesch_kincaid_reading_ease", + "doc._.dale_chall" + ], + "author": "Michael Holtzscher", + "author_links": { + "github": "mholtzscher" + }, + "category": ["pipeline"] + }, + { + "id": "spacy-sentence-segmenter", + "title": "Sentence Segmenter", + "slogan": "Custom sentence segmentation for spaCy", + "code_example": [ + "from seg.newline.segmenter import NewLineSegmenter", + "import spacy", + "", + "nlseg = NewLineSegmenter()", + "nlp = spacy.load('en')", + "nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')", + "doc = nlp(my_doc_text)" + ], + "author": "tc64", + "author_links": { + "github": "tc64" + }, + "category": ["pipeline"] + }, + { + "id": "spacy_cld", + "title": "spaCy-CLD", + "slogan": "Add language detection to your spaCy pipeline using CLD2", + "description": "spaCy-CLD operates on `Doc` and `Span` spaCy objects. When called on a `Doc` or `Span`, the object is given two attributes: `languages` (a list of up to 3 language codes) and `language_scores` (a dictionary mapping language codes to confidence scores between 0 and 1).\n\nspacy-cld is a little extension that wraps the [PYCLD2](https://github.com/aboSamoor/pycld2) Python library, which in turn wraps the [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) C library originally built at Google for the Chromium project. CLD2 uses character n-grams as features and a Naive Bayes classifier to identify 80+ languages from Unicode text strings (or XML/HTML). It can detect up to 3 different languages in a given document, and reports a confidence score (reported in with each language.", + "github": "nickdavidhaynes/spacy-cld", + "pip": "spacy_cld", + "code_example": [ + "import spacy", + "from spacy_cld import LanguageDetector", + "", + "nlp = spacy.load('en')", + "language_detector = LanguageDetector()", + "nlp.add_pipe(language_detector)", + "doc = nlp('This is some English text.')", + "", + "doc._.languages # ['en']", + "doc._.language_scores['en'] # 0.96" + ], + "author": "Nicholas D Haynes", + "author_links": { + "github": "nickdavidhaynes" + }, + "category": ["pipeline"] + }, + { + "id": "spacy-lookup", + "slogan": "A powerful entity matcher for very large dictionaries, using the FlashText module", + "description": "spaCy v2.0 extension and pipeline component for adding Named Entities metadata to `Doc` objects. Detects Named Entities using dictionaries. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_entity`, `._.entity_type`, `._.has_entities` and `._.entities`. Named Entities are matched using the python module `flashtext`, and looked up in the data provided by different dictionaries.", + "github": "mpuig/spacy-lookup", + "pip": "spacy-lookup", + "code_example": [ + "import spacy", + "from spacy_lookup import Entity", + "", + "nlp = spacy.load('en')", + "entity = Entity(keywords_list=['python', 'product manager', 'java platform'])", + "nlp.add_pipe(entity, last=True)", + "", + "doc = nlp(\"I am a product manager for a java and python.\")", + "assert doc._.has_entities == True", + "assert doc[0]._.is_entity == False", + "assert doc[3]._.entity_desc == 'product manager'", + "assert doc[3]._.is_entity == True", + "", + "print([(token.text, token._.canonical) for token in doc if token._.is_entity])" + ], + "author": "Marc Puig", + "author_links": { + "github": "mpuig" + }, + "category": ["pipeline"] + }, + { + "id": "spacy-iwnlp", + "slogan": "German lemmatization with IWNLP", + "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add [IWNLP-py](https://github.com/Liebeck/iwnlp-py) as German lemmatizer directly into your spaCy pipeline.", + "github": "Liebeck/spacy-iwnlp", + "pip": "spacy-iwnlp", + "code_example": [ + "import spacy", + "from spacy_iwnlp import spaCyIWNLP", + "", + "nlp = spacy.load('de')", + "iwnlp = spaCyIWNLP(lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json')", + "nlp.add_pipe(iwnlp)", + "doc = nlp('Wir mögen Fußballspiele mit ausgedehnten Verlängerungen.')", + "for token in doc:", + " print('POS: {}\tIWNLP:{}'.format(token.pos_, token._.iwnlp_lemmas))" + ], + "author": "Matthias Liebeck", + "author_links": { + "github": "Liebeck" + }, + "category": ["pipeline"], + "tags": ["lemmatizer", "german"] + }, + { + "id": "spacy-sentiws", + "slogan": "German sentiment scores with SentiWS", + "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add [SentiWS](http://wortschatz.uni-leipzig.de/en/download) as German sentiment score directly into your spaCy pipeline.", + "github": "Liebeck/spacy-sentiws", + "pip": "spacy-sentiws", + "code_example": [ + "import spacy", + "from spacy_sentiws import spaCySentiWS", + "", + "nlp = spacy.load('de')", + "sentiws = spaCySentiWS(sentiws_path='data/sentiws/')", + "nlp.add_pipe(sentiws)", + "doc = nlp('Die Dummheit der Unterwerfung blüht in hübschen Farben.')", + "", + "for token in doc:", + " print('{}, {}, {}'.format(token.text, token._.sentiws, token.pos_))" + ], + "author": "Matthias Liebeck", + "author_links": { + "github": "Liebeck" + }, + "category": ["pipeline"], + "tags": ["sentiment", "german"] + }, + { + "id": "spacy-lefff", + "slogan": "POS and French lemmatization with Lefff", + "description": "spacy v2.0 extension and pipeline component for adding a French POS and lemmatizer based on [Lefff](https://hal.inria.fr/inria-00521242/).", + "github": "sammous/spacy-lefff", + "pip": "spacy-lefff", + "code_example": [ + "import spacy", + "from spacy_lefff import LefffLemmatizer, POSTagger", + "", + "nlp = spacy.load('fr')", + "pos = POSTagger()", + "french_lemmatizer = LefffLemmatizer(after_melt=True)", + "nlp.add_pipe(pos, name='pos', after='parser')", + "nlp.add_pipe(french_lemmatizer, name='lefff', after='pos')", + "doc = nlp(u\"Paris est une ville très chère.\")", + "for d in doc:", + " print(d.text, d.pos_, d._.melt_tagger, d._.lefff_lemma, d.tag_, d.lemma_)" + ], + "author": "Sami Moustachir", + "author_links": { + "github": "sammous" + }, + "category": ["pipeline"], + "tags": ["pos", "lemmatizer", "french"] + }, + { + "id": "lemmy", + "title": "Lemmy", + "slogan": "A Danish lemmatizer", + "description": "Lemmy is a lemmatizer for Danish 🇩🇰 . It comes already trained on Dansk Sprognævns (DSN) word list (‘fuldformliste’) and the Danish Universal Dependencies and is ready for use. Lemmy also supports training on your own dataset. The model currently included in Lemmy was evaluated on the Danish Universal Dependencies dev dataset and scored an accruacy > 99%.\n\nYou can use Lemmy as a spaCy extension, more specifcally a spaCy pipeline component. This is highly recommended and makes the lemmas easily accessible from the spaCy tokens. Lemmy makes use of POS tags to predict the lemmas. When wired up to the spaCy pipeline, Lemmy has the benefit of using spaCy’s builtin POS tagger.", + "github": "sorenlind/lemmy", + "pip": "lemmy", + "code_example": [ + "import da_custom_model as da # name of your spaCy model", + "import lemmy.pipe", + "nlp = da.load()", + "", + "# create an instance of Lemmy's pipeline component for spaCy", + "pipe = lemmy.pipe.load()", + "", + "# add the comonent to the spaCy pipeline.", + "nlp.add_pipe(pipe, after='tagger')", + "", + "# lemmas can now be accessed using the `._.lemma` attribute on the tokens", + "nlp(\"akvariernes\")[0]._.lemma" + ], + "thumb": "https://i.imgur.com/RJVFRWm.jpg", + "author": "Søren Lind Kristiansen", + "author_links": { + "github": "sorenlind" + }, + "category": ["pipeline"], + "tags": ["lemmatizer", "danish"] + }, + { + "id": "dacy", + "title": "DaCy", + "slogan": "An efficient Pipeline for Danish NLP", + "description": "DaCy is a Danish preprocessing pipeline trained in SpaCy. It has achieved State-of-the-Art performance on Named entity recognition, part-of-speech tagging and dependency parsing for Danish. This repository contains material for using the DaCy, reproducing the results and guides on usage of the package. Furthermore, it also contains a series of behavioural test for biases and robustness of Danish NLP pipelines.", + "github": "centre-for-humanities-computing/DaCy", + "pip": "dacy", + "code_example": [ + "import dacy", + "print(dacy.models()) # get a list of dacy models", + "nlp = dacy.load('medium') # load your spacy pipeline", + "", + "# DaCy also includes functionality for adding other Danish models to the pipeline", + "# For instance you can add the BertTone model for classification of sentiment polarity to the pipeline:", + "nlp = add_berttone_polarity(nlp)" + ], + "thumb": "https://github.com/centre-for-humanities-computing/DaCy/blob/main/img/icon_no_title.png?raw=true", + "author": "Centre for Humanities Computing Aarhus", + "author_links": { + "github": "centre-for-humanities-computing", + "website": "https://chcaa.io/#/" + }, + "category": ["pipeline"], + "tags": ["pipeline", "danish"] + }, + { + "id": "spacy-wrap", + "title": "spaCy-wrap", + "slogan": "For Wrapping fine-tuned transformers in spaCy pipelines", + "description": "spaCy-wrap is a wrapper library for spaCy for including fine-tuned transformers from Huggingface in your spaCy pipeline allowing inclusion of existing models within existing workflows.", + "github": "kennethenevoldsen/spacy-wrap", + "pip": "spacy_wrap", + "code_example": [ + "import spacy", + "import spacy_wrap", + "", + "nlp = spacy.blank('en')", + "config = {", + " 'doc_extension_trf_data': 'clf_trf_data', # document extention for the forward pass", + " 'doc_extension_prediction': 'sentiment', # document extention for the prediction", + " 'labels': ['negative', 'neutral', 'positive'],", + " 'model': {", + " 'name': 'cardiffnlp/twitter-roberta-base-sentiment', # the model name or path of huggingface model", + "},", + "}", + "", + "transformer = nlp.add_pipe('classification_transformer', config=config)", + "transformer.model.initialize()", + "", + "doc = nlp('spaCy is a wonderful tool')", + "", + "print(doc._.clf_trf_data)", + "# TransformerData(wordpieces=...", + "print(doc._.sentiment)", + "# 'positive'", + "print(doc._.sentiment_prob)", + "# {'prob': array([0.004, 0.028, 0.969], dtype=float32), 'labels': ['negative', 'neutral', 'positive']}" + ], + "thumb": "https://raw.githubusercontent.com/KennethEnevoldsen/spacy-wrap/main/docs/_static/icon.png", + "author": "Kenneth Enevoldsen", + "author_links": { + "github": "KennethEnevoldsen", + "website": "https://www.kennethenevoldsen.com" + }, + "category": ["pipeline", "models", "training"], + "tags": ["pipeline", "models", "transformers"] + }, + { + "id": "textdescriptives", + "title": "TextDescriptives", + "slogan": "Extraction of descriptive stats, readability, and syntactic complexity measures", + "description": "Pipeline component for spaCy v.3 that calculates descriptive statistics, readability metrics, and syntactic complexity (dependency distance).", + "github": "HLasse/TextDescriptives", + "pip": "textdescriptives", + "code_example": [ + "import spacy", + "import textdescriptives as td", + "nlp = spacy.load('en_core_web_sm')", + "nlp.add_pipe('textdescriptives')", + "doc = nlp('This is a short test text')", + "doc._.readability # access some of the values", + "td.extract_df(doc) # extract all metrics to DataFrame" + ], + "author": "Lasse Hansen, Kenneth Enevoldsen, Ludvig Olsen", + "author_links": { + "github": "HLasse" + }, + "category": ["pipeline"], + "tags": ["pipeline", "readability", "syntactic complexity", "descriptive statistics"] + }, + { + "id": "wmd-relax", + "slogan": "Calculates word mover's distance insanely fast", + "description": "Calculates Word Mover's Distance as described in [From Word Embeddings To Document Distances](http://www.cs.cornell.edu/~kilian/papers/wmd_metric.pdf) by Matt Kusner, Yu Sun, Nicholas Kolkin and Kilian Weinberger.\n\n⚠️ **This package is currently only compatible with spaCy v.1x.**", + "github": "src-d/wmd-relax", + "thumb": "https://i.imgur.com/f91C3Lf.jpg", + "code_example": [ + "import spacy", + "import wmd", + "", + "nlp = spacy.load('en', create_pipeline=wmd.WMD.create_spacy_pipeline)", + "doc1 = nlp(\"Politician speaks to the media in Illinois.\")", + "doc2 = nlp(\"The president greets the press in Chicago.\")", + "print(doc1.similarity(doc2))" + ], + "author": "source{d}", + "author_links": { + "github": "src-d", + "twitter": "sourcedtech", + "website": "https://sourced.tech" + }, + "category": ["pipeline"] + }, + { + "id": "neuralcoref", + "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy", + "description": "This coreference resolution module is based on the super fast [spaCy](https://spacy.io/) parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. Since ✨Neuralcoref v2.0, you can train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Note that to use neuralcoref with spaCy > 2.1.0, you'll have to install neuralcoref from source.", + "github": "huggingface/neuralcoref", + "thumb": "https://i.imgur.com/j6FO9O6.jpg", + "code_example": [ + "import spacy", + "import neuralcoref", + "", + "nlp = spacy.load('en')", + "neuralcoref.add_to_pipe(nlp)", + "doc1 = nlp('My sister has a dog. She loves him.')", + "print(doc1._.coref_clusters)", + "", + "doc2 = nlp('Angela lives in Boston. She is quite happy in that city.')", + "for ent in doc2.ents:", + " print(ent._.coref_cluster)" + ], + "author": "Hugging Face", + "author_links": { + "github": "huggingface" + }, + "category": ["standalone", "conversational", "models"], + "tags": ["coref"] + }, + { + "id": "neuralcoref-vizualizer", + "title": "Neuralcoref Visualizer", + "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy", + "description": "In short, coreference is the fact that two or more expressions in a text – like pronouns or nouns – link to the same person or thing. It is a classical Natural language processing task, that has seen a revival of interest in the past two years as several research groups applied cutting-edge deep-learning and reinforcement-learning techniques to it. It is also one of the key building blocks to building conversational Artificial intelligences.", + "url": "https://huggingface.co/coref/", + "image": "https://i.imgur.com/3yy4Qyf.png", + "thumb": "https://i.imgur.com/j6FO9O6.jpg", + "github": "huggingface/neuralcoref", + "category": ["visualizers", "conversational"], + "tags": ["coref", "chatbots"], + "author": "Hugging Face", + "author_links": { + "github": "huggingface" + } + }, + { + "id": "spacy-vis", + "slogan": "A visualisation tool for spaCy using Hierplane", + "description": "A visualiser for spaCy annotations. This visualisation uses the [Hierplane](https://allenai.github.io/hierplane/) Library to render the dependency parse from spaCy's models. It also includes visualisation of entities and POS tags within nodes.", + "github": "DeNeutoy/spacy-vis", + "url": "http://spacyvis.allennlp.org/spacy-parser", + "thumb": "https://i.imgur.com/DAG9QFd.jpg", + "image": "https://raw.githubusercontent.com/DeNeutoy/spacy-vis/master/img/example.gif", + "author": "Mark Neumann", + "author_links": { + "twitter": "MarkNeumannnn", + "github": "DeNeutoy" + }, + "category": ["visualizers"] + }, + { + "id": "matcher-explorer", + "title": "Rule-based Matcher Explorer", + "slogan": "Test spaCy's rule-based Matcher by creating token patterns interactively", + "description": "Test spaCy's rule-based `Matcher` by creating token patterns interactively and running them over your text. Each token can set multiple attributes like text value, part-of-speech tag or boolean flags. The token-based view lets you explore how spaCy processes your text – and why your pattern matches, or why it doesn't. For more details on rule-based matching, see the [documentation](https://spacy.io/usage/rule-based-matching).", + "image": "https://explosion.ai/assets/img/demos/matcher.png", + "thumb": "https://i.imgur.com/rPK4AGt.jpg", + "url": "https://explosion.ai/demos/matcher", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines", + "website": "https://ines.io" + }, + "category": ["visualizers"] + }, + { + "id": "displacy", + "title": "displaCy", + "slogan": "A modern syntactic dependency visualizer", + "description": "Visualize spaCy's guess at the syntactic structure of a sentence. Arrows point from children to heads, and are labelled by their relation type.", + "url": "https://explosion.ai/demos/displacy", + "thumb": "https://i.imgur.com/nxDcHaL.jpg", + "image": "https://explosion.ai/assets/img/demos/displacy.png", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines", + "website": "https://ines.io" + }, + "category": ["visualizers"] + }, + { + "id": "displacy-ent", + "title": "displaCy ENT", + "slogan": "A modern named entity visualizer", + "description": "Visualize spaCy's guess at the named entities in the document. You can filter the displayed types, to only show the annotations you're interested in.", + "url": "https://explosion.ai/demos/displacy-ent", + "thumb": "https://i.imgur.com/A77Ecbs.jpg", + "image": "https://explosion.ai/assets/img/demos/displacy-ent.png", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines", + "website": "https://ines.io" + }, + "category": ["visualizers"] + }, + { + "id": "explacy", + "slogan": "A small tool that explains spaCy parse results", + "github": "tylerneylon/explacy", + "thumb": "https://i.imgur.com/V1hCWmn.jpg", + "image": "https://raw.githubusercontent.com/tylerneylon/explacy/master/img/screenshot.png", + "code_example": [ + "import spacy", + "import explacy", + "", + "nlp = spacy.load('en')", + "explacy.print_parse_info(nlp, 'The salad was surprisingly tasty.')" + ], + "author": "Tyler Neylon", + "author_links": { + "github": "tylerneylon" + }, + "category": ["visualizers"] + }, + { + "id": "deplacy", + "slogan": "CUI-based Tree Visualizer for Universal Dependencies and Immediate Catena Analysis", + "description": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.", + "github": "KoichiYasuoka/deplacy", + "image": "https://i.imgur.com/6uOI4Op.png", + "code_example": [ + "import spacy", + "import deplacy", + "", + "nlp=spacy.load('en_core_web_sm')", + "doc=nlp('I saw a horse yesterday which had no name.')", + "deplacy.render(doc)" + ], + "author": "Koichi Yasuoka", + "author_links": { + "github": "KoichiYasuoka" + }, + "category": ["visualizers"] + }, + { + "id": "scattertext", + "slogan": "Beautiful visualizations of how language differs among document types", + "description": "A tool for finding distinguishing terms in small-to-medium-sized corpora, and presenting them in a sexy, interactive scatter plot with non-overlapping term labels. Exploratory data analysis just got more fun.", + "github": "JasonKessler/scattertext", + "image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png", + "code_example": [ + "import spacy", + "import scattertext as st", + "", + "nlp = spacy.load('en')", + "corpus = st.CorpusFromPandas(convention_df,", + " category_col='party',", + " text_col='text',", + " nlp=nlp).build()" + ], + "author": "Jason Kessler", + "author_links": { + "github": "JasonKessler", + "twitter": "jasonkessler" + }, + "category": ["visualizers"] + }, + { + "id": "rasa", + "title": "Rasa", + "slogan": "Turn natural language into structured data", + "description": "Machine learning tools for developers to build, improve, and deploy contextual chatbots and assistants. Powered by open source.", + "github": "RasaHQ/rasa", + "pip": "rasa", + "thumb": "https://i.imgur.com/TyZnpwL.png", + "url": "https://rasa.com/", + "author": "Rasa", + "author_links": { + "github": "RasaHQ" + }, + "category": ["conversational"], + "tags": ["chatbots"] + }, + { + "id": "mindmeld", + "title": "MindMeld - Conversational AI platform", + "slogan": "Conversational AI platform for deep-domain voice interfaces and chatbots", + "description": "The MindMeld Conversational AI platform is among the most advanced AI platforms for building production-quality conversational applications. It is a Python-based machine learning framework which encompasses all of the algorithms and utilities required for this purpose. (https://github.com/cisco/mindmeld)", + "github": "cisco/mindmeld", + "pip": "mindmeld", + "thumb": "https://www.mindmeld.com/img/mindmeld-logo.png", + "category": ["conversational", "ner"], + "tags": ["chatbots"], + "author": "Cisco", + "author_links": { + "github": "cisco/mindmeld", + "website": "https://www.mindmeld.com/" + } + }, + { + "id": "torchtext", + "title": "torchtext", + "slogan": "Data loaders and abstractions for text and NLP", + "github": "pytorch/text", + "pip": "torchtext", + "thumb": "https://i.imgur.com/WFkxuPo.png", + "code_example": [ + ">>> pos = data.TabularDataset(", + "... path='data/pos/pos_wsj_train.tsv', format='tsv',", + "... fields=[('text', data.Field()),", + "... ('labels', data.Field())])", + "...", + ">>> sentiment = data.TabularDataset(", + "... path='data/sentiment/train.json', format='json',", + "... fields={'sentence_tokenized': ('text', data.Field(sequential=True)),", + "... 'sentiment_gold': ('labels', data.Field(sequential=False))})" + ], + "category": ["standalone", "research"], + "tags": ["pytorch"] + }, + { + "id": "allennlp", + "title": "AllenNLP", + "slogan": "An open-source NLP research library, built on PyTorch and spaCy", + "description": "AllenNLP is a new library designed to accelerate NLP research, by providing a framework that supports modern deep learning workflows for cutting-edge language understanding problems. AllenNLP uses spaCy as a preprocessing component. You can also use Allen NLP to develop spaCy pipeline components, to add annotations to the `Doc` object.", + "github": "allenai/allennlp", + "pip": "allennlp", + "thumb": "https://i.imgur.com/U8opuDN.jpg", + "url": "http://allennlp.org", + "author": " Allen Institute for Artificial Intelligence", + "author_links": { + "github": "allenai", + "twitter": "allenai_org", + "website": "http://allenai.org" + }, + "category": ["standalone", "research"] + }, + { + "id": "scispacy", + "title": "scispaCy", + "slogan": "A full spaCy pipeline and models for scientific/biomedical documents", + "github": "allenai/scispacy", + "pip": "scispacy", + "thumb": "https://i.imgur.com/dJQSclW.png", + "url": "https://allenai.github.io/scispacy/", + "author": " Allen Institute for Artificial Intelligence", + "author_links": { + "github": "allenai", + "twitter": "allenai_org", + "website": "http://allenai.org" + }, + "category": ["scientific", "models", "research"] + }, + { + "id": "textacy", + "slogan": "NLP, before and after spaCy", + "description": "`textacy` is a Python library for performing a variety of natural language processing (NLP) tasks, built on the high-performance `spacy` library. With the fundamentals – tokenization, part-of-speech tagging, dependency parsing, etc. – delegated to another library, `textacy` focuses on the tasks that come before and follow after.", + "github": "chartbeat-labs/textacy", + "pip": "textacy", + "url": "https://github.com/chartbeat-labs/textacy", + "author": "Burton DeWilde", + "author_links": { + "github": "bdewilde", + "twitter": "bjdewilde" + }, + "category": ["standalone"] + }, + { + "id": "textpipe", + "slogan": "clean and extract metadata from text", + "description": "`textpipe` is a Python package for converting raw text in to clean, readable text and extracting metadata from that text. Its functionalities include transforming raw text into readable text by removing HTML tags and extracting metadata such as the number of words and named entities from the text.", + "github": "textpipe/textpipe", + "pip": "textpipe", + "author": "Textpipe Contributors", + "author_links": { + "github": "textpipe", + "website": "https://github.com/textpipe/textpipe/blob/master/CONTRIBUTORS.md" + }, + "category": ["standalone"], + "tags": ["text-processing", "named-entity-recognition"], + "thumb": "https://avatars0.githubusercontent.com/u/40492530", + "code_example": [ + "from textpipe import doc, pipeline", + "sample_text = 'Sample text! '", + "document = doc.Doc(sample_text)", + "print(document.clean)", + "'Sample text!'", + "print(document.language)", + "# 'en'", + "print(document.nwords)", + "# 2", + "", + "pipe = pipeline.Pipeline(['CleanText', 'NWords'])", + "print(pipe(sample_text))", + "# {'CleanText': 'Sample text!', 'NWords': 2}" + ] + }, + { + "id": "mordecai", + "slogan": "Full text geoparsing using spaCy, Geonames and Keras", + "description": "Extract the place names from a piece of text, resolve them to the correct place, and return their coordinates and structured geographic information.", + "github": "openeventdata/mordecai", + "pip": "mordecai", + "thumb": "https://i.imgur.com/gPJ9upa.jpg", + "code_example": [ + "from mordecai import Geoparser", + "geo = Geoparser()", + "geo.geoparse(\"I traveled from Oxford to Ottawa.\")" + ], + "author": "Andy Halterman", + "author_links": { + "github": "ahalterman", + "twitter": "ahalterman" + }, + "category": ["standalone", "scientific"] + }, + { + "id": "kindred", + "title": "Kindred", + "slogan": "Biomedical relation extraction using spaCy", + "description": "Kindred is a package for relation extraction in biomedical texts. Given some training data, it can build a model to identify relations between entities (e.g. drugs, genes, etc) in a sentence.", + "github": "jakelever/kindred", + "pip": "kindred", + "code_example": [ + "import kindred", + "", + "trainCorpus = kindred.bionlpst.load('2016-BB3-event-train')", + "devCorpus = kindred.bionlpst.load('2016-BB3-event-dev')", + "predictionCorpus = devCorpus.clone()", + "predictionCorpus.removeRelations()", + "classifier = kindred.RelationClassifier()", + "classifier.train(trainCorpus)", + "classifier.predict(predictionCorpus)", + "f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score')" + ], + "author": "Jake Lever", + "author_links": { + "github": "jakelever" + }, + "category": ["standalone", "scientific"] + }, + { + "id": "sense2vec", + "slogan": "Use NLP to go beyond vanilla word2vec", + "description": "sense2vec ([Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you learn more interesting, detailed and context-sensitive word vectors. For an interactive example of the technology, see our [sense2vec demo](https://explosion.ai/demos/sense2vec) that lets you explore semantic similarities across all Reddit comments of 2015.", + "github": "explosion/sense2vec", + "pip": "sense2vec==1.0.0a1", + "thumb": "https://i.imgur.com/awfdhX6.jpg", + "image": "https://explosion.ai/assets/img/demos/sense2vec.png", + "url": "https://explosion.ai/demos/sense2vec", + "code_example": [ + "import spacy", + "", + "nlp = spacy.load(\"en_core_web_sm\")", + "s2v = nlp.add_pipe(\"sense2vec\")", + "s2v.from_disk(\"/path/to/s2v_reddit_2015_md\")", + "", + "doc = nlp(\"A sentence about natural language processing.\")", + "assert doc[3:6].text == \"natural language processing\"", + "freq = doc[3:6]._.s2v_freq", + "vector = doc[3:6]._.s2v_vec", + "most_similar = doc[3:6]._.s2v_most_similar(3)", + "# [(('machine learning', 'NOUN'), 0.8986967),", + "# (('computer vision', 'NOUN'), 0.8636297),", + "# (('deep learning', 'NOUN'), 0.8573361)]" + ], + "category": ["pipeline", "standalone", "visualizers"], + "tags": ["vectors"], + "author": "Explosion", + "author_links": { + "twitter": "explosion_ai", + "github": "explosion", + "website": "https://explosion.ai" + } + }, + { + "id": "spacyr", + "slogan": "An R wrapper for spaCy", + "github": "quanteda/spacyr", + "cran": "spacyr", + "code_example": [ + "library(\"spacyr\")", + "spacy_initialize()", + "", + "txt <- c(d1 = \"spaCy excels at large-scale information extraction tasks.\",", + " d2 = \"Mr. Smith goes to North Carolina.\")", + "", + "# process documents and obtain a data.table", + "parsedtxt <- spacy_parse(txt)" + ], + "code_language": "r", + "author": "Kenneth Benoit & Aki Matsuo", + "category": ["nonpython"] + }, + { + "id": "cleannlp", + "title": "CleanNLP", + "slogan": "A tidy data model for NLP in R", + "description": "The cleanNLP package is designed to make it as painless as possible to turn raw text into feature-rich data frames. the package offers four backends that can be used for parsing text: `tokenizers`, `udpipe`, `spacy` and `corenlp`.", + "github": "statsmaths/cleanNLP", + "cran": "cleanNLP", + "author": "Taylor B. Arnold", + "author_links": { + "github": "statsmaths" + }, + "category": ["nonpython"] + }, + { + "id": "spacy-cpp", + "slogan": "C++ wrapper library for spaCy", + "description": "The goal of spacy-cpp is to expose the functionality of spaCy to C++ applications, and to provide an API that is similar to that of spaCy, enabling rapid development in Python and simple porting to C++.", + "github": "d99kris/spacy-cpp", + "code_example": [ + "Spacy::Spacy spacy;", + "auto nlp = spacy.load(\"en_core_web_sm\");", + "auto doc = nlp.parse(\"This is a sentence.\");", + "for (auto& token : doc.tokens())", + " std::cout << token.text() << \" [\" << token.pos_() << \"]\\n\";" + ], + "code_language": "cpp", + "author": "Kristofer Berggren", + "author_links": { + "github": "d99kris" + }, + "category": ["nonpython"] + }, + { + "id": "spaCy.jl", + "slogan": "Julia interface for spaCy (work in progress)", + "github": "jekbradbury/SpaCy.jl", + "author": "James Bradbury", + "author_links": { + "github": "jekbradbury", + "twitter": "jekbradbury" + }, + "category": ["nonpython"] + }, + { + "id": "ruby-spacy", + "title": "ruby-spacy", + "slogan": "Wrapper module for using spaCy from Ruby via PyCall", + "description": "ruby-spacy is a wrapper module for using spaCy from the Ruby programming language via PyCall. This module aims to make it easy and natural for Ruby programmers to use spaCy.", + "github": "yohasebe/ruby-spacy", + "code_example": [ + "require \"ruby-spacy\"", + "require \"terminal-table\"", + "nlp = Spacy::Language.new(\"en_core_web_sm\")", + "doc = nlp.read(\"Apple is looking at buying U.K. startup for $1 billion\")", + "headings = [\"text\", \"lemma\", \"pos\", \"tag\", \"dep\"]", + "rows = []", + "doc.each do |token|", + " rows << [token.text, token.lemma, token.pos, token.tag, token.dep]", + "end", + "table = Terminal::Table.new rows: rows, headings: headings", + "puts table" + ], + "code_language": "ruby", + "url": "https://rubygems.org/gems/ruby-spacy", + "author": "Yoichiro Hasebe", + "author_links": { + "github": "yohasebe", + "twitter": "yohasebe" + }, + "category": ["nonpython"], + "tags": ["ruby"] + }, + { + "id": "spacy_api", + "slogan": "Server/client to load models in a separate, dedicated process", + "github": "kootenpv/spacy_api", + "pip": "spacy_api", + "code_example": [ + "from spacy_api import Client", + "", + "spacy_client = Client() # default args host/port", + "doc = spacy_client.single(\"How are you\")" + ], + "author": "Pascal van Kooten", + "author_links": { + "github": "kootenpv" + }, + "category": ["apis"] + }, + { + "id": "spacy-api-docker", + "slogan": "spaCy REST API, wrapped in a Docker container", + "github": "jgontrum/spacy-api-docker", + "url": "https://hub.docker.com/r/jgontrum/spacyapi/", + "thumb": "https://i.imgur.com/NRnDKyj.jpg", + "code_example": [ + "version: '2'", + "", + "services:", + " spacyapi:", + " image: jgontrum/spacyapi:en_v2", + " ports:", + " - \"127.0.0.1:8080:80\"", + " restart: always" + ], + "code_language": "docker", + "author": "Johannes Gontrum", + "author_links": { + "github": "jgontrum" + }, + "category": ["apis"] + }, + { + "id": "languagecrunch", + "slogan": "NLP server for spaCy, WordNet and NeuralCoref as a Docker image", + "github": "artpar/languagecrunch", + "code_example": [ + "docker run -it -p 8080:8080 artpar/languagecrunch", + "curl http://localhost:8080/nlp/parse?`echo -n \"The new twitter is so weird. Seriously. Why is there a new twitter? What was wrong with the old one? Fix it now.\" | python -c \"import urllib, sys; print(urllib.urlencode({'sentence': sys.stdin.read()}))\"`" + ], + "code_language": "bash", + "author": "Parth Mudgal", + "author_links": { + "github": "artpar" + }, + "category": ["apis"] + }, + { + "id": "spacy-nlp", + "slogan": " Expose spaCy NLP text parsing to Node.js (and other languages) via Socket.IO", + "github": "kengz/spacy-nlp", + "thumb": "https://i.imgur.com/w41VSr7.jpg", + "code_example": [ + "const spacyNLP = require(\"spacy-nlp\")", + "// default port 6466", + "// start the server with the python client that exposes spacyIO (or use an existing socketIO server at IOPORT)", + "var serverPromise = spacyNLP.server({ port: process.env.IOPORT });", + "// Loading spacy may take up to 15s" + ], + "code_language": "javascript", + "author": "Wah Loon Keng", + "author_links": { + "github": "kengz" + }, + "category": ["apis", "nonpython"] + }, + { + "id": "prodigy", + "title": "Prodigy", + "slogan": "Radically efficient machine teaching, powered by active learning", + "description": "Prodigy is an annotation tool so efficient that data scientists can do the annotation themselves, enabling a new level of rapid iteration. Whether you're working on entity recognition, intent detection or image classification, Prodigy can help you train and evaluate your models faster. Stream in your own examples or real-world data from live APIs, update your model in real-time and chain models together to build more complex systems.", + "thumb": "https://i.imgur.com/UVRtP6g.jpg", + "image": "https://i.imgur.com/Dt5vrY6.png", + "url": "https://prodi.gy", + "code_example": [ + "prodigy dataset ner_product \"Improve PRODUCT on Reddit data\"", + "✨ Created dataset 'ner_product'.", + "", + "prodigy ner.teach ner_product en_core_web_sm ~/data.jsonl --label PRODUCT", + "✨ Starting the web server on port 8080..." + ], + "code_language": "bash", + "category": ["standalone", "training"], + "author": "Explosion", + "author_links": { + "twitter": "explosion_ai", + "github": "explosion", + "website": "https://explosion.ai" + } + }, + { + "id": "dragonfire", + "title": "Dragonfire", + "slogan": "An open-source virtual assistant for Ubuntu based Linux distributions", + "github": "DragonComputer/Dragonfire", + "thumb": "https://i.imgur.com/5fqguKS.jpg", + "image": "https://raw.githubusercontent.com/DragonComputer/Dragonfire/master/docs/img/demo.gif", + "author": "Dragon Computer", + "author_links": { + "github": "DragonComputer", + "website": "http://dragon.computer" + }, + "category": ["standalone"] + }, + { + "id": "prefect", + "title": "Prefect", + "slogan": "Workflow management system designed for modern infrastructure", + "github": "PrefectHQ/prefect", + "pip": "prefect", + "thumb": "https://i.imgur.com/oLTwr0e.png", + "code_example": [ + "from prefect import Flow", + "from prefect.tasks.spacy.spacy_tasks import SpacyNLP", + "import spacy", + "", + "nlp = spacy.load(\"en_core_web_sm\")", + "", + "with Flow(\"Natural Language Processing\") as flow:", + " doc = SpacyNLP(text=\"This is some text\", nlp=nlp)", + "", + "flow.run()" + ], + "author": "Prefect", + "author_links": { + "website": "https://prefect.io" + }, + "category": ["standalone"] + }, + { + "id": "graphbrain", + "title": "Graphbrain", + "slogan": "Automated meaning extraction and text understanding", + "description": "Graphbrain is an Artificial Intelligence open-source software library and scientific research tool. Its aim is to facilitate automated meaning extraction and text understanding, as well as the exploration and inference of knowledge.", + "github": "graphbrain/graphbrain", + "pip": "graphbrain", + "thumb": "https://i.imgur.com/cct9W1E.png", + "author": "Graphbrain", + "category": ["standalone"] + }, + { + "type": "education", + "id": "nostarch-nlp-python", + "title": "Natural Language Processing Using Python", + "slogan": "No Starch Press, 2020", + "description": "Natural Language Processing Using Python is an introduction to natural language processing (NLP), the task of converting human language into data that a computer can process. The book uses spaCy, a leading Python library for NLP, to guide readers through common NLP tasks related to generating and understanding human language with code. It addresses problems like understanding a user's intent, continuing a conversation with a human, and maintaining the state of a conversation.", + "cover": "https://i.imgur.com/w0iycjl.jpg", + "url": "https://nostarch.com/NLPPython", + "author": "Yuli Vasiliev", + "category": ["books"] + }, + { + "type": "education", + "id": "oreilly-python-ds", + "title": "Introduction to Machine Learning with Python: A Guide for Data Scientists", + "slogan": "O'Reilly, 2016", + "description": "Machine learning has become an integral part of many commercial applications and research projects, but this field is not exclusive to large companies with extensive research teams. If you use Python, even as a beginner, this book will teach you practical ways to build your own machine learning solutions. With all the data available today, machine learning applications are limited only by your imagination.", + "cover": "https://covers.oreillystatic.com/images/0636920030515/lrg.jpg", + "url": "http://shop.oreilly.com/product/0636920030515.do", + "author": "Andreas Müller, Sarah Guido", + "category": ["books"] + }, + { + "type": "education", + "id": "text-analytics-python", + "title": "Text Analytics with Python", + "slogan": "Apress / Springer, 2016", + "description": "*Text Analytics with Python* teaches you the techniques related to natural language processing and text analytics, and you will gain the skills to know which technique is best suited to solve a particular problem. You will look at each technique and algorithm with both a bird's eye view to understand how it can be used as well as with a microscopic view to understand the mathematical concepts and to implement them to solve your own problems.", + "github": "dipanjanS/text-analytics-with-python", + "cover": "https://i.imgur.com/AOmzZu8.png", + "url": "https://www.amazon.com/Text-Analytics-Python-Real-World-Actionable/dp/148422387X", + "author": "Dipanjan Sarkar", + "category": ["books"] + }, + { + "type": "education", + "id": "practical-ml-python", + "title": "Practical Machine Learning with Python", + "slogan": "Apress, 2017", + "description": "Master the essential skills needed to recognize and solve complex problems with machine learning and deep learning. Using real-world examples that leverage the popular Python machine learning ecosystem, this book is your perfect companion for learning the art and science of machine learning to become a successful practitioner. The concepts, techniques, tools, frameworks, and methodologies used in this book will teach you how to think, design, build, and execute machine learning systems and projects successfully.", + "github": "dipanjanS/practical-machine-learning-with-python", + "cover": "https://i.imgur.com/5F4mkt7.jpg", + "url": "https://www.amazon.com/Practical-Machine-Learning-Python-Problem-Solvers/dp/1484232062", + "author": "Dipanjan Sarkar, Raghav Bali, Tushar Sharma", + "category": ["books"] + }, + { + "type": "education", + "id": "packt-nlp-computational-linguistics", + "title": "Natural Language Processing and Computational Linguistics", + "slogan": "Packt, 2018", + "description": "This book shows you how to use natural language processing, and computational linguistics algorithms, to make inferences and gain insights about data you have. These algorithms are based on statistical machine learning and artificial intelligence techniques. The tools to work with these algorithms are available to you right now - with Python, and tools like Gensim and spaCy.", + "cover": "https://i.imgur.com/aleMf1Y.jpg", + "url": "https://www.amazon.com/Natural-Language-Processing-Computational-Linguistics-ebook/dp/B07BWH779J", + "author": "Bhargav Srinivasa-Desikan", + "category": ["books"] + }, + { + "type": "education", + "id": "mastering-spacy", + "title": "Mastering spaCy", + "slogan": "Packt, 2021", + "description": "This is your ultimate spaCy book. Master the crucial skills to use spaCy components effectively to create real-world NLP applications with spaCy. Explaining linguistic concepts such as dependency parsing, POS-tagging and named entity extraction with many examples, this book will help you to conquer computational linguistics with spaCy. The book further focuses on ML topics with Keras and Tensorflow. You'll cover popular topics, including intent recognition, sentiment analysis and context resolution; and use them on popular datasets and interpret the results. A special hands-on section on chatbot design is included.", + "github": "PacktPublishing/Mastering-spaCy", + "cover": "https://tinyimg.io/i/aWEm0dh.jpeg", + "url": "https://www.amazon.com/Mastering-spaCy-end-end-implementing/dp/1800563353", + "author": "Duygu Altinok", + "author_links": { + "github": "DuyguA", + "website": "https://www.linkedin.com/in/duygu-altinok-4021389a" + }, + "category": ["books"] + }, + { + "type": "education", + "id": "applied-nlp-in-enterprise", + "title": "Applied Natural Language Processing in the Enterprise: Teaching Machines to Read, Write, and Understand", + "slogan": "O'Reilly, 2021", + "description": "Natural language processing (NLP) is one of the hottest topics in AI today. Having lagged behind other deep learning fields such as computer vision for years, NLP only recently gained mainstream popularity. Even though Google, Facebook, and OpenAI have open sourced large pretrained language models to make NLP easier, many organizations today still struggle with developing and productionizing NLP applications. This hands-on guide helps you learn the field quickly.", + "github": "nlpbook/nlpbook", + "cover": "https://i.imgur.com/6RxLBvf.jpg", + "url": "https://www.amazon.com/dp/149206257X", + "author": "Ankur A. Patel", + "author_links": { + "github": "aapatel09", + "website": "https://www.ankurapatel.io" + }, + "category": ["books"] + }, + { + "type": "education", + "id": "learning-path-spacy", + "title": "Learning Path: Mastering spaCy for Natural Language Processing", + "slogan": "O'Reilly, 2017", + "description": "spaCy, a fast, user-friendly library for teaching computers to understand text, simplifies NLP techniques, such as speech tagging and syntactic dependencies, so you can easily extract information, attributes, and objects from massive amounts of text to then document, measure, and analyze. This Learning Path is a hands-on introduction to using spaCy to discover insights through natural language processing. While end-to-end natural language processing solutions can be complex, you’ll learn the linguistics, algorithms, and machine learning skills to get the job done.", + "url": "https://www.safaribooksonline.com/library/view/learning-path-mastering/9781491986653/", + "thumb": "https://i.imgur.com/9MIgMAc.jpg", + "author": "Aaron Kramer", + "category": ["courses"] + }, + { + "type": "education", + "id": "introduction-into-spacy-3", + "title": "Introduction to spaCy 3", + "slogan": "A free course for beginners by Dr. W.J.B. Mattingly", + "url": "http://spacy.pythonhumanities.com/", + "thumb": "https://spacy.pythonhumanities.com/_static/freecodecamp_small.jpg", + "author": "Dr. W.J.B. Mattingly", + "category": ["courses"] + }, + { + "type": "education", + "id": "spacy-course", + "title": "Advanced NLP with spaCy", + "slogan": "A free online course", + "description": "In this free interactive course, you'll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches.", + "url": "https://course.spacy.io", + "image": "https://i.imgur.com/JC00pHW.jpg", + "thumb": "https://i.imgur.com/5RXLtrr.jpg", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines", + "website": "https://ines.io" + }, + "category": ["courses"] + }, + { + "type": "education", + "id": "applt-course", + "title": "Applied Language Technology", + "slogan": "NLP for newcomers using spaCy and Stanza", + "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.", + "url": "https://applied-language-technology.mooc.fi", + "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg", + "thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png", + "author": "Tuomo Hiippala", + "author_links": { + "twitter": "tuomo_h", + "github": "thiippal", + "website": "https://www.mv.helsinki.fi/home/thiippal/" + }, + "category": ["courses"] + }, + { + "type": "education", + "id": "video-spacys-ner-model", + "title": "spaCy's NER model", + "slogan": "Incremental parsing with bloom embeddings and residual CNNs", + "description": "spaCy v2.0's Named Entity Recognition system features a sophisticated word embedding strategy using subword features and \"Bloom\" embeddings, a deep convolutional neural network with residual connections, and a novel transition-based approach to named entity parsing. The system is designed to give a good balance of efficiency, accuracy and adaptability. In this talk, I sketch out the components of the system, explaining the intuition behind the various choices. I also give a brief introduction to the named entity recognition problem, with an overview of what else Explosion AI is working on, and why.", + "youtube": "sqDHBH9IjRU", + "author": "Matthew Honnibal", + "author_links": { + "twitter": "honnibal", + "github": "honnibal", + "website": "https://explosion.ai" + }, + "category": ["videos"] + }, + { + "type": "education", + "id": "video-new-nlp-solutions", + "title": "Building new NLP solutions with spaCy and Prodigy", + "slogan": "PyData Berlin 2018", + "description": "In this talk, I will discuss how to address some of the most likely causes of failure for new Natural Language Processing (NLP) projects. My main recommendation is to take an iterative approach: don't assume you know what your pipeline should look like, let alone your annotation schemes or model architectures.", + "author": "Matthew Honnibal", + "author_links": { + "twitter": "honnibal", + "github": "honnibal", + "website": "https://explosion.ai" + }, + "youtube": "jpWqz85F_4Y", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-modern-nlp-in-python", + "title": "Modern NLP in Python", + "slogan": "PyData DC 2016", + "description": "Academic and industry research in Natural Language Processing (NLP) has progressed at an accelerating pace over the last several years. Members of the Python community have been hard at work moving cutting-edge research out of papers and into open source, \"batteries included\" software libraries that can be applied to practical problems. We'll explore some of these tools for modern NLP in Python.", + "author": "Patrick Harrison", + "youtube": "6zm9NC9uRkk", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-spacy-course", + "title": "Advanced NLP with spaCy · A free online course", + "description": "spaCy is a modern Python library for industrial-strength Natural Language Processing. In this free and interactive online course, you'll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches.", + "url": "https://course.spacy.io/en", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines" + }, + "youtube": "THduWAnG97k", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-spacy-course-de", + "title": "Modernes NLP mit spaCy · Ein Gratis-Onlinekurs", + "description": "spaCy ist eine moderne Python-Bibliothek für industriestarkes Natural Language Processing. In diesem kostenlosen und interaktiven Onlinekurs lernst du, mithilfe von spaCy fortgeschrittene Systeme für die Analyse natürlicher Sprache zu entwickeln und dabei sowohl regelbasierte Verfahren, als auch moderne Machine-Learning-Technologie einzusetzen.", + "url": "https://course.spacy.io/de", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines" + }, + "youtube": "K1elwpgDdls", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-spacy-course-es", + "title": "NLP avanzado con spaCy · Un curso en línea gratis", + "description": "spaCy es un paquete moderno de Python para hacer Procesamiento de Lenguaje Natural de potencia industrial. En este curso en línea, interactivo y gratuito, aprenderás a usar spaCy para construir sistemas avanzados de comprensión de lenguaje natural usando enfoques basados en reglas y en machine learning.", + "url": "https://course.spacy.io/es", + "author": "Camila Gutiérrez", + "author_links": { + "twitter": "Mariacamilagl30" + }, + "youtube": "RNiLVCE5d4k", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-1", + "title": "Intro to NLP with spaCy (1)", + "slogan": "Episode 1: Data exploration", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "WnGPv6HnBok", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-2", + "title": "Intro to NLP with spaCy (2)", + "slogan": "Episode 2: Rule-based Matching", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "KL4-Mpgbahw", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-3", + "title": "Intro to NLP with spaCy (3)", + "slogan": "Episode 2: Evaluation", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "4V0JDdohxAk", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-4", + "title": "Intro to NLP with spaCy (4)", + "slogan": "Episode 4: Named Entity Recognition", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "IqOJU1-_Fi0", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-intro-to-nlp-episode-5", + "title": "Intro to NLP with spaCy (5)", + "slogan": "Episode 5: Rules vs. Machine Learning", + "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", + "author": "Vincent Warmerdam", + "author_links": { + "twitter": "fishnets88", + "github": "koaning" + }, + "youtube": "f4sqeLRzkPg", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-spacy-irl-entity-linking", + "title": "Entity Linking functionality in spaCy", + "slogan": "spaCy IRL 2019", + "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc", + "author": "Sofie Van Landeghem", + "author_links": { + "twitter": "OxyKodit", + "github": "svlandeg" + }, + "youtube": "PW3RJM8tDGo", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-spacy-irl-lemmatization", + "title": "Rethinking rule-based lemmatization", + "slogan": "spaCy IRL 2019", + "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc", + "author": "Guadalupe Romero", + "author_links": { + "twitter": "_guadiromero", + "github": "guadi1994" + }, + "youtube": "88zcQODyuko", + "category": ["videos"] + }, + { + "type": "education", + "id": "video-spacy-irl-scispacy", + "title": "ScispaCy: A spaCy pipeline & models for scientific & biomedical text", + "slogan": "spaCy IRL 2019", + "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc", + "author": "Mark Neumann", + "author_links": { + "twitter": "MarkNeumannnn", + "github": "DeNeutoy" + }, + "youtube": "2_HSKDALwuw", + "category": ["videos"] + }, + { + "type": "education", + "id": "podcast-nlp-highlights", + "title": "NLP Highlights #78: Where do corpora come from?", + "slogan": "January 2019", + "description": "Most NLP projects rely crucially on the quality of annotations used for training and evaluating models. In this episode, Matt and Ines of Explosion AI tell us how Prodigy can improve data annotation and model development workflows. Prodigy is an annotation tool implemented as a python library, and it comes with a web application and a command line interface. A developer can define input data streams and design simple annotation interfaces. Prodigy can help break down complex annotation decisions into a series of binary decisions, and it provides easy integration with spaCy models. Developers can specify how models should be modified as new annotations come in in an active learning framework.", + "soundcloud": "559200912", + "thumb": "https://i.imgur.com/hOBQEzc.jpg", + "url": "https://soundcloud.com/nlp-highlights/78-where-do-corpora-come-from-with-matt-honnibal-and-ines-montani", + "author": "Matt Gardner, Waleed Ammar (Allen AI)", + "author_links": { + "website": "https://soundcloud.com/nlp-highlights" + }, + "category": ["podcasts"] + }, + { + "type": "education", + "id": "podcast-init", + "title": "Podcast.__init__ #87: spaCy with Matthew Honnibal", + "slogan": "December 2017", + "description": "As the amount of text available on the internet and in businesses continues to increase, the need for fast and accurate language analysis becomes more prominent. This week Matthew Honnibal, the creator of spaCy, talks about his experiences researching natural language processing and creating a library to make his findings accessible to industry.", + "iframe": "https://www.pythonpodcast.com/wp-content/plugins/podlove-podcasting-plugin-for-wordpress/lib/modules/podlove_web_player/player_v4/dist/share.html?episode=https://www.pythonpodcast.com/?podlove_player4=176", + "iframe_height": 200, + "thumb": "https://i.imgur.com/rpo6BuY.png", + "url": "https://www.podcastinit.com/episode-87-spacy-with-matthew-honnibal/", + "author": "Tobias Macey", + "author_links": { + "website": "https://www.podcastinit.com" + }, + "category": ["podcasts"] + }, + { + "type": "education", + "id": "podcast-init2", + "title": "Podcast.__init__ #256: An Open Source Toolchain For NLP From Explosion AI", + "slogan": "March 2020", + "description": "The state of the art in natural language processing is a constantly moving target. With the rise of deep learning, previously cutting edge techniques have given way to robust language models. Through it all the team at Explosion AI have built a strong presence with the trifecta of spaCy, Thinc, and Prodigy to support fast and flexible data labeling to feed deep learning models and performant and scalable text processing. In this episode founder and open source author Matthew Honnibal shares his experience growing a business around cutting edge open source libraries for the machine learning developent process.", + "iframe": "https://cdn.podlove.org/web-player/share.html?episode=https%3A%2F%2Fwww.pythonpodcast.com%2F%3Fpodlove_player4%3D614", + "iframe_height": 200, + "thumb": "https://i.imgur.com/rpo6BuY.png", + "url": "https://www.pythonpodcast.com/explosion-ai-natural-language-processing-episode-256/", + "author": "Tobias Macey", + "author_links": { + "website": "https://www.podcastinit.com" + }, + "category": ["podcasts"] + }, + { + "type": "education", + "id": "talk-python-podcast", + "title": "Talk Python #202: Building a software business", + "slogan": "March 2019", + "description": "One core question around open source is how do you fund it? Well, there is always that PayPal donate button. But that's been a tremendous failure for many projects. Often the go-to answer is consulting. But what if you don't want to trade time for money? You could take things up a notch and change the equation, exchanging value for money. That's what Ines Montani and her co-founder did when they started Explosion AI with spaCy as the foundation.", + "thumb": "https://i.imgur.com/q1twuK8.png", + "url": "https://talkpython.fm/episodes/show/202/building-a-software-business", + "soundcloud": "588364857", + "author": "Michael Kennedy", + "author_links": { + "website": "https://talkpython.fm/" + }, + "category": ["podcasts"] + }, + { + "type": "education", + "id": "twimlai-podcast", + "title": "TWiML & AI: Practical NLP with spaCy and Prodigy", + "slogan": "May 2019", + "description": "\"Ines and I caught up to discuss her various projects, including the aforementioned spaCy, an open-source NLP library built with a focus on industry and production use cases. In our conversation, Ines gives us an overview of the spaCy Library, a look at some of the use cases that excite her, and the Spacy community and contributors. We also discuss her work with Prodigy, an annotation service tool that uses continuous active learning to train models, and finally, what other exciting projects she is working on.\"", + "thumb": "https://i.imgur.com/ng2F5gK.png", + "url": "https://twimlai.com/twiml-talk-262-practical-natural-language-processing-with-spacy-and-prodigy-w-ines-montani", + "iframe": "https://html5-player.libsyn.com/embed/episode/id/9691514/height/90/theme/custom/thumbnail/no/preload/no/direction/backward/render-playlist/no/custom-color/3e85b1/", + "iframe_height": 90, + "author": "Sam Charrington", + "author_links": { + "website": "https://twimlai.com" + }, + "category": ["podcasts"] + }, + { + "type": "education", + "id": "analytics-vidhya", + "title": "DataHack Radio #23: The Brains behind spaCy", + "slogan": "June 2019", + "description": "\"What would you do if you had the chance to pick the brains behind one of the most popular Natural Language Processing (NLP) libraries of our era? A library that has helped usher in the current boom in NLP applications and nurtured tons of NLP scientists? Well – you invite the creators on our popular DataHack Radio podcast and let them do the talking! We are delighted to welcome Ines Montani and Matt Honnibal, the developers of spaCy – a powerful and advanced library for NLP.\"", + "thumb": "https://i.imgur.com/3zJKZ1P.jpg", + "url": "https://www.analyticsvidhya.com/blog/2019/06/datahack-radio-ines-montani-matthew-honnibal-brains-behind-spacy/", + "soundcloud": "630741825", + "author": "Analytics Vidhya", + "author_links": { + "website": "https://www.analyticsvidhya.com", + "twitter": "analyticsvidhya" + }, + "category": ["podcasts"] + }, + { + "type": "education", + "id": "practical-ai-podcast", + "title": "Practical AI: Modern NLP with spaCy", + "slogan": "December 2019", + "description": "\"spaCy is awesome for NLP! It’s easy to use, has widespread adoption, is open source, and integrates the latest language models. Ines Montani and Matthew Honnibal (core developers of spaCy and co-founders of Explosion) join us to discuss the history of the project, its capabilities, and the latest trends in NLP. We also dig into the practicalities of taking NLP workflows to production. You don’t want to miss this episode!\"", + "thumb": "https://i.imgur.com/jn8Bcdw.png", + "url": "https://changelog.com/practicalai/68", + "author": "Daniel Whitenack & Chris Benson", + "author_links": { + "website": "https://changelog.com/practicalai", + "twitter": "https://twitter.com/PracticalAIFM" + }, + "category": ["podcasts"] + }, + { + "type": "education", + "id": "video-entity-linking", + "title": "Training a custom entity linking mode with spaCy", + "author": "Sofie Van Landeghem", + "author_links": { + "twitter": "OxyKodit", + "github": "svlandeg" + }, + "youtube": "8u57WSXVpmw", + "category": ["videos"] + }, + { + "id": "adam_qas", + "title": "ADAM: Question Answering System", + "slogan": "A question answering system that extracts answers from Wikipedia to questions posed in natural language.", + "github": "5hirish/adam_qas", + "pip": "qas", + "code_example": [ + "git clone https://github.com/5hirish/adam_qas.git", + "cd adam_qas", + "pip install -r requirements.txt", + "python -m qas.adam 'When was linux kernel version 4.0 released ?'" + ], + "code_language": "bash", + "thumb": "https://shirishkadam.files.wordpress.com/2018/04/mini_alleviate.png", + "author": "Shirish Kadam", + "author_links": { + "twitter": "5hirish", + "github": "5hirish", + "website": "https://shirishkadam.com/" + }, + "category": ["standalone"], + "tags": ["question-answering", "elasticsearch"] + }, + { + "id": "epitator", + "title": "EpiTator", + "thumb": "https://i.imgur.com/NYFY1Km.jpg", + "slogan": "Extracts case counts, resolved location/species/disease names, date ranges and more", + "description": "EcoHealth Alliance uses EpiTator to catalog the what, where and when of infectious disease case counts reported in online news. Each of these aspects is extracted using independent annotators than can be applied to other domains. EpiTator organizes annotations by creating \"AnnoTiers\" for each type. AnnoTiers have methods for manipulating, combining and searching annotations. For instance, the `with_following_spans_from()` method can be used to create a new tier that combines a tier of one type (such as numbers), with another (say, kitchenware). The resulting tier will contain all the phrases in the document that match that pattern, like \"5 plates\" or \"2 cups.\"\n\nAnother commonly used method is `group_spans_by_containing_span()` which can be used to do things like find all the spaCy tokens in all the GeoNames a document mentions. spaCy tokens, named entities, sentences and noun chunks are exposed through the spaCy annotator which will create a AnnoTier for each. These are basis of many of the other annotators. EpiTator also includes an annotator for extracting tables embedded in free text articles. Another neat feature is that the lexicons used for entity resolution are all stored in an embedded sqlite database so there is no need to run any external services in order to use EpiTator.", + "url": "https://github.com/ecohealthalliance/EpiTator", + "github": "ecohealthalliance/EpiTator", + "pip": "EpiTator", + "code_example": [ + "from epitator.annotator import AnnoDoc", + "from epitator.geoname_annotator import GeonameAnnotator", + "", + "doc = AnnoDoc('Where is Chiang Mai?')", + "geoname_annotier = doc.require_tiers('geonames', via=GeonameAnnotator)", + "geoname = geoname_annotier.spans[0].metadata['geoname']", + "geoname['name']", + "# = 'Chiang Mai'", + "geoname['geonameid']", + "# = '1153671'", + "geoname['latitude']", + "# = 18.79038", + "geoname['longitude']", + "# = 98.98468", + "", + "from epitator.spacy_annotator import SpacyAnnotator", + "spacy_token_tier = doc.require_tiers('spacy.tokens', via=SpacyAnnotator)", + "list(geoname_annotier.group_spans_by_containing_span(spacy_token_tier))", + "# = [(AnnoSpan(9-19, Chiang Mai), [AnnoSpan(9-15, Chiang), AnnoSpan(16-19, Mai)])]" + ], + "author": "EcoHealth Alliance", + "author_links": { + "github": "ecohealthalliance", + "website": " https://ecohealthalliance.org/" + }, + "category": ["scientific", "standalone"] + }, + { + "id": "self-attentive-parser", + "title": "Berkeley Neural Parser", + "slogan": "Constituency Parsing with a Self-Attentive Encoder (ACL 2018)", + "description": "A Python implementation of the parsers described in *\"Constituency Parsing with a Self-Attentive Encoder\"* from ACL 2018.", + "url": "https://arxiv.org/abs/1805.01052", + "github": "nikitakit/self-attentive-parser", + "pip": "benepar", + "code_example": [ + "import benepar, spacy", + "nlp = spacy.load('en_core_web_md')", + "nlp.add_pipe('benepar', config={'model': 'benepar_en3'})", + "doc = nlp('The time for action is now. It is never too late to do something.')", + "sent = list(doc.sents)[0]", + "print(sent._.parse_string)", + "# (S (NP (NP (DT The) (NN time)) (PP (IN for) (NP (NN action)))) (VP (VBZ is) (ADVP (RB now))) (. .))", + "print(sent._.labels)", + "# ('S',)", + "print(list(sent._.children)[0])", + "# The time for action" + ], + "author": "Nikita Kitaev", + "author_links": { + "github": "nikitakit", + "website": " http://kitaev.io" + }, + "category": ["research", "pipeline"] + }, + { + "id": "excelcy", + "title": "ExcelCy", + "slogan": "Excel Integration with spaCy. Training NER using XLSX from PDF, DOCX, PPT, PNG or JPG.", + "description": "ExcelCy is a toolkit to integrate Excel to spaCy NLP training experiences. Training NER using XLSX from PDF, DOCX, PPT, PNG or JPG. ExcelCy has pipeline to match Entity with PhraseMatcher or Matcher in regular expression.", + "url": "https://github.com/kororo/excelcy", + "github": "kororo/excelcy", + "pip": "excelcy", + "code_example": [ + "from excelcy import ExcelCy", + "# collect sentences, annotate Entities and train NER using spaCy", + "excelcy = ExcelCy.execute(file_path='https://github.com/kororo/excelcy/raw/master/tests/data/test_data_01.xlsx')", + "# use the nlp object as per spaCy API", + "doc = excelcy.nlp('Google rebrands its business apps')", + "# or save it for faster bootstrap for application", + "excelcy.nlp.to_disk('/model')" + ], + "author": "Robertus Johansyah", + "author_links": { + "github": "kororo" + }, + "category": ["training"], + "tags": ["excel"] + }, + { + "id": "spacy-graphql", + "title": "spacy-graphql", + "slogan": "Query spaCy's linguistic annotations using GraphQL", + "github": "ines/spacy-graphql", + "description": "A very simple and experimental app that lets you query spaCy's linguistic annotations using [GraphQL](https://graphql.org/). The API currently supports most token attributes, named entities, sentences and text categories (if available as `doc.cats`, i.e. if you added a text classifier to a model). The `meta` field will return the model meta data. Models are only loaded once and kept in memory.", + "url": "https://explosion.ai/demos/spacy-graphql", + "category": ["apis"], + "tags": ["graphql"], + "thumb": "https://i.imgur.com/xC7zpTO.png", + "code_example": [ + "{", + " nlp(text: \"Zuckerberg is the CEO of Facebook.\", model: \"en_core_web_sm\") {", + " meta {", + " lang", + " description", + " }", + " doc {", + " text", + " tokens {", + " text", + " pos_", + " }", + " ents {", + " text", + " label_", + " }", + " }", + " }", + "}" + ], + "code_language": "json", + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines", + "website": "https://ines.io" + } + }, + { + "id": "spacy-js", + "title": "spacy-js", + "slogan": "JavaScript API for spaCy with Python REST API", + "github": "ines/spacy-js", + "description": "JavaScript interface for accessing linguistic annotations provided by spaCy. This project is mostly experimental and was developed for fun to play around with different ways of mimicking spaCy's Python API.\n\nThe results will still be computed in Python and made available via a REST API. The JavaScript API resembles spaCy's Python API as closely as possible (with a few exceptions, as the values are all pre-computed and it's tricky to express complex recursive relationships).", + "code_language": "javascript", + "code_example": [ + "const spacy = require('spacy');", + "", + "(async function() {", + " const nlp = spacy.load('en_core_web_sm');", + " const doc = await nlp('This is a text about Facebook.');", + " for (let ent of doc.ents) {", + " console.log(ent.text, ent.label);", + " }", + " for (let token of doc) {", + " console.log(token.text, token.pos, token.head.text);", + " }", + "})();" + ], + "author": "Ines Montani", + "author_links": { + "twitter": "_inesmontani", + "github": "ines", + "website": "https://ines.io" + }, + "category": ["nonpython"], + "tags": ["javascript"] + }, + { + "id": "spacy-raspberry", + "title": "spacy-raspberry", + "slogan": "64bit Raspberry Pi image for spaCy and neuralcoref", + "github": "boehm-e/spacy-raspberry", + "thumb": "https://i.imgur.com/VCJMrE6.png", + "image": "https://raw.githubusercontent.com/boehm-e/spacy-raspberry/master/imgs/preview.png", + "author": "Erwan Boehm", + "author_links": { + "github": "boehm-e" + }, + "category": ["apis"], + "tags": ["raspberrypi"] + }, + { + "id": "spacy-wordnet", + "title": "spacy-wordnet", + "slogan": "WordNet meets spaCy", + "description": "`spacy-wordnet` creates annotations that easily allow the use of WordNet and [WordNet Domains](http://wndomains.fbk.eu/) by using the [NLTK WordNet interface](http://www.nltk.org/howto/wordnet.html)", + "github": "recognai/spacy-wordnet", + "tags": ["wordnet", "synsets"], + "thumb": "https://i.imgur.com/ud4C7cj.png", + "code_example": [ + "import spacy", + "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ", + "", + "# Load an spacy model (supported models are \"es\" and \"en\") ", + "nlp = spacy.load('en')", + "# Spacy 3.x", + "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})", + "# Spacy 2.x", + "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", + "token = nlp('prices')[0]", + "", + "# wordnet object link spacy token with nltk wordnet interface by giving acces to", + "# synsets and lemmas ", + "token._.wordnet.synsets()", + "token._.wordnet.lemmas()", + "", + "# And automatically tags with wordnet domains", + "token._.wordnet.wordnet_domains()" + ], + "author": "recognai", + "author_links": { + "github": "recognai", + "twitter": "recogn_ai", + "website": "https://recogn.ai" + }, + "category": ["pipeline"] + }, + { + "id": "spacy-conll", + "title": "spacy_conll", + "slogan": "Parsing to CoNLL with spaCy, spacy-stanza, and spacy-udpipe", + "description": "This module allows you to parse text into CoNLL-U format. You can use it as a command line tool, or embed it in your own scripts by adding it as a custom pipeline component to a spaCy, spacy-stanfordnlp, spacy-stanza, or spacy-udpipe pipeline. It also provides an easy-to-use function to quickly initialize a parser. CoNLL-related properties are added to Doc elements, sentence Spans, and Tokens.", + "code_example": [ + "from spacy_conll import init_parser", + "", + "", + "# Initialise English parser, already including the ConllFormatter as a pipeline component.", + "# Indicate that we want to get the CoNLL headers in the string output.", + "# `use_gpu` and `verbose` are specific to stanza (and stanfordnlp). These keywords arguments", + "# are passed onto their Pipeline() initialisation", + "nlp = init_parser(\"stanza\",", + " \"en\",", + " parser_opts={\"use_gpu\": True, \"verbose\": False},", + " include_headers=True)", + "# Parse a given string", + "doc = nlp(\"A cookie is a baked or cooked food that is typically small, flat and sweet. It usually contains flour, sugar and some type of oil or fat.\")", + "", + "# Get the CoNLL representation of the whole document, including headers", + "conll = doc._.conll_str", + "print(conll)" + ], + "code_language": "python", + "author": "Bram Vanroy", + "author_links": { + "github": "BramVanroy", + "twitter": "BramVanroy", + "website": "http://bramvanroy.be" + }, + "github": "BramVanroy/spacy_conll", + "category": ["standalone", "pipeline"], + "tags": ["linguistics", "computational linguistics", "conll"] + }, + { + "id": "spacy-langdetect", + "title": "spacy-langdetect", + "slogan": "A fully customizable language detection pipeline for spaCy", + "description": "This module allows you to add language detection capabilites to your spaCy pipeline. Also supports custom language detectors!", + "pip": "spacy-langdetect", + "code_example": [ + "import spacy", + "from spacy_langdetect import LanguageDetector", + "nlp = spacy.load('en')", + "nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)", + "text = 'This is an english text.'", + "doc = nlp(text)", + "# document level language detection. Think of it like average language of the document!", + "print(doc._.language)", + "# sentence level language detection", + "for sent in doc.sents:", + " print(sent, sent._.language)" + ], + "code_language": "python", + "author": "Abhijit Balaji", + "author_links": { + "github": "Abhijit-2592", + "website": "https://abhijit-2592.github.io/" + }, + "github": "Abhijit-2592/spacy-langdetect", + "category": ["pipeline"], + "tags": ["language-detection"] + }, + { + "id": "ludwig", + "title": "Ludwig", + "slogan": "A code-free deep learning toolbox", + "description": "Ludwig makes it easy to build deep learning models for many applications, including NLP ones. It uses spaCy for tokenizing text in different languages.", + "pip": "ludwig", + "github": "uber/ludwig", + "thumb": "https://i.imgur.com/j1sORgD.png", + "url": "http://ludwig.ai", + "author": "Piero Molino @ Uber AI", + "author_links": { + "github": "w4nderlust", + "twitter": "w4nderlus7", + "website": "http://w4nderlu.st" + }, + "category": ["standalone", "research"] + }, + { + "id": "pic2phrase_bot", + "title": "pic2phrase_bot: Photo Description Generator", + "slogan": "A bot that generates descriptions to submitted photos, in a human-like manner.", + "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy.", + "thumb": "https://i.imgur.com/ggVI02O.jpg", + "image": "https://i.imgur.com/z1yhWQR.jpg", + "url": "https://telegram.me/pic2phrase_bot", + "author": "Yuli Vasiliev", + "author_links": { + "twitter": "VasilievYuli" + }, + "category": ["standalone", "conversational"] + }, + { + "id": "gracyql", + "title": "gracyql", + "slogan": "A thin GraphQL wrapper around spacy", + "github": "oterrier/gracyql", + "description": "An example of a basic [Starlette](https://github.com/encode/starlette) app using [Spacy](https://github.com/explosion/spaCy) and [Graphene](https://github.com/graphql-python/graphene). The main goal is to be able to use the amazing power of spaCy from other languages and retrieving only the information you need thanks to the GraphQL query definition. The GraphQL schema tries to mimic as much as possible the original Spacy API with classes Doc, Span and Token.", + "thumb": "https://i.imgur.com/xC7zpTO.png", + "category": ["apis"], + "tags": ["graphql"], + "code_example": [ + "query ParserDisabledQuery {", + " nlp(model: \"en\", disable: [\"parser\", \"ner\"]) {", + " doc(text: \"I live in Grenoble, France\") {", + " text", + " tokens {", + " id", + " pos", + " lemma", + " dep", + " }", + " ents {", + " start", + " end", + " label", + " }", + " }", + " }", + "}" + ], + "code_language": "json", + "author": "Olivier Terrier", + "author_links": { + "github": "oterrier" + } + }, + { + "id": "pyInflect", + "slogan": "A Python module for word inflections", + "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add word inflections to the system.", + "github": "bjascob/pyInflect", + "pip": "pyinflect", + "code_example": [ + "import spacy", + "import pyinflect", + "", + "nlp = spacy.load('en_core_web_sm')", + "doc = nlp('This is an example.')", + "doc[3].tag_ # NN", + "doc[3]._.inflect('NNS') # examples" + ], + "author": "Brad Jascob", + "author_links": { + "github": "bjascob" + }, + "category": ["pipeline"], + "tags": ["inflection"] + }, + { + "id": "lemminflect", + "slogan": "A Python module for English lemmatization and inflection", + "description": "LemmInflect uses a dictionary approach to lemmatize English words and inflect them into forms specified by a user supplied [Universal Dependencies](https://universaldependencies.org/u/pos/) or [Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) tag. The library works with out-of-vocabulary (OOV) words by applying neural network techniques to classify word forms and choose the appropriate morphing rules. The system acts as a standalone module or as an extension to spaCy.", + "github": "bjascob/LemmInflect", + "pip": "lemminflect", + "thumb": "https://raw.githubusercontent.com/bjascob/LemmInflect/master/docs/img/icons8-citrus-80.png", + "code_example": [ + "import spacy", + "import lemminflect", + "", + "nlp = spacy.load('en_core_web_sm')", + "doc = nlp('I am testing this example.')", + "doc[2]._.lemma() # 'test'", + "doc[4]._.inflect('NNS') # 'examples'" + ], + "author": "Brad Jascob", + "author_links": { + "github": "bjascob" + }, + "category": ["pipeline"], + "tags": ["inflection", "lemmatizer"] + }, + { + "id": "amrlib", + "slogan": "A python library that makes AMR parsing, generation and visualization simple.", + "description": "amrlib is a python module and spaCy add-in for Abstract Meaning Representation (AMR). The system can parse sentences to AMR graphs or generate text from existing graphs. It includes a GUI for visualization and experimentation.", + "github": "bjascob/amrlib", + "pip": "amrlib", + "code_example": [ + "import spacy", + "import amrlib", + "amrlib.setup_spacy_extension()", + "nlp = spacy.load('en_core_web_sm')", + "doc = nlp('This is a test of the spaCy extension. The test has multiple sentences.')", + "graphs = doc._.to_amr()", + "for graph in graphs:", + " print(graph)" + ], + "author": "Brad Jascob", + "author_links": { + "github": "bjascob" + }, + "category": ["pipeline"] + }, + { + "id": "blackstone", + "title": "Blackstone", + "slogan": "A spaCy pipeline and model for NLP on unstructured legal text", + "description": "Blackstone is a spaCy model and library for processing long-form, unstructured legal text. Blackstone is an experimental research project from the [Incorporated Council of Law Reporting for England and Wales'](https://iclr.co.uk/) research lab, [ICLR&D](https://research.iclr.co.uk/).", + "github": "ICLRandD/Blackstone", + "pip": "blackstone", + "thumb": "https://iclr.s3-eu-west-1.amazonaws.com/assets/iclrand/Blackstone/thumb.png", + "url": "https://research.iclr.co.uk", + "author": " ICLR&D", + "author_links": { + "github": "ICLRandD", + "twitter": "ICLRanD", + "website": "https://research.iclr.co.uk" + }, + "category": ["scientific", "models", "research"] + }, + { + "id": "NGym", + "title": "NeuralGym", + "slogan": "A little Windows GUI for training models with spaCy", + "description": "NeuralGym is a Python application for Windows with a graphical user interface to train models with spaCy. Run the application, select an output folder, a training data file in spaCy's data format, a spaCy model or blank model and press 'Start'.", + "github": "d5555/NeuralGym", + "url": "https://github.com/d5555/NeuralGym", + "image": "https://github.com/d5555/NeuralGym/raw/master/NGym.png", + "thumb": "https://github.com/d5555/NeuralGym/raw/master/NGym/web.png", + "author": "d5555", + "category": ["training"], + "tags": ["windows"] + }, + { + "id": "holmes", + "title": "Holmes", + "slogan": "Information extraction from English and German texts based on predicate logic", + "github": "msg-systems/holmes-extractor", + "url": "https://github.com/msg-systems/holmes-extractor", + "description": "Holmes is a Python 3 library that supports a number of use cases involving information extraction from English and German texts, including chatbot, structural extraction, topic matching and supervised document classification. There is a [website demonstrating intelligent search based on topic matching](https://holmes-demo.xt.msg.team).", + "pip": "holmes-extractor", + "category": ["conversational", "standalone"], + "tags": ["chatbots", "text-processing"], + "thumb": "https://raw.githubusercontent.com/msg-systems/holmes-extractor/master/docs/holmes_thumbnail.png", + "code_example": [ + "import holmes_extractor as holmes", + "holmes_manager = holmes.Manager(model='en_core_web_lg')", + "holmes_manager.register_search_phrase('A big dog chases a cat')", + "holmes_manager.start_chatbot_mode_console()" + ], + "author": "Richard Paul Hudson", + "author_links": { + "github": "richardpaulhudson" + } + }, + { + "id": "coreferee", + "title": "Coreferee", + "slogan": "Coreference resolution for multiple languages", + "github": "msg-systems/coreferee", + "url": "https://github.com/msg-systems/coreferee", + "description": "Coreferee is a pipeline plugin that performs coreference resolution for English, German and Polish. It is designed so that it is easy to add support for new languages and optimised for limited training data. It uses a mixture of neural networks and programmed rules. Please note you will need to [install models](https://github.com/msg-systems/coreferee#getting-started) before running the code example.", + "pip": "coreferee", + "category": ["pipeline", "models", "standalone"], + "tags": ["coreference-resolution", "anaphora"], + "code_example": [ + "import coreferee, spacy", + "nlp = spacy.load('en_core_web_trf')", + "nlp.add_pipe('coreferee')", + "doc = nlp('Although he was very busy with his work, Peter had had enough of it. He and his wife decided they needed a holiday. They travelled to Spain because they loved the country very much.')", + "doc._.coref_chains.print()", + "# Output:", + "#", + "# 0: he(1), his(6), Peter(9), He(16), his(18)", + "# 1: work(7), it(14)", + "# 2: [He(16); wife(19)], they(21), They(26), they(31)", + "# 3: Spain(29), country(34)", + "#", + "print(doc._.coref_chains.resolve(doc[31]))", + "# Output:", + "#", + "# [Peter, wife]" + ], + "author": "Richard Paul Hudson", + "author_links": { + "github": "richardpaulhudson" + } + }, + { + "id": "spacy-transformers", + "title": "spacy-transformers", + "slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2", + "description": "This package provides spaCy model pipelines that wrap [Hugging Face's `transformers`](https://github.com/huggingface/transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.", + "github": "explosion/spacy-transformers", + "url": "https://explosion.ai/blog/spacy-transformers", + "pip": "spacy-transformers", + "category": ["pipeline", "models", "research"], + "code_example": [ + "import spacy", + "", + "nlp = spacy.load(\"en_core_web_trf\")", + "doc = nlp(\"Apple shares rose on the news. Apple pie is delicious.\")" + ], + "author": "Explosion", + "author_links": { + "twitter": "explosion_ai", + "github": "explosion", + "website": "https://explosion.ai" + } + }, + { + "id": "spacy-huggingface-hub", + "title": "spacy-huggingface-hub", + "slogan": "Push your spaCy pipelines to the Hugging Face Hub", + "description": "This package provides a CLI command for uploading any trained spaCy pipeline packaged with [`spacy package`](https://spacy.io/api/cli#package) to the [Hugging Face Hub](https://huggingface.co). It auto-generates all meta information for you, uploads a pretty README (requires spaCy v3.1+) and handles version control under the hood.", + "github": "explosion/spacy-huggingface-hub", + "thumb": "https://i.imgur.com/j6FO9O6.jpg", + "url": "https://github.com/explosion/spacy-huggingface-hub", + "pip": "spacy-huggingface-hub", + "category": ["pipeline", "models"], + "author": "Explosion", + "author_links": { + "twitter": "explosion_ai", + "github": "explosion", + "website": "https://explosion.ai" + } + }, + { + "id": "spacy-clausie", + "title": "spacy-clausie", + "slogan": "Implementation of the ClausIE information extraction system for Python+spaCy", + "github": "mmxgn/spacy-clausie", + "url": "https://github.com/mmxgn/spacy-clausie", + "description": "ClausIE, a novel, clause-based approach to open information extraction, which extracts relations and their arguments from natural language text", + "category": ["pipeline", "scientific", "research"], + "code_example": [ + "import spacy", + "import claucy", + "", + "nlp = spacy.load(\"en\")", + "claucy.add_to_pipe(nlp)", + "", + "doc = nlp(\"AE died in Princeton in 1955.\")", + "", + "print(doc._.clauses)", + "# Output:", + "# ", + "", + "propositions = doc._.clauses[0].to_propositions(as_text=True)", + "", + "print(propositions)", + "# Output:", + "# [AE died in Princeton in 1955, AE died in 1955, AE died in Princeton" + ], + "author": "Emmanouil Theofanis Chourdakis", + "author_links": { + "github": "mmxgn" + } + }, + { + "id": "ipymarkup", + "slogan": "NER, syntax markup visualizations", + "description": "Collection of NLP visualizations for NER and syntax tree markup. Similar to [displaCy](https://explosion.ai/demos/displacy) and [displaCy ENT](https://explosion.ai/demos/displacy-ent).", + "github": "natasha/ipymarkup", + "image": "https://github.com/natasha/ipymarkup/blob/master/table.png?raw=true", + "pip":"pip install ipymarkup", + "code_example": [ + "from ipymarkup import show_span_ascii_markup, show_dep_ascii_markup", + "", + "text = 'В мероприятии примут участие не только российские учёные, но и зарубежные исследователи, в том числе, Крис Хелмбрехт - управляющий директор и совладелец креативного агентства Kollektiv (Германия, США), Ннека Угбома - руководитель проекта Mushroom works (Великобритания), Гергей Ковач - политик и лидер субкультурной партии «Dog with two tails» (Венгрия), Георг Жено - немецкий режиссёр, один из создателей экспериментального театра «Театр.doc», Театра им. Йозефа Бойса (Германия).'", + "spans = [(102, 116, 'PER'), (186, 194, 'LOC'), (196, 199, 'LOC'), (202, 214, 'PER'), (254, 268, 'LOC'), (271, 283, 'PER'), (324, 342, 'ORG'), (345, 352, 'LOC'), (355, 365, 'PER'), (445, 455, 'ORG'), (456, 468, 'PER'), (470, 478, 'LOC')]", + "show_span_ascii_markup(text, spans)" + ], + "author": "Alexander Kukushkin", + "author_links": { + "github": "kuk" + }, + "category": ["visualizers"] + }, + { + "id": "negspacy", + "title": "negspaCy", + "slogan": "spaCy pipeline object for negating concepts in text based on the NegEx algorithm.", + "github": "jenojp/negspacy", + "url": "https://github.com/jenojp/negspacy", + "description": "negspacy is a spaCy pipeline component that evaluates whether Named Entities are negated in text. It adds an extension to 'Span' objects.", + "pip": "negspacy", + "category": ["pipeline", "scientific"], + "tags": ["negation", "text-processing"], + "thumb": "https://github.com/jenojp/negspacy/blob/master/docs/thumb.png?raw=true", + "image": "https://github.com/jenojp/negspacy/blob/master/docs/icon.png?raw=true", + "code_example": [ + "import spacy", + "from negspacy.negation import Negex", + "", + "nlp = spacy.load(\"en_core_web_sm\")", + "nlp.add_pipe(\"negex\", config={\"ent_types\":[\"PERSON\",\"ORG\"]})", + "", + "doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")", + "for e in doc.ents:", + " print(e.text, e._.negex)" + ], + "author": "Jeno Pizarro", + "author_links": { + "github": "jenojp", + "twitter": "jenojp" + } + }, + { + "id": "ronec", + "title": "RONEC - Romanian Named Entity Corpus", + "slogan": "Named Entity Recognition corpus for Romanian language.", + "github": "dumitrescustefan/ronec", + "url": "https://github.com/dumitrescustefan/ronec", + "description": "The corpus holds 5127 sentences, annotated with 16 classes, with a total of 26376 annotated entities. The corpus comes into two formats: BRAT and CONLLUP.", + "category": ["standalone", "models"], + "tags": ["ner", "romanian"], + "thumb": "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/res/thumb.png", + "code_example": [ + "# to train a new model on ronec", + "python3 convert_spacy.py ronec/conllup/ronec.conllup output", + "python3 -m spacy train ro models output/train_ronec.json output/train_ronec.json -p ent", + "", + "# download the Romanian NER model", + "python -m spacy download ro_ner", + "", + "# load the model and print entities for a simple sentence", + "import spacy", + "", + "nlp = spacy.load(\"ro_ner\")", + "doc = nlp(\"Popescu Ion a fost la Cluj\")", + "", + "for ent in doc.ents:", + "\tprint(ent.text, ent.start_char, ent.end_char, ent.label_)" + ], + "author": "Stefan Daniel Dumitrescu, Andrei-Marius Avram" + }, + { + "id": "num_fh", + "title": "Numeric Fused-Head", + "slogan": "Numeric Fused-Head Identificaiton and Resolution in English", + "description": "This package provide a wrapper for the Numeric Fused-Head in English. It provides another information layer on numbers that refer to another entity which is not obvious from the syntactic tree.", + "github": "yanaiela/num_fh", + "pip": "num_fh", + "category": ["pipeline", "research"], + "code_example": [ + "import spacy", + "from num_fh import NFH", + "nlp = spacy.load('en_core_web_sm')", + "nfh = NFH(nlp)", + "nlp.add_pipe(nfh, first=False)", + "doc = nlp(\"I told you two, that only one of them is the one who will get 2 or 3 icecreams\")", + "", + "assert doc[16]._.is_nfh == True", + "assert doc[18]._.is_nfh == False", + "assert doc[3]._.is_deter_nfh == True", + "assert doc[16]._.is_deter_nfh == False", + "assert len(doc._.nfh) == 4" + ], + "author": "Yanai Elazar", + "author_links": { + "github": "yanaiela", + "twitter": "yanaiela", + "website": "https://yanaiela.github.io" + } + }, + { + "id": "Healthsea", + "title": "Healthsea", + "slogan": "Healthsea: an end-to-end spaCy pipeline for exploring health supplement effects", + "description": "This spaCy project trains an NER model and a custom Text Classification model with Clause Segmentation and Blinding capabilities to analyze supplement reviews and their potential effects on health.", + "github": "explosion/healthsea", + "thumb": "https://github.com/explosion/healthsea/blob/main/img/Jellyfish.png", + "category": ["pipeline", "research"], + "code_example": [ + "import spacy", + "", + "nlp = spacy.load(\"en_healthsea\")", + "doc = nlp(\"This is great for joint pain.\")", + "", + "# Clause Segmentation & Blinding", + "print(doc._.clauses)", + "", + "> {", + "> \"split_indices\": [0, 7],", + "> \"has_ent\": true,", + "> \"ent_indices\": [4, 6],", + "> \"blinder\": \"_CONDITION_\",", + "> \"ent_name\": \"joint pain\",", + "> \"cats\": {", + "> \"POSITIVE\": 0.9824668169021606,", + "> \"NEUTRAL\": 0.017364952713251114,", + "> \"NEGATIVE\": 0.00002889777533710003,", + "> \"ANAMNESIS\": 0.0001394189748680219", + "> \"prediction_text\": [\"This\", \"is\", \"great\", \"for\", \"_CONDITION_\", \"!\"]", + "> }", + "", + "# Aggregated results", + "> {", + "> \"joint_pain\": {", + "> \"effects\": [\"POSITIVE\"],", + "> \"effect\": \"POSITIVE\",", + "> \"label\": \"CONDITION\",", + "> \"text\": \"joint pain\"", + "> }", + "> }" + ], + "author": "Edward Schmuhl", + "author_links": { + "github": "thomashacker", + "twitter": "aestheticedwar1", + "website": "https://explosion.ai/" + } + }, + { + "id": "presidio", + "title": "Presidio", + "slogan": "Context aware, pluggable and customizable data protection and PII data anonymization", + "description": "Presidio *(Origin from Latin praesidium ‘protection, garrison’)* helps to ensure sensitive text is properly managed and governed. It provides fast ***analytics*** and ***anonymization*** for sensitive text such as credit card numbers, names, locations, social security numbers, bitcoin wallets, US phone numbers and financial data. Presidio analyzes the text using predefined or custom recognizers to identify entities, patterns, formats, and checksums with relevant context.", + "url": "https://aka.ms/presidio", + "image": "https://raw.githubusercontent.com/microsoft/presidio/master/docs/assets/before-after.png", + "github": "microsoft/presidio", + "category": ["standalone"], + "thumb": "https://avatars0.githubusercontent.com/u/6154722", + "author": "Microsoft", + "author_links": { + "github": "microsoft" + } + }, + { + "id": "presidio-research", + "title": "Presidio Research", + "slogan": "Toolbox for developing and evaluating PII detectors, NER models for PII and generating fake PII data", + "description": "This package features data-science related tasks for developing new recognizers for Microsoft Presidio. It is used for the evaluation of the entire system, as well as for evaluating specific PII recognizers or PII detection models. Anyone interested in evaluating an existing Microsoft Presidio instance, a specific PII recognizer or to develop new models or logic for detecting PII could leverage the preexisting work in this package. Additionally, anyone interested in generating new data based on previous datasets (e.g. to increase the coverage of entity values) for Named Entity Recognition models could leverage the data generator contained in this package.", + "url": "https://aka.ms/presidio-research", + "github": "microsoft/presidio-research", + "category": ["standalone"], + "thumb": "https://avatars0.githubusercontent.com/u/6154722", + "author": "Microsoft", + "author_links": { + "github": "microsoft" + } + }, + { + "id": "python-sentence-boundary-disambiguation", + "title": "pySBD - python Sentence Boundary Disambiguation", + "slogan": "Rule-based sentence boundary detection that works out-of-the-box", + "github": "nipunsadvilkar/pySBD", + "description": "pySBD is 'real-world' sentence segmenter which extracts reasonable sentences when the format and domain of the input text are unknown. It is a rules-based algorithm based on [The Golden Rules](https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt) - a set of tests to check accuracy of segmenter in regards to edge case scenarios developed by [TM-Town](https://www.tm-town.com/) dev team. pySBD is python port of ruby gem [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter).", + "pip": "pysbd", + "category": ["scientific"], + "tags": ["sentence segmentation"], + "code_example": [ + "from pysbd.utils import PySBDFactory", + "", + "nlp = spacy.blank('en')", + "nlp.add_pipe(PySBDFactory(nlp))", + "", + "doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.')", + "print(list(doc.sents))", + "# [My name is Jonas E. Smith., Please turn to p. 55.]" + ], + "author": "Nipun Sadvilkar", + "author_links": { + "twitter": "nipunsadvilkar", + "github": "nipunsadvilkar", + "website": "https://nipunsadvilkar.github.io" + } + }, + { + "id": "cookiecutter-spacy-fastapi", + "title": "cookiecutter-spacy-fastapi", + "slogan": "Docker-based cookiecutter for easy spaCy APIs using FastAPI", + "description": "Docker-based cookiecutter for easy spaCy APIs using FastAPI. The default endpoints expect batch requests with a list of Records in the Azure Search Cognitive Skill format. So out of the box, this cookiecutter can be setup as a Custom Cognitive Skill. For more on Azure Search and Cognitive Skills [see this page](https://docs.microsoft.com/en-us/azure/search/cognitive-search-custom-skill-interface).", + "url": "https://github.com/microsoft/cookiecutter-spacy-fastapi", + "image": "https://raw.githubusercontent.com/microsoft/cookiecutter-spacy-fastapi/master/images/cookiecutter-docs.png", + "github": "microsoft/cookiecutter-spacy-fastapi", + "category": ["apis"], + "thumb": "https://avatars0.githubusercontent.com/u/6154722", + "author": "Microsoft", + "author_links": { + "github": "microsoft" + } + }, + { + "id": "dframcy", + "title": "Dframcy", + "slogan": "Dataframe Integration with spaCy NLP", + "github": "yash1994/dframcy", + "description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.", + "pip": "dframcy", + "category": ["pipeline", "training"], + "tags": ["pandas"], + "code_example": [ + "import spacy", + "from dframcy import DframCy", + "", + "nlp = spacy.load('en_core_web_sm')", + "dframcy = DframCy(nlp)", + "doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')", + "annotation_dataframe = dframcy.to_dataframe(doc)" + ], + "author": "Yash Patadia", + "author_links": { + "twitter": "PatadiaYash", + "github": "yash1994" + } + }, + { + "id": "spacy-pytextrank", + "title": "PyTextRank", + "slogan": "Py impl of TextRank for lightweight phrase extraction", + "description": "An implementation of TextRank in Python for use in spaCy pipelines which provides fast, effective phrase extraction from texts, along with extractive summarization. The graph algorithm works independent of a specific natural language and does not require domain knowledge. See (Mihalcea 2004) https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf", + "github": "DerwenAI/pytextrank", + "pip": "pytextrank", + "code_example": [ + "import spacy", + "import pytextrank", + "", + "nlp = spacy.load('en_core_web_sm')", + "", + "tr = pytextrank.TextRank()", + "nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)", + "", + "text = 'Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered.'", + "doc = nlp(text)", + "", + "# examine the top-ranked phrases in the document", + "for p in doc._.phrases:", + " print('{:.4f} {:5d} {}'.format(p.rank, p.count, p.text))", + " print(p.chunks)" + ], + "code_language": "python", + "url": "https://github.com/DerwenAI/pytextrank/wiki", + "thumb": "https://memegenerator.net/img/instances/66942896.jpg", + "image": "https://memegenerator.net/img/instances/66942896.jpg", + "author": "Paco Nathan", + "author_links": { + "twitter": "pacoid", + "github": "ceteri", + "website": "https://derwen.ai/paco" + }, + "category": ["pipeline"], + "tags": ["phrase extraction", "ner", "summarization", "graph algorithms", "textrank"] + }, + { + "id": "spacy_syllables", + "title": "Spacy Syllables", + "slogan": "Multilingual syllable annotations", + "description": "Spacy Syllables is a pipeline component that adds multilingual syllable annotations to Tokens. It uses Pyphen under the hood and has support for a long list of languages.", + "github": "sloev/spacy-syllables", + "pip": "spacy_syllables", + "code_example": [ + "import spacy", + "from spacy_syllables import SpacySyllables", + "", + "nlp = spacy.load('en_core_web_sm')", + "syllables = SpacySyllables(nlp)", + "nlp.add_pipe(syllables, after='tagger')", + "", + "doc = nlp('terribly long')", + "", + "data = [", + " (token.text, token._.syllables, token._.syllables_count)", + " for token in doc", + "]", + "", + "assert data == [", + " ('terribly', ['ter', 'ri', 'bly'], 3),", + " ('long', ['long'], 1)", + "]" + ], + "thumb": "https://raw.githubusercontent.com/sloev/spacy-syllables/master/logo.png", + "author": "Johannes Valbjørn", + "author_links": { + "github": "sloev" + }, + "category": ["pipeline"], + "tags": ["syllables", "multilingual"] + }, + { + "id": "gobbli", + "title": "gobbli", + "slogan": "Deep learning for text classification doesn't have to be scary", + "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models. spaCy's base text classification models, as well as models integrated from `spacy-transformers`, are available in the collection of classification models. In addition, spaCy is used for data augmentation and document embeddings.", + "url": "https://github.com/rtiinternational/gobbli", + "github": "rtiinternational/gobbli", + "pip": "gobbli", + "thumb": "https://i.postimg.cc/NGpzhrdr/gobbli-lg.png", + "code_example": [ + "from gobbli.io import PredictInput, TrainInput", + "from gobbli.model.bert import BERT", + "", + "train_input = TrainInput(", + " X_train=['This is a training document.', 'This is another training document.'],", + " y_train=['0', '1'],", + " X_valid=['This is a validation sentence.', 'This is another validation sentence.'],", + " y_valid=['1', '0'],", + ")", + "", + "clf = BERT()", + "", + "# Set up classifier resources -- Docker image, etc.", + "clf.build()", + "", + "# Train model", + "train_output = clf.train(train_input)", + "", + "predict_input = PredictInput(", + " X=['Which class is this document?'],", + " labels=train_output.labels,", + " checkpoint=train_output.checkpoint,", + ")", + "", + "predict_output = clf.predict(predict_input)" + ], + "category": ["standalone"] + }, + { + "id": "spacy_fastlang", + "title": "Spacy FastLang", + "slogan": "Language detection done fast", + "description": "Fast language detection using FastText and Spacy.", + "github": "thomasthiebaud/spacy-fastlang", + "pip": "spacy_fastlang", + "code_example": [ + "import spacy_fastlang", + "", + "nlp = spacy.load(\"en_core_web_sm\")", + "nlp.add_pipe(\"language_detector\")", + "doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')", + "", + "assert doc._.language == 'en'", + "assert doc._.language_score >= 0.8" + ], + "author": "Thomas Thiebaud", + "author_links": { + "github": "thomasthiebaud" + }, + "category": ["pipeline"] + }, + { + "id": "mlflow", + "title": "MLflow", + "slogan": "An open source platform for the machine learning lifecycle", + "description": "MLflow is an open source platform to manage the ML lifecycle, including experimentation, reproducibility, deployment, and a central model registry. MLflow currently offers four components: Tracking, Projects, Models and Registry.", + "github": "mlflow/mlflow", + "pip": "mlflow", + "thumb": "https://www.mlflow.org/docs/latest/_static/MLflow-logo-final-black.png", + "image": "", + "url": "https://mlflow.org/", + "author": "Databricks", + "author_links": { + "github": "databricks", + "twitter": "databricks", + "website": "https://databricks.com/" + }, + "category": ["standalone", "apis"], + "code_example": [ + "import mlflow", + "import mlflow.spacy", + "", + "# MLflow Tracking", + "nlp = spacy.load('my_best_model_path/output/model-best')", + "with mlflow.start_run(run_name='Spacy'):", + " mlflow.set_tag('model_flavor', 'spacy')", + " mlflow.spacy.log_model(spacy_model=nlp, artifact_path='model')", + " mlflow.log_metric(('accuracy', 0.72))", + " my_run_id = mlflow.active_run().info.run_id", + "", + "", + "# MLflow Models", + "model_uri = f'runs:/{my_run_id}/model'", + "nlp2 = mlflow.spacy.load_model(model_uri=model_uri)" + ] + }, + { + "id": "pyate", + "title": "PyATE", + "slogan": "Python Automated Term Extraction", + "description": "PyATE is a term extraction library written in Python using Spacy POS tagging with Basic, Combo Basic, C-Value, TermExtractor, and Weirdness.", + "github": "kevinlu1248/pyate", + "pip": "pyate", + "code_example": [ + "import spacy", + "import pyate", + "", + "nlp = spacy.load('en_core_web_sm')", + "nlp.add_pipe(\"combo_basic\") # or any of `basic`, `weirdness`, `term_extractor` or `cvalue`", + "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/", + "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'", + "", + "doc = nlp(string)", + "print(doc._.combo_basic.sort_values(ascending=False).head(5))", + "\"\"\"\"\"\"", + "dysfunctional tumor 1.443147", + "tumor suppressors 1.443147", + "genetic changes 1.386294", + "cancer cells 1.386294", + "dysfunctional tumor suppressors 1.298612", + "\"\"\"\"\"\"" + ], + "code_language": "python", + "url": "https://github.com/kevinlu1248/pyate", + "author": "Kevin Lu", + "author_links": { + "twitter": "kevinlu1248", + "github": "kevinlu1248", + "website": "https://github.com/kevinlu1248/pyate" + }, + "category": ["pipeline", "research"], + "tags": ["term_extraction"] + }, + { + "id": "contextualSpellCheck", + "title": "Contextual Spell Check", + "slogan": "Contextual spell correction using BERT (bidirectional representations)", + "description": "This package currently focuses on Out of Vocabulary (OOV) word or non-word error (NWE) correction using BERT model. The idea of using BERT was to use the context when correcting NWE.", + "github": "R1j1t/contextualSpellCheck", + "pip": "contextualSpellCheck", + "code_example": [ + "import spacy", + "import contextualSpellCheck", + "", + "nlp = spacy.load('en_core_web_sm')", + "contextualSpellCheck.add_to_pipe(nlp)", + "doc = nlp('Income was $9.4 milion compared to the prior year of $2.7 milion.')", + "", + "print(doc._.performed_spellCheck) #Should be True", + "print(doc._.outcome_spellCheck) #Income was $9.4 million compared to the prior year of $2.7 million." + ], + "code_language": "python", + "url": "https://github.com/R1j1t/contextualSpellCheck", + "thumb": "https://user-images.githubusercontent.com/22280243/82760949-98e68480-9e14-11ea-952e-4738620fd9e3.png", + "image": "https://user-images.githubusercontent.com/22280243/82138959-2852cd00-9842-11ea-918a-49b2a7873ef6.png", + "author": "Rajat Goel", + "author_links": { + "github": "r1j1t", + "website": "https://github.com/R1j1t" + }, + "category": ["pipeline", "conversational", "research"], + "tags": ["spell check", "correction", "preprocessing", "translation", "correction"] + }, + { + "id": "texthero", + "title": "Texthero", + "slogan": "Text preprocessing, representation and visualization from zero to hero.", + "description": "Texthero is a python package to work with text data efficiently. It empowers NLP developers with a tool to quickly understand any text-based dataset and it provides a solid pipeline to clean and represent text data, from zero to hero.", + "github": "jbesomi/texthero", + "pip": "texthero", + "code_example": [ + "import texthero as hero", + "import pandas as pd", + "", + "df = pd.read_csv('https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv')", + "df['named_entities'] = hero.named_entities(df['text'])", + "df.head()" + ], + "code_language": "python", + "url": "https://texthero.org", + "thumb": "https://texthero.org/img/T.png", + "image": "https://texthero.org/docs/assets/texthero.png", + "author": "Jonathan Besomi", + "author_links": { + "github": "jbesomi", + "website": "https://besomi.ai" + }, + "category": ["standalone"] + }, + { + "id": "cov-bsv", + "title": "VA COVID-19 NLP BSV", + "slogan": "spaCy pipeline for COVID-19 surveillance.", + "github": "abchapman93/VA_COVID-19_NLP_BSV", + "description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.", + "pip": "cov-bsv", + "code_example": [ + "import cov_bsv", + "", + "nlp = cov_bsv.load()", + "doc = nlp('Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected')", + "", + "print(doc.ents)", + "print(doc._.cov_classification)", + "cov_bsv.visualize_doc(doc)" + ], + "category": ["pipeline", "standalone", "biomedical", "scientific"], + "tags": ["clinical", "epidemiology", "covid-19", "surveillance"], + "author": "Alec Chapman", + "author_links": { + "github": "abchapman93" + } + }, + { + "id": "medspacy", + "title": "medspaCy", + "thumb": "https://raw.githubusercontent.com/medspacy/medspacy/master/images/medspacy_logo.png", + "slogan": "A toolkit for clinical NLP with spaCy.", + "github": "medspacy/medspacy", + "description": "A toolkit for clinical NLP with spaCy. Features include sentence splitting, section detection, and asserting negation, family history, and uncertainty.", + "pip": "medspacy", + "code_example": [ + "import medspacy", + "from medspacy.ner import TargetRule", + "", + "nlp = medspacy.load()", + "print(nlp.pipe_names)", + "", + "nlp.get_pipe('target_matcher').add([TargetRule('stroke', 'CONDITION'), TargetRule('diabetes', 'CONDITION'), TargetRule('pna', 'CONDITION')])", + "doc = nlp('Patient has hx of stroke. Mother diagnosed with diabetes. No evidence of pna.')", + "", + "for ent in doc.ents:", + " print(ent, ent._.is_negated, ent._.is_family, ent._.is_historical)", + "medspacy.visualization.visualize_ent(doc)" + ], + "category": ["biomedical", "scientific", "research"], + "tags": ["clinical"], + "author": "medspacy", + "author_links": { + "github": "medspacy" + } + }, + { + "id": "rita-dsl", + "title": "RITA DSL", + "slogan": "Domain Specific Language for creating language rules", + "github": "zaibacu/rita-dsl", + "description": "A Domain Specific Language (DSL) for building language patterns. These can be later compiled into spaCy patterns, pure regex, or any other format", + "pip": "rita-dsl", + "thumb": "https://raw.githubusercontent.com/zaibacu/rita-dsl/master/docs/assets/logo-100px.png", + "code_language": "python", + "code_example": [ + "import spacy", + "from rita.shortcuts import setup_spacy", + "", + "rules = \"\"\"", + "cuts = {\"fitted\", \"wide-cut\"}", + "lengths = {\"short\", \"long\", \"calf-length\", \"knee-length\"}", + "fabric_types = {\"soft\", \"airy\", \"crinkled\"}", + "fabrics = {\"velour\", \"chiffon\", \"knit\", \"woven\", \"stretch\"}", + "", + "{IN_LIST(cuts)?, IN_LIST(lengths), WORD(\"dress\")}->MARK(\"DRESS_TYPE\")", + "{IN_LIST(lengths), IN_LIST(cuts), WORD(\"dress\")}->MARK(\"DRESS_TYPE\")", + "{IN_LIST(fabric_types)?, IN_LIST(fabrics)}->MARK(\"DRESS_FABRIC\")", + "\"\"\"", + "", + "nlp = spacy.load(\"en\")", + "setup_spacy(nlp, rules_string=rules)", + "r = nlp(\"She was wearing a short wide-cut dress\")", + "print(list([{\"label\": e.label_, \"text\": e.text} for e in r.ents]))" + ], + "category": ["standalone"], + "tags": ["dsl", "language-patterns", "language-rules", "nlp"], + "author": "Šarūnas Navickas", + "author_links": { + "github": "zaibacu" + } + }, + { + "id": "PatternOmatic", + "title": "PatternOmatic", + "slogan": "Finds linguistic patterns effortlessly", + "description": "Discover spaCy's linguistic patterns matching a given set of String samples to be used by the spaCy's Rule Based Matcher", + "github": "revuel/PatternOmatic", + "pip": "PatternOmatic", + "code_example": [ + "from PatternOmatic.api import find_patterns", + "", + "samples = ['I am a cat!', 'You are a dog!', 'She is an owl!']", + "", + "patterns_found, _ = find_patterns(samples)", + "", + "print(f'Patterns found: {patterns_found}')" + ], + "code_language": "python", + "thumb": "https://svgshare.com/i/R3P.svg", + "image": "https://svgshare.com/i/R3P.svg", + "author": "Miguel Revuelta Espinosa", + "author_links": { + "github": "revuel" + }, + "category": ["scientific", "research", "standalone"], + "tags": ["Evolutionary Computation", "Grammatical Evolution"] + }, + { + "id": "SpacyDotNet", + "title": "spaCy .NET Wrapper", + "slogan": "SpacyDotNet is a .NET Core compatible wrapper for spaCy, based on Python.NET", + "description": "This projects relies on [Python.NET](http://pythonnet.github.io/) to interop with spaCy. It's not meant to be a complete and exhaustive implementation of all spaCy features and [APIs](https://spacy.io/api). Although it should be enough for basic tasks, it's considered as a starting point if you need to build a complex project using spaCy in .NET Most of the basic features in _Spacy101_ are available. All `Container` classes are present (`Doc`, `Token`, `Span` and `Lexeme`) with their basic properties/methods running and also `Vocab` and `StringStore` in a limited form. Anyway, any developer should be ready to add the missing properties or classes in a very straightforward manner.", + "github": "AMArostegui/SpacyDotNet", + "thumb": "https://raw.githubusercontent.com/AMArostegui/SpacyDotNet/master/cslogo.png", + "code_example": [ + "var spacy = new Spacy();", + "", + "var nlp = spacy.Load(\"en_core_web_sm\");", + "var doc = nlp.GetDocument(\"Apple is looking at buying U.K. startup for $1 billion\");", + "", + "foreach (Token token in doc.Tokens)", + " Console.WriteLine($\"{token.Text} {token.Lemma} {token.PoS} {token.Tag} {token.Dep} {token.Shape} {token.IsAlpha} {token.IsStop}\");", + "", + "Console.WriteLine(\"\");", + "foreach (Span ent in doc.Ents)", + " Console.WriteLine($\"{ent.Text} {ent.StartChar} {ent.EndChar} {ent.Label}\");", + "", + "nlp = spacy.Load(\"en_core_web_md\");", + "var tokens = nlp.GetDocument(\"dog cat banana afskfsd\");", + "", + "Console.WriteLine(\"\");", + "foreach (Token token in tokens.Tokens)", + " Console.WriteLine($\"{token.Text} {token.HasVector} {token.VectorNorm}, {token.IsOov}\");", + "", + "tokens = nlp.GetDocument(\"dog cat banana\");", + "Console.WriteLine(\"\");", + "foreach (Token token1 in tokens.Tokens)", + "{", + " foreach (Token token2 in tokens.Tokens)", + " Console.WriteLine($\"{token1.Text} {token2.Text} {token1.Similarity(token2) }\");", + "}", + "", + "doc = nlp.GetDocument(\"I love coffee\");", + "Console.WriteLine(\"\");", + "Console.WriteLine(doc.Vocab.Strings[\"coffee\"]);", + "Console.WriteLine(doc.Vocab.Strings[3197928453018144401]);", + "", + "Console.WriteLine(\"\");", + "foreach (Token word in doc.Tokens)", + "{", + " var lexeme = doc.Vocab[word.Text];", + " Console.WriteLine($@\"{lexeme.Text} {lexeme.Orth} {lexeme.Shape} {lexeme.Prefix} {lexeme.Suffix} {lexeme.IsAlpha} {lexeme.IsDigit} {lexeme.IsTitle} {lexeme.Lang}\");", + "}" + ], + "code_language": "csharp", + "author": "Antonio Miras", + "author_links": { + "github": "AMArostegui" + }, + "category": ["nonpython"] + }, + { + "id": "ruts", + "title": "ruTS", + "slogan": "A library for statistics extraction from texts in Russian", + "description": "The library allows extracting the following statistics from a text: basic statistics, readability metrics, lexical diversity metrics, morphological statistics", + "github": "SergeyShk/ruTS", + "pip": "ruts", + "code_example": [ + "import spacy", + "import ruts", + "", + "nlp = spacy.load('ru_core_news_sm')", + "nlp.add_pipe('basic', last=True)", + "doc = nlp('мама мыла раму')", + "doc._.basic.get_stats()" + ], + "code_language": "python", + "thumb": "https://habrastorage.org/webt/6z/le/fz/6zlefzjavzoqw_wymz7v3pwgfp4.png", + "image": "https://clipartart.com/images/free-tree-roots-clipart-black-and-white-2.png", + "author": "Sergey Shkarin", + "author_links": { + "twitter": "shk_sergey", + "github": "SergeyShk" + }, + "category": ["pipeline", "standalone"], + "tags": ["Text Analytics", "Russian"] + }, + { + "id": "trunajod", + "title": "TRUNAJOD", + "slogan": "A text complexity library for text analysis built on spaCy", + "description": "With all the basic NLP capabilities provided by spaCy (dependency parsing, POS tagging, tokenizing), `TRUNAJOD` focuses on extracting measurements from texts that might be interesting for different applications and use cases.", + "github": "dpalmasan/TRUNAJOD2.0", + "pip": "trunajod", + "code_example": [ + "import spacy", + "from TRUNAJOD.entity_grid import EntityGrid", + "", + "nlp = spacy.load('es_core_news_sm', disable=['ner', 'textcat'])", + "example_text = (", + " 'El espectáculo del cielo nocturno cautiva la mirada y suscita preguntas'", + " 'sobre el universo, su origen y su funcionamiento. No es sorprendente que '", + " 'todas las civilizaciones y culturas hayan formado sus propias '", + " 'cosmologías. Unas relatan, por ejemplo, que el universo ha'", + " 'sido siempre tal como es, con ciclos que inmutablemente se repiten; '", + " 'otras explican que este universo ha tenido un principio, '", + " 'que ha aparecido por obra creadora de una divinidad.'", + ")", + "doc = nlp(example_text)", + "egrid = EntityGrid(doc)", + "print(egrid.get_egrid())" + ], + "code_language": "python", + "thumb": "https://raw.githubusercontent.com/dpalmasan/TRUNAJOD2.0/master/imgs/trunajod_thumb.png", + "image": "https://raw.githubusercontent.com/dpalmasan/TRUNAJOD2.0/master/imgs/trunajod_logo.png", + "author": "Diego Palma", + "author_links": { + "github": "dpalmasan" + }, + "category": ["research", "standalone", "scientific"], + "tags": ["Text Analytics", "Coherence", "Cohesion"] + }, + { + "id": "lingfeat", + "title": "LingFeat", + "slogan": "A Linguistic Feature Extraction (Text Analysis) Tool for Readability Assessment and Text Simplification", + "description": "LingFeat is a feature extraction library which currently extracts 255 linguistic features from English string input. Categories include syntax, semantics, discourse, and also traditional readability formulas. Published in EMNLP 2021.", + "github": "brucewlee/lingfeat", + "pip": "lingfeat", + "code_example": [ + "from lingfeat import extractor", + "", + "", + "text = 'TAEAN, South Chungcheong Province -- Just before sunup, Lee Young-ho, a seasoned fisherman with over 30 years of experience, silently waits for boats carrying blue crabs as the season for the seafood reaches its height. Soon afterward, small and big boats sail into Sinjin Port in Taean County, South Chungcheong Province, the second-largest source of blue crab after Incheon, accounting for 29 percent of total production of the country. A crane lifts 28 boxes filled with blue crabs weighing 40 kilograms each from the boat, worth about 10 million won ($8,500). “It has been a productive fall season for crabbing here. The water temperature is a very important factor affecting crab production. They hate cold water,” Lee said. The temperature of the sea off Taean appeared to have stayed at the level where crabs become active. If the sea temperature suddenly drops, crabs go into their winter dormancy mode, burrowing into the mud and sleeping through the cold months.'", + "", + "", + "#Pass text", + "LingFeat = extractor.pass_text(text)", + "", + "", + "#Preprocess text", + "LingFeat.preprocess()", + "", + "", + "#Extract features", + "#each method returns a dictionary of the corresponding features", + "#Advanced Semantic (AdSem) Features", + "WoKF = LingFeat.WoKF_() #Wikipedia Knowledge Features", + "WBKF = LingFeat.WBKF_() #WeeBit Corpus Knowledge Features", + "OSKF = LingFeat.OSKF_() #OneStopEng Corpus Knowledge Features", + "", + "#Discourse (Disco) Features", + "EnDF = LingFeat.EnDF_() #Entity Density Features", + "EnGF = LingFeat.EnGF_() #Entity Grid Features", + "", + "#Syntactic (Synta) Features", + "PhrF = LingFeat.PhrF_() #Noun/Verb/Adj/Adv/... Phrasal Features", + "TrSF = LingFeat.TrSF_() #(Parse) Tree Structural Features", + "POSF = LingFeat.POSF_() #Noun/Verb/Adj/Adv/... Part-of-Speech Features", + "", + "#Lexico Semantic (LxSem) Features", + "TTRF = LingFeat.TTRF_() #Type Token Ratio Features", + "VarF = LingFeat.VarF_() #Noun/Verb/Adj/Adv Variation Features", + "PsyF = LingFeat.PsyF_() #Psycholinguistic Difficulty of Words (AoA Kuperman)", + "WoLF = LingFeat.WorF_() #Word Familiarity from Frequency Count (SubtlexUS)", + "", + "Shallow Traditional (ShTra) Features", + "ShaF = LingFeat.ShaF_() #Shallow Features (e.g. avg number of tokens)", + "TraF = LingFeat.TraF_() #Traditional Formulas" + ], + "code_language": "python", + "thumb": "https://raw.githubusercontent.com/brucewlee/lingfeat/master/img/lingfeat_logo2.png", + "image": "https://raw.githubusercontent.com/brucewlee/lingfeat/master/img/lingfeat_logo.png", + "author": "Bruce W. Lee (이웅성)", + "author_links": { + "github": "brucewlee", + "website": "https://brucewlee.github.io/" + }, + "category": ["research", "scientific"], + "tags": ["Readability", "Simplification", "Feature Extraction", "Syntax", "Discourse", "Semantics", "Lexical"] + }, + { + "id": "hmrb", + "title": "Hammurabi", + "slogan": "Python Rule Processing Engine 🏺", + "description": "Hammurabi works as a rule engine to parse input using a defined set of rules. It uses a simple and readable syntax to define complex rules to handle phrase matching. The syntax supports nested logical statements, regular expressions, reusable or side-loaded variables and match triggered callback functions to modularize your rules. The latest version works with both spaCy 2.X and 3.X. For more information check the documentation on [ReadTheDocs](https://hmrb.readthedocs.io/en/latest/).", + "github": "babylonhealth/hmrb", + "pip": "hmrb", + "code_example": [ + "import spacy", + "from hmrb.core import SpacyCore", + "", + "nlp = spacy.load(\"en_core_web_sm\")", + "sentences = \"I love gorillas. Peter loves gorillas. Jane loves Tarzan.\"", + "", + "def conj_be(subj: str) -> str:", + " if subj == \"I\":", + " return \"am\"", + " elif subj == \"you\":", + " return \"are\"", + " else:", + " return \"is\"", + "", + "@spacy.registry.callbacks(\"gorilla_callback\")", + "def gorilla_clb(seq: list, span: slice, data: dict) -> None:", + " subj = seq[span.start].text", + " be = conj_be(subj)", + " print(f\"{subj} {be} a gorilla person.\")", + "@spacy.registry.callbacks(\"lover_callback\")", + "def lover_clb(seq: list, span: slice, data: dict) -> None:", + " print(f\"{seq[span][-1].text} is a love interest of {seq[span.start].text}.\")", + "", + "grammar = \"\"\"", + " Law:", + " - callback: \"loves_gorilla\"", + " (", + " ((pos: \"PROPN\") or (pos: \"PRON\"))", + " (lemma: \"love\")", + " (lemma: \"gorilla\")", + " )", + " Law:", + " - callback: \"loves_someone\"", + " (", + " (pos: \"PROPN\")", + " (lower: \"loves\")", + " (pos: \"PROPN\")", + " )", + "\"\"\"", + "", + "@spacy.registry.augmenters(\"jsonify_span\")", + "def jsonify_span(span):", + " return [{\"lemma\": token.lemma_, \"pos\": token.pos_, \"lower\": token.lower_} for token in span]", + "", + "conf = {", + " \"rules\": grammar,", + " \"callbacks\": {", + " \"loves_gorilla\": \"callbacks.gorilla_callback\",", + " \"loves_someone\": \"callbacks.lover_callback\",", + " },", + " \"map_doc\": \"augmenters.jsonify_span\",", + " \"sort_length\": True,", + "}", + "", + "nlp.add_pipe(\"hmrb\", config=conf)", + "nlp(sentences)" + ], + "code_language": "python", + "thumb": "https://user-images.githubusercontent.com/6807878/118643685-cae6b880-b7d4-11eb-976e-066aec9505da.png", + "image": "https://user-images.githubusercontent.com/6807878/118643685-cae6b880-b7d4-11eb-976e-066aec9505da.png", + "author": "Kristian Boda", + "author_links": { + "github": "bodak", + "twitter": "bodak", + "website": "https://github.com/babylonhealth/" + }, + "category": ["pipeline", "standalone", "scientific", "biomedical"], + "tags": ["babylonhealth", "rule-engine", "matcher"] + }, + { + "id": "forte", + "title": "Forte", + "slogan": "Forte is a toolkit for building Natural Language Processing pipelines, featuring cross-task interaction, adaptable data-model interfaces and composable pipelines.", + "description": "Forte provides a platform to assemble state-of-the-art NLP and ML technologies in a highly-composable fashion, including a wide spectrum of tasks ranging from Information Retrieval, Natural Language Understanding to Natural Language Generation.", + "github": "asyml/forte", + "pip": "forte.spacy stave torch", + "code_example": [ + "from fortex.spacy import SpacyProcessor", + "from forte.processors.stave import StaveProcessor", + "from forte import Pipeline", + "from forte.data.readers import StringReader", + "", + "pipeline = Pipeline()", + "pipeline.set_reader(StringReader())", + "pipeline.add(SpacyProcessor())", + "pipeline.add(StaveProcessor())", + "pipeline.run('Running SpaCy with Forte!')" + ], + "code_language": "python", + "url": "https://medium.com/casl-project/forte-building-modular-and-re-purposable-nlp-pipelines-cf5b5c5abbe9", + "thumb": "https://raw.githubusercontent.com/asyml/forte/master/docs/_static/img/forte_graphic.png", + "image": "https://raw.githubusercontent.com/asyml/forte/master/docs/_static/img/logo_h.png", + "author": "Petuum", + "author_links": { + "twitter": "PetuumInc", + "github": "asyml", + "website": "https://petuum.com" + }, + "category": ["pipeline", "standalone"], + "tags": ["pipeline"] + }, + { + "id": "spacy-api-docker-v3", + "slogan": "spaCy v3 REST API, wrapped in a Docker container", + "github": "bbieniek/spacy-api-docker", + "url": "https://hub.docker.com/r/bbieniek/spacyapi/", + "thumb": "https://i.imgur.com/NRnDKyj.jpg", + "code_example": [ + "version: '3'", + "", + "services:", + " spacyapi:", + " image: bbieniek/spacyapi:en_v3", + " ports:", + " - \"127.0.0.1:8080:80\"", + " restart: always" + ], + "code_language": "docker", + "author": "Baltazar Bieniek", + "author_links": { + "github": "bbieniek" + }, + "category": ["apis"] + }, + { + "id": "phruzz_matcher", + "title": "phruzz-matcher", + "slogan": "Phrase matcher using RapidFuzz", + "description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO \"perfect matches\" due to typos or abbreviations between a Spacy doc and a list of phrases.", + "github": "mjvallone/phruzz-matcher", + "pip": "phruzz_matcher", + "code_example": [ + "import spacy", + "from spacy.language import Language", + "from phruzz_matcher.phrase_matcher import PhruzzMatcher", + "", + "famous_people = [", + " \"Brad Pitt\",", + " \"Demi Moore\",", + " \"Bruce Willis\",", + " \"Jim Carrey\",", + "]", + "", + "@Language.factory(\"phrase_matcher\")", + "def phrase_matcher(nlp: Language, name: str):", + " return PhruzzMatcher(nlp, famous_people, \"FAMOUS_PEOPLE\", 85)", + "", + "nlp = spacy.blank('es')", + "nlp.add_pipe(\"phrase_matcher\")", + "", + "doc = nlp(\"El otro día fui a un bar donde vi a brad pit y a Demi Moore, estaban tomando unas cervezas mientras charlaban de sus asuntos.\")", + "print(f\"doc.ents: {doc.ents}\")", + "", + "#OUTPUT", + "#doc.ents: (brad pit, Demi Moore)" + ], + "thumb": "https://avatars.githubusercontent.com/u/961296?v=4", + "image": "", + "code_language": "python", + "author": "Martin Vallone", + "author_links": { + "github": "mjvallone", + "twitter": "vallotin", + "website": "https://fiqus.coop/" + }, + "category": ["pipeline", "research", "standalone"], + "tags": ["spacy", "python", "nlp", "ner"] + }, + { + "id": "WordDumb", + "title": "WordDumb", + "slogan": "A calibre plugin that generates Word Wise and X-Ray files.", + "description": "A calibre plugin that generates Word Wise and X-Ray files then sends them to Kindle. Supports KFX, AZW3 and MOBI eBooks. X-Ray supports 18 languages.", + "github": "xxyzz/WordDumb", + "code_language": "python", + "thumb": "https://raw.githubusercontent.com/xxyzz/WordDumb/master/starfish.svg", + "image": "https://user-images.githubusercontent.com/21101839/130245435-b874f19a-7785-4093-9975-81596efc42bb.png", + "author": "xxyzz", + "author_links": { + "github": "xxyzz" + }, + "category": ["standalone"] + }, + { + "id": "eng_spacysentiment", + "title": "eng_spacysentiment", + "slogan": "Simple sentiment analysis using spaCy pipelines", + "description": "Sentiment analysis for simple english sentences using pre-trained spaCy pipelines", + "github": "vishnunkumar/spacysentiment", + "pip": "eng-spacysentiment", + "code_example": [ + "import eng_spacysentiment", + "nlp = eng_spacysentiment.load()", + "text = \"Welcome to Arsenals official YouTube channel Watch as we take you closer and show you the personality of the club\"", + "doc = nlp(text)", + "print(doc.cats)", + "# {'positive': 0.29878824949264526, 'negative': 0.7012117505073547}" + ], + "thumb": "", + "image": "", + "code_language": "python", + "author": "Vishnu Nandakumar", + "author_links": { + "github": "Vishnunkumar", + "twitter": "vishnun_uchiha" + }, + "category": ["pipeline"], + "tags": ["pipeline", "nlp", "sentiment"] + } + ], + + "categories": [ + { + "label": "Projects", + "items": [ + { + "id": "pipeline", + "title": "Pipeline", + "description": "Custom pipeline components and extensions" + }, + { + "id": "training", + "title": "Training", + "description": "Helpers and toolkits for training spaCy models" + }, + { + "id": "conversational", + "title": "Conversational", + "description": "Frameworks and utilities for working with conversational text, e.g. for chat bots" + }, + { + "id": "research", + "title": "Research", + "description": "Frameworks and utilities for developing better NLP models, especially using neural networks" + }, + { + "id": "scientific", + "title": "Scientific", + "description": "Frameworks and utilities for scientific text processing" + }, + { + "id": "biomedical", + "title": "Biomedical", + "description": "Frameworks and utilities for processing biomedical text" + }, + { + "id": "visualizers", + "title": "Visualizers", + "description": "Demos and tools to visualize NLP annotations or systems" + }, + { + "id": "apis", + "title": "Containers & APIs", + "description": "Infrastructure tools for managing or deploying spaCy" + }, + { + "id": "nonpython", + "title": "Non-Python", + "description": "Wrappers, bindings and implementations in other programming languages" + }, + { + "id": "standalone", + "title": "Standalone", + "description": "Self-contained libraries or tools that use spaCy under the hood" + }, + { + "id": "models", + "title": "Models", + "description": "Third-party pretrained models for different languages and domains" + } + ] + }, + { + "label": "Education", + "items": [ + { + "id": "books", + "title": "Books", + "description": "Books about or featuring spaCy" + }, + { + "id": "courses", + "title": "Courses", + "description": "Online courses and interactive tutorials" + }, + { + "id": "videos", + "title": "Videos", + "description": "Talks and tutorials in video format" + }, + { + "id": "podcasts", + "title": "Podcasts", + "description": "Episodes about spaCy or interviews with the spaCy team" + } + ] + } + ] +} From 9c00b287c12d1b63f3bdbbe15d6071bafd96b721 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 12 Sep 2022 10:44:57 +0200 Subject: [PATCH 17/35] Remove beta references. Delete universe.json. --- spacy/cli/find_threshold.py | 8 +- spacy/scorer.py | 9 +- spacy/tests/universe/universe.json | 3831 ---------------------------- 3 files changed, 3 insertions(+), 3845 deletions(-) delete mode 100644 spacy/tests/universe/universe.json diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 0b8e6fbdbfc..6d89355124f 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -15,7 +15,6 @@ _DEFAULTS = { "average": "micro", "n_trials": 10, - "beta": 1, "use_gpu": -1, "gold_preproc": False, } @@ -33,7 +32,6 @@ def find_threshold_cli( threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"), scores_key: str = Arg(..., help="Name of score to metric to optimize"), n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"), - beta: float = Opt(_DEFAULTS["beta"], "--beta", help="Beta for F1 calculation. Ignored if different metric is used"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), @@ -48,7 +46,6 @@ def find_threshold_cli( threshold_key (str): Key of threshold attribute in component's configuration. scores_key (str): Name of score to metric to optimize. n_trials (int): Number of trials to determine optimal thresholds - beta (float): Beta for F-score calculation. code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported. use_gpu (int): GPU ID or -1 for CPU. gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the @@ -66,7 +63,6 @@ def find_threshold_cli( threshold_key=threshold_key, scores_key=scores_key, n_trials=n_trials, - beta=beta, use_gpu=use_gpu, gold_preproc=gold_preproc, silent=False, @@ -81,7 +77,6 @@ def find_threshold( scores_key: str, *, n_trials: int = _DEFAULTS["n_trials"], # type: ignore - beta: float = _DEFAULTS["beta"], # type: ignore use_gpu: int = _DEFAULTS["use_gpu"], # type: ignore gold_preproc: bool = _DEFAULTS["gold_preproc"], # type: ignore silent: bool = True, @@ -94,7 +89,6 @@ def find_threshold( threshold_key (str): Key of threshold attribute in component's configuration. scores_key (str): Name of score to metric to optimize. n_trials (int): Number of trials to determine optimal thresholds. - beta (float): Beta for F-score calculation. use_gpu (int): GPU ID or -1 for CPU. gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due @@ -121,7 +115,7 @@ def find_threshold( if not silent: wasabi.msg.info( title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} " - f"trials and beta = {beta}." + f"trials." ) # Load evaluation corpus. diff --git a/spacy/scorer.py b/spacy/scorer.py index 95258af9e17..74402b46615 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -21,13 +21,10 @@ class PRFScore: """A precision / recall / F score.""" - def __init__( - self, *, tp: int = 0, fp: int = 0, fn: int = 0, beta: float = 1 - ) -> None: + def __init__(self, *, tp: int = 0, fp: int = 0, fn: int = 0) -> None: self.tp = tp self.fp = fp self.fn = fn - self.beta = beta def __len__(self) -> int: return self.tp + self.fp + self.fn @@ -39,12 +36,10 @@ def __iadd__(self, other): return self def __add__(self, other): - assert self.beta == other.beta return PRFScore( tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn, - beta=self.beta, ) def score_set(self, cand: set, gold: set) -> None: @@ -64,7 +59,7 @@ def recall(self) -> float: def fscore(self) -> float: p = self.precision r = self.recall - return (1 + self.beta**2) * ((p * r) / ((self.beta**2 * p) + r + 1e-100)) + return 2 * ((p * r) / (p + r + 1e-100)) def to_dict(self) -> Dict[str, float]: return {"p": self.precision, "r": self.recall, "f": self.fscore} diff --git a/spacy/tests/universe/universe.json b/spacy/tests/universe/universe.json deleted file mode 100644 index b1a61598ecf..00000000000 --- a/spacy/tests/universe/universe.json +++ /dev/null @@ -1,3831 +0,0 @@ -{ - "resources": [ - { - "id": "spacypdfreader", - "title": "spadypdfreader", - "category": ["pipeline"], - "tags": ["PDF"], - "slogan": "Easy PDF to text to spaCy text extraction in Python.", - "description": "*spacypdfreader* is a Python library that allows you to convert PDF files directly into *spaCy* `Doc` objects. The library provides several built in parsers or bring your own parser. `Doc` objects are annotated with several custom attributes including: `token._.page_number`, `doc._.page_range`, `doc._.first_page`, `doc._.last_page`, `doc._.pdf_file_name`, and `doc._.page(int)`.", - "github": "SamEdwardes/spacypdfreader", - "pip": "spacypdfreader", - "url": "https://samedwardes.github.io/spacypdfreader/", - "code_language": "python", - "author": "Sam Edwardes", - "author_links": { - "twitter": "TheReaLSamlam", - "github": "SamEdwardes", - "website": "https://samedwardes.com" - }, - "code_example": [ - "import spacy", - "from spacypdfreader import pdf_reader", - "", - "nlp = spacy.load('en_core_web_sm')", - "doc = pdf_reader('tests/data/test_pdf_01.pdf', nlp)", - "", - "# Get the page number of any token.", - "print(doc[0]._.page_number) # 1", - "print(doc[-1]._.page_number) # 4", - "", - "# Get page meta data about the PDF document.", - "print(doc._.pdf_file_name) # 'tests/data/test_pdf_01.pdf'", - "print(doc._.page_range) # (1, 4)", - "print(doc._.first_page) # 1", - "print(doc._.last_page) # 4", - "", - "# Get all of the text from a specific PDF page.", - "print(doc._.page(4)) # 'able to display the destination page (unless...'" - ] - }, - { - "id": "nlpcloud", - "title": "NLPCloud.io", - "slogan": "Production-ready API for spaCy models in production", - "description": "A highly-available hosted API to easily deploy and use spaCy models in production. Supports NER, POS tagging, dependency parsing, and tokenization.", - "github": "nlpcloud", - "pip": "nlpcloud", - "code_example": [ - "import nlpcloud", - "", - "client = nlpcloud.Client('en_core_web_lg', '4eC39HqLyjWDarjtT1zdp7dc')", - "client.entities('John Doe is a Go Developer at Google')", - "# [{'end': 8, 'start': 0, 'text': 'John Doe', 'type': 'PERSON'}, {'end': 25, 'start': 13, 'text': 'Go Developer', 'type': 'POSITION'}, {'end': 35,'start': 30, 'text': 'Google', 'type': 'ORG'}]" - ], - "thumb": "https://avatars.githubusercontent.com/u/77671902", - "image": "https://nlpcloud.io/assets/images/logo.svg", - "code_language": "python", - "author": "NLPCloud.io", - "author_links": { - "github": "nlpcloud", - "twitter": "cloud_nlp", - "website": "https://nlpcloud.io" - }, - "category": ["apis", "nonpython", "standalone"], - "tags": ["api", "deploy", "production"] - }, - { - "id": "eMFDscore", - "title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python", - "slogan": "Extended Moral Foundation Dictionary Scoring for Python", - "description": "eMFDscore is a library for the fast and flexible extraction of various moral information metrics from textual input data. eMFDscore is built on spaCy for faster execution and performs minimal preprocessing consisting of tokenization, syntactic dependency parsing, lower-casing, and stopword/punctuation/whitespace removal. eMFDscore lets users score documents with multiple Moral Foundations Dictionaries, provides various metrics for analyzing moral information, and extracts moral patient, agent, and attribute words related to entities.", - "github": "medianeuroscience/emfdscore", - "code_example": [ - "from emfdscore.scoring import score_docs", - "import pandas as pd", - "template_input = pd.read_csv('emfdscore/template_input.csv', header=None)", - "DICT_TYPE = 'emfd'", - "PROB_MAP = 'single'", - "SCORE_METHOD = 'bow'", - "OUT_METRICS = 'vice-virtue'", - "OUT_CSV_PATH = 'single-vv.csv'", - "df = score_docs(template_input,DICT_TYPE,PROB_MAP,SCORE_METHOD,OUT_METRICS,num_docs)" - ], - "code_language": "python", - "author": "Media Neuroscience Lab", - "author_links": { - "github": "medianeuroscience", - "twitter": "medianeuro" - }, - "category": ["research", "teaching"], - "tags": ["morality", "dictionary", "sentiment"] - }, - { - "id": "skweak", - "title": "skweak", - "slogan": "Weak supervision for NLP", - "description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.", - "github": "NorskRegnesentral/skweak", - "pip": "skweak", - "code_example": [ - "import spacy, re", - "from skweak import heuristics, gazetteers, aggregation, utils", - "", - "# LF 1: heuristic to detect occurrences of MONEY entities", - "def money_detector(doc):", - " for tok in doc[1:]:", - " if tok.text[0].isdigit() and tok.nbor(-1).is_currency:", - " yield tok.i-1, tok.i+1, 'MONEY'", - "lf1 = heuristics.FunctionAnnotator('money', money_detector)", - "", - "# LF 2: detection of years with a regex", - "lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')", - "", - "# LF 3: a gazetteer with a few names", - "NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]", - "trie = gazetteers.Trie(NAMES)", - "lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})", - "", - "# We create a corpus (here with a single text)", - "nlp = spacy.load('en_core_web_sm')", - "doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')", - "", - "# apply the labelling functions", - "doc = lf3(lf2(lf1(doc)))", - "", - "# and aggregate them", - "hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])", - "hmm.fit_and_aggregate([doc])", - "", - "# we can then visualise the final result (in Jupyter)", - "utils.display_entities(doc, 'hmm')" - ], - "code_language": "python", - "url": "https://github.com/NorskRegnesentral/skweak", - "thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg", - "image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg", - "author": "Pierre Lison", - "author_links": { - "twitter": "plison2", - "github": "plison", - "website": "https://www.nr.no/~plison" - }, - "category": ["pipeline", "standalone", "research", "training"], - "tags": [], - "spacy_version": 3 - }, - { - "id": "numerizer", - "title": "numerizer", - "slogan": "Convert natural language numerics into ints and floats.", - "description": "A SpaCy extension for Docs, Spans and Tokens that converts numerical words and quantitative named entities into numeric strings.", - "github": "jaidevd/numerizer", - "pip": "numerizer", - "code_example": [ - "from spacy import load", - "import numerizer", - "nlp = load('en_core_web_sm') # or any other model", - "doc = nlp('The Hogwarts Express is at platform nine and three quarters')", - "doc._.numerize()", - "# {nine and three quarters: '9.75'}" - ], - "author": "Jaidev Deshpande", - "author_links": { - "github": "jaidevd", - "twitter": "jaidevd" - }, - "category": ["standalone"] - }, - { - "id": "spikex", - "title": "SpikeX - SpaCy Pipes for Knowledge Extraction", - "slogan": "Use SpikeX to build knowledge extraction tools with almost-zero effort", - "description": "SpikeX is a collection of pipes ready to be plugged in a spaCy pipeline. It aims to help in building knowledge extraction tools with almost-zero effort.", - "github": "erre-quadro/spikex", - "pip": "spikex", - "code_example": [ - "from spacy import load as spacy_load", - "from spikex.wikigraph import load as wg_load", - "from spikex.pipes import WikiPageX", - "", - "# load a spacy model and get a doc", - "nlp = spacy_load('en_core_web_sm')", - "doc = nlp('An apple a day keeps the doctor away')", - "# load a WikiGraph", - "wg = wg_load('simplewiki_core')", - "# get a WikiPageX and extract all pages", - "wikipagex = WikiPageX(wg)", - "doc = wikipagex(doc)", - "# see all pages extracted from the doc", - "for span in doc._.wiki_spans:", - " print(span._.wiki_pages)" - ], - "category": ["pipeline", "standalone"], - "author": "Erre Quadro", - "author_links": { - "github": "erre-quadro", - "website": "https://www.errequadrosrl.com" - } - }, - { - "id": "spacy-dbpedia-spotlight", - "title": "DBpedia Spotlight for SpaCy", - "slogan": "Use DBpedia Spotlight to link entities inside SpaCy", - "description": "This library links SpaCy with [DBpedia Spotlight](https://www.dbpedia-spotlight.org/). You can easily get the DBpedia entities from your documents, using the public web service or by using your own instance of DBpedia Spotlight. The `doc.ents` are populated with the entities and all their details (URI, type, ...).", - "github": "MartinoMensio/spacy-dbpedia-spotlight", - "pip": "spacy-dbpedia-spotlight", - "code_example": [ - "import spacy_dbpedia_spotlight", - "# load your model as usual", - "nlp = spacy.load('en_core_web_lg')", - "# add the pipeline stage", - "nlp.add_pipe('dbpedia_spotlight')", - "# get the document", - "doc = nlp('The president of USA is calling Boris Johnson to decide what to do about coronavirus')", - "# see the entities", - "print('Entities', [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])", - "# inspect the raw data from DBpedia spotlight", - "print(doc.ents[0]._.dbpedia_raw_result)" - ], - "category": ["models", "pipeline"], - "author": "Martino Mensio", - "author_links": { - "twitter": "MartinoMensio", - "github": "MartinoMensio", - "website": "https://martinomensio.github.io" - } - }, - { - "id": "spacy-textblob", - "title": "spaCyTextBlob", - "slogan": "Easy sentiment analysis for spaCy using TextBlob. Now supports spaCy 3.0!", - "thumb": "https://github.com/SamEdwardes/spaCyTextBlob/raw/main/website/static/img/logo-thumb-square-250x250.png", - "description": "spaCyTextBlob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extensions `._.polarity`, `._.subjectivity`, and `._.assessments` to `Doc`, `Span`, and `Token` objects. For spaCy 2 please use `pip install pip install spacytextblob==0.1.7`", - "github": "SamEdwardes/spaCyTextBlob", - "pip": "spacytextblob", - "code_example": [ - "import spacy", - "from spacytextblob.spacytextblob import SpacyTextBlob", - "", - "nlp = spacy.load('en_core_web_sm')", - "nlp.add_pipe('spacytextblob')", - "text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'", - "doc = nlp(text)", - "doc._.polarity # Polarity: -0.125", - "doc._.subjectivity # Sujectivity: 0.9", - "doc._.assessments # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]" - ], - "code_language": "python", - "url": "https://spacytextblob.netlify.app/", - "author": "Sam Edwardes", - "author_links": { - "twitter": "TheReaLSamlam", - "github": "SamEdwardes", - "website": "https://samedwardes.com" - }, - "category": ["pipeline"], - "tags": ["sentiment", "textblob"] - }, - { - "id": "spacy-ray", - "title": "spacy-ray", - "slogan": "Parallel and distributed training with spaCy and Ray", - "description": "[Ray](https://ray.io/) is a fast and simple framework for building and running **distributed applications**. This very lightweight extension package lets you use Ray for parallel and distributed training with spaCy. If `spacy-ray` is installed in the same environment as spaCy, it will automatically add `spacy ray` commands to your spaCy CLI.", - "github": "explosion/spacy-ray", - "pip": "spacy-ray", - "category": ["training"], - "author": "Explosion / Anyscale", - "thumb": "https://i.imgur.com/7so6ZpS.png" - }, - { - "id": "spacy-sentence-bert", - "title": "spaCy - sentence-transformers", - "slogan": "Pipelines for pretrained sentence-transformers (BERT, RoBERTa, XLM-RoBERTa & Co.) directly within spaCy", - "description": "This library lets you use the embeddings from [sentence-transformers](https://github.com/UKPLab/sentence-transformers) of Docs, Spans and Tokens directly from spaCy. Most models are for the english language but three of them are multilingual.", - "github": "MartinoMensio/spacy-sentence-bert", - "pip": "spacy-sentence-bert", - "code_example": [ - "import spacy_sentence_bert", - "# load one of the models listed at https://github.com/MartinoMensio/spacy-sentence-bert/", - "nlp = spacy_sentence_bert.load_model('en_roberta_large_nli_stsb_mean_tokens')", - "# get two documents", - "doc_1 = nlp('Hi there, how are you?')", - "doc_2 = nlp('Hello there, how are you doing today?')", - "# use the similarity method that is based on the vectors, on Doc, Span or Token", - "print(doc_1.similarity(doc_2[0:7]))" - ], - "category": ["models", "pipeline"], - "author": "Martino Mensio", - "author_links": { - "twitter": "MartinoMensio", - "github": "MartinoMensio", - "website": "https://martinomensio.github.io" - } - }, - { - "id": "spacy-streamlit", - "title": "spacy-streamlit", - "slogan": "spaCy building blocks for Streamlit apps", - "github": "explosion/spacy-streamlit", - "description": "This package contains utilities for visualizing spaCy models and building interactive spaCy-powered apps with [Streamlit](https://streamlit.io). It includes various building blocks you can use in your own Streamlit app, like visualizers for **syntactic dependencies**, **named entities**, **text classification**, **semantic similarity** via word vectors, token attributes, and more.", - "pip": "spacy-streamlit", - "category": ["visualizers"], - "thumb": "https://i.imgur.com/mhEjluE.jpg", - "image": "https://user-images.githubusercontent.com/13643239/85388081-f2da8700-b545-11ea-9bd4-e303d3c5763c.png", - "code_example": [ - "import spacy_streamlit", - "", - "models = [\"en_core_web_sm\", \"en_core_web_md\"]", - "default_text = \"Sundar Pichai is the CEO of Google.\"", - "spacy_streamlit.visualize(models, default_text)" - ], - "author": "Ines Montani", - "author_links": { - "twitter": "_inesmontani", - "github": "ines", - "website": "https://ines.io" - } - }, - { - "id": "spaczz", - "title": "spaczz", - "slogan": "Fuzzy matching and more for spaCy.", - "description": "Spaczz provides fuzzy matching and multi-token regex matching functionality for spaCy. Spaczz's components have similar APIs to their spaCy counterparts and spaczz pipeline components can integrate into spaCy pipelines where they can be saved/loaded as models.", - "github": "gandersen101/spaczz", - "pip": "spaczz", - "code_example": [ - "import spacy", - "from spaczz.pipeline import SpaczzRuler", - "", - "nlp = spacy.blank('en')", - "ruler = SpaczzRuler(nlp)", - "ruler.add_patterns([{'label': 'PERSON', 'pattern': 'Bill Gates', 'type': 'fuzzy'}])", - "nlp.add_pipe(ruler)", - "", - "doc = nlp('Oops, I spelled Bill Gatez wrong.')", - "print([(ent.text, ent.start, ent.end, ent.label_) for ent in doc.ents])" - ], - "code_language": "python", - "url": "https://spaczz.readthedocs.io/en/latest/", - "author": "Grant Andersen", - "author_links": { - "twitter": "gandersen101", - "github": "gandersen101" - }, - "category": ["pipeline"], - "tags": ["fuzzy-matching", "regex"] - }, - { - "id": "spacy-universal-sentence-encoder", - "title": "spaCy - Universal Sentence Encoder", - "slogan": "Make use of Google's Universal Sentence Encoder directly within spaCy", - "description": "This library lets you use Universal Sentence Encoder embeddings of Docs, Spans and Tokens directly from TensorFlow Hub", - "github": "MartinoMensio/spacy-universal-sentence-encoder", - "pip": "spacy-universal-sentence-encoder", - "code_example": [ - "import spacy_universal_sentence_encoder", - "# load one of the models: ['en_use_md', 'en_use_lg', 'xx_use_md', 'xx_use_lg']", - "nlp = spacy_universal_sentence_encoder.load_model('en_use_lg')", - "# get two documents", - "doc_1 = nlp('Hi there, how are you?')", - "doc_2 = nlp('Hello there, how are you doing today?')", - "# use the similarity method that is based on the vectors, on Doc, Span or Token", - "print(doc_1.similarity(doc_2[0:7]))" - ], - "category": ["models", "pipeline"], - "author": "Martino Mensio", - "author_links": { - "twitter": "MartinoMensio", - "github": "MartinoMensio", - "website": "https://martinomensio.github.io" - } - }, - { - "id": "whatlies", - "title": "whatlies", - "slogan": "Make interactive visualisations to figure out 'what lies' in word embeddings.", - "description": "This small library offers tools to make visualisation easier of both word embeddings as well as operations on them. It has support for spaCy prebuilt models as a first class citizen but also offers support for sense2vec. There's a convenient API to perform linear algebra as well as support for popular transformations like PCA/UMAP/etc.", - "github": "rasahq/whatlies", - "pip": "whatlies", - "thumb": "https://i.imgur.com/rOkOiLv.png", - "image": "https://raw.githubusercontent.com/RasaHQ/whatlies/master/docs/gif-two.gif", - "code_example": [ - "from whatlies import EmbeddingSet", - "from whatlies.language import SpacyLanguage", - "", - "lang = SpacyLanguage('en_core_web_md')", - "words = ['cat', 'dog', 'fish', 'kitten', 'man', 'woman', 'king', 'queen', 'doctor', 'nurse']", - "", - "emb = lang[words]", - "emb.plot_interactive(x_axis='man', y_axis='woman')" - ], - "category": ["visualizers", "research"], - "author": "Vincent D. Warmerdam", - "author_links": { - "twitter": "fishnets88", - "github": "koaning", - "website": "https://koaning.io" - } - }, - { - "id": "tokenwiser", - "title": "tokenwiser", - "slogan": "Connect vowpal-wabbit & scikit-learn models to spaCy to run simple classification benchmarks. Comes with many utility functions for spaCy pipelines.", - "github": "koaning/tokenwiser", - "pip": "tokenwiser", - "thumb": "https://koaning.github.io/tokenwiser/token.png", - "image": "https://koaning.github.io/tokenwiser/logo-tokw.png", - "code_example": [ - "import spacy", - "", - "from sklearn.pipeline import make_pipeline", - "from sklearn.feature_extraction.text import CountVectorizer", - "from sklearn.linear_model import LogisticRegression", - "", - "from tokenwiser.component import attach_sklearn_categoriser", - "", - "X = [", - " 'i really like this post',", - " 'thanks for that comment',", - " 'i enjoy this friendly forum',", - " 'this is a bad post',", - " 'i dislike this article',", - " 'this is not well written'", - "]", - "", - "y = ['pos', 'pos', 'pos', 'neg', 'neg', 'neg']", - "", - "# Note that we're training a pipeline here via a single-batch `.fit()` method", - "pipe = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y)", - "", - "nlp = spacy.load('en_core_web_sm')", - "# This is where we attach our pre-trained model as a pipeline step.", - "attach_sklearn_categoriser(nlp, pipe_name='silly_sentiment', estimator=pipe)" - ], - "category": ["pipeline", "training"], - "author": "Vincent D. Warmerdam", - "author_links": { - "twitter": "fishnets88", - "github": "koaning", - "website": "https://koaning.io" - } - }, - { - "id": "spacy-stanza", - "title": "spacy-stanza", - "slogan": "Use the latest Stanza (StanfordNLP) research models directly in spaCy", - "description": "This package wraps the Stanza (formerly StanfordNLP) library, so you can use Stanford's models as a spaCy pipeline. Using this wrapper, you'll be able to use the following annotations, computed by your pretrained `stanza` model:\n\n- Statistical tokenization (reflected in the `Doc` and its tokens)\n - Lemmatization (`token.lemma` and `token.lemma_`)\n - Part-of-speech tagging (`token.tag`, `token.tag_`, `token.pos`, `token.pos_`)\n - Dependency parsing (`token.dep`, `token.dep_`, `token.head`)\n - Named entity recognition (`doc.ents`, `token.ent_type`, `token.ent_type_`, `token.ent_iob`, `token.ent_iob_`)\n - Sentence segmentation (`doc.sents`)", - "github": "explosion/spacy-stanza", - "pip": "spacy-stanza", - "thumb": "https://i.imgur.com/myhLjMJ.png", - "code_example": [ - "import stanza", - "import spacy_stanza", - "", - "stanza.download(\"en\")", - "nlp = spacy_stanza.load_pipeline(\"en\")", - "", - "doc = nlp(\"Barack Obama was born in Hawaii. He was elected president in 2008.\")", - "for token in doc:", - " print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)", - "print(doc.ents)" - ], - "category": ["pipeline", "standalone", "models", "research"], - "author": "Explosion", - "author_links": { - "twitter": "explosion_ai", - "github": "explosion", - "website": "https://explosion.ai" - } - }, - { - "id": "spacy-udpipe", - "title": "spacy-udpipe", - "slogan": "Use the latest UDPipe models directly in spaCy", - "description": "This package wraps the fast and efficient UDPipe language-agnostic NLP pipeline (via its Python bindings), so you can use UDPipe pre-trained models as a spaCy pipeline for 50+ languages out-of-the-box. Inspired by spacy-stanza, this package offers slightly less accurate models that are in turn much faster.", - "github": "TakeLab/spacy-udpipe", - "pip": "spacy-udpipe", - "code_example": [ - "import spacy_udpipe", - "", - "spacy_udpipe.download(\"en\") # download English model", - "", - "text = \"Wikipedia is a free online encyclopedia, created and edited by volunteers around the world.\"", - "nlp = spacy_udpipe.load(\"en\")", - "", - "doc = nlp(text)", - "for token in doc:", - " print(token.text, token.lemma_, token.pos_, token.dep_)" - ], - "category": ["pipeline", "standalone", "models", "research"], - "author": "TakeLab", - "author_links": { - "github": "TakeLab", - "website": "https://takelab.fer.hr/" - } - }, - { - "id": "spacy-server", - "title": "spaCy Server", - "slogan": "\uD83E\uDD9C Containerized HTTP API for spaCy NLP", - "description": "For developers who need programming language agnostic NLP, spaCy Server is a containerized HTTP API that provides industrial-strength natural language processing. Unlike other servers, our server is fast, idiomatic, and well documented.", - "github": "neelkamath/spacy-server", - "code_example": [ - "docker run --rm -dp 8080:8080 neelkamath/spacy-server", - "curl http://localhost:8080/ner -H 'Content-Type: application/json' -d '{\"sections\": [\"My name is John Doe. I grew up in California.\"]}'" - ], - "code_language": "shell", - "url": "https://hub.docker.com/r/neelkamath/spacy-server", - "author": "Neel Kamath", - "author_links": { - "github": "neelkamath", - "website": "https://neelkamath.com" - }, - "category": ["apis"], - "tags": ["docker"] - }, - { - "id": "nlp-architect", - "title": "NLP Architect", - "slogan": "Python lib for exploring Deep NLP & NLU by Intel AI", - "github": "NervanaSystems/nlp-architect", - "pip": "nlp-architect", - "thumb": "https://i.imgur.com/vMideRx.png", - "category": ["standalone", "research"], - "tags": ["pytorch"] - }, - { - "id": "NeuroNER", - "title": "NeuroNER", - "slogan": "Named-entity recognition using neural networks", - "github": "Franck-Dernoncourt/NeuroNER", - "category": ["models"], - "pip": "pyneuroner[cpu]", - "code_example": [ - "from neuroner import neuromodel", - "nn = neuromodel.NeuroNER(train_model=False, use_pretrained_model=True)" - ], - "tags": ["standalone"] - }, - { - "id": "NLPre", - "title": "NLPre", - "slogan": "Natural Language Preprocessing Library for health data and more", - "github": "NIHOPA/NLPre", - "pip": "nlpre", - "code_example": [ - "from nlpre import titlecaps, dedash, identify_parenthetical_phrases", - "from nlpre import replace_acronyms, replace_from_dictionary", - "ABBR = identify_parenthetical_phrases()(text)", - "parsers = [dedash(), titlecaps(), replace_acronyms(ABBR),", - " replace_from_dictionary(prefix='MeSH_')]", - "for f in parsers:", - " text = f(text)", - "print(text)" - ], - "category": ["scientific", "biomedical"], - "author": "Travis Hoppe", - "author_links": { - "github": "thoppe", - "twitter": "metasemantic", - "website": "http://thoppe.github.io/" - } - }, - { - "id": "Chatterbot", - "title": "Chatterbot", - "slogan": "A machine-learning based conversational dialog engine for creating chat bots", - "github": "gunthercox/ChatterBot", - "pip": "chatterbot", - "thumb": "https://i.imgur.com/eyAhwXk.jpg", - "code_example": [ - "from chatterbot import ChatBot", - "from chatterbot.trainers import ListTrainer", - "# Create a new chat bot named Charlie", - "chatbot = ChatBot('Charlie')", - "trainer = ListTrainer(chatbot)", - "trainer.train([", - "'Hi, can I help you?',", - "'Sure, I would like to book a flight to Iceland.',", - "'Your flight has been booked.'", - "])", - "", - "response = chatbot.get_response('I would like to book a flight.')" - ], - "author": "Gunther Cox", - "author_links": { - "github": "gunthercox" - }, - "category": ["conversational", "standalone"], - "tags": ["chatbots"] - }, - { - "id": "saber", - "title": "saber", - "slogan": "Deep-learning based tool for information extraction in the biomedical domain", - "github": "BaderLab/saber", - "pip": "saber", - "thumb": "https://raw.githubusercontent.com/BaderLab/saber/master/docs/img/saber_logo.png", - "code_example": [ - "from saber.saber import Saber", - "saber = Saber()", - "saber.load('PRGE')", - "saber.annotate('The phosphorylation of Hdm2 by MK2 promotes the ubiquitination of p53.')" - ], - "author": "Bader Lab, University of Toronto", - "category": ["scientific"], - "tags": ["keras", "biomedical"] - }, - { - "id": "alibi", - "title": "alibi", - "slogan": "Algorithms for monitoring and explaining machine learning models ", - "github": "SeldonIO/alibi", - "pip": "alibi", - "thumb": "https://i.imgur.com/YkzQHRp.png", - "code_example": [ - "from alibi.explainers import AnchorTabular", - "explainer = AnchorTabular(predict_fn, feature_names)", - "explainer.fit(X_train)", - "explainer.explain(x)" - ], - "author": "Seldon", - "category": ["standalone", "research"] - }, - { - "id": "spacymoji", - "slogan": "Emoji handling and meta data as a spaCy pipeline component", - "github": "ines/spacymoji", - "description": "spaCy extension and pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Human-readable emoji descriptions are added as a custom attribute, and an optional lookup table can be provided for your own descriptions. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_emoji`, `._.emoji_desc`, `._.has_emoji` and `._.emoji`.", - "pip": "spacymoji", - "category": ["pipeline"], - "tags": ["emoji", "unicode"], - "thumb": "https://i.imgur.com/XOTYIgn.jpg", - "code_example": [ - "import spacy", - "from spacymoji import Emoji", - "", - "nlp = spacy.load('en')", - "emoji = Emoji(nlp)", - "nlp.add_pipe(emoji, first=True)", - "", - "doc = nlp('This is a test 😻 👍🏿')", - "assert doc._.has_emoji == True", - "assert doc[2:5]._.has_emoji == True", - "assert doc[0]._.is_emoji == False", - "assert doc[4]._.is_emoji == True", - "assert doc[5]._.emoji_desc == 'thumbs up dark skin tone'", - "assert len(doc._.emoji) == 2", - "assert doc._.emoji[1] == ('👍🏿', 5, 'thumbs up dark skin tone')" - ], - "author": "Ines Montani", - "author_links": { - "twitter": "_inesmontani", - "github": "ines", - "website": "https://ines.io" - } - }, - { - "id": "spacyopentapioca", - "title": "spaCyOpenTapioca", - "slogan": "Named entity linking on Wikidata in spaCy via OpenTapioca", - "description": "A spaCy wrapper of OpenTapioca for named entity linking on Wikidata", - "github": "UB-Mannheim/spacyopentapioca", - "pip": "spacyopentapioca", - "code_example": [ - "import spacy", - "nlp = spacy.blank('en')", - "nlp.add_pipe('opentapioca')", - "doc = nlp('Christian Drosten works in Germany.')", - "for span in doc.ents:", - " print((span.text, span.kb_id_, span.label_, span._.description, span._.score))", - "# ('Christian Drosten', 'Q1079331', 'PERSON', 'German virologist and university teacher', 3.6533377082098895)", - "# ('Germany', 'Q183', 'LOC', 'sovereign state in Central Europe', 2.1099332471902863)", - "## Check also span._.types, span._.aliases, span._.rank" - ], - "category": ["models", "pipeline"], - "tags": ["NER", "NEL"], - "author": "Renat Shigapov", - "author_links": { - "twitter": "_shigapov", - "github": "shigapov" - } - }, - { - "id": "spacy_hunspell", - "slogan": "Add spellchecking and spelling suggestions to your spaCy pipeline using Hunspell", - "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add [Hunspell](http://hunspell.github.io) support for spellchecking.", - "github": "tokestermw/spacy_hunspell", - "pip": "spacy_hunspell", - "code_example": [ - "import spacy", - "from spacy_hunspell import spaCyHunSpell", - "", - "nlp = spacy.load('en_core_web_sm')", - "hunspell = spaCyHunSpell(nlp, 'mac')", - "nlp.add_pipe(hunspell)", - "doc = nlp('I can haz cheezeburger.')", - "haz = doc[2]", - "haz._.hunspell_spell # False", - "haz._.hunspell_suggest # ['ha', 'haze', 'hazy', 'has', 'hat', 'had', 'hag', 'ham', 'hap', 'hay', 'haw', 'ha z']" - ], - "author": "Motoki Wu", - "author_links": { - "github": "tokestermw", - "twitter": "plusepsilon" - }, - "category": ["pipeline"], - "tags": ["spellcheck"] - }, - { - "id": "spacy_grammar", - "slogan": "Language Tool style grammar handling with spaCy", - "description": "This packages leverages the [Matcher API](https://spacy.io/docs/usage/rule-based-matching) in spaCy to quickly match on spaCy tokens not dissimilar to regex. It reads a `grammar.yml` file to load up custom patterns and returns the results inside `Doc`, `Span`, and `Token`. It is extensible through adding rules to `grammar.yml` (though currently only the simple string matching is implemented).", - "github": "tokestermw/spacy_grammar", - "code_example": [ - "import spacy", - "from spacy_grammar.grammar import Grammar", - "", - "nlp = spacy.load('en')", - "grammar = Grammar(nlp)", - "nlp.add_pipe(grammar)", - "doc = nlp('I can haz cheeseburger.')", - "doc._.has_grammar_error # True" - ], - "author": "Motoki Wu", - "author_links": { - "github": "tokestermw", - "twitter": "plusepsilon" - }, - "category": ["pipeline"] - }, - { - "id": "spacy_kenlm", - "slogan": "KenLM extension for spaCy 2.0", - "github": "tokestermw/spacy_kenlm", - "pip": "spacy_kenlm", - "code_example": [ - "import spacy", - "from spacy_kenlm import spaCyKenLM", - "", - "nlp = spacy.load('en_core_web_sm')", - "spacy_kenlm = spaCyKenLM() # default model from test.arpa", - "nlp.add_pipe(spacy_kenlm)", - "doc = nlp('How are you?')", - "doc._.kenlm_score # doc score", - "doc[:2]._.kenlm_score # span score", - "doc[2]._.kenlm_score # token score" - ], - "author": "Motoki Wu", - "author_links": { - "github": "tokestermw", - "twitter": "plusepsilon" - }, - "category": ["pipeline"] - }, - { - "id": "spacy_readability", - "slogan": "Add text readability meta data to Doc objects", - "description": "spaCy v2.0 pipeline component for calculating readability scores of of text. Provides scores for Flesh-Kincaid grade level, Flesh-Kincaid reading ease, and Dale-Chall.", - "github": "mholtzscher/spacy_readability", - "pip": "spacy-readability", - "code_example": [ - "import spacy", - "from spacy_readability import Readability", - "", - "nlp = spacy.load('en')", - "read = Readability(nlp)", - "nlp.add_pipe(read, last=True)", - "doc = nlp(\"I am some really difficult text to read because I use obnoxiously large words.\")", - "doc._.flesch_kincaid_grade_level", - "doc._.flesch_kincaid_reading_ease", - "doc._.dale_chall" - ], - "author": "Michael Holtzscher", - "author_links": { - "github": "mholtzscher" - }, - "category": ["pipeline"] - }, - { - "id": "spacy-sentence-segmenter", - "title": "Sentence Segmenter", - "slogan": "Custom sentence segmentation for spaCy", - "code_example": [ - "from seg.newline.segmenter import NewLineSegmenter", - "import spacy", - "", - "nlseg = NewLineSegmenter()", - "nlp = spacy.load('en')", - "nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')", - "doc = nlp(my_doc_text)" - ], - "author": "tc64", - "author_links": { - "github": "tc64" - }, - "category": ["pipeline"] - }, - { - "id": "spacy_cld", - "title": "spaCy-CLD", - "slogan": "Add language detection to your spaCy pipeline using CLD2", - "description": "spaCy-CLD operates on `Doc` and `Span` spaCy objects. When called on a `Doc` or `Span`, the object is given two attributes: `languages` (a list of up to 3 language codes) and `language_scores` (a dictionary mapping language codes to confidence scores between 0 and 1).\n\nspacy-cld is a little extension that wraps the [PYCLD2](https://github.com/aboSamoor/pycld2) Python library, which in turn wraps the [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) C library originally built at Google for the Chromium project. CLD2 uses character n-grams as features and a Naive Bayes classifier to identify 80+ languages from Unicode text strings (or XML/HTML). It can detect up to 3 different languages in a given document, and reports a confidence score (reported in with each language.", - "github": "nickdavidhaynes/spacy-cld", - "pip": "spacy_cld", - "code_example": [ - "import spacy", - "from spacy_cld import LanguageDetector", - "", - "nlp = spacy.load('en')", - "language_detector = LanguageDetector()", - "nlp.add_pipe(language_detector)", - "doc = nlp('This is some English text.')", - "", - "doc._.languages # ['en']", - "doc._.language_scores['en'] # 0.96" - ], - "author": "Nicholas D Haynes", - "author_links": { - "github": "nickdavidhaynes" - }, - "category": ["pipeline"] - }, - { - "id": "spacy-lookup", - "slogan": "A powerful entity matcher for very large dictionaries, using the FlashText module", - "description": "spaCy v2.0 extension and pipeline component for adding Named Entities metadata to `Doc` objects. Detects Named Entities using dictionaries. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_entity`, `._.entity_type`, `._.has_entities` and `._.entities`. Named Entities are matched using the python module `flashtext`, and looked up in the data provided by different dictionaries.", - "github": "mpuig/spacy-lookup", - "pip": "spacy-lookup", - "code_example": [ - "import spacy", - "from spacy_lookup import Entity", - "", - "nlp = spacy.load('en')", - "entity = Entity(keywords_list=['python', 'product manager', 'java platform'])", - "nlp.add_pipe(entity, last=True)", - "", - "doc = nlp(\"I am a product manager for a java and python.\")", - "assert doc._.has_entities == True", - "assert doc[0]._.is_entity == False", - "assert doc[3]._.entity_desc == 'product manager'", - "assert doc[3]._.is_entity == True", - "", - "print([(token.text, token._.canonical) for token in doc if token._.is_entity])" - ], - "author": "Marc Puig", - "author_links": { - "github": "mpuig" - }, - "category": ["pipeline"] - }, - { - "id": "spacy-iwnlp", - "slogan": "German lemmatization with IWNLP", - "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add [IWNLP-py](https://github.com/Liebeck/iwnlp-py) as German lemmatizer directly into your spaCy pipeline.", - "github": "Liebeck/spacy-iwnlp", - "pip": "spacy-iwnlp", - "code_example": [ - "import spacy", - "from spacy_iwnlp import spaCyIWNLP", - "", - "nlp = spacy.load('de')", - "iwnlp = spaCyIWNLP(lemmatizer_path='data/IWNLP.Lemmatizer_20170501.json')", - "nlp.add_pipe(iwnlp)", - "doc = nlp('Wir mögen Fußballspiele mit ausgedehnten Verlängerungen.')", - "for token in doc:", - " print('POS: {}\tIWNLP:{}'.format(token.pos_, token._.iwnlp_lemmas))" - ], - "author": "Matthias Liebeck", - "author_links": { - "github": "Liebeck" - }, - "category": ["pipeline"], - "tags": ["lemmatizer", "german"] - }, - { - "id": "spacy-sentiws", - "slogan": "German sentiment scores with SentiWS", - "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add [SentiWS](http://wortschatz.uni-leipzig.de/en/download) as German sentiment score directly into your spaCy pipeline.", - "github": "Liebeck/spacy-sentiws", - "pip": "spacy-sentiws", - "code_example": [ - "import spacy", - "from spacy_sentiws import spaCySentiWS", - "", - "nlp = spacy.load('de')", - "sentiws = spaCySentiWS(sentiws_path='data/sentiws/')", - "nlp.add_pipe(sentiws)", - "doc = nlp('Die Dummheit der Unterwerfung blüht in hübschen Farben.')", - "", - "for token in doc:", - " print('{}, {}, {}'.format(token.text, token._.sentiws, token.pos_))" - ], - "author": "Matthias Liebeck", - "author_links": { - "github": "Liebeck" - }, - "category": ["pipeline"], - "tags": ["sentiment", "german"] - }, - { - "id": "spacy-lefff", - "slogan": "POS and French lemmatization with Lefff", - "description": "spacy v2.0 extension and pipeline component for adding a French POS and lemmatizer based on [Lefff](https://hal.inria.fr/inria-00521242/).", - "github": "sammous/spacy-lefff", - "pip": "spacy-lefff", - "code_example": [ - "import spacy", - "from spacy_lefff import LefffLemmatizer, POSTagger", - "", - "nlp = spacy.load('fr')", - "pos = POSTagger()", - "french_lemmatizer = LefffLemmatizer(after_melt=True)", - "nlp.add_pipe(pos, name='pos', after='parser')", - "nlp.add_pipe(french_lemmatizer, name='lefff', after='pos')", - "doc = nlp(u\"Paris est une ville très chère.\")", - "for d in doc:", - " print(d.text, d.pos_, d._.melt_tagger, d._.lefff_lemma, d.tag_, d.lemma_)" - ], - "author": "Sami Moustachir", - "author_links": { - "github": "sammous" - }, - "category": ["pipeline"], - "tags": ["pos", "lemmatizer", "french"] - }, - { - "id": "lemmy", - "title": "Lemmy", - "slogan": "A Danish lemmatizer", - "description": "Lemmy is a lemmatizer for Danish 🇩🇰 . It comes already trained on Dansk Sprognævns (DSN) word list (‘fuldformliste’) and the Danish Universal Dependencies and is ready for use. Lemmy also supports training on your own dataset. The model currently included in Lemmy was evaluated on the Danish Universal Dependencies dev dataset and scored an accruacy > 99%.\n\nYou can use Lemmy as a spaCy extension, more specifcally a spaCy pipeline component. This is highly recommended and makes the lemmas easily accessible from the spaCy tokens. Lemmy makes use of POS tags to predict the lemmas. When wired up to the spaCy pipeline, Lemmy has the benefit of using spaCy’s builtin POS tagger.", - "github": "sorenlind/lemmy", - "pip": "lemmy", - "code_example": [ - "import da_custom_model as da # name of your spaCy model", - "import lemmy.pipe", - "nlp = da.load()", - "", - "# create an instance of Lemmy's pipeline component for spaCy", - "pipe = lemmy.pipe.load()", - "", - "# add the comonent to the spaCy pipeline.", - "nlp.add_pipe(pipe, after='tagger')", - "", - "# lemmas can now be accessed using the `._.lemma` attribute on the tokens", - "nlp(\"akvariernes\")[0]._.lemma" - ], - "thumb": "https://i.imgur.com/RJVFRWm.jpg", - "author": "Søren Lind Kristiansen", - "author_links": { - "github": "sorenlind" - }, - "category": ["pipeline"], - "tags": ["lemmatizer", "danish"] - }, - { - "id": "dacy", - "title": "DaCy", - "slogan": "An efficient Pipeline for Danish NLP", - "description": "DaCy is a Danish preprocessing pipeline trained in SpaCy. It has achieved State-of-the-Art performance on Named entity recognition, part-of-speech tagging and dependency parsing for Danish. This repository contains material for using the DaCy, reproducing the results and guides on usage of the package. Furthermore, it also contains a series of behavioural test for biases and robustness of Danish NLP pipelines.", - "github": "centre-for-humanities-computing/DaCy", - "pip": "dacy", - "code_example": [ - "import dacy", - "print(dacy.models()) # get a list of dacy models", - "nlp = dacy.load('medium') # load your spacy pipeline", - "", - "# DaCy also includes functionality for adding other Danish models to the pipeline", - "# For instance you can add the BertTone model for classification of sentiment polarity to the pipeline:", - "nlp = add_berttone_polarity(nlp)" - ], - "thumb": "https://github.com/centre-for-humanities-computing/DaCy/blob/main/img/icon_no_title.png?raw=true", - "author": "Centre for Humanities Computing Aarhus", - "author_links": { - "github": "centre-for-humanities-computing", - "website": "https://chcaa.io/#/" - }, - "category": ["pipeline"], - "tags": ["pipeline", "danish"] - }, - { - "id": "spacy-wrap", - "title": "spaCy-wrap", - "slogan": "For Wrapping fine-tuned transformers in spaCy pipelines", - "description": "spaCy-wrap is a wrapper library for spaCy for including fine-tuned transformers from Huggingface in your spaCy pipeline allowing inclusion of existing models within existing workflows.", - "github": "kennethenevoldsen/spacy-wrap", - "pip": "spacy_wrap", - "code_example": [ - "import spacy", - "import spacy_wrap", - "", - "nlp = spacy.blank('en')", - "config = {", - " 'doc_extension_trf_data': 'clf_trf_data', # document extention for the forward pass", - " 'doc_extension_prediction': 'sentiment', # document extention for the prediction", - " 'labels': ['negative', 'neutral', 'positive'],", - " 'model': {", - " 'name': 'cardiffnlp/twitter-roberta-base-sentiment', # the model name or path of huggingface model", - "},", - "}", - "", - "transformer = nlp.add_pipe('classification_transformer', config=config)", - "transformer.model.initialize()", - "", - "doc = nlp('spaCy is a wonderful tool')", - "", - "print(doc._.clf_trf_data)", - "# TransformerData(wordpieces=...", - "print(doc._.sentiment)", - "# 'positive'", - "print(doc._.sentiment_prob)", - "# {'prob': array([0.004, 0.028, 0.969], dtype=float32), 'labels': ['negative', 'neutral', 'positive']}" - ], - "thumb": "https://raw.githubusercontent.com/KennethEnevoldsen/spacy-wrap/main/docs/_static/icon.png", - "author": "Kenneth Enevoldsen", - "author_links": { - "github": "KennethEnevoldsen", - "website": "https://www.kennethenevoldsen.com" - }, - "category": ["pipeline", "models", "training"], - "tags": ["pipeline", "models", "transformers"] - }, - { - "id": "textdescriptives", - "title": "TextDescriptives", - "slogan": "Extraction of descriptive stats, readability, and syntactic complexity measures", - "description": "Pipeline component for spaCy v.3 that calculates descriptive statistics, readability metrics, and syntactic complexity (dependency distance).", - "github": "HLasse/TextDescriptives", - "pip": "textdescriptives", - "code_example": [ - "import spacy", - "import textdescriptives as td", - "nlp = spacy.load('en_core_web_sm')", - "nlp.add_pipe('textdescriptives')", - "doc = nlp('This is a short test text')", - "doc._.readability # access some of the values", - "td.extract_df(doc) # extract all metrics to DataFrame" - ], - "author": "Lasse Hansen, Kenneth Enevoldsen, Ludvig Olsen", - "author_links": { - "github": "HLasse" - }, - "category": ["pipeline"], - "tags": ["pipeline", "readability", "syntactic complexity", "descriptive statistics"] - }, - { - "id": "wmd-relax", - "slogan": "Calculates word mover's distance insanely fast", - "description": "Calculates Word Mover's Distance as described in [From Word Embeddings To Document Distances](http://www.cs.cornell.edu/~kilian/papers/wmd_metric.pdf) by Matt Kusner, Yu Sun, Nicholas Kolkin and Kilian Weinberger.\n\n⚠️ **This package is currently only compatible with spaCy v.1x.**", - "github": "src-d/wmd-relax", - "thumb": "https://i.imgur.com/f91C3Lf.jpg", - "code_example": [ - "import spacy", - "import wmd", - "", - "nlp = spacy.load('en', create_pipeline=wmd.WMD.create_spacy_pipeline)", - "doc1 = nlp(\"Politician speaks to the media in Illinois.\")", - "doc2 = nlp(\"The president greets the press in Chicago.\")", - "print(doc1.similarity(doc2))" - ], - "author": "source{d}", - "author_links": { - "github": "src-d", - "twitter": "sourcedtech", - "website": "https://sourced.tech" - }, - "category": ["pipeline"] - }, - { - "id": "neuralcoref", - "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy", - "description": "This coreference resolution module is based on the super fast [spaCy](https://spacy.io/) parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. Since ✨Neuralcoref v2.0, you can train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Note that to use neuralcoref with spaCy > 2.1.0, you'll have to install neuralcoref from source.", - "github": "huggingface/neuralcoref", - "thumb": "https://i.imgur.com/j6FO9O6.jpg", - "code_example": [ - "import spacy", - "import neuralcoref", - "", - "nlp = spacy.load('en')", - "neuralcoref.add_to_pipe(nlp)", - "doc1 = nlp('My sister has a dog. She loves him.')", - "print(doc1._.coref_clusters)", - "", - "doc2 = nlp('Angela lives in Boston. She is quite happy in that city.')", - "for ent in doc2.ents:", - " print(ent._.coref_cluster)" - ], - "author": "Hugging Face", - "author_links": { - "github": "huggingface" - }, - "category": ["standalone", "conversational", "models"], - "tags": ["coref"] - }, - { - "id": "neuralcoref-vizualizer", - "title": "Neuralcoref Visualizer", - "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy", - "description": "In short, coreference is the fact that two or more expressions in a text – like pronouns or nouns – link to the same person or thing. It is a classical Natural language processing task, that has seen a revival of interest in the past two years as several research groups applied cutting-edge deep-learning and reinforcement-learning techniques to it. It is also one of the key building blocks to building conversational Artificial intelligences.", - "url": "https://huggingface.co/coref/", - "image": "https://i.imgur.com/3yy4Qyf.png", - "thumb": "https://i.imgur.com/j6FO9O6.jpg", - "github": "huggingface/neuralcoref", - "category": ["visualizers", "conversational"], - "tags": ["coref", "chatbots"], - "author": "Hugging Face", - "author_links": { - "github": "huggingface" - } - }, - { - "id": "spacy-vis", - "slogan": "A visualisation tool for spaCy using Hierplane", - "description": "A visualiser for spaCy annotations. This visualisation uses the [Hierplane](https://allenai.github.io/hierplane/) Library to render the dependency parse from spaCy's models. It also includes visualisation of entities and POS tags within nodes.", - "github": "DeNeutoy/spacy-vis", - "url": "http://spacyvis.allennlp.org/spacy-parser", - "thumb": "https://i.imgur.com/DAG9QFd.jpg", - "image": "https://raw.githubusercontent.com/DeNeutoy/spacy-vis/master/img/example.gif", - "author": "Mark Neumann", - "author_links": { - "twitter": "MarkNeumannnn", - "github": "DeNeutoy" - }, - "category": ["visualizers"] - }, - { - "id": "matcher-explorer", - "title": "Rule-based Matcher Explorer", - "slogan": "Test spaCy's rule-based Matcher by creating token patterns interactively", - "description": "Test spaCy's rule-based `Matcher` by creating token patterns interactively and running them over your text. Each token can set multiple attributes like text value, part-of-speech tag or boolean flags. The token-based view lets you explore how spaCy processes your text – and why your pattern matches, or why it doesn't. For more details on rule-based matching, see the [documentation](https://spacy.io/usage/rule-based-matching).", - "image": "https://explosion.ai/assets/img/demos/matcher.png", - "thumb": "https://i.imgur.com/rPK4AGt.jpg", - "url": "https://explosion.ai/demos/matcher", - "author": "Ines Montani", - "author_links": { - "twitter": "_inesmontani", - "github": "ines", - "website": "https://ines.io" - }, - "category": ["visualizers"] - }, - { - "id": "displacy", - "title": "displaCy", - "slogan": "A modern syntactic dependency visualizer", - "description": "Visualize spaCy's guess at the syntactic structure of a sentence. Arrows point from children to heads, and are labelled by their relation type.", - "url": "https://explosion.ai/demos/displacy", - "thumb": "https://i.imgur.com/nxDcHaL.jpg", - "image": "https://explosion.ai/assets/img/demos/displacy.png", - "author": "Ines Montani", - "author_links": { - "twitter": "_inesmontani", - "github": "ines", - "website": "https://ines.io" - }, - "category": ["visualizers"] - }, - { - "id": "displacy-ent", - "title": "displaCy ENT", - "slogan": "A modern named entity visualizer", - "description": "Visualize spaCy's guess at the named entities in the document. You can filter the displayed types, to only show the annotations you're interested in.", - "url": "https://explosion.ai/demos/displacy-ent", - "thumb": "https://i.imgur.com/A77Ecbs.jpg", - "image": "https://explosion.ai/assets/img/demos/displacy-ent.png", - "author": "Ines Montani", - "author_links": { - "twitter": "_inesmontani", - "github": "ines", - "website": "https://ines.io" - }, - "category": ["visualizers"] - }, - { - "id": "explacy", - "slogan": "A small tool that explains spaCy parse results", - "github": "tylerneylon/explacy", - "thumb": "https://i.imgur.com/V1hCWmn.jpg", - "image": "https://raw.githubusercontent.com/tylerneylon/explacy/master/img/screenshot.png", - "code_example": [ - "import spacy", - "import explacy", - "", - "nlp = spacy.load('en')", - "explacy.print_parse_info(nlp, 'The salad was surprisingly tasty.')" - ], - "author": "Tyler Neylon", - "author_links": { - "github": "tylerneylon" - }, - "category": ["visualizers"] - }, - { - "id": "deplacy", - "slogan": "CUI-based Tree Visualizer for Universal Dependencies and Immediate Catena Analysis", - "description": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.", - "github": "KoichiYasuoka/deplacy", - "image": "https://i.imgur.com/6uOI4Op.png", - "code_example": [ - "import spacy", - "import deplacy", - "", - "nlp=spacy.load('en_core_web_sm')", - "doc=nlp('I saw a horse yesterday which had no name.')", - "deplacy.render(doc)" - ], - "author": "Koichi Yasuoka", - "author_links": { - "github": "KoichiYasuoka" - }, - "category": ["visualizers"] - }, - { - "id": "scattertext", - "slogan": "Beautiful visualizations of how language differs among document types", - "description": "A tool for finding distinguishing terms in small-to-medium-sized corpora, and presenting them in a sexy, interactive scatter plot with non-overlapping term labels. Exploratory data analysis just got more fun.", - "github": "JasonKessler/scattertext", - "image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png", - "code_example": [ - "import spacy", - "import scattertext as st", - "", - "nlp = spacy.load('en')", - "corpus = st.CorpusFromPandas(convention_df,", - " category_col='party',", - " text_col='text',", - " nlp=nlp).build()" - ], - "author": "Jason Kessler", - "author_links": { - "github": "JasonKessler", - "twitter": "jasonkessler" - }, - "category": ["visualizers"] - }, - { - "id": "rasa", - "title": "Rasa", - "slogan": "Turn natural language into structured data", - "description": "Machine learning tools for developers to build, improve, and deploy contextual chatbots and assistants. Powered by open source.", - "github": "RasaHQ/rasa", - "pip": "rasa", - "thumb": "https://i.imgur.com/TyZnpwL.png", - "url": "https://rasa.com/", - "author": "Rasa", - "author_links": { - "github": "RasaHQ" - }, - "category": ["conversational"], - "tags": ["chatbots"] - }, - { - "id": "mindmeld", - "title": "MindMeld - Conversational AI platform", - "slogan": "Conversational AI platform for deep-domain voice interfaces and chatbots", - "description": "The MindMeld Conversational AI platform is among the most advanced AI platforms for building production-quality conversational applications. It is a Python-based machine learning framework which encompasses all of the algorithms and utilities required for this purpose. (https://github.com/cisco/mindmeld)", - "github": "cisco/mindmeld", - "pip": "mindmeld", - "thumb": "https://www.mindmeld.com/img/mindmeld-logo.png", - "category": ["conversational", "ner"], - "tags": ["chatbots"], - "author": "Cisco", - "author_links": { - "github": "cisco/mindmeld", - "website": "https://www.mindmeld.com/" - } - }, - { - "id": "torchtext", - "title": "torchtext", - "slogan": "Data loaders and abstractions for text and NLP", - "github": "pytorch/text", - "pip": "torchtext", - "thumb": "https://i.imgur.com/WFkxuPo.png", - "code_example": [ - ">>> pos = data.TabularDataset(", - "... path='data/pos/pos_wsj_train.tsv', format='tsv',", - "... fields=[('text', data.Field()),", - "... ('labels', data.Field())])", - "...", - ">>> sentiment = data.TabularDataset(", - "... path='data/sentiment/train.json', format='json',", - "... fields={'sentence_tokenized': ('text', data.Field(sequential=True)),", - "... 'sentiment_gold': ('labels', data.Field(sequential=False))})" - ], - "category": ["standalone", "research"], - "tags": ["pytorch"] - }, - { - "id": "allennlp", - "title": "AllenNLP", - "slogan": "An open-source NLP research library, built on PyTorch and spaCy", - "description": "AllenNLP is a new library designed to accelerate NLP research, by providing a framework that supports modern deep learning workflows for cutting-edge language understanding problems. AllenNLP uses spaCy as a preprocessing component. You can also use Allen NLP to develop spaCy pipeline components, to add annotations to the `Doc` object.", - "github": "allenai/allennlp", - "pip": "allennlp", - "thumb": "https://i.imgur.com/U8opuDN.jpg", - "url": "http://allennlp.org", - "author": " Allen Institute for Artificial Intelligence", - "author_links": { - "github": "allenai", - "twitter": "allenai_org", - "website": "http://allenai.org" - }, - "category": ["standalone", "research"] - }, - { - "id": "scispacy", - "title": "scispaCy", - "slogan": "A full spaCy pipeline and models for scientific/biomedical documents", - "github": "allenai/scispacy", - "pip": "scispacy", - "thumb": "https://i.imgur.com/dJQSclW.png", - "url": "https://allenai.github.io/scispacy/", - "author": " Allen Institute for Artificial Intelligence", - "author_links": { - "github": "allenai", - "twitter": "allenai_org", - "website": "http://allenai.org" - }, - "category": ["scientific", "models", "research"] - }, - { - "id": "textacy", - "slogan": "NLP, before and after spaCy", - "description": "`textacy` is a Python library for performing a variety of natural language processing (NLP) tasks, built on the high-performance `spacy` library. With the fundamentals – tokenization, part-of-speech tagging, dependency parsing, etc. – delegated to another library, `textacy` focuses on the tasks that come before and follow after.", - "github": "chartbeat-labs/textacy", - "pip": "textacy", - "url": "https://github.com/chartbeat-labs/textacy", - "author": "Burton DeWilde", - "author_links": { - "github": "bdewilde", - "twitter": "bjdewilde" - }, - "category": ["standalone"] - }, - { - "id": "textpipe", - "slogan": "clean and extract metadata from text", - "description": "`textpipe` is a Python package for converting raw text in to clean, readable text and extracting metadata from that text. Its functionalities include transforming raw text into readable text by removing HTML tags and extracting metadata such as the number of words and named entities from the text.", - "github": "textpipe/textpipe", - "pip": "textpipe", - "author": "Textpipe Contributors", - "author_links": { - "github": "textpipe", - "website": "https://github.com/textpipe/textpipe/blob/master/CONTRIBUTORS.md" - }, - "category": ["standalone"], - "tags": ["text-processing", "named-entity-recognition"], - "thumb": "https://avatars0.githubusercontent.com/u/40492530", - "code_example": [ - "from textpipe import doc, pipeline", - "sample_text = 'Sample text! '", - "document = doc.Doc(sample_text)", - "print(document.clean)", - "'Sample text!'", - "print(document.language)", - "# 'en'", - "print(document.nwords)", - "# 2", - "", - "pipe = pipeline.Pipeline(['CleanText', 'NWords'])", - "print(pipe(sample_text))", - "# {'CleanText': 'Sample text!', 'NWords': 2}" - ] - }, - { - "id": "mordecai", - "slogan": "Full text geoparsing using spaCy, Geonames and Keras", - "description": "Extract the place names from a piece of text, resolve them to the correct place, and return their coordinates and structured geographic information.", - "github": "openeventdata/mordecai", - "pip": "mordecai", - "thumb": "https://i.imgur.com/gPJ9upa.jpg", - "code_example": [ - "from mordecai import Geoparser", - "geo = Geoparser()", - "geo.geoparse(\"I traveled from Oxford to Ottawa.\")" - ], - "author": "Andy Halterman", - "author_links": { - "github": "ahalterman", - "twitter": "ahalterman" - }, - "category": ["standalone", "scientific"] - }, - { - "id": "kindred", - "title": "Kindred", - "slogan": "Biomedical relation extraction using spaCy", - "description": "Kindred is a package for relation extraction in biomedical texts. Given some training data, it can build a model to identify relations between entities (e.g. drugs, genes, etc) in a sentence.", - "github": "jakelever/kindred", - "pip": "kindred", - "code_example": [ - "import kindred", - "", - "trainCorpus = kindred.bionlpst.load('2016-BB3-event-train')", - "devCorpus = kindred.bionlpst.load('2016-BB3-event-dev')", - "predictionCorpus = devCorpus.clone()", - "predictionCorpus.removeRelations()", - "classifier = kindred.RelationClassifier()", - "classifier.train(trainCorpus)", - "classifier.predict(predictionCorpus)", - "f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score')" - ], - "author": "Jake Lever", - "author_links": { - "github": "jakelever" - }, - "category": ["standalone", "scientific"] - }, - { - "id": "sense2vec", - "slogan": "Use NLP to go beyond vanilla word2vec", - "description": "sense2vec ([Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you learn more interesting, detailed and context-sensitive word vectors. For an interactive example of the technology, see our [sense2vec demo](https://explosion.ai/demos/sense2vec) that lets you explore semantic similarities across all Reddit comments of 2015.", - "github": "explosion/sense2vec", - "pip": "sense2vec==1.0.0a1", - "thumb": "https://i.imgur.com/awfdhX6.jpg", - "image": "https://explosion.ai/assets/img/demos/sense2vec.png", - "url": "https://explosion.ai/demos/sense2vec", - "code_example": [ - "import spacy", - "", - "nlp = spacy.load(\"en_core_web_sm\")", - "s2v = nlp.add_pipe(\"sense2vec\")", - "s2v.from_disk(\"/path/to/s2v_reddit_2015_md\")", - "", - "doc = nlp(\"A sentence about natural language processing.\")", - "assert doc[3:6].text == \"natural language processing\"", - "freq = doc[3:6]._.s2v_freq", - "vector = doc[3:6]._.s2v_vec", - "most_similar = doc[3:6]._.s2v_most_similar(3)", - "# [(('machine learning', 'NOUN'), 0.8986967),", - "# (('computer vision', 'NOUN'), 0.8636297),", - "# (('deep learning', 'NOUN'), 0.8573361)]" - ], - "category": ["pipeline", "standalone", "visualizers"], - "tags": ["vectors"], - "author": "Explosion", - "author_links": { - "twitter": "explosion_ai", - "github": "explosion", - "website": "https://explosion.ai" - } - }, - { - "id": "spacyr", - "slogan": "An R wrapper for spaCy", - "github": "quanteda/spacyr", - "cran": "spacyr", - "code_example": [ - "library(\"spacyr\")", - "spacy_initialize()", - "", - "txt <- c(d1 = \"spaCy excels at large-scale information extraction tasks.\",", - " d2 = \"Mr. Smith goes to North Carolina.\")", - "", - "# process documents and obtain a data.table", - "parsedtxt <- spacy_parse(txt)" - ], - "code_language": "r", - "author": "Kenneth Benoit & Aki Matsuo", - "category": ["nonpython"] - }, - { - "id": "cleannlp", - "title": "CleanNLP", - "slogan": "A tidy data model for NLP in R", - "description": "The cleanNLP package is designed to make it as painless as possible to turn raw text into feature-rich data frames. the package offers four backends that can be used for parsing text: `tokenizers`, `udpipe`, `spacy` and `corenlp`.", - "github": "statsmaths/cleanNLP", - "cran": "cleanNLP", - "author": "Taylor B. Arnold", - "author_links": { - "github": "statsmaths" - }, - "category": ["nonpython"] - }, - { - "id": "spacy-cpp", - "slogan": "C++ wrapper library for spaCy", - "description": "The goal of spacy-cpp is to expose the functionality of spaCy to C++ applications, and to provide an API that is similar to that of spaCy, enabling rapid development in Python and simple porting to C++.", - "github": "d99kris/spacy-cpp", - "code_example": [ - "Spacy::Spacy spacy;", - "auto nlp = spacy.load(\"en_core_web_sm\");", - "auto doc = nlp.parse(\"This is a sentence.\");", - "for (auto& token : doc.tokens())", - " std::cout << token.text() << \" [\" << token.pos_() << \"]\\n\";" - ], - "code_language": "cpp", - "author": "Kristofer Berggren", - "author_links": { - "github": "d99kris" - }, - "category": ["nonpython"] - }, - { - "id": "spaCy.jl", - "slogan": "Julia interface for spaCy (work in progress)", - "github": "jekbradbury/SpaCy.jl", - "author": "James Bradbury", - "author_links": { - "github": "jekbradbury", - "twitter": "jekbradbury" - }, - "category": ["nonpython"] - }, - { - "id": "ruby-spacy", - "title": "ruby-spacy", - "slogan": "Wrapper module for using spaCy from Ruby via PyCall", - "description": "ruby-spacy is a wrapper module for using spaCy from the Ruby programming language via PyCall. This module aims to make it easy and natural for Ruby programmers to use spaCy.", - "github": "yohasebe/ruby-spacy", - "code_example": [ - "require \"ruby-spacy\"", - "require \"terminal-table\"", - "nlp = Spacy::Language.new(\"en_core_web_sm\")", - "doc = nlp.read(\"Apple is looking at buying U.K. startup for $1 billion\")", - "headings = [\"text\", \"lemma\", \"pos\", \"tag\", \"dep\"]", - "rows = []", - "doc.each do |token|", - " rows << [token.text, token.lemma, token.pos, token.tag, token.dep]", - "end", - "table = Terminal::Table.new rows: rows, headings: headings", - "puts table" - ], - "code_language": "ruby", - "url": "https://rubygems.org/gems/ruby-spacy", - "author": "Yoichiro Hasebe", - "author_links": { - "github": "yohasebe", - "twitter": "yohasebe" - }, - "category": ["nonpython"], - "tags": ["ruby"] - }, - { - "id": "spacy_api", - "slogan": "Server/client to load models in a separate, dedicated process", - "github": "kootenpv/spacy_api", - "pip": "spacy_api", - "code_example": [ - "from spacy_api import Client", - "", - "spacy_client = Client() # default args host/port", - "doc = spacy_client.single(\"How are you\")" - ], - "author": "Pascal van Kooten", - "author_links": { - "github": "kootenpv" - }, - "category": ["apis"] - }, - { - "id": "spacy-api-docker", - "slogan": "spaCy REST API, wrapped in a Docker container", - "github": "jgontrum/spacy-api-docker", - "url": "https://hub.docker.com/r/jgontrum/spacyapi/", - "thumb": "https://i.imgur.com/NRnDKyj.jpg", - "code_example": [ - "version: '2'", - "", - "services:", - " spacyapi:", - " image: jgontrum/spacyapi:en_v2", - " ports:", - " - \"127.0.0.1:8080:80\"", - " restart: always" - ], - "code_language": "docker", - "author": "Johannes Gontrum", - "author_links": { - "github": "jgontrum" - }, - "category": ["apis"] - }, - { - "id": "languagecrunch", - "slogan": "NLP server for spaCy, WordNet and NeuralCoref as a Docker image", - "github": "artpar/languagecrunch", - "code_example": [ - "docker run -it -p 8080:8080 artpar/languagecrunch", - "curl http://localhost:8080/nlp/parse?`echo -n \"The new twitter is so weird. Seriously. Why is there a new twitter? What was wrong with the old one? Fix it now.\" | python -c \"import urllib, sys; print(urllib.urlencode({'sentence': sys.stdin.read()}))\"`" - ], - "code_language": "bash", - "author": "Parth Mudgal", - "author_links": { - "github": "artpar" - }, - "category": ["apis"] - }, - { - "id": "spacy-nlp", - "slogan": " Expose spaCy NLP text parsing to Node.js (and other languages) via Socket.IO", - "github": "kengz/spacy-nlp", - "thumb": "https://i.imgur.com/w41VSr7.jpg", - "code_example": [ - "const spacyNLP = require(\"spacy-nlp\")", - "// default port 6466", - "// start the server with the python client that exposes spacyIO (or use an existing socketIO server at IOPORT)", - "var serverPromise = spacyNLP.server({ port: process.env.IOPORT });", - "// Loading spacy may take up to 15s" - ], - "code_language": "javascript", - "author": "Wah Loon Keng", - "author_links": { - "github": "kengz" - }, - "category": ["apis", "nonpython"] - }, - { - "id": "prodigy", - "title": "Prodigy", - "slogan": "Radically efficient machine teaching, powered by active learning", - "description": "Prodigy is an annotation tool so efficient that data scientists can do the annotation themselves, enabling a new level of rapid iteration. Whether you're working on entity recognition, intent detection or image classification, Prodigy can help you train and evaluate your models faster. Stream in your own examples or real-world data from live APIs, update your model in real-time and chain models together to build more complex systems.", - "thumb": "https://i.imgur.com/UVRtP6g.jpg", - "image": "https://i.imgur.com/Dt5vrY6.png", - "url": "https://prodi.gy", - "code_example": [ - "prodigy dataset ner_product \"Improve PRODUCT on Reddit data\"", - "✨ Created dataset 'ner_product'.", - "", - "prodigy ner.teach ner_product en_core_web_sm ~/data.jsonl --label PRODUCT", - "✨ Starting the web server on port 8080..." - ], - "code_language": "bash", - "category": ["standalone", "training"], - "author": "Explosion", - "author_links": { - "twitter": "explosion_ai", - "github": "explosion", - "website": "https://explosion.ai" - } - }, - { - "id": "dragonfire", - "title": "Dragonfire", - "slogan": "An open-source virtual assistant for Ubuntu based Linux distributions", - "github": "DragonComputer/Dragonfire", - "thumb": "https://i.imgur.com/5fqguKS.jpg", - "image": "https://raw.githubusercontent.com/DragonComputer/Dragonfire/master/docs/img/demo.gif", - "author": "Dragon Computer", - "author_links": { - "github": "DragonComputer", - "website": "http://dragon.computer" - }, - "category": ["standalone"] - }, - { - "id": "prefect", - "title": "Prefect", - "slogan": "Workflow management system designed for modern infrastructure", - "github": "PrefectHQ/prefect", - "pip": "prefect", - "thumb": "https://i.imgur.com/oLTwr0e.png", - "code_example": [ - "from prefect import Flow", - "from prefect.tasks.spacy.spacy_tasks import SpacyNLP", - "import spacy", - "", - "nlp = spacy.load(\"en_core_web_sm\")", - "", - "with Flow(\"Natural Language Processing\") as flow:", - " doc = SpacyNLP(text=\"This is some text\", nlp=nlp)", - "", - "flow.run()" - ], - "author": "Prefect", - "author_links": { - "website": "https://prefect.io" - }, - "category": ["standalone"] - }, - { - "id": "graphbrain", - "title": "Graphbrain", - "slogan": "Automated meaning extraction and text understanding", - "description": "Graphbrain is an Artificial Intelligence open-source software library and scientific research tool. Its aim is to facilitate automated meaning extraction and text understanding, as well as the exploration and inference of knowledge.", - "github": "graphbrain/graphbrain", - "pip": "graphbrain", - "thumb": "https://i.imgur.com/cct9W1E.png", - "author": "Graphbrain", - "category": ["standalone"] - }, - { - "type": "education", - "id": "nostarch-nlp-python", - "title": "Natural Language Processing Using Python", - "slogan": "No Starch Press, 2020", - "description": "Natural Language Processing Using Python is an introduction to natural language processing (NLP), the task of converting human language into data that a computer can process. The book uses spaCy, a leading Python library for NLP, to guide readers through common NLP tasks related to generating and understanding human language with code. It addresses problems like understanding a user's intent, continuing a conversation with a human, and maintaining the state of a conversation.", - "cover": "https://i.imgur.com/w0iycjl.jpg", - "url": "https://nostarch.com/NLPPython", - "author": "Yuli Vasiliev", - "category": ["books"] - }, - { - "type": "education", - "id": "oreilly-python-ds", - "title": "Introduction to Machine Learning with Python: A Guide for Data Scientists", - "slogan": "O'Reilly, 2016", - "description": "Machine learning has become an integral part of many commercial applications and research projects, but this field is not exclusive to large companies with extensive research teams. If you use Python, even as a beginner, this book will teach you practical ways to build your own machine learning solutions. With all the data available today, machine learning applications are limited only by your imagination.", - "cover": "https://covers.oreillystatic.com/images/0636920030515/lrg.jpg", - "url": "http://shop.oreilly.com/product/0636920030515.do", - "author": "Andreas Müller, Sarah Guido", - "category": ["books"] - }, - { - "type": "education", - "id": "text-analytics-python", - "title": "Text Analytics with Python", - "slogan": "Apress / Springer, 2016", - "description": "*Text Analytics with Python* teaches you the techniques related to natural language processing and text analytics, and you will gain the skills to know which technique is best suited to solve a particular problem. You will look at each technique and algorithm with both a bird's eye view to understand how it can be used as well as with a microscopic view to understand the mathematical concepts and to implement them to solve your own problems.", - "github": "dipanjanS/text-analytics-with-python", - "cover": "https://i.imgur.com/AOmzZu8.png", - "url": "https://www.amazon.com/Text-Analytics-Python-Real-World-Actionable/dp/148422387X", - "author": "Dipanjan Sarkar", - "category": ["books"] - }, - { - "type": "education", - "id": "practical-ml-python", - "title": "Practical Machine Learning with Python", - "slogan": "Apress, 2017", - "description": "Master the essential skills needed to recognize and solve complex problems with machine learning and deep learning. Using real-world examples that leverage the popular Python machine learning ecosystem, this book is your perfect companion for learning the art and science of machine learning to become a successful practitioner. The concepts, techniques, tools, frameworks, and methodologies used in this book will teach you how to think, design, build, and execute machine learning systems and projects successfully.", - "github": "dipanjanS/practical-machine-learning-with-python", - "cover": "https://i.imgur.com/5F4mkt7.jpg", - "url": "https://www.amazon.com/Practical-Machine-Learning-Python-Problem-Solvers/dp/1484232062", - "author": "Dipanjan Sarkar, Raghav Bali, Tushar Sharma", - "category": ["books"] - }, - { - "type": "education", - "id": "packt-nlp-computational-linguistics", - "title": "Natural Language Processing and Computational Linguistics", - "slogan": "Packt, 2018", - "description": "This book shows you how to use natural language processing, and computational linguistics algorithms, to make inferences and gain insights about data you have. These algorithms are based on statistical machine learning and artificial intelligence techniques. The tools to work with these algorithms are available to you right now - with Python, and tools like Gensim and spaCy.", - "cover": "https://i.imgur.com/aleMf1Y.jpg", - "url": "https://www.amazon.com/Natural-Language-Processing-Computational-Linguistics-ebook/dp/B07BWH779J", - "author": "Bhargav Srinivasa-Desikan", - "category": ["books"] - }, - { - "type": "education", - "id": "mastering-spacy", - "title": "Mastering spaCy", - "slogan": "Packt, 2021", - "description": "This is your ultimate spaCy book. Master the crucial skills to use spaCy components effectively to create real-world NLP applications with spaCy. Explaining linguistic concepts such as dependency parsing, POS-tagging and named entity extraction with many examples, this book will help you to conquer computational linguistics with spaCy. The book further focuses on ML topics with Keras and Tensorflow. You'll cover popular topics, including intent recognition, sentiment analysis and context resolution; and use them on popular datasets and interpret the results. A special hands-on section on chatbot design is included.", - "github": "PacktPublishing/Mastering-spaCy", - "cover": "https://tinyimg.io/i/aWEm0dh.jpeg", - "url": "https://www.amazon.com/Mastering-spaCy-end-end-implementing/dp/1800563353", - "author": "Duygu Altinok", - "author_links": { - "github": "DuyguA", - "website": "https://www.linkedin.com/in/duygu-altinok-4021389a" - }, - "category": ["books"] - }, - { - "type": "education", - "id": "applied-nlp-in-enterprise", - "title": "Applied Natural Language Processing in the Enterprise: Teaching Machines to Read, Write, and Understand", - "slogan": "O'Reilly, 2021", - "description": "Natural language processing (NLP) is one of the hottest topics in AI today. Having lagged behind other deep learning fields such as computer vision for years, NLP only recently gained mainstream popularity. Even though Google, Facebook, and OpenAI have open sourced large pretrained language models to make NLP easier, many organizations today still struggle with developing and productionizing NLP applications. This hands-on guide helps you learn the field quickly.", - "github": "nlpbook/nlpbook", - "cover": "https://i.imgur.com/6RxLBvf.jpg", - "url": "https://www.amazon.com/dp/149206257X", - "author": "Ankur A. Patel", - "author_links": { - "github": "aapatel09", - "website": "https://www.ankurapatel.io" - }, - "category": ["books"] - }, - { - "type": "education", - "id": "learning-path-spacy", - "title": "Learning Path: Mastering spaCy for Natural Language Processing", - "slogan": "O'Reilly, 2017", - "description": "spaCy, a fast, user-friendly library for teaching computers to understand text, simplifies NLP techniques, such as speech tagging and syntactic dependencies, so you can easily extract information, attributes, and objects from massive amounts of text to then document, measure, and analyze. This Learning Path is a hands-on introduction to using spaCy to discover insights through natural language processing. While end-to-end natural language processing solutions can be complex, you’ll learn the linguistics, algorithms, and machine learning skills to get the job done.", - "url": "https://www.safaribooksonline.com/library/view/learning-path-mastering/9781491986653/", - "thumb": "https://i.imgur.com/9MIgMAc.jpg", - "author": "Aaron Kramer", - "category": ["courses"] - }, - { - "type": "education", - "id": "introduction-into-spacy-3", - "title": "Introduction to spaCy 3", - "slogan": "A free course for beginners by Dr. W.J.B. Mattingly", - "url": "http://spacy.pythonhumanities.com/", - "thumb": "https://spacy.pythonhumanities.com/_static/freecodecamp_small.jpg", - "author": "Dr. W.J.B. Mattingly", - "category": ["courses"] - }, - { - "type": "education", - "id": "spacy-course", - "title": "Advanced NLP with spaCy", - "slogan": "A free online course", - "description": "In this free interactive course, you'll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches.", - "url": "https://course.spacy.io", - "image": "https://i.imgur.com/JC00pHW.jpg", - "thumb": "https://i.imgur.com/5RXLtrr.jpg", - "author": "Ines Montani", - "author_links": { - "twitter": "_inesmontani", - "github": "ines", - "website": "https://ines.io" - }, - "category": ["courses"] - }, - { - "type": "education", - "id": "applt-course", - "title": "Applied Language Technology", - "slogan": "NLP for newcomers using spaCy and Stanza", - "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.", - "url": "https://applied-language-technology.mooc.fi", - "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg", - "thumb": "https://www.mv.helsinki.fi/home/thiippal/images/applt-logo.png", - "author": "Tuomo Hiippala", - "author_links": { - "twitter": "tuomo_h", - "github": "thiippal", - "website": "https://www.mv.helsinki.fi/home/thiippal/" - }, - "category": ["courses"] - }, - { - "type": "education", - "id": "video-spacys-ner-model", - "title": "spaCy's NER model", - "slogan": "Incremental parsing with bloom embeddings and residual CNNs", - "description": "spaCy v2.0's Named Entity Recognition system features a sophisticated word embedding strategy using subword features and \"Bloom\" embeddings, a deep convolutional neural network with residual connections, and a novel transition-based approach to named entity parsing. The system is designed to give a good balance of efficiency, accuracy and adaptability. In this talk, I sketch out the components of the system, explaining the intuition behind the various choices. I also give a brief introduction to the named entity recognition problem, with an overview of what else Explosion AI is working on, and why.", - "youtube": "sqDHBH9IjRU", - "author": "Matthew Honnibal", - "author_links": { - "twitter": "honnibal", - "github": "honnibal", - "website": "https://explosion.ai" - }, - "category": ["videos"] - }, - { - "type": "education", - "id": "video-new-nlp-solutions", - "title": "Building new NLP solutions with spaCy and Prodigy", - "slogan": "PyData Berlin 2018", - "description": "In this talk, I will discuss how to address some of the most likely causes of failure for new Natural Language Processing (NLP) projects. My main recommendation is to take an iterative approach: don't assume you know what your pipeline should look like, let alone your annotation schemes or model architectures.", - "author": "Matthew Honnibal", - "author_links": { - "twitter": "honnibal", - "github": "honnibal", - "website": "https://explosion.ai" - }, - "youtube": "jpWqz85F_4Y", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-modern-nlp-in-python", - "title": "Modern NLP in Python", - "slogan": "PyData DC 2016", - "description": "Academic and industry research in Natural Language Processing (NLP) has progressed at an accelerating pace over the last several years. Members of the Python community have been hard at work moving cutting-edge research out of papers and into open source, \"batteries included\" software libraries that can be applied to practical problems. We'll explore some of these tools for modern NLP in Python.", - "author": "Patrick Harrison", - "youtube": "6zm9NC9uRkk", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-spacy-course", - "title": "Advanced NLP with spaCy · A free online course", - "description": "spaCy is a modern Python library for industrial-strength Natural Language Processing. In this free and interactive online course, you'll learn how to use spaCy to build advanced natural language understanding systems, using both rule-based and machine learning approaches.", - "url": "https://course.spacy.io/en", - "author": "Ines Montani", - "author_links": { - "twitter": "_inesmontani", - "github": "ines" - }, - "youtube": "THduWAnG97k", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-spacy-course-de", - "title": "Modernes NLP mit spaCy · Ein Gratis-Onlinekurs", - "description": "spaCy ist eine moderne Python-Bibliothek für industriestarkes Natural Language Processing. In diesem kostenlosen und interaktiven Onlinekurs lernst du, mithilfe von spaCy fortgeschrittene Systeme für die Analyse natürlicher Sprache zu entwickeln und dabei sowohl regelbasierte Verfahren, als auch moderne Machine-Learning-Technologie einzusetzen.", - "url": "https://course.spacy.io/de", - "author": "Ines Montani", - "author_links": { - "twitter": "_inesmontani", - "github": "ines" - }, - "youtube": "K1elwpgDdls", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-spacy-course-es", - "title": "NLP avanzado con spaCy · Un curso en línea gratis", - "description": "spaCy es un paquete moderno de Python para hacer Procesamiento de Lenguaje Natural de potencia industrial. En este curso en línea, interactivo y gratuito, aprenderás a usar spaCy para construir sistemas avanzados de comprensión de lenguaje natural usando enfoques basados en reglas y en machine learning.", - "url": "https://course.spacy.io/es", - "author": "Camila Gutiérrez", - "author_links": { - "twitter": "Mariacamilagl30" - }, - "youtube": "RNiLVCE5d4k", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-intro-to-nlp-episode-1", - "title": "Intro to NLP with spaCy (1)", - "slogan": "Episode 1: Data exploration", - "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", - "author": "Vincent Warmerdam", - "author_links": { - "twitter": "fishnets88", - "github": "koaning" - }, - "youtube": "WnGPv6HnBok", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-intro-to-nlp-episode-2", - "title": "Intro to NLP with spaCy (2)", - "slogan": "Episode 2: Rule-based Matching", - "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", - "author": "Vincent Warmerdam", - "author_links": { - "twitter": "fishnets88", - "github": "koaning" - }, - "youtube": "KL4-Mpgbahw", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-intro-to-nlp-episode-3", - "title": "Intro to NLP with spaCy (3)", - "slogan": "Episode 2: Evaluation", - "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", - "author": "Vincent Warmerdam", - "author_links": { - "twitter": "fishnets88", - "github": "koaning" - }, - "youtube": "4V0JDdohxAk", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-intro-to-nlp-episode-4", - "title": "Intro to NLP with spaCy (4)", - "slogan": "Episode 4: Named Entity Recognition", - "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", - "author": "Vincent Warmerdam", - "author_links": { - "twitter": "fishnets88", - "github": "koaning" - }, - "youtube": "IqOJU1-_Fi0", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-intro-to-nlp-episode-5", - "title": "Intro to NLP with spaCy (5)", - "slogan": "Episode 5: Rules vs. Machine Learning", - "description": "In this new video series, data science instructor Vincent Warmerdam gets started with spaCy, an open-source library for Natural Language Processing in Python. His mission: building a system to automatically detect programming languages in large volumes of text. Follow his process from the first idea to a prototype all the way to data collection and training a statistical named entity recogntion model from scratch.", - "author": "Vincent Warmerdam", - "author_links": { - "twitter": "fishnets88", - "github": "koaning" - }, - "youtube": "f4sqeLRzkPg", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-spacy-irl-entity-linking", - "title": "Entity Linking functionality in spaCy", - "slogan": "spaCy IRL 2019", - "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc", - "author": "Sofie Van Landeghem", - "author_links": { - "twitter": "OxyKodit", - "github": "svlandeg" - }, - "youtube": "PW3RJM8tDGo", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-spacy-irl-lemmatization", - "title": "Rethinking rule-based lemmatization", - "slogan": "spaCy IRL 2019", - "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc", - "author": "Guadalupe Romero", - "author_links": { - "twitter": "_guadiromero", - "github": "guadi1994" - }, - "youtube": "88zcQODyuko", - "category": ["videos"] - }, - { - "type": "education", - "id": "video-spacy-irl-scispacy", - "title": "ScispaCy: A spaCy pipeline & models for scientific & biomedical text", - "slogan": "spaCy IRL 2019", - "url": "https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc", - "author": "Mark Neumann", - "author_links": { - "twitter": "MarkNeumannnn", - "github": "DeNeutoy" - }, - "youtube": "2_HSKDALwuw", - "category": ["videos"] - }, - { - "type": "education", - "id": "podcast-nlp-highlights", - "title": "NLP Highlights #78: Where do corpora come from?", - "slogan": "January 2019", - "description": "Most NLP projects rely crucially on the quality of annotations used for training and evaluating models. In this episode, Matt and Ines of Explosion AI tell us how Prodigy can improve data annotation and model development workflows. Prodigy is an annotation tool implemented as a python library, and it comes with a web application and a command line interface. A developer can define input data streams and design simple annotation interfaces. Prodigy can help break down complex annotation decisions into a series of binary decisions, and it provides easy integration with spaCy models. Developers can specify how models should be modified as new annotations come in in an active learning framework.", - "soundcloud": "559200912", - "thumb": "https://i.imgur.com/hOBQEzc.jpg", - "url": "https://soundcloud.com/nlp-highlights/78-where-do-corpora-come-from-with-matt-honnibal-and-ines-montani", - "author": "Matt Gardner, Waleed Ammar (Allen AI)", - "author_links": { - "website": "https://soundcloud.com/nlp-highlights" - }, - "category": ["podcasts"] - }, - { - "type": "education", - "id": "podcast-init", - "title": "Podcast.__init__ #87: spaCy with Matthew Honnibal", - "slogan": "December 2017", - "description": "As the amount of text available on the internet and in businesses continues to increase, the need for fast and accurate language analysis becomes more prominent. This week Matthew Honnibal, the creator of spaCy, talks about his experiences researching natural language processing and creating a library to make his findings accessible to industry.", - "iframe": "https://www.pythonpodcast.com/wp-content/plugins/podlove-podcasting-plugin-for-wordpress/lib/modules/podlove_web_player/player_v4/dist/share.html?episode=https://www.pythonpodcast.com/?podlove_player4=176", - "iframe_height": 200, - "thumb": "https://i.imgur.com/rpo6BuY.png", - "url": "https://www.podcastinit.com/episode-87-spacy-with-matthew-honnibal/", - "author": "Tobias Macey", - "author_links": { - "website": "https://www.podcastinit.com" - }, - "category": ["podcasts"] - }, - { - "type": "education", - "id": "podcast-init2", - "title": "Podcast.__init__ #256: An Open Source Toolchain For NLP From Explosion AI", - "slogan": "March 2020", - "description": "The state of the art in natural language processing is a constantly moving target. With the rise of deep learning, previously cutting edge techniques have given way to robust language models. Through it all the team at Explosion AI have built a strong presence with the trifecta of spaCy, Thinc, and Prodigy to support fast and flexible data labeling to feed deep learning models and performant and scalable text processing. In this episode founder and open source author Matthew Honnibal shares his experience growing a business around cutting edge open source libraries for the machine learning developent process.", - "iframe": "https://cdn.podlove.org/web-player/share.html?episode=https%3A%2F%2Fwww.pythonpodcast.com%2F%3Fpodlove_player4%3D614", - "iframe_height": 200, - "thumb": "https://i.imgur.com/rpo6BuY.png", - "url": "https://www.pythonpodcast.com/explosion-ai-natural-language-processing-episode-256/", - "author": "Tobias Macey", - "author_links": { - "website": "https://www.podcastinit.com" - }, - "category": ["podcasts"] - }, - { - "type": "education", - "id": "talk-python-podcast", - "title": "Talk Python #202: Building a software business", - "slogan": "March 2019", - "description": "One core question around open source is how do you fund it? Well, there is always that PayPal donate button. But that's been a tremendous failure for many projects. Often the go-to answer is consulting. But what if you don't want to trade time for money? You could take things up a notch and change the equation, exchanging value for money. That's what Ines Montani and her co-founder did when they started Explosion AI with spaCy as the foundation.", - "thumb": "https://i.imgur.com/q1twuK8.png", - "url": "https://talkpython.fm/episodes/show/202/building-a-software-business", - "soundcloud": "588364857", - "author": "Michael Kennedy", - "author_links": { - "website": "https://talkpython.fm/" - }, - "category": ["podcasts"] - }, - { - "type": "education", - "id": "twimlai-podcast", - "title": "TWiML & AI: Practical NLP with spaCy and Prodigy", - "slogan": "May 2019", - "description": "\"Ines and I caught up to discuss her various projects, including the aforementioned spaCy, an open-source NLP library built with a focus on industry and production use cases. In our conversation, Ines gives us an overview of the spaCy Library, a look at some of the use cases that excite her, and the Spacy community and contributors. We also discuss her work with Prodigy, an annotation service tool that uses continuous active learning to train models, and finally, what other exciting projects she is working on.\"", - "thumb": "https://i.imgur.com/ng2F5gK.png", - "url": "https://twimlai.com/twiml-talk-262-practical-natural-language-processing-with-spacy-and-prodigy-w-ines-montani", - "iframe": "https://html5-player.libsyn.com/embed/episode/id/9691514/height/90/theme/custom/thumbnail/no/preload/no/direction/backward/render-playlist/no/custom-color/3e85b1/", - "iframe_height": 90, - "author": "Sam Charrington", - "author_links": { - "website": "https://twimlai.com" - }, - "category": ["podcasts"] - }, - { - "type": "education", - "id": "analytics-vidhya", - "title": "DataHack Radio #23: The Brains behind spaCy", - "slogan": "June 2019", - "description": "\"What would you do if you had the chance to pick the brains behind one of the most popular Natural Language Processing (NLP) libraries of our era? A library that has helped usher in the current boom in NLP applications and nurtured tons of NLP scientists? Well – you invite the creators on our popular DataHack Radio podcast and let them do the talking! We are delighted to welcome Ines Montani and Matt Honnibal, the developers of spaCy – a powerful and advanced library for NLP.\"", - "thumb": "https://i.imgur.com/3zJKZ1P.jpg", - "url": "https://www.analyticsvidhya.com/blog/2019/06/datahack-radio-ines-montani-matthew-honnibal-brains-behind-spacy/", - "soundcloud": "630741825", - "author": "Analytics Vidhya", - "author_links": { - "website": "https://www.analyticsvidhya.com", - "twitter": "analyticsvidhya" - }, - "category": ["podcasts"] - }, - { - "type": "education", - "id": "practical-ai-podcast", - "title": "Practical AI: Modern NLP with spaCy", - "slogan": "December 2019", - "description": "\"spaCy is awesome for NLP! It’s easy to use, has widespread adoption, is open source, and integrates the latest language models. Ines Montani and Matthew Honnibal (core developers of spaCy and co-founders of Explosion) join us to discuss the history of the project, its capabilities, and the latest trends in NLP. We also dig into the practicalities of taking NLP workflows to production. You don’t want to miss this episode!\"", - "thumb": "https://i.imgur.com/jn8Bcdw.png", - "url": "https://changelog.com/practicalai/68", - "author": "Daniel Whitenack & Chris Benson", - "author_links": { - "website": "https://changelog.com/practicalai", - "twitter": "https://twitter.com/PracticalAIFM" - }, - "category": ["podcasts"] - }, - { - "type": "education", - "id": "video-entity-linking", - "title": "Training a custom entity linking mode with spaCy", - "author": "Sofie Van Landeghem", - "author_links": { - "twitter": "OxyKodit", - "github": "svlandeg" - }, - "youtube": "8u57WSXVpmw", - "category": ["videos"] - }, - { - "id": "adam_qas", - "title": "ADAM: Question Answering System", - "slogan": "A question answering system that extracts answers from Wikipedia to questions posed in natural language.", - "github": "5hirish/adam_qas", - "pip": "qas", - "code_example": [ - "git clone https://github.com/5hirish/adam_qas.git", - "cd adam_qas", - "pip install -r requirements.txt", - "python -m qas.adam 'When was linux kernel version 4.0 released ?'" - ], - "code_language": "bash", - "thumb": "https://shirishkadam.files.wordpress.com/2018/04/mini_alleviate.png", - "author": "Shirish Kadam", - "author_links": { - "twitter": "5hirish", - "github": "5hirish", - "website": "https://shirishkadam.com/" - }, - "category": ["standalone"], - "tags": ["question-answering", "elasticsearch"] - }, - { - "id": "epitator", - "title": "EpiTator", - "thumb": "https://i.imgur.com/NYFY1Km.jpg", - "slogan": "Extracts case counts, resolved location/species/disease names, date ranges and more", - "description": "EcoHealth Alliance uses EpiTator to catalog the what, where and when of infectious disease case counts reported in online news. Each of these aspects is extracted using independent annotators than can be applied to other domains. EpiTator organizes annotations by creating \"AnnoTiers\" for each type. AnnoTiers have methods for manipulating, combining and searching annotations. For instance, the `with_following_spans_from()` method can be used to create a new tier that combines a tier of one type (such as numbers), with another (say, kitchenware). The resulting tier will contain all the phrases in the document that match that pattern, like \"5 plates\" or \"2 cups.\"\n\nAnother commonly used method is `group_spans_by_containing_span()` which can be used to do things like find all the spaCy tokens in all the GeoNames a document mentions. spaCy tokens, named entities, sentences and noun chunks are exposed through the spaCy annotator which will create a AnnoTier for each. These are basis of many of the other annotators. EpiTator also includes an annotator for extracting tables embedded in free text articles. Another neat feature is that the lexicons used for entity resolution are all stored in an embedded sqlite database so there is no need to run any external services in order to use EpiTator.", - "url": "https://github.com/ecohealthalliance/EpiTator", - "github": "ecohealthalliance/EpiTator", - "pip": "EpiTator", - "code_example": [ - "from epitator.annotator import AnnoDoc", - "from epitator.geoname_annotator import GeonameAnnotator", - "", - "doc = AnnoDoc('Where is Chiang Mai?')", - "geoname_annotier = doc.require_tiers('geonames', via=GeonameAnnotator)", - "geoname = geoname_annotier.spans[0].metadata['geoname']", - "geoname['name']", - "# = 'Chiang Mai'", - "geoname['geonameid']", - "# = '1153671'", - "geoname['latitude']", - "# = 18.79038", - "geoname['longitude']", - "# = 98.98468", - "", - "from epitator.spacy_annotator import SpacyAnnotator", - "spacy_token_tier = doc.require_tiers('spacy.tokens', via=SpacyAnnotator)", - "list(geoname_annotier.group_spans_by_containing_span(spacy_token_tier))", - "# = [(AnnoSpan(9-19, Chiang Mai), [AnnoSpan(9-15, Chiang), AnnoSpan(16-19, Mai)])]" - ], - "author": "EcoHealth Alliance", - "author_links": { - "github": "ecohealthalliance", - "website": " https://ecohealthalliance.org/" - }, - "category": ["scientific", "standalone"] - }, - { - "id": "self-attentive-parser", - "title": "Berkeley Neural Parser", - "slogan": "Constituency Parsing with a Self-Attentive Encoder (ACL 2018)", - "description": "A Python implementation of the parsers described in *\"Constituency Parsing with a Self-Attentive Encoder\"* from ACL 2018.", - "url": "https://arxiv.org/abs/1805.01052", - "github": "nikitakit/self-attentive-parser", - "pip": "benepar", - "code_example": [ - "import benepar, spacy", - "nlp = spacy.load('en_core_web_md')", - "nlp.add_pipe('benepar', config={'model': 'benepar_en3'})", - "doc = nlp('The time for action is now. It is never too late to do something.')", - "sent = list(doc.sents)[0]", - "print(sent._.parse_string)", - "# (S (NP (NP (DT The) (NN time)) (PP (IN for) (NP (NN action)))) (VP (VBZ is) (ADVP (RB now))) (. .))", - "print(sent._.labels)", - "# ('S',)", - "print(list(sent._.children)[0])", - "# The time for action" - ], - "author": "Nikita Kitaev", - "author_links": { - "github": "nikitakit", - "website": " http://kitaev.io" - }, - "category": ["research", "pipeline"] - }, - { - "id": "excelcy", - "title": "ExcelCy", - "slogan": "Excel Integration with spaCy. Training NER using XLSX from PDF, DOCX, PPT, PNG or JPG.", - "description": "ExcelCy is a toolkit to integrate Excel to spaCy NLP training experiences. Training NER using XLSX from PDF, DOCX, PPT, PNG or JPG. ExcelCy has pipeline to match Entity with PhraseMatcher or Matcher in regular expression.", - "url": "https://github.com/kororo/excelcy", - "github": "kororo/excelcy", - "pip": "excelcy", - "code_example": [ - "from excelcy import ExcelCy", - "# collect sentences, annotate Entities and train NER using spaCy", - "excelcy = ExcelCy.execute(file_path='https://github.com/kororo/excelcy/raw/master/tests/data/test_data_01.xlsx')", - "# use the nlp object as per spaCy API", - "doc = excelcy.nlp('Google rebrands its business apps')", - "# or save it for faster bootstrap for application", - "excelcy.nlp.to_disk('/model')" - ], - "author": "Robertus Johansyah", - "author_links": { - "github": "kororo" - }, - "category": ["training"], - "tags": ["excel"] - }, - { - "id": "spacy-graphql", - "title": "spacy-graphql", - "slogan": "Query spaCy's linguistic annotations using GraphQL", - "github": "ines/spacy-graphql", - "description": "A very simple and experimental app that lets you query spaCy's linguistic annotations using [GraphQL](https://graphql.org/). The API currently supports most token attributes, named entities, sentences and text categories (if available as `doc.cats`, i.e. if you added a text classifier to a model). The `meta` field will return the model meta data. Models are only loaded once and kept in memory.", - "url": "https://explosion.ai/demos/spacy-graphql", - "category": ["apis"], - "tags": ["graphql"], - "thumb": "https://i.imgur.com/xC7zpTO.png", - "code_example": [ - "{", - " nlp(text: \"Zuckerberg is the CEO of Facebook.\", model: \"en_core_web_sm\") {", - " meta {", - " lang", - " description", - " }", - " doc {", - " text", - " tokens {", - " text", - " pos_", - " }", - " ents {", - " text", - " label_", - " }", - " }", - " }", - "}" - ], - "code_language": "json", - "author": "Ines Montani", - "author_links": { - "twitter": "_inesmontani", - "github": "ines", - "website": "https://ines.io" - } - }, - { - "id": "spacy-js", - "title": "spacy-js", - "slogan": "JavaScript API for spaCy with Python REST API", - "github": "ines/spacy-js", - "description": "JavaScript interface for accessing linguistic annotations provided by spaCy. This project is mostly experimental and was developed for fun to play around with different ways of mimicking spaCy's Python API.\n\nThe results will still be computed in Python and made available via a REST API. The JavaScript API resembles spaCy's Python API as closely as possible (with a few exceptions, as the values are all pre-computed and it's tricky to express complex recursive relationships).", - "code_language": "javascript", - "code_example": [ - "const spacy = require('spacy');", - "", - "(async function() {", - " const nlp = spacy.load('en_core_web_sm');", - " const doc = await nlp('This is a text about Facebook.');", - " for (let ent of doc.ents) {", - " console.log(ent.text, ent.label);", - " }", - " for (let token of doc) {", - " console.log(token.text, token.pos, token.head.text);", - " }", - "})();" - ], - "author": "Ines Montani", - "author_links": { - "twitter": "_inesmontani", - "github": "ines", - "website": "https://ines.io" - }, - "category": ["nonpython"], - "tags": ["javascript"] - }, - { - "id": "spacy-raspberry", - "title": "spacy-raspberry", - "slogan": "64bit Raspberry Pi image for spaCy and neuralcoref", - "github": "boehm-e/spacy-raspberry", - "thumb": "https://i.imgur.com/VCJMrE6.png", - "image": "https://raw.githubusercontent.com/boehm-e/spacy-raspberry/master/imgs/preview.png", - "author": "Erwan Boehm", - "author_links": { - "github": "boehm-e" - }, - "category": ["apis"], - "tags": ["raspberrypi"] - }, - { - "id": "spacy-wordnet", - "title": "spacy-wordnet", - "slogan": "WordNet meets spaCy", - "description": "`spacy-wordnet` creates annotations that easily allow the use of WordNet and [WordNet Domains](http://wndomains.fbk.eu/) by using the [NLTK WordNet interface](http://www.nltk.org/howto/wordnet.html)", - "github": "recognai/spacy-wordnet", - "tags": ["wordnet", "synsets"], - "thumb": "https://i.imgur.com/ud4C7cj.png", - "code_example": [ - "import spacy", - "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ", - "", - "# Load an spacy model (supported models are \"es\" and \"en\") ", - "nlp = spacy.load('en')", - "# Spacy 3.x", - "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})", - "# Spacy 2.x", - "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", - "token = nlp('prices')[0]", - "", - "# wordnet object link spacy token with nltk wordnet interface by giving acces to", - "# synsets and lemmas ", - "token._.wordnet.synsets()", - "token._.wordnet.lemmas()", - "", - "# And automatically tags with wordnet domains", - "token._.wordnet.wordnet_domains()" - ], - "author": "recognai", - "author_links": { - "github": "recognai", - "twitter": "recogn_ai", - "website": "https://recogn.ai" - }, - "category": ["pipeline"] - }, - { - "id": "spacy-conll", - "title": "spacy_conll", - "slogan": "Parsing to CoNLL with spaCy, spacy-stanza, and spacy-udpipe", - "description": "This module allows you to parse text into CoNLL-U format. You can use it as a command line tool, or embed it in your own scripts by adding it as a custom pipeline component to a spaCy, spacy-stanfordnlp, spacy-stanza, or spacy-udpipe pipeline. It also provides an easy-to-use function to quickly initialize a parser. CoNLL-related properties are added to Doc elements, sentence Spans, and Tokens.", - "code_example": [ - "from spacy_conll import init_parser", - "", - "", - "# Initialise English parser, already including the ConllFormatter as a pipeline component.", - "# Indicate that we want to get the CoNLL headers in the string output.", - "# `use_gpu` and `verbose` are specific to stanza (and stanfordnlp). These keywords arguments", - "# are passed onto their Pipeline() initialisation", - "nlp = init_parser(\"stanza\",", - " \"en\",", - " parser_opts={\"use_gpu\": True, \"verbose\": False},", - " include_headers=True)", - "# Parse a given string", - "doc = nlp(\"A cookie is a baked or cooked food that is typically small, flat and sweet. It usually contains flour, sugar and some type of oil or fat.\")", - "", - "# Get the CoNLL representation of the whole document, including headers", - "conll = doc._.conll_str", - "print(conll)" - ], - "code_language": "python", - "author": "Bram Vanroy", - "author_links": { - "github": "BramVanroy", - "twitter": "BramVanroy", - "website": "http://bramvanroy.be" - }, - "github": "BramVanroy/spacy_conll", - "category": ["standalone", "pipeline"], - "tags": ["linguistics", "computational linguistics", "conll"] - }, - { - "id": "spacy-langdetect", - "title": "spacy-langdetect", - "slogan": "A fully customizable language detection pipeline for spaCy", - "description": "This module allows you to add language detection capabilites to your spaCy pipeline. Also supports custom language detectors!", - "pip": "spacy-langdetect", - "code_example": [ - "import spacy", - "from spacy_langdetect import LanguageDetector", - "nlp = spacy.load('en')", - "nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)", - "text = 'This is an english text.'", - "doc = nlp(text)", - "# document level language detection. Think of it like average language of the document!", - "print(doc._.language)", - "# sentence level language detection", - "for sent in doc.sents:", - " print(sent, sent._.language)" - ], - "code_language": "python", - "author": "Abhijit Balaji", - "author_links": { - "github": "Abhijit-2592", - "website": "https://abhijit-2592.github.io/" - }, - "github": "Abhijit-2592/spacy-langdetect", - "category": ["pipeline"], - "tags": ["language-detection"] - }, - { - "id": "ludwig", - "title": "Ludwig", - "slogan": "A code-free deep learning toolbox", - "description": "Ludwig makes it easy to build deep learning models for many applications, including NLP ones. It uses spaCy for tokenizing text in different languages.", - "pip": "ludwig", - "github": "uber/ludwig", - "thumb": "https://i.imgur.com/j1sORgD.png", - "url": "http://ludwig.ai", - "author": "Piero Molino @ Uber AI", - "author_links": { - "github": "w4nderlust", - "twitter": "w4nderlus7", - "website": "http://w4nderlu.st" - }, - "category": ["standalone", "research"] - }, - { - "id": "pic2phrase_bot", - "title": "pic2phrase_bot: Photo Description Generator", - "slogan": "A bot that generates descriptions to submitted photos, in a human-like manner.", - "description": "pic2phrase_bot runs inside Telegram messenger and can be used to generate a phrase describing a submitted photo, employing computer vision, web scraping, and syntactic dependency analysis powered by spaCy.", - "thumb": "https://i.imgur.com/ggVI02O.jpg", - "image": "https://i.imgur.com/z1yhWQR.jpg", - "url": "https://telegram.me/pic2phrase_bot", - "author": "Yuli Vasiliev", - "author_links": { - "twitter": "VasilievYuli" - }, - "category": ["standalone", "conversational"] - }, - { - "id": "gracyql", - "title": "gracyql", - "slogan": "A thin GraphQL wrapper around spacy", - "github": "oterrier/gracyql", - "description": "An example of a basic [Starlette](https://github.com/encode/starlette) app using [Spacy](https://github.com/explosion/spaCy) and [Graphene](https://github.com/graphql-python/graphene). The main goal is to be able to use the amazing power of spaCy from other languages and retrieving only the information you need thanks to the GraphQL query definition. The GraphQL schema tries to mimic as much as possible the original Spacy API with classes Doc, Span and Token.", - "thumb": "https://i.imgur.com/xC7zpTO.png", - "category": ["apis"], - "tags": ["graphql"], - "code_example": [ - "query ParserDisabledQuery {", - " nlp(model: \"en\", disable: [\"parser\", \"ner\"]) {", - " doc(text: \"I live in Grenoble, France\") {", - " text", - " tokens {", - " id", - " pos", - " lemma", - " dep", - " }", - " ents {", - " start", - " end", - " label", - " }", - " }", - " }", - "}" - ], - "code_language": "json", - "author": "Olivier Terrier", - "author_links": { - "github": "oterrier" - } - }, - { - "id": "pyInflect", - "slogan": "A Python module for word inflections", - "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add word inflections to the system.", - "github": "bjascob/pyInflect", - "pip": "pyinflect", - "code_example": [ - "import spacy", - "import pyinflect", - "", - "nlp = spacy.load('en_core_web_sm')", - "doc = nlp('This is an example.')", - "doc[3].tag_ # NN", - "doc[3]._.inflect('NNS') # examples" - ], - "author": "Brad Jascob", - "author_links": { - "github": "bjascob" - }, - "category": ["pipeline"], - "tags": ["inflection"] - }, - { - "id": "lemminflect", - "slogan": "A Python module for English lemmatization and inflection", - "description": "LemmInflect uses a dictionary approach to lemmatize English words and inflect them into forms specified by a user supplied [Universal Dependencies](https://universaldependencies.org/u/pos/) or [Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) tag. The library works with out-of-vocabulary (OOV) words by applying neural network techniques to classify word forms and choose the appropriate morphing rules. The system acts as a standalone module or as an extension to spaCy.", - "github": "bjascob/LemmInflect", - "pip": "lemminflect", - "thumb": "https://raw.githubusercontent.com/bjascob/LemmInflect/master/docs/img/icons8-citrus-80.png", - "code_example": [ - "import spacy", - "import lemminflect", - "", - "nlp = spacy.load('en_core_web_sm')", - "doc = nlp('I am testing this example.')", - "doc[2]._.lemma() # 'test'", - "doc[4]._.inflect('NNS') # 'examples'" - ], - "author": "Brad Jascob", - "author_links": { - "github": "bjascob" - }, - "category": ["pipeline"], - "tags": ["inflection", "lemmatizer"] - }, - { - "id": "amrlib", - "slogan": "A python library that makes AMR parsing, generation and visualization simple.", - "description": "amrlib is a python module and spaCy add-in for Abstract Meaning Representation (AMR). The system can parse sentences to AMR graphs or generate text from existing graphs. It includes a GUI for visualization and experimentation.", - "github": "bjascob/amrlib", - "pip": "amrlib", - "code_example": [ - "import spacy", - "import amrlib", - "amrlib.setup_spacy_extension()", - "nlp = spacy.load('en_core_web_sm')", - "doc = nlp('This is a test of the spaCy extension. The test has multiple sentences.')", - "graphs = doc._.to_amr()", - "for graph in graphs:", - " print(graph)" - ], - "author": "Brad Jascob", - "author_links": { - "github": "bjascob" - }, - "category": ["pipeline"] - }, - { - "id": "blackstone", - "title": "Blackstone", - "slogan": "A spaCy pipeline and model for NLP on unstructured legal text", - "description": "Blackstone is a spaCy model and library for processing long-form, unstructured legal text. Blackstone is an experimental research project from the [Incorporated Council of Law Reporting for England and Wales'](https://iclr.co.uk/) research lab, [ICLR&D](https://research.iclr.co.uk/).", - "github": "ICLRandD/Blackstone", - "pip": "blackstone", - "thumb": "https://iclr.s3-eu-west-1.amazonaws.com/assets/iclrand/Blackstone/thumb.png", - "url": "https://research.iclr.co.uk", - "author": " ICLR&D", - "author_links": { - "github": "ICLRandD", - "twitter": "ICLRanD", - "website": "https://research.iclr.co.uk" - }, - "category": ["scientific", "models", "research"] - }, - { - "id": "NGym", - "title": "NeuralGym", - "slogan": "A little Windows GUI for training models with spaCy", - "description": "NeuralGym is a Python application for Windows with a graphical user interface to train models with spaCy. Run the application, select an output folder, a training data file in spaCy's data format, a spaCy model or blank model and press 'Start'.", - "github": "d5555/NeuralGym", - "url": "https://github.com/d5555/NeuralGym", - "image": "https://github.com/d5555/NeuralGym/raw/master/NGym.png", - "thumb": "https://github.com/d5555/NeuralGym/raw/master/NGym/web.png", - "author": "d5555", - "category": ["training"], - "tags": ["windows"] - }, - { - "id": "holmes", - "title": "Holmes", - "slogan": "Information extraction from English and German texts based on predicate logic", - "github": "msg-systems/holmes-extractor", - "url": "https://github.com/msg-systems/holmes-extractor", - "description": "Holmes is a Python 3 library that supports a number of use cases involving information extraction from English and German texts, including chatbot, structural extraction, topic matching and supervised document classification. There is a [website demonstrating intelligent search based on topic matching](https://holmes-demo.xt.msg.team).", - "pip": "holmes-extractor", - "category": ["conversational", "standalone"], - "tags": ["chatbots", "text-processing"], - "thumb": "https://raw.githubusercontent.com/msg-systems/holmes-extractor/master/docs/holmes_thumbnail.png", - "code_example": [ - "import holmes_extractor as holmes", - "holmes_manager = holmes.Manager(model='en_core_web_lg')", - "holmes_manager.register_search_phrase('A big dog chases a cat')", - "holmes_manager.start_chatbot_mode_console()" - ], - "author": "Richard Paul Hudson", - "author_links": { - "github": "richardpaulhudson" - } - }, - { - "id": "coreferee", - "title": "Coreferee", - "slogan": "Coreference resolution for multiple languages", - "github": "msg-systems/coreferee", - "url": "https://github.com/msg-systems/coreferee", - "description": "Coreferee is a pipeline plugin that performs coreference resolution for English, German and Polish. It is designed so that it is easy to add support for new languages and optimised for limited training data. It uses a mixture of neural networks and programmed rules. Please note you will need to [install models](https://github.com/msg-systems/coreferee#getting-started) before running the code example.", - "pip": "coreferee", - "category": ["pipeline", "models", "standalone"], - "tags": ["coreference-resolution", "anaphora"], - "code_example": [ - "import coreferee, spacy", - "nlp = spacy.load('en_core_web_trf')", - "nlp.add_pipe('coreferee')", - "doc = nlp('Although he was very busy with his work, Peter had had enough of it. He and his wife decided they needed a holiday. They travelled to Spain because they loved the country very much.')", - "doc._.coref_chains.print()", - "# Output:", - "#", - "# 0: he(1), his(6), Peter(9), He(16), his(18)", - "# 1: work(7), it(14)", - "# 2: [He(16); wife(19)], they(21), They(26), they(31)", - "# 3: Spain(29), country(34)", - "#", - "print(doc._.coref_chains.resolve(doc[31]))", - "# Output:", - "#", - "# [Peter, wife]" - ], - "author": "Richard Paul Hudson", - "author_links": { - "github": "richardpaulhudson" - } - }, - { - "id": "spacy-transformers", - "title": "spacy-transformers", - "slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2", - "description": "This package provides spaCy model pipelines that wrap [Hugging Face's `transformers`](https://github.com/huggingface/transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.", - "github": "explosion/spacy-transformers", - "url": "https://explosion.ai/blog/spacy-transformers", - "pip": "spacy-transformers", - "category": ["pipeline", "models", "research"], - "code_example": [ - "import spacy", - "", - "nlp = spacy.load(\"en_core_web_trf\")", - "doc = nlp(\"Apple shares rose on the news. Apple pie is delicious.\")" - ], - "author": "Explosion", - "author_links": { - "twitter": "explosion_ai", - "github": "explosion", - "website": "https://explosion.ai" - } - }, - { - "id": "spacy-huggingface-hub", - "title": "spacy-huggingface-hub", - "slogan": "Push your spaCy pipelines to the Hugging Face Hub", - "description": "This package provides a CLI command for uploading any trained spaCy pipeline packaged with [`spacy package`](https://spacy.io/api/cli#package) to the [Hugging Face Hub](https://huggingface.co). It auto-generates all meta information for you, uploads a pretty README (requires spaCy v3.1+) and handles version control under the hood.", - "github": "explosion/spacy-huggingface-hub", - "thumb": "https://i.imgur.com/j6FO9O6.jpg", - "url": "https://github.com/explosion/spacy-huggingface-hub", - "pip": "spacy-huggingface-hub", - "category": ["pipeline", "models"], - "author": "Explosion", - "author_links": { - "twitter": "explosion_ai", - "github": "explosion", - "website": "https://explosion.ai" - } - }, - { - "id": "spacy-clausie", - "title": "spacy-clausie", - "slogan": "Implementation of the ClausIE information extraction system for Python+spaCy", - "github": "mmxgn/spacy-clausie", - "url": "https://github.com/mmxgn/spacy-clausie", - "description": "ClausIE, a novel, clause-based approach to open information extraction, which extracts relations and their arguments from natural language text", - "category": ["pipeline", "scientific", "research"], - "code_example": [ - "import spacy", - "import claucy", - "", - "nlp = spacy.load(\"en\")", - "claucy.add_to_pipe(nlp)", - "", - "doc = nlp(\"AE died in Princeton in 1955.\")", - "", - "print(doc._.clauses)", - "# Output:", - "# ", - "", - "propositions = doc._.clauses[0].to_propositions(as_text=True)", - "", - "print(propositions)", - "# Output:", - "# [AE died in Princeton in 1955, AE died in 1955, AE died in Princeton" - ], - "author": "Emmanouil Theofanis Chourdakis", - "author_links": { - "github": "mmxgn" - } - }, - { - "id": "ipymarkup", - "slogan": "NER, syntax markup visualizations", - "description": "Collection of NLP visualizations for NER and syntax tree markup. Similar to [displaCy](https://explosion.ai/demos/displacy) and [displaCy ENT](https://explosion.ai/demos/displacy-ent).", - "github": "natasha/ipymarkup", - "image": "https://github.com/natasha/ipymarkup/blob/master/table.png?raw=true", - "pip":"pip install ipymarkup", - "code_example": [ - "from ipymarkup import show_span_ascii_markup, show_dep_ascii_markup", - "", - "text = 'В мероприятии примут участие не только российские учёные, но и зарубежные исследователи, в том числе, Крис Хелмбрехт - управляющий директор и совладелец креативного агентства Kollektiv (Германия, США), Ннека Угбома - руководитель проекта Mushroom works (Великобритания), Гергей Ковач - политик и лидер субкультурной партии «Dog with two tails» (Венгрия), Георг Жено - немецкий режиссёр, один из создателей экспериментального театра «Театр.doc», Театра им. Йозефа Бойса (Германия).'", - "spans = [(102, 116, 'PER'), (186, 194, 'LOC'), (196, 199, 'LOC'), (202, 214, 'PER'), (254, 268, 'LOC'), (271, 283, 'PER'), (324, 342, 'ORG'), (345, 352, 'LOC'), (355, 365, 'PER'), (445, 455, 'ORG'), (456, 468, 'PER'), (470, 478, 'LOC')]", - "show_span_ascii_markup(text, spans)" - ], - "author": "Alexander Kukushkin", - "author_links": { - "github": "kuk" - }, - "category": ["visualizers"] - }, - { - "id": "negspacy", - "title": "negspaCy", - "slogan": "spaCy pipeline object for negating concepts in text based on the NegEx algorithm.", - "github": "jenojp/negspacy", - "url": "https://github.com/jenojp/negspacy", - "description": "negspacy is a spaCy pipeline component that evaluates whether Named Entities are negated in text. It adds an extension to 'Span' objects.", - "pip": "negspacy", - "category": ["pipeline", "scientific"], - "tags": ["negation", "text-processing"], - "thumb": "https://github.com/jenojp/negspacy/blob/master/docs/thumb.png?raw=true", - "image": "https://github.com/jenojp/negspacy/blob/master/docs/icon.png?raw=true", - "code_example": [ - "import spacy", - "from negspacy.negation import Negex", - "", - "nlp = spacy.load(\"en_core_web_sm\")", - "nlp.add_pipe(\"negex\", config={\"ent_types\":[\"PERSON\",\"ORG\"]})", - "", - "doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")", - "for e in doc.ents:", - " print(e.text, e._.negex)" - ], - "author": "Jeno Pizarro", - "author_links": { - "github": "jenojp", - "twitter": "jenojp" - } - }, - { - "id": "ronec", - "title": "RONEC - Romanian Named Entity Corpus", - "slogan": "Named Entity Recognition corpus for Romanian language.", - "github": "dumitrescustefan/ronec", - "url": "https://github.com/dumitrescustefan/ronec", - "description": "The corpus holds 5127 sentences, annotated with 16 classes, with a total of 26376 annotated entities. The corpus comes into two formats: BRAT and CONLLUP.", - "category": ["standalone", "models"], - "tags": ["ner", "romanian"], - "thumb": "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/res/thumb.png", - "code_example": [ - "# to train a new model on ronec", - "python3 convert_spacy.py ronec/conllup/ronec.conllup output", - "python3 -m spacy train ro models output/train_ronec.json output/train_ronec.json -p ent", - "", - "# download the Romanian NER model", - "python -m spacy download ro_ner", - "", - "# load the model and print entities for a simple sentence", - "import spacy", - "", - "nlp = spacy.load(\"ro_ner\")", - "doc = nlp(\"Popescu Ion a fost la Cluj\")", - "", - "for ent in doc.ents:", - "\tprint(ent.text, ent.start_char, ent.end_char, ent.label_)" - ], - "author": "Stefan Daniel Dumitrescu, Andrei-Marius Avram" - }, - { - "id": "num_fh", - "title": "Numeric Fused-Head", - "slogan": "Numeric Fused-Head Identificaiton and Resolution in English", - "description": "This package provide a wrapper for the Numeric Fused-Head in English. It provides another information layer on numbers that refer to another entity which is not obvious from the syntactic tree.", - "github": "yanaiela/num_fh", - "pip": "num_fh", - "category": ["pipeline", "research"], - "code_example": [ - "import spacy", - "from num_fh import NFH", - "nlp = spacy.load('en_core_web_sm')", - "nfh = NFH(nlp)", - "nlp.add_pipe(nfh, first=False)", - "doc = nlp(\"I told you two, that only one of them is the one who will get 2 or 3 icecreams\")", - "", - "assert doc[16]._.is_nfh == True", - "assert doc[18]._.is_nfh == False", - "assert doc[3]._.is_deter_nfh == True", - "assert doc[16]._.is_deter_nfh == False", - "assert len(doc._.nfh) == 4" - ], - "author": "Yanai Elazar", - "author_links": { - "github": "yanaiela", - "twitter": "yanaiela", - "website": "https://yanaiela.github.io" - } - }, - { - "id": "Healthsea", - "title": "Healthsea", - "slogan": "Healthsea: an end-to-end spaCy pipeline for exploring health supplement effects", - "description": "This spaCy project trains an NER model and a custom Text Classification model with Clause Segmentation and Blinding capabilities to analyze supplement reviews and their potential effects on health.", - "github": "explosion/healthsea", - "thumb": "https://github.com/explosion/healthsea/blob/main/img/Jellyfish.png", - "category": ["pipeline", "research"], - "code_example": [ - "import spacy", - "", - "nlp = spacy.load(\"en_healthsea\")", - "doc = nlp(\"This is great for joint pain.\")", - "", - "# Clause Segmentation & Blinding", - "print(doc._.clauses)", - "", - "> {", - "> \"split_indices\": [0, 7],", - "> \"has_ent\": true,", - "> \"ent_indices\": [4, 6],", - "> \"blinder\": \"_CONDITION_\",", - "> \"ent_name\": \"joint pain\",", - "> \"cats\": {", - "> \"POSITIVE\": 0.9824668169021606,", - "> \"NEUTRAL\": 0.017364952713251114,", - "> \"NEGATIVE\": 0.00002889777533710003,", - "> \"ANAMNESIS\": 0.0001394189748680219", - "> \"prediction_text\": [\"This\", \"is\", \"great\", \"for\", \"_CONDITION_\", \"!\"]", - "> }", - "", - "# Aggregated results", - "> {", - "> \"joint_pain\": {", - "> \"effects\": [\"POSITIVE\"],", - "> \"effect\": \"POSITIVE\",", - "> \"label\": \"CONDITION\",", - "> \"text\": \"joint pain\"", - "> }", - "> }" - ], - "author": "Edward Schmuhl", - "author_links": { - "github": "thomashacker", - "twitter": "aestheticedwar1", - "website": "https://explosion.ai/" - } - }, - { - "id": "presidio", - "title": "Presidio", - "slogan": "Context aware, pluggable and customizable data protection and PII data anonymization", - "description": "Presidio *(Origin from Latin praesidium ‘protection, garrison’)* helps to ensure sensitive text is properly managed and governed. It provides fast ***analytics*** and ***anonymization*** for sensitive text such as credit card numbers, names, locations, social security numbers, bitcoin wallets, US phone numbers and financial data. Presidio analyzes the text using predefined or custom recognizers to identify entities, patterns, formats, and checksums with relevant context.", - "url": "https://aka.ms/presidio", - "image": "https://raw.githubusercontent.com/microsoft/presidio/master/docs/assets/before-after.png", - "github": "microsoft/presidio", - "category": ["standalone"], - "thumb": "https://avatars0.githubusercontent.com/u/6154722", - "author": "Microsoft", - "author_links": { - "github": "microsoft" - } - }, - { - "id": "presidio-research", - "title": "Presidio Research", - "slogan": "Toolbox for developing and evaluating PII detectors, NER models for PII and generating fake PII data", - "description": "This package features data-science related tasks for developing new recognizers for Microsoft Presidio. It is used for the evaluation of the entire system, as well as for evaluating specific PII recognizers or PII detection models. Anyone interested in evaluating an existing Microsoft Presidio instance, a specific PII recognizer or to develop new models or logic for detecting PII could leverage the preexisting work in this package. Additionally, anyone interested in generating new data based on previous datasets (e.g. to increase the coverage of entity values) for Named Entity Recognition models could leverage the data generator contained in this package.", - "url": "https://aka.ms/presidio-research", - "github": "microsoft/presidio-research", - "category": ["standalone"], - "thumb": "https://avatars0.githubusercontent.com/u/6154722", - "author": "Microsoft", - "author_links": { - "github": "microsoft" - } - }, - { - "id": "python-sentence-boundary-disambiguation", - "title": "pySBD - python Sentence Boundary Disambiguation", - "slogan": "Rule-based sentence boundary detection that works out-of-the-box", - "github": "nipunsadvilkar/pySBD", - "description": "pySBD is 'real-world' sentence segmenter which extracts reasonable sentences when the format and domain of the input text are unknown. It is a rules-based algorithm based on [The Golden Rules](https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt) - a set of tests to check accuracy of segmenter in regards to edge case scenarios developed by [TM-Town](https://www.tm-town.com/) dev team. pySBD is python port of ruby gem [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter).", - "pip": "pysbd", - "category": ["scientific"], - "tags": ["sentence segmentation"], - "code_example": [ - "from pysbd.utils import PySBDFactory", - "", - "nlp = spacy.blank('en')", - "nlp.add_pipe(PySBDFactory(nlp))", - "", - "doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.')", - "print(list(doc.sents))", - "# [My name is Jonas E. Smith., Please turn to p. 55.]" - ], - "author": "Nipun Sadvilkar", - "author_links": { - "twitter": "nipunsadvilkar", - "github": "nipunsadvilkar", - "website": "https://nipunsadvilkar.github.io" - } - }, - { - "id": "cookiecutter-spacy-fastapi", - "title": "cookiecutter-spacy-fastapi", - "slogan": "Docker-based cookiecutter for easy spaCy APIs using FastAPI", - "description": "Docker-based cookiecutter for easy spaCy APIs using FastAPI. The default endpoints expect batch requests with a list of Records in the Azure Search Cognitive Skill format. So out of the box, this cookiecutter can be setup as a Custom Cognitive Skill. For more on Azure Search and Cognitive Skills [see this page](https://docs.microsoft.com/en-us/azure/search/cognitive-search-custom-skill-interface).", - "url": "https://github.com/microsoft/cookiecutter-spacy-fastapi", - "image": "https://raw.githubusercontent.com/microsoft/cookiecutter-spacy-fastapi/master/images/cookiecutter-docs.png", - "github": "microsoft/cookiecutter-spacy-fastapi", - "category": ["apis"], - "thumb": "https://avatars0.githubusercontent.com/u/6154722", - "author": "Microsoft", - "author_links": { - "github": "microsoft" - } - }, - { - "id": "dframcy", - "title": "Dframcy", - "slogan": "Dataframe Integration with spaCy NLP", - "github": "yash1994/dframcy", - "description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.", - "pip": "dframcy", - "category": ["pipeline", "training"], - "tags": ["pandas"], - "code_example": [ - "import spacy", - "from dframcy import DframCy", - "", - "nlp = spacy.load('en_core_web_sm')", - "dframcy = DframCy(nlp)", - "doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')", - "annotation_dataframe = dframcy.to_dataframe(doc)" - ], - "author": "Yash Patadia", - "author_links": { - "twitter": "PatadiaYash", - "github": "yash1994" - } - }, - { - "id": "spacy-pytextrank", - "title": "PyTextRank", - "slogan": "Py impl of TextRank for lightweight phrase extraction", - "description": "An implementation of TextRank in Python for use in spaCy pipelines which provides fast, effective phrase extraction from texts, along with extractive summarization. The graph algorithm works independent of a specific natural language and does not require domain knowledge. See (Mihalcea 2004) https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf", - "github": "DerwenAI/pytextrank", - "pip": "pytextrank", - "code_example": [ - "import spacy", - "import pytextrank", - "", - "nlp = spacy.load('en_core_web_sm')", - "", - "tr = pytextrank.TextRank()", - "nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)", - "", - "text = 'Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered.'", - "doc = nlp(text)", - "", - "# examine the top-ranked phrases in the document", - "for p in doc._.phrases:", - " print('{:.4f} {:5d} {}'.format(p.rank, p.count, p.text))", - " print(p.chunks)" - ], - "code_language": "python", - "url": "https://github.com/DerwenAI/pytextrank/wiki", - "thumb": "https://memegenerator.net/img/instances/66942896.jpg", - "image": "https://memegenerator.net/img/instances/66942896.jpg", - "author": "Paco Nathan", - "author_links": { - "twitter": "pacoid", - "github": "ceteri", - "website": "https://derwen.ai/paco" - }, - "category": ["pipeline"], - "tags": ["phrase extraction", "ner", "summarization", "graph algorithms", "textrank"] - }, - { - "id": "spacy_syllables", - "title": "Spacy Syllables", - "slogan": "Multilingual syllable annotations", - "description": "Spacy Syllables is a pipeline component that adds multilingual syllable annotations to Tokens. It uses Pyphen under the hood and has support for a long list of languages.", - "github": "sloev/spacy-syllables", - "pip": "spacy_syllables", - "code_example": [ - "import spacy", - "from spacy_syllables import SpacySyllables", - "", - "nlp = spacy.load('en_core_web_sm')", - "syllables = SpacySyllables(nlp)", - "nlp.add_pipe(syllables, after='tagger')", - "", - "doc = nlp('terribly long')", - "", - "data = [", - " (token.text, token._.syllables, token._.syllables_count)", - " for token in doc", - "]", - "", - "assert data == [", - " ('terribly', ['ter', 'ri', 'bly'], 3),", - " ('long', ['long'], 1)", - "]" - ], - "thumb": "https://raw.githubusercontent.com/sloev/spacy-syllables/master/logo.png", - "author": "Johannes Valbjørn", - "author_links": { - "github": "sloev" - }, - "category": ["pipeline"], - "tags": ["syllables", "multilingual"] - }, - { - "id": "gobbli", - "title": "gobbli", - "slogan": "Deep learning for text classification doesn't have to be scary", - "description": "gobbli is a Python library which wraps several modern deep learning models in a uniform interface that makes it easy to evaluate feasibility and conduct analyses. It leverages the abstractive powers of Docker to hide nearly all dependency management and functional differences between models from the user. It also contains an interactive app for exploring text data and evaluating classification models. spaCy's base text classification models, as well as models integrated from `spacy-transformers`, are available in the collection of classification models. In addition, spaCy is used for data augmentation and document embeddings.", - "url": "https://github.com/rtiinternational/gobbli", - "github": "rtiinternational/gobbli", - "pip": "gobbli", - "thumb": "https://i.postimg.cc/NGpzhrdr/gobbli-lg.png", - "code_example": [ - "from gobbli.io import PredictInput, TrainInput", - "from gobbli.model.bert import BERT", - "", - "train_input = TrainInput(", - " X_train=['This is a training document.', 'This is another training document.'],", - " y_train=['0', '1'],", - " X_valid=['This is a validation sentence.', 'This is another validation sentence.'],", - " y_valid=['1', '0'],", - ")", - "", - "clf = BERT()", - "", - "# Set up classifier resources -- Docker image, etc.", - "clf.build()", - "", - "# Train model", - "train_output = clf.train(train_input)", - "", - "predict_input = PredictInput(", - " X=['Which class is this document?'],", - " labels=train_output.labels,", - " checkpoint=train_output.checkpoint,", - ")", - "", - "predict_output = clf.predict(predict_input)" - ], - "category": ["standalone"] - }, - { - "id": "spacy_fastlang", - "title": "Spacy FastLang", - "slogan": "Language detection done fast", - "description": "Fast language detection using FastText and Spacy.", - "github": "thomasthiebaud/spacy-fastlang", - "pip": "spacy_fastlang", - "code_example": [ - "import spacy_fastlang", - "", - "nlp = spacy.load(\"en_core_web_sm\")", - "nlp.add_pipe(\"language_detector\")", - "doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')", - "", - "assert doc._.language == 'en'", - "assert doc._.language_score >= 0.8" - ], - "author": "Thomas Thiebaud", - "author_links": { - "github": "thomasthiebaud" - }, - "category": ["pipeline"] - }, - { - "id": "mlflow", - "title": "MLflow", - "slogan": "An open source platform for the machine learning lifecycle", - "description": "MLflow is an open source platform to manage the ML lifecycle, including experimentation, reproducibility, deployment, and a central model registry. MLflow currently offers four components: Tracking, Projects, Models and Registry.", - "github": "mlflow/mlflow", - "pip": "mlflow", - "thumb": "https://www.mlflow.org/docs/latest/_static/MLflow-logo-final-black.png", - "image": "", - "url": "https://mlflow.org/", - "author": "Databricks", - "author_links": { - "github": "databricks", - "twitter": "databricks", - "website": "https://databricks.com/" - }, - "category": ["standalone", "apis"], - "code_example": [ - "import mlflow", - "import mlflow.spacy", - "", - "# MLflow Tracking", - "nlp = spacy.load('my_best_model_path/output/model-best')", - "with mlflow.start_run(run_name='Spacy'):", - " mlflow.set_tag('model_flavor', 'spacy')", - " mlflow.spacy.log_model(spacy_model=nlp, artifact_path='model')", - " mlflow.log_metric(('accuracy', 0.72))", - " my_run_id = mlflow.active_run().info.run_id", - "", - "", - "# MLflow Models", - "model_uri = f'runs:/{my_run_id}/model'", - "nlp2 = mlflow.spacy.load_model(model_uri=model_uri)" - ] - }, - { - "id": "pyate", - "title": "PyATE", - "slogan": "Python Automated Term Extraction", - "description": "PyATE is a term extraction library written in Python using Spacy POS tagging with Basic, Combo Basic, C-Value, TermExtractor, and Weirdness.", - "github": "kevinlu1248/pyate", - "pip": "pyate", - "code_example": [ - "import spacy", - "import pyate", - "", - "nlp = spacy.load('en_core_web_sm')", - "nlp.add_pipe(\"combo_basic\") # or any of `basic`, `weirdness`, `term_extractor` or `cvalue`", - "# source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/", - "string = 'Central to the development of cancer are genetic changes that endow these “cancer cells” with many of the hallmarks of cancer, such as self-sufficient growth and resistance to anti-growth and pro-death signals. However, while the genetic changes that occur within cancer cells themselves, such as activated oncogenes or dysfunctional tumor suppressors, are responsible for many aspects of cancer development, they are not sufficient. Tumor promotion and progression are dependent on ancillary processes provided by cells of the tumor environment but that are not necessarily cancerous themselves. Inflammation has long been associated with the development of cancer. This review will discuss the reflexive relationship between cancer and inflammation with particular focus on how considering the role of inflammation in physiologic processes such as the maintenance of tissue homeostasis and repair may provide a logical framework for understanding the connection between the inflammatory response and cancer.'", - "", - "doc = nlp(string)", - "print(doc._.combo_basic.sort_values(ascending=False).head(5))", - "\"\"\"\"\"\"", - "dysfunctional tumor 1.443147", - "tumor suppressors 1.443147", - "genetic changes 1.386294", - "cancer cells 1.386294", - "dysfunctional tumor suppressors 1.298612", - "\"\"\"\"\"\"" - ], - "code_language": "python", - "url": "https://github.com/kevinlu1248/pyate", - "author": "Kevin Lu", - "author_links": { - "twitter": "kevinlu1248", - "github": "kevinlu1248", - "website": "https://github.com/kevinlu1248/pyate" - }, - "category": ["pipeline", "research"], - "tags": ["term_extraction"] - }, - { - "id": "contextualSpellCheck", - "title": "Contextual Spell Check", - "slogan": "Contextual spell correction using BERT (bidirectional representations)", - "description": "This package currently focuses on Out of Vocabulary (OOV) word or non-word error (NWE) correction using BERT model. The idea of using BERT was to use the context when correcting NWE.", - "github": "R1j1t/contextualSpellCheck", - "pip": "contextualSpellCheck", - "code_example": [ - "import spacy", - "import contextualSpellCheck", - "", - "nlp = spacy.load('en_core_web_sm')", - "contextualSpellCheck.add_to_pipe(nlp)", - "doc = nlp('Income was $9.4 milion compared to the prior year of $2.7 milion.')", - "", - "print(doc._.performed_spellCheck) #Should be True", - "print(doc._.outcome_spellCheck) #Income was $9.4 million compared to the prior year of $2.7 million." - ], - "code_language": "python", - "url": "https://github.com/R1j1t/contextualSpellCheck", - "thumb": "https://user-images.githubusercontent.com/22280243/82760949-98e68480-9e14-11ea-952e-4738620fd9e3.png", - "image": "https://user-images.githubusercontent.com/22280243/82138959-2852cd00-9842-11ea-918a-49b2a7873ef6.png", - "author": "Rajat Goel", - "author_links": { - "github": "r1j1t", - "website": "https://github.com/R1j1t" - }, - "category": ["pipeline", "conversational", "research"], - "tags": ["spell check", "correction", "preprocessing", "translation", "correction"] - }, - { - "id": "texthero", - "title": "Texthero", - "slogan": "Text preprocessing, representation and visualization from zero to hero.", - "description": "Texthero is a python package to work with text data efficiently. It empowers NLP developers with a tool to quickly understand any text-based dataset and it provides a solid pipeline to clean and represent text data, from zero to hero.", - "github": "jbesomi/texthero", - "pip": "texthero", - "code_example": [ - "import texthero as hero", - "import pandas as pd", - "", - "df = pd.read_csv('https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv')", - "df['named_entities'] = hero.named_entities(df['text'])", - "df.head()" - ], - "code_language": "python", - "url": "https://texthero.org", - "thumb": "https://texthero.org/img/T.png", - "image": "https://texthero.org/docs/assets/texthero.png", - "author": "Jonathan Besomi", - "author_links": { - "github": "jbesomi", - "website": "https://besomi.ai" - }, - "category": ["standalone"] - }, - { - "id": "cov-bsv", - "title": "VA COVID-19 NLP BSV", - "slogan": "spaCy pipeline for COVID-19 surveillance.", - "github": "abchapman93/VA_COVID-19_NLP_BSV", - "description": "A spaCy rule-based pipeline for identifying positive cases of COVID-19 from clinical text. A version of this system was deployed as part of the US Department of Veterans Affairs biosurveillance response to COVID-19.", - "pip": "cov-bsv", - "code_example": [ - "import cov_bsv", - "", - "nlp = cov_bsv.load()", - "doc = nlp('Pt tested for COVID-19. His wife was recently diagnosed with novel coronavirus. SARS-COV-2: Detected')", - "", - "print(doc.ents)", - "print(doc._.cov_classification)", - "cov_bsv.visualize_doc(doc)" - ], - "category": ["pipeline", "standalone", "biomedical", "scientific"], - "tags": ["clinical", "epidemiology", "covid-19", "surveillance"], - "author": "Alec Chapman", - "author_links": { - "github": "abchapman93" - } - }, - { - "id": "medspacy", - "title": "medspaCy", - "thumb": "https://raw.githubusercontent.com/medspacy/medspacy/master/images/medspacy_logo.png", - "slogan": "A toolkit for clinical NLP with spaCy.", - "github": "medspacy/medspacy", - "description": "A toolkit for clinical NLP with spaCy. Features include sentence splitting, section detection, and asserting negation, family history, and uncertainty.", - "pip": "medspacy", - "code_example": [ - "import medspacy", - "from medspacy.ner import TargetRule", - "", - "nlp = medspacy.load()", - "print(nlp.pipe_names)", - "", - "nlp.get_pipe('target_matcher').add([TargetRule('stroke', 'CONDITION'), TargetRule('diabetes', 'CONDITION'), TargetRule('pna', 'CONDITION')])", - "doc = nlp('Patient has hx of stroke. Mother diagnosed with diabetes. No evidence of pna.')", - "", - "for ent in doc.ents:", - " print(ent, ent._.is_negated, ent._.is_family, ent._.is_historical)", - "medspacy.visualization.visualize_ent(doc)" - ], - "category": ["biomedical", "scientific", "research"], - "tags": ["clinical"], - "author": "medspacy", - "author_links": { - "github": "medspacy" - } - }, - { - "id": "rita-dsl", - "title": "RITA DSL", - "slogan": "Domain Specific Language for creating language rules", - "github": "zaibacu/rita-dsl", - "description": "A Domain Specific Language (DSL) for building language patterns. These can be later compiled into spaCy patterns, pure regex, or any other format", - "pip": "rita-dsl", - "thumb": "https://raw.githubusercontent.com/zaibacu/rita-dsl/master/docs/assets/logo-100px.png", - "code_language": "python", - "code_example": [ - "import spacy", - "from rita.shortcuts import setup_spacy", - "", - "rules = \"\"\"", - "cuts = {\"fitted\", \"wide-cut\"}", - "lengths = {\"short\", \"long\", \"calf-length\", \"knee-length\"}", - "fabric_types = {\"soft\", \"airy\", \"crinkled\"}", - "fabrics = {\"velour\", \"chiffon\", \"knit\", \"woven\", \"stretch\"}", - "", - "{IN_LIST(cuts)?, IN_LIST(lengths), WORD(\"dress\")}->MARK(\"DRESS_TYPE\")", - "{IN_LIST(lengths), IN_LIST(cuts), WORD(\"dress\")}->MARK(\"DRESS_TYPE\")", - "{IN_LIST(fabric_types)?, IN_LIST(fabrics)}->MARK(\"DRESS_FABRIC\")", - "\"\"\"", - "", - "nlp = spacy.load(\"en\")", - "setup_spacy(nlp, rules_string=rules)", - "r = nlp(\"She was wearing a short wide-cut dress\")", - "print(list([{\"label\": e.label_, \"text\": e.text} for e in r.ents]))" - ], - "category": ["standalone"], - "tags": ["dsl", "language-patterns", "language-rules", "nlp"], - "author": "Šarūnas Navickas", - "author_links": { - "github": "zaibacu" - } - }, - { - "id": "PatternOmatic", - "title": "PatternOmatic", - "slogan": "Finds linguistic patterns effortlessly", - "description": "Discover spaCy's linguistic patterns matching a given set of String samples to be used by the spaCy's Rule Based Matcher", - "github": "revuel/PatternOmatic", - "pip": "PatternOmatic", - "code_example": [ - "from PatternOmatic.api import find_patterns", - "", - "samples = ['I am a cat!', 'You are a dog!', 'She is an owl!']", - "", - "patterns_found, _ = find_patterns(samples)", - "", - "print(f'Patterns found: {patterns_found}')" - ], - "code_language": "python", - "thumb": "https://svgshare.com/i/R3P.svg", - "image": "https://svgshare.com/i/R3P.svg", - "author": "Miguel Revuelta Espinosa", - "author_links": { - "github": "revuel" - }, - "category": ["scientific", "research", "standalone"], - "tags": ["Evolutionary Computation", "Grammatical Evolution"] - }, - { - "id": "SpacyDotNet", - "title": "spaCy .NET Wrapper", - "slogan": "SpacyDotNet is a .NET Core compatible wrapper for spaCy, based on Python.NET", - "description": "This projects relies on [Python.NET](http://pythonnet.github.io/) to interop with spaCy. It's not meant to be a complete and exhaustive implementation of all spaCy features and [APIs](https://spacy.io/api). Although it should be enough for basic tasks, it's considered as a starting point if you need to build a complex project using spaCy in .NET Most of the basic features in _Spacy101_ are available. All `Container` classes are present (`Doc`, `Token`, `Span` and `Lexeme`) with their basic properties/methods running and also `Vocab` and `StringStore` in a limited form. Anyway, any developer should be ready to add the missing properties or classes in a very straightforward manner.", - "github": "AMArostegui/SpacyDotNet", - "thumb": "https://raw.githubusercontent.com/AMArostegui/SpacyDotNet/master/cslogo.png", - "code_example": [ - "var spacy = new Spacy();", - "", - "var nlp = spacy.Load(\"en_core_web_sm\");", - "var doc = nlp.GetDocument(\"Apple is looking at buying U.K. startup for $1 billion\");", - "", - "foreach (Token token in doc.Tokens)", - " Console.WriteLine($\"{token.Text} {token.Lemma} {token.PoS} {token.Tag} {token.Dep} {token.Shape} {token.IsAlpha} {token.IsStop}\");", - "", - "Console.WriteLine(\"\");", - "foreach (Span ent in doc.Ents)", - " Console.WriteLine($\"{ent.Text} {ent.StartChar} {ent.EndChar} {ent.Label}\");", - "", - "nlp = spacy.Load(\"en_core_web_md\");", - "var tokens = nlp.GetDocument(\"dog cat banana afskfsd\");", - "", - "Console.WriteLine(\"\");", - "foreach (Token token in tokens.Tokens)", - " Console.WriteLine($\"{token.Text} {token.HasVector} {token.VectorNorm}, {token.IsOov}\");", - "", - "tokens = nlp.GetDocument(\"dog cat banana\");", - "Console.WriteLine(\"\");", - "foreach (Token token1 in tokens.Tokens)", - "{", - " foreach (Token token2 in tokens.Tokens)", - " Console.WriteLine($\"{token1.Text} {token2.Text} {token1.Similarity(token2) }\");", - "}", - "", - "doc = nlp.GetDocument(\"I love coffee\");", - "Console.WriteLine(\"\");", - "Console.WriteLine(doc.Vocab.Strings[\"coffee\"]);", - "Console.WriteLine(doc.Vocab.Strings[3197928453018144401]);", - "", - "Console.WriteLine(\"\");", - "foreach (Token word in doc.Tokens)", - "{", - " var lexeme = doc.Vocab[word.Text];", - " Console.WriteLine($@\"{lexeme.Text} {lexeme.Orth} {lexeme.Shape} {lexeme.Prefix} {lexeme.Suffix} {lexeme.IsAlpha} {lexeme.IsDigit} {lexeme.IsTitle} {lexeme.Lang}\");", - "}" - ], - "code_language": "csharp", - "author": "Antonio Miras", - "author_links": { - "github": "AMArostegui" - }, - "category": ["nonpython"] - }, - { - "id": "ruts", - "title": "ruTS", - "slogan": "A library for statistics extraction from texts in Russian", - "description": "The library allows extracting the following statistics from a text: basic statistics, readability metrics, lexical diversity metrics, morphological statistics", - "github": "SergeyShk/ruTS", - "pip": "ruts", - "code_example": [ - "import spacy", - "import ruts", - "", - "nlp = spacy.load('ru_core_news_sm')", - "nlp.add_pipe('basic', last=True)", - "doc = nlp('мама мыла раму')", - "doc._.basic.get_stats()" - ], - "code_language": "python", - "thumb": "https://habrastorage.org/webt/6z/le/fz/6zlefzjavzoqw_wymz7v3pwgfp4.png", - "image": "https://clipartart.com/images/free-tree-roots-clipart-black-and-white-2.png", - "author": "Sergey Shkarin", - "author_links": { - "twitter": "shk_sergey", - "github": "SergeyShk" - }, - "category": ["pipeline", "standalone"], - "tags": ["Text Analytics", "Russian"] - }, - { - "id": "trunajod", - "title": "TRUNAJOD", - "slogan": "A text complexity library for text analysis built on spaCy", - "description": "With all the basic NLP capabilities provided by spaCy (dependency parsing, POS tagging, tokenizing), `TRUNAJOD` focuses on extracting measurements from texts that might be interesting for different applications and use cases.", - "github": "dpalmasan/TRUNAJOD2.0", - "pip": "trunajod", - "code_example": [ - "import spacy", - "from TRUNAJOD.entity_grid import EntityGrid", - "", - "nlp = spacy.load('es_core_news_sm', disable=['ner', 'textcat'])", - "example_text = (", - " 'El espectáculo del cielo nocturno cautiva la mirada y suscita preguntas'", - " 'sobre el universo, su origen y su funcionamiento. No es sorprendente que '", - " 'todas las civilizaciones y culturas hayan formado sus propias '", - " 'cosmologías. Unas relatan, por ejemplo, que el universo ha'", - " 'sido siempre tal como es, con ciclos que inmutablemente se repiten; '", - " 'otras explican que este universo ha tenido un principio, '", - " 'que ha aparecido por obra creadora de una divinidad.'", - ")", - "doc = nlp(example_text)", - "egrid = EntityGrid(doc)", - "print(egrid.get_egrid())" - ], - "code_language": "python", - "thumb": "https://raw.githubusercontent.com/dpalmasan/TRUNAJOD2.0/master/imgs/trunajod_thumb.png", - "image": "https://raw.githubusercontent.com/dpalmasan/TRUNAJOD2.0/master/imgs/trunajod_logo.png", - "author": "Diego Palma", - "author_links": { - "github": "dpalmasan" - }, - "category": ["research", "standalone", "scientific"], - "tags": ["Text Analytics", "Coherence", "Cohesion"] - }, - { - "id": "lingfeat", - "title": "LingFeat", - "slogan": "A Linguistic Feature Extraction (Text Analysis) Tool for Readability Assessment and Text Simplification", - "description": "LingFeat is a feature extraction library which currently extracts 255 linguistic features from English string input. Categories include syntax, semantics, discourse, and also traditional readability formulas. Published in EMNLP 2021.", - "github": "brucewlee/lingfeat", - "pip": "lingfeat", - "code_example": [ - "from lingfeat import extractor", - "", - "", - "text = 'TAEAN, South Chungcheong Province -- Just before sunup, Lee Young-ho, a seasoned fisherman with over 30 years of experience, silently waits for boats carrying blue crabs as the season for the seafood reaches its height. Soon afterward, small and big boats sail into Sinjin Port in Taean County, South Chungcheong Province, the second-largest source of blue crab after Incheon, accounting for 29 percent of total production of the country. A crane lifts 28 boxes filled with blue crabs weighing 40 kilograms each from the boat, worth about 10 million won ($8,500). “It has been a productive fall season for crabbing here. The water temperature is a very important factor affecting crab production. They hate cold water,” Lee said. The temperature of the sea off Taean appeared to have stayed at the level where crabs become active. If the sea temperature suddenly drops, crabs go into their winter dormancy mode, burrowing into the mud and sleeping through the cold months.'", - "", - "", - "#Pass text", - "LingFeat = extractor.pass_text(text)", - "", - "", - "#Preprocess text", - "LingFeat.preprocess()", - "", - "", - "#Extract features", - "#each method returns a dictionary of the corresponding features", - "#Advanced Semantic (AdSem) Features", - "WoKF = LingFeat.WoKF_() #Wikipedia Knowledge Features", - "WBKF = LingFeat.WBKF_() #WeeBit Corpus Knowledge Features", - "OSKF = LingFeat.OSKF_() #OneStopEng Corpus Knowledge Features", - "", - "#Discourse (Disco) Features", - "EnDF = LingFeat.EnDF_() #Entity Density Features", - "EnGF = LingFeat.EnGF_() #Entity Grid Features", - "", - "#Syntactic (Synta) Features", - "PhrF = LingFeat.PhrF_() #Noun/Verb/Adj/Adv/... Phrasal Features", - "TrSF = LingFeat.TrSF_() #(Parse) Tree Structural Features", - "POSF = LingFeat.POSF_() #Noun/Verb/Adj/Adv/... Part-of-Speech Features", - "", - "#Lexico Semantic (LxSem) Features", - "TTRF = LingFeat.TTRF_() #Type Token Ratio Features", - "VarF = LingFeat.VarF_() #Noun/Verb/Adj/Adv Variation Features", - "PsyF = LingFeat.PsyF_() #Psycholinguistic Difficulty of Words (AoA Kuperman)", - "WoLF = LingFeat.WorF_() #Word Familiarity from Frequency Count (SubtlexUS)", - "", - "Shallow Traditional (ShTra) Features", - "ShaF = LingFeat.ShaF_() #Shallow Features (e.g. avg number of tokens)", - "TraF = LingFeat.TraF_() #Traditional Formulas" - ], - "code_language": "python", - "thumb": "https://raw.githubusercontent.com/brucewlee/lingfeat/master/img/lingfeat_logo2.png", - "image": "https://raw.githubusercontent.com/brucewlee/lingfeat/master/img/lingfeat_logo.png", - "author": "Bruce W. Lee (이웅성)", - "author_links": { - "github": "brucewlee", - "website": "https://brucewlee.github.io/" - }, - "category": ["research", "scientific"], - "tags": ["Readability", "Simplification", "Feature Extraction", "Syntax", "Discourse", "Semantics", "Lexical"] - }, - { - "id": "hmrb", - "title": "Hammurabi", - "slogan": "Python Rule Processing Engine 🏺", - "description": "Hammurabi works as a rule engine to parse input using a defined set of rules. It uses a simple and readable syntax to define complex rules to handle phrase matching. The syntax supports nested logical statements, regular expressions, reusable or side-loaded variables and match triggered callback functions to modularize your rules. The latest version works with both spaCy 2.X and 3.X. For more information check the documentation on [ReadTheDocs](https://hmrb.readthedocs.io/en/latest/).", - "github": "babylonhealth/hmrb", - "pip": "hmrb", - "code_example": [ - "import spacy", - "from hmrb.core import SpacyCore", - "", - "nlp = spacy.load(\"en_core_web_sm\")", - "sentences = \"I love gorillas. Peter loves gorillas. Jane loves Tarzan.\"", - "", - "def conj_be(subj: str) -> str:", - " if subj == \"I\":", - " return \"am\"", - " elif subj == \"you\":", - " return \"are\"", - " else:", - " return \"is\"", - "", - "@spacy.registry.callbacks(\"gorilla_callback\")", - "def gorilla_clb(seq: list, span: slice, data: dict) -> None:", - " subj = seq[span.start].text", - " be = conj_be(subj)", - " print(f\"{subj} {be} a gorilla person.\")", - "@spacy.registry.callbacks(\"lover_callback\")", - "def lover_clb(seq: list, span: slice, data: dict) -> None:", - " print(f\"{seq[span][-1].text} is a love interest of {seq[span.start].text}.\")", - "", - "grammar = \"\"\"", - " Law:", - " - callback: \"loves_gorilla\"", - " (", - " ((pos: \"PROPN\") or (pos: \"PRON\"))", - " (lemma: \"love\")", - " (lemma: \"gorilla\")", - " )", - " Law:", - " - callback: \"loves_someone\"", - " (", - " (pos: \"PROPN\")", - " (lower: \"loves\")", - " (pos: \"PROPN\")", - " )", - "\"\"\"", - "", - "@spacy.registry.augmenters(\"jsonify_span\")", - "def jsonify_span(span):", - " return [{\"lemma\": token.lemma_, \"pos\": token.pos_, \"lower\": token.lower_} for token in span]", - "", - "conf = {", - " \"rules\": grammar,", - " \"callbacks\": {", - " \"loves_gorilla\": \"callbacks.gorilla_callback\",", - " \"loves_someone\": \"callbacks.lover_callback\",", - " },", - " \"map_doc\": \"augmenters.jsonify_span\",", - " \"sort_length\": True,", - "}", - "", - "nlp.add_pipe(\"hmrb\", config=conf)", - "nlp(sentences)" - ], - "code_language": "python", - "thumb": "https://user-images.githubusercontent.com/6807878/118643685-cae6b880-b7d4-11eb-976e-066aec9505da.png", - "image": "https://user-images.githubusercontent.com/6807878/118643685-cae6b880-b7d4-11eb-976e-066aec9505da.png", - "author": "Kristian Boda", - "author_links": { - "github": "bodak", - "twitter": "bodak", - "website": "https://github.com/babylonhealth/" - }, - "category": ["pipeline", "standalone", "scientific", "biomedical"], - "tags": ["babylonhealth", "rule-engine", "matcher"] - }, - { - "id": "forte", - "title": "Forte", - "slogan": "Forte is a toolkit for building Natural Language Processing pipelines, featuring cross-task interaction, adaptable data-model interfaces and composable pipelines.", - "description": "Forte provides a platform to assemble state-of-the-art NLP and ML technologies in a highly-composable fashion, including a wide spectrum of tasks ranging from Information Retrieval, Natural Language Understanding to Natural Language Generation.", - "github": "asyml/forte", - "pip": "forte.spacy stave torch", - "code_example": [ - "from fortex.spacy import SpacyProcessor", - "from forte.processors.stave import StaveProcessor", - "from forte import Pipeline", - "from forte.data.readers import StringReader", - "", - "pipeline = Pipeline()", - "pipeline.set_reader(StringReader())", - "pipeline.add(SpacyProcessor())", - "pipeline.add(StaveProcessor())", - "pipeline.run('Running SpaCy with Forte!')" - ], - "code_language": "python", - "url": "https://medium.com/casl-project/forte-building-modular-and-re-purposable-nlp-pipelines-cf5b5c5abbe9", - "thumb": "https://raw.githubusercontent.com/asyml/forte/master/docs/_static/img/forte_graphic.png", - "image": "https://raw.githubusercontent.com/asyml/forte/master/docs/_static/img/logo_h.png", - "author": "Petuum", - "author_links": { - "twitter": "PetuumInc", - "github": "asyml", - "website": "https://petuum.com" - }, - "category": ["pipeline", "standalone"], - "tags": ["pipeline"] - }, - { - "id": "spacy-api-docker-v3", - "slogan": "spaCy v3 REST API, wrapped in a Docker container", - "github": "bbieniek/spacy-api-docker", - "url": "https://hub.docker.com/r/bbieniek/spacyapi/", - "thumb": "https://i.imgur.com/NRnDKyj.jpg", - "code_example": [ - "version: '3'", - "", - "services:", - " spacyapi:", - " image: bbieniek/spacyapi:en_v3", - " ports:", - " - \"127.0.0.1:8080:80\"", - " restart: always" - ], - "code_language": "docker", - "author": "Baltazar Bieniek", - "author_links": { - "github": "bbieniek" - }, - "category": ["apis"] - }, - { - "id": "phruzz_matcher", - "title": "phruzz-matcher", - "slogan": "Phrase matcher using RapidFuzz", - "description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO \"perfect matches\" due to typos or abbreviations between a Spacy doc and a list of phrases.", - "github": "mjvallone/phruzz-matcher", - "pip": "phruzz_matcher", - "code_example": [ - "import spacy", - "from spacy.language import Language", - "from phruzz_matcher.phrase_matcher import PhruzzMatcher", - "", - "famous_people = [", - " \"Brad Pitt\",", - " \"Demi Moore\",", - " \"Bruce Willis\",", - " \"Jim Carrey\",", - "]", - "", - "@Language.factory(\"phrase_matcher\")", - "def phrase_matcher(nlp: Language, name: str):", - " return PhruzzMatcher(nlp, famous_people, \"FAMOUS_PEOPLE\", 85)", - "", - "nlp = spacy.blank('es')", - "nlp.add_pipe(\"phrase_matcher\")", - "", - "doc = nlp(\"El otro día fui a un bar donde vi a brad pit y a Demi Moore, estaban tomando unas cervezas mientras charlaban de sus asuntos.\")", - "print(f\"doc.ents: {doc.ents}\")", - "", - "#OUTPUT", - "#doc.ents: (brad pit, Demi Moore)" - ], - "thumb": "https://avatars.githubusercontent.com/u/961296?v=4", - "image": "", - "code_language": "python", - "author": "Martin Vallone", - "author_links": { - "github": "mjvallone", - "twitter": "vallotin", - "website": "https://fiqus.coop/" - }, - "category": ["pipeline", "research", "standalone"], - "tags": ["spacy", "python", "nlp", "ner"] - }, - { - "id": "WordDumb", - "title": "WordDumb", - "slogan": "A calibre plugin that generates Word Wise and X-Ray files.", - "description": "A calibre plugin that generates Word Wise and X-Ray files then sends them to Kindle. Supports KFX, AZW3 and MOBI eBooks. X-Ray supports 18 languages.", - "github": "xxyzz/WordDumb", - "code_language": "python", - "thumb": "https://raw.githubusercontent.com/xxyzz/WordDumb/master/starfish.svg", - "image": "https://user-images.githubusercontent.com/21101839/130245435-b874f19a-7785-4093-9975-81596efc42bb.png", - "author": "xxyzz", - "author_links": { - "github": "xxyzz" - }, - "category": ["standalone"] - }, - { - "id": "eng_spacysentiment", - "title": "eng_spacysentiment", - "slogan": "Simple sentiment analysis using spaCy pipelines", - "description": "Sentiment analysis for simple english sentences using pre-trained spaCy pipelines", - "github": "vishnunkumar/spacysentiment", - "pip": "eng-spacysentiment", - "code_example": [ - "import eng_spacysentiment", - "nlp = eng_spacysentiment.load()", - "text = \"Welcome to Arsenals official YouTube channel Watch as we take you closer and show you the personality of the club\"", - "doc = nlp(text)", - "print(doc.cats)", - "# {'positive': 0.29878824949264526, 'negative': 0.7012117505073547}" - ], - "thumb": "", - "image": "", - "code_language": "python", - "author": "Vishnu Nandakumar", - "author_links": { - "github": "Vishnunkumar", - "twitter": "vishnun_uchiha" - }, - "category": ["pipeline"], - "tags": ["pipeline", "nlp", "sentiment"] - } - ], - - "categories": [ - { - "label": "Projects", - "items": [ - { - "id": "pipeline", - "title": "Pipeline", - "description": "Custom pipeline components and extensions" - }, - { - "id": "training", - "title": "Training", - "description": "Helpers and toolkits for training spaCy models" - }, - { - "id": "conversational", - "title": "Conversational", - "description": "Frameworks and utilities for working with conversational text, e.g. for chat bots" - }, - { - "id": "research", - "title": "Research", - "description": "Frameworks and utilities for developing better NLP models, especially using neural networks" - }, - { - "id": "scientific", - "title": "Scientific", - "description": "Frameworks and utilities for scientific text processing" - }, - { - "id": "biomedical", - "title": "Biomedical", - "description": "Frameworks and utilities for processing biomedical text" - }, - { - "id": "visualizers", - "title": "Visualizers", - "description": "Demos and tools to visualize NLP annotations or systems" - }, - { - "id": "apis", - "title": "Containers & APIs", - "description": "Infrastructure tools for managing or deploying spaCy" - }, - { - "id": "nonpython", - "title": "Non-Python", - "description": "Wrappers, bindings and implementations in other programming languages" - }, - { - "id": "standalone", - "title": "Standalone", - "description": "Self-contained libraries or tools that use spaCy under the hood" - }, - { - "id": "models", - "title": "Models", - "description": "Third-party pretrained models for different languages and domains" - } - ] - }, - { - "label": "Education", - "items": [ - { - "id": "books", - "title": "Books", - "description": "Books about or featuring spaCy" - }, - { - "id": "courses", - "title": "Courses", - "description": "Online courses and interactive tutorials" - }, - { - "id": "videos", - "title": "Videos", - "description": "Talks and tutorials in video format" - }, - { - "id": "podcasts", - "title": "Podcasts", - "description": "Episodes about spaCy or interviews with the spaCy team" - } - ] - } - ] -} From 65e41a52ddfe3266039443743fcf6d197de55909 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 29 Sep 2022 10:39:36 +0200 Subject: [PATCH 18/35] Reverting unnecessary changes. Removing unused default values. Renaming variables in find-cli tests. --- spacy/cli/find_threshold.py | 1 - spacy/scorer.py | 12 ++++++++---- spacy/tests/test_cli.py | 14 +++++++------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 6d89355124f..41210a0c6f2 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -13,7 +13,6 @@ from .. import util _DEFAULTS = { - "average": "micro", "n_trials": 10, "use_gpu": -1, "gold_preproc": False, diff --git a/spacy/scorer.py b/spacy/scorer.py index 74402b46615..ac58b94734a 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -21,7 +21,13 @@ class PRFScore: """A precision / recall / F score.""" - def __init__(self, *, tp: int = 0, fp: int = 0, fn: int = 0) -> None: + def __init__( + self, + *, + tp: int = 0, + fp: int = 0, + fn: int = 0 + ) -> None: self.tp = tp self.fp = fp self.fn = fn @@ -37,9 +43,7 @@ def __iadd__(self, other): def __add__(self, other): return PRFScore( - tp=self.tp + other.tp, - fp=self.fp + other.fp, - fn=self.fn + other.fn, + tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn, ) def score_set(self, cand: set, gold: set) -> None: diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 7b5bc88c3b0..c690e9e05bd 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -889,8 +889,8 @@ def make_examples(_nlp: Language) -> List[Example]: def init_nlp( components: Tuple[Tuple[str, Dict[str, Any]], ...] = () ) -> Tuple[Language, List[Example]]: - _nlp = English() - textcat: TextCategorizer = _nlp.add_pipe( # type: ignore + new_nlp = English() + textcat: TextCategorizer = new_nlp.add_pipe( # type: ignore factory_name="textcat_multilabel", name="tc_multi", config={"threshold": 0.9}, @@ -901,17 +901,17 @@ def init_nlp( # Append additional components to pipeline. for cfn, comp_config in components: - comp = _nlp.add_pipe(cfn, config=comp_config) + comp = new_nlp.add_pipe(cfn, config=comp_config) if isinstance(comp, TextCategorizer): for label in textcat_labels: comp.add_label(label) - _examples = make_examples(_nlp) - _nlp.initialize(get_examples=lambda: _examples) + new_examples = make_examples(new_nlp) + new_nlp.initialize(get_examples=lambda: new_examples) for i in range(5): - _nlp.update(_examples) + new_nlp.update(new_examples) - return _nlp, _examples + return new_nlp, new_examples with make_tempdir() as docs_dir: # Check whether find_threshold() identifies lowest threshold above 0 as (first) ideal threshold, as this matches From 604c5eab3d556540bc1fbf04cb68c7bd137ecef4 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 29 Sep 2022 10:40:30 +0200 Subject: [PATCH 19/35] Update spacy/cli/find_threshold.py Co-authored-by: Adriane Boyd --- spacy/cli/find_threshold.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 41210a0c6f2..a506d21d6b1 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -167,9 +167,7 @@ def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: ) scores[threshold] = nlp.evaluate(dev_dataset)[scores_key] - if not isinstance(scores[threshold], float) and not isinstance( - scores[threshold], int - ): + if not isinstance(scores[threshold], (float, int)): wasabi.msg.fail( f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric " f"scores.", From 58d5c99b939cb4525ee8df59957b93c6aba844bd Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 29 Sep 2022 10:43:11 +0200 Subject: [PATCH 20/35] Remove adding labels in tests. --- spacy/tests/test_cli.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index c690e9e05bd..0e7413f1e87 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -36,7 +36,6 @@ from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs -from spacy.pipeline import TextCategorizer from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config from ..cli.init_pipeline import _init_labels @@ -890,21 +889,15 @@ def init_nlp( components: Tuple[Tuple[str, Dict[str, Any]], ...] = () ) -> Tuple[Language, List[Example]]: new_nlp = English() - textcat: TextCategorizer = new_nlp.add_pipe( # type: ignore + new_nlp.add_pipe( # type: ignore factory_name="textcat_multilabel", name="tc_multi", config={"threshold": 0.9}, ) - textcat_labels = ("ANGRY", "CONFUSED", "HAPPY") - for label in textcat_labels: - textcat.add_label(label) # Append additional components to pipeline. for cfn, comp_config in components: - comp = new_nlp.add_pipe(cfn, config=comp_config) - if isinstance(comp, TextCategorizer): - for label in textcat_labels: - comp.add_label(label) + new_nlp.add_pipe(cfn, config=comp_config) new_examples = make_examples(new_nlp) new_nlp.initialize(get_examples=lambda: new_examples) From 08c0c4140cac4b02995638e958bc4c57b72633b3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 21 Oct 2022 11:09:32 +0200 Subject: [PATCH 21/35] Remove unused error --- spacy/cli/find_threshold.py | 2 +- spacy/errors.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 283c6d4fb36..6e4141d7b97 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -109,7 +109,7 @@ def find_threshold( except KeyError as err: wasabi.msg.fail(title=str(err), exits=1) if not hasattr(pipe, "scorer"): - raise AttributeError(Errors.E1048) + raise AttributeError(Errors.E1047) if not silent: wasabi.msg.info( diff --git a/spacy/errors.py b/spacy/errors.py index 95a6e3ae2eb..836ad785380 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -952,8 +952,7 @@ class Errors(metaclass=ErrorsWithCodes): "sure it's overwritten on the subclass.") E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default " "knowledge base, use `InMemoryLookupKB`.") - E1047 = ("`find_threshold()` only supports components of type `TrainablePipe`.") - E1048 = ("`find_threshold()` only supports components with a `scorer` attribute.") + E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.") # Deprecated model shortcuts, only used in errors and warnings From 67596fc58adb2ffb630ff2c31465a78a8b568089 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 21 Oct 2022 11:21:58 +0200 Subject: [PATCH 22/35] Undo changes to PRFScorer --- spacy/scorer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index ac58b94734a..8cd755ac40c 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -26,7 +26,7 @@ def __init__( *, tp: int = 0, fp: int = 0, - fn: int = 0 + fn: int = 0, ) -> None: self.tp = tp self.fp = fp @@ -43,7 +43,7 @@ def __iadd__(self, other): def __add__(self, other): return PRFScore( - tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn, + tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn ) def score_set(self, cand: set, gold: set) -> None: From 9d947a42ced99ce64085748106119e05a85f9943 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 24 Oct 2022 11:25:51 +0200 Subject: [PATCH 23/35] Change default value for n_trials. Log table iteratively. --- spacy/cli/find_threshold.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 6e4141d7b97..9f1b144750c 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -13,7 +13,7 @@ from .. import util _DEFAULTS = { - "n_trials": 10, + "n_trials": 11, "use_gpu": -1, "gold_preproc": False, } @@ -109,7 +109,7 @@ def find_threshold( except KeyError as err: wasabi.msg.fail(title=str(err), exits=1) if not hasattr(pipe, "scorer"): - raise AttributeError(Errors.E1047) + raise AttributeError(Errors.E1045) if not silent: wasabi.msg.info( @@ -125,7 +125,7 @@ def find_threshold( def set_nested_item( config: Dict[str, Any], keys: List[str], value: float ) -> Dict[str, Any]: - """Set item in nested dictionary. Adapated from https://stackoverflow.com/a/54138200. + """Set item in nested dictionary. Adapted from https://stackoverflow.com/a/54138200. config (Dict[str, Any]): Configuration dictionary. keys (List[Any]): Path to value to set. value (float): Value to set. @@ -149,6 +149,8 @@ def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: # Evaluate with varying threshold values. scores: Dict[float, float] = {} config_keys_full = ["components", pipe_name, *config_keys] + table_col_widths = (10, 10) + print(wasabi.tables.row(["Threshold", f"{scores_key}"], widths=table_col_widths)) for threshold in numpy.linspace(0, 1, n_trials): # Reload pipeline with overrides specifying the new threshold. nlp = util.load_model( @@ -173,15 +175,17 @@ def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: f"scores.", exits=1, ) + print( + wasabi.row( + [round(threshold, 3), round(scores[threshold], 3)], + widths=table_col_widths, + ) + ) best_threshold = max(scores.keys(), key=(lambda key: scores[key])) if not silent: print( - f"Best threshold: {round(best_threshold, ndigits=4)} with value of {scores[best_threshold]}.", - wasabi.tables.table( - data=[(threshold, score) for threshold, score in scores.items()], - header=["Threshold", f"{scores_key}"], - ), + f"\nBest threshold: {round(best_threshold, ndigits=4)} with value of {scores[best_threshold]}." ) return best_threshold, scores[best_threshold], scores From 19dd45fe9753491476fc3639b24a6c8a43809cda Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 28 Oct 2022 12:27:23 +0200 Subject: [PATCH 24/35] Add warnings for pointless applications of find_threshold(). --- spacy/cli/find_threshold.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 9f1b144750c..b62da3b2b65 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -7,6 +7,7 @@ import numpy import wasabi.tables +from pipeline import TextCategorizer, MultiLabel_TextCategorizer from ..errors import Errors from ..training import Corpus from ._util import app, Arg, Opt, import_code, setup_gpu @@ -111,6 +112,12 @@ def find_threshold( if not hasattr(pipe, "scorer"): raise AttributeError(Errors.E1045) + if isinstance(pipe, TextCategorizer): + wasabi.msg.warn( + "The `textcat` component doesn't use a threshold as it's not applicable to the concept of " + "exclusive classes. All thresholds will yield the same results." + ) + if not silent: wasabi.msg.info( title=f"Optimizing for {scores_key} for component '{pipe_name}' with {n_trials} " @@ -150,8 +157,9 @@ def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: scores: Dict[float, float] = {} config_keys_full = ["components", pipe_name, *config_keys] table_col_widths = (10, 10) + thresholds = numpy.linspace(0, 1, n_trials) print(wasabi.tables.row(["Threshold", f"{scores_key}"], widths=table_col_widths)) - for threshold in numpy.linspace(0, 1, n_trials): + for threshold in thresholds: # Reload pipeline with overrides specifying the new threshold. nlp = util.load_model( model, @@ -183,9 +191,23 @@ def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: ) best_threshold = max(scores.keys(), key=(lambda key: scores[key])) - if not silent: - print( - f"\nBest threshold: {round(best_threshold, ndigits=4)} with value of {scores[best_threshold]}." + + # If all scores are identical, emit warning. + if all([score == scores[thresholds[0]] for score in scores.values()]): + wasabi.msg.warn( + title="All scores are identical. Verify that all settings are correct.", + text="" + if ( + not isinstance(pipe, MultiLabel_TextCategorizer) + or scores_key in ("cats_macro_f", "cats_micro_f") + ) + else "Use `cats_macro_f` or `cats_micro_f` when optimizing the threshold for `textcat_multilabel`.", ) + else: + if not silent: + print( + f"\nBest threshold: {round(best_threshold, ndigits=4)} with value of {scores[best_threshold]}." + ) + return best_threshold, scores[best_threshold], scores From 5bacad8f6713f7ebdd9344a22cfaf220945530fe Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 28 Oct 2022 12:59:06 +0200 Subject: [PATCH 25/35] Fix imports. --- spacy/cli/find_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index b62da3b2b65..6baa811d2bf 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -7,7 +7,7 @@ import numpy import wasabi.tables -from pipeline import TextCategorizer, MultiLabel_TextCategorizer +from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer from ..errors import Errors from ..training import Corpus from ._util import app, Arg, Opt, import_code, setup_gpu From 5de02dc903495c818d253cbf970ae0cd28b791b0 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 28 Oct 2022 13:13:38 +0200 Subject: [PATCH 26/35] Adjust type check of TextCategorizer to exclude subclasses. --- spacy/cli/find_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 6baa811d2bf..fdd3a955064 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -112,7 +112,7 @@ def find_threshold( if not hasattr(pipe, "scorer"): raise AttributeError(Errors.E1045) - if isinstance(pipe, TextCategorizer): + if type(pipe) == TextCategorizer: wasabi.msg.warn( "The `textcat` component doesn't use a threshold as it's not applicable to the concept of " "exclusive classes. All thresholds will yield the same results." From 34c6c3b7c9afeebfc09f22e47b432ed98b23ff1e Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 11 Nov 2022 11:33:50 +0100 Subject: [PATCH 27/35] Change check of if there's only one unique value in scores. --- spacy/cli/find_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index fdd3a955064..df2e99017c1 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -193,7 +193,7 @@ def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: best_threshold = max(scores.keys(), key=(lambda key: scores[key])) # If all scores are identical, emit warning. - if all([score == scores[thresholds[0]] for score in scores.values()]): + if len(set(scores.values())) == 1: wasabi.msg.warn( title="All scores are identical. Verify that all settings are correct.", text="" From 5500a58c001392d86ded789795b494e4c9bb9074 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 17 Nov 2022 10:48:10 +0100 Subject: [PATCH 28/35] Update spacy/cli/find_threshold.py Co-authored-by: Sofie Van Landeghem --- spacy/cli/find_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index df2e99017c1..bfb15f39adb 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -207,7 +207,7 @@ def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: else: if not silent: print( - f"\nBest threshold: {round(best_threshold, ndigits=4)} with value of {scores[best_threshold]}." + f"\nBest threshold: {round(best_threshold, ndigits=4)} with {scores_key} value of {scores[best_threshold]}." ) return best_threshold, scores[best_threshold], scores From d080808bf72bb886f8eedb48faef57eaeabe13d5 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 17 Nov 2022 11:16:08 +0100 Subject: [PATCH 29/35] Incorporate feedback. --- spacy/cli/find_threshold.py | 45 ++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index bfb15f39adb..a37f276b415 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -30,7 +30,7 @@ def find_threshold_cli( data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"), threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"), - scores_key: str = Arg(..., help="Name of score to metric to optimize"), + scores_key: str = Arg(..., help="Metric to optimize"), n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"), code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), @@ -39,12 +39,12 @@ def find_threshold_cli( # fmt: on ): """ - Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric from CLI. + Runs prediction trials for models with varying tresholds to maximize the specified metric from CLI. model (Path): Path to file with trained model. data_path (Path): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. threshold_key (str): Key of threshold attribute in component's configuration. - scores_key (str): Name of score to metric to optimize. + scores_key (str): Metric to optimize. n_trials (int): Number of trials to determine optimal thresholds code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported. use_gpu (int): GPU ID or -1 for CPU. @@ -82,7 +82,7 @@ def find_threshold( silent: bool = True, ) -> Tuple[float, float, Dict[float, float]]: """ - Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric. + Runs prediction trials for models with varying tresholds to maximize the specified metric. model (Union[str, Path]): Path to file with trained model. data_path (Union[str, Path]): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. @@ -104,11 +104,9 @@ def find_threshold( wasabi.msg.fail("Evaluation data not found", data_path, exits=1) nlp = util.load_model(model) - pipe: Optional[Any] = None - try: - pipe = nlp.get_pipe(pipe_name) - except KeyError as err: - wasabi.msg.fail(title=str(err), exits=1) + if pipe_name not in nlp.component_names: + raise AttributeError(Errors.E001.format(name=pipe_name)) + pipe = nlp.get_pipe(pipe_name) if not hasattr(pipe, "scorer"): raise AttributeError(Errors.E1045) @@ -141,14 +139,24 @@ def set_nested_item( functools.reduce(operator.getitem, keys[:-1], config)[keys[-1]] = value return config - def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: + def filter_config( + config: Dict[str, Any], keys: List[str], full_key: str + ) -> Dict[str, Any]: """Filters provided config dictionary so that only the specified keys path remains. config (Dict[str, Any]): Configuration dictionary. keys (List[Any]): Path to value to set. + full_key (str): Full user-specified key. RETURNS (Dict[str, Any]): Filtered dictionary. """ + if keys[0] not in config: + wasabi.msg.fail( + title=f"Failed to look up `{full_key}` in config: sub-key {[keys[0]]} not found.", + text=f"Make sure you specified {[keys[0]]} correctly. The following sub-keys are available instead: " + f"{config.keys()}", + exits=1, + ) return { - keys[0]: filter_config(config[keys[0]], keys[1:]) + keys[0]: filter_config(config[keys[0]], keys[1:], full_key) if len(keys) > 1 else config[keys[0]] } @@ -164,7 +172,9 @@ def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: nlp = util.load_model( model, config=set_nested_item( - filter_config(nlp.config, config_keys_full).copy(), + filter_config( + nlp.config, config_keys_full, ".".join(config_keys_full) + ).copy(), config_keys_full, threshold, ), @@ -176,7 +186,16 @@ def filter_config(config: Dict[str, Any], keys: List[str]) -> Dict[str, Any]: set_nested_item(getattr(pipe, "cfg"), config_keys, threshold), ) - scores[threshold] = nlp.evaluate(dev_dataset)[scores_key] + eval_scores = nlp.evaluate(dev_dataset) + if scores_key not in eval_scores: + wasabi.msg.fail( + title=f"Failed to look up score `{scores_key}` in evaluation results.", + text=f"Make sure you specified the correct value for `scores_key` correctly. The following scores are " + f"available: {eval_scores.keys()}", + exits=1, + ) + scores[threshold] = eval_scores[scores_key] + if not isinstance(scores[threshold], (float, int)): wasabi.msg.fail( f"Returned score for key '{scores_key}' is not numeric. Threshold optimization only works for numeric " From 7b4da3f36d3650f6f35d56657fb6c71425ec3e8f Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 17 Nov 2022 11:50:42 +0100 Subject: [PATCH 30/35] Fix test issue. Update docstring. --- spacy/cli/find_threshold.py | 30 +++++++++++++++--------------- spacy/tests/test_cli.py | 3 +-- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index a37f276b415..32cb7a555e0 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -39,19 +39,17 @@ def find_threshold_cli( # fmt: on ): """ - Runs prediction trials for models with varying tresholds to maximize the specified metric from CLI. - model (Path): Path to file with trained model. - data_path (Path): Path to file with DocBin with docs to use for threshold search. - pipe_name (str): Name of pipe to examine thresholds for. - threshold_key (str): Key of threshold attribute in component's configuration. - scores_key (str): Metric to optimize. - n_trials (int): Number of trials to determine optimal thresholds - code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported. - use_gpu (int): GPU ID or -1 for CPU. - gold_preproc (bool): Whether to use gold preprocessing. Gold preprocessing helps the annotations align to the - tokenization, and may result in sequences of more consistent length. However, it may reduce runtime accuracy due - to train/test skew. - silent (bool): Display more information for debugging purposes + Runs prediction trials for models with varying tresholds to maximize the + specified metric. The search space for the threshold is traversed + linearly from 0 to 1 in n_trials steps. + + This is applicable only for components whose predictions are influenced + by thresholds (e.g. textcat_multilabel and spancat, but not textcat). + + Note that the full path to the corresponding threshold attribute in the + config has to be provided. + + DOCS: https://spacy.io/api/cli#find-threshold """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) @@ -105,7 +103,9 @@ def find_threshold( nlp = util.load_model(model) if pipe_name not in nlp.component_names: - raise AttributeError(Errors.E001.format(name=pipe_name)) + raise AttributeError( + Errors.E001.format(name=pipe_name, opts=nlp.component_names) + ) pipe = nlp.get_pipe(pipe_name) if not hasattr(pipe, "scorer"): raise AttributeError(Errors.E1045) @@ -190,7 +190,7 @@ def filter_config( if scores_key not in eval_scores: wasabi.msg.fail( title=f"Failed to look up score `{scores_key}` in evaluation results.", - text=f"Make sure you specified the correct value for `scores_key` correctly. The following scores are " + text=f"Make sure you specified the correct value for `scores_key`. The following scores are " f"available: {eval_scores.keys()}", exits=1, ) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 6d45ba53b4e..f29568bab7a 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -966,7 +966,7 @@ def init_nlp( nlp, _ = init_nlp() with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) - with pytest.raises(SystemExit) as error: + with pytest.raises(AttributeError): find_threshold( model=nlp_dir, data_path=docs_dir / "docs.spacy", @@ -975,7 +975,6 @@ def init_nlp( scores_key="cats_macro_f", silent=True, ) - assert error.value.code == 1 @pytest.mark.parametrize( From 809588de308d5c3053c9900842cdf43f46e4a540 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 17 Nov 2022 12:37:22 +0100 Subject: [PATCH 31/35] Update docs & docstring. --- spacy/cli/find_threshold.py | 21 +++++++++++---------- website/docs/api/cli.md | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 32cb7a555e0..f75b3aac832 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -39,15 +39,16 @@ def find_threshold_cli( # fmt: on ): """ - Runs prediction trials for models with varying tresholds to maximize the - specified metric. The search space for the threshold is traversed - linearly from 0 to 1 in n_trials steps. + Runs prediction trials for a trained model with varying tresholds to maximize + the specified metric. The search space for the threshold is traversed linearly + from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` + (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` + returns all results). - This is applicable only for components whose predictions are influenced - by thresholds (e.g. textcat_multilabel and spancat, but not textcat). - - Note that the full path to the corresponding threshold attribute in the - config has to be provided. + This is applicable only for components whose predictions are influenced by + thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note + that the full path to the corresponding threshold attribute in the config has to + be provided. DOCS: https://spacy.io/api/cli#find-threshold """ @@ -81,8 +82,8 @@ def find_threshold( ) -> Tuple[float, float, Dict[float, float]]: """ Runs prediction trials for models with varying tresholds to maximize the specified metric. - model (Union[str, Path]): Path to file with trained model. - data_path (Union[str, Path]): Path to file with DocBin with docs to use for threshold search. + model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory. + data_path (Path): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. threshold_key (str): Key of threshold attribute in component's configuration. scores_key (str): Name of score to metric to optimize. diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index fc2c46022ff..23a7e7a8d3a 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -12,6 +12,7 @@ menu: - ['train', 'train'] - ['pretrain', 'pretrain'] - ['evaluate', 'evaluate'] + - ['find-threshold', 'find-threshold'] - ['assemble', 'assemble'] - ['package', 'package'] - ['project', 'project'] @@ -474,8 +475,7 @@ report span characteristics such as the average span length and the span (or span boundary) distinctiveness. The distinctiveness measure shows how different the tokens are with respect to the rest of the corpus using the KL-divergence of the token distributions. To learn more, you can check out Papay et al.'s work on -[*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP -2020)](https://aclanthology.org/2020.emnlp-main.396/). +[_Dissecting Span Identification Tasks with Performance Prediction_ (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/). @@ -1163,6 +1163,37 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | Training results and optional metrics and visualizations. | +## find-threshold {#find-threshold new="3.5" tag="command"} + +Runs prediction trials for a trained model with varying tresholds to maximize +the specified metric. The search space for the threshold is traversed linearly +from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` +(the corresponding API call to `spacy.cli.find_threshold.find_threshold()` +returns all results). + +This is applicable only for components whose predictions are influenced by +thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note +that the full path to the corresponding threshold attribute in the config has to +be provided. + +```cli +$ python -m spacy find-threshold [model] [data_path] [pipe_name] [threshold_key] [scores_key] [--n_trials] [--code] [--use-gpu] [--gold-preproc] [--verbose] +``` + +| Name | Description | +| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Path to file with DocBin with docs to use for threshold search. ~~Path (positional)~~ | +| `pipe_name` | Name of pipe to examine thresholds for. ~~str (positional)~~ | +| `threshold_key` | Key of threshold attribute in component's configuration. ~~str (positional)~~ | +| `scores_key` | Name of score to metric to optimize. ~~str (positional)~~ | +| `--n_trials`, `-n` | Number of trials to determine optimal thresholds. ~~int (option)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--silent`, `-V`, `-VV` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | + ## assemble {#assemble tag="command"} Assemble a pipeline from a config file without additional training. Expects a From 3f9d879bfe5589a96bc7b746203e75dbd586a0f5 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 17 Nov 2022 12:53:27 +0100 Subject: [PATCH 32/35] Update spacy/tests/test_cli.py Co-authored-by: Adriane Boyd --- spacy/tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index f29568bab7a..affb20f251a 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -864,7 +864,7 @@ def test_span_length_freq_dist_output_must_be_correct(): def test_cli_find_threshold(capsys): thresholds = numpy.linspace(0, 1, 10) - def make_examples(_nlp: Language) -> List[Example]: + def make_examples(nlp: Language) -> List[Example]: docs: List[Example] = [] for t in [ From dd84d651c9424dc0645b4d9a7fb7e3a4de99fa2b Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 17 Nov 2022 12:56:14 +0100 Subject: [PATCH 33/35] Add examples to docs. Rename _nlp to nlp in tests. --- spacy/tests/test_cli.py | 2 +- website/docs/api/cli.md | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index affb20f251a..1c4d0c98f85 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -883,7 +883,7 @@ def make_examples(nlp: Language) -> List[Example]: }, ), ]: - doc = _nlp.make_doc(t[0]) + doc = nlp.make_doc(t[0]) docs.append(Example.from_dict(doc, t[1])) return docs diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 5f5a8ce15c7..b42ba8a4f62 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -1175,9 +1175,18 @@ thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note that the full path to the corresponding threshold attribute in the config has to be provided. -```cli -$ python -m spacy find-threshold [model] [data_path] [pipe_name] [threshold_key] [scores_key] [--n_trials] [--code] [--use-gpu] [--gold-preproc] [--verbose] -``` +> #### Examples +> +> ```cli +> # For textcat_multilabel: +> $ python -m spacy find-threshold my_nlp data.spacy textcat_multilabel threshold cats_macro_f +> ``` +> +> ```cli +> # For spancat: +> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f +> ``` + | Name | Description | | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | From 0ee225763a0a3e36651ee66557d75a2166d9be1e Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 17 Nov 2022 16:33:03 +0100 Subject: [PATCH 34/35] Update spacy/cli/find_threshold.py Co-authored-by: Sofie Van Landeghem --- spacy/cli/find_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index f75b3aac832..0f3b6989b99 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -153,7 +153,7 @@ def filter_config( wasabi.msg.fail( title=f"Failed to look up `{full_key}` in config: sub-key {[keys[0]]} not found.", text=f"Make sure you specified {[keys[0]]} correctly. The following sub-keys are available instead: " - f"{config.keys()}", + f"{list(config.keys())}", exits=1, ) return { From bbfef28791e16ad168f62ef6f357667ab2c4758d Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 17 Nov 2022 16:33:14 +0100 Subject: [PATCH 35/35] Update spacy/cli/find_threshold.py Co-authored-by: Sofie Van Landeghem --- spacy/cli/find_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 0f3b6989b99..efa664832f4 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -192,7 +192,7 @@ def filter_config( wasabi.msg.fail( title=f"Failed to look up score `{scores_key}` in evaluation results.", text=f"Make sure you specified the correct value for `scores_key`. The following scores are " - f"available: {eval_scores.keys()}", + f"available: {list(eval_scores.keys())}", exits=1, ) scores[threshold] = eval_scores[scores_key]