From 97fa11f4d75348dda8743cc25898d165bc1b953d Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 12 Jul 2023 10:17:27 +0200 Subject: [PATCH 1/8] Update 'spacy init config' to support 'llm' component. --- spacy/cli/init_config.py | 50 ++++++++++++++++++- spacy/cli/templates/quickstart_training.jinja | 25 +++++++++- spacy/errors.py | 2 + spacy/tests/test_cli.py | 23 ++++++++- 4 files changed, 96 insertions(+), 4 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index a7c03d00f90..2cb50ba33d9 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -1,7 +1,7 @@ import re from enum import Enum from pathlib import Path -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Any, Dict import srsly from jinja2 import Template @@ -9,9 +9,10 @@ from wasabi import Printer, diff_strings from .. import util +from ..errors import Errors from ..language import DEFAULT_CONFIG_PRETRAIN_PATH from ..schemas import RecommendationSchema -from ..util import SimpleFrozenList +from ..util import SimpleFrozenList, registry from ._util import ( COMMAND, Arg, @@ -40,6 +41,8 @@ class InitValues: lang = "en" pipeline = SimpleFrozenList(["tagger", "parser", "ner"]) + llm_task: Optional[str] = None + llm_model: Optional[str] = None optimize = Optimizations.efficiency gpu = False pretraining = False @@ -52,6 +55,8 @@ def init_config_cli( output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"), pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), + llm_task: str = Opt(InitValues.llm_task, "--llm.task", "-p", help="Name of task for LLM pipeline components"), + llm_model: str = Opt(InitValues.llm_model, "--llm.model", "-p", help="Name of model for LLM pipeline components"), optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."), pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), @@ -77,6 +82,8 @@ def init_config_cli( config = init_config( lang=lang, pipeline=pipeline, + llm_model=llm_model, + llm_task=llm_task, optimize=optimize.value, gpu=gpu, pretraining=pretraining, @@ -157,6 +164,8 @@ def init_config( *, lang: str = InitValues.lang, pipeline: List[str] = InitValues.pipeline, + llm_model: str = InitValues.llm_model, + llm_task: str = InitValues.llm_task, optimize: str = InitValues.optimize, gpu: bool = InitValues.gpu, pretraining: bool = InitValues.pretraining, @@ -165,8 +174,44 @@ def init_config( msg = Printer(no_print=silent) with TEMPLATE_PATH.open("r") as f: template = Template(f.read()) + # Filter out duplicates since tok2vec and transformer are added by template pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] + + # Verify LLM arguments are consistent, if at least one `llm` component has been specified. + llm_spec: Dict[str, Dict[str, Any]] = {} + if "llm" in pipeline: + try: + import spacy_llm + except ImportError as ex: + raise ValueError(Errors.E1055) from ex + + if llm_model is None: + raise ValueError("Option `llm.model` must be set if `llm` component is in pipeline.") + if llm_task is None: + raise ValueError("Option `llm.task` must be set if `llm` component is in pipeline.") + + # Select registry handles for model(s) and task(s). Raise if no match found. + llm_spec = { + spec_type: { + "arg": llm_model if spec_type == "model" else llm_task, + "matched_reg_handle": None, + "reg_handles": getattr(registry, f"llm_{spec_type}s").get_all() + } + for spec_type in ("model", "task") + } + + for spec_type, spec in llm_spec.items(): + for reg_handle in spec["reg_handles"]: + if reg_handle.split(".")[1].lower() == spec["arg"].lower().replace(".", "-"): + spec["matched_reg_handle"] = reg_handle + break + + if not spec["matched_reg_handle"]: + arg = spec["arg"] + raise ValueError(f"Couldn't find a matching registration handle for {spec_type} '{spec}'. Double-check" + f" whether '{arg}' is spelled correctly.") + defaults = RECOMMENDATIONS["__default__"] reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict() variables = { @@ -175,6 +220,7 @@ def init_config( "optimize": optimize, "hardware": "gpu" if gpu else "cpu", "transformer_data": reco["transformer"], + "llm_spec": {key: llm_spec[key]["matched_reg_handle"] for key in llm_spec}, "word_vectors": reco["word_vectors"], "has_letters": reco["has_letters"], } diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 1937ea93533..304a8354a07 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and can help generate the best possible configuration, given a user's requirements. #} {%- set use_transformer = hardware != "cpu" and transformer_data -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} -{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%} +{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer", "llm"] -%} [paths] train = null dev = null @@ -328,6 +328,18 @@ grad_factor = 1.0 {%- endif %} {%- endif %} +{% if "llm" in components -%} +[components.llm] +factory = "llm" + +[components.llm.model] +@llm_models = "{{ llm_spec['model'] }}" + +[components.llm.task] +@llm_tasks = "{{ llm_spec['task'] }}" +{% endif -%} + + {# NON-TRANSFORMER PIPELINE #} {% else -%} {% if "tok2vec" in full_pipeline -%} @@ -585,6 +597,17 @@ no_output_layer = false {%- endif %} {% endif %} +{% if "llm" in components -%} +[components.llm] +factory = "llm" + +[components.llm.model] +@llm_models = "{{ llm_spec['model'] }}" + +[components.llm.task] +@llm_tasks = "{{ llm_spec['task'] }}" +{% endif -%} + {% for pipe in components %} {% if pipe not in listener_components %} {# Other components defined by the user: we just assume they're factories #} diff --git a/spacy/errors.py b/spacy/errors.py index db1a886aa8f..b35dc1aa2c8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -981,6 +981,8 @@ class Errors(metaclass=ErrorsWithCodes): " 'min_length': {min_length}, 'max_length': {max_length}") E1054 = ("The text, including whitespace, must match between reference and " "predicted docs when training {component}.") + E1055 = ("To use the `llm` component, `spacy-llm` needs to be installed. `spacy-llm` was not found in your " + "environment, install it with `pip install spacy-llm`.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 8e1c9ca3215..66c4e308fca 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -628,7 +628,6 @@ def test_parse_cli_overrides(): "pipeline", [ ["tagger", "parser", "ner"], - [], ["ner", "textcat", "sentencizer"], ["morphologizer", "spancat", "entity_linker"], ["spancat_singlelabel", "textcat_multilabel"], @@ -651,6 +650,28 @@ def test_init_config(lang, pipeline, optimize, pretraining): load_model_from_config(config, auto_fill=True) +@pytest.mark.parametrize("pipeline", [["llm"]]) +@pytest.mark.parametrize("llm_model", ["noop"]) +@pytest.mark.parametrize("llm_task", ["ner", "sentiment"]) +def test_init_config_llm(pipeline, llm_model, llm_task): + config = init_config( + lang="en", + pipeline=pipeline, + llm_model=llm_model, + llm_task=llm_task, + optimize="accuracy", + pretraining=False, + gpu=False, + ) + assert isinstance(config, Config) + assert len(config["components"]) == 1 + assert "llm" in config["components"] + + load_model_from_config(config, auto_fill=True) + + + + def test_model_recommendations(): for lang, data in RECOMMENDATIONS.items(): assert RecommendationSchema(**data) From 564c2b04d9b760c0bbee0a8699e7db1704f204fa Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 13 Jul 2023 08:56:34 +0200 Subject: [PATCH 2/8] Format. --- spacy/cli/init_config.py | 20 ++++++++++++++------ spacy/tests/test_cli.py | 2 -- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 2cb50ba33d9..099fdcac1a7 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -187,30 +187,38 @@ def init_config( raise ValueError(Errors.E1055) from ex if llm_model is None: - raise ValueError("Option `llm.model` must be set if `llm` component is in pipeline.") + raise ValueError( + "Option `llm.model` must be set if `llm` component is in pipeline." + ) if llm_task is None: - raise ValueError("Option `llm.task` must be set if `llm` component is in pipeline.") + raise ValueError( + "Option `llm.task` must be set if `llm` component is in pipeline." + ) # Select registry handles for model(s) and task(s). Raise if no match found. llm_spec = { spec_type: { "arg": llm_model if spec_type == "model" else llm_task, "matched_reg_handle": None, - "reg_handles": getattr(registry, f"llm_{spec_type}s").get_all() + "reg_handles": getattr(registry, f"llm_{spec_type}s").get_all(), } for spec_type in ("model", "task") } for spec_type, spec in llm_spec.items(): for reg_handle in spec["reg_handles"]: - if reg_handle.split(".")[1].lower() == spec["arg"].lower().replace(".", "-"): + if reg_handle.split(".")[1].lower() == spec["arg"].lower().replace( + ".", "-" + ): spec["matched_reg_handle"] = reg_handle break if not spec["matched_reg_handle"]: arg = spec["arg"] - raise ValueError(f"Couldn't find a matching registration handle for {spec_type} '{spec}'. Double-check" - f" whether '{arg}' is spelled correctly.") + raise ValueError( + f"Couldn't find a matching registration handle for {spec_type} '{spec}'. Double-check" + f" whether '{arg}' is spelled correctly." + ) defaults = RECOMMENDATIONS["__default__"] reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict() diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 66c4e308fca..fdaaf5359ba 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -670,8 +670,6 @@ def test_init_config_llm(pipeline, llm_model, llm_task): load_model_from_config(config, auto_fill=True) - - def test_model_recommendations(): for lang, data in RECOMMENDATIONS.items(): assert RecommendationSchema(**data) From 423694206cadf109935b30f00257e235ac52c2ba Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 13 Jul 2023 08:58:39 +0200 Subject: [PATCH 3/8] Format. --- spacy/cli/init_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 099fdcac1a7..b6aff55ef50 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -1,7 +1,7 @@ import re from enum import Enum from pathlib import Path -from typing import List, Optional, Tuple, Any, Dict +from typing import Any, Dict, List, Optional, Tuple import srsly from jinja2 import Template From f9dddbb70d5950e05adf76491c7b7fd845c74eab Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 13 Jul 2023 09:03:02 +0200 Subject: [PATCH 4/8] Fix mypy issues. --- spacy/cli/init_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index b6aff55ef50..0ff431f6cc3 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -164,8 +164,8 @@ def init_config( *, lang: str = InitValues.lang, pipeline: List[str] = InitValues.pipeline, - llm_model: str = InitValues.llm_model, - llm_task: str = InitValues.llm_task, + llm_model: Optional[str] = InitValues.llm_model, + llm_task: Optional[str] = InitValues.llm_task, optimize: str = InitValues.optimize, gpu: bool = InitValues.gpu, pretraining: bool = InitValues.pretraining, From 9cf8ac87be923e267a7d59b46966f1d105661f0d Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 14 Jul 2023 21:44:13 +0200 Subject: [PATCH 5/8] Remove incorrect argument shorthands for LLM properties. --- spacy/cli/init_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 0ff431f6cc3..7c6464b6048 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -55,8 +55,8 @@ def init_config_cli( output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"), pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), - llm_task: str = Opt(InitValues.llm_task, "--llm.task", "-p", help="Name of task for LLM pipeline components"), - llm_model: str = Opt(InitValues.llm_model, "--llm.model", "-p", help="Name of model for LLM pipeline components"), + llm_task: str = Opt(InitValues.llm_task, "--llm.task", help="Name of task for LLM pipeline components"), + llm_model: str = Opt(InitValues.llm_model, "--llm.model", help="Name of model for LLM pipeline components"), optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."), pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), From 8da042ce17f3646afa4cf9f179edbaa4c9104a5b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 25 Jul 2023 18:05:14 +0200 Subject: [PATCH 6/8] use msg.fail instead of raising error to provide nicer output on the console --- spacy/cli/init_config.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 7c6464b6048..2d735eec58c 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -187,12 +187,14 @@ def init_config( raise ValueError(Errors.E1055) from ex if llm_model is None: - raise ValueError( - "Option `llm.model` must be set if `llm` component is in pipeline." + msg.fail( + "Option `--llm.model` must be set if `llm` component is in pipeline.", + exits=1, ) if llm_task is None: - raise ValueError( - "Option `llm.task` must be set if `llm` component is in pipeline." + msg.fail( + "Option `--llm.task` must be set if `llm` component is in pipeline.", + exits=1, ) # Select registry handles for model(s) and task(s). Raise if no match found. @@ -215,9 +217,11 @@ def init_config( if not spec["matched_reg_handle"]: arg = spec["arg"] - raise ValueError( - f"Couldn't find a matching registration handle for {spec_type} '{spec}'. Double-check" - f" whether '{arg}' is spelled correctly." + valid_args = sorted(spec["reg_handles"].keys()) + msg.fail( + f"Couldn't find a matching registration handle for {spec_type} '{arg}'. " + f"Available functions: {valid_args}", + exits=1, ) defaults = RECOMMENDATIONS["__default__"] From 0948dc19fb18fbac56056404b98e73950333ade8 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 25 Jul 2023 18:13:14 +0200 Subject: [PATCH 7/8] better representation of valid values --- spacy/cli/init_config.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 2d735eec58c..8cc67a80774 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -208,19 +208,20 @@ def init_config( } for spec_type, spec in llm_spec.items(): + valid_values = set() + user_value = spec["arg"].lower().replace(".", "-") for reg_handle in spec["reg_handles"]: - if reg_handle.split(".")[1].lower() == spec["arg"].lower().replace( - ".", "-" - ): + reg_name = reg_handle.split(".")[1] + valid_values.add(reg_name) + if reg_name.lower() == user_value: spec["matched_reg_handle"] = reg_handle break if not spec["matched_reg_handle"]: arg = spec["arg"] - valid_args = sorted(spec["reg_handles"].keys()) msg.fail( f"Couldn't find a matching registration handle for {spec_type} '{arg}'. " - f"Available functions: {valid_args}", + f"Available functions: {valid_values}", exits=1, ) From a3bf379cdde822b115dacf494b716063bcb5581c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 25 Jul 2023 18:15:47 +0200 Subject: [PATCH 8/8] update msg --- spacy/cli/init_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 8cc67a80774..b0198e6cadf 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -221,7 +221,7 @@ def init_config( arg = spec["arg"] msg.fail( f"Couldn't find a matching registration handle for {spec_type} '{arg}'. " - f"Available functions: {valid_values}", + f"Valid options are: {valid_values}", exits=1, )