diff --git a/README.md b/README.md index b72172d..7da6b27 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,8 @@ SKLLMConfig.set_openai_key("") SKLLMConfig.set_openai_org("") ``` -**Important notice:** +**Important notice:** + - If you have a free trial OpenAI account, the [rate limits](https://platform.openai.com/docs/guides/rate-limits/overview) are not sufficient (specifically 3 requests per minute). Please switch to the "pay as you go" plan first. - When calling `SKLLMConfig.set_openai_org`, you have to provide your organization ID and **NOT** the name. You can find your ID [here](https://platform.openai.com/account/org-settings). @@ -121,6 +122,29 @@ clf.fit(None, [candidate_labels]) labels = clf.predict(X) ``` +### Few-Shot Text Classification + +With `FewShotGPTClassifier` it is possible to perform a few-shot classification, which means that the training samples will be added to prompt and passed to the model. + +```python +from skllm import FewShotGPTClassifier +from skllm.datasets import get_classification_dataset + +X, y = get_classification_dataset() + +clf = FewShotGPTClassifier(openai_model="gpt-3.5-turbo") +clf.fit(X, y) +labels = clf.predict(X) +``` + +While the api remains the same as for the zero shot classifier, there are a few things to take into account: + +- the "training" requires some labelled training data; +- the training set should be small enough to fit into a single prompt (we recommend up to 10 samples per label); +- because of the significantly larger prompt, the inference takes longer and consumes higher amount of tokens. + +Note: as the model is not being re-trained, but uses the training data during inference, one could say that this is still a (different) zero-shot approach. + ### Text Vectorization As an alternative to using GPT as a classifier, it can be used solely for data preprocessing. `GPTVectorizer` allows to embed a chunk of text of arbitrary length to a fixed-dimensional vector, that can be used with virtually any classification or regression model. @@ -156,12 +180,13 @@ yh = clf.predict(X_test) GPT excels at performing summarization tasks. Therefore, we provide `GPTSummarizer` that can be used both as stand-alone estimator, or as a preprocessor (in this case we can make an analogy with a dimensionality reduction preprocessor). Example: + ```python from skllm.preprocessing import GPTSummarizer from skllm.datasets import get_summarization_dataset X = get_summarization_dataset() -s = GPTSummarizer(openai_model = 'gpt-3.5-turbo', max_words = 15) +s = GPTSummarizer(openai_model="gpt-3.5-turbo", max_words=15) summaries = s.fit_transform(X) ``` @@ -172,12 +197,16 @@ Please be aware that the `max_words` hyperparameter sets a soft limit, which is - [x] Zero-Shot Classification with OpenAI GPT 3/4 - [x] Multiclass classification - [x] Multi-label classification - - [x] ChatGPT models - - [ ] InstructGPT models -- [ ] Few shot classifier +- [ ] Few-Shot classifier + - [x] Multiclass classification + - [ ] Multi-label classification - [x] GPT Vectorizer -- [ ] GPT Fine-tuning (optional) -- [ ] Integration of other LLMs +- [x] ChatGPT models +- [ ] InstructGPT models +- [ ] InstructGPT Fine-tuning (optional) +- [ ] Open source models + +*The order of the elements in the roadmap is arbitrary and does not reflect the planned order of implementation.* ## Contributing diff --git a/pyproject.toml b/pyproject.toml index 2818764..be5c561 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ ignore = [ "E501", "N803", "N806", + "D104", ] extend-exclude = ["tests/*.py", "setup.py"] target-version = "py38" diff --git a/skllm/__init__.py b/skllm/__init__.py index 3893f0a..8502005 100644 --- a/skllm/__init__.py +++ b/skllm/__init__.py @@ -1 +1,5 @@ -from skllm.models.gpt_zero_shot_clf import ZeroShotGPTClassifier, MultiLabelZeroShotGPTClassifier \ No newline at end of file +from skllm.models.gpt_few_shot_clf import FewShotGPTClassifier +from skllm.models.gpt_zero_shot_clf import ( + MultiLabelZeroShotGPTClassifier, + ZeroShotGPTClassifier, +) diff --git a/skllm/models/gpt_few_shot_clf.py b/skllm/models/gpt_few_shot_clf.py new file mode 100644 index 0000000..b552f38 --- /dev/null +++ b/skllm/models/gpt_few_shot_clf.py @@ -0,0 +1,74 @@ +from typing import List, Union + +import numpy as np +import pandas as pd + +from skllm.models.gpt_zero_shot_clf import ( + ZeroShotGPTClassifier as _ZeroShotGPTClassifier, +) +from skllm.prompts.builders import build_few_shot_prompt_slc +from skllm.utils import to_numpy as _to_numpy + +_TRAINING_SAMPLE_PROMPT_TEMPLATE = """ +Sample input: +```{x}``` + +Sample target: {label} +""" + + +class FewShotGPTClassifier(_ZeroShotGPTClassifier): + """Few-shot single-label classifier.""" + + def fit( + self, + X: Union[np.ndarray, pd.Series, List[str]], + y: Union[np.ndarray, pd.Series, List[str]], + ): + """Fits the model by storing the training data and extracting the + unique targets. + + Parameters + ---------- + X : Union[np.ndarray, pd.Series, List[str]] + training data + y : Union[np.ndarray, pd.Series, List[str]] + training labels + + Returns + ------- + FewShotGPTClassifier + self + """ + if not len(X) == len(y): + raise ValueError("X and y must have the same length.") + X = _to_numpy(X) + y = _to_numpy(y) + self.training_data_ = (X, y) + self.classes_, self.probabilities_ = self._get_unique_targets(y) + return self + + def _get_prompt(self, x: str) -> str: + """Generates the prompt for the given input. + + Parameters + ---------- + x : str + sample to classify + + Returns + ------- + str + final prompt + """ + training_data = [] + for xt, yt in zip(*self.training_data_): + training_data.append( + _TRAINING_SAMPLE_PROMPT_TEMPLATE.format(x=xt, label=yt) + ) + + training_data_str = "\n".join(training_data) + + return build_few_shot_prompt_slc( + x=x, training_data=training_data_str, labels=repr(self.classes_) + ) diff --git a/skllm/prompts/builders.py b/skllm/prompts/builders.py index c9d7418..6fc57d0 100644 --- a/skllm/prompts/builders.py +++ b/skllm/prompts/builders.py @@ -1,6 +1,7 @@ from typing import Union from skllm.prompts.templates import ( + FEW_SHOT_CLF_PROMPT_TEMPLATE, SUMMARY_PROMPT_TEMPLATE, ZERO_SHOT_CLF_PROMPT_TEMPLATE, ZERO_SHOT_MLCLF_PROMPT_TEMPLATE, @@ -31,6 +32,33 @@ def build_zero_shot_prompt_slc( return template.format(x=x, labels=labels) +def build_few_shot_prompt_slc( + x: str, + labels: str, + training_data: str, + template: str = FEW_SHOT_CLF_PROMPT_TEMPLATE, +) -> str: + """Builds a prompt for zero-shot single-label classification. + + Parameters + ---------- + x : str + sample to classify + labels : str + candidate labels in a list-like representation + training_data : str + training data to be used for few-shot learning + template : str + prompt template to use, must contain placeholders for all variables, by default ZERO_SHOT_CLF_PROMPT_TEMPLATE + + Returns + ------- + str + prepared prompt + """ + return template.format(x=x, labels=labels, training_data=training_data) + + def build_zero_shot_prompt_mlc( x: str, labels: str, diff --git a/skllm/prompts/templates.py b/skllm/prompts/templates.py index 18785c0..fbe1cdb 100644 --- a/skllm/prompts/templates.py +++ b/skllm/prompts/templates.py @@ -15,6 +15,27 @@ Your JSON response: """ +FEW_SHOT_CLF_PROMPT_TEMPLATE = """ +You will be provided with the following information: +1. An arbitrary text sample. The sample is delimited with triple backticks. +2. List of categories the text sample can be assigned to. The list is delimited with square brackets. The categories in the list are enclosed in the single quotes and comma separated. +3. Examples of text samples and their assigned categories. The examples are delimited with triple backticks. The assigned categories are enclosed in a list-like structure. These examples are to be used as training data. + +Perform the following tasks: +1. Identify to which category the provided text belongs to with the highest probability. +2. Assign the provided text to that category. +3. Provide your response in a JSON format containing a single key `label` and a value corresponding to the assigned category. Do not provide any additional information except the JSON. + +List of categories: {labels} + +Training data: +{training_data} + +Text sample: ```{x}``` + +Your JSON response: +""" + ZERO_SHOT_MLCLF_PROMPT_TEMPLATE = """ You will be provided with the following information: 1. An arbitrary text sample. The sample is delimited with triple backticks.