Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dynamic few shot classifier #33

Merged
merged 6 commits into from
Jun 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ repos:
- id: check-executables-have-shebangs
- id: check-case-conflict
- id: check-added-large-files
- id: detect-aws-credentials
- id: detect-private-key
# Formatter for Json and Yaml files
- repo: https://github.com/pre-commit/mirrors-prettier
Expand Down
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ ZeroShotGPTClassifier(openai_model="gpt4all::ggml-gpt4all-j-v1.3-groovy")

When running for the first time, the model file will be downloaded automatially.

At the moment only the following estimators support gpt4all as a backend:
At the moment only the following estimators support gpt4all as a backend:

- `ZeroShotGPTClassifier`
- `MultiLabelZeroShotGPTClassifier`
- `FewShotGPTClassifier`
Expand Down Expand Up @@ -179,6 +180,27 @@ While the api remains the same as for the zero shot classifier, there are a few

Note: as the model is not being re-trained, but uses the training data during inference, one could say that this is still a (different) zero-shot approach.

### Dynamic Few-Shot Text Classification

`DynamicFewShotGPTClassifier` dynamically selects N samples per class to include in the prompt. This allows the few-shot classifier to scale to datasets that are too large for the standard context window of LLMs.

*How does it work?*

During fitting, the whole dataset is partitioned by class, vectorized, and stored.

During inference, the [annoy](https://github.com/spotify/annoy) library is used for fast neighbor lookup, which allows including only the most similar examples in the prompt.

```python
from skllm import DynamicFewShotGPTClassifier
from skllm.datasets import get_classification_dataset

X, y = get_classification_dataset()

clf = DynamicFewShotGPTClassifier(n_examples=3)
clf.fit(X, y)
labels = clf.predict(X)
```

### Text Vectorization

As an alternative to using GPT as a classifier, it can be used solely for data preprocessing. `GPTVectorizer` allows to embed a chunk of text of arbitrary length to a fixed-dimensional vector, that can be used with virtually any classification or regression model.
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ dependencies = [
"pandas>=1.5.0",
"openai>=0.27.0",
"tqdm>=4.60.0",
"annoy>=1.17.2",
]
name = "scikit-llm"
version = "0.1.1"
version = "0.2.0"
authors = [
{ name="Oleg Kostromin", email="kostromin97@gmail.com" },
{ name="Iryna Kondrashchenko", email="iryna230520@gmail.com" },
Expand Down Expand Up @@ -79,12 +80,13 @@ target-version = ['py38', 'py39', 'py310', 'py311']
profile = "black"
filter_files = true
known_first_party = ["skllm", "skllm.*"]
skip = ["__init__.py"]

[tool.docformatter]
close-quotes-on-newline = true # D209

[tool.interrogate]
fail-under = 80
fail-under = 65
ignore-module = true
ignore-nested-functions = true
ignore-private = true
Expand Down
4 changes: 3 additions & 1 deletion skllm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from skllm.models.gpt_few_shot_clf import FewShotGPTClassifier
# ordering is important here to prevent circular imports
from skllm.models.gpt_zero_shot_clf import (
MultiLabelZeroShotGPTClassifier,
ZeroShotGPTClassifier,
)
from skllm.models.gpt_few_shot_clf import FewShotGPTClassifier
from skllm.models.gpt_dyn_few_shot_clf import DynamicFewShotGPTClassifier
1 change: 1 addition & 0 deletions skllm/memory/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from skllm.memory._annoy import AnnoyMemoryIndex
119 changes: 119 additions & 0 deletions skllm/memory/_annoy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import os
import tempfile
from typing import Any, List

from annoy import AnnoyIndex
from numpy import ndarray

from skllm.memory.base import _BaseMemoryIndex


class AnnoyMemoryIndex(_BaseMemoryIndex):
"""Memory index using Annoy.

Parameters
----------
dim : int
dimensionality of the vectors
metric : str, optional
metric to use, by default "euclidean"
"""

def __init__(self, dim: int, metric: str = "euclidean", **kwargs: Any) -> None:
self._index = AnnoyIndex(dim, metric)
self.metric = metric
self.dim = dim
self.built = False

def add(self, id: int, vector: ndarray) -> None:
"""Adds a vector to the index.

Parameters
----------
id : Any
identifier for the vector
vector : ndarray
vector to add to the index
"""
if self.built:
raise RuntimeError("Cannot add vectors after index is built.")
self._index.add_item(id, vector)

def build(self) -> None:
"""Builds the index.

No new vectors can be added after building.
"""
self._index.build(-1)
self.built = True

def retrieve(self, vectors: ndarray, k: int) -> List[List[int]]:
"""Retrieves the k nearest neighbors for each vector.

Parameters
----------
vectors : ndarray
vectors to retrieve nearest neighbors for
k : int
number of nearest neighbors to retrieve

Returns
-------
List
ids of retrieved nearest neighbors
"""
if not self.built:
raise RuntimeError("Cannot retrieve vectors before the index is built.")
return [
self._index.get_nns_by_vector(v, k, search_k=-1, include_distances=False)
for v in vectors
]

def __getstate__(self) -> dict:
"""Returns the state of the object. To store the actual annoy index, it
has to be written to a temporary file.

Returns
-------
dict
state of the object
"""
state = self.__dict__.copy()

# save index to temporary file
with tempfile.NamedTemporaryFile(delete=False) as tmp:
temp_filename = tmp.name
self._index.save(temp_filename)

# read bytes from the file
with open(temp_filename, "rb") as tmp:
index_bytes = tmp.read()

# store bytes representation in state
state["_index"] = index_bytes

# remove temporary file
os.remove(temp_filename)

return state

def __setstate__(self, state: dict) -> None:
"""Sets the state of the object. It restores the annoy index from the
bytes representation.

Parameters
----------
state : dict
state of the object
"""
self.__dict__.update(state)
# restore index from bytes
with tempfile.NamedTemporaryFile(delete=False) as tmp:
temp_filename = tmp.name
tmp.write(self._index)

self._index = AnnoyIndex(self.dim, self.metric)
self._index.load(temp_filename)

# remove temporary file
os.remove(temp_filename)
45 changes: 45 additions & 0 deletions skllm/memory/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from abc import ABC, abstractmethod
from typing import Any, List

from numpy import ndarray


class _BaseMemoryIndex(ABC):
@abstractmethod
def add(self, id: Any, vector: ndarray):
"""Adds a vector to the index.

Parameters
----------
id : Any
identifier for the vector
vector : ndarray
vector to add to the index
"""
pass

@abstractmethod
def retrieve(self, vectors: ndarray, k: int) -> List:
"""Retrieves the k nearest neighbors for each vector.

Parameters
----------
vectors : ndarray
vectors to retrieve nearest neighbors for
k : int
number of nearest neighbors to retrieve

Returns
-------
List
ids of retrieved nearest neighbors
"""
pass

@abstractmethod
def build(self) -> None:
"""Builds the index.

All build parameters should be passed to the constructor.
"""
pass
115 changes: 115 additions & 0 deletions skllm/models/gpt_dyn_few_shot_clf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from __future__ import annotations

import numpy as np
import pandas as pd

from skllm import FewShotGPTClassifier
from skllm.memory import AnnoyMemoryIndex
from skllm.models.gpt_few_shot_clf import _TRAINING_SAMPLE_PROMPT_TEMPLATE
from skllm.preprocessing import GPTVectorizer
from skllm.prompts.builders import build_few_shot_prompt_slc
from skllm.utils import to_numpy


class DynamicFewShotGPTClassifier(FewShotGPTClassifier):
"""Dynamic few-shot single-label classifier.

Parameters
----------
n_examples : int, optional
number of examples per class, by default 3
openai_key : Optional[str] , default : None
Your OpenAI API key. If None, the key will be read from the SKLLM_CONFIG_OPENAI_KEY environment variable.
openai_org : Optional[str] , default : None
Your OpenAI organization. If None, the organization will be read from the SKLLM_CONFIG_OPENAI_ORG
environment variable.
openai_model : str , default : "gpt-3.5-turbo"
The OpenAI model to use. See https://beta.openai.com/docs/api-reference/available-models for a list of
available models.
default_label : Optional[Union[List[str], str]] , default : 'Random'
The default label to use if the LLM could not generate a response for a sample. If set to 'Random' a random
label will be chosen based on probabilities from the training set.
"""

def __init__(
self,
n_examples: int = 3,
openai_key: str | None = None,
openai_org: str | None = None,
openai_model: str = "gpt-3.5-turbo",
default_label: str | None = "Random",
):
super().__init__(openai_key, openai_org, openai_model, default_label)
self.n_examples = n_examples

def fit(
self,
X: np.ndarray | pd.Series | list[str],
y: np.ndarray | pd.Series | list[str],
) -> DynamicFewShotGPTClassifier:
"""Fits the model to the given data.

Parameters
----------
X : Union[np.ndarray, pd.Series, List[str]]
training data
y : Union[np.ndarray, pd.Series, List[str]]
training labels

Returns
-------
DynamicFewShotGPTClassifier
self
"""
X = to_numpy(X)
y = to_numpy(y)
self.embedding_model_ = GPTVectorizer().fit(X)
self.classes_, self.probabilities_ = self._get_unique_targets(y)

self.data_ = {}
for cls in self.classes_:
print(f"Building index for class `{cls}` ...")
self.data_[cls] = {}
partition = X[y == cls]
self.data_[cls]["partition"] = partition
embeddings = self.embedding_model_.transform(partition)
index = AnnoyMemoryIndex(embeddings.shape[1])
for i, embedding in enumerate(embeddings):
index.add(i, embedding)
index.build()
self.data_[cls]["index"] = index

return self

def _get_prompt(self, x: str) -> str:
"""Generates the prompt for the given input.

Parameters
----------
x : str
sample to classify

Returns
-------
str
final prompt
"""
embedding = self.embedding_model_.transform([x])
training_data = []
for cls in self.classes_:
index = self.data_[cls]["index"]
partition = self.data_[cls]["partition"]
neighbors = index.retrieve(embedding, min(self.n_examples, len(partition)))
neighbors = [partition[i] for i in neighbors[0]]
training_data.extend(
[
_TRAINING_SAMPLE_PROMPT_TEMPLATE.format(x=neighbor, label=cls)
for neighbor in neighbors
]
)

training_data_str = "\n".join(training_data)

return build_few_shot_prompt_slc(
x=x, training_data=training_data_str, labels=repr(self.classes_)
)
Loading