Skip to content

Commit

Permalink
Replace fasttext and pymagnitude with staticvectors. Closes #857. Closes
Browse files Browse the repository at this point in the history
 #858. Closes #859
  • Loading branch information
davidmezzetti committed Jan 27, 2025
1 parent 12aa955 commit f79d806
Show file tree
Hide file tree
Showing 13 changed files with 252 additions and 440 deletions.
7 changes: 0 additions & 7 deletions docs/embeddings/configuration/vectors.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,6 @@ Builds embeddings using a [Model2Vec](https://github.com/MinishLab/model2vec) mo

Builds embeddings using a word embeddings model and static vectors. While Transformers models are preferred in most cases, this method can be useful for low resource and historical languages where there isn't much linguistic data available.

#### storevectors
```yaml
storevectors: boolean
```

Enables copying of a vectors model set in path into the embeddings models output directory on save. This option enables a fully encapsulated index with no external file dependencies.

#### pca
```yaml
pca: int
Expand Down
2 changes: 1 addition & 1 deletion examples/01_Introducing_txtai.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
"!pip install git+https://github.com/neuml/txtai#egg=txtai[graph]\n",
"\n",
"# Install translation pipeline dependencies for later examples\n",
"!pip install sentencepiece sacremoses fasttext"
"!pip install sentencepiece sacremoses staticvectors"
],
"execution_count": 1,
"outputs": []
Expand Down
404 changes: 164 additions & 240 deletions examples/03_Build_an_Embeddings_index_from_a_data_source.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion examples/24_Whats_new_in_txtai_4_0.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@
"source": [
"# External vector models\n",
"\n",
"txtai supports generating vectors with [Hugging Face Transformers](https://github.com/huggingface/transformers), [PyTorch](https://github.com/pytorch/pytorch), [ONNX](https://github.com/microsoft/onnxruntime) and [Word Vector](https://github.com/neuml/magnitude) models.\n",
"txtai supports generating vectors with [Hugging Face Transformers](https://github.com/huggingface/transformers), [PyTorch](https://github.com/pytorch/pytorch), [ONNX](https://github.com/microsoft/onnxruntime) and [Word Vector](https://github.com/neuml/staticvectors) models.\n",
"\n",
"This release adds support for pre-computed vectors using external models. External models may be an API, custom library and/or another way to vectorize data. This adds flexibility given the high computation cost in building embeddings vectors. Embeddings generation could be outsourced or consolidated to a group of servers with GPUs, leaving index servers to run on lower resourced machines. \n",
"\n",
Expand Down
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@

extras["pipeline-llm"] = ["litellm>=1.37.16", "llama-cpp-python>=0.2.75"]

extras["pipeline-text"] = ["fasttext-wheel>=0.9.2", "sentencepiece>=0.1.91"]
extras["pipeline-text"] = ["sentencepiece>=0.1.91", "staticvectors>=0.2.0"]

extras["pipeline-train"] = [
"accelerate>=0.26.0",
Expand All @@ -93,14 +93,13 @@
extras["scoring"] = ["sqlalchemy>=2.0.20"]

extras["vectors"] = [
"fasttext-wheel>=0.9.2",
"litellm>=1.37.16",
"llama-cpp-python>=0.2.75",
"model2vec>=0.3.0",
"pymagnitude-lite>=0.1.43",
"scikit-learn>=0.23.1",
"sentence-transformers>=2.2.0",
"skops>=0.9.0",
"staticvectors>=0.2.0",
]

extras["workflow"] = [
Expand Down
7 changes: 0 additions & 7 deletions src/python/txtai/embeddings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import json
import os
import shutil
import tempfile

import numpy as np
Expand Down Expand Up @@ -618,12 +617,6 @@ def save(self, path, cloud=None, **kwargs):
# Create output directory, if necessary
os.makedirs(path, exist_ok=True)

# Copy vectors model
if self.config.get("storevectors"):
shutil.copyfile(self.config["path"], os.path.join(path, os.path.basename(self.config["path"])))

self.config["path"] = os.path.basename(self.config["path"])

# Save index configuration
Configuration().save(self.config, path)

Expand Down
4 changes: 0 additions & 4 deletions src/python/txtai/embeddings/index/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@ def load(self, path):
# Add format parameter
config["format"] = "json" if jsonconfig else "pickle"

# Build full path to embedding vectors file
if config.get("storevectors"):
config["path"] = os.path.join(path, config["path"])

return config

def save(self, config, path):
Expand Down
44 changes: 9 additions & 35 deletions src/python/txtai/pipeline/text/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,14 @@
Translation module
"""

import os

# Conditional import
try:
import fasttext
from staticvectors import StaticVectors

FASTTEXT = True
STATICVECTORS = True
except ImportError:
FASTTEXT = False
STATICVECTORS = False

from huggingface_hub import hf_hub_download
from huggingface_hub.hf_api import HfApi
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

Expand All @@ -26,7 +23,7 @@ class Translation(HFModel):
"""

# Default language detection model
DEFAULT_LANG_DETECT = "julien-c/fasttext-language-id/lid.176.ftz"
DEFAULT_LANG_DETECT = "neuml/language-id-quantized"

def __init__(self, path=None, quantize=False, gpu=True, batch=64, langdetect=None, findmodels=True):
"""
Expand Down Expand Up @@ -137,7 +134,7 @@ def detect(self, texts):

def defaultdetect(self, texts):
"""
Default fasttext language detection model.
Default language detection model.
Args:
texts: list of text
Expand All @@ -147,43 +144,20 @@ def defaultdetect(self, texts):
"""

if not self.detector:
if not FASTTEXT:
if not STATICVECTORS:
raise ImportError('Language detection is not available - install "pipeline" extra to enable')

# Suppress unnecessary warning
fasttext.FastText.eprint = lambda x: None

# Get model path
path = self.langdetect if self.langdetect else Translation.DEFAULT_LANG_DETECT

# Load language detection model
path = path if os.path.exists(path) else self.download(path)
self.detector = fasttext.load_model(path)
self.detector = StaticVectors(path)

# Transform texts to format expected by language detection model
texts = [x.lower().replace("\n", " ").replace("\r\n", " ") for x in texts]

return [x[0].split("__")[-1] for x in self.detector.predict(texts)[0]]

def download(self, path):
"""
Downloads path from the Hugging Face Hub.
Args:
path: full model path
Returns:
local cached model path
"""

# Split into parts
parts = path.split("/")

# Calculate repo id split
repo = 2 if len(parts) > 2 else 1

# Download and cache file
return hf_hub_download(repo_id="/".join(parts[:repo]), filename="/".join(parts[repo:]))
# Detect languages
return [x[0][0] for x in self.detector.predict(texts)]

def translate(self, texts, source, target, showmodels=False):
"""
Expand Down
2 changes: 1 addition & 1 deletion src/python/txtai/vectors/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def method(config):
method = "llama.cpp"
elif Model2Vec.ismodel(path):
method = "model2vec"
elif WordVectors.isdatabase(path):
elif WordVectors.ismodel(path):
method = "words"
else:
method = "transformers"
Expand Down
133 changes: 55 additions & 78 deletions src/python/txtai/vectors/words.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,24 @@
Word Vectors module
"""

import json
import logging
import os
import tempfile

from errno import ENOENT
from multiprocessing import Pool

import numpy as np

# Conditionally import Word Vector libraries as they aren't installed by default
from transformers.utils import cached_file

# Conditional import
try:
import fasttext
from pymagnitude import converter, Magnitude
from staticvectors import Database, StaticVectors

WORDS = True
STATICVECTORS = True
except ImportError:
WORDS = False
STATICVECTORS = False

from ..pipeline import Tokenizer

Expand Down Expand Up @@ -66,24 +67,59 @@ class WordVectors(Vectors):
Builds vectors using weighted word embeddings.
"""

@staticmethod
def ismodel(path):
"""
Checks if path is a WordVectors model.
Args:
path: input path
Returns:
True if this is a WordVectors model, False otherwise
"""

# Check if this is a SQLite database
if WordVectors.isdatabase(path):
return True

try:
# Download file and parse JSON
path = cached_file(path_or_repo_id=path, filename="config.json")
if path:
with open(path, encoding="utf-8") as f:
config = json.load(f)
return config.get("model_type") == "staticvectors"

# Ignore this error - invalid repo or directory
except OSError:
pass

return False

@staticmethod
def isdatabase(path):
"""
Checks if this is a SQLite database file which is the file format used for word vectors databases.
Args:
path: path to check
Returns:
True if this is a SQLite database
"""

return isinstance(path, str) and Database.isdatabase(path)

def __init__(self, config, scoring, models):
# Check before parent constructor since it calls loadmodel
if not WORDS:
# Raise error if trying to create Word Vectors without vectors extra
raise ImportError(
'Word vector models are not available - install "vectors" extra to enable. Otherwise, specify '
+ 'method="transformers" to use transformer backed models'
)
if not STATICVECTORS:
raise ImportError('staticvectors is not available - install "vectors" extra to enable')

super().__init__(config, scoring, models)

def loadmodel(self, path):
# Ensure that vector path exists
if not path or not os.path.isfile(path):
raise IOError(ENOENT, "Vector model file not found", path)

# Load magnitude model. If this is a training run (uninitialized config), block until vectors are fully loaded
return Magnitude(path, case_insensitive=True, blocking=not self.initialized)
return StaticVectors(path)

def encode(self, data):
# Iterate over each data element, tokenize (if necessary) and build an aggregated embeddings vector
Expand Down Expand Up @@ -156,67 +192,8 @@ def lookup(self, tokens):
word vectors array
"""

return self.model.query(tokens)
return self.model.embeddings(tokens)

def tokens(self, data):
# Skip tokenization rules
return data

@staticmethod
def isdatabase(path):
"""
Checks if this is a SQLite database file which is the file format used for word vectors databases.
Args:
path: path to check
Returns:
True if this is a SQLite database
"""

if isinstance(path, str) and os.path.isfile(path) and os.path.getsize(path) >= 100:
# Read 100 byte SQLite header
with open(path, "rb") as f:
header = f.read(100)

# Check for SQLite header
return header.startswith(b"SQLite format 3\000")

return False

@staticmethod
def build(data, size, mincount, path):
"""
Builds fastText vectors from a file.
Args:
data: path to input data file
size: number of vector dimensions
mincount: minimum number of occurrences required to register a token
path: path to output file
"""

# Train on data file using largest dimension size
model = fasttext.train_unsupervised(data, dim=size, minCount=mincount)

# Output file path
logger.info("Building %d dimension model", size)

# Output vectors in vec/txt format
with open(path + ".txt", "w", encoding="utf-8") as output:
words = model.get_words()
output.write(f"{len(words)} {model.get_dimension()}\n")

for word in words:
# Skip end of line token
if word != "</s>":
vector = model.get_word_vector(word)
data = ""
for v in vector:
data += " " + str(v)

output.write(word + data + "\n")

# Build magnitude vectors database
logger.info("Converting vectors to magnitude format")
converter.convert(path + ".txt", path + ".magnitude", subword=True)
Loading

0 comments on commit f79d806

Please sign in to comment.