Replace fasttext and pymagnitude with staticvectors. Closes #857. Closes

#858. Closes #859
neuml · Jan 27, 2025 · f79d806 · f79d806
1 parent 12aa955
commit f79d806
Show file tree

Hide file tree

Showing 13 changed files with 252 additions and 440 deletions.
diff --git a/docs/embeddings/configuration/vectors.md b/docs/embeddings/configuration/vectors.md
@@ -56,13 +56,6 @@ Builds embeddings using a [Model2Vec](https://github.com/MinishLab/model2vec) mo
 
 Builds embeddings using a word embeddings model and static vectors. While Transformers models are preferred in most cases, this method can be useful for low resource and historical languages where there isn't much linguistic data available.
 
-#### storevectors
-```yaml
-storevectors: boolean
-```
-
-Enables copying of a vectors model set in path into the embeddings models output directory on save. This option enables a fully encapsulated index with no external file dependencies.
-
 #### pca
 ```yaml
 pca: int

diff --git a/examples/01_Introducing_txtai.ipynb b/examples/01_Introducing_txtai.ipynb
@@ -80,7 +80,7 @@
         "!pip install git+https://github.com/neuml/txtai#egg=txtai[graph]\n",
         "\n",
         "# Install translation pipeline dependencies for later examples\n",
-        "!pip install sentencepiece sacremoses fasttext"
+        "!pip install sentencepiece sacremoses staticvectors"
       ],
       "execution_count": 1,
       "outputs": []

diff --git a/examples/03_Build_an_Embeddings_index_from_a_data_source.ipynb b/examples/03_Build_an_Embeddings_index_from_a_data_source.ipynb
diff --git a/examples/24_Whats_new_in_txtai_4_0.ipynb b/examples/24_Whats_new_in_txtai_4_0.ipynb
@@ -429,7 +429,7 @@
       "source": [
         "# External vector models\n",
         "\n",
-        "txtai supports generating vectors with [Hugging Face Transformers](https://github.com/huggingface/transformers), [PyTorch](https://github.com/pytorch/pytorch), [ONNX](https://github.com/microsoft/onnxruntime) and [Word Vector](https://github.com/neuml/magnitude) models.\n",
+        "txtai supports generating vectors with [Hugging Face Transformers](https://github.com/huggingface/transformers), [PyTorch](https://github.com/pytorch/pytorch), [ONNX](https://github.com/microsoft/onnxruntime) and [Word Vector](https://github.com/neuml/staticvectors) models.\n",
         "\n",
         "This release adds support for pre-computed vectors using external models. External models may be an API, custom library and/or another way to vectorize data. This adds flexibility given the high computation cost in building embeddings vectors. Embeddings generation could be outsourced or consolidated to a group of servers with GPUs, leaving index servers to run on lower resourced machines. \n",
         "\n",

diff --git a/setup.py b/setup.py
@@ -69,7 +69,7 @@
 
 extras["pipeline-llm"] = ["litellm>=1.37.16", "llama-cpp-python>=0.2.75"]
 
-extras["pipeline-text"] = ["fasttext-wheel>=0.9.2", "sentencepiece>=0.1.91"]
+extras["pipeline-text"] = ["sentencepiece>=0.1.91", "staticvectors>=0.2.0"]
 
 extras["pipeline-train"] = [
     "accelerate>=0.26.0",
@@ -93,14 +93,13 @@
 extras["scoring"] = ["sqlalchemy>=2.0.20"]
 
 extras["vectors"] = [
-    "fasttext-wheel>=0.9.2",
     "litellm>=1.37.16",
     "llama-cpp-python>=0.2.75",
     "model2vec>=0.3.0",
-    "pymagnitude-lite>=0.1.43",
     "scikit-learn>=0.23.1",
     "sentence-transformers>=2.2.0",
     "skops>=0.9.0",
+    "staticvectors>=0.2.0",
 ]
 
 extras["workflow"] = [

diff --git a/src/python/txtai/embeddings/base.py b/src/python/txtai/embeddings/base.py
@@ -4,7 +4,6 @@
 
 import json
 import os
-import shutil
 import tempfile
 
 import numpy as np
@@ -618,12 +617,6 @@ def save(self, path, cloud=None, **kwargs):
             # Create output directory, if necessary
             os.makedirs(path, exist_ok=True)
 
-            # Copy vectors model
-            if self.config.get("storevectors"):
-                shutil.copyfile(self.config["path"], os.path.join(path, os.path.basename(self.config["path"])))
-
-                self.config["path"] = os.path.basename(self.config["path"])
-
             # Save index configuration
             Configuration().save(self.config, path)
 

diff --git a/src/python/txtai/embeddings/index/configuration.py b/src/python/txtai/embeddings/index/configuration.py
@@ -41,10 +41,6 @@ def load(self, path):
         # Add format parameter
         config["format"] = "json" if jsonconfig else "pickle"
 
-        # Build full path to embedding vectors file
-        if config.get("storevectors"):
-            config["path"] = os.path.join(path, config["path"])
-
         return config
 
     def save(self, config, path):

diff --git a/src/python/txtai/pipeline/text/translation.py b/src/python/txtai/pipeline/text/translation.py
@@ -2,17 +2,14 @@
 Translation module
 """
 
-import os
-
 # Conditional import
 try:
-    import fasttext
+    from staticvectors import StaticVectors
 
-    FASTTEXT = True
+    STATICVECTORS = True
 except ImportError:
-    FASTTEXT = False
+    STATICVECTORS = False
 
-from huggingface_hub import hf_hub_download
 from huggingface_hub.hf_api import HfApi
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 
@@ -26,7 +23,7 @@ class Translation(HFModel):
     """
 
     # Default language detection model
-    DEFAULT_LANG_DETECT = "julien-c/fasttext-language-id/lid.176.ftz"
+    DEFAULT_LANG_DETECT = "neuml/language-id-quantized"
 
     def __init__(self, path=None, quantize=False, gpu=True, batch=64, langdetect=None, findmodels=True):
         """
@@ -137,7 +134,7 @@ def detect(self, texts):
 
     def defaultdetect(self, texts):
         """
-        Default fasttext language detection model.
+        Default language detection model.
 
         Args:
             texts: list of text
@@ -147,43 +144,20 @@ def defaultdetect(self, texts):
         """
 
         if not self.detector:
-            if not FASTTEXT:
+            if not STATICVECTORS:
                 raise ImportError('Language detection is not available - install "pipeline" extra to enable')
 
-            # Suppress unnecessary warning
-            fasttext.FastText.eprint = lambda x: None
-
             # Get model path
             path = self.langdetect if self.langdetect else Translation.DEFAULT_LANG_DETECT
 
             # Load language detection model
-            path = path if os.path.exists(path) else self.download(path)
-            self.detector = fasttext.load_model(path)
+            self.detector = StaticVectors(path)
 
         # Transform texts to format expected by language detection model
         texts = [x.lower().replace("\n", " ").replace("\r\n", " ") for x in texts]
 
-        return [x[0].split("__")[-1] for x in self.detector.predict(texts)[0]]
-
-    def download(self, path):
-        """
-        Downloads path from the Hugging Face Hub.
-
-        Args:
-            path: full model path
-
-        Returns:
-            local cached model path
-        """
-
-        # Split into parts
-        parts = path.split("/")
-
-        # Calculate repo id split
-        repo = 2 if len(parts) > 2 else 1
-
-        # Download and cache file
-        return hf_hub_download(repo_id="/".join(parts[:repo]), filename="/".join(parts[repo:]))
+        # Detect languages
+        return [x[0][0] for x in self.detector.predict(texts)]
 
     def translate(self, texts, source, target, showmodels=False):
         """

diff --git a/src/python/txtai/vectors/factory.py b/src/python/txtai/vectors/factory.py
@@ -91,7 +91,7 @@ def method(config):
                     method = "llama.cpp"
                 elif Model2Vec.ismodel(path):
                     method = "model2vec"
-                elif WordVectors.isdatabase(path):
+                elif WordVectors.ismodel(path):
                     method = "words"
                 else:
                     method = "transformers"

diff --git a/src/python/txtai/vectors/words.py b/src/python/txtai/vectors/words.py
@@ -2,23 +2,24 @@
 Word Vectors module
 """
 
+import json
 import logging
 import os
 import tempfile
 
-from errno import ENOENT
 from multiprocessing import Pool
 
 import numpy as np
 
-# Conditionally import Word Vector libraries as they aren't installed by default
+from transformers.utils import cached_file
+
+# Conditional import
 try:
-    import fasttext
-    from pymagnitude import converter, Magnitude
+    from staticvectors import Database, StaticVectors
 
-    WORDS = True
+    STATICVECTORS = True
 except ImportError:
-    WORDS = False
+    STATICVECTORS = False
 
 from ..pipeline import Tokenizer
 
@@ -66,24 +67,59 @@ class WordVectors(Vectors):
     Builds vectors using weighted word embeddings.
     """
 
+    @staticmethod
+    def ismodel(path):
+        """
+        Checks if path is a WordVectors model.
+
+        Args:
+            path: input path
+
+        Returns:
+            True if this is a WordVectors model, False otherwise
+        """
+
+        # Check if this is a SQLite database
+        if WordVectors.isdatabase(path):
+            return True
+
+        try:
+            # Download file and parse JSON
+            path = cached_file(path_or_repo_id=path, filename="config.json")
+            if path:
+                with open(path, encoding="utf-8") as f:
+                    config = json.load(f)
+                    return config.get("model_type") == "staticvectors"
+
+        # Ignore this error - invalid repo or directory
+        except OSError:
+            pass
+
+        return False
+
+    @staticmethod
+    def isdatabase(path):
+        """
+        Checks if this is a SQLite database file which is the file format used for word vectors databases.
+
+        Args:
+            path: path to check
+
+        Returns:
+            True if this is a SQLite database
+        """
+
+        return isinstance(path, str) and Database.isdatabase(path)
+
     def __init__(self, config, scoring, models):
         # Check before parent constructor since it calls loadmodel
-        if not WORDS:
-            # Raise error if trying to create Word Vectors without vectors extra
-            raise ImportError(
-                'Word vector models are not available - install "vectors" extra to enable. Otherwise, specify '
-                + 'method="transformers" to use transformer backed models'
-            )
+        if not STATICVECTORS:
+            raise ImportError('staticvectors is not available - install "vectors" extra to enable')
 
         super().__init__(config, scoring, models)
 
     def loadmodel(self, path):
-        # Ensure that vector path exists
-        if not path or not os.path.isfile(path):
-            raise IOError(ENOENT, "Vector model file not found", path)
-
-        # Load magnitude model. If this is a training run (uninitialized config), block until vectors are fully loaded
-        return Magnitude(path, case_insensitive=True, blocking=not self.initialized)
+        return StaticVectors(path)
 
     def encode(self, data):
         # Iterate over each data element, tokenize (if necessary) and build an aggregated embeddings vector
@@ -156,67 +192,8 @@ def lookup(self, tokens):
             word vectors array
         """
 
-        return self.model.query(tokens)
+        return self.model.embeddings(tokens)
 
     def tokens(self, data):
         # Skip tokenization rules
         return data
-
-    @staticmethod
-    def isdatabase(path):
-        """
-        Checks if this is a SQLite database file which is the file format used for word vectors databases.
-
-        Args:
-            path: path to check
-
-        Returns:
-            True if this is a SQLite database
-        """
-
-        if isinstance(path, str) and os.path.isfile(path) and os.path.getsize(path) >= 100:
-            # Read 100 byte SQLite header
-            with open(path, "rb") as f:
-                header = f.read(100)
-
-            # Check for SQLite header
-            return header.startswith(b"SQLite format 3\000")
-
-        return False
-
-    @staticmethod
-    def build(data, size, mincount, path):
-        """
-        Builds fastText vectors from a file.
-
-        Args:
-            data: path to input data file
-            size: number of vector dimensions
-            mincount: minimum number of occurrences required to register a token
-            path: path to output file
-        """
-
-        # Train on data file using largest dimension size
-        model = fasttext.train_unsupervised(data, dim=size, minCount=mincount)
-
-        # Output file path
-        logger.info("Building %d dimension model", size)
-
-        # Output vectors in vec/txt format
-        with open(path + ".txt", "w", encoding="utf-8") as output:
-            words = model.get_words()
-            output.write(f"{len(words)} {model.get_dimension()}\n")
-
-            for word in words:
-                # Skip end of line token
-                if word != "</s>":
-                    vector = model.get_word_vector(word)
-                    data = ""
-                    for v in vector:
-                        data += " " + str(v)
-
-                    output.write(word + data + "\n")
-
-        # Build magnitude vectors database
-        logger.info("Converting vectors to magnitude format")
-        converter.convert(path + ".txt", path + ".magnitude", subword=True)