ucbepic · shreyashankar · Oct 12, 2024 · Oct 10, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/docetl/operations/outliers.py b/docetl/operations/outliers.py
@@ -0,0 +1,82 @@
+from jinja2 import Environment, Template
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+from .base import BaseOperation
+from .utils import RichLoopBar
+from .clustering_utils import get_embeddings_for_clustering
+
+class OutliersOperation(BaseOperation):
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.max_batch_size: int = self.config.get(
+            "max_batch_size", kwargs.get("max_batch_size", float("inf"))
+        )
+
+    def syntax_check(self) -> None:
+        """
+        Checks the configuration of the OutlierOperation for required keys and valid structure.
+
+        Raises:
+            ValueError: If required keys are missing
+        """
+
+        pass
+
+
+    def execute(
+        self, input_data: List[Dict], is_build: bool = False
+    ) -> Tuple[List[Dict], float]:
+        """
+        Executes the cluster operation on the input data. Modifies the
+        input data and returns it in place.
+
+        Args:
+            input_data (List[Dict]): A list of dictionaries to process.
+            is_build (bool): Whether the operation is being executed
+              in the build phase. Defaults to False.
+
+        Returns:
+            Tuple[List[Dict], float]: A tuple containing the filtered
+              list of dictionaries and the total cost of the operation.
+        """
+
+        embeddings, cost = get_embeddings_for_clustering(
+            input_data, self.config, self.runner.api
+        )
+        embeddings = np.array(embeddings)
+
+        if self.config.get("center", None) is not None:
+            center_embeddings, cost2 = get_embeddings_for_clustering(
+                [self.config["center"]], self.config, self.runner.api
+            )
+            cost += cost2
+            center = np.array(center_embeddings[0])
+        else:
+            center = embeddings.mean(axis=0)
+
+        distances = np.sqrt(((embeddings - center)**2).sum(axis=1))
+
+        if "samples" in self.config:
+            distance_distribution = np.sort(distances)
+            samples = self.config["samples"]
+            if isinstance(samples, float):
+                samples = int(samples * (len(distance_distribution)-1))
+            cutoff = distance_distribution[samples]
+        elif "std" in self.config:
+            cutoff = np.sqrt((embeddings.std(axis=0)**2).sum()) * self.config["std"]
+
+        if not self.config.get("keep", False):
+            include = distances <= cutoff
+        else:
+            include = distances > cutoff
+
+        return [
+            item
+            for idx, item in enumerate(input_data)
+            if include[idx]], cost
+
diff --git a/docs/operators/outliers.md b/docs/operators/outliers.md
@@ -0,0 +1,60 @@
+# Outliers operation
+
+The Outliers operation in DocETL removes outliers from the input (or
+keeps only outliers).
+
+## 🚀 Example: 
+
+```yaml
+- name: remove-worst-10
+  type: outliers
+  samples: 0.9
+  embedding_keys:
+   - concept
+   - description
+```
+
+This will keep the 90 percent closest to the center (average)
+embedding of the keys provided. Altermnatively, you could set samples
+to an integer count of items to keep (or a negative number to throw
+away). You can also assume a gaussian distribution and set the key std
+to a number of standard deviations out from the center, instead of
+setting samples.
+
+Small note about embeddings: If you embed too short values, some
+embedding models will yield a very "sparse" distribution, where the
+absolute majority of points lie on the surface of a hyperssphere,
+meaning that this operation will not work very well!
+
+### Using it as a poor-mans-RAG
+```yaml
+- name: remove-worst-10
+  type: outliers
+  samples: 0.01
+  embedding_keys:
+   - concept
+   - description
+  center:
+    concept: Horse
+    description: A horse is a large steppe roaming and grazing animal. Humans have utilized horses for transport throughout historical times
+```
+
+If center is provided, it must have the same keys as those listed
+under embedding_keys, and their values will be used to calculate the
+"center" embedding, instead of using the average of all embeddings of
+the input items. This will effectively turn this into a search
+operation for items similar to the center provided.
+
+## Required Parameters
+
+- `name`: A unique name for the operation.
+- `type`: Must be set to "sample".
+- `samples`: Either a an integer count of samples, or a float fraction of samples.
+- `embedding_keys`: A list of keys to use for the embedding distance calculation.
+
+## Optional Parameters
+
+| Parameter                 | Description                                                                      | Default                       |
+| ------------------------- | -------------------------------------------------------------------------------- | ----------------------------- |
+| `keep`                    | If set to true, return the outliers instead of the non-outliers | false
+| `center`                  | An explicit center object to be used to calculate the center embedding instead of using the average | The average embedding of all input data
diff --git a/pyproject.toml b/pyproject.toml
@@ -88,6 +88,7 @@ reduce = "docetl.operations.reduce:ReduceOperation"
 resolve = "docetl.operations.resolve:ResolveOperation"
 gather = "docetl.operations.gather:GatherOperation"
 cluster = "docetl.operations.cluster:ClusterOperation"
+outliers = "docetl.operations.outliers:OutliersOperation"
 sample = "docetl.operations.sample:SampleOperation"
 
 [tool.poetry.plugins."docetl.parser"]