improve docs for MinHashDedup

argilla-io · Nov 1, 2024 · 9ccda5e · 9ccda5e
1 parent 844165f
commit 9ccda5e
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 10 deletions.
diff --git a/src/distilabel/steps/filtering/_datasketch.py b/src/distilabel/steps/filtering/_datasketch.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 """
-`dataskech` (https://github.com/ekzhu/datasketch) doesn't offer a way to store the hash tables in disk. This
-is a custom implementation that uses `shelve` to store the hash tables in disk.
+`datasketch` (https://github.com/ekzhu/datasketch) doesn't offer a way to store the hash tables in disk. This
+is a custom implementation that uses `diskcache` to store the hash tables in disk.
 Note: This implementation is not optimized for performance, but could be worth
 creating a PR to `datasketch`.
 """
@@ -98,15 +98,15 @@ def insert(self, key, *vals, **kwargs):
 
 
 def ordered_storage(config, name=None):
-    """Copy of `datasketch.storage.ordered_storage` with the addition of `ShelveListStorage`."""
+    """Copy of `datasketch.storage.ordered_storage` with the addition of `DiskCacheListStorage`."""
     tp = config["type"]
     if tp == "disk":
         return DiskCacheListStorage(config, name=name)
     return _ordered_storage(config, name=name)
 
 
 def unordered_storage(config, name=None):
-    """Copy of `datasketch.storage.ordered_storage` with the addition of `ShelveSetStorage`."""
+    """Copy of `datasketch.storage.ordered_storage` with the addition of `DiskCacheSetStorage`."""
     tp = config["type"]
     if tp == "disk":
         return DiskCacheSetStorage(config, name=name)

diff --git a/src/distilabel/steps/filtering/minhash.py b/src/distilabel/steps/filtering/minhash.py
@@ -92,12 +92,11 @@ class MinHashDedup(Step):
 
     Attributes:
         num_perm: the number of permutations to use. Defaults to `128`.
-        seed: the seed to use for the MinHash. This seed must be the same
-            used for `MinHash`, keep in mind when both steps are created. Defaults to `1`.
+        seed: the seed to use for the MinHash. Defaults to `1`.
         tokenizer: the tokenizer to use. Available ones are `words` or `ngrams`.
-            If `words` is selected, it tokenize the text into words using nltk's
+            If `words` is selected, it tokenizes the text into words using nltk's
             word tokenizer. `ngram` estimates the ngrams (together with the size
-            `n`) using. Defaults to `words`.
+            `n`). Defaults to `words`.
         n: the size of the ngrams to use. Only relevant if `tokenizer="ngrams"`. Defaults to `5`.
         threshold: the threshold to consider two MinHashes as duplicates.
             Values closer to 0 detect more duplicates. Defaults to `0.9`.
@@ -106,8 +105,6 @@ class MinHashDedup(Step):
             not defined in `datasketch`, that is based on DiskCache's `Index` class.
             It should work as a `dict`, but backed by disk, but depending on the system
             it can be slower. Defaults to `dict`.
-            which uses a custom `shelve` backend. Note the `disk`
-            is an experimetal feature that may cause issues. Defaults to `dict`.
 
     Input columns:
         - text (`str`): the texts to be filtered.