Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: propagate dtype to idf_ for large corpuses in PandasNormalizedTfidfVectorizer #26

Merged
merged 17 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions emm/indexing/pandas_normalized_tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@
class PandasNormalizedTfidfVectorizer(TfidfVectorizer):
"""Implementation of customized TFIDF vectorizer"""

dtype = np.float32
mbaak marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, **kwargs: Any) -> None:
"""Implementation of customized TFIDF vectorizer

Expand All @@ -53,6 +51,7 @@ def __init__(self, **kwargs: Any) -> None:
Args:
kwargs: kew-word arguments are same as TfidfVectorizer.
"""
kwargs.setdefault("dtype", np.float32)
kwargs.update({"norm": None, "smooth_idf": True, "lowercase": True})
if kwargs.get("analyzer") in {"word", None}:
kwargs["token_pattern"] = r"\w+"
Expand All @@ -74,6 +73,8 @@ def fit(self, X: pd.Series | pd.DataFrame) -> TfidfVectorizer:
with Timer("CustomizedTfidfVectorizer.fit") as timer:
timer.label("super fit")
super().fit(X)
# scikit-learn's TfidfVectorizer does not preserve dtype for large X, so we force it here
self.idf_ = self.idf_.astype(self.dtype)
chrispyl marked this conversation as resolved.
Show resolved Hide resolved

timer.label("normalize")
n_features = self.idf_.shape[0]
Expand Down
2 changes: 1 addition & 1 deletion emm/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

VERSION = "2.1.5"
VERSION = "2.1.6"

__version__ = VERSION
20 changes: 20 additions & 0 deletions tests/integration/test_pandas_em.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import logging
import os
import uuid

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -135,6 +136,25 @@ def test_pandas_tfidf(dtype):
np.testing.assert_allclose(actual_value, exp_value, rtol=0, atol=0.001)


def test_pandas_tfidf_default_dtype():
pandas_t = PandasNormalizedTfidfVectorizer()
unique_names = [str(uuid.uuid4()) for i in range(100)]
gt_names = pd.Series(unique_names)
pandas_t.fit(gt_names)
assert pandas_t.idf_.dtype == np.float32


@pytest.mark.parametrize(
("dtype", "data_size"), [(np.float32, 100), (np.float64, 100), (np.float32, 1000000), (np.float64, 1000000)]
)
def test_pandas_tfidf_dtype_for_different_input_sizes(dtype, data_size):
pandas_t = PandasNormalizedTfidfVectorizer(dtype=dtype)
unique_names = [str(uuid.uuid4()) for i in range(data_size)]
gt_names = pd.Series(unique_names)
pandas_t.fit(gt_names)
assert pandas_t.idf_.dtype == dtype


def test_pandas_tfidf_ngram():
pandas_t = PandasNormalizedTfidfVectorizer(binary=True, analyzer="char", ngram_range=(3, 3))
gt_names = pd.Series(["aaab", "bbbc"])
Expand Down
Loading