Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: remove obsolete flake8 config #2066

Merged
merged 4 commits into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .flake8

This file was deleted.

751 changes: 184 additions & 567 deletions bertopic/_bertopic.py

Large diffs are not rendered by default.

44 changes: 10 additions & 34 deletions bertopic/_save_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,7 @@ def push_to_hf_hub(
save_ctfidf: Whether to save c-TF-IDF information
"""
if not _has_hf_hub:
raise ValueError(
"Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`"
)
raise ValueError("Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`")

# Create repo if it doesn't exist yet and infer complete repo_id
repo_url = create_repo(repo_id, token=token, private=private, exist_ok=True)
Expand All @@ -156,9 +154,7 @@ def push_to_hf_hub(

# Add README if it does not exist
try:
get_hf_file_metadata(
hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)
)
get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
except: # noqa: E722
if model_card:
readme_text = generate_readme(model, repo_id)
Expand Down Expand Up @@ -241,13 +237,9 @@ def load_files_from_hf(path):

# c-TF-IDF
try:
ctfidf_config = load_cfg_from_json(
hf_hub_download(path, CTFIDF_CFG_NAME, revision=None)
)
ctfidf_config = load_cfg_from_json(hf_hub_download(path, CTFIDF_CFG_NAME, revision=None))
try:
ctfidf_tensors = hf_hub_download(
path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None
)
ctfidf_tensors = hf_hub_download(path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None)
ctfidf_tensors = load_safetensors(ctfidf_tensors)
except: # noqa: E722
ctfidf_tensors = hf_hub_download(path, CTFIDF_WEIGHTS_NAME, revision=None)
Expand All @@ -268,9 +260,7 @@ def load_files_from_hf(path):
topic_list = list(topics["topic_representations"].keys())
images = {}
for topic in topic_list:
image = Image.open(
hf_hub_download(path, f"images/{topic}.jpg", revision=None)
)
image = Image.open(hf_hub_download(path, f"images/{topic}.jpg", revision=None))
images[int(topic)] = image

return topics, params, tensors, ctfidf_tensors, ctfidf_config, images
Expand All @@ -283,11 +273,7 @@ def generate_readme(model, repo_id: str):

# Get Statistics
model_name = repo_id.split("/")[-1]
params = {
param: value
for param, value in model.get_params().items()
if "model" not in param
}
params = {param: value for param, value in model.get_params().items() if "model" not in param}
params = "\n".join([f"* {param}: {value}" for param, value in params.items()])
topics = sorted(list(set(model.topics_)))
nr_topics = str(len(set(model.topics_)))
Expand All @@ -298,23 +284,15 @@ def generate_readme(model, repo_id: str):
nr_documents = ""

# Topic information
topic_keywords = [
" - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics
]
topic_keywords = [" - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics]
topic_freq = [model.get_topic_freq(topic) for topic in topics]
topic_labels = (
model.custom_labels_
if model.custom_labels_
else [model.topic_labels_[topic] for topic in topics]
)
topic_labels = model.custom_labels_ if model.custom_labels_ else [model.topic_labels_[topic] for topic in topics]
topics = [
f"| {topic} | {topic_keywords[index]} | {topic_freq[topic]} | {topic_labels[index]} | \n"
for index, topic in enumerate(topics)
]
topics = topic_table_head + "".join(topics)
frameworks = "\n".join(
[f"* {param}: {value}" for param, value in get_package_versions().items()]
)
frameworks = "\n".join([f"* {param}: {value}" for param, value in get_package_versions().items()])

# Fill Statistics into model card
model_card = model_card.replace("{MODEL_NAME}", model_name)
Expand All @@ -330,9 +308,7 @@ def generate_readme(model, repo_id: str):
if not has_visual_aspect:
model_card = model_card.replace("{PIPELINE_TAG}", "text-classification")
else:
model_card = model_card.replace(
"pipeline_tag: {PIPELINE_TAG}\n", ""
) # TODO add proper tag for this instance
model_card = model_card.replace("pipeline_tag: {PIPELINE_TAG}\n", "") # TODO add proper tag for this instance

return model_card

Expand Down
29 changes: 7 additions & 22 deletions bertopic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,14 @@ def check_documents_type(documents):
if not any([isinstance(doc, str) for doc in documents]):
raise TypeError("Make sure that the iterable only contains strings.")
else:
raise TypeError(
"Make sure that the documents variable is an iterable containing strings only."
)
raise TypeError("Make sure that the documents variable is an iterable containing strings only.")


def check_embeddings_shape(embeddings, docs):
"""Check if the embeddings have the correct shape."""
if embeddings is not None:
if not any(
[isinstance(embeddings, np.ndarray), isinstance(embeddings, csr_matrix)]
):
raise ValueError(
"Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. "
)
if not any([isinstance(embeddings, np.ndarray), isinstance(embeddings, csr_matrix)]):
raise ValueError("Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. ")
else:
if embeddings.shape[0] != len(docs):
raise ValueError(
Expand Down Expand Up @@ -137,16 +131,11 @@ def validate_distance_matrix(X, n_samples):
# check it has correct size
n = s[0]
if n != (n_samples * (n_samples - 1) / 2):
raise ValueError(
"The condensed distance matrix must have " "shape (n*(n-1)/2,)."
)
raise ValueError("The condensed distance matrix must have " "shape (n*(n-1)/2,).")
elif len(s) == 2:
# check it has correct size
if (s[0] != n_samples) or (s[1] != n_samples):
raise ValueError(
"The distance matrix must be of shape "
"(n, n) where n is the number of samples."
)
raise ValueError("The distance matrix must be of shape " "(n, n) where n is the number of samples.")
# force zero diagonal and convert to condensed
np.fill_diagonal(X, 0)
X = squareform(X)
Expand Down Expand Up @@ -182,15 +171,11 @@ def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array:
for i in range(dists.shape[0] - 1):
if dists[i] == dists[i + 1]:
# returns the next unique distance or the current distance with the added noise
next_unique_dist = next(
(d for d in dists[i + 1 :] if d != dists[i]), dists[i] + noise_max
)
next_unique_dist = next((d for d in dists[i + 1 :] if d != dists[i]), dists[i] + noise_max)

# the noise can never be large then the difference between the next unique distance and the current one
curr_max_noise = min(noise_max, next_unique_dist - dists_cp[i])
dists_cp[i + 1] = np.random.uniform(
low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise
)
dists_cp[i + 1] = np.random.uniform(low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise)
return dists_cp


Expand Down
4 changes: 1 addition & 3 deletions bertopic/backend/_flair.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
embeddings = []
for document in tqdm(documents, disable=not verbose):
try:
sentence = (
Sentence(document) if document else Sentence("an empty document")
)
sentence = Sentence(document) if document else Sentence("an empty document")
self.embedding_model.embed(sentence)
except RuntimeError:
sentence = Sentence("an empty document")
Expand Down
4 changes: 1 addition & 3 deletions bertopic/backend/_gensim.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
vector_shape = self.embedding_model.get_vector(
list(self.embedding_model.index_to_key)[0]
).shape[0]
vector_shape = self.embedding_model.get_vector(list(self.embedding_model.index_to_key)[0]).shape[0]
empty_vector = np.zeros(vector_shape)

# Extract word embeddings and pool to document-level
Expand Down
14 changes: 5 additions & 9 deletions bertopic/backend/_hftransformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:

embeddings = []
for document, features in tqdm(
zip(
documents, self.embedding_model(dataset, truncation=True, padding=True)
),
zip(documents, self.embedding_model(dataset, truncation=True, padding=True)),
total=len(dataset),
disable=not verbose,
):
Expand All @@ -79,12 +77,10 @@ def _embed(self, document: str, features: np.ndarray) -> np.ndarray:
https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2#usage-huggingface-transformers
"""
token_embeddings = np.array(features)
attention_mask = self.embedding_model.tokenizer(
document, truncation=True, padding=True, return_tensors="np"
)["attention_mask"]
input_mask_expanded = np.broadcast_to(
np.expand_dims(attention_mask, -1), token_embeddings.shape
)
attention_mask = self.embedding_model.tokenizer(document, truncation=True, padding=True, return_tensors="np")[
"attention_mask"
]
input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape)
sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = np.clip(
input_mask_expanded.sum(1),
Expand Down
23 changes: 6 additions & 17 deletions bertopic/backend/_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,7 @@ def __init__(
except: # noqa: E722
self.tokenizer = None

def embed(
self, documents: List[str], images: List[str] = None, verbose: bool = False
) -> np.ndarray:
def embed(self, documents: List[str], images: List[str] = None, verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words or images into an n-dimensional
matrix of embeddings.

Expand Down Expand Up @@ -124,9 +122,7 @@ def embed(
elif image_embeddings is not None:
return image_embeddings

def embed_documents(
self, documents: List[str], verbose: bool = False
) -> np.ndarray:
def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand All @@ -139,9 +135,7 @@ def embed_documents(
that each have an embeddings size of `m`
"""
truncated_docs = [self._truncate_document(doc) for doc in documents]
embeddings = self.embedding_model.encode(
truncated_docs, show_progress_bar=verbose
)
embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose)
return embeddings

def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
Expand Down Expand Up @@ -170,15 +164,12 @@ def embed_images(self, images, verbose):
end_index = (i * self.batch_size) + self.batch_size

images_to_embed = [
Image.open(image) if isinstance(image, str) else image
for image in images[start_index:end_index]
Image.open(image) if isinstance(image, str) else image for image in images[start_index:end_index]
]
if self.image_model is not None:
img_emb = self.image_model.encode(images_to_embed)
else:
img_emb = self.embedding_model.encode(
images_to_embed, show_progress_bar=False
)
img_emb = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
embeddings.extend(img_emb.tolist())

# Close images
Expand All @@ -191,9 +182,7 @@ def embed_images(self, images, verbose):
if self.image_model is not None:
embeddings = self.image_model.encode(images_to_embed)
else:
embeddings = self.embedding_model.encode(
images_to_embed, show_progress_bar=False
)
embeddings = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
return embeddings

def _truncate_document(self, document):
Expand Down
8 changes: 2 additions & 6 deletions bertopic/backend/_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
if self.batch_size is not None:
embeddings = []
for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):
response = self.client.embeddings.create(
input=batch, **self.generator_kwargs
)
response = self.client.embeddings.create(input=batch, **self.generator_kwargs)
embeddings.extend([r.embedding for r in response.data])

# Delay subsequent calls
Expand All @@ -81,9 +79,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:

# Extract embeddings all at once
else:
response = self.client.embeddings.create(
input=prepared_documents, **self.generator_kwargs
)
response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)
embeddings = [r.embedding for r in response.data]
return np.array(embeddings)

Expand Down
5 changes: 1 addition & 4 deletions bertopic/backend/_use.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
that each have an embeddings size of `m`
"""
embeddings = np.array(
[
self.embedding_model([doc]).cpu().numpy()[0]
for doc in tqdm(documents, disable=not verbose)
]
[self.embedding_model([doc]).cpu().numpy()[0] for doc in tqdm(documents, disable=not verbose)]
)
return embeddings
16 changes: 4 additions & 12 deletions bertopic/backend/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,7 @@
]


def select_backend(
embedding_model, language: str = None, verbose: bool = False
) -> BaseEmbedder:
def select_backend(embedding_model, language: str = None, verbose: bool = False) -> BaseEmbedder:
"""Select an embedding model based on language or a specific provided model.
When selecting a language, we choose all-MiniLM-L6-v2 for English and
paraphrase-multilingual-MiniLM-L12-v2 for all other languages as it support 100+ languages.
Expand Down Expand Up @@ -115,9 +113,7 @@ def select_backend(
return USEBackend(embedding_model)

# Sentence Transformer embeddings
if "sentence_transformers" in str(type(embedding_model)) or isinstance(
embedding_model, str
):
if "sentence_transformers" in str(type(embedding_model)) or isinstance(embedding_model, str):
from ._sentencetransformers import SentenceTransformerBackend

return SentenceTransformerBackend(embedding_model)
Expand All @@ -134,13 +130,9 @@ def select_backend(
from ._sentencetransformers import SentenceTransformerBackend

if language.lower() in ["English", "english", "en"]:
return SentenceTransformerBackend(
"sentence-transformers/all-MiniLM-L6-v2"
)
return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2")
elif language.lower() in languages or language == "multilingual":
return SentenceTransformerBackend(
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
return SentenceTransformerBackend("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
else:
raise ValueError(
f"{language} is currently not supported. However, you can "
Expand Down
4 changes: 1 addition & 3 deletions bertopic/cluster/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@ def hdbscan_delegator(model, func: str, embeddings: np.ndarray = None):
if "cuml" in str_type_model and "hdbscan" in str_type_model:
from cuml.cluster import hdbscan as cuml_hdbscan

predictions, probabilities = cuml_hdbscan.approximate_predict(
model, embeddings
)
predictions, probabilities = cuml_hdbscan.approximate_predict(model, embeddings)
return predictions, probabilities

predictions = model.predict(embeddings)
Expand Down
4 changes: 1 addition & 3 deletions bertopic/plotting/_approximate_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,7 @@ def text_color(val):

def highligh_color(data, color="white"):
attr = "background-color: {}".format(color)
return pd.DataFrame(
np.where(data == 0, attr, ""), index=data.index, columns=data.columns
)
return pd.DataFrame(np.where(data == 0, attr, ""), index=data.index, columns=data.columns)

if len(df) == 0:
return df
Expand Down
Loading