Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed the Issue in utis.py #52

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 15 additions & 14 deletions src/kwx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import gensim
import pandas as pd
import spacy
from spacy import __version__ as spacy_version
from googletrans import Translator
from nltk.stem.snowball import SnowballStemmer
from stopwordsiso import stopwords
Expand Down Expand Up @@ -63,24 +64,19 @@ def load_data(data, target_cols=None):
The texts as a df.
"""
if isinstance(data, str):
if data[-len("xlsx") :] == "xlsx":
df_texts = pd.read_excel(io=data)
elif data[-len("csv") :] == "csv":
df_texts = pd.read_csv(filepath_or_buffer=data)
if data.endswith("xlsx"):
df_texts = pd.read_excel(data)
elif data.endswith("csv"):
df_texts = pd.read_csv(data)
else:
ValueError("Strings passed should be paths to csv or xlsx files.")

raise ValueError("Strings passed should be paths to csv or xlsx files.")
elif isinstance(data, pd.DataFrame):
df_texts = data

elif isinstance(data, pd.Series):
df_texts = pd.DataFrame(data).reset_index(drop=True)
df_texts.columns = data.index.values.tolist()

else:
ValueError(
"The 'data' argument should be either the name of a csv/xlsx file a pandas dataframe."
)
raise ValueError("The 'data' argument should be either the name of a csv/xlsx file or a pandas dataframe.")

if target_cols is None:
target_cols = df_texts.columns
Expand Down Expand Up @@ -178,19 +174,20 @@ def _lemmatize(tokens, nlp=None, verbose=True):
Tokens to be lemmatized.

nlp : spacy.load object
A spacy language model.
A SpaCy language model.

verbose : bool (default=True)
Whether to show a tqdm progress bar for the query.

Returns
-------
base_tokens : list or list of lists
Tokens that have been lemmatized for nlp analysis.
Tokens that have been lemmatized for NLP analysis.
"""
allowed_pos_tags = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]

base_tokens = []

for t in tqdm(
tokens,
total=len(tokens),
Expand All @@ -200,7 +197,11 @@ def _lemmatize(tokens, nlp=None, verbose=True):
):
combined_texts = _combine_texts_to_str(text_corpus=t)

lem_tokens = nlp(combined_texts)
if spacy_version >= "3.0.0":
lem_tokens = nlp(combined_texts)
else:
lem_tokens = nlp.tokenizer(combined_texts)

lemmed_tokens = [
token.lemma_ for token in lem_tokens if token.pos_ in allowed_pos_tags
]
Expand Down