-
Notifications
You must be signed in to change notification settings - Fork 0
/
preparation.py
90 lines (58 loc) · 2.43 KB
/
preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import numpy as np
from string import punctuation
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
tqdm.pandas()
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
STOPWORDS = set(stopwords.words('english'))
def lemmatize_words(tokens):
pos_tagged_text = nltk.pos_tag(tokens)
return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
def text_processing(text : str) -> str:
# lowercase
text = text.lower()
# remove punctuation
text = text.translate(str.maketrans('', '', punctuation))
# remove stopwords
tokens = [word for word in str(text).split() if word not in STOPWORDS]
# lemmatization
text = lemmatize_words(tokens)
return text
if __name__ == "__main__":
df = pd.read_csv("data/ecommerceDataset.csv", names=["class", "text"])
print(f"\nshape: {df.shape}\n")
df["class"] = df["class"].astype(str)
df["text"] = df["text"].astype(str)
df.loc[:,"len"] = df.text.apply(lambda x: len(x))
df.loc[:,"n"] = df.text.apply(lambda x: len(x.split()))
min_n = np.percentile(df["n"], 2)
max_n = np.percentile(df["n"], 97)
print(f"percrentiles 2%: {min_n}, 97%: {max_n}\n")
df = df[(df["n"] > min_n) & (df["n"] < max_n)]
lemmatizer = WordNetLemmatizer()
wordnet_map = {
"N" : wordnet.NOUN,
"V" : wordnet.VERB,
"J" : wordnet.ADJ,
"R" : wordnet.ADV
}
df["text"] = df["text"].progress_apply(lambda x: text_processing(x))
df = df[["text", "class"]]
df = df.drop_duplicates()
df.reset_index(inplace=True, drop=True)
print(f"\nEnd of processing shape: {df.shape}\n")
train, test = train_test_split(
df, test_size=0.33, shuffle=True, random_state=42, stratify=df["class"])
with open('data/train.txt', 'w') as f:
for each_text, each_label in zip(train['text'], test['class']):
f.writelines(f'__label__{each_label.replace(" ", "_")} {each_text}\n')
with open('data/test.txt', 'w') as f:
for each_text, each_label in zip(train['text'], test['class']):
f.writelines(f'__label__{each_label.replace(" ", "_")} {each_text}\n')
print("write files for fasttext training in data/train.txt and data/test.txt\n")