-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_analytics_tool.py
127 lines (108 loc) · 4.92 KB
/
text_analytics_tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import nltk
from nltk.util import ngrams
from collections import Counter
from openpyxl import load_workbook, Workbook
import textstat
from collections import defaultdict
nltk.download("vader_lexicon")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
def get_ngrams(text, n):
tokens = nltk.word_tokenize(text.lower())
return list(ngrams(tokens, n))
def get_data_from_file(file_name):
wb = load_workbook(file_name)
ws = wb.active
data = []
for row in ws.iter_rows(min_row=1, max_col=5, values_only=True):
data.extend(list(row))
return data
import re
def keyword_density(data):
ngram_counts = defaultdict(int)
for sentence in data:
if not sentence:
continue
tokens = nltk.word_tokenize(sentence)
# Remove special characters from tokens
tokens = [re.sub(r"[^\w\s]", "", token) for token in tokens]
tokens = [token for token in tokens if token]
for n in range(1, 7):
ngrams = list(nltk.ngrams(tokens, n))
for ngram in ngrams:
ngram_counts[ngram] += 1
return ngram_counts
def content_analysis(data):
full_text = " ".join([sentence for sentence in data if sentence])
tokens = nltk.word_tokenize(full_text)
unique_tokens = set(tokens)
lexical_density = len(unique_tokens) / len(tokens) * 100
return {
"Word Count": textstat.lexicon_count(full_text),
"Character Length": len(full_text),
"Letters": sum(c.isalpha() for c in full_text),
"Sentences": textstat.sentence_count(full_text),
"Syllables": textstat.syllable_count(full_text),
"Average Words/Sentence": textstat.avg_sentence_length(full_text),
"Average Syllables/Word": textstat.avg_syllables_per_word(full_text),
"Lexical Density": lexical_density,
"Lexical Diversity": textstat.text_standard(full_text, float_output=True),
"Reading Ease": textstat.flesch_reading_ease(full_text),
"Grade Level": textstat.flesch_kincaid_grade(full_text),
"Gunning Fog": textstat.gunning_fog(full_text),
"Coleman Liau Index": textstat.coleman_liau_index(full_text),
"Smog Index": textstat.smog_index(full_text),
"Automated Reading Index": textstat.automated_readability_index(full_text),
}
from nltk.sentiment import SentimentIntensityAnalyzer
def sentence_analysis(data):
sia = SentimentIntensityAnalyzer()
sentence_data = []
for sentence in data:
if not sentence:
continue
sentiment = sia.polarity_scores(sentence)
readability_scores = {
"Reading Ease": textstat.flesch_reading_ease(sentence),
"Grade Level": textstat.flesch_kincaid_grade(sentence),
"Gunning Fog": textstat.gunning_fog(sentence),
"Coleman Liau Index": textstat.coleman_liau_index(sentence),
"Smog Index": textstat.smog_index(sentence),
"Automated Reading Index": textstat.automated_readability_index(sentence),
}
sentence_data.append({
"Sentence": sentence,
"Compound": sentiment["compound"],
"Words": textstat.lexicon_count(sentence),
**readability_scores
})
return sentence_data
def write_to_excel(data, content_data, sentence_data, output_file):
wb = Workbook()
keyword_density_ws = wb.active
keyword_density_ws.title = "Keyword density"
keyword_density_ws.append(["Phrase Length", "Keyword Phrase", "Ngram Density"])
for ngram, count in data.items():
keyword_density_ws.append([len(ngram), " ".join(ngram), count])
content_analysis_ws = wb.create_sheet("Content Analysis")
for key, value in content_data.items():
if key == "Readability":
for sub_key, sub_value in value.items():
content_analysis_ws.append([sub_key, sub_value])
else:
content_analysis_ws.append([key, value])
sentence_analysis_ws = wb.create_sheet("Sentence Analysis")
sentence_analysis_ws.append(["Sentence", "Compound", "Words", "Reading Ease", "Grade Level", "Gunning Fog", "Coleman Liau Index", "Smog Index", "Automated Reading Index"])
for s_data in sentence_data:
sentence_analysis_ws.append([s_data["Sentence"], s_data["Compound"], s_data["Words"], s_data["Reading Ease"], s_data["Grade Level"], s_data["Gunning Fog"], s_data["Coleman Liau Index"], s_data["Smog Index"], s_data["Automated Reading Index"]])
wb.save(output_file)
def main():
input_file = "your_file.xlsx"
output_file = "data_analytics.xlsx"
data = get_data_from_file(input_file)
ngram_counts = keyword_density(data)
content_data = content_analysis(data)
sentence_data = sentence_analysis(data)
write_to_excel(ngram_counts, content_data, sentence_data, output_file)
if __name__ == "__main__":
main()