-
Notifications
You must be signed in to change notification settings - Fork 2
/
word-count-hisgram.py
102 lines (75 loc) · 3.99 KB
/
word-count-hisgram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""Python script to create a histogram of words in a text file.
Usage: python word_frequency.py -f "/path/to/file.txt" -n 200
Specify the path to the text file as above. Manually specify the top N words to report (default 100).
Text file can contain punctuation, new lines, etc., but special characters aren't handled well.
"""
import os
import sys
import string
import argparse
import operator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
__author__ = 'Nick Powell (PhD student, CMIC & CABI, UCL, UK), nicholas.powell.11@ucl.ac.uk'
__version__ = '0.2.20150303'
__created__ = '2014-12-18, Thursday'
def main():
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f','--filepath',dest='filepath',metavar='file path',help='Path to text input file to be analysed.', required=True)
parser.add_argument('-n','--number',dest='number',metavar='number',help='Most frequent n words will be displayed and plotted.', required=False, default=100, type=int)
args = parser.parse_args()
# Path to text file to analyse
rawfilepath = args.filepath
# Print a histogram containing the top N words, and print them and their counts.
top_n = args.number
# Load the file
filepath = os.path.normpath(os.path.join(rawfilepath))
file = open(filepath, 'r')
# Parse as a list, removing lines
content_sublists = [line.split(',') for line in file.readlines()]
# Parse into a single list (from a list of lists)
content_list = [item for sublist in content_sublists for item in sublist]
# Remove whitespace so we can concatenate appropriately, and unify case
content_list_strip = [str.strip().lower() for str in content_list]
# Concatenate strings into a single string
content_concat = ' '.join(content_list_strip)
# Remove punctuation and new lines
punct = set(string.punctuation)
unpunct_content = ''.join(x for x in content_concat if x not in punct)
# Split string into list of strings, again
word_list = unpunct_content.split()
# Perform count
counts_all = Counter(word_list)
words, count_values = zip(*counts_all.items())
# Sort both lists by frequency in values (Schwartzian transform) - thanks, http://stackoverflow.com/questions/9543211/sorting-a-list-in-python-using-the-result-from-sorting-another-list
values_sorted, words_sorted = zip(*sorted(zip(count_values, words), key=operator.itemgetter(0), reverse=True))
# Top N
words_sorted_top = words_sorted[0:top_n]
values_sorted_top = values_sorted[0:top_n]
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
print("{0} unique words identified in the text file, {1}".format(len(values_sorted), filepath))
print("The top {0} words are: \n{1}".format(top_n, words_sorted_top))
print("... their respective frequencies: \n{0}".format(values_sorted_top))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
# Pandas DataFrame just for visualisation
df = pd.DataFrame({'count': values_sorted_top, 'word': words_sorted_top})
print("{0}".format(df))
sys.stdout.flush()
# Histogram
# Make xticklabels comprehensible by matplotlib
xticklabels = str(list(words_sorted_top)).split()
# Remove the single quotes, commas and enclosing square brackets
xtlabs = [xstr.replace("'","").replace(",","").replace("]","").replace("[","") for xstr in xticklabels]
indices = np.arange(len(words_sorted_top))
width = 1
fig = plt.figure()
fig.suptitle('Word frequency histogram, top {0}'.format(top_n), fontsize=16)
plt.xlabel('word', fontsize=12)
plt.ylabel('count', fontsize=12)
plt.bar(indices, values_sorted_top, width*0.9, alpha=0.7, color='blue')
plt.xticks(indices + width * 0.5, xtlabs, rotation='vertical', fontsize=8)
plt.show()
if __name__ == '__main__':
main()