-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_collocates.py
111 lines (82 loc) · 4.21 KB
/
get_collocates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""Python script to get collocates of a node word in a directory of TXT files.
Earl K. Brown, ekbrown byu edu (add appropriate characters to create email)
"""
import os, re, time, pandas as pd
start = time.time()
def find_collocates(dir_with_txt, node_wd, stopwords = [], span = 4, side = "both", min_freq = 2):
"""Get the collocates (neighboring words) of a node word.
dir_with_txt: Pathway to directory with TXT files; other file types are ignored.
node_wd: Node word whose collocates will be found.
stopwords: A Python list of stopwords to exclude as possible collocates (default is empty list).
span: The span in number of words around the node word to look for collocates (default is 4).
side: Which side, or both, of the node word to look for collocates,
from among "both", "left", "right" (default is "both").
min_freq: Minimum frequency that a word must have to be considered a collocate (default is 2).
return value: A Pandas DataFrame.
"""
### verify arguments given by user
if span <= 0 or not isinstance(span, int):
raise Exception("In the call to find_collocates(), you need to supply a positive integer to the argument 'span'.")
if side.lower() == "both":
span_to_search = range(-span,span + 1)
elif side.lower() == "left":
span_to_search = range(-span, 0)
elif side.lower() == "right":
span_to_search = range(0, span + 1)
else:
raise Exception("In the call to find_collocates(), you need to specify 'side' as either 'left', 'right', or 'both'.")
# make stopwords uppercase
stopwords = [i.upper() for i in stopwords]
### end data verification
# creates collector dictionary
freqs_dict = {}
# gets .txt filenames
original_working_dir = os.getcwd()
os.chdir(dir_with_txt)
filenames = [i for i in os.listdir() if re.search(r"\.txt$", i, flags=re.I)]
# count = 0
for i in filenames:
with open(i) as fin:
for ln in fin:
# checks whether the node word is in the current line
if re.search(node_wd, ln, flags=re.I):
# if occursin(node_wd, ln)
# split up current line into words
wds = re.split(r"[^-'a-z]+", ln, flags=re.I)
wds = [i for i in wds if len(i) > 0]
# loop over the words in the current line
for j in range(len(wds)):
# if the current word matches the node word
if re.search(node_wd, wds[j], flags=re.I):
# loop over the collocates within the span
for k in span_to_search:
# if the current span word is the node word
if k == 0:
continue
# try to get the next collocate word, if it doesn't fall outside the range of the words in the current line
try:
collocate_wd = wds[j + k].upper()
# don't add collocate if stopword
if collocate_wd not in stopwords:
freqs_dict[collocate_wd] = freqs_dict.get(collocate_wd, 0) + 1
except:
continue
freqs_df = pd.DataFrame(list(freqs_dict.items()), columns = ["collocate", "freq"])
# limit results to minimum frequency
freqs_df = freqs_df[(freqs_df.freq >= min_freq)]
# sort in descending order by frequency, then in ascending order by collocate
freqs_df = freqs_df.sort_values(["freq", "collocate"], ascending=[False, True])
# change to original working directory
os.chdir(original_working_dir)
return freqs_df
### test the function
dir_with_txt = "/Users/ekb5/Corpora/gen_conf_trunc/"
node_wd = r"\bprophet\b" # as regex
span = 4
side = "both"
min_freq = 3
from stop_words import get_stop_words # mind the underscores
stopwords = get_stop_words('en')
results = find_collocates(dir_with_txt, node_wd, stopwords, span, side, min_freq)
print(results.iloc[:10])
print(time.time() - start, "seconds")