-
Notifications
You must be signed in to change notification settings - Fork 3
/
enron.py
70 lines (55 loc) · 2 KB
/
enron.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
python enron.py
Takes a directory for a single user from a downloaded copy the Enron Email dataset available here
https://www.cs.cmu.edu/~enron/
Originally released into the public domain by https://www.ferc.gov/ during their investigation.
May 7th 2015 version of data
https://www.cs.cmu.edu/~enron/enron_mail_20150507.tar.gz
"""
# *********************************************************************************************************************
# standard imports
import os
import pathlib
import datetime
# 3rd party imports
import click
# custom imports
import humanfirst
@click.command()
@click.option('-d', '--directory', type=str, required=True, help='root path single user')
def main(directory: str) -> None:
"""Main Function"""
# build path
assert os.path.isdir(directory)
file_list = []
for path in pathlib.Path(directory).rglob("*."):
file_list.append(str(path))
print(f'Length: {len(file_list)}')
#
workspace = humanfirst.objects.HFWorkspace()
for file in file_list:
with open(file,mode='r',encoding='WINDOWS-1252') as filehandle:
try:
text = filehandle.read()
except UnicodeDecodeError:
text = ""
print(f'Couldn\'t read {file} because of encoding')
example = humanfirst.objects.HFExample(
text=text,
id=file,
created_at=datetime.datetime.now().isoformat(),
metadata={
"file": file
},
context=humanfirst.objects.HFContext(file,'conversation','client')
)
workspace.add_example(example)
# write to output
print("Commencing write")
filename_out = os.path.join(directory,"output.json")
file_out = open(filename_out, mode='w', encoding='utf8')
workspace.write_json(file_out)
file_out.close()
print(f"Write complete to {filename_out}")
if __name__ == '__main__':
main() # pylint: disable=no-value-for-parameter