-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathgrab_context.py
executable file
·120 lines (104 loc) · 3.83 KB
/
grab_context.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
""" Retrieve context data from each thing.
"""
import argparse
import csv
import datetime
import re
import requests
import numpy as np
import os
from thingiverse_crawler import get_url
def extract_publish_time(contents):
pattern = "<time datetime=\"([\w\s\-:]*)\">";
r = re.findall(pattern, contents);
if (len(r) != 1):
return None;
return datetime.datetime.strptime(r[0], "%Y-%m-%d %H:%M:%S GMT");
def extract_category(contents):
pattern = "\"/categories/([\w\-]*)(/([\w\-]*))?\"";
r = re.findall(pattern, contents);
if (len(r) == 0):
return (None, None);
assert(len(r[0]) == 3);
return r[0][0], r[0][2];
def extract_tags(contents):
pattern = "\"/tag:([\w\-]*)\"";
r = re.findall(pattern, contents);
if (len(r) == 0):
return [];
return r;
def extract_title_and_author(contents):
#pattern = "<meta property=\"og:title\" content=\"([^<>]+)\s*by\s*([^<>]*)\" />";
pattern = "<title>([^<>]+) by ([^<>]*) - Thingiverse</title>";
r = re.findall(pattern, contents);
if r is not None:
return r[0];
else:
return None, None;
def grab_context(thing_ids):
contexts = [];
num_tries = 0;
while len(thing_ids) > 0 and num_tries < 3:
missing = [];
for thing_id in thing_ids:
print("Thing id: {}".format(thing_id));
url = "http://www.thingiverse.com/thing:{}".format(thing_id);
contents = get_url(url, 30 + 10 * num_tries);
if contents is None:
missing.append(thing_id);
continue;
publish_time = extract_publish_time(contents.text);
category = extract_category(contents.text);
tags = extract_tags(contents.text);
title, author = extract_title_and_author(contents.text);
print("Published time: {}".format(publish_time.isoformat()));
print("Category : {}".format(category));
print("Tags : {}".format(tags));
print("Title : {}".format(title));
print("Author : {}".format(author));
contexts.append((
thing_id, publish_time, category, tags, title, author));
thing_ids = missing;
num_tries+=1;
return contexts;
def parse_args():
parser = argparse.ArgumentParser(description=__doc__);
parser.add_argument("summary", help="summary file");
return parser.parse_args();
def main():
args = parse_args();
summary_file = args.summary;
with open(summary_file, 'r') as fin:
csv_reader = csv.reader(fin);
header = next(csv_reader);
header = [str(item).strip() for item in header];
thing_id_idx = header.index("thing_id");
thing_ids = [int(row[thing_id_idx]) for row in csv_reader];
thing_ids = np.unique(thing_ids);
contexts = grab_context(thing_ids);
# Save context
summary_name = os.path.splitext(summary_file)[0]
with open(summary_name+"-context.csv", 'w') as fout:
fout.write("thing_id, publish_time, category, subcategory, title, author\n");
for cts in contexts:
thing_id = cts[0];
publish_date = cts[1];
category = cts[2];
title = cts[4];
author = cts[5];
if publish_date is not None:
publish_date = publish_date.isoformat();
fout.write("{}\n".format(
",".join([str(thing_id), publish_date,
str(category[0]), str(category[1]), title, author])));
# Save tags
with open(summary_name+"-tags.csv", 'w') as fout:
fout.write("thing_id, tag\n");
for cts in contexts:
thing_id = cts[0];
tags = cts[3];
for tag in tags:
fout.write("{},{}\n".format(thing_id, tag));
if __name__ == "__main__":
main();