-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsearch.py
executable file
·149 lines (130 loc) · 5.3 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/python
import argparse
import sys
from bs4 import BeautifulSoup
from urllib2 import urlopen
from urllib import quote_plus
from pprint import pprint
from HTMLParser import HTMLParser
class SearchUMN(object):
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(self, html):
s = self.MLStripper()
s.feed(html)
return s.get_data()
def replace_br_with_newline(self, html):
return html.replace('<br>', '\n').replace('</br>', '\n')
def parse_multiple_results(self, soup):
rows = soup.find(id='pagecontent').find('table').find_all('tr')
all_rows = []
for row in rows:
cells = row.find_all('td')
row_data = {}
for cell_num in xrange(len(cells)):
cell_data = cells[cell_num]
cell_text = cell_data.text
cell_field = self.fields[cell_num]
if cell_text.strip() != '':
row_data[cell_field] = cell_text.replace(u'\xa0', u' ')
if cell_num == self.url_field_num:
url = cell_data.find('a')['href']
row_data[self.url_field] = url
x500 = url.split(self.x500_split)[1]
row_data[self.x500_field] = x500
all_rows.append(row_data)
all_rows.pop(0)
return all_rows
def parse_single_result(self, soup):
data = soup.find(id='pagecontent').find('table').find_all('tr')
parsed = {}
for kv_pair in data:
key = kv_pair.find('th').text.strip().lower().replace(' ', '_')
value_html = str(kv_pair.find('td'))
value = self.strip_tags(self.replace_br_with_newline(value_html)).strip()
parsed[key] = value
return parsed
def is_multiple_results(self, soup):
try:
table_present = soup.find(id='pagecontent').find('table')
h2_present = soup.find(id='pagecontent').find('h2')
return (table_present and not(h2_present)) == True
except AttributeError: # catch None.find()
return False
def is_single_result(self, soup):
try:
result = soup.find(id='pagecontent').find('h2')
return result != None
except AttributeError: # catch None.find()
return False
def is_no_results(self, soup):
try:
results = soup.find(id='pagecontent').find_all('b')
for result in results:
if result.text == 'No matches found.':
return True
return False
except AttributeError: # catch None.find()
return False
def is_too_many_results(self, soup):
try:
results = soup.find(id='pagecontent').find_all('b')
for result in results:
if result.text == 'Too many entries matched your search criteria. Please try again with more specific criteria. ':
return True
return False
except AttributeError: # catch None.find()
return False
search_url = 'http://www.umn.edu/lookup?SET_INSTITUTION=UMNTC&type=%s&CN=%s&campus=%s&role=%s'
fields = ['name', 'email', 'work_phone', 'phone', 'dept_or_college']
x500_field = 'x500'
x500_split = '&UID='
url_field = 'url'
url_field_num = 0
def load_results(self):
html = urlopen(self.search_url % (search_type, search_query, campus, role)).read()
soup = BeautifulSoup(html)
if self.is_multiple_results(soup):
return self.parse_multiple_results(soup)
elif self.is_single_result(soup):
return self.parse_single_result(soup)
elif self.is_no_results(soup):
return None
elif self.is_too_many_results(soup):
return None
else:
return None
def __init__(self, search_type='name', campus='t', role='any', query=None):
self.search_type = search_type
self.campus = campus
self.role = role
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Search the UMN People Directory using a Python API.')
parser.add_argument('-t', '--search_type', default='name',
help='Search by name or id (Default: name)',
choices=['name','id'])
parser.add_argument('-c', '--campus', default='t',
help='Campus to search (Default: Twin Cities campus)',
choices=['a','c','d','m','r','t','o'])
parser.add_argument('-r', '--role', default='any',
help='The person\'s role (Default: any)',
choices=['any','sta','stu','alu','ret'])
parser.add_argument('query',
help='Name or Internet ID of person to be searched')
args = parser.parse_args()
if (args.search_type == 'id'):
search_type = 'Internet+ID'
else:
search_type = args.search_type
campus = args.campus
role = args.role
search_query = quote_plus(args.query)
searcher = SearchUMN(search_type, campus, role, search_query)
results = searcher.load_results()
pprint(results)