-
Notifications
You must be signed in to change notification settings - Fork 0
/
csweb.py
92 lines (72 loc) · 3.08 KB
/
csweb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
import logging as logger
import re
from datetime import datetime, timedelta
from urllib.parse import urlparse, parse_qs
import feedparser
import requests
from bs4 import BeautifulSoup
class CSWeb:
def __init__(self, url='https://cs.kaist.ac.kr/rss/events/ko'):
self.url = url
self.feed = feedparser.parse(url)
@staticmethod
def _url_to_event_id(url):
return parse_qs(urlparse(url).query)['bbs_sn'][0]
def get_event_ids(self, refresh=False):
# refresh feed?
if refresh:
self.feed = feedparser.parse(self.url)
return [self._url_to_event_id(entry['link'])
for entry in self.feed.entries]
def get_event_details(self, event_id):
"""
Returns: title, location, description, start_dt, end_dt
"""
title, location, description, start_dt, end_dt = None, None, None, None, None
logger.warning('Fetching event: {}'.format(event_id))
# exception: 8447 (seems a stray event)
if event_id == '8447':
return title, location, description, start_dt, end_dt
# title, description
for entry in self.feed.entries:
if event_id == self._url_to_event_id(entry['link']):
title = entry['title']
description = entry['description']
url_pattern = 'https://cs.kaist.ac.kr/board/view?bbs_id=events&bbs_sn={}&menu=86'
event_url = url_pattern.format(event_id)
# prepend event URL to description
description = """Event URL: {}
<br/>
{}
""".format(event_url, description)
r = requests.get(event_url)
soup = BeautifulSoup(r.text, 'html.parser')
# start_dt, end_dt
dt_info = soup.find('p', class_='seminarsInfo').text.split('\n')[1].split('@')
date_str = dt_info[0].strip()
try:
time_info = dt_info[1].strip().split('~')
start_time = time_info[0]
start_dt_str = '{} @ {}'.format(date_str, start_time)
start_dt = datetime.strptime(start_dt_str, '%a, %b %d, %Y @ %H:%M')
if len(time_info) == 2:
end_time = time_info[1]
end_dt_str = '{} @ {}'.format(date_str, end_time)
end_dt = datetime.strptime(end_dt_str, '%a, %b %d, %Y @ %H:%M')
else:
# assume 1hr 30m event
end_dt = start_dt + timedelta(hours=1, minutes=30)
# Well, sometimes time information doesn't exist
# assuming 8AM; pull request welcome from NLP ninja
except IndexError:
start_dt_str = '{} @ {}'.format(date_str, '08:00')
start_dt = datetime.strptime(start_dt_str, '%a, %b %d, %Y @ %H:%M')
# again, assume 1hr 30m event. duh!
end_dt = start_dt + timedelta(hours=1, minutes=30)
# location
try:
location = soup.find('strong', text=re.compile('Location:*')).text.split(':')[-1].strip()
except AttributeError:
location = 'TSIAK'
return title, location, description, start_dt, end_dt