This repository has been archived by the owner on Feb 20, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
granicus.py
154 lines (122 loc) · 5.61 KB
/
granicus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import json
import os
import re
from collections import namedtuple
from datetime import datetime, date
from urllib.parse import urlparse, parse_qs
import pendulum
import pytz
from bs4 import BeautifulSoup
from typing import Iterable
from common import VideoProvider, VideoMetadata, PreparedVideoInfo
from ffmpeg import ffmpeg_concat
from segment_tools import download_clip, write_ffmpeg_concat_file
GranicusVideo = namedtuple('GranicusVideo',
['title', 'date', 'agenda_url', 'minutes_url', 'minutes_url_title', 'video_url'])
Streams = namedtuple('Streams', ['rtmp_url', 'm3u8_url'])
class GranicusVideoMetadata(VideoMetadata):
def __init__(self, video_id, title, category, start_ts, url, agenda_title, agenda_url, minutes_title, minutes_url):
super().__init__(video_id, title, category, start_ts, None, None, url)
self.agenda_title = agenda_title
self.agenda_url = agenda_url
self.minutes_title = minutes_title
self.minutes_url = minutes_url
def get_href_parts(td):
a = td.find('a')
if not a:
return None, None
return a['href'], remove_extraneous_spaces(a.string)
def get_url_from_onclick(onclick_value):
start_val = "open('"
start_index = onclick_value.find(start_val) + len(start_val)
return onclick_value[start_index:onclick_value.rfind("','player'")]
def remove_extraneous_spaces(value):
return ' '.join(value.split())
def uncommented_lines(full_text):
for line in full_text.split('\n'):
if line.startswith('#'):
continue
yield line
class GranicusScraperApi(VideoProvider):
def __init__(self, site_url, tz='America/Vancouver'):
"""
:param site_url: The Granicus page with the list of available videos.
"""
super().__init__(site_url)
self.tz = tz
def available_dates(self, start_date: date, end_date: date) -> Iterable[pendulum.Date]:
seen_dates = set()
for video in self.get_videos():
dt = pendulum.parse(video.start_ts)
if start_date <=dt <= end_date and video.start_ts not in seen_dates:
seen_dates.add(dt)
yield dt.date()
def get_metadata(self, for_date) -> Iterable[VideoMetadata]:
for video in self.get_videos():
if pendulum.parse(video.start_ts).date() != for_date:
continue
yield video
def download(self, url, destination_dir):
clip_guid = self.get_clip_id(url)
streams = self.get_streams(clip_guid)
download_clip(self.get_video_piece_urls(streams.m3u8_url), destination_dir, 16)
def postprocess(self, video_metadata: VideoMetadata, download_dir, destination_dir, **kwargs) -> PreparedVideoInfo:
concat_file_path = write_ffmpeg_concat_file(download_dir, None)
video_filename = video_metadata.video_id + '.ts'
video_path = os.path.join(destination_dir, video_filename)
ffmpeg_concat(concat_file_path, video_path, mono=kwargs.get('mono', False))
return PreparedVideoInfo(video_metadata, video_filename)
def _parse_html(self, url):
resp = self.session.get(url)
resp.raise_for_status()
return BeautifulSoup(resp.text, 'html.parser')
def get_videos(self):
parsed_html = self._parse_html(self.provider_url)
# The second table is the useful one.
previous_meetings_table = parsed_html.select('table.listingTable')[1]
for row in previous_meetings_table.find_all('tr'):
cells = row.find_all('td')
if not cells:
continue
date_value = remove_extraneous_spaces(cells[1].string).replace('.', '')
date_value = datetime.strptime(date_value, '%b %d, %Y')
date_value = pytz.timezone(self.tz).localize(date_value)
agenda_url, agenda_title = get_href_parts(cells[-3])
minutes_url, minutes_title = get_href_parts(cells[-2])
# Remove date from title.
title = remove_extraneous_spaces(next(cells[0].stripped_strings))
title = re.sub(r'\s\(\w+\.? \d+, \d+\)', '', title)
video_url = get_url_from_onclick(cells[-1].find('a')['onclick'])
qs = parse_qs(urlparse(video_url).query)
yield GranicusVideoMetadata(
qs['clip_id'][0],
title,
title,
date_value.isoformat(),
video_url,
agenda_title, agenda_url,
minutes_title, minutes_url,
)
def get_clip_id(self, video_url):
resp = self.session.get(video_url)
resp.raise_for_status()
match = re.search(r"\s+clipId:\s*'([\w\-]+)',", resp.text)
return match.group(1)
def get_streams(self, clip_id):
parsed_site = urlparse(self.provider_url)
streams_url = '{}://{}/player/GetStreams.php'.format(parsed_site.scheme, parsed_site.netloc)
resp = self.session.get(streams_url, params={'clip_id': clip_id})
js_text = resp.text.replace('\/', '/')
js = json.loads(js_text)
return Streams(js[0], js[1])
def get_video_piece_urls(self, m3u8_url):
resp = self.session.get(m3u8_url)
resp.raise_for_status()
dir_url_for_original_m3u8 = os.path.dirname(m3u8_url)
next_m3u8_url = os.path.join(dir_url_for_original_m3u8, next(uncommented_lines(resp.text)))
resp = self.session.get(next_m3u8_url)
resp.raise_for_status()
for ts_filename in uncommented_lines(resp.text):
if not ts_filename.endswith('.ts'):
continue
yield os.path.join(dir_url_for_original_m3u8, ts_filename)