-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEdxCourseParser.php
198 lines (173 loc) · 6.77 KB
/
EdxCourseParser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
<?php
/**
* Takes in html source code, and extracts relevant peices of info.
* This code is specifically for edx.org
*
*
* @author Chris Rehfeld
*/
require_once 'AbstractCourseParser.php';
require_once 'phpQuery-onefile.php';
require_once 'EdxUniversityNameConverter.php';
class EdxCourseParser extends AbstractCourseParser
{
protected $htmlText;
/**
* Initializes the object, making it ready for the parse() method to be called.
*
* @param string $htmlText the full html source code, utf8 encoded.
* @param string $homepageUrl the url to the detailed course description webpage.
* @param string $shortCourseDescription a short textual description of the course. probably 1 sentence.
*/
public function __construct($homepageUrl, $htmlText, $shortCourseDescription)
{
if (!is_string($htmlText))
{
throw new InvalidArgumentException("arg1 must be string");
}
if (strlen($htmlText) < 20)
{
throw new InvalidArgumentException("html text too short");
}
$this->htmlText = $htmlText;
$this->homepageUrl = $homepageUrl;
$this->shortCourseDescription = $shortCourseDescription;
}
/**
* Trys to extract data. hopefully after this method is called,
* the getters should return valid info.
*
* @throws CourseParsingException if something really bad happens
*/
public function parse()
{
//marks that we attempted parsing
$this->isParsed = true;
//this inits the pq() function, setting the html it will operate on
phpQuery::newDocument($this->htmlText);
//course start date
$start = pq('.start-date')->slice(0, 1)->text();
//format in html is usally Sep 15, 2013
//but sometimes just Sept, 2013
$dateStr = trim(str_replace(',', '', $start));
if (preg_match('~^\w+( \d{1,2})? \d{2,4}$~', $dateStr))
{
$ts = '@' . strtotime($dateStr);
$this->startDate = date_create($ts);
// if we still failed...this is an unknown date format
if (!$this->startDate)
{
throw new CourseParsingException("Unknown date format. date_str='{$dateStr}'");
}
}
//course end date
$end = pq('.final-date')->slice(0, 1)->text();
//format in html is usally Sep 15, 2013
//but sometimes just Sept, 2013
$dateStr = trim(str_replace(',', '', $end));
if (preg_match('~^\w+( \d{1,2})? \d{2,4}$~', $dateStr))
{
$ts = '@' . strtotime($dateStr);
$this->endDate = date_create($ts);
// if we still failed...this is an unknown date format
if (!$this->endDate)
{
throw new CourseParsingException("Unknown date format. date_str='{$dateStr}'");
}
}
//calc duration, if possible
if ($this->startDate && $this->endDate)
{
// divide diff by seconds per day
$diffDays = ($this->endDate->format('U') - $this->startDate->format('U')) / (60 * 60 * 24);
$this->duration = ceil($diffDays / 7);
}
//staff/professors
$staff = array();
foreach (pq('.teacher h3') as $h3elem)
{
$name = pq($h3elem)->slice(0, 1)->text();
$url = pq($h3elem)->prev('div.teacher-image')->find('img')->attr('src');
$image = null;
//check for a url. not all teachers have an image
if ($url)
{
//see if the url has a domain
$parts = parse_url($url);
//url parsing failure is an exception. it means we scraped a non url(they changed html doc structure)
if ($parts === false)
{
throw new CourseParsingException("parsing of image url failed. url was: '$url'");
}
//add the hostname if its missing
if (!isset($parts['host']))
{
$url = "https://www.edx.org$url";
//make sure new url is well formed
if (false === parse_url($url))
{
throw new CourseParsingException("parsing of constructed image url failed. url was: '$url");
}
}
$image = $url;
}
$staff[] = compact('name', 'image');
}
$this->otherProfessors = $staff;
$this->primaryProfessor = $staff[0];
//university name
$this->universityName = pq('hgroup h1 a')->slice(0, 1)->text();
// $this->universityName = EdxUniversityNameConverter::convert($this->universityName);
//course name
$courseName = trim(pq('hgroup h1')->clone()->children()->remove()->end()->text());
//edx has leading course codes. they always contain a number,
//and the code is always seperated from the title with a space
//we strip
$preparedName = preg_replace('~^(\S*\d[^ ]* )?(.*?)~', '\2', $courseName);
$this->courseName = trim($preparedName);
//workload
$effort = pq('p:contains("Estimated Effort")')->next('.start-date')->slice(0, 1)->text();
if ($effort && preg_match('~\d+~', $effort, $matches))
{
$this->workload = (int) $matches[0];
}
//long course description
$this->longCourseDescription = pq('section.about p')->text();
//category names....but edx doesnt categorize, so we just provide an empty list
$this->categoryNames = array();
//course photo
$photoUrl = pq('div.hero img')->slice(0, 1)->attr('src');
if (strlen($photoUrl) > 0)
{
$parts = parse_url($photoUrl);
if (!$parts)
{
throw new CourseParsingException("couldnt parse photo url");
}
else
{
$this->coursePhotoUrl = isset($parts['host'])
? $photoUrl
: "http://www.edx.org" . $photoUrl;
}
}
else
{
throw new CourseParsingException("couldnt find photo url");
}
//course video
$videoUrl = pq('#video-modal iframe')->slice(0, 1)->attr('src');
if (strlen($videoUrl) > 0)
{
$parts = parse_url($videoUrl);
if (!$parts)
{
throw new CourseParsingException("couldnt parse video url");
}
else
{
$this->courseVideoUrl = $videoUrl;
}
}
}
}