-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCourseraCourseParser.php
233 lines (196 loc) · 7.46 KB
/
CourseraCourseParser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
<?php
/**
* Takes in json text, and extracts relevant peices of info.
* This code is specifically for coursera.org
*
* @author Chris Rehfeld
*/
require_once 'AbstractCourseParser.php';
class CourseraCourseParser extends AbstractCourseParser
{
protected $generalJsonText
, $instructorJsonText;
/**
* Initializes the object, making it ready for the parse() method to be called.
*
* @param string $generalJsonText the json text that contains the general info about the course
* @param string $instructorJsonText the json text that contains info about the professors for the course
* @throws IllegalArgumentException
*/
public function __construct($homepageUrl, $generalJsonText, $instructorJsonText)
{
$this->homepageUrl = $homepageUrl;
if (!is_string($generalJsonText) || !is_string($instructorJsonText))
{
throw new InvalidArgumentException("arg 1 and 2 must be string");
}
if (strlen($generalJsonText) < 20 || strlen($instructorJsonText) < 2)
{
throw new InvalidArgumentException("json text too short");
}
$this->generalJsonText = $generalJsonText;
$this->instructorJsonText = $instructorJsonText;
}
/**
* Trys to extract data. hopefully after this method is called,
* the getters should return valid info.
*
* @throws CourseParsingException if something really bad happens
*/
public function parse()
{
//marks that we attempted parsing
$this->isParsed = true;
$generalObj = json_decode($this->generalJsonText);
if (!$generalObj)
{
throw new RuntimeException("json parsing failed for generalJsonText");
}
$instructorObj = json_decode($this->instructorJsonText);
if (!$instructorObj)
{
throw new RuntimeException("json parsing failed for instructorJsonText");
}
//course description
$this->courseDescription = strip_tags($generalObj->about_the_course);
//university name
$this->universityName = $generalObj->universities[0]->name;
//course name
$this->courseName = $generalObj->name;
//workload
if (preg_match('~(\d+)(\-(\d+)) hours?/week~', $generalObj->estimated_class_workload, $matches))
{
if (isset($matches[3]))
{
// case like: 5 hours/week
$ave = ceil(($matches[1] + $matches[3] ) / 2);
}
else
{
// case like 5-6 hours/week
$ave = (int) $matches[1];
}
if ($ave < 0 || $ave > 100)
{
throw new CourseParsingException("parsing of workload failed for $ave '{$generalObj->estimated_class_workload}'");
}
$this->workload = $ave;
}
//start date
//we try to pick the "active" course, but, default to the first in the list
//because sometimes there is no course marked as active
$course = $generalObj->courses[0];
foreach($generalObj->courses as $potentialActiveCourse) {
if ($potentialActiveCourse->status)
{
$course = $potentialActiveCourse;
break;
}
}
if ($course->start_date_string)
{
// remove commas
$dateStr = trim(str_replace(',', '', $course->start_date_string));
if (preg_match('~^(\d{1,2} )?\w+ \d{2,4}$~', $dateStr))
{
$ts = '@' . strtotime($dateStr);
$this->startDate = date_create($ts);
// if we still failed...this is an unknown date format
if (!$this->startDate)
{
throw new CourseParsingException("Unknown date format. date_str='{$dateStr}'");
}
}
elseif ($dateStr === 'Self-service')
{
$this->startDate = null;
}
else
{
throw new CourseParsingException("Unknown date format. date_str='{$dateStr}'");
}
}
elseif ($course->start_year && $course->start_month)
{
//sometimes they omit the start day, assume first of the month
$day = $course->start_day ? $course->start_day : 1;
$this->startDate = date_create("{$course->start_year}-{$course->start_month}-$day 00:00:00");
if (!$this->startDate)
{
throw new CourseParsingException("Unexpected date values. y:m:d = {$course->start_year}:{$course->start_month}:{$course->start_day}");
}
}
else
{
//we assume this case is that the date is simply unspecified/"to be determined"
$this->startDate = null;
}
//duration
if (preg_match('~(\d+) weeks~', $course->duration_string, $matches))
{
//convert weeks to days
$this->duration = $matches[1];
}
//calc end date, if possible
if ($this->startDate && $this->duration)
{
//add the duration in seconds
$endTimestamp = $this->startDate->format('U') + ($this->duration * 60 * 60 * 24 * 7);
$this->endDate = new Datetime("@$endTimestamp");
}
//staff/professors
$staff = array();
foreach ($instructorObj as $prof)
{
$name = strlen($prof->middle_name)
? "{$prof->first_name} {$prof->middle_name} {$prof->last_name}"
: "{$prof->first_name} {$prof->last_name}";
$image = $prof->photo;
//sometimes theres blank entries where the name is just whitespace. skip.
if (strlen(trim($name)))
{
$staff[] = compact('name', 'image');
}
}
$this->otherProfessors = $staff;
$this->primaryProfessor = $staff[0];
//categories
$categoryNames = array();
foreach ($generalObj->categories as $category)
{
//trim names and make sure not empty, otherwise skip
$name = trim($category->name);
if (strlen($name))
{
$categoryNames[] = $name;
}
}
$this->categoryNames = $categoryNames;
//short description
$this->shortCourseDescription = $generalObj->short_description;
//long description
$this->longCourseDescription = $generalObj->about_the_course;
//video url
if (strlen($generalObj->video) > 0)
{
if (!preg_match('~^[a-zA-Z0-9_-]{5,50}$~Di', $generalObj->video))
{
throw new CourseParsingException("Unexpected youtube url component format. val='{$generalObj->video}'");
}
else
{
$this->courseVideoUrl = 'http://www.youtube.com/watch?v=' . urlencode($generalObj->video);
}
}
//photo url
$photoUrl = $generalObj->photo;
if (strlen($photoUrl) > 0)
{
$parts = parse_url($photoUrl);
if (isset($parts['host']))
{
$this->coursePhotoUrl = $photoUrl;
}
}
}
}