-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinsert.php
184 lines (150 loc) · 5.85 KB
/
insert.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
<?php
/**
*insert.php cleans alltables in the database, craws all URLs of Edx and Coursera,
*and adds all returned data into the course_data and coursedetails tables.
*
* @author Manzoor Ahmed
* @author Chris Rehfeld
* @author Tatiana Braginets
*/
header('content-type: text/plain;charset=utf-8');
require_once 'connection.php';
require_once 'AbstractCourseParser.php';
require_once 'ParserFactory.php';
require_once 'emptyTables.php';
require_once 'CourseraUrlsParser.php';
require_once 'EdxUrlsParser.php';
//set crawling time for 10 min, report all errors
set_time_limit(1200);
error_reporting(E_ALL);
ini_set('display_errors', 1);
$edxUrls = array();
$courseraUrls = array();
//get all urls from coursera
$coursera_urls_parser = new CourseraUrlsParser();
$coursera_urls_parser->parse();
$courseraUrls = $coursera_urls_parser->getUrls();
//get all urls from edx
$edx_urls_parser = new EdxUrlsParser();
$edx_urls_parser->parse();
$edxUrls = $edx_urls_parser->getUrls();
//clean tables before every crawl
//cleanTables();
removeOldCourses($edxUrls, "Edx");
removeOldCourses($courseraUrls, "Coursera");
//insert to coursedetails
foreach ($courseraUrls as $url) {
insertCourseDetails($url);
}
foreach ($edxUrls as $url) {
$extraInfo = array('shortCourseDescription' => $edx_urls_parser->getCourseShortDesc($url));
insertCourseDetails($url, $extraInfo,"Edx");
}
/**
* insertCourseDetails, adds primary professors to coursedetails database
* @param $url, array of site links
* @param $extraInfo, extra information [Optional]
* @param $website, the site name where the $url is being passed by
**/
function insertCourseDetails($url, $extraInfo = array(),$website="Coursera"){
//mysqli db connection
$dbc = $GLOBALS['dbc'];
$factory = new ParserFactory();
try {
$p = $factory->create($url, $extraInfo);
$p->parse();
} catch (Exception $e) {
//these really should be logged....but print to stdout for now
echo "parsing failure for $url\n";
echo $e->getMessage(), "\n", $e->getTraceAsString();
return false;
}
if (!$p->isValid()){
echo "invalid parser for $url\n";
return false;
}
//to store primary professors
$id;
$prim_prof = $p->getProfessors();
$title = $dbc->real_escape_string($p->getCourseName());
$short_desc = $dbc->real_escape_string($p->getShortCourseDescription());
$long_desc = $dbc->real_escape_string($p->getLongCourseDescription());
$course_link = $dbc->real_escape_string($p->getHomepageUrl());
$video_link = $dbc->real_escape_string($p->getVideoUrl());
$course_length = $p->getDuration();
$course_image = $dbc->real_escape_string($p->getCoursePhotoUrl());
$getcategories =$p->getCategoryNames();
$course_date = $p->getStartDate();
if ($course_date) {
$course_date = $course_date->format('Y-m-d 00:00:00');
}
$site = $dbc->real_escape_string($p->getUniversityName());
$category = $dbc->real_escape_string(join(', ', $p->getCategoryNames()));
$find_course_sql = "SELECT id, course_link from course_data
WHERE 1 AND course_link='$course_link'";
$result = $dbc->query($find_course_sql);
if ($result && $result->num_rows) {
$row = $result->fetch_array();
$id = $row['id'];
$update_course_sql = "UPDATE course_data SET title='$title', short_desc='$short_desc',
long_desc='$long_desc', video_link='$video_link',
start_date='$course_date', course_length='$course_length', course_image='$course_image',
category='$category', site='$website'
WHERE 1 AND id='$id'";
$dbc->query($update_course_sql) or die($dbc->error);
}
else {
//insert to course_data first
$que ="INSERT INTO `course_data` (`id`, `title`, `short_desc`, `long_desc`, `course_link`, `video_link`,
`start_date`, `course_length`, `course_image`, `category`, `site`)
VALUES ('0', '$title', '$short_desc', '$long_desc', '$course_link', '$video_link', '$course_date', '$course_length', '$course_image', '$category', '$website');";
//run query
$dbc->query($que) or die($dbc->error);
//get the last auto generated id, needed for insert to the next table
$id =mysqli_insert_id($dbc);
$hit_query = "INSERT INTO `trendingcourses` (`id` ,`hits`)
VALUES ('$id', '0');";
$dbc->query($hit_query) or die($dbc->error);
//get all the professors who are teaching the class
foreach($prim_prof as $row){
//get the profesor's name and image
$name = $dbc->real_escape_string($row['name']);
$image= $dbc->real_escape_string($row['image']);
//prepare query for inserting to coursedetails table
$sql = "INSERT INTO `coursedetails` (`id`, `profname`, `profimage`)
VALUES
( '$id', '$name', '$image');" or die($dbc->error);
$dbc->query($sql) or die($dbc->error);
}
// insert into new_courses
$new_courses_sql = "INSERT INTO new_courses (course_data_id, date_added) VALUES ('$id', now())";
$dbc->query($new_courses_sql) or die($dbc->error);
}
}//end function
/**
* Removes courses URLs from db that are not in
* the passed array
*
* @param array $newUrls new URLs
* @param string $website which website URLs belong to
* @return void
*/
function removeOldCourses($newUrls, $website)
{
$dbc = $GLOBALS['dbc'];
$old_urls_sql = "SELECT id, course_link from course_data
WHERE site='$website'";
$result = $dbc->query($old_urls_sql);
if ($result && $result->num_rows) {
while ($row = $result->fetch_array()) {
if (array_search($row['course_link'], $newUrls) === FALSE) {
$id = $row['id'];
$delete_sql = "DELETE from course_data WHERE id='$id'";
$dbc->query($delete_sql) or die($dbc->error);
}
}
}
return;
}
exit;
?>