-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbackend.js
207 lines (180 loc) · 5.27 KB
/
backend.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
// import fetch from 'node-fetch';
const express = require("express");
const cors = require("cors");
const fetch = require("node-fetch");
const cheerio = require("cheerio");
async function getTitle(url) {
// get html text from reddit
const response = await fetch(url);
// using await to ensure that the promise resolves
const body = await response.text();
// parse the html text and extract titles
const $ = cheerio.load(body);
const title = $("head > title").text().trim();
console.log(title);
return title;
}
require("dotenv").config();
const app = express();
const port = process.env.PORT || 8080;
const uri = process.env.ATLAS_URI;
// const uri = "mongodb+srv://admin:<password>$@web-map.qzzvr.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"
// Create MongoDB client
const MongoClient = require("mongodb").MongoClient;
const client = new MongoClient(uri);
// Connecting to Database
try {
client.connect();
} catch (e) {
// Logging any connection error
console.log(e);
}
// Specifying cors policy for frontend access.
// update to match the domain you will make the request from
// Middleware
// app.use(function(req, res, next) {
// res.header("Access-Control-Allow-Origin", "https://vision-frontend.herokuapp.com");
// res.header("Access-Control-Allow-Headers", "Origin, X-Requested-With, Content-Type, Accept");
// res.header('Access-Control-Allow-Methods', 'GET');
// next();
// })
app.use(function (req, res, next) {
res.header("Access-Control-Allow-Origin", "http://localhost:3000");
res.header(
"Access-Control-Allow-Headers",
"Origin, X-Requested-With, Content-Type, Accept"
);
res.header("Access-Control-Allow-Methods", "GET");
next();
});
// The infamous Merge sort by HydroxyHelium
function merge(left, right) {
let dbResults = [];
while (left.length && right.length) {
if (left[0][1] > right[0][1]) {
dbResults.push(left.shift());
} else {
dbResults.push(right.shift());
}
}
return [...dbResults, ...left, ...right];
}
function mergeSort(array) {
const half = array.length / 2;
if (array.length < 2) {
return array;
}
const left = array.splice(0, half);
return merge(mergeSort(left), mergeSort(array));
}
// Merge Sort END
async function getData(wordList, dbWordTags, dbDomainCount, res, pageNumber) {
let dbResults = await dbWordTags.find({ word: { $in: wordList } }).toArray();
let ratedLinks = {};
let domainList = [];
let ratedDomains = {};
let finalResults = [];
dbResults.map(async (obj) => {
for (var index in obj) {
if (!isNaN(index)) {
obj[index].map((url) => {
if (ratedLinks[url]) {
ratedLinks[url] = ratedLinks[url] * index;
} else {
ratedLinks[url] = index * 1;
}
});
}
}
});
// extract domain from links ...
for (var url in ratedLinks) {
var host = url
.replace("http://", "")
.replace("https://", "")
.split(/[/?#]/)[0];
domainList.push(host);
}
// Retrieve Domain Count from database
let dbResultCount = await dbDomainCount
.find({ url: { $in: domainList } })
.toArray();
dbResultCount.map((obj) => {
ratedDomains[obj.url] = obj.count;
});
// Assign appropriate Ratings according to the domainCount
for (var index in ratedLinks) {
var host = index
.replace("http://", "")
.replace("https://", "")
.split(/[/?#]/)[0];
finalResults.push([
index,
Math.log(ratedLinks[index] * ratedDomains[host]),
]);
}
// sort all the filtered queries
finalResults = mergeSort(finalResults);
// Only returning 20 results based on pageNumber
let final = [];
const len = finalResults.length;
for (let step = 0; step < 20; step++) {
if (step * pageNumber < len) {
const val = finalResults[step * pageNumber];
final.push([val[0], val[1]]);
}
}
console.log(final);
res.send(final);
}
app.get("/api/search", async (req, res) => {
// Extracting Parameters from request
console.log(req.query);
const searchQuery = req.query.search;
const pageNumber = req.query.page;
// Splitting the search into multiple words.
const wordList = searchQuery.split(" ");
// Connecting to Database "web-map"
var dbConn = client.db("web-map");
// Running the algorithm
getData(
wordList,
dbConn.collection("tags"),
dbConn.collection("domains"),
res,
pageNumber
);
});
app.get("/api/requestCrawl", async (req, res) => {
const url = req.query.url;
var dbConn = client.db("web-map");
// check if the url already exists in the database
let dbResult = await dbConn
.collection("seed-url")
.find({ url: url })
.toArray();
if (dbResult.length > 0) {
// if it exists, update the status to pending
dbConn
.collection("seed-url")
.updateOne({ url: url }, { $set: { status: "pending" } });
res.send("Successfully requested to crawl again");
return;
}
urlObject = {
url: url,
status: "pending",
depth: 100,
parent: "",
dateAdded: new Date(),
// 'lastCrawled': , // add this later
};
dbConn.collection("seed-url").insertOne(urlObject);
res.send("Crawl Requested Successfully");
return;
});
app.use(cors());
app.use(express.json());
app.listen(port, () => {
console.log(`Server is running on port ${port}`);
});