-
Notifications
You must be signed in to change notification settings - Fork 12
/
websiteContactHarvester.js
124 lines (99 loc) · 4.15 KB
/
websiteContactHarvester.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
'use strict';
var request = require("sync-request");
var cheerio = require("cheerio");
var urlParse = require("url-parse");
var websiteContactHarvester = function () {
var self = this;
self.maxCrawlDepth = 2;
self.phoneNumberRegEx = /\(?\d{3}\)?[-\s\.]?\d{3}[-\s\.]?\d{4}/;
// given the initial uri and the htmlContent from that uri
// parse all anchor tags from the htmlContent and compile an array of unique sibling uris
self.getSiblingUris = function (uri, htmlContent) {
var initialUri = urlParse(uri);
var $ = cheerio.load(htmlContent);
var allAnchors = $("a");
var siblingUris = [];
var i = allAnchors.length;
while (i--) {
var a = allAnchors[i];
var aUri = urlParse(a.attribs.href);
// if the current uri is for the same host domain, and not exactly the original uri
// and we don't already have it in our array, add it
if (aUri.host == initialUri.host && a.attribs.href != uri) {
if (siblingUris.indexOf(a.attribs.href) < 0)
siblingUris.push(a.attribs.href);
}
}
return siblingUris;
};
self.getHtml = function (uri) {
var htmlContent = "";
try {
var req = request("GET", uri);
htmlContent = req.getBody('utf8');
}
catch (e) {
console.log("exception while crawling: " + uri);
}
return htmlContent;
};
// start with the uri provided and crawl the site for other uris
// crawl all sibling uris gathering the htmlcontent as we go until we hit the max crawl depth
// don't crawl the same uri twice
self.recursiveCrawlSite = function (uriToCrawl, crawledPages, crawlDepth) {
var htmlContent = self.getHtml(uriToCrawl);
if (htmlContent == "")
return crawledPages;
crawledPages.push({ uri: uriToCrawl, htmlContent: htmlContent });
// if we reached the max crawl depth, just return the collection
if (crawlDepth >= self.maxCrawlDepth)
return crawledPages;
// get any sibling uris that exist on the page
var siblingUris = self.getSiblingUris(uriToCrawl, htmlContent);
var crawledUris = crawledPages.map(function (d) { return d.uri; });
// filter the sibling uris, remove those we've already crawled
var siblingUrisNotYetCrawled = siblingUris.filter(function (uri) { return crawledUris.indexOf(uri) < 0; });
// recursively call this method with the not-yet-crawled uris
siblingUrisNotYetCrawled.forEach(function (uri) {
crawledPages = self.recursiveCrawlSite(uri, crawledPages, (crawlDepth + 1));
});
return crawledPages;
};
self.harvestContactInfo = function (uri, htmlContent) {
var infos = [];
var i = 0;
var url = urlParse(uri);
var $ = cheerio.load(htmlContent);
var bodyText = $("body").text();
var phoneMatches = self.phoneNumberRegEx.exec(bodyText);
if (phoneMatches) {
phoneMatches.forEach(function (p) {
infos.push({ host: url.host, uri: uri, infoType: "phone", value: p });
});
}
// anchors with mailto hrefs
var emailMatches = $("a[href^='mailto:']");
i = emailMatches.length;
while (i--) {
var e = emailMatches[i];
infos.push({ host: url.host, uri: uri, infoType: "email", value: e.attribs.href });
}
// anchors with twitter
var twitterMatches = $("a[href*='twitter']");
i = twitterMatches.length;
while (i--) {
var t = twitterMatches[i];
infos.push({ host: url.host, uri: uri, infoType: "twitter", value: t.attribs.href });
}
// anchors with facebook
var facebookMatches = $("a[href*='facebook']");
i = facebookMatches.length;
while (i--) {
var f = facebookMatches[i];
infos.push({ host: url.host, uri: uri, infoType: "facebook", value: f.attribs.href });
}
return infos;
};
return self;
};
module.exports = websiteContactHarvester;