-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaddressers.mjs
executable file
·71 lines (50 loc) · 1.69 KB
/
addressers.mjs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!node_modules/.bin/zx
import { fs, path } from 'zx';
import _ from 'lodash';
import { MongoClient } from 'mongodb';
import { parseHTML } from 'linkedom';
const jsource = path.join('addresses', 'IdList.json');
const ids = await fs.readJSON(jsource);
for (const id of ids) {
const url = `https://www.europarl.europa.eu/meps/en/${id}`;
const tmpo = path.join('addresses', 'outputs', `${id}.html`);
if(fs.existsSync(tmpo)) {
console.log(`File ${tmpo} exists, skipping`);
continue;
}
await $`curl -o ${tmpo} -L ${url}`.quiet();
console.log(`curl completed, output produced`);
await $`ls -l ${tmpo}`;
await $`sleep 1`.quiet();
}
const client = new MongoClient('mongodb://127.0.0.1:27017/faces');
await client.connect();
function antiSpam(stripz) {
const replaced = stripz.replace(/\[at\]/, '@').replace(/\[dot\]/, '.');
const straight = (_.reverse(_.split(replaced, ''))).join('');
return straight;
}
for (const id of ids) {
const sourcepage = path.join('addresses', 'outputs', `${id}.html`);
const htmlpage = await fs.readFile(sourcepage, 'utf-8');
const {
window, document, customElements,
HTMLElement,
Event, CustomEvent
} = parseHTML(htmlpage);
const twitt = document.querySelector('a.link_twitt');
const email = document.querySelector('a.link_email');
const mep = await client.db()
.collection('meps').findOne({ id });
if(twitt)
mep.twitter = twitt.getAttribute('href');
if(email)
mep.email = antiSpam(email.getAttribute('href'));
await client.db()
.collection('meps').deleteOne({id: mep.id});
_.unset(mep, '_id');
await client.db()
.collection('meps').insertOne(mep);
}
await client.close();
console.log("Aquisition done!");