-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.js
114 lines (93 loc) · 4.32 KB
/
test.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
const puppeteer = require('puppeteer');
const request = require('request-promise');
var fs = require('fs');
var XLSX = require('xlsx');
var sleep = require('system-sleep');
(async () => {
try{
// Viewport && Window size
const width = 1080
const height = 1080
const browser = await puppeteer.launch({
headless: true,
args:[
'--ignore-certificate-errors',
'--ignore-certificate-errors-spki-list '
]
} );
var sheetToProcess = 0 // sheet number you want to process
var startIndex= 31576 // start row index
var endIndex = 59999 // end row index
var page = await browser.newPage();
var workbook = XLSX.readFile('./124.xlsx'); //Excel Sheet to read in
var sheet_name_list = workbook.SheetNames;
var temp = XLSX.utils.sheet_to_json(workbook.Sheets[sheet_name_list[sheetToProcess]])
await page.setViewport({ width, height })
var loopIndex = startIndex
await page.goto('https://www.wikidata.org/w/index.php?search', {
waitUntil: 'networkidle2',
timeout: 3000000
});
for(let x = startIndex; x < endIndex ; x++){
console.log("processing "+ x +" out of "+temp.length)
loopIndex = x
if(x==0 ){
await page.goto('https://www.wikidata.org/w/index.php?search',{
waitUntil: 'networkidle2',
timeout: 3000000
});
}
if(x%50 ==0){
//Closeing tab and opeing tab to prevent RAM buffer overflow
await page.close();
page = await browser.newPage();
await page.goto('https://www.wikidata.org/w/index.php?search',{
waitUntil: 'networkidle2',
timeout: 3000000
});
}
if( temp[x].token != "null" && typeof temp[x].token !='undefined'){
await page.type('input[name="search"]', temp[x].token.toString())
await page.click('button[type="submit"]')
await page.waitForNavigation()
var listofitems = await page.$$eval('#mw-content-text > div.searchresults > ul > li > div.mw-search-result-heading > a'
,as => as.map(a =>({href:a.href , title : a.title})))
if(listofitems.length > 0){
let lessThan ;
if (listofitems.length > 4){
lessThan = 4
}else{
lessThan = listofitems.length
}
for (let y = 0 ; y < lessThan ; y++){
if(listofitems[y].title.match(/[a-zA-Z ]+/).toString().replace(/^[ ]+|[ ]+$/g,'')== temp[x].token){
if(listofitems[y].href.match("https://www.wikidata.org/wiki/(.*)")[1].toString().substring(0, 1)!='P')
temp[x].Wikidata = "B-"+listofitems[y].href.match("https://www.wikidata.org/wiki/(.*)")[1]
}
}
}
await page.click('input[name="search"]', {clickCount: 3})
if(x%5 ==0){
//Used to slow the crawler to prevent wikidata firewall from blocking us :D
sleep(2*1000);
}
else if(x%2 ==0){
sleep(2*1000);
}
}
}
var ws = XLSX.utils.json_to_sheet(temp);
var wb = XLSX.utils.book_new();
XLSX.utils.book_append_sheet(wb, ws, "report");
//Output excel to root folder
XLSX.writeFile(wb, 'out index from '+startIndex +"-"+ endIndex +" "+ (Date.now() % 171761)+'.xlsx');
await browser.close();
}catch(e){
console.log(e)
var ws = XLSX.utils.json_to_sheet(temp);
var wb = XLSX.utils.book_new();
XLSX.utils.book_append_sheet(wb, ws, "report");
//If error occur save processed rows , LoopIndex will indicate loop end at which row index
XLSX.writeFile(wb, 'out index from '+startIndex +"-"+ endIndex +" end at "+loopIndex+" "+ (Date.now() % 171761)+'.xlsx');
}
})();