-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.js
66 lines (55 loc) · 1.83 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
const puppeteer = require('puppeteer');
const fs = require('fs');
const { Parser } = require('json2csv');
(async () => {
const browser = await puppeteer.launch();
try {
const domains = [
'https://www.example.com/',
'https://www.test.com/'
];
const paths = [
'path1',
'path2',
'path3'
]; // Add more paths as needed
const allAnchorData = [];
//array for scraped data to be pushed into
for (const domain of domains) {
for (const path of paths) {
const url = domain + path;
const page = await browser.newPage();
await page.goto(url, {'timeout': 120000});
//added timeout for pages with long loading times, lots of videos, etc.
const anchorData = await page.evaluate((domain, path) => {
//create an array of all anchor tags with class='btn-primary'
const anchorTags = Array.from(document.querySelectorAll('a.btn-primary'));
//return the array as JSON with selected content
return anchorTags.map(tag => {
return {
//return both domain & path to compare differences
domain: domain,
path: path,
href: tag.getAttribute('href'),
text: tag.textContent.trim()
};
});
}, domain, path);
allAnchorData.push(...anchorData);
await page.close();
}
}
// Specify the output file path
const outputFile = 'output.csv';
// Convert JSON data to CSV format
const json2csvParser = new Parser();
const csv = json2csvParser.parse(allAnchorData);
// Write the CSV data to the output file
fs.writeFileSync(outputFile, csv);
console.log(`Anchor data written to ${outputFile}`);
} catch (error) {
console.error('Error:', error);
} finally {
await browser.close();
}
})();