Skip to content

Commit

Permalink
Improve scraper code
Browse files Browse the repository at this point in the history
  • Loading branch information
lachlanjc committed Mar 14, 2020
1 parent 491dccb commit dcee625
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 54 deletions.
110 changes: 57 additions & 53 deletions api/src/functions/scrape.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,18 @@ const COUNTRIES = [

// data returns in format: { "2020-02-28": 324, "2020-02-29": 500 }
const extractDataFromChart = function($, chartId) {
let htmlAttempt = $('div#' + chartId + ' + div > script').html()
let htmlAttempt = $(`div#${chartId} + div > script`).html()

// sometimes charts have a div wrapping the following script tag, sometimes they don't
if (!htmlAttempt) {
htmlAttempt = $('div#' + chartId + ' + script').html()
htmlAttempt = $(`div#${chartId} + script`).html()
}

const hackedChartJSON = htmlAttempt.replace(/^.*?{/, '{').replace(/\); ?$/, '').replace(/([A-z]+):/g, '"$1":').replace(/'/g, '"')
const hackedChartJSON = htmlAttempt
.replace(/^.*?{/, '{')
.replace(/\); ?$/, '')
.replace(/([A-z]+):/g, '"$1":')
.replace(/'/g, '"')

const json = JSON.parse(hackedChartJSON)
const xVals = json.xAxis.categories
Expand All @@ -46,12 +50,10 @@ const extractDataFromChart = function($, chartId) {
}

// ex. data = { "2020-02-28": { confirmedCases: 43 } }, new cases = { "2020-02-28": 23 }, keyForData = "newCases"
//
// returns { "2020-02-28": { confirmedCases: 43, newCases: 23 } }
const mergeChart = function(data, toMergeIn, keyForData) {
for (const day in toMergeIn) {
data[day] = data[day] || {}

data[day][keyForData] = toMergeIn[day]
}

Expand All @@ -60,7 +62,9 @@ const mergeChart = function(data, toMergeIn, keyForData) {

// total cases, new cases, currently infected, total deaths, new deaths
const scrapeCases = async function(country) {
const html = await fetch('https://www.worldometers.info/coronavirus/country/' + country + '/').then(r => r.text())
const html = await fetch(
`https://www.worldometers.info/coronavirus/country/${country}/`
).then((r) => r.text())

const $ = cheerio.load(html)

Expand All @@ -81,93 +85,93 @@ const scrapeCases = async function(country) {
return data
}

const scrapeAllCountryData = async function() {
const data = {}

for (const country of COUNTRIES) {
data[country] = await scrapeCases(country)
}

return data
}

const dbBS = async function() {
const countries = await db.country.findMany()

console.log(countries)

for (const country of countries) {
console.log('\n###############')
console.log('Syncing ' + country.name + ' (' + country.iso + ', ' + country.worldometersSlug + ')')
console.log(
`Syncing ${country.name} (${country.iso}, ${country.worldometersSlug})`
)
console.log('###############')

console.log("\nBeginning scraping...")
console.log('\nBeginning scraping...')
const data = await scrapeCases(country.worldometersSlug)
console.log("Done!")
console.log('Done!')

console.log("\nBeginning sync with DB...")
console.log('\nBeginning sync with DB...')
for (const day in data) {
console.log("Fetching Day for " + day + " (and creating one if it doesn't exist)")
console.log(
`Fetching Day for ${day} (and creating one if it doesn't exist)`
)
const dbDay = await db.day.upsert({
update: {
date: new Date(day)
},
where: {
date: new Date(day)
},
create: {
date: new Date(day)
}
update: { date: new Date(day) },
where: { date: new Date(day) },
create: { date: new Date(day) }
})

console.log("\nBeginning DailyCount creation for " + day + " (or update if it already exists)...")
console.log(`
Beginning DailyCount creation for ${day} (or update if it already exists)...`)
const foundCounts = await db.dailyCount.findMany({
where: {
country: { id: country.id },
date: { id: dbDay.id },
date: { id: dbDay.id }
}
})

let dailyCount

if (foundCounts.length == 0) {
console.log("No DailyCount found. Creating one.")
console.log('No DailyCount found. Creating one.')
const {
totalCases,
newCases,
currentlyInfected,
totalDeaths,
newDeaths
} = data[day]
dailyCount = await db.dailyCount.create({
data: {
country: { connect: { id: country.id } },
date: { connect: { id: dbDay.id } },
totalCases: data[day].totalCases,
newCases: data[day].newCases,
currentlyInfected: data[day].currentlyInfected,
totalDeaths: data[day].totalDeaths,
newDeaths: data[day].newDeaths,
totalCases,
newCases,
currentlyInfected,
totalDeaths,
newDeaths
}
})
} else if (foundCounts.length == 1) {
console.log("DailyCount #" + foundCounts[0].id + " found. Updating it...")
console.log(`DailyCount #${foundCounts[0].id} found. Updating it...`)
const {
totalCases,
newCases,
currentlyInfected,
totalDeaths,
newDeaths
} = data[day]
dailyCount = await db.dailyCount.update({
where: { id: foundCounts[0].id },
data: {
totalCases: data[day].totalCases,
newCases: data[day].newCases,
currentlyInfected: data[day].currentlyInfected,
totalDeaths: data[day].totalDeaths,
newDeaths: data[day].newDeaths,
totalCases,
newCases,
currentlyInfected,
totalDeaths,
newDeaths
}
})
} else {
console.log("Multiple DailyCounts found. This should never happen. HORRIBLE ERROR.")
console.error(
'Multiple DailyCounts found. This should never happen. HORRIBLE ERROR.'
)
}
}
}
}

export const handler = (event, context, callback) => {
export const handler = (event, context, callback) =>
dbBS()
.then(() => {
return callback(null, { status: 200, body: 'Success!' })
})
.catch(err => {
return callback(null, { status: 500, body: "Error! " + err })
})
}
.then(() => callback(null, { status: 200, body: 'Success!' }))
.catch((err) => callback(null, { status: 500, body: 'Error! ' + err }))
2 changes: 1 addition & 1 deletion prettier.config.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// https://prettier.io/docs/en/options.html
module.exports = {
trailingComma: 'es5',
trailingComma: 'none',
bracketSpacing: true,
tabWidth: 2,
semi: false,
Expand Down

0 comments on commit dcee625

Please sign in to comment.