Skip to content

Commit

Permalink
Merge pull request #13 from clessn/ajouter-scraper-national_post
Browse files Browse the repository at this point in the history
Ajouter scraper national post
  • Loading branch information
p2xcode authored Mar 28, 2023
2 parents 6fde409 + 81ed4a9 commit f5d8d14
Showing 1 changed file with 47 additions and 21 deletions.
68 changes: 47 additions & 21 deletions pipelines/extractors/e_radar+.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,34 @@
######################## Functions and Globals ######################
###############################################################################
medias_urls <- list(
cbcnews = list(
long_name = "CBC News",
short_name = "CBC",
cbcnews = list(
long_name = "CBC News",
short_name = "CBC",
country = "CAN",
base = "https://www.cbc.ca",
front = "/news"
),
jdm = list(
long_name = "Le Journal de Montréal",
short_name = "JDM",
country = "CAN",
base = "https://www.journaldemontreal.com",
front = "/"
),
radiocan = list(
long_name = "Radio-Canada Info",
short_name = "RCI",
country = "CAN",
base = "https://ici.radio-canada.ca",
front = "/info"
),
nationalPost = list(
long_name = "National Post",
short_name = "NP",
country = "CAN",
base = "https://www.cbc.ca",
front = "/news"
),
jdm = list(
long_name = "Le Journal de Montréal",
short_name = "JDM",
country = "CAN",
base = "https://www.journaldemontreal.com",
base = "https://nationalpost.com",
front = "/"
),
radiocan = list(
long_name = "Radio-Canada Info",
short_name = "RCI",
country = "CAN",
base = "https://ici.radio-canada.ca",
front = "/info"
),
tvaNouvelles = list(
long_name = "TVA Nouvelles",
short_name = "TVA",
Expand Down Expand Up @@ -83,13 +90,34 @@ harvest_headline <- function(r, m) {
rvest::html_nodes('a') %>%
rvest::html_attr("href")

if(length(CBC_extracted_headline) == 0){
CBC_extracted_headline <<- r %>%
rvest::html_nodes(xpath = '//*[@class="primaryHeadline desktopHeadline"]') %>%
rvest::html_nodes('a') %>%
rvest::html_attr("href")
}

if (grepl("^http.*", CBC_extracted_headline[[1]])) {
url <- CBC_extracted_headline[[1]]
} else {
url <- paste(m$base, CBC_extracted_headline[[1]], sep="")
}
found_supported_media <- TRUE
}

if(m$short_name == "NP"){
NP_extracted_headline <<- r %>%
rvest::html_nodes(xpath = '//*[contains(concat(" ", @class, "="), "hero-feed__hero-col")]') %>%
rvest::html_nodes(xpath = '//a[@class="article-card__link"]') %>%
rvest::html_attr("href")

if (grepl("^http.*", NP_extracted_headline[[1]])) {
url <- NP_extracted_headline[[1]]
} else {
url <- paste(m$base, NP_extracted_headline[[1]], sep="")
}
found_supported_media <- TRUE
}

if(m$short_name == "TVA"){
TVA_extracted_headline <- r %>% rvest::html_nodes(xpath = '//*[@class="home-top-story"]') %>% rvest::html_nodes(xpath = '//*[@class="news_unit-link"]') %>% rvest::html_attr("href")
Expand All @@ -115,9 +143,7 @@ harvest_headline <- function(r, m) {
}
found_supported_media <- TRUE
}




if (!found_supported_media) {
clessnverse::logit(scriptname, paste("no supported media found", m$short_name), logger)
warning(paste("no supported media found", m$short_name))
Expand Down

0 comments on commit f5d8d14

Please sign in to comment.