diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index 4fde44e..bf7d05a 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -19,27 +19,34 @@ ######################## Functions and Globals ###################### ############################################################################### medias_urls <- list( - cbcnews = list( - long_name = "CBC News", - short_name = "CBC", + cbcnews = list( + long_name = "CBC News", + short_name = "CBC", + country = "CAN", + base = "https://www.cbc.ca", + front = "/news" + ), + jdm = list( + long_name = "Le Journal de Montréal", + short_name = "JDM", + country = "CAN", + base = "https://www.journaldemontreal.com", + front = "/" + ), + radiocan = list( + long_name = "Radio-Canada Info", + short_name = "RCI", + country = "CAN", + base = "https://ici.radio-canada.ca", + front = "/info" + ), + nationalPost = list( + long_name = "National Post", + short_name = "NP", country = "CAN", - base = "https://www.cbc.ca", - front = "/news" - ), - jdm = list( - long_name = "Le Journal de Montréal", - short_name = "JDM", - country = "CAN", - base = "https://www.journaldemontreal.com", + base = "https://nationalpost.com", front = "/" ), - radiocan = list( - long_name = "Radio-Canada Info", - short_name = "RCI", - country = "CAN", - base = "https://ici.radio-canada.ca", - front = "/info" - ), tvaNouvelles = list( long_name = "TVA Nouvelles", short_name = "TVA", @@ -83,6 +90,13 @@ harvest_headline <- function(r, m) { rvest::html_nodes('a') %>% rvest::html_attr("href") + if(length(CBC_extracted_headline) == 0){ + CBC_extracted_headline <<- r %>% + rvest::html_nodes(xpath = '//*[@class="primaryHeadline desktopHeadline"]') %>% + rvest::html_nodes('a') %>% + rvest::html_attr("href") + } + if (grepl("^http.*", CBC_extracted_headline[[1]])) { url <- CBC_extracted_headline[[1]] } else { @@ -90,6 +104,20 @@ harvest_headline <- function(r, m) { } found_supported_media <- TRUE } + + if(m$short_name == "NP"){ + NP_extracted_headline <<- r %>% + rvest::html_nodes(xpath = '//*[contains(concat(" ", @class, "="), "hero-feed__hero-col")]') %>% + rvest::html_nodes(xpath = '//a[@class="article-card__link"]') %>% + rvest::html_attr("href") + + if (grepl("^http.*", NP_extracted_headline[[1]])) { + url <- NP_extracted_headline[[1]] + } else { + url <- paste(m$base, NP_extracted_headline[[1]], sep="") + } + found_supported_media <- TRUE + } if(m$short_name == "TVA"){ TVA_extracted_headline <- r %>% rvest::html_nodes(xpath = '//*[@class="home-top-story"]') %>% rvest::html_nodes(xpath = '//*[@class="news_unit-link"]') %>% rvest::html_attr("href") @@ -115,9 +143,7 @@ harvest_headline <- function(r, m) { } found_supported_media <- TRUE } - - - + if (!found_supported_media) { clessnverse::logit(scriptname, paste("no supported media found", m$short_name), logger) warning(paste("no supported media found", m$short_name))