diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index 305b045..b1208f7 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -51,22 +51,29 @@ medias_urls <- list( long_name = "TVA Nouvelles", short_name = "TVA", country = "CAN", - base = "https://www.tvanouvelles.ca/", + base = "https://www.tvanouvelles.ca", front = "/" ), globeAndMail = list( long_name = "The Globe and Mail", short_name = "GAM", country = "CAN", - base = "https://www.theglobeandmail.com/", + base = "https://www.theglobeandmail.com", front = "/" ), vancouverSun = list( long_name = "Vancouver Sun", short_name = "VS", country = "CAN", - base = "https://vancouversun.com/", - front = "" + base = "https://vancouversun.com", + front = "/" + ), + laPresse = list( + long_name = "La Presse", + short_name = "LAP", + country = "CAN", + base = "https://www.lapresse.ca", + front = "/" ) ) @@ -179,6 +186,21 @@ harvest_headline <- function(r, m) { } found_supported_media <- TRUE } + + if(m$short_name == "LAP"){ + LAP_extracted_headline <- r %>% + rvest::html_nodes(xpath = '//div[@class="homeHeadlinesRow__main"]') %>% + rvest::html_nodes(xpath = '//article[@data-position="1"]') %>% + rvest::html_nodes(xpath = '//a[@class="storyCard__cover homeHeadlinesCard__cover"]') %>% + rvest::html_attr("href") + + if (grepl("^http.*", LAP_extracted_headline[[1]])) { + url <- LAP_extracted_headline[[1]] + } else { + url <- paste(m$base, LAP_extracted_headline[[1]], sep="") + } + found_supported_media <- TRUE + } if (!found_supported_media) { clessnverse::logit(scriptname, paste("no supported media found", m$short_name), logger) @@ -214,8 +236,12 @@ harvest_headline <- function(r, m) { clessnverse::logit(scriptname, paste("pushing headline", url, "to hub"), logger) + keyUrl <- url + if(substr(keyUrl, nchar(keyUrl) - 1 + 1, nchar(keyUrl)) == '/'){ + keyUrl <- substr(keyUrl, 1, nchar(keyUrl) - 1) + } #key = paste(digest::digest(url), gsub(" |-|:", "", Sys.time()), sep="_") - key <- gsub(" |-|:|/|\\.", "_", paste(stringr::str_match(url, "[^/]+$"), Sys.time(), sep="_")) + key <- gsub(" |-|:|/|\\.", "_", paste(stringr::str_match(keyUrl, "[^/]+$"), Sys.time(), sep="_")) hub_response <- clessnverse::commit_lake_item( data = list( @@ -282,7 +308,12 @@ main <- function() { clessnverse::logit(scriptname, paste("pushing frontpage", url, "to hub"), logger) #key = paste(digest::digest(url), gsub(" |-|:", "", Sys.time()), sep="_") - key <- gsub(" |-|:|/|\\.", "_", paste(stringr::str_match(url, "[^/]+$"), Sys.time(), sep="_")) + keyUrl <- url + if(substr(keyUrl, nchar(keyUrl) - 1 + 1, nchar(keyUrl)) == '/'){ + keyUrl <- substr(keyUrl, 1, nchar(keyUrl) - 1) + } + + key <- gsub(" |-|:|/|\\.", "_", paste(stringr::str_match(keyUrl, "[^/]+$"), Sys.time(), sep="_")) if (opt$refresh_data) mode <- "refresh" else mode <- "newonly" hub_response <- clessnverse::commit_lake_item(