From 7835af2babc6e8d1d267e4e1a85e947ae18cc096 Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Thu, 30 Mar 2023 15:13:22 +0000 Subject: [PATCH 1/4] =?UTF-8?q?Ajouter=20La=20Presse=20=C3=A0=20la=20liste?= =?UTF-8?q?=20des=20m=C3=A9dia?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/extractors/e_radar+.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index 305b045..159831f 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -67,6 +67,13 @@ medias_urls <- list( country = "CAN", base = "https://vancouversun.com/", front = "" + ), + laPresse = list( + long_name = "La Presse", + short_name = "LAP", + country = "CAN", + base = "https://www.lapresse.ca/", + front = "" ) ) From cbf4017977fc136ec930028a719fc3a7162c536e Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Thu, 30 Mar 2023 15:39:20 +0000 Subject: [PATCH 2/4] =?UTF-8?q?Scraper=20La=20Presse=20Ajout=C3=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/extractors/e_radar+.R | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index 159831f..d01be0e 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -186,6 +186,21 @@ harvest_headline <- function(r, m) { } found_supported_media <- TRUE } + + if(m$short_name == "LAP"){ + LAP_extracted_headline <- r %>% + rvest::html_nodes(xpath = '//div[@class="homeHeadlinesRow__main"]') %>% + rvest::html_nodes(xpath = '//article[@data-position="1"]') %>% + rvest::html_nodes(xpath = '//a[@class="storyCard__cover homeHeadlinesCard__cover"]') %>% + rvest::html_attr("href") + + if (grepl("^http.*", LAP_extracted_headline[[1]])) { + url <- LAP_extracted_headline[[1]] + } else { + url <- paste(m$base, LAP_extracted_headline[[1]], sep="") + } + found_supported_media <- TRUE + } if (!found_supported_media) { clessnverse::logit(scriptname, paste("no supported media found", m$short_name), logger) From 49e8f16a338f9d2744d98b0275240d23bc716446 Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Thu, 30 Mar 2023 15:53:59 +0000 Subject: [PATCH 3/4] =?UTF-8?q?Chang=C3=A9=20le=20pattern=20des=20sources?= =?UTF-8?q?=20m=C3=A9dias=20dans=20la=20liste?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/extractors/e_radar+.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index d01be0e..564106c 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -51,29 +51,29 @@ medias_urls <- list( long_name = "TVA Nouvelles", short_name = "TVA", country = "CAN", - base = "https://www.tvanouvelles.ca/", + base = "https://www.tvanouvelles.ca", front = "/" ), globeAndMail = list( long_name = "The Globe and Mail", short_name = "GAM", country = "CAN", - base = "https://www.theglobeandmail.com/", + base = "https://www.theglobeandmail.com", front = "/" ), vancouverSun = list( long_name = "Vancouver Sun", short_name = "VS", country = "CAN", - base = "https://vancouversun.com/", - front = "" + base = "https://vancouversun.com", + front = "/" ), laPresse = list( long_name = "La Presse", short_name = "LAP", country = "CAN", - base = "https://www.lapresse.ca/", - front = "" + base = "https://www.lapresse.ca", + front = "/" ) ) From dfcdc4f5fe26aa589628e49b51a19f51dc5f8f63 Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Thu, 30 Mar 2023 18:09:54 +0000 Subject: [PATCH 4/4] Fixed keyUrl for the key making --- pipelines/extractors/e_radar+.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index 564106c..b1208f7 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -236,8 +236,12 @@ harvest_headline <- function(r, m) { clessnverse::logit(scriptname, paste("pushing headline", url, "to hub"), logger) + keyUrl <- url + if(substr(keyUrl, nchar(keyUrl) - 1 + 1, nchar(keyUrl)) == '/'){ + keyUrl <- substr(keyUrl, 1, nchar(keyUrl) - 1) + } #key = paste(digest::digest(url), gsub(" |-|:", "", Sys.time()), sep="_") - key <- gsub(" |-|:|/|\\.", "_", paste(stringr::str_match(url, "[^/]+$"), Sys.time(), sep="_")) + key <- gsub(" |-|:|/|\\.", "_", paste(stringr::str_match(keyUrl, "[^/]+$"), Sys.time(), sep="_")) hub_response <- clessnverse::commit_lake_item( data = list( @@ -304,7 +308,12 @@ main <- function() { clessnverse::logit(scriptname, paste("pushing frontpage", url, "to hub"), logger) #key = paste(digest::digest(url), gsub(" |-|:", "", Sys.time()), sep="_") - key <- gsub(" |-|:|/|\\.", "_", paste(stringr::str_match(url, "[^/]+$"), Sys.time(), sep="_")) + keyUrl <- url + if(substr(keyUrl, nchar(keyUrl) - 1 + 1, nchar(keyUrl)) == '/'){ + keyUrl <- substr(keyUrl, 1, nchar(keyUrl) - 1) + } + + key <- gsub(" |-|:|/|\\.", "_", paste(stringr::str_match(keyUrl, "[^/]+$"), Sys.time(), sep="_")) if (opt$refresh_data) mode <- "refresh" else mode <- "newonly" hub_response <- clessnverse::commit_lake_item(