From c1098576c1edba46e0a1a7a2259cdcef1bad51ad Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Tue, 18 Apr 2023 13:31:59 +0000 Subject: [PATCH 1/8] =?UTF-8?q?Ajouter=20m$short=5Fname=20dans=20la=20cl?= =?UTF-8?q?=C3=A9=20de=20frontpage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/extractors/e_radar+.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index 0496396..482f8fe 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -472,7 +472,7 @@ main <- function() { keyUrl <- substr(keyUrl, 1, nchar(keyUrl) - 1) } - key <- gsub(" |-|:|/|\\.", "_", paste(stringr::str_match(keyUrl, "[^/]+$"), Sys.time(), sep="_")) + key <- gsub(" |-|:|/|\\.", "_", paste(m$short_name, stringr::str_match(keyUrl, "[^/]+$"), Sys.time(), sep="_")) if (opt$refresh_data) mode <- "refresh" else mode <- "newonly" hub_response <- clessnverse::commit_lake_item( From 28be0c5043b96a3846be4ef4c142edfb6e584abe Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Tue, 18 Apr 2023 14:19:05 +0000 Subject: [PATCH 2/8] =?UTF-8?q?Ajout=20d'une=20deuxi=C3=A8me=20fa=C3=A7on?= =?UTF-8?q?=20de=20scraper=20CTV?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pipelines/extractors/e_radar+.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index 482f8fe..847a42a 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -296,6 +296,14 @@ harvest_headline <- function(r, m) { rvest::html_nodes("a") %>% rvest::html_attr("href") + if(length(CTV_extracted_headline) == 0){ + clessnverse::logit(scriptname, "CTV: Initial attempt failed, trying thorugh xpaths.", logger) + CTV_extracted_headline <- r %>% + rvest::html_nodes(xpath = '//div[@class="c-list__item__block"]') %>% + rvest::html_nodes(xpath = '//a[@class="c-list__item__image"]') %>% + rvest::html_attr("href") + } + if(length(CTV_extracted_headline) > 0){ if (grepl("^http.*", CTV_extracted_headline[[1]])) { url <- CTV_extracted_headline[[1]] From 6c1599d1f1127be686dd02cdb2b0893ae0cd4542 Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Tue, 18 Apr 2023 15:37:40 +0000 Subject: [PATCH 3/8] Retry on pushing to lake if it fails --- pipelines/extractors/e_radar+.R | 79 +++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index 847a42a..597e3a2 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -405,24 +405,30 @@ harvest_headline <- function(r, m) { pushedHeadlines <<- append(pushedHeadlines, key) - hub_response <- clessnverse::commit_lake_item( - data = list( - key = key, - path = "radarplus/headline", - item = doc - ), - metadata = metadata, - mode = if (opt$refresh_data) "refresh" else "newonly", - credentials - ) + pushed <- FALSE - if (hub_response) { - clessnverse::logit(scriptname, paste("successfuly pushed headline", key, "to datalake"), logger) - nb_headline <<- nb_headline + 1 - } else { - clessnverse::logit(scriptname, paste("error while pushing headline", key, "to datalake"), logger) - warning(paste("error while pushing headline", key, "to datalake")) + while(!pushed){ + hub_response <- clessnverse::commit_lake_item( + data = list( + key = key, + path = "radarplus/headline", + item = doc + ), + metadata = metadata, + mode = if (opt$refresh_data) "refresh" else "newonly", + credentials + ) + + if (hub_response) { + clessnverse::logit(scriptname, paste("successfuly pushed headline", key, "to datalake"), logger) + nb_headline <<- nb_headline + 1 + pushed <- TRUE + } else { + clessnverse::logit(scriptname, paste("error while pushing headline", key, "to datalake"), logger) + warning(paste("error while pushing headline", key, "to datalake")) + } } + } else { clessnverse::logit(scriptname, paste("there was an error getting url", url), logger) @@ -483,24 +489,29 @@ main <- function() { key <- gsub(" |-|:|/|\\.", "_", paste(m$short_name, stringr::str_match(keyUrl, "[^/]+$"), Sys.time(), sep="_")) if (opt$refresh_data) mode <- "refresh" else mode <- "newonly" - hub_response <- clessnverse::commit_lake_item( - data = list( - key = key, - path = "radarplus/frontpage", - item = doc - ), - metadata = metadata, - mode = mode, - credentials = credentials - ) - - if (hub_response) { - clessnverse::logit(scriptname, paste("successfuly pushed frontpage", key, "to datalake"), logger) - nb_frontpage <<- nb_frontpage + 1 - harvest_headline(r, m) - } else { - clessnverse::logit(scriptname, paste("error while pushing frontpage", key, "to datalake"), logger) - warning(paste("error while pushing frontpage", key, "to datalake")) + pushed <- FALSE + + while(!pushed){ + hub_response <- clessnverse::commit_lake_item( + data = list( + key = key, + path = "radarplus/frontpage", + item = doc + ), + metadata = metadata, + mode = mode, + credentials = credentials + ) + + if (hub_response) { + clessnverse::logit(scriptname, paste("successfuly pushed frontpage", key, "to datalake"), logger) + nb_frontpage <<- nb_frontpage + 1 + pushed <- TRUE + harvest_headline(r, m) + } else { + clessnverse::logit(scriptname, paste("error while pushing frontpage", key, "to datalake"), logger) + warning(paste("error while pushing frontpage", key, "to datalake")) + } } } else { From 0c9e8e4e966c1fdee539b197b551f1b2c74a3cee Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Tue, 18 Apr 2023 16:05:23 +0000 Subject: [PATCH 4/8] Ajout champ metadata frontpage_root_key + fix typo --- pipelines/extractors/e_radar+.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index 597e3a2..3194794 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -385,7 +385,8 @@ harvest_headline <- function(r, m) { storage_class = "lake", country = m$country, schema = opt$schema, - hashedHTML = NA + hashed_html = NA, + frontpage_root_key = NA ) if (r$response$status_code == 200) { @@ -466,7 +467,7 @@ main <- function() { storage_class = "lake", country = m$country, schema = opt$schema, - keysUne = NA + keys_une = NA ) r <<- rvest::session(url) From 176a477fc203c4d7fb20dd8373860c9511591b9b Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Tue, 18 Apr 2023 19:56:14 +0000 Subject: [PATCH 5/8] Ajouter counter max et sleep time --- pipelines/extractors/e_radar+.R | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index 3194794..aaadd00 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -407,8 +407,12 @@ harvest_headline <- function(r, m) { pushedHeadlines <<- append(pushedHeadlines, key) pushed <- FALSE + counter <- 0 - while(!pushed){ + while(!pushed && counter < 20){ + if(counter > 0){ + Sys.sleep(20) + } hub_response <- clessnverse::commit_lake_item( data = list( key = key, @@ -426,11 +430,13 @@ harvest_headline <- function(r, m) { pushed <- TRUE } else { clessnverse::logit(scriptname, paste("error while pushing headline", key, "to datalake"), logger) - warning(paste("error while pushing headline", key, "to datalake")) + counter <- counter + 1 } } - + if(!pushed){ + warning(paste("error while pushing headline", key, "to datalake")) + } } else { clessnverse::logit(scriptname, paste("there was an error getting url", url), logger) warning(paste("there was an error getting url", url)) @@ -491,8 +497,12 @@ main <- function() { if (opt$refresh_data) mode <- "refresh" else mode <- "newonly" pushed <- FALSE + counter <- 0 while(!pushed){ + if(counter > 0){ + Sys.sleep(20) + } hub_response <- clessnverse::commit_lake_item( data = list( key = key, @@ -508,13 +518,18 @@ main <- function() { clessnverse::logit(scriptname, paste("successfuly pushed frontpage", key, "to datalake"), logger) nb_frontpage <<- nb_frontpage + 1 pushed <- TRUE - harvest_headline(r, m) } else { clessnverse::logit(scriptname, paste("error while pushing frontpage", key, "to datalake"), logger) - warning(paste("error while pushing frontpage", key, "to datalake")) + counter <- counter + 1 } } + if(pushed){ + harvest_headline(r, m) + } else { + warning(paste("error while pushing frontpage", key, "to datalake")) + } + } else { clessnverse::logit(scriptname, paste("there was an error getting url", url), logger) warning(paste("there was an error getting url", url)) From a58082ad7e824ec536a5c6691652ba8a7ef685b9 Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Tue, 18 Apr 2023 20:16:07 +0000 Subject: [PATCH 6/8] Changer le nom de keys_unes pour headline_keys --- pipelines/extractors/e_radar+.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index aaadd00..6666bfb 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -473,7 +473,7 @@ main <- function() { storage_class = "lake", country = m$country, schema = opt$schema, - keys_une = NA + headline_keys = NA ) r <<- rvest::session(url) From 0ece76fc35be2100375aac7021fa7ddc2ea58ac2 Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Tue, 18 Apr 2023 20:20:53 +0000 Subject: [PATCH 7/8] Enlever headline_keys --- pipelines/extractors/e_radar+.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index 6666bfb..aa393aa 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -472,8 +472,7 @@ main <- function() { content_type = "news_headline", storage_class = "lake", country = m$country, - schema = opt$schema, - headline_keys = NA + schema = opt$schema ) r <<- rvest::session(url) From 288ef92e3bf2aee6c4489c87e1dff46676677188 Mon Sep 17 00:00:00 2001 From: ClementCadieux Date: Wed, 19 Apr 2023 13:18:43 +0000 Subject: [PATCH 8/8] Rajouter headline_root_key --- pipelines/extractors/e_radar+.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/extractors/e_radar+.R b/pipelines/extractors/e_radar+.R index aa393aa..76f4f6f 100644 --- a/pipelines/extractors/e_radar+.R +++ b/pipelines/extractors/e_radar+.R @@ -472,7 +472,8 @@ main <- function() { content_type = "news_headline", storage_class = "lake", country = m$country, - schema = opt$schema + schema = opt$schema, + headline_root_key = NA ) r <<- rvest::session(url)