From 78493180a2a6e2edd0d34b4e2dd34f124c0b64a4 Mon Sep 17 00:00:00 2001 From: chreman Date: Thu, 18 Jun 2020 16:47:04 +0200 Subject: [PATCH] implement subject_orig field where missing Former-commit-id: 88e802cf4e2f19e311082992cb031db56250ee06 --- server/preprocessing/other-scripts/openaire.R | 1 + server/workers/gsheets/src/search_gsheets.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/server/preprocessing/other-scripts/openaire.R b/server/preprocessing/other-scripts/openaire.R index 22f64928d..69b2f3feb 100644 --- a/server/preprocessing/other-scripts/openaire.R +++ b/server/preprocessing/other-scripts/openaire.R @@ -114,6 +114,7 @@ get_return_values <- function(all_artifacts){ } preprocess_data <- function(all_artifacts){ + all_artifacts$subject_orig <- all_artifacts$subject all_artifacts$subject <- unlist(lapply(all_artifacts$subject, function(x) {gsub("\\[[A-Za-z \\.-]+\\]", "", x)})) # removes [ INFO.INFO-MA ] Computer Science [cs]/Multiagent Systems [cs.MA] all_artifacts$subject <- unlist(lapply(all_artifacts$subject, function(x) {gsub(" ?/", ";", x)})) all_artifacts$subject <- unlist(lapply(all_artifacts$subject, function(x) {gsub("\\:.*\\:\\:", "", x)})) # keeps only last part after :: ":Enginyeria de la telecomunicació::Processament del senyal::Reconeixement de formes [Àrees temàtiques de la UPC]" diff --git a/server/workers/gsheets/src/search_gsheets.py b/server/workers/gsheets/src/search_gsheets.py index 9fafacb67..a169bda6e 100644 --- a/server/workers/gsheets/src/search_gsheets.py +++ b/server/workers/gsheets/src/search_gsheets.py @@ -228,7 +228,8 @@ def create_input_data(df): metadata["year"] = df["Publication Date"] metadata["url"] = df.ID metadata["readers"] = 0 - metadata["subject"] = df.Keywords + metadata["subject_orig"] = df.Keywords + metadata["subject"] = metadata["subject_orig"] metadata["oa_state"] = df.Access metadata["link"] = df["Link to PDF"].map(lambda x: x.replace("N/A", "") if isinstance(x, str) else "") metadata["relevance"] = df.index