From 63f2d7263133cf4f2d472db33da7a55b937daf2f Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Sat, 16 Nov 2019 10:50:53 -0500 Subject: [PATCH 01/37] only count cites/references and inverses --- .../DatasetExternalCitationsServiceBean.java | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java index 0c1816241bb..fd374c5d96c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java @@ -33,7 +33,14 @@ public class DatasetExternalCitationsServiceBean implements java.io.Serializable @EJB DatasetServiceBean datasetService; - + + //Array of relationship types that are considered to be citations + static ArrayList relationships = new ArrayList( + Arrays.asList( + "is-cited-by", + "cites", + "is-referenced-by", + "references")); public List parseCitations(JsonObject report) { List datasetExternalCitations = new ArrayList<>(); @@ -44,18 +51,19 @@ public List parseCitations(JsonObject report) { exCit.setCitedByUrl(citation.getJsonObject("attributes").getString("subj-id")); String localDatasetDOI = citation.getJsonObject("attributes").getString("obj-id"); - - Dataset localDs = null; - if (localDatasetDOI.contains("doi")) { - String globalId = localDatasetDOI.replace("https://", "").replace("doi.org/", "doi:").toUpperCase().replace("DOI:", "doi:"); - localDs = datasetService.findByGlobalId(globalId); - exCit.setDataset(localDs); - } + String relationship = citation.getJsonObject("attributes").getString("relation-type-id"); + if (relationships.contains(relationship)) { + Dataset localDs = null; + if (localDatasetDOI.contains("doi")) { + String globalId = localDatasetDOI.replace("https://", "").replace("doi.org/", "doi:").toUpperCase().replace("DOI:", "doi:"); + localDs = datasetService.findByGlobalId(globalId); + exCit.setDataset(localDs); + } - if (localDs != null && !exCit.getCitedByUrl().isEmpty() ) { - datasetExternalCitations.add(exCit); + if (localDs != null && !exCit.getCitedByUrl().isEmpty()) { + datasetExternalCitations.add(exCit); + } } - } return datasetExternalCitations; } From bbb2722d799e8a4386695f366d737064e8c3c0bc Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 1 Apr 2020 11:13:05 -0400 Subject: [PATCH 02/37] MDC scripts --- conf/mdc/counter_daily.sh | 30 +++++++++++++++++++++++++++ conf/mdc/counter_weekly.sh | 42 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 conf/mdc/counter_daily.sh create mode 100644 conf/mdc/counter_weekly.sh diff --git a/conf/mdc/counter_daily.sh b/conf/mdc/counter_daily.sh new file mode 100644 index 00000000000..925737a47d6 --- /dev/null +++ b/conf/mdc/counter_daily.sh @@ -0,0 +1,30 @@ +#! /bin/bash +sudo -u counter +# control_daily.sh + +echo >>tmp/counter_daily.log +date >>tmp/counter_daily.log +echo >>tmp/counter_daily.log + +# "You should run Counter Processor once a day to create reports in SUSHI (JSON) format that are saved to disk for Dataverse to process and that are sent to the DataCite hub." + +LAST=$(date -d "yesterday 13:00" '+%Y-%m-%d') +# echo $LAST +YEAR_MONTH=$(date -d "yesterday 13:00" '+%Y-%m') +# echo $YEAR_MONTH +d=$(date -I -d "$YEAR_MONTH-01") +#echo $d +while [ "$(date -d "$d" +%Y%m%d)" -le "$(date -d "$LAST" +%Y%m%d)" ]; +do + if [ -f "/srv/glassfish/dataverse/logs/mdc/counter_$d.log" ]; then +# echo "Found counter_$d.log" + else + touch "/srv/glassfish/dataverse/logs/mdc/counter_$d.log" + fi + d=$(date -I -d "$d + 1 day") +done + +cd /opt/counter-processor-0.0.1 +YEAR_MONTH=$YEAR_MONTH python3 main.py >>tmp/counter_daily.log + +curl -X POST "http://localhost:8080/api/admin/makeDataCount/addUsageMetricsFromSushiReport?reportOnDisk=/opt/counter-processor-0.0.1/tmp/make-data-count-report.json" diff --git a/conf/mdc/counter_weekly.sh b/conf/mdc/counter_weekly.sh new file mode 100644 index 00000000000..5384e77a237 --- /dev/null +++ b/conf/mdc/counter_weekly.sh @@ -0,0 +1,42 @@ +#!/bin/sh + +# This script iterates through all published Datasets in all Dataverses and calls the Make Data Count API to update their citations from DataCite +# Note: Requires curl and jq for parsing JSON responses form curl + +# A recursive method to process each Dataverse +processDV () { + +#Call the Dataverse API to get the contents of the Dataverse (without credentials, this will only list published datasets and dataverses +DVCONTENTS=$(curl -s http://localhost:8080/api/dataverses/$1/contents) + +# Iterate over all datasets, pulling the value of their DOIs (as part of the persistentUrl) from the json returned +for subds in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataset") | .persistentUrl'); do + +#The authority/identifier are preceded by a protocol/host, i.e. https://doi.org/ +DOI=`expr "$subds" : '.*:\/\/\doi\.org\/\(.*\)'` + +# Call the Dataverse API for this dataset and get the response +RESULT=$(curl -s -X POST "http://localhost:8080/api/admin/makeDataCount/:persistentId/updateCitationsForDataset?persistentId=doi:$DOI" | jq -r '.status') + +# The status for a call that worked +OK='OK' + +# Check the status and report +if [ "$RESULT" = "$OK" ]; then + echo "Updated citations for doi:$DOI" +else + echo "Failed to update citations for doi:$DOI" + echo "Run curl -s -X POST 'http://localhost:8080/api/admin/makeDataCount/:persistentId/updateCitationsForDataset?persistentId=doi:$DOI' to retry/see the error message" +fi + +done + +# Now iterate over any child Dataverses and recursively process them +for subdv in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataverse") | .id'); do +processDV $subdv +done +} + +# Call the function on the root dataverse to start processing +processDV 1 + From c1d37b3e334e59644c7c35ddceff84c48fd5d69d Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 8 May 2020 16:13:26 -0400 Subject: [PATCH 03/37] script updates --- conf/mdc/counter_daily.sh | 11 +++++++---- conf/mdc/counter_weekly.sh | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/conf/mdc/counter_daily.sh b/conf/mdc/counter_daily.sh index 925737a47d6..b9af62646e4 100644 --- a/conf/mdc/counter_daily.sh +++ b/conf/mdc/counter_daily.sh @@ -1,6 +1,8 @@ #! /bin/bash -sudo -u counter -# control_daily.sh + +# counter_daily.sh + +cd /opt/counter-processor-0.0.1 echo >>tmp/counter_daily.log date >>tmp/counter_daily.log @@ -24,7 +26,8 @@ do d=$(date -I -d "$d + 1 day") done -cd /opt/counter-processor-0.0.1 -YEAR_MONTH=$YEAR_MONTH python3 main.py >>tmp/counter_daily.log +#run counter-processor as counter user + +sudo -u counter YEAR_MONTH=$YEAR_MONTH python3 main.py >>tmp/counter_daily.log curl -X POST "http://localhost:8080/api/admin/makeDataCount/addUsageMetricsFromSushiReport?reportOnDisk=/opt/counter-processor-0.0.1/tmp/make-data-count-report.json" diff --git a/conf/mdc/counter_weekly.sh b/conf/mdc/counter_weekly.sh index 5384e77a237..0f8825ba89e 100644 --- a/conf/mdc/counter_weekly.sh +++ b/conf/mdc/counter_weekly.sh @@ -1,4 +1,5 @@ #!/bin/sh +#counter_weekly.sh # This script iterates through all published Datasets in all Dataverses and calls the Make Data Count API to update their citations from DataCite # Note: Requires curl and jq for parsing JSON responses form curl From 197095364e54ca8f9adc88818fb95a993e1c7816 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 8 May 2020 17:50:45 -0400 Subject: [PATCH 04/37] also report # of citation counts found --- conf/mdc/counter_weekly.sh | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/conf/mdc/counter_weekly.sh b/conf/mdc/counter_weekly.sh index 0f8825ba89e..7b63567cf7c 100644 --- a/conf/mdc/counter_weekly.sh +++ b/conf/mdc/counter_weekly.sh @@ -6,6 +6,7 @@ # A recursive method to process each Dataverse processDV () { +echo "Processing Dataverse ID#: $1" #Call the Dataverse API to get the contents of the Dataverse (without credentials, this will only list published datasets and dataverses DVCONTENTS=$(curl -s http://localhost:8080/api/dataverses/$1/contents) @@ -17,27 +18,31 @@ for subds in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataset" DOI=`expr "$subds" : '.*:\/\/\doi\.org\/\(.*\)'` # Call the Dataverse API for this dataset and get the response -RESULT=$(curl -s -X POST "http://localhost:8080/api/admin/makeDataCount/:persistentId/updateCitationsForDataset?persistentId=doi:$DOI" | jq -r '.status') +RESULT=$(curl -s -X POST "http://localhost:8080/api/admin/makeDataCount/:persistentId/updateCitationsForDataset?persistentId=doi:$DOI" ) +# Parse the status and number of citations found from the response +STATUS=$(echo "$RESULT" | jq -j '.status' ) +CITATIONS=$(echo "$RESULT" | jq -j '.data.citationCount') # The status for a call that worked OK='OK' # Check the status and report -if [ "$RESULT" = "$OK" ]; then - echo "Updated citations for doi:$DOI" +if [ "$STATUS" = "$OK" ]; then + echo "Updated: $CITATIONS citations for doi:$DOI" else echo "Failed to update citations for doi:$DOI" - echo "Run curl -s -X POST 'http://localhost:8080/api/admin/makeDataCount/:persistentId/updateCitationsForDataset?persistentId=doi:$DOI' to retry/see the error message" + echo "Run curl -s -X POST 'http://localhost:8080/api/admin/makeDataCount/:persistentId/updateCitationsForDataset?persistentId=doi:$DOI ' to retry/see the error message" fi - +#processDV $subds done # Now iterate over any child Dataverses and recursively process them for subdv in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataverse") | .id'); do +echo $subdv processDV $subdv done + } # Call the function on the root dataverse to start processing processDV 1 - From f0772e2967aa4a2ab34a5833eb2b4fe1bbea9777 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 8 May 2020 18:09:56 -0400 Subject: [PATCH 05/37] fix cut/pasted names --- .../makedatacount/DatasetExternalCitationsServiceBean.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java index 19e88f10df3..93571214efc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java @@ -61,10 +61,10 @@ public List parseCitations(JsonArray citations) { public DatasetExternalCitations save(DatasetExternalCitations datasetExternalCitations) { //Replace existing if necessary - Dataset testDs = datasetExternalCitations.getDataset(); - String testMonth = datasetExternalCitations.getCitedByUrl(); + Dataset dataset = datasetExternalCitations.getDataset(); + String citedByUrl = datasetExternalCitations.getCitedByUrl(); - DatasetExternalCitations getExisting = getDatasetExternalCitationsByDatasetCitingPID(testDs, testMonth); + DatasetExternalCitations getExisting = getDatasetExternalCitationsByDatasetCitingPID(dataset, citedByUrl); if (getExisting != null){ em.remove(getExisting); } From f35f8d7a4659ea8753bc8ec47159a990fdb05a1c Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 8 May 2020 18:10:07 -0400 Subject: [PATCH 06/37] add todo note --- .../edu/harvard/iq/dataverse/api/MakeDataCountApi.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java index fec7b0b3b7f..eea1ef8ba80 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java @@ -170,7 +170,13 @@ public Response updateCitationsForDataset(@PathParam("id") String id) throws Mal } while (nextPage == true); JsonArray allData = dataBuilder.build(); List datasetExternalCitations = datasetExternalCitationsService.parseCitations(allData); - + /* + * ToDo: If this is the only source of citations, we should remove all the existing ones for the dataset and repopuate them. + * As is, this call doesn't remove old citations if there are now none (legacy issue if we decide to stop counting certain types of citation + * as we've done for 'hasPart'). + * If there are some, this call individually checks each one and if a matching item exists, it removes it and adds it back. Faster and better to delete all and + * add the new ones. + */ if (!datasetExternalCitations.isEmpty()) { for (DatasetExternalCitations dm : datasetExternalCitations) { datasetExternalCitationsService.save(dm); From 2f0fbd9560b77176423f8ec43c6c58f20a554647 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 4 Jun 2020 17:52:46 -0400 Subject: [PATCH 07/37] Updates to script location, made paths configurable, added documentation --- .../admin/counter-processor-config.yaml | 2 +- .../source/_static/util}/counter_daily.sh | 19 +++++++++++-------- .../source/_static/util}/counter_weekly.sh | 0 .../source/admin/make-data-count.rst | 8 ++++++-- 4 files changed, 18 insertions(+), 11 deletions(-) rename {conf/mdc => doc/sphinx-guides/source/_static/util}/counter_daily.sh (55%) rename {conf/mdc => doc/sphinx-guides/source/_static/util}/counter_weekly.sh (100%) diff --git a/doc/sphinx-guides/source/_static/admin/counter-processor-config.yaml b/doc/sphinx-guides/source/_static/admin/counter-processor-config.yaml index c4025ce07fb..db9fc393e29 100644 --- a/doc/sphinx-guides/source/_static/admin/counter-processor-config.yaml +++ b/doc/sphinx-guides/source/_static/admin/counter-processor-config.yaml @@ -2,7 +2,7 @@ # 4-digit year and 2-digit month and day # /usr/local/payara5/glassfish/domains/domain1/logs/counter_2019-01-11.log #log_name_pattern: sample_logs/counter_(yyyy-mm-dd).log -log_name_pattern: /usr/local/payara5/glassfish/domains/domain1/logs/counter_(yyyy-mm-dd).log +log_name_pattern: /usr/local/payara5/glassfish/domains/domain1/logs/mdc/counter_(yyyy-mm-dd).log # path_types regular expressions allow matching to classify page urls as either an investigation or request # based on specific URL structure for your system. diff --git a/conf/mdc/counter_daily.sh b/doc/sphinx-guides/source/_static/util/counter_daily.sh similarity index 55% rename from conf/mdc/counter_daily.sh rename to doc/sphinx-guides/source/_static/util/counter_daily.sh index b9af62646e4..d595c920d5f 100644 --- a/conf/mdc/counter_daily.sh +++ b/doc/sphinx-guides/source/_static/util/counter_daily.sh @@ -1,12 +1,15 @@ #! /bin/bash +COUNTER_PROCESSOR_DIRECTORY="/opt/counter-processor-0.0.1" +MDC_LOG_DIRECTORY="/usr/local/payara5/glassfish/domains/domain1/logs/mdc" + # counter_daily.sh -cd /opt/counter-processor-0.0.1 +cd $COUNTER_PROCESSOR_DIRECTORY -echo >>tmp/counter_daily.log -date >>tmp/counter_daily.log -echo >>tmp/counter_daily.log +echo >>/tmp/counter_daily.log +date >>/tmp/counter_daily.log +echo >>/tmp/counter_daily.log # "You should run Counter Processor once a day to create reports in SUSHI (JSON) format that are saved to disk for Dataverse to process and that are sent to the DataCite hub." @@ -18,16 +21,16 @@ d=$(date -I -d "$YEAR_MONTH-01") #echo $d while [ "$(date -d "$d" +%Y%m%d)" -le "$(date -d "$LAST" +%Y%m%d)" ]; do - if [ -f "/srv/glassfish/dataverse/logs/mdc/counter_$d.log" ]; then + if [ -f "$MDC_LOG_DIRECTORY/counter_$d.log" ]; then # echo "Found counter_$d.log" else - touch "/srv/glassfish/dataverse/logs/mdc/counter_$d.log" + touch "$MDC_LOG_DIRECTORY/counter_$d.log" fi d=$(date -I -d "$d + 1 day") done #run counter-processor as counter user -sudo -u counter YEAR_MONTH=$YEAR_MONTH python3 main.py >>tmp/counter_daily.log +sudo -u counter YEAR_MONTH=$YEAR_MONTH python3 main.py >>/tmp/counter_daily.log -curl -X POST "http://localhost:8080/api/admin/makeDataCount/addUsageMetricsFromSushiReport?reportOnDisk=/opt/counter-processor-0.0.1/tmp/make-data-count-report.json" +curl -X POST "http://localhost:8080/api/admin/makeDataCount/addUsageMetricsFromSushiReport?reportOnDisk=/tmp/make-data-count-report.json" diff --git a/conf/mdc/counter_weekly.sh b/doc/sphinx-guides/source/_static/util/counter_weekly.sh similarity index 100% rename from conf/mdc/counter_weekly.sh rename to doc/sphinx-guides/source/_static/util/counter_weekly.sh diff --git a/doc/sphinx-guides/source/admin/make-data-count.rst b/doc/sphinx-guides/source/admin/make-data-count.rst index aa19ce5ce8d..65de26f9759 100644 --- a/doc/sphinx-guides/source/admin/make-data-count.rst +++ b/doc/sphinx-guides/source/admin/make-data-count.rst @@ -72,6 +72,8 @@ Enable or Disable Display of Make Data Count Metrics By default, when MDC logging is enabled (when ``:MDCLogPath`` is set), Dataverse will display MDC metrics instead of it's internal (legacy) metrics. You can avoid this (e.g. to collect MDC metrics for some period of time before starting to display them) by setting ``:DisplayMDCMetrics`` to false. +The following discussion assumes ``:MDCLogPath`` has been set to ``/usr/local/payara5/glassfish/domains/domain1/logs/mdc`` + Configure Counter Processor ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -92,7 +94,7 @@ Configure Counter Processor Populate Views and Downloads for the First Time ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Soon we will be setting up a cron job to run nightly but we start with a single successful configuration and run of Counter Processor and calls to Dataverse APIs. +Soon we will be setting up a cron job to run nightly but we start with a single successful configuration and manual run of Counter Processor and calls to Dataverse APIs. (The scripts discussed in the next section automate the steps described here, including creating empty log files if you're starting mid-month.) * Change to the directory where you installed Counter Processor. @@ -100,7 +102,7 @@ Soon we will be setting up a cron job to run nightly but we start with a single * If you are running Counter Processor for the first time in the middle of a month, you will need create blank log files for the previous days. e.g.: - * ``cd /usr/local/payara5/glassfish/domains/domain1/logs`` + * ``cd /usr/local/payara5/glassfish/domains/domain1/logs/mdc`` * ``touch counter_2019-02-01.log`` @@ -127,6 +129,8 @@ Populate Views and Downloads Nightly Running ``main.py`` to create the SUSHI JSON file and the subsequent calling of the Dataverse API to process it should be added as a cron job. +Dataverse provides example scripts that run the steps to process new accesses and uploads and update Dataverse's database (`counter_daily.sh`) and to retrieve citations for all Datasets from DataCite (`counter_weekly.sh`). These scripts should be configured for your environment and can be run manually or as cron jobs. + Sending Usage Metrics to the DataCite Hub ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From dbdb3329ebc4be912502041d02ae6e192570a4b0 Mon Sep 17 00:00:00 2001 From: Danny Brooke Date: Thu, 4 Jun 2020 21:11:27 -0400 Subject: [PATCH 08/37] adding release note --- doc/release-notes/6784-mdc-scripts.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 doc/release-notes/6784-mdc-scripts.md diff --git a/doc/release-notes/6784-mdc-scripts.md b/doc/release-notes/6784-mdc-scripts.md new file mode 100644 index 00000000000..1d913d51c87 --- /dev/null +++ b/doc/release-notes/6784-mdc-scripts.md @@ -0,0 +1 @@ +In the "Notes for Dataverse Installation Administrators" we should mention the new scripts for MDC. \ No newline at end of file From 81457d555dd75deb698b723e8ccb5b1a931c23a5 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 5 Jun 2020 14:40:46 -0400 Subject: [PATCH 09/37] Change default CP dir per comment --- doc/sphinx-guides/source/_static/util/counter_daily.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx-guides/source/_static/util/counter_daily.sh b/doc/sphinx-guides/source/_static/util/counter_daily.sh index d595c920d5f..597ff0ac737 100644 --- a/doc/sphinx-guides/source/_static/util/counter_daily.sh +++ b/doc/sphinx-guides/source/_static/util/counter_daily.sh @@ -1,6 +1,6 @@ #! /bin/bash -COUNTER_PROCESSOR_DIRECTORY="/opt/counter-processor-0.0.1" +COUNTER_PROCESSOR_DIRECTORY="/usr/local/counter-processor-0.0.1" MDC_LOG_DIRECTORY="/usr/local/payara5/glassfish/domains/domain1/logs/mdc" # counter_daily.sh From bf6ca00a8d1f4d0e1e941c3ac7bd40dcc29c7a59 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 2 Oct 2020 10:26:17 -0400 Subject: [PATCH 10/37] add google archiver and dependencies --- pom.xml | 16 +- .../GoogleCloudSubmitToArchiveCommand.java | 228 ++++++++++++++++++ 2 files changed, 242 insertions(+), 2 deletions(-) create mode 100644 src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java diff --git a/pom.xml b/pom.xml index 6c9fa99dbc9..c86e1f0d608 100644 --- a/pom.xml +++ b/pom.xml @@ -57,7 +57,7 @@ - @@ -127,6 +127,13 @@ httpclient ${httpcomponents.client.version} + + com.google.cloud + google-cloud-bom + 0.115.0-alpha + pom + import + org.testcontainers testcontainers-bom @@ -137,7 +144,7 @@ @@ -581,6 +588,11 @@ opennlp-tools 1.9.1 + + com.google.cloud + google-cloud-storage + 1.97.0 + diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java new file mode 100644 index 00000000000..cb729a9807a --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -0,0 +1,228 @@ +package edu.harvard.iq.dataverse.engine.command.impl; + +import edu.harvard.iq.dataverse.DOIDataCiteRegisterService; +import edu.harvard.iq.dataverse.DataCitation; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.DatasetLock.Reason; +import edu.harvard.iq.dataverse.authorization.Permission; +import edu.harvard.iq.dataverse.authorization.users.ApiToken; +import edu.harvard.iq.dataverse.engine.command.Command; +import edu.harvard.iq.dataverse.engine.command.DataverseRequest; +import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.workflow.step.Failure; +import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; + +import java.io.BufferedInputStream; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.nio.charset.Charset; +import java.security.DigestInputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Map; +import java.util.logging.Logger; + +import org.apache.commons.codec.binary.Hex; +import com.google.auth.oauth2.ServiceAccountCredentials; +import com.google.cloud.storage.Blob; +import com.google.cloud.storage.Bucket; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageOptions; + +@RequiredPermissions(Permission.PublishDataset) +public class GoogleCloudSubmitToArchiveCommand extends AbstractSubmitToArchiveCommand implements Command { + + private static final Logger logger = Logger.getLogger(GoogleCloudSubmitToArchiveCommand.class.getName()); + private static final String GOOGLECLOUD_BUCKET = ":GoogleCloudBucket"; + private static final String GOOGLECLOUD_PROJECT = ":GoogleCloudProject"; + + public GoogleCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { + super(aRequest, version); + } + + @Override + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { + logger.fine("In GoogleCloudSubmitToArchiveCommand..."); + String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET); + String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT); + logger.fine("Project: " + projectName + " Bucket: " + bucketName); + if (bucketName != null && projectName != null) { + Storage storage; + try { + FileInputStream fis = new FileInputStream(System.getProperty("dataverse.files.directory") + System.getProperty("file.separator")+ "googlecloudkey.json"); + storage = StorageOptions.newBuilder() + .setCredentials(ServiceAccountCredentials.fromStream(fis)) + .setProjectId(projectName) + .build() + .getService(); + Bucket bucket = storage.get(bucketName); + + Dataset dataset = dv.getDataset(); + if (dataset.getLockFor(Reason.finalizePublication) == null) { + + String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') + .replace('.', '-').toLowerCase(); + + DataCitation dc = new DataCitation(dv); + Map metadata = dc.getDataCiteMetadata(); + String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject( + dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset()); + String blobIdString = null; + MessageDigest messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream dataciteIn = new PipedInputStream(); DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) { + // Add datacite.xml file + + new Thread(new Runnable() { + public void run() { + try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) { + + dataciteOut.write(dataciteXml.getBytes(Charset.forName("utf-8"))); + dataciteOut.close(); + } catch (Exception e) { + logger.severe("Error creating datacite.xml: " + e.getMessage()); + // TODO Auto-generated catch block + e.printStackTrace(); + throw new RuntimeException("Error creating datacite.xml: " + e.getMessage()); + } + } + }).start(); + //Have seen broken pipe in PostPublishDataset workflow without this delay + int i=0; + while(digestInputStream.available()<=0 && i<100) { + Thread.sleep(10); + i++; + } + Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber()+".xml", digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); + String checksum = dcXml.getMd5ToHexString(); + logger.fine("Content: datacite.xml added with checksum: " + checksum); + String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); + if (!checksum.equals(localchecksum)) { + logger.severe(checksum + " not equal to " + localchecksum); + return new Failure("Error in transferring DataCite.xml file to GoogleCloud", + "GoogleCloud Submission Failure: incomplete metadata transfer"); + } + + // Store BagIt file + String fileName = spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; + + // Add BagIt ZIP file + // Google uses MD5 as one way to verify the + // transfer + messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream in = new PipedInputStream(100000); DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest);) { + Thread writeThread = new Thread(new Runnable() { + public void run() { + try (PipedOutputStream out = new PipedOutputStream(in)) { + // Generate bag + BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setAuthenticationKey(token.getTokenString()); + bagger.generateBag(out); + } catch (Exception e) { + logger.severe("Error creating bag: " + e.getMessage()); + // TODO Auto-generated catch block + e.printStackTrace(); + try { + digestInputStream2.close(); + } catch(Exception ex) { + logger.warning(ex.getLocalizedMessage()); + } + throw new RuntimeException("Error creating bag: " + e.getMessage()); + } + } + }); + writeThread.start(); + /* + * The following loop handles two issues. First, with no delay, the + * bucket.create() call below can get started before the piped streams are set + * up, causing a failure (seen when triggered in a PostPublishDataset workflow). + * A minimal initial wait, e.g. until some bytes are available, would address + * this. Second, the BagGenerator class, due to it's use of parallel streaming + * creation of the zip file, has the characteristic that it makes a few bytes + * available - from setting up the directory structure for the zip file - + * significantly earlier than it is ready to stream file content (e.g. for + * thousands of files and GB of content). If, for these large datasets, + * bucket.create() is called as soon as bytes are available, the call can + * timeout before the bytes for all the zipped files are available. To manage + * this, the loop waits until 90K bytes are available, larger than any expected + * dir structure for the zip and implying that the main zipped content is + * available, or until the thread terminates, with all of its content written to + * the pipe. (Note the PipedInputStream buffer is set at 100K above - I didn't + * want to test whether that means that exactly 100K bytes will be available() + * for large datasets or not, so the test below is at 90K.) + * + * An additional sanity check limits the wait to 2K seconds. The BagGenerator + * has been used to archive >120K files, 2K directories, and ~600GB files on the + * SEAD project (streaming content to disk rather than over an internet + * connection) which would take longer than 2K seconds (10+ hours) and might + * produce an initial set of bytes for directories > 90K. If Dataverse ever + * needs to support datasets of this size, the numbers here would need to be + * increased, and/or a change in how archives are sent to google (e.g. as + * multiple blobs that get aggregated) would be required. + */ + i=0; + while(digestInputStream2.available()<=90000 && i<2000 && writeThread.isAlive()) { + Thread.sleep(1000); + logger.fine("avail: " + digestInputStream2.available() + " : " + writeThread.getState().toString()); + i++; + } + logger.fine("Bag: transfer started, i=" + i + ", avail = " + digestInputStream2.available()); + if(i==2000) { + throw new IOException("Stream not available"); + } + Blob bag = bucket.create(spaceName + "/" + fileName, digestInputStream2, "application/zip", Bucket.BlobWriteOption.doesNotExist()); + if(bag.getSize()==0) { + throw new IOException("Empty Bag"); + } + blobIdString = bag.getBlobId().getBucket() + "/" + bag.getBlobId().getName(); + checksum = bag.getMd5ToHexString(); + logger.fine("Bag: " + fileName + " added with checksum: " + checksum); + localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); + if (!checksum.equals(localchecksum)) { + logger.severe(checksum + " not equal to " + localchecksum); + return new Failure("Error in transferring Zip file to GoogleCloud", + "GoogleCloud Submission Failure: incomplete archive transfer"); + } + } catch (RuntimeException rte) { + logger.severe("Error creating Bag during GoogleCloud archiving: " + rte.getMessage()); + return new Failure("Error in generating Bag", + "GoogleCloud Submission Failure: archive file not created"); + } + + logger.fine("GoogleCloud Submission step: Content Transferred"); + + // Document the location of dataset archival copy location (actually the URL + // where you can + // view it as an admin) + + StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/"); + sb.append(blobIdString); + dv.setArchivalCopyLocation(sb.toString()); + } catch (RuntimeException rte) { + logger.severe("Error creating datacite xml file during GoogleCloud Archiving: " + rte.getMessage()); + return new Failure("Error in generating datacite.xml file", + "GoogleCloud Submission Failure: metadata file not created"); + } + } else { + logger.warning("GoogleCloud Submision Workflow aborted: Dataset locked for pidRegister"); + return new Failure("Dataset locked"); + } + } catch (Exception e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); + return new Failure("GoogleCloud Submission Failure", + e.getLocalizedMessage() + ": check log for details"); + + } + return WorkflowStepResult.OK; + } else { + return new Failure("GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); + } + } + +} From ee08e9c2e2ec518248e5bb252ab7f7f1cf41876d Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 2 Oct 2020 13:32:57 -0400 Subject: [PATCH 11/37] documentation --- .../source/installation/config.rst | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 34da299528f..c9b3c681877 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -779,7 +779,7 @@ Dataverse may be configured to submit a copy of published Datasets, packaged as Dataverse offers an internal archive workflow which may be configured as a PostPublication workflow via an admin API call to manually submit previously published Datasets and prior versions to a configured archive such as Chronopolis. The workflow creates a `JSON-LD `_ serialized `OAI-ORE `_ map file, which is also available as a metadata export format in the Dataverse web interface. -At present, the DPNSubmitToArchiveCommand and LocalSubmitToArchiveCommand are the only implementations extending the AbstractSubmitToArchiveCommand and using the configurable mechanisms discussed below. +At present, the DPNSubmitToArchiveCommand, LocalSubmitToArchiveCommand, and GoogleCloudSubmitToArchive are the only implementations extending the AbstractSubmitToArchiveCommand and using the configurable mechanisms discussed below. .. _Duracloud Configuration: @@ -831,6 +831,32 @@ ArchiverClassName - the fully qualified class to be used for archiving. For exam :BagItLocalPath is the file path that you've set in :ArchiverSettings. +.. _Google Cloud Configuration: + +Google Cloud Configuration +++++++++++++++++++++++++++ + +The Google Cloud Archiver can send Dataverse Bags to a bucket in Google's could, including those in the 'Coldline' Storage class (cheaper, with slower access) + +``curl http://localhost:8080/api/admin/settings/:ArchiverClassName -X PUT -d "edu.harvard.iq.dataverse.engine.command.impl.GoogleCloudSubmitToArchiveCommand"`` + +``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":":GoogleCloudBucket, :GoogleCloudProject"`` + +The Google Cloud archiver defines two custom settings, both are required: + +\:GoogleCloudBucket - the name of the bucket to use: + +``curl http://localhost:8080/api/admin/settings/:GoogleCloudBucket -X PUT -d "qdr-archive"`` + +\:GoogleCloudProject - the name of the project managing the bucket: + +``curl http://localhost:8080/api/admin/settings/:GoogleCloudProject -X PUT -d "qdr-project"`` + +In addition, the Google Cloud Archiver requires that the googlecloudkey.json file for the project be placed in the 'dataverse.files.directory' directory. This file can be created in the Google Could Console. + +.. _Local Path Configuration: + + API Call ++++++++ From 5dfbae394f875edf2ce8c724f9e3bcce45b8f7e0 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 2 Oct 2020 13:33:33 -0400 Subject: [PATCH 12/37] update DuraCloud archiver with enhancements from Google archiver --- .../impl/DuraCloudSubmitToArchiveCommand.java | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index 66e8770a641..468e99f24c1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -99,7 +99,12 @@ public void run() { } } }).start(); - + //Have seen Pipe Closed errors for other archivers when used as a workflow without this delay loop + int i=0; + while(digestInputStream.available()<=0 && i<100) { + Thread.sleep(10); + i++; + } String checksum = store.addContent(spaceName, "datacite.xml", digestInputStream, -1l, null, null, null); logger.fine("Content: datacite.xml added with checksum: " + checksum); @@ -133,7 +138,11 @@ public void run() { } } }).start(); - + i=0; + while(digestInputStream.available()<=0 && i<100) { + Thread.sleep(10); + i++; + } checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null); logger.fine("Content: " + fileName + " added with checksum: " + checksum); @@ -174,6 +183,9 @@ public void run() { logger.severe(rte.getMessage()); return new Failure("Error in generating datacite.xml file", "DuraCloud Submission Failure: metadata file not created"); + } catch (InterruptedException e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); } } catch (ContentStoreException e) { logger.warning(e.getMessage()); From fed5e456bd6381e3758d637cf69b5aec00641843 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 2 Oct 2020 13:37:14 -0400 Subject: [PATCH 13/37] typos --- doc/sphinx-guides/source/installation/config.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index c9b3c681877..a9a532888aa 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -827,7 +827,7 @@ ArchiverClassName - the fully qualified class to be used for archiving. For exam \:ArchiverSettings - the archiver class can access required settings including existing Dataverse settings and dynamically defined ones specific to the class. This setting is a comma-separated list of those settings. For example\: -``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":BagItLocalPath”`` +``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":BagItLocalPath"`` :BagItLocalPath is the file path that you've set in :ArchiverSettings. @@ -836,7 +836,7 @@ ArchiverClassName - the fully qualified class to be used for archiving. For exam Google Cloud Configuration ++++++++++++++++++++++++++ -The Google Cloud Archiver can send Dataverse Bags to a bucket in Google's could, including those in the 'Coldline' Storage class (cheaper, with slower access) +The Google Cloud Archiver can send Dataverse Bags to a bucket in Google's cloud, including those in the 'Coldline' Storage class (cheaper, with slower access) ``curl http://localhost:8080/api/admin/settings/:ArchiverClassName -X PUT -d "edu.harvard.iq.dataverse.engine.command.impl.GoogleCloudSubmitToArchiveCommand"`` @@ -852,7 +852,7 @@ The Google Cloud archiver defines two custom settings, both are required: ``curl http://localhost:8080/api/admin/settings/:GoogleCloudProject -X PUT -d "qdr-project"`` -In addition, the Google Cloud Archiver requires that the googlecloudkey.json file for the project be placed in the 'dataverse.files.directory' directory. This file can be created in the Google Could Console. +In addition, the Google Cloud Archiver requires that the googlecloudkey.json file for the project be placed in the 'dataverse.files.directory' directory. This file can be created in the Google Cloud Console. .. _Local Path Configuration: From 4d1b4b00bc90389947192340ff9a8a226a0b457e Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 2 Oct 2020 13:39:08 -0400 Subject: [PATCH 14/37] capitalization --- doc/sphinx-guides/source/installation/config.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index a9a532888aa..826f3472ab3 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -836,13 +836,13 @@ ArchiverClassName - the fully qualified class to be used for archiving. For exam Google Cloud Configuration ++++++++++++++++++++++++++ -The Google Cloud Archiver can send Dataverse Bags to a bucket in Google's cloud, including those in the 'Coldline' Storage class (cheaper, with slower access) +The Google Cloud Archiver can send Dataverse Bags to a bucket in Google's cloud, including those in the 'Coldline' storage class (cheaper, with slower access) ``curl http://localhost:8080/api/admin/settings/:ArchiverClassName -X PUT -d "edu.harvard.iq.dataverse.engine.command.impl.GoogleCloudSubmitToArchiveCommand"`` ``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":":GoogleCloudBucket, :GoogleCloudProject"`` -The Google Cloud archiver defines two custom settings, both are required: +The Google Cloud Archiver defines two custom settings, both are required: \:GoogleCloudBucket - the name of the bucket to use: From 3eb0ebcab2facd9d79c40821f2903b3ae1c72155 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 2 Oct 2020 13:40:08 -0400 Subject: [PATCH 15/37] for example --- doc/sphinx-guides/source/installation/config.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 826f3472ab3..8aadeaf0601 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -844,11 +844,11 @@ The Google Cloud Archiver can send Dataverse Bags to a bucket in Google's cloud, The Google Cloud Archiver defines two custom settings, both are required: -\:GoogleCloudBucket - the name of the bucket to use: +\:GoogleCloudBucket - the name of the bucket to use. For example: ``curl http://localhost:8080/api/admin/settings/:GoogleCloudBucket -X PUT -d "qdr-archive"`` -\:GoogleCloudProject - the name of the project managing the bucket: +\:GoogleCloudProject - the name of the project managing the bucket. For example: ``curl http://localhost:8080/api/admin/settings/:GoogleCloudProject -X PUT -d "qdr-project"`` From 3e5eecfae790dce775b194e5ca34ca0e61cbef50 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 2 Oct 2020 15:47:05 -0400 Subject: [PATCH 16/37] adding settings to master list --- .../source/installation/config.rst | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 8aadeaf0601..d44690a6eaf 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -772,6 +772,8 @@ For Google Analytics, the example script at :download:`analytics-code.html ` above for details about this and for further explanation of the other archiving related settings below). +This setting specifies which storage system to use by identifying the particular Java class that should be run. Current options include DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, and GoogleCloudSubmitToArchiveCommand. + +``curl -X PUT -d 'LocalSubmitToArchiveCommand' http://localhost:8080/api/admin/settings/:ArchiverClassName`` + + +:ArchiverSettings + +Each Archiver class may have it's own custom settings. Along with setting which Archiver class to use, one must use this setting to identify which setting values should be sent to it when it is invoked. The value should be a comma-spearate list of setting names. +For example, the LocalSubmitToArchiveCommand only uses the :BagItLocalPath setting. To allow the class to use that setting, this setting must set as: + +``curl -X PUT -d ':BagItLocalPath' http://localhost:8080/api/admin/settings/:ArchiverSettings`` + +:DuraCloudHost +:DuraCloudPort +:DuraCloudContext + +These three settings define the host, port, and context used by the DuraCloudSubmitToArchiveCommand. :DuraCloudHost is required. The other settings have default values as noted in the :ref:`Duracloud Configuration ` section above. + +:BagItLocalPath + +This is the local file system path to be used with the LocalSubmitToArchiveCommand class. It is recommended to use an absolute path. See the :ref:`Local Path Configuration ` section above. + +:GoogleCloudBucket +:GoogleCloudProject + +These are the bucket and project names to be used with the GoogleCloudSubmitToArchiveCommand class. Further information is in the :ref:`Google Cloud Configuration ` section above. \ No newline at end of file From ede2a221389839a4e6f70bcdacf6b4b58f45205e Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 2 Oct 2020 15:48:54 -0400 Subject: [PATCH 17/37] add formatting --- doc/sphinx-guides/source/installation/config.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index d44690a6eaf..d58e23a3c99 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2157,8 +2157,8 @@ This setting specifies which storage system to use by identifying the particular ``curl -X PUT -d 'LocalSubmitToArchiveCommand' http://localhost:8080/api/admin/settings/:ArchiverClassName`` - :ArchiverSettings ++++++++++++++++++ Each Archiver class may have it's own custom settings. Along with setting which Archiver class to use, one must use this setting to identify which setting values should be sent to it when it is invoked. The value should be a comma-spearate list of setting names. For example, the LocalSubmitToArchiveCommand only uses the :BagItLocalPath setting. To allow the class to use that setting, this setting must set as: @@ -2166,16 +2166,22 @@ For example, the LocalSubmitToArchiveCommand only uses the :BagItLocalPath setti ``curl -X PUT -d ':BagItLocalPath' http://localhost:8080/api/admin/settings/:ArchiverSettings`` :DuraCloudHost +++++++++++++++ :DuraCloudPort +++++++++++++++ :DuraCloudContext ++++++++++++++++++ These three settings define the host, port, and context used by the DuraCloudSubmitToArchiveCommand. :DuraCloudHost is required. The other settings have default values as noted in the :ref:`Duracloud Configuration ` section above. :BagItLocalPath ++++++++++++++++ This is the local file system path to be used with the LocalSubmitToArchiveCommand class. It is recommended to use an absolute path. See the :ref:`Local Path Configuration ` section above. -:GoogleCloudBucket +:GoogleCloudBucket +++++++++++++++++++ :GoogleCloudProject ++++++++++++++++++++ These are the bucket and project names to be used with the GoogleCloudSubmitToArchiveCommand class. Further information is in the :ref:`Google Cloud Configuration ` section above. \ No newline at end of file From a33b9b167a1c0b825966a55515388dcfbfdbd9dc Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Fri, 2 Oct 2020 15:54:52 -0400 Subject: [PATCH 18/37] simplify links --- doc/sphinx-guides/source/installation/config.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index d58e23a3c99..a7268ad34ec 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2152,7 +2152,7 @@ To enable redirects to the zipper on a different server: :ArchiverClassName ++++++++++++++++++ -Dataverse can export archival "Bag' files to an extensible set of storage systems (see the :ref:`BagIt Export ` above for details about this and for further explanation of the other archiving related settings below). +Dataverse can export archival "Bag' files to an extensible set of storage systems (see :ref:`BagIt Export` above for details about this and for further explanation of the other archiving related settings below). This setting specifies which storage system to use by identifying the particular Java class that should be run. Current options include DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, and GoogleCloudSubmitToArchiveCommand. ``curl -X PUT -d 'LocalSubmitToArchiveCommand' http://localhost:8080/api/admin/settings/:ArchiverClassName`` @@ -2172,16 +2172,16 @@ For example, the LocalSubmitToArchiveCommand only uses the :BagItLocalPath setti :DuraCloudContext +++++++++++++++++ -These three settings define the host, port, and context used by the DuraCloudSubmitToArchiveCommand. :DuraCloudHost is required. The other settings have default values as noted in the :ref:`Duracloud Configuration ` section above. +These three settings define the host, port, and context used by the DuraCloudSubmitToArchiveCommand. :DuraCloudHost is required. The other settings have default values as noted in the :ref:`Duracloud Configuration` section above. :BagItLocalPath +++++++++++++++ -This is the local file system path to be used with the LocalSubmitToArchiveCommand class. It is recommended to use an absolute path. See the :ref:`Local Path Configuration ` section above. +This is the local file system path to be used with the LocalSubmitToArchiveCommand class. It is recommended to use an absolute path. See the :ref:`Local Path Configuration` section above. :GoogleCloudBucket ++++++++++++++++++ :GoogleCloudProject +++++++++++++++++++ -These are the bucket and project names to be used with the GoogleCloudSubmitToArchiveCommand class. Further information is in the :ref:`Google Cloud Configuration ` section above. \ No newline at end of file +These are the bucket and project names to be used with the GoogleCloudSubmitToArchiveCommand class. Further information is in the :ref:`Google Cloud Configuration` section above. \ No newline at end of file From 4aeccfdf861cb3fb4e5710e8248a3ad5fe658ccb Mon Sep 17 00:00:00 2001 From: Danny Brooke Date: Fri, 2 Oct 2020 16:07:47 -0400 Subject: [PATCH 19/37] add release notes --- doc/release-notes/7140-google-cloud.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 doc/release-notes/7140-google-cloud.md diff --git a/doc/release-notes/7140-google-cloud.md b/doc/release-notes/7140-google-cloud.md new file mode 100644 index 00000000000..62aef73acd0 --- /dev/null +++ b/doc/release-notes/7140-google-cloud.md @@ -0,0 +1,12 @@ +## Google Cloud Archiver + +Dataverse Bags can now be sent to a bucket in Google Cloud, including those in the 'Coldline' storage class, which provide less expensive but slower access. + +## Use Cases + +- As an Administrator I can set up a regular export to Google Cloud so that my users' data is preserved. + +## New Settings + +:GoogleCloudProject - the name of the project managing the bucket. +:GoogleCloudBucket - the name of the bucket to use \ No newline at end of file From c257a1ea5e2960459bf5bd9325994afee55f7a7c Mon Sep 17 00:00:00 2001 From: Danny Brooke Date: Mon, 5 Oct 2020 13:29:09 -0400 Subject: [PATCH 20/37] Update doc/sphinx-guides/source/installation/config.rst Co-authored-by: Philip Durbin --- doc/sphinx-guides/source/installation/config.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index a7268ad34ec..414796aa7ca 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2160,7 +2160,7 @@ This setting specifies which storage system to use by identifying the particular :ArchiverSettings +++++++++++++++++ -Each Archiver class may have it's own custom settings. Along with setting which Archiver class to use, one must use this setting to identify which setting values should be sent to it when it is invoked. The value should be a comma-spearate list of setting names. +Each Archiver class may have its own custom settings. Along with setting which Archiver class to use, one must use this setting to identify which setting values should be sent to it when it is invoked. The value should be a comma-separated list of setting names. For example, the LocalSubmitToArchiveCommand only uses the :BagItLocalPath setting. To allow the class to use that setting, this setting must set as: ``curl -X PUT -d ':BagItLocalPath' http://localhost:8080/api/admin/settings/:ArchiverSettings`` @@ -2184,4 +2184,4 @@ This is the local file system path to be used with the LocalSubmitToArchiveComma :GoogleCloudProject +++++++++++++++++++ -These are the bucket and project names to be used with the GoogleCloudSubmitToArchiveCommand class. Further information is in the :ref:`Google Cloud Configuration` section above. \ No newline at end of file +These are the bucket and project names to be used with the GoogleCloudSubmitToArchiveCommand class. Further information is in the :ref:`Google Cloud Configuration` section above. From 57e6b426f48d1a0952694a451391d51fc8e85af6 Mon Sep 17 00:00:00 2001 From: Danny Brooke Date: Mon, 5 Oct 2020 13:29:28 -0400 Subject: [PATCH 21/37] Update doc/sphinx-guides/source/installation/config.rst Co-authored-by: Philip Durbin --- doc/sphinx-guides/source/installation/config.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 414796aa7ca..a984bcfa0f4 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2152,7 +2152,7 @@ To enable redirects to the zipper on a different server: :ArchiverClassName ++++++++++++++++++ -Dataverse can export archival "Bag' files to an extensible set of storage systems (see :ref:`BagIt Export` above for details about this and for further explanation of the other archiving related settings below). +Dataverse can export archival "Bag" files to an extensible set of storage systems (see :ref:`BagIt Export` above for details about this and for further explanation of the other archiving related settings below). This setting specifies which storage system to use by identifying the particular Java class that should be run. Current options include DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, and GoogleCloudSubmitToArchiveCommand. ``curl -X PUT -d 'LocalSubmitToArchiveCommand' http://localhost:8080/api/admin/settings/:ArchiverClassName`` From babb316d729817421c93421d8e5edda29e2dac90 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 5 Oct 2020 13:47:41 -0400 Subject: [PATCH 22/37] typo fixes --- doc/sphinx-guides/source/installation/config.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index a7268ad34ec..a984bcfa0f4 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2152,7 +2152,7 @@ To enable redirects to the zipper on a different server: :ArchiverClassName ++++++++++++++++++ -Dataverse can export archival "Bag' files to an extensible set of storage systems (see :ref:`BagIt Export` above for details about this and for further explanation of the other archiving related settings below). +Dataverse can export archival "Bag" files to an extensible set of storage systems (see :ref:`BagIt Export` above for details about this and for further explanation of the other archiving related settings below). This setting specifies which storage system to use by identifying the particular Java class that should be run. Current options include DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, and GoogleCloudSubmitToArchiveCommand. ``curl -X PUT -d 'LocalSubmitToArchiveCommand' http://localhost:8080/api/admin/settings/:ArchiverClassName`` @@ -2160,7 +2160,7 @@ This setting specifies which storage system to use by identifying the particular :ArchiverSettings +++++++++++++++++ -Each Archiver class may have it's own custom settings. Along with setting which Archiver class to use, one must use this setting to identify which setting values should be sent to it when it is invoked. The value should be a comma-spearate list of setting names. +Each Archiver class may have its own custom settings. Along with setting which Archiver class to use, one must use this setting to identify which setting values should be sent to it when it is invoked. The value should be a comma-separated list of setting names. For example, the LocalSubmitToArchiveCommand only uses the :BagItLocalPath setting. To allow the class to use that setting, this setting must set as: ``curl -X PUT -d ':BagItLocalPath' http://localhost:8080/api/admin/settings/:ArchiverSettings`` @@ -2184,4 +2184,4 @@ This is the local file system path to be used with the LocalSubmitToArchiveComma :GoogleCloudProject +++++++++++++++++++ -These are the bucket and project names to be used with the GoogleCloudSubmitToArchiveCommand class. Further information is in the :ref:`Google Cloud Configuration` section above. \ No newline at end of file +These are the bucket and project names to be used with the GoogleCloudSubmitToArchiveCommand class. Further information is in the :ref:`Google Cloud Configuration` section above. From 700382110ddc1b22c0906e3a0eda0ec4f146861c Mon Sep 17 00:00:00 2001 From: "don.sizemore" Date: Tue, 6 Oct 2020 10:51:37 -0400 Subject: [PATCH 23/37] #21 bump Apache Tika to 1.24.1 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a115e67fb19..e454b4c169b 100644 --- a/pom.xml +++ b/pom.xml @@ -573,7 +573,7 @@ org.apache.tika tika-parsers - 1.22 + 1.24.1 From b6db8c50e04ad477672ab754e4b56e80c77afec6 Mon Sep 17 00:00:00 2001 From: Danny Brooke Date: Wed, 7 Oct 2020 17:13:20 -0400 Subject: [PATCH 24/37] removing unused dependency --- pom.xml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pom.xml b/pom.xml index a115e67fb19..596a0d5ffae 100644 --- a/pom.xml +++ b/pom.xml @@ -440,11 +440,6 @@ slf4j-log4j12 1.6.1 - - axis - axis - 1.4 - io.searchbox jest From 342277fe3dfb24fdcb629e7c0a5c298a61c0235e Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 8 Oct 2020 14:49:41 -0400 Subject: [PATCH 25/37] filter directionally and add supplemented pair --- .../DatasetExternalCitationsServiceBean.java | 39 ++++++++++++++----- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java index 965802332c3..957378c11ad 100644 --- a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java @@ -36,29 +36,48 @@ public class DatasetExternalCitationsServiceBean implements java.io.Serializable DatasetServiceBean datasetService; //Array of relationship types that are considered to be citations - static ArrayList relationships = new ArrayList( + static ArrayList inboundRelationships = new ArrayList( Arrays.asList( - "is-cited-by", "cites", + "references", + "supplements")); + static ArrayList outboundRelationships = new ArrayList( + Arrays.asList( + "is-cited-by", "is-referenced-by", - "references")); + "is-supplemented-by")); + public List parseCitations(JsonArray citations) { List datasetExternalCitations = new ArrayList<>(); for (JsonValue citationValue : citations) { DatasetExternalCitations exCit = new DatasetExternalCitations(); JsonObject citation = (JsonObject) citationValue; - exCit.setCitedByUrl(citation.getJsonObject("attributes").getString("subj-id")); - - String localDatasetDOI = citation.getJsonObject("attributes").getString("obj-id"); + String subjectUri = citation.getJsonObject("attributes").getString("subj-id"); + + String objectUri = citation.getJsonObject("attributes").getString("obj-id"); String relationship = citation.getJsonObject("attributes").getString("relation-type-id"); - if (relationships.contains(relationship)) { + if (inboundRelationships.contains(relationship)) { Dataset localDs = null; - if (localDatasetDOI.contains("doi")) { - String globalId = localDatasetDOI.replace("https://", "").replace("doi.org/", "doi:").toUpperCase().replace("DOI:", "doi:"); + if (objectUri.contains("doi")) { + String globalId = objectUri.replace("https://", "").replace("doi.org/", "doi:").toUpperCase().replace("DOI:", "doi:"); localDs = datasetService.findByGlobalId(globalId); exCit.setDataset(localDs); } - + exCit.setCitedByUrl(subjectUri); + + if (localDs != null && !exCit.getCitedByUrl().isEmpty()) { + datasetExternalCitations.add(exCit); + } + } + if (outboundRelationships.contains(relationship)) { + Dataset localDs = null; + if (subjectUri.contains("doi")) { + String globalId = subjectUri.replace("https://", "").replace("doi.org/", "doi:").toUpperCase().replace("DOI:", "doi:"); + localDs = datasetService.findByGlobalId(globalId); + exCit.setDataset(localDs); + } + exCit.setCitedByUrl(objectUri); + if (localDs != null && !exCit.getCitedByUrl().isEmpty()) { datasetExternalCitations.add(exCit); } From ab29f15fa9b34b0806ad79f463a3f4339fed9d72 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 8 Oct 2020 15:27:29 -0400 Subject: [PATCH 26/37] typo --- doc/sphinx-guides/source/installation/config.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 7733e8fc7d9..47cb2c797c9 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -846,7 +846,7 @@ The Google Cloud Archiver can send Dataverse Bags to a bucket in Google's cloud, ``curl http://localhost:8080/api/admin/settings/:ArchiverClassName -X PUT -d "edu.harvard.iq.dataverse.engine.command.impl.GoogleCloudSubmitToArchiveCommand"`` -``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":":GoogleCloudBucket, :GoogleCloudProject"`` +``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":GoogleCloudBucket, :GoogleCloudProject"`` The Google Cloud Archiver defines two custom settings, both are required: From af6dc98af51abe8b241dc00d4417e89ec4248e64 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 8 Oct 2020 16:22:10 -0400 Subject: [PATCH 27/37] expanded directions --- doc/sphinx-guides/source/installation/config.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 47cb2c797c9..677ac3f90f3 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -848,7 +848,9 @@ The Google Cloud Archiver can send Dataverse Bags to a bucket in Google's cloud, ``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":GoogleCloudBucket, :GoogleCloudProject"`` -The Google Cloud Archiver defines two custom settings, both are required: +The Google Cloud Archiver defines two custom settings, both are required. The credentials for your account, in the form of a json key file, must also be obtained and stored locally (see below): + +In order to use the Google Cloud Archiver, you must have a Google account. You will need to create a project and bucket within that account and provide those values in the settings: \:GoogleCloudBucket - the name of the bucket to use. For example: @@ -858,7 +860,11 @@ The Google Cloud Archiver defines two custom settings, both are required: ``curl http://localhost:8080/api/admin/settings/:GoogleCloudProject -X PUT -d "qdr-project"`` -In addition, the Google Cloud Archiver requires that the googlecloudkey.json file for the project be placed in the 'dataverse.files.directory' directory. This file can be created in the Google Cloud Console. +The Google Cloud Archiver also requires a key file that must be renamed to 'googlecloudkey.json' file and placed in the directory identified by your 'dataverse.files.directory' jvm option. This file can be created in the Google Cloud Console. (One method: Navigate to your Project 'Settings'/'Service Accounts', create an account, give this account the 'Cloud Storage'/'Storage Admin' role, and once it's created, use the 'Actions' menu to 'Create Key', selecting the 'JSON' format option. Use this as the 'googlecloudkey.json' file.) + +For example: + +``cp /usr/local/payara5/glassfish/domains/domain1/files/googlecloudkey.json`` .. _Local Path Configuration: From 3484bb6b6a2cdd08a2e361d8d31de9dc56840939 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 8 Oct 2020 16:25:37 -0400 Subject: [PATCH 28/37] typos --- doc/sphinx-guides/source/installation/config.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 677ac3f90f3..de8fbad3687 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -860,14 +860,13 @@ In order to use the Google Cloud Archiver, you must have a Google account. You w ``curl http://localhost:8080/api/admin/settings/:GoogleCloudProject -X PUT -d "qdr-project"`` -The Google Cloud Archiver also requires a key file that must be renamed to 'googlecloudkey.json' file and placed in the directory identified by your 'dataverse.files.directory' jvm option. This file can be created in the Google Cloud Console. (One method: Navigate to your Project 'Settings'/'Service Accounts', create an account, give this account the 'Cloud Storage'/'Storage Admin' role, and once it's created, use the 'Actions' menu to 'Create Key', selecting the 'JSON' format option. Use this as the 'googlecloudkey.json' file.) +The Google Cloud Archiver also requires a key file that must be renamed to 'googlecloudkey.json' and placed in the directory identified by your 'dataverse.files.directory' jvm option. This file can be created in the Google Cloud Console. (One method: Navigate to your Project 'Settings'/'Service Accounts', create an account, give this account the 'Cloud Storage'/'Storage Admin' role, and once it's created, use the 'Actions' menu to 'Create Key', selecting the 'JSON' format option. Use this as the 'googlecloudkey.json' file.) For example: ``cp /usr/local/payara5/glassfish/domains/domain1/files/googlecloudkey.json`` -.. _Local Path Configuration: - +.. _Archiving API Call: API Call ++++++++ From 45172469e2f7042bc2183957b876452eacee0c04 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 9 Oct 2020 10:40:18 -0400 Subject: [PATCH 29/37] persist debug=true in session, toggle with query parameter #7273 --- .../harvard/iq/dataverse/DataverseSession.java | 18 +++++++++++++++++- .../search/SearchIncludeFragment.java | 12 +++++++----- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java b/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java index e15badc994b..9e4bfb1b110 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java @@ -49,6 +49,14 @@ public class DataverseSession implements Serializable{ private static final Logger logger = Logger.getLogger(DataverseSession.class.getCanonicalName()); private boolean statusDismissed = false; + + /** + * If debug is set to true, some pages show extra debugging information. + * + * The way to set the boolean to true is to pass debug=true as a query + * parameter. The boolean will remain true until debug=false is passed. + */ + private boolean debug; public User getUser() { if ( user == null ) { @@ -82,7 +90,15 @@ public boolean isStatusDismissed() { public void setStatusDismissed(boolean status) { statusDismissed = status; //MAD: Set to true to enable code! } - + + public boolean isDebug() { + return debug; + } + + public void setDebug(boolean debug) { + this.debug = debug; + } + public StaticPermissionQuery on( Dataverse d ) { return permissionsService.userOn(user, d); } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java index f5430ae32bb..464cdd05773 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java @@ -130,7 +130,7 @@ public class SearchIncludeFragment implements java.io.Serializable { Map staticSolrFieldFriendlyNamesBySolrField = new HashMap<>(); private boolean solrIsDown = false; private Map numberOfFacets = new HashMap<>(); - private boolean debug = false; + private Boolean debug; // private boolean showUnpublished; List filterQueriesDebug = new ArrayList<>(); // private Map friendlyName = new HashMap<>(); @@ -1017,13 +1017,15 @@ public void setRootDv(boolean rootDv) { this.rootDv = rootDv; } - public boolean isDebug() { - return (debug && session.getUser().isSuperuser()) + public Boolean getDebug() { + return (session.isDebug() && session.getUser().isSuperuser()) || settingsWrapper.isTrueForKey(":Debug", false); } - public void setDebug(boolean debug) { - this.debug = debug; + public void setDebug(Boolean debug) { + if (debug != null) { + session.setDebug(debug); + } } public List getFilterQueriesDebug() { From 2743f30580da5b121e392ec54c4bc4e03b4efded Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Oct 2020 17:33:50 +0000 Subject: [PATCH 30/37] Bump junit from 4.12 to 4.13.1 Bumps [junit](https://github.com/junit-team/junit4) from 4.12 to 4.13.1. - [Release notes](https://github.com/junit-team/junit4/releases) - [Changelog](https://github.com/junit-team/junit4/blob/main/doc/ReleaseNotes4.12.md) - [Commits](https://github.com/junit-team/junit4/compare/r4.12...r4.13.1) Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index f95f97af112..e1cfdf22740 100644 --- a/pom.xml +++ b/pom.xml @@ -30,7 +30,7 @@ 1.11.762 1.2 4.5.5 - 4.12 + 4.13.1 5.5.2 ${junit.jupiter.version} 1.13.0 From a4582f05e18a95f744c0db39b26afed4d1ed45bb Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 14 Oct 2020 13:45:57 -0400 Subject: [PATCH 31/37] consolidate logic, remove :Debug database setting #7273 --- .../java/edu/harvard/iq/dataverse/DataverseSession.java | 7 ++++++- .../harvard/iq/dataverse/search/SearchIncludeFragment.java | 3 +-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java b/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java index 9e4bfb1b110..41c6db6f02d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java @@ -51,7 +51,8 @@ public class DataverseSession implements Serializable{ private boolean statusDismissed = false; /** - * If debug is set to true, some pages show extra debugging information. + * If debug is set to true, some pages show extra debugging information to + * superusers. * * The way to set the boolean to true is to pass debug=true as a query * parameter. The boolean will remain true until debug=false is passed. @@ -92,6 +93,10 @@ public void setStatusDismissed(boolean status) { } public boolean isDebug() { + // Only superusers get extra debugging information. + if (!getUser().isSuperuser()) { + return false; + } return debug; } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java index 464cdd05773..541ab79c36d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java @@ -1018,8 +1018,7 @@ public void setRootDv(boolean rootDv) { } public Boolean getDebug() { - return (session.isDebug() && session.getUser().isSuperuser()) - || settingsWrapper.isTrueForKey(":Debug", false); + return session.isDebug(); } public void setDebug(Boolean debug) { From ce78897f7c630dca5261f5b9bba42a9b3537db0e Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 14 Oct 2020 16:09:32 -0400 Subject: [PATCH 32/37] switch to getters and setters in session bean #7273 --- .../iq/dataverse/DataverseSession.java | 20 +++++++++++++------ .../search/SearchIncludeFragment.java | 11 ---------- src/main/webapp/dataverse.xhtml | 2 +- src/main/webapp/search-include-fragment.xhtml | 10 +++++----- 4 files changed, 20 insertions(+), 23 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java b/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java index 41c6db6f02d..d8d73ceaf3e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataverseSession.java @@ -54,10 +54,15 @@ public class DataverseSession implements Serializable{ * If debug is set to true, some pages show extra debugging information to * superusers. * - * The way to set the boolean to true is to pass debug=true as a query - * parameter. The boolean will remain true until debug=false is passed. + * The way to set the Boolean to true is to pass debug=true as a query + * parameter. The Boolean will remain true (even if nothing is passed to it) + * until debug=false is passed. + * + * Because a boolean is false by default when it comes from a viewParam we + * use a Boolean instead. That way, if the debug viewParam is null, we can + * leave the state alone (see setDebug()). */ - private boolean debug; + private Boolean debug; public User getUser() { if ( user == null ) { @@ -92,7 +97,7 @@ public void setStatusDismissed(boolean status) { statusDismissed = status; //MAD: Set to true to enable code! } - public boolean isDebug() { + public Boolean getDebug() { // Only superusers get extra debugging information. if (!getUser().isSuperuser()) { return false; @@ -100,8 +105,11 @@ public boolean isDebug() { return debug; } - public void setDebug(boolean debug) { - this.debug = debug; + public void setDebug(Boolean debug) { + // Leave the debug state alone if nothing is passed. + if (debug != null) { + this.debug = debug; + } } public StaticPermissionQuery on( Dataverse d ) { diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java index 541ab79c36d..ac70072b7bb 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchIncludeFragment.java @@ -130,7 +130,6 @@ public class SearchIncludeFragment implements java.io.Serializable { Map staticSolrFieldFriendlyNamesBySolrField = new HashMap<>(); private boolean solrIsDown = false; private Map numberOfFacets = new HashMap<>(); - private Boolean debug; // private boolean showUnpublished; List filterQueriesDebug = new ArrayList<>(); // private Map friendlyName = new HashMap<>(); @@ -1017,16 +1016,6 @@ public void setRootDv(boolean rootDv) { this.rootDv = rootDv; } - public Boolean getDebug() { - return session.isDebug(); - } - - public void setDebug(Boolean debug) { - if (debug != null) { - session.setDebug(debug); - } - } - public List getFilterQueriesDebug() { return filterQueriesDebug; } diff --git a/src/main/webapp/dataverse.xhtml b/src/main/webapp/dataverse.xhtml index 6d73b95b949..fbbe7563baf 100644 --- a/src/main/webapp/dataverse.xhtml +++ b/src/main/webapp/dataverse.xhtml @@ -44,7 +44,7 @@ - + diff --git a/src/main/webapp/search-include-fragment.xhtml b/src/main/webapp/search-include-fragment.xhtml index f3a4220bed6..3260495c4e7 100644 --- a/src/main/webapp/search-include-fragment.xhtml +++ b/src/main/webapp/search-include-fragment.xhtml @@ -232,7 +232,7 @@
-
+

@@ -448,7 +448,7 @@ - + @@ -461,7 +461,7 @@ - + @@ -519,7 +519,7 @@ - + @@ -586,7 +586,7 @@ - + From af06ae3fa856490d6661a665e16f55803fecff9a Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 14 Oct 2020 20:11:14 -0400 Subject: [PATCH 33/37] fix for handling a folder in a shapefile archive that's not explicitly listed in the zip directory. (#7331) --- .../edu/harvard/iq/dataverse/util/ShapefileHandler.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java b/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java index d5e40dffaf3..3af562882f3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java @@ -335,7 +335,14 @@ private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File String unzipFilePath = unzipFileName; if (unzipFolderName != null) { - unzipFilePath = unzipFolderName + "/" + unzipFileName; + unzipFilePath = unzipFolderName + "/" + unzipFileName; + + // There's a chance we haven't created this folder yet + // in the destination directory (this happens if the folder + // is not explicitly listed in the Zip archive directory). + String dirpath = target_directory.getAbsolutePath() + "/" + unzipFolderName; + // (and if it already exists, it'll be skipped) + createDirectory(dirpath); } if (unzipFileName==null){ From 049a2d2fb6a6cccf0515948983e4f5d786ab9c4b Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Thu, 15 Oct 2020 12:13:22 -0400 Subject: [PATCH 34/37] giving an extra couple of seconds to the ingest in a test that's occasionally failing. --- src/test/java/edu/harvard/iq/dataverse/api/DownloadFilesIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/api/DownloadFilesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/DownloadFilesIT.java index 908beeac941..dde79574b87 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/DownloadFilesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/DownloadFilesIT.java @@ -350,7 +350,7 @@ public void downloadAllFilesTabular() throws IOException { .body("data.files[0].label", equalTo("50by1000.dta")); // UtilIT.MAXIMUM_INGEST_LOCK_DURATION is 3 but not long enough. - assertTrue("Failed test if Ingest Lock exceeds max duration " + pathToFile, UtilIT.sleepForLock(datasetId.longValue(), "Ingest", apiToken, 4)); + assertTrue("Failed test if Ingest Lock exceeds max duration " + pathToFile, UtilIT.sleepForLock(datasetId.longValue(), "Ingest", apiToken, UtilIT.MAXIMUM_INGEST_LOCK_DURATION + 3)); Response downloadFiles1 = UtilIT.downloadFiles(datasetPid, apiToken); downloadFiles1.then().assertThat() From 0c4fccc45b542aac2173ca46551bd1fe6da1d31c Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Thu, 15 Oct 2020 17:22:39 -0400 Subject: [PATCH 35/37] A quick fix for the optimized datafile info retrieval method issue; this was as a result of the investigation into #7310 - ldjson export failure inside dataset page. --- .../edu/harvard/iq/dataverse/DataFileServiceBean.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java index 65d26d2eb63..a9a16e60ae2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java @@ -578,7 +578,7 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion int i = 0; - List dataTableResults = em.createNativeQuery("SELECT t0.ID, t0.DATAFILE_ID, t0.UNF, t0.CASEQUANTITY, t0.VARQUANTITY, t0.ORIGINALFILEFORMAT, t0.ORIGINALFILESIZE FROM dataTable t0, dataFile t1, dvObject t2 WHERE ((t0.DATAFILE_ID = t1.ID) AND (t1.ID = t2.ID) AND (t2.OWNER_ID = " + owner.getId() + ")) ORDER BY t0.ID").getResultList(); + List dataTableResults = em.createNativeQuery("SELECT t0.ID, t0.DATAFILE_ID, t0.UNF, t0.CASEQUANTITY, t0.VARQUANTITY, t0.ORIGINALFILEFORMAT, t0.ORIGINALFILESIZE, t0.ORIGINALFILENAME FROM dataTable t0, dataFile t1, dvObject t2 WHERE ((t0.DATAFILE_ID = t1.ID) AND (t1.ID = t2.ID) AND (t2.OWNER_ID = " + owner.getId() + ")) ORDER BY t0.ID").getResultList(); for (Object[] result : dataTableResults) { DataTable dataTable = new DataTable(); @@ -596,6 +596,8 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion dataTable.setOriginalFileSize((Long)result[6]); + dataTable.setOriginalFileName((String)result[7]); + dataTables.add(dataTable); datatableMap.put(fileId, i++); @@ -856,8 +858,10 @@ private List retrieveFileMetadataForVersion(Dataset dataset, Datas fileMetadata.setDatasetVersion(version); - //fileMetadata.setDataFile(dataset.getFiles().get(file_list_id)); + // Link the FileMetadata object to the DataFile: fileMetadata.setDataFile(dataFiles.get(file_list_id)); + // ... and the DataFile back to the FileMetadata: + fileMetadata.getDataFile().getFileMetadatas().add(fileMetadata); String description = (String) result[2]; From 8f8ff854754831296c6656f5332e56c3f034a215 Mon Sep 17 00:00:00 2001 From: Stephen Kraffmiller Date: Tue, 20 Oct 2020 13:58:05 -0400 Subject: [PATCH 36/37] #7097 move dv indexing to on success --- .../FinalizeDatasetPublicationCommand.java | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java index ce407d8986b..7ab83a27746 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java @@ -32,6 +32,7 @@ import edu.harvard.iq.dataverse.batch.util.LoggingUtil; import edu.harvard.iq.dataverse.engine.command.Command; import edu.harvard.iq.dataverse.util.FileUtil; +import java.util.ArrayList; import java.util.concurrent.Future; import org.apache.solr.client.solrj.SolrServerException; @@ -52,6 +53,8 @@ public class FinalizeDatasetPublicationCommand extends AbstractPublishDatasetCom */ final boolean datasetExternallyReleased; + List dataversesToIndex = new ArrayList<>(); + public static final String FILE_VALIDATION_ERROR = "FILE VALIDATION ERROR"; public FinalizeDatasetPublicationCommand(Dataset aDataset, DataverseRequest aRequest) { @@ -224,6 +227,20 @@ public boolean onSuccess(CommandContext ctxt, Object r) { LoggingUtil.writeOnSuccessFailureLog(this, failureLogText, dataset); retVal = false; } + + //re-indexing dataverses that have additional subjects + if (!dataversesToIndex.isEmpty()){ + for (Dataverse dv : dataversesToIndex) { + try { + Future indexString = ctxt.index().indexDataverse(dv); + } catch (IOException | SolrServerException e) { + String failureLogText = "Post-publication indexing failed. You can kick off a re-index of this dataverse with: \r\n curl http://localhost:8080/api/admin/index/dataverses/" + dv.getId().toString(); + failureLogText += "\r\n" + e.getLocalizedMessage(); + LoggingUtil.writeOnSuccessFailureLog(this, failureLogText, dataset); + retVal = false; + } + } + } exportMetadata(dataset, ctxt.settings()); @@ -255,13 +272,13 @@ private void exportMetadata(Dataset dataset, SettingsServiceBean settingsService * add the dataset subjects to all parent dataverses. */ private void updateParentDataversesSubjectsField(Dataset savedDataset, CommandContext ctxt) throws SolrServerException, IOException { + for (DatasetField dsf : savedDataset.getLatestVersion().getDatasetFields()) { if (dsf.getDatasetFieldType().getName().equals(DatasetFieldConstant.subject)) { Dataverse dv = savedDataset.getOwner(); while (dv != null) { boolean newSubjectsAdded = false; - for (ControlledVocabularyValue cvv : dsf.getControlledVocabularyValues()) { - + for (ControlledVocabularyValue cvv : dsf.getControlledVocabularyValues()) { if (!dv.getDataverseSubjects().contains(cvv)) { logger.fine("dv "+dv.getAlias()+" does not have subject "+cvv.getStrValue()); newSubjectsAdded = true; @@ -271,10 +288,11 @@ private void updateParentDataversesSubjectsField(Dataset savedDataset, CommandCo } } if (newSubjectsAdded) { - logger.fine("new dataverse subjects added - saving and reindexing"); + logger.fine("new dataverse subjects added - saving and reindexing in OnSuccess"); Dataverse dvWithSubjectJustAdded = ctxt.em().merge(dv); ctxt.em().flush(); - ctxt.index().indexDataverse(dvWithSubjectJustAdded); // need to reindex to capture the new subjects + //adding dv to list of those we need to re-index for new subjects + dataversesToIndex.add(dvWithSubjectJustAdded); } else { logger.fine("no new subjects added to the dataverse; skipping reindexing"); } From 4bb0f6ebbf3b886c6b2063eb6e32f220ec2253b0 Mon Sep 17 00:00:00 2001 From: landreev Date: Tue, 20 Oct 2020 16:01:19 -0400 Subject: [PATCH 37/37] bumping up jupiter version, to resolve conflict --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index e1cfdf22740..cec5551096f 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ 1.2 4.5.5 4.13.1 - 5.5.2 + 5.7.0 ${junit.jupiter.version} 1.13.0 2.28.2