Merge branch 'develop' into 6919-preview-tools #6919

IQSS · Oct 21, 2020 · 6b62d0d · 6b62d0d
2 parents 767882b + d8b1f0d
commit 6b62d0d
Show file tree

Hide file tree

Showing 20 changed files with 558 additions and 59 deletions.
diff --git a/doc/release-notes/6784-mdc-scripts.md b/doc/release-notes/6784-mdc-scripts.md
@@ -0,0 +1 @@
+In the "Notes for Dataverse Installation Administrators" we should mention the new scripts for MDC.
diff --git a/doc/release-notes/7140-google-cloud.md b/doc/release-notes/7140-google-cloud.md
@@ -0,0 +1,12 @@
+## Google Cloud Archiver
+
+Dataverse Bags can now be sent to a bucket in Google Cloud, including those in the 'Coldline' storage class, which provide less expensive but slower access.
+
+## Use Cases
+
+- As an Administrator I can set up a regular export to Google Cloud so that my users' data is preserved.
+
+## New Settings
+
+:GoogleCloudProject - the name of the project managing the bucket.
+:GoogleCloudBucket - the name of the bucket to use
diff --git a/doc/sphinx-guides/source/_static/admin/counter-processor-config.yaml b/doc/sphinx-guides/source/_static/admin/counter-processor-config.yaml
@@ -2,7 +2,7 @@
 # 4-digit year and 2-digit month and day
 # /usr/local/payara5/glassfish/domains/domain1/logs/counter_2019-01-11.log
 #log_name_pattern: sample_logs/counter_(yyyy-mm-dd).log
-log_name_pattern: /usr/local/payara5/glassfish/domains/domain1/logs/counter_(yyyy-mm-dd).log
+log_name_pattern: /usr/local/payara5/glassfish/domains/domain1/logs/mdc/counter_(yyyy-mm-dd).log
 
 # path_types regular expressions allow matching to classify page urls as either an investigation or request
 # based on specific URL structure for your system.

diff --git a/doc/sphinx-guides/source/_static/util/counter_daily.sh b/doc/sphinx-guides/source/_static/util/counter_daily.sh
@@ -0,0 +1,36 @@
+#! /bin/bash
+
+COUNTER_PROCESSOR_DIRECTORY="/usr/local/counter-processor-0.0.1"
+MDC_LOG_DIRECTORY="/usr/local/payara5/glassfish/domains/domain1/logs/mdc"
+
+# counter_daily.sh
+
+cd $COUNTER_PROCESSOR_DIRECTORY
+
+echo >>/tmp/counter_daily.log
+date >>/tmp/counter_daily.log
+echo >>/tmp/counter_daily.log
+
+# "You should run Counter Processor once a day to create reports in SUSHI (JSON) format that are saved to disk for Dataverse to process and that are sent to the DataCite hub."
+
+LAST=$(date -d "yesterday 13:00" '+%Y-%m-%d')
+# echo $LAST
+YEAR_MONTH=$(date -d "yesterday 13:00" '+%Y-%m')
+# echo $YEAR_MONTH
+d=$(date -I -d "$YEAR_MONTH-01")
+#echo $d
+while [ "$(date -d "$d" +%Y%m%d)" -le "$(date -d "$LAST" +%Y%m%d)" ];
+do
+  if [ -f "$MDC_LOG_DIRECTORY/counter_$d.log" ]; then
+#       echo "Found counter_$d.log"
+  else
+        touch "$MDC_LOG_DIRECTORY/counter_$d.log"
+  fi
+  d=$(date -I -d "$d + 1 day")
+done
+
+#run counter-processor as counter user
+
+sudo -u counter YEAR_MONTH=$YEAR_MONTH python3 main.py >>/tmp/counter_daily.log
+
+curl -X POST "http://localhost:8080/api/admin/makeDataCount/addUsageMetricsFromSushiReport?reportOnDisk=/tmp/make-data-count-report.json"
diff --git a/doc/sphinx-guides/source/_static/util/counter_weekly.sh b/doc/sphinx-guides/source/_static/util/counter_weekly.sh
@@ -0,0 +1,48 @@
+#!/bin/sh
+#counter_weekly.sh
+
+# This script iterates through all published Datasets in all Dataverses and calls the Make Data Count API to update their citations from DataCite
+# Note: Requires curl and jq for parsing JSON responses form curl
+
+# A recursive method to process each Dataverse
+processDV () {
+echo "Processing Dataverse ID#: $1"
+
+#Call the Dataverse API to get the contents of the Dataverse (without credentials, this will only list published datasets and dataverses
+DVCONTENTS=$(curl -s http://localhost:8080/api/dataverses/$1/contents)
+
+# Iterate over all datasets, pulling the value of their DOIs (as part of the persistentUrl) from the json returned
+for subds in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataset") | .persistentUrl'); do
+
+#The authority/identifier are preceded by a protocol/host, i.e. https://doi.org/
+DOI=`expr "$subds" : '.*:\/\/\doi\.org\/\(.*\)'`
+
+# Call the Dataverse API for this dataset and get the response
+RESULT=$(curl -s -X POST "http://localhost:8080/api/admin/makeDataCount/:persistentId/updateCitationsForDataset?persistentId=doi:$DOI" )
+# Parse the status and number of citations found from the response
+STATUS=$(echo "$RESULT" | jq -j '.status' )
+CITATIONS=$(echo "$RESULT" | jq -j '.data.citationCount')
+
+# The status for a call that worked
+OK='OK'
+
+# Check the status and report
+if [ "$STATUS" = "$OK" ]; then
+        echo "Updated: $CITATIONS citations for doi:$DOI"
+else
+        echo "Failed to update citations for doi:$DOI"
+        echo "Run curl -s -X POST 'http://localhost:8080/api/admin/makeDataCount/:persistentId/updateCitationsForDataset?persistentId=doi:$DOI ' to retry/see the error message"
+fi
+#processDV $subds
+done
+
+# Now iterate over any child Dataverses and recursively process them
+for subdv in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataverse") | .id'); do
+echo $subdv
+processDV $subdv
+done
+
+}
+
+# Call the function on the root dataverse to start processing 
+processDV 1
diff --git a/doc/sphinx-guides/source/admin/make-data-count.rst b/doc/sphinx-guides/source/admin/make-data-count.rst
@@ -72,6 +72,8 @@ Enable or Disable Display of Make Data Count Metrics
 
 By default, when MDC logging is enabled (when ``:MDCLogPath`` is set), Dataverse will display MDC metrics instead of it's internal (legacy) metrics. You can avoid this (e.g. to collect MDC metrics for some period of time before starting to display them) by setting ``:DisplayMDCMetrics`` to false.
 
+The following discussion assumes ``:MDCLogPath`` has been set to ``/usr/local/payara5/glassfish/domains/domain1/logs/mdc``
+
 Configure Counter Processor
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -92,15 +94,15 @@ Configure Counter Processor
 Populate Views and Downloads for the First Time
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Soon we will be setting up a cron job to run nightly but we start with a single successful configuration and run of Counter Processor and calls to Dataverse APIs.
+Soon we will be setting up a cron job to run nightly but we start with a single successful configuration and manual run of Counter Processor and calls to Dataverse APIs. (The scripts discussed in the next section automate the steps described here, including creating empty log files if you're starting mid-month.) 
 
 * Change to the directory where you installed Counter Processor.
 
   * ``cd /usr/local/counter-processor-0.0.1``
 
 * If you are running Counter Processor for the first time in the middle of a month, you will need create blank log files for the previous days. e.g.:
 
-  * ``cd /usr/local/payara5/glassfish/domains/domain1/logs``
+  * ``cd /usr/local/payara5/glassfish/domains/domain1/logs/mdc``
 
   * ``touch counter_2019-02-01.log``
 
@@ -127,6 +129,8 @@ Populate Views and Downloads Nightly
 
 Running ``main.py`` to create the SUSHI JSON file and the subsequent calling of the Dataverse API to process it should be added as a cron job.
 
+Dataverse provides example scripts that run the steps to process new accesses and uploads and update Dataverse's database (`counter_daily.sh</_static/util/counter_daily.sh>`) and to retrieve citations for all Datasets from DataCite (`counter_weekly.sh</_static/util/counter_weekly.sh>`). These scripts should be configured for your environment and can be run manually or as cron jobs.
+
 Sending Usage Metrics to the DataCite Hub
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst
@@ -776,14 +776,16 @@ For Google Analytics, the example script at :download:`analytics-code.html </_st
 
 Once this script is running, you can look in the Google Analytics console (Realtime/Events or Behavior/Events) and view events by type and/or the Dataset or File the event involves.
 
+.. _BagIt Export:
+
 BagIt Export
 ------------
 
 Dataverse may be configured to submit a copy of published Datasets, packaged as `Research Data Alliance conformant <https://www.rd-alliance.org/system/files/Research%20Data%20Repository%20Interoperability%20WG%20-%20Final%20Recommendations_reviewed_0.pdf>`_ zipped `BagIt <https://tools.ietf.org/html/draft-kunze-bagit-17>`_ bags to `Chronopolis <https://libraries.ucsd.edu/chronopolis/>`_ via `DuraCloud <https://duraspace.org/duracloud/>`_ or alternately to any folder on the local filesystem.
 
 Dataverse offers an internal archive workflow which may be configured as a PostPublication workflow via an admin API call to manually submit previously published Datasets and prior versions to a configured archive such as Chronopolis. The workflow creates a `JSON-LD <http://www.openarchives.org/ore/0.9/jsonld>`_ serialized `OAI-ORE <https://www.openarchives.org/ore/>`_ map file, which is also available as a metadata export format in the Dataverse web interface.
 
-At present, the DPNSubmitToArchiveCommand and LocalSubmitToArchiveCommand are the only implementations extending the AbstractSubmitToArchiveCommand and using the configurable mechanisms discussed below.
+At present, the DPNSubmitToArchiveCommand, LocalSubmitToArchiveCommand, and GoogleCloudSubmitToArchive are the only implementations extending the AbstractSubmitToArchiveCommand and using the configurable mechanisms discussed below.
 
 .. _Duracloud Configuration:
 
@@ -831,10 +833,41 @@ ArchiverClassName - the fully qualified class to be used for archiving. For exam
 
 \:ArchiverSettings - the archiver class can access required settings including existing Dataverse settings and dynamically defined ones specific to the class. This setting is a comma-separated list of those settings. For example\:
 
-``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":BagItLocalPath”``
+``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":BagItLocalPath"``
 
 :BagItLocalPath is the file path that you've set in :ArchiverSettings.
 
+.. _Google Cloud Configuration:
+
+Google Cloud Configuration
+++++++++++++++++++++++++++
+
+The Google Cloud Archiver can send Dataverse Bags to a bucket in Google's cloud, including those in the 'Coldline' storage class (cheaper, with slower access) 
+
+``curl http://localhost:8080/api/admin/settings/:ArchiverClassName -X PUT -d "edu.harvard.iq.dataverse.engine.command.impl.GoogleCloudSubmitToArchiveCommand"``
+
+``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":GoogleCloudBucket, :GoogleCloudProject"``
+
+The Google Cloud Archiver defines two custom settings, both are required. The credentials for your account, in the form of a json key file, must also be obtained and stored locally (see below):
+
+In order to use the Google Cloud Archiver, you must have a Google account. You will need to create a project and bucket within that account and provide those values in the settings:
+
+\:GoogleCloudBucket - the name of the bucket to use. For example:
+
+``curl http://localhost:8080/api/admin/settings/:GoogleCloudBucket -X PUT -d "qdr-archive"``
+
+\:GoogleCloudProject - the name of the project managing the bucket. For example:
+
+``curl http://localhost:8080/api/admin/settings/:GoogleCloudProject -X PUT -d "qdr-project"``
+
+The Google Cloud Archiver also requires a key file that must be renamed to 'googlecloudkey.json' and placed in the directory identified by your 'dataverse.files.directory' jvm option. This file can be created in the Google Cloud Console. (One method: Navigate to your Project 'Settings'/'Service Accounts', create an account, give this account the 'Cloud Storage'/'Storage Admin' role, and once it's created, use the 'Actions' menu to 'Create Key', selecting the 'JSON' format option. Use this as the 'googlecloudkey.json' file.)
+
+For example:
+
+``cp <your key file> /usr/local/payara5/glassfish/domains/domain1/files/googlecloudkey.json``
+
+.. _Archiving API Call:
+
 API Call
 ++++++++
 
@@ -2124,3 +2157,40 @@ To enable redirects to the zipper installed on the same server as the main Datav
 To enable redirects to the zipper on a different server: 
 
 ``curl -X PUT -d 'https://zipper.example.edu/cgi-bin/zipdownload' http://localhost:8080/api/admin/settings/:CustomZipDownloadServiceUrl`` 
+
+:ArchiverClassName
+++++++++++++++++++
+
+Dataverse can export archival "Bag" files to an extensible set of storage systems (see :ref:`BagIt Export` above for details about this and for further explanation of the other archiving related settings below).
+This setting specifies which storage system to use by identifying the particular Java class that should be run. Current options include DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, and GoogleCloudSubmitToArchiveCommand.
+
+``curl -X PUT -d 'LocalSubmitToArchiveCommand' http://localhost:8080/api/admin/settings/:ArchiverClassName`` 
+
+:ArchiverSettings
++++++++++++++++++
+
+Each Archiver class may have its own custom settings. Along with setting which Archiver class to use, one must use this setting to identify which setting values should be sent to it when it is invoked. The value should be a comma-separated list of setting names.
+For example, the LocalSubmitToArchiveCommand only uses the :BagItLocalPath setting. To allow the class to use that setting, this setting must set as:
+
+``curl -X PUT -d ':BagItLocalPath' http://localhost:8080/api/admin/settings/:ArchiverSettings`` 
+
+:DuraCloudHost
+++++++++++++++
+:DuraCloudPort
+++++++++++++++
+:DuraCloudContext
++++++++++++++++++
+
+These three settings define the host, port, and context used by the DuraCloudSubmitToArchiveCommand. :DuraCloudHost is required. The other settings have default values as noted in the :ref:`Duracloud Configuration` section above.
+
+:BagItLocalPath
++++++++++++++++
+
+This is the local file system path to be used with the LocalSubmitToArchiveCommand class. It is recommended to use an absolute path. See the :ref:`Local Path Configuration` section above.
+
+:GoogleCloudBucket
+++++++++++++++++++ 
+:GoogleCloudProject
++++++++++++++++++++
+
+These are the bucket and project names to be used with the GoogleCloudSubmitToArchiveCommand class. Further information is in the :ref:`Google Cloud Configuration` section above.
diff --git a/pom.xml b/pom.xml
@@ -30,8 +30,8 @@
         <aws.version>1.11.762</aws.version>
         <commons.logging.version>1.2</commons.logging.version>
         <httpcomponents.client.version>4.5.5</httpcomponents.client.version>
-        <junit.version>4.12</junit.version>
-        <junit.jupiter.version>5.5.2</junit.jupiter.version>
+        <junit.version>4.13.1</junit.version>
+        <junit.jupiter.version>5.7.0</junit.jupiter.version>
         <junit.vintage.version>${junit.jupiter.version}</junit.vintage.version>
         <testcontainers.version>1.13.0</testcontainers.version>
         <mockito.version>2.28.2</mockito.version>
@@ -57,7 +57,7 @@
             </releases>
         </pluginRepository>
     </pluginRepositories>
-    <!--Maven checks for dependendies from these repos in the order shown in the pom.xml
+    <!--Maven checks for dependencies from these repos in the order shown in the pom.xml
         This isn't well documented and seems to change between maven versions -MAD 4.9.4 -->
     <repositories>
         <repository>
@@ -127,6 +127,13 @@
                 <artifactId>httpclient</artifactId>
                 <version>${httpcomponents.client.version}</version>
             </dependency>
+            <dependency>
+              <groupId>com.google.cloud</groupId>
+              <artifactId>google-cloud-bom</artifactId>
+              <version>0.115.0-alpha</version>
+              <type>pom</type>
+              <scope>import</scope>
+            </dependency>
             <dependency>
                 <groupId>org.testcontainers</groupId>
                 <artifactId>testcontainers-bom</artifactId>
@@ -137,7 +144,7 @@
         </dependencies>
     </dependencyManagement>
     <!-- Declare any DIRECT dependencies here.
-         In case the depency is both transitive and direct (e. g. some common lib for logging),
+         In case the dependency is both transitive and direct (e. g. some common lib for logging),
          manage the version above and add the direct dependency here WITHOUT version tag, too.
     -->
     <!-- TODO: Housekeeping is utterly needed. -->
@@ -440,11 +447,6 @@
             <artifactId>slf4j-log4j12</artifactId>
             <version>1.6.1</version>
         </dependency -->
-        <dependency>
-            <groupId>axis</groupId>
-            <artifactId>axis</artifactId>
-            <version>1.4</version>
-        </dependency>
         <dependency>
             <groupId>io.searchbox</groupId>
             <artifactId>jest</artifactId>
@@ -573,14 +575,19 @@
         <dependency>
             <groupId>org.apache.tika</groupId>
             <artifactId>tika-parsers</artifactId>
-            <version>1.22</version>
+            <version>1.24.1</version>
         </dependency>
         <!-- Named Entity Recognition -->
         <dependency>
             <groupId>org.apache.opennlp</groupId>
             <artifactId>opennlp-tools</artifactId>
             <version>1.9.1</version>
         </dependency>
+        <dependency>
+          <groupId>com.google.cloud</groupId>
+          <artifactId>google-cloud-storage</artifactId>
+          <version>1.97.0</version>
+        </dependency>
 
 
         <!-- TESTING DEPENDENCIES -->

diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java
@@ -578,7 +578,7 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion
 
         int i = 0; 
 
-        List<Object[]> dataTableResults = em.createNativeQuery("SELECT t0.ID, t0.DATAFILE_ID, t0.UNF, t0.CASEQUANTITY, t0.VARQUANTITY, t0.ORIGINALFILEFORMAT, t0.ORIGINALFILESIZE FROM dataTable t0, dataFile t1, dvObject t2 WHERE ((t0.DATAFILE_ID = t1.ID) AND (t1.ID = t2.ID) AND (t2.OWNER_ID = " + owner.getId() + ")) ORDER BY t0.ID").getResultList();
+        List<Object[]> dataTableResults = em.createNativeQuery("SELECT t0.ID, t0.DATAFILE_ID, t0.UNF, t0.CASEQUANTITY, t0.VARQUANTITY, t0.ORIGINALFILEFORMAT, t0.ORIGINALFILESIZE, t0.ORIGINALFILENAME FROM dataTable t0, dataFile t1, dvObject t2 WHERE ((t0.DATAFILE_ID = t1.ID) AND (t1.ID = t2.ID) AND (t2.OWNER_ID = " + owner.getId() + ")) ORDER BY t0.ID").getResultList();
 
         for (Object[] result : dataTableResults) {
             DataTable dataTable = new DataTable(); 
@@ -596,6 +596,8 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion
 
             dataTable.setOriginalFileSize((Long)result[6]);
 
+            dataTable.setOriginalFileName((String)result[7]);
+
             dataTables.add(dataTable);
             datatableMap.put(fileId, i++);
 
@@ -856,8 +858,10 @@ private List<FileMetadata> retrieveFileMetadataForVersion(Dataset dataset, Datas
 
             fileMetadata.setDatasetVersion(version);
 
-            //fileMetadata.setDataFile(dataset.getFiles().get(file_list_id));
+            // Link the FileMetadata object to the DataFile:
             fileMetadata.setDataFile(dataFiles.get(file_list_id));
+            // ... and the DataFile back to the FileMetadata:
+            fileMetadata.getDataFile().getFileMetadatas().add(fileMetadata);
 
             String description = (String) result[2];