Skip to content

Commit

Permalink
Support SHA1 rather than MD5 as a checksum on a per file basis #3354
Browse files Browse the repository at this point in the history
A dependency for rsync support (#3145) is the ability to persist SHA-1
checksums for files rather than MD5 checksums.

A new installation-wide configuration setting called
":FileFixityChecksumAlgorithm" has been added which can be set to
"SHA-1" to have Dataverse calculate and show SHA-1 checksums rather than
MD5 checksums.

In order to run this branch you must run the provided SQL upgrade
script: scripts/database/upgrades/3354-alt-checksum.sql

In addition, the Solr schema should be updated to the version in this
branch.
  • Loading branch information
pdurbin committed Sep 20, 2016
1 parent 4e88d77 commit b703e27
Show file tree
Hide file tree
Showing 37 changed files with 567 additions and 164 deletions.
2 changes: 2 additions & 0 deletions conf/solr/4.6.0/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,8 @@
<field name="unf" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="fileSizeInBytes" type="long" stored="true" indexed="true" multiValued="false"/>
<field name="fileMd5" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="fileChecksumType" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="fileChecksumValue" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="fileContentType" type="string" stored="true" indexed="true" multiValued="false"/>
<field name="deaccessionReason" type="string" stored="true" indexed="false" multiValued="false"/>

Expand Down
7 changes: 7 additions & 0 deletions doc/sphinx-guides/source/installation/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -454,3 +454,10 @@ This setting is experimental per :doc:`/installation/shibboleth`.
++++++++++++

Set to false to disallow local accounts to be created if you are using :doc:`shibboleth` but not for production use until https://github.com/IQSS/dataverse/issues/2838 has been fixed.

:FileFixityChecksumAlgorithm
++++++++++++++++++++++++++++

Dataverse calculates checksums for uploaded files so that users can determine if their file was corrupted via upload or download. This is sometimes called "file fixity": https://en.wikipedia.org/wiki/File_Fixity

The default checksum algorithm used is MD5 and should be sufficient for establishing file fixity. "SHA-1" is an experimental alternate value for this setting.
7 changes: 7 additions & 0 deletions scripts/database/upgrades/3354-alt-checksum.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
ALTER TABLE datafile ADD COLUMN checksumtype character varying(255);
ALTER TABLE datafile ALTER COLUMN checksumtype SET NOT NULL;
UPDATE datafile SET checksumtype = 'MD5';
-- alternate statement for sbgrid.org and others interested in SHA-1 support
-- note that in the database we use "SHA1" (no hyphen) but the GUI will show "SHA-1"
--UPDATE datafile SET checksumtype = 'SHA1';
ALTER TABLE datafile RENAME md5 TO checksumvalue;
5 changes: 5 additions & 0 deletions scripts/issues/3354/createDatasetWithSha1Files.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/sh
# existing, works, no files, commenting out
#curl -s -X POST -H "Content-type:application/json" -d @scripts/search/tests/data/dataset-finch1.json "http://localhost:8080/api/dataverses/root/datasets/?key=$API_TOKEN"
# new, has files
curl -s -X POST -H "Content-type:application/json" -d @scripts/issues/3354/datasetWithSha1Files.json "http://localhost:8080/api/dataverses/root/datasets/?key=$API_TOKEN"
86 changes: 86 additions & 0 deletions scripts/issues/3354/datasetWithSha1Files.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
{
"datasetVersion": {
"files": [
{
"label": "foo.txt",
"dataFile": {
"filename": "foo.txt",
"contentType": "text/plain",
"storageIdentifier": "157484f9d6c-c36006fa39e5",
"originalFormatLabel": "UNKNOWN",
"checksum": {
"type": "SHA-1",
"value": "f1d2d2f924e986ac86fdf7b36c94bcdf32beec15"
}
}
}
],
"metadataBlocks": {
"citation": {
"fields": [
{
"value": "Dataset with SHA-1 files",
"typeClass": "primitive",
"multiple": false,
"typeName": "title"
},
{
"value": [
{
"authorName": {
"value": "Finch, Fiona",
"typeClass": "primitive",
"multiple": false,
"typeName": "authorName"
},
"authorAffiliation": {
"value": "Birds Inc.",
"typeClass": "primitive",
"multiple": false,
"typeName": "authorAffiliation"
}
}
],
"typeClass": "compound",
"multiple": true,
"typeName": "author"
},
{
"value": [
{ "datasetContactEmail" : {
"typeClass": "primitive",
"multiple": false,
"typeName": "datasetContactEmail",
"value" : "finch@mailinator.com"
}
}],
"typeClass": "compound",
"multiple": true,
"typeName": "datasetContact"
},
{
"value": [ {
"dsDescriptionValue":{
"value": "Some people prefer SHA-1 to MD5 for file fixity.",
"multiple":false,
"typeClass": "primitive",
"typeName": "dsDescriptionValue"
}}],
"typeClass": "compound",
"multiple": true,
"typeName": "dsDescription"
},
{
"value": [
"Other"
],
"typeClass": "controlledVocabulary",
"multiple": true,
"typeName": "subject"
}
],
"displayName": "Citation Metadata"
}
}
}
}
3 changes: 3 additions & 0 deletions scripts/issues/3354/mydata
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh
# FIXME: Make this into a REST Assured test.
curl -s "http://localhost:8080/api/mydata/retrieve?key=$API_TOKEN&role_ids=1&dvobject_types=DataFile&published_states=Published&published_states=Unpublished&published_states=Draft&published_states=In+Review&published_states=Deaccessioned" | jq .data.items
4 changes: 2 additions & 2 deletions scripts/search/search
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/bin/sh
if [ -z "$1" ]; then
curl -s 'http://localhost:8080/api/search?q=*'
curl -H "X-Dataverse-key: $API_TOKEN" -s 'http://localhost:8080/api/search?q=*'
#curl -s 'http://localhost:8080/api/search?q=*&key=pete'
else
# i.e. ./search 'q=*&fq=filetype_s:"image"&fq=dvtype:files'
# i.e. ./search 'q=*&start=10'
# i.e. ./search 'q=*&sort=name_sort&order=asc'
# i.e. ./search 'q=*&sort=name_sort&order=asc' | jq '.itemsJson[] | {name_sort}'
curl -s "http://localhost:8080/api/search?$1"
curl -H "X-Dataverse-key: $API_TOKEN" -s "http://localhost:8080/api/search?$1"
fi
7 changes: 2 additions & 5 deletions src/main/java/Bundle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -1076,7 +1076,6 @@ dataset.metadata.persistentId.tip=The unique persistent identifier for a Dataset
dataset.versionDifferences.termsOfUseAccess=Terms of Use and Access
dataset.versionDifferences.termsOfUseAccessChanged=Terms of Use/Access Changed
file.viewDiffDialog.restricted=Restricted
file.viewDiffDialog.md5=MD5

dataset.template.tip=Changing the template will clear any fields you may have entered data into.
dataset.noTemplate.label=None
Expand Down Expand Up @@ -1133,9 +1132,8 @@ file.download.header=Download
file.preview=Preview:
file.fileName=File Name
file.type.tabularData=Tabular Data
file.MD5=MD5
file.MD5.origal=Original File MD5
file.MD5.exists.tip=A file with this MD5 already exists in the dataset.
file.originalChecksumType=Original File {0}
file.checksum.exists.tip=A file with this checksum already exists in the dataset.
file.selectedThumbnail=Thumbnail
file.selectedThumbnail.tip=The thumbnail for this file is used as the default thumbnail for the dataset. Click 'Advanced Options' button of another file to select that file.

Expand Down Expand Up @@ -1371,7 +1369,6 @@ file.tags.label=Tags

file.metadataTab.fileMetadata.header=File Metadata
file.metadataTab.fileMetadata.persistentid.label=Data File Persistent ID
file.metadataTab.fileMetadata.md5.label=MD5
file.metadataTab.fileMetadata.unf.label=UNF
file.metadataTab.fileMetadata.size.label=Size
file.metadataTab.fileMetadata.type.label=Type
Expand Down
80 changes: 71 additions & 9 deletions src/main/java/edu/harvard/iq/dataverse/DataFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import edu.harvard.iq.dataverse.dataaccess.DataFileIO;
import edu.harvard.iq.dataverse.ingest.IngestReport;
import edu.harvard.iq.dataverse.ingest.IngestRequest;
import edu.harvard.iq.dataverse.util.BundleUtil;
import edu.harvard.iq.dataverse.util.FileUtil;
import edu.harvard.iq.dataverse.util.ShapefileHandler;
import java.io.IOException;
Expand All @@ -16,11 +17,14 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.Files;
import java.util.Arrays;
import javax.persistence.Entity;
import javax.persistence.OneToMany;
import javax.persistence.OneToOne;
import javax.persistence.CascadeType;
import javax.persistence.Column;
import javax.persistence.EnumType;
import javax.persistence.Enumerated;
import javax.persistence.Index;
import javax.persistence.JoinColumn;
import javax.persistence.JoinTable;
Expand Down Expand Up @@ -61,9 +65,56 @@ public class DataFile extends DvObject {

@Column( nullable = false )
private String fileSystemName;

@Column( nullable = false )
private String md5;

/**
* End users will see "SHA-1" (with a hyphen) rather than "SHA1" in the GUI
* and API but in the "datafile" table we persist "SHA1" (no hyphen) for
* type safety (using keys of the enum). In the "setting" table, we persist
* "SHA-1" (with a hyphen) to match the GUI and the "Algorithm Name" list at
* https://docs.oracle.com/javase/8/docs/technotes/guides/security/StandardNames.html#MessageDigest
*
* The list of types should be limited to the list above in the technote
* because the string gets passed into MessageDigest.getInstance() and you
* can't just pass in any old string.
*/
public enum ChecksumType {

MD5("MD5"),
SHA1("SHA-1");

private final String text;

private ChecksumType(final String text) {
this.text = text;
}

public static ChecksumType fromString(String text) {
if (text != null) {
for (ChecksumType checksumType : ChecksumType.values()) {
if (text.equals(checksumType.text)) {
return checksumType;
}
}
}
throw new IllegalArgumentException("ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + ".");
}

@Override
public String toString() {
return text;
}
}

@Column(nullable = false)
@Enumerated(EnumType.STRING)
private ChecksumType checksumType;

/**
* Examples include "f622da34d54bdc8ee541d6916ac1c16f" as an MD5 value or
* "3a484dfdb1b429c2e15eb2a735f1f5e4d5b04ec6" as a SHA-1 value"
*/
@Column(nullable = false)
private String checksumValue;

@Column(nullable=true)
private Long filesize; // Number of bytes in file. Allows 0 and null, negative numbers not permitted
Expand Down Expand Up @@ -364,15 +415,26 @@ public void setRestricted(boolean restricted) {
this.restricted = restricted;
}

public ChecksumType getChecksumType() {
return checksumType;
}

public String getmd5() {
return this.md5;
public void setChecksumType(ChecksumType checksumType) {
this.checksumType = checksumType;
}
public void setmd5(String md5) {
this.md5 = md5;

public String getChecksumValue() {
return this.checksumValue;
}


public void setChecksumValue(String checksumValue) {
this.checksumValue = checksumValue;
}

public String getOriginalChecksumType() {
return BundleUtil.getStringFromBundle("file.originalChecksumType", Arrays.asList(this.checksumType.toString()) );
}

public DataFileIO getAccessObject() throws IOException {
DataFileIO dataAccess = DataAccess.createDataAccessObject(this);

Expand Down
32 changes: 27 additions & 5 deletions src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ public DataFile findCheapAndEasy(Long id) {
Object[] result = null;

try {
result = (Object[]) em.createNativeQuery("SELECT t0.ID, t0.CREATEDATE, t0.INDEXTIME, t0.MODIFICATIONTIME, t0.PERMISSIONINDEXTIME, t0.PERMISSIONMODIFICATIONTIME, t0.PUBLICATIONDATE, t0.CREATOR_ID, t0.RELEASEUSER_ID, t0.PREVIEWIMAGEAVAILABLE, t1.CONTENTTYPE, t1.FILESYSTEMNAME, t1.FILESIZE, t1.INGESTSTATUS, t1.MD5, t1.RESTRICTED, t3.ID, t3.AUTHORITY, t3.IDENTIFIER FROM DVOBJECT t0, DATAFILE t1, DVOBJECT t2, DATASET t3 WHERE ((t0.ID = " + id + ") AND (t0.OWNER_ID = t2.ID) AND (t2.ID = t3.ID) AND (t1.ID = t0.ID))").getSingleResult();
result = (Object[]) em.createNativeQuery("SELECT t0.ID, t0.CREATEDATE, t0.INDEXTIME, t0.MODIFICATIONTIME, t0.PERMISSIONINDEXTIME, t0.PERMISSIONMODIFICATIONTIME, t0.PUBLICATIONDATE, t0.CREATOR_ID, t0.RELEASEUSER_ID, t0.PREVIEWIMAGEAVAILABLE, t1.CONTENTTYPE, t1.FILESYSTEMNAME, t1.FILESIZE, t1.INGESTSTATUS, t1.CHECKSUMVALUE, t1.RESTRICTED, t3.ID, t3.AUTHORITY, t3.IDENTIFIER, t1.CHECKSUMTYPE FROM DVOBJECT t0, DATAFILE t1, DVOBJECT t2, DATASET t3 WHERE ((t0.ID = " + id + ") AND (t0.OWNER_ID = t2.ID) AND (t2.ID = t3.ID) AND (t1.ID = t0.ID))").getSingleResult();
} catch (Exception ex) {
return null;
}
Expand Down Expand Up @@ -346,14 +346,14 @@ public DataFile findCheapAndEasy(Long id) {
String md5 = (String) result[14];

if (md5 != null) {
dataFile.setmd5(md5);
dataFile.setChecksumValue(md5);
}

Boolean restricted = (Boolean) result[15];
if (restricted != null) {
dataFile.setRestricted(restricted);
}


Dataset owner = new Dataset();

Expand All @@ -362,6 +362,17 @@ public DataFile findCheapAndEasy(Long id) {
owner.setId((Long)result[16]);
owner.setAuthority((String)result[17]);
owner.setIdentifier((String)result[18]);

String checksumType = (String) result[19];
if (checksumType != null) {
try {
// In the database we store "SHA1" rather than "SHA-1".
DataFile.ChecksumType typeFromStringInDatabase = DataFile.ChecksumType.valueOf(checksumType);
dataFile.setChecksumType(typeFromStringInDatabase);
} catch (IllegalArgumentException ex) {
logger.info("Exception trying to convert " + checksumType + " to enum: " + ex);
}
}

dataFile.setOwner(owner);

Expand Down Expand Up @@ -465,7 +476,7 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion

i = 0;

List<Object[]> fileResults = em.createNativeQuery("SELECT t0.ID, t0.CREATEDATE, t0.INDEXTIME, t0.MODIFICATIONTIME, t0.PERMISSIONINDEXTIME, t0.PERMISSIONMODIFICATIONTIME, t0.PUBLICATIONDATE, t0.CREATOR_ID, t0.RELEASEUSER_ID, t1.CONTENTTYPE, t1.FILESYSTEMNAME, t1.FILESIZE, t1.INGESTSTATUS, t1.MD5, t1.RESTRICTED FROM DVOBJECT t0, DATAFILE t1 WHERE ((t0.OWNER_ID = " + owner.getId() + ") AND ((t1.ID = t0.ID) AND (t0.DTYPE = 'DataFile')))").getResultList();
List<Object[]> fileResults = em.createNativeQuery("SELECT t0.ID, t0.CREATEDATE, t0.INDEXTIME, t0.MODIFICATIONTIME, t0.PERMISSIONINDEXTIME, t0.PERMISSIONMODIFICATIONTIME, t0.PUBLICATIONDATE, t0.CREATOR_ID, t0.RELEASEUSER_ID, t1.CONTENTTYPE, t1.FILESYSTEMNAME, t1.FILESIZE, t1.INGESTSTATUS, t1.CHECKSUMVALUE, t1.RESTRICTED, t1.CHECKSUMTYPE FROM DVOBJECT t0, DATAFILE t1 WHERE ((t0.OWNER_ID = " + owner.getId() + ") AND ((t1.ID = t0.ID) AND (t0.DTYPE = 'DataFile')))").getResultList();

for (Object[] result : fileResults) {
Integer file_id = (Integer) result[0];
Expand Down Expand Up @@ -544,13 +555,24 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion
String md5 = (String) result[13];

if (md5 != null) {
dataFile.setmd5(md5);
dataFile.setChecksumValue(md5);
}

Boolean restricted = (Boolean) result[14];
if (restricted != null) {
dataFile.setRestricted(restricted);
}

String checksumType = (String) result[15];
if (checksumType != null) {
try {
// In the database we store "SHA1" rather than "SHA-1".
DataFile.ChecksumType typeFromStringInDatabase = DataFile.ChecksumType.valueOf(checksumType);
dataFile.setChecksumType(typeFromStringInDatabase);
} catch (IllegalArgumentException ex) {
logger.info("Exception trying to convert " + checksumType + " to enum: " + ex);
}
}

// TODO:
// - if ingest status is "bad", look up the ingest report;
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/edu/harvard/iq/dataverse/DatasetPage.java
Original file line number Diff line number Diff line change
Expand Up @@ -2837,7 +2837,7 @@ public String cancel() {
}

public boolean isDuplicate(FileMetadata fileMetadata) {
String thisMd5 = fileMetadata.getDataFile().getmd5();
String thisMd5 = fileMetadata.getDataFile().getChecksumValue();
if (thisMd5 == null) {
return false;
}
Expand All @@ -2854,7 +2854,7 @@ public boolean isDuplicate(FileMetadata fileMetadata) {
Iterator<FileMetadata> fmIt = workingVersion.getFileMetadatas().iterator();
while (fmIt.hasNext()) {
FileMetadata fm = fmIt.next();
String md5 = fm.getDataFile().getmd5();
String md5 = fm.getDataFile().getChecksumValue();
if (md5 != null) {
if (MD5Map.get(md5) != null) {
MD5Map.put(md5, MD5Map.get(md5).intValue() + 1);
Expand Down
Loading

0 comments on commit b703e27

Please sign in to comment.