Skip to content

Commit

Permalink
Change remote file url handling when harvesting (#2586)
Browse files Browse the repository at this point in the history
* Remove query portion of effective urls when harvesting

* Save json url for remote files rather than effective url

* Re-work harvested resource title for better results

* Change test to not rely on external file

* Add documentation

* Add test for the 255 char limit fallback to effective url

* Update boot.php
  • Loading branch information
janette authored and dafeder committed Jul 26, 2018
1 parent 644d4bf commit 258baf3
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 9 deletions.
11 changes: 9 additions & 2 deletions docs/components/harvest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,19 @@ Click **Harvest Now**. The datasets that were cached will now be imported into y

Harvest Source nodes are viewable by the public and provide some basic metadata to the user.

.. note::
Some behaviors of the Topics field on *harvest sources* to be aware of:
.. warning::
Some behaviors of the **Topics** field on harvest sources to be aware of:

- Changing the Topic on the source and re-harvesting will not update the Topic on harvested datasets if nothing else has changed. The Harvester will only re-import a dataset if it detects changes from the source.
- If you manually add additional topics to a harvested dataset, and there *is* a change at the source, the next time the dataset is harvested your topics will be overwritten.

.. warning::
Some behaviors of harvesting **resources** to be aware of:

- If only an **accessURL** value is given, the url will be saved to the **API or Website URL** field.
- If a **downloadURL** value is given, the url will be saved to the **Remote file** field.
- The maximum size of a managed file field is 255 characters. It is not possible to increase the size of the field as this would force MySQL to auto-convert the VARCHAR(255) to a SMALLTEXT datatype, which subsequently fails with error 1170 on key length if the column is used as primary key or unique or non-unique index. Therefore the Harvester will check the length of the url and if it exceeds 255 characters, will fall back to using the *effective url* which is the last url in a redirect chain.

Managing Harvest Sources
************************

Expand Down
11 changes: 7 additions & 4 deletions modules/dkan/dkan_harvest/dkan_harvest.migrate.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1159,10 +1159,12 @@ class HarvestMigration extends MigrateDKAN {
return FALSE;
}

// If the URL is determined to be field_link_remote_file material,
// use the effective URL unless it is more than 255 characters.
// If the URL is determined to be a remote file,
// check that the URL is no more than 255 characters.
// More than 255 will give the 'Data too long for column' error.
if (strlen($remoteFileInfo->getEffectiveUrl()) < 256) {
if (strlen($resource->url) > 255 &&
strlen($remoteFileInfo->getEffectiveUrl()) < 256) {
// Switch to the effective url if the json url is too long.
$resource->url = $remoteFileInfo->getEffectiveUrl();
}
$resource->url_type = 'file';
Expand All @@ -1175,7 +1177,8 @@ class HarvestMigration extends MigrateDKAN {
$resource->format = isset($format) ? strtolower($format) : $format_detected;

// Title.
$resource->title = isset($title) ? $title : $resource->format;
$name = isset($title) ? $title : $remoteFileInfo->getName();
$resource->title = isset($name) ? $name : $resource->format;

// Created.
$resource->created = isset($created) ? $created : time();
Expand Down
2 changes: 1 addition & 1 deletion test/phpunit/boot.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
$dir = implode('/', array(__DIR__, '..', '..', '..', 'docroot'));

// Host.
$uri = getenv('DKAN_WEB_1_PORT_80_TCP_ADDR') ? 'http://' . getenv('DKAN_WEB_1_PORT_80_TCP_ADDR') : 'http://127.0.0.1:8888';
$uri = getenv('DKAN_WEB_1_PORT_80_TCP_ADDR') ? 'http://' . getenv('DKAN_WEB_1_PORT_80_TCP_ADDR') : 'http://web';

$driver = new DrupalDriver($dir, $uri);
$driver->setCoreFromVersion();
Expand Down
11 changes: 9 additions & 2 deletions test/phpunit/dkan_dataset/getRemoteFileInfoTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,37 +40,44 @@ public function getHeaders($url) {
* Run test URLs threw the getRemoteFileInfo class.
*/
public function testUrls() {
global $base_url;
$urls = [];
$urls[0]['url'] = 'https://data.wa.gov/api/views/mu24-67ke/rows.csv?accessType=DOWNLOAD';
$urls[0]['type'] = 'text/csv';
$urls[0]['extension'] = 'csv';
$urls[0]['name'] = "Hospital_Inpatient_Discharges_by_DRG__Northwest__FY2011.csv";
$urls[0]['effective_url'] = 'https://data.wa.gov/api/views/mu24-67ke/rows.csv?accessType=DOWNLOAD';

$urls[1]['url'] = "https://data.ca.gov/node/1801/download";
$urls[1]['url'] = $base_url . '/profiles/dkan/test/phpunit/phpunit_redirect_test.php?test=33f45be7-970c-4d57-b6e1-e20c80b60588-33f45be7-970c-4d57-b6e1-e20c80b60588-33f45be7-970c-4d57-b6e1-e20c80b60588-33f45be7-970c-4d57-b6e1-e20c80b60588-33f45be7-970c-4d57-b6e1-e20c80b60588';
$urls[1]['type'] = 'text/csv';
$urls[1]['extension'] = 'csv';
$urls[1]['name'] = "uw_supplier_data060518.csv";
$urls[1]['name'] = 'Polling_Places_Madison_test.csv';
$urls[1]['effective_url'] = '/profiles/dkan/test/files/dkan/Polling_Places_Madison_test.csv';

$urls[2]['url'] = "https://s3.amazonaws.com/dkan-default-content-files/files/albo.xls";
$urls[2]['type'] = 'application/vnd.ms-excel';
$urls[2]['extension'] = 'xls';
$urls[2]['name'] = "albo.xls";
$urls[2]['effective_url'] = 'https://s3.amazonaws.com/dkan-default-content-files/files/albo.xls';

$urls[3]['url'] = "https://data.chhs.ca.gov/dataset/596b5eed-31de-4fd8-a645-249f3f9b19c4/resource/57da6c9a-41a7-44b0-ab8d-815ff2cd5913/download/cscpopendata.csv";
$urls[3]['type'] = 'text/csv';
$urls[3]['extension'] = 'csv';
$urls[3]['name'] = "cscpopendata.csv";
$urls[3]['effective_url'] = 'https://data.chhs.ca.gov/dataset/596b5eed-31de-4fd8-a645-249f3f9b19c4/resource/57da6c9a-41a7-44b0-ab8d-815ff2cd5913/download/cscpopendata.csv';

$urls[4]['url'] = "https://developers.google.com/kml/documentation/KML_Samples.kml";
$urls[4]['type'] = 'application/vnd.google-earth.kml+xml';
$urls[4]['extension'] = 'kml';
$urls[4]['name'] = "KML_Samples.kml";
$urls[4]['effective_url'] = "https://developers.google.com/kml/documentation/KML_Samples.kml";

foreach ($urls as $key => $info) {
$fileInfo = new getRemoteFileInfo($info['url'], 'test', TRUE);
$this->assertEquals($fileInfo->getType(), $info['type']);
$this->assertEquals($fileInfo->getExtension(), $info['extension']);
$this->assertEquals($fileInfo->getName(), $info['name']);
$this->assertEquals($fileInfo->getEffectiveUrl(), $info['effective_url']);
}
}

Expand Down
11 changes: 11 additions & 0 deletions test/phpunit/phpunit_redirect_test.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php

global $base_url;

$redirect = 'Location: ' . $base_url . '/profiles/dkan/test/files/dkan/Polling_Places_Madison_test.csv';

/* Redirect browser */
header($redirect);

exit;
?>

0 comments on commit 258baf3

Please sign in to comment.