diff --git a/docs/components/harvest.rst b/docs/components/harvest.rst index 0e045a4339..3beb35f74a 100644 --- a/docs/components/harvest.rst +++ b/docs/components/harvest.rst @@ -65,12 +65,19 @@ Click **Harvest Now**. The datasets that were cached will now be imported into y Harvest Source nodes are viewable by the public and provide some basic metadata to the user. -.. note:: - Some behaviors of the Topics field on *harvest sources* to be aware of: +.. warning:: + Some behaviors of the **Topics** field on harvest sources to be aware of: - Changing the Topic on the source and re-harvesting will not update the Topic on harvested datasets if nothing else has changed. The Harvester will only re-import a dataset if it detects changes from the source. - If you manually add additional topics to a harvested dataset, and there *is* a change at the source, the next time the dataset is harvested your topics will be overwritten. +.. warning:: + Some behaviors of harvesting **resources** to be aware of: + + - If only an **accessURL** value is given, the url will be saved to the **API or Website URL** field. + - If a **downloadURL** value is given, the url will be saved to the **Remote file** field. + - The maximum size of a managed file field is 255 characters. It is not possible to increase the size of the field as this would force MySQL to auto-convert the VARCHAR(255) to a SMALLTEXT datatype, which subsequently fails with error 1170 on key length if the column is used as primary key or unique or non-unique index. Therefore the Harvester will check the length of the url and if it exceeds 255 characters, will fall back to using the *effective url* which is the last url in a redirect chain. + Managing Harvest Sources ************************ diff --git a/modules/dkan/dkan_harvest/dkan_harvest.migrate.inc b/modules/dkan/dkan_harvest/dkan_harvest.migrate.inc index 255e2e91cf..340b2b22d3 100644 --- a/modules/dkan/dkan_harvest/dkan_harvest.migrate.inc +++ b/modules/dkan/dkan_harvest/dkan_harvest.migrate.inc @@ -1159,10 +1159,12 @@ class HarvestMigration extends MigrateDKAN { return FALSE; } - // If the URL is determined to be field_link_remote_file material, - // use the effective URL unless it is more than 255 characters. + // If the URL is determined to be a remote file, + // check that the URL is no more than 255 characters. // More than 255 will give the 'Data too long for column' error. - if (strlen($remoteFileInfo->getEffectiveUrl()) < 256) { + if (strlen($resource->url) > 255 && + strlen($remoteFileInfo->getEffectiveUrl()) < 256) { + // Switch to the effective url if the json url is too long. $resource->url = $remoteFileInfo->getEffectiveUrl(); } $resource->url_type = 'file'; @@ -1175,7 +1177,8 @@ class HarvestMigration extends MigrateDKAN { $resource->format = isset($format) ? strtolower($format) : $format_detected; // Title. - $resource->title = isset($title) ? $title : $resource->format; + $name = isset($title) ? $title : $remoteFileInfo->getName(); + $resource->title = isset($name) ? $name : $resource->format; // Created. $resource->created = isset($created) ? $created : time(); diff --git a/test/phpunit/boot.php b/test/phpunit/boot.php index 927a51c5bd..8874a81706 100644 --- a/test/phpunit/boot.php +++ b/test/phpunit/boot.php @@ -13,7 +13,7 @@ $dir = implode('/', array(__DIR__, '..', '..', '..', 'docroot')); // Host. -$uri = getenv('DKAN_WEB_1_PORT_80_TCP_ADDR') ? 'http://' . getenv('DKAN_WEB_1_PORT_80_TCP_ADDR') : 'http://127.0.0.1:8888'; +$uri = getenv('DKAN_WEB_1_PORT_80_TCP_ADDR') ? 'http://' . getenv('DKAN_WEB_1_PORT_80_TCP_ADDR') : 'http://web'; $driver = new DrupalDriver($dir, $uri); $driver->setCoreFromVersion(); diff --git a/test/phpunit/dkan_dataset/getRemoteFileInfoTest.php b/test/phpunit/dkan_dataset/getRemoteFileInfoTest.php index b22ccd504b..7c3b9bcf72 100644 --- a/test/phpunit/dkan_dataset/getRemoteFileInfoTest.php +++ b/test/phpunit/dkan_dataset/getRemoteFileInfoTest.php @@ -40,37 +40,44 @@ public function getHeaders($url) { * Run test URLs threw the getRemoteFileInfo class. */ public function testUrls() { + global $base_url; $urls = []; $urls[0]['url'] = 'https://data.wa.gov/api/views/mu24-67ke/rows.csv?accessType=DOWNLOAD'; $urls[0]['type'] = 'text/csv'; $urls[0]['extension'] = 'csv'; $urls[0]['name'] = "Hospital_Inpatient_Discharges_by_DRG__Northwest__FY2011.csv"; + $urls[0]['effective_url'] = 'https://data.wa.gov/api/views/mu24-67ke/rows.csv?accessType=DOWNLOAD'; - $urls[1]['url'] = "https://data.ca.gov/node/1801/download"; + $urls[1]['url'] = $base_url . '/profiles/dkan/test/phpunit/phpunit_redirect_test.php?test=33f45be7-970c-4d57-b6e1-e20c80b60588-33f45be7-970c-4d57-b6e1-e20c80b60588-33f45be7-970c-4d57-b6e1-e20c80b60588-33f45be7-970c-4d57-b6e1-e20c80b60588-33f45be7-970c-4d57-b6e1-e20c80b60588'; $urls[1]['type'] = 'text/csv'; $urls[1]['extension'] = 'csv'; - $urls[1]['name'] = "uw_supplier_data060518.csv"; + $urls[1]['name'] = 'Polling_Places_Madison_test.csv'; + $urls[1]['effective_url'] = '/profiles/dkan/test/files/dkan/Polling_Places_Madison_test.csv'; $urls[2]['url'] = "https://s3.amazonaws.com/dkan-default-content-files/files/albo.xls"; $urls[2]['type'] = 'application/vnd.ms-excel'; $urls[2]['extension'] = 'xls'; $urls[2]['name'] = "albo.xls"; + $urls[2]['effective_url'] = 'https://s3.amazonaws.com/dkan-default-content-files/files/albo.xls'; $urls[3]['url'] = "https://data.chhs.ca.gov/dataset/596b5eed-31de-4fd8-a645-249f3f9b19c4/resource/57da6c9a-41a7-44b0-ab8d-815ff2cd5913/download/cscpopendata.csv"; $urls[3]['type'] = 'text/csv'; $urls[3]['extension'] = 'csv'; $urls[3]['name'] = "cscpopendata.csv"; + $urls[3]['effective_url'] = 'https://data.chhs.ca.gov/dataset/596b5eed-31de-4fd8-a645-249f3f9b19c4/resource/57da6c9a-41a7-44b0-ab8d-815ff2cd5913/download/cscpopendata.csv'; $urls[4]['url'] = "https://developers.google.com/kml/documentation/KML_Samples.kml"; $urls[4]['type'] = 'application/vnd.google-earth.kml+xml'; $urls[4]['extension'] = 'kml'; $urls[4]['name'] = "KML_Samples.kml"; + $urls[4]['effective_url'] = "https://developers.google.com/kml/documentation/KML_Samples.kml"; foreach ($urls as $key => $info) { $fileInfo = new getRemoteFileInfo($info['url'], 'test', TRUE); $this->assertEquals($fileInfo->getType(), $info['type']); $this->assertEquals($fileInfo->getExtension(), $info['extension']); $this->assertEquals($fileInfo->getName(), $info['name']); + $this->assertEquals($fileInfo->getEffectiveUrl(), $info['effective_url']); } } diff --git a/test/phpunit/phpunit_redirect_test.php b/test/phpunit/phpunit_redirect_test.php new file mode 100644 index 0000000000..0f0f53ac9b --- /dev/null +++ b/test/phpunit/phpunit_redirect_test.php @@ -0,0 +1,11 @@ +