Skip to content

Commit

Permalink
Add URL support to D.O. load task (#1804) (#1808)
Browse files Browse the repository at this point in the history
Added functionality, to the digital object load task, to import URLs in
addition to files.

Did minor cleanup and improved CLI help to include a list of valid CSV
columns.
  • Loading branch information
mcantelon authored May 2, 2024
1 parent 11ee01f commit 778beab
Showing 1 changed file with 90 additions and 33 deletions.
123 changes: 90 additions & 33 deletions lib/task/digitalobject/digitalObjectLoadTask.class.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,19 @@
*/

/**
* Load a csv list of digital objects.
* Load a CSV list of digital objects.
*
* @author David Juhasz <david@artefactual.com>
*/
class digitalObjectLoadTask extends arBaseTask
{
public const IO_SLUG_COLUMN = 'slug';
public const IO_IDENTIFIER_COLUMN = 'identifier';
public const IO_ID_COLUMN = 'information_object_id';
public const PATH_COLUMN = 'filename';

public const IO_SPECIFIER_COLUMNS = [self::IO_SLUG_COLUMN, self::IO_IDENTIFIER_COLUMN, self::IO_ID_COLUMN];

protected static $count = 0;

private $curObjNum = 0;
Expand Down Expand Up @@ -72,22 +79,20 @@ public function execute($arguments = [], $options = [])
// Get header (first) row
$header = fgetcsv($fh, 1000);

if ((!in_array('information_object_id', $header) && !in_array('identifier', $header) && !in_array('slug', $header)) || !in_array('filename', $header)) {
throw new sfException('Import file must contain an \'information_object_id\', an \'identifier\' or a \'slug\' column, and a \'filename\' column');
}
self::validateColumns($header);

$fileKey = array_search('filename', $header);
$fileKey = array_search(self::PATH_COLUMN, $header);

// If information_object_id column is available, use it for id
if (false !== $idKey = array_search('information_object_id', $header)) {
if (false !== $idKey = array_search(self::IO_ID_COLUMN, $header)) {
$idType = 'id';
}
// If no id, then lookup by identifier
elseif (false !== $idKey = array_search('identifier', $header)) {
elseif (false !== $idKey = array_search(self::IO_IDENTIFIER_COLUMN, $header)) {
$idType = 'identifier';
}
// Lookup by slug
elseif (false !== $idKey = array_search('slug', $header)) {
elseif (false !== $idKey = array_search(self::IO_SLUG_COLUMN, $header)) {
$idType = 'slug';
}

Expand Down Expand Up @@ -159,14 +164,14 @@ public function execute($arguments = [], $options = [])
$digitalObjectName = !is_array($item) ? $item : end($item);

if (null !== $results[1]) {
if (file_exists($path = self::getPath($digitalObjectName, $options))) {
if (self::validUrlOrFilePath($digitalObjectName, $options)) {
// get digital object and delete it.
if (null !== $do = QubitDigitalObject::getById($results[1])) {
$do->delete();
++$this->deletedCount;
}
} else {
$this->log(sprintf("Couldn't read file '{$digitalObjectName}'"));
$this->log(sprintf("Couldn't read file or URL '{$digitalObjectName}'"));
++$this->skippedCount;

continue;
Expand All @@ -186,8 +191,8 @@ public function execute($arguments = [], $options = [])
continue;
}

if (!file_exists($path = self::getPath($item, $options))) {
$this->log(sprintf("Couldn't read file '{$item}'"));
if (!self::validUrlOrFilePath($item, $options)) {
$this->log(sprintf("Couldn't read file of URL '{$item}'"));
++$this->skippedCount;

continue;
Expand All @@ -196,8 +201,8 @@ public function execute($arguments = [], $options = [])
self::addDigitalObject($results[0], $item, $options);
} else {
if (!is_array($item)) {
if (!file_exists($path = self::getPath($item, $options))) {
$this->log(sprintf("Couldn't read file '{$item}'"));
if (!self::validUrlOrFilePath($item, $options)) {
$this->log(sprintf("Couldn't read file of URL '{$item}'"));
++$this->skippedCount;

continue;
Expand All @@ -207,8 +212,8 @@ public function execute($arguments = [], $options = [])
} else {
// If more than one digital object linked to this information object
for ($i = 0; $i < count($item); ++$i) {
if (!file_exists($path = self::getPath($item[$i], $options))) {
$this->log(sprintf("Couldn't read file '{$item[$i]}'"));
if (!self::validUrlOrFilePath($item[$i], $options)) {
$this->log(sprintf("Couldn't read file of URL '{$item[$i]}'"));
++$this->skippedCount;

continue;
Expand Down Expand Up @@ -244,8 +249,8 @@ protected function configure()
new sfCommandOption('application', null, sfCommandOption::PARAMETER_OPTIONAL, 'The application name', true),
new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'cli'),
new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'propel'),
new sfCommandOption('link-source', 's', sfCommandOption::PARAMETER_NONE, 'Link source', null),
new sfCommandOption('path', 'p', sfCommandOption::PARAMETER_OPTIONAL, 'Path prefix for digital objects', null),
new sfCommandOption('link-source', 's', sfCommandOption::PARAMETER_NONE, 'Link source (if importing a file)', null),
new sfCommandOption('path', 'p', sfCommandOption::PARAMETER_OPTIONAL, 'Path or URL prefix for all digital objects', null),
new sfCommandOption('limit', 'l', sfCommandOption::PARAMETER_OPTIONAL, 'Limit number of digital objects imported to n', null),
new sfCommandOption('attach-only', 'a', sfCommandOption::PARAMETER_NONE, 'Always attach digital objects to a new child description', null),
new sfCommandOption('replace', 'r', sfCommandOption::PARAMETER_NONE, 'Delete and replace digital objects', null),
Expand All @@ -255,11 +260,15 @@ protected function configure()

$this->namespace = 'digitalobject';
$this->name = 'load';
$this->briefDescription = 'Load a csv list of digital objects';
$this->briefDescription = 'Load a CSV list of digital objects';

$this->detailedDescription = <<<'EOF'
Load a csv list of digital objects
EOF;
$this->detailedDescription = "Load a CSV list of digital objects\n\n";

$this->detailedDescription .= sprintf(
"Valid CSV columns are '%s' and one of: '%s'",
self::PATH_COLUMN,
implode("', '", self::IO_SPECIFIER_COLUMNS),
);
}

protected function attachDigitalObject($item, $informationObjectId, $options = [])
Expand All @@ -275,6 +284,22 @@ protected function attachDigitalObject($item, $informationObjectId, $options = [
self::addDigitalObject($informationObject->id, $item, $options);
}

protected function validateColumns($columns)
{
// First check for existance of column indicating file path or URL
$valid = in_array(self::PATH_COLUMN, $columns);

// Second check for existance of an information object specifier column
if ($valid) {
$valid = count(array_intersect(self::IO_SPECIFIER_COLUMNS, $columns)) > 0;
}

// Throw error if columns aren't valid
if (!$valid) {
throw new sfException("Import file must contain a '".self::PATH_COLUMN."' column and one of the following: '".implode("', '", self::IO_SPECIFIER_COLUMNS)."'");
}
}

protected function getPath($path, $options = [])
{
if (isset($options['path'])) {
Expand All @@ -284,20 +309,44 @@ protected function getPath($path, $options = [])
return $path;
}

protected function addDigitalObject($objectId, $path, $options = [])
protected function validUrlOrFilePath($url_or_path, $options)
{
++$this->curObjNum;
$url_or_path = self::getPath($url_or_path, $options);

$path = self::getPath($path, $options);
// Check first for a file (as this is fastest and most likely)
if (file_exists($url_or_path)) {
return true;
}

$filename = basename($path);
// If it's not a file, assume it's a URL and dismiss if invalid
if (!filter_var($url_or_path, FILTER_VALIDATE_URL)) {
return false;
}

if (!file_exists($path)) {
$this->log("Couldn't read file '{$path}'");
// Check if URL exists
$headers = @get_headers($url_or_path);

if ($headers && strpos($headers[0], '200')) {
return true;
}

// Not a file path or valid, existing URL
return false;
}

protected function addDigitalObject($objectId, $path, $options = [])
{
++$this->curObjNum;

if (!self::validUrlOrFilePath($path, $options)) {
$this->log("Couldn't read file or URL '{$path}'");

return;
}

$path = self::getPath($path, $options);
$filename = basename($path);

$remainingImportCount = $this->totalObjCount - $this->skippedCount - $importedCount;
$operation = $options['replace'] ? 'Replacing with' : 'Loading';
$message = sprintf("%s '%s' (%d of %d remaining", $operation, $filename, $this->curObjNum, $remainingImportCount);
Expand All @@ -313,13 +362,21 @@ protected function addDigitalObject($objectId, $path, $options = [])
$do = new QubitDigitalObject();
$do->objectId = $objectId;

if ($options['link-source']) {
if (false === $do->importFromFile($path)) {
return;
if (file_exists($path)) {
// Add digital object from file
if ($options['link-source']) {
if (false === $do->importFromFile($path)) {
return;
}
} else {
$do->usageId = QubitTerm::MASTER_ID;
$do->assets[] = new QubitAsset($path);
}
} else {
$do->usageId = QubitTerm::MASTER_ID;
$do->assets[] = new QubitAsset($path);
// Add digital object from URL
if (false === $do->importFromURI($path)) {
return;
}
}

$do->save($options['conn']);
Expand Down

0 comments on commit 778beab

Please sign in to comment.