From 4687f32e8d98ee22d469efd9b9253ee3d62f1f22 Mon Sep 17 00:00:00 2001 From: Steve Gallo Date: Mon, 12 Jun 2017 16:22:23 -0400 Subject: [PATCH 1/7] Remove commented out code, improve control structure flow --- classes/ETL/DataEndpoint/DirectoryScanner.php | 343 +++++++++--------- 1 file changed, 171 insertions(+), 172 deletions(-) diff --git a/classes/ETL/DataEndpoint/DirectoryScanner.php b/classes/ETL/DataEndpoint/DirectoryScanner.php index f7c86afdab..7af1919760 100644 --- a/classes/ETL/DataEndpoint/DirectoryScanner.php +++ b/classes/ETL/DataEndpoint/DirectoryScanner.php @@ -175,8 +175,6 @@ public function __construct(DataEndpointOptions $options, Log $logger = null) foreach ( $options as $property => $value ) { - // $this->logger->debug("OPT: $property => " . print_r($value, true)); - // Skip null values and use property defaults if ( null === $value ) { @@ -376,209 +374,210 @@ public function connect() { // The first time a connection is made the endpoint handle should be set. - if ( null === $this->handle ) { - - // The PHP docs on SPL iterators at http://php.net/manual/en/spl.iterators.php - // are sparse so these are some notes on usage of RecursiveDirectoryIterator, - // RecursiveIteratorIterator, and filtering. - // - // A note on recursive iterators: If a directory doesn't match the filter - // (i.e., it returns FALSE) then the directory will not be recursed - // into. Rather than using RecursiveCallbackFilterIterator it may be better to - // use CallbackFilterIterator after RecursiveIteratorIterator depending on the - // situation. - // - // We want to be able to filter on the path and file separately so we are using an - // instance of CallbackFilterIterator to do this instead of RegexIterator. - // - // RecursiveDirectoryIterator provides a mechanism to recursively iterate over - // directories and the files that they contain. It does not iterate beyond the - // *root* directory automatically. For this we need to roll our own or use - // RecursiveIteratorIterator. - // - // RecursiveRegexIterator can be attached to the RecursiveDirectoryIterator to - // filter paths. Note that if a directory doesn't match the regex it will not - // be recursed into so this is not well suited for regexes that should be - // applied to files. - // - // RecursiveCallbackFilterIterator needs to operate on a RecursiveIterator so - // it cannot be applied to a RecursiveIteratorIterator, use - // CallbackFilterIterator on a RecursiveIteratorIterator. - // - // RecursiveIteratorIterator operates on classes implementing - // RecursiveIterator and will handle the recursion into the children. The - // mode can be specified as LEAVES_ONLY (default), SELF_FIRST, CHILD_FIRST. - // Note that LEAVES_ONLY will include "." and ".." directories so these will - // need to be filtered out AFTER applying this iterator. - // - // RegexIterator can be attached to RecursiveIteratorIterator to filter the - // paths that it returns, but note that the regex applies to the full path, - // not just the name of the file itself. - // - // Note that we want to be able to filter on the path and file separately and are using - // the CallbackFilterIterator to do this instead of RegexIterator. - - // We are conditionally creating multiple iterators that will consume earlier - // iterators. Keep track of the current iterator. - - $iterator = null; + if ( null !== $this->handle ) { + return $this->handle; + } + + // The PHP docs on SPL iterators at http://php.net/manual/en/spl.iterators.php + // are sparse so these are some notes on usage of RecursiveDirectoryIterator, + // RecursiveIteratorIterator, and filtering. + // + // A note on recursive iterators: If a directory doesn't match the filter + // (i.e., it returns FALSE) then the directory will not be recursed + // into. Rather than using RecursiveCallbackFilterIterator it may be better to + // use CallbackFilterIterator after RecursiveIteratorIterator depending on the + // situation. + // + // We want to be able to filter on the path and file separately so we are using an + // instance of CallbackFilterIterator to do this instead of RegexIterator. + // + // RecursiveDirectoryIterator provides a mechanism to recursively iterate over + // directories and the files that they contain. It does not iterate beyond the + // *root* directory automatically. For this we need to roll our own or use + // RecursiveIteratorIterator. + // + // RecursiveRegexIterator can be attached to the RecursiveDirectoryIterator to + // filter paths. Note that if a directory doesn't match the regex it will not + // be recursed into so this is not well suited for regexes that should be + // applied to files. + // + // RecursiveCallbackFilterIterator needs to operate on a RecursiveIterator so + // it cannot be applied to a RecursiveIteratorIterator, use + // CallbackFilterIterator on a RecursiveIteratorIterator. + // + // RecursiveIteratorIterator operates on classes implementing + // RecursiveIterator and will handle the recursion into the children. The + // mode can be specified as LEAVES_ONLY (default), SELF_FIRST, CHILD_FIRST. + // Note that LEAVES_ONLY will include "." and ".." directories so these will + // need to be filtered out AFTER applying this iterator. + // + // RegexIterator can be attached to RecursiveIteratorIterator to filter the + // paths that it returns, but note that the regex applies to the full path, + // not just the name of the file itself. + // + // Note that we want to be able to filter on the path and file separately and are using + // the CallbackFilterIterator to do this instead of RegexIterator. + + // We are conditionally creating multiple iterators that will consume earlier + // iterators. Keep track of the current iterator. + + $iterator = null; + $this->logger->debug( + sprintf("Connecting directory scanner to %s", $this->path) + ); + + try { + $directoryIterator = new \RecursiveDirectoryIterator($this->path); + $iterator = $directoryIterator; + } catch ( Exception $e ) { + $this->logAndThrowException( + sprintf("Error opening directory '%s': %s", $this->path, $e->getMessage()) + ); + } + + // Apply the recursion iterator that will traverse the directory iterator + + try { + $flattenedIterator = new \RecursiveIteratorIterator($iterator); + } catch ( Exception $e ) { + $this->logAndThrowException( + sprintf("Error creating RecursiveIteratorIterator: %s", $e->getMessage()) + ); + } + + if ( null !== $this->maxRecursionDepth ) { $this->logger->debug( - sprintf("Connecting directory scanner to %s", $this->path) + sprintf("Set max recursion depth: %d", $this->maxRecursionDepth) ); + $flattenedIterator->setMaxDepth($this->maxRecursionDepth); + } - try { - $directoryIterator = new \RecursiveDirectoryIterator($this->path); - $iterator = $directoryIterator; - } catch ( Exception $e ) { - $this->logAndThrowException( - sprintf("Error opening directory '%s': %s", $this->path, $e->getMessage()) - ); - } + $iterator = $flattenedIterator; - // Apply the recursion iterator that will traverse the directory iterator + // Filter out directories "." and "..". This and other filters could be + // included in a single CallbackFilterIterator bit I've decided to keep them + // split out for readability, debugging, and error reporting. - try { - $flattenedIterator = new \RecursiveIteratorIterator($iterator); - } catch ( Exception $e ) { - $this->logAndThrowException( - sprintf("Error creating RecursiveIteratorIterator: %s", $e->getMessage()) - ); - } + // For the CallbackFilter classes, the types of the callback parameters depend + // on the flags passed to RecursiveDirectoryIterator::__construct(). In our + // case, $current is a SplFileInfo object and the key is the fill path to the + // file. - if ( null !== $this->maxRecursionDepth ) { - $this->logger->debug( - sprintf("Set max recursion depth: %d", $this->maxRecursionDepth) - ); - $flattenedIterator->setMaxDepth($this->maxRecursionDepth); - } + try { + $dotDirFilterIterator = new \CallbackFilterIterator( + $iterator, + function ($current, $key, $iterator) { + if ( $iterator->isDot() ) { + return false; + } + return true; + } + ); + $iterator = $dotDirFilterIterator; + } catch ( Exception $e ) { + $this->logAndThrowException( + sprintf("Error applying dot directory filters: %s", $e->getMessage()) + ); + } + + // We do not want to return directories as part of the traversal so we need to + // apply directory/file patterns and other checks AFTER traversing the + // directory tree. Otherwise, directories may be inadvertantly filtered and + // the files missed. - $iterator = $flattenedIterator; + if ( null !== $this->directoryPattern || null !== $this->filePattern ) { - // Filter out directories "." and "..". This and other filters could be - // included in a single CallbackFilterIterator bit I've decided to keep them - // split out for readability, debugging, and error reporting. + // PHP 5.3 does not allow us to reference the object in the callback + $dirPattern = $this->directoryPattern; + $filePattern = $this->filePattern; - // For the CallbackFilter classes, the types of the callback parameters depend - // on the flags passed to RecursiveDirectoryIterator::__construct(). In our - // case, $current is a SplFileInfo object and the key is the fill path to the - // file. + $this->logger->info( + sprintf( + "Applying pattern filters: (directory: %s, file: %s)", + ( null === $dirPattern ? "null" : $dirPattern ), + ( null === $filePattern ? "null" : $filePattern ) + ) + ); try { - $dotDirFilterIterator = new \CallbackFilterIterator( + $patternCallbackIterator = new \CallbackFilterIterator( $iterator, - function ($current, $key, $iterator) { - if ( $iterator->isDot() ) { + function ($current, $key, $iterator) use ($dirPattern, $filePattern) { + if ( + null !== $dirPattern + && ! preg_match($dirPattern, $current->getPath()) + ) { + return false; + } + + if ( + null !== $filePattern + && ! preg_match($filePattern, $current->getFilename()) + ) { return false; } + return true; } ); - $iterator = $dotDirFilterIterator; + $iterator = $patternCallbackIterator; } catch ( Exception $e ) { $this->logAndThrowException( - sprintf("Error applying dot directory filters: %s", $e->getMessage()) - ); - } - - // We do not want to return directories as part of the traversal so we need to - // apply directory/file patterns and other checks AFTER traversing the - // directory tree. Otherwise, directories may be inadvertantly filtered and - // the files missed. - - if ( null !== $this->directoryPattern || null !== $this->filePattern ) { - - // PHP 5.3 does not allow us to reference the object in the callback - $dirPattern = $this->directoryPattern; - $filePattern = $this->filePattern; - - $this->logger->info( sprintf( - "Applying pattern filters: (directory: %s, file: %s)", - ( null === $dirPattern ? "null" : $dirPattern ), - ( null === $filePattern ? "null" : $filePattern ) + "Error applying pattern filters (directory: %s, file: %s): %s", + ( null === $this->directoryPattern ? "null" : $this->directoryPattern ), + ( null === $this->filePattern ? "null" : $this->filePattern ), + $e->getMessage() ) ); - - try { - $patternCallbackIterator = new \CallbackFilterIterator( - $iterator, - function ($current, $key, $iterator) use ($dirPattern, $filePattern) { - if ( - null !== $dirPattern - && ! preg_match($dirPattern, $current->getPath()) - ) { - return false; - } - - if ( - null !== $filePattern - && ! preg_match($filePattern, $current->getFilename()) - ) { - return false; - } - - return true; - } - ); - $iterator = $patternCallbackIterator; - } catch ( Exception $e ) { - $this->logAndThrowException( - sprintf( - "Error applying pattern filters (directory: %s, file: %s): %s", - ( null === $this->directoryPattern ? "null" : $this->directoryPattern ), - ( null === $this->filePattern ? "null" : $this->filePattern ), - $e->getMessage() - ) - ); - } } + } + + if ( null !== $this->lastModifiedStartTimestamp || null !== $this->lastModifiedEndTimestamp ) { - if ( null !== $this->lastModifiedStartTimestamp || null !== $this->lastModifiedEndTimestamp ) { + // PHP 5.3 does not allow us to reference the object in the callback + $lmStartTs = $this->lastModifiedStartTimestamp; + $lmEndTs = $this->lastModifiedEndTimestamp; - // PHP 5.3 does not allow us to reference the object in the callback - $lmStartTs = $this->lastModifiedStartTimestamp; - $lmEndTs = $this->lastModifiedEndTimestamp; + $this->logger->info( + sprintf( + "Applying mtime filter: (start: %s, end: %s)", + ( null === $lmStartTs ? "null" : $lmStartTs ), + ( null === $lmEndTs ? "null" : $lmEndTs ) + ) + ); - $this->logger->info( + try { + $callbackIterator = new \CallbackFilterIterator( + $iterator, + function ($current, $key, $iterator) use ($lmStartTs, $lmEndTs) { + if ( null !== $lmStartTs && null !== $lmEndTs ) { + return $current->getMTime() >= $lmStartTs && $current->getMTime() <= $lmEndTs; + } elseif ( null !== $lmStartTs ) { + return $current->getMTime() >= $lmStartTs; + } elseif ( null !== $lmEndTs ) { + return $current->getMTime() <= $lmEndTs; + } else { + return false; + } + } + ); + $iterator = $callbackIterator; + } catch ( Exception $e ) { + $this->logAndThrowException( sprintf( - "Applying mtime filter: (start: %s, end: %s)", - ( null === $lmStartTs ? "null" : $lmStartTs ), - ( null === $lmEndTs ? "null" : $lmEndTs ) + "Error applying last modified filter (start: %s, end: %s): %s", + ( null === $this->lastModifiedStartTimestamp ? "null" : $this->lastModifiedStartTimestamp ), + ( null === $this->lastModifiedEndTimestamp ? "null" : $this->lastModifiedEndTimestamp ), + $e->getMessage() ) ); - - try { - $callbackIterator = new \CallbackFilterIterator( - $iterator, - function ($current, $key, $iterator) use ($lmStartTs, $lmEndTs) { - if ( null !== $lmStartTs && null !== $lmEndTs ) { - return $current->getMTime() >= $lmStartTs && $current->getMTime() <= $lmEndTs; - } elseif ( null !== $lmStartTs ) { - return $current->getMTime() >= $lmStartTs; - } elseif ( null !== $lmEndTs ) { - return $current->getMTime() <= $lmEndTs; - } else { - return false; - } - } - ); - $iterator = $callbackIterator; - } catch ( Exception $e ) { - $this->logAndThrowException( - sprintf( - "Error applying last modified filter (start: %s, end: %s): %s", - ( null === $this->lastModifiedStartTimestamp ? "null" : $this->lastModifiedStartTimestamp ), - ( null === $this->lastModifiedEndTimestamp ? "null" : $this->lastModifiedEndTimestamp ), - $e->getMessage() - ) - ); - } } - - $this->handle = $iterator; } + $this->handle = $iterator; + // Rewind the handle so it is ready to use. $this->handle->rewind(); From 495daacdfb5a9d117186293e56398ec1666b8bb2 Mon Sep 17 00:00:00 2001 From: Steve Gallo Date: Mon, 12 Jun 2017 16:23:23 -0400 Subject: [PATCH 2/7] Allow DataEndpoint to auto-discover classes implementing iDataEndpoint --- classes/ETL/DataEndpoint.php | 234 ++++++++++++++---- classes/ETL/DataEndpoint/DirectoryScanner.php | 8 + classes/ETL/DataEndpoint/File.php | 4 + classes/ETL/DataEndpoint/JsonFile.php | 9 + classes/ETL/DataEndpoint/Mysql.php | 10 +- classes/ETL/DataEndpoint/Oracle.php | 6 +- classes/ETL/DataEndpoint/Postgres.php | 10 +- classes/ETL/DataEndpoint/Rest.php | 7 +- 8 files changed, 239 insertions(+), 49 deletions(-) diff --git a/classes/ETL/DataEndpoint.php b/classes/ETL/DataEndpoint.php index 58f1dae511..44735a75dd 100644 --- a/classes/ETL/DataEndpoint.php +++ b/classes/ETL/DataEndpoint.php @@ -1,7 +1,7 @@ * @date 2015-09-25 @@ -12,47 +12,49 @@ use ETL\DataEndpoint\DataEndpointOptions; use ETL\DataEndpoint\iDataEndpoint; -use \Exception; -use \Log; +use Exception; +use Log; class DataEndpoint { + /** + * Namesapce, relative to the current namespace, where data endpoint classes are + * defined. This is used to automatically search for defined endpoints. + * + * @var string + */ + + private static $dataEndpointRelativeNs = 'DataEndpoint'; + + /** + * Fully namespaced interface that all data endpoints must implement + * + * @var string + */ + + private static $dataEndpointRequiredInterface = 'ETL\\DataEndpoint\\iDataEndpoint'; - // Default namespace for ingestors, can be overriden in the ETL configuration file - private static $defaultNs = "ETL\\DataEndpoint\\"; - - // List of defined data endpoint types. These are defined as constants so they can be used in the - // ETL configuration file. - - const TYPE_MYSQL = "mysql"; - const TYPE_POSTGRES = "postgres"; - const TYPE_ORACLE = "oracle"; - const TYPE_FILE = "file"; - const TYPE_JSONFILE = "jsonfile"; - const TYPE_DIRECTORY_SCANNER = "directoryscanner"; - const TYPE_REST = "rest"; - - private static $supportedTypes = array( - self::TYPE_MYSQL, - self::TYPE_ORACLE, - self::TYPE_POSTGRES, - self::TYPE_FILE, - self::TYPE_JSONFILE, - self::TYPE_DIRECTORY_SCANNER, - self::TYPE_REST - ); - - private static $classmap = array( - self::TYPE_MYSQL => 'ETL\DataEndpoint\Mysql', - self::TYPE_POSTGRES => 'ETL\DataEndpoint\Postgres', - self::TYPE_ORACLE => 'ETL\DataEndpoint\Oracle', - self::TYPE_FILE => 'ETL\DataEndpoint\File', - self::TYPE_JSONFILE => 'ETL\DataEndpoint\JsonFile', - self::TYPE_DIRECTORY_SCANNER => 'ETL\DataEndpoint\DirectoryScanner', - self::TYPE_REST => 'ETL\DataEndpoint\Rest' - ); - - /* ------------------------------------------------------------------------------------------ + /** + * The name of the constant expected to be defined in all data endpoint classes. This + * is used to identify the name that will be used to refer to the endpoint in + * configuration files. + * + * @var string + */ + + private static $endpointNameConstant = 'ENDPOINT_NAME'; + + /** + * Associative array where the keys are data endpoint names and the values are fully + * namespaced class names that implement those endpoints. This will be null until it + * is initialized. + * + * @var array | null + */ + + private static $endpointClassMap = null; + + /** ----------------------------------------------------------------------------------------- * Private constructor ensures the singleton can't be instantiated. * ------------------------------------------------------------------------------------------ */ @@ -61,10 +63,151 @@ private function __construct() { } - /* ------------------------------------------------------------------------------------------ + /** ----------------------------------------------------------------------------------------- + * Return the list of data endpoint names that are currently configured/supported. + * + * @return array A list of data endpoint names + * ------------------------------------------------------------------------------------------ + */ + + public static function getDataEndpointNames() + { + return array_keys(self::getDataEndpointInfo()); + } // getDataEndpointNames() + + /** ----------------------------------------------------------------------------------------- + * Return an associative array where the keys are data endpoint names and the values + * are fully namespaced class names that implement those endpoints. + * + * @return array A list of data endpoint names + * ------------------------------------------------------------------------------------------ + */ + + public static function getDataEndpointInfo() + { + self::discover(); + return self::$endpointClassMap; + } // getDataEndpointInfo() + + /** ----------------------------------------------------------------------------------------- + * Discover the list of currently supported data endpoints and constuct a list mapping + * their names to the classes that implement them. All data endpoints must implement + * the interface specified in self::$dataEndpointRequiredInterface. By automatically + * discovering the data endpoints we do not need to modify this file when new + * endpoints are created. + * + * @param boolean $force Set to TRUE to force re-discovery of endpoints + * @param Log $logger A PEAR Log object or null to use the null logger. + * ------------------------------------------------------------------------------------------ + */ + + public static function discover($force = false, Log $logger = null) + { + if ( null !== self::$endpointClassMap && ! $force ) { + return; + } + + // As per PSR-4 (http://www.php-fig.org/psr/psr-4/) the contiguous sub-namespace + // names after the "namespace prefix" correspond to a subdirectory within a "base + // directory", in which the namespace separators represent directory separators. + // This means that we can assume subdirectories under the directory where this + // file resides represent sub-namespaces. + + // The endpoint directory is relative to the directory where this file is found + $endpointDir = dirname(__FILE__) . '/' . strtr(self::$dataEndpointRelativeNs, '\\', '/'); + $endpointDirLength = strlen($endpointDir); + + // Recursively traverse the directory where the endpoints live and discover any + // defined endpoints. + + $dirIterator = new \RecursiveDirectoryIterator($endpointDir); + $flattenedIterator = new \RecursiveIteratorIterator($dirIterator); + self::$endpointClassMap = array(); + + // The iterator returns SplFileInfo objects where the keys are the path to the + // file. + + foreach ( $flattenedIterator as $path => $fileInfo ) { + + if ( $flattenedIterator->isDot() ) { + continue; + } + + // Set up an array so we can programmatically construct the namespace based on + // the path to the class file + + $constructedNamespace = array(__NAMESPACE__, self::$dataEndpointRelativeNs); + + // Construct any additional sub-namespaces for subdirectories discovered under + // the endpoint namespace (e.g. the subdirectory DataEndpoint/Filters + // translates to the namespace DataEndpoint\Filters. + + $subDirNs = strtr(substr($fileInfo->getPath(), $endpointDirLength + 1), '/', '\\'); + if ( "" !== $subDirNs) { + $constructedNamespace[] = $subDirNs; + } + + // Discover the class name based on the file name. The file name is expected + // to be named . + + $extension = $fileInfo->getExtension(); + $filename = $fileInfo->getFilename(); + // Handle the case where there is no extension + $length = ( strlen($extension) > 0 ? -1 * (strlen($extension) + 1) : strlen($filename) ); + $className = substr($filename, 0, $length); + $constructedNamespace[] = $className; + + $nsClass = implode('\\', $constructedNamespace); + + try { + $r = new \ReflectionClass($nsClass); + + // Ensure that the class is not abstract and implements the required + // interface + + if ( ! $r->isAbstract() && $r->implementsInterface(self::$dataEndpointRequiredInterface) ) { + + if ( $r->hasConstant(self::$endpointNameConstant) ) { + + $name = $r->getConstant(self::$endpointNameConstant); + if ( ! array_key_exists($name, self::$endpointClassMap) ) { + self::$endpointClassMap[$name] = $r->getName(); + } elseif ( null !== $logger ) { + $logger->warning( + sprintf( + "%s Endpoint with name '%s' already exists, skipping", + static::class, + $name + ) + ); + } + + } else if ( null !== $logger ) { + + $logger->warning( + sprintf( + "%s Class '%s' does not define %s, skipping", + static::class, + $r->getName(), + self::$endpointNameConstant + ) + ); + + } + } + } catch ( \ReflectionException $e ) { + // The class does not exist + continue; + } + } + } // discover() + + /** ----------------------------------------------------------------------------------------- * Factory pattern to instantiate a DataEndpoint based on the options. * - * @param $options A DataEndpointOptions object containing options parsed from the ETL config. + * @param DataEndpointOptions $options A DataEndpointOptions object containing options + * parsed from the ETL config. + * @param Log $logger A PEAR Log object or null to use the null logger. * * @return A data endpoint object implementing the iDataEndpoint interface. * @@ -76,12 +219,13 @@ private function __construct() public static function factory(DataEndpointOptions $options, Log $logger = null) { + self::discover(false, $logger); $options->verify(); // If the type is defined and has a mapping to an implementation, create a class for the type. - if ( ! array_key_exists($options->type, self::$classmap) ) { - $msg = __CLASS__ . ": Undefined data endpoint type: '{$options->type}'"; + if ( ! array_key_exists($options->type, self::$endpointClassMap) ) { + $msg = sprintf("%s: Undefined data endpoint type: '%s'", static::class, $options->type); if ( null !== $logger ) { $logger->err($msg); } @@ -89,12 +233,12 @@ public static function factory(DataEndpointOptions $options, Log $logger = null) } // Ensure that the class implements the interface - $className = self::$classmap[$options->type]; + $className = self::$endpointClassMap[$options->type]; $endpoint = new $className($options, $logger); if ( ! $endpoint instanceof iDataEndpoint ) { - $msg = __CLASS__ . ": $className does not implement iDataEndpoint"; + $msg = sprintf("%s: %s does not implement iDataEndpoint", static::class, $className); if ( null !== $logger ) { $logger->err($msg); } diff --git a/classes/ETL/DataEndpoint/DirectoryScanner.php b/classes/ETL/DataEndpoint/DirectoryScanner.php index 7af1919760..9a94b9dbb8 100644 --- a/classes/ETL/DataEndpoint/DirectoryScanner.php +++ b/classes/ETL/DataEndpoint/DirectoryScanner.php @@ -22,6 +22,14 @@ class DirectoryScanner extends aDataEndpoint implements iDataEndpoint, \Iterator { + /** ----------------------------------------------------------------------------------------- + * The ENDPOINT_NAME constant defines the name for this endpoint that should be used + * in configuration files. It also allows us to implement auto-discovery. + * + * @const string + */ + + const ENDPOINT_NAME = 'directoryscanner'; /** ----------------------------------------------------------------------------------------- * Numeric key to use for the default file extension handler. This should be the only diff --git a/classes/ETL/DataEndpoint/File.php b/classes/ETL/DataEndpoint/File.php index 20412e8f47..e877acd234 100644 --- a/classes/ETL/DataEndpoint/File.php +++ b/classes/ETL/DataEndpoint/File.php @@ -12,6 +12,10 @@ class File extends aDataEndpoint implements iDataEndpoint { + // The ENDPOINT_NAME constant defines the name for this endpoint that should be used + // in configuration files. It also allows us to implement auto-discovery. + const ENDPOINT_NAME = 'file'; + // The path to the file. protected $path = null; diff --git a/classes/ETL/DataEndpoint/JsonFile.php b/classes/ETL/DataEndpoint/JsonFile.php index 8007761dfc..1f98ef59e5 100644 --- a/classes/ETL/DataEndpoint/JsonFile.php +++ b/classes/ETL/DataEndpoint/JsonFile.php @@ -18,6 +18,15 @@ class JsonFile extends aStructuredFile implements iStructuredFile { + /** ----------------------------------------------------------------------------------------- + * The ENDPOINT_NAME constant defines the name for this endpoint that should be used + * in configuration files. It also allows us to implement auto-discovery. + * + * @const string + */ + + const ENDPOINT_NAME = 'jsonfile'; + /** ----------------------------------------------------------------------------------------- * @see iDataEndpoint::__construct() * ------------------------------------------------------------------------------------------ diff --git a/classes/ETL/DataEndpoint/Mysql.php b/classes/ETL/DataEndpoint/Mysql.php index c93acf29e3..9870af2075 100644 --- a/classes/ETL/DataEndpoint/Mysql.php +++ b/classes/ETL/DataEndpoint/Mysql.php @@ -10,10 +10,18 @@ namespace ETL\DataEndpoint; use ETL\DataEndpoint\DataEndpointOptions; -use \Log; +use Log; class Mysql extends aRdbmsEndpoint implements iRdbmsEndpoint { + /** ----------------------------------------------------------------------------------------- + * The ENDPOINT_NAME constant defines the name for this endpoint that should be used + * in configuration files. It also allows us to implement auto-discovery. + * + * @const string + */ + + const ENDPOINT_NAME = 'mysql'; /* ------------------------------------------------------------------------------------------ * @see iDataEndpoint::__construct() diff --git a/classes/ETL/DataEndpoint/Oracle.php b/classes/ETL/DataEndpoint/Oracle.php index 4d88fdcd49..7c84e8bdec 100644 --- a/classes/ETL/DataEndpoint/Oracle.php +++ b/classes/ETL/DataEndpoint/Oracle.php @@ -10,11 +10,15 @@ namespace ETL\DataEndpoint; use ETL\DataEndpoint\DataEndpointOptions; -use \Log; +use Log; class Oracle extends aRdbmsEndpoint implements iRdbmsEndpoint { + // The ENDPOINT_NAME constant defines the name for this endpoint that should be used + // in configuration files. It also allows us to implement auto-discovery. + const ENDPOINT_NAME = 'oracle'; + public function __construct(DataEndpointOptions $options, Log $logger = null) { parent::__construct($options, $logger); diff --git a/classes/ETL/DataEndpoint/Postgres.php b/classes/ETL/DataEndpoint/Postgres.php index 3f5b27b2d6..a205b77555 100644 --- a/classes/ETL/DataEndpoint/Postgres.php +++ b/classes/ETL/DataEndpoint/Postgres.php @@ -10,10 +10,18 @@ namespace ETL\DataEndpoint; use ETL\DataEndpoint\DataEndpointOptions; -use \Log; +use Log; class Postgres extends aRdbmsEndpoint implements iRdbmsEndpoint { + /** ----------------------------------------------------------------------------------------- + * The ENDPOINT_NAME constant defines the name for this endpoint that should be used + * in configuration files. It also allows us to implement auto-discovery. + * + * @const string + */ + + const ENDPOINT_NAME = 'postgres'; public function __construct(DataEndpointOptions $options, Log $logger = null) { diff --git a/classes/ETL/DataEndpoint/Rest.php b/classes/ETL/DataEndpoint/Rest.php index 27a841af0a..bdab26ea11 100644 --- a/classes/ETL/DataEndpoint/Rest.php +++ b/classes/ETL/DataEndpoint/Rest.php @@ -12,10 +12,15 @@ namespace ETL\DataEndpoint; use ETL\DataEndpoint\DataEndpointOptions; -use \Log; +use Log; class Rest extends aDataEndpoint implements iDataEndpoint { + // The ENDPOINT_NAME constant defines the name for this endpoint that should be used + // in configuration files. It also allows us to implement auto-discovery. + + const ENDPOINT_NAME = 'rest'; + // The base url for this endpoint protected $baseUrl = null; From 29292afbc91493f03e7bb3e131cc65a6aea93a66 Mon Sep 17 00:00:00 2001 From: Steve Gallo Date: Mon, 12 Jun 2017 16:24:04 -0400 Subject: [PATCH 3/7] Add overseer option to list available data endpoint types --- tools/etl/etl_overseer.php | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/tools/etl/etl_overseer.php b/tools/etl/etl_overseer.php index 65968e2b62..b82430808d 100755 --- a/tools/etl/etl_overseer.php +++ b/tools/etl/etl_overseer.php @@ -52,8 +52,10 @@ 'list-actions' => false, // List available aggregators 'list-aggregators' => false, - // List available data endpoints - 'list-endpoints' => false, + // List the available data endpoint types (e.g., classes) + 'list-endpoint-types' => false, + // List endpoints that have been configured + 'list-configured-endpoints' => false, // List available ETL groups 'list-groups' => false, // List available Ingestors @@ -489,7 +491,18 @@ function ($key) { } break; - case 'list-endpoints': + case 'list-endpoint-types': + \ETL\DataEndpoint::discover(false, $logger); + $endpointInfo = \ETL\DataEndpoint::getDataEndpointInfo(false, $logger); + $headings = array("Name", "Class"); + print implode(LIST_SEPARATOR, $headings) . "\n"; + ksort($endpointInfo); + foreach ( $endpointInfo as $name => $class) { + print "$name\t$class\n"; + } + break; + + case 'list-configured-endpoints': $endpoints = $etlConfig->getDataEndpoints(); $endpointSummary = array(); @@ -643,7 +656,7 @@ function usage_and_exit($msg = null) -k, --chunk-size {none, day, week, month, year} Break up ingestion into chunks of this size. Helps to make more recent data available faster. [default year] - -l, --list {resources, sections, actions, endpoints} | + -l, --list {resources, sections, actions, endpoint-types, configured-endpoints} | List available actions in the specified section, resources, data endpoints, or sections. If a section name is provided list all actions in that section. -m, --last-modified-start-date From c5767ede6863dca2f23b75efb8cb19bc98a85eac Mon Sep 17 00:00:00 2001 From: Steve Gallo Date: Mon, 12 Jun 2017 22:40:08 -0400 Subject: [PATCH 4/7] Satisfy the PHP 5.3 demons --- classes/ETL/DataEndpoint.php | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/classes/ETL/DataEndpoint.php b/classes/ETL/DataEndpoint.php index 44735a75dd..49196e0d25 100644 --- a/classes/ETL/DataEndpoint.php +++ b/classes/ETL/DataEndpoint.php @@ -176,18 +176,18 @@ public static function discover($force = false, Log $logger = null) $logger->warning( sprintf( "%s Endpoint with name '%s' already exists, skipping", - static::class, + __CLASS__, $name ) ); } - } else if ( null !== $logger ) { + } elseif ( null !== $logger ) { $logger->warning( sprintf( "%s Class '%s' does not define %s, skipping", - static::class, + __CLASS__, $r->getName(), self::$endpointNameConstant ) @@ -225,7 +225,7 @@ public static function factory(DataEndpointOptions $options, Log $logger = null) // If the type is defined and has a mapping to an implementation, create a class for the type. if ( ! array_key_exists($options->type, self::$endpointClassMap) ) { - $msg = sprintf("%s: Undefined data endpoint type: '%s'", static::class, $options->type); + $msg = sprintf("%s: Undefined data endpoint type: '%s'", __CLASS__, $options->type); if ( null !== $logger ) { $logger->err($msg); } @@ -238,7 +238,7 @@ public static function factory(DataEndpointOptions $options, Log $logger = null) $endpoint = new $className($options, $logger); if ( ! $endpoint instanceof iDataEndpoint ) { - $msg = sprintf("%s: %s does not implement iDataEndpoint", static::class, $className); + $msg = sprintf("%s: %s does not implement iDataEndpoint", __CLASS__, $className); if ( null !== $logger ) { $logger->err($msg); } From 637067952526d7035e7be485a7ed56fe02b7c437 Mon Sep 17 00:00:00 2001 From: Steve Gallo Date: Tue, 13 Jun 2017 09:32:38 -0400 Subject: [PATCH 5/7] PHP 5.3 hates me --- classes/ETL/DataEndpoint.php | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/classes/ETL/DataEndpoint.php b/classes/ETL/DataEndpoint.php index 49196e0d25..14dc2c9e52 100644 --- a/classes/ETL/DataEndpoint.php +++ b/classes/ETL/DataEndpoint.php @@ -150,8 +150,14 @@ public static function discover($force = false, Log $logger = null) // Discover the class name based on the file name. The file name is expected // to be named . - $extension = $fileInfo->getExtension(); + // SplFileInfo::getExtension() is not defined until PHP 5.3.6 + $filename = $fileInfo->getFilename(); + $extension = ''; + if ( false !== ($pos = strrpos($filename, '.')) ) { + $extension = substr($filename, $pos + 1); + } + // Handle the case where there is no extension $length = ( strlen($extension) > 0 ? -1 * (strlen($extension) + 1) : strlen($filename) ); $className = substr($filename, 0, $length); From f047a095e2b44d7819fea2c528a2b6a082bfdd90 Mon Sep 17 00:00:00 2001 From: Steve Gallo Date: Thu, 15 Jun 2017 16:11:22 -0400 Subject: [PATCH 6/7] Changes as per @ryanrath code review --- classes/ETL/DataEndpoint.php | 44 ++++++++++--------- classes/ETL/DataEndpoint/DirectoryScanner.php | 23 +++++----- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/classes/ETL/DataEndpoint.php b/classes/ETL/DataEndpoint.php index 14dc2c9e52..5c6d59be73 100644 --- a/classes/ETL/DataEndpoint.php +++ b/classes/ETL/DataEndpoint.php @@ -18,31 +18,31 @@ class DataEndpoint { /** - * Namesapce, relative to the current namespace, where data endpoint classes are + * Namespace, relative to the current namespace, where data endpoint classes are * defined. This is used to automatically search for defined endpoints. * - * @var string + * @const string */ - private static $dataEndpointRelativeNs = 'DataEndpoint'; + const DATA_ENDPOINT_RELATIVE_NS = 'DataEndpoint'; /** * Fully namespaced interface that all data endpoints must implement * - * @var string + * @const string */ - private static $dataEndpointRequiredInterface = 'ETL\\DataEndpoint\\iDataEndpoint'; + const DATA_ENDPOINT_REQUIRED_INTERFACE = 'ETL\\DataEndpoint\\iDataEndpoint'; /** * The name of the constant expected to be defined in all data endpoint classes. This * is used to identify the name that will be used to refer to the endpoint in * configuration files. * - * @var string + * @const string */ - private static $endpointNameConstant = 'ENDPOINT_NAME'; + const ENDPOINT_NAME_CONSTANT = 'ENDPOINT_NAME'; /** * Associative array where the keys are data endpoint names and the values are fully @@ -90,11 +90,13 @@ public static function getDataEndpointInfo() } // getDataEndpointInfo() /** ----------------------------------------------------------------------------------------- - * Discover the list of currently supported data endpoints and constuct a list mapping - * their names to the classes that implement them. All data endpoints must implement - * the interface specified in self::$dataEndpointRequiredInterface. By automatically - * discovering the data endpoints we do not need to modify this file when new - * endpoints are created. + * Discover the list of currently supported data endpoints and construct a list + * mapping their names to the classes that implement them. All data endpoints must + * implement the interface specified in self::DATA_ENDPOINT_REQUIRED_INTERFACE and + * also define a constant referenced by self::ENDPOINT_NAME_CONSTANT that is set to + * the name of the endpoint (e.g., const ENDPOINT_NAME = 'file.json'. By + * automatically discovering the data endpoints we do not need to modify this file + * when new endpoints are created. * * @param boolean $force Set to TRUE to force re-discovery of endpoints * @param Log $logger A PEAR Log object or null to use the null logger. @@ -114,7 +116,9 @@ public static function discover($force = false, Log $logger = null) // file resides represent sub-namespaces. // The endpoint directory is relative to the directory where this file is found - $endpointDir = dirname(__FILE__) . '/' . strtr(self::$dataEndpointRelativeNs, '\\', '/'); + $endpointDir = dirname(__FILE__) + . DIRECTORY_SEPARATOR + . strtr(self::DATA_ENDPOINT_RELATIVE_NS, '\\', DIRECTORY_SEPARATOR); $endpointDirLength = strlen($endpointDir); // Recursively traverse the directory where the endpoints live and discover any @@ -136,13 +140,13 @@ public static function discover($force = false, Log $logger = null) // Set up an array so we can programmatically construct the namespace based on // the path to the class file - $constructedNamespace = array(__NAMESPACE__, self::$dataEndpointRelativeNs); + $constructedNamespace = array(__NAMESPACE__, self::DATA_ENDPOINT_RELATIVE_NS); // Construct any additional sub-namespaces for subdirectories discovered under // the endpoint namespace (e.g. the subdirectory DataEndpoint/Filters // translates to the namespace DataEndpoint\Filters. - $subDirNs = strtr(substr($fileInfo->getPath(), $endpointDirLength + 1), '/', '\\'); + $subDirNs = strtr(substr($fileInfo->getPath(), $endpointDirLength + 1), DIRECTORY_SEPARATOR, '\\'); if ( "" !== $subDirNs) { $constructedNamespace[] = $subDirNs; } @@ -171,11 +175,11 @@ public static function discover($force = false, Log $logger = null) // Ensure that the class is not abstract and implements the required // interface - if ( ! $r->isAbstract() && $r->implementsInterface(self::$dataEndpointRequiredInterface) ) { + if ( ! $r->isAbstract() && $r->implementsInterface(self::DATA_ENDPOINT_REQUIRED_INTERFACE) ) { - if ( $r->hasConstant(self::$endpointNameConstant) ) { + if ( $r->hasConstant(self::ENDPOINT_NAME_CONSTANT) ) { - $name = $r->getConstant(self::$endpointNameConstant); + $name = $r->getConstant(self::ENDPOINT_NAME_CONSTANT); if ( ! array_key_exists($name, self::$endpointClassMap) ) { self::$endpointClassMap[$name] = $r->getName(); } elseif ( null !== $logger ) { @@ -195,7 +199,7 @@ public static function discover($force = false, Log $logger = null) "%s Class '%s' does not define %s, skipping", __CLASS__, $r->getName(), - self::$endpointNameConstant + self::ENDPOINT_NAME_CONSTANT ) ); @@ -215,7 +219,7 @@ public static function discover($force = false, Log $logger = null) * parsed from the ETL config. * @param Log $logger A PEAR Log object or null to use the null logger. * - * @return A data endpoint object implementing the iDataEndpoint interface. + * @return iDataEndpoint A data endpoint object implementing the iDataEndpoint interface. * * @throws Exception If required options were not provided * @throws Exception If the requested class could not be found diff --git a/classes/ETL/DataEndpoint/DirectoryScanner.php b/classes/ETL/DataEndpoint/DirectoryScanner.php index 9a94b9dbb8..1b493fcc3f 100644 --- a/classes/ETL/DataEndpoint/DirectoryScanner.php +++ b/classes/ETL/DataEndpoint/DirectoryScanner.php @@ -448,21 +448,21 @@ public function connect() try { $flattenedIterator = new \RecursiveIteratorIterator($iterator); + + if ( null !== $this->maxRecursionDepth ) { + $this->logger->debug( + sprintf("Set max recursion depth: %d", $this->maxRecursionDepth) + ); + $flattenedIterator->setMaxDepth($this->maxRecursionDepth); + } + + $iterator = $flattenedIterator; } catch ( Exception $e ) { $this->logAndThrowException( sprintf("Error creating RecursiveIteratorIterator: %s", $e->getMessage()) ); } - if ( null !== $this->maxRecursionDepth ) { - $this->logger->debug( - sprintf("Set max recursion depth: %d", $this->maxRecursionDepth) - ); - $flattenedIterator->setMaxDepth($this->maxRecursionDepth); - } - - $iterator = $flattenedIterator; - // Filter out directories "." and "..". This and other filters could be // included in a single CallbackFilterIterator bit I've decided to keep them // split out for readability, debugging, and error reporting. @@ -476,10 +476,7 @@ public function connect() $dotDirFilterIterator = new \CallbackFilterIterator( $iterator, function ($current, $key, $iterator) { - if ( $iterator->isDot() ) { - return false; - } - return true; + return ( ! $iterator->isDot() ); } ); $iterator = $dotDirFilterIterator; From e6dfe125d427bf9beecdfc7f1098c732dc5485fc Mon Sep 17 00:00:00 2001 From: Steve Gallo Date: Thu, 15 Jun 2017 16:13:10 -0400 Subject: [PATCH 7/7] Fix comment grammar --- classes/ETL/DataEndpoint.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/classes/ETL/DataEndpoint.php b/classes/ETL/DataEndpoint.php index 5c6d59be73..713e47f853 100644 --- a/classes/ETL/DataEndpoint.php +++ b/classes/ETL/DataEndpoint.php @@ -94,7 +94,7 @@ public static function getDataEndpointInfo() * mapping their names to the classes that implement them. All data endpoints must * implement the interface specified in self::DATA_ENDPOINT_REQUIRED_INTERFACE and * also define a constant referenced by self::ENDPOINT_NAME_CONSTANT that is set to - * the name of the endpoint (e.g., const ENDPOINT_NAME = 'file.json'. By + * the name of the endpoint (e.g., const ENDPOINT_NAME = 'file.json'). By * automatically discovering the data endpoints we do not need to modify this file * when new endpoints are created. *