Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move to using custom Ingestor for cloud resource specifications #1489

Merged
merged 10 commits into from
Feb 25, 2021
140 changes: 140 additions & 0 deletions classes/ETL/Ingestor/CloudResourceSpecsStateTransformIngestor.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
<?php
/* ==========================================================================================
* This class simulates a Finite State Machine to reconstruct the start and end time of a specific
* set of vcpus and memory for a cloud host. It get a set of rows that contains the vcpus and memory
* sorted by resource, host and start time of the configuration. This list is iterated over and an end
* time is set anytime a changed is detected in the number of vcpus or memory for a host.
*
* If no changes are found the current date is considered the end date of the configuration
*
* A -1 value for the vcpus and memory_mb means the host was not available on that day
*
* @author Greg Dean <gmdean@buffalo.edu>
* @date 2021-01-27
*/

namespace ETL\Ingestor;

use ETL\aOptions;
use ETL\iAction;
use ETL\aAction;
use ETL\Configuration\EtlConfiguration;
use ETL\EtlOverseerOptions;

use Psr\Log\LoggerInterface;

class CloudResourceSpecsStateTransformIngestor extends pdoIngestor implements iAction
{

private $_instance_state;

/**
* @see ETL\Ingestor\pdoIngestor::__construct()
*/
public function __construct(aOptions $options, EtlConfiguration $etlConfig, LoggerInterface $logger = null)
{
parent::__construct($options, $etlConfig, $logger);

$this->_end_time = $etlConfig->getVariableStore()->endDate ? date('Y-m-d H:i:s', strtotime($etlConfig->getVariableStore()->endDate)) : null;

$this->resetInstance();
}

private function initInstance($srcRecord)
{
// Since we only get information for when a configuration changes we assume a configuration has an end date
// of today unless we have a row that tells us otherwise
$default_end_time = isset($this->_end_time) ? $this->_end_time : date('Y-m-d') . ' 23:59:59';

$this->_instance_state = array(
'resource_id' => $srcRecord['resource_id'],
'hostname' => $srcRecord['hostname'],
'vcpus' => $srcRecord['vcpus'],
'memory_mb' => $srcRecord['memory_mb'],
'start_date_ts' => strtotime($srcRecord['fact_date'] . " 00:00:00"),
'end_date_ts' => strtotime($default_end_time),
'start_day_id' => date('Y', strtotime($srcRecord['fact_date'])) * 100000 + date('z', strtotime($srcRecord['fact_date'])) + 1,
'end_day_id' => date('Y', strtotime($default_end_time)) * 100000 + date('z', strtotime($default_end_time)) + 1
);
}

private function resetInstance()
{
$this->_instance_state = null;
}

private function updateInstance($srcRecord)
{
// The -1 is to make sure we use the last second of the previous day
$end_date_timestamp = strtotime($srcRecord['fact_date'] . " 00:00:00") - 1;
$this->_instance_state['end_date_ts'] = $end_date_timestamp;

// date(z) is zero indexed so +1 is needed to get the correct day of the year
$this->_instance_state['end_day_id'] = date('Y', $end_date_timestamp) * 100000 + date('z', $end_date_timestamp) + 1;
}

/**
* @see ETL\Ingestor\pdoIngestor::transform()
*/
protected function transform(array $srcRecord, &$orderId)
{
// We want to just flush when we hit the dummy row
if ($srcRecord['fact_date'] === 0) {
if (isset($this->_instance_state)) {
return array($this->_instance_state);
} else {
return array();
}
}

if ($this->_instance_state === null) {
if($srcRecord['vcpus'] == -1 && $srcRecord['memory_mb'] == -1) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What scenario is handled here? When would you have the first record have vcpus and memory_mb as -1?

return array();
}

$this->initInstance($srcRecord);
}

$transformedRecord = array();

if (($this->_instance_state['hostname'] != $srcRecord['hostname']) || ($this->_instance_state['resource_id'] != $srcRecord['resource_id']) || ($this->_instance_state['vcpus'] != $srcRecord['vcpus'] || $this->_instance_state['memory_mb'] != $srcRecord['memory_mb'])) {

// Only update the instance if the only thing that is different between $srcRecord and $this->_instance_state is that either the memory or vcpus changed
if (($this->_instance_state['vcpus'] != $srcRecord['vcpus'] || $this->_instance_state['memory_mb'] != $srcRecord['memory_mb'])
&& ($this->_instance_state['hostname'] == $srcRecord['hostname']) && ($this->_instance_state['resource_id'] == $srcRecord['resource_id'])) {
$this->updateInstance($srcRecord);
}

$transformedRecord[] = $this->_instance_state;
$this->resetInstance();

// Under most circumstances when we detect a change we want to start a new row with data from the row that has changed. This is not
// the case when the change detected is a -1 value for vcpus or memory_mb. When vcpus or memory_mb is -1 it means the host has been
// removed and we just want to end the row and not create a new row.
if($srcRecord['vcpus'] != -1 && $srcRecord['memory_mb'] != -1) {
$this->initInstance($srcRecord);
}
}

return $transformedRecord;
}

protected function getSourceQueryString()
{
$sql = parent::getSourceQueryString();

// Due to the way the Finite State Machine handles the rows in event reconstruction, the last row
// is lost. To work around this we add a dummy row filled with zeroes.
$colCount = count($this->etlSourceQuery->records);
$unionValues = array_fill(0, $colCount, 0);
$sql .= "\nUNION ALL\nSELECT " . implode(',', $unionValues) . "\nORDER BY 1 DESC, 2 ASC, 5 ASC";

return $sql;
}

public function transformHelper(array $srcRecord)
{
$orderId = 0;
return $this->transform($srcRecord, $orderId);
}
}
4 changes: 2 additions & 2 deletions configuration/etl/etl.d/cloud_ingest_resource_specs.json
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@
"ingest-cloud-resource-specs": [
{
"name": "CloudResourceSpecsReconstructor",
"class": "DatabaseIngestor",
"definition_file": "cloud_common/resource_specifications.json",
"class": "CloudResourceSpecsStateTransformIngestor",
"definition_file": "cloud_common/resource_specifications_transformer.json",
"description": "Sets a start and end time for memory and vcpu paring for a compute node on a cloud resource"
}
],
Expand Down
93 changes: 54 additions & 39 deletions configuration/etl/etl.d/xdmod-migration-9_0_0-9_5_0.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,25 @@
"namespace": "ETL\\Maintenance",
"class": "ExecuteSql",
"options_class": "MaintenanceOptions"
},
"cloud-migration-9_0_0-9_5_0": {
},
"cloud-migration-9_0_0-9_5_0": {
"namespace": "ETL\\Ingestor",
"options_class": "IngestorOptions",
"endpoints": {
"source": {
"type": "mysql",
"name": "Cloud DB",
"config": "datawarehouse",
"schema": "modw_cloud"
},
"destination": {
"type": "mysql",
"name": "Cloud DB",
"config": "datawarehouse",
"schema": "modw_cloud"
}
}
}
"source": {
"type": "mysql",
"name": "Cloud DB",
"config": "datawarehouse",
"schema": "modw_cloud"
},
"destination": {
"type": "mysql",
"name": "Cloud DB",
"config": "datawarehouse",
"schema": "modw_cloud"
}
}
}
},
"migration-9_0_0-9_5_0": [
{
Expand Down Expand Up @@ -104,28 +104,43 @@
}
],
"cloud-migration-9_0_0-9_5_0": [
{
"#": "Asset data must be aggregated post ingestion",
"name": "CloudResourceSpecsAggregator",
"class": "SimpleAggregator",
"namespace": "ETL\\Aggregator",
"options_class": "AggregatorOptions",
"description": "Aggregate cloud records.",
"definition_file": "cloud_common/cloud_resource_specs_aggregation.json",
"table_prefix": "resourcespecsfact_by_",
"aggregation_units": [
"day", "month", "quarter", "year"
],
"endpoints": {
"destination": {
"type": "mysql",
"name": "Aggregate DB",
"config": "datawarehouse",
"schema": "modw_aggregates",
"create_schema_if_not_exists": true,
"truncate_destination": true
{
"name": "CloudResourceSpecsReconstructor",
"class": "CloudResourceSpecsStateTransformIngestor",
"definition_file": "cloud_common/resource_specifications_transformer.json",
"description": "Sets a start and end time for memory and vcpu paring for a compute node on a cloud resource",
"endpoints": {
"destination": {
"type": "mysql",
"name": "Cloud DB",
"config": "datawarehouse",
"schema": "modw_cloud",
"truncate_destination": true
}
}
}
}
]
},
{
"#": "Asset data must be aggregated post ingestion",
"name": "CloudResourceSpecsAggregator",
"class": "SimpleAggregator",
"namespace": "ETL\\Aggregator",
"options_class": "AggregatorOptions",
"description": "Aggregate cloud records.",
"definition_file": "cloud_common/cloud_resource_specs_aggregation.json",
"table_prefix": "resourcespecsfact_by_",
"aggregation_units": [
"day", "month", "quarter", "year"
],
"endpoints": {
"destination": {
"type": "mysql",
"name": "Aggregate DB",
"config": "datawarehouse",
"schema": "modw_aggregates",
"create_schema_if_not_exists": true,
"truncate_destination": true
}
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"table_definition": {
"$ref": "${table_definition_dir}/cloud_common/cloud_resource_specs.json#/table_definition"
},
"destination_record_map": {
"cloud_resource_specs": {
"resource_id": "resource_id",
"hostname": "hostname",
"vcpus": "vcpus",
"memory_mb": "memory_mb",
"start_date_ts": "start_date_ts",
"end_date_ts": "end_date_ts",
"start_day_id": "start_day_id",
"end_day_id": "end_day_id"
}
},
"source_query": {
"records": {
"resource_id": "srs.resource_id",
"hostname": "srs.hostname",
"vcpus": "srs.vcpus",
"memory_mb": "srs.memory_mb",
"fact_date": "srs.fact_date",
"start_date_ts": -1,
"end_date_ts": -1,
"start_day_id": -1,
"end_day_id": -1
},
"joins": [{
"name": "staging_resource_specifications",
"schema": "${SOURCE_SCHEMA}",
"alias": "srs"
}]
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@
{
"name": "vcpus",
"type": "int(5)",
"nullable": false,
"default": null
"nullable": false
},
{
"name": "start_date_ts",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"table_definition": {
"$ref": "../etl_tables_8.0.0.d/cloud_resource_specs.json#/table_definition"
},
"destination_record_map": {
"cloud_resource_specs": {
"resource_id": "resource_id",
"hostname": "hostname",
"vcpus": "vcpus",
"memory_mb": "memory_mb",
"start_date_ts": "start_date_ts",
"end_date_ts": "end_date_ts",
"start_day_id": "start_day_id",
"end_day_id": "end_day_id"
}
},
"source_query": {
"records": {
"resource_id": "srs.resource_id",
"hostname": "srs.hostname",
"vcpus": "srs.vcpus",
"memory_mb": "srs.memory_mb",
"start_date_ts": "srs.fact_date",
"end_date_ts": -1,
"start_day_id": -1,
"end_day_id": -1
},
"joins": [{
"name": "staging_resource_specifications",
"schema": "${SOURCE_SCHEMA}",
"alias": "srs"
}]
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,34 @@
{
"name": "vcpus",
"type": "int(5)",
"nullable": true,
"default": null
"nullable": false
},
{
"name": "start_date_ts",
"type": "int(12) unsigned",
"nullable": false
},
{
"name": "end_date_ts",
"type": "int(12) unsigned",
"nullable": true
},
{
"name": "start_date",
"type": "date",
"name": "start_day_id",
"type": "int(11) unsigned",
"nullable": false
},
{
"name": "end_date",
"type": "date",
"name": "end_day_id",
"type": "int(11) unsigned",
"nullable": true
},
{
"name": "last_modified",
"type": "timestamp",
"nullable": false,
"default": "CURRENT_TIMESTAMP",
"extra": "on update CURRENT_TIMESTAMP"
}
],

Expand All @@ -59,7 +75,7 @@
"hostname",
"memory_mb",
"vcpus",
"start_date"
"start_date_ts"
],
"is_unique": true
},
Expand Down
Loading