[Schema][Utilization] Add schema tables for job utilization (#6183)

# Overview Add two tables in misc database for utilization data. - oss_ci_utilization_metadata: metadata - oss_ci_time_series: time-series table to store time-series data Utilization Data Pipeline Steps: 1. Modify monitor script for final data model (Done) 2. Add S3 bucket for ready-to-insert files (Done) 3. **Add Clickhouse database schemas (This Pr)** 4. Setup logic in upload_artifact to process log raw data and insert clean data into the ready-to-insert s3 bucket - notice we will generate two files, one for metadata table, and one for timeseries table. metadata table is single insertion, while time-series table is batch opertaion. 5. set up s3 replicator generator to insert table Doc Design https://docs.google.com/document/d/151uzLPpOTVcfdfDgFHmGqztiyWwHLI8OR-U3W9QH0lA/edit?tab=t.0 # Details TTL (time to live) All records are set time to live for a year using created_at timestamp, this gives us flexibility to re-insert hot data in the future. The data is backed up in S3, Use S3 replicator approach to insert data, see guidance: https://github.com/pytorch/test-infra/wiki/How-to-add-a-new-custom-table-on-ClickHouse See the data pipeline beflow: ![image](https://github.com/user-attachments/assets/87e1792b-6638-48d2-8613-efd7236f6426)
pytorch · Jan 21, 2025 · abf8016 · abf8016
1 parent 929e0fe
commit abf8016
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 0 deletions.
diff --git a/clickhouse_db_schema/oss_ci_utilization/oss_ci_time_series_schema.sql b/clickhouse_db_schema/oss_ci_utilization/oss_ci_time_series_schema.sql
@@ -0,0 +1,35 @@
+-- This query creates the oss_ci_time_series table on ClickHouse
+CREATE TABLE misc.oss_ci_time_series(
+     -- created_at DateTime when the record is processed in db.
+    `created_at` DateTime64(0,'UTC'),
+    -- type of time series, for instance, utilization log data is 'utilization'.
+    `type` String,
+    `tags` Array(String) DEFAULT [],
+    `time_stamp` DateTime64(0,'UTC'),
+    `repo` String DEFAULT 'pytorch/pytorch',
+    `workflow_id` UInt64,
+    `run_attempt` UInt32,
+    `job_id` UInt64,
+    `workflow_name` String,
+    `job_name` String,
+    -- the data stored as raw json string.
+    -- Notice in clickhouse the length of string type is not limited.
+    `json_data` String DEFAULT '{}',
+    -- The raw records on S3, this is populated by the s3 replicator
+    `_meta` Tuple(bucket String, key String),
+)ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
+PARTITION BY toYYYYMM(time_stamp)
+ORDER BY
+    (
+        workflow_id,
+        job_id,
+        repo,
+        workflow_name,
+        job_name,
+        type,
+        time_stamp,
+    )
+-- data exists in the db for a year.
+-- time to live is based on created_at which is when the record is inserted in db.
+TTL toDate(created_at) + toIntervalYear(1)
+SETTINGS index_granularity = 8192
diff --git a/clickhouse_db_schema/oss_ci_utilization/oss_ci_utilization_metadata_schema.sql b/clickhouse_db_schema/oss_ci_utilization/oss_ci_utilization_metadata_schema.sql
@@ -0,0 +1,37 @@
+-- This query creates the oss_ci_utilization_metadata table on ClickHouse
+CREATE TABLE misc.oss_ci_utilization_metadata
+(
+    `created_at` DateTime64(0, 'UTC'),
+    -- github info
+    `repo` String DEFAULT 'pytorch/pytorch',
+    `workflow_id` UInt64,
+    `run_attempt` UInt32,
+    `job_id` UInt64,
+    `workflow_name` String,
+    `job_name` String,
+    -- metadata
+    `usage_collect_interval` Float32,
+    `data_model_version` String,
+    `gpu_count` UInt32,
+    `cpu_count` UInt32,
+    `gpu_type` String DEFAULT 'None',
+    `start_at` DateTime64(0, 'UTC'),
+    `end_at` DateTime64(0, 'UTC'),
+    -- segments are post-job processed data to identify detected test intervals
+    `segments` Array(Tuple(level String, name String, start_at DateTime64(0, 'UTC'), end_at DateTime64(0, 'UTC'), extra_info Map(String, String)))  DEFAULT [],
+    -- The raw records on S3, this is populated by the s3 replicator
+    `_meta` Tuple(bucket String, key String)
+)
+ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
+PARTITION BY toYYYYMM(start_at)
+ORDER BY (
+    workflow_id,
+    job_id,
+    repo,
+    workflow_name,
+    job_name,
+    start_at)
+-- data exists in the db for a year.
+-- time to live is based on created_at which is when the record is inserted in db.
+TTL toDate(created_at) + toIntervalYear(1)
+SETTINGS index_granularity = 8192