fix: Timestamp update (#2486)

* Timestamps Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Update md files Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Update more Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Update batch source creators Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix data source Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Temp fix Signed-off-by: Kevin Zhang <kzhang@tecton.ai> * Fixed Signed-off-by: Kevin Zhang <kzhang@tecton.ai>
feast-dev · Apr 6, 2022 · bf23111 · bf23111
1 parent 0c9e5b7
commit bf23111
Show file tree

Hide file tree

Showing 44 changed files with 216 additions and 180 deletions.
diff --git a/docs/how-to-guides/adding-a-new-offline-store.md b/docs/how-to-guides/adding-a-new-offline-store.md
@@ -54,7 +54,7 @@ There are two methods that deal with reading data from the offline stores`get_hi
                                         data_source: DataSource,
                                         join_key_columns: List[str],
                                         feature_name_columns: List[str],
-                                        event_timestamp_column: str,
+                                        timestamp_field: str,
                                         created_timestamp_column: Optional[str],
                                         start_date: datetime,
                                         end_date: datetime) -> RetrievalJob:
@@ -63,7 +63,7 @@ There are two methods that deal with reading data from the offline stores`get_hi
                                                        data_source,
                                                        join_key_columns,
                                                        feature_name_columns,
-                                                       event_timestamp_column,
+                                                       timestamp_field=timestamp_field,
                                                        created_timestamp_column,
                                                        start_date,
                                                        end_date)
@@ -165,14 +165,14 @@ class CustomFileDataSource(FileSource):
     """Custom data source class for local files"""
     def __init__(
         self,
-        event_timestamp_column: Optional[str] = "",
+        timestamp_field: Optional[str] = "",
         path: Optional[str] = None,
         field_mapping: Optional[Dict[str, str]] = None,
         created_timestamp_column: Optional[str] = "",
         date_partition_column: Optional[str] = "",
     ):
         super(CustomFileDataSource, self).__init__(
-            event_timestamp_column,
+            timestamp_field=timestamp_field,
             created_timestamp_column,
             field_mapping,
             date_partition_column,
@@ -189,7 +189,7 @@ class CustomFileDataSource(FileSource):
         return CustomFileDataSource(
             field_mapping=dict(data_source.field_mapping),
             path=path,
-            event_timestamp_column=data_source.event_timestamp_column,
+            timestamp_field=data_source.timestamp_field,
             created_timestamp_column=data_source.created_timestamp_column,
             date_partition_column=data_source.date_partition_column,
         )
@@ -203,7 +203,7 @@ class CustomFileDataSource(FileSource):
             ),
         )
 
-        data_source_proto.event_timestamp_column = self.event_timestamp_column
+        data_source_proto.timestamp_field = self.timestamp_field
         data_source_proto.created_timestamp_column = self.created_timestamp_column
         data_source_proto.date_partition_column = self.date_partition_column
 

diff --git a/docs/reference/data-sources/spark.md b/docs/reference/data-sources/spark.md
@@ -45,7 +45,7 @@ from feast.infra.offline_stores.contrib.spark_offline_store.spark_source import
 my_spark_source = SparkSource(
     path=f"{CURRENT_DIR}/data/driver_hourly_stats",
     file_format="parquet",
-    event_timestamp_column="event_timestamp",
+    timestamp_field="event_timestamp",
     created_timestamp_column="created",
 )
 ```
diff --git a/docs/reference/feature-repository/README.md b/docs/reference/feature-repository/README.md
@@ -26,7 +26,7 @@ Typically, users store their feature repositories in a Git repository, especiall
 The structure of a feature repository is as follows:
 
 * The root of the repository should contain a `feature_store.yaml` file and may contain a `.feastignore` file.
-* The repository should contain Python files that contain feature definitions. 
+* The repository should contain Python files that contain feature definitions.
 * The repository can contain other files as well, including documentation and potentially data files.
 
 An example structure of a feature repository is shown below:
@@ -98,7 +98,7 @@ from feast import BigQuerySource, Entity, Feature, FeatureView, ValueType
 
 driver_locations_source = BigQuerySource(
     table_ref="rh_prod.ride_hailing_co.drivers",
-    event_timestamp_column="event_timestamp",
+    timestamp_field="event_timestamp",
     created_timestamp_column="created_timestamp",
 )
 

diff --git a/docs/reference/feature-repository/registration-inferencing.md b/docs/reference/feature-repository/registration-inferencing.md
@@ -2,6 +2,6 @@
 
 ## Overview
 
-* FeatureView - When the `features` parameter is left out of the feature view definition, upon a `feast apply` call, Feast will automatically consider every column in the data source as a feature to be registered other than the specific timestamp columns associated with the underlying data source definition (e.g. event_timestamp_column) and the columns associated with the feature view's entities.
-* DataSource - When the `event_timestamp_column` parameter is left out of the data source definition, upon a 'feast apply' call, Feast will automatically find the sole timestamp column in the table underlying the data source and use that as the `event_timestamp_column`. If there are no columns of timestamp type or multiple columns of timestamp type, `feast apply` will throw an exception.
+* FeatureView - When the `features` parameter is left out of the feature view definition, upon a `feast apply` call, Feast will automatically consider every column in the data source as a feature to be registered other than the specific timestamp columns associated with the underlying data source definition (e.g. timestamp_field) and the columns associated with the feature view's entities.
+* DataSource - When the `timestamp_field` parameter is left out of the data source definition, upon a 'feast apply' call, Feast will automatically find the sole timestamp column in the table underlying the data source and use that as the `timestamp_field`. If there are no columns of timestamp type or multiple columns of timestamp type, `feast apply` will throw an exception.
 * Entity - When the `value_type` parameter is left out of the entity definition, upon a `feast apply` call, Feast will automatically find the column corresponding with the entity's `join_key` and take that column's data type to be the `value_type`. If the column doesn't exist, `feast apply` will throw an exception.
diff --git a/docs/tutorials/validating-historical-features.md b/docs/tutorials/validating-historical-features.md
@@ -1,12 +1,12 @@
 # Validating historical features with Great Expectations
 
-In this tutorial, we will use the public dataset of Chicago taxi trips to present data validation capabilities of Feast. 
-- The original dataset is stored in BigQuery and consists of raw data for each taxi trip (one row per trip) since 2013. 
+In this tutorial, we will use the public dataset of Chicago taxi trips to present data validation capabilities of Feast.
+- The original dataset is stored in BigQuery and consists of raw data for each taxi trip (one row per trip) since 2013.
 - We will generate several training datasets (aka historical features in Feast) for different periods and evaluate expectations made on one dataset against another.
 
 Types of features we're ingesting and generating:
-- Features that aggregate raw data with daily intervals (eg, trips per day, average fare or speed for a specific day, etc.). 
-- Features using SQL while pulling data from BigQuery (like total trips time or total miles travelled). 
+- Features that aggregate raw data with daily intervals (eg, trips per day, average fare or speed for a specific day, etc.).
+- Features using SQL while pulling data from BigQuery (like total trips time or total miles travelled).
 - Features calculated on the fly when requested using Feast's on-demand transformations
 
 Our plan:
@@ -31,7 +31,7 @@ Install Feast Python SDK and great expectations:
 ```
 
 
-### 1. Dataset preparation (Optional) 
+### 1. Dataset preparation (Optional)
 
 **You can skip this step if you don't have GCP account. Please use parquet files that are coming with this tutorial instead**
 
@@ -56,15 +56,15 @@ Running some basic aggregations while pulling data from BigQuery. Grouping by ta
 
 
 ```python
-data_query = """SELECT 
+data_query = """SELECT
     taxi_id,
     TIMESTAMP_TRUNC(trip_start_timestamp, DAY) as day,
     SUM(trip_miles) as total_miles_travelled,
     SUM(trip_seconds) as total_trip_seconds,
     SUM(fare) as total_earned,
     COUNT(*) as trip_count
-FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` 
-WHERE 
+FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
+WHERE
     trip_miles > 0 AND trip_seconds > 60 AND
     trip_start_timestamp BETWEEN '2019-01-01' and '2020-12-31' AND
     trip_total < 1000
@@ -84,7 +84,7 @@ pyarrow.parquet.write_table(driver_stats_table, "trips_stats.parquet")
 def entities_query(year):
     return f"""SELECT
     distinct taxi_id
-FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` 
+FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
 WHERE
     trip_miles > 0 AND trip_seconds > 0 AND
     trip_start_timestamp BETWEEN '{year}-01-01' and '{year}-12-31'
@@ -120,7 +120,7 @@ from google.protobuf.duration_pb2 import Duration
 
 ```python
 batch_source = FileSource(
-    event_timestamp_column="day",
+    timestamp_field="day",
     path="trips_stats.parquet",  # using parquet file that we created on previous step
     file_format=ParquetFormat()
 )
@@ -141,7 +141,7 @@ trips_stats_fv = FeatureView(
         Feature("total_trip_seconds", ValueType.DOUBLE),
         Feature("total_earned", ValueType.DOUBLE),
         Feature("trip_count", ValueType.INT64),
-        
+
     ],
     ttl=Duration(seconds=86400),
     batch_source=batch_source,
@@ -317,8 +317,8 @@ store.create_saved_dataset(
 
 Dataset profiler is a function that accepts dataset and generates set of its characteristics. This charasteristics will be then used to evaluate (validate) next datasets.
 
-**Important: datasets are not compared to each other! 
-Feast use a reference dataset and a profiler function to generate a reference profile. 
+**Important: datasets are not compared to each other!
+Feast use a reference dataset and a profiler function to generate a reference profile.
 This profile will be then used during validation of the tested dataset.**
 
 
@@ -523,37 +523,37 @@ def stats_profiler(ds: PandasDataset) -> ExpectationSuite:
         max_value=60,
         mostly=0.99  # allow some outliers
     )
-    
+
     ds.expect_column_values_to_be_between(
         "total_miles_travelled",
         min_value=0,
         max_value=500,
         mostly=0.99  # allow some outliers
     )
-    
+
     # expectation of means based on observed values
     observed_mean = ds.trip_count.mean()
     ds.expect_column_mean_to_be_between("trip_count",
                                         min_value=observed_mean * (1 - DELTA),
                                         max_value=observed_mean * (1 + DELTA))
-    
+
     observed_mean = ds.earned_per_hour.mean()
     ds.expect_column_mean_to_be_between("earned_per_hour",
                                         min_value=observed_mean * (1 - DELTA),
                                         max_value=observed_mean * (1 + DELTA))
-    
-    
+
+
     # expectation of quantiles
     qs = [0.5, 0.75, 0.9, 0.95]
     observed_quantiles = ds.avg_fare.quantile(qs)
-    
+
     ds.expect_column_quantile_values_to_be_between(
         "avg_fare",
         quantile_ranges={
             "quantiles": qs,
             "value_ranges": [[None, max_value] for max_value in observed_quantiles]
-        })                                     
-    
+        })
+
     return ds.get_expectation_suite()
 ```
 
@@ -663,7 +663,7 @@ _ = job.to_df(validation_reference=validation_reference)
 Validation successfully passed as no exception were raised.
 
 
-### 5. Validating new historical retrieval 
+### 5. Validating new historical retrieval
 
 Creating new timestamps for Dec 2020:
 

diff --git a/examples/java-demo/feature_repo/driver_repo.py b/examples/java-demo/feature_repo/driver_repo.py
@@ -7,7 +7,7 @@
 
 driver_hourly_stats = FileSource(
     path="data/driver_stats_with_string.parquet",
-    event_timestamp_column="event_timestamp",
+    timestamp_field="event_timestamp",
     created_timestamp_column="created",
 )
 driver = Entity(name="driver_id", value_type=ValueType.INT64, description="driver id",)

diff --git a/java/serving/src/test/java/feast/serving/util/DataGenerator.java b/java/serving/src/test/java/feast/serving/util/DataGenerator.java
@@ -158,7 +158,7 @@ public static FeatureTableSpec createFeatureTableSpec(
         .setMaxAge(Duration.newBuilder().setSeconds(3600).build())
         .setBatchSource(
             DataSource.newBuilder()
-                .setEventTimestampColumn("ts")
+                .setTimestampField("ts")
                 .setType(DataSource.SourceType.BATCH_FILE)
                 .setFileOptions(
                     FileOptions.newBuilder()
@@ -204,7 +204,7 @@ public static DataSource createFileDataSourceSpec(
         .setType(DataSource.SourceType.BATCH_FILE)
         .setFileOptions(
             FileOptions.newBuilder().setFileFormat(createParquetFormat()).setUri(fileURL).build())
-        .setEventTimestampColumn(timestampColumn)
+        .setTimestampField(timestampColumn)
         .setDatePartitionColumn(datePartitionColumn)
         .build();
   }
@@ -215,7 +215,7 @@ public static DataSource createBigQueryDataSourceSpec(
         .setType(DataSource.SourceType.BATCH_BIGQUERY)
         .setBigqueryOptions(
             DataSource.BigQueryOptions.newBuilder().setTableRef(bigQueryTableRef).build())
-        .setEventTimestampColumn(timestampColumn)
+        .setTimestampField(timestampColumn)
         .setDatePartitionColumn(datePartitionColumn)
         .build();
   }
@@ -230,7 +230,7 @@ public static DataSource createKafkaDataSourceSpec(
                 .setBootstrapServers(servers)
                 .setMessageFormat(createProtoFormat("class.path"))
                 .build())
-        .setEventTimestampColumn(timestampColumn)
+        .setTimestampField(timestampColumn)
         .build();
   }
 
@@ -292,7 +292,7 @@ public static DataSource createKinesisDataSourceSpec(
                 .setStreamName("stream")
                 .setRecordFormat(createProtoFormat(classPath))
                 .build())
-        .setEventTimestampColumn(timestampColumn)
+        .setTimestampField(timestampColumn)
         .build();
   }
 

diff --git a/protos/feast/core/DataSource.proto b/protos/feast/core/DataSource.proto
@@ -66,7 +66,7 @@ message DataSource {
   map<string, string> field_mapping = 2;
 
   // Must specify event timestamp column name
-  string event_timestamp_column = 3;
+  string timestamp_field = 3;
 
   // (Optional) Specify partition column
   // useful for file sources