Skip to content

Commit

Permalink
feat(MLOP-2236): add NTZ (#360)
Browse files Browse the repository at this point in the history
* feat: NTZ and new tests
  • Loading branch information
ralphrass authored Jun 14, 2024
1 parent cbda73d commit 2a5a6e8
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 3 deletions.
2 changes: 2 additions & 0 deletions butterfree/constants/data_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
IntegerType,
LongType,
StringType,
TimestampNTZType,
TimestampType,
)
from typing_extensions import final
Expand All @@ -21,6 +22,7 @@
class DataType(Enum):
"""Holds constants for data types within Butterfree."""

TIMESTAMP_NTZ = (TimestampNTZType(), "timestamp", "TIMESTAMP_NTZ")
TIMESTAMP = (TimestampType(), "timestamp", "TIMESTAMP")
BINARY = (BinaryType(), "boolean", "BINARY")
BOOLEAN = (BooleanType(), "boolean", "BOOLEAN")
Expand Down
3 changes: 2 additions & 1 deletion butterfree/transform/features/timestamp_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class TimestampFeature(Feature):

def __init__(
self,
dtype: Optional[DataType] = DataType.TIMESTAMP,
from_column: Optional[str] = None,
transformation: Optional[TransformComponent] = None,
from_ms: bool = False,
Expand All @@ -51,7 +52,7 @@ def __init__(
name=TIMESTAMP_COLUMN,
description=description,
from_column=from_column,
dtype=DataType.TIMESTAMP,
dtype=dtype,
transformation=transformation,
)
self.from_ms = from_ms
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def create_temp_view(dataframe: DataFrame, name):


def create_db_and_table(spark, table_reader_id, table_reader_db, table_reader_table):
spark.sql(f"drop schema {table_reader_db} cascade")
spark.sql(f"drop schema if exists {table_reader_db} cascade")
spark.sql(f"create database {table_reader_db}")
spark.sql(f"use {table_reader_db}")
spark.sql(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
from pyspark.sql.types import StringType
from datetime import datetime

import pytz
from pyspark.sql.types import StringType, StructField, StructType

from butterfree.clients import SparkClient
from butterfree.constants import DataType
from butterfree.constants.columns import TIMESTAMP_COLUMN
from butterfree.transform.features import TimestampFeature

# from pyspark.sql.types import *


class TestTimestampFeature:
def test_args_without_transformation(self):

test_key = TimestampFeature(from_column="ts")
test_key_ntz = TimestampFeature(dtype=DataType.TIMESTAMP_NTZ, from_column="ts")

assert test_key.name == TIMESTAMP_COLUMN
assert test_key.from_column == "ts"
assert test_key.dtype == DataType.TIMESTAMP
assert test_key_ntz.dtype == DataType.TIMESTAMP_NTZ

def test_transform(self, feature_set_dataframe):

Expand Down Expand Up @@ -70,3 +78,73 @@ def test_transform_mask(self, feature_set_dataframe_date):

assert df[0]["timestamp"] == "2020-02-07 00:00:00"
assert df[1]["timestamp"] == "2020-02-08 00:00:00"

def test_timezone_configs(self):

spark = SparkClient()
now = datetime.now()

# Testing a new timezone
spark.conn.conf.set("spark.sql.session.timeZone", "GMT-5")

time_list = [(now, now)]
rdd = spark.conn.sparkContext.parallelize(time_list)

schema = StructType(
[
StructField("ts", DataType.TIMESTAMP.spark, True),
StructField("ts_ntz", DataType.TIMESTAMP_NTZ.spark, True),
]
)
df = spark.conn.createDataFrame(rdd, schema)
df.createOrReplaceTempView("temp_tz_table")

df1 = spark.conn.sql("""SELECT ts, ts_ntz FROM temp_tz_table""")
df2 = df1.withColumns(
{"ts": df1.ts.cast(StringType()), "ts_ntz": df1.ts_ntz.cast(StringType())}
)
df2_vals = df2.collect()[0]

assert df2_vals.ts != df2_vals.ts_ntz

# New TZ. Column with TZ must have a != value; Column NTZ must keep its value
spark.conn.conf.set("spark.sql.session.timeZone", "GMT-7")

df3 = spark.conn.sql("""SELECT ts, ts_ntz FROM temp_tz_table""")
df4 = df3.withColumns(
{"ts": df1.ts.cast(StringType()), "ts_ntz": df1.ts_ntz.cast(StringType())}
)
df4_vals = df4.collect()[0]

assert df4_vals.ts != df2_vals.ts
assert df4_vals.ts_ntz == df2_vals.ts_ntz

def test_timezone(self):

spark = SparkClient()

my_date = datetime.now(pytz.timezone("US/Pacific"))

datetime_mask = "%Y-%m-%d %H:%M"

data = [
{"id": 1, TIMESTAMP_COLUMN: str(my_date), "feature": 100},
{"id": 2, TIMESTAMP_COLUMN: str(my_date), "feature": 200},
]

df = spark.conn.read.json(spark.conn._sc.parallelize(data, 1))
df.createOrReplaceTempView("time_table")

df2 = spark.sql("SELECT TIMESTAMP AS ts FROM time_table")

time_value = datetime.fromisoformat(df2.collect()[0].ts).strftime(datetime_mask)

df_different_timezone = df2.withColumn(
"ts", df2.ts.cast(DataType.TIMESTAMP.spark)
)
df_no_timezone = df2.withColumn("ts", df2.ts.cast(DataType.TIMESTAMP_NTZ.spark))

assert (
df_different_timezone.collect()[0].ts.strftime(datetime_mask) != time_value
)
assert df_no_timezone.collect()[0].ts.strftime(datetime_mask) == time_value

1 comment on commit 2a5a6e8

@chip-n-dale
Copy link

@chip-n-dale chip-n-dale bot commented on 2a5a6e8 Jun 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @ralphrass!

The GitLeaks SecTool reported some possibly exposed credentials/secrets, how about giving them a look?

GitLeaks Alert Sync
[
  {
    "line": "    webhook: REDACTED",
    "lineNumber": 141,
    "offender": "REDACTED",
    "offenderEntropy": -1,
    "commit": "b6a5daf28abc035f74b9685aab573d384680b9d1",
    "repo": "butterfree",
    "repoURL": "",
    "leakURL": "",
    "rule": "Slack Webhook",
    "commitMessage": "initial commit\n",
    "author": "Alvaro",
    "email": "alvaro.marques.andrade@gmail.com",
    "file": ".drone.yml",
    "date": "2020-01-03T14:21:51-03:00",
    "tags": "key, slack"
  },
  {
    "line": "    webhook: REDACTED",
    "lineNumber": 159,
    "offender": "REDACTED",
    "offenderEntropy": -1,
    "commit": "b6697aa708fec0c5a9e3af0b2713cee6f45ff675",
    "repo": "butterfree",
    "repoURL": "",
    "leakURL": "",
    "rule": "Slack Webhook",
    "commitMessage": "hail to the butterfree\n",
    "author": "Alvaro",
    "email": "alvaro.marques.andrade@gmail.com",
    "file": ".drone.yml",
    "date": "2020-01-03T11:07:44-03:00",
    "tags": "key, slack"
  }
]

In case of false-positives, more information is available on GitLeaks FAQ
If you had any other problem or question during this process, be sure to contact us on the Security space on GChat!

Please sign in to comment.