implemented numerous existing operations (#723)

progress on #722 Added: * hash * lower * round * sqrt * upper * zip_max * zip_min
kaskada-ai · Sep 4, 2023 · 4f89615 · 4f89615
1 parent 5cbbbff
commit 4f89615
Show file tree

Hide file tree

Showing 21 changed files with 326 additions and 4 deletions.
diff --git a/python/docs/source/reference/timestream/arithmetic.md b/python/docs/source/reference/timestream/arithmetic.md
@@ -19,11 +19,15 @@ See the notes on the specific functions for more information.
     Timestream.add
     Timestream.ceil
     Timestream.clamp
+    Timestream.div
     Timestream.exp
     Timestream.floor
-    Timestream.powf
-    Timestream.sub
+    Timestream.greatest
+    Timestream.least
     Timestream.mul
-    Timestream.div
     Timestream.neg
+    Timestream.powf
+    Timestream.round
+    Timestream.sqrt
+    Timestream.sub
 ```
diff --git a/python/docs/source/reference/timestream/index.md b/python/docs/source/reference/timestream/index.md
@@ -28,5 +28,6 @@ grouping
 logical
 misc
 records
+string
 time
 ```
diff --git a/python/docs/source/reference/timestream/misc.md b/python/docs/source/reference/timestream/misc.md
@@ -10,6 +10,7 @@
     Timestream.coalesce
     Timestream.else_
     Timestream.filter
+    Timestream.hash
     Timestream.if_
     Timestream.lag
     Timestream.null_if

diff --git a/python/docs/source/reference/timestream/string.md b/python/docs/source/reference/timestream/string.md
@@ -0,0 +1,11 @@
+# String
+
+```{eval-rst}
+.. currentmodule:: kaskada
+
+.. autosummary::
+   :toctree: ../apidocs/
+
+    Timestream.lower
+    Timestream.upper
+```
diff --git a/python/noxfile.py b/python/noxfile.py
@@ -136,7 +136,7 @@ def docs_build(session: nox.Session) -> None:
 @nox.session(python=python_versions[0])
 def docs(session: nox.Session) -> None:
     """Build and serve the documentation with live reloading on file changes."""
-    args = ["--open-browser", "docs/source", "docs/_build", "-j", "auto", "--ignore", "*/apidocs/*"]
+    args = ["--open-browser", "docs/source", "docs/_build", "-j", "auto", "--ignore", "*/apidocs/*", "--watch", "pysrc/kaskada"]
     install(session, groups=["typecheck", "docs"])
 
     build_dir = Path("docs", "_build")

diff --git a/python/pysrc/kaskada/_timestream.py b/python/pysrc/kaskada/_timestream.py
@@ -301,6 +301,20 @@ def floor(self) -> Timestream:
         """Return a Timestream of self rounded down to the nearest integer."""
         return Timestream._call("floor", self)
 
+    def hash(self) -> Timestream:
+        """Return a Timestream containing the hash of the input.
+
+        Notes:
+            This will only return `null` when interpolated at points where
+            the input is not defined. At other points, it will return the
+            hash of `null`, which is `0` (not `null`).
+        """
+        return Timestream._call("hash", self)
+
+    def lower(self) -> Timestream:
+        """Return a Timestream with all values converted to lower case."""
+        return Timestream._call("lower", self)
+
     def mul(self, rhs: Arg) -> Timestream:
         """Return a Timestream multiplying this and `rhs`.
 
@@ -803,6 +817,11 @@ def max(self, window: Optional[kd.windows.Window] = None) -> Timestream:
 
         Args:
             window: The window to use for the aggregation. Defaults to the entire Timestream.
+
+        See Also:
+            This returns the maximum of values in a column. See
+            :func:`greatest` to get the maximum value
+            between Timestreams at each point.
         """
         return _aggregation("max", self, window)
 
@@ -813,6 +832,11 @@ def min(self, window: Optional[kd.windows.Window] = None) -> Timestream:
 
         Args:
             window: The window to use for the aggregation. Defaults to the entire Timestream.
+
+        See Also:
+            This returns the minimum of values in a column. See
+            :func:`least` to get the minimum value
+            between Timestreams at each point.
         """
         return _aggregation("min", self, window)
 
@@ -923,6 +947,71 @@ def record(self, fields: Callable[[Timestream], Mapping[str, Arg]]) -> Timestrea
         """
         return record(fields(self))
 
+    def round(self) -> Timestream:
+        """Return a Timestream with all values rounded to the nearest integer.
+
+        Returns:
+            A Timestream of the same type as `self`. The result contains `null`
+            if the value was `null` at that point. Otherwise, it contains
+            the result of rounding the value to the nearest integer.
+
+        Notes:
+            This method may be applied to any numeric type. For anything other
+            than `float32` and `float64` it has no affect since the values
+            are already integers.
+
+        See Also:
+            - :func:`ceil`
+            - :func:`floor`
+        """
+        return Timestream._call("round", self)
+
+    def sqrt(self) -> Timestream:
+        """Return a Timestream with the square root of all values."""
+        return Timestream._call("sqrt", self)
+
+    def upper(self) -> Timestream:
+        """Return a Timestream with all values converted to upper case."""
+        return Timestream._call("upper", self)
+
+    def greatest(self, rhs: Arg) -> Timestream:
+        """Return a Timestream with the maximum value of `self` and `rhs` at each point.
+
+        Args:
+            rhs: The Timestream or literal value to compare to this.
+
+        Returns:
+            Each point contains the value from `self` if `self`
+            is greater than `rhs`, otherwise it contains `rhs`.
+            If any input is `null` or `NaN`, then that will be
+            the result.
+
+        See Also:
+            This returns the greatest of two values. See
+            :func:`max` for the maximum of values in
+            a column.
+        """
+        return Timestream._call("zip_max", self, rhs)
+
+    def least(self, rhs: Arg) -> Timestream:
+        """Return a Timestream with the minimum value of `self` and `rhs` at each point.
+
+        Args:
+            rhs: The Timestream or literal value to compare to this.
+
+        Returns:
+            Each point contains the value from `self` if `self`
+            is less than `rhs`, otherwise it contains `rhs`.
+            If any input is `null` or `NaN`, then that will be
+            the result.
+
+        See Also:
+            This returns the least of two values. See
+            :func:`min` for the minimum of values in
+            a column.
+        """
+        return Timestream._call("zip_min", self, rhs)
+
     def preview(
         self,
         limit: int = 10,

diff --git a/python/pytests/golden/greatest_test/test_greatest.jsonl b/python/pytests/golden/greatest_test/test_greatest.jsonl
@@ -0,0 +1,5 @@
+{"_time":"2021-01-01T00:00:00.000000000","_key":"A","a":5.7,"b":1.2,"a_greatest_b":5.7}
+{"_time":"2021-01-02T00:00:00.000000000","_key":"A","a":6.3,"b":0.4,"a_greatest_b":6.3}
+{"_time":"2021-01-03T00:00:00.000000000","_key":"B","a":null,"b":3.7,"a_greatest_b":null}
+{"_time":"2021-01-04T00:00:00.000000000","_key":"A","a":13.2,"b":null,"a_greatest_b":null}
+{"_time":"2021-01-05T00:00:00.000000000","_key":"A","a":2.0,"b":5.4,"a_greatest_b":5.4}
diff --git a/python/pytests/golden/hash_test/test_hash_integer.jsonl b/python/pytests/golden/hash_test/test_hash_integer.jsonl
@@ -0,0 +1,6 @@
+{"_time":"2021-01-01T00:00:00.000000000","_key":"Ben","m":5.0,"hash_m":16461383214845928621}
+{"_time":"2021-01-01T00:00:00.000000000","_key":"Ryan","m":8.0,"hash_m":6794973171266502674}
+{"_time":"2021-01-02T00:00:00.000000000","_key":"Ryan","m":9.0,"hash_m":15653042715643359010}
+{"_time":"2021-01-03T00:00:00.000000000","_key":"Ben","m":8.0,"hash_m":6794973171266502674}
+{"_time":"2021-01-04T00:00:00.000000000","_key":"Ben","m":null,"hash_m":0}
+{"_time":"2021-01-04T00:00:00.000000000","_key":"Ryan","m":9.0,"hash_m":15653042715643359010}
diff --git a/python/pytests/golden/hash_test/test_hash_string.jsonl b/python/pytests/golden/hash_test/test_hash_string.jsonl
@@ -0,0 +1,6 @@
+{"_time":"2021-01-01T00:00:00.000000000","_key":"Ben","m":"hello","hash_m":1472103086483932002}
+{"_time":"2021-01-01T00:00:00.000000000","_key":"Ryan","m":null,"hash_m":0}
+{"_time":"2021-01-02T00:00:00.000000000","_key":"Ryan","m":"world","hash_m":8057155968893317866}
+{"_time":"2021-01-03T00:00:00.000000000","_key":"Ben","m":"hi","hash_m":2460612554838835252}
+{"_time":"2021-01-04T00:00:00.000000000","_key":"Ben","m":" ","hash_m":14894517190786516170}
+{"_time":"2021-01-04T00:00:00.000000000","_key":"Ryan","m":"earth","hash_m":14489671231712828724}
diff --git a/python/pytests/golden/least_test/test_least.jsonl b/python/pytests/golden/least_test/test_least.jsonl
@@ -0,0 +1,5 @@
+{"_time":"2021-01-01T00:00:00.000000000","_key":"A","a":5.7,"b":1.2,"a_least_b":1.2}
+{"_time":"2021-01-02T00:00:00.000000000","_key":"A","a":6.3,"b":0.4,"a_least_b":0.4}
+{"_time":"2021-01-03T00:00:00.000000000","_key":"B","a":null,"b":3.7,"a_least_b":null}
+{"_time":"2021-01-04T00:00:00.000000000","_key":"A","a":13.2,"b":null,"a_least_b":null}
+{"_time":"2021-01-05T00:00:00.000000000","_key":"A","a":2.0,"b":5.4,"a_least_b":2.0}
diff --git a/python/pytests/golden/lower_test/test_lower.jsonl b/python/pytests/golden/lower_test/test_lower.jsonl
@@ -0,0 +1,6 @@
+{"_time":"2021-01-01T00:00:00.000000000","_key":"Ben","m":"Hello World","lower_m":"hello world"}
+{"_time":"2021-01-02T00:00:00.000000000","_key":"Ryan","m":null,"lower_m":null}
+{"_time":"2021-01-02T00:00:00.000000000","_key":"Ryan","m":"Hi Earth","lower_m":"hi earth"}
+{"_time":"2021-01-03T00:00:00.000000000","_key":"Ben","m":"Hello","lower_m":"hello"}
+{"_time":"2021-01-03T00:00:00.000000000","_key":"Ben","m":null,"lower_m":null}
+{"_time":"2021-01-04T00:00:00.000000000","_key":"Ryan","m":"hi","lower_m":"hi"}
diff --git a/python/pytests/golden/round_test/test_round.jsonl b/python/pytests/golden/round_test/test_round.jsonl
@@ -0,0 +1,3 @@
+{"_time":"2021-01-01T00:00:00.000000000","_key":"A","m":5.7,"round_m":6.0}
+{"_time":"2021-01-01T00:00:00.000000000","_key":"A","m":6.3,"round_m":6.0}
+{"_time":"2021-01-02T00:00:00.000000000","_key":"B","m":null,"round_m":null}
diff --git a/python/pytests/golden/sqrt_test/test_sqrt.jsonl b/python/pytests/golden/sqrt_test/test_sqrt.jsonl
@@ -0,0 +1,3 @@
+{"_time":"2021-01-01T00:00:00.000000000","_key":"A","m":5.7,"sqrt_m":2.3874672773}
+{"_time":"2021-01-01T00:00:00.000000000","_key":"A","m":6.3,"sqrt_m":2.5099800796}
+{"_time":"2021-01-02T00:00:00.000000000","_key":"B","m":null,"sqrt_m":null}
diff --git a/python/pytests/golden/upper_test/test_upper.jsonl b/python/pytests/golden/upper_test/test_upper.jsonl
@@ -0,0 +1,6 @@
+{"_time":"2021-01-01T00:00:00.000000000","_key":"Ben","m":"Hello World","upper_m":"HELLO WORLD"}
+{"_time":"2021-01-02T00:00:00.000000000","_key":"Ryan","m":null,"upper_m":null}
+{"_time":"2021-01-02T00:00:00.000000000","_key":"Ryan","m":"Hi Earth","upper_m":"HI EARTH"}
+{"_time":"2021-01-03T00:00:00.000000000","_key":"Ben","m":"Hello","upper_m":"HELLO"}
+{"_time":"2021-01-03T00:00:00.000000000","_key":"Ben","m":null,"upper_m":null}
+{"_time":"2021-01-04T00:00:00.000000000","_key":"Ryan","m":"hi","upper_m":"HI"}
diff --git a/python/pytests/greatest_test.py b/python/pytests/greatest_test.py
@@ -0,0 +1,23 @@
+import kaskada as kd
+import pytest
+
+
+@pytest.fixture(scope="module")
+def source() -> kd.sources.CsvString:
+    content = "\n".join(
+        [
+            "time,key,a,b",
+            "2021-01-01T00:00:00,A,5.7,1.2",
+            "2021-01-02T00:00:00,A,6.3,0.4",
+            "2021-01-03T00:00:00,B,,3.7",
+            "2021-01-04T00:00:00,A,13.2,",
+            "2021-01-05T00:00:00,A,2,5.4",
+        ]
+    )
+    return kd.sources.CsvString(content, time_column="time", key_column="key")
+
+
+def test_greatest(source, golden) -> None:
+    a = source.col("a")
+    b = source.col("b")
+    golden.jsonl(kd.record({"a": a, "b": b, "a_greatest_b": a.greatest(b)}))
diff --git a/python/pytests/hash_test.py b/python/pytests/hash_test.py
@@ -0,0 +1,44 @@
+import kaskada as kd
+import pytest
+
+
+@pytest.fixture(scope="module")
+def string_source() -> kd.sources.CsvString:
+    content = "\n".join(
+        [
+            "time,key,m,n",
+            "2021-01-01T00:00:00,Ben,hello,",
+            "2021-01-01T00:00:00,Ryan,,",
+            "2021-01-02T00:00:00,Ryan,world,",
+            "2021-01-03T00:00:00,Ben,hi,",
+            "2021-01-04T00:00:00,Ben, ,",
+            "2021-01-04T00:00:00,Ryan,earth,",
+        ]
+    )
+    return kd.sources.CsvString(content, time_column="time", key_column="key")
+
+
+def test_hash_string(string_source, golden) -> None:
+    m = string_source.col("m")
+    golden.jsonl(kd.record({"m": m, "hash_m": m.hash()}))
+
+
+@pytest.fixture(scope="module")
+def integer_source() -> kd.sources.CsvString:
+    content = "\n".join(
+        [
+            "time,key,m,n",
+            "2021-01-01T00:00:00,Ben,5,",
+            "2021-01-01T00:00:00,Ryan,8,",
+            "2021-01-02T00:00:00,Ryan,9,",
+            "2021-01-03T00:00:00,Ben,8,",
+            "2021-01-04T00:00:00,Ben,,",
+            "2021-01-04T00:00:00,Ryan,9,",
+        ]
+    )
+    return kd.sources.CsvString(content, time_column="time", key_column="key")
+
+
+def test_hash_integer(integer_source, golden) -> None:
+    m = integer_source.col("m")
+    golden.jsonl(kd.record({"m": m, "hash_m": m.hash()}))
diff --git a/python/pytests/least_test.py b/python/pytests/least_test.py
@@ -0,0 +1,23 @@
+import kaskada as kd
+import pytest
+
+
+@pytest.fixture(scope="module")
+def source() -> kd.sources.CsvString:
+    content = "\n".join(
+        [
+            "time,key,a,b",
+            "2021-01-01T00:00:00,A,5.7,1.2",
+            "2021-01-02T00:00:00,A,6.3,0.4",
+            "2021-01-03T00:00:00,B,,3.7",
+            "2021-01-04T00:00:00,A,13.2,",
+            "2021-01-05T00:00:00,A,2,5.4",
+        ]
+    )
+    return kd.sources.CsvString(content, time_column="time", key_column="key")
+
+
+def test_least(source, golden) -> None:
+    a = source.col("a")
+    b = source.col("b")
+    golden.jsonl(kd.record({"a": a, "b": b, "a_least_b": a.least(b)}))
diff --git a/python/pytests/lower_test.py b/python/pytests/lower_test.py
@@ -0,0 +1,23 @@
+import kaskada as kd
+import pytest
+
+
+@pytest.fixture(scope="module")
+def source() -> kd.sources.CsvString:
+    content = "\n".join(
+        [
+            "time,key,m",
+            "2021-01-01T00:00:00,Ben,Hello World",
+            "2021-01-02T00:00:00,Ryan,",
+            "2021-01-02T00:00:00,Ryan,Hi Earth",
+            "2021-01-03T00:00:00,Ben,Hello",
+            "2021-01-03T00:00:00,Ben,",
+            "2021-01-04T00:00:00,Ryan,hi",
+        ]
+    )
+    return kd.sources.CsvString(content, time_column="time", key_column="key")
+
+
+def test_lower(source, golden) -> None:
+    m = source.col("m")
+    golden.jsonl(kd.record({"m": m, "lower_m": m.lower()}))
diff --git a/python/pytests/round_test.py b/python/pytests/round_test.py
@@ -0,0 +1,20 @@
+import kaskada as kd
+import pytest
+
+
+@pytest.fixture(scope="module")
+def source() -> kd.sources.CsvString:
+    content = "\n".join(
+        [
+            "time,key,m",
+            "2021-01-01T00:00:00,A,5.7",
+            "2021-01-01T00:00:00,A,6.3",
+            "2021-01-02T00:00:00,B,",
+        ]
+    )
+    return kd.sources.CsvString(content, time_column="time", key_column="key")
+
+
+def test_round(source, golden) -> None:
+    m = source.col("m")
+    golden.jsonl(kd.record({"m": m, "round_m": m.round()}))
diff --git a/python/pytests/sqrt_test.py b/python/pytests/sqrt_test.py
@@ -0,0 +1,20 @@
+import kaskada as kd
+import pytest
+
+
+@pytest.fixture(scope="module")
+def source() -> kd.sources.CsvString:
+    content = "\n".join(
+        [
+            "time,key,m",
+            "2021-01-01T00:00:00,A,5.7",
+            "2021-01-01T00:00:00,A,6.3",
+            "2021-01-02T00:00:00,B,",
+        ]
+    )
+    return kd.sources.CsvString(content, time_column="time", key_column="key")
+
+
+def test_sqrt(source, golden) -> None:
+    m = source.col("m")
+    golden.jsonl(kd.record({"m": m, "sqrt_m": m.sqrt()}))
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,5 +28,6 @@ grouping @@
     logical
     misc
     records
+    string
     time
     ```