Merge pull request #141 from google/gbm

[Code wide] Fix pending todo + schema + several small improvements
google · Jun 6, 2023 · 3772e34 · 3772e34
2 parents 09511ad + 603bca8
commit 3772e34
Show file tree

Hide file tree

Showing 162 changed files with 7,065 additions and 7,015 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -22,9 +22,7 @@
   "python.linting.flake8Enabled": false,
   "python.linting.pylintEnabled": true,
   "python.linting.enabled": true,
-  "editor.rulers": [
-    80
-  ],
+  "editor.rulers": [80],
   "editor.codeActionsOnSave": {
     "source.organizeImports": false
   },
@@ -84,4 +82,4 @@
     "span": "cpp",
     "algorithm": "cpp"
   }
-}
+}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -66,8 +66,8 @@ Benchmarking and profiling of pre-configured scripts is available as follow:
 #### Time and memory profiling
 
 ```shell
-bazel run -c opt benchmark:profile_time -- [name]
-bazel run -c opt benchmark:profile_memory -- [name] [-p]
+bazel run -c opt //benchmark:profile_time -- [name]
+bazel run -c opt //benchmark:profile_memory -- [name] [-p]
 ```
 
 where `[name]` is the name of one of the python scripts in
@@ -79,7 +79,7 @@ consumption.
 #### Time benchmarking
 
 ```shell
-bazel run -c opt benchmark:benchmark_time
+bazel run -c opt //benchmark:benchmark_time
 ```
 
 ### Running docs server

diff --git a/benchmark/benchmark_time.py b/benchmark/benchmark_time.py
@@ -23,14 +23,12 @@
 import pandas as pd
 import temporian as tp
 
-from temporian.implementation.numpy.data.event_set import EventSet
-
 # TODO(gbm): Add flag to control which benchmark to run.
 
 
 def _build_toy_dataset(
     n: int, data_prefix="", data2_is_categorical_integer=False
-) -> EventSet:
+) -> tp.EventSet:
     """Builds a toy dataset with two features.
 
     Args:
@@ -54,7 +52,7 @@ def _build_toy_dataset(
     else:
         data_2 = np.random.randn(n)
 
-    return EventSet.from_dataframe(
+    return tp.pd_dataframe_to_event_set(
         pd.DataFrame(
             {
                 "timestamp": timestamps,
@@ -65,7 +63,6 @@ def _build_toy_dataset(
             }
         ),
         index_names=["index_1", "index_2"],
-        is_sorted=True,
     )
 
 
@@ -104,7 +101,9 @@ def benchmark_calendar_day_of_month(runner):
         timestamps = np.sort(np.random.randn(n) * 1_700_000_000).astype(
             "datetime64[s]"
         )
-        ds = EventSet.from_dataframe(pd.DataFrame({"timestamp": timestamps}))
+        ds = tp.pd_dataframe_to_event_set(
+            pd.DataFrame({"timestamp": timestamps})
+        )
 
         node = ds.node()
         output = tp.calendar_day_of_month(node)
@@ -124,7 +123,7 @@ def benchmark_sample(runner):
 
             node_1 = ds_1.node()
             node_2 = ds_2.node()
-            output = tp.sample(node_1, node_2)
+            output = tp.resample(node_1, node_2)
 
             runner.benchmark(
                 f"sample:e{m:_}_s{n:_}",
@@ -219,13 +218,11 @@ def benchmark_from_dataframe(runner):
 
                     runner.benchmark(
                         benchmark_name,
-                        lambda: EventSet.from_dataframe(
-                            df, index_names, is_sorted=True
-                        ),
+                        lambda: tp.pd_dataframe_to_event_set(df, index_names),
                     )
 
 
-def benchmark_set_index(runner):
+def benchmark_add_index(runner):
     runner.add_separator()
 
     np.random.seed(0)
@@ -246,7 +243,7 @@ def benchmark_set_index(runner):
         feature_5 = np.random.choice(feature_values, number_timestamps)
         feature_6 = np.random.choice(feature_values, number_timestamps)
 
-        evset = EventSet.from_dataframe(
+        evset = tp.pd_dataframe_to_event_set(
             pd.DataFrame(
                 {
                     "timestamp": timestamps,
@@ -260,7 +257,6 @@ def benchmark_set_index(runner):
                     "feature_6": feature_6,
                 }
             ),
-            is_sorted=True,
             index_names=["index_1", "index_2"],
         )
 
@@ -275,12 +271,11 @@ def benchmark_set_index(runner):
         ]
 
         for index in possible_indexes:
-            for append in [False]:
-                output = tp.set_index(node, index, append=append)
-                runner.benchmark(
-                    f"set_index:s:{number_timestamps:_}:num_idx:{len(index)}:append:{append}",
-                    lambda: tp.evaluate(output, input={node: evset}),
-                )
+            output = tp.add_index(node, index)
+            runner.benchmark(
+                f"add_index:s:{number_timestamps:_}:num_idx:{len(index)}",
+                lambda: tp.evaluate(output, input={node: evset}),
+            )
 
 
 class BenchmarkResult(NamedTuple):
@@ -366,7 +361,7 @@ def main():
         "propagate",
         "cast",
         "unique_timestamps",
-        "set_index",
+        "add_index",
     ]
     if args.functions is not None:
         benchmarks_to_run = args.functions

diff --git a/benchmark/scripts/basic.py b/benchmark/scripts/basic.py
@@ -47,7 +47,7 @@ def main():
     product_ids = np.random.choice(ids, N)
     store_ids = np.random.choice(ids, N)
 
-    evset_1 = EventSet.from_dataframe(
+    evset_1 = tp.pd_dataframe_to_event_set(
         pd.DataFrame(
             {
                 STORE: store_ids,
@@ -59,7 +59,7 @@ def main():
         index_names=[STORE, PRODUCT],
     )
 
-    evset_2 = EventSet.from_dataframe(
+    evset_2 = tp.pd_dataframe_to_event_set(
         pd.DataFrame(
             {
                 STORE: store_ids,
@@ -77,7 +77,7 @@ def main():
 
     a = tp.glue(node_1, node_2)
     b = tp.prefix("sma_", tp.simple_moving_average(a, window_length=10.0))
-    c = tp.glue(a, tp.sample(b, a))
+    c = tp.glue(a, tp.resample(b, a))
 
     res: EventSet = tp.evaluate(
         c,

diff --git a/benchmark/scripts/sma.py b/benchmark/scripts/sma.py
@@ -47,7 +47,7 @@ def main():
     product_ids = np.random.choice(ids, N)
     store_ids = np.random.choice(ids, N)
 
-    evset = EventSet.from_dataframe(
+    evset = tp.pd_dataframe_to_event_set(
         pd.DataFrame(
             {
                 STORE: store_ids,
@@ -65,14 +65,12 @@ def main():
 
     res: EventSet = tp.evaluate(
         sma,
-        input={
-            node: evset,
-        },
+        input={node: evset},
         check_execution=False,
     )
 
     # Print output's first row, useful to check reproducibility
-    print(res.first_index_data())
+    print(res)
 
 
 if __name__ == "__main__":