Skip to content

Commit

Permalink
Merge pull request #141 from google/gbm
Browse files Browse the repository at this point in the history
[Code wide] Fix pending todo + schema + several small improvements
  • Loading branch information
achoum authored Jun 6, 2023
2 parents 09511ad + 603bca8 commit 3772e34
Show file tree
Hide file tree
Showing 162 changed files with 7,065 additions and 7,015 deletions.
6 changes: 2 additions & 4 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@
"python.linting.flake8Enabled": false,
"python.linting.pylintEnabled": true,
"python.linting.enabled": true,
"editor.rulers": [
80
],
"editor.rulers": [80],
"editor.codeActionsOnSave": {
"source.organizeImports": false
},
Expand Down Expand Up @@ -84,4 +82,4 @@
"span": "cpp",
"algorithm": "cpp"
}
}
}
6 changes: 3 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ Benchmarking and profiling of pre-configured scripts is available as follow:
#### Time and memory profiling

```shell
bazel run -c opt benchmark:profile_time -- [name]
bazel run -c opt benchmark:profile_memory -- [name] [-p]
bazel run -c opt //benchmark:profile_time -- [name]
bazel run -c opt //benchmark:profile_memory -- [name] [-p]
```

where `[name]` is the name of one of the python scripts in
Expand All @@ -79,7 +79,7 @@ consumption.
#### Time benchmarking

```shell
bazel run -c opt benchmark:benchmark_time
bazel run -c opt //benchmark:benchmark_time
```

### Running docs server
Expand Down
35 changes: 15 additions & 20 deletions benchmark/benchmark_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,12 @@
import pandas as pd
import temporian as tp

from temporian.implementation.numpy.data.event_set import EventSet

# TODO(gbm): Add flag to control which benchmark to run.


def _build_toy_dataset(
n: int, data_prefix="", data2_is_categorical_integer=False
) -> EventSet:
) -> tp.EventSet:
"""Builds a toy dataset with two features.
Args:
Expand All @@ -54,7 +52,7 @@ def _build_toy_dataset(
else:
data_2 = np.random.randn(n)

return EventSet.from_dataframe(
return tp.pd_dataframe_to_event_set(
pd.DataFrame(
{
"timestamp": timestamps,
Expand All @@ -65,7 +63,6 @@ def _build_toy_dataset(
}
),
index_names=["index_1", "index_2"],
is_sorted=True,
)


Expand Down Expand Up @@ -104,7 +101,9 @@ def benchmark_calendar_day_of_month(runner):
timestamps = np.sort(np.random.randn(n) * 1_700_000_000).astype(
"datetime64[s]"
)
ds = EventSet.from_dataframe(pd.DataFrame({"timestamp": timestamps}))
ds = tp.pd_dataframe_to_event_set(
pd.DataFrame({"timestamp": timestamps})
)

node = ds.node()
output = tp.calendar_day_of_month(node)
Expand All @@ -124,7 +123,7 @@ def benchmark_sample(runner):

node_1 = ds_1.node()
node_2 = ds_2.node()
output = tp.sample(node_1, node_2)
output = tp.resample(node_1, node_2)

runner.benchmark(
f"sample:e{m:_}_s{n:_}",
Expand Down Expand Up @@ -219,13 +218,11 @@ def benchmark_from_dataframe(runner):

runner.benchmark(
benchmark_name,
lambda: EventSet.from_dataframe(
df, index_names, is_sorted=True
),
lambda: tp.pd_dataframe_to_event_set(df, index_names),
)


def benchmark_set_index(runner):
def benchmark_add_index(runner):
runner.add_separator()

np.random.seed(0)
Expand All @@ -246,7 +243,7 @@ def benchmark_set_index(runner):
feature_5 = np.random.choice(feature_values, number_timestamps)
feature_6 = np.random.choice(feature_values, number_timestamps)

evset = EventSet.from_dataframe(
evset = tp.pd_dataframe_to_event_set(
pd.DataFrame(
{
"timestamp": timestamps,
Expand All @@ -260,7 +257,6 @@ def benchmark_set_index(runner):
"feature_6": feature_6,
}
),
is_sorted=True,
index_names=["index_1", "index_2"],
)

Expand All @@ -275,12 +271,11 @@ def benchmark_set_index(runner):
]

for index in possible_indexes:
for append in [False]:
output = tp.set_index(node, index, append=append)
runner.benchmark(
f"set_index:s:{number_timestamps:_}:num_idx:{len(index)}:append:{append}",
lambda: tp.evaluate(output, input={node: evset}),
)
output = tp.add_index(node, index)
runner.benchmark(
f"add_index:s:{number_timestamps:_}:num_idx:{len(index)}",
lambda: tp.evaluate(output, input={node: evset}),
)


class BenchmarkResult(NamedTuple):
Expand Down Expand Up @@ -366,7 +361,7 @@ def main():
"propagate",
"cast",
"unique_timestamps",
"set_index",
"add_index",
]
if args.functions is not None:
benchmarks_to_run = args.functions
Expand Down
6 changes: 3 additions & 3 deletions benchmark/scripts/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def main():
product_ids = np.random.choice(ids, N)
store_ids = np.random.choice(ids, N)

evset_1 = EventSet.from_dataframe(
evset_1 = tp.pd_dataframe_to_event_set(
pd.DataFrame(
{
STORE: store_ids,
Expand All @@ -59,7 +59,7 @@ def main():
index_names=[STORE, PRODUCT],
)

evset_2 = EventSet.from_dataframe(
evset_2 = tp.pd_dataframe_to_event_set(
pd.DataFrame(
{
STORE: store_ids,
Expand All @@ -77,7 +77,7 @@ def main():

a = tp.glue(node_1, node_2)
b = tp.prefix("sma_", tp.simple_moving_average(a, window_length=10.0))
c = tp.glue(a, tp.sample(b, a))
c = tp.glue(a, tp.resample(b, a))

res: EventSet = tp.evaluate(
c,
Expand Down
8 changes: 3 additions & 5 deletions benchmark/scripts/sma.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def main():
product_ids = np.random.choice(ids, N)
store_ids = np.random.choice(ids, N)

evset = EventSet.from_dataframe(
evset = tp.pd_dataframe_to_event_set(
pd.DataFrame(
{
STORE: store_ids,
Expand All @@ -65,14 +65,12 @@ def main():

res: EventSet = tp.evaluate(
sma,
input={
node: evset,
},
input={node: evset},
check_execution=False,
)

# Print output's first row, useful to check reproducibility
print(res.first_index_data())
print(res)


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 3772e34

Please sign in to comment.