From 150ee4b04daff456ec369a7d8a3126602e66a369 Mon Sep 17 00:00:00 2001
From: Matt Seddon <mattseddon@hotmail.com>
Date: Tue, 30 Jul 2024 16:20:20 +1000
Subject: [PATCH] add examples smoke tests

---
 .github/workflows/benchmarks.yml        |  2 +-
 .github/workflows/examples.yml          | 37 +++++++++++++++++++
 examples/get_started/json-csv-reader.py | 19 +++++-----
 examples/get_started/torch-loader.py    |  2 +-
 examples/multimodal/wds.py              | 31 ++++++++++------
 noxfile.py                              | 11 ++++++
 pyproject.toml                          |  6 ++--
 tests/examples/test_examples.py         | 48 +++++++++++++++++++++++++
 8 files changed, 131 insertions(+), 25 deletions(-)
 create mode 100644 .github/workflows/examples.yml
 create mode 100644 tests/examples/test_examples.py

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 564fb54fc..233dd85d6 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -11,7 +11,7 @@ env:
   FORCE_COLOR: "1"
 
 jobs:
-  build:
+  run:
     if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') }}
     runs-on: ubuntu-latest
 
diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
new file mode 100644
index 000000000..3bddcdef1
--- /dev/null
+++ b/.github/workflows/examples.yml
@@ -0,0 +1,37 @@
+name: Examples
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 3 * * *'
+  push: # to remove
+
+env:
+  FORCE_COLOR: "1"
+
+jobs:
+  run:
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest-8-cores, macos-latest, windows-latest-8-cores]
+        pyv: ['3.9', '3.12']
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.pyv }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.pyv }}
+          cache: 'pip'
+
+      - name: Upgrade nox and uv
+        run: |
+          python -m pip install --upgrade 'nox[uv]'
+          nox --version
+          uv --version
+
+      - name: Run examples
+        run: nox -s examples -p ${{ matrix.pyv }}
diff --git a/examples/get_started/json-csv-reader.py b/examples/get_started/json-csv-reader.py
index c4507fe0c..1dc457151 100644
--- a/examples/get_started/json-csv-reader.py
+++ b/examples/get_started/json-csv-reader.py
@@ -36,7 +36,7 @@ def main():
     print("========================================================================")
     uri = "gs://datachain-demo/jsonl/object.jsonl"
     jsonl_ds = DataChain.from_json(uri, meta_type="jsonl", show_schema=True)
-    print(jsonl_ds.to_pandas())
+    jsonl_ds.show()
 
     print()
     print("========================================================================")
@@ -49,8 +49,7 @@ def main():
     json_pairs_ds = DataChain.from_json(
         uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage"
     )
-    print(json_pairs_ds.to_pandas())
-    # print(list(json_pairs_ds.collect())[0])
+    json_pairs_ds.show()
 
     uri = "gs://datachain-demo/coco2017/annotations_captions/"
 
@@ -72,14 +71,14 @@ def main():
     static_json_ds = DataChain.from_json(
         uri, jmespath="licenses", spec=LicenseFeature, nrows=3
     )
-    print(static_json_ds.to_pandas())
+    static_json_ds.show()
 
     print()
     print("========================================================================")
     print("dynamic JSON schema test parsing 5K objects")
     print("========================================================================")
     dynamic_json_ds = DataChain.from_json(uri, jmespath="images", show_schema=True)
-    print(dynamic_json_ds.to_pandas())
+    dynamic_json_ds.show()
 
     uri = "gs://datachain-demo/chatbot-csv/"
     print()
@@ -88,16 +87,16 @@ def main():
     print("========================================================================")
     static_csv_ds = DataChain.from_csv(uri, output=ChatDialog, object_name="chat")
     static_csv_ds.print_schema()
-    print(static_csv_ds.to_pandas())
+    static_csv_ds.show()
 
-    uri = "gs://datachain-demo/laion-aesthetics-csv"
+    uri = "gs://datachain-demo/laion-aesthetics-csv/"
     print()
     print("========================================================================")
-    print("dynamic CSV with header schema test parsing 3/3M objects")
+    print("dynamic CSV with header schema test parsing 3M objects")
     print("========================================================================")
-    dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3)
+    dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion")
     dynamic_csv_ds.print_schema()
-    print(dynamic_csv_ds.to_pandas())
+    dynamic_csv_ds.show()
 
 
 if __name__ == "__main__":
diff --git a/examples/get_started/torch-loader.py b/examples/get_started/torch-loader.py
index fdbf7541b..e9f3fa7da 100644
--- a/examples/get_started/torch-loader.py
+++ b/examples/get_started/torch-loader.py
@@ -64,7 +64,7 @@ def forward(self, x):
     optimizer = optim.Adam(model.parameters(), lr=0.001)
 
     # Train the model
-    num_epochs = 10
+    num_epochs = 3
     for epoch in range(num_epochs):
         for i, data in enumerate(train_loader):
             inputs, labels = data
diff --git a/examples/multimodal/wds.py b/examples/multimodal/wds.py
index 6078747a8..72e3c2fca 100644
--- a/examples/multimodal/wds.py
+++ b/examples/multimodal/wds.py
@@ -1,5 +1,3 @@
-import pandas as pd
-
 from datachain import C, DataChain
 from datachain.lib.webdataset import process_webdataset
 from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
@@ -9,25 +7,36 @@
     .filter(C("file.name").glob("00000000.tar"))
     .settings(cache=True)
     .gen(laion=process_webdataset(spec=WDSLaion), params="file")
+    .save()  # materialize chain to avoid downloading data multiple times
+)
+
+meta_pq = (
+    DataChain.from_parquet("gs://datachain-demo/datacomp-small/metadata/0020f*.parquet")
+    .filter(
+        C("uid").in_(values[0] for values in wds.select("laion.json.uid").collect())
+    )
+    .map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
+    .save()
 )
 
 meta_emd = (
-    DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata")
-    .filter(C("file.name").glob("0020f*.npz"))
+    DataChain.from_storage("gs://datachain-demo/datacomp-small/metadata/0020f*.npz")
     .gen(emd=process_laion_meta)
+    .filter(
+        C("emd.index").in_(
+            values[0] for values in meta_pq.select("source.index").collect()
+        )
+    )
     .map(stem=lambda file: file.get_file_stem(), params=["emd.file"], output=str)
 )
 
-meta_pq = DataChain.from_parquet(
-    "gs://datachain-demo/datacomp-small/metadata/0020f*.parquet"
-).map(stem=lambda file: file.get_file_stem(), params=["source.file"], output=str)
 
 meta = meta_emd.merge(
-    meta_pq, on=["stem", "emd.index"], right_on=["stem", "source.index"]
+    meta_pq,
+    on=["stem", "emd.index"],
+    right_on=["stem", "source.index"],
 )
 
 res = wds.merge(meta, on="laion.json.uid", right_on="uid")
 
-df = res.limit(10).to_pandas()
-with pd.option_context("display.max_columns", None):
-    print(df)
+res.show()
diff --git a/noxfile.py b/noxfile.py
index 5c3650475..7132ce7dd 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -74,3 +74,14 @@ def dev(session: nox.Session) -> None:
 
     python = os.path.join(venv_dir, "bin/python")
     session.run(python, "-m", "pip", "install", "-e", ".[dev]", external=True)
+
+
+@nox.session(python=["3.9", "3.10", "3.11", "3.12", "pypy3.9", "pypy3.10"])
+def examples(session: nox.Session) -> None:
+    session.install(".[tests]")
+    session.run(
+        "pytest",
+        "-m",
+        "examples",
+        *session.posargs,
+    )
diff --git a/pyproject.toml b/pyproject.toml
index 1dac0892b..3a912e614 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,9 @@ tests = [
   "hypothesis",
   "open_clip_torch",
   "aiotools>=1.7.0",
-  "requests-mock"
+  "requests-mock",
+  "unstructured[all-docs]",
+  "libmagic"
 ]
 dev = [
   "datachain[docs,tests]",
@@ -110,7 +112,7 @@ namespaces = false
 [tool.setuptools_scm]
 
 [tool.pytest.ini_options]
-addopts = "-rfEs -m 'not benchmark'"
+addopts = "-rfEs -m 'not benchmark and not examples'"
 markers = [
   "benchmark: benchmarks.",
   "e2e: End-to-end tests"
diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py
new file mode 100644
index 000000000..f16ff1dbd
--- /dev/null
+++ b/tests/examples/test_examples.py
@@ -0,0 +1,48 @@
+import glob
+import os
+import subprocess
+import sys
+
+import pytest
+
+get_started_examples = glob.glob("examples/get_started/**/*.py", recursive=True)
+llm_and_nlp_examples = filter(
+    # no anthropic token
+    lambda filename: "claude" not in filename,
+    glob.glob("examples/llm_and_nlp/**/*.py", recursive=True),
+)
+multimodal_examples = filter(
+    # no OpenAI token and hf download painfully slow
+    lambda filename: "openai" not in filename and "hf" not in filename,
+    glob.glob("examples/multimodal/**/*.py", recursive=True),
+)
+
+
+def smoke_test(example: str):
+    completed_process = subprocess.run(  # noqa: S603
+        [sys.executable, example],
+        capture_output=True,
+        cwd=os.path.abspath(os.path.join(__file__, "..", "..", "..")),
+        check=True,
+    )
+
+    assert completed_process.stdout
+    assert completed_process.stderr
+
+
+@pytest.mark.examples
+@pytest.mark.parametrize("example", get_started_examples)
+def test_get_started_examples(example):
+    smoke_test(example)
+
+
+@pytest.mark.examples
+@pytest.mark.parametrize("example", llm_and_nlp_examples)
+def test_llm_and_nlp_examples(example):
+    smoke_test(example)
+
+
+@pytest.mark.examples
+@pytest.mark.parametrize("example", multimodal_examples)
+def test_multimodal(example):
+    smoke_test(example)