From a18b30a7ffbe6c26e99184e5ff4d6c642fa1261b Mon Sep 17 00:00:00 2001 From: Sammy Sidhu Date: Fri, 23 Aug 2024 14:00:17 -0700 Subject: [PATCH] Update PreCommit Hooks (#2715) --- .pre-commit-config.yaml | 12 +- benchmarking/tpch/data_generation.py | 2 +- .../tpch/pipelined_data_generation.py | 2 +- daft/context.py | 2 +- daft/dataframe/dataframe.py | 2 +- daft/expressions/expressions.py | 1 + docs/source/10-min.ipynb | 76 +- .../fotw/fotw-000-data-access.ipynb | 44 +- .../user_guide/fotw/fotw-001-images.ipynb | 58 +- tests/expressions/test_udf.py | 6 +- .../io/test_list_files_s3_minio.py | 2 +- .../1-local-image-batch-inference.ipynb | 15 +- .../2-distributed-batch-inference.ipynb | 14 +- .../3-pytorch-ray-single-node-training.ipynb | 1033 +++++++++- ...ft_tutorial_embeddings_stackexchange.ipynb | 47 +- tutorials/flyte/notebook.ipynb | 4 +- .../image_querying/top_n_red_color.ipynb | 22 +- tutorials/intro.ipynb | 4 +- tutorials/mnist.ipynb | 1817 +++++++++-------- .../talks_and_demos/data-ai-summit-2024.ipynb | 46 +- .../talks_and_demos/iceberg_summit_2024.ipynb | 15 +- .../talks_and_demos/linkedin-03-05-2024.ipynb | 9 +- .../talks_and_demos/pydata_global_2023.ipynb | 12 +- .../text_to_image_generation.ipynb | 633 +++--- .../text_to_image/using_cloud_with_ray.ipynb | 444 ++-- 25 files changed, 2669 insertions(+), 1653 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fab1ffa4bc..1f82930614 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.991 + rev: v1.11.1 hooks: - id: mypy additional_dependencies: [types-requests, types-PyYAML, types-tabulate] @@ -10,7 +10,7 @@ repos: exclude: daft/pickle/.*\.py - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.6.0 hooks: - id: detect-private-key - id: trailing-whitespace @@ -40,7 +40,7 @@ repos: - id: check-toml - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks - rev: v2.6.0 + rev: v2.14.0 hooks: - id: pretty-format-toml args: [--autofix] @@ -49,7 +49,7 @@ repos: args: [--autofix] - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell additional_dependencies: @@ -57,7 +57,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.3.7 + rev: v0.6.2 hooks: # Run the linter. - id: ruff @@ -111,6 +111,6 @@ repos: pass_filenames: false - repo: https://github.com/abravalheri/validate-pyproject - rev: v0.10.1 + rev: v0.19 hooks: - id: validate-pyproject diff --git a/benchmarking/tpch/data_generation.py b/benchmarking/tpch/data_generation.py index 1908828a40..84a7adca1f 100644 --- a/benchmarking/tpch/data_generation.py +++ b/benchmarking/tpch/data_generation.py @@ -253,7 +253,7 @@ def gen_csv_files(basedir: str, num_parts: int, scale_factor: float) -> str: Returns: str: path to folder with generated CSV files """ - cachedir = os.path.join(basedir, ("%.1f" % scale_factor).replace(".", "_"), str(num_parts)) + cachedir = os.path.join(basedir, (f"{scale_factor:.1f}").replace(".", "_"), str(num_parts)) if not os.path.exists(cachedir): # If running in CI, use a scale factor of 0.2 # Otherwise, check for SCALE_FACTOR env variable or default to 1 diff --git a/benchmarking/tpch/pipelined_data_generation.py b/benchmarking/tpch/pipelined_data_generation.py index 36d3629c67..f28063a990 100644 --- a/benchmarking/tpch/pipelined_data_generation.py +++ b/benchmarking/tpch/pipelined_data_generation.py @@ -48,7 +48,7 @@ def pipelined_data_generation( ): assert num_parts > 1, "script should only be used if num_parts > 1" - cachedir = pathlib.Path(scratch_dir) / ("%.1f" % scale_factor).replace(".", "_") / str(num_parts) + cachedir = pathlib.Path(scratch_dir) / (f"{scale_factor:.1f}").replace(".", "_") / str(num_parts) if not cachedir.exists(): logger.info("Cloning tpch dbgen repo") diff --git a/daft/context.py b/daft/context.py index f286c77c7b..38ef8545d5 100644 --- a/daft/context.py +++ b/daft/context.py @@ -17,7 +17,7 @@ class _RunnerConfig: - name = ClassVar[str] + name: ClassVar[str] @dataclasses.dataclass(frozen=True) diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py index 3dd7458db4..37dea4d822 100644 --- a/daft/dataframe/dataframe.py +++ b/daft/dataframe/dataframe.py @@ -1984,7 +1984,7 @@ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) """ result = func(self, *args, **kwargs) assert isinstance(result, DataFrame), ( - "Func returned an instance of type [%s], " "should have been DataFrame." % type(result) + f"Func returned an instance of type [{type(result)}], " "should have been DataFrame." ) return result diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py index 8de584035b..9f5085ac3a 100644 --- a/daft/expressions/expressions.py +++ b/daft/expressions/expressions.py @@ -116,6 +116,7 @@ def lit(value: object) -> Expression: lit_value = _time_lit(i64_value, time_unit) elif isinstance(value, Decimal): sign, digits, exponent = value.as_tuple() + assert isinstance(exponent, int) lit_value = _decimal_lit(sign == 1, digits, exponent) elif isinstance(value, Series): lit_value = _series_lit(value._series) diff --git a/docs/source/10-min.ipynb b/docs/source/10-min.ipynb index 8ce2ec4fcc..238b56eca7 100644 --- a/docs/source/10-min.ipynb +++ b/docs/source/10-min.ipynb @@ -75,7 +75,7 @@ "outputs": [], "source": [ "import daft\n", - "from daft import DataType, col, udf" + "from daft import DataType, udf" ] }, { @@ -136,16 +136,23 @@ "source": [ "import datetime\n", "\n", - "df = daft.from_pydict({\n", - " \"integers\": [1, 2, 3, 4],\n", - " \"floats\": [1.5, 2.5, 3.5, 4.5],\n", - " \"bools\": [True, True, False, False],\n", - " \"strings\": [\"a\", \"b\", \"c\", \"d\"],\n", - " \"bytes\": [b\"a\", b\"b\", b\"c\", b\"d\"],\n", - " \"dates\": [datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3), datetime.date(1994, 1, 4)],\n", - " \"lists\": [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]],\n", - " \"nulls\": [None, None, None, None],\n", - "})\n", + "df = daft.from_pydict(\n", + " {\n", + " \"integers\": [1, 2, 3, 4],\n", + " \"floats\": [1.5, 2.5, 3.5, 4.5],\n", + " \"bools\": [True, True, False, False],\n", + " \"strings\": [\"a\", \"b\", \"c\", \"d\"],\n", + " \"bytes\": [b\"a\", b\"b\", b\"c\", b\"d\"],\n", + " \"dates\": [\n", + " datetime.date(1994, 1, 1),\n", + " datetime.date(1994, 1, 2),\n", + " datetime.date(1994, 1, 3),\n", + " datetime.date(1994, 1, 4),\n", + " ],\n", + " \"lists\": [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]],\n", + " \"nulls\": [None, None, None, None],\n", + " }\n", + ")\n", "\n", "df" ] @@ -236,9 +243,7 @@ "# Set IO Configurations to use anonymous data access mode\n", "daft.set_planning_config(default_io_config=daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True)))\n", "\n", - "df = daft.read_parquet(\n", - " \"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\"\n", - ")\n", + "df = daft.read_parquet(\"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\")\n", "df" ] }, @@ -620,7 +625,7 @@ } ], "source": [ - "df = df.with_column(\"full_name\", daft.col('first_name') + ' ' + daft.col('last_name'))\n", + "df = df.with_column(\"full_name\", daft.col(\"first_name\") + \" \" + daft.col(\"last_name\"))\n", "df.select(\"full_name\", \"age\", \"country\", \"has_dog\").show()" ] }, @@ -868,7 +873,7 @@ } ], "source": [ - "#select only columns for grouping\n", + "# select only columns for grouping\n", "grouping_df = df.select(df[\"country\"], df[\"first_name\"].alias(\"counts\"))\n", "\n", "# groupby country column and count the number of countries\n", @@ -932,12 +937,14 @@ } ], "source": [ - "missing_data_df = daft.from_pydict({\n", - " \"floats\": [1.5, None, float(\"nan\")],\n", - "})\n", - "missing_data_df = missing_data_df \\\n", - " .with_column(\"floats_is_null\", missing_data_df[\"floats\"].is_null()) \\\n", - " .with_column(\"floats_is_nan\", missing_data_df[\"floats\"].float.is_nan())\n", + "missing_data_df = daft.from_pydict(\n", + " {\n", + " \"floats\": [1.5, None, float(\"nan\")],\n", + " }\n", + ")\n", + "missing_data_df = missing_data_df.with_column(\"floats_is_null\", missing_data_df[\"floats\"].is_null()).with_column(\n", + " \"floats_is_nan\", missing_data_df[\"floats\"].float.is_nan()\n", + ")\n", "\n", "missing_data_df.show()" ] @@ -1184,9 +1191,7 @@ } ], "source": [ - "df2 = daft.read_parquet(\n", - " \"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\"\n", - ")\n", + "df2 = daft.read_parquet(\"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\")\n", "df2.where(df[\"country\"] == \"Canada\").explain(show_all=True)" ] }, @@ -1561,8 +1566,8 @@ "outputs": [], "source": [ "# import additional libraries, these are necessary for PyTorch\n", - "import torch\n", - "import warnings" + "\n", + "import torch" ] }, { @@ -1580,26 +1585,21 @@ "metadata": {}, "outputs": [], "source": [ - "@udf(return_dtype=DataType.fixed_size_list(dtype=DataType.string() , size=2))\n", + "@udf(return_dtype=DataType.fixed_size_list(dtype=DataType.string(), size=2))\n", "class ClassifyImages:\n", - " \n", " def __init__(self):\n", - " # Perform expensive initializations - create and load the pre-trained model \n", - " self.model = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resnet50', pretrained=True)\n", - " self.utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n", + " # Perform expensive initializations - create and load the pre-trained model\n", + " self.model = torch.hub.load(\"NVIDIA/DeepLearningExamples:torchhub\", \"nvidia_resnet50\", pretrained=True)\n", + " self.utils = torch.hub.load(\"NVIDIA/DeepLearningExamples:torchhub\", \"nvidia_convnets_processing_utils\")\n", " self.model.eval().to(torch.device(\"cpu\"))\n", "\n", " def __call__(self, images_urls):\n", " uris = images_urls.to_pylist()\n", - " batch = torch.cat(\n", - " [self.utils.prepare_input_from_uri(uri) for uri in uris]\n", - " ).to(\n", - " torch.device(\"cpu\")\n", - " )\n", + " batch = torch.cat([self.utils.prepare_input_from_uri(uri) for uri in uris]).to(torch.device(\"cpu\"))\n", "\n", " with torch.no_grad():\n", " output = torch.nn.functional.softmax(self.model(batch), dim=1)\n", - " \n", + "\n", " results = self.utils.pick_n_best(predictions=output, n=1)\n", " return [result[0] for result in results]" ] diff --git a/docs/source/user_guide/fotw/fotw-000-data-access.ipynb b/docs/source/user_guide/fotw/fotw-000-data-access.ipynb index ab1bc73c48..3aa33dca8e 100644 --- a/docs/source/user_guide/fotw/fotw-000-data-access.ipynb +++ b/docs/source/user_guide/fotw/fotw-000-data-access.ipynb @@ -34,6 +34,7 @@ "# Skip this notebook execution in CI because it hits data in a relative path\n", "if CI:\n", " import sys\n", + "\n", " sys.exit()" ] }, @@ -529,10 +530,7 @@ "MY_ANONYMOUS_IO_CONFIG = daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True))\n", "\n", "# Read this file using `MY_ANONYMOUS_IO_CONFIG`\n", - "df = daft.read_csv(\n", - " \"s3://daft-public-data/melbourne-airbnb/melbourne_airbnb.csv\", \n", - " io_config=MY_ANONYMOUS_IO_CONFIG\n", - ")" + "df = daft.read_csv(\"s3://daft-public-data/melbourne-airbnb/melbourne_airbnb.csv\", io_config=MY_ANONYMOUS_IO_CONFIG)" ] }, { @@ -657,10 +655,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = daft.read_parquet(\n", - " bucket,\n", - " io_config=io_config\n", - ")" + "df = daft.read_parquet(bucket, io_config=io_config)" ] }, { @@ -805,9 +800,7 @@ "import sqlite3\n", "\n", "connection = sqlite3.connect(\"example.db\")\n", - "connection.execute(\n", - " \"CREATE TABLE IF NOT EXISTS books (title TEXT, author TEXT, year INTEGER)\"\n", - ")\n", + "connection.execute(\"CREATE TABLE IF NOT EXISTS books (title TEXT, author TEXT, year INTEGER)\")\n", "connection.execute(\n", " \"\"\"\n", "INSERT INTO books (title, author, year)\n", @@ -894,6 +887,7 @@ "outputs": [], "source": [ "from sqlalchemy import create_engine\n", + "\n", "# substitue the uri below with the engine path on your local machine\n", "engine_uri = \"sqlite:////Users/rpelgrim/daft_sql\"\n", "engine = create_engine(engine_uri, echo=True)" @@ -908,17 +902,12 @@ }, "outputs": [], "source": [ - "import pandas as pd \n", + "import pandas as pd\n", + "\n", "csv_file_path = \"data/census-01.csv\"\n", "df = pd.read_csv(csv_file_path)\n", "\n", - "sql_df = df.to_sql(\n", - " name=\"censustable\",\n", - " con=engine,\n", - " index=False,\n", - " index_label=\"id\",\n", - " if_exists=\"replace\"\n", - ")" + "sql_df = df.to_sql(name=\"censustable\", con=engine, index=False, index_label=\"id\", if_exists=\"replace\")" ] }, { @@ -947,7 +936,7 @@ "outputs": [], "source": [ "# Read from local SQLite database\n", - "uri = \"sqlite:////Users/rpelgrim/daft_sql\" #replace with your local uri\n", + "uri = \"sqlite:////Users/rpelgrim/daft_sql\" # replace with your local uri\n", "\n", "df = daft.read_sql(\"SELECT * FROM censustable\", uri)" ] @@ -1079,10 +1068,10 @@ ], "source": [ "df = daft.read_sql(\n", - " \"SELECT * FROM censustable\", \n", + " \"SELECT * FROM censustable\",\n", " uri,\n", " partition_col=\"education\",\n", - "# num_partitions=12\n", + " # num_partitions=12\n", ")\n", "\n", "df.show()" @@ -1234,6 +1223,7 @@ "outputs": [], "source": [ "import boto3\n", + "\n", "session = boto3.session.Session()\n", "creds = session.get_credentials()\n", "\n", @@ -1247,14 +1237,9 @@ ")\n", "\n", "# Read Delta Lake table in S3 into a Daft DataFrame.\n", - "table_uri = (\n", - " \"s3://avriiil/delta-test-daft/\"\n", - ")\n", + "table_uri = \"s3://avriiil/delta-test-daft/\"\n", "\n", - "df = daft.read_deltalake(\n", - " table_uri,\n", - " io_config=io_config\n", - ")" + "df = daft.read_deltalake(table_uri, io_config=io_config)" ] }, { @@ -1572,6 +1557,7 @@ "source": [ "# Use the boto3 library to generate temporary credentials which can be used for S3 access\n", "import boto3\n", + "\n", "session = boto3.session.Session()\n", "creds = session.get_credentials()\n", "\n", diff --git a/docs/source/user_guide/fotw/fotw-001-images.ipynb b/docs/source/user_guide/fotw/fotw-001-images.ipynb index 80fda030b8..e6a88905e8 100644 --- a/docs/source/user_guide/fotw/fotw-001-images.ipynb +++ b/docs/source/user_guide/fotw/fotw-001-images.ipynb @@ -36,6 +36,7 @@ "# Skip this notebook execution in CI because it uses torch and private buckets\n", "if CI:\n", " import sys\n", + "\n", " sys.exit()" ] }, @@ -176,8 +177,8 @@ ], "source": [ "df = daft.from_glob_path(\n", - " \"s3://avriiil/images-dogs/*.jpg\", # substitute with a path to your own private bucket\n", - " io_config=io_config\n", + " \"s3://avriiil/images-dogs/*.jpg\", # substitute with a path to your own private bucket\n", + " io_config=io_config,\n", ")\n", "df.show()" ] @@ -504,7 +505,7 @@ } ], "source": [ - "df_img = df_img.with_column(\"thumbnail\", daft.col(\"image\").image.resize(32,32))\n", + "df_img = df_img.with_column(\"thumbnail\", daft.col(\"image\").image.resize(32, 32))\n", "df_img.show()" ] }, @@ -543,10 +544,11 @@ "outputs": [], "source": [ "# import additional libraries, these are necessary for PyTorch\n", - "import torch\n", "import numpy as np\n", + "import torch\n", "from PIL import Image\n", - "from daft import udf, DataType" + "\n", + "from daft import DataType, udf" ] }, { @@ -566,35 +568,30 @@ "metadata": {}, "outputs": [], "source": [ - "#@udf(return_dtype=DataType.fixed_size_list(dtype=DataType.string(), size=2))\n", + "# @udf(return_dtype=DataType.fixed_size_list(dtype=DataType.string(), size=2))\n", "@udf(return_dtype=DataType.struct({\"top_prediction\": DataType.string(), \"confidence\": DataType.float32()}))\n", "class ClassifyImages:\n", - " \n", " def __init__(self):\n", " # Perform expensive initializations - create and load the pre-trained model\n", - " self.model = torch.hub.load(\n", - " \"NVIDIA/DeepLearningExamples:torchhub\", \"nvidia_resnet50\", pretrained=True\n", - " )\n", - " self.utils = torch.hub.load(\n", - " \"NVIDIA/DeepLearningExamples:torchhub\", \"nvidia_convnets_processing_utils\"\n", - " )\n", + " self.model = torch.hub.load(\"NVIDIA/DeepLearningExamples:torchhub\", \"nvidia_resnet50\", pretrained=True)\n", + " self.utils = torch.hub.load(\"NVIDIA/DeepLearningExamples:torchhub\", \"nvidia_convnets_processing_utils\")\n", " self.model.eval().to(torch.device(\"cpu\"))\n", - " \n", + "\n", " def __call__(self, tensors):\n", - " tensors = torch.tensor(np.array(tensors.to_pylist())) #get tensors into correct format\n", - " \n", + " tensors = torch.tensor(np.array(tensors.to_pylist())) # get tensors into correct format\n", + "\n", " with torch.no_grad():\n", " output = torch.nn.functional.softmax(self.model(tensors), dim=1)\n", "\n", " results = self.utils.pick_n_best(predictions=output, n=1)\n", - " \n", - " #post-process results into StructType format\n", + "\n", + " # post-process results into StructType format\n", " list_res = [result[0] for result in results]\n", " new_list = []\n", " for pred, conf in list_res:\n", - " conf = float(conf.strip('%'))/100\n", - " new_list.append({\"top_prediction\": pred, \"confidence\": round(conf,2)})\n", - " \n", + " conf = float(conf.strip(\"%\")) / 100\n", + " new_list.append({\"top_prediction\": pred, \"confidence\": round(conf, 2)})\n", + "\n", " return new_list" ] }, @@ -617,17 +614,20 @@ "source": [ "from torchvision import transforms\n", "\n", + "\n", "def transform_image(image):\n", " # img = Image.fromarray(image)\n", " img = Image.fromarray(np.array(image))\n", - " preprocess = transforms.Compose([\n", - " transforms.Resize(256),\n", - " transforms.CenterCrop(224),\n", - " transforms.ToTensor(),\n", - " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", - " ])\n", - " tensor = preprocess(img) \n", - " return tensor " + " preprocess = transforms.Compose(\n", + " [\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + " ]\n", + " )\n", + " tensor = preprocess(img)\n", + " return tensor" ] }, { diff --git a/tests/expressions/test_udf.py b/tests/expressions/test_udf.py index 1c6c00d3aa..2572eb1adc 100644 --- a/tests/expressions/test_udf.py +++ b/tests/expressions/test_udf.py @@ -154,11 +154,11 @@ def test_udf_return_containers(container, batch_size): @udf(return_dtype=DataType.string(), batch_size=batch_size) def identity(data): - if container == Series: + if container is Series: return data - elif container == list: + elif container is list: return data.to_pylist() - elif container == np.ndarray: + elif container is np.ndarray: return np.array(data.to_arrow()) else: raise NotImplementedError(f"Test not implemented for container type: {container}") diff --git a/tests/integration/io/test_list_files_s3_minio.py b/tests/integration/io/test_list_files_s3_minio.py index b98100ef5c..5cdfd59c68 100644 --- a/tests/integration/io/test_list_files_s3_minio.py +++ b/tests/integration/io/test_list_files_s3_minio.py @@ -213,7 +213,7 @@ def test_directory_globbing_fragment_wildcard(minio_io_config, path_expect_pair, for name in files: fs.touch(f"bucket/{name}") - if type(expect) == type and issubclass(expect, BaseException): + if type(expect) is type and issubclass(expect, BaseException): with pytest.raises(expect): io_glob(globpath, io_config=minio_io_config, fanout_limit=fanout_limit) else: diff --git a/tutorials/delta_lake/1-local-image-batch-inference.ipynb b/tutorials/delta_lake/1-local-image-batch-inference.ipynb index 4cf9547f60..96d2562975 100644 --- a/tutorials/delta_lake/1-local-image-batch-inference.ipynb +++ b/tutorials/delta_lake/1-local-image-batch-inference.ipynb @@ -42,6 +42,7 @@ "# Skip this notebook execution in CI because it hits non-public buckets\n", "if CI:\n", " import sys\n", + "\n", " sys.exit()" ] }, @@ -63,6 +64,7 @@ "outputs": [], "source": [ "import boto3\n", + "\n", "import daft\n", "\n", "session = boto3.session.Session()\n", @@ -248,8 +250,7 @@ "outputs": [], "source": [ "df = df.with_column(\n", - " \"image_url\",\n", - " \"s3://daft-public-datasets/imagenet/val-10k-sample-deltalake/images/\" + df[\"filename\"] + \".jpeg\"\n", + " \"image_url\", \"s3://daft-public-datasets/imagenet/val-10k-sample-deltalake/images/\" + df[\"filename\"] + \".jpeg\"\n", ")\n", "df = df.with_column(\"image\", df[\"image_url\"].url.download().image.decode())" ] @@ -350,10 +351,12 @@ "metadata": {}, "outputs": [], "source": [ - "import daft\n", "import numpy as np\n", "import torch\n", - "from torchvision.models import resnet50, ResNet50_Weights\n", + "from torchvision.models import ResNet50_Weights, resnet50\n", + "\n", + "import daft\n", + "\n", "\n", "@daft.udf(return_dtype=daft.DataType.string())\n", "class ClassifyImage:\n", @@ -378,8 +381,8 @@ " batch = self.preprocess(images_array)\n", " prediction = self.model(batch).softmax(0)\n", " class_ids = prediction.argmax(1)\n", - " scores = prediction[:, class_ids]\n", - " return [self.category_map[class_id] for class_id in class_ids]\n" + " prediction[:, class_ids]\n", + " return [self.category_map[class_id] for class_id in class_ids]" ] }, { diff --git a/tutorials/delta_lake/2-distributed-batch-inference.ipynb b/tutorials/delta_lake/2-distributed-batch-inference.ipynb index 3959e21133..8462a74e0f 100644 --- a/tutorials/delta_lake/2-distributed-batch-inference.ipynb +++ b/tutorials/delta_lake/2-distributed-batch-inference.ipynb @@ -40,6 +40,7 @@ "# Skip this notebook execution in CI because it hits non-public buckets\n", "if CI:\n", " import sys\n", + "\n", " sys.exit()" ] }, @@ -118,6 +119,7 @@ "source": [ "# Provision Cloud Credentials\n", "import boto3\n", + "\n", "import daft\n", "\n", "session = boto3.session.Session()\n", @@ -180,8 +182,7 @@ "source": [ "# Retrieve images and run preprocessing\n", "df = df.with_column(\n", - " \"image_url\",\n", - " \"s3://daft-public-datasets/imagenet/val-10k-sample-deltalake/images/\" + df[\"filename\"] + \".jpeg\"\n", + " \"image_url\", \"s3://daft-public-datasets/imagenet/val-10k-sample-deltalake/images/\" + df[\"filename\"] + \".jpeg\"\n", ")\n", "df = df.with_column(\"image\", df[\"image_url\"].url.download().image.decode())\n", "df = df.with_column(\"image_resized_small\", df[\"image\"].image.resize(32, 32))\n", @@ -306,10 +307,12 @@ ], "source": [ "# Run batch inference over the entire dataset\n", - "import daft\n", "import numpy as np\n", "import torch\n", - "from torchvision.models import resnet50, ResNet50_Weights\n", + "from torchvision.models import ResNet50_Weights, resnet50\n", + "\n", + "import daft\n", + "\n", "\n", "@daft.udf(return_dtype=daft.DataType.string())\n", "class ClassifyImage:\n", @@ -334,9 +337,10 @@ " batch = self.preprocess(images_array)\n", " prediction = self.model(batch).softmax(0)\n", " class_ids = prediction.argmax(1)\n", - " scores = prediction[:, class_ids]\n", + " prediction[:, class_ids]\n", " return [self.category_map[class_id] for class_id in class_ids]\n", "\n", + "\n", "# Filter out rows where the channel != 3\n", "df = df.where(df[\"image\"].apply(lambda img: img.shape[2] == 3, return_dtype=daft.DataType.bool()))\n", "\n", diff --git a/tutorials/delta_lake/3-pytorch-ray-single-node-training.ipynb b/tutorials/delta_lake/3-pytorch-ray-single-node-training.ipynb index 98172fd109..5102360ba2 100644 --- a/tutorials/delta_lake/3-pytorch-ray-single-node-training.ipynb +++ b/tutorials/delta_lake/3-pytorch-ray-single-node-training.ipynb @@ -38,6 +38,7 @@ "# Skip this notebook execution in CI because it hits non-public buckets\n", "if CI:\n", " import sys\n", + "\n", " sys.exit()" ] }, @@ -59,6 +60,7 @@ "outputs": [], "source": [ "import boto3\n", + "\n", "import daft\n", "\n", "session = boto3.session.Session()\n", @@ -239,8 +241,7 @@ "source": [ "# Download and resize images\n", "df = df.with_column(\n", - " \"image_url\",\n", - " \"s3://daft-public-datasets/imagenet/val-10k-sample-deltalake/images/\" + df[\"filename\"] + \".jpeg\"\n", + " \"image_url\", \"s3://daft-public-datasets/imagenet/val-10k-sample-deltalake/images/\" + df[\"filename\"] + \".jpeg\"\n", ")\n", "df = df.with_column(\"image\", df[\"image_url\"].url.download().image.decode())\n", "df = df.with_column(\"image\", df[\"image\"].image.resize(256, 256))\n", @@ -248,10 +249,7 @@ "# Convert the images to a Tensor datatype\n", "df = df.with_column(\n", " \"arr\",\n", - " (\n", - " df[\"image\"]\n", - " .cast(daft.DataType.tensor(daft.DataType.uint8(), shape=(256, 256, 3)))\n", - " ),\n", + " (df[\"image\"].cast(daft.DataType.tensor(daft.DataType.uint8(), shape=(256, 256, 3)))),\n", ")" ] }, @@ -263,21 +261,1024 @@ "outputs": [], "source": [ "# Map class names to human-readable names and numeric IDs\n", - "classes = [('n01440764', 'tench'), ('n01443537', 'goldfish'), ('n01484850', 'great_white_shark'), ('n01491361', 'tiger_shark'), ('n01494475', 'hammerhead'), ('n01496331', 'electric_ray'), ('n01498041', 'stingray'), ('n01514668', 'cock'), ('n01514859', 'hen'), ('n01518878', 'ostrich'), ('n01530575', 'brambling'), ('n01531178', 'goldfinch'), ('n01532829', 'house_finch'), ('n01534433', 'junco'), ('n01537544', 'indigo_bunting'), ('n01558993', 'robin'), ('n01560419', 'bulbul'), ('n01580077', 'jay'), ('n01582220', 'magpie'), ('n01592084', 'chickadee'), ('n01601694', 'water_ouzel'), ('n01608432', 'kite'), ('n01614925', 'bald_eagle'), ('n01616318', 'vulture'), ('n01622779', 'great_grey_owl'), ('n01629819', 'European_fire_salamander'), ('n01630670', 'common_newt'), ('n01631663', 'eft'), ('n01632458', 'spotted_salamander'), ('n01632777', 'axolotl'), ('n01641577', 'bullfrog'), ('n01644373', 'tree_frog'), ('n01644900', 'tailed_frog'), ('n01664065', 'loggerhead'), ('n01665541', 'leatherback_turtle'), ('n01667114', 'mud_turtle'), ('n01667778', 'terrapin'), ('n01669191', 'box_turtle'), ('n01675722', 'banded_gecko'), ('n01677366', 'common_iguana'), ('n01682714', 'American_chameleon'), ('n01685808', 'whiptail'), ('n01687978', 'agama'), ('n01688243', 'frilled_lizard'), ('n01689811', 'alligator_lizard'), ('n01692333', 'Gila_monster'), ('n01693334', 'green_lizard'), ('n01694178', 'African_chameleon'), ('n01695060', 'Komodo_dragon'), ('n01697457', 'African_crocodile'), ('n01698640', 'American_alligator'), ('n01704323', 'triceratops'), ('n01728572', 'thunder_snake'), ('n01728920', 'ringneck_snake'), ('n01729322', 'hognose_snake'), ('n01729977', 'green_snake'), ('n01734418', 'king_snake'), ('n01735189', 'garter_snake'), ('n01737021', 'water_snake'), ('n01739381', 'vine_snake'), ('n01740131', 'night_snake'), ('n01742172', 'boa_constrictor'), ('n01744401', 'rock_python'), ('n01748264', 'Indian_cobra'), ('n01749939', 'green_mamba'), ('n01751748', 'sea_snake'), ('n01753488', 'horned_viper'), ('n01755581', 'diamondback'), ('n01756291', 'sidewinder'), ('n01768244', 'trilobite'), ('n01770081', 'harvestman'), ('n01770393', 'scorpion'), ('n01773157', 'black_and_gold_garden_spider'), ('n01773549', 'barn_spider'), ('n01773797', 'garden_spider'), ('n01774384', 'black_widow'), ('n01774750', 'tarantula'), ('n01775062', 'wolf_spider'), ('n01776313', 'tick'), ('n01784675', 'centipede'), ('n01795545', 'black_grouse'), ('n01796340', 'ptarmigan'), ('n01797886', 'ruffed_grouse'), ('n01798484', 'prairie_chicken'), ('n01806143', 'peacock'), ('n01806567', 'quail'), ('n01807496', 'partridge'), ('n01817953', 'African_grey'), ('n01818515', 'macaw'), ('n01819313', 'sulphur-crested_cockatoo'), ('n01820546', 'lorikeet'), ('n01824575', 'coucal'), ('n01828970', 'bee_eater'), ('n01829413', 'hornbill'), ('n01833805', 'hummingbird'), ('n01843065', 'jacamar'), ('n01843383', 'toucan'), ('n01847000', 'drake'), ('n01855032', 'red-breasted_merganser'), ('n01855672', 'goose'), ('n01860187', 'black_swan'), ('n01871265', 'tusker'), ('n01872401', 'echidna'), ('n01873310', 'platypus'), ('n01877812', 'wallaby'), ('n01882714', 'koala'), ('n01883070', 'wombat'), ('n01910747', 'jellyfish'), ('n01914609', 'sea_anemone'), ('n01917289', 'brain_coral'), ('n01924916', 'flatworm'), ('n01930112', 'nematode'), ('n01943899', 'conch'), ('n01944390', 'snail'), ('n01945685', 'slug'), ('n01950731', 'sea_slug'), ('n01955084', 'chiton'), ('n01968897', 'chambered_nautilus'), ('n01978287', 'Dungeness_crab'), ('n01978455', 'rock_crab'), ('n01980166', 'fiddler_crab'), ('n01981276', 'king_crab'), ('n01983481', 'American_lobster'), ('n01984695', 'spiny_lobster'), ('n01985128', 'crayfish'), ('n01986214', 'hermit_crab'), ('n01990800', 'isopod'), ('n02002556', 'white_stork'), ('n02002724', 'black_stork'), ('n02006656', 'spoonbill'), ('n02007558', 'flamingo'), ('n02009229', 'little_blue_heron'), ('n02009912', 'American_egret'), ('n02011460', 'bittern'), ('n02012849', 'crane'), ('n02013706', 'limpkin'), ('n02017213', 'European_gallinule'), ('n02018207', 'American_coot'), ('n02018795', 'bustard'), ('n02025239', 'ruddy_turnstone'), ('n02027492', 'red-backed_sandpiper'), ('n02028035', 'redshank'), ('n02033041', 'dowitcher'), ('n02037110', 'oystercatcher'), ('n02051845', 'pelican'), ('n02056570', 'king_penguin'), ('n02058221', 'albatross'), ('n02066245', 'grey_whale'), ('n02071294', 'killer_whale'), ('n02074367', 'dugong'), ('n02077923', 'sea_lion'), ('n02085620', 'Chihuahua'), ('n02085782', 'Japanese_spaniel'), ('n02085936', 'Maltese_dog'), ('n02086079', 'Pekinese'), ('n02086240', 'Shih-Tzu'), ('n02086646', 'Blenheim_spaniel'), ('n02086910', 'papillon'), ('n02087046', 'toy_terrier'), ('n02087394', 'Rhodesian_ridgeback'), ('n02088094', 'Afghan_hound'), ('n02088238', 'basset'), ('n02088364', 'beagle'), ('n02088466', 'bloodhound'), ('n02088632', 'bluetick'), ('n02089078', 'black-and-tan_coonhound'), ('n02089867', 'Walker_hound'), ('n02089973', 'English_foxhound'), ('n02090379', 'redbone'), ('n02090622', 'borzoi'), ('n02090721', 'Irish_wolfhound'), ('n02091032', 'Italian_greyhound'), ('n02091134', 'whippet'), ('n02091244', 'Ibizan_hound'), ('n02091467', 'Norwegian_elkhound'), ('n02091635', 'otterhound'), ('n02091831', 'Saluki'), ('n02092002', 'Scottish_deerhound'), ('n02092339', 'Weimaraner'), ('n02093256', 'Staffordshire_bullterrier'), ('n02093428', 'American_Staffordshire_terrier'), ('n02093647', 'Bedlington_terrier'), ('n02093754', 'Border_terrier'), ('n02093859', 'Kerry_blue_terrier'), ('n02093991', 'Irish_terrier'), ('n02094114', 'Norfolk_terrier'), ('n02094258', 'Norwich_terrier'), ('n02094433', 'Yorkshire_terrier'), ('n02095314', 'wire-haired_fox_terrier'), ('n02095570', 'Lakeland_terrier'), ('n02095889', 'Sealyham_terrier'), ('n02096051', 'Airedale'), ('n02096177', 'cairn'), ('n02096294', 'Australian_terrier'), ('n02096437', 'Dandie_Dinmont'), ('n02096585', 'Boston_bull'), ('n02097047', 'miniature_schnauzer'), ('n02097130', 'giant_schnauzer'), ('n02097209', 'standard_schnauzer'), ('n02097298', 'Scotch_terrier'), ('n02097474', 'Tibetan_terrier'), ('n02097658', 'silky_terrier'), ('n02098105', 'soft-coated_wheaten_terrier'), ('n02098286', 'West_Highland_white_terrier'), ('n02098413', 'Lhasa'), ('n02099267', 'flat-coated_retriever'), ('n02099429', 'curly-coated_retriever'), ('n02099601', 'golden_retriever'), ('n02099712', 'Labrador_retriever'), ('n02099849', 'Chesapeake_Bay_retriever'), ('n02100236', 'German_short-haired_pointer'), ('n02100583', 'vizsla'), ('n02100735', 'English_setter'), ('n02100877', 'Irish_setter'), ('n02101006', 'Gordon_setter'), ('n02101388', 'Brittany_spaniel'), ('n02101556', 'clumber'), ('n02102040', 'English_springer'), ('n02102177', 'Welsh_springer_spaniel'), ('n02102318', 'cocker_spaniel'), ('n02102480', 'Sussex_spaniel'), ('n02102973', 'Irish_water_spaniel'), ('n02104029', 'kuvasz'), ('n02104365', 'schipperke'), ('n02105056', 'groenendael'), ('n02105162', 'malinois'), ('n02105251', 'briard'), ('n02105412', 'kelpie'), ('n02105505', 'komondor'), ('n02105641', 'Old_English_sheepdog'), ('n02105855', 'Shetland_sheepdog'), ('n02106030', 'collie'), ('n02106166', 'Border_collie'), ('n02106382', 'Bouvier_des_Flandres'), ('n02106550', 'Rottweiler'), ('n02106662', 'German_shepherd'), ('n02107142', 'Doberman'), ('n02107312', 'miniature_pinscher'), ('n02107574', 'Greater_Swiss_Mountain_dog'), ('n02107683', 'Bernese_mountain_dog'), ('n02107908', 'Appenzeller'), ('n02108000', 'EntleBucher'), ('n02108089', 'boxer'), ('n02108422', 'bull_mastiff'), ('n02108551', 'Tibetan_mastiff'), ('n02108915', 'French_bulldog'), ('n02109047', 'Great_Dane'), ('n02109525', 'Saint_Bernard'), ('n02109961', 'Eskimo_dog'), ('n02110063', 'malamute'), ('n02110185', 'Siberian_husky'), ('n02110341', 'dalmatian'), ('n02110627', 'affenpinscher'), ('n02110806', 'basenji'), ('n02110958', 'pug'), ('n02111129', 'Leonberg'), ('n02111277', 'Newfoundland'), ('n02111500', 'Great_Pyrenees'), ('n02111889', 'Samoyed'), ('n02112018', 'Pomeranian'), ('n02112137', 'chow'), ('n02112350', 'keeshond'), ('n02112706', 'Brabancon_griffon'), ('n02113023', 'Pembroke'), ('n02113186', 'Cardigan'), ('n02113624', 'toy_poodle'), ('n02113712', 'miniature_poodle'), ('n02113799', 'standard_poodle'), ('n02113978', 'Mexican_hairless'), ('n02114367', 'timber_wolf'), ('n02114548', 'white_wolf'), ('n02114712', 'red_wolf'), ('n02114855', 'coyote'), ('n02115641', 'dingo'), ('n02115913', 'dhole'), ('n02116738', 'African_hunting_dog'), ('n02117135', 'hyena'), ('n02119022', 'red_fox'), ('n02119789', 'kit_fox'), ('n02120079', 'Arctic_fox'), ('n02120505', 'grey_fox'), ('n02123045', 'tabby'), ('n02123159', 'tiger_cat'), ('n02123394', 'Persian_cat'), ('n02123597', 'Siamese_cat'), ('n02124075', 'Egyptian_cat'), ('n02125311', 'cougar'), ('n02127052', 'lynx'), ('n02128385', 'leopard'), ('n02128757', 'snow_leopard'), ('n02128925', 'jaguar'), ('n02129165', 'lion'), ('n02129604', 'tiger'), ('n02130308', 'cheetah'), ('n02132136', 'brown_bear'), ('n02133161', 'American_black_bear'), ('n02134084', 'ice_bear'), ('n02134418', 'sloth_bear'), ('n02137549', 'mongoose'), ('n02138441', 'meerkat'), ('n02165105', 'tiger_beetle'), ('n02165456', 'ladybug'), ('n02167151', 'ground_beetle'), ('n02168699', 'long-horned_beetle'), ('n02169497', 'leaf_beetle'), ('n02172182', 'dung_beetle'), ('n02174001', 'rhinoceros_beetle'), ('n02177972', 'weevil'), ('n02190166', 'fly'), ('n02206856', 'bee'), ('n02219486', 'ant'), ('n02226429', 'grasshopper'), ('n02229544', 'cricket'), ('n02231487', 'walking_stick'), ('n02233338', 'cockroach'), ('n02236044', 'mantis'), ('n02256656', 'cicada'), ('n02259212', 'leafhopper'), ('n02264363', 'lacewing'), ('n02268443', 'dragonfly'), ('n02268853', 'damselfly'), ('n02276258', 'admiral'), ('n02277742', 'ringlet'), ('n02279972', 'monarch'), ('n02280649', 'cabbage_butterfly'), ('n02281406', 'sulphur_butterfly'), ('n02281787', 'lycaenid'), ('n02317335', 'starfish'), ('n02319095', 'sea_urchin'), ('n02321529', 'sea_cucumber'), ('n02325366', 'wood_rabbit'), ('n02326432', 'hare'), ('n02328150', 'Angora'), ('n02342885', 'hamster'), ('n02346627', 'porcupine'), ('n02356798', 'fox_squirrel'), ('n02361337', 'marmot'), ('n02363005', 'beaver'), ('n02364673', 'guinea_pig'), ('n02389026', 'sorrel'), ('n02391049', 'zebra'), ('n02395406', 'hog'), ('n02396427', 'wild_boar'), ('n02397096', 'warthog'), ('n02398521', 'hippopotamus'), ('n02403003', 'ox'), ('n02408429', 'water_buffalo'), ('n02410509', 'bison'), ('n02412080', 'ram'), ('n02415577', 'bighorn'), ('n02417914', 'ibex'), ('n02422106', 'hartebeest'), ('n02422699', 'impala'), ('n02423022', 'gazelle'), ('n02437312', 'Arabian_camel'), ('n02437616', 'llama'), ('n02441942', 'weasel'), ('n02442845', 'mink'), ('n02443114', 'polecat'), ('n02443484', 'black-footed_ferret'), ('n02444819', 'otter'), ('n02445715', 'skunk'), ('n02447366', 'badger'), ('n02454379', 'armadillo'), ('n02457408', 'three-toed_sloth'), ('n02480495', 'orangutan'), ('n02480855', 'gorilla'), ('n02481823', 'chimpanzee'), ('n02483362', 'gibbon'), ('n02483708', 'siamang'), ('n02484975', 'guenon'), ('n02486261', 'patas'), ('n02486410', 'baboon'), ('n02487347', 'macaque'), ('n02488291', 'langur'), ('n02488702', 'colobus'), ('n02489166', 'proboscis_monkey'), ('n02490219', 'marmoset'), ('n02492035', 'capuchin'), ('n02492660', 'howler_monkey'), ('n02493509', 'titi'), ('n02493793', 'spider_monkey'), ('n02494079', 'squirrel_monkey'), ('n02497673', 'Madagascar_cat'), ('n02500267', 'indri'), ('n02504013', 'Indian_elephant'), ('n02504458', 'African_elephant'), ('n02509815', 'lesser_panda'), ('n02510455', 'giant_panda'), ('n02514041', 'barracouta'), ('n02526121', 'eel'), ('n02536864', 'coho'), ('n02606052', 'rock_beauty'), ('n02607072', 'anemone_fish'), ('n02640242', 'sturgeon'), ('n02641379', 'gar'), ('n02643566', 'lionfish'), ('n02655020', 'puffer'), ('n02666196', 'abacus'), ('n02667093', 'abaya'), ('n02669723', 'academic_gown'), ('n02672831', 'accordion'), ('n02676566', 'acoustic_guitar'), ('n02687172', 'aircraft_carrier'), ('n02690373', 'airliner'), ('n02692877', 'airship'), ('n02699494', 'altar'), ('n02701002', 'ambulance'), ('n02704792', 'amphibian'), ('n02708093', 'analog_clock'), ('n02727426', 'apiary'), ('n02730930', 'apron'), ('n02747177', 'ashcan'), ('n02749479', 'assault_rifle'), ('n02769748', 'backpack'), ('n02776631', 'bakery'), ('n02777292', 'balance_beam'), ('n02782093', 'balloon'), ('n02783161', 'ballpoint'), ('n02786058', 'Band_Aid'), ('n02787622', 'banjo'), ('n02788148', 'bannister'), ('n02790996', 'barbell'), ('n02791124', 'barber_chair'), ('n02791270', 'barbershop'), ('n02793495', 'barn'), ('n02794156', 'barometer'), ('n02795169', 'barrel'), ('n02797295', 'barrow'), ('n02799071', 'baseball'), ('n02802426', 'basketball'), ('n02804414', 'bassinet'), ('n02804610', 'bassoon'), ('n02807133', 'bathing_cap'), ('n02808304', 'bath_towel'), ('n02808440', 'bathtub'), ('n02814533', 'beach_wagon'), ('n02814860', 'beacon'), ('n02815834', 'beaker'), ('n02817516', 'bearskin'), ('n02823428', 'beer_bottle'), ('n02823750', 'beer_glass'), ('n02825657', 'bell_cote'), ('n02834397', 'bib'), ('n02835271', 'bicycle-built-for-two'), ('n02837789', 'bikini'), ('n02840245', 'binder'), ('n02841315', 'binoculars'), ('n02843684', 'birdhouse'), ('n02859443', 'boathouse'), ('n02860847', 'bobsled'), ('n02865351', 'bolo_tie'), ('n02869837', 'bonnet'), ('n02870880', 'bookcase'), ('n02871525', 'bookshop'), ('n02877765', 'bottlecap'), ('n02879718', 'bow'), ('n02883205', 'bow_tie'), ('n02892201', 'brass'), ('n02892767', 'brassiere'), ('n02894605', 'breakwater'), ('n02895154', 'breastplate'), ('n02906734', 'broom'), ('n02909870', 'bucket'), ('n02910353', 'buckle'), ('n02916936', 'bulletproof_vest'), ('n02917067', 'bullet_train'), ('n02927161', 'butcher_shop'), ('n02930766', 'cab'), ('n02939185', 'caldron'), ('n02948072', 'candle'), ('n02950826', 'cannon'), ('n02951358', 'canoe'), ('n02951585', 'can_opener'), ('n02963159', 'cardigan'), ('n02965783', 'car_mirror'), ('n02966193', 'carousel'), ('n02966687', \"carpenter's_kit\"), ('n02971356', 'carton'), ('n02974003', 'car_wheel'), ('n02977058', 'cash_machine'), ('n02978881', 'cassette'), ('n02979186', 'cassette_player'), ('n02980441', 'castle'), ('n02981792', 'catamaran'), ('n02988304', 'CD_player'), ('n02992211', 'cello'), ('n02992529', 'cellular_telephone'), ('n02999410', 'chain'), ('n03000134', 'chainlink_fence'), ('n03000247', 'chain_mail'), ('n03000684', 'chain_saw'), ('n03014705', 'chest'), ('n03016953', 'chiffonier'), ('n03017168', 'chime'), ('n03018349', 'china_cabinet'), ('n03026506', 'Christmas_stocking'), ('n03028079', 'church'), ('n03032252', 'cinema'), ('n03041632', 'cleaver'), ('n03042490', 'cliff_dwelling'), ('n03045698', 'cloak'), ('n03047690', 'clog'), ('n03062245', 'cocktail_shaker'), ('n03063599', 'coffee_mug'), ('n03063689', 'coffeepot'), ('n03065424', 'coil'), ('n03075370', 'combination_lock'), ('n03085013', 'computer_keyboard'), ('n03089624', 'confectionery'), ('n03095699', 'container_ship'), ('n03100240', 'convertible'), ('n03109150', 'corkscrew'), ('n03110669', 'cornet'), ('n03124043', 'cowboy_boot'), ('n03124170', 'cowboy_hat'), ('n03125729', 'cradle'), ('n03126707', 'crane'), ('n03127747', 'crash_helmet'), ('n03127925', 'crate'), ('n03131574', 'crib'), ('n03133878', 'Crock_Pot'), ('n03134739', 'croquet_ball'), ('n03141823', 'crutch'), ('n03146219', 'cuirass'), ('n03160309', 'dam'), ('n03179701', 'desk'), ('n03180011', 'desktop_computer'), ('n03187595', 'dial_telephone'), ('n03188531', 'diaper'), ('n03196217', 'digital_clock'), ('n03197337', 'digital_watch'), ('n03201208', 'dining_table'), ('n03207743', 'dishrag'), ('n03207941', 'dishwasher'), ('n03208938', 'disk_brake'), ('n03216828', 'dock'), ('n03218198', 'dogsled'), ('n03220513', 'dome'), ('n03223299', 'doormat'), ('n03240683', 'drilling_platform'), ('n03249569', 'drum'), ('n03250847', 'drumstick'), ('n03255030', 'dumbbell'), ('n03259280', 'Dutch_oven'), ('n03271574', 'electric_fan'), ('n03272010', 'electric_guitar'), ('n03272562', 'electric_locomotive'), ('n03290653', 'entertainment_center'), ('n03291819', 'envelope'), ('n03297495', 'espresso_maker'), ('n03314780', 'face_powder'), ('n03325584', 'feather_boa'), ('n03337140', 'file'), ('n03344393', 'fireboat'), ('n03345487', 'fire_engine'), ('n03347037', 'fire_screen'), ('n03355925', 'flagpole'), ('n03372029', 'flute'), ('n03376595', 'folding_chair'), ('n03379051', 'football_helmet'), ('n03384352', 'forklift'), ('n03388043', 'fountain'), ('n03388183', 'fountain_pen'), ('n03388549', 'four-poster'), ('n03393912', 'freight_car'), ('n03394916', 'French_horn'), ('n03400231', 'frying_pan'), ('n03404251', 'fur_coat'), ('n03417042', 'garbage_truck'), ('n03424325', 'gasmask'), ('n03425413', 'gas_pump'), ('n03443371', 'goblet'), ('n03444034', 'go-kart'), ('n03445777', 'golf_ball'), ('n03445924', 'golfcart'), ('n03447447', 'gondola'), ('n03447721', 'gong'), ('n03450230', 'gown'), ('n03452741', 'grand_piano'), ('n03457902', 'greenhouse'), ('n03459775', 'grille'), ('n03461385', 'grocery_store'), ('n03467068', 'guillotine'), ('n03476684', 'hair_slide'), ('n03476991', 'hair_spray'), ('n03478589', 'half_track'), ('n03481172', 'hammer'), ('n03482405', 'hamper'), ('n03483316', 'hand_blower'), ('n03485407', 'hand-held_computer'), ('n03485794', 'handkerchief'), ('n03492542', 'hard_disc'), ('n03494278', 'harmonica'), ('n03495258', 'harp'), ('n03496892', 'harvester'), ('n03498962', 'hatchet'), ('n03527444', 'holster'), ('n03529860', 'home_theater'), ('n03530642', 'honeycomb'), ('n03532672', 'hook'), ('n03534580', 'hoopskirt'), ('n03535780', 'horizontal_bar'), ('n03538406', 'horse_cart'), ('n03544143', 'hourglass'), ('n03584254', 'iPod'), ('n03584829', 'iron'), ('n03590841', \"jack-o'-lantern\"), ('n03594734', 'jean'), ('n03594945', 'jeep'), ('n03595614', 'jersey'), ('n03598930', 'jigsaw_puzzle'), ('n03599486', 'jinrikisha'), ('n03602883', 'joystick'), ('n03617480', 'kimono'), ('n03623198', 'knee_pad'), ('n03627232', 'knot'), ('n03630383', 'lab_coat'), ('n03633091', 'ladle'), ('n03637318', 'lampshade'), ('n03642806', 'laptop'), ('n03649909', 'lawn_mower'), ('n03657121', 'lens_cap'), ('n03658185', 'letter_opener'), ('n03661043', 'library'), ('n03662601', 'lifeboat'), ('n03666591', 'lighter'), ('n03670208', 'limousine'), ('n03673027', 'liner'), ('n03676483', 'lipstick'), ('n03680355', 'Loafer'), ('n03690938', 'lotion'), ('n03691459', 'loudspeaker'), ('n03692522', 'loupe'), ('n03697007', 'lumbermill'), ('n03706229', 'magnetic_compass'), ('n03709823', 'mailbag'), ('n03710193', 'mailbox'), ('n03710637', 'maillot'), ('n03710721', 'maillot'), ('n03717622', 'manhole_cover'), ('n03720891', 'maraca'), ('n03721384', 'marimba'), ('n03724870', 'mask'), ('n03729826', 'matchstick'), ('n03733131', 'maypole'), ('n03733281', 'maze'), ('n03733805', 'measuring_cup'), ('n03742115', 'medicine_chest'), ('n03743016', 'megalith'), ('n03759954', 'microphone'), ('n03761084', 'microwave'), ('n03763968', 'military_uniform'), ('n03764736', 'milk_can'), ('n03769881', 'minibus'), ('n03770439', 'miniskirt'), ('n03770679', 'minivan'), ('n03773504', 'missile'), ('n03775071', 'mitten'), ('n03775546', 'mixing_bowl'), ('n03776460', 'mobile_home'), ('n03777568', 'Model_T'), ('n03777754', 'modem'), ('n03781244', 'monastery'), ('n03782006', 'monitor'), ('n03785016', 'moped'), ('n03786901', 'mortar'), ('n03787032', 'mortarboard'), ('n03788195', 'mosque'), ('n03788365', 'mosquito_net'), ('n03791053', 'motor_scooter'), ('n03792782', 'mountain_bike'), ('n03792972', 'mountain_tent'), ('n03793489', 'mouse'), ('n03794056', 'mousetrap'), ('n03796401', 'moving_van'), ('n03803284', 'muzzle'), ('n03804744', 'nail'), ('n03814639', 'neck_brace'), ('n03814906', 'necklace'), ('n03825788', 'nipple'), ('n03832673', 'notebook'), ('n03837869', 'obelisk'), ('n03838899', 'oboe'), ('n03840681', 'ocarina'), ('n03841143', 'odometer'), ('n03843555', 'oil_filter'), ('n03854065', 'organ'), ('n03857828', 'oscilloscope'), ('n03866082', 'overskirt'), ('n03868242', 'oxcart'), ('n03868863', 'oxygen_mask'), ('n03871628', 'packet'), ('n03873416', 'paddle'), ('n03874293', 'paddlewheel'), ('n03874599', 'padlock'), ('n03876231', 'paintbrush'), ('n03877472', 'pajama'), ('n03877845', 'palace'), ('n03884397', 'panpipe'), ('n03887697', 'paper_towel'), ('n03888257', 'parachute'), ('n03888605', 'parallel_bars'), ('n03891251', 'park_bench'), ('n03891332', 'parking_meter'), ('n03895866', 'passenger_car'), ('n03899768', 'patio'), ('n03902125', 'pay-phone'), ('n03903868', 'pedestal'), ('n03908618', 'pencil_box'), ('n03908714', 'pencil_sharpener'), ('n03916031', 'perfume'), ('n03920288', 'Petri_dish'), ('n03924679', 'photocopier'), ('n03929660', 'pick'), ('n03929855', 'pickelhaube'), ('n03930313', 'picket_fence'), ('n03930630', 'pickup'), ('n03933933', 'pier'), ('n03935335', 'piggy_bank'), ('n03937543', 'pill_bottle'), ('n03938244', 'pillow'), ('n03942813', 'ping-pong_ball'), ('n03944341', 'pinwheel'), ('n03947888', 'pirate'), ('n03950228', 'pitcher'), ('n03954731', 'plane'), ('n03956157', 'planetarium'), ('n03958227', 'plastic_bag'), ('n03961711', 'plate_rack'), ('n03967562', 'plow'), ('n03970156', 'plunger'), ('n03976467', 'Polaroid_camera'), ('n03976657', 'pole'), ('n03977966', 'police_van'), ('n03980874', 'poncho'), ('n03982430', 'pool_table'), ('n03983396', 'pop_bottle'), ('n03991062', 'pot'), ('n03992509', \"potter's_wheel\"), ('n03995372', 'power_drill'), ('n03998194', 'prayer_rug'), ('n04004767', 'printer'), ('n04005630', 'prison'), ('n04008634', 'projectile'), ('n04009552', 'projector'), ('n04019541', 'puck'), ('n04023962', 'punching_bag'), ('n04026417', 'purse'), ('n04033901', 'quill'), ('n04033995', 'quilt'), ('n04037443', 'racer'), ('n04039381', 'racket'), ('n04040759', 'radiator'), ('n04041544', 'radio'), ('n04044716', 'radio_telescope'), ('n04049303', 'rain_barrel'), ('n04065272', 'recreational_vehicle'), ('n04067472', 'reel'), ('n04069434', 'reflex_camera'), ('n04070727', 'refrigerator'), ('n04074963', 'remote_control'), ('n04081281', 'restaurant'), ('n04086273', 'revolver'), ('n04090263', 'rifle'), ('n04099969', 'rocking_chair'), ('n04111531', 'rotisserie'), ('n04116512', 'rubber_eraser'), ('n04118538', 'rugby_ball'), ('n04118776', 'rule'), ('n04120489', 'running_shoe'), ('n04125021', 'safe'), ('n04127249', 'safety_pin'), ('n04131690', 'saltshaker'), ('n04133789', 'sandal'), ('n04136333', 'sarong'), ('n04141076', 'sax'), ('n04141327', 'scabbard'), ('n04141975', 'scale'), ('n04146614', 'school_bus'), ('n04147183', 'schooner'), ('n04149813', 'scoreboard'), ('n04152593', 'screen'), ('n04153751', 'screw'), ('n04154565', 'screwdriver'), ('n04162706', 'seat_belt'), ('n04179913', 'sewing_machine'), ('n04192698', 'shield'), ('n04200800', 'shoe_shop'), ('n04201297', 'shoji'), ('n04204238', 'shopping_basket'), ('n04204347', 'shopping_cart'), ('n04208210', 'shovel'), ('n04209133', 'shower_cap'), ('n04209239', 'shower_curtain'), ('n04228054', 'ski'), ('n04229816', 'ski_mask'), ('n04235860', 'sleeping_bag'), ('n04238763', 'slide_rule'), ('n04239074', 'sliding_door'), ('n04243546', 'slot'), ('n04251144', 'snorkel'), ('n04252077', 'snowmobile'), ('n04252225', 'snowplow'), ('n04254120', 'soap_dispenser'), ('n04254680', 'soccer_ball'), ('n04254777', 'sock'), ('n04258138', 'solar_dish'), ('n04259630', 'sombrero'), ('n04263257', 'soup_bowl'), ('n04264628', 'space_bar'), ('n04265275', 'space_heater'), ('n04266014', 'space_shuttle'), ('n04270147', 'spatula'), ('n04273569', 'speedboat'), ('n04275548', 'spider_web'), ('n04277352', 'spindle'), ('n04285008', 'sports_car'), ('n04286575', 'spotlight'), ('n04296562', 'stage'), ('n04310018', 'steam_locomotive'), ('n04311004', 'steel_arch_bridge'), ('n04311174', 'steel_drum'), ('n04317175', 'stethoscope'), ('n04325704', 'stole'), ('n04326547', 'stone_wall'), ('n04328186', 'stopwatch'), ('n04330267', 'stove'), ('n04332243', 'strainer'), ('n04335435', 'streetcar'), ('n04336792', 'stretcher'), ('n04344873', 'studio_couch'), ('n04346328', 'stupa'), ('n04347754', 'submarine'), ('n04350905', 'suit'), ('n04355338', 'sundial'), ('n04355933', 'sunglass'), ('n04356056', 'sunglasses'), ('n04357314', 'sunscreen'), ('n04366367', 'suspension_bridge'), ('n04367480', 'swab'), ('n04370456', 'sweatshirt'), ('n04371430', 'swimming_trunks'), ('n04371774', 'swing'), ('n04372370', 'switch'), ('n04376876', 'syringe'), ('n04380533', 'table_lamp'), ('n04389033', 'tank'), ('n04392985', 'tape_player'), ('n04398044', 'teapot'), ('n04399382', 'teddy'), ('n04404412', 'television'), ('n04409515', 'tennis_ball'), ('n04417672', 'thatch'), ('n04418357', 'theater_curtain'), ('n04423845', 'thimble'), ('n04428191', 'thresher'), ('n04429376', 'throne'), ('n04435653', 'tile_roof'), ('n04442312', 'toaster'), ('n04443257', 'tobacco_shop'), ('n04447861', 'toilet_seat'), ('n04456115', 'torch'), ('n04458633', 'totem_pole'), ('n04461696', 'tow_truck'), ('n04462240', 'toyshop'), ('n04465501', 'tractor'), ('n04467665', 'trailer_truck'), ('n04476259', 'tray'), ('n04479046', 'trench_coat'), ('n04482393', 'tricycle'), ('n04483307', 'trimaran'), ('n04485082', 'tripod'), ('n04486054', 'triumphal_arch'), ('n04487081', 'trolleybus'), ('n04487394', 'trombone'), ('n04493381', 'tub'), ('n04501370', 'turnstile'), ('n04505470', 'typewriter_keyboard'), ('n04507155', 'umbrella'), ('n04509417', 'unicycle'), ('n04515003', 'upright'), ('n04517823', 'vacuum'), ('n04522168', 'vase'), ('n04523525', 'vault'), ('n04525038', 'velvet'), ('n04525305', 'vending_machine'), ('n04532106', 'vestment'), ('n04532670', 'viaduct'), ('n04536866', 'violin'), ('n04540053', 'volleyball'), ('n04542943', 'waffle_iron'), ('n04548280', 'wall_clock'), ('n04548362', 'wallet'), ('n04550184', 'wardrobe'), ('n04552348', 'warplane'), ('n04553703', 'washbasin'), ('n04554684', 'washer'), ('n04557648', 'water_bottle'), ('n04560804', 'water_jug'), ('n04562935', 'water_tower'), ('n04579145', 'whiskey_jug'), ('n04579432', 'whistle'), ('n04584207', 'wig'), ('n04589890', 'window_screen'), ('n04590129', 'window_shade'), ('n04591157', 'Windsor_tie'), ('n04591713', 'wine_bottle'), ('n04592741', 'wing'), ('n04596742', 'wok'), ('n04597913', 'wooden_spoon'), ('n04599235', 'wool'), ('n04604644', 'worm_fence'), ('n04606251', 'wreck'), ('n04612504', 'yawl'), ('n04613696', 'yurt'), ('n06359193', 'web_site'), ('n06596364', 'comic_book'), ('n06785654', 'crossword_puzzle'), ('n06794110', 'street_sign'), ('n06874185', 'traffic_light'), ('n07248320', 'book_jacket'), ('n07565083', 'menu'), ('n07579787', 'plate'), ('n07583066', 'guacamole'), ('n07584110', 'consomme'), ('n07590611', 'hot_pot'), ('n07613480', 'trifle'), ('n07614500', 'ice_cream'), ('n07615774', 'ice_lolly'), ('n07684084', 'French_loaf'), ('n07693725', 'bagel'), ('n07695742', 'pretzel'), ('n07697313', 'cheeseburger'), ('n07697537', 'hotdog'), ('n07711569', 'mashed_potato'), ('n07714571', 'head_cabbage'), ('n07714990', 'broccoli'), ('n07715103', 'cauliflower'), ('n07716358', 'zucchini'), ('n07716906', 'spaghetti_squash'), ('n07717410', 'acorn_squash'), ('n07717556', 'butternut_squash'), ('n07718472', 'cucumber'), ('n07718747', 'artichoke'), ('n07720875', 'bell_pepper'), ('n07730033', 'cardoon'), ('n07734744', 'mushroom'), ('n07742313', 'Granny_Smith'), ('n07745940', 'strawberry'), ('n07747607', 'orange'), ('n07749582', 'lemon'), ('n07753113', 'fig'), ('n07753275', 'pineapple'), ('n07753592', 'banana'), ('n07754684', 'jackfruit'), ('n07760859', 'custard_apple'), ('n07768694', 'pomegranate'), ('n07802026', 'hay'), ('n07831146', 'carbonara'), ('n07836838', 'chocolate_sauce'), ('n07860988', 'dough'), ('n07871810', 'meat_loaf'), ('n07873807', 'pizza'), ('n07875152', 'potpie'), ('n07880968', 'burrito'), ('n07892512', 'red_wine'), ('n07920052', 'espresso'), ('n07930864', 'cup'), ('n07932039', 'eggnog'), ('n09193705', 'alp'), ('n09229709', 'bubble'), ('n09246464', 'cliff'), ('n09256479', 'coral_reef'), ('n09288635', 'geyser'), ('n09332890', 'lakeside'), ('n09399592', 'promontory'), ('n09421951', 'sandbar'), ('n09428293', 'seashore'), ('n09468604', 'valley'), ('n09472597', 'volcano'), ('n09835506', 'ballplayer'), ('n10148035', 'groom'), ('n10565667', 'scuba_diver'), ('n11879895', 'rapeseed'), ('n11939491', 'daisy'), ('n12057211', \"yellow_lady's_slipper\"), ('n12144580', 'corn'), ('n12267677', 'acorn'), ('n12620546', 'hip'), ('n12768682', 'buckeye'), ('n12985857', 'coral_fungus'), ('n12998815', 'agaric'), ('n13037406', 'gyromitra'), ('n13040303', 'stinkhorn'), ('n13044778', 'earthstar'), ('n13052670', 'hen-of-the-woods'), ('n13054560', 'bolete'), ('n13133613', 'ear'), ('n15075141', 'toilet_tissue')]\n", + "classes = [\n", + " (\"n01440764\", \"tench\"),\n", + " (\"n01443537\", \"goldfish\"),\n", + " (\"n01484850\", \"great_white_shark\"),\n", + " (\"n01491361\", \"tiger_shark\"),\n", + " (\"n01494475\", \"hammerhead\"),\n", + " (\"n01496331\", \"electric_ray\"),\n", + " (\"n01498041\", \"stingray\"),\n", + " (\"n01514668\", \"cock\"),\n", + " (\"n01514859\", \"hen\"),\n", + " (\"n01518878\", \"ostrich\"),\n", + " (\"n01530575\", \"brambling\"),\n", + " (\"n01531178\", \"goldfinch\"),\n", + " (\"n01532829\", \"house_finch\"),\n", + " (\"n01534433\", \"junco\"),\n", + " (\"n01537544\", \"indigo_bunting\"),\n", + " (\"n01558993\", \"robin\"),\n", + " (\"n01560419\", \"bulbul\"),\n", + " (\"n01580077\", \"jay\"),\n", + " (\"n01582220\", \"magpie\"),\n", + " (\"n01592084\", \"chickadee\"),\n", + " (\"n01601694\", \"water_ouzel\"),\n", + " (\"n01608432\", \"kite\"),\n", + " (\"n01614925\", \"bald_eagle\"),\n", + " (\"n01616318\", \"vulture\"),\n", + " (\"n01622779\", \"great_grey_owl\"),\n", + " (\"n01629819\", \"European_fire_salamander\"),\n", + " (\"n01630670\", \"common_newt\"),\n", + " (\"n01631663\", \"eft\"),\n", + " (\"n01632458\", \"spotted_salamander\"),\n", + " (\"n01632777\", \"axolotl\"),\n", + " (\"n01641577\", \"bullfrog\"),\n", + " (\"n01644373\", \"tree_frog\"),\n", + " (\"n01644900\", \"tailed_frog\"),\n", + " (\"n01664065\", \"loggerhead\"),\n", + " (\"n01665541\", \"leatherback_turtle\"),\n", + " (\"n01667114\", \"mud_turtle\"),\n", + " (\"n01667778\", \"terrapin\"),\n", + " (\"n01669191\", \"box_turtle\"),\n", + " (\"n01675722\", \"banded_gecko\"),\n", + " (\"n01677366\", \"common_iguana\"),\n", + " (\"n01682714\", \"American_chameleon\"),\n", + " (\"n01685808\", \"whiptail\"),\n", + " (\"n01687978\", \"agama\"),\n", + " (\"n01688243\", \"frilled_lizard\"),\n", + " (\"n01689811\", \"alligator_lizard\"),\n", + " (\"n01692333\", \"Gila_monster\"),\n", + " (\"n01693334\", \"green_lizard\"),\n", + " (\"n01694178\", \"African_chameleon\"),\n", + " (\"n01695060\", \"Komodo_dragon\"),\n", + " (\"n01697457\", \"African_crocodile\"),\n", + " (\"n01698640\", \"American_alligator\"),\n", + " (\"n01704323\", \"triceratops\"),\n", + " (\"n01728572\", \"thunder_snake\"),\n", + " (\"n01728920\", \"ringneck_snake\"),\n", + " (\"n01729322\", \"hognose_snake\"),\n", + " (\"n01729977\", \"green_snake\"),\n", + " (\"n01734418\", \"king_snake\"),\n", + " (\"n01735189\", \"garter_snake\"),\n", + " (\"n01737021\", \"water_snake\"),\n", + " (\"n01739381\", \"vine_snake\"),\n", + " (\"n01740131\", \"night_snake\"),\n", + " (\"n01742172\", \"boa_constrictor\"),\n", + " (\"n01744401\", \"rock_python\"),\n", + " (\"n01748264\", \"Indian_cobra\"),\n", + " (\"n01749939\", \"green_mamba\"),\n", + " (\"n01751748\", \"sea_snake\"),\n", + " (\"n01753488\", \"horned_viper\"),\n", + " (\"n01755581\", \"diamondback\"),\n", + " (\"n01756291\", \"sidewinder\"),\n", + " (\"n01768244\", \"trilobite\"),\n", + " (\"n01770081\", \"harvestman\"),\n", + " (\"n01770393\", \"scorpion\"),\n", + " (\"n01773157\", \"black_and_gold_garden_spider\"),\n", + " (\"n01773549\", \"barn_spider\"),\n", + " (\"n01773797\", \"garden_spider\"),\n", + " (\"n01774384\", \"black_widow\"),\n", + " (\"n01774750\", \"tarantula\"),\n", + " (\"n01775062\", \"wolf_spider\"),\n", + " (\"n01776313\", \"tick\"),\n", + " (\"n01784675\", \"centipede\"),\n", + " (\"n01795545\", \"black_grouse\"),\n", + " (\"n01796340\", \"ptarmigan\"),\n", + " (\"n01797886\", \"ruffed_grouse\"),\n", + " (\"n01798484\", \"prairie_chicken\"),\n", + " (\"n01806143\", \"peacock\"),\n", + " (\"n01806567\", \"quail\"),\n", + " (\"n01807496\", \"partridge\"),\n", + " (\"n01817953\", \"African_grey\"),\n", + " (\"n01818515\", \"macaw\"),\n", + " (\"n01819313\", \"sulphur-crested_cockatoo\"),\n", + " (\"n01820546\", \"lorikeet\"),\n", + " (\"n01824575\", \"coucal\"),\n", + " (\"n01828970\", \"bee_eater\"),\n", + " (\"n01829413\", \"hornbill\"),\n", + " (\"n01833805\", \"hummingbird\"),\n", + " (\"n01843065\", \"jacamar\"),\n", + " (\"n01843383\", \"toucan\"),\n", + " (\"n01847000\", \"drake\"),\n", + " (\"n01855032\", \"red-breasted_merganser\"),\n", + " (\"n01855672\", \"goose\"),\n", + " (\"n01860187\", \"black_swan\"),\n", + " (\"n01871265\", \"tusker\"),\n", + " (\"n01872401\", \"echidna\"),\n", + " (\"n01873310\", \"platypus\"),\n", + " (\"n01877812\", \"wallaby\"),\n", + " (\"n01882714\", \"koala\"),\n", + " (\"n01883070\", \"wombat\"),\n", + " (\"n01910747\", \"jellyfish\"),\n", + " (\"n01914609\", \"sea_anemone\"),\n", + " (\"n01917289\", \"brain_coral\"),\n", + " (\"n01924916\", \"flatworm\"),\n", + " (\"n01930112\", \"nematode\"),\n", + " (\"n01943899\", \"conch\"),\n", + " (\"n01944390\", \"snail\"),\n", + " (\"n01945685\", \"slug\"),\n", + " (\"n01950731\", \"sea_slug\"),\n", + " (\"n01955084\", \"chiton\"),\n", + " (\"n01968897\", \"chambered_nautilus\"),\n", + " (\"n01978287\", \"Dungeness_crab\"),\n", + " (\"n01978455\", \"rock_crab\"),\n", + " (\"n01980166\", \"fiddler_crab\"),\n", + " (\"n01981276\", \"king_crab\"),\n", + " (\"n01983481\", \"American_lobster\"),\n", + " (\"n01984695\", \"spiny_lobster\"),\n", + " (\"n01985128\", \"crayfish\"),\n", + " (\"n01986214\", \"hermit_crab\"),\n", + " (\"n01990800\", \"isopod\"),\n", + " (\"n02002556\", \"white_stork\"),\n", + " (\"n02002724\", \"black_stork\"),\n", + " (\"n02006656\", \"spoonbill\"),\n", + " (\"n02007558\", \"flamingo\"),\n", + " (\"n02009229\", \"little_blue_heron\"),\n", + " (\"n02009912\", \"American_egret\"),\n", + " (\"n02011460\", \"bittern\"),\n", + " (\"n02012849\", \"crane\"),\n", + " (\"n02013706\", \"limpkin\"),\n", + " (\"n02017213\", \"European_gallinule\"),\n", + " (\"n02018207\", \"American_coot\"),\n", + " (\"n02018795\", \"bustard\"),\n", + " (\"n02025239\", \"ruddy_turnstone\"),\n", + " (\"n02027492\", \"red-backed_sandpiper\"),\n", + " (\"n02028035\", \"redshank\"),\n", + " (\"n02033041\", \"dowitcher\"),\n", + " (\"n02037110\", \"oystercatcher\"),\n", + " (\"n02051845\", \"pelican\"),\n", + " (\"n02056570\", \"king_penguin\"),\n", + " (\"n02058221\", \"albatross\"),\n", + " (\"n02066245\", \"grey_whale\"),\n", + " (\"n02071294\", \"killer_whale\"),\n", + " (\"n02074367\", \"dugong\"),\n", + " (\"n02077923\", \"sea_lion\"),\n", + " (\"n02085620\", \"Chihuahua\"),\n", + " (\"n02085782\", \"Japanese_spaniel\"),\n", + " (\"n02085936\", \"Maltese_dog\"),\n", + " (\"n02086079\", \"Pekinese\"),\n", + " (\"n02086240\", \"Shih-Tzu\"),\n", + " (\"n02086646\", \"Blenheim_spaniel\"),\n", + " (\"n02086910\", \"papillon\"),\n", + " (\"n02087046\", \"toy_terrier\"),\n", + " (\"n02087394\", \"Rhodesian_ridgeback\"),\n", + " (\"n02088094\", \"Afghan_hound\"),\n", + " (\"n02088238\", \"basset\"),\n", + " (\"n02088364\", \"beagle\"),\n", + " (\"n02088466\", \"bloodhound\"),\n", + " (\"n02088632\", \"bluetick\"),\n", + " (\"n02089078\", \"black-and-tan_coonhound\"),\n", + " (\"n02089867\", \"Walker_hound\"),\n", + " (\"n02089973\", \"English_foxhound\"),\n", + " (\"n02090379\", \"redbone\"),\n", + " (\"n02090622\", \"borzoi\"),\n", + " (\"n02090721\", \"Irish_wolfhound\"),\n", + " (\"n02091032\", \"Italian_greyhound\"),\n", + " (\"n02091134\", \"whippet\"),\n", + " (\"n02091244\", \"Ibizan_hound\"),\n", + " (\"n02091467\", \"Norwegian_elkhound\"),\n", + " (\"n02091635\", \"otterhound\"),\n", + " (\"n02091831\", \"Saluki\"),\n", + " (\"n02092002\", \"Scottish_deerhound\"),\n", + " (\"n02092339\", \"Weimaraner\"),\n", + " (\"n02093256\", \"Staffordshire_bullterrier\"),\n", + " (\"n02093428\", \"American_Staffordshire_terrier\"),\n", + " (\"n02093647\", \"Bedlington_terrier\"),\n", + " (\"n02093754\", \"Border_terrier\"),\n", + " (\"n02093859\", \"Kerry_blue_terrier\"),\n", + " (\"n02093991\", \"Irish_terrier\"),\n", + " (\"n02094114\", \"Norfolk_terrier\"),\n", + " (\"n02094258\", \"Norwich_terrier\"),\n", + " (\"n02094433\", \"Yorkshire_terrier\"),\n", + " (\"n02095314\", \"wire-haired_fox_terrier\"),\n", + " (\"n02095570\", \"Lakeland_terrier\"),\n", + " (\"n02095889\", \"Sealyham_terrier\"),\n", + " (\"n02096051\", \"Airedale\"),\n", + " (\"n02096177\", \"cairn\"),\n", + " (\"n02096294\", \"Australian_terrier\"),\n", + " (\"n02096437\", \"Dandie_Dinmont\"),\n", + " (\"n02096585\", \"Boston_bull\"),\n", + " (\"n02097047\", \"miniature_schnauzer\"),\n", + " (\"n02097130\", \"giant_schnauzer\"),\n", + " (\"n02097209\", \"standard_schnauzer\"),\n", + " (\"n02097298\", \"Scotch_terrier\"),\n", + " (\"n02097474\", \"Tibetan_terrier\"),\n", + " (\"n02097658\", \"silky_terrier\"),\n", + " (\"n02098105\", \"soft-coated_wheaten_terrier\"),\n", + " (\"n02098286\", \"West_Highland_white_terrier\"),\n", + " (\"n02098413\", \"Lhasa\"),\n", + " (\"n02099267\", \"flat-coated_retriever\"),\n", + " (\"n02099429\", \"curly-coated_retriever\"),\n", + " (\"n02099601\", \"golden_retriever\"),\n", + " (\"n02099712\", \"Labrador_retriever\"),\n", + " (\"n02099849\", \"Chesapeake_Bay_retriever\"),\n", + " (\"n02100236\", \"German_short-haired_pointer\"),\n", + " (\"n02100583\", \"vizsla\"),\n", + " (\"n02100735\", \"English_setter\"),\n", + " (\"n02100877\", \"Irish_setter\"),\n", + " (\"n02101006\", \"Gordon_setter\"),\n", + " (\"n02101388\", \"Brittany_spaniel\"),\n", + " (\"n02101556\", \"clumber\"),\n", + " (\"n02102040\", \"English_springer\"),\n", + " (\"n02102177\", \"Welsh_springer_spaniel\"),\n", + " (\"n02102318\", \"cocker_spaniel\"),\n", + " (\"n02102480\", \"Sussex_spaniel\"),\n", + " (\"n02102973\", \"Irish_water_spaniel\"),\n", + " (\"n02104029\", \"kuvasz\"),\n", + " (\"n02104365\", \"schipperke\"),\n", + " (\"n02105056\", \"groenendael\"),\n", + " (\"n02105162\", \"malinois\"),\n", + " (\"n02105251\", \"briard\"),\n", + " (\"n02105412\", \"kelpie\"),\n", + " (\"n02105505\", \"komondor\"),\n", + " (\"n02105641\", \"Old_English_sheepdog\"),\n", + " (\"n02105855\", \"Shetland_sheepdog\"),\n", + " (\"n02106030\", \"collie\"),\n", + " (\"n02106166\", \"Border_collie\"),\n", + " (\"n02106382\", \"Bouvier_des_Flandres\"),\n", + " (\"n02106550\", \"Rottweiler\"),\n", + " (\"n02106662\", \"German_shepherd\"),\n", + " (\"n02107142\", \"Doberman\"),\n", + " (\"n02107312\", \"miniature_pinscher\"),\n", + " (\"n02107574\", \"Greater_Swiss_Mountain_dog\"),\n", + " (\"n02107683\", \"Bernese_mountain_dog\"),\n", + " (\"n02107908\", \"Appenzeller\"),\n", + " (\"n02108000\", \"EntleBucher\"),\n", + " (\"n02108089\", \"boxer\"),\n", + " (\"n02108422\", \"bull_mastiff\"),\n", + " (\"n02108551\", \"Tibetan_mastiff\"),\n", + " (\"n02108915\", \"French_bulldog\"),\n", + " (\"n02109047\", \"Great_Dane\"),\n", + " (\"n02109525\", \"Saint_Bernard\"),\n", + " (\"n02109961\", \"Eskimo_dog\"),\n", + " (\"n02110063\", \"malamute\"),\n", + " (\"n02110185\", \"Siberian_husky\"),\n", + " (\"n02110341\", \"dalmatian\"),\n", + " (\"n02110627\", \"affenpinscher\"),\n", + " (\"n02110806\", \"basenji\"),\n", + " (\"n02110958\", \"pug\"),\n", + " (\"n02111129\", \"Leonberg\"),\n", + " (\"n02111277\", \"Newfoundland\"),\n", + " (\"n02111500\", \"Great_Pyrenees\"),\n", + " (\"n02111889\", \"Samoyed\"),\n", + " (\"n02112018\", \"Pomeranian\"),\n", + " (\"n02112137\", \"chow\"),\n", + " (\"n02112350\", \"keeshond\"),\n", + " (\"n02112706\", \"Brabancon_griffon\"),\n", + " (\"n02113023\", \"Pembroke\"),\n", + " (\"n02113186\", \"Cardigan\"),\n", + " (\"n02113624\", \"toy_poodle\"),\n", + " (\"n02113712\", \"miniature_poodle\"),\n", + " (\"n02113799\", \"standard_poodle\"),\n", + " (\"n02113978\", \"Mexican_hairless\"),\n", + " (\"n02114367\", \"timber_wolf\"),\n", + " (\"n02114548\", \"white_wolf\"),\n", + " (\"n02114712\", \"red_wolf\"),\n", + " (\"n02114855\", \"coyote\"),\n", + " (\"n02115641\", \"dingo\"),\n", + " (\"n02115913\", \"dhole\"),\n", + " (\"n02116738\", \"African_hunting_dog\"),\n", + " (\"n02117135\", \"hyena\"),\n", + " (\"n02119022\", \"red_fox\"),\n", + " (\"n02119789\", \"kit_fox\"),\n", + " (\"n02120079\", \"Arctic_fox\"),\n", + " (\"n02120505\", \"grey_fox\"),\n", + " (\"n02123045\", \"tabby\"),\n", + " (\"n02123159\", \"tiger_cat\"),\n", + " (\"n02123394\", \"Persian_cat\"),\n", + " (\"n02123597\", \"Siamese_cat\"),\n", + " (\"n02124075\", \"Egyptian_cat\"),\n", + " (\"n02125311\", \"cougar\"),\n", + " (\"n02127052\", \"lynx\"),\n", + " (\"n02128385\", \"leopard\"),\n", + " (\"n02128757\", \"snow_leopard\"),\n", + " (\"n02128925\", \"jaguar\"),\n", + " (\"n02129165\", \"lion\"),\n", + " (\"n02129604\", \"tiger\"),\n", + " (\"n02130308\", \"cheetah\"),\n", + " (\"n02132136\", \"brown_bear\"),\n", + " (\"n02133161\", \"American_black_bear\"),\n", + " (\"n02134084\", \"ice_bear\"),\n", + " (\"n02134418\", \"sloth_bear\"),\n", + " (\"n02137549\", \"mongoose\"),\n", + " (\"n02138441\", \"meerkat\"),\n", + " (\"n02165105\", \"tiger_beetle\"),\n", + " (\"n02165456\", \"ladybug\"),\n", + " (\"n02167151\", \"ground_beetle\"),\n", + " (\"n02168699\", \"long-horned_beetle\"),\n", + " (\"n02169497\", \"leaf_beetle\"),\n", + " (\"n02172182\", \"dung_beetle\"),\n", + " (\"n02174001\", \"rhinoceros_beetle\"),\n", + " (\"n02177972\", \"weevil\"),\n", + " (\"n02190166\", \"fly\"),\n", + " (\"n02206856\", \"bee\"),\n", + " (\"n02219486\", \"ant\"),\n", + " (\"n02226429\", \"grasshopper\"),\n", + " (\"n02229544\", \"cricket\"),\n", + " (\"n02231487\", \"walking_stick\"),\n", + " (\"n02233338\", \"cockroach\"),\n", + " (\"n02236044\", \"mantis\"),\n", + " (\"n02256656\", \"cicada\"),\n", + " (\"n02259212\", \"leafhopper\"),\n", + " (\"n02264363\", \"lacewing\"),\n", + " (\"n02268443\", \"dragonfly\"),\n", + " (\"n02268853\", \"damselfly\"),\n", + " (\"n02276258\", \"admiral\"),\n", + " (\"n02277742\", \"ringlet\"),\n", + " (\"n02279972\", \"monarch\"),\n", + " (\"n02280649\", \"cabbage_butterfly\"),\n", + " (\"n02281406\", \"sulphur_butterfly\"),\n", + " (\"n02281787\", \"lycaenid\"),\n", + " (\"n02317335\", \"starfish\"),\n", + " (\"n02319095\", \"sea_urchin\"),\n", + " (\"n02321529\", \"sea_cucumber\"),\n", + " (\"n02325366\", \"wood_rabbit\"),\n", + " (\"n02326432\", \"hare\"),\n", + " (\"n02328150\", \"Angora\"),\n", + " (\"n02342885\", \"hamster\"),\n", + " (\"n02346627\", \"porcupine\"),\n", + " (\"n02356798\", \"fox_squirrel\"),\n", + " (\"n02361337\", \"marmot\"),\n", + " (\"n02363005\", \"beaver\"),\n", + " (\"n02364673\", \"guinea_pig\"),\n", + " (\"n02389026\", \"sorrel\"),\n", + " (\"n02391049\", \"zebra\"),\n", + " (\"n02395406\", \"hog\"),\n", + " (\"n02396427\", \"wild_boar\"),\n", + " (\"n02397096\", \"warthog\"),\n", + " (\"n02398521\", \"hippopotamus\"),\n", + " (\"n02403003\", \"ox\"),\n", + " (\"n02408429\", \"water_buffalo\"),\n", + " (\"n02410509\", \"bison\"),\n", + " (\"n02412080\", \"ram\"),\n", + " (\"n02415577\", \"bighorn\"),\n", + " (\"n02417914\", \"ibex\"),\n", + " (\"n02422106\", \"hartebeest\"),\n", + " (\"n02422699\", \"impala\"),\n", + " (\"n02423022\", \"gazelle\"),\n", + " (\"n02437312\", \"Arabian_camel\"),\n", + " (\"n02437616\", \"llama\"),\n", + " (\"n02441942\", \"weasel\"),\n", + " (\"n02442845\", \"mink\"),\n", + " (\"n02443114\", \"polecat\"),\n", + " (\"n02443484\", \"black-footed_ferret\"),\n", + " (\"n02444819\", \"otter\"),\n", + " (\"n02445715\", \"skunk\"),\n", + " (\"n02447366\", \"badger\"),\n", + " (\"n02454379\", \"armadillo\"),\n", + " (\"n02457408\", \"three-toed_sloth\"),\n", + " (\"n02480495\", \"orangutan\"),\n", + " (\"n02480855\", \"gorilla\"),\n", + " (\"n02481823\", \"chimpanzee\"),\n", + " (\"n02483362\", \"gibbon\"),\n", + " (\"n02483708\", \"siamang\"),\n", + " (\"n02484975\", \"guenon\"),\n", + " (\"n02486261\", \"patas\"),\n", + " (\"n02486410\", \"baboon\"),\n", + " (\"n02487347\", \"macaque\"),\n", + " (\"n02488291\", \"langur\"),\n", + " (\"n02488702\", \"colobus\"),\n", + " (\"n02489166\", \"proboscis_monkey\"),\n", + " (\"n02490219\", \"marmoset\"),\n", + " (\"n02492035\", \"capuchin\"),\n", + " (\"n02492660\", \"howler_monkey\"),\n", + " (\"n02493509\", \"titi\"),\n", + " (\"n02493793\", \"spider_monkey\"),\n", + " (\"n02494079\", \"squirrel_monkey\"),\n", + " (\"n02497673\", \"Madagascar_cat\"),\n", + " (\"n02500267\", \"indri\"),\n", + " (\"n02504013\", \"Indian_elephant\"),\n", + " (\"n02504458\", \"African_elephant\"),\n", + " (\"n02509815\", \"lesser_panda\"),\n", + " (\"n02510455\", \"giant_panda\"),\n", + " (\"n02514041\", \"barracouta\"),\n", + " (\"n02526121\", \"eel\"),\n", + " (\"n02536864\", \"coho\"),\n", + " (\"n02606052\", \"rock_beauty\"),\n", + " (\"n02607072\", \"anemone_fish\"),\n", + " (\"n02640242\", \"sturgeon\"),\n", + " (\"n02641379\", \"gar\"),\n", + " (\"n02643566\", \"lionfish\"),\n", + " (\"n02655020\", \"puffer\"),\n", + " (\"n02666196\", \"abacus\"),\n", + " (\"n02667093\", \"abaya\"),\n", + " (\"n02669723\", \"academic_gown\"),\n", + " (\"n02672831\", \"accordion\"),\n", + " (\"n02676566\", \"acoustic_guitar\"),\n", + " (\"n02687172\", \"aircraft_carrier\"),\n", + " (\"n02690373\", \"airliner\"),\n", + " (\"n02692877\", \"airship\"),\n", + " (\"n02699494\", \"altar\"),\n", + " (\"n02701002\", \"ambulance\"),\n", + " (\"n02704792\", \"amphibian\"),\n", + " (\"n02708093\", \"analog_clock\"),\n", + " (\"n02727426\", \"apiary\"),\n", + " (\"n02730930\", \"apron\"),\n", + " (\"n02747177\", \"ashcan\"),\n", + " (\"n02749479\", \"assault_rifle\"),\n", + " (\"n02769748\", \"backpack\"),\n", + " (\"n02776631\", \"bakery\"),\n", + " (\"n02777292\", \"balance_beam\"),\n", + " (\"n02782093\", \"balloon\"),\n", + " (\"n02783161\", \"ballpoint\"),\n", + " (\"n02786058\", \"Band_Aid\"),\n", + " (\"n02787622\", \"banjo\"),\n", + " (\"n02788148\", \"bannister\"),\n", + " (\"n02790996\", \"barbell\"),\n", + " (\"n02791124\", \"barber_chair\"),\n", + " (\"n02791270\", \"barbershop\"),\n", + " (\"n02793495\", \"barn\"),\n", + " (\"n02794156\", \"barometer\"),\n", + " (\"n02795169\", \"barrel\"),\n", + " (\"n02797295\", \"barrow\"),\n", + " (\"n02799071\", \"baseball\"),\n", + " (\"n02802426\", \"basketball\"),\n", + " (\"n02804414\", \"bassinet\"),\n", + " (\"n02804610\", \"bassoon\"),\n", + " (\"n02807133\", \"bathing_cap\"),\n", + " (\"n02808304\", \"bath_towel\"),\n", + " (\"n02808440\", \"bathtub\"),\n", + " (\"n02814533\", \"beach_wagon\"),\n", + " (\"n02814860\", \"beacon\"),\n", + " (\"n02815834\", \"beaker\"),\n", + " (\"n02817516\", \"bearskin\"),\n", + " (\"n02823428\", \"beer_bottle\"),\n", + " (\"n02823750\", \"beer_glass\"),\n", + " (\"n02825657\", \"bell_cote\"),\n", + " (\"n02834397\", \"bib\"),\n", + " (\"n02835271\", \"bicycle-built-for-two\"),\n", + " (\"n02837789\", \"bikini\"),\n", + " (\"n02840245\", \"binder\"),\n", + " (\"n02841315\", \"binoculars\"),\n", + " (\"n02843684\", \"birdhouse\"),\n", + " (\"n02859443\", \"boathouse\"),\n", + " (\"n02860847\", \"bobsled\"),\n", + " (\"n02865351\", \"bolo_tie\"),\n", + " (\"n02869837\", \"bonnet\"),\n", + " (\"n02870880\", \"bookcase\"),\n", + " (\"n02871525\", \"bookshop\"),\n", + " (\"n02877765\", \"bottlecap\"),\n", + " (\"n02879718\", \"bow\"),\n", + " (\"n02883205\", \"bow_tie\"),\n", + " (\"n02892201\", \"brass\"),\n", + " (\"n02892767\", \"brassiere\"),\n", + " (\"n02894605\", \"breakwater\"),\n", + " (\"n02895154\", \"breastplate\"),\n", + " (\"n02906734\", \"broom\"),\n", + " (\"n02909870\", \"bucket\"),\n", + " (\"n02910353\", \"buckle\"),\n", + " (\"n02916936\", \"bulletproof_vest\"),\n", + " (\"n02917067\", \"bullet_train\"),\n", + " (\"n02927161\", \"butcher_shop\"),\n", + " (\"n02930766\", \"cab\"),\n", + " (\"n02939185\", \"caldron\"),\n", + " (\"n02948072\", \"candle\"),\n", + " (\"n02950826\", \"cannon\"),\n", + " (\"n02951358\", \"canoe\"),\n", + " (\"n02951585\", \"can_opener\"),\n", + " (\"n02963159\", \"cardigan\"),\n", + " (\"n02965783\", \"car_mirror\"),\n", + " (\"n02966193\", \"carousel\"),\n", + " (\"n02966687\", \"carpenter's_kit\"),\n", + " (\"n02971356\", \"carton\"),\n", + " (\"n02974003\", \"car_wheel\"),\n", + " (\"n02977058\", \"cash_machine\"),\n", + " (\"n02978881\", \"cassette\"),\n", + " (\"n02979186\", \"cassette_player\"),\n", + " (\"n02980441\", \"castle\"),\n", + " (\"n02981792\", \"catamaran\"),\n", + " (\"n02988304\", \"CD_player\"),\n", + " (\"n02992211\", \"cello\"),\n", + " (\"n02992529\", \"cellular_telephone\"),\n", + " (\"n02999410\", \"chain\"),\n", + " (\"n03000134\", \"chainlink_fence\"),\n", + " (\"n03000247\", \"chain_mail\"),\n", + " (\"n03000684\", \"chain_saw\"),\n", + " (\"n03014705\", \"chest\"),\n", + " (\"n03016953\", \"chiffonier\"),\n", + " (\"n03017168\", \"chime\"),\n", + " (\"n03018349\", \"china_cabinet\"),\n", + " (\"n03026506\", \"Christmas_stocking\"),\n", + " (\"n03028079\", \"church\"),\n", + " (\"n03032252\", \"cinema\"),\n", + " (\"n03041632\", \"cleaver\"),\n", + " (\"n03042490\", \"cliff_dwelling\"),\n", + " (\"n03045698\", \"cloak\"),\n", + " (\"n03047690\", \"clog\"),\n", + " (\"n03062245\", \"cocktail_shaker\"),\n", + " (\"n03063599\", \"coffee_mug\"),\n", + " (\"n03063689\", \"coffeepot\"),\n", + " (\"n03065424\", \"coil\"),\n", + " (\"n03075370\", \"combination_lock\"),\n", + " (\"n03085013\", \"computer_keyboard\"),\n", + " (\"n03089624\", \"confectionery\"),\n", + " (\"n03095699\", \"container_ship\"),\n", + " (\"n03100240\", \"convertible\"),\n", + " (\"n03109150\", \"corkscrew\"),\n", + " (\"n03110669\", \"cornet\"),\n", + " (\"n03124043\", \"cowboy_boot\"),\n", + " (\"n03124170\", \"cowboy_hat\"),\n", + " (\"n03125729\", \"cradle\"),\n", + " (\"n03126707\", \"crane\"),\n", + " (\"n03127747\", \"crash_helmet\"),\n", + " (\"n03127925\", \"crate\"),\n", + " (\"n03131574\", \"crib\"),\n", + " (\"n03133878\", \"Crock_Pot\"),\n", + " (\"n03134739\", \"croquet_ball\"),\n", + " (\"n03141823\", \"crutch\"),\n", + " (\"n03146219\", \"cuirass\"),\n", + " (\"n03160309\", \"dam\"),\n", + " (\"n03179701\", \"desk\"),\n", + " (\"n03180011\", \"desktop_computer\"),\n", + " (\"n03187595\", \"dial_telephone\"),\n", + " (\"n03188531\", \"diaper\"),\n", + " (\"n03196217\", \"digital_clock\"),\n", + " (\"n03197337\", \"digital_watch\"),\n", + " (\"n03201208\", \"dining_table\"),\n", + " (\"n03207743\", \"dishrag\"),\n", + " (\"n03207941\", \"dishwasher\"),\n", + " (\"n03208938\", \"disk_brake\"),\n", + " (\"n03216828\", \"dock\"),\n", + " (\"n03218198\", \"dogsled\"),\n", + " (\"n03220513\", \"dome\"),\n", + " (\"n03223299\", \"doormat\"),\n", + " (\"n03240683\", \"drilling_platform\"),\n", + " (\"n03249569\", \"drum\"),\n", + " (\"n03250847\", \"drumstick\"),\n", + " (\"n03255030\", \"dumbbell\"),\n", + " (\"n03259280\", \"Dutch_oven\"),\n", + " (\"n03271574\", \"electric_fan\"),\n", + " (\"n03272010\", \"electric_guitar\"),\n", + " (\"n03272562\", \"electric_locomotive\"),\n", + " (\"n03290653\", \"entertainment_center\"),\n", + " (\"n03291819\", \"envelope\"),\n", + " (\"n03297495\", \"espresso_maker\"),\n", + " (\"n03314780\", \"face_powder\"),\n", + " (\"n03325584\", \"feather_boa\"),\n", + " (\"n03337140\", \"file\"),\n", + " (\"n03344393\", \"fireboat\"),\n", + " (\"n03345487\", \"fire_engine\"),\n", + " (\"n03347037\", \"fire_screen\"),\n", + " (\"n03355925\", \"flagpole\"),\n", + " (\"n03372029\", \"flute\"),\n", + " (\"n03376595\", \"folding_chair\"),\n", + " (\"n03379051\", \"football_helmet\"),\n", + " (\"n03384352\", \"forklift\"),\n", + " (\"n03388043\", \"fountain\"),\n", + " (\"n03388183\", \"fountain_pen\"),\n", + " (\"n03388549\", \"four-poster\"),\n", + " (\"n03393912\", \"freight_car\"),\n", + " (\"n03394916\", \"French_horn\"),\n", + " (\"n03400231\", \"frying_pan\"),\n", + " (\"n03404251\", \"fur_coat\"),\n", + " (\"n03417042\", \"garbage_truck\"),\n", + " (\"n03424325\", \"gasmask\"),\n", + " (\"n03425413\", \"gas_pump\"),\n", + " (\"n03443371\", \"goblet\"),\n", + " (\"n03444034\", \"go-kart\"),\n", + " (\"n03445777\", \"golf_ball\"),\n", + " (\"n03445924\", \"golfcart\"),\n", + " (\"n03447447\", \"gondola\"),\n", + " (\"n03447721\", \"gong\"),\n", + " (\"n03450230\", \"gown\"),\n", + " (\"n03452741\", \"grand_piano\"),\n", + " (\"n03457902\", \"greenhouse\"),\n", + " (\"n03459775\", \"grille\"),\n", + " (\"n03461385\", \"grocery_store\"),\n", + " (\"n03467068\", \"guillotine\"),\n", + " (\"n03476684\", \"hair_slide\"),\n", + " (\"n03476991\", \"hair_spray\"),\n", + " (\"n03478589\", \"half_track\"),\n", + " (\"n03481172\", \"hammer\"),\n", + " (\"n03482405\", \"hamper\"),\n", + " (\"n03483316\", \"hand_blower\"),\n", + " (\"n03485407\", \"hand-held_computer\"),\n", + " (\"n03485794\", \"handkerchief\"),\n", + " (\"n03492542\", \"hard_disc\"),\n", + " (\"n03494278\", \"harmonica\"),\n", + " (\"n03495258\", \"harp\"),\n", + " (\"n03496892\", \"harvester\"),\n", + " (\"n03498962\", \"hatchet\"),\n", + " (\"n03527444\", \"holster\"),\n", + " (\"n03529860\", \"home_theater\"),\n", + " (\"n03530642\", \"honeycomb\"),\n", + " (\"n03532672\", \"hook\"),\n", + " (\"n03534580\", \"hoopskirt\"),\n", + " (\"n03535780\", \"horizontal_bar\"),\n", + " (\"n03538406\", \"horse_cart\"),\n", + " (\"n03544143\", \"hourglass\"),\n", + " (\"n03584254\", \"iPod\"),\n", + " (\"n03584829\", \"iron\"),\n", + " (\"n03590841\", \"jack-o'-lantern\"),\n", + " (\"n03594734\", \"jean\"),\n", + " (\"n03594945\", \"jeep\"),\n", + " (\"n03595614\", \"jersey\"),\n", + " (\"n03598930\", \"jigsaw_puzzle\"),\n", + " (\"n03599486\", \"jinrikisha\"),\n", + " (\"n03602883\", \"joystick\"),\n", + " (\"n03617480\", \"kimono\"),\n", + " (\"n03623198\", \"knee_pad\"),\n", + " (\"n03627232\", \"knot\"),\n", + " (\"n03630383\", \"lab_coat\"),\n", + " (\"n03633091\", \"ladle\"),\n", + " (\"n03637318\", \"lampshade\"),\n", + " (\"n03642806\", \"laptop\"),\n", + " (\"n03649909\", \"lawn_mower\"),\n", + " (\"n03657121\", \"lens_cap\"),\n", + " (\"n03658185\", \"letter_opener\"),\n", + " (\"n03661043\", \"library\"),\n", + " (\"n03662601\", \"lifeboat\"),\n", + " (\"n03666591\", \"lighter\"),\n", + " (\"n03670208\", \"limousine\"),\n", + " (\"n03673027\", \"liner\"),\n", + " (\"n03676483\", \"lipstick\"),\n", + " (\"n03680355\", \"Loafer\"),\n", + " (\"n03690938\", \"lotion\"),\n", + " (\"n03691459\", \"loudspeaker\"),\n", + " (\"n03692522\", \"loupe\"),\n", + " (\"n03697007\", \"lumbermill\"),\n", + " (\"n03706229\", \"magnetic_compass\"),\n", + " (\"n03709823\", \"mailbag\"),\n", + " (\"n03710193\", \"mailbox\"),\n", + " (\"n03710637\", \"maillot\"),\n", + " (\"n03710721\", \"maillot\"),\n", + " (\"n03717622\", \"manhole_cover\"),\n", + " (\"n03720891\", \"maraca\"),\n", + " (\"n03721384\", \"marimba\"),\n", + " (\"n03724870\", \"mask\"),\n", + " (\"n03729826\", \"matchstick\"),\n", + " (\"n03733131\", \"maypole\"),\n", + " (\"n03733281\", \"maze\"),\n", + " (\"n03733805\", \"measuring_cup\"),\n", + " (\"n03742115\", \"medicine_chest\"),\n", + " (\"n03743016\", \"megalith\"),\n", + " (\"n03759954\", \"microphone\"),\n", + " (\"n03761084\", \"microwave\"),\n", + " (\"n03763968\", \"military_uniform\"),\n", + " (\"n03764736\", \"milk_can\"),\n", + " (\"n03769881\", \"minibus\"),\n", + " (\"n03770439\", \"miniskirt\"),\n", + " (\"n03770679\", \"minivan\"),\n", + " (\"n03773504\", \"missile\"),\n", + " (\"n03775071\", \"mitten\"),\n", + " (\"n03775546\", \"mixing_bowl\"),\n", + " (\"n03776460\", \"mobile_home\"),\n", + " (\"n03777568\", \"Model_T\"),\n", + " (\"n03777754\", \"modem\"),\n", + " (\"n03781244\", \"monastery\"),\n", + " (\"n03782006\", \"monitor\"),\n", + " (\"n03785016\", \"moped\"),\n", + " (\"n03786901\", \"mortar\"),\n", + " (\"n03787032\", \"mortarboard\"),\n", + " (\"n03788195\", \"mosque\"),\n", + " (\"n03788365\", \"mosquito_net\"),\n", + " (\"n03791053\", \"motor_scooter\"),\n", + " (\"n03792782\", \"mountain_bike\"),\n", + " (\"n03792972\", \"mountain_tent\"),\n", + " (\"n03793489\", \"mouse\"),\n", + " (\"n03794056\", \"mousetrap\"),\n", + " (\"n03796401\", \"moving_van\"),\n", + " (\"n03803284\", \"muzzle\"),\n", + " (\"n03804744\", \"nail\"),\n", + " (\"n03814639\", \"neck_brace\"),\n", + " (\"n03814906\", \"necklace\"),\n", + " (\"n03825788\", \"nipple\"),\n", + " (\"n03832673\", \"notebook\"),\n", + " (\"n03837869\", \"obelisk\"),\n", + " (\"n03838899\", \"oboe\"),\n", + " (\"n03840681\", \"ocarina\"),\n", + " (\"n03841143\", \"odometer\"),\n", + " (\"n03843555\", \"oil_filter\"),\n", + " (\"n03854065\", \"organ\"),\n", + " (\"n03857828\", \"oscilloscope\"),\n", + " (\"n03866082\", \"overskirt\"),\n", + " (\"n03868242\", \"oxcart\"),\n", + " (\"n03868863\", \"oxygen_mask\"),\n", + " (\"n03871628\", \"packet\"),\n", + " (\"n03873416\", \"paddle\"),\n", + " (\"n03874293\", \"paddlewheel\"),\n", + " (\"n03874599\", \"padlock\"),\n", + " (\"n03876231\", \"paintbrush\"),\n", + " (\"n03877472\", \"pajama\"),\n", + " (\"n03877845\", \"palace\"),\n", + " (\"n03884397\", \"panpipe\"),\n", + " (\"n03887697\", \"paper_towel\"),\n", + " (\"n03888257\", \"parachute\"),\n", + " (\"n03888605\", \"parallel_bars\"),\n", + " (\"n03891251\", \"park_bench\"),\n", + " (\"n03891332\", \"parking_meter\"),\n", + " (\"n03895866\", \"passenger_car\"),\n", + " (\"n03899768\", \"patio\"),\n", + " (\"n03902125\", \"pay-phone\"),\n", + " (\"n03903868\", \"pedestal\"),\n", + " (\"n03908618\", \"pencil_box\"),\n", + " (\"n03908714\", \"pencil_sharpener\"),\n", + " (\"n03916031\", \"perfume\"),\n", + " (\"n03920288\", \"Petri_dish\"),\n", + " (\"n03924679\", \"photocopier\"),\n", + " (\"n03929660\", \"pick\"),\n", + " (\"n03929855\", \"pickelhaube\"),\n", + " (\"n03930313\", \"picket_fence\"),\n", + " (\"n03930630\", \"pickup\"),\n", + " (\"n03933933\", \"pier\"),\n", + " (\"n03935335\", \"piggy_bank\"),\n", + " (\"n03937543\", \"pill_bottle\"),\n", + " (\"n03938244\", \"pillow\"),\n", + " (\"n03942813\", \"ping-pong_ball\"),\n", + " (\"n03944341\", \"pinwheel\"),\n", + " (\"n03947888\", \"pirate\"),\n", + " (\"n03950228\", \"pitcher\"),\n", + " (\"n03954731\", \"plane\"),\n", + " (\"n03956157\", \"planetarium\"),\n", + " (\"n03958227\", \"plastic_bag\"),\n", + " (\"n03961711\", \"plate_rack\"),\n", + " (\"n03967562\", \"plow\"),\n", + " (\"n03970156\", \"plunger\"),\n", + " (\"n03976467\", \"Polaroid_camera\"),\n", + " (\"n03976657\", \"pole\"),\n", + " (\"n03977966\", \"police_van\"),\n", + " (\"n03980874\", \"poncho\"),\n", + " (\"n03982430\", \"pool_table\"),\n", + " (\"n03983396\", \"pop_bottle\"),\n", + " (\"n03991062\", \"pot\"),\n", + " (\"n03992509\", \"potter's_wheel\"),\n", + " (\"n03995372\", \"power_drill\"),\n", + " (\"n03998194\", \"prayer_rug\"),\n", + " (\"n04004767\", \"printer\"),\n", + " (\"n04005630\", \"prison\"),\n", + " (\"n04008634\", \"projectile\"),\n", + " (\"n04009552\", \"projector\"),\n", + " (\"n04019541\", \"puck\"),\n", + " (\"n04023962\", \"punching_bag\"),\n", + " (\"n04026417\", \"purse\"),\n", + " (\"n04033901\", \"quill\"),\n", + " (\"n04033995\", \"quilt\"),\n", + " (\"n04037443\", \"racer\"),\n", + " (\"n04039381\", \"racket\"),\n", + " (\"n04040759\", \"radiator\"),\n", + " (\"n04041544\", \"radio\"),\n", + " (\"n04044716\", \"radio_telescope\"),\n", + " (\"n04049303\", \"rain_barrel\"),\n", + " (\"n04065272\", \"recreational_vehicle\"),\n", + " (\"n04067472\", \"reel\"),\n", + " (\"n04069434\", \"reflex_camera\"),\n", + " (\"n04070727\", \"refrigerator\"),\n", + " (\"n04074963\", \"remote_control\"),\n", + " (\"n04081281\", \"restaurant\"),\n", + " (\"n04086273\", \"revolver\"),\n", + " (\"n04090263\", \"rifle\"),\n", + " (\"n04099969\", \"rocking_chair\"),\n", + " (\"n04111531\", \"rotisserie\"),\n", + " (\"n04116512\", \"rubber_eraser\"),\n", + " (\"n04118538\", \"rugby_ball\"),\n", + " (\"n04118776\", \"rule\"),\n", + " (\"n04120489\", \"running_shoe\"),\n", + " (\"n04125021\", \"safe\"),\n", + " (\"n04127249\", \"safety_pin\"),\n", + " (\"n04131690\", \"saltshaker\"),\n", + " (\"n04133789\", \"sandal\"),\n", + " (\"n04136333\", \"sarong\"),\n", + " (\"n04141076\", \"sax\"),\n", + " (\"n04141327\", \"scabbard\"),\n", + " (\"n04141975\", \"scale\"),\n", + " (\"n04146614\", \"school_bus\"),\n", + " (\"n04147183\", \"schooner\"),\n", + " (\"n04149813\", \"scoreboard\"),\n", + " (\"n04152593\", \"screen\"),\n", + " (\"n04153751\", \"screw\"),\n", + " (\"n04154565\", \"screwdriver\"),\n", + " (\"n04162706\", \"seat_belt\"),\n", + " (\"n04179913\", \"sewing_machine\"),\n", + " (\"n04192698\", \"shield\"),\n", + " (\"n04200800\", \"shoe_shop\"),\n", + " (\"n04201297\", \"shoji\"),\n", + " (\"n04204238\", \"shopping_basket\"),\n", + " (\"n04204347\", \"shopping_cart\"),\n", + " (\"n04208210\", \"shovel\"),\n", + " (\"n04209133\", \"shower_cap\"),\n", + " (\"n04209239\", \"shower_curtain\"),\n", + " (\"n04228054\", \"ski\"),\n", + " (\"n04229816\", \"ski_mask\"),\n", + " (\"n04235860\", \"sleeping_bag\"),\n", + " (\"n04238763\", \"slide_rule\"),\n", + " (\"n04239074\", \"sliding_door\"),\n", + " (\"n04243546\", \"slot\"),\n", + " (\"n04251144\", \"snorkel\"),\n", + " (\"n04252077\", \"snowmobile\"),\n", + " (\"n04252225\", \"snowplow\"),\n", + " (\"n04254120\", \"soap_dispenser\"),\n", + " (\"n04254680\", \"soccer_ball\"),\n", + " (\"n04254777\", \"sock\"),\n", + " (\"n04258138\", \"solar_dish\"),\n", + " (\"n04259630\", \"sombrero\"),\n", + " (\"n04263257\", \"soup_bowl\"),\n", + " (\"n04264628\", \"space_bar\"),\n", + " (\"n04265275\", \"space_heater\"),\n", + " (\"n04266014\", \"space_shuttle\"),\n", + " (\"n04270147\", \"spatula\"),\n", + " (\"n04273569\", \"speedboat\"),\n", + " (\"n04275548\", \"spider_web\"),\n", + " (\"n04277352\", \"spindle\"),\n", + " (\"n04285008\", \"sports_car\"),\n", + " (\"n04286575\", \"spotlight\"),\n", + " (\"n04296562\", \"stage\"),\n", + " (\"n04310018\", \"steam_locomotive\"),\n", + " (\"n04311004\", \"steel_arch_bridge\"),\n", + " (\"n04311174\", \"steel_drum\"),\n", + " (\"n04317175\", \"stethoscope\"),\n", + " (\"n04325704\", \"stole\"),\n", + " (\"n04326547\", \"stone_wall\"),\n", + " (\"n04328186\", \"stopwatch\"),\n", + " (\"n04330267\", \"stove\"),\n", + " (\"n04332243\", \"strainer\"),\n", + " (\"n04335435\", \"streetcar\"),\n", + " (\"n04336792\", \"stretcher\"),\n", + " (\"n04344873\", \"studio_couch\"),\n", + " (\"n04346328\", \"stupa\"),\n", + " (\"n04347754\", \"submarine\"),\n", + " (\"n04350905\", \"suit\"),\n", + " (\"n04355338\", \"sundial\"),\n", + " (\"n04355933\", \"sunglass\"),\n", + " (\"n04356056\", \"sunglasses\"),\n", + " (\"n04357314\", \"sunscreen\"),\n", + " (\"n04366367\", \"suspension_bridge\"),\n", + " (\"n04367480\", \"swab\"),\n", + " (\"n04370456\", \"sweatshirt\"),\n", + " (\"n04371430\", \"swimming_trunks\"),\n", + " (\"n04371774\", \"swing\"),\n", + " (\"n04372370\", \"switch\"),\n", + " (\"n04376876\", \"syringe\"),\n", + " (\"n04380533\", \"table_lamp\"),\n", + " (\"n04389033\", \"tank\"),\n", + " (\"n04392985\", \"tape_player\"),\n", + " (\"n04398044\", \"teapot\"),\n", + " (\"n04399382\", \"teddy\"),\n", + " (\"n04404412\", \"television\"),\n", + " (\"n04409515\", \"tennis_ball\"),\n", + " (\"n04417672\", \"thatch\"),\n", + " (\"n04418357\", \"theater_curtain\"),\n", + " (\"n04423845\", \"thimble\"),\n", + " (\"n04428191\", \"thresher\"),\n", + " (\"n04429376\", \"throne\"),\n", + " (\"n04435653\", \"tile_roof\"),\n", + " (\"n04442312\", \"toaster\"),\n", + " (\"n04443257\", \"tobacco_shop\"),\n", + " (\"n04447861\", \"toilet_seat\"),\n", + " (\"n04456115\", \"torch\"),\n", + " (\"n04458633\", \"totem_pole\"),\n", + " (\"n04461696\", \"tow_truck\"),\n", + " (\"n04462240\", \"toyshop\"),\n", + " (\"n04465501\", \"tractor\"),\n", + " (\"n04467665\", \"trailer_truck\"),\n", + " (\"n04476259\", \"tray\"),\n", + " (\"n04479046\", \"trench_coat\"),\n", + " (\"n04482393\", \"tricycle\"),\n", + " (\"n04483307\", \"trimaran\"),\n", + " (\"n04485082\", \"tripod\"),\n", + " (\"n04486054\", \"triumphal_arch\"),\n", + " (\"n04487081\", \"trolleybus\"),\n", + " (\"n04487394\", \"trombone\"),\n", + " (\"n04493381\", \"tub\"),\n", + " (\"n04501370\", \"turnstile\"),\n", + " (\"n04505470\", \"typewriter_keyboard\"),\n", + " (\"n04507155\", \"umbrella\"),\n", + " (\"n04509417\", \"unicycle\"),\n", + " (\"n04515003\", \"upright\"),\n", + " (\"n04517823\", \"vacuum\"),\n", + " (\"n04522168\", \"vase\"),\n", + " (\"n04523525\", \"vault\"),\n", + " (\"n04525038\", \"velvet\"),\n", + " (\"n04525305\", \"vending_machine\"),\n", + " (\"n04532106\", \"vestment\"),\n", + " (\"n04532670\", \"viaduct\"),\n", + " (\"n04536866\", \"violin\"),\n", + " (\"n04540053\", \"volleyball\"),\n", + " (\"n04542943\", \"waffle_iron\"),\n", + " (\"n04548280\", \"wall_clock\"),\n", + " (\"n04548362\", \"wallet\"),\n", + " (\"n04550184\", \"wardrobe\"),\n", + " (\"n04552348\", \"warplane\"),\n", + " (\"n04553703\", \"washbasin\"),\n", + " (\"n04554684\", \"washer\"),\n", + " (\"n04557648\", \"water_bottle\"),\n", + " (\"n04560804\", \"water_jug\"),\n", + " (\"n04562935\", \"water_tower\"),\n", + " (\"n04579145\", \"whiskey_jug\"),\n", + " (\"n04579432\", \"whistle\"),\n", + " (\"n04584207\", \"wig\"),\n", + " (\"n04589890\", \"window_screen\"),\n", + " (\"n04590129\", \"window_shade\"),\n", + " (\"n04591157\", \"Windsor_tie\"),\n", + " (\"n04591713\", \"wine_bottle\"),\n", + " (\"n04592741\", \"wing\"),\n", + " (\"n04596742\", \"wok\"),\n", + " (\"n04597913\", \"wooden_spoon\"),\n", + " (\"n04599235\", \"wool\"),\n", + " (\"n04604644\", \"worm_fence\"),\n", + " (\"n04606251\", \"wreck\"),\n", + " (\"n04612504\", \"yawl\"),\n", + " (\"n04613696\", \"yurt\"),\n", + " (\"n06359193\", \"web_site\"),\n", + " (\"n06596364\", \"comic_book\"),\n", + " (\"n06785654\", \"crossword_puzzle\"),\n", + " (\"n06794110\", \"street_sign\"),\n", + " (\"n06874185\", \"traffic_light\"),\n", + " (\"n07248320\", \"book_jacket\"),\n", + " (\"n07565083\", \"menu\"),\n", + " (\"n07579787\", \"plate\"),\n", + " (\"n07583066\", \"guacamole\"),\n", + " (\"n07584110\", \"consomme\"),\n", + " (\"n07590611\", \"hot_pot\"),\n", + " (\"n07613480\", \"trifle\"),\n", + " (\"n07614500\", \"ice_cream\"),\n", + " (\"n07615774\", \"ice_lolly\"),\n", + " (\"n07684084\", \"French_loaf\"),\n", + " (\"n07693725\", \"bagel\"),\n", + " (\"n07695742\", \"pretzel\"),\n", + " (\"n07697313\", \"cheeseburger\"),\n", + " (\"n07697537\", \"hotdog\"),\n", + " (\"n07711569\", \"mashed_potato\"),\n", + " (\"n07714571\", \"head_cabbage\"),\n", + " (\"n07714990\", \"broccoli\"),\n", + " (\"n07715103\", \"cauliflower\"),\n", + " (\"n07716358\", \"zucchini\"),\n", + " (\"n07716906\", \"spaghetti_squash\"),\n", + " (\"n07717410\", \"acorn_squash\"),\n", + " (\"n07717556\", \"butternut_squash\"),\n", + " (\"n07718472\", \"cucumber\"),\n", + " (\"n07718747\", \"artichoke\"),\n", + " (\"n07720875\", \"bell_pepper\"),\n", + " (\"n07730033\", \"cardoon\"),\n", + " (\"n07734744\", \"mushroom\"),\n", + " (\"n07742313\", \"Granny_Smith\"),\n", + " (\"n07745940\", \"strawberry\"),\n", + " (\"n07747607\", \"orange\"),\n", + " (\"n07749582\", \"lemon\"),\n", + " (\"n07753113\", \"fig\"),\n", + " (\"n07753275\", \"pineapple\"),\n", + " (\"n07753592\", \"banana\"),\n", + " (\"n07754684\", \"jackfruit\"),\n", + " (\"n07760859\", \"custard_apple\"),\n", + " (\"n07768694\", \"pomegranate\"),\n", + " (\"n07802026\", \"hay\"),\n", + " (\"n07831146\", \"carbonara\"),\n", + " (\"n07836838\", \"chocolate_sauce\"),\n", + " (\"n07860988\", \"dough\"),\n", + " (\"n07871810\", \"meat_loaf\"),\n", + " (\"n07873807\", \"pizza\"),\n", + " (\"n07875152\", \"potpie\"),\n", + " (\"n07880968\", \"burrito\"),\n", + " (\"n07892512\", \"red_wine\"),\n", + " (\"n07920052\", \"espresso\"),\n", + " (\"n07930864\", \"cup\"),\n", + " (\"n07932039\", \"eggnog\"),\n", + " (\"n09193705\", \"alp\"),\n", + " (\"n09229709\", \"bubble\"),\n", + " (\"n09246464\", \"cliff\"),\n", + " (\"n09256479\", \"coral_reef\"),\n", + " (\"n09288635\", \"geyser\"),\n", + " (\"n09332890\", \"lakeside\"),\n", + " (\"n09399592\", \"promontory\"),\n", + " (\"n09421951\", \"sandbar\"),\n", + " (\"n09428293\", \"seashore\"),\n", + " (\"n09468604\", \"valley\"),\n", + " (\"n09472597\", \"volcano\"),\n", + " (\"n09835506\", \"ballplayer\"),\n", + " (\"n10148035\", \"groom\"),\n", + " (\"n10565667\", \"scuba_diver\"),\n", + " (\"n11879895\", \"rapeseed\"),\n", + " (\"n11939491\", \"daisy\"),\n", + " (\"n12057211\", \"yellow_lady's_slipper\"),\n", + " (\"n12144580\", \"corn\"),\n", + " (\"n12267677\", \"acorn\"),\n", + " (\"n12620546\", \"hip\"),\n", + " (\"n12768682\", \"buckeye\"),\n", + " (\"n12985857\", \"coral_fungus\"),\n", + " (\"n12998815\", \"agaric\"),\n", + " (\"n13037406\", \"gyromitra\"),\n", + " (\"n13040303\", \"stinkhorn\"),\n", + " (\"n13044778\", \"earthstar\"),\n", + " (\"n13052670\", \"hen-of-the-woods\"),\n", + " (\"n13054560\", \"bolete\"),\n", + " (\"n13133613\", \"ear\"),\n", + " (\"n15075141\", \"toilet_tissue\"),\n", + "]\n", "classes_human_readable = {v0: v1 for (v0, v1) in classes}\n", "classes_id = {v0: int(k) for k, (v0, _) in enumerate(classes)}\n", "\n", "df = df.with_column(\n", " \"class_human_readable\",\n", - " df[\"object\"].list.get(0).struct.get(\"name\").apply(\n", - " lambda name: classes_human_readable[name], return_dtype=daft.DataType.string()\n", - " ),\n", + " df[\"object\"]\n", + " .list.get(0)\n", + " .struct.get(\"name\")\n", + " .apply(lambda name: classes_human_readable[name], return_dtype=daft.DataType.string()),\n", ")\n", "df = df.with_column(\n", " \"class_id\",\n", - " df[\"object\"].list.get(0).struct.get(\"name\").apply(\n", - " lambda name: classes_id[name], return_dtype=daft.DataType.int64()\n", - " ),\n", + " df[\"object\"]\n", + " .list.get(0)\n", + " .struct.get(\"name\")\n", + " .apply(lambda name: classes_id[name], return_dtype=daft.DataType.int64()),\n", ")" ] }, @@ -401,9 +1402,9 @@ "metadata": {}, "outputs": [], "source": [ + "import torch\n", "import torchvision.models as models\n", "from torch import nn\n", - "import torch\n", "\n", "model = models.__dict__[\"resnet18\"](weights=models.ResNet18_Weights.DEFAULT)\n", "criterion = nn.CrossEntropyLoss()\n", @@ -481,7 +1482,9 @@ "source": [ "df.with_column(\n", " \"model_predictions\",\n", - " df[\"arr\"].apply(lambda arr: model(torch.tensor(arr).permute(2,0,1).unsqueeze(0).float()), return_dtype=daft.DataType.python())\n", + " df[\"arr\"].apply(\n", + " lambda arr: model(torch.tensor(arr).permute(2, 0, 1).unsqueeze(0).float()), return_dtype=daft.DataType.python()\n", + " ),\n", ").show(2)" ] }, diff --git a/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb b/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb index 83e27fee23..109d040234 100644 --- a/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb +++ b/tutorials/embeddings/daft_tutorial_embeddings_stackexchange.ipynb @@ -103,7 +103,9 @@ "import daft\n", "\n", "SAMPLE_DATA_PATH = \"s3://daft-public-data/redpajama-1t-sample/stackexchange_sample.jsonl\"\n", - "IO_CONFIG = daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True, region_name=\"us-west-2\")) # Use anonymous-mode for accessing AWS S3\n", + "IO_CONFIG = daft.io.IOConfig(\n", + " s3=daft.io.S3Config(anonymous=True, region_name=\"us-west-2\")\n", + ") # Use anonymous-mode for accessing AWS S3\n", "\n", "df = daft.read_json(SAMPLE_DATA_PATH, io_config=IO_CONFIG)\n", "\n", @@ -134,17 +136,16 @@ "source": [ "MODEL_NAME = \"all-MiniLM-L6-v2\"\n", "\n", + "\n", "@daft.udf(return_dtype=daft.DataType.python())\n", "class EncodingUDF:\n", " def __init__(self):\n", " from sentence_transformers import SentenceTransformer\n", + "\n", " self.model = SentenceTransformer(MODEL_NAME)\n", "\n", " def __call__(self, text_col):\n", - " return [\n", - " self.model.encode(text, convert_to_tensor=True)\n", - " for text in text_col.to_pylist()\n", - " ]" + " return [self.model.encode(text, convert_to_tensor=True) for text in text_col.to_pylist()]" ] }, { @@ -244,13 +245,10 @@ "outputs": [], "source": [ "import math\n", + "\n", "NUM_TOP_QUESTIONS = math.ceil(math.sqrt(len(df)))\n", "\n", - "top_questions = (\n", - " df\n", - " .sort(df[\"question_score\"], desc=True)\n", - " .limit(NUM_TOP_QUESTIONS)\n", - ").to_pydict()" + "top_questions = (df.sort(df[\"question_score\"], desc=True).limit(NUM_TOP_QUESTIONS)).to_pydict()" ] }, { @@ -270,10 +268,11 @@ "source": [ "@daft.udf(return_dtype=daft.DataType.python())\n", "def similarity_search(embedding_col, top_embeddings, top_urls):\n", - " if len(embedding_col) == 0: return []\n", - " \n", - " from sentence_transformers import util\n", + " if len(embedding_col) == 0:\n", + " return []\n", + "\n", " import torch\n", + " from sentence_transformers import util\n", "\n", " # Tensor prep\n", " query_embedding_t = torch.stack(embedding_col.to_pylist())\n", @@ -283,7 +282,7 @@ "\n", " # Do semantic search\n", " results = util.semantic_search(query_embedding_t, top_embeddings, top_k=1)\n", - " \n", + "\n", " # Extract URL and score from search results\n", " results = [res[0] for res in results]\n", " results = [\n", @@ -295,25 +294,25 @@ " ]\n", " return results\n", "\n", + "\n", "import torch\n", + "\n", "df = df.with_column(\n", - " \"search_result\", \n", + " \"search_result\",\n", " similarity_search(\n", - " df[\"embedding\"], \n", - " top_embeddings=torch.stack(top_questions[\"embedding\"]), \n", + " df[\"embedding\"],\n", + " top_embeddings=torch.stack(top_questions[\"embedding\"]),\n", " top_urls=top_questions[\"URL\"],\n", - " )\n", + " ),\n", ")\n", "\n", "df = df.select(\n", " df[\"URL\"],\n", " df[\"question_score\"],\n", " df[\"search_result\"]\n", - " .apply(lambda x: x[\"related_top_question\"], return_dtype=daft.DataType.string())\n", - " .alias(\"related_top_question\"),\n", - " df[\"search_result\"]\n", - " .apply(lambda x: x[\"similarity\"], return_dtype=daft.DataType.float64())\n", - " .alias(\"similarity\"),\n", + " .apply(lambda x: x[\"related_top_question\"], return_dtype=daft.DataType.string())\n", + " .alias(\"related_top_question\"),\n", + " df[\"search_result\"].apply(lambda x: x[\"similarity\"], return_dtype=daft.DataType.float64()).alias(\"similarity\"),\n", ")" ] }, @@ -402,7 +401,7 @@ ], "source": [ "df = df.where(df[\"similarity\"] < 0.99) # To ignore duplicate questions.\n", - "df = df.sort(df[\"similarity\"], desc=True) \n", + "df = df.sort(df[\"similarity\"], desc=True)\n", "df.show()" ] }, diff --git a/tutorials/flyte/notebook.ipynb b/tutorials/flyte/notebook.ipynb index 41558b6f29..21131561cd 100644 --- a/tutorials/flyte/notebook.ipynb +++ b/tutorials/flyte/notebook.ipynb @@ -8,7 +8,9 @@ "source": [ "import daft\n", "\n", - "IO_CONFIG = daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True, region_name=\"us-west-2\")) # Use anonymous-mode for accessing AWS S3\n", + "IO_CONFIG = daft.io.IOConfig(\n", + " s3=daft.io.S3Config(anonymous=True, region_name=\"us-west-2\")\n", + ") # Use anonymous-mode for accessing AWS S3\n", "PARQUET_PATH = \"s3://daft-public-data/tutorials/laion-parquet/train-00000-of-00001-6f24a7497df494ae.parquet\"" ] }, diff --git a/tutorials/image_querying/top_n_red_color.ipynb b/tutorials/image_querying/top_n_red_color.ipynb index 2e9b90a1a6..0b0274beb6 100644 --- a/tutorials/image_querying/top_n_red_color.ipynb +++ b/tutorials/image_querying/top_n_red_color.ipynb @@ -78,9 +78,10 @@ "###\n", "\n", "if USE_RAY:\n", - " import daft.context\n", " import ray\n", "\n", + " import daft.context\n", + "\n", " # NOTE: Replace with the address to an existing running Ray cluster, or None to start a local Ray cluster\n", " RAY_CLUSTER_ADDRESS = \"ray://localhost:10001\"\n", "\n", @@ -89,7 +90,7 @@ " runtime_env={\"pip\": [\"getdaft\", \"pillow\", \"s3fs\"]},\n", " )\n", "\n", - " daft.context.set_runner_ray(address=RAY_CLUSTER_ADDRESS)\n" + " daft.context.set_runner_ray(address=RAY_CLUSTER_ADDRESS)" ] }, { @@ -393,21 +394,26 @@ }, "outputs": [], "source": [ + "import numpy as np\n", "import PIL\n", "from PIL import ImageFilter\n", - "import numpy as np\n", "\n", "\n", "def magic_red_detector(img: np.ndarray) -> PIL.Image.Image:\n", " \"\"\"Gets a new image which is a mask covering all 'red' areas in the image\"\"\"\n", " img = PIL.Image.fromarray(img)\n", " lower = np.array([245, 100, 100])\n", - " upper = np.array([10,255,255])\n", + " upper = np.array([10, 255, 255])\n", " lower_hue, upper_hue = lower[0, np.newaxis, np.newaxis], upper[0, np.newaxis, np.newaxis]\n", - " lower_saturation_intensity, upper_saturation_intensity = lower[1:, np.newaxis, np.newaxis], upper[1:, np.newaxis, np.newaxis] \n", - " hsv = img.convert('HSV')\n", + " lower_saturation_intensity, upper_saturation_intensity = (\n", + " lower[1:, np.newaxis, np.newaxis],\n", + " upper[1:, np.newaxis, np.newaxis],\n", + " )\n", + " hsv = img.convert(\"HSV\")\n", " hsv = np.asarray(hsv).T\n", - " mask = np.all((hsv[1:, ...] >= lower_saturation_intensity) & (hsv[1:, ...] <= upper_saturation_intensity), axis=0) & ((hsv[0, ...] >= lower_hue) | (hsv[0, ...] <= upper_hue))\n", + " mask = np.all(\n", + " (hsv[1:, ...] >= lower_saturation_intensity) & (hsv[1:, ...] <= upper_saturation_intensity), axis=0\n", + " ) & ((hsv[0, ...] >= lower_hue) | (hsv[0, ...] <= upper_hue))\n", " img = PIL.Image.fromarray(mask.T)\n", " img = img.filter(ImageFilter.ModeFilter(size=5))\n", " return img\n", @@ -452,10 +458,12 @@ "source": [ "import numpy as np\n", "\n", + "\n", "def sum_mask(mask: PIL.Image.Image) -> int:\n", " val = np.asarray(mask).sum()\n", " return int(val)\n", "\n", + "\n", "df = df.with_column(\n", " \"num_pixels_red\",\n", " df[\"red_mask\"].apply(sum_mask, return_dtype=daft.DataType.int64()),\n", diff --git a/tutorials/intro.ipynb b/tutorials/intro.ipynb index 22b5a79323..ae3a541b74 100644 --- a/tutorials/intro.ipynb +++ b/tutorials/intro.ipynb @@ -46,7 +46,7 @@ "\n", "# Daft also supports reading from many other sources:\n", "# df = daft.read_csv(...)\n", - "# df = daft.read_parquet(...) \n", + "# df = daft.read_parquet(...)\n", "# df = daft.read_json(...)\n", "# df = daft.read_iceberg(...) # " ] @@ -129,7 +129,7 @@ "outputs": [], "source": [ "df = df.with_column(\"data\", df[\"path\"].url.download()) # Utf8 -> Binary\n", - "df = df.with_column(\"image\", df[\"data\"].image.decode()) # Binary -> Image" + "df = df.with_column(\"image\", df[\"data\"].image.decode()) # Binary -> Image" ] }, { diff --git a/tutorials/mnist.ipynb b/tutorials/mnist.ipynb index 95725af804..28973d1b47 100644 --- a/tutorials/mnist.ipynb +++ b/tutorials/mnist.ipynb @@ -1,910 +1,911 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "d1b56860-db41-4829-b395-176e11987cdc", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install getdaft\n", - "%pip install Pillow torch torchvision" - ] - }, - { - "cell_type": "markdown", - "id": "c571e01d", - "metadata": {}, - "source": [ - "```{hint}\n", - "✨✨✨ **Run this notebook on Google Colab** ✨✨✨\n", - "\n", - "You can [run this notebook yourself with Google Colab](https://colab.research.google.com/github/Eventual-Inc/Daft/blob/main/tutorials/mnist.ipynb)!\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "9b14abf5-a183-4bfb-9b15-a9a54b744fce", - "metadata": {}, - "source": [ - "# MNIST Daft Tutorial\n", - "\n", - "The MNIST Dataset is a \"large database of handwritten digits that is commonly used for training various image processing systems\"." - ] - }, - { - "cell_type": "markdown", - "id": "252b5128-99c2-49dd-b624-6e4b21275959", - "metadata": {}, - "source": [ - "## Loading Data\n", - "\n", - "This is a JSON file containing all the data for the MNIST test set. Let's load it up into a Daft Dataframe!" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "fc63a3ad-0e0a-4ab3-9cc0-cbec8bdd0632", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-04-21 11:44:02.554 | INFO | daft.context:runner:88 - Using PyRunner\n" - ] - } - ], - "source": [ - "import daft\n", - "from daft import col, udf, DataType\n", - "\n", - "URL = \"https://github.com/Eventual-Inc/mnist-json/raw/master/mnist_handwritten_test.json.gz\"\n", - "images_df = daft.read_json(URL)" - ] - }, - { - "cell_type": "markdown", - "id": "d52f6032-6619-4682-8305-2ed65bdc194c", - "metadata": {}, - "source": [ - "To peek at the dataset, simply have your notebook display the images_df that was just created." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "73a71adf-3b2e-4ec5-a0d2-34ad8eec734c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " \n", - "\n", - "\n", - "\n", - "
image
List[Int64]
label
Int64
\n", - " (No data to display: Dataframe not materialized)\n", - "
" - ], - "text/plain": [ - "+---------------+---------+\n", - "| image | label |\n", - "| List[Int64] | Int64 |\n", - "+===============+=========+\n", - "+---------------+---------+\n", - "(No data to display: Dataframe not materialized)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "images_df" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "4787caab-d7d1-4fd4-9a76-ffb08a404a31", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
image
List[Int64]
label
Int64
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 7
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 0
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 5
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9
\n", - " (Showing first 10 rows)\n", - "
" - ], - "text/plain": [ - "+----------------------+---------+\n", - "| image | label |\n", - "| List[Int64] | Int64 |\n", - "+======================+=========+\n", - "| [0, 0, 0, 0, 0, 0, | 7 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | |\n", - "| 0, 0, 0, 0, 0, 0,... | |\n", - "+----------------------+---------+\n", - "| [0, 0, 0, 0, 0, 0, | 2 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | |\n", - "| 0, 0, 0, 0, 0, 0,... | |\n", - "+----------------------+---------+\n", - "| [0, 0, 0, 0, 0, 0, | 1 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | |\n", - "| 0, 0, 0, 0, 0, 0,... | |\n", - "+----------------------+---------+\n", - "| [0, 0, 0, 0, 0, 0, | 0 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | |\n", - "| 0, 0, 0, 0, 0, 0,... | |\n", - "+----------------------+---------+\n", - "| [0, 0, 0, 0, 0, 0, | 4 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | |\n", - "| 0, 0, 0, 0, 0, 0,... | |\n", - "+----------------------+---------+\n", - "| [0, 0, 0, 0, 0, 0, | 1 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | |\n", - "| 0, 0, 0, 0, 0, 0,... | |\n", - "+----------------------+---------+\n", - "| [0, 0, 0, 0, 0, 0, | 4 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | |\n", - "| 0, 0, 0, 0, 0, 0,... | |\n", - "+----------------------+---------+\n", - "| [0, 0, 0, 0, 0, 0, | 9 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | |\n", - "| 0, 0, 0, 0, 0, 0,... | |\n", - "+----------------------+---------+\n", - "| [0, 0, 0, 0, 0, 0, | 5 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | |\n", - "| 0, 0, 0, 0, 0, 0,... | |\n", - "+----------------------+---------+\n", - "| [0, 0, 0, 0, 0, 0, | 9 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | |\n", - "| 0, 0, 0, 0, 0, 0,... | |\n", - "+----------------------+---------+\n", - "(Showing first 10 rows)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "images_df.show(10)" - ] - }, - { - "cell_type": "markdown", - "id": "426f1bbb-e1c0-4fd6-b84e-cbb1ab309ff9", - "metadata": {}, - "source": [ - "You just loaded your first DaFt Dataframe! It consists of two columns:\n", - "1. The \"image\" column is a Python column of type `list` - where it looks like each row contains a list of digits representing the pixels of each image\n", - "2. The \"label\" column is an Integer column, consisting of just the label of that image." - ] - }, - { - "cell_type": "markdown", - "id": "9a7872e3-9860-4867-8a8c-61a69f69e334", - "metadata": {}, - "source": [ - "## Processing Columns with User-Defined Functions (UDF)\n", - "\n", - "It seems our JSON file has provided us with a one-dimensional array of pixels instead of two-dimensional images. We can easily modify data in this column by instructing Daft to run a method on every row in the column like so:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "af857589-b28a-4ee0-91cd-dc7a01ff4c07", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "images_df = images_df.with_column(\n", - " \"image_2d\",\n", - " col(\"image\").apply(lambda l: np.array(l).reshape(28, 28), return_dtype=DataType.python()),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d1212a7e-949a-4881-ba54-9d7e7eb31e6f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
image
List[Int64]
label
Int64
image_2d
Python
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 7<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 0<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 5<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\n", - " (Showing first 10 rows)\n", - "
" - ], - "text/plain": [ - "+----------------------+---------+----------------------+\n", - "| image | label | image_2d |\n", - "| List[Int64] | Int64 | Python |\n", - "+======================+=========+======================+\n", - "| [0, 0, 0, 0, 0, 0, | 7 | [[ 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", - "+----------------------+---------+----------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", - "+----------------------+---------+----------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", - "+----------------------+---------+----------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 0 | [[ 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", - "+----------------------+---------+----------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", - "+----------------------+---------+----------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", - "+----------------------+---------+----------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", - "+----------------------+---------+----------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", - "+----------------------+---------+----------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 5 | [[ 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", - "+----------------------+---------+----------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", - "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", - "+----------------------+---------+----------------------+\n", - "(Showing first 10 rows)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "images_df.show(10)" - ] - }, - { - "cell_type": "markdown", - "id": "cd0d2664-12d8-4964-85cd-a67f8fee1384", - "metadata": {}, - "source": [ - "Great, but we can do one better - let's convert these two-dimensional arrays into Images. Computers speak in pixels and arrays, but humans do much better with visual patterns!\n", - "\n", - "To do this, we can leverage the `.apply` expression method. Similar to the `.as_py` method, this allows us to run a single function on all rows of a given column, but provides us with more flexibility as it takes as input any arbitrary function." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e585303a-7c83-4a31-afbb-461c951481f7", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from PIL import Image\n", - "\n", - "images_df = images_df.with_column(\"pil_image\", col(\"image_2d\").apply(lambda arr: Image.fromarray(arr.astype(np.uint8)), return_dtype=DataType.python()))" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "59b655ed-13aa-4764-acd4-a00beb91ec2f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
image
List[Int64]
label
Int64
image_2d
Python
pil_image
Python
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 7<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 0<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 5<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
\n", - " (Showing first 10 rows)\n", - "
" - ], - "text/plain": [ - "+----------------------+---------+----------------------+------------------+\n", - "| image | label | image_2d | pil_image |\n", - "| List[Int64] | Int64 | Python | Python |\n", - "+======================+=========+======================+==================+\n", - "| [0, 0, 0, 0, 0, 0, | 7 | [[ 0 0 0 0 | |\n", - "+----------------------+---------+----------------------+------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 | |\n", - "+----------------------+---------+----------------------+------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 | |\n", - "+----------------------+---------+----------------------+------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 0 | [[ 0 0 0 0 | |\n", - "+----------------------+---------+----------------------+------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | |\n", - "+----------------------+---------+----------------------+------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 | |\n", - "+----------------------+---------+----------------------+------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | |\n", - "+----------------------+---------+----------------------+------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 | |\n", - "+----------------------+---------+----------------------+------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 5 | [[ 0 0 0 0 | |\n", - "+----------------------+---------+----------------------+------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 | |\n", - "+----------------------+---------+----------------------+------------------+\n", - "(Showing first 10 rows)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "images_df.show(10)" - ] - }, - { - "cell_type": "markdown", - "id": "e6b633f4-3d9d-4c25-9075-bc815d8e357f", - "metadata": {}, - "source": [ - "Amazing! This looks great and we can finally get some idea of what the dataset truly looks like." - ] - }, - { - "cell_type": "markdown", - "id": "cd7e6774-9fb7-4827-a324-c116c8c812e1", - "metadata": {}, - "source": [ - "## Running a model with UDFs\n", - "\n", - "Next, let's try to run a deep learning model to classify each image. Models are expensive to initialize and load, so we want to do this as few times as possible, and share a model across multiple invocations.\n", - "\n", - "For the convenience of this quickstart tutorial, we pre-trained a model using a PyTorch-provided example script and saved the trained weights at https://github.com/Eventual-Inc/mnist-json/raw/master/mnist_cnn.pt. We need to define the same deep learning model \"scaffold\" as the trained model that we want to load (this part is all PyTorch and is not specific at all to DaFt)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "5ff43066-8a42-4773-974f-160ca4a9bc49", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "###\n", - "# Model was trained using a script provided in PyTorch Examples: https://github.com/pytorch/examples/blob/main/mnist/main.py\n", - "###\n", - "\n", - "from __future__ import print_function\n", - "import argparse\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torch.optim as optim\n", - "import torch.hub\n", - "from torchvision import datasets, transforms\n", - "from torch.optim.lr_scheduler import StepLR\n", - "\n", - "class Net(nn.Module):\n", - " def __init__(self):\n", - " super(Net, self).__init__()\n", - " self.conv1 = nn.Conv2d(1, 32, 3, 1)\n", - " self.conv2 = nn.Conv2d(32, 64, 3, 1)\n", - " self.dropout1 = nn.Dropout(0.25)\n", - " self.dropout2 = nn.Dropout(0.5)\n", - " self.fc1 = nn.Linear(9216, 128)\n", - " self.fc2 = nn.Linear(128, 10)\n", - "\n", - " def forward(self, x):\n", - " x = self.conv1(x)\n", - " x = F.relu(x)\n", - " x = self.conv2(x)\n", - " x = F.relu(x)\n", - " x = F.max_pool2d(x, 2)\n", - " x = self.dropout1(x)\n", - " x = torch.flatten(x, 1)\n", - " x = self.fc1(x)\n", - " x = F.relu(x)\n", - " x = self.dropout2(x)\n", - " x = self.fc2(x)\n", - " output = F.log_softmax(x, dim=1)\n", - " return output" - ] - }, - { - "cell_type": "markdown", - "id": "266c1cf8-bf9a-4990-8182-97b072f15b57", - "metadata": {}, - "source": [ - "Now comes the fun part - we can define a UDF using the `@udf` decorator. Notice that for a batch of data we only initialize our model once!" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "fda097ea-4946-483c-bcc0-5271e0b033c3", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "@udf(return_dtype=DataType.int64())\n", - "class ClassifyImages:\n", - " \n", - " def __init__(self):\n", - " # Perform expensive initializations - create the model, download model weights and load up the model with weights\n", - " self.model = Net()\n", - " state_dict = torch.hub.load_state_dict_from_url(\"https://github.com/Eventual-Inc/mnist-json/raw/master/mnist_cnn.pt\")\n", - " self.model.load_state_dict(state_dict)\n", - " \n", - " def __call__(self, images_2d_col):\n", - " images_arr = np.array(images_2d_col.to_pylist())\n", - " normalized_image_2d = images_arr / 255\n", - " normalized_image_2d = normalized_image_2d[:, np.newaxis, :, :]\n", - " classifications = self.model(torch.from_numpy(normalized_image_2d).float())\n", - " return classifications.detach().numpy().argmax(axis=1)" - ] - }, - { - "cell_type": "markdown", - "id": "3605d3a6-f9ce-4e81-9e0f-5190f981bbd4", - "metadata": {}, - "source": [ - "Using this UDF is really easy, we simply run it on the columns that we want to process:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "4f9fd9f8-a231-44fb-a519-0288f670a34a", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
image
List[Int64]
label
Int64
image_2d
Python
pil_image
Python
model_classification
Int64
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 7<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 7
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 2
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 0<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 0
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 4
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 4
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 9
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 5<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 6
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 9
\n", - " (Showing first 10 rows)\n", - "
" - ], - "text/plain": [ - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| image | label | image_2d | pil_image | model_classification |\n", - "| List[Int64] | Int64 | Python | Python | Int64 |\n", - "+======================+=========+======================+==================+========================+\n", - "| [0, 0, 0, 0, 0, 0, | 7 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 0 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 5 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "(Showing first 10 rows)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "classified_images_df = images_df.with_column(\"model_classification\", ClassifyImages(col(\"image_2d\")))\n", - "\n", - "classified_images_df.show(10)" - ] - }, - { - "cell_type": "markdown", - "id": "2e6fb5fc-957d-414b-bfac-961ea64dad68", - "metadata": {}, - "source": [ - "Our model ran successfully, and produced a new classification column. These look pretty good - let's filter our Dataframe to show only rows that the model predicted wrongly." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "69344d63-7db4-496f-a0b2-949dfd947e4f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
image
List[Int64]
label
Int64
image_2d
Python
pil_image
Python
model_classification
Int64
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 7
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 3<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 2
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 8
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 8
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 9
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 7<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 9
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 5<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 8
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 6<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 5
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 2
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 8
\n", - " (Showing first 10 rows)\n", - "
" - ], - "text/plain": [ - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| image | label | image_2d | pil_image | model_classification |\n", - "| List[Int64] | Int64 | Python | Python | Int64 |\n", - "+======================+=========+======================+==================+========================+\n", - "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 3 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 7 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 5 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 6 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 | | |\n", - "+----------------------+---------+----------------------+------------------+------------------------+\n", - "(Showing first 10 rows)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "classified_images_df.where(col(\"label\") != col(\"model_classification\")).show(10)" - ] - }, - { - "cell_type": "markdown", - "id": "bb7ca72b-0743-451d-a3bf-e492a73ad7d6", - "metadata": {}, - "source": [ - "Some of these look hard indeed, even for a human!" - ] - }, - { - "cell_type": "markdown", - "id": "5482e99e-cf3a-4d54-93e3-6e468db03eef", - "metadata": {}, - "source": [ - "## Analytics\n", - "\n", - "We just managed to run our model, but how well did it actually do? Dataframes expose a powerful set of operations in Groupbys/Aggregations to help us report on aggregates of our data.\n", - "\n", - "Let's group our data by the true labels and calculate how many mistakes our model made per label." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "8b60eef9-eeab-435e-9f5d-c775af9afe3f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
label
Int64
num_rows
UInt64
correct
Int64
wrong
Int64
0 980 957 23
1 1135 1123 12
2 1032 996 36
3 1010 965 45
4 982 951 31
5 892 830 62
6 958 925 33
7 1028 971 57
\n", - " (Showing first 8 rows)\n", - "
" - ], - "text/plain": [ - "+---------+------------+-----------+---------+\n", - "| label | num_rows | correct | wrong |\n", - "| Int64 | UInt64 | Int64 | Int64 |\n", - "+=========+============+===========+=========+\n", - "| 0 | 980 | 957 | 23 |\n", - "+---------+------------+-----------+---------+\n", - "| 1 | 1135 | 1123 | 12 |\n", - "+---------+------------+-----------+---------+\n", - "| 2 | 1032 | 996 | 36 |\n", - "+---------+------------+-----------+---------+\n", - "| 3 | 1010 | 965 | 45 |\n", - "+---------+------------+-----------+---------+\n", - "| 4 | 982 | 951 | 31 |\n", - "+---------+------------+-----------+---------+\n", - "| 5 | 892 | 830 | 62 |\n", - "+---------+------------+-----------+---------+\n", - "| 6 | 958 | 925 | 33 |\n", - "+---------+------------+-----------+---------+\n", - "| 7 | 1028 | 971 | 57 |\n", - "+---------+------------+-----------+---------+\n", - "(Showing first 8 rows)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "analysis_df = classified_images_df \\\n", - " .with_column(\"correct\", (col(\"model_classification\") == col(\"label\")).cast(DataType.int64())) \\\n", - " .with_column(\"wrong\", (col(\"model_classification\") != col(\"label\")).cast(DataType.int64())) \\\n", - " .groupby(col(\"label\")) \\\n", - " .agg(\n", - " col(\"label\").count().alias(\"num_rows\"),\n", - " col(\"correct\").sum(),\n", - " col(\"wrong\").sum(),\n", - " ) \\\n", - " .sort(col(\"label\"))\n", - "\n", - "analysis_df.show()" - ] - }, - { - "cell_type": "markdown", - "id": "05f7df20-6dbc-4115-9acf-8d863cac93af", - "metadata": {}, - "source": [ - "Pretty impressive, given that the model only actually trained for one epoch!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4acf0191-8bb2-4c50-9d19-7a6bc97840d2", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - }, - "vscode": { - "interpreter": { - "hash": "e5d77f7bd5a748e4f6412a25f9708ab7af36936de941fc795d1a6b75eb2da082" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d1b56860-db41-4829-b395-176e11987cdc", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install getdaft\n", + "%pip install Pillow torch torchvision" + ] + }, + { + "cell_type": "markdown", + "id": "c571e01d", + "metadata": {}, + "source": [ + "```{hint}\n", + "✨✨✨ **Run this notebook on Google Colab** ✨✨✨\n", + "\n", + "You can [run this notebook yourself with Google Colab](https://colab.research.google.com/github/Eventual-Inc/Daft/blob/main/tutorials/mnist.ipynb)!\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "9b14abf5-a183-4bfb-9b15-a9a54b744fce", + "metadata": {}, + "source": [ + "# MNIST Daft Tutorial\n", + "\n", + "The MNIST Dataset is a \"large database of handwritten digits that is commonly used for training various image processing systems\"." + ] + }, + { + "cell_type": "markdown", + "id": "252b5128-99c2-49dd-b624-6e4b21275959", + "metadata": {}, + "source": [ + "## Loading Data\n", + "\n", + "This is a JSON file containing all the data for the MNIST test set. Let's load it up into a Daft Dataframe!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "fc63a3ad-0e0a-4ab3-9cc0-cbec8bdd0632", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-04-21 11:44:02.554 | INFO | daft.context:runner:88 - Using PyRunner\n" + ] + } + ], + "source": [ + "import daft\n", + "from daft import DataType, col, udf\n", + "\n", + "URL = \"https://github.com/Eventual-Inc/mnist-json/raw/master/mnist_handwritten_test.json.gz\"\n", + "images_df = daft.read_json(URL)" + ] + }, + { + "cell_type": "markdown", + "id": "d52f6032-6619-4682-8305-2ed65bdc194c", + "metadata": {}, + "source": [ + "To peek at the dataset, simply have your notebook display the images_df that was just created." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "73a71adf-3b2e-4ec5-a0d2-34ad8eec734c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " \n", + "\n", + "\n", + "\n", + "
image
List[Int64]
label
Int64
\n", + " (No data to display: Dataframe not materialized)\n", + "
" + ], + "text/plain": [ + "+---------------+---------+\n", + "| image | label |\n", + "| List[Int64] | Int64 |\n", + "+===============+=========+\n", + "+---------------+---------+\n", + "(No data to display: Dataframe not materialized)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "images_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4787caab-d7d1-4fd4-9a76-ffb08a404a31", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
image
List[Int64]
label
Int64
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 7
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 0
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 5
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9
\n", + " (Showing first 10 rows)\n", + "
" + ], + "text/plain": [ + "+----------------------+---------+\n", + "| image | label |\n", + "| List[Int64] | Int64 |\n", + "+======================+=========+\n", + "| [0, 0, 0, 0, 0, 0, | 7 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | |\n", + "| 0, 0, 0, 0, 0, 0,... | |\n", + "+----------------------+---------+\n", + "| [0, 0, 0, 0, 0, 0, | 2 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | |\n", + "| 0, 0, 0, 0, 0, 0,... | |\n", + "+----------------------+---------+\n", + "| [0, 0, 0, 0, 0, 0, | 1 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | |\n", + "| 0, 0, 0, 0, 0, 0,... | |\n", + "+----------------------+---------+\n", + "| [0, 0, 0, 0, 0, 0, | 0 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | |\n", + "| 0, 0, 0, 0, 0, 0,... | |\n", + "+----------------------+---------+\n", + "| [0, 0, 0, 0, 0, 0, | 4 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | |\n", + "| 0, 0, 0, 0, 0, 0,... | |\n", + "+----------------------+---------+\n", + "| [0, 0, 0, 0, 0, 0, | 1 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | |\n", + "| 0, 0, 0, 0, 0, 0,... | |\n", + "+----------------------+---------+\n", + "| [0, 0, 0, 0, 0, 0, | 4 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | |\n", + "| 0, 0, 0, 0, 0, 0,... | |\n", + "+----------------------+---------+\n", + "| [0, 0, 0, 0, 0, 0, | 9 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | |\n", + "| 0, 0, 0, 0, 0, 0,... | |\n", + "+----------------------+---------+\n", + "| [0, 0, 0, 0, 0, 0, | 5 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | |\n", + "| 0, 0, 0, 0, 0, 0,... | |\n", + "+----------------------+---------+\n", + "| [0, 0, 0, 0, 0, 0, | 9 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | |\n", + "| 0, 0, 0, 0, 0, 0,... | |\n", + "+----------------------+---------+\n", + "(Showing first 10 rows)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "images_df.show(10)" + ] + }, + { + "cell_type": "markdown", + "id": "426f1bbb-e1c0-4fd6-b84e-cbb1ab309ff9", + "metadata": {}, + "source": [ + "You just loaded your first DaFt Dataframe! It consists of two columns:\n", + "1. The \"image\" column is a Python column of type `list` - where it looks like each row contains a list of digits representing the pixels of each image\n", + "2. The \"label\" column is an Integer column, consisting of just the label of that image." + ] + }, + { + "cell_type": "markdown", + "id": "9a7872e3-9860-4867-8a8c-61a69f69e334", + "metadata": {}, + "source": [ + "## Processing Columns with User-Defined Functions (UDF)\n", + "\n", + "It seems our JSON file has provided us with a one-dimensional array of pixels instead of two-dimensional images. We can easily modify data in this column by instructing Daft to run a method on every row in the column like so:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "af857589-b28a-4ee0-91cd-dc7a01ff4c07", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "images_df = images_df.with_column(\n", + " \"image_2d\",\n", + " col(\"image\").apply(lambda img: np.array(img).reshape(28, 28), return_dtype=DataType.python()),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d1212a7e-949a-4881-ba54-9d7e7eb31e6f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
image
List[Int64]
label
Int64
image_2d
Python
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 7<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 0<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 5<np.ndarray
shape=(28, 28)
dtype=int64>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\n", + " (Showing first 10 rows)\n", + "
" + ], + "text/plain": [ + "+----------------------+---------+----------------------+\n", + "| image | label | image_2d |\n", + "| List[Int64] | Int64 | Python |\n", + "+======================+=========+======================+\n", + "| [0, 0, 0, 0, 0, 0, | 7 | [[ 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", + "+----------------------+---------+----------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", + "+----------------------+---------+----------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", + "+----------------------+---------+----------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 0 | [[ 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", + "+----------------------+---------+----------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", + "+----------------------+---------+----------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", + "+----------------------+---------+----------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", + "+----------------------+---------+----------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", + "+----------------------+---------+----------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 5 | [[ 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", + "+----------------------+---------+----------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0, 0, | | 0 0 0 0 0 |\n", + "| 0, 0, 0, 0, 0, 0,... | | 0 0 0 0 0... |\n", + "+----------------------+---------+----------------------+\n", + "(Showing first 10 rows)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "images_df.show(10)" + ] + }, + { + "cell_type": "markdown", + "id": "cd0d2664-12d8-4964-85cd-a67f8fee1384", + "metadata": {}, + "source": [ + "Great, but we can do one better - let's convert these two-dimensional arrays into Images. Computers speak in pixels and arrays, but humans do much better with visual patterns!\n", + "\n", + "To do this, we can leverage the `.apply` expression method. Similar to the `.as_py` method, this allows us to run a single function on all rows of a given column, but provides us with more flexibility as it takes as input any arbitrary function." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e585303a-7c83-4a31-afbb-461c951481f7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from PIL import Image\n", + "\n", + "images_df = images_df.with_column(\n", + " \"pil_image\",\n", + " col(\"image_2d\").apply(lambda arr: Image.fromarray(arr.astype(np.uint8)), return_dtype=DataType.python()),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "59b655ed-13aa-4764-acd4-a00beb91ec2f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
image
List[Int64]
label
Int64
image_2d
Python
pil_image
Python
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 7<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 0<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 5<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" />
\n", + " (Showing first 10 rows)\n", + "
" + ], + "text/plain": [ + "+----------------------+---------+----------------------+------------------+\n", + "| image | label | image_2d | pil_image |\n", + "| List[Int64] | Int64 | Python | Python |\n", + "+======================+=========+======================+==================+\n", + "| [0, 0, 0, 0, 0, 0, | 7 | [[ 0 0 0 0 | |\n", + "+----------------------+---------+----------------------+------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 | |\n", + "+----------------------+---------+----------------------+------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 | |\n", + "+----------------------+---------+----------------------+------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 0 | [[ 0 0 0 0 | |\n", + "+----------------------+---------+----------------------+------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | |\n", + "+----------------------+---------+----------------------+------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 | |\n", + "+----------------------+---------+----------------------+------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | |\n", + "+----------------------+---------+----------------------+------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 | |\n", + "+----------------------+---------+----------------------+------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 5 | [[ 0 0 0 0 | |\n", + "+----------------------+---------+----------------------+------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 | |\n", + "+----------------------+---------+----------------------+------------------+\n", + "(Showing first 10 rows)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "images_df.show(10)" + ] + }, + { + "cell_type": "markdown", + "id": "e6b633f4-3d9d-4c25-9075-bc815d8e357f", + "metadata": {}, + "source": [ + "Amazing! This looks great and we can finally get some idea of what the dataset truly looks like." + ] + }, + { + "cell_type": "markdown", + "id": "cd7e6774-9fb7-4827-a324-c116c8c812e1", + "metadata": {}, + "source": [ + "## Running a model with UDFs\n", + "\n", + "Next, let's try to run a deep learning model to classify each image. Models are expensive to initialize and load, so we want to do this as few times as possible, and share a model across multiple invocations.\n", + "\n", + "For the convenience of this quickstart tutorial, we pre-trained a model using a PyTorch-provided example script and saved the trained weights at https://github.com/Eventual-Inc/mnist-json/raw/master/mnist_cnn.pt. We need to define the same deep learning model \"scaffold\" as the trained model that we want to load (this part is all PyTorch and is not specific at all to DaFt)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5ff43066-8a42-4773-974f-160ca4a9bc49", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "###\n", + "# Model was trained using a script provided in PyTorch Examples: https://github.com/pytorch/examples/blob/main/mnist/main.py\n", + "###\n", + "\n", + "import torch\n", + "import torch.hub\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "\n", + "class Net(nn.Module):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.conv1 = nn.Conv2d(1, 32, 3, 1)\n", + " self.conv2 = nn.Conv2d(32, 64, 3, 1)\n", + " self.dropout1 = nn.Dropout(0.25)\n", + " self.dropout2 = nn.Dropout(0.5)\n", + " self.fc1 = nn.Linear(9216, 128)\n", + " self.fc2 = nn.Linear(128, 10)\n", + "\n", + " def forward(self, x):\n", + " x = self.conv1(x)\n", + " x = F.relu(x)\n", + " x = self.conv2(x)\n", + " x = F.relu(x)\n", + " x = F.max_pool2d(x, 2)\n", + " x = self.dropout1(x)\n", + " x = torch.flatten(x, 1)\n", + " x = self.fc1(x)\n", + " x = F.relu(x)\n", + " x = self.dropout2(x)\n", + " x = self.fc2(x)\n", + " output = F.log_softmax(x, dim=1)\n", + " return output" + ] + }, + { + "cell_type": "markdown", + "id": "266c1cf8-bf9a-4990-8182-97b072f15b57", + "metadata": {}, + "source": [ + "Now comes the fun part - we can define a UDF using the `@udf` decorator. Notice that for a batch of data we only initialize our model once!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fda097ea-4946-483c-bcc0-5271e0b033c3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "@udf(return_dtype=DataType.int64())\n", + "class ClassifyImages:\n", + " def __init__(self):\n", + " # Perform expensive initializations - create the model, download model weights and load up the model with weights\n", + " self.model = Net()\n", + " state_dict = torch.hub.load_state_dict_from_url(\n", + " \"https://github.com/Eventual-Inc/mnist-json/raw/master/mnist_cnn.pt\"\n", + " )\n", + " self.model.load_state_dict(state_dict)\n", + "\n", + " def __call__(self, images_2d_col):\n", + " images_arr = np.array(images_2d_col.to_pylist())\n", + " normalized_image_2d = images_arr / 255\n", + " normalized_image_2d = normalized_image_2d[:, np.newaxis, :, :]\n", + " classifications = self.model(torch.from_numpy(normalized_image_2d).float())\n", + " return classifications.detach().numpy().argmax(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "3605d3a6-f9ce-4e81-9e0f-5190f981bbd4", + "metadata": {}, + "source": [ + "Using this UDF is really easy, we simply run it on the columns that we want to process:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4f9fd9f8-a231-44fb-a519-0288f670a34a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
image
List[Int64]
label
Int64
image_2d
Python
pil_image
Python
model_classification
Int64
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 7<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 7
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 2
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 0<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 0
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 4
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 1<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 4
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 9
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 5<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 6
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 9
\n", + " (Showing first 10 rows)\n", + "
" + ], + "text/plain": [ + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| image | label | image_2d | pil_image | model_classification |\n", + "| List[Int64] | Int64 | Python | Python | Int64 |\n", + "+======================+=========+======================+==================+========================+\n", + "| [0, 0, 0, 0, 0, 0, | 7 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 0 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 1 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 5 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "(Showing first 10 rows)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classified_images_df = images_df.with_column(\"model_classification\", ClassifyImages(col(\"image_2d\")))\n", + "\n", + "classified_images_df.show(10)" + ] + }, + { + "cell_type": "markdown", + "id": "2e6fb5fc-957d-414b-bfac-961ea64dad68", + "metadata": {}, + "source": [ + "Our model ran successfully, and produced a new classification column. These look pretty good - let's filter our Dataframe to show only rows that the model predicted wrongly." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "69344d63-7db4-496f-a0b2-949dfd947e4f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
image
List[Int64]
label
Int64
image_2d
Python
pil_image
Python
model_classification
Int64
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 7
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 3<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 2
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 9<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 8
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 8
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 9
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 7<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 9
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 5<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 8
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 6<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 5
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 4<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 2
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... 2<np.ndarray
shape=(28, 28)
dtype=int64>
\"<PIL.Image.Image\" /> 8
\n", + " (Showing first 10 rows)\n", + "
" + ], + "text/plain": [ + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| image | label | image_2d | pil_image | model_classification |\n", + "| List[Int64] | Int64 | Python | Python | Int64 |\n", + "+======================+=========+======================+==================+========================+\n", + "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 3 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 9 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 7 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 5 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 6 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 4 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "| [0, 0, 0, 0, 0, 0, | 2 | [[ 0 0 0 0 | | |\n", + "+----------------------+---------+----------------------+------------------+------------------------+\n", + "(Showing first 10 rows)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classified_images_df.where(col(\"label\") != col(\"model_classification\")).show(10)" + ] + }, + { + "cell_type": "markdown", + "id": "bb7ca72b-0743-451d-a3bf-e492a73ad7d6", + "metadata": {}, + "source": [ + "Some of these look hard indeed, even for a human!" + ] + }, + { + "cell_type": "markdown", + "id": "5482e99e-cf3a-4d54-93e3-6e468db03eef", + "metadata": {}, + "source": [ + "## Analytics\n", + "\n", + "We just managed to run our model, but how well did it actually do? Dataframes expose a powerful set of operations in Groupbys/Aggregations to help us report on aggregates of our data.\n", + "\n", + "Let's group our data by the true labels and calculate how many mistakes our model made per label." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8b60eef9-eeab-435e-9f5d-c775af9afe3f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
label
Int64
num_rows
UInt64
correct
Int64
wrong
Int64
0 980 957 23
1 1135 1123 12
2 1032 996 36
3 1010 965 45
4 982 951 31
5 892 830 62
6 958 925 33
7 1028 971 57
\n", + " (Showing first 8 rows)\n", + "
" + ], + "text/plain": [ + "+---------+------------+-----------+---------+\n", + "| label | num_rows | correct | wrong |\n", + "| Int64 | UInt64 | Int64 | Int64 |\n", + "+=========+============+===========+=========+\n", + "| 0 | 980 | 957 | 23 |\n", + "+---------+------------+-----------+---------+\n", + "| 1 | 1135 | 1123 | 12 |\n", + "+---------+------------+-----------+---------+\n", + "| 2 | 1032 | 996 | 36 |\n", + "+---------+------------+-----------+---------+\n", + "| 3 | 1010 | 965 | 45 |\n", + "+---------+------------+-----------+---------+\n", + "| 4 | 982 | 951 | 31 |\n", + "+---------+------------+-----------+---------+\n", + "| 5 | 892 | 830 | 62 |\n", + "+---------+------------+-----------+---------+\n", + "| 6 | 958 | 925 | 33 |\n", + "+---------+------------+-----------+---------+\n", + "| 7 | 1028 | 971 | 57 |\n", + "+---------+------------+-----------+---------+\n", + "(Showing first 8 rows)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analysis_df = (\n", + " classified_images_df.with_column(\"correct\", (col(\"model_classification\") == col(\"label\")).cast(DataType.int64()))\n", + " .with_column(\"wrong\", (col(\"model_classification\") != col(\"label\")).cast(DataType.int64()))\n", + " .groupby(col(\"label\"))\n", + " .agg(\n", + " col(\"label\").count().alias(\"num_rows\"),\n", + " col(\"correct\").sum(),\n", + " col(\"wrong\").sum(),\n", + " )\n", + " .sort(col(\"label\"))\n", + ")\n", + "\n", + "analysis_df.show()" + ] + }, + { + "cell_type": "markdown", + "id": "05f7df20-6dbc-4115-9acf-8d863cac93af", + "metadata": {}, + "source": [ + "Pretty impressive, given that the model only actually trained for one epoch!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4acf0191-8bb2-4c50-9d19-7a6bc97840d2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "vscode": { + "interpreter": { + "hash": "e5d77f7bd5a748e4f6412a25f9708ab7af36936de941fc795d1a6b75eb2da082" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/tutorials/talks_and_demos/data-ai-summit-2024.ipynb b/tutorials/talks_and_demos/data-ai-summit-2024.ipynb index 9c4ee92375..4ba9d9555c 100644 --- a/tutorials/talks_and_demos/data-ai-summit-2024.ipynb +++ b/tutorials/talks_and_demos/data-ai-summit-2024.ipynb @@ -44,6 +44,7 @@ "# Skip this notebook execution in CI because it requires AWS credentials for presigned URL generation\n", "if CI:\n", " import sys\n", + "\n", " sys.exit()" ] }, @@ -388,21 +389,19 @@ }, "outputs": [], "source": [ - "import base64\n", - "import requests\n", "import json\n", "import os\n", + "\n", "import boto3\n", + "import requests\n", "\n", "DEFAULT_PROMPT = \"What’s in this image?\"\n", "api_key = os.getenv(\"OPENAI_API_KEY\")\n", "if api_key is None:\n", " raise RuntimeError(\"Please specify your OpenAI API key as the environment variable `OPENAI_API_KEY`.\")\n", "\n", - "headers = {\n", - " \"Content-Type\": \"application/json\",\n", - " \"Authorization\": f\"Bearer {api_key}\"\n", - "}\n", + "headers = {\"Content-Type\": \"application/json\", \"Authorization\": f\"Bearer {api_key}\"}\n", + "\n", "\n", "@daft.udf(return_dtype=daft.DataType.string())\n", "def generate_presigned_url(s3_urls, expires_in=3600):\n", @@ -419,37 +418,30 @@ " presigned_urls.append(url)\n", " return presigned_urls\n", "\n", + "\n", "@daft.udf(return_dtype=daft.DataType.string())\n", "def run_gpt4o_on_urls(images_urls, prompt=DEFAULT_PROMPT):\n", " \"\"\"Run the gpt-4o LLM by making an API call to OpenAI\"\"\"\n", " results = []\n", " for url in images_urls.to_pylist():\n", " payload = {\n", - " \"model\": \"gpt-4o\",\n", - " \"messages\": [\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": [\n", + " \"model\": \"gpt-4o\",\n", + " \"messages\": [\n", " {\n", - " \"type\": \"text\",\n", - " \"text\": \"What’s in this image?\"\n", - " },\n", - " {\n", - " \"type\": \"image_url\",\n", - " \"image_url\": {\n", - " \"url\": url\n", - " }\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"type\": \"text\", \"text\": \"What’s in this image?\"},\n", + " {\"type\": \"image_url\", \"image_url\": {\"url\": url}},\n", + " ],\n", " }\n", - " ]\n", - " }\n", - " ],\n", - " \"max_tokens\": 300\n", + " ],\n", + " \"max_tokens\": 300,\n", " }\n", "\n", " response = requests.post(\"https://api.openai.com/v1/chat/completions\", headers=headers, json=payload)\n", " results.append(json.dumps(response.json()))\n", "\n", - " return results\n" + " return results" ] }, { @@ -654,9 +646,9 @@ "metadata": {}, "outputs": [], "source": [ - "read_df = read_df \\\n", - " .with_column(\"image_thumbnail\", daft.col(\"image_thumbnail\").image.decode()) \\\n", - " .where(read_df[\"description\"].str.contains(\"dog\"))" + "read_df = read_df.with_column(\"image_thumbnail\", daft.col(\"image_thumbnail\").image.decode()).where(\n", + " read_df[\"description\"].str.contains(\"dog\")\n", + ")" ] }, { diff --git a/tutorials/talks_and_demos/iceberg_summit_2024.ipynb b/tutorials/talks_and_demos/iceberg_summit_2024.ipynb index a85481877e..176c684567 100644 --- a/tutorials/talks_and_demos/iceberg_summit_2024.ipynb +++ b/tutorials/talks_and_demos/iceberg_summit_2024.ipynb @@ -48,6 +48,7 @@ "# Skip this notebook execution in CI because it hits non-public buckets\n", "if CI:\n", " import sys\n", + "\n", " sys.exit()" ] }, @@ -124,7 +125,6 @@ "metadata": {}, "outputs": [], "source": [ - "import pyiceberg\n", "from pyiceberg.catalog.sql import SqlCatalog\n", "\n", "warehouse_path = \"/tmp/warehouse\"\n", @@ -174,7 +174,7 @@ "metadata": {}, "outputs": [], "source": [ - "from pyiceberg.schema import Schema, NestedField, StringType, IntegerType\n", + "from pyiceberg.schema import IntegerType, NestedField, Schema, StringType\n", "\n", "SCHEMA = Schema(\n", " NestedField(1, \"name\", StringType(), required=False),\n", @@ -213,10 +213,12 @@ "source": [ "import daft\n", "\n", - "df = daft.from_pydict({\n", - " \"name\": [\"jay\", \"sammy\", \"brian\"],\n", - " \"age\": [30, 31, 32],\n", - "})\n", + "df = daft.from_pydict(\n", + " {\n", + " \"name\": [\"jay\", \"sammy\", \"brian\"],\n", + " \"age\": [30, 31, 32],\n", + " }\n", + ")\n", "\n", "df" ] @@ -458,6 +460,7 @@ "outputs": [], "source": [ "import datetime\n", + "\n", "from pyiceberg.catalog.glue import GlueCatalog\n", "\n", "catalog = GlueCatalog(\"my_glue_catalog\")\n", diff --git a/tutorials/talks_and_demos/linkedin-03-05-2024.ipynb b/tutorials/talks_and_demos/linkedin-03-05-2024.ipynb index 3440a7e4e5..16a758519d 100644 --- a/tutorials/talks_and_demos/linkedin-03-05-2024.ipynb +++ b/tutorials/talks_and_demos/linkedin-03-05-2024.ipynb @@ -43,6 +43,7 @@ "# Skip this notebook execution in CI because it hits non-public data in AWS\n", "if CI:\n", " import sys\n", + "\n", " sys.exit()" ] }, @@ -105,7 +106,9 @@ "source": [ "### DeltaLake\n", "\n", - "delta_df = daft.read_deltalake(\"s3://daft-public-data/nyc-taxi-dataset-2023-jan-deltalake/\", io_config=ANONYMOUS_IO_CONFIG)\n", + "delta_df = daft.read_deltalake(\n", + " \"s3://daft-public-data/nyc-taxi-dataset-2023-jan-deltalake/\", io_config=ANONYMOUS_IO_CONFIG\n", + ")\n", "delta_df.show()" ] }, @@ -117,7 +120,7 @@ "source": [ "### Daft also supports reading from many other file sources:\n", "# df = daft.read_csv(...)\n", - "# df = daft.read_parquet(...) \n", + "# df = daft.read_parquet(...)\n", "# df = daft.read_json(...)\n", "\n", "### Read from SQL Databases\n", @@ -194,7 +197,7 @@ "outputs": [], "source": [ "laion_df = laion_df.with_column(\"data\", laion_df[\"path\"].url.download()) # Utf8 -> Binary\n", - "laion_df = laion_df.with_column(\"image\", laion_df[\"data\"].image.decode()) # Binary -> Image" + "laion_df = laion_df.with_column(\"image\", laion_df[\"data\"].image.decode()) # Binary -> Image" ] }, { diff --git a/tutorials/talks_and_demos/pydata_global_2023.ipynb b/tutorials/talks_and_demos/pydata_global_2023.ipynb index 1376774270..a162b0d959 100644 --- a/tutorials/talks_and_demos/pydata_global_2023.ipynb +++ b/tutorials/talks_and_demos/pydata_global_2023.ipynb @@ -22,6 +22,7 @@ "# Skip this notebook execution in CI because it hits non-public buckets\n", "if CI:\n", " import sys\n", + "\n", " sys.exit()" ] }, @@ -147,6 +148,7 @@ "%%time\n", "\n", "import boto3\n", + "\n", "client = boto3.client(\"s3\")\n", "kwargs = {\"Bucket\": \"daft-public-datasets\", \"Prefix\": \"tpch-lineitem/10k-1mb-csv-files\"}\n", "response = client.list_objects_v2(**kwargs)\n", @@ -159,7 +161,7 @@ " response = client.list_objects_v2(**kwargs)\n", " data.extend(response[\"Contents\"])\n", " token = response.get(\"NextContinuationToken\")\n", - " \n", + "\n", "print(f\"Retrieved {len(data)} results.\")" ] }, @@ -258,10 +260,13 @@ "metadata": {}, "outputs": [], "source": [ - "import daft\n", "import os\n", "\n", + "import daft\n", + "\n", "PARQUET_FOLDER = \"s3://eventual-dev-benchmarking-fixtures/uncompressed-smaller-rg/tpch-dbgen/1000_0/512/parquet/\"\n", + "\n", + "\n", "def get_df(table_name: str) -> daft.DataFrame:\n", " return daft.read_parquet(os.path.join(PARQUET_FOLDER, table_name, \"*.parquet\"))" ] @@ -272,9 +277,10 @@ "metadata": {}, "outputs": [], "source": [ - "from daft import col\n", "import datetime\n", "\n", + "from daft import col\n", + "\n", "lineitem = get_df(\"lineitem\")\n", "\n", "discounted_price = col(\"L_EXTENDEDPRICE\") * (1 - col(\"L_DISCOUNT\"))\n", diff --git a/tutorials/text_to_image/text_to_image_generation.ipynb b/tutorials/text_to_image/text_to_image_generation.ipynb index a8316c3e9d..d0e604028d 100644 --- a/tutorials/text_to_image/text_to_image_generation.ipynb +++ b/tutorials/text_to_image/text_to_image_generation.ipynb @@ -1,318 +1,321 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "019805d9-4e9f-4306-8f18-a565cb1e8845", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "019805d9-4e9f-4306-8f18-a565cb1e8845", - "outputId": "f48e4a66-21cd-4b93-e8cb-261ae8c8aec8" - }, - "outputs": [], - "source": [ - "!pip install getdaft --pre --extra-index-url https://pypi.anaconda.org/daft-nightly/simple\n", - "!pip install min-dalle torch Pillow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9536868c", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "CI = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ff9d08a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import daft\n", - "\n", - "# Flip this flag if you want to see the performance of running on CPU vs GPU\n", - "USE_GPU = False if CI else True\n", - "IO_CONFIG = daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True, region_name=\"us-west-2\")) # Use anonymous-mode for accessing AWS S3\n", - "PARQUET_PATH = \"s3://daft-public-data/tutorials/laion-parquet/train-00000-of-00001-6f24a7497df494ae.parquet\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "d5a31f06", - "metadata": {}, - "source": [ - "```{hint}\n", - "✨✨✨ **Run this notebook on Google Colab** ✨✨✨\n", - "\n", - "You can [run this notebook yourself with Google Colab](https://colab.research.google.com/github/Eventual-Inc/Daft/blob/main/tutorials/text_to_image/text_to_image_generation.ipynb)!\n", - "```" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "78db424a-96b5-46f3-bd32-484f5c6b92a3", - "metadata": { - "id": "78db424a-96b5-46f3-bd32-484f5c6b92a3" - }, - "source": [ - "# Generating Images from Text with DALL-E\n", - "\n", - "In this tutorial, we will be using the DALL-E model to generate images from text. We will explore how to use GPUs with Daft to accelerate computations.\n", - "\n", - "To run this tutorial:\n", - "\n", - "1. You will need access to a GPU. If you are on Google Colab, you may switch to a GPU runtime by going to the menu `Runtime -> Change runtime type -> Hardware accelerator -> GPU -> Save`.\n", - "\n", - "Let's get started!" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "4da65a96-e4fe-4795-92d0-a5e631b58e33", - "metadata": { - "id": "4da65a96-e4fe-4795-92d0-a5e631b58e33" - }, - "source": [ - "## Setting Up\n", - "\n", - "First, let's load a Parquet file into Daft. This particular file is hosted in HuggingFace at a https URL." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "806451f8-68af-462a-af7b-ff5480425a3a", - "metadata": { - "id": "806451f8-68af-462a-af7b-ff5480425a3a", - "tags": [] - }, - "outputs": [], - "source": [ - "import daft\n", - "daft.context.set_runner_py(use_thread_pool=False)\n", - "\n", - "parquet_df = daft.read_parquet(PARQUET_PATH, io_config=IO_CONFIG)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "a1e20f90", - "metadata": {}, - "source": [ - "Let's go ahead and `.collect()` this DataFrame. This will download the Parquet file and materialize the data in memory so that all our subsequent operations will be cached!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1e3b619-beaf-465e-83f2-5ab71638dcc1", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 544 - }, - "id": "e1e3b619-beaf-465e-83f2-5ab71638dcc1", - "outputId": "e52133d2-5694-49a0-e385-758cf5b1b203", - "tags": [] - }, - "outputs": [], - "source": [ - "parquet_df.collect()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b257cd91-db90-4803-afd9-9fdf571cf755", - "metadata": { - "id": "b257cd91-db90-4803-afd9-9fdf571cf755", - "tags": [] - }, - "outputs": [], - "source": [ - "parquet_df = parquet_df.select(parquet_df[\"URL\"], parquet_df[\"TEXT\"], parquet_df[\"AESTHETIC_SCORE\"])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f28047df-bf05-47df-b4d4-3507a8f7d2ac", - "metadata": { - "id": "f28047df-bf05-47df-b4d4-3507a8f7d2ac" - }, - "source": [ - "## Downloading Images\n", - "\n", - "Like many datasets, instead of storing the actual images in the dataset's files it looks like the Dataset authors have instead opted to store a URL to the image.\n", - "\n", - "Let's use Daft's builtin functionality to download the images and open them as PIL Images - all in just a few lines of code!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1e5cd84-4526-4a91-9fd5-f4e78f35965d", - "metadata": { - "id": "f1e5cd84-4526-4a91-9fd5-f4e78f35965d", - "tags": [] - }, - "outputs": [], - "source": [ - "# Filter for images with longer descriptions\n", - "parquet_df_with_long_strings = parquet_df.where(parquet_df[\"TEXT\"].str.length() > 50)\n", - "\n", - "# Download images\n", - "images_df = parquet_df_with_long_strings.with_column(\n", - " \"image\",\n", - " parquet_df[\"URL\"].url.download().image.decode(),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1361728-8b1a-4e6e-9632-ddd17cad948b", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 802 - }, - "id": "c1361728-8b1a-4e6e-9632-ddd17cad948b", - "outputId": "1c2ce3a4-63a1-4f77-ce2e-e3ecea2a3e1f", - "tags": [] - }, - "outputs": [], - "source": [ - "images_df.show(5)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "6e6f59ee", - "metadata": {}, - "source": [ - "Great! Now we have a pretty good idea of what our dataset looks like." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "gCTmONUl81Vw", - "metadata": { - "id": "gCTmONUl81Vw" - }, - "source": [ - "# Running the Mini DALL-E model on a GPU using Daft UDFs\n", - "\n", - "Let's now run the Mini DALL-E model over the `\"TEXT\"` column, and generate images for those texts!\n", - "\n", - "Using GPUs with Daft UDFs is simple. Just specify `num_gpus=N`, where `N` is the number of GPUs that your UDF is going to use." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b500e7f5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import torch\n", - "from min_dalle import MinDalle\n", - "\n", - "from daft import ResourceRequest\n", - "\n", - "\n", - "@daft.udf(return_dtype=daft.DataType.python())\n", - "class GenerateImageFromText:\n", - " def __init__(self):\n", - " self.model = MinDalle(\n", - " models_root='./pretrained',\n", - " dtype=torch.float32,\n", - " # Tell the min-dalle library to load model on GPU or GPU\n", - " device=\"cuda\" if USE_GPU else \"cpu\",\n", - " is_mega=False, \n", - " is_reusable=True\n", - " )\n", - " \n", - " def __call__(self, text_col):\n", - " return [\n", - " self.model.generate_image(\n", - " t,\n", - " seed=-1,\n", - " grid_size=1,\n", - " is_seamless=False,\n", - " temperature=1,\n", - " top_k=256,\n", - " supercondition_factor=32,\n", - " ) for t in text_col.to_pylist()\n", - " ]\n", - "\n", - "if USE_GPU:\n", - " GenerateImageFromText = GenerateImageFromText.override_options(num_gpus=1)\n", - "\n", - "images_df.with_column(\n", - " \"generated_image\",\n", - " GenerateImageFromText(images_df[\"TEXT\"]),\n", - ").show(1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5764ae83-9100-47c7-95d3-4880d9f1fe7c", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - }, - "vscode": { - "interpreter": { - "hash": "e5d77f7bd5a748e4f6412a25f9708ab7af36936de941fc795d1a6b75eb2da082" - } - } + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "019805d9-4e9f-4306-8f18-a565cb1e8845", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, - "nbformat": 4, - "nbformat_minor": 5 + "id": "019805d9-4e9f-4306-8f18-a565cb1e8845", + "outputId": "f48e4a66-21cd-4b93-e8cb-261ae8c8aec8" + }, + "outputs": [], + "source": [ + "!pip install getdaft --pre --extra-index-url https://pypi.anaconda.org/daft-nightly/simple\n", + "!pip install min-dalle torch Pillow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9536868c", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "CI = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ff9d08a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import daft\n", + "\n", + "# Flip this flag if you want to see the performance of running on CPU vs GPU\n", + "USE_GPU = False if CI else True\n", + "IO_CONFIG = daft.io.IOConfig(\n", + " s3=daft.io.S3Config(anonymous=True, region_name=\"us-west-2\")\n", + ") # Use anonymous-mode for accessing AWS S3\n", + "PARQUET_PATH = \"s3://daft-public-data/tutorials/laion-parquet/train-00000-of-00001-6f24a7497df494ae.parquet\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d5a31f06", + "metadata": {}, + "source": [ + "```{hint}\n", + "✨✨✨ **Run this notebook on Google Colab** ✨✨✨\n", + "\n", + "You can [run this notebook yourself with Google Colab](https://colab.research.google.com/github/Eventual-Inc/Daft/blob/main/tutorials/text_to_image/text_to_image_generation.ipynb)!\n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "78db424a-96b5-46f3-bd32-484f5c6b92a3", + "metadata": { + "id": "78db424a-96b5-46f3-bd32-484f5c6b92a3" + }, + "source": [ + "# Generating Images from Text with DALL-E\n", + "\n", + "In this tutorial, we will be using the DALL-E model to generate images from text. We will explore how to use GPUs with Daft to accelerate computations.\n", + "\n", + "To run this tutorial:\n", + "\n", + "1. You will need access to a GPU. If you are on Google Colab, you may switch to a GPU runtime by going to the menu `Runtime -> Change runtime type -> Hardware accelerator -> GPU -> Save`.\n", + "\n", + "Let's get started!" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "4da65a96-e4fe-4795-92d0-a5e631b58e33", + "metadata": { + "id": "4da65a96-e4fe-4795-92d0-a5e631b58e33" + }, + "source": [ + "## Setting Up\n", + "\n", + "First, let's load a Parquet file into Daft. This particular file is hosted in HuggingFace at a https URL." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "806451f8-68af-462a-af7b-ff5480425a3a", + "metadata": { + "id": "806451f8-68af-462a-af7b-ff5480425a3a", + "tags": [] + }, + "outputs": [], + "source": [ + "import daft\n", + "\n", + "daft.context.set_runner_py(use_thread_pool=False)\n", + "\n", + "parquet_df = daft.read_parquet(PARQUET_PATH, io_config=IO_CONFIG)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a1e20f90", + "metadata": {}, + "source": [ + "Let's go ahead and `.collect()` this DataFrame. This will download the Parquet file and materialize the data in memory so that all our subsequent operations will be cached!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1e3b619-beaf-465e-83f2-5ab71638dcc1", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 544 + }, + "id": "e1e3b619-beaf-465e-83f2-5ab71638dcc1", + "outputId": "e52133d2-5694-49a0-e385-758cf5b1b203", + "tags": [] + }, + "outputs": [], + "source": [ + "parquet_df.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b257cd91-db90-4803-afd9-9fdf571cf755", + "metadata": { + "id": "b257cd91-db90-4803-afd9-9fdf571cf755", + "tags": [] + }, + "outputs": [], + "source": [ + "parquet_df = parquet_df.select(parquet_df[\"URL\"], parquet_df[\"TEXT\"], parquet_df[\"AESTHETIC_SCORE\"])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f28047df-bf05-47df-b4d4-3507a8f7d2ac", + "metadata": { + "id": "f28047df-bf05-47df-b4d4-3507a8f7d2ac" + }, + "source": [ + "## Downloading Images\n", + "\n", + "Like many datasets, instead of storing the actual images in the dataset's files it looks like the Dataset authors have instead opted to store a URL to the image.\n", + "\n", + "Let's use Daft's builtin functionality to download the images and open them as PIL Images - all in just a few lines of code!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1e5cd84-4526-4a91-9fd5-f4e78f35965d", + "metadata": { + "id": "f1e5cd84-4526-4a91-9fd5-f4e78f35965d", + "tags": [] + }, + "outputs": [], + "source": [ + "# Filter for images with longer descriptions\n", + "parquet_df_with_long_strings = parquet_df.where(parquet_df[\"TEXT\"].str.length() > 50)\n", + "\n", + "# Download images\n", + "images_df = parquet_df_with_long_strings.with_column(\n", + " \"image\",\n", + " parquet_df[\"URL\"].url.download().image.decode(),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1361728-8b1a-4e6e-9632-ddd17cad948b", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 802 + }, + "id": "c1361728-8b1a-4e6e-9632-ddd17cad948b", + "outputId": "1c2ce3a4-63a1-4f77-ce2e-e3ecea2a3e1f", + "tags": [] + }, + "outputs": [], + "source": [ + "images_df.show(5)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6e6f59ee", + "metadata": {}, + "source": [ + "Great! Now we have a pretty good idea of what our dataset looks like." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "gCTmONUl81Vw", + "metadata": { + "id": "gCTmONUl81Vw" + }, + "source": [ + "# Running the Mini DALL-E model on a GPU using Daft UDFs\n", + "\n", + "Let's now run the Mini DALL-E model over the `\"TEXT\"` column, and generate images for those texts!\n", + "\n", + "Using GPUs with Daft UDFs is simple. Just specify `num_gpus=N`, where `N` is the number of GPUs that your UDF is going to use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b500e7f5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import torch\n", + "from min_dalle import MinDalle\n", + "\n", + "\n", + "@daft.udf(return_dtype=daft.DataType.python())\n", + "class GenerateImageFromText:\n", + " def __init__(self):\n", + " self.model = MinDalle(\n", + " models_root=\"./pretrained\",\n", + " dtype=torch.float32,\n", + " # Tell the min-dalle library to load model on GPU or GPU\n", + " device=\"cuda\" if USE_GPU else \"cpu\",\n", + " is_mega=False,\n", + " is_reusable=True,\n", + " )\n", + "\n", + " def __call__(self, text_col):\n", + " return [\n", + " self.model.generate_image(\n", + " t,\n", + " seed=-1,\n", + " grid_size=1,\n", + " is_seamless=False,\n", + " temperature=1,\n", + " top_k=256,\n", + " supercondition_factor=32,\n", + " )\n", + " for t in text_col.to_pylist()\n", + " ]\n", + "\n", + "\n", + "if USE_GPU:\n", + " GenerateImageFromText = GenerateImageFromText.override_options(num_gpus=1)\n", + "\n", + "images_df.with_column(\n", + " \"generated_image\",\n", + " GenerateImageFromText(images_df[\"TEXT\"]),\n", + ").show(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5764ae83-9100-47c7-95d3-4880d9f1fe7c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "vscode": { + "interpreter": { + "hash": "e5d77f7bd5a748e4f6412a25f9708ab7af36936de941fc795d1a6b75eb2da082" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/tutorials/text_to_image/using_cloud_with_ray.ipynb b/tutorials/text_to_image/using_cloud_with_ray.ipynb index c14aa26be9..fc9c55e893 100644 --- a/tutorials/text_to_image/using_cloud_with_ray.ipynb +++ b/tutorials/text_to_image/using_cloud_with_ray.ipynb @@ -1,223 +1,225 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "1b8a5f16-3d51-4690-80b3-95c7899c5474", - "metadata": {}, - "source": [ - "# Using Ray for Scaling Up\n", - "\n", - "Daft's default PyRunner is great for experimentation on your laptop, but when it comes times to running much more computationally expensive jobs that need to take advantage of large scale parallelism, you can run Daft on a [Ray](https://www.ray.io/) cluster instead.\n", - "\n", - "## What is a Ray Cluster, and why do I need it?\n", - "\n", - "Ray is a framework that exposes a Python interface for running distributed computation over a cluster of machines. Daft is built to use Ray as a backend for running dataframe operations, allowing it to scale to huge amounts of data and computation.\n", - "\n", - "However even if you do not have a big cluster to use Ray, you can run Ray locally on your laptop (in which case it would spin up a Ray cluster of just a single machine: your laptop), and using Daft's Ray backend would allow Daft to fully utilize your machine's cores.\n", - "\n", - "## Let's get started!\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbe8f7c1-ae08-49b9-96c7-4b16cf48f479", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install getdaft[ray]\n", - "!pip install Pillow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28d1d421", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "CI = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb73a22b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import daft\n", - "\n", - "USE_RAY = False if CI else True\n", - "NUM_ROWS_LIMIT = 16 if CI else 160\n", - "IO_CONFIG = daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True, region_name=\"us-west-2\")) # Use anonymous-mode for accessing AWS S3\n", - "PARQUET_URL = \"s3://daft-public-data/tutorials/laion-parquet/train-00000-of-00001-6f24a7497df494ae.parquet\"\n", - "\n", - "daft.set_planning_config(default_io_config=IO_CONFIG)" - ] - }, - { - "cell_type": "markdown", - "id": "8a3ef6f2-550b-4668-9d41-0417e98e22f7", - "metadata": {}, - "source": [ - "By default, Daft uses the \"Python Runner\" which runs all processing in a single Python process.\n", - "\n", - "To activate the RayRunner, you can either:\n", - "\n", - "1. Use the `DAFT_RUNNER=ray` and optionally the `RAY_ADDRESS` environment variables\n", - "2. Call `daft.context.set_runner_ray(...)` at the start of your program.\n", - "\n", - "We'll demonstrate option 2 here!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d73050cd", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import daft\n", - "\n", - "if USE_RAY:\n", - " RAY_ADDRESS = None\n", - " daft.context.set_runner_ray(\n", - " # You may provide Daft with the address to an existing Ray cluster if you have one!\n", - " # If this is not provided, Daft will default to spinning up a single-node Ray cluster consisting of just your current local machine\n", - " address=RAY_ADDRESS,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "5bb090a9", - "metadata": {}, - "source": [ - "Let's try to download the images from our previous [Text-to-Image Generatation tutorial](https://colab.research.google.com/github/Eventual-Inc/Daft/blob/main/tutorials/text_to_image/text_to_image_generation.ipynb) with the RayRunner instead." - ] - }, - { - "cell_type": "markdown", - "id": "448057cd-52f7-449d-9c1c-6d6368de9cb2", - "metadata": {}, - "source": [ - "We limit the dataset to 160 rows and repartition it into 8 partitions for demonstration purposes. This just means that our data will be divided into 8 approximately equal-sized \"chunks\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f2639220-6c8f-48e2-9d8b-8301de21b8f2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from daft import col, udf\n", - "\n", - "parquet_df = daft.read_parquet(PARQUET_URL, io_config=IO_CONFIG).limit(NUM_ROWS_LIMIT).repartition(8)\n", - "parquet_df.collect()" - ] - }, - { - "cell_type": "markdown", - "id": "a601555b-ab09-4c6a-87dd-77a761b351b3", - "metadata": {}, - "source": [ - "## Download data from URLs\n", - "\n", - "Now, let's try downloading the data from the URLs with `.url.download()`!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01bf3fde-5a64-49e3-8f32-cbf180905efe", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "images_df = parquet_df.with_column(\"images\", col(\"URL\").url.download(on_error=\"null\"))\n", - "images_df.collect()" - ] - }, - { - "cell_type": "markdown", - "id": "d1ece805-b991-4b75-a3c0-ca00354365f2", - "metadata": {}, - "source": [ - "On Google Colab, it should take approximately 10 seconds, vs about 20 seconds with the Py Runner!" - ] - }, - { - "cell_type": "markdown", - "id": "c2278cc6-f3c6-44da-9f38-684e220a43e1", - "metadata": {}, - "source": [ - "With exactly the same code, we were able to achieve a 2x speedup in execution - what happened here?\n", - "\n", - "It turns out that our workload is [IO Bound](https://en.wikipedia.org/wiki/I/O_bound) because most of the time is spent waiting for data to be downloaded from the URL.\n", - "\n", - "By default, the `.url.download()` UDF requests `num_cpus=1`. Since our Google Colab machine has 2 CPUs, the RayRunner is able to run two of these UDFs in parallel, hence achieving a 2x increase in throughput!" - ] - }, - { - "cell_type": "markdown", - "id": "8430f756-f641-4bfa-9425-8d72e200d726", - "metadata": {}, - "source": [ - "## Remote Ray Clusters\n", - "\n", - "We have seen that using the RayRunner even locally provides us with some speedup already. However, the real power of distributed computing is in allowing us to access thousands of CPUs and GPUs in the cloud, on a remote Ray cluster.\n", - "\n", - "For example, UDFs that request for a single GPU with can run in parallel across hundreds of GPUs on a remote Ray cluster, effortlessly scaling your workloads up to take full advantage of the available hardware.\n", - "\n", - "To run Daft on large clusters, check out [Eventual](https://www.eventualcomputing.com) where you have access to a fully managed platform for running Daft at scale." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f159a45-dbf5-4e19-b2c6-0d15474b270c", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - }, - "vscode": { - "interpreter": { - "hash": "e5d77f7bd5a748e4f6412a25f9708ab7af36936de941fc795d1a6b75eb2da082" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "1b8a5f16-3d51-4690-80b3-95c7899c5474", + "metadata": {}, + "source": [ + "# Using Ray for Scaling Up\n", + "\n", + "Daft's default PyRunner is great for experimentation on your laptop, but when it comes times to running much more computationally expensive jobs that need to take advantage of large scale parallelism, you can run Daft on a [Ray](https://www.ray.io/) cluster instead.\n", + "\n", + "## What is a Ray Cluster, and why do I need it?\n", + "\n", + "Ray is a framework that exposes a Python interface for running distributed computation over a cluster of machines. Daft is built to use Ray as a backend for running dataframe operations, allowing it to scale to huge amounts of data and computation.\n", + "\n", + "However even if you do not have a big cluster to use Ray, you can run Ray locally on your laptop (in which case it would spin up a Ray cluster of just a single machine: your laptop), and using Daft's Ray backend would allow Daft to fully utilize your machine's cores.\n", + "\n", + "## Let's get started!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbe8f7c1-ae08-49b9-96c7-4b16cf48f479", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install getdaft[ray]\n", + "!pip install Pillow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28d1d421", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "CI = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb73a22b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import daft\n", + "\n", + "USE_RAY = False if CI else True\n", + "NUM_ROWS_LIMIT = 16 if CI else 160\n", + "IO_CONFIG = daft.io.IOConfig(\n", + " s3=daft.io.S3Config(anonymous=True, region_name=\"us-west-2\")\n", + ") # Use anonymous-mode for accessing AWS S3\n", + "PARQUET_URL = \"s3://daft-public-data/tutorials/laion-parquet/train-00000-of-00001-6f24a7497df494ae.parquet\"\n", + "\n", + "daft.set_planning_config(default_io_config=IO_CONFIG)" + ] + }, + { + "cell_type": "markdown", + "id": "8a3ef6f2-550b-4668-9d41-0417e98e22f7", + "metadata": {}, + "source": [ + "By default, Daft uses the \"Python Runner\" which runs all processing in a single Python process.\n", + "\n", + "To activate the RayRunner, you can either:\n", + "\n", + "1. Use the `DAFT_RUNNER=ray` and optionally the `RAY_ADDRESS` environment variables\n", + "2. Call `daft.context.set_runner_ray(...)` at the start of your program.\n", + "\n", + "We'll demonstrate option 2 here!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d73050cd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import daft\n", + "\n", + "if USE_RAY:\n", + " RAY_ADDRESS = None\n", + " daft.context.set_runner_ray(\n", + " # You may provide Daft with the address to an existing Ray cluster if you have one!\n", + " # If this is not provided, Daft will default to spinning up a single-node Ray cluster consisting of just your current local machine\n", + " address=RAY_ADDRESS,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "5bb090a9", + "metadata": {}, + "source": [ + "Let's try to download the images from our previous [Text-to-Image Generatation tutorial](https://colab.research.google.com/github/Eventual-Inc/Daft/blob/main/tutorials/text_to_image/text_to_image_generation.ipynb) with the RayRunner instead." + ] + }, + { + "cell_type": "markdown", + "id": "448057cd-52f7-449d-9c1c-6d6368de9cb2", + "metadata": {}, + "source": [ + "We limit the dataset to 160 rows and repartition it into 8 partitions for demonstration purposes. This just means that our data will be divided into 8 approximately equal-sized \"chunks\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2639220-6c8f-48e2-9d8b-8301de21b8f2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from daft import col\n", + "\n", + "parquet_df = daft.read_parquet(PARQUET_URL, io_config=IO_CONFIG).limit(NUM_ROWS_LIMIT).repartition(8)\n", + "parquet_df.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "a601555b-ab09-4c6a-87dd-77a761b351b3", + "metadata": {}, + "source": [ + "## Download data from URLs\n", + "\n", + "Now, let's try downloading the data from the URLs with `.url.download()`!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01bf3fde-5a64-49e3-8f32-cbf180905efe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "images_df = parquet_df.with_column(\"images\", col(\"URL\").url.download(on_error=\"null\"))\n", + "images_df.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "d1ece805-b991-4b75-a3c0-ca00354365f2", + "metadata": {}, + "source": [ + "On Google Colab, it should take approximately 10 seconds, vs about 20 seconds with the Py Runner!" + ] + }, + { + "cell_type": "markdown", + "id": "c2278cc6-f3c6-44da-9f38-684e220a43e1", + "metadata": {}, + "source": [ + "With exactly the same code, we were able to achieve a 2x speedup in execution - what happened here?\n", + "\n", + "It turns out that our workload is [IO Bound](https://en.wikipedia.org/wiki/I/O_bound) because most of the time is spent waiting for data to be downloaded from the URL.\n", + "\n", + "By default, the `.url.download()` UDF requests `num_cpus=1`. Since our Google Colab machine has 2 CPUs, the RayRunner is able to run two of these UDFs in parallel, hence achieving a 2x increase in throughput!" + ] + }, + { + "cell_type": "markdown", + "id": "8430f756-f641-4bfa-9425-8d72e200d726", + "metadata": {}, + "source": [ + "## Remote Ray Clusters\n", + "\n", + "We have seen that using the RayRunner even locally provides us with some speedup already. However, the real power of distributed computing is in allowing us to access thousands of CPUs and GPUs in the cloud, on a remote Ray cluster.\n", + "\n", + "For example, UDFs that request for a single GPU with can run in parallel across hundreds of GPUs on a remote Ray cluster, effortlessly scaling your workloads up to take full advantage of the available hardware.\n", + "\n", + "To run Daft on large clusters, check out [Eventual](https://www.eventualcomputing.com) where you have access to a fully managed platform for running Daft at scale." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f159a45-dbf5-4e19-b2c6-0d15474b270c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + }, + "vscode": { + "interpreter": { + "hash": "e5d77f7bd5a748e4f6412a25f9708ab7af36936de941fc795d1a6b75eb2da082" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 }