Update PreCommit Hooks (#2715)

Eventual-Inc · Aug 23, 2024 · a18b30a · a18b30a
1 parent ab6d1a5
commit a18b30a
Show file tree

Hide file tree

Showing 25 changed files with 2,669 additions and 1,653 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,15 +2,15 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
 - repo: https://github.com/pre-commit/mirrors-mypy
-  rev: v0.991
+  rev: v1.11.1
   hooks:
   - id: mypy
     additional_dependencies: [types-requests, types-PyYAML, types-tabulate]
     files: daft
     exclude: daft/pickle/.*\.py
 
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.4.0
+  rev: v4.6.0
   hooks:
   - id: detect-private-key
   - id: trailing-whitespace
@@ -40,7 +40,7 @@ repos:
   - id: check-toml
 
 - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
-  rev: v2.6.0
+  rev: v2.14.0
   hooks:
   - id: pretty-format-toml
     args: [--autofix]
@@ -49,15 +49,15 @@ repos:
     args: [--autofix]
 
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.2.6
+  rev: v2.3.0
   hooks:
   - id: codespell
     additional_dependencies:
     - tomli
 
 - repo: https://github.com/astral-sh/ruff-pre-commit
       # Ruff version.
-  rev: v0.3.7
+  rev: v0.6.2
   hooks:
       # Run the linter.
   - id: ruff
@@ -111,6 +111,6 @@ repos:
     pass_filenames: false
 
 - repo: https://github.com/abravalheri/validate-pyproject
-  rev: v0.10.1
+  rev: v0.19
   hooks:
   - id: validate-pyproject
diff --git a/benchmarking/tpch/data_generation.py b/benchmarking/tpch/data_generation.py
@@ -253,7 +253,7 @@ def gen_csv_files(basedir: str, num_parts: int, scale_factor: float) -> str:
     Returns:
         str: path to folder with generated CSV files
     """
-    cachedir = os.path.join(basedir, ("%.1f" % scale_factor).replace(".", "_"), str(num_parts))
+    cachedir = os.path.join(basedir, (f"{scale_factor:.1f}").replace(".", "_"), str(num_parts))
     if not os.path.exists(cachedir):
         # If running in CI, use a scale factor of 0.2
         # Otherwise, check for SCALE_FACTOR env variable or default to 1

diff --git a/benchmarking/tpch/pipelined_data_generation.py b/benchmarking/tpch/pipelined_data_generation.py
@@ -48,7 +48,7 @@ def pipelined_data_generation(
 ):
     assert num_parts > 1, "script should only be used if num_parts > 1"
 
-    cachedir = pathlib.Path(scratch_dir) / ("%.1f" % scale_factor).replace(".", "_") / str(num_parts)
+    cachedir = pathlib.Path(scratch_dir) / (f"{scale_factor:.1f}").replace(".", "_") / str(num_parts)
 
     if not cachedir.exists():
         logger.info("Cloning tpch dbgen repo")

diff --git a/daft/context.py b/daft/context.py
@@ -17,7 +17,7 @@
 
 
 class _RunnerConfig:
-    name = ClassVar[str]
+    name: ClassVar[str]
 
 
 @dataclasses.dataclass(frozen=True)

diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py
@@ -1984,7 +1984,7 @@ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any)
         """
         result = func(self, *args, **kwargs)
         assert isinstance(result, DataFrame), (
-            "Func returned an instance of type [%s], " "should have been DataFrame." % type(result)
+            f"Func returned an instance of type [{type(result)}], " "should have been DataFrame."
         )
         return result
 

diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
@@ -116,6 +116,7 @@ def lit(value: object) -> Expression:
         lit_value = _time_lit(i64_value, time_unit)
     elif isinstance(value, Decimal):
         sign, digits, exponent = value.as_tuple()
+        assert isinstance(exponent, int)
         lit_value = _decimal_lit(sign == 1, digits, exponent)
     elif isinstance(value, Series):
         lit_value = _series_lit(value._series)

diff --git a/docs/source/10-min.ipynb b/docs/source/10-min.ipynb
@@ -75,7 +75,7 @@
    "outputs": [],
    "source": [
     "import daft\n",
-    "from daft import DataType, col, udf"
+    "from daft import DataType, udf"
    ]
   },
   {
@@ -136,16 +136,23 @@
    "source": [
     "import datetime\n",
     "\n",
-    "df = daft.from_pydict({\n",
-    "    \"integers\": [1, 2, 3, 4],\n",
-    "    \"floats\": [1.5, 2.5, 3.5, 4.5],\n",
-    "    \"bools\": [True, True, False, False],\n",
-    "    \"strings\": [\"a\", \"b\", \"c\", \"d\"],\n",
-    "    \"bytes\": [b\"a\", b\"b\", b\"c\", b\"d\"],\n",
-    "    \"dates\": [datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3), datetime.date(1994, 1, 4)],\n",
-    "    \"lists\": [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]],\n",
-    "    \"nulls\": [None, None, None, None],\n",
-    "})\n",
+    "df = daft.from_pydict(\n",
+    "    {\n",
+    "        \"integers\": [1, 2, 3, 4],\n",
+    "        \"floats\": [1.5, 2.5, 3.5, 4.5],\n",
+    "        \"bools\": [True, True, False, False],\n",
+    "        \"strings\": [\"a\", \"b\", \"c\", \"d\"],\n",
+    "        \"bytes\": [b\"a\", b\"b\", b\"c\", b\"d\"],\n",
+    "        \"dates\": [\n",
+    "            datetime.date(1994, 1, 1),\n",
+    "            datetime.date(1994, 1, 2),\n",
+    "            datetime.date(1994, 1, 3),\n",
+    "            datetime.date(1994, 1, 4),\n",
+    "        ],\n",
+    "        \"lists\": [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]],\n",
+    "        \"nulls\": [None, None, None, None],\n",
+    "    }\n",
+    ")\n",
     "\n",
     "df"
    ]
@@ -236,9 +243,7 @@
     "# Set IO Configurations to use anonymous data access mode\n",
     "daft.set_planning_config(default_io_config=daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True)))\n",
     "\n",
-    "df = daft.read_parquet(\n",
-    "    \"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\"\n",
-    ")\n",
+    "df = daft.read_parquet(\"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\")\n",
     "df"
    ]
   },
@@ -620,7 +625,7 @@
     }
    ],
    "source": [
-    "df = df.with_column(\"full_name\", daft.col('first_name') + ' ' + daft.col('last_name'))\n",
+    "df = df.with_column(\"full_name\", daft.col(\"first_name\") + \" \" + daft.col(\"last_name\"))\n",
     "df.select(\"full_name\", \"age\", \"country\", \"has_dog\").show()"
    ]
   },
@@ -868,7 +873,7 @@
     }
    ],
    "source": [
-    "#select only columns for grouping\n",
+    "# select only columns for grouping\n",
     "grouping_df = df.select(df[\"country\"], df[\"first_name\"].alias(\"counts\"))\n",
     "\n",
     "# groupby country column and count the number of countries\n",
@@ -932,12 +937,14 @@
     }
    ],
    "source": [
-    "missing_data_df = daft.from_pydict({\n",
-    "    \"floats\": [1.5, None, float(\"nan\")],\n",
-    "})\n",
-    "missing_data_df = missing_data_df \\\n",
-    "    .with_column(\"floats_is_null\", missing_data_df[\"floats\"].is_null()) \\\n",
-    "    .with_column(\"floats_is_nan\", missing_data_df[\"floats\"].float.is_nan())\n",
+    "missing_data_df = daft.from_pydict(\n",
+    "    {\n",
+    "        \"floats\": [1.5, None, float(\"nan\")],\n",
+    "    }\n",
+    ")\n",
+    "missing_data_df = missing_data_df.with_column(\"floats_is_null\", missing_data_df[\"floats\"].is_null()).with_column(\n",
+    "    \"floats_is_nan\", missing_data_df[\"floats\"].float.is_nan()\n",
+    ")\n",
     "\n",
     "missing_data_df.show()"
    ]
@@ -1184,9 +1191,7 @@
     }
    ],
    "source": [
-    "df2 = daft.read_parquet(\n",
-    "    \"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\"\n",
-    ")\n",
+    "df2 = daft.read_parquet(\"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\")\n",
     "df2.where(df[\"country\"] == \"Canada\").explain(show_all=True)"
    ]
   },
@@ -1561,8 +1566,8 @@
    "outputs": [],
    "source": [
     "# import additional libraries, these are necessary for PyTorch\n",
-    "import torch\n",
-    "import warnings"
+    "\n",
+    "import torch"
    ]
   },
   {
@@ -1580,26 +1585,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@udf(return_dtype=DataType.fixed_size_list(dtype=DataType.string() , size=2))\n",
+    "@udf(return_dtype=DataType.fixed_size_list(dtype=DataType.string(), size=2))\n",
     "class ClassifyImages:\n",
-    "    \n",
     "    def __init__(self):\n",
-    "        # Perform expensive initializations - create and load the pre-trained model \n",
-    "        self.model = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resnet50', pretrained=True)\n",
-    "        self.utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n",
+    "        # Perform expensive initializations - create and load the pre-trained model\n",
+    "        self.model = torch.hub.load(\"NVIDIA/DeepLearningExamples:torchhub\", \"nvidia_resnet50\", pretrained=True)\n",
+    "        self.utils = torch.hub.load(\"NVIDIA/DeepLearningExamples:torchhub\", \"nvidia_convnets_processing_utils\")\n",
     "        self.model.eval().to(torch.device(\"cpu\"))\n",
     "\n",
     "    def __call__(self, images_urls):\n",
     "        uris = images_urls.to_pylist()\n",
-    "        batch = torch.cat(\n",
-    "            [self.utils.prepare_input_from_uri(uri) for uri in uris]\n",
-    "        ).to(\n",
-    "            torch.device(\"cpu\")\n",
-    "        )\n",
+    "        batch = torch.cat([self.utils.prepare_input_from_uri(uri) for uri in uris]).to(torch.device(\"cpu\"))\n",
     "\n",
     "        with torch.no_grad():\n",
     "            output = torch.nn.functional.softmax(self.model(batch), dim=1)\n",
-    "    \n",
+    "\n",
     "        results = self.utils.pick_n_best(predictions=output, n=1)\n",
     "        return [result[0] for result in results]"
    ]

diff --git a/docs/source/user_guide/fotw/fotw-000-data-access.ipynb b/docs/source/user_guide/fotw/fotw-000-data-access.ipynb
@@ -34,6 +34,7 @@
     "# Skip this notebook execution in CI because it hits data in a relative path\n",
     "if CI:\n",
     "    import sys\n",
+    "\n",
     "    sys.exit()"
    ]
   },
@@ -529,10 +530,7 @@
     "MY_ANONYMOUS_IO_CONFIG = daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True))\n",
     "\n",
     "# Read this file using `MY_ANONYMOUS_IO_CONFIG`\n",
-    "df = daft.read_csv(\n",
-    "    \"s3://daft-public-data/melbourne-airbnb/melbourne_airbnb.csv\", \n",
-    "    io_config=MY_ANONYMOUS_IO_CONFIG\n",
-    ")"
+    "df = daft.read_csv(\"s3://daft-public-data/melbourne-airbnb/melbourne_airbnb.csv\", io_config=MY_ANONYMOUS_IO_CONFIG)"
    ]
   },
   {
@@ -657,10 +655,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = daft.read_parquet(\n",
-    "    bucket,\n",
-    "    io_config=io_config\n",
-    ")"
+    "df = daft.read_parquet(bucket, io_config=io_config)"
    ]
   },
   {
@@ -805,9 +800,7 @@
     "import sqlite3\n",
     "\n",
     "connection = sqlite3.connect(\"example.db\")\n",
-    "connection.execute(\n",
-    "    \"CREATE TABLE IF NOT EXISTS books (title TEXT, author TEXT, year INTEGER)\"\n",
-    ")\n",
+    "connection.execute(\"CREATE TABLE IF NOT EXISTS books (title TEXT, author TEXT, year INTEGER)\")\n",
     "connection.execute(\n",
     "    \"\"\"\n",
     "INSERT INTO books (title, author, year)\n",
@@ -894,6 +887,7 @@
    "outputs": [],
    "source": [
     "from sqlalchemy import create_engine\n",
+    "\n",
     "# substitue the uri below with the engine path on your local machine\n",
     "engine_uri = \"sqlite:////Users/rpelgrim/daft_sql\"\n",
     "engine = create_engine(engine_uri, echo=True)"
@@ -908,17 +902,12 @@
    },
    "outputs": [],
    "source": [
-    "import pandas as pd \n",
+    "import pandas as pd\n",
+    "\n",
     "csv_file_path = \"data/census-01.csv\"\n",
     "df = pd.read_csv(csv_file_path)\n",
     "\n",
-    "sql_df = df.to_sql(\n",
-    "    name=\"censustable\",\n",
-    "    con=engine,\n",
-    "    index=False,\n",
-    "    index_label=\"id\",\n",
-    "    if_exists=\"replace\"\n",
-    ")"
+    "sql_df = df.to_sql(name=\"censustable\", con=engine, index=False, index_label=\"id\", if_exists=\"replace\")"
    ]
   },
   {
@@ -947,7 +936,7 @@
    "outputs": [],
    "source": [
     "# Read from local SQLite database\n",
-    "uri = \"sqlite:////Users/rpelgrim/daft_sql\" #replace with your local uri\n",
+    "uri = \"sqlite:////Users/rpelgrim/daft_sql\"  # replace with your local uri\n",
     "\n",
     "df = daft.read_sql(\"SELECT * FROM censustable\", uri)"
    ]
@@ -1079,10 +1068,10 @@
    ],
    "source": [
     "df = daft.read_sql(\n",
-    "    \"SELECT * FROM censustable\", \n",
+    "    \"SELECT * FROM censustable\",\n",
     "    uri,\n",
     "    partition_col=\"education\",\n",
-    "#    num_partitions=12\n",
+    "    #    num_partitions=12\n",
     ")\n",
     "\n",
     "df.show()"
@@ -1234,6 +1223,7 @@
    "outputs": [],
    "source": [
     "import boto3\n",
+    "\n",
     "session = boto3.session.Session()\n",
     "creds = session.get_credentials()\n",
     "\n",
@@ -1247,14 +1237,9 @@
     ")\n",
     "\n",
     "# Read Delta Lake table in S3 into a Daft DataFrame.\n",
-    "table_uri = (\n",
-    "    \"s3://avriiil/delta-test-daft/\"\n",
-    ")\n",
+    "table_uri = \"s3://avriiil/delta-test-daft/\"\n",
     "\n",
-    "df = daft.read_deltalake(\n",
-    "    table_uri,\n",
-    "    io_config=io_config\n",
-    ")"
+    "df = daft.read_deltalake(table_uri, io_config=io_config)"
    ]
   },
   {
@@ -1572,6 +1557,7 @@
    "source": [
     "# Use the boto3 library to generate temporary credentials which can be used for S3 access\n",
     "import boto3\n",
+    "\n",
     "session = boto3.session.Session()\n",
     "creds = session.get_credentials()\n",
     "\n",