Skip to content

Commit

Permalink
Update PreCommit Hooks (#2715)
Browse files Browse the repository at this point in the history
  • Loading branch information
samster25 authored Aug 23, 2024
1 parent ab6d1a5 commit a18b30a
Show file tree
Hide file tree
Showing 25 changed files with 2,669 additions and 1,653 deletions.
12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.991
rev: v1.11.1
hooks:
- id: mypy
additional_dependencies: [types-requests, types-PyYAML, types-tabulate]
files: daft
exclude: daft/pickle/.*\.py

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v4.6.0
hooks:
- id: detect-private-key
- id: trailing-whitespace
Expand Down Expand Up @@ -40,7 +40,7 @@ repos:
- id: check-toml

- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
rev: v2.6.0
rev: v2.14.0
hooks:
- id: pretty-format-toml
args: [--autofix]
Expand All @@ -49,15 +49,15 @@ repos:
args: [--autofix]

- repo: https://github.com/codespell-project/codespell
rev: v2.2.6
rev: v2.3.0
hooks:
- id: codespell
additional_dependencies:
- tomli

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.3.7
rev: v0.6.2
hooks:
# Run the linter.
- id: ruff
Expand Down Expand Up @@ -111,6 +111,6 @@ repos:
pass_filenames: false

- repo: https://github.com/abravalheri/validate-pyproject
rev: v0.10.1
rev: v0.19
hooks:
- id: validate-pyproject
2 changes: 1 addition & 1 deletion benchmarking/tpch/data_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def gen_csv_files(basedir: str, num_parts: int, scale_factor: float) -> str:
Returns:
str: path to folder with generated CSV files
"""
cachedir = os.path.join(basedir, ("%.1f" % scale_factor).replace(".", "_"), str(num_parts))
cachedir = os.path.join(basedir, (f"{scale_factor:.1f}").replace(".", "_"), str(num_parts))
if not os.path.exists(cachedir):
# If running in CI, use a scale factor of 0.2
# Otherwise, check for SCALE_FACTOR env variable or default to 1
Expand Down
2 changes: 1 addition & 1 deletion benchmarking/tpch/pipelined_data_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def pipelined_data_generation(
):
assert num_parts > 1, "script should only be used if num_parts > 1"

cachedir = pathlib.Path(scratch_dir) / ("%.1f" % scale_factor).replace(".", "_") / str(num_parts)
cachedir = pathlib.Path(scratch_dir) / (f"{scale_factor:.1f}").replace(".", "_") / str(num_parts)

if not cachedir.exists():
logger.info("Cloning tpch dbgen repo")
Expand Down
2 changes: 1 addition & 1 deletion daft/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


class _RunnerConfig:
name = ClassVar[str]
name: ClassVar[str]


@dataclasses.dataclass(frozen=True)
Expand Down
2 changes: 1 addition & 1 deletion daft/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1984,7 +1984,7 @@ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any)
"""
result = func(self, *args, **kwargs)
assert isinstance(result, DataFrame), (
"Func returned an instance of type [%s], " "should have been DataFrame." % type(result)
f"Func returned an instance of type [{type(result)}], " "should have been DataFrame."
)
return result

Expand Down
1 change: 1 addition & 0 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def lit(value: object) -> Expression:
lit_value = _time_lit(i64_value, time_unit)
elif isinstance(value, Decimal):
sign, digits, exponent = value.as_tuple()
assert isinstance(exponent, int)
lit_value = _decimal_lit(sign == 1, digits, exponent)
elif isinstance(value, Series):
lit_value = _series_lit(value._series)
Expand Down
76 changes: 38 additions & 38 deletions docs/source/10-min.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
"outputs": [],
"source": [
"import daft\n",
"from daft import DataType, col, udf"
"from daft import DataType, udf"
]
},
{
Expand Down Expand Up @@ -136,16 +136,23 @@
"source": [
"import datetime\n",
"\n",
"df = daft.from_pydict({\n",
" \"integers\": [1, 2, 3, 4],\n",
" \"floats\": [1.5, 2.5, 3.5, 4.5],\n",
" \"bools\": [True, True, False, False],\n",
" \"strings\": [\"a\", \"b\", \"c\", \"d\"],\n",
" \"bytes\": [b\"a\", b\"b\", b\"c\", b\"d\"],\n",
" \"dates\": [datetime.date(1994, 1, 1), datetime.date(1994, 1, 2), datetime.date(1994, 1, 3), datetime.date(1994, 1, 4)],\n",
" \"lists\": [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]],\n",
" \"nulls\": [None, None, None, None],\n",
"})\n",
"df = daft.from_pydict(\n",
" {\n",
" \"integers\": [1, 2, 3, 4],\n",
" \"floats\": [1.5, 2.5, 3.5, 4.5],\n",
" \"bools\": [True, True, False, False],\n",
" \"strings\": [\"a\", \"b\", \"c\", \"d\"],\n",
" \"bytes\": [b\"a\", b\"b\", b\"c\", b\"d\"],\n",
" \"dates\": [\n",
" datetime.date(1994, 1, 1),\n",
" datetime.date(1994, 1, 2),\n",
" datetime.date(1994, 1, 3),\n",
" datetime.date(1994, 1, 4),\n",
" ],\n",
" \"lists\": [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]],\n",
" \"nulls\": [None, None, None, None],\n",
" }\n",
")\n",
"\n",
"df"
]
Expand Down Expand Up @@ -236,9 +243,7 @@
"# Set IO Configurations to use anonymous data access mode\n",
"daft.set_planning_config(default_io_config=daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True)))\n",
"\n",
"df = daft.read_parquet(\n",
" \"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\"\n",
")\n",
"df = daft.read_parquet(\"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\")\n",
"df"
]
},
Expand Down Expand Up @@ -620,7 +625,7 @@
}
],
"source": [
"df = df.with_column(\"full_name\", daft.col('first_name') + ' ' + daft.col('last_name'))\n",
"df = df.with_column(\"full_name\", daft.col(\"first_name\") + \" \" + daft.col(\"last_name\"))\n",
"df.select(\"full_name\", \"age\", \"country\", \"has_dog\").show()"
]
},
Expand Down Expand Up @@ -868,7 +873,7 @@
}
],
"source": [
"#select only columns for grouping\n",
"# select only columns for grouping\n",
"grouping_df = df.select(df[\"country\"], df[\"first_name\"].alias(\"counts\"))\n",
"\n",
"# groupby country column and count the number of countries\n",
Expand Down Expand Up @@ -932,12 +937,14 @@
}
],
"source": [
"missing_data_df = daft.from_pydict({\n",
" \"floats\": [1.5, None, float(\"nan\")],\n",
"})\n",
"missing_data_df = missing_data_df \\\n",
" .with_column(\"floats_is_null\", missing_data_df[\"floats\"].is_null()) \\\n",
" .with_column(\"floats_is_nan\", missing_data_df[\"floats\"].float.is_nan())\n",
"missing_data_df = daft.from_pydict(\n",
" {\n",
" \"floats\": [1.5, None, float(\"nan\")],\n",
" }\n",
")\n",
"missing_data_df = missing_data_df.with_column(\"floats_is_null\", missing_data_df[\"floats\"].is_null()).with_column(\n",
" \"floats_is_nan\", missing_data_df[\"floats\"].float.is_nan()\n",
")\n",
"\n",
"missing_data_df.show()"
]
Expand Down Expand Up @@ -1184,9 +1191,7 @@
}
],
"source": [
"df2 = daft.read_parquet(\n",
" \"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\"\n",
")\n",
"df2 = daft.read_parquet(\"s3://daft-public-data/tutorials/10-min/sample-data-dog-owners-partitioned.pq/**\")\n",
"df2.where(df[\"country\"] == \"Canada\").explain(show_all=True)"
]
},
Expand Down Expand Up @@ -1561,8 +1566,8 @@
"outputs": [],
"source": [
"# import additional libraries, these are necessary for PyTorch\n",
"import torch\n",
"import warnings"
"\n",
"import torch"
]
},
{
Expand All @@ -1580,26 +1585,21 @@
"metadata": {},
"outputs": [],
"source": [
"@udf(return_dtype=DataType.fixed_size_list(dtype=DataType.string() , size=2))\n",
"@udf(return_dtype=DataType.fixed_size_list(dtype=DataType.string(), size=2))\n",
"class ClassifyImages:\n",
" \n",
" def __init__(self):\n",
" # Perform expensive initializations - create and load the pre-trained model \n",
" self.model = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resnet50', pretrained=True)\n",
" self.utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')\n",
" # Perform expensive initializations - create and load the pre-trained model\n",
" self.model = torch.hub.load(\"NVIDIA/DeepLearningExamples:torchhub\", \"nvidia_resnet50\", pretrained=True)\n",
" self.utils = torch.hub.load(\"NVIDIA/DeepLearningExamples:torchhub\", \"nvidia_convnets_processing_utils\")\n",
" self.model.eval().to(torch.device(\"cpu\"))\n",
"\n",
" def __call__(self, images_urls):\n",
" uris = images_urls.to_pylist()\n",
" batch = torch.cat(\n",
" [self.utils.prepare_input_from_uri(uri) for uri in uris]\n",
" ).to(\n",
" torch.device(\"cpu\")\n",
" )\n",
" batch = torch.cat([self.utils.prepare_input_from_uri(uri) for uri in uris]).to(torch.device(\"cpu\"))\n",
"\n",
" with torch.no_grad():\n",
" output = torch.nn.functional.softmax(self.model(batch), dim=1)\n",
" \n",
"\n",
" results = self.utils.pick_n_best(predictions=output, n=1)\n",
" return [result[0] for result in results]"
]
Expand Down
44 changes: 15 additions & 29 deletions docs/source/user_guide/fotw/fotw-000-data-access.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"# Skip this notebook execution in CI because it hits data in a relative path\n",
"if CI:\n",
" import sys\n",
"\n",
" sys.exit()"
]
},
Expand Down Expand Up @@ -529,10 +530,7 @@
"MY_ANONYMOUS_IO_CONFIG = daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True))\n",
"\n",
"# Read this file using `MY_ANONYMOUS_IO_CONFIG`\n",
"df = daft.read_csv(\n",
" \"s3://daft-public-data/melbourne-airbnb/melbourne_airbnb.csv\", \n",
" io_config=MY_ANONYMOUS_IO_CONFIG\n",
")"
"df = daft.read_csv(\"s3://daft-public-data/melbourne-airbnb/melbourne_airbnb.csv\", io_config=MY_ANONYMOUS_IO_CONFIG)"
]
},
{
Expand Down Expand Up @@ -657,10 +655,7 @@
"metadata": {},
"outputs": [],
"source": [
"df = daft.read_parquet(\n",
" bucket,\n",
" io_config=io_config\n",
")"
"df = daft.read_parquet(bucket, io_config=io_config)"
]
},
{
Expand Down Expand Up @@ -805,9 +800,7 @@
"import sqlite3\n",
"\n",
"connection = sqlite3.connect(\"example.db\")\n",
"connection.execute(\n",
" \"CREATE TABLE IF NOT EXISTS books (title TEXT, author TEXT, year INTEGER)\"\n",
")\n",
"connection.execute(\"CREATE TABLE IF NOT EXISTS books (title TEXT, author TEXT, year INTEGER)\")\n",
"connection.execute(\n",
" \"\"\"\n",
"INSERT INTO books (title, author, year)\n",
Expand Down Expand Up @@ -894,6 +887,7 @@
"outputs": [],
"source": [
"from sqlalchemy import create_engine\n",
"\n",
"# substitue the uri below with the engine path on your local machine\n",
"engine_uri = \"sqlite:////Users/rpelgrim/daft_sql\"\n",
"engine = create_engine(engine_uri, echo=True)"
Expand All @@ -908,17 +902,12 @@
},
"outputs": [],
"source": [
"import pandas as pd \n",
"import pandas as pd\n",
"\n",
"csv_file_path = \"data/census-01.csv\"\n",
"df = pd.read_csv(csv_file_path)\n",
"\n",
"sql_df = df.to_sql(\n",
" name=\"censustable\",\n",
" con=engine,\n",
" index=False,\n",
" index_label=\"id\",\n",
" if_exists=\"replace\"\n",
")"
"sql_df = df.to_sql(name=\"censustable\", con=engine, index=False, index_label=\"id\", if_exists=\"replace\")"
]
},
{
Expand Down Expand Up @@ -947,7 +936,7 @@
"outputs": [],
"source": [
"# Read from local SQLite database\n",
"uri = \"sqlite:////Users/rpelgrim/daft_sql\" #replace with your local uri\n",
"uri = \"sqlite:////Users/rpelgrim/daft_sql\" # replace with your local uri\n",
"\n",
"df = daft.read_sql(\"SELECT * FROM censustable\", uri)"
]
Expand Down Expand Up @@ -1079,10 +1068,10 @@
],
"source": [
"df = daft.read_sql(\n",
" \"SELECT * FROM censustable\", \n",
" \"SELECT * FROM censustable\",\n",
" uri,\n",
" partition_col=\"education\",\n",
"# num_partitions=12\n",
" # num_partitions=12\n",
")\n",
"\n",
"df.show()"
Expand Down Expand Up @@ -1234,6 +1223,7 @@
"outputs": [],
"source": [
"import boto3\n",
"\n",
"session = boto3.session.Session()\n",
"creds = session.get_credentials()\n",
"\n",
Expand All @@ -1247,14 +1237,9 @@
")\n",
"\n",
"# Read Delta Lake table in S3 into a Daft DataFrame.\n",
"table_uri = (\n",
" \"s3://avriiil/delta-test-daft/\"\n",
")\n",
"table_uri = \"s3://avriiil/delta-test-daft/\"\n",
"\n",
"df = daft.read_deltalake(\n",
" table_uri,\n",
" io_config=io_config\n",
")"
"df = daft.read_deltalake(table_uri, io_config=io_config)"
]
},
{
Expand Down Expand Up @@ -1572,6 +1557,7 @@
"source": [
"# Use the boto3 library to generate temporary credentials which can be used for S3 access\n",
"import boto3\n",
"\n",
"session = boto3.session.Session()\n",
"creds = session.get_credentials()\n",
"\n",
Expand Down
Loading

0 comments on commit a18b30a

Please sign in to comment.