Skip to content

Commit

Permalink
Provide an size bytes estimate for mongodb block (#31930)
Browse files Browse the repository at this point in the history
Fix the CI failure (#31929) caused by enabling new execution backend.
  • Loading branch information
jianoaix authored Jan 26, 2023
1 parent fe65c3e commit f8e26de
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .buildkite/pipeline.ml.yml
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-client python/ray/util/dask/...

- label: ":potable_water: Dataset datasource integration tests (Python 3.7)"
conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_PYTHON_AFFECTED"]
conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_DATA_AFFECTED"]
commands:
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
- DATA_PROCESSING_TESTING=1 ARROW_VERSION=9.* ARROW_MONGO_VERSION=0.5.* ./ci/env/install-dependencies.sh
Expand Down
6 changes: 5 additions & 1 deletion python/ray/data/datasource/mongo_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ def __init__(
self._client = pymongo.MongoClient(uri)
_validate_database_collection_exist(self._client, database, collection)

self._avg_obj_size = self._client[database].command("collstats", collection)[
"avgObjSize"
]

def estimate_inmemory_data_size(self) -> Optional[int]:
# TODO(jian): Add memory size estimation to improve auto-tune of parallelism.
return None
Expand Down Expand Up @@ -154,7 +158,7 @@ def make_block(
for i, partition in enumerate(partitions_ids):
metadata = BlockMetadata(
num_rows=partition["count"],
size_bytes=None,
size_bytes=partition["count"] * self._avg_obj_size,
schema=None,
input_files=None,
exec_stats=None,
Expand Down

0 comments on commit f8e26de

Please sign in to comment.