You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
importxarrayasxrimportrioxarrayimportdask_gatewayfromdask.distributedimportLock# Create Dask distributed client here however you want# this example uses Dask Gatewaygateway=dask_gateway.Gateway()
cluster=gateway.new_cluster(cluster_options)
client=cluster.get_client()
# Read file, with chunks so it uses Daskimg=rioxarray.open_rasterio("https://naipeuwest.blob.core.windows.net/naip/v002/md/2013/md_100cm_2013/39076/m_3907617_ne_18_1_20130924.tif", chunks=(500,500))
# Write file, also using daskimg.rio.to_raster('output.tif', tiled=True, lock=Lock("rio", client=client))
Problem description
This gives the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_373/522236255.py in <module>
----> 1 img.rio.to_raster("output.tif", tiled=True, lock=Lock("rio", client=client))
/srv/conda/envs/notebook/lib/python3.8/site-packages/rioxarray/raster_array.py in to_raster(self, raster_path, driver, dtype, tags, windowed, recalc_transform, lock, compute, **profile_kwargs)
967 )
968
--> 969 return RasterioWriter(raster_path=raster_path).to_raster(
970 xarray_dataarray=self._obj,
971 tags=tags,
/srv/conda/envs/notebook/lib/python3.8/site-packages/rioxarray/raster_writer.py in to_raster(self, xarray_dataarray, tags, windowed, lock, compute, **kwargs)
255
256 if lock and is_dask_collection(xarray_dataarray.data):
--> 257 return dask.array.store(
258 encode_cf_variable(xarray_dataarray).data.astype(numpy_dtype),
259 self,
/srv/conda/envs/notebook/lib/python3.8/site-packages/dask/array/core.py in store(sources, targets, lock, regions, compute, return_stored, **kwargs)
1038 else:
1039 if compute:
-> 1040 compute_as_if_collection(Array, store_dsk, store_keys, **kwargs)
1041 return None
1042 else:
/srv/conda/envs/notebook/lib/python3.8/site-packages/dask/base.py in compute_as_if_collection(cls, dsk, keys, scheduler, get, **kwargs)
311 schedule = get_scheduler(scheduler=scheduler, cls=cls, get=get)
312 dsk2 = optimization_function(cls)(dsk, keys, **kwargs)
--> 313 return schedule(dsk2, keys, **kwargs)
314
315
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2669 should_rejoin = False
2670 try:
-> 2671 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2672 finally:
2673 for f in futures.values():
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1946 else:
1947 local_worker = None
-> 1948 return self.sync(
1949 self._gather,
1950 futures,
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
843 return future
844 else:
--> 845 return sync(
846 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
847 )
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
324 if error[0]:
325 typ, exc, tb = error[0]
--> 326 raise exc.with_traceback(tb)
327 else:
328 return result[0]
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/utils.py in f()
307 if callback_timeout is not None:
308 future = asyncio.wait_for(future, callback_timeout)
--> 309 result[0] = yield future
310 except Exception:
311 error[0] = sys.exc_info()
/srv/conda/envs/notebook/lib/python3.8/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1811 exc = CancelledError(key)
1812 else:
-> 1813 raise exception.with_traceback(traceback)
1814 raise exc
1815 if errors == "skip":
/srv/conda/envs/notebook/lib/python3.8/site-packages/dask/array/core.py in store_chunk()
4010
4011 def store_chunk(x, out, index, lock, return_stored):
-> 4012 return load_store_chunk(x, out, index, lock, return_stored, False)
4013
4014
/srv/conda/envs/notebook/lib/python3.8/site-packages/dask/array/core.py in load_store_chunk()
3997 if x is not None:
3998 if is_arraylike(x):
-> 3999 out[index] = x
4000 else:
4001 out[index] = np.asanyarray(x)
/srv/conda/envs/notebook/lib/python3.8/site-packages/rioxarray/raster_writer.py in __setitem__()
193 chx = xxx.stop - xxx.start
194
--> 195 with rasterio.open(self.raster_path, "r+") as rds:
196 rds.write(item, window=Window(chx_off, chy_off, chx, chy), indexes=indexes)
197
/srv/conda/envs/notebook/lib/python3.8/site-packages/rasterio/env.py in wrapper()
435
436 with env_ctor(session=session):
--> 437 return f(*args, **kwds)
438
439 return wrapper
/srv/conda/envs/notebook/lib/python3.8/site-packages/rasterio/__init__.py in open()
220 s = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)
221 elif mode == "r+":
--> 222 s = get_writer_for_path(path, driver=driver)(
223 path, mode, driver=driver, sharing=sharing, **kwargs
224 )
/srv/conda/envs/notebook/lib/python3.8/site-packages/rasterio/io.py in get_writer_for_path()
196 """Return the writer class appropriate for the existing dataset."""
197 if not driver:
--> 198 driver = get_dataset_driver(path)
199 return get_writer_for_driver(driver)
rasterio/_base.pyx in rasterio._base.get_dataset_driver()
TypeError: output.tif: No such file or directory
It seems like it's trying to read the output file before it has written it, and therefore can't find the file.
Everything works fine if I remove the lock argument.
This seems to work fine on a local machine when using a local Dask distributed client, created using:
from dask.distributed import Client
client = Client()
So it seems like it something about it actually being distributed that is causing the problem.
It seems you are using a multi-machine cluster. The only way for this to work is if they are sharing the same disk space. Are you able to write to a shared drive?
That seems obvious in retrospect @snowman2 - thank you.
I had actually started by trying to write a COG to a cloud storage bucket, and when that failed I tried writing locally in parallel, without realising that there wasn't a shared place for the workers to write to.
The only way for this to work is if they are sharing the same disk space
@snowman2 do you know if that's a limitation of rasterio / GDAL / COG? I'm guessing it's just not possible to write in parallel to non-overlapping windows of the same COG file?
Code Sample, a copy-pastable example if possible
Problem description
This gives the following error:
It seems like it's trying to read the output file before it has written it, and therefore can't find the file.
Everything works fine if I remove the
lock
argument.This seems to work fine on a local machine when using a local Dask distributed client, created using:
So it seems like it something about it actually being distributed that is causing the problem.
Expected Output
The write succeeds, in parallel using Dask.
Environment Information
rioxarray (0.7.1) deps:
rasterio: 1.2.9
xarray: 0.19.0
GDAL: 3.3.2
Other python deps:
scipy: 1.7.1
pyproj: 3.2.1
System:
python: 3.8.12 | packaged by conda-forge | (default, Sep 29 2021, 19:52:28) [GCC 9.4.0]
executable: /srv/conda/envs/notebook/bin/python
machine: Linux-5.4.0-1040-azure-x86_64-with-glibc2.10
Installation method
Unknown - using Microsoft Planetary Computer
The text was updated successfully, but these errors were encountered: