From 34ede8a48bf3c9c901acc6ae7db3f12fbe2d0f2b Mon Sep 17 00:00:00 2001 From: Dan Allan Date: Wed, 27 Jul 2022 19:25:25 -0400 Subject: [PATCH] Add mimetype_detection_hook. (#259) * Add mimetype_detection_hook. * Resolve object from config. * Change precedence and signature. * WIP: Document custom file types. * Add new docs page to index. * Bump max overflow. We misunderstood what this is for. * Cover end-to-end story for custom formats. * Consistent headers * more polish --- .../explanations/specialized-formats.md | 2 +- docs/source/how-to/read-custom-formats.md | 288 ++++++++++++++++++ docs/source/index.md | 1 + tiled/_tests/test_directory_walker.py | 45 ++- tiled/adapters/files.py | 46 ++- .../config_schemas/service_configuration.yml | 2 +- tiled/server/settings.py | 2 +- 7 files changed, 375 insertions(+), 11 deletions(-) create mode 100644 docs/source/how-to/read-custom-formats.md diff --git a/docs/source/explanations/specialized-formats.md b/docs/source/explanations/specialized-formats.md index d94b35ad8..72ea0225f 100644 --- a/docs/source/explanations/specialized-formats.md +++ b/docs/source/explanations/specialized-formats.md @@ -1,4 +1,4 @@ -# Reading and Exporting Specialized Formats +# Case Study: Reading and Exporting a Specialized Format diff --git a/docs/source/how-to/read-custom-formats.md b/docs/source/how-to/read-custom-formats.md new file mode 100644 index 000000000..a74d8b061 --- /dev/null +++ b/docs/source/how-to/read-custom-formats.md @@ -0,0 +1,288 @@ +# Serve Files with Custom Formats + +Out of the box, Tiled can serve a directory of files that have common formats +with recognizable file names like `*.csv`, `*.tiff`, or `*.h5`. In this guide, +we will configure it to recognize files that have nonstandard (e.g. custom) +names and/or custom formats. + +```{note} + +Tiled is not limited to serving data from files. + +Large deployements typically involve a database, supporting fast search on +metadata, and perhaps external files or "blob stores" with large data. + +But starting with files is a good way to get rolling with Tiled. +``` + +## Formats are named using "MIME types" + +Tiled refers to formats using a web standard called +[MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types). +MIME types look like: + +``` +text/csv +image/png +application/x-hdf5 +``` + +There is an +[official list](https://www.iana.org/assignments/media-types/media-types.xhtml) +of registered MIME types, and if an official one exists we use it. If +a format is not registered, then the standard tells us to use `text/x-SOMETHING` if the +format is textual or `application/x-SOMETHING` if it is binary. For example, +we use `text/x-xdi` for XDI and `applicaiton/x-hdf5` for HDF5, formats which +are not registered. + +## Case 1: Unfamiliar File Extension + +Suppose you have data files that are formatted in a supported format like CSVs. +If they were named `*.csv` then Tiled could handle them natively without any +additional configuration: + +``` +tiled serve directory path/to/directory +``` + +But if they use some unusual extension like `*.stuff` Tiled needs to be +told that it should read `*.stuff` files like CSVs. + +### Map the unfamiliar file extension to a MIME type + +We use a configuration file like this: + +```yaml +# config.yml +trees: + tree: files + args: + directory: path/to/directory + mimetypes_by_file_ext: + .stuff: text/csv +``` + +We are mapping the file extension, `.stuff` (including the leading `.`) to +the MIME type `text/csv`. + +Multiple file extensions can be mapped to the same MIME type. For example, +Tiled's default configuration maps both `.tif` and `.tiff` to `image/tiff`. + +We then use the configuration file like this: + +``` +tiled serve config config.yml +``` + +The configuration file `config.yml` can be named anything you like. + +## Case 2: No File Extension + +Not all files have a name like `.`. Some have no dot, like: + +``` +data0001 +data0002 +data0003 +``` + +Others do have a dot, but the part after the dot is not really a file +extension; it does not signify the _format_. Instead, it's scientific metadata +of some kind, as in: + +``` +polymer_10_new_Ck150V.2050 +polymer_10_new_Ck150V.3050 +polymer_10_new_Ck150V.4050 +``` + +### Write a custom function for detecting the MIME type + +The best solution is to avoid naming files like this, but we cannot always +control how our files are named. To cope with this, we need to write a +Python function. + +```python +# custom.py + +def detect_mimetype(filepath, mimetype): + if mimetype is None: + # If we are here, detection based on file extension came up empty. + ... + mimetype = "text/csv" + return mimetype +``` + +The function `detect_mimetype` will be passed the full `filepath` (e.g. +`path/to/filename`) not just the filename. It can use this to examine the +filename or even open the file to, for example, look for a +[file signature](https://en.wikipedia.org/wiki/List_of_file_signatures). The +function will also be passed the `mimetype`, if any, that was detected based on +its file extension. Therefore, this function can be used to catch files that +have no file extension or to _override_ the determination based file extension +if it is wrong. + +If the Python script `custom.py` is placed in the same directory as +`config.yml`, Tiled will find it. (Tiled temporarily adds the directory +containing the configuration file(s) to the Python import path while +it parses the configuration.) + +```yaml +# config.yml +trees: + tree: files + args: + directory: path/to/directory + mimetype_detection_hook: custom:detect_mimetype +``` + +Alternatively, if the function can be defined in some external Python package +like `my_package.my_module.func` and configured like + +``` +mimetype_detection_hook: my_package.my_module:func +``` + +Note that the packages are separated by `.` but the final object (`func`) is +preceded by a `:`. If you forget this, Tiled will raise a clear error to remind +you. + +The names `custom.py` and `detect_mimetype` are arbitrary. The +`mimetype_detection_hook` may be used in combination with +`mimetypes_by_file_ext`. + +As in Case 1, we use the configuration file like this: + +``` +tiled serve config config.yml +``` + +## Case 3: Custom Format + +In this case we format that Tiled cannot read. It's not just a familiar +format with an unfamiliar name; it's a new format that Tiled needs to +be taught how to read. + +### Choose a MIME type + +Referring back to the top of this guide, we need to choose a MIME type +to refer to this format by. As an example, we'll call our format + +``` +application/x-stuff +``` + +The is, of course, some risk of name collisions when we invent names outside of +the +[official list](https://www.iana.org/assignments/media-types/media-types.xhtml) +of MIME types, so be specific. + +### Write a custom adapter + +Tiled must represent the content of your file as: + +* An array + a dictionary of metadata +* A table (dataframe) + dictionary of metadata +* A nested structure (i.e. directory-like hierarchy) of the above + +You must choose which is appropriate for this data format. Examples +for each structure follow. + +#### Simple Array example + +```py +# custom.py +from tiled.adpaters.array import ArrayAdapter + +def read_custom_format(filepath): + # Extract an array and an optional dictionary of metadata + # from your file. + array = ... # a numpy array + metadata = ... # a dictionary or None + return ArrayAdapter.from_array(array, metadata=metadata) +``` + +#### Simple Tabular (DataFrame) example + +```py +# custom.py +from tiled.adpaters.dataframe import DataFrameAdapter + +def read_custom_format(filepath): + # Extract a DataFrame and an optional dictionary of metadata + # from your file. + df = ... # a pandas DataFrame + metadata = ... # a dictionary or None + return DataFrameAdapter.from_pandas(df, npartitions=1, metadata=metadata) +``` + +#### Simple Nested Structure example + +```py +# custom.py +from tiled.adpaters.array import ArrayAdapter +from tiled.adpaters.dataframe import DataFrameAdapter +from tiled.adpaters.mapping import MapAdapter + +def read_custom_format(filepath): + + # Build a dictionary (potentially nested) of arrays and/or dataframes. + # See examples above for ArrayAdapter and DataFrameAdapter usage. + + return MapAdapter( + { + "stuff": ArrayAdapter.from_array(...), + "things": DataFrameAdapter.from_pandas(...), + } + metadata={...}, + ) +``` + +#### Advanced: Delay I/O + +See the implementations in the pacakage `tiled.adapters` for more advanced +examples, especially ways to refer reading the entire file up front if the user +only wants to read part of it. + +#### Advanced: Mark up Structure with optional "Specs" + +If the array, table, or nested structure follows some convention or standard +for its internal layout or naming scheme, it can be useful to notate that. +Some Tiled clients may be able to use that information to provide additional +functionality or performance. + +See :doc:`../explanations/metadata` for more information on Specs. + +Specify them as an argument to the Adapter, as in: + +```py +DataFrameAdapter(..., specs=["xdi"]) +``` + +### Configure Tiled to use this Adapter + +Our configuration file should use `mimetypes_by_file_ext` (Case 1) or +`mimetype_detection_hook` (Case 2) to recognize this custom file. +Additionally, it should add a section `readers_by_mimetype` to +map our MIME type `application/x-stuff` to our custom function. + +Again, Tiled will find `custom.py` if it is placed in the same directory as +`config.yml`. The name is arbitrary, and you can have multiple such files if +needed. + +```yaml +# config.yml +trees: + tree: files + args: + directory: path/to/directory + mimetype_detection_hook: custom:detect_mimetype + readers_by_mimetype: + application/x-stuff: custom:read_custom_format +``` + +We then use the configuration file like this: + +``` +tiled serve config config.yml +``` diff --git a/docs/source/index.md b/docs/source/index.md index 38dcd073e..890ced211 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -20,6 +20,7 @@ tutorials/plotly-integration how-to/client-logger how-to/configuration +how-to/read-custom-formats how-to/custom-export-formats how-to/profiles how-to/api-keys diff --git a/tiled/_tests/test_directory_walker.py b/tiled/_tests/test_directory_walker.py index 658944dab..3c7b25dc0 100644 --- a/tiled/_tests/test_directory_walker.py +++ b/tiled/_tests/test_directory_walker.py @@ -7,7 +7,7 @@ import tifffile from ..adapters.array import ArrayAdapter -from ..adapters.files import Change, strip_suffixes +from ..adapters.files import Change, identity, strip_suffixes from ..client import from_config from ..examples.generate_files import data, df1, generate_files from .utils import force_update @@ -294,3 +294,46 @@ def test_sort(example_data_dir): } client = from_config(config) list(client.sort(("does_not_exsit", 1))) + + +def test_mimetype_detection_hook(tmpdir): + content = "a, b, c\n1, 2 ,3\n4, 5, 6\n" + with open(Path(tmpdir / "a0"), "w") as file: + file.write(content) + with open(Path(tmpdir / "b0"), "w") as file: + file.write(content) + with open(Path(tmpdir / "c.csv"), "w") as file: + file.write(content) + with open(Path(tmpdir / "a.0.asfwoeijviojefeiofw"), "w") as file: + file.write(content) + with open(Path(tmpdir / "b.0.asfwoeijviojefeiofw"), "w") as file: + file.write(content) + + def detect_mimetype(path, mimetype): + filename = Path(path).name + # If detection based on file extension worked, + # we should get that in the mimetype. Otherwise, + # mimetype should be None. + if filename.endswith(".csv"): + assert mimetype == "text/csv" + else: + assert mimetype is None + if filename.startswith("a"): + return "text/csv" + return mimetype + + config = { + "trees": [ + { + "tree": "tiled.adapters.files:DirectoryAdapter.from_directory", + "path": "/", + "args": { + "directory": str(tmpdir), + "mimetype_detection_hook": detect_mimetype, + "key_from_filename": identity, + }, + } + ] + } + client = from_config(config) + assert set(client) == {"a0", "a.0.asfwoeijviojefeiofw", "c.csv"} diff --git a/tiled/adapters/files.py b/tiled/adapters/files.py index 63e97e888..cecbfd155 100644 --- a/tiled/adapters/files.py +++ b/tiled/adapters/files.py @@ -119,6 +119,7 @@ def from_directory( ignore_re_files=None, readers_by_mimetype=None, mimetypes_by_file_ext=None, + mimetype_detection_hook=None, subdirectory_handler=None, key_from_filename=strip_suffixes, metadata=None, @@ -146,6 +147,11 @@ def from_directory( Map a mimetype to a Reader suitable for that mimetype mimetypes_by_file_ext : dict, optional Map a file extension (e.g. '.tif') to a mimetype (e.g. 'image/tiff') + mimetype_detection_hook: callable, optional + Signature: f(filepath) -> str + + It may return a registered mimetype like 'text/csv' or + a custom unregistered mimetype 'text/x-specfile'. subdirectory_handler : callable, optional Given a (relative) filepath to a direj key_from_filename : callable[str] -> str, @@ -183,6 +189,8 @@ def from_directory( "appearing later, use error_if_missing=False." ) readers_by_mimetype = readers_by_mimetype or {} + if mimetype_detection_hook is not None: + mimetype_detection_hook = import_object(mimetype_detection_hook) # If readers_by_mimetype comes from a configuration file, # objects are given as importable strings, like "package.module:Reader". for key, value in list(readers_by_mimetype.items()): @@ -234,6 +242,7 @@ def from_directory( subdirectory_handler, merged_readers_by_mimetype, merged_mimetypes_by_file_ext, + mimetype_detection_hook, key_from_filename, initial_scan_complete, watcher_thread_kill_switch, @@ -323,6 +332,7 @@ def from_directory( reader_factory = _reader_factory_for_file( merged_readers_by_mimetype, merged_mimetypes_by_file_ext, + mimetype_detection_hook, Path(root, filename), ) except NoReaderAvailable: @@ -431,6 +441,7 @@ def _watch( subdirectory_handler, readers_by_mimetype, mimetypes_by_file_ext, + mimetype_detection_hook, key_from_filename, initial_scan_complete, watcher_thread_kill_switch, @@ -452,6 +463,7 @@ def _watch( directory, readers_by_mimetype, mimetypes_by_file_ext, + mimetype_detection_hook, key_from_filename, index, subdirectory_trie, @@ -465,6 +477,7 @@ def _watch( directory, readers_by_mimetype, mimetypes_by_file_ext, + mimetype_detection_hook, key_from_filename, index, subdirectory_trie, @@ -490,6 +503,7 @@ def _process_changes( directory, readers_by_mimetype, mimetypes_by_file_ext, + mimetype_detection_hook, key_from_filename, index, subdirectory_trie, @@ -574,7 +588,10 @@ def _process_changes( ) try: reader_factory = _reader_factory_for_file( - readers_by_mimetype, mimetypes_by_file_ext, path + readers_by_mimetype, + mimetypes_by_file_ext, + mimetype_detection_hook, + path, ) except NoReaderAvailable: # Ignore this file in the future. @@ -606,6 +623,7 @@ def _process_changes( directory, readers_by_mimetype, mimetypes_by_file_ext, + mimetype_detection_hook, key_from_filename, index, subdirectory_trie, @@ -629,7 +647,10 @@ def _process_changes( # that this could be the first time we see this path. try: reader_factory = _reader_factory_for_file( - readers_by_mimetype, mimetypes_by_file_ext, path + readers_by_mimetype, + mimetypes_by_file_ext, + mimetype_detection_hook, + path, ) except NoReaderAvailable: # Ignore this file in the future. @@ -648,8 +669,13 @@ def _process_changes( changes_callback(changes) -def _reader_factory_for_file(readers_by_mimetype, mimetypes_by_file_ext, path): +def _reader_factory_for_file( + readers_by_mimetype, mimetypes_by_file_ext, mimetype_detection_hook, path +): + # First, try to infer the mimetype from the file extension. ext = "".join(path.suffixes) # e.g. ".h5" or ".tar.gz" + # User-specified mapping from file extension to mimetype + # gets priority. if ext in mimetypes_by_file_ext: mimetype = mimetypes_by_file_ext[ext] else: @@ -657,14 +683,20 @@ def _reader_factory_for_file(readers_by_mimetype, mimetypes_by_file_ext, path): # from file extension. This loads data about mimetypes from # the operating system the first time it is used. mimetype, _ = mimetypes.guess_type(str(path)) + # Finally, user-specified function has the opportunity to + # look at more than just the file extension. This gets access to the full + # path, so it can consider the file name and even open the file. It is also + # passed the mimetype determined above, or None if no match was found. + if mimetype_detection_hook is not None: + mimetype = mimetype_detection_hook(path, mimetype) if mimetype is None: msg = ( f"The file at {path} has a file extension {ext} this is not " - "recognized. The file will be skipped, pass in a mimetype " + "recognized. The file will be skipped. Pass in a mimetype " "for this file extension via the parameter " - "DirectoryAdapter.from_directory(..., mimetypes_by_file_ext={...}) and " - "pass in a Reader than handles this mimetype via " - "the parameter DirectoryAdapter.from_directory(..., readers_by_mimetype={...})." + "DirectoryAdapter.from_directory(..., mimetypes_by_file_ext={...}) " + "or a function for determining the mimetype based the full filepath " + "DirectoryAdapter.from_directory(..., mimetype_detection_hook=func)." ) warnings.warn(msg) raise NoReaderAvailable diff --git a/tiled/config_schemas/service_configuration.yml b/tiled/config_schemas/service_configuration.yml index 33435e86e..50d4ae94e 100644 --- a/tiled/config_schemas/service_configuration.yml +++ b/tiled/config_schemas/service_configuration.yml @@ -315,7 +315,7 @@ properties: minimum: 2 # sqlalchemy raises if you try a pool_size of 1 max_overflow: type: integer - description: Connection pool max overflow. Default is 0. + description: Connection pool max overflow. Default is 5. access_control: type: object additionalProperties: false diff --git a/tiled/server/settings.py b/tiled/server/settings.py index 6c3ace03d..46a372c1f 100644 --- a/tiled/server/settings.py +++ b/tiled/server/settings.py @@ -58,7 +58,7 @@ class Settings(BaseSettings): int(os.getenv("TILED_DATABASE_POOL_PRE_PING", 1)) ) database_max_overflow: Optional[int] = int( - os.getenv("TILED_DATABASE_MAX_OVERFLOW", 0) + os.getenv("TILED_DATABASE_MAX_OVERFLOW", 5) ) @property