diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml new file mode 100644 index 00000000..25ba65f5 --- /dev/null +++ b/.github/workflows/docs.yaml @@ -0,0 +1,50 @@ +# Build documentation and commit to gh-pages branch. + +name: Build and Push Documentation to gh-pages Branch + +on: + push: + branches: [ 'master'] + +jobs: + build_and_push_docs: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + path: repo/ + - name: Checkout gh-pages + uses: actions/checkout@v2 + with: + path: docs/ + ref: gh-pages + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Install pdoc3 + run: | + python3 -m pip install pdoc3 + - name: Install DELTA + run: | + cd repo + ./scripts/setup.sh + python3 -m pip install . + - name: Build Documentation + run: | + ./repo/scripts/docs.sh ./docs/ + - name: Commit and Push + run: | + cd repo + EMAIL=`git show -s --format='%ae' HEAD` + NAME=`git show -s --format='%an' HEAD` + cd .. + cd docs/ + git add . + git config user.email "$EMAIL" + git config user.name "$NAME" + git commit -m "Automatic update for $GITHUB_SHA." + git push origin gh-pages + diff --git a/bin/delta b/bin/delta index 8cfb07ed..16084d19 100755 --- a/bin/delta +++ b/bin/delta @@ -18,30 +18,8 @@ # limitations under the License. import sys -import argparse -from delta.config import config -from delta.subcommands import commands - -def main(args): - parser = argparse.ArgumentParser(description='DELTA Machine Learning Toolkit') - subparsers = parser.add_subparsers() - - for d in commands.SETUP_COMMANDS: - d(subparsers) - - try: - options = parser.parse_args(args[1:]) - except argparse.ArgumentError: - parser.print_help(sys.stderr) - sys.exit(1) - - if not hasattr(options, 'function'): - parser.print_help(sys.stderr) - sys.exit(1) - - config.initialize(options) - return options.function(options) +from delta.subcommands import main if __name__ == "__main__": - sys.exit(main(sys.argv)) + sys.exit(main.main(sys.argv)) diff --git a/delta/config/README.md b/delta/config/README.md index f3985eaa..e7ff0f42 100644 --- a/delta/config/README.md +++ b/delta/config/README.md @@ -107,7 +107,7 @@ Used in the `delta train` and `delta mlflow_ui` commands to keep track of traini networks from different stages of training. * `frequency`: Frequency in batches to save a checkpoint. Networks can require a fair amount of disk space, so don't save too often. - * `save_latest`: If true, only keep the network file from the most recent checkpoint. + * `only_save_latest`: If true, only keep the network file from the most recent checkpoint. TensorBoard ----------- diff --git a/delta/config/config.py b/delta/config/config.py index bf47c894..bbb2127f 100644 --- a/delta/config/config.py +++ b/delta/config/config.py @@ -35,6 +35,9 @@ def validate_positive(num, _): raise ValueError('%d is not positive' % (num)) return num +class _NotSpecified: #pylint:disable=too-few-public-methods + pass + class DeltaConfigComponent: """ DELTA configuration component. @@ -78,7 +81,7 @@ def register_component(self, component, name : str, attr_name = None): attr_name = name setattr(self, attr_name, component) - def register_field(self, name : str, types, accessor = None, cmd_arg = None, validate_fn = None, desc = None): + def register_field(self, name : str, types, accessor = None, validate_fn = None, desc = None): """ Register a field in this component of the configuration. @@ -92,7 +95,6 @@ def register_field(self, name : str, types, accessor = None, cmd_arg = None, val self._fields.append(name) self._validate[name] = validate_fn self._types[name] = types - self._cmd_args[name] = cmd_arg self._descs[name] = desc if accessor: def access(self) -> types: @@ -101,14 +103,44 @@ def access(self) -> types: access.__doc__ = desc setattr(self.__class__, accessor, access) + def register_arg(self, field, argname, **kwargs): + """ + Registers a command line argument in this component. + + field is the (registered) field this argument modifies. + argname is the name of the flag on the command line (i.e., '--flag') + **kwargs are arguments to ArgumentParser.add_argument. + + If help and type are not specified, will use the ones for the field. + If default is not specified, will use the value from the config files. + """ + assert field in self._fields, 'Field %s not registered.' % (field) + if 'help' not in kwargs: + kwargs['help'] = self._descs[field] + if 'type' not in kwargs: + kwargs['type'] = self._types[field] + elif kwargs['type'] is None: + del kwargs['type'] + if 'default' not in kwargs: + kwargs['default'] = _NotSpecified + self._cmd_args[argname] = (field, kwargs) + + def to_dict(self) -> dict: + """ + Returns a dictionary representing the config object. + """ + if isinstance(self._config_dict, dict): + exp = self._config_dict.copy() + for (name, c) in self._components.items(): + exp[name] = c.to_dict() + return exp + return self._config_dict + def export(self) -> str: """ - Returns a YAML string of all configuration options. + Returns a YAML string of all configuration options, from to_dict. """ - exp = self._config_dict.copy() - for (name, c) in self._components.items(): - exp[name] = c.export() - return yaml.dump(exp) + return yaml.dump(self.to_dict()) def _set_field(self, name : str, value : str, base_dir : str): if name not in self._fields: @@ -139,12 +171,9 @@ def setup_arg_parser(self, parser, components = None) -> None: """ if self._section_header is not None: parser = parser.add_argument_group(self._section_header) - for name in self._fields: - c = self._cmd_args[name] - if c is None: - continue - parser.add_argument(c, dest=c.replace('-', '_'), required=False, - type=self._types[name], help=self._descs[name]) + for (arg, value) in self._cmd_args.items(): + (field, kwargs) = value + parser.add_argument(arg, dest=field, **kwargs) for (name, c) in self._components.items(): if components is None or name in components: @@ -157,14 +186,12 @@ def parse_args(self, options): configuration values. """ d = {} - for name in self._fields: - c = self._cmd_args[name] - if c is None: + for (field, _) in self._cmd_args.values(): + if not hasattr(options, field) or getattr(options, field) is None: continue - n = c.replace('-', '_') - if not hasattr(options, n) or getattr(options, n) is None: + if getattr(options, field) is _NotSpecified: continue - d[name] = getattr(options, n) + d[field] = getattr(options, field) self._load_dict(d, None) for c in self._components.values(): @@ -183,6 +210,7 @@ def load(self, yaml_file: str = None, yaml_str: str = None): """ base_path = None if yaml_file: + #print("Loading config file: " + yaml_file) if not os.path.exists(yaml_file): raise Exception('Config file does not exist: ' + yaml_file) with open(yaml_file, 'r') as f: diff --git a/delta/config/delta.yaml b/delta/config/delta.yaml index 55d3b29c..82dbf2f8 100644 --- a/delta/config/delta.yaml +++ b/delta/config/delta.yaml @@ -1,6 +1,7 @@ general: # negative is all gpus: -1 + stop_on_input_error: true # If false skip past bad input files without halting training io: threads: 1 @@ -10,12 +11,16 @@ io: interleave_images: 5 # ratio of tile width and height when loading images tile_ratio: 5.0 + # When resuming training with a log_folder, skip input image where we have + # already loaded this many tiles. + resume_cutoff: 5000 cache: # default is OS-specific, in Linux, ~/.cache/delta dir: default limit: 8 dataset: + log_folder: ~ # Storage location for any record keeping files about the input dataset images: type: tiff # preprocess the images when loading (i.e., scaling) @@ -39,11 +44,33 @@ dataset: file_list: ~ files: ~ + # can either be a list of classes or the number of classes, + # if the labels are 0, 1, 2, 3 + classes: 4 + # labels are 1, 2, 3, 4; with names and display colors + # weight is optional, but required for either all or none + #classes: + # - 1: + # name: Water + # color: 0x67a9cf + # weight: 5.0 + # - 2: + # name: No Water + # color: 0xf6eff7 + # weight: 1.0 + # - 3: + # name: Maybe Water + # color: 0xbdc9e1 + # weight: 1.0 + # - 4: + # name: Cloud + # color: 0x02818a + # weight: 1.0 + train: network: chunk_size: 16 output_size: 8 - classes: 4 model: yaml_file: networks/convpool.yaml params: ~ @@ -91,8 +118,8 @@ mlflow: experiment_name: Default # rate in batches to save model checkpoints checkpoints: - frequency: 10000 - save_latest: true + frequency: 10000 + only_save_latest: true tensorboard: enabled: false diff --git a/delta/config/modules.py b/delta/config/modules.py new file mode 100644 index 00000000..2b53419d --- /dev/null +++ b/delta/config/modules.py @@ -0,0 +1,33 @@ +# Copyright © 2020, United States Government, as represented by the +# Administrator of the National Aeronautics and Space Administration. +# All rights reserved. +# +# The DELTA (Deep Earth Learning, Tools, and Analysis) platform is +# licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0. +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Registers all config modules. +""" + +import delta.imagery.imagery_config +import delta.ml.ml_config + +_config_initialized = False +def register_all(): + global _config_initialized #pylint: disable=global-statement + # needed to call twice when testing subcommands and when not + if _config_initialized: + return + delta.imagery.imagery_config.register() + delta.ml.ml_config.register() + _config_initialized = True diff --git a/delta/config/networks/autoencoder_conv.yaml b/delta/config/networks/autoencoder_conv.yaml index c1dd1938..751c0c1c 100644 --- a/delta/config/networks/autoencoder_conv.yaml +++ b/delta/config/networks/autoencoder_conv.yaml @@ -2,28 +2,28 @@ layers: - Input: shape: in_shape - Conv2D: - filters: 300 + filters: 50 kernel_size: [3, 3] activation: relu padding: same - MaxPooling2D: pool_size: [2, 2] - Conv2D: - filters: 150 + filters: 50 kernel_size: [3, 3] activation: relu padding: same - MaxPooling2D: pool_size: [2, 2] - Conv2D: - filters: 75 + filters: 50 kernel_size: [3, 3] activation: relu padding: same - UpSampling2D: size: [2, 2] - Conv2D: - filters: 150 + filters: 50 kernel_size: [3, 3] activation: relu padding: same diff --git a/delta/config/networks/autoencoder_conv_med_filters.yaml b/delta/config/networks/autoencoder_conv_med_filters.yaml new file mode 100644 index 00000000..505deaf6 --- /dev/null +++ b/delta/config/networks/autoencoder_conv_med_filters.yaml @@ -0,0 +1,36 @@ +layers: + - Input: + shape: in_shape + - Conv2D: + filters: 50 + kernel_size: [5, 5] + activation: relu + padding: same + - MaxPooling2D: + pool_size: [2, 2] + - Conv2D: + filters: 50 + kernel_size: [5, 5] + activation: relu + padding: same + - MaxPooling2D: + pool_size: [2, 2] + - Conv2D: + filters: 50 + kernel_size: [5, 5] + activation: relu + padding: same + - UpSampling2D: + size: [2, 2] + - Conv2D: + filters: 50 + kernel_size: [5, 5] + activation: relu + padding: same + - UpSampling2D: + size: [2, 2] + - Conv2D: + filters: num_bands + kernel_size: [5, 5] + activation: relu + padding: same diff --git a/delta/config/networks/autoencoder_conv_wide_filters.yaml b/delta/config/networks/autoencoder_conv_wide_filters.yaml new file mode 100644 index 00000000..7548cd72 --- /dev/null +++ b/delta/config/networks/autoencoder_conv_wide_filters.yaml @@ -0,0 +1,36 @@ +layers: + - Input: + shape: in_shape + - Conv2D: + filters: 50 + kernel_size: [7, 7] + activation: relu + padding: same + - MaxPooling2D: + pool_size: [2, 2] + - Conv2D: + filters: 50 + kernel_size: [7, 7] + activation: relu + padding: same + - MaxPooling2D: + pool_size: [2, 2] + - Conv2D: + filters: 50 + kernel_size: [7, 7] + activation: relu + padding: same + - UpSampling2D: + size: [2, 2] + - Conv2D: + filters: 50 + kernel_size: [7, 7] + activation: relu + padding: same + - UpSampling2D: + size: [2, 2] + - Conv2D: + filters: num_bands + kernel_size: [7, 7] + activation: relu + padding: same diff --git a/delta/config/networks/convpool.yaml b/delta/config/networks/convpool.yaml index bf4fb806..c1f8d0c9 100644 --- a/delta/config/networks/convpool.yaml +++ b/delta/config/networks/convpool.yaml @@ -1,6 +1,6 @@ params: dropout_rate: 0.3 - num_filters: 100 + num_filters: 64 layers: - Input: shape: in_shape diff --git a/delta/config/networks/segnet-medium.yaml b/delta/config/networks/segnet-medium.yaml index 21c0f6f5..749b66e3 100644 --- a/delta/config/networks/segnet-medium.yaml +++ b/delta/config/networks/segnet-medium.yaml @@ -2,7 +2,7 @@ layers: - Input: shape: in_shape - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -11,7 +11,7 @@ layers: - Activation: activation: relu - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -24,7 +24,7 @@ layers: strides: 2 name: pooling_1 - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -33,7 +33,7 @@ layers: - Activation: activation: relu - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -49,7 +49,7 @@ layers: size: [2, 2] name: upsampling_1 - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -58,7 +58,7 @@ layers: - Activation: activation: relu - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -70,7 +70,7 @@ layers: size: [2, 2] name: upsampling_2 - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -79,7 +79,7 @@ layers: - Activation: activation: relu - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same diff --git a/delta/config/networks/segnet-short-fewer-filters.yaml b/delta/config/networks/segnet-short-fewer-filters.yaml new file mode 100644 index 00000000..c044bcac --- /dev/null +++ b/delta/config/networks/segnet-short-fewer-filters.yaml @@ -0,0 +1,51 @@ +layers: + - Input: + shape: in_shape + - Conv2D: + filters: 64 + kernel_size: [7, 7] + padding: same + use_bias: false + name: conv_1_1 + - BatchNormalization: + - Activation: + activation: relu + - Conv2D: + filters: 64 + kernel_size: [7, 7] + padding: same + use_bias: false + name: conv_1_2 + - BatchNormalization: + - Activation: + activation: relu + - MaxPooling2D: + pool_size: [2, 2] + strides: 2 + name: pooling_1 + - UpSampling2D: + size: [2, 2] + name: upsampling_5 + - Conv2DTranspose: + filters: 64 + kernel_size: [7, 7] + strides: [1, 1] + padding: same + name: conv_T_5_1 + - BatchNormalization: + - Activation: + activation: relu + - Conv2DTranspose: + filters: 64 + kernel_size: [7, 7] + strides: [1, 1] + padding: same + name: conv_T_5_2 + - BatchNormalization: + - Activation: + activation: relu + - Conv2D: + filters: num_bands + kernel_size: [7, 7] + activation: relu + padding: same diff --git a/delta/config/networks/segnet-short-small-filters.yaml b/delta/config/networks/segnet-short-small-filters.yaml new file mode 100644 index 00000000..4b0b9498 --- /dev/null +++ b/delta/config/networks/segnet-short-small-filters.yaml @@ -0,0 +1,51 @@ +layers: + - Input: + shape: in_shape + - Conv2D: + filters: 100 + kernel_size: [5, 5] + padding: same + use_bias: false + name: conv_1_1 + - BatchNormalization: + - Activation: + activation: relu + - Conv2D: + filters: 100 + kernel_size: [5, 5] + padding: same + use_bias: false + name: conv_1_2 + - BatchNormalization: + - Activation: + activation: relu + - MaxPooling2D: + pool_size: [2, 2] + strides: 2 + name: pooling_1 + - UpSampling2D: + size: [2, 2] + name: upsampling_5 + - Conv2DTranspose: + filters: 100 + kernel_size: [5, 5] + strides: [1, 1] + padding: same + name: conv_T_5_1 + - BatchNormalization: + - Activation: + activation: relu + - Conv2DTranspose: + filters: 100 + kernel_size: [5, 5] + strides: [1, 1] + padding: same + name: conv_T_5_2 + - BatchNormalization: + - Activation: + activation: relu + - Conv2D: + filters: num_bands + kernel_size: [5, 5] + activation: relu + padding: same diff --git a/delta/config/networks/segnet-short.yaml b/delta/config/networks/segnet-short.yaml index 11fb3113..c044bcac 100644 --- a/delta/config/networks/segnet-short.yaml +++ b/delta/config/networks/segnet-short.yaml @@ -2,7 +2,7 @@ layers: - Input: shape: in_shape - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -11,7 +11,7 @@ layers: - Activation: activation: relu - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -27,7 +27,7 @@ layers: size: [2, 2] name: upsampling_5 - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -36,7 +36,7 @@ layers: - Activation: activation: relu - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same diff --git a/delta/config/networks/segnet.yaml b/delta/config/networks/segnet.yaml index 605afee8..a693b238 100644 --- a/delta/config/networks/segnet.yaml +++ b/delta/config/networks/segnet.yaml @@ -2,7 +2,7 @@ layers: - Input: shape: in_shape - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -11,7 +11,7 @@ layers: - Activation: activation: relu - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -24,7 +24,7 @@ layers: strides: 2 name: pooling_1 - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -33,7 +33,7 @@ layers: - Activation: activation: relu - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -46,7 +46,7 @@ layers: strides: 2 name: pooling_2 - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -55,7 +55,7 @@ layers: - Activation: activation: relu - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -64,7 +64,7 @@ layers: - Activation: activation: relu - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -77,7 +77,7 @@ layers: strides: 2 name: pooling_3 - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -86,7 +86,7 @@ layers: - Activation: activation: relu - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -95,7 +95,7 @@ layers: - Activation: activation: relu - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -108,7 +108,7 @@ layers: strides: 2 name: pooling_4 - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -117,7 +117,7 @@ layers: - Activation: activation: relu - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -126,7 +126,7 @@ layers: - Activation: activation: relu - Conv2D: - filters: 100 + filters: 64 kernel_size: [7, 7] padding: same use_bias: false @@ -142,7 +142,7 @@ layers: size: [2, 2] name: upsampling_1 - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -154,7 +154,7 @@ layers: size: [2, 2] name: upsampling_1 - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -163,7 +163,7 @@ layers: - Activation: activation: relu - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -172,7 +172,7 @@ layers: - Activation: activation: relu - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -184,7 +184,7 @@ layers: size: [2, 2] name: upsampling_2 - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -193,7 +193,7 @@ layers: - Activation: activation: relu - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -202,7 +202,7 @@ layers: - Activation: activation: relu - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -214,7 +214,7 @@ layers: size: [2, 2] name: upsampling_3 - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -223,7 +223,7 @@ layers: - Activation: activation: relu - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -232,7 +232,7 @@ layers: - Activation: activation: relu - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -244,7 +244,7 @@ layers: size: [2, 2] name: upsampling_4 - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -253,7 +253,7 @@ layers: - Activation: activation: relu - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -265,7 +265,7 @@ layers: size: [2, 2] name: upsampling_5 - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same @@ -274,7 +274,7 @@ layers: - Activation: activation: relu - Conv2DTranspose: - filters: 100 + filters: 64 kernel_size: [7, 7] strides: [1, 1] padding: same diff --git a/delta/imagery/imagery_config.py b/delta/imagery/imagery_config.py index a7f912c1..9720a37b 100644 --- a/delta/imagery/imagery_config.py +++ b/delta/imagery/imagery_config.py @@ -142,9 +142,10 @@ def __preprocess_function(image_comp): return None return lambda data, _, dummy: data / np.float32(f) -def load_images_labels(images_comp, labels_comp): +def load_images_labels(images_comp, labels_comp, classes_comp): ''' - Takes two configuration subsections and returns (image set, label set) + Takes two configuration subsections and returns (image set, label set). Also takes classes + configuration to apply preprocessing function to labels. ''' images_dict = images_comp._config_dict #pylint:disable=protected-access labels_dict = labels_comp._config_dict #pylint:disable=protected-access @@ -170,15 +171,18 @@ def load_images_labels(images_comp, labels_comp): if len(labels) != len(images): raise ValueError('%d images found, but %d labels found.' % (len(images), len(labels))) - pre = __preprocess_function(labels_comp) + pre = pre_orig = __preprocess_function(labels_comp) + conv = classes_comp.classes_to_indices_func() + if conv is not None: + pre = lambda data, _, dummy: conv(pre_orig(data, _, dummy) if pre_orig is not None else data) return (imageset, ImageSet(labels, labels_dict['type'], pre, labels_dict['nodata_value'])) class ImagePreprocessConfig(DeltaConfigComponent): def __init__(self): super().__init__() - self.register_field('enabled', bool, 'enabled', None, None, 'Turn on preprocessing.') - self.register_field('scale_factor', (float, str), 'scale_factor', None, None, 'Image scale factor.') + self.register_field('enabled', bool, 'enabled', None, 'Turn on preprocessing.') + self.register_field('scale_factor', (float, str), 'scale_factor', None, 'Image scale factor.') def _validate_paths(paths, base_dir): out = [] @@ -189,15 +193,18 @@ def _validate_paths(paths, base_dir): class ImageSetConfig(DeltaConfigComponent): def __init__(self, name=None): super().__init__() - self.register_field('type', str, 'type', '--' + name + '-type' if name else None, None, 'Image type.') - self.register_field('files', list, None, None, _validate_paths, 'List of image files.') - self.register_field('file_list', list, None, '--' + name + '-file-list' if name else None, - validate_path, 'File listing image files.') - self.register_field('directory', str, None, '--' + name + '-dir' if name else None, - validate_path, 'Directory of image files.') - self.register_field('extension', str, None, '--' + name + '-extension' if name else None, - None, 'Image file extension.') - self.register_field('nodata_value', float, None, None, None, 'Value of pixels to ignore.') + self.register_field('type', str, 'type', None, 'Image type.') + self.register_field('files', list, None, _validate_paths, 'List of image files.') + self.register_field('file_list', list, None, validate_path, 'File listing image files.') + self.register_field('directory', str, None, validate_path, 'Directory of image files.') + self.register_field('extension', str, None, None, 'Image file extension.') + self.register_field('nodata_value', (float, int), None, None, 'Value of pixels to ignore.') + + if name: + self.register_arg('type', '--' + name + '-type') + self.register_arg('file_list', '--' + name + '-file-list') + self.register_arg('directory', '--' + name + '-dir') + self.register_arg('extension', '--' + name + '-extension') self.register_component(ImagePreprocessConfig(), 'preprocess') self._name = name @@ -215,6 +222,94 @@ def parse_args(self, options): if hasattr(options, self._name) and getattr(options, self._name) is not None: self._config_dict['files'] = [getattr(options, self._name)] +class LabelClass: + def __init__(self, value, name=None, color=None, weight=None): + color_order = [0x1f77b4, 0xff7f0e, 0x2ca02c, 0xd62728, 0x9467bd, 0x8c564b, \ + 0xe377c2, 0x7f7f7f, 0xbcbd22, 0x17becf] + if name is None: + name = 'Class ' + str(value) + if color is None: + color = color_order[value] if value < len(color_order) else 0 + self.value = value + self.name = name + self.color = color + self.weight = weight + + def __repr__(self): + return 'Color: ' + self.name + +class ClassesConfig(DeltaConfigComponent): + def __init__(self): + super().__init__() + self._classes = [] + self._conversions = [] + + def __iter__(self): + return self._classes.__iter__() + + def __len__(self): + return len(self._classes) + + # overwrite model entirely if updated (don't want combined layers from multiple files) + def _load_dict(self, d : dict, base_dir): + if not d: + return + self._config_dict = d + self._classes = [] + if isinstance(d, int): + for i in range(d): + self._classes.append(LabelClass(i)) + elif isinstance(d, list): + for (i, c) in enumerate(d): + if isinstance(c, int): # just pixel value + self._classes.append(LabelClass(i)) + else: + keys = c.keys() + assert len(keys) == 1, 'Dict should have name of pixel value.' + k = next(iter(keys)) + assert isinstance(k, int), 'Class label value must be int.' + inner_dict = c[k] + self._classes.append(LabelClass(k, str(inner_dict.get('name')), + inner_dict.get('color'), inner_dict.get('weight'))) + else: + raise ValueError('Expected classes to be an int or list in config, was ' + str(d)) + # make sure the order is consistent for same values, and create preprocessing function + self._conversions = [] + self._classes = sorted(self._classes, key=lambda x: x.value) + for (i, v) in enumerate(self._classes): + if v.value != i: + self._conversions.append(v.value) + + def weights(self): + weights = [] + for c in self._classes: + if c.weight is not None: + weights.append(c.weight) + if not weights: + return None + assert len(weights) == len(self._classes), 'For class weights, either all or none must be specified.' + return weights + + def classes_to_indices_func(self): + if not self._conversions: + return None + def convert(data): + assert isinstance(data, np.ndarray) + for (i, c) in enumerate(self._conversions): + data[data == c] = i + return data + return convert + + def indices_to_classes_func(self): + if not self._conversions: + return None + def convert(data): + assert isinstance(data, np.ndarray) + for (i, c) in reversed(list(enumerate(self._conversions))): + data[data == i] = c + return data + return convert + class DatasetConfig(DeltaConfigComponent): def __init__(self): super().__init__('Dataset') @@ -222,6 +317,9 @@ def __init__(self): self.register_component(ImageSetConfig('label'), 'labels', '__label_comp') self.__images = None self.__labels = None + self.register_field('log_folder', str, 'log_folder', validate_path, + 'Directory where dataset progress is recorded.') + self.register_component(ClassesConfig(), 'classes') def reset(self): super().reset() @@ -234,7 +332,8 @@ def images(self) -> ImageSet: """ if self.__images is None: (self.__images, self.__labels) = load_images_labels(self._components['images'], - self._components['labels']) + self._components['labels'], + self._components['classes']) return self.__images def labels(self) -> ImageSet: @@ -243,14 +342,15 @@ def labels(self) -> ImageSet: """ if self.__labels is None: (self.__images, self.__labels) = load_images_labels(self._components['images'], - self._components['labels']) + self._components['labels'], + self._components['classes']) return self.__labels class CacheConfig(DeltaConfigComponent): def __init__(self): super().__init__() - self.register_field('dir', str, None, None, validate_path, 'Cache directory.') - self.register_field('limit', int, None, None, validate_positive, 'Number of items to cache.') + self.register_field('dir', str, None, validate_path, 'Cache directory.') + self.register_field('limit', int, None, validate_positive, 'Number of items to cache.') self._cache_manager = None @@ -272,13 +372,20 @@ def manager(self) -> disk_folder_cache.DiskCache: class IOConfig(DeltaConfigComponent): def __init__(self): super().__init__() - self.register_field('threads', int, 'threads', '--threads', None, 'Number of threads to use.') - self.register_field('block_size_mb', int, 'block_size_mb', '--block-size-mb', validate_positive, + self.register_field('threads', int, 'threads', None, 'Number of threads to use.') + self.register_field('block_size_mb', int, 'block_size_mb', validate_positive, 'Size of an image block to load in memory at once.') - self.register_field('interleave_images', int, 'interleave_images', None, validate_positive, + self.register_field('interleave_images', int, 'interleave_images', validate_positive, 'Number of images to interleave at a time when training.') - self.register_field('tile_ratio', float, 'tile_ratio', '--tile-ratio', validate_positive, + self.register_field('tile_ratio', float, 'tile_ratio', validate_positive, 'Width to height ratio of blocks to load in images.') + self.register_field('resume_cutoff', int, 'resume_cutoff', None, + 'When resuming a dataset, skip images where we have read this many tiles.') + + self.register_arg('threads', '--threads') + self.register_arg('block_size_mb', '--block-size-mb') + self.register_arg('tile_ratio', '--tile-ratio') + self.register_component(CacheConfig(), 'cache') def register(): diff --git a/delta/imagery/imagery_dataset.py b/delta/imagery/imagery_dataset.py index 607decd0..00bae651 100644 --- a/delta/imagery/imagery_dataset.py +++ b/delta/imagery/imagery_dataset.py @@ -22,7 +22,9 @@ import math import random import sys - +import os +import portalocker +import numpy as np import tensorflow as tf from delta.config import config @@ -33,19 +35,25 @@ class ImageryDataset: """Create dataset with all files as described in the provided config file. """ - def __init__(self, images, labels, chunk_size, output_size, chunk_stride=1): + def __init__(self, images, labels, chunk_size, output_size, chunk_stride=1, + resume_mode=False, log_folder=None): """ Initialize the dataset based on the specified image and label ImageSets """ + self._resume_mode = resume_mode + self._log_folder = log_folder + if self._log_folder and not os.path.exists(self._log_folder): + os.mkdir(self._log_folder) + # Record some of the config values assert (chunk_size % 2) == (output_size % 2), 'Chunk size and output size must both be either even or odd.' - self._chunk_size = chunk_size - self._output_size = output_size - self._output_dims = 1 + self._chunk_size = chunk_size + self._output_size = output_size + self._output_dims = 1 self._chunk_stride = chunk_stride - self._data_type = tf.float32 - self._label_type = tf.uint8 + self._data_type = tf.float32 + self._label_type = tf.uint8 if labels: assert len(images) == len(labels) @@ -55,13 +63,52 @@ def __init__(self, images, labels, chunk_size, output_size, chunk_stride=1): # Load the first image to get the number of bands for the input files. self._num_bands = loader.load_image(images, 0).num_bands() + def _get_image_read_log_path(self, image_path): + """Return the path to the read log for an input image""" + if not self._log_folder: + return None + image_name = os.path.basename(image_path) + file_name = os.path.splitext(image_name)[0] + '_read.log' + log_path = os.path.join(self._log_folder, file_name) + return log_path + + def _get_image_read_count(self, image_path): + """Return the number of ROIs we have read from an image""" + log_path = self._get_image_read_log_path(image_path) + if (not log_path) or not os.path.exists(log_path): + return 0 + counter = 0 + with portalocker.Lock(log_path, 'r', timeout=300) as f: + for line in f: #pylint: disable=W0612 + counter += 1 + return counter + def _load_tensor_imagery(self, is_labels, image_index, bbox): """Loads a single image as a tensor.""" - image = loader.load_image(self._labels if is_labels else self._images, image_index.numpy()) - w = int(bbox[2]) - h = int(bbox[3]) - rect = rectangle.Rectangle(int(bbox[0]), int(bbox[1]), w, h) - r = image.read(rect) + data = self._labels if is_labels else self._images + + if not is_labels: # Record each time we write a tile + file_path = data[image_index.numpy()] + log_path = self._get_image_read_log_path(file_path) + if log_path: + with portalocker.Lock(log_path, 'a', timeout=300) as f: + f.write(str(bbox) + '\n') + # TODO: What to write and when to clear it? + + try: + image = loader.load_image(data, image_index.numpy()) + w = int(bbox[2]) + h = int(bbox[3]) + rect = rectangle.Rectangle(int(bbox[0]), int(bbox[1]), w, h) + r = image.read(rect) + except Exception as e: #pylint: disable=W0703 + print('Caught exception loading tile from image: ' + data[image_index.numpy()] + ' -> ' + str(e) + + '\nSkipping tile: ' + str(bbox)) + if config.general.stop_on_input_error(): + print('Aborting processing, set --bypass-input-errors to bypass this error.') + raise + # Else just skip this tile + r = np.zeros(shape=(0,0,0), dtype=np.float32) return r def _tile_images(self): @@ -69,22 +116,47 @@ def _tile_images(self): def tile_generator(): tgs = [] for i in range(len(self._images)): - img = loader.load_image(self._images, i) - # w * h * bands * 4 * chunk * chunk = max_block_bytes - tile_width = int(math.sqrt(max_block_bytes / img.num_bands() / self._data_type.size / - config.io.tile_ratio())) - tile_height = int(config.io.tile_ratio() * tile_width) - min_block_size = self._chunk_size ** 2 * config.io.tile_ratio() * img.num_bands() * 4 - if max_block_bytes < min_block_size: - print('Warning: max_block_bytes=%g MB, but %g MB is recommended (minimum: %g MB)' % ( \ - max_block_bytes / 1024 / 1024, min_block_size * 2 / 1024 / 1024, min_block_size / 1024/ 1024), - file=sys.stderr) - if tile_width < self._chunk_size or tile_height < self._chunk_size: - raise ValueError('max_block_bytes is too low.') - tiles = img.tiles(tile_width, tile_height, min_width=self._chunk_size, min_height=self._chunk_size, - overlap=self._chunk_size - 1) + + if self._resume_mode: + # TODO: Improve feature to work with multiple epochs + # Skip images which we have already read some number of tiles from + if self._get_image_read_count(self._images[i]) > config.io.resume_cutoff(): + continue + + try: + img = loader.load_image(self._images, i) + + if self._labels: # If we have labels make sure they are the same size as the input images + label = loader.load_image(self._labels, i) + if label.size() != img.size(): + raise Exception('Label file ' + self._labels[i] + ' with size ' + str(label.size()) + + ' does not match input image size of ' + str(img.size())) + # w * h * bands * 4 * chunk * chunk = max_block_bytes + tile_width = int(math.sqrt(max_block_bytes / img.num_bands() / self._data_type.size / + config.io.tile_ratio())) + tile_height = int(config.io.tile_ratio() * tile_width) + min_block_size = self._chunk_size ** 2 * config.io.tile_ratio() * img.num_bands() * 4 + if max_block_bytes < min_block_size: + print('Warning: max_block_bytes=%g MB, but %g MB is recommended (minimum: %g MB)' + % (max_block_bytes / 1024 / 1024, + min_block_size * 2 / 1024 / 1024, min_block_size / 1024/ 1024), + file=sys.stderr) + if tile_width < self._chunk_size or tile_height < self._chunk_size: + raise ValueError('max_block_bytes is too low.') + tiles = img.tiles(tile_width, tile_height, min_width=self._chunk_size, min_height=self._chunk_size, + overlap=self._chunk_size - 1) + except Exception as e: #pylint: disable=W0703 + print('Caught exception tiling image: ' + self._images[i] + ' -> ' + str(e) + + '\nWill not load any tiles from this image') + if config.general.stop_on_input_error(): + print('Aborting processing, set --bypass-input-errors to bypass this error.') + raise + tiles = [] # Else move past this image without loading any tiles + random.Random(0).shuffle(tiles) # gives consistent random ordering so labels will match tgs.append((i, tiles)) + if not tgs: + return while tgs: cur = tgs[:config.io.interleave_images()] tgs = tgs[config.io.interleave_images():] @@ -114,15 +186,18 @@ def load_tile(image_index, x1, y1, x2, y2): is_labels), [image_index, [x1, y1, x2, y2]], data_type) return img - ret = ds_input.map(load_tile, num_parallel_calls=config.io.threads()) + ret = ds_input.map(load_tile, num_parallel_calls=tf.data.experimental.AUTOTUNE)#config.io.threads()) - return ret.prefetch(tf.data.experimental.AUTOTUNE) + # Don't let the entire session be taken down by one bad dataset input. + # - Would be better to handle this somehow but it is not clear if TF supports that. +# ret = ret.apply(tf.data.experimental.ignore_errors()) + + return ret def _chunk_image(self, image): """Split up a tensor image into tensor chunks""" - ksizes = [1, self._chunk_size, self._chunk_size, 1] # Size of the chunks - strides = [1, self._chunk_stride, self._chunk_stride, 1] # SPacing between chunk starts + strides = [1, self._chunk_stride, self._chunk_stride, 1] # Spacing between chunk starts rates = [1, 1, 1, 1] result = tf.image.extract_patches(tf.expand_dims(image, 0), ksizes, strides, rates, padding='VALID') @@ -134,7 +209,8 @@ def _chunk_image(self, image): def _reshape_labels(self, labels): """Reshape the labels to account for the chunking process.""" w = (self._chunk_size - self._output_size) // 2 - labels = tf.image.crop_to_bounding_box(labels, w, w, tf.shape(labels)[0] - 2 * w, tf.shape(labels)[1] - 2 * w) + labels = tf.image.crop_to_bounding_box(labels, w, w, tf.shape(labels)[0] - 2 * w, + tf.shape(labels)[1] - 2 * w) #pylint: disable=C0330 ksizes = [1, self._output_size, self._output_size, 1] strides = [1, self._chunk_stride, self._chunk_stride, 1] @@ -148,7 +224,7 @@ def data(self): Unbatched dataset of image chunks. """ ret = self._load_images(False, self._data_type) - ret = ret.map(self._chunk_image, num_parallel_calls=config.io.threads()) + ret = ret.map(self._chunk_image, num_parallel_calls=tf.data.experimental.AUTOTUNE) return ret.unbatch() def labels(self): @@ -156,13 +232,14 @@ def labels(self): Unbatched dataset of labels. """ label_set = self._load_images(True, self._label_type) - label_set = label_set.map(self._reshape_labels) - + label_set = label_set.map(self._reshape_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE) #pylint: disable=C0301 return label_set.unbatch() - def dataset(self): + def dataset(self, class_weights=None): """ Return the underlying TensorFlow dataset object that this class creates. + + class_weights: a list of weights to apply to the samples in each class, if specified. """ # Pair the data and labels in our dataset @@ -170,7 +247,9 @@ def dataset(self): # ignore labels with no data if self._labels.nodata_value(): ds = ds.filter(lambda x, y: tf.math.not_equal(y, self._labels.nodata_value())) - + if class_weights is not None: + lookup = tf.constant(class_weights) + ds = ds.map(lambda x, y: (x, y, tf.gather(lookup, tf.cast(y, tf.int32), axis=None))) return ds def num_bands(self): @@ -203,11 +282,12 @@ def label_set(self): class AutoencoderDataset(ImageryDataset): """Slightly modified dataset class for the Autoencoder which does not use separate label files""" - def __init__(self, images, chunk_size, chunk_stride=1): + def __init__(self, images, chunk_size, chunk_stride=1, resume_mode=False, log_folder=None): """ The images are used as labels as well. """ - super(AutoencoderDataset, self).__init__(images, None, chunk_size, chunk_size, chunk_stride=chunk_stride) + super(AutoencoderDataset, self).__init__(images, None, chunk_size, chunk_size, chunk_stride=chunk_stride, + resume_mode=resume_mode, log_folder=log_folder) self._labels = self._images self._output_dims = self.num_bands() diff --git a/delta/imagery/sources/delta_image.py b/delta/imagery/sources/delta_image.py index 4e877812..fb984196 100644 --- a/delta/imagery/sources/delta_image.py +++ b/delta/imagery/sources/delta_image.py @@ -127,7 +127,6 @@ def roi_generator(self, requested_rois: Iterator[rectangle.Rectangle]) -> Iterat # gdal doesn't work reading multithreading. But this let's a thread # take care of IO input while we do computation. - exe = concurrent.futures.ThreadPoolExecutor(1) jobs = [] total_rois = len(block_rois) @@ -148,18 +147,25 @@ def roi_generator(self, requested_rois: Iterator[rectangle.Rectangle]) -> Iterat continue applicable_rois.append(block_rois.pop(index)) - buf = exe.submit(functools.partial(self.read, read_roi)) - jobs.append((buf, read_roi, applicable_rois)) + jobs.append((read_roi, applicable_rois)) + # only do a few reads ahead since otherwise we will exhaust our memory + pending = [] + exe = concurrent.futures.ThreadPoolExecutor(1) + NUM_AHEAD = 2 + for i in range(min(NUM_AHEAD, len(jobs))): + pending.append(exe.submit(functools.partial(self.read, jobs[i][0]))) num_remaining = total_rois - for (buf_exe, read_roi, rois) in jobs: - buf = buf_exe.result() + for (i, (read_roi, rois)) in enumerate(jobs): + buf = pending.pop(0).result() for roi in rois: x0 = roi.min_x - read_roi.min_x y0 = roi.min_y - read_roi.min_y num_remaining -= 1 yield (roi, buf[x0:x0 + roi.width(), y0:y0 + roi.height(), :], (total_rois - num_remaining, total_rois)) + if i + NUM_AHEAD < len(jobs): + pending.append(exe.submit(functools.partial(self.read, jobs[i + NUM_AHEAD][0]))) def process_rois(self, requested_rois: Iterator[rectangle.Rectangle], callback_function: Callable[[rectangle.Rectangle, np.ndarray], None], diff --git a/delta/imagery/sources/tiff.py b/delta/imagery/sources/tiff.py index f8db711e..3b868370 100644 --- a/delta/imagery/sources/tiff.py +++ b/delta/imagery/sources/tiff.py @@ -260,13 +260,17 @@ def _prep(self, paths): os.system(cmd) return [output_path] -def numpy_dtype_to_gdal_type(dtype): +def numpy_dtype_to_gdal_type(dtype): #pylint: disable=R0911 if dtype == np.uint8: return gdal.GDT_Byte if dtype == np.uint16: return gdal.GDT_UInt16 if dtype == np.uint32: return gdal.GDT_UInt32 + if dtype == np.int16: + return gdal.GDT_Int16 + if dtype == np.int32: + return gdal.GDT_Int32 if dtype == np.float32: return gdal.GDT_Float32 if dtype == np.float64: @@ -413,7 +417,7 @@ def initialize(self, size, numpy_dtype, metadata=None): """ Prepare for writing with the given size and dtype. """ - assert len(size) == 3 + assert (len(size) == 3), ('Error: len(size) of '+str(size)+' != 3') TILE_SIZE = 256 self._tiff_w = TiffWriter(self._filename, size[0], size[1], num_bands=size[2], data_type=numpy_dtype_to_gdal_type(numpy_dtype), metadata=metadata, diff --git a/delta/imagery/sources/worldview.py b/delta/imagery/sources/worldview.py index 279af9a8..fb1b0923 100644 --- a/delta/imagery/sources/worldview.py +++ b/delta/imagery/sources/worldview.py @@ -20,9 +20,14 @@ """ import math +import zipfile import functools import os +import sys import numpy as np +import portalocker + +import tensorflow as tf from delta.config import config from delta.imagery import utilities @@ -64,19 +69,29 @@ def __init__(self, paths): def _unpack(self, paths): # Get the folder where this will be stored from the cache manager - name = '_'.join([self._sensor, self._date]) - unpack_folder = config.io.cache.manager().register_item(name) - - # Check if we already unpacked this data - (tif_path, imd_path) = _get_files_from_unpack_folder(unpack_folder) - - if imd_path and tif_path: - #print('Already have unpacked files in ' + unpack_folder) - pass - else: - print('Unpacking file ' + paths + ' to folder ' + unpack_folder) - utilities.unpack_to_folder(paths, unpack_folder) + unpack_folder = config.io.cache.manager().register_item(self._name) + + with portalocker.Lock(paths, 'r', timeout=300) as unused: #pylint: disable=W0612 + # Check if we already unpacked this data (tif_path, imd_path) = _get_files_from_unpack_folder(unpack_folder) + + if imd_path and tif_path: + #tf.print('Already have unpacked files in ' + unpack_folder, + # output_stream=sys.stdout) + pass + else: + tf.print('Unpacking file ' + paths + ' to folder ' + unpack_folder, + output_stream=sys.stdout) + utilities.unpack_to_folder(paths, unpack_folder) + # some worldview zip files have a subdirectory with the name of the image + if not os.path.exists(os.path.join(unpack_folder, 'vendor_metadata')): + subdir = os.path.join(unpack_folder, os.path.splitext(os.path.basename(paths))[0]) + if not os.path.exists(os.path.join(subdir, 'vendor_metadata')): + raise Exception('vendor_metadata not found in %s.' % (paths)) + for filename in os.listdir(subdir): + os.rename(os.path.join(subdir, filename), os.path.join(unpack_folder, filename)) + os.rmdir(subdir) + (tif_path, imd_path) = _get_files_from_unpack_folder(unpack_folder) return (tif_path, imd_path) # This function is currently set up for the HDDS archived WV data, files from other @@ -87,9 +102,20 @@ def _prep(self, paths): TODO: Apply TOA conversion! """ assert isinstance(paths, str) - parts = os.path.basename(paths).split('_') + (_, ext) = os.path.splitext(paths) + assert '.zip' in ext, f'Error: Was assuming a zip file. Found {paths}' + + zip_file = zipfile.ZipFile(paths, 'r') + tif_names = list(filter(lambda x: '.tif' in x, zip_file.namelist())) + assert len(tif_names) > 0, f'Error: no tif files in the file {paths}' + assert len(tif_names) == 1, f'Error: too many tif files in {paths}: {tif_names}' + tif_name = tif_names[0] + + + parts = os.path.basename(tif_name).split('_') self._sensor = parts[0][0:4] - self._date = parts[2][6:14] + self._date = parts[2][6:14] + self._name = os.path.splitext(os.path.basename(tif_name))[0] (tif_path, imd_path) = self._unpack(paths) diff --git a/delta/imagery/utilities.py b/delta/imagery/utilities.py index 35b067d4..b64ca206 100644 --- a/delta/imagery/utilities.py +++ b/delta/imagery/utilities.py @@ -39,11 +39,11 @@ def unpack_to_folder(compressed_path, unpack_folder): else: # Assume a tar file with tarfile.TarFile(compressed_path, 'r') as tf: tf.extractall(tmpdir) - except: - shutil.rmtree(tmpdir) - raise - # make this atomic so we don't have incomplete data - os.rename(tmpdir, unpack_folder) + except Exception as e: + shutil.rmtree(tmpdir) # Clear any partially unpacked results + raise RuntimeError('Caught exception unpacking compressed file: ' + compressed_path + + '\n' + str(e)) + os.rename(tmpdir, unpack_folder) # Clean up def progress_bar(text, fill_amount, prefix = '', length = 80): #pylint: disable=W0613 """ diff --git a/delta/ml/io.py b/delta/ml/io.py new file mode 100644 index 00000000..12e4733f --- /dev/null +++ b/delta/ml/io.py @@ -0,0 +1,32 @@ +# Copyright © 2020, United States Government, as represented by the +# Administrator of the National Aeronautics and Space Administration. +# All rights reserved. +# +# The DELTA (Deep Earth Learning, Tools, and Analysis) platform is +# licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0. +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Functions for IO specific to ML. +""" + +import h5py + +from delta.config import config + +def save_model(model, filename): + """ + Save a model. Includes DELTA configuration. + """ + model.save(filename, save_format='h5') + with h5py.File(filename, 'r+') as f: + f.attrs['delta'] = config.export() diff --git a/delta/ml/ml_config.py b/delta/ml/ml_config.py index 537ea4b0..10ea3bcf 100644 --- a/delta/ml/ml_config.py +++ b/delta/ml/ml_config.py @@ -18,6 +18,8 @@ """ Configuration options specific to machine learning. """ +# Please do not put any tensorflow imports in this file as it will greatly slow loading +# when tensorflow isn't needed import os.path import appdirs @@ -27,6 +29,35 @@ from delta.imagery.imagery_config import ImageSet, ImageSetConfig, load_images_labels import delta.config as config +def loss_function_factory(loss_spec): + ''' + loss_function_factory - Creates a loss function object, if an object is specified in the + config file, or a string if that is all that is specified. + + :param: loss_spec Specification of the loss function. Either a string that is compatible + with the keras interface (e.g. 'categorical_crossentropy') or an object defined by a dict + of the form {'LossFunctionName': {'arg1':arg1_val, ...,'argN',argN_val}} + ''' + import tensorflow.keras.losses # pylint: disable=import-outside-toplevel + + if isinstance(loss_spec, str): + return loss_spec + + if isinstance(loss_spec, list): + assert len(loss_spec) == 1, 'Too many loss functions specified' + assert isinstance(loss_spec[0], dict), '''Loss functions objects and parameters must + be specified as a yaml dictionary object + ''' + assert len(loss_spec[0].keys()) == 1, f'Too many loss functions specified: {dict.keys()}' + loss_type = list(loss_spec[0].keys())[0] + loss_fn_args = loss_spec[0][loss_type] + + loss_class = getattr(tensorflow.keras.losses, loss_type, None) + return loss_class(**loss_fn_args) + + raise RuntimeError(f'Did not recognize the loss function specification: {loss_spec}') + + class ValidationSet:#pylint:disable=too-few-public-methods """ Specifies the images and labels in a validation set. @@ -63,10 +94,10 @@ def __init__(self, batch_size, epochs, loss_function, metrics, validation=None, class NetworkModelConfig(config.DeltaConfigComponent): def __init__(self): super().__init__() - self.register_field('yaml_file', str, 'yaml_file', None, config.validate_path, + self.register_field('yaml_file', str, 'yaml_file', config.validate_path, 'A YAML file describing the network to train.') - self.register_field('params', dict, None, None, None, None) - self.register_field('layers', list, None, None, None, None) + self.register_field('params', dict, None, None, None) + self.register_field('layers', list, None, None, None) # overwrite model entirely if updated (don't want combined layers from multiple files) def _load_dict(self, d : dict, base_dir): @@ -75,34 +106,28 @@ def _load_dict(self, d : dict, base_dir): self._config_dict['layers'] = None elif 'layers' in d: self._config_dict['yaml_file'] = None - - def as_dict(self) -> dict: - """ - Returns a dictionary representing the network model for use by `delta.ml.model_parser`. - """ - yaml_file = self._config_dict['yaml_file'] - if yaml_file is not None: - if self._config_dict['layers'] is not None: - raise ValueError('Specified both yaml file and layers in model.') - + if 'yaml_file' in d and 'layers' in d and d['yaml_file'] is not None and d['layers'] is not None: + raise ValueError('Specified both yaml file and layers in model.') + if 'yaml_file' in d and d['yaml_file'] is not None: + yaml_file = d['yaml_file'] resource = os.path.join('config', yaml_file) if not os.path.exists(yaml_file) and pkg_resources.resource_exists('delta', resource): yaml_file = pkg_resources.resource_filename('delta', resource) if not os.path.exists(yaml_file): raise ValueError('Model yaml_file does not exist: ' + yaml_file) with open(yaml_file, 'r') as f: - return yaml.safe_load(f) - return self._config_dict + self._config_dict.update(yaml.safe_load(f)) class NetworkConfig(config.DeltaConfigComponent): def __init__(self): super().__init__() - self.register_field('chunk_size', int, 'chunk_size', '--chunk-size', config.validate_positive, + self.register_field('chunk_size', int, 'chunk_size', config.validate_positive, 'Width of an image chunk to input to the neural network.') - self.register_field('output_size', int, 'output_size', '--output-size', config.validate_positive, + self.register_field('output_size', int, 'output_size', config.validate_positive, 'Width of an image chunk to output from the neural network.') - self.register_field('classes', int, 'classes', '--classes', config.validate_positive, - 'Number of label classes.') + + self.register_arg('chunk_size', '--chunk-size') + self.register_arg('output_size', '--output-size') self.register_component(NetworkModelConfig(), 'model') def setup_arg_parser(self, parser, components = None) -> None: @@ -112,12 +137,12 @@ def setup_arg_parser(self, parser, components = None) -> None: class ValidationConfig(config.DeltaConfigComponent): def __init__(self): super().__init__() - self.register_field('steps', int, 'steps', None, config.validate_positive, + self.register_field('steps', int, 'steps', config.validate_positive, 'If from training, validate for this many steps.') - self.register_field('from_training', bool, 'from_training', None, None, + self.register_field('from_training', bool, 'from_training', None, 'Take validation data from training data.') - self.register_component(ImageSetConfig(), 'images') - self.register_component(ImageSetConfig(), 'labels') + self.register_component(ImageSetConfig(), 'images', '__image_comp') + self.register_component(ImageSetConfig(), 'labels', '__label_comp') self.__images = None self.__labels = None @@ -132,7 +157,8 @@ def images(self) -> ImageSet: """ if self.__images is None: (self.__images, self.__labels) = load_images_labels(self._components['images'], - self._components['labels']) + self._components['labels'], + config.config.dataset.classes) return self.__images def labels(self) -> ImageSet: @@ -141,22 +167,28 @@ def labels(self) -> ImageSet: """ if self.__labels is None: (self.__images, self.__labels) = load_images_labels(self._components['images'], - self._components['labels']) + self._components['labels'], + config.config.dataset.classes) return self.__labels class TrainingConfig(config.DeltaConfigComponent): def __init__(self): super().__init__() - self.register_field('chunk_stride', int, None, '--chunk-stride', config.validate_positive, + self.register_field('chunk_stride', int, None, config.validate_positive, 'Pixels to skip when iterating over chunks. A value of 1 means to take every chunk.') - self.register_field('epochs', int, None, '--epochs', config.validate_positive, + self.register_field('epochs', int, None, config.validate_positive, 'Number of times to repeat training on the dataset.') - self.register_field('batch_size', int, None, '--batch-size', config.validate_positive, + self.register_field('batch_size', int, None, config.validate_positive, 'Features to group into each training batch.') - self.register_field('loss_function', str, None, None, None, 'Keras loss function.') - self.register_field('metrics', list, None, None, None, 'List of metrics to apply.') - self.register_field('steps', int, None, '--steps', config.validate_positive, 'Batches to train per epoch.') - self.register_field('optimizer', str, None, None, None, 'Keras optimizer to use.') + self.register_field('loss_function', (str, list), None, None, 'Keras loss function.') + self.register_field('metrics', list, None, None, 'List of metrics to apply.') + self.register_field('steps', int, None, config.validate_positive, 'Batches to train per epoch.') + self.register_field('optimizer', str, None, None, 'Keras optimizer to use.') + + self.register_arg('chunk_stride', '--chunk-stride') + self.register_arg('epochs', '--epochs') + self.register_arg('batch_size', '--batch-size') + self.register_arg('steps', '--steps') self.register_component(ValidationConfig(), 'validation') self.register_component(NetworkConfig(), 'network') self.__training = None @@ -176,9 +208,10 @@ def spec(self) -> TrainingSpec: if not from_training: (vimg, vlabels) = (self._components['validation'].images(), self._components['validation'].labels()) validation = ValidationSet(vimg, vlabels, from_training, vsteps) + loss_fn = loss_function_factory(self._config_dict['loss_function']) self.__training = TrainingSpec(batch_size=self._config_dict['batch_size'], epochs=self._config_dict['epochs'], - loss_function=self._config_dict['loss_function'], + loss_function=loss_fn, metrics=self._config_dict['metrics'], validation=validation, steps=self._config_dict['steps'], @@ -190,19 +223,22 @@ def spec(self) -> TrainingSpec: class MLFlowCheckpointsConfig(config.DeltaConfigComponent): def __init__(self): super().__init__() - self.register_field('frequency', int, 'frequency', None, None, + self.register_field('frequency', int, 'frequency', None, 'Frequency in batches to store neural network checkpoints.') - self.register_field('save_latest', bool, 'save_latest', None, None, + self.register_field('only_save_latest', bool, 'only_save_latest', None, 'If true, only keep the most recent checkpoint.') class MLFlowConfig(config.DeltaConfigComponent): def __init__(self): super().__init__() - self.register_field('enabled', bool, 'enabled', None, None, 'Enable MLFlow.') - self.register_field('uri', str, None, None, None, 'URI to store MLFlow data.') - self.register_field('frequency', int, 'frequency', None, config.validate_positive, + self.register_field('enabled', bool, 'enabled', None, 'Enable MLFlow.') + self.register_field('uri', str, None, None, 'URI to store MLFlow data.') + self.register_field('frequency', int, 'frequency', config.validate_positive, 'Frequency to store metrics.') - self.register_field('experiment_name', str, 'experiment', None, None, 'Experiment name in MLFlow.') + self.register_field('experiment_name', str, 'experiment', None, 'Experiment name in MLFlow.') + + self.register_arg('enabled', '--disable-mlflow', action='store_const', const=False, type=None) + self.register_arg('enabled', '--enable-mlflow', action='store_const', const=True, type=None) self.register_component(MLFlowCheckpointsConfig(), 'checkpoints') def uri(self) -> str: @@ -217,8 +253,8 @@ def uri(self) -> str: class TensorboardConfig(config.DeltaConfigComponent): def __init__(self): super().__init__() - self.register_field('enabled', bool, 'enabled', None, None, 'Enable Tensorboard.') - self.register_field('dir', str, None, None, None, 'Directory to store Tensorboard data.') + self.register_field('enabled', bool, 'enabled', None, 'Enable Tensorboard.') + self.register_field('dir', str, None, None, 'Directory to store Tensorboard data.') def dir(self) -> str: """ @@ -237,7 +273,15 @@ def register(): """ if not hasattr(config.config, 'general'): config.config.register_component(config.DeltaConfigComponent('General'), 'general') - config.config.general.register_field('gpus', int, 'gpus', '--gpus', None, 'Number of gpus to use.') + + config.config.general.register_field('gpus', int, 'gpus', None, 'Number of gpus to use.') + config.config.general.register_arg('gpus', '--gpus') + config.config.general.register_field('stop_on_input_error', bool, 'stop_on_input_error', None, + 'If false, skip past bad input images.') + config.config.general.register_arg('stop_on_input_error', '--bypass-input-errors', + action='store_const', const=False, type=None) + config.config.general.register_arg('stop_on_input_error', '--stop-on-input-error', + action='store_const', const=True, type=None) config.config.register_component(TrainingConfig(), 'train') config.config.register_component(MLFlowConfig(), 'mlflow') diff --git a/delta/ml/model_parser.py b/delta/ml/model_parser.py index 0d6fc078..3ba37ff3 100644 --- a/delta/ml/model_parser.py +++ b/delta/ml/model_parser.py @@ -146,7 +146,7 @@ def config_model(num_bands: int) -> Callable[[], tensorflow.keras.models.Sequent """ in_data_shape = (config.train.network.chunk_size(), config.train.network.chunk_size(), num_bands) out_data_shape = (config.train.network.output_size(), config.train.network.output_size(), - config.train.network.classes()) + len(config.dataset.classes)) params_exposed = {'out_shape' : out_data_shape, 'out_dims' : out_data_shape[0] * out_data_shape[1] * out_data_shape[2], @@ -154,4 +154,4 @@ def config_model(num_bands: int) -> Callable[[], tensorflow.keras.models.Sequent 'in_dims' : in_data_shape[0] * in_data_shape[1] * in_data_shape[2], 'num_bands' : in_data_shape[2]} - return model_from_dict(config.train.network.model.as_dict(), params_exposed) + return model_from_dict(config.train.network.model.to_dict(), params_exposed) diff --git a/delta/ml/predict.py b/delta/ml/predict.py index 574adac0..6c3e752e 100644 --- a/delta/ml/predict.py +++ b/delta/ml/predict.py @@ -64,15 +64,15 @@ def _process_block(self, pred_image, x, y, labels): """ def _predict_array(self, data): - net_input_shape = self._model.get_input_shape_at(0)[1:] - net_output_shape = self._model.get_output_shape_at(0)[1:] + net_input_shape = self._model.input_shape[1:] + net_output_shape = self._model.output_shape[1:] assert net_input_shape[2] == data.shape[2],\ 'Model expects %d input channels, data has %d channels' % (net_input_shape[2], data.shape[2]) out_shape = (data.shape[0] - net_input_shape[0] + net_output_shape[0], data.shape[1] - net_input_shape[1] + net_output_shape[1]) - out_type = self._model.get_output_at(0).dtype + out_type = tf.dtypes.as_dtype(self._model.dtype) image = tf.convert_to_tensor(data) image = tf.expand_dims(image, 0) chunks = tf.image.extract_patches(image, [1, net_input_shape[0], net_input_shape[1], 1], @@ -101,8 +101,8 @@ def predict(self, image, label=None, input_bounds=None): Results are limited to `input_bounds`. Returns output, the meaning of which depends on the subclass. """ - net_input_shape = self._model.get_input_shape_at(0)[1:] - net_output_shape = self._model.get_output_shape_at(0)[1:] + net_input_shape = self._model.input_shape[1:] + net_output_shape = self._model.output_shape[1:] offset_r = -net_input_shape[0] + net_output_shape[0] offset_c = -net_input_shape[1] + net_output_shape[1] block_size_x = net_input_shape[0] * (_TILE_SIZE // net_input_shape[0]) @@ -141,11 +141,12 @@ def callback_function(roi, data): raise return self._complete() + class LabelPredictor(Predictor): """ Predicts integer labels for an image. """ - def __init__(self, model, output_image=None, show_progress=False, + def __init__(self, model, output_image=None, show_progress=False, nodata_value=None, # pylint:disable=too-many-arguments colormap=None, prob_image=None, error_image=None, error_colors=None): """ output_image, prob_image, and error_image are all DeltaImageWriter's. @@ -155,6 +156,16 @@ def __init__(self, model, output_image=None, show_progress=False, self._confusion_matrix = None self._num_classes = None self._output_image = output_image + if colormap is not None: + # convert python list to numpy array + if not isinstance(colormap, np.ndarray): + a = np.zeros(shape=(len(colormap), 3), dtype=np.uint8) + for (i, v) in enumerate(colormap): + a[i][0] = (v >> 16) & 0xFF + a[i][1] = (v >> 8) & 0xFF + a[i][2] = v & 0xFF + colormap = a + self._nodata_value = nodata_value self._colormap = colormap self._prob_image = prob_image self._error_image = error_image @@ -166,7 +177,7 @@ def __init__(self, model, output_image=None, show_progress=False, self._errors = None def _initialize(self, shape, label, image): - net_output_shape = self._model.get_output_shape_at(0)[1:] + net_output_shape = self._model.output_shape[1:] self._num_classes = net_output_shape[-1] if label: self._errors = np.zeros(shape, dtype=np.bool) @@ -179,7 +190,7 @@ def _initialize(self, shape, label, image): self._output_image.initialize((shape[0], shape[1], self._colormap.shape[1]), self._colormap.dtype, image.metadata()) else: - self._output_image.initialize((shape[0], shape[1]), np.int32, image.metadata()) + self._output_image.initialize((shape[0], shape[1], 1), np.int32, image.metadata()) if self._prob_image: self._prob_image.initialize((shape[0], shape[1], self._num_classes), np.float32, image.metadata()) if self._error_image: @@ -215,7 +226,13 @@ def _process_block(self, pred_image, x, y, labels): self._output_image.write(pred_image, x, y) if labels is not None: - self._error_image.write(self._error_colors[(labels != pred_image).astype(int)], x, y) + eimg = self._error_colors[(labels != pred_image).astype(int)] + if self._nodata_value is not None: + valid = (labels != self._nodata_value) + eimg[np.logical_not(valid)] = np.zeros(eimg.shape[-1:], dtype=eimg.dtype) + labels = labels[valid] + pred_image = pred_image[valid] + self._error_image.write(eimg, x, y) cm = tf.math.confusion_matrix(np.ndarray.flatten(labels), np.ndarray.flatten(pred_image), self._num_classes) @@ -244,7 +261,7 @@ def __init__(self, model, output_image=None, show_progress=False, transform=None self._transform = transform def _initialize(self, shape, label, image): - net_output_shape = self._model.get_output_shape_at(0)[1:] + net_output_shape = self._model.output_shape[1:] if self._output_image is not None: dtype = np.float32 if self._transform is None else self._transform[1] bands = net_output_shape[-1] if self._transform is None else self._transform[2] diff --git a/delta/ml/train.py b/delta/ml/train.py index 43973380..6e76843b 100644 --- a/delta/ml/train.py +++ b/delta/ml/train.py @@ -28,7 +28,9 @@ from delta.config import config from delta.imagery.imagery_dataset import ImageryDataset +from delta.imagery.imagery_dataset import AutoencoderDataset from .layers import DeltaLayer +from .io import save_model def _devices(num_gpus): ''' @@ -59,8 +61,10 @@ def _strategy(devices): return strategy def _prep_datasets(ids, tc, chunk_size, output_size): - ds = ids.dataset() + ds = ids.dataset(config.dataset.classes.weights()) ds = ds.batch(tc.batch_size) + #ds = ds.cache() + ds = ds.prefetch(tf.data.experimental.AUTOTUNE) if tc.validation: if tc.validation.from_training: validation = ds.take(tc.validation.steps) @@ -68,15 +72,24 @@ def _prep_datasets(ids, tc, chunk_size, output_size): else: vimg = tc.validation.images vlabel = tc.validation.labels - if not vimg or not vlabel: + if not vimg: validation = None else: - vimagery = ImageryDataset(vimg, vlabel, chunk_size, output_size, tc.chunk_stride) - validation = vimagery.dataset().batch(tc.batch_size).take(tc.validation.steps) + if vlabel: + vimagery = ImageryDataset(vimg, vlabel, chunk_size, output_size, tc.chunk_stride, + resume_mode=False) + else: + vimagery = AutoencoderDataset(vimg, chunk_size, tc.chunk_stride, resume_mode=False) + validation = vimagery.dataset().batch(tc.batch_size) + if tc.validation.steps: + validation = validation.take(tc.validation.steps) + #validation = validation.prefetch(4)#tf.data.experimental.AUTOTUNE) else: + validation = None if tc.steps: ds = ds.take(tc.steps) + #ds = ds.prefetch(4)#tf.data.experimental.AUTOTUNE) ds = ds.repeat(tc.epochs) return (ds, validation) @@ -95,7 +108,7 @@ def _log_mlflow_params(model, dataset, training_spec): mlflow.log_param('Batch Size', training_spec.batch_size) mlflow.log_param('Optimizer', training_spec.optimizer) mlflow.log_param('Model Layers', len(model.layers)) - #mlflow.log_param('Status', 'Running') + #mlflow.log_param('Status', 'Running') Illegal to change the value! class _MLFlowCallback(tf.keras.callbacks.Callback): """ @@ -116,11 +129,11 @@ def on_train_batch_end(self, batch, logs=None): for k in logs.keys(): if k in ('batch', 'size'): continue - mlflow.log_metric(k, logs[k].item(), step=batch) + mlflow.log_metric(k, logs[k], step=batch) if config.mlflow.checkpoints.frequency() and batch % config.mlflow.checkpoints.frequency() == 0: filename = os.path.join(self.temp_dir, '%d.h5' % (batch)) - self.model.save(filename, save_format='h5') - if config.mlflow.checkpoints.save_latest(): + save_model(self.model, filename) + if config.mlflow.checkpoints.only_save_latest(): old = filename filename = os.path.join(self.temp_dir, 'latest.h5') os.rename(old, filename) @@ -165,8 +178,8 @@ def train(model_fn, dataset : ImageryDataset, training_spec): model.compile(optimizer=training_spec.optimizer, loss=loss, metrics=training_spec.metrics) - input_shape = model.get_input_at(0).shape - output_shape = model.get_output_at(0).shape + input_shape = model.input_shape + output_shape = model.output_shape chunk_size = input_shape[1] assert len(input_shape) == 4, 'Input to network is wrong shape.' @@ -181,7 +194,7 @@ def train(model_fn, dataset : ImageryDataset, training_spec): (ds, validation) = _prep_datasets(dataset, training_spec, chunk_size, output_shape[1]) - callbacks = [] + callbacks = [tf.keras.callbacks.TerminateOnNaN()] # add callbacks from DeltaLayers for l in model.layers: if isinstance(l, DeltaLayer): @@ -199,6 +212,7 @@ def train(model_fn, dataset : ImageryDataset, training_spec): if config.mlflow.enabled(): mcb = _mlflow_train_setup(model, dataset, training_spec) callbacks.append(mcb) + #print('Using mlflow folder: ' + mlflow.get_artifact_uri()) try: history = model.fit(ds, @@ -206,11 +220,13 @@ def train(model_fn, dataset : ImageryDataset, training_spec): callbacks=callbacks, validation_data=validation, validation_steps=training_spec.validation.steps if training_spec.validation else None, - steps_per_epoch=training_spec.steps) + steps_per_epoch=training_spec.steps, + verbose=1) + if config.mlflow.enabled(): model_path = os.path.join(mcb.temp_dir, 'final_model.h5') print('\nFinished, saving model to %s.' % (mlflow.get_artifact_uri() + '/final_model.h5')) - model.save(model_path, save_format='h5') + save_model(model, model_path) mlflow.log_artifact(model_path) os.remove(model_path) mlflow.log_param('Status', 'Completed') @@ -220,7 +236,7 @@ def train(model_fn, dataset : ImageryDataset, training_spec): mlflow.end_run('FAILED') model_path = os.path.join(mcb.temp_dir, 'aborted_model.h5') print('\nAborting, saving current model to %s.' % (mlflow.get_artifact_uri() + '/aborted_model.h5')) - model.save(model_path, save_format='h5') + save_model(model, model_path) mlflow.log_artifact(model_path) os.remove(model_path) raise diff --git a/delta/subcommands/classify.py b/delta/subcommands/classify.py index 497d4995..45ec4dc2 100644 --- a/delta/subcommands/classify.py +++ b/delta/subcommands/classify.py @@ -20,6 +20,7 @@ """ import os.path +import time import numpy as np import matplotlib.pyplot as plt import tensorflow as tf @@ -32,16 +33,18 @@ import delta.imagery.imagery_config import delta.ml.ml_config -def save_confusion(cm, filename): +def save_confusion(cm, class_labels, filename): f = plt.figure() ax = f.add_subplot(1, 1, 1) image = ax.imshow(cm, interpolation='nearest', cmap=plt.get_cmap('inferno')) ax.set_title('Confusion Matrix') f.colorbar(image) + ax.set_xlim(-0.5, cm.shape[0] - 0.5) + ax.set_ylim(-0.5, cm.shape[0] - 0.5) ax.set_xticks(range(cm.shape[0])) ax.set_yticks(range(cm.shape[0])) - ax.set_xlim(-0.5, cm.shape[0]-0.5) - ax.set_ylim(-0.5, cm.shape[0]-0.5) + ax.set_xticklabels(class_labels) + ax.set_yticklabels(class_labels) m = cm.max() total = cm.sum() @@ -50,23 +53,30 @@ def save_confusion(cm, filename): ax.text(j, i, '%d\n%.2g%%' % (cm[i, j], cm[i, j] / total * 100), horizontalalignment='center', color='white' if cm[i, j] < m / 2 else 'black') ax.set_ylabel('True Label') - ax.set_xlabel('Predicated Label') + ax.set_xlabel('Predicted Label') f.savefig(filename) def ae_convert(data): return (data[:, :, [4, 2, 1]] * 256.0).astype(np.uint8) def main(options): - model = tf.keras.models.load_model(options.model, custom_objects=delta.ml.layers.ALL_LAYERS) - colors = np.array([[0x0, 0x0, 0x0], - [0x67, 0xa9, 0xcf], - [0xf6, 0xef, 0xf7], - [0xbd, 0xc9, 0xe1], - [0x02, 0x81, 0x8a]], dtype=np.uint8) + # TODO: Share the way this is done with in ml/train.py + cpuOnly = (config.general.gpus()==0) + + if cpuOnly: + with tf.device('/cpu:0'): + model = tf.keras.models.load_model(options.model, custom_objects=delta.ml.layers.ALL_LAYERS) + else: + model = tf.keras.models.load_model(options.model, custom_objects=delta.ml.layers.ALL_LAYERS) + + colors = list(map(lambda x: x.color, config.dataset.classes)) error_colors = np.array([[0x0, 0x0, 0x0], [0xFF, 0x00, 0x00]], dtype=np.uint8) + if options.noColormap: + colors=None # Forces raw one channel output + start_time = time.time() images = config.dataset.images() labels = config.dataset.labels() @@ -86,15 +96,21 @@ def main(options): label = None if labels: label = loader.load_image(config.dataset.labels(), i) + if options.autoencoder: label = image predictor = predict.ImagePredictor(model, output_image, True, (ae_convert, np.uint8, 3)) else: - predictor = predict.LabelPredictor(model, output_image, True, colormap=colors, prob_image=prob_image, - error_image=error_image, error_colors=error_colors) + predictor = predict.LabelPredictor(model, output_image, True, labels.nodata_value(), colormap=colors, + prob_image=prob_image, error_image=error_image, + error_colors=error_colors) try: - predictor.predict(image, label) + if cpuOnly: + with tf.device('/cpu:0'): + predictor.predict(image, label) + else: + predictor.predict(image, label) except KeyboardInterrupt: print('\nAborted.') return 0 @@ -102,9 +118,11 @@ def main(options): if labels: cm = predictor.confusion_matrix() print('%.2g%% Correct: %s' % (np.sum(np.diag(cm)) / np.sum(cm) * 100, path)) - save_confusion(cm, 'confusion_' + base_name + '.pdf') + save_confusion(cm, map(lambda x: x.name, config.dataset.classes), 'confusion_' + base_name + '.pdf') if options.autoencoder: tiff.write_tiff('orig_' + base_name + '.tiff', ae_convert(image.read()), metadata=image.metadata()) + stop_time = time.time() + print('Elapsed time = ', stop_time - start_time) return 0 diff --git a/delta/subcommands/commands.py b/delta/subcommands/commands.py index 6401e6a7..5e56b825 100644 --- a/delta/subcommands/commands.py +++ b/delta/subcommands/commands.py @@ -18,15 +18,10 @@ """ Lists all avaiable commands. """ -import delta.imagery.imagery_config -import delta.ml.ml_config from delta.config import config #pylint:disable=import-outside-toplevel -delta.imagery.imagery_config.register() -delta.ml.ml_config.register() - # we put this here because tensorflow takes so long to load, we don't do it unless we have to def main_classify(options): from . import classify @@ -46,6 +41,8 @@ def setup_classify(subparsers): sub.add_argument('--prob', dest='prob', action='store_true', help='Save image of class probabilities.') sub.add_argument('--autoencoder', dest='autoencoder', action='store_true', help='Classify with the autoencoder.') + sub.add_argument('--no-colormap', dest='noColormap', action='store_true', + help='Save raw classification values instead of colormapped values.') sub.add_argument('model', help='File to save the network to.') sub.set_defaults(function=main_classify) diff --git a/delta/subcommands/main.py b/delta/subcommands/main.py new file mode 100644 index 00000000..15bca996 --- /dev/null +++ b/delta/subcommands/main.py @@ -0,0 +1,44 @@ +# Copyright © 2020, United States Government, as represented by the +# Administrator of the National Aeronautics and Space Administration. +# All rights reserved. +# +# The DELTA (Deep Earth Learning, Tools, and Analysis) platform is +# licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0. +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import argparse + +from delta.config import config +import delta.config.modules +from delta.subcommands import commands + +def main(args): + delta.config.modules.register_all() + parser = argparse.ArgumentParser(description='DELTA Machine Learning Toolkit') + subparsers = parser.add_subparsers() + + for d in commands.SETUP_COMMANDS: + d(subparsers) + + try: + options = parser.parse_args(args[1:]) + except argparse.ArgumentError: + parser.print_help(sys.stderr) + sys.exit(1) + + if not hasattr(options, 'function'): + parser.print_help(sys.stderr) + sys.exit(1) + + config.initialize(options) + return options.function(options) diff --git a/delta/subcommands/train.py b/delta/subcommands/train.py index 8116b51e..85b0ee31 100644 --- a/delta/subcommands/train.py +++ b/delta/subcommands/train.py @@ -20,6 +20,11 @@ """ import sys +import time +import os + +#import logging +#logging.getLogger("tensorflow").setLevel(logging.DEBUG) import tensorflow as tf @@ -28,22 +33,39 @@ from delta.ml.train import train from delta.ml.model_parser import config_model from delta.ml.layers import ALL_LAYERS +from delta.ml.io import save_model + +#tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG) def main(options): + + log_folder = config.dataset.log_folder() + if log_folder: + if not options.resume: # Start fresh and clear the read logs + os.system('rm ' + log_folder + '/*') + print('Dataset progress recording in: ' + log_folder) + else: + print('Resuming dataset progress recorded in: ' + log_folder) + + start_time = time.time() images = config.dataset.images() if not images: print('No images specified.', file=sys.stderr) return 1 tc = config.train.spec() if options.autoencoder: - ids = imagery_dataset.AutoencoderDataset(images, config.train.network.chunk_size(), tc.chunk_stride) + ids = imagery_dataset.AutoencoderDataset(images, config.train.network.chunk_size(), + tc.chunk_stride, resume_mode=options.resume, + log_folder=log_folder) else: labels = config.dataset.labels() if not labels: print('No labels specified.', file=sys.stderr) return 1 ids = imagery_dataset.ImageryDataset(images, labels, config.train.network.chunk_size(), - config.train.network.output_size(), tc.chunk_stride) + config.train.network.output_size(), tc.chunk_stride, + resume_mode=options.resume, + log_folder=log_folder) try: if options.resume is not None: @@ -53,9 +75,11 @@ def main(options): model, _ = train(model, ids, tc) if options.model is not None: - model.save(options.model) + save_model(model, options.model) except KeyboardInterrupt: print() print('Training cancelled.') + stop_time = time.time() + print('Elapsed time = ', stop_time-start_time) return 0 diff --git a/scripts/docs.sh b/scripts/docs.sh index 0af389d9..c8b68735 100755 --- a/scripts/docs.sh +++ b/scripts/docs.sh @@ -2,5 +2,6 @@ SCRIPT=$(readlink -f "$0") SCRIPTPATH=$(dirname "$SCRIPT") +OUT_DIR=$(readlink -m ${1:-./html}) cd $SCRIPTPATH/.. -pdoc3 --html -c show_type_annotations=True delta --force +pdoc3 --html -c show_type_annotations=True delta --force -o $OUT_DIR diff --git a/scripts/fetch/convert_image_list.py b/scripts/fetch/convert_image_list.py new file mode 100755 index 00000000..9f5ba888 --- /dev/null +++ b/scripts/fetch/convert_image_list.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +# Copyright © 2020, United States Government, as represented by the +# Administrator of the National Aeronautics and Space Administration. +# All rights reserved. +# +# The DELTA (Deep Earth Learning, Tools, and Analysis) platform is +# licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0. +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#pylint: disable=R0915,R0914,R0912 +""" +Script to extract a list of associated image files from the label file csv list. +""" +import sys + + +def main(argsIn): #pylint: disable=R0914,R0912 + + if len(argsIn) != 2: + print("usage: convert_image_list.py ") + + input_path = argsIn[0] + output_path = argsIn[1] + + # Just find the image name for every line with a label ID (integer) + output_list = [] + with open(input_path, 'r') as f: + for line in f: + parts = line.split(',') + try: + label_num = int(parts[0]) #pylint: disable=W0612 + image_name = parts[1] + output_list.append(image_name) + #print('%s -> %s' % (label_num, image_name)) + # Header lines etc will throw exceptions trying to cast the integer + except: #pylint: disable=W0702 + pass + + # Write out a text file with all of the image names. + with open(output_path, 'w') as f: + for line in output_list: + f.write(line+'\n') + print('Wrote out ' + str(len(output_list)) + ' items.') + + return 0 + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/scripts/fetch/fetch_hdds_images.py b/scripts/fetch/fetch_hdds_images.py index 7093a3c3..112e813d 100755 --- a/scripts/fetch/fetch_hdds_images.py +++ b/scripts/fetch/fetch_hdds_images.py @@ -53,14 +53,14 @@ def get_dataset_list(options): # Each event is a dataset, start by fetching the list of all HDDS datasets. print('Submitting HDDS dataset query...') results = api.datasets("", CATALOG) - + print(results) if not results['data']: raise Exception('Did not find any HDDS data!') print('Found ' + str(len(results['data'])) + ' matching datasets.') # Go through all the datasets and identify the events we are interested in. TARGET_TYPES = ['flood', 'hurricane', 'cyclone', 'tsunami', 'dam_collapse', 'storm'] - SKIP = ['test', 'snowstorm', 'adhoc', 'ad hoc', 'ad_hoc'] # TODO: What is ad hoc here? + SKIP = ['test', 'icestorm', 'snowstorm', 'adhoc', 'ad hoc', 'ad_hoc'] # TODO: What is ad hoc here? handle = open(dataset_cache_path, 'w') @@ -140,7 +140,7 @@ def main(argsIn): #pylint: disable=R0914,R0912 try: - usage = "usage: get_landsat_dswe_labels [options]" + usage = "usage: fetch_hdds_images.py [options]" parser = argparse.ArgumentParser(usage=usage) parser.add_argument("--output-folder", dest="output_folder", required=True, @@ -163,6 +163,12 @@ def main(argsIn): #pylint: disable=R0914,R0912 dest="refetch_scenes", default=False, help="Force refetches of scene lists for each dataset.") + parser.add_argument("--image-list-path", dest="image_list_path", default=None, + help="Path to text file containing list of image IDs to download, one per line.") + + parser.add_argument("--event-name", dest="event_name", default=None, + help="Only download images from this event.") + options = parser.parse_args(argsIn) except argparse.ArgumentError: @@ -172,6 +178,12 @@ def main(argsIn): #pylint: disable=R0914,R0912 if options.output_folder and not os.path.exists(options.output_folder): os.mkdir(options.output_folder) + images_to_use = [] + if options.image_list_path: + with open(options.image_list_path, 'r') as f: + for line in f: + images_to_use.append(line.strip()) + # Only log in if our session expired (ugly function use to check!) if options.force_login or (not api._get_api_key(None)): #pylint: disable=W0212 print('Logging in to USGS EarthExplorer...') @@ -197,6 +209,10 @@ def main(argsIn): #pylint: disable=R0914,R0912 #if counter == 1: # continue + if options.event_name: # Only download images from the specified event + if options.event_name.lower() not in full_name.lower(): + continue + dataset_folder = os.path.join(options.output_folder, full_name) scene_list_path = os.path.join(dataset_folder, 'scene_list.dat') done_flag_path = os.path.join(dataset_folder, 'done.flag') @@ -209,12 +225,35 @@ def main(argsIn): #pylint: disable=R0914,R0912 print('--> Search scenes for: ' + full_name) + BATCH_SIZE = 10000 if not os.path.exists(scene_list_path) or options.refetch_scenes: # Request the scene list from USGS #details = {'Agency - Platform - Vendor':'WORLDVIEW', 'Sensor Type':'MS'} #details = {'sensor_type':'MS'} details = {} # TODO: How do these work?? - results = api.search(dataset, CATALOG, where=details, max_results=5000, extended=False) + + # Large sets of results require multiple queries in order to get all of the data + done = False + error = False + all_scenes = [] # Acculumate all scene data here + while not done: + print('Searching with start offset = ' + str(len(all_scenes))) + results = api.search(dataset, CATALOG, where=details, + max_results=BATCH_SIZE, + starting_number=len(all_scenes), extended=False) + + if 'results' not in results['data']: + print('ERROR: Failed to get any results for dataset: ' + full_name) + error = True + break + if len(results['data']['results']) < BATCH_SIZE: + done = True + all_scenes += results['data']['results'] + + if error: + continue + + results['data']['results'] = all_scenes # Cache the results to disk with open(scene_list_path, 'wb') as f: @@ -224,17 +263,12 @@ def main(argsIn): #pylint: disable=R0914,R0912 with open(scene_list_path, 'rb') as f: results = pickle.load(f) - if 'results' not in results['data']: - print('ERROR: Failed to get any results for dataset: ' + full_name) - continue print('Got ' + str(len(results['data']['results'])) + ' scene results.') for scene in results['data']['results']: - #print(scene) - fail = False - REQUIRED_PARTS = ['displayId', 'summary'] + REQUIRED_PARTS = ['displayId', 'summary', 'entityId', 'displayId'] for p in REQUIRED_PARTS: if (p not in scene) or (not scene[p]): print('scene object is missing element: ' + p) @@ -243,6 +277,9 @@ def main(argsIn): #pylint: disable=R0914,R0912 if fail: continue + # If image list was provided skip other image names + if images_to_use and (scene['displayId'] not in images_to_use): + continue # Figure out the downloaded file path for this image file_name = scene['displayId'] + '.zip' @@ -269,6 +306,7 @@ def main(argsIn): #pylint: disable=R0914,R0912 print('Undesired sensor: ' + scene['summary']) continue + # Investigate the number of bands PLATFORM_BAND_COUNTS = {'worldview':8, 'TODO':1} min_num_bands = PLATFORM_BAND_COUNTS[platform] @@ -287,7 +325,7 @@ def main(argsIn): #pylint: disable=R0914,R0912 if not num_bands: raise KeyError() # Treat like the except case if num_bands < min_num_bands: - print('Skipping, too few bands: ' + str(num_bands)) + print('Skipping %s, too few bands: %d' % (scene['displayId'], num_bands)) continue except KeyError: print('Unable to perform metadata check!') @@ -311,6 +349,7 @@ def main(argsIn): #pylint: disable=R0914,R0912 if not ready: raise Exception('Missing download option for scene: ' + str(types)) + # Get the download URL of the file we want. r = api.download(dataset, CATALOG, [scene['entityId']], product=download_type) diff --git a/scripts/fetch/random_folder_split.py b/scripts/fetch/random_folder_split.py index e6bb3f87..d050663b 100644 --- a/scripts/fetch/random_folder_split.py +++ b/scripts/fetch/random_folder_split.py @@ -17,6 +17,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +#pylint: disable=R0914 + """ Given folders of input image/label files, create a new pair of train/validate folders which contain symlinks to random non-overlapping subsets of the input files. @@ -48,7 +50,7 @@ def main(argsIn): parser.add_argument("--image-folder", dest="image_folder", required=True, help="Folder containing the input image files.") - parser.add_argument("--label-folder", dest="label_folder", required=True, + parser.add_argument("--label-folder", dest="label_folder", default=None, help="Folder containing the input label files.") parser.add_argument("--output-folder", dest="output_folder", required=True, @@ -62,6 +64,12 @@ def main(argsIn): parser.add_argument("--label-ext", dest="label_extension", default='.tif', help="Extension for label files.") + parser.add_argument("--image-limit", dest="image_limit", default=None, type=int, + help="Only use this many image files total.") + + parser.add_argument("--file-list-path", dest="file_list_path", default=None, + help="Path to text file containing list of image file names to use, one per line.") + parser.add_argument("--config-file", dest="config_path", default=None, help="Make a copy of this config file with paths changed. The config " + "file must be fully set up, as only the directory entries will be updated.") @@ -86,38 +94,57 @@ def main(argsIn): os.mkdir(out_train_folder) os.mkdir(out_valid_folder) os.mkdir(train_image_folder) - os.mkdir(train_label_folder) os.mkdir(valid_image_folder) - os.mkdir(valid_label_folder) - - input_image_list = os.listdir(options.image_folder) + if options.label_folder: + os.mkdir(train_label_folder) + os.mkdir(valid_label_folder) + + # Recursively find image files, obtaining the full path for each file. + input_image_list = [os.path.join(root, name) + for root, dirs, files in os.walk(options.image_folder) + for name in files + if name.endswith((options.image_extension))] + + images_to_use = [] + if options.file_list_path: + with open(options.file_list_path, 'r') as f: + for line in f: + images_to_use.append(line.strip()) train_count = 0 valid_count = 0 - for f in input_image_list: - # Skip other files - ext = os.path.splitext(f)[1] - if ext != options.image_extension: - continue + for image_path in input_image_list: - # Get file names - image_path = os.path.join(options.image_folder, f) - label_path = get_label_path(f, options) - label_name = os.path.basename(label_path) + # If an image list was provided skip images which are not in the list. + image_name = os.path.basename(image_path) + if images_to_use and (os.path.splitext(image_name)[0] not in images_to_use): + continue - # Decide where to make the symlinks, train or label + # Use for validation or for training? use_for_valid = (random.random() < options.validate_fraction) + + # Handle the image file if use_for_valid: - image_dest = os.path.join(valid_image_folder, f) - label_dest = os.path.join(valid_label_folder, label_name) + image_dest = os.path.join(valid_image_folder, image_name) valid_count += 1 else: - image_dest = os.path.join(train_image_folder, f) - label_dest = os.path.join(train_label_folder, label_name) + image_dest = os.path.join(train_image_folder, image_name) train_count += 1 - os.symlink(image_path, image_dest) - os.symlink(label_path, label_dest) + + if options.label_folder: # Handle the label file + label_path = get_label_path(image_name, options) + label_name = os.path.basename(label_path) + if use_for_valid: + label_dest = os.path.join(valid_label_folder, label_name) + else: + label_dest = os.path.join(train_label_folder, label_name) + os.symlink(label_path, label_dest) + + # Check the image limit if it was specified + total_count = valid_count + train_count + if options.image_limit and (total_count >= options.image_limit): + break # Copy config file if provided if options.config_path: @@ -129,13 +156,16 @@ def main(argsIn): config_data = yaml.load(f, Loader=yaml.FullLoader) config_data['images']['directory'] = train_image_folder - config_data['labels']['directory'] = train_label_folder config_data['train']['validation']['images']['directory'] = valid_image_folder - config_data['train']['validation']['labels']['directory'] = valid_label_folder + + if options.label_folder: + config_data['labels']['directory'] = train_label_folder + config_data['train']['validation']['labels']['directory'] = valid_label_folder with open(config_out_path, 'w') as f: yaml.dump(config_data, f) print('Wrote config file: ' + config_out_path) + except Exception as e: #pylint: disable=broad-except print('Failed to copy config file!') print(str(e)) diff --git a/scripts/label-img-info b/scripts/label-img-info new file mode 100755 index 00000000..5bc326d3 --- /dev/null +++ b/scripts/label-img-info @@ -0,0 +1,19 @@ +#!/usr/bin/env python + +import sys +import pathlib +import numpy as np +from osgeo import gdal + +if __name__=='__main__': + assert len(sys.argv) > 1, 'Need to supply a file' + filename = pathlib.Path(sys.argv[1]) + + tif_file = gdal.Open(str(filename)) + assert tif_file is not None, f'Could not open file {filename}' + tif_data = tif_file.ReadAsArray() + unique_labels = np.unique(tif_data) + print(np.any(np.isnan(tif_data)), tif_data.min(), tif_data.max(), tif_data.shape, unique_labels) + print(np.histogram(tif_data, bins=len(unique_labels))) + + diff --git a/scripts/model2config b/scripts/model2config new file mode 100755 index 00000000..3b74504c --- /dev/null +++ b/scripts/model2config @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +import tensorflow as tf +from argparse import ArgumentParser +import h5py +import pathlib + +parser = ArgumentParser(description='Converts a neural network in a *.h5 file to the DELTA configuration langauge') +parser.add_argument('model_name', type=pathlib.Path, help='The model to convert') + +args = parser.parse_args() + +print('Configuration File') +with h5py.File(args.model_name, 'r') as f: + if 'delta' not in f.attrs: + print(' - Not Available\n') + else: + print('\n' + f.attrs['delta'] + '\n') + +a = tf.keras.models.load_model(args.model_name) +print('Network Structure') +for l in a.layers: + print('\t- ', type(l).__name__) + configs = l.get_config() + if isinstance(l.input, list): + print('\t\t- input: ['+ ', '.join([x.name.replace('/Identity:0','') for x in l.input])+ ']') + else: + print('\t\t- input:', l.input.name.replace('/Identity:0','')) + for k in configs.keys(): + if isinstance(configs[k], dict) or configs[k] is None: + continue + print(f'\t\t- {k}: {configs[k]}') diff --git a/setup.py b/setup.py index 1a9a62ca..36ba9160 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,8 @@ 'mlflow', 'portalocker', 'appdirs', - 'gdal' + 'gdal', + 'h5py' ], scripts=scripts, include_package_data = True, diff --git a/tests/conftest.py b/tests/conftest.py index 30362e9f..d7bffbf0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + # Copyright © 2020, United States Government, as represented by the # Administrator of the National Aeronautics and Space Administration. # All rights reserved. @@ -19,6 +21,7 @@ import os import random import shutil +import sys import tempfile import zipfile @@ -27,12 +30,10 @@ from delta.imagery.sources import tiff -import delta.imagery.imagery_config -import delta.ml.ml_config +import delta.config.modules +delta.config.modules.register_all() -# initialize config files -delta.imagery.imagery_config.register() -delta.ml.ml_config.register() +assert 'tensorflow' not in sys.modules, 'For speed of command line tool, tensorflow should not be imported by config!' def generate_tile(width=32, height=32, blocks=50): """Generate a widthXheightX3 image, with blocks pixels surrounded by ones and the rest zeros in band 0""" diff --git a/tests/test_config.py b/tests/test_config.py index cdfe8fa3..5586e471 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -21,13 +21,12 @@ import pytest import yaml +import numpy as np import tensorflow as tf from delta.config import config from delta.ml import model_parser -#pylint: disable=import-outside-toplevel - def test_general(): config.reset() @@ -94,6 +93,48 @@ def test_images_files(): assert len(im) == 1 assert im[0] == file_path +def test_classes(): + config.reset() + test_str = ''' + dataset: + classes: 2 + ''' + config.load(yaml_str=test_str) + assert len(config.dataset.classes) == 2 + for (i, c) in enumerate(config.dataset.classes): + assert c.value == i + assert config.dataset.classes.weights() is None + config.reset() + test_str = ''' + dataset: + classes: + - 2: + name: 2 + color: 2 + weight: 5.0 + - 1: + name: 1 + color: 1 + weight: 1.0 + - 5: + name: 5 + color: 5 + weight: 2.0 + ''' + config.load(yaml_str=test_str) + assert config.dataset.classes + values = [1, 2, 5] + for (i, c) in enumerate(config.dataset.classes): + e = values[i] + assert c.value == e + assert c.name == str(e) + assert c.color == e + assert config.dataset.classes.weights() == [1.0, 5.0, 2.0] + arr = np.array(values) + ind = config.dataset.classes.classes_to_indices_func()(arr) + assert np.max(ind) == 2 + assert (config.dataset.classes.indices_to_classes_func()(ind) == values).all() + def test_model_from_dict(): config.reset() test_str = ''' @@ -171,20 +212,20 @@ def test_pretrained_layer(): def test_network_file(): config.reset() test_str = ''' + dataset: + classes: 3 train: network: chunk_size: 5 - classes: 3 model: yaml_file: networks/convpool.yaml ''' config.load(yaml_str=test_str) assert config.train.network.chunk_size() == 5 - assert config.train.network.classes() == 3 model = model_parser.config_model(2)() assert model.input_shape == (None, config.train.network.chunk_size(), config.train.network.chunk_size(), 2) assert model.output_shape == (None, config.train.network.output_size(), - config.train.network.output_size(), config.train.network.classes()) + config.train.network.output_size(), len(config.dataset.classes)) def test_validate(): config.reset() @@ -207,11 +248,12 @@ def test_validate(): def test_network_inline(): config.reset() test_str = ''' + dataset: + classes: 3 train: network: chunk_size: 5 output_size: 1 - classes: 3 model: params: v1 : 10 @@ -227,10 +269,10 @@ def test_network_inline(): ''' config.load(yaml_str=test_str) assert config.train.network.chunk_size() == 5 - assert config.train.network.classes() == 3 + assert len(config.dataset.classes) == 3 model = model_parser.config_model(2)() assert model.input_shape == (None, config.train.network.chunk_size(), config.train.network.chunk_size(), 2) - assert model.output_shape == (None, config.train.network.classes()) + assert model.output_shape == (None, len(config.dataset.classes)) def test_train(): config.reset() @@ -309,7 +351,6 @@ def test_argparser(): assert config.train.network.chunk_size() == 5 im = config.dataset.images() - print(im.preprocess()) assert im.preprocess() is not None assert im.type() == 'tiff' assert len(im) == 1 diff --git a/tests/test_imagery_dataset.py b/tests/test_imagery_dataset.py index b194414c..fece4a32 100644 --- a/tests/test_imagery_dataset.py +++ b/tests/test_imagery_dataset.py @@ -127,14 +127,15 @@ def model_fn(): model, _ = train.train(model_fn, dataset, TrainingSpec(100, 5, 'sparse_categorical_crossentropy', ['accuracy'])) ret = model.evaluate(x=dataset.dataset().batch(1000)) - assert ret[1] > 0.90 + assert ret[1] > 0.70 (test_image, test_label) = conftest.generate_tile() test_label = test_label[1:-1, 1:-1] output_image = npy.NumpyImageWriter() predictor = predict.LabelPredictor(model, output_image=output_image) predictor.predict(npy.NumpyImage(test_image)) - assert sum(sum(np.logical_xor(output_image.buffer(), test_label))) < 200 # very easy test since we don't train much + # very easy test since we don't train much + assert sum(sum(np.logical_xor(output_image.buffer()[:,:,0], test_label))) < 200 @pytest.fixture(scope="function") def autoencoder(all_sources):