From 1de4368b5b83b894079d3f5f9ec7b8d9b541a45d Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Mon, 21 Oct 2019 11:52:26 -0700 Subject: [PATCH 01/26] Added CsvExampleGen component --- components/tfx/CsvExampleGen.component.yaml | 138 ++++++++++++ components/tfx/CsvExampleGen.py | 89 ++++++++ .../tfx/CsvExampleGen_GCS.component.yaml | 173 ++++++++++++++ components/tfx/CsvExampleGen_GCS.py | 92 ++++++++ components/tfx/TFX_pipeline.ipynb | 211 ++++++++++++++++++ 5 files changed, 703 insertions(+) create mode 100644 components/tfx/CsvExampleGen.component.yaml create mode 100644 components/tfx/CsvExampleGen.py create mode 100644 components/tfx/CsvExampleGen_GCS.component.yaml create mode 100644 components/tfx/CsvExampleGen_GCS.py create mode 100644 components/tfx/TFX_pipeline.ipynb diff --git a/components/tfx/CsvExampleGen.component.yaml b/components/tfx/CsvExampleGen.component.yaml new file mode 100644 index 00000000000..c65808353d1 --- /dev/null +++ b/components/tfx/CsvExampleGen.component.yaml @@ -0,0 +1,138 @@ +description: | + Executes the CsvExampleGen component. + + Args: + input_base: A Channel of 'ExternalPath' type, which includes one artifact + whose uri is an external directory with csv files inside (required). + input_config: An example_gen_pb2.Input instance, providing input + configuration. If unset, the files under input_base will be treated as a + single split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and + eval examples. + ??? input: Forwards compatibility alias for the 'input_base' argument. + ??? instance_name: Optional unique instance name. Necessary if multiple + CsvExampleGen components are declared in the same pipeline. +implementation: + container: + args: + - --input-base + - inputPath: input_base + - if: + cond: + isPresent: input_config + then: + - --input-config + - inputValue: input_config + - if: + cond: + isPresent: output_config + then: + - --output-config + - inputValue: output_config + - --output-examples + - outputPath: output_examples + command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'tfx==0.14' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet + --no-warn-script-location 'tfx==0.14' --user) && "$0" "$@" + - python3 + - -u + - -c + - "class InputPath:\n '''When creating component from function, InputPath should\ + \ be used as function parameter annotation to tell the system to pass the *data\ + \ file path* to the function instead of passing the actual data.'''\n def\ + \ __init__(self, type=None):\n self.type = type\n\nclass OutputPath:\n\ + \ '''When creating component from function, OutputPath should be used as\ + \ function parameter annotation to tell the system that the function wants to\ + \ output data by writing it into a file with the given path instead of returning\ + \ the data from the function.'''\n def __init__(self, type=None):\n \ + \ self.type = type\n\ndef _make_parent_dirs_and_return_path(file_path: str):\n\ + \ import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n\ + \ return file_path\n\ndef CsvExampleGen(\n # Inputs\n input_base_path:\ + \ InputPath('ExternalPath'),\n #input_base_path: 'ExternalPath', # A Channel\ + \ of 'ExternalPath' type, which includes one artifact whose uri is an external\ + \ directory with csv files inside (required).\n\n # Outputs\n output_examples_path:\ + \ OutputPath('ExamplesPath'),\n #output_examples_path: 'ExamplesPath',\n\n\ + \ # Execution properties\n #input_config_splits: {'List' : {'item_type':\ + \ 'ExampleGen.Input.Split'}},\n input_config: 'ExampleGen.Input' = '{\"splits\"\ + : []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration.\ + \ If unset, the files under input_base will be treated as a single split.\n\ + \ #output_config_splits: {'List' : {'item_type': 'ExampleGen.SplitConfig'}},\n\ + \ output_config: 'ExampleGen.Output' = '{\"splitConfig\": {\"splits\": []}}',\ + \ # JSON-serialized example_gen_pb2.Output instance, providing output configuration.\ + \ If unset, default splits will be 'train' and 'eval' with size 2:1.\n #custom_config:\ + \ 'ExampleGen.CustomConfig' = None,\n):\n \"\"\"Executes the CsvExampleGen\ + \ component.\n\n Args:\n input_base: A Channel of 'ExternalPath' type,\ + \ which includes one artifact\n whose uri is an external directory with\ + \ csv files inside (required).\n input_config: An example_gen_pb2.Input\ + \ instance, providing input\n configuration. If unset, the files under\ + \ input_base will be treated as a\n single split.\n output_config:\ + \ An example_gen_pb2.Output instance, providing output\n configuration.\ + \ If unset, default splits will be 'train' and 'eval' with\n size 2:1.\n\ + \ ??? example_artifacts: Optional channel of 'ExamplesPath' for output\ + \ train and\n eval examples.\n ??? input: Forwards compatibility\ + \ alias for the 'input_base' argument.\n ??? instance_name: Optional unique\ + \ instance name. Necessary if multiple\n CsvExampleGen components are\ + \ declared in the same pipeline.\n \"\"\"\n\n import json\n import\ + \ os\n from tfx.components.example_gen import utils\n from tfx.components.example_gen.csv_example_gen.component\ + \ import CsvExampleGen\n from tfx.types import standard_artifacts\n\n \ + \ # Create input dict.\n # input_dict['input_base'] always has a single entry\n\ + \ input_base = standard_artifacts.ExternalArtifact()\n input_base.uri\ + \ = input_base_path\n input_dict = {\n 'input_base': [input_base],\n\ + \ }\n\n # Create output dict.\n input_config_dict = json.loads(input_config)\n\ + \ output_config_dict = json.loads(output_config)\n split_names = utils.generate_output_split_names(input_config_dict,\ + \ output_config_dict)\n output_dict_examples = []\n for split_name in\ + \ split_names:\n output_split_examples = standard_artifacts.Examples(split=split_name)\n\ + \ output_split_examples.uri = os.path.join(output_examples_path, split_name)\n\ + \ output_dict_examples.append(output_split_examples)\n output_dict\ + \ = {\n 'examples': output_dict_examples,\n }\n\n # Create exec\ + \ proterties.\n exec_properties = {\n 'input_config': input_config,\n\ + \ 'output_config': output_config\n }\n\n executor = CsvExampleGen.EXECUTOR_SPEC.executor_class()\n\ + \ executor.Do(\n input_dict=input_dict,\n output_dict=output_dict,\n\ + \ exec_properties=exec_properties,\n )\n\nimport argparse\n_parser\ + \ = argparse.ArgumentParser(prog='Csvexamplegen', description=\"Executes the\ + \ CsvExampleGen component.\\n\\n Args:\\n input_base: A Channel of 'ExternalPath'\ + \ type, which includes one artifact\\n whose uri is an external directory\ + \ with csv files inside (required).\\n input_config: An example_gen_pb2.Input\ + \ instance, providing input\\n configuration. If unset, the files under\ + \ input_base will be treated as a\\n single split.\\n output_config:\ + \ An example_gen_pb2.Output instance, providing output\\n configuration.\ + \ If unset, default splits will be 'train' and 'eval' with\\n size 2:1.\\\ + n ??? example_artifacts: Optional channel of 'ExamplesPath' for output\ + \ train and\\n eval examples.\\n ??? input: Forwards compatibility\ + \ alias for the 'input_base' argument.\\n ??? instance_name: Optional unique\ + \ instance name. Necessary if multiple\\n CsvExampleGen components are\ + \ declared in the same pipeline.\\n\")\n_parser.add_argument(\"--input-base\"\ + , dest=\"input_base_path\", type=str, required=True, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--input-config\", dest=\"input_config\", type=str, required=False,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--output-config\", dest=\"\ + output_config\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"\ + --output-examples\", dest=\"output_examples_path\", type=_make_parent_dirs_and_return_path,\ + \ required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ + _output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = CsvExampleGen(**_parsed_args)\n\ + \nif not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):\n \ + \ _outputs = [_outputs]\n\n_output_serializers = [\n \n]\n\nimport os\n\ + for idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n\ + \ except OSError:\n pass\n with open(output_file, 'w') as f:\n\ + \ f.write(_output_serializers[idx](_outputs[idx]))\n" + image: tensorflow/tensorflow:1.14.0-py3 +inputs: +- name: input_base + type: ExternalPath +- default: '{"splits": []}' + name: input_config + optional: true + type: ExampleGen.Input +- default: '{"splitConfig": {"splits": []}}' + name: output_config + optional: true + type: ExampleGen.Output +name: Csvexamplegen +outputs: +- name: output_examples + type: ExamplesPath diff --git a/components/tfx/CsvExampleGen.py b/components/tfx/CsvExampleGen.py new file mode 100644 index 00000000000..1932ea6f129 --- /dev/null +++ b/components/tfx/CsvExampleGen.py @@ -0,0 +1,89 @@ +from kfp.components import InputPath, OutputPath + +def CsvExampleGen( + # Inputs + input_base_path: InputPath('ExternalPath'), + #input_base_path: 'ExternalPath', # A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with csv files inside (required). + + # Outputs + output_examples_path: OutputPath('ExamplesPath'), + #output_examples_path: 'ExamplesPath', + + # Execution properties + #input_config_splits: {'List' : {'item_type': 'ExampleGen.Input.Split'}}, + input_config: 'ExampleGen.Input' = '{"splits": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. + #output_config_splits: {'List' : {'item_type': 'ExampleGen.SplitConfig'}}, + output_config: 'ExampleGen.Output' = '{"splitConfig": {"splits": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. + #custom_config: 'ExampleGen.CustomConfig' = None, +): + """Executes the CsvExampleGen component. + + Args: + input_base: A Channel of 'ExternalPath' type, which includes one artifact + whose uri is an external directory with csv files inside (required). + input_config: An example_gen_pb2.Input instance, providing input + configuration. If unset, the files under input_base will be treated as a + single split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and + eval examples. + ??? input: Forwards compatibility alias for the 'input_base' argument. + ??? instance_name: Optional unique instance name. Necessary if multiple + CsvExampleGen components are declared in the same pipeline. + """ + + import json + import os + from google.protobuf import json_format + from tfx.components.example_gen import utils + from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen + from tfx.proto import example_gen_pb2 + from tfx.types import standard_artifacts + + # Create input dict. + # input_dict['input_base'] always has a single entry + input_base = standard_artifacts.ExternalArtifact() + input_base.uri = input_base_path + input_dict = { + 'input_base': [input_base], + } + + # Create output dict. + input_config_obj = example_gen_pb2.Input() + output_config_obj = example_gen_pb2.Output() + json_format.Parse(input_config, input_config_obj) + json_format.Parse(output_config, output_config_obj) + split_names = utils.generate_output_split_names(input_config_obj, output_config_obj) + output_dict_examples = [] + for split_name in split_names: + output_split_examples = standard_artifacts.Examples(split=split_name) + output_split_examples.uri = os.path.join(output_examples_path, split_name) + output_dict_examples.append(output_split_examples) + output_dict = { + 'examples': output_dict_examples, + } + + # Create exec proterties. + exec_properties = { + 'input_config': input_config, + 'output_config': output_config + } + + executor = CsvExampleGen.EXECUTOR_SPEC.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op( + CsvExampleGen, + base_image='tensorflow/tensorflow:1.14.0-py3', + packages_to_install=['tfx==0.14', 'six>=1.12.0'], + output_component_file='CsvExampleGen.component.yaml' + ) \ No newline at end of file diff --git a/components/tfx/CsvExampleGen_GCS.component.yaml b/components/tfx/CsvExampleGen_GCS.component.yaml new file mode 100644 index 00000000000..8a685423425 --- /dev/null +++ b/components/tfx/CsvExampleGen_GCS.component.yaml @@ -0,0 +1,173 @@ +description: | + Executes the CsvExampleGen component. + + Args: + input_base: A Channel of 'ExternalPath' type, which includes one artifact + whose uri is an external directory with csv files inside (required). + input_config: An example_gen_pb2.Input instance, providing input + configuration. If unset, the files under input_base will be treated as a + single split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and + eval examples. + ??? input: Forwards compatibility alias for the 'input_base' argument. + ??? instance_name: Optional unique instance name. Necessary if multiple + CsvExampleGen components are declared in the same pipeline. +implementation: + container: + args: + - --input-base-path + - inputValue: input_base_path + - --output-examples-path + - inputValue: output_examples_path + - if: + cond: + isPresent: input_config + then: + - --input-config + - inputValue: input_config + - if: + cond: + isPresent: output_config + then: + - --output-config + - inputValue: output_config + - '----output-paths' + - outputPath: examples_path + command: + - sh + - -c + - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location + 'tfx==0.14' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet + --no-warn-script-location 'tfx==0.14' --user) && "$0" "$@" + - python3 + - -u + - -c + - | + from typing import NamedTuple + + def CsvExampleGen_GCS( # + # Inputs + #input_base_path: InputPath('ExternalPath'), + input_base_path: 'ExternalPath', # A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with csv files inside (required). + + # Outputs + #output_examples_path: OutputPath('ExamplesPath'), + output_examples_path: 'ExamplesPath', + + # Execution properties + #input_config_splits: {'List' : {'item_type': 'ExampleGen.Input.Split'}}, + input_config: 'ExampleGen.Input' = '{"splits": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. + #output_config_splits: {'List' : {'item_type': 'ExampleGen.SplitConfig'}}, + output_config: 'ExampleGen.Output' = '{"splitConfig": {"splits": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. + #custom_config: 'ExampleGen.CustomConfig' = None, + ) -> NamedTuple('Outputs', [ + ('examples_path', 'ExamplesPath'), + ]): + """Executes the CsvExampleGen component. + + Args: + input_base: A Channel of 'ExternalPath' type, which includes one artifact + whose uri is an external directory with csv files inside (required). + input_config: An example_gen_pb2.Input instance, providing input + configuration. If unset, the files under input_base will be treated as a + single split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and + eval examples. + ??? input: Forwards compatibility alias for the 'input_base' argument. + ??? instance_name: Optional unique instance name. Necessary if multiple + CsvExampleGen components are declared in the same pipeline. + """ + + import json + import os + from tfx.components.example_gen import utils + from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen + from tfx.types import standard_artifacts + + # Create input dict. + # input_dict['input_base'] always has a single entry + input_base = standard_artifacts.ExternalArtifact() + input_base.uri = input_base_path + input_dict = { + 'input_base': [input_base], + } + + # Create output dict. + input_config_dict = json.loads(input_config) + output_config_dict = json.loads(output_config) + split_names = utils.generate_output_split_names(input_config_dict, output_config_dict) + output_dict_examples = [] + for split_name in split_names: + output_split_examples = standard_artifacts.Examples(split=split_name) + output_split_examples.uri = os.path.join(output_examples_path, split_name) + output_dict_examples.append(output_split_examples) + output_dict = { + 'examples': output_dict_examples, + } + + # Create exec proterties. + exec_properties = { + 'input_config': input_config, + 'output_config': output_config + } + + executor = CsvExampleGen.EXECUTOR_SPEC.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + return (output_examples_path,) + + import argparse + _parser = argparse.ArgumentParser(prog='CsvExampleGen_GCS', description="Executes the CsvExampleGen component.\n\n Args:\n input_base: A Channel of 'ExternalPath' type, which includes one artifact\n whose uri is an external directory with csv files inside (required).\n input_config: An example_gen_pb2.Input instance, providing input\n configuration. If unset, the files under input_base will be treated as a\n single split.\n output_config: An example_gen_pb2.Output instance, providing output\n configuration. If unset, default splits will be 'train' and 'eval' with\n size 2:1.\n ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and\n eval examples.\n ??? input: Forwards compatibility alias for the 'input_base' argument.\n ??? instance_name: Optional unique instance name. Necessary if multiple\n CsvExampleGen components are declared in the same pipeline.\n") + _parser.add_argument("--input-base-path", dest="input_base_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--output-examples-path", dest="output_examples_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--input-config", dest="input_config", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--output-config", dest="output_config", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = CsvExampleGen_GCS(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + _output_serializers = [ + str + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + image: tensorflow/tensorflow:1.14.0-py3 +inputs: +- name: input_base_path + type: ExternalPath +- name: output_examples_path + type: ExamplesPath +- default: '{"splits": []}' + name: input_config + optional: true + type: ExampleGen.Input +- default: '{"splitConfig": {"splits": []}}' + name: output_config + optional: true + type: ExampleGen.Output +name: CsvExampleGen_GCS +outputs: +- name: examples_path + type: ExamplesPath diff --git a/components/tfx/CsvExampleGen_GCS.py b/components/tfx/CsvExampleGen_GCS.py new file mode 100644 index 00000000000..a33495adda0 --- /dev/null +++ b/components/tfx/CsvExampleGen_GCS.py @@ -0,0 +1,92 @@ +from typing import NamedTuple + +def CsvExampleGen_GCS( # + # Inputs + #input_base_path: InputPath('ExternalPath'), + input_base_path: 'ExternalPath', # A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with csv files inside (required). + + # Outputs + #output_examples_path: OutputPath('ExamplesPath'), + output_examples_path: 'ExamplesPath', + + # Execution properties + #input_config_splits: {'List' : {'item_type': 'ExampleGen.Input.Split'}}, + input_config: 'ExampleGen.Input' = '{"splits": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. + #output_config_splits: {'List' : {'item_type': 'ExampleGen.SplitConfig'}}, + output_config: 'ExampleGen.Output' = '{"splitConfig": {"splits": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. + #custom_config: 'ExampleGen.CustomConfig' = None, +) -> NamedTuple('Outputs', [ + ('examples_path', 'ExamplesPath'), +]): + """Executes the CsvExampleGen component. + + Args: + input_base: A Channel of 'ExternalPath' type, which includes one artifact + whose uri is an external directory with csv files inside (required). + input_config: An example_gen_pb2.Input instance, providing input + configuration. If unset, the files under input_base will be treated as a + single split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and + eval examples. + ??? input: Forwards compatibility alias for the 'input_base' argument. + ??? instance_name: Optional unique instance name. Necessary if multiple + CsvExampleGen components are declared in the same pipeline. + """ + + import json + import os + from google.protobuf import json_format + from tfx.components.example_gen import utils + from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen + from tfx.proto import example_gen_pb2 + from tfx.types import standard_artifacts + + # Create input dict. + # input_dict['input_base'] always has a single entry + input_base = standard_artifacts.ExternalArtifact() + input_base.uri = input_base_path + input_dict = { + 'input_base': [input_base], + } + + # Create output dict. + input_config_obj = example_gen_pb2.Input() + output_config_obj = example_gen_pb2.Output() + json_format.Parse(input_config, input_config_obj) + json_format.Parse(output_config, output_config_obj) + split_names = utils.generate_output_split_names(input_config_obj, output_config_obj) + output_dict_examples = [] + for split_name in split_names: + output_split_examples = standard_artifacts.Examples(split=split_name) + output_split_examples.uri = os.path.join(output_examples_path, split_name) + output_dict_examples.append(output_split_examples) + output_dict = { + 'examples': output_dict_examples, + } + + # Create exec proterties. + exec_properties = { + 'input_config': input_config, + 'output_config': output_config + } + + executor = CsvExampleGen.EXECUTOR_SPEC.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + return (output_examples_path,) + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op( + CsvExampleGen_GCS, + base_image='tensorflow/tensorflow:1.14.0-py3', + packages_to_install=['tfx==0.14', 'six>=1.12.0'], + output_component_file='CsvExampleGen_GCS.component.yaml' + ) \ No newline at end of file diff --git a/components/tfx/TFX_pipeline.ipynb b/components/tfx/TFX_pipeline.ipynb new file mode 100644 index 00000000000..0290c6f6421 --- /dev/null +++ b/components/tfx/TFX_pipeline.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import NamedTuple\n", + "\n", + "def CsvExampleGen_GCS( #\n", + " # Inputs\n", + " #input_base_path: InputPath('ExternalPath'),\n", + " input_base_path: 'ExternalPath', # A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with csv files inside (required).\n", + "\n", + " # Outputs\n", + " #output_examples_path: OutputPath('ExamplesPath'),\n", + " output_examples_path: 'ExamplesPath',\n", + "\n", + " # Execution properties\n", + " #input_config_splits: {'List' : {'item_type': 'ExampleGen.Input.Split'}},\n", + " input_config: 'ExampleGen.Input' = '{\"splits\": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split.\n", + " #output_config_splits: {'List' : {'item_type': 'ExampleGen.SplitConfig'}},\n", + " output_config: 'ExampleGen.Output' = '{\"splitConfig\": {\"splits\": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1.\n", + " #custom_config: 'ExampleGen.CustomConfig' = None,\n", + ") -> NamedTuple('Outputs', [\n", + " ('examples_path', 'ExamplesPath'),\n", + "]):\n", + " \"\"\"Executes the CsvExampleGen component.\n", + "\n", + " Args:\n", + " input_base: A Channel of 'ExternalPath' type, which includes one artifact\n", + " whose uri is an external directory with csv files inside (required).\n", + " input_config: An example_gen_pb2.Input instance, providing input\n", + " configuration. If unset, the files under input_base will be treated as a\n", + " single split.\n", + " output_config: An example_gen_pb2.Output instance, providing output\n", + " configuration. If unset, default splits will be 'train' and 'eval' with\n", + " size 2:1.\n", + " ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and\n", + " eval examples.\n", + " ??? input: Forwards compatibility alias for the 'input_base' argument.\n", + " ??? instance_name: Optional unique instance name. Necessary if multiple\n", + " CsvExampleGen components are declared in the same pipeline.\n", + " \"\"\"\n", + "\n", + " import json\n", + " import os\n", + " from google.protobuf import json_format\n", + " from tfx.components.example_gen import utils\n", + " from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen\n", + " from tfx.proto import example_gen_pb2\n", + " from tfx.types import standard_artifacts\n", + "\n", + " # Create input dict.\n", + " # input_dict['input_base'] always has a single entry\n", + " input_base = standard_artifacts.ExternalArtifact()\n", + " input_base.uri = input_base_path\n", + " input_dict = {\n", + " 'input_base': [input_base],\n", + " }\n", + "\n", + " # Create output dict.\n", + " input_config_obj = example_gen_pb2.Input()\n", + " output_config_obj = example_gen_pb2.Output()\n", + " json_format.Parse(input_config, input_config_obj)\n", + " json_format.Parse(output_config, output_config_obj)\n", + " split_names = utils.generate_output_split_names(input_config_obj, output_config_obj)\n", + " output_dict_examples = []\n", + " for split_name in split_names:\n", + " output_split_examples = standard_artifacts.Examples(split=split_name)\n", + " output_split_examples.uri = os.path.join(output_examples_path, split_name)\n", + " output_dict_examples.append(output_split_examples)\n", + " output_dict = {\n", + " 'examples': output_dict_examples,\n", + " }\n", + "\n", + " # Create exec proterties.\n", + " exec_properties = {\n", + " 'input_config': input_config,\n", + " 'output_config': output_config\n", + " }\n", + "\n", + " executor = CsvExampleGen.EXECUTOR_SPEC.executor_class()\n", + " executor.Do(\n", + " input_dict=input_dict,\n", + " output_dict=output_dict,\n", + " exec_properties=exec_properties,\n", + " )\n", + "\n", + " return (output_examples_path,)\n", + "\n", + "if __name__ == '__main__':\n", + " import kfp\n", + " kfp.components.func_to_container_op(\n", + " CsvExampleGen_GCS,\n", + " base_image='tensorflow/tensorflow:1.14.0-py3',\n", + " packages_to_install=['tfx==0.14'],\n", + " output_component_file='CsvExampleGen_GCS.component.yaml'\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Experiment link here" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run link here" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "RunPipelineResult(run_id=84697c60-f477-11e9-93ae-42010a800216)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#from .CsvExampleGen import CsvExampleGen_GCS\n", + "import kfp\n", + "import json\n", + "\n", + "CsvExampleGen_op = kfp.components.func_to_container_op(\n", + " func=CsvExampleGen_GCS,\n", + " base_image='tensorflow/tensorflow:1.14.0-py3',\n", + " packages_to_install=['tfx==0.14', 'six>=1.12.0'],\n", + " output_component_file='CsvExampleGen_GCS.component.yaml'\n", + ")\n", + "\n", + "output_path_template = 'gs://avolkov/tmp/tfx_pipeline/' + kfp.dsl.EXECUTION_ID_PLACEHOLDER\n", + "def tfx_pipeline():\n", + " CsvExampleGen_op(\n", + " input_base_path='gs://avolkov/tensorflow-tfx/tfx/components/testdata/external',\n", + " output_examples_path=output_path_template,\n", + " input_config=json.dumps({\n", + " \"splits\": [\n", + " {'name': 'data', 'pattern': 'csv/*.csv'},\n", + " ]\n", + " }),\n", + " output_config=json.dumps({\n", + " \"splitConfig\": {\n", + " \"splits\": [\n", + " {'name': 'train', 'hash_buckets': 2},\n", + " {'name': 'eval', 'hash_buckets': 1},\n", + " ]\n", + " }\n", + " }),\n", + " )\n", + "\n", + "from kfp.gcp import use_gcp_secret\n", + "pipeline_conf = kfp.dsl.PipelineConf()\n", + "pipeline_conf.add_op_transformer(use_gcp_secret('user-gcp-sa'))\n", + " \n", + "kfp.Client().create_run_from_pipeline_func(tfx_pipeline, arguments={}, pipeline_conf=pipeline_conf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 1751f99b65d8eb7282de41740ccb320c12bff74e Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Wed, 30 Oct 2019 23:30:06 -0700 Subject: [PATCH 02/26] Switched to using some processing code from the component class Needs testing --- components/tfx/CsvExampleGen.component.yaml | 139 +++++++++--------- components/tfx/CsvExampleGen.py | 54 +++---- .../tfx/CsvExampleGen_GCS.component.yaml | 61 ++++---- components/tfx/CsvExampleGen_GCS.py | 50 ++++--- 4 files changed, 159 insertions(+), 145 deletions(-) diff --git a/components/tfx/CsvExampleGen.component.yaml b/components/tfx/CsvExampleGen.component.yaml index c65808353d1..f0051e534a5 100644 --- a/components/tfx/CsvExampleGen.component.yaml +++ b/components/tfx/CsvExampleGen.component.yaml @@ -38,81 +38,86 @@ implementation: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location - 'tfx==0.14' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet - --no-warn-script-location 'tfx==0.14' --user) && "$0" "$@" + 'tfx==0.14' 'six>=1.12.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip + install --quiet --no-warn-script-location 'tfx==0.14' 'six>=1.12.0' --user) + && "$0" "$@" - python3 - -u - -c - - "class InputPath:\n '''When creating component from function, InputPath should\ - \ be used as function parameter annotation to tell the system to pass the *data\ - \ file path* to the function instead of passing the actual data.'''\n def\ - \ __init__(self, type=None):\n self.type = type\n\nclass OutputPath:\n\ + - "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n \ + \ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ + \nclass InputPath:\n '''When creating component from function, InputPath\ + \ should be used as function parameter annotation to tell the system to pass\ + \ the *data file path* to the function instead of passing the actual data.'''\n\ + \ def __init__(self, type=None):\n self.type = type\n\nclass OutputPath:\n\ \ '''When creating component from function, OutputPath should be used as\ \ function parameter annotation to tell the system that the function wants to\ \ output data by writing it into a file with the given path instead of returning\ \ the data from the function.'''\n def __init__(self, type=None):\n \ - \ self.type = type\n\ndef _make_parent_dirs_and_return_path(file_path: str):\n\ - \ import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n\ - \ return file_path\n\ndef CsvExampleGen(\n # Inputs\n input_base_path:\ - \ InputPath('ExternalPath'),\n #input_base_path: 'ExternalPath', # A Channel\ + \ self.type = type\n\ndef CsvExampleGen(\n # Inputs\n input_base_path:\ + \ InputPath('ExternalPath'),\n#input_base_path: 'ExternalPath', # A Channel\ \ of 'ExternalPath' type, which includes one artifact whose uri is an external\ \ directory with csv files inside (required).\n\n # Outputs\n output_examples_path:\ \ OutputPath('ExamplesPath'),\n #output_examples_path: 'ExamplesPath',\n\n\ \ # Execution properties\n #input_config_splits: {'List' : {'item_type':\ - \ 'ExampleGen.Input.Split'}},\n input_config: 'ExampleGen.Input' = '{\"splits\"\ - : []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration.\ - \ If unset, the files under input_base will be treated as a single split.\n\ - \ #output_config_splits: {'List' : {'item_type': 'ExampleGen.SplitConfig'}},\n\ - \ output_config: 'ExampleGen.Output' = '{\"splitConfig\": {\"splits\": []}}',\ - \ # JSON-serialized example_gen_pb2.Output instance, providing output configuration.\ - \ If unset, default splits will be 'train' and 'eval' with size 2:1.\n #custom_config:\ - \ 'ExampleGen.CustomConfig' = None,\n):\n \"\"\"Executes the CsvExampleGen\ - \ component.\n\n Args:\n input_base: A Channel of 'ExternalPath' type,\ - \ which includes one artifact\n whose uri is an external directory with\ - \ csv files inside (required).\n input_config: An example_gen_pb2.Input\ - \ instance, providing input\n configuration. If unset, the files under\ - \ input_base will be treated as a\n single split.\n output_config:\ - \ An example_gen_pb2.Output instance, providing output\n configuration.\ - \ If unset, default splits will be 'train' and 'eval' with\n size 2:1.\n\ - \ ??? example_artifacts: Optional channel of 'ExamplesPath' for output\ - \ train and\n eval examples.\n ??? input: Forwards compatibility\ - \ alias for the 'input_base' argument.\n ??? instance_name: Optional unique\ - \ instance name. Necessary if multiple\n CsvExampleGen components are\ - \ declared in the same pipeline.\n \"\"\"\n\n import json\n import\ - \ os\n from tfx.components.example_gen import utils\n from tfx.components.example_gen.csv_example_gen.component\ - \ import CsvExampleGen\n from tfx.types import standard_artifacts\n\n \ - \ # Create input dict.\n # input_dict['input_base'] always has a single entry\n\ - \ input_base = standard_artifacts.ExternalArtifact()\n input_base.uri\ - \ = input_base_path\n input_dict = {\n 'input_base': [input_base],\n\ - \ }\n\n # Create output dict.\n input_config_dict = json.loads(input_config)\n\ - \ output_config_dict = json.loads(output_config)\n split_names = utils.generate_output_split_names(input_config_dict,\ - \ output_config_dict)\n output_dict_examples = []\n for split_name in\ - \ split_names:\n output_split_examples = standard_artifacts.Examples(split=split_name)\n\ - \ output_split_examples.uri = os.path.join(output_examples_path, split_name)\n\ - \ output_dict_examples.append(output_split_examples)\n output_dict\ - \ = {\n 'examples': output_dict_examples,\n }\n\n # Create exec\ - \ proterties.\n exec_properties = {\n 'input_config': input_config,\n\ - \ 'output_config': output_config\n }\n\n executor = CsvExampleGen.EXECUTOR_SPEC.executor_class()\n\ - \ executor.Do(\n input_dict=input_dict,\n output_dict=output_dict,\n\ - \ exec_properties=exec_properties,\n )\n\nimport argparse\n_parser\ - \ = argparse.ArgumentParser(prog='Csvexamplegen', description=\"Executes the\ - \ CsvExampleGen component.\\n\\n Args:\\n input_base: A Channel of 'ExternalPath'\ - \ type, which includes one artifact\\n whose uri is an external directory\ - \ with csv files inside (required).\\n input_config: An example_gen_pb2.Input\ - \ instance, providing input\\n configuration. If unset, the files under\ - \ input_base will be treated as a\\n single split.\\n output_config:\ - \ An example_gen_pb2.Output instance, providing output\\n configuration.\ - \ If unset, default splits will be 'train' and 'eval' with\\n size 2:1.\\\ - n ??? example_artifacts: Optional channel of 'ExamplesPath' for output\ - \ train and\\n eval examples.\\n ??? input: Forwards compatibility\ - \ alias for the 'input_base' argument.\\n ??? instance_name: Optional unique\ - \ instance name. Necessary if multiple\\n CsvExampleGen components are\ - \ declared in the same pipeline.\\n\")\n_parser.add_argument(\"--input-base\"\ - , dest=\"input_base_path\", type=str, required=True, default=argparse.SUPPRESS)\n\ - _parser.add_argument(\"--input-config\", dest=\"input_config\", type=str, required=False,\ - \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--output-config\", dest=\"\ - output_config\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"\ - --output-examples\", dest=\"output_examples_path\", type=_make_parent_dirs_and_return_path,\ + \ 'ExampleGen.Input.Split'}},\n input_config: 'ExampleGen.Input' = None,\ + \ # = '{\"splits\": []}', # JSON-serialized example_gen_pb2.Input instance,\ + \ providing input configuration. If unset, the files under input_base will be\ + \ treated as a single split.\n #output_config_splits: {'List' : {'item_type':\ + \ 'ExampleGen.SplitConfig'}},\n output_config: 'ExampleGen.Output' = None,\ + \ # = '{\"splitConfig\": {\"splits\": []}}', # JSON-serialized example_gen_pb2.Output\ + \ instance, providing output configuration. If unset, default splits will be\ + \ 'train' and 'eval' with size 2:1.\n #custom_config: 'ExampleGen.CustomConfig'\ + \ = None,\n):\n \"\"\"Executes the CsvExampleGen component.\n\n Args:\n\ + \ input_base: A Channel of 'ExternalPath' type, which includes one artifact\n\ + \ whose uri is an external directory with csv files inside (required).\n\ + \ input_config: An example_gen_pb2.Input instance, providing input\n \ + \ configuration. If unset, the files under input_base will be treated as\ + \ a\n single split.\n output_config: An example_gen_pb2.Output instance,\ + \ providing output\n configuration. If unset, default splits will be\ + \ 'train' and 'eval' with\n size 2:1.\n ??? example_artifacts: Optional\ + \ channel of 'ExamplesPath' for output train and\n eval examples.\n \ + \ ??? input: Forwards compatibility alias for the 'input_base' argument.\n\ + \ ??? instance_name: Optional unique instance name. Necessary if multiple\n\ + \ CsvExampleGen components are declared in the same pipeline.\n \"\ + \"\"\n\n import json\n import os\n from google.protobuf import json_format\n\ + \ from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen\n\ + \ from tfx.proto import example_gen_pb2\n from tfx.types import standard_artifacts\n\ + \ from tfx.types import channel_utils\n\n # Create input dict.\n input_base\ + \ = standard_artifacts.ExternalArtifact()\n input_base.uri = input_base_path\n\ + \ input_base_channel = channel_utils.as_channel([input_base])\n\n input_config_obj\ + \ = None\n if input_config:\n input_config_obj = example_gen_pb2.Input()\n\ + \ json_format.Parse(input_config, input_config_obj)\n\n output_config_obj\ + \ = None\n if output_config:\n output_config_obj = example_gen_pb2.Output()\n\ + \ json_format.Parse(output_config, output_config_obj)\n\n component_class_instance\ + \ = CsvExampleGen(\n input=input_base_channel,\n input_config=input_config_obj,\n\ + \ output_config=output_config_obj,\n )\n\n input_dict = {name:\ + \ channel.artifacts for name, channel in component_class_instance.inputs.items()}\n\ + \ output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()}\n\ + \ exec_properties = component_class_instance.exec_properties\n\n # Generating\ + \ paths for output artifacts\n for output_artifact in output_dict['examples']:\n\ + \ output_artifact.uri = output_examples_path\n if output_artifact.split:\n\ + \ output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split)\n\ + \n executor = CsvExampleGen.EXECUTOR_SPEC.executor_class()\n executor.Do(\n\ + \ input_dict=input_dict,\n output_dict=output_dict,\n exec_properties=exec_properties,\n\ + \ )\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Csvexamplegen',\ + \ description=\"Executes the CsvExampleGen component.\\n\\n Args:\\n \ + \ input_base: A Channel of 'ExternalPath' type, which includes one artifact\\\ + n whose uri is an external directory with csv files inside (required).\\\ + n input_config: An example_gen_pb2.Input instance, providing input\\n \ + \ configuration. If unset, the files under input_base will be treated\ + \ as a\\n single split.\\n output_config: An example_gen_pb2.Output\ + \ instance, providing output\\n configuration. If unset, default splits\ + \ will be 'train' and 'eval' with\\n size 2:1.\\n ??? example_artifacts:\ + \ Optional channel of 'ExamplesPath' for output train and\\n eval examples.\\\ + n ??? input: Forwards compatibility alias for the 'input_base' argument.\\\ + n ??? instance_name: Optional unique instance name. Necessary if multiple\\\ + n CsvExampleGen components are declared in the same pipeline.\\n\")\n\ + _parser.add_argument(\"--input-base\", dest=\"input_base_path\", type=str, required=True,\ + \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--input-config\", dest=\"\ + input_config\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"\ + --output-config\", dest=\"output_config\", type=str, required=False, default=argparse.SUPPRESS)\n\ + _parser.add_argument(\"--output-examples\", dest=\"output_examples_path\", type=_make_parent_dirs_and_return_path,\ \ required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ _output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = CsvExampleGen(**_parsed_args)\n\ \nif not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):\n \ @@ -124,12 +129,10 @@ implementation: inputs: - name: input_base type: ExternalPath -- default: '{"splits": []}' - name: input_config +- name: input_config optional: true type: ExampleGen.Input -- default: '{"splitConfig": {"splits": []}}' - name: output_config +- name: output_config optional: true type: ExampleGen.Output name: Csvexamplegen diff --git a/components/tfx/CsvExampleGen.py b/components/tfx/CsvExampleGen.py index 1932ea6f129..854d64afc3f 100644 --- a/components/tfx/CsvExampleGen.py +++ b/components/tfx/CsvExampleGen.py @@ -11,9 +11,9 @@ def CsvExampleGen( # Execution properties #input_config_splits: {'List' : {'item_type': 'ExampleGen.Input.Split'}}, - input_config: 'ExampleGen.Input' = '{"splits": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. + input_config: 'ExampleGen.Input' = None, # = '{"splits": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. #output_config_splits: {'List' : {'item_type': 'ExampleGen.SplitConfig'}}, - output_config: 'ExampleGen.Output' = '{"splitConfig": {"splits": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. + output_config: 'ExampleGen.Output' = None, # = '{"splitConfig": {"splits": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. #custom_config: 'ExampleGen.CustomConfig' = None, ): """Executes the CsvExampleGen component. @@ -37,39 +37,41 @@ def CsvExampleGen( import json import os from google.protobuf import json_format - from tfx.components.example_gen import utils from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen from tfx.proto import example_gen_pb2 from tfx.types import standard_artifacts + from tfx.types import channel_utils # Create input dict. - # input_dict['input_base'] always has a single entry input_base = standard_artifacts.ExternalArtifact() input_base.uri = input_base_path - input_dict = { - 'input_base': [input_base], - } + input_base_channel = channel_utils.as_channel([input_base]) - # Create output dict. - input_config_obj = example_gen_pb2.Input() - output_config_obj = example_gen_pb2.Output() - json_format.Parse(input_config, input_config_obj) - json_format.Parse(output_config, output_config_obj) - split_names = utils.generate_output_split_names(input_config_obj, output_config_obj) - output_dict_examples = [] - for split_name in split_names: - output_split_examples = standard_artifacts.Examples(split=split_name) - output_split_examples.uri = os.path.join(output_examples_path, split_name) - output_dict_examples.append(output_split_examples) - output_dict = { - 'examples': output_dict_examples, - } + input_config_obj = None + if input_config: + input_config_obj = example_gen_pb2.Input() + json_format.Parse(input_config, input_config_obj) - # Create exec proterties. - exec_properties = { - 'input_config': input_config, - 'output_config': output_config - } + output_config_obj = None + if output_config: + output_config_obj = example_gen_pb2.Output() + json_format.Parse(output_config, output_config_obj) + + component_class_instance = CsvExampleGen( + input=input_base_channel, + input_config=input_config_obj, + output_config=output_config_obj, + ) + + input_dict = {name: channel.artifacts for name, channel in component_class_instance.inputs.items()} + output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for output_artifact in output_dict['examples']: + output_artifact.uri = output_examples_path + if output_artifact.split: + output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split) executor = CsvExampleGen.EXECUTOR_SPEC.executor_class() executor.Do( diff --git a/components/tfx/CsvExampleGen_GCS.component.yaml b/components/tfx/CsvExampleGen_GCS.component.yaml index 8a685423425..7f10fa83663 100644 --- a/components/tfx/CsvExampleGen_GCS.component.yaml +++ b/components/tfx/CsvExampleGen_GCS.component.yaml @@ -40,8 +40,9 @@ implementation: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location - 'tfx==0.14' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet - --no-warn-script-location 'tfx==0.14' --user) && "$0" "$@" + 'tfx==0.14' 'six>=1.12.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip + install --quiet --no-warn-script-location 'tfx==0.14' 'six>=1.12.0' --user) + && "$0" "$@" - python3 - -u - -c @@ -86,36 +87,42 @@ implementation: import json import os - from tfx.components.example_gen import utils + from google.protobuf import json_format from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen + from tfx.proto import example_gen_pb2 from tfx.types import standard_artifacts + from tfx.types import channel_utils # Create input dict. - # input_dict['input_base'] always has a single entry input_base = standard_artifacts.ExternalArtifact() input_base.uri = input_base_path - input_dict = { - 'input_base': [input_base], - } - - # Create output dict. - input_config_dict = json.loads(input_config) - output_config_dict = json.loads(output_config) - split_names = utils.generate_output_split_names(input_config_dict, output_config_dict) - output_dict_examples = [] - for split_name in split_names: - output_split_examples = standard_artifacts.Examples(split=split_name) - output_split_examples.uri = os.path.join(output_examples_path, split_name) - output_dict_examples.append(output_split_examples) - output_dict = { - 'examples': output_dict_examples, - } - - # Create exec proterties. - exec_properties = { - 'input_config': input_config, - 'output_config': output_config - } + input_base_channel = channel_utils.as_channel([input_base]) + + input_config_obj = None + if input_config: + input_config_obj = example_gen_pb2.Input() + json_format.Parse(input_config, input_config_obj) + + output_config_obj = None + if output_config: + output_config_obj = example_gen_pb2.Output() + json_format.Parse(output_config, output_config_obj) + + component_class_instance = CsvExampleGen( + input=input_base_channel, + input_config=input_config_obj, + output_config=output_config_obj, + ) + + input_dict = {name: channel.artifacts for name, channel in component_class_instance.inputs.items()} + output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for output_artifact in output_dict['examples']: + output_artifact.uri = output_examples_path + if output_artifact.split: + output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split) executor = CsvExampleGen.EXECUTOR_SPEC.executor_class() executor.Do( @@ -127,7 +134,7 @@ implementation: return (output_examples_path,) import argparse - _parser = argparse.ArgumentParser(prog='CsvExampleGen_GCS', description="Executes the CsvExampleGen component.\n\n Args:\n input_base: A Channel of 'ExternalPath' type, which includes one artifact\n whose uri is an external directory with csv files inside (required).\n input_config: An example_gen_pb2.Input instance, providing input\n configuration. If unset, the files under input_base will be treated as a\n single split.\n output_config: An example_gen_pb2.Output instance, providing output\n configuration. If unset, default splits will be 'train' and 'eval' with\n size 2:1.\n ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and\n eval examples.\n ??? input: Forwards compatibility alias for the 'input_base' argument.\n ??? instance_name: Optional unique instance name. Necessary if multiple\n CsvExampleGen components are declared in the same pipeline.\n") + _parser = argparse.ArgumentParser(prog='Csvexamplegen gcs', description="Executes the CsvExampleGen component.\n\n Args:\n input_base: A Channel of 'ExternalPath' type, which includes one artifact\n whose uri is an external directory with csv files inside (required).\n input_config: An example_gen_pb2.Input instance, providing input\n configuration. If unset, the files under input_base will be treated as a\n single split.\n output_config: An example_gen_pb2.Output instance, providing output\n configuration. If unset, default splits will be 'train' and 'eval' with\n size 2:1.\n ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and\n eval examples.\n ??? input: Forwards compatibility alias for the 'input_base' argument.\n ??? instance_name: Optional unique instance name. Necessary if multiple\n CsvExampleGen components are declared in the same pipeline.\n") _parser.add_argument("--input-base-path", dest="input_base_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--output-examples-path", dest="output_examples_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--input-config", dest="input_config", type=str, required=False, default=argparse.SUPPRESS) diff --git a/components/tfx/CsvExampleGen_GCS.py b/components/tfx/CsvExampleGen_GCS.py index a33495adda0..a47b10bcf5f 100644 --- a/components/tfx/CsvExampleGen_GCS.py +++ b/components/tfx/CsvExampleGen_GCS.py @@ -39,39 +39,41 @@ def CsvExampleGen_GCS( # import json import os from google.protobuf import json_format - from tfx.components.example_gen import utils from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen from tfx.proto import example_gen_pb2 from tfx.types import standard_artifacts + from tfx.types import channel_utils # Create input dict. - # input_dict['input_base'] always has a single entry input_base = standard_artifacts.ExternalArtifact() input_base.uri = input_base_path - input_dict = { - 'input_base': [input_base], - } + input_base_channel = channel_utils.as_channel([input_base]) - # Create output dict. - input_config_obj = example_gen_pb2.Input() - output_config_obj = example_gen_pb2.Output() - json_format.Parse(input_config, input_config_obj) - json_format.Parse(output_config, output_config_obj) - split_names = utils.generate_output_split_names(input_config_obj, output_config_obj) - output_dict_examples = [] - for split_name in split_names: - output_split_examples = standard_artifacts.Examples(split=split_name) - output_split_examples.uri = os.path.join(output_examples_path, split_name) - output_dict_examples.append(output_split_examples) - output_dict = { - 'examples': output_dict_examples, - } + input_config_obj = None + if input_config: + input_config_obj = example_gen_pb2.Input() + json_format.Parse(input_config, input_config_obj) - # Create exec proterties. - exec_properties = { - 'input_config': input_config, - 'output_config': output_config - } + output_config_obj = None + if output_config: + output_config_obj = example_gen_pb2.Output() + json_format.Parse(output_config, output_config_obj) + + component_class_instance = CsvExampleGen( + input=input_base_channel, + input_config=input_config_obj, + output_config=output_config_obj, + ) + + input_dict = {name: channel.artifacts for name, channel in component_class_instance.inputs.items()} + output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for output_artifact in output_dict['examples']: + output_artifact.uri = output_examples_path + if output_artifact.split: + output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split) executor = CsvExampleGen.EXECUTOR_SPEC.executor_class() executor.Do( From 0ccd7c3bf780278eedda4a2d3b1f0b18904ca7f8 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Wed, 30 Oct 2019 23:33:39 -0700 Subject: [PATCH 03/26] Renamed output_examples to example_artifacts for consistency with the original component --- components/tfx/CsvExampleGen.py | 6 +++--- components/tfx/CsvExampleGen_GCS.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/components/tfx/CsvExampleGen.py b/components/tfx/CsvExampleGen.py index 854d64afc3f..38986520038 100644 --- a/components/tfx/CsvExampleGen.py +++ b/components/tfx/CsvExampleGen.py @@ -6,8 +6,8 @@ def CsvExampleGen( #input_base_path: 'ExternalPath', # A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with csv files inside (required). # Outputs - output_examples_path: OutputPath('ExamplesPath'), - #output_examples_path: 'ExamplesPath', + example_artifacts_path: OutputPath('ExamplesPath'), + #example_artifacts_path: 'ExamplesPath', # Execution properties #input_config_splits: {'List' : {'item_type': 'ExampleGen.Input.Split'}}, @@ -69,7 +69,7 @@ def CsvExampleGen( # Generating paths for output artifacts for output_artifact in output_dict['examples']: - output_artifact.uri = output_examples_path + output_artifact.uri = example_artifacts_path if output_artifact.split: output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split) diff --git a/components/tfx/CsvExampleGen_GCS.py b/components/tfx/CsvExampleGen_GCS.py index a47b10bcf5f..5e7163b4d1d 100644 --- a/components/tfx/CsvExampleGen_GCS.py +++ b/components/tfx/CsvExampleGen_GCS.py @@ -6,8 +6,8 @@ def CsvExampleGen_GCS( # input_base_path: 'ExternalPath', # A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with csv files inside (required). # Outputs - #output_examples_path: OutputPath('ExamplesPath'), - output_examples_path: 'ExamplesPath', + #example_artifacts_path: OutputPath('ExamplesPath'), + example_artifacts_path: 'ExamplesPath', # Execution properties #input_config_splits: {'List' : {'item_type': 'ExampleGen.Input.Split'}}, @@ -16,7 +16,7 @@ def CsvExampleGen_GCS( # output_config: 'ExampleGen.Output' = '{"splitConfig": {"splits": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. #custom_config: 'ExampleGen.CustomConfig' = None, ) -> NamedTuple('Outputs', [ - ('examples_path', 'ExamplesPath'), + ('example_artifacts', 'ExamplesPath'), ]): """Executes the CsvExampleGen component. @@ -71,7 +71,7 @@ def CsvExampleGen_GCS( # # Generating paths for output artifacts for output_artifact in output_dict['examples']: - output_artifact.uri = output_examples_path + output_artifact.uri = example_artifacts_path if output_artifact.split: output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split) @@ -82,7 +82,7 @@ def CsvExampleGen_GCS( # exec_properties=exec_properties, ) - return (output_examples_path,) + return (example_artifacts_path,) if __name__ == '__main__': import kfp From ace062ef2acdf1e538cbdf9da19206b4fc8b51c0 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Wed, 30 Oct 2019 23:38:44 -0700 Subject: [PATCH 04/26] Fixed the docstring a bit --- components/tfx/CsvExampleGen.py | 7 +++---- components/tfx/CsvExampleGen_GCS.py | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/components/tfx/CsvExampleGen.py b/components/tfx/CsvExampleGen.py index 38986520038..d1ca417590b 100644 --- a/components/tfx/CsvExampleGen.py +++ b/components/tfx/CsvExampleGen.py @@ -27,11 +27,10 @@ def CsvExampleGen( output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. - ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and - eval examples. ??? input: Forwards compatibility alias for the 'input_base' argument. - ??? instance_name: Optional unique instance name. Necessary if multiple - CsvExampleGen components are declared in the same pipeline. + Returns: + example_artifacts: Artifact of type 'ExamplesPath' for output train and + eval examples. """ import json diff --git a/components/tfx/CsvExampleGen_GCS.py b/components/tfx/CsvExampleGen_GCS.py index 5e7163b4d1d..bab3246c8aa 100644 --- a/components/tfx/CsvExampleGen_GCS.py +++ b/components/tfx/CsvExampleGen_GCS.py @@ -29,11 +29,10 @@ def CsvExampleGen_GCS( # output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. - ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and - eval examples. ??? input: Forwards compatibility alias for the 'input_base' argument. - ??? instance_name: Optional unique instance name. Necessary if multiple - CsvExampleGen components are declared in the same pipeline. + Returns: + example_artifacts: Artifact of type 'ExamplesPath' for output train and + eval examples. """ import json From 8e30a62a60c7536cebb242a80cae049e82bce4bd Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Thu, 31 Oct 2019 00:11:22 -0700 Subject: [PATCH 05/26] Added StatisticsGen First draft --- components/tfx/StatisticsGen.py | 73 +++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 components/tfx/StatisticsGen.py diff --git a/components/tfx/StatisticsGen.py b/components/tfx/StatisticsGen.py new file mode 100644 index 00000000000..ba5e073b6b6 --- /dev/null +++ b/components/tfx/StatisticsGen.py @@ -0,0 +1,73 @@ +from kfp.components import InputPath, OutputPath + + +def StatisticsGen( + # Inputs + input_data_path: InputPath('ExamplesPath'), + #input_data_path: 'ExamplesPath', + + # Outputs + output_path: OutputPath('ExampleStatistics'), + #output_path: 'ExampleStatistics', +): +#) -> NamedTuple('Outputs', [ +# ('output', 'ExampleStatistics'), +#]): + """Construct a StatisticsGen component. + + Args: + input_data: A Channel of `ExamplesPath` type, likely generated by the + [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen). + This needs to contain two splits labeled `train` and `eval`. _required_ + # examples: Forwards compatibility alias for the `input_data` argument. + Returns: + output: `ExampleStatistics` channel for statistics of each split + provided in the input examples. + """ + + import json + import os + from google.protobuf import json_format + from tfx.types import standard_artifacts + from tfx.types import channel_utils + + # Create input dict. + # Recovering splits + splits = sorted(os.listdir(input_data_path)) + input_data_artifacts = [] + for split in splits: + artifact = standard_artifacts.Examples() + artifact.uri = os.path.join(input_data_path, split) + input_data_artifacts.append(artifact) + input_data_channel = channel_utils.as_channel(input_data_artifacts) + + from tfx.components.statistics_gen.component import StatisticsGen + component_class_instance = StatisticsGen( + input_data=input_data_channel, + ) + + input_dict = {name: channel.artifacts for name, channel in component_class_instance.inputs.items()} + output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for output_artifact in output_dict['output']: + output_artifact.uri = os.path.join(output_path, output_artifact.split) # Default split is '' + + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + #return (output_path,) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op( + StatisticsGen, + base_image='tensorflow/tensorflow:1.14.0-py3', + packages_to_install=['tfx==0.14', 'six>=1.12.0'], + output_component_file='StatisticsGen.component.yaml' + ) \ No newline at end of file From b8fd5a7b5ce8fab9c7964d87dbe560a01d8a3c7a Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Thu, 31 Oct 2019 00:28:44 -0700 Subject: [PATCH 06/26] Added SchemaGen First draft --- components/tfx/SchemaGen.py | 77 +++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 components/tfx/SchemaGen.py diff --git a/components/tfx/SchemaGen.py b/components/tfx/SchemaGen.py new file mode 100644 index 00000000000..dba8f957e0c --- /dev/null +++ b/components/tfx/SchemaGen.py @@ -0,0 +1,77 @@ +from kfp.components import InputPath, OutputPath + + +def SchemaGen( + stats_path: InputPath('ExampleStatistics'), + #statistics_path: InputPath('ExampleStatistics'), + output_path: OutputPath('Schema'), + #schema_path: InputPath('Schema') = None, + infer_feature_shape: bool = False, +): + """Constructs a SchemaGen component. + + Args: + stats: A Channel of `ExampleStatistics` type (required if spec is not + passed). This should contain at least a `train` split. Other splits are + currently ignored. + # Exactly one of 'stats'/'statistics' or 'schema' is required. + #schema: A Channel of `Schema` type that provides an instance of Schema. + # If provided, pass through this schema artifact as the output. Exactly + # one of 'stats'/'statistics' or 'schema' is required. + infer_feature_shape: Boolean value indicating whether or not to infer the + shape of features. If the feature shape is not inferred, downstream + Tensorflow Transform component using the schema will parse input + as tf.SparseTensor. + #statistics: Future replacement of the 'stats' argument. + #Either `statistics` or `stats` must be present in the input arguments. + Returns: + output: Output `Schema` channel for schema result. + """ + + import json + import os + from google.protobuf import json_format + from tfx.types import standard_artifacts + from tfx.types import channel_utils + + # Create input dict. + # Recovering splits + input_base_path = stats_path + splits = sorted(os.listdir(input_base_path)) + input_data_artifacts = [] + for split in splits: + artifact = standard_artifacts.ExampleStatistics() + artifact.uri = os.path.join(input_base_path, split) + input_data_artifacts.append(artifact) + input_data_channel = channel_utils.as_channel(input_data_artifacts) + + from tfx.components.schema_gen.component import SchemaGen + component_class_instance = SchemaGen( + input_data=input_data_channel, + ) + + input_dict = {name: channel.artifacts for name, channel in component_class_instance.inputs.items()} + output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for output_artifact in output_dict['output']: + output_artifact.uri = os.path.join(output_path, output_artifact.split) # Default split is '' + + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + #return (output_path,) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op( + SchemaGen, + base_image='tensorflow/tensorflow:1.14.0-py3', + packages_to_install=['tfx==0.14', 'six>=1.12.0'], + output_component_file='SchemaGen.component.yaml' + ) \ No newline at end of file From 35ab2e08998a8ebfa9c640346391d8f7177d545a Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Fri, 1 Nov 2019 15:40:31 -0700 Subject: [PATCH 07/26] Fixed the input_dict construction --- components/tfx/CsvExampleGen.py | 9 ++++++--- components/tfx/CsvExampleGen_GCS.py | 8 +++++--- components/tfx/SchemaGen.py | 6 +++--- components/tfx/StatisticsGen.py | 6 +++--- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/components/tfx/CsvExampleGen.py b/components/tfx/CsvExampleGen.py index d1ca417590b..1690412d05e 100644 --- a/components/tfx/CsvExampleGen.py +++ b/components/tfx/CsvExampleGen.py @@ -62,8 +62,11 @@ def CsvExampleGen( output_config=output_config_obj, ) - input_dict = {name: channel.artifacts for name, channel in component_class_instance.inputs.items()} - output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()} + # component_class_instance.inputs/outputs are wrappers that do not behave like real dictionaries. The underlying dict can be accessed using .get_all() + # Channel artifacts can be accessed by calling .get() + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties # Generating paths for output artifacts @@ -87,4 +90,4 @@ def CsvExampleGen( base_image='tensorflow/tensorflow:1.14.0-py3', packages_to_install=['tfx==0.14', 'six>=1.12.0'], output_component_file='CsvExampleGen.component.yaml' - ) \ No newline at end of file + ) diff --git a/components/tfx/CsvExampleGen_GCS.py b/components/tfx/CsvExampleGen_GCS.py index bab3246c8aa..7e14da266d9 100644 --- a/components/tfx/CsvExampleGen_GCS.py +++ b/components/tfx/CsvExampleGen_GCS.py @@ -64,8 +64,10 @@ def CsvExampleGen_GCS( # output_config=output_config_obj, ) - input_dict = {name: channel.artifacts for name, channel in component_class_instance.inputs.items()} - output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()} + # component_class_instance.inputs/outputs are wrappers that do not behave like real dictionaries. The underlying dict can be accessed using .get_all() + # Channel artifacts can be accessed by calling .get() + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} exec_properties = component_class_instance.exec_properties # Generating paths for output artifacts @@ -90,4 +92,4 @@ def CsvExampleGen_GCS( # base_image='tensorflow/tensorflow:1.14.0-py3', packages_to_install=['tfx==0.14', 'six>=1.12.0'], output_component_file='CsvExampleGen_GCS.component.yaml' - ) \ No newline at end of file + ) diff --git a/components/tfx/SchemaGen.py b/components/tfx/SchemaGen.py index dba8f957e0c..fed984b087b 100644 --- a/components/tfx/SchemaGen.py +++ b/components/tfx/SchemaGen.py @@ -50,8 +50,8 @@ def SchemaGen( input_data=input_data_channel, ) - input_dict = {name: channel.artifacts for name, channel in component_class_instance.inputs.items()} - output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()} + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} exec_properties = component_class_instance.exec_properties # Generating paths for output artifacts @@ -74,4 +74,4 @@ def SchemaGen( base_image='tensorflow/tensorflow:1.14.0-py3', packages_to_install=['tfx==0.14', 'six>=1.12.0'], output_component_file='SchemaGen.component.yaml' - ) \ No newline at end of file + ) diff --git a/components/tfx/StatisticsGen.py b/components/tfx/StatisticsGen.py index ba5e073b6b6..ef4b73fd709 100644 --- a/components/tfx/StatisticsGen.py +++ b/components/tfx/StatisticsGen.py @@ -46,8 +46,8 @@ def StatisticsGen( input_data=input_data_channel, ) - input_dict = {name: channel.artifacts for name, channel in component_class_instance.inputs.items()} - output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()} + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} exec_properties = component_class_instance.exec_properties # Generating paths for output artifacts @@ -70,4 +70,4 @@ def StatisticsGen( base_image='tensorflow/tensorflow:1.14.0-py3', packages_to_install=['tfx==0.14', 'six>=1.12.0'], output_component_file='StatisticsGen.component.yaml' - ) \ No newline at end of file + ) From fcef47349a395aedd4d4d1207d94b1bb004cb0f0 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Fri, 1 Nov 2019 15:52:09 -0700 Subject: [PATCH 08/26] Use None defaults --- components/tfx/CsvExampleGen_GCS.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/tfx/CsvExampleGen_GCS.py b/components/tfx/CsvExampleGen_GCS.py index 7e14da266d9..23f5ea7fcff 100644 --- a/components/tfx/CsvExampleGen_GCS.py +++ b/components/tfx/CsvExampleGen_GCS.py @@ -11,9 +11,9 @@ def CsvExampleGen_GCS( # # Execution properties #input_config_splits: {'List' : {'item_type': 'ExampleGen.Input.Split'}}, - input_config: 'ExampleGen.Input' = '{"splits": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. + input_config: 'ExampleGen.Input' = None, # = '{"splits": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. #output_config_splits: {'List' : {'item_type': 'ExampleGen.SplitConfig'}}, - output_config: 'ExampleGen.Output' = '{"splitConfig": {"splits": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. + output_config: 'ExampleGen.Output' = None, # = '{"splitConfig": {"splits": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. #custom_config: 'ExampleGen.CustomConfig' = None, ) -> NamedTuple('Outputs', [ ('example_artifacts', 'ExamplesPath'), From 8a1d1e544591f4f2cff5767bb666eb49c69c5a38 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Fri, 1 Nov 2019 15:53:04 -0700 Subject: [PATCH 09/26] Switched to TFX container image --- components/tfx/CsvExampleGen.py | 3 +-- components/tfx/CsvExampleGen_GCS.py | 3 +-- components/tfx/SchemaGen.py | 3 +-- components/tfx/StatisticsGen.py | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/components/tfx/CsvExampleGen.py b/components/tfx/CsvExampleGen.py index 1690412d05e..93adfd570b9 100644 --- a/components/tfx/CsvExampleGen.py +++ b/components/tfx/CsvExampleGen.py @@ -87,7 +87,6 @@ def CsvExampleGen( import kfp kfp.components.func_to_container_op( CsvExampleGen, - base_image='tensorflow/tensorflow:1.14.0-py3', - packages_to_install=['tfx==0.14', 'six>=1.12.0'], + base_image='tensorflow/tfx:0.15.0rc0', output_component_file='CsvExampleGen.component.yaml' ) diff --git a/components/tfx/CsvExampleGen_GCS.py b/components/tfx/CsvExampleGen_GCS.py index 23f5ea7fcff..3c2300a4da8 100644 --- a/components/tfx/CsvExampleGen_GCS.py +++ b/components/tfx/CsvExampleGen_GCS.py @@ -89,7 +89,6 @@ def CsvExampleGen_GCS( # import kfp kfp.components.func_to_container_op( CsvExampleGen_GCS, - base_image='tensorflow/tensorflow:1.14.0-py3', - packages_to_install=['tfx==0.14', 'six>=1.12.0'], + base_image='tensorflow/tfx:0.15.0rc0', output_component_file='CsvExampleGen_GCS.component.yaml' ) diff --git a/components/tfx/SchemaGen.py b/components/tfx/SchemaGen.py index fed984b087b..81f4a24e77e 100644 --- a/components/tfx/SchemaGen.py +++ b/components/tfx/SchemaGen.py @@ -71,7 +71,6 @@ def SchemaGen( import kfp kfp.components.func_to_container_op( SchemaGen, - base_image='tensorflow/tensorflow:1.14.0-py3', - packages_to_install=['tfx==0.14', 'six>=1.12.0'], + base_image='tensorflow/tfx:0.15.0rc0', output_component_file='SchemaGen.component.yaml' ) diff --git a/components/tfx/StatisticsGen.py b/components/tfx/StatisticsGen.py index ef4b73fd709..d4137c25b0f 100644 --- a/components/tfx/StatisticsGen.py +++ b/components/tfx/StatisticsGen.py @@ -67,7 +67,6 @@ def StatisticsGen( import kfp kfp.components.func_to_container_op( StatisticsGen, - base_image='tensorflow/tensorflow:1.14.0-py3', - packages_to_install=['tfx==0.14', 'six>=1.12.0'], + base_image='tensorflow/tfx:0.15.0rc0', output_component_file='StatisticsGen.component.yaml' ) From d6e6b52084420a50e05f997697cdf0ac5f98c84f Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Fri, 1 Nov 2019 15:53:19 -0700 Subject: [PATCH 10/26] Updated component definitions --- components/tfx/CsvExampleGen.component.yaml | 265 ++++++++++-------- .../tfx/CsvExampleGen_GCS.component.yaml | 110 ++++---- components/tfx/SchemaGen.component.yaml | 151 ++++++++++ components/tfx/StatisticsGen.component.yaml | 130 +++++++++ 4 files changed, 475 insertions(+), 181 deletions(-) create mode 100644 components/tfx/SchemaGen.component.yaml create mode 100644 components/tfx/StatisticsGen.component.yaml diff --git a/components/tfx/CsvExampleGen.component.yaml b/components/tfx/CsvExampleGen.component.yaml index f0051e534a5..29e745e05ae 100644 --- a/components/tfx/CsvExampleGen.component.yaml +++ b/components/tfx/CsvExampleGen.component.yaml @@ -1,3 +1,10 @@ +name: CsvExampleGen +inputs: +- {name: input_base, type: ExternalPath} +- {name: input_config, optional: true, type: ExampleGen.Input} +- {name: output_config, optional: true, type: ExampleGen.Output} +outputs: +- {name: example_artifacts, type: ExamplesPath} description: | Executes the CsvExampleGen component. @@ -10,132 +17,154 @@ description: | output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. - ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and - eval examples. ??? input: Forwards compatibility alias for the 'input_base' argument. - ??? instance_name: Optional unique instance name. Necessary if multiple - CsvExampleGen components are declared in the same pipeline. + Returns: + example_artifacts: Artifact of type 'ExamplesPath' for output train and + eval examples. implementation: container: + image: tensorflow/tfx:0.15.0rc0 + command: + - python3 + - -u + - -c + - | + def _make_parent_dirs_and_return_path(file_path: str): + import os + os.makedirs(os.path.dirname(file_path), exist_ok=True) + return file_path + + class OutputPath: + '''When creating component from function, OutputPath should be used as function parameter annotation to tell the system that the function wants to output data by writing it into a file with the given path instead of returning the data from the function.''' + def __init__(self, type=None): + self.type = type + + class InputPath: + '''When creating component from function, InputPath should be used as function parameter annotation to tell the system to pass the *data file path* to the function instead of passing the actual data.''' + def __init__(self, type=None): + self.type = type + + def CsvExampleGen( + # Inputs + input_base_path: InputPath('ExternalPath'), + #input_base_path: 'ExternalPath', # A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with csv files inside (required). + + # Outputs + example_artifacts_path: OutputPath('ExamplesPath'), + #example_artifacts_path: 'ExamplesPath', + + # Execution properties + #input_config_splits: {'List' : {'item_type': 'ExampleGen.Input.Split'}}, + input_config: 'ExampleGen.Input' = None, # = '{"splits": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. + #output_config_splits: {'List' : {'item_type': 'ExampleGen.SplitConfig'}}, + output_config: 'ExampleGen.Output' = None, # = '{"splitConfig": {"splits": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. + #custom_config: 'ExampleGen.CustomConfig' = None, + ): + """\ + Executes the CsvExampleGen component. + + Args: + input_base: A Channel of 'ExternalPath' type, which includes one artifact + whose uri is an external directory with csv files inside (required). + input_config: An example_gen_pb2.Input instance, providing input + configuration. If unset, the files under input_base will be treated as a + single split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + ??? input: Forwards compatibility alias for the 'input_base' argument. + Returns: + example_artifacts: Artifact of type 'ExamplesPath' for output train and + eval examples. + """ + + import json + import os + from google.protobuf import json_format + from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen + from tfx.proto import example_gen_pb2 + from tfx.types import standard_artifacts + from tfx.types import channel_utils + + # Create input dict. + input_base = standard_artifacts.ExternalArtifact() + input_base.uri = input_base_path + input_base_channel = channel_utils.as_channel([input_base]) + + input_config_obj = None + if input_config: + input_config_obj = example_gen_pb2.Input() + json_format.Parse(input_config, input_config_obj) + + output_config_obj = None + if output_config: + output_config_obj = example_gen_pb2.Output() + json_format.Parse(output_config, output_config_obj) + + component_class_instance = CsvExampleGen( + input=input_base_channel, + input_config=input_config_obj, + output_config=output_config_obj, + ) + + # component_class_instance.inputs/outputs are wrappers that do not behave like real dictionaries. The underlying dict can be accessed using .get_all() + # Channel artifacts can be accessed by calling .get() + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for output_artifact in output_dict['examples']: + output_artifact.uri = example_artifacts_path + if output_artifact.split: + output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split) + + executor = CsvExampleGen.EXECUTOR_SPEC.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + import argparse + _parser = argparse.ArgumentParser(prog='Csvexamplegen', description="Executes the CsvExampleGen component.\n\n Args:\n input_base: A Channel of 'ExternalPath' type, which includes one artifact\n whose uri is an external directory with csv files inside (required).\n input_config: An example_gen_pb2.Input instance, providing input\n configuration. If unset, the files under input_base will be treated as a\n single split.\n output_config: An example_gen_pb2.Output instance, providing output\n configuration. If unset, default splits will be 'train' and 'eval' with\n size 2:1.\n ??? input: Forwards compatibility alias for the 'input_base' argument.\n Returns:\n example_artifacts: Artifact of type 'ExamplesPath' for output train and\n eval examples.\n") + _parser.add_argument("--input-base", dest="input_base_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--input-config", dest="input_config", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--output-config", dest="output_config", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--example-artifacts", dest="example_artifacts_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = CsvExampleGen(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + _output_serializers = [ + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) args: - --input-base - - inputPath: input_base + - {inputPath: input_base} - if: - cond: - isPresent: input_config + cond: {isPresent: input_config} then: - --input-config - - inputValue: input_config + - {inputValue: input_config} - if: - cond: - isPresent: output_config + cond: {isPresent: output_config} then: - --output-config - - inputValue: output_config - - --output-examples - - outputPath: output_examples - command: - - sh - - -c - - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location - 'tfx==0.14' 'six>=1.12.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip - install --quiet --no-warn-script-location 'tfx==0.14' 'six>=1.12.0' --user) - && "$0" "$@" - - python3 - - -u - - -c - - "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n \ - \ os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ - \nclass InputPath:\n '''When creating component from function, InputPath\ - \ should be used as function parameter annotation to tell the system to pass\ - \ the *data file path* to the function instead of passing the actual data.'''\n\ - \ def __init__(self, type=None):\n self.type = type\n\nclass OutputPath:\n\ - \ '''When creating component from function, OutputPath should be used as\ - \ function parameter annotation to tell the system that the function wants to\ - \ output data by writing it into a file with the given path instead of returning\ - \ the data from the function.'''\n def __init__(self, type=None):\n \ - \ self.type = type\n\ndef CsvExampleGen(\n # Inputs\n input_base_path:\ - \ InputPath('ExternalPath'),\n#input_base_path: 'ExternalPath', # A Channel\ - \ of 'ExternalPath' type, which includes one artifact whose uri is an external\ - \ directory with csv files inside (required).\n\n # Outputs\n output_examples_path:\ - \ OutputPath('ExamplesPath'),\n #output_examples_path: 'ExamplesPath',\n\n\ - \ # Execution properties\n #input_config_splits: {'List' : {'item_type':\ - \ 'ExampleGen.Input.Split'}},\n input_config: 'ExampleGen.Input' = None,\ - \ # = '{\"splits\": []}', # JSON-serialized example_gen_pb2.Input instance,\ - \ providing input configuration. If unset, the files under input_base will be\ - \ treated as a single split.\n #output_config_splits: {'List' : {'item_type':\ - \ 'ExampleGen.SplitConfig'}},\n output_config: 'ExampleGen.Output' = None,\ - \ # = '{\"splitConfig\": {\"splits\": []}}', # JSON-serialized example_gen_pb2.Output\ - \ instance, providing output configuration. If unset, default splits will be\ - \ 'train' and 'eval' with size 2:1.\n #custom_config: 'ExampleGen.CustomConfig'\ - \ = None,\n):\n \"\"\"Executes the CsvExampleGen component.\n\n Args:\n\ - \ input_base: A Channel of 'ExternalPath' type, which includes one artifact\n\ - \ whose uri is an external directory with csv files inside (required).\n\ - \ input_config: An example_gen_pb2.Input instance, providing input\n \ - \ configuration. If unset, the files under input_base will be treated as\ - \ a\n single split.\n output_config: An example_gen_pb2.Output instance,\ - \ providing output\n configuration. If unset, default splits will be\ - \ 'train' and 'eval' with\n size 2:1.\n ??? example_artifacts: Optional\ - \ channel of 'ExamplesPath' for output train and\n eval examples.\n \ - \ ??? input: Forwards compatibility alias for the 'input_base' argument.\n\ - \ ??? instance_name: Optional unique instance name. Necessary if multiple\n\ - \ CsvExampleGen components are declared in the same pipeline.\n \"\ - \"\"\n\n import json\n import os\n from google.protobuf import json_format\n\ - \ from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen\n\ - \ from tfx.proto import example_gen_pb2\n from tfx.types import standard_artifacts\n\ - \ from tfx.types import channel_utils\n\n # Create input dict.\n input_base\ - \ = standard_artifacts.ExternalArtifact()\n input_base.uri = input_base_path\n\ - \ input_base_channel = channel_utils.as_channel([input_base])\n\n input_config_obj\ - \ = None\n if input_config:\n input_config_obj = example_gen_pb2.Input()\n\ - \ json_format.Parse(input_config, input_config_obj)\n\n output_config_obj\ - \ = None\n if output_config:\n output_config_obj = example_gen_pb2.Output()\n\ - \ json_format.Parse(output_config, output_config_obj)\n\n component_class_instance\ - \ = CsvExampleGen(\n input=input_base_channel,\n input_config=input_config_obj,\n\ - \ output_config=output_config_obj,\n )\n\n input_dict = {name:\ - \ channel.artifacts for name, channel in component_class_instance.inputs.items()}\n\ - \ output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()}\n\ - \ exec_properties = component_class_instance.exec_properties\n\n # Generating\ - \ paths for output artifacts\n for output_artifact in output_dict['examples']:\n\ - \ output_artifact.uri = output_examples_path\n if output_artifact.split:\n\ - \ output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split)\n\ - \n executor = CsvExampleGen.EXECUTOR_SPEC.executor_class()\n executor.Do(\n\ - \ input_dict=input_dict,\n output_dict=output_dict,\n exec_properties=exec_properties,\n\ - \ )\n\nimport argparse\n_parser = argparse.ArgumentParser(prog='Csvexamplegen',\ - \ description=\"Executes the CsvExampleGen component.\\n\\n Args:\\n \ - \ input_base: A Channel of 'ExternalPath' type, which includes one artifact\\\ - n whose uri is an external directory with csv files inside (required).\\\ - n input_config: An example_gen_pb2.Input instance, providing input\\n \ - \ configuration. If unset, the files under input_base will be treated\ - \ as a\\n single split.\\n output_config: An example_gen_pb2.Output\ - \ instance, providing output\\n configuration. If unset, default splits\ - \ will be 'train' and 'eval' with\\n size 2:1.\\n ??? example_artifacts:\ - \ Optional channel of 'ExamplesPath' for output train and\\n eval examples.\\\ - n ??? input: Forwards compatibility alias for the 'input_base' argument.\\\ - n ??? instance_name: Optional unique instance name. Necessary if multiple\\\ - n CsvExampleGen components are declared in the same pipeline.\\n\")\n\ - _parser.add_argument(\"--input-base\", dest=\"input_base_path\", type=str, required=True,\ - \ default=argparse.SUPPRESS)\n_parser.add_argument(\"--input-config\", dest=\"\ - input_config\", type=str, required=False, default=argparse.SUPPRESS)\n_parser.add_argument(\"\ - --output-config\", dest=\"output_config\", type=str, required=False, default=argparse.SUPPRESS)\n\ - _parser.add_argument(\"--output-examples\", dest=\"output_examples_path\", type=_make_parent_dirs_and_return_path,\ - \ required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\ - _output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = CsvExampleGen(**_parsed_args)\n\ - \nif not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str):\n \ - \ _outputs = [_outputs]\n\n_output_serializers = [\n \n]\n\nimport os\n\ - for idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n\ - \ except OSError:\n pass\n with open(output_file, 'w') as f:\n\ - \ f.write(_output_serializers[idx](_outputs[idx]))\n" - image: tensorflow/tensorflow:1.14.0-py3 -inputs: -- name: input_base - type: ExternalPath -- name: input_config - optional: true - type: ExampleGen.Input -- name: output_config - optional: true - type: ExampleGen.Output -name: Csvexamplegen -outputs: -- name: output_examples - type: ExamplesPath + - {inputValue: output_config} + - --example-artifacts + - {outputPath: example_artifacts} diff --git a/components/tfx/CsvExampleGen_GCS.component.yaml b/components/tfx/CsvExampleGen_GCS.component.yaml index 7f10fa83663..b500938787e 100644 --- a/components/tfx/CsvExampleGen_GCS.component.yaml +++ b/components/tfx/CsvExampleGen_GCS.component.yaml @@ -1,3 +1,11 @@ +name: CsvExampleGen GCS +inputs: +- {name: input_base_path, type: ExternalPath} +- {name: example_artifacts_path, type: ExamplesPath} +- {name: input_config, optional: true, type: ExampleGen.Input} +- {name: output_config, optional: true, type: ExampleGen.Output} +outputs: +- {name: example_artifacts, type: ExamplesPath} description: | Executes the CsvExampleGen component. @@ -10,39 +18,14 @@ description: | output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. - ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and - eval examples. ??? input: Forwards compatibility alias for the 'input_base' argument. - ??? instance_name: Optional unique instance name. Necessary if multiple - CsvExampleGen components are declared in the same pipeline. + Returns: + example_artifacts: Artifact of type 'ExamplesPath' for output train and + eval examples. implementation: container: - args: - - --input-base-path - - inputValue: input_base_path - - --output-examples-path - - inputValue: output_examples_path - - if: - cond: - isPresent: input_config - then: - - --input-config - - inputValue: input_config - - if: - cond: - isPresent: output_config - then: - - --output-config - - inputValue: output_config - - '----output-paths' - - outputPath: examples_path + image: tensorflow/tfx:0.15.0rc0 command: - - sh - - -c - - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location - 'tfx==0.14' 'six>=1.12.0' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip - install --quiet --no-warn-script-location 'tfx==0.14' 'six>=1.12.0' --user) - && "$0" "$@" - python3 - -u - -c @@ -55,17 +38,17 @@ implementation: input_base_path: 'ExternalPath', # A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with csv files inside (required). # Outputs - #output_examples_path: OutputPath('ExamplesPath'), - output_examples_path: 'ExamplesPath', + #example_artifacts_path: OutputPath('ExamplesPath'), + example_artifacts_path: 'ExamplesPath', # Execution properties #input_config_splits: {'List' : {'item_type': 'ExampleGen.Input.Split'}}, - input_config: 'ExampleGen.Input' = '{"splits": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. + input_config: 'ExampleGen.Input' = None, # = '{"splits": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. #output_config_splits: {'List' : {'item_type': 'ExampleGen.SplitConfig'}}, - output_config: 'ExampleGen.Output' = '{"splitConfig": {"splits": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. + output_config: 'ExampleGen.Output' = None, # = '{"splitConfig": {"splits": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. #custom_config: 'ExampleGen.CustomConfig' = None, ) -> NamedTuple('Outputs', [ - ('examples_path', 'ExamplesPath'), + ('example_artifacts', 'ExamplesPath'), ]): """Executes the CsvExampleGen component. @@ -78,11 +61,10 @@ implementation: output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. - ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and - eval examples. ??? input: Forwards compatibility alias for the 'input_base' argument. - ??? instance_name: Optional unique instance name. Necessary if multiple - CsvExampleGen components are declared in the same pipeline. + Returns: + example_artifacts: Artifact of type 'ExamplesPath' for output train and + eval examples. """ import json @@ -114,13 +96,15 @@ implementation: output_config=output_config_obj, ) - input_dict = {name: channel.artifacts for name, channel in component_class_instance.inputs.items()} - output_dict = {name: channel.artifacts for name, channel in component_class_instance.outputs.items()} + # component_class_instance.inputs/outputs are wrappers that do not behave like real dictionaries. The underlying dict can be accessed using .get_all() + # Channel artifacts can be accessed by calling .get() + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} exec_properties = component_class_instance.exec_properties # Generating paths for output artifacts for output_artifact in output_dict['examples']: - output_artifact.uri = output_examples_path + output_artifact.uri = example_artifacts_path if output_artifact.split: output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split) @@ -131,12 +115,12 @@ implementation: exec_properties=exec_properties, ) - return (output_examples_path,) + return (example_artifacts_path,) import argparse - _parser = argparse.ArgumentParser(prog='Csvexamplegen gcs', description="Executes the CsvExampleGen component.\n\n Args:\n input_base: A Channel of 'ExternalPath' type, which includes one artifact\n whose uri is an external directory with csv files inside (required).\n input_config: An example_gen_pb2.Input instance, providing input\n configuration. If unset, the files under input_base will be treated as a\n single split.\n output_config: An example_gen_pb2.Output instance, providing output\n configuration. If unset, default splits will be 'train' and 'eval' with\n size 2:1.\n ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and\n eval examples.\n ??? input: Forwards compatibility alias for the 'input_base' argument.\n ??? instance_name: Optional unique instance name. Necessary if multiple\n CsvExampleGen components are declared in the same pipeline.\n") + _parser = argparse.ArgumentParser(prog='Csvexamplegen gcs', description="Executes the CsvExampleGen component.\n\n Args:\n input_base: A Channel of 'ExternalPath' type, which includes one artifact\n whose uri is an external directory with csv files inside (required).\n input_config: An example_gen_pb2.Input instance, providing input\n configuration. If unset, the files under input_base will be treated as a\n single split.\n output_config: An example_gen_pb2.Output instance, providing output\n configuration. If unset, default splits will be 'train' and 'eval' with\n size 2:1.\n ??? input: Forwards compatibility alias for the 'input_base' argument.\n Returns:\n example_artifacts: Artifact of type 'ExamplesPath' for output train and\n eval examples.\n") _parser.add_argument("--input-base-path", dest="input_base_path", type=str, required=True, default=argparse.SUPPRESS) - _parser.add_argument("--output-examples-path", dest="output_examples_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--example-artifacts-path", dest="example_artifacts_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--input-config", dest="input_config", type=str, required=False, default=argparse.SUPPRESS) _parser.add_argument("--output-config", dest="output_config", type=str, required=False, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) @@ -149,7 +133,8 @@ implementation: _outputs = [_outputs] _output_serializers = [ - str + str, + ] import os @@ -160,21 +145,20 @@ implementation: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) - image: tensorflow/tensorflow:1.14.0-py3 -inputs: -- name: input_base_path - type: ExternalPath -- name: output_examples_path - type: ExamplesPath -- default: '{"splits": []}' - name: input_config - optional: true - type: ExampleGen.Input -- default: '{"splitConfig": {"splits": []}}' - name: output_config - optional: true - type: ExampleGen.Output -name: CsvExampleGen_GCS -outputs: -- name: examples_path - type: ExamplesPath + args: + - --input-base-path + - {inputValue: input_base_path} + - --example-artifacts-path + - {inputValue: example_artifacts_path} + - if: + cond: {isPresent: input_config} + then: + - --input-config + - {inputValue: input_config} + - if: + cond: {isPresent: output_config} + then: + - --output-config + - {inputValue: output_config} + - '----output-paths' + - {outputPath: example_artifacts} diff --git a/components/tfx/SchemaGen.component.yaml b/components/tfx/SchemaGen.component.yaml new file mode 100644 index 00000000000..c41e8da2459 --- /dev/null +++ b/components/tfx/SchemaGen.component.yaml @@ -0,0 +1,151 @@ +name: Schemagen +inputs: +- {name: stats, type: ExampleStatistics} +- {default: 'False', name: infer_feature_shape, optional: true, type: Boolean} +outputs: +- {name: output, type: Schema} +description: | + Constructs a SchemaGen component. + + Args: + stats: A Channel of `ExampleStatistics` type (required if spec is not + passed). This should contain at least a `train` split. Other splits are + currently ignored. + # Exactly one of 'stats'/'statistics' or 'schema' is required. + #schema: A Channel of `Schema` type that provides an instance of Schema. + # If provided, pass through this schema artifact as the output. Exactly + # one of 'stats'/'statistics' or 'schema' is required. + infer_feature_shape: Boolean value indicating whether or not to infer the + shape of features. If the feature shape is not inferred, downstream + Tensorflow Transform component using the schema will parse input + as tf.SparseTensor. + #statistics: Future replacement of the 'stats' argument. + #Either `statistics` or `stats` must be present in the input arguments. + Returns: + output: Output `Schema` channel for schema result. +implementation: + container: + image: tensorflow/tfx:0.15.0rc0 + command: + - python3 + - -u + - -c + - | + class OutputPath: + '''When creating component from function, OutputPath should be used as function parameter annotation to tell the system that the function wants to output data by writing it into a file with the given path instead of returning the data from the function.''' + def __init__(self, type=None): + self.type = type + + class InputPath: + '''When creating component from function, InputPath should be used as function parameter annotation to tell the system to pass the *data file path* to the function instead of passing the actual data.''' + def __init__(self, type=None): + self.type = type + + def _make_parent_dirs_and_return_path(file_path: str): + import os + os.makedirs(os.path.dirname(file_path), exist_ok=True) + return file_path + + def SchemaGen( + stats_path: InputPath('ExampleStatistics'), + #statistics_path: InputPath('ExampleStatistics'), + output_path: OutputPath('Schema'), + #schema_path: InputPath('Schema') = None, + infer_feature_shape: bool = False, + ): + """Constructs a SchemaGen component. + + Args: + stats: A Channel of `ExampleStatistics` type (required if spec is not + passed). This should contain at least a `train` split. Other splits are + currently ignored. + # Exactly one of 'stats'/'statistics' or 'schema' is required. + #schema: A Channel of `Schema` type that provides an instance of Schema. + # If provided, pass through this schema artifact as the output. Exactly + # one of 'stats'/'statistics' or 'schema' is required. + infer_feature_shape: Boolean value indicating whether or not to infer the + shape of features. If the feature shape is not inferred, downstream + Tensorflow Transform component using the schema will parse input + as tf.SparseTensor. + #statistics: Future replacement of the 'stats' argument. + #Either `statistics` or `stats` must be present in the input arguments. + Returns: + output: Output `Schema` channel for schema result. + """ + + import json + import os + from google.protobuf import json_format + from tfx.types import standard_artifacts + from tfx.types import channel_utils + + # Create input dict. + # Recovering splits + input_base_path = stats_path + splits = sorted(os.listdir(input_base_path)) + input_data_artifacts = [] + for split in splits: + artifact = standard_artifacts.ExampleStatistics() + artifact.uri = os.path.join(input_base_path, split) + input_data_artifacts.append(artifact) + input_data_channel = channel_utils.as_channel(input_data_artifacts) + + from tfx.components.schema_gen.component import SchemaGen + component_class_instance = SchemaGen( + input_data=input_data_channel, + ) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for output_artifact in output_dict['output']: + output_artifact.uri = os.path.join(output_path, output_artifact.split) # Default split is '' + + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + def _deserialize_bool(s) -> bool: + from distutils.util import strtobool + return strtobool(s) == 1 + + import argparse + _parser = argparse.ArgumentParser(prog='Schemagen', description="Constructs a SchemaGen component.\n\n Args:\n stats: A Channel of `ExampleStatistics` type (required if spec is not\n passed). This should contain at least a `train` split. Other splits are\n currently ignored.\n # Exactly one of 'stats'/'statistics' or 'schema' is required.\n #schema: A Channel of `Schema` type that provides an instance of Schema.\n # If provided, pass through this schema artifact as the output. Exactly\n # one of 'stats'/'statistics' or 'schema' is required.\n infer_feature_shape: Boolean value indicating whether or not to infer the\n shape of features. If the feature shape is not inferred, downstream\n Tensorflow Transform component using the schema will parse input\n as tf.SparseTensor.\n #statistics: Future replacement of the 'stats' argument.\n #Either `statistics` or `stats` must be present in the input arguments.\n Returns:\n output: Output `Schema` channel for schema result.\n") + _parser.add_argument("--stats", dest="stats_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--infer-feature-shape", dest="infer_feature_shape", type=_deserialize_bool, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--output", dest="output_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = SchemaGen(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + _output_serializers = [ + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - --stats + - {inputPath: stats} + - if: + cond: {isPresent: infer_feature_shape} + then: + - --infer-feature-shape + - {inputValue: infer_feature_shape} + - --output + - {outputPath: output} diff --git a/components/tfx/StatisticsGen.component.yaml b/components/tfx/StatisticsGen.component.yaml new file mode 100644 index 00000000000..bc592a4d506 --- /dev/null +++ b/components/tfx/StatisticsGen.component.yaml @@ -0,0 +1,130 @@ +name: Statisticsgen +inputs: +- name: input_data + type: ExamplesPath +outputs: +- name: output + type: ExampleStatistics +description: | + Construct a StatisticsGen component. + + Args: + input_data: A Channel of `ExamplesPath` type, likely generated by the + [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen). + This needs to contain two splits labeled `train` and `eval`. _required_ + # examples: Forwards compatibility alias for the `input_data` argument. + Returns: + output: `ExampleStatistics` channel for statistics of each split + provided in the input examples. +implementation: + container: + image: tensorflow/tfx:0.15.0rc0 + command: + - python3 + - -u + - -c + - | + class OutputPath: + '''When creating component from function, OutputPath should be used as function parameter annotation to tell the system that the function wants to output data by writing it into a file with the given path instead of returning the data from the function.''' + def __init__(self, type=None): + self.type = type + + class InputPath: + '''When creating component from function, InputPath should be used as function parameter annotation to tell the system to pass the *data file path* to the function instead of passing the actual data.''' + def __init__(self, type=None): + self.type = type + + def _make_parent_dirs_and_return_path(file_path: str): + import os + os.makedirs(os.path.dirname(file_path), exist_ok=True) + return file_path + + def StatisticsGen( + # Inputs + input_data_path: InputPath('ExamplesPath'), + #input_data_path: 'ExamplesPath', + + # Outputs + output_path: OutputPath('ExampleStatistics'), + #output_path: 'ExampleStatistics', + ): + #) -> NamedTuple('Outputs', [ + # ('output', 'ExampleStatistics'), + #]): + """Construct a StatisticsGen component. + + Args: + input_data: A Channel of `ExamplesPath` type, likely generated by the + [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen). + This needs to contain two splits labeled `train` and `eval`. _required_ + # examples: Forwards compatibility alias for the `input_data` argument. + Returns: + output: `ExampleStatistics` channel for statistics of each split + provided in the input examples. + """ + + import json + import os + from google.protobuf import json_format + from tfx.types import standard_artifacts + from tfx.types import channel_utils + + # Create input dict. + # Recovering splits + splits = sorted(os.listdir(input_data_path)) + input_data_artifacts = [] + for split in splits: + artifact = standard_artifacts.Examples() + artifact.uri = os.path.join(input_data_path, split) + input_data_artifacts.append(artifact) + input_data_channel = channel_utils.as_channel(input_data_artifacts) + + from tfx.components.statistics_gen.component import StatisticsGen + component_class_instance = StatisticsGen( + input_data=input_data_channel, + ) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for output_artifact in output_dict['output']: + output_artifact.uri = os.path.join(output_path, output_artifact.split) # Default split is '' + + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + import argparse + _parser = argparse.ArgumentParser(prog='Statisticsgen', description='Construct a StatisticsGen component.\n\n Args:\n input_data: A Channel of `ExamplesPath` type, likely generated by the\n [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen).\n This needs to contain two splits labeled `train` and `eval`. _required_\n # examples: Forwards compatibility alias for the `input_data` argument.\n Returns:\n output: `ExampleStatistics` channel for statistics of each split\n provided in the input examples.\n') + _parser.add_argument("--input-data", dest="input_data_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--output", dest="output_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = StatisticsGen(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + _output_serializers = [ + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - --input-data + - inputPath: input_data + - --output + - outputPath: output From fa7374cd6257d9af63f56b220ccef9258dd360c6 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Fri, 1 Nov 2019 18:27:15 -0700 Subject: [PATCH 11/26] Fixed StatisticsGen and SchemaGen Input artifacts must have splits. Split URIs should end with "/'. The ciomponents now work. Also printing component_class_instance for debugging. --- components/tfx/SchemaGen.component.yaml | 13 +++++++++---- components/tfx/SchemaGen.py | 12 ++++++++---- components/tfx/StatisticsGen.component.yaml | 10 ++++++++-- components/tfx/StatisticsGen.py | 9 +++++++-- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/components/tfx/SchemaGen.component.yaml b/components/tfx/SchemaGen.component.yaml index c41e8da2459..d747ab8f5f5 100644 --- a/components/tfx/SchemaGen.component.yaml +++ b/components/tfx/SchemaGen.component.yaml @@ -80,19 +80,21 @@ implementation: from tfx.types import channel_utils # Create input dict. - # Recovering splits input_base_path = stats_path + input_artifact_class = standard_artifacts.ExampleStatistics + # Recovering splits splits = sorted(os.listdir(input_base_path)) input_data_artifacts = [] for split in splits: - artifact = standard_artifacts.ExampleStatistics() - artifact.uri = os.path.join(input_base_path, split) + artifact = input_artifact_class() + artifact.split = split + artifact.uri = os.path.join(input_base_path, split) + '/' input_data_artifacts.append(artifact) input_data_channel = channel_utils.as_channel(input_data_artifacts) from tfx.components.schema_gen.component import SchemaGen component_class_instance = SchemaGen( - input_data=input_data_channel, + stats=input_data_channel, ) input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} @@ -103,12 +105,15 @@ implementation: for output_artifact in output_dict['output']: output_artifact.uri = os.path.join(output_path, output_artifact.split) # Default split is '' + print('component instance: ' + str(component_class_instance)) + executor = component_class_instance.executor_spec.executor_class() executor.Do( input_dict=input_dict, output_dict=output_dict, exec_properties=exec_properties, ) + #return (output_path,) def _deserialize_bool(s) -> bool: from distutils.util import strtobool diff --git a/components/tfx/SchemaGen.py b/components/tfx/SchemaGen.py index 81f4a24e77e..634031228f3 100644 --- a/components/tfx/SchemaGen.py +++ b/components/tfx/SchemaGen.py @@ -35,19 +35,21 @@ def SchemaGen( from tfx.types import channel_utils # Create input dict. - # Recovering splits input_base_path = stats_path + input_artifact_class = standard_artifacts.ExampleStatistics + # Recovering splits splits = sorted(os.listdir(input_base_path)) input_data_artifacts = [] for split in splits: - artifact = standard_artifacts.ExampleStatistics() - artifact.uri = os.path.join(input_base_path, split) + artifact = input_artifact_class() + artifact.split = split + artifact.uri = os.path.join(input_base_path, split) + '/' input_data_artifacts.append(artifact) input_data_channel = channel_utils.as_channel(input_data_artifacts) from tfx.components.schema_gen.component import SchemaGen component_class_instance = SchemaGen( - input_data=input_data_channel, + stats=input_data_channel, ) input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} @@ -58,6 +60,8 @@ def SchemaGen( for output_artifact in output_dict['output']: output_artifact.uri = os.path.join(output_path, output_artifact.split) # Default split is '' + print('component instance: ' + str(component_class_instance)) + executor = component_class_instance.executor_spec.executor_class() executor.Do( input_dict=input_dict, diff --git a/components/tfx/StatisticsGen.component.yaml b/components/tfx/StatisticsGen.component.yaml index bc592a4d506..8d441f4c1bd 100644 --- a/components/tfx/StatisticsGen.component.yaml +++ b/components/tfx/StatisticsGen.component.yaml @@ -70,12 +70,15 @@ implementation: from tfx.types import channel_utils # Create input dict. + input_base_path = input_data_path + input_artifact_class = standard_artifacts.Examples # Recovering splits splits = sorted(os.listdir(input_data_path)) input_data_artifacts = [] for split in splits: - artifact = standard_artifacts.Examples() - artifact.uri = os.path.join(input_data_path, split) + artifact = input_artifact_class() + artifact.split = split + artifact.uri = os.path.join(input_base_path, split) + '/' input_data_artifacts.append(artifact) input_data_channel = channel_utils.as_channel(input_data_artifacts) @@ -92,12 +95,15 @@ implementation: for output_artifact in output_dict['output']: output_artifact.uri = os.path.join(output_path, output_artifact.split) # Default split is '' + print('Component instance: ' + str(component_class_instance)) + executor = component_class_instance.executor_spec.executor_class() executor.Do( input_dict=input_dict, output_dict=output_dict, exec_properties=exec_properties, ) + #return (output_path,) import argparse _parser = argparse.ArgumentParser(prog='Statisticsgen', description='Construct a StatisticsGen component.\n\n Args:\n input_data: A Channel of `ExamplesPath` type, likely generated by the\n [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen).\n This needs to contain two splits labeled `train` and `eval`. _required_\n # examples: Forwards compatibility alias for the `input_data` argument.\n Returns:\n output: `ExampleStatistics` channel for statistics of each split\n provided in the input examples.\n') diff --git a/components/tfx/StatisticsGen.py b/components/tfx/StatisticsGen.py index d4137c25b0f..b43815ec977 100644 --- a/components/tfx/StatisticsGen.py +++ b/components/tfx/StatisticsGen.py @@ -32,12 +32,15 @@ def StatisticsGen( from tfx.types import channel_utils # Create input dict. + input_base_path = input_data_path + input_artifact_class = standard_artifacts.Examples # Recovering splits splits = sorted(os.listdir(input_data_path)) input_data_artifacts = [] for split in splits: - artifact = standard_artifacts.Examples() - artifact.uri = os.path.join(input_data_path, split) + artifact = input_artifact_class() + artifact.split = split + artifact.uri = os.path.join(input_base_path, split) + '/' input_data_artifacts.append(artifact) input_data_channel = channel_utils.as_channel(input_data_artifacts) @@ -54,6 +57,8 @@ def StatisticsGen( for output_artifact in output_dict['output']: output_artifact.uri = os.path.join(output_path, output_artifact.split) # Default split is '' + print('Component instance: ' + str(component_class_instance)) + executor = component_class_instance.executor_spec.executor_class() executor.Do( input_dict=input_dict, From a9e784ec93c570527a80c0ad1ac9c6a1319f75f3 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Fri, 1 Nov 2019 18:50:50 -0700 Subject: [PATCH 12/26] Printing component instance in CsvExampleGen --- components/tfx/CsvExampleGen.component.yaml | 2 ++ components/tfx/CsvExampleGen.py | 2 ++ components/tfx/CsvExampleGen_GCS.component.yaml | 2 ++ components/tfx/CsvExampleGen_GCS.py | 2 ++ 4 files changed, 8 insertions(+) diff --git a/components/tfx/CsvExampleGen.component.yaml b/components/tfx/CsvExampleGen.component.yaml index 29e745e05ae..b608826d948 100644 --- a/components/tfx/CsvExampleGen.component.yaml +++ b/components/tfx/CsvExampleGen.component.yaml @@ -120,6 +120,8 @@ implementation: if output_artifact.split: output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split) + print('component instance: ' + str(component_class_instance)) + executor = CsvExampleGen.EXECUTOR_SPEC.executor_class() executor.Do( input_dict=input_dict, diff --git a/components/tfx/CsvExampleGen.py b/components/tfx/CsvExampleGen.py index 93adfd570b9..3a679aac42f 100644 --- a/components/tfx/CsvExampleGen.py +++ b/components/tfx/CsvExampleGen.py @@ -75,6 +75,8 @@ def CsvExampleGen( if output_artifact.split: output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split) + print('component instance: ' + str(component_class_instance)) + executor = CsvExampleGen.EXECUTOR_SPEC.executor_class() executor.Do( input_dict=input_dict, diff --git a/components/tfx/CsvExampleGen_GCS.component.yaml b/components/tfx/CsvExampleGen_GCS.component.yaml index b500938787e..13ef94187b0 100644 --- a/components/tfx/CsvExampleGen_GCS.component.yaml +++ b/components/tfx/CsvExampleGen_GCS.component.yaml @@ -108,6 +108,8 @@ implementation: if output_artifact.split: output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split) + print('component instance: ' + str(component_class_instance)) + executor = CsvExampleGen.EXECUTOR_SPEC.executor_class() executor.Do( input_dict=input_dict, diff --git a/components/tfx/CsvExampleGen_GCS.py b/components/tfx/CsvExampleGen_GCS.py index 3c2300a4da8..4fb239481bb 100644 --- a/components/tfx/CsvExampleGen_GCS.py +++ b/components/tfx/CsvExampleGen_GCS.py @@ -76,6 +76,8 @@ def CsvExampleGen_GCS( # if output_artifact.split: output_artifact.uri = os.path.join(output_artifact.uri, output_artifact.split) + print('component instance: ' + str(component_class_instance)) + executor = CsvExampleGen.EXECUTOR_SPEC.executor_class() executor.Do( input_dict=input_dict, From 3a1159a0b361ed495532918158554df62e77f38e Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Fri, 1 Nov 2019 19:08:17 -0700 Subject: [PATCH 13/26] Moved components to directories --- .../{CsvExampleGen.py => ExampleGen/CsvExampleGen/component.py} | 0 .../CsvExampleGen/component.yaml} | 0 .../CsvExampleGen/with_URI_IO/component.py} | 0 .../CsvExampleGen/with_URI_IO/component.yaml} | 0 components/tfx/{SchemaGen.py => SchemaGen/component.py} | 0 .../tfx/{SchemaGen.component.yaml => SchemaGen/component.yaml} | 0 components/tfx/{StatisticsGen.py => StatisticsGen/component.py} | 0 .../component.yaml} | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename components/tfx/{CsvExampleGen.py => ExampleGen/CsvExampleGen/component.py} (100%) rename components/tfx/{CsvExampleGen.component.yaml => ExampleGen/CsvExampleGen/component.yaml} (100%) rename components/tfx/{CsvExampleGen_GCS.py => ExampleGen/CsvExampleGen/with_URI_IO/component.py} (100%) rename components/tfx/{CsvExampleGen_GCS.component.yaml => ExampleGen/CsvExampleGen/with_URI_IO/component.yaml} (100%) rename components/tfx/{SchemaGen.py => SchemaGen/component.py} (100%) rename components/tfx/{SchemaGen.component.yaml => SchemaGen/component.yaml} (100%) rename components/tfx/{StatisticsGen.py => StatisticsGen/component.py} (100%) rename components/tfx/{StatisticsGen.component.yaml => StatisticsGen/component.yaml} (100%) diff --git a/components/tfx/CsvExampleGen.py b/components/tfx/ExampleGen/CsvExampleGen/component.py similarity index 100% rename from components/tfx/CsvExampleGen.py rename to components/tfx/ExampleGen/CsvExampleGen/component.py diff --git a/components/tfx/CsvExampleGen.component.yaml b/components/tfx/ExampleGen/CsvExampleGen/component.yaml similarity index 100% rename from components/tfx/CsvExampleGen.component.yaml rename to components/tfx/ExampleGen/CsvExampleGen/component.yaml diff --git a/components/tfx/CsvExampleGen_GCS.py b/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py similarity index 100% rename from components/tfx/CsvExampleGen_GCS.py rename to components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py diff --git a/components/tfx/CsvExampleGen_GCS.component.yaml b/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.yaml similarity index 100% rename from components/tfx/CsvExampleGen_GCS.component.yaml rename to components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.yaml diff --git a/components/tfx/SchemaGen.py b/components/tfx/SchemaGen/component.py similarity index 100% rename from components/tfx/SchemaGen.py rename to components/tfx/SchemaGen/component.py diff --git a/components/tfx/SchemaGen.component.yaml b/components/tfx/SchemaGen/component.yaml similarity index 100% rename from components/tfx/SchemaGen.component.yaml rename to components/tfx/SchemaGen/component.yaml diff --git a/components/tfx/StatisticsGen.py b/components/tfx/StatisticsGen/component.py similarity index 100% rename from components/tfx/StatisticsGen.py rename to components/tfx/StatisticsGen/component.py diff --git a/components/tfx/StatisticsGen.component.yaml b/components/tfx/StatisticsGen/component.yaml similarity index 100% rename from components/tfx/StatisticsGen.component.yaml rename to components/tfx/StatisticsGen/component.yaml From 5645997a498481e79b1971285317fac2220c6471 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Fri, 1 Nov 2019 19:41:56 -0700 Subject: [PATCH 14/26] Updated the sample TFX pipeline --- components/tfx/TFX_pipeline.ipynb | 211 --------------------- components/tfx/_samples/TFX_pipeline.ipynb | 105 ++++++++++ 2 files changed, 105 insertions(+), 211 deletions(-) delete mode 100644 components/tfx/TFX_pipeline.ipynb create mode 100644 components/tfx/_samples/TFX_pipeline.ipynb diff --git a/components/tfx/TFX_pipeline.ipynb b/components/tfx/TFX_pipeline.ipynb deleted file mode 100644 index 0290c6f6421..00000000000 --- a/components/tfx/TFX_pipeline.ipynb +++ /dev/null @@ -1,211 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import NamedTuple\n", - "\n", - "def CsvExampleGen_GCS( #\n", - " # Inputs\n", - " #input_base_path: InputPath('ExternalPath'),\n", - " input_base_path: 'ExternalPath', # A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with csv files inside (required).\n", - "\n", - " # Outputs\n", - " #output_examples_path: OutputPath('ExamplesPath'),\n", - " output_examples_path: 'ExamplesPath',\n", - "\n", - " # Execution properties\n", - " #input_config_splits: {'List' : {'item_type': 'ExampleGen.Input.Split'}},\n", - " input_config: 'ExampleGen.Input' = '{\"splits\": []}', # JSON-serialized example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split.\n", - " #output_config_splits: {'List' : {'item_type': 'ExampleGen.SplitConfig'}},\n", - " output_config: 'ExampleGen.Output' = '{\"splitConfig\": {\"splits\": []}}', # JSON-serialized example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1.\n", - " #custom_config: 'ExampleGen.CustomConfig' = None,\n", - ") -> NamedTuple('Outputs', [\n", - " ('examples_path', 'ExamplesPath'),\n", - "]):\n", - " \"\"\"Executes the CsvExampleGen component.\n", - "\n", - " Args:\n", - " input_base: A Channel of 'ExternalPath' type, which includes one artifact\n", - " whose uri is an external directory with csv files inside (required).\n", - " input_config: An example_gen_pb2.Input instance, providing input\n", - " configuration. If unset, the files under input_base will be treated as a\n", - " single split.\n", - " output_config: An example_gen_pb2.Output instance, providing output\n", - " configuration. If unset, default splits will be 'train' and 'eval' with\n", - " size 2:1.\n", - " ??? example_artifacts: Optional channel of 'ExamplesPath' for output train and\n", - " eval examples.\n", - " ??? input: Forwards compatibility alias for the 'input_base' argument.\n", - " ??? instance_name: Optional unique instance name. Necessary if multiple\n", - " CsvExampleGen components are declared in the same pipeline.\n", - " \"\"\"\n", - "\n", - " import json\n", - " import os\n", - " from google.protobuf import json_format\n", - " from tfx.components.example_gen import utils\n", - " from tfx.components.example_gen.csv_example_gen.component import CsvExampleGen\n", - " from tfx.proto import example_gen_pb2\n", - " from tfx.types import standard_artifacts\n", - "\n", - " # Create input dict.\n", - " # input_dict['input_base'] always has a single entry\n", - " input_base = standard_artifacts.ExternalArtifact()\n", - " input_base.uri = input_base_path\n", - " input_dict = {\n", - " 'input_base': [input_base],\n", - " }\n", - "\n", - " # Create output dict.\n", - " input_config_obj = example_gen_pb2.Input()\n", - " output_config_obj = example_gen_pb2.Output()\n", - " json_format.Parse(input_config, input_config_obj)\n", - " json_format.Parse(output_config, output_config_obj)\n", - " split_names = utils.generate_output_split_names(input_config_obj, output_config_obj)\n", - " output_dict_examples = []\n", - " for split_name in split_names:\n", - " output_split_examples = standard_artifacts.Examples(split=split_name)\n", - " output_split_examples.uri = os.path.join(output_examples_path, split_name)\n", - " output_dict_examples.append(output_split_examples)\n", - " output_dict = {\n", - " 'examples': output_dict_examples,\n", - " }\n", - "\n", - " # Create exec proterties.\n", - " exec_properties = {\n", - " 'input_config': input_config,\n", - " 'output_config': output_config\n", - " }\n", - "\n", - " executor = CsvExampleGen.EXECUTOR_SPEC.executor_class()\n", - " executor.Do(\n", - " input_dict=input_dict,\n", - " output_dict=output_dict,\n", - " exec_properties=exec_properties,\n", - " )\n", - "\n", - " return (output_examples_path,)\n", - "\n", - "if __name__ == '__main__':\n", - " import kfp\n", - " kfp.components.func_to_container_op(\n", - " CsvExampleGen_GCS,\n", - " base_image='tensorflow/tensorflow:1.14.0-py3',\n", - " packages_to_install=['tfx==0.14'],\n", - " output_component_file='CsvExampleGen_GCS.component.yaml'\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Experiment link here" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Run link here" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "RunPipelineResult(run_id=84697c60-f477-11e9-93ae-42010a800216)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#from .CsvExampleGen import CsvExampleGen_GCS\n", - "import kfp\n", - "import json\n", - "\n", - "CsvExampleGen_op = kfp.components.func_to_container_op(\n", - " func=CsvExampleGen_GCS,\n", - " base_image='tensorflow/tensorflow:1.14.0-py3',\n", - " packages_to_install=['tfx==0.14', 'six>=1.12.0'],\n", - " output_component_file='CsvExampleGen_GCS.component.yaml'\n", - ")\n", - "\n", - "output_path_template = 'gs://avolkov/tmp/tfx_pipeline/' + kfp.dsl.EXECUTION_ID_PLACEHOLDER\n", - "def tfx_pipeline():\n", - " CsvExampleGen_op(\n", - " input_base_path='gs://avolkov/tensorflow-tfx/tfx/components/testdata/external',\n", - " output_examples_path=output_path_template,\n", - " input_config=json.dumps({\n", - " \"splits\": [\n", - " {'name': 'data', 'pattern': 'csv/*.csv'},\n", - " ]\n", - " }),\n", - " output_config=json.dumps({\n", - " \"splitConfig\": {\n", - " \"splits\": [\n", - " {'name': 'train', 'hash_buckets': 2},\n", - " {'name': 'eval', 'hash_buckets': 1},\n", - " ]\n", - " }\n", - " }),\n", - " )\n", - "\n", - "from kfp.gcp import use_gcp_secret\n", - "pipeline_conf = kfp.dsl.PipelineConf()\n", - "pipeline_conf.add_op_transformer(use_gcp_secret('user-gcp-sa'))\n", - " \n", - "kfp.Client().create_run_from_pipeline_func(tfx_pipeline, arguments={}, pipeline_conf=pipeline_conf)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/components/tfx/_samples/TFX_pipeline.ipynb b/components/tfx/_samples/TFX_pipeline.ipynb new file mode 100644 index 00000000000..e5a93d1ae32 --- /dev/null +++ b/components/tfx/_samples/TFX_pipeline.ipynb @@ -0,0 +1,105 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TFX Components\n", + "\n", + "This notebook shows how to create pipeline that uses TFX components:\n", + "\n", + "* CsvExampleGen\n", + "* StatisticsGen\n", + "* SchemaGen" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import kfp\n", + "\n", + "# Initializing the client\n", + "client = kfp.Client()\n", + "\n", + "# ! Use kfp.Client(host='https://xxxxx.notebooks.googleusercontent.com/') if working from GCP notebooks (or local notebooks)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from kfp.components import load_component_from_url\n", + "\n", + "download_from_gcs_op = load_component_from_url('https://raw.githubusercontent.com/Ark-kun/pipelines/290fa55/components/google-cloud/storage/download/component.yaml')\n", + "CsvExampleGen_op = load_component_from_url('https://raw.githubusercontent.com/Ark-kun/pipelines/3a1159a/components/tfx/ExampleGen/CsvExampleGen/component.yaml')\n", + "StatisticsGen_op = load_component_from_url('https://raw.githubusercontent.com/Ark-kun/pipelines/3a1159a/components/tfx/StatisticsGen/component.yaml')\n", + "SchemaGen_op = load_component_from_url('https://raw.githubusercontent.com/Ark-kun/pipelines/3a1159a/components/tfx/SchemaGen/component.yaml')\n", + "\n", + "def tfx_pipeline(\n", + " input_data_uri,\n", + "):\n", + " download_task = download_from_gcs_op(\n", + " input_data_uri,\n", + " )\n", + " examples_task = CsvExampleGen_op(\n", + " input_base=download_task.output,\n", + " input_config=json.dumps({\n", + " \"splits\": [\n", + " {'name': 'data', 'pattern': '*.csv'},\n", + " ]\n", + " }),\n", + " output_config=json.dumps({\n", + " \"splitConfig\": {\n", + " \"splits\": [\n", + " {'name': 'train', 'hash_buckets': 2},\n", + " {'name': 'eval', 'hash_buckets': 1},\n", + " ]\n", + " }\n", + " }),\n", + " )\n", + " \n", + " statistics_task = StatisticsGen_op(\n", + " examples_task.output,\n", + " )\n", + " \n", + " schema_task = SchemaGen_op(\n", + " statistics_task.output,\n", + " )\n", + " \n", + "client.create_run_from_pipeline_func(\n", + " tfx_pipeline,\n", + " arguments={\n", + " 'input_data_uri': 'gs://avolkov/tensorflow-tfx/tfx/components/testdata/external/csv',\n", + " },\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From eb8f28137dae8c1b72ab030ce4bc6ad02cf24b66 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Wed, 6 Nov 2019 22:18:15 -0800 Subject: [PATCH 15/26] Renamed ExamplesPath to Examples for data passing components --- components/tfx/ExampleGen/CsvExampleGen/component.py | 4 ++-- components/tfx/ExampleGen/CsvExampleGen/component.yaml | 4 ++-- components/tfx/StatisticsGen/component.py | 4 ++-- components/tfx/StatisticsGen/component.yaml | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/components/tfx/ExampleGen/CsvExampleGen/component.py b/components/tfx/ExampleGen/CsvExampleGen/component.py index 3a679aac42f..0ca94ae41b7 100644 --- a/components/tfx/ExampleGen/CsvExampleGen/component.py +++ b/components/tfx/ExampleGen/CsvExampleGen/component.py @@ -6,7 +6,7 @@ def CsvExampleGen( #input_base_path: 'ExternalPath', # A Channel of 'ExternalPath' type, which includes one artifact whose uri is an external directory with csv files inside (required). # Outputs - example_artifacts_path: OutputPath('ExamplesPath'), + example_artifacts_path: OutputPath('Examples'), #example_artifacts_path: 'ExamplesPath', # Execution properties @@ -29,7 +29,7 @@ def CsvExampleGen( size 2:1. ??? input: Forwards compatibility alias for the 'input_base' argument. Returns: - example_artifacts: Artifact of type 'ExamplesPath' for output train and + example_artifacts: Artifact of type 'Examples' for output train and eval examples. """ diff --git a/components/tfx/ExampleGen/CsvExampleGen/component.yaml b/components/tfx/ExampleGen/CsvExampleGen/component.yaml index b608826d948..7ae96f73412 100644 --- a/components/tfx/ExampleGen/CsvExampleGen/component.yaml +++ b/components/tfx/ExampleGen/CsvExampleGen/component.yaml @@ -4,7 +4,7 @@ inputs: - {name: input_config, optional: true, type: ExampleGen.Input} - {name: output_config, optional: true, type: ExampleGen.Output} outputs: -- {name: example_artifacts, type: ExamplesPath} +- {name: example_artifacts, type: Examples} description: | Executes the CsvExampleGen component. @@ -19,7 +19,7 @@ description: | size 2:1. ??? input: Forwards compatibility alias for the 'input_base' argument. Returns: - example_artifacts: Artifact of type 'ExamplesPath' for output train and + example_artifacts: Artifact of type 'Examples' for output train and eval examples. implementation: container: diff --git a/components/tfx/StatisticsGen/component.py b/components/tfx/StatisticsGen/component.py index b43815ec977..1fdad0d564a 100644 --- a/components/tfx/StatisticsGen/component.py +++ b/components/tfx/StatisticsGen/component.py @@ -3,7 +3,7 @@ def StatisticsGen( # Inputs - input_data_path: InputPath('ExamplesPath'), + input_data_path: InputPath('Examples'), #input_data_path: 'ExamplesPath', # Outputs @@ -16,7 +16,7 @@ def StatisticsGen( """Construct a StatisticsGen component. Args: - input_data: A Channel of `ExamplesPath` type, likely generated by the + input_data: A Channel of `Examples` type, likely generated by the [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen). This needs to contain two splits labeled `train` and `eval`. _required_ # examples: Forwards compatibility alias for the `input_data` argument. diff --git a/components/tfx/StatisticsGen/component.yaml b/components/tfx/StatisticsGen/component.yaml index 8d441f4c1bd..41bc2915019 100644 --- a/components/tfx/StatisticsGen/component.yaml +++ b/components/tfx/StatisticsGen/component.yaml @@ -1,7 +1,7 @@ name: Statisticsgen inputs: - name: input_data - type: ExamplesPath + type: Examples outputs: - name: output type: ExampleStatistics From f84d7c90bbfbf99455efe793b7074416c5f7b0ea Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Wed, 6 Nov 2019 22:32:32 -0800 Subject: [PATCH 16/26] Corrected output_component_file paths --- components/tfx/ExampleGen/CsvExampleGen/component.py | 2 +- .../tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py | 2 +- components/tfx/SchemaGen/component.py | 2 +- components/tfx/StatisticsGen/component.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/components/tfx/ExampleGen/CsvExampleGen/component.py b/components/tfx/ExampleGen/CsvExampleGen/component.py index 0ca94ae41b7..409fc6a38cb 100644 --- a/components/tfx/ExampleGen/CsvExampleGen/component.py +++ b/components/tfx/ExampleGen/CsvExampleGen/component.py @@ -90,5 +90,5 @@ def CsvExampleGen( kfp.components.func_to_container_op( CsvExampleGen, base_image='tensorflow/tfx:0.15.0rc0', - output_component_file='CsvExampleGen.component.yaml' + output_component_file='component.yaml' ) diff --git a/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py b/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py index 4fb239481bb..9ac5e95096f 100644 --- a/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py +++ b/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py @@ -92,5 +92,5 @@ def CsvExampleGen_GCS( # kfp.components.func_to_container_op( CsvExampleGen_GCS, base_image='tensorflow/tfx:0.15.0rc0', - output_component_file='CsvExampleGen_GCS.component.yaml' + output_component_file='component.yaml' ) diff --git a/components/tfx/SchemaGen/component.py b/components/tfx/SchemaGen/component.py index 634031228f3..b975f094e80 100644 --- a/components/tfx/SchemaGen/component.py +++ b/components/tfx/SchemaGen/component.py @@ -76,5 +76,5 @@ def SchemaGen( kfp.components.func_to_container_op( SchemaGen, base_image='tensorflow/tfx:0.15.0rc0', - output_component_file='SchemaGen.component.yaml' + output_component_file='component.yaml' ) diff --git a/components/tfx/StatisticsGen/component.py b/components/tfx/StatisticsGen/component.py index 1fdad0d564a..abc93c4eef9 100644 --- a/components/tfx/StatisticsGen/component.py +++ b/components/tfx/StatisticsGen/component.py @@ -73,5 +73,5 @@ def StatisticsGen( kfp.components.func_to_container_op( StatisticsGen, base_image='tensorflow/tfx:0.15.0rc0', - output_component_file='StatisticsGen.component.yaml' + output_component_file='component.yaml' ) From 1cc4a0f781a1aa162eff7b594ba4b19a922d64cd Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Thu, 7 Nov 2019 00:25:40 -0800 Subject: [PATCH 17/26] Added the Transform component The component uses almost completely generic code. --- components/tfx/Transform/component.py | 139 +++++++++++++ components/tfx/Transform/component.yaml | 264 ++++++++++++++++++++++++ 2 files changed, 403 insertions(+) create mode 100644 components/tfx/Transform/component.py create mode 100644 components/tfx/Transform/component.yaml diff --git a/components/tfx/Transform/component.py b/components/tfx/Transform/component.py new file mode 100644 index 00000000000..ac273887bfe --- /dev/null +++ b/components/tfx/Transform/component.py @@ -0,0 +1,139 @@ +from kfp.components import InputPath, OutputPath + + +def Transform( + input_data_path: InputPath('Examples'), + #examples: InputPath('Examples'), + schema_path: InputPath('Schema'), + + transform_output_path: OutputPath('TransformGraph'), + #transform_graph_path: OutputPath('TransformGraph'), + transformed_examples_path: OutputPath('Examples'), + + module_file: 'Uri' = None, + preprocessing_fn: str = None, +): + """A TFX component to transform the input examples. + + The Transform component wraps TensorFlow Transform (tf.Transform) to + preprocess data in a TFX pipeline. This component will load the + preprocessing_fn from input module file, preprocess both 'train' and 'eval' + splits of input examples, generate the `tf.Transform` output, and save both + transform function and transformed examples to orchestrator desired locations. + + ## Providing a preprocessing function + The TFX executor will use the estimator provided in the `module_file` file + to train the model. The Transform executor will look specifically for the + `preprocessing_fn()` function within that file. + + An example of `preprocessing_fn()` can be found in the [user-supplied + code]((https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py)) + of the TFX Chicago Taxi pipeline example. + + Args: + input_data: A Channel of 'Examples' type (required). This should + contain the two splits 'train' and 'eval'. + #examples: Forwards compatibility alias for the 'input_data' argument. + schema: A Channel of 'SchemaPath' type. This should contain a single + schema artifact. + module_file: The file path to a python module file, from which the + 'preprocessing_fn' function will be loaded. The function must have the + following signature. + + def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]: + ... + + where the values of input and returned Dict are either tf.Tensor or + tf.SparseTensor. Exactly one of 'module_file' or 'preprocessing_fn' + must be supplied. + preprocessing_fn: The path to python function that implements a + 'preprocessing_fn'. See 'module_file' for expected signature of the + function. Exactly one of 'module_file' or 'preprocessing_fn' must + be supplied. + + Returns: + transform_output: Optional output 'TransformPath' channel for output of + 'tf.Transform', which includes an exported Tensorflow graph suitable for + both training and serving; + transformed_examples: Optional output 'ExamplesPath' channel for + materialized transformed examples, which includes both 'train' and + 'eval' splits. + + Raises: + ValueError: When both or neither of 'module_file' and 'preprocessing_fn' + is supplied. + """ + from tfx.components.transform.component import Transform + component_class = Transform + input_channels_with_splits = {'input_data', 'examples'} + output_channels_with_splits = {'transformed_examples'} + + + import json + import os + import tfx + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # Maybe FIX: execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op( + Transform, + base_image='tensorflow/tfx:0.15.0rc0', + output_component_file='component.yaml' + ) diff --git a/components/tfx/Transform/component.yaml b/components/tfx/Transform/component.yaml new file mode 100644 index 00000000000..ed021a553e0 --- /dev/null +++ b/components/tfx/Transform/component.yaml @@ -0,0 +1,264 @@ +name: Transform +description: | + A TFX component to transform the input examples. + + The Transform component wraps TensorFlow Transform (tf.Transform) to + preprocess data in a TFX pipeline. This component will load the + preprocessing_fn from input module file, preprocess both 'train' and 'eval' + splits of input examples, generate the `tf.Transform` output, and save both + transform function and transformed examples to orchestrator desired locations. + + ## Providing a preprocessing function + The TFX executor will use the estimator provided in the `module_file` file + to train the model. The Transform executor will look specifically for the + `preprocessing_fn()` function within that file. + + An example of `preprocessing_fn()` can be found in the [user-supplied + code]((https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py)) + of the TFX Chicago Taxi pipeline example. + + Args: + input_data: A Channel of 'Examples' type (required). This should + contain the two splits 'train' and 'eval'. + #examples: Forwards compatibility alias for the 'input_data' argument. + schema: A Channel of 'SchemaPath' type. This should contain a single + schema artifact. + module_file: The file path to a python module file, from which the + 'preprocessing_fn' function will be loaded. The function must have the + following signature. + + def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]: + ... + + where the values of input and returned Dict are either tf.Tensor or + tf.SparseTensor. Exactly one of 'module_file' or 'preprocessing_fn' + must be supplied. + preprocessing_fn: The path to python function that implements a + 'preprocessing_fn'. See 'module_file' for expected signature of the + function. Exactly one of 'module_file' or 'preprocessing_fn' must + be supplied. + + Returns: + transform_output: Optional output 'TransformPath' channel for output of + 'tf.Transform', which includes an exported Tensorflow graph suitable for + both training and serving; + transformed_examples: Optional output 'ExamplesPath' channel for + materialized transformed examples, which includes both 'train' and + 'eval' splits. + + Raises: + ValueError: When both or neither of 'module_file' and 'preprocessing_fn' + is supplied. +inputs: +- name: input_data + type: Examples +- name: schema + type: Schema +- name: module_file + type: Uri + optional: true +- name: preprocessing_fn + type: String + optional: true +outputs: +- name: transform_output + type: TransformGraph +- name: transformed_examples + type: Examples +implementation: + container: + image: tensorflow/tfx:0.15.0rc0 + command: + - python3 + - -u + - -c + - | + class OutputPath: + '''When creating component from function, OutputPath should be used as function parameter annotation to tell the system that the function wants to output data by writing it into a file with the given path instead of returning the data from the function.''' + def __init__(self, type=None): + self.type = type + + class InputPath: + '''When creating component from function, InputPath should be used as function parameter annotation to tell the system to pass the *data file path* to the function instead of passing the actual data.''' + def __init__(self, type=None): + self.type = type + + def _make_parent_dirs_and_return_path(file_path: str): + import os + os.makedirs(os.path.dirname(file_path), exist_ok=True) + return file_path + + def Transform( + input_data_path: InputPath('Examples'), + #examples: InputPath('Examples'), + schema_path: InputPath('Schema'), + + transform_output_path: OutputPath('TransformGraph'), + #transform_graph_path: OutputPath('TransformGraph'), + transformed_examples_path: OutputPath('Examples'), + + module_file: 'Uri' = None, + preprocessing_fn: str = None, + ): + """A TFX component to transform the input examples. + + The Transform component wraps TensorFlow Transform (tf.Transform) to + preprocess data in a TFX pipeline. This component will load the + preprocessing_fn from input module file, preprocess both 'train' and 'eval' + splits of input examples, generate the `tf.Transform` output, and save both + transform function and transformed examples to orchestrator desired locations. + + ## Providing a preprocessing function + The TFX executor will use the estimator provided in the `module_file` file + to train the model. The Transform executor will look specifically for the + `preprocessing_fn()` function within that file. + + An example of `preprocessing_fn()` can be found in the [user-supplied + code]((https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py)) + of the TFX Chicago Taxi pipeline example. + + Args: + input_data: A Channel of 'Examples' type (required). This should + contain the two splits 'train' and 'eval'. + #examples: Forwards compatibility alias for the 'input_data' argument. + schema: A Channel of 'SchemaPath' type. This should contain a single + schema artifact. + module_file: The file path to a python module file, from which the + 'preprocessing_fn' function will be loaded. The function must have the + following signature. + + def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]: + ... + + where the values of input and returned Dict are either tf.Tensor or + tf.SparseTensor. Exactly one of 'module_file' or 'preprocessing_fn' + must be supplied. + preprocessing_fn: The path to python function that implements a + 'preprocessing_fn'. See 'module_file' for expected signature of the + function. Exactly one of 'module_file' or 'preprocessing_fn' must + be supplied. + + Returns: + transform_output: Optional output 'TransformPath' channel for output of + 'tf.Transform', which includes an exported Tensorflow graph suitable for + both training and serving; + transformed_examples: Optional output 'ExamplesPath' channel for + materialized transformed examples, which includes both 'train' and + 'eval' splits. + + Raises: + ValueError: When both or neither of 'module_file' and 'preprocessing_fn' + is supplied. + """ + from tfx.components.transform.component import Transform + component_class = Transform + input_channels_with_splits = {'input_data', 'examples'} + output_channels_with_splits = {'transformed_examples'} + + import json + import os + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + import argparse + _parser = argparse.ArgumentParser(prog='Transform', description="A TFX component to transform the input examples.\n\n The Transform component wraps TensorFlow Transform (tf.Transform) to\n preprocess data in a TFX pipeline. This component will load the\n preprocessing_fn from input module file, preprocess both 'train' and 'eval'\n splits of input examples, generate the `tf.Transform` output, and save both\n transform function and transformed examples to orchestrator desired locations.\n\n ## Providing a preprocessing function\n The TFX executor will use the estimator provided in the `module_file` file\n to train the model. The Transform executor will look specifically for the\n `preprocessing_fn()` function within that file.\n\n An example of `preprocessing_fn()` can be found in the [user-supplied\n code]((https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py))\n of the TFX Chicago Taxi pipeline example.\n\n Args:\n input_data: A Channel of 'Examples' type (required). This should\n contain the two splits 'train' and 'eval'.\n #examples: Forwards compatibility alias for the 'input_data' argument.\n schema: A Channel of 'SchemaPath' type. This should contain a single\n schema artifact.\n module_file: The file path to a python module file, from which the\n 'preprocessing_fn' function will be loaded. The function must have the\n following signature.\n\n def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]:\n ...\n\n where the values of input and returned Dict are either tf.Tensor or\n tf.SparseTensor. Exactly one of 'module_file' or 'preprocessing_fn'\n must be supplied.\n preprocessing_fn: The path to python function that implements a\n 'preprocessing_fn'. See 'module_file' for expected signature of the\n function. Exactly one of 'module_file' or 'preprocessing_fn' must\n be supplied.\n\n Returns:\n transform_output: Optional output 'TransformPath' channel for output of\n 'tf.Transform', which includes an exported Tensorflow graph suitable for\n both training and serving;\n transformed_examples: Optional output 'ExamplesPath' channel for\n materialized transformed examples, which includes both 'train' and\n 'eval' splits.\n\n Raises:\n ValueError: When both or neither of 'module_file' and 'preprocessing_fn'\n is supplied.\n") + _parser.add_argument("--input-data", dest="input_data_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--schema", dest="schema_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--module-file", dest="module_file", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--preprocessing-fn", dest="preprocessing_fn", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--transform-output", dest="transform_output_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--transformed-examples", dest="transformed_examples_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = Transform(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + _output_serializers = [ + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - --input-data + - inputPath: input_data + - --schema + - inputPath: schema + - if: + cond: + isPresent: module_file + then: + - --module-file + - inputValue: module_file + - if: + cond: + isPresent: preprocessing_fn + then: + - --preprocessing-fn + - inputValue: preprocessing_fn + - --transform-output + - outputPath: transform_output + - --transformed-examples + - outputPath: transformed_examples From 7cc33500eb9e4b37a6b1f24d4e845cef91703d89 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Thu, 7 Nov 2019 00:50:59 -0800 Subject: [PATCH 18/26] Added the Trainer component --- components/tfx/Trainer/component.py | 171 +++++++++++++ components/tfx/Trainer/component.yaml | 347 ++++++++++++++++++++++++++ 2 files changed, 518 insertions(+) create mode 100644 components/tfx/Trainer/component.py create mode 100644 components/tfx/Trainer/component.yaml diff --git a/components/tfx/Trainer/component.py b/components/tfx/Trainer/component.py new file mode 100644 index 00000000000..b672890f5d0 --- /dev/null +++ b/components/tfx/Trainer/component.py @@ -0,0 +1,171 @@ +from kfp.components import InputPath, OutputPath + + +def Trainer( + examples_path: InputPath('Examples'), + transform_output_path: InputPath('TransformGraph'), # ? = None + #transform_graph_path: InputPath('TransformGraph'), + schema_path: InputPath('Schema'), + + output_path: OutputPath('Model'), + + module_file: str = None, + trainer_fn: str = None, + train_args: 'JsonObject: tfx.proto.trainer_pb2.TrainArgs' = None, + eval_args: 'JsonObject: tfx.proto.trainer_pb2.EvalArgs' = None, + #custom_config: dict = None, + #custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None, +): + """ + A TFX component to train a TensorFlow model. + + The Trainer component is used to train and eval a model using given inputs and + a user-supplied estimator. This component includes a custom driver to + optionally grab previous model to warm start from. + + ## Providing an estimator + The TFX executor will use the estimator provided in the `module_file` file + to train the model. The Trainer executor will look specifically for the + `trainer_fn()` function within that file. Before training, the executor will + call that function expecting the following returned as a dictionary: + + - estimator: The + [estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator) + to be used by TensorFlow to train the model. + - train_spec: The + [configuration](https://www.tensorflow.org/api_docs/python/tf/estimator/TrainSpec) + to be used by the "train" part of the TensorFlow `train_and_evaluate()` + call. + - eval_spec: The + [configuration](https://www.tensorflow.org/api_docs/python/tf/estimator/EvalSpec) + to be used by the "eval" part of the TensorFlow `train_and_evaluate()` call. + - eval_input_receiver_fn: The + [configuration](https://www.tensorflow.org/tfx/model_analysis/get_started#modify_an_existing_model) + to be used + by the [ModelValidator](https://www.tensorflow.org/tfx/guide/modelval) + component when validating the model. + + An example of `trainer_fn()` can be found in the [user-supplied + code]((https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py)) + of the TFX Chicago Taxi pipeline example. + + + Args: + examples: A Channel of 'ExamplesPath' type, serving as the source of + examples that are used in training (required). May be raw or + transformed. + transform_output: An optional Channel of 'TransformPath' type, serving as + the input transform graph if present. + #transform_graph: Forwards compatibility alias for the 'transform_output' + # argument. + schema: A Channel of 'SchemaPath' type, serving as the schema of training + and eval data. + module_file: A path to python module file containing UDF model definition. + The module_file must implement a function named `trainer_fn` at its + top level. The function must have the following signature. + + def trainer_fn(tf.contrib.training.HParams, + tensorflow_metadata.proto.v0.schema_pb2) -> Dict: + ... + + where the returned Dict has the following key-values. + 'estimator': an instance of tf.estimator.Estimator + 'train_spec': an instance of tf.estimator.TrainSpec + 'eval_spec': an instance of tf.estimator.EvalSpec + 'eval_input_receiver_fn': an instance of tfma.export.EvalInputReceiver + + Exactly one of 'module_file' or 'trainer_fn' must be supplied. + trainer_fn: A python path to UDF model definition function. See + 'module_file' for the required signature of the UDF. + Exactly one of 'module_file' or 'trainer_fn' must be supplied. + train_args: A trainer_pb2.TrainArgs instance, containing args used for + training. Current only num_steps is available. + eval_args: A trainer_pb2.EvalArgs instance, containing args used for eval. + Current only num_steps is available. + #custom_config: A dict which contains the training job parameters to be + # passed to Google Cloud ML Engine. For the full set of parameters + # supported by Google Cloud ML Engine, refer to + # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#Job + #custom_executor_spec: Optional custom executor spec. + Returns: + output: Optional 'ModelExportPath' channel for result of exported models. + Raises: + ValueError: + - When both or neither of 'module_file' and 'trainer_fn' is supplied. + - When both or neither of 'examples' and 'transformed_examples' + is supplied. + - When 'transformed_examples' is supplied but 'transform_output' + is not supplied. + """ + from tfx.components.trainer.component import Trainer + component_class = Trainer + input_channels_with_splits = {'examples'} + output_channels_with_splits = {} + + + import json + import os + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op( + Trainer, + base_image='tensorflow/tfx:0.15.0rc0', + output_component_file='component.yaml' + ) diff --git a/components/tfx/Trainer/component.yaml b/components/tfx/Trainer/component.yaml new file mode 100644 index 00000000000..93791114755 --- /dev/null +++ b/components/tfx/Trainer/component.yaml @@ -0,0 +1,347 @@ +name: Trainer +description: | + A TFX component to train a TensorFlow model. + + The Trainer component is used to train and eval a model using given inputs and + a user-supplied estimator. This component includes a custom driver to + optionally grab previous model to warm start from. + + ## Providing an estimator + The TFX executor will use the estimator provided in the `module_file` file + to train the model. The Trainer executor will look specifically for the + `trainer_fn()` function within that file. Before training, the executor will + call that function expecting the following returned as a dictionary: + + - estimator: The + [estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator) + to be used by TensorFlow to train the model. + - train_spec: The + [configuration](https://www.tensorflow.org/api_docs/python/tf/estimator/TrainSpec) + to be used by the "train" part of the TensorFlow `train_and_evaluate()` + call. + - eval_spec: The + [configuration](https://www.tensorflow.org/api_docs/python/tf/estimator/EvalSpec) + to be used by the "eval" part of the TensorFlow `train_and_evaluate()` call. + - eval_input_receiver_fn: The + [configuration](https://www.tensorflow.org/tfx/model_analysis/get_started#modify_an_existing_model) + to be used + by the [ModelValidator](https://www.tensorflow.org/tfx/guide/modelval) + component when validating the model. + + An example of `trainer_fn()` can be found in the [user-supplied + code]((https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py)) + of the TFX Chicago Taxi pipeline example. + + + Args: + examples: A Channel of 'ExamplesPath' type, serving as the source of + examples that are used in training (required). May be raw or + transformed. + transform_output: An optional Channel of 'TransformPath' type, serving as + the input transform graph if present. + #transform_graph: Forwards compatibility alias for the 'transform_output' + # argument. + schema: A Channel of 'SchemaPath' type, serving as the schema of training + and eval data. + module_file: A path to python module file containing UDF model definition. + The module_file must implement a function named `trainer_fn` at its + top level. The function must have the following signature. + + def trainer_fn(tf.contrib.training.HParams, + tensorflow_metadata.proto.v0.schema_pb2) -> Dict: + ... + + where the returned Dict has the following key-values. + 'estimator': an instance of tf.estimator.Estimator + 'train_spec': an instance of tf.estimator.TrainSpec + 'eval_spec': an instance of tf.estimator.EvalSpec + 'eval_input_receiver_fn': an instance of tfma.export.EvalInputReceiver + + Exactly one of 'module_file' or 'trainer_fn' must be supplied. + trainer_fn: A python path to UDF model definition function. See + 'module_file' for the required signature of the UDF. + Exactly one of 'module_file' or 'trainer_fn' must be supplied. + train_args: A trainer_pb2.TrainArgs instance, containing args used for + training. Current only num_steps is available. + eval_args: A trainer_pb2.EvalArgs instance, containing args used for eval. + Current only num_steps is available. + #custom_config: A dict which contains the training job parameters to be + # passed to Google Cloud ML Engine. For the full set of parameters + # supported by Google Cloud ML Engine, refer to + # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#Job + #custom_executor_spec: Optional custom executor spec. + Returns: + output: Optional 'ModelExportPath' channel for result of exported models. + Raises: + ValueError: + - When both or neither of 'module_file' and 'trainer_fn' is supplied. + - When both or neither of 'examples' and 'transformed_examples' + is supplied. + - When 'transformed_examples' is supplied but 'transform_output' + is not supplied. +inputs: +- name: examples + type: Examples +- name: transform_output + type: TransformGraph +- name: schema + type: Schema +- name: module_file + type: String + optional: true +- name: trainer_fn + type: String + optional: true +- name: train_args + type: 'JsonObject: tfx.proto.trainer_pb2.TrainArgs' + optional: true +- name: eval_args + type: 'JsonObject: tfx.proto.trainer_pb2.EvalArgs' + optional: true +outputs: +- name: output + type: Model +implementation: + container: + image: tensorflow/tfx:0.15.0rc0 + command: + - python3 + - -u + - -c + - | + class OutputPath: + '''When creating component from function, OutputPath should be used as function parameter annotation to tell the system that the function wants to output data by writing it into a file with the given path instead of returning the data from the function.''' + def __init__(self, type=None): + self.type = type + + class InputPath: + '''When creating component from function, InputPath should be used as function parameter annotation to tell the system to pass the *data file path* to the function instead of passing the actual data.''' + def __init__(self, type=None): + self.type = type + + def _make_parent_dirs_and_return_path(file_path: str): + import os + os.makedirs(os.path.dirname(file_path), exist_ok=True) + return file_path + + def Trainer( + examples_path: InputPath('Examples'), + transform_output_path: InputPath('TransformGraph'), # ? = None + #transform_graph_path: InputPath('TransformGraph'), + schema_path: InputPath('Schema'), + + output_path: OutputPath('Model'), + + module_file: str = None, + trainer_fn: str = None, + train_args: 'JsonObject: tfx.proto.trainer_pb2.TrainArgs' = None, + eval_args: 'JsonObject: tfx.proto.trainer_pb2.EvalArgs' = None, + #custom_config: dict = None, + #custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None, + ): + """ + A TFX component to train a TensorFlow model. + + The Trainer component is used to train and eval a model using given inputs and + a user-supplied estimator. This component includes a custom driver to + optionally grab previous model to warm start from. + + ## Providing an estimator + The TFX executor will use the estimator provided in the `module_file` file + to train the model. The Trainer executor will look specifically for the + `trainer_fn()` function within that file. Before training, the executor will + call that function expecting the following returned as a dictionary: + + - estimator: The + [estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator) + to be used by TensorFlow to train the model. + - train_spec: The + [configuration](https://www.tensorflow.org/api_docs/python/tf/estimator/TrainSpec) + to be used by the "train" part of the TensorFlow `train_and_evaluate()` + call. + - eval_spec: The + [configuration](https://www.tensorflow.org/api_docs/python/tf/estimator/EvalSpec) + to be used by the "eval" part of the TensorFlow `train_and_evaluate()` call. + - eval_input_receiver_fn: The + [configuration](https://www.tensorflow.org/tfx/model_analysis/get_started#modify_an_existing_model) + to be used + by the [ModelValidator](https://www.tensorflow.org/tfx/guide/modelval) + component when validating the model. + + An example of `trainer_fn()` can be found in the [user-supplied + code]((https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py)) + of the TFX Chicago Taxi pipeline example. + + Args: + examples: A Channel of 'ExamplesPath' type, serving as the source of + examples that are used in training (required). May be raw or + transformed. + transform_output: An optional Channel of 'TransformPath' type, serving as + the input transform graph if present. + #transform_graph: Forwards compatibility alias for the 'transform_output' + # argument. + schema: A Channel of 'SchemaPath' type, serving as the schema of training + and eval data. + module_file: A path to python module file containing UDF model definition. + The module_file must implement a function named `trainer_fn` at its + top level. The function must have the following signature. + + def trainer_fn(tf.contrib.training.HParams, + tensorflow_metadata.proto.v0.schema_pb2) -> Dict: + ... + + where the returned Dict has the following key-values. + 'estimator': an instance of tf.estimator.Estimator + 'train_spec': an instance of tf.estimator.TrainSpec + 'eval_spec': an instance of tf.estimator.EvalSpec + 'eval_input_receiver_fn': an instance of tfma.export.EvalInputReceiver + + Exactly one of 'module_file' or 'trainer_fn' must be supplied. + trainer_fn: A python path to UDF model definition function. See + 'module_file' for the required signature of the UDF. + Exactly one of 'module_file' or 'trainer_fn' must be supplied. + train_args: A trainer_pb2.TrainArgs instance, containing args used for + training. Current only num_steps is available. + eval_args: A trainer_pb2.EvalArgs instance, containing args used for eval. + Current only num_steps is available. + #custom_config: A dict which contains the training job parameters to be + # passed to Google Cloud ML Engine. For the full set of parameters + # supported by Google Cloud ML Engine, refer to + # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#Job + #custom_executor_spec: Optional custom executor spec. + Returns: + output: Optional 'ModelExportPath' channel for result of exported models. + Raises: + ValueError: + - When both or neither of 'module_file' and 'trainer_fn' is supplied. + - When both or neither of 'examples' and 'transformed_examples' + is supplied. + - When 'transformed_examples' is supplied but 'transform_output' + is not supplied. + """ + from tfx.components.trainer.component import Trainer + component_class = Trainer + input_channels_with_splits = {'examples'} + output_channels_with_splits = {} + + import json + import os + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + import argparse + _parser = argparse.ArgumentParser(prog='Trainer', description='A TFX component to train a TensorFlow model.\n\n The Trainer component is used to train and eval a model using given inputs and\n a user-supplied estimator. This component includes a custom driver to\n optionally grab previous model to warm start from.\n\n ## Providing an estimator\n The TFX executor will use the estimator provided in the `module_file` file\n to train the model. The Trainer executor will look specifically for the\n `trainer_fn()` function within that file. Before training, the executor will\n call that function expecting the following returned as a dictionary:\n\n - estimator: The\n [estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)\n to be used by TensorFlow to train the model.\n - train_spec: The\n [configuration](https://www.tensorflow.org/api_docs/python/tf/estimator/TrainSpec)\n to be used by the "train" part of the TensorFlow `train_and_evaluate()`\n call.\n - eval_spec: The\n [configuration](https://www.tensorflow.org/api_docs/python/tf/estimator/EvalSpec)\n to be used by the "eval" part of the TensorFlow `train_and_evaluate()` call.\n - eval_input_receiver_fn: The\n [configuration](https://www.tensorflow.org/tfx/model_analysis/get_started#modify_an_existing_model)\n to be used\n by the [ModelValidator](https://www.tensorflow.org/tfx/guide/modelval)\n component when validating the model.\n\n An example of `trainer_fn()` can be found in the [user-supplied\n code]((https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py))\n of the TFX Chicago Taxi pipeline example.\n\n\n Args:\n examples: A Channel of \'ExamplesPath\' type, serving as the source of\n examples that are used in training (required). May be raw or\n transformed.\n transform_output: An optional Channel of \'TransformPath\' type, serving as\n the input transform graph if present.\n #transform_graph: Forwards compatibility alias for the \'transform_output\'\n # argument.\n schema: A Channel of \'SchemaPath\' type, serving as the schema of training\n and eval data.\n module_file: A path to python module file containing UDF model definition.\n The module_file must implement a function named `trainer_fn` at its\n top level. The function must have the following signature.\n\n def trainer_fn(tf.contrib.training.HParams,\n tensorflow_metadata.proto.v0.schema_pb2) -> Dict:\n ...\n\n where the returned Dict has the following key-values.\n \'estimator\': an instance of tf.estimator.Estimator\n \'train_spec\': an instance of tf.estimator.TrainSpec\n \'eval_spec\': an instance of tf.estimator.EvalSpec\n \'eval_input_receiver_fn\': an instance of tfma.export.EvalInputReceiver\n\n Exactly one of \'module_file\' or \'trainer_fn\' must be supplied.\n trainer_fn: A python path to UDF model definition function. See\n \'module_file\' for the required signature of the UDF.\n Exactly one of \'module_file\' or \'trainer_fn\' must be supplied.\n train_args: A trainer_pb2.TrainArgs instance, containing args used for\n training. Current only num_steps is available.\n eval_args: A trainer_pb2.EvalArgs instance, containing args used for eval.\n Current only num_steps is available.\n #custom_config: A dict which contains the training job parameters to be\n # passed to Google Cloud ML Engine. For the full set of parameters\n # supported by Google Cloud ML Engine, refer to\n # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#Job\n #custom_executor_spec: Optional custom executor spec.\n Returns:\n output: Optional \'ModelExportPath\' channel for result of exported models.\n Raises:\n ValueError:\n - When both or neither of \'module_file\' and \'trainer_fn\' is supplied.\n - When both or neither of \'examples\' and \'transformed_examples\'\n is supplied.\n - When \'transformed_examples\' is supplied but \'transform_output\'\n is not supplied.\n') + _parser.add_argument("--examples", dest="examples_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--transform-output", dest="transform_output_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--schema", dest="schema_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--module-file", dest="module_file", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--trainer-fn", dest="trainer_fn", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--train-args", dest="train_args", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--eval-args", dest="eval_args", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--output", dest="output_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = Trainer(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + _output_serializers = [ + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - --examples + - inputPath: examples + - --transform-output + - inputPath: transform_output + - --schema + - inputPath: schema + - if: + cond: + isPresent: module_file + then: + - --module-file + - inputValue: module_file + - if: + cond: + isPresent: trainer_fn + then: + - --trainer-fn + - inputValue: trainer_fn + - if: + cond: + isPresent: train_args + then: + - --train-args + - inputValue: train_args + - if: + cond: + isPresent: eval_args + then: + - --eval-args + - inputValue: eval_args + - --output + - outputPath: output From 9f5fe9cb9de059c8fb17b5809767b96913d0b419 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Thu, 7 Nov 2019 01:22:55 -0800 Subject: [PATCH 19/26] Added the BigQueryExampleGen component --- .../BigQueryExampleGen/component.py | 105 ++++++++++ .../BigQueryExampleGen/component.yaml | 193 ++++++++++++++++++ 2 files changed, 298 insertions(+) create mode 100644 components/tfx/ExampleGen/BigQueryExampleGen/component.py create mode 100644 components/tfx/ExampleGen/BigQueryExampleGen/component.yaml diff --git a/components/tfx/ExampleGen/BigQueryExampleGen/component.py b/components/tfx/ExampleGen/BigQueryExampleGen/component.py new file mode 100644 index 00000000000..5ad5f13791d --- /dev/null +++ b/components/tfx/ExampleGen/BigQueryExampleGen/component.py @@ -0,0 +1,105 @@ +from kfp.components import InputPath, OutputPath + + +def BigQueryExampleGen( + example_artifacts_path: OutputPath('Examples'), + + query: str = None, + input_config: 'JsonObject: example_gen_pb2.Input' = None, + output_config: 'JsonObject: example_gen_pb2.Output' = None, +): + """ + Official TFX BigQueryExampleGen component. + + The BigQuery examplegen component takes a query, and generates train + and eval examples for downsteam components. + + + Args: + query: BigQuery sql string, query result will be treated as a single + split, can be overwritten by input_config. + input_config: An example_gen_pb2.Input instance with Split.pattern as + BigQuery sql string. If set, it overwrites the 'query' arg, and allows + different queries per split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + Returns: + example_artifacts: Optional channel of 'ExamplesPath' for output train and + eval examples. + + Raises: + RuntimeError: Only one of query and input_config should be set. + """ + from tfx.components.example_gen.csv_example_gen.component import BigQueryExampleGen + component_class = BigQueryExampleGen + input_channels_with_splits = {} + output_channels_with_splits = {'example_artifacts'} + + + import json + import os + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op( + BigQueryExampleGen, + base_image='tensorflow/tfx:0.15.0rc0', + output_component_file='component.yaml' + ) diff --git a/components/tfx/ExampleGen/BigQueryExampleGen/component.yaml b/components/tfx/ExampleGen/BigQueryExampleGen/component.yaml new file mode 100644 index 00000000000..721139ce34b --- /dev/null +++ b/components/tfx/ExampleGen/BigQueryExampleGen/component.yaml @@ -0,0 +1,193 @@ +name: Bigqueryexamplegen +description: | + Official TFX BigQueryExampleGen component. + + The BigQuery examplegen component takes a query, and generates train + and eval examples for downsteam components. + + + Args: + query: BigQuery sql string, query result will be treated as a single + split, can be overwritten by input_config. + input_config: An example_gen_pb2.Input instance with Split.pattern as + BigQuery sql string. If set, it overwrites the 'query' arg, and allows + different queries per split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + Returns: + example_artifacts: Optional channel of 'ExamplesPath' for output train and + eval examples. + + Raises: + RuntimeError: Only one of query and input_config should be set. +inputs: +- name: query + type: String + optional: true +- name: input_config + type: 'JsonObject: example_gen_pb2.Input' + optional: true +- name: output_config + type: 'JsonObject: example_gen_pb2.Output' + optional: true +outputs: +- name: example_artifacts + type: Examples +implementation: + container: + image: tensorflow/tfx:0.15.0rc0 + command: + - python3 + - -u + - -c + - | + def _make_parent_dirs_and_return_path(file_path: str): + import os + os.makedirs(os.path.dirname(file_path), exist_ok=True) + return file_path + + class OutputPath: + '''When creating component from function, OutputPath should be used as function parameter annotation to tell the system that the function wants to output data by writing it into a file with the given path instead of returning the data from the function.''' + def __init__(self, type=None): + self.type = type + + def BigQueryExampleGen( + example_artifacts_path: OutputPath('Examples'), + + query: str = None, + input_config: 'JsonObject: example_gen_pb2.Input' = None, + output_config: 'JsonObject: example_gen_pb2.Output' = None, + ): + """ + Official TFX BigQueryExampleGen component. + + The BigQuery examplegen component takes a query, and generates train + and eval examples for downsteam components. + + Args: + query: BigQuery sql string, query result will be treated as a single + split, can be overwritten by input_config. + input_config: An example_gen_pb2.Input instance with Split.pattern as + BigQuery sql string. If set, it overwrites the 'query' arg, and allows + different queries per split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + Returns: + example_artifacts: Optional channel of 'ExamplesPath' for output train and + eval examples. + + Raises: + RuntimeError: Only one of query and input_config should be set. + """ + from tfx.components.example_gen.csv_example_gen.component import BigQueryExampleGen + component_class = BigQueryExampleGen + input_channels_with_splits = {} + output_channels_with_splits = {'example_artifacts'} + + import json + import os + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + import argparse + _parser = argparse.ArgumentParser(prog='Bigqueryexamplegen', description="Official TFX BigQueryExampleGen component.\n\n The BigQuery examplegen component takes a query, and generates train\n and eval examples for downsteam components.\n\n\n Args:\n query: BigQuery sql string, query result will be treated as a single\n split, can be overwritten by input_config.\n input_config: An example_gen_pb2.Input instance with Split.pattern as\n BigQuery sql string. If set, it overwrites the 'query' arg, and allows\n different queries per split.\n output_config: An example_gen_pb2.Output instance, providing output\n configuration. If unset, default splits will be 'train' and 'eval' with\n size 2:1.\n Returns:\n example_artifacts: Optional channel of 'ExamplesPath' for output train and\n eval examples.\n\n Raises:\n RuntimeError: Only one of query and input_config should be set.\n") + _parser.add_argument("--query", dest="query", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--input-config", dest="input_config", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--output-config", dest="output_config", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--example-artifacts", dest="example_artifacts_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = BigQueryExampleGen(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + _output_serializers = [ + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - if: + cond: + isPresent: query + then: + - --query + - inputValue: query + - if: + cond: + isPresent: input_config + then: + - --input-config + - inputValue: input_config + - if: + cond: + isPresent: output_config + then: + - --output-config + - inputValue: output_config + - --example-artifacts + - outputPath: example_artifacts From 91ec94d8a8442d1bc0c2149115e11cf1ac09347f Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Thu, 7 Nov 2019 01:26:32 -0800 Subject: [PATCH 20/26] Added the ImportExampleGen component --- .../ExampleGen/ImportExampleGen/component.py | 110 ++++++++++ .../ImportExampleGen/component.yaml | 202 ++++++++++++++++++ 2 files changed, 312 insertions(+) create mode 100644 components/tfx/ExampleGen/ImportExampleGen/component.py create mode 100644 components/tfx/ExampleGen/ImportExampleGen/component.yaml diff --git a/components/tfx/ExampleGen/ImportExampleGen/component.py b/components/tfx/ExampleGen/ImportExampleGen/component.py new file mode 100644 index 00000000000..7f863e61b7d --- /dev/null +++ b/components/tfx/ExampleGen/ImportExampleGen/component.py @@ -0,0 +1,110 @@ +from kfp.components import InputPath, OutputPath + + +def ImportExampleGen( + input_base_path: InputPath('ExternalPath'), + #input_path: InputPath('ExternalPath'), + + example_artifacts_path: OutputPath('Examples'), + + input_config: 'JsonObject: example_gen_pb2.Input' = None, + output_config: 'JsonObject: example_gen_pb2.Output' = None, +): + """ + Official TFX ImportExampleGen component. + + The ImportExampleGen component takes TFRecord files with TF Example data + format, and generates train and eval examples for downsteam components. + This component provides consistent and configurable partition, and it also + shuffle the dataset for ML best practice. + + Args: + input_base: A Channel of 'ExternalPath' type, which includes one artifact + whose uri is an external directory with TFRecord files inside + (required). + #input: Forwards compatibility alias for the 'input_base' argument. + input_config: An example_gen_pb2.Input instance, providing input + configuration. If unset, the files under input_base will be treated as a + single split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + Returns: + example_artifacts: Optional channel of 'ExamplesPath' for output train and + eval examples. + + Raises: + RuntimeError: Only one of query and input_config should be set. + """ + from tfx.components.example_gen.import_example_gen.component import ImportExampleGen + component_class = ImportExampleGen + input_channels_with_splits = {} + output_channels_with_splits = {'example_artifacts'} + + + import json + import os + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op( + ImportExampleGen, + base_image='tensorflow/tfx:0.15.0rc0', + output_component_file='component.yaml' + ) diff --git a/components/tfx/ExampleGen/ImportExampleGen/component.yaml b/components/tfx/ExampleGen/ImportExampleGen/component.yaml new file mode 100644 index 00000000000..87f4a2aa0ec --- /dev/null +++ b/components/tfx/ExampleGen/ImportExampleGen/component.yaml @@ -0,0 +1,202 @@ +name: Importexamplegen +description: | + Official TFX ImportExampleGen component. + + The ImportExampleGen component takes TFRecord files with TF Example data + format, and generates train and eval examples for downsteam components. + This component provides consistent and configurable partition, and it also + shuffle the dataset for ML best practice. + + Args: + input_base: A Channel of 'ExternalPath' type, which includes one artifact + whose uri is an external directory with TFRecord files inside + (required). + #input: Forwards compatibility alias for the 'input_base' argument. + input_config: An example_gen_pb2.Input instance, providing input + configuration. If unset, the files under input_base will be treated as a + single split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + Returns: + example_artifacts: Optional channel of 'ExamplesPath' for output train and + eval examples. + + Raises: + RuntimeError: Only one of query and input_config should be set. +inputs: +- name: input_base + type: ExternalPath +- name: input_config + type: 'JsonObject: example_gen_pb2.Input' + optional: true +- name: output_config + type: 'JsonObject: example_gen_pb2.Output' + optional: true +outputs: +- name: example_artifacts + type: Examples +implementation: + container: + image: tensorflow/tfx:0.15.0rc0 + command: + - python3 + - -u + - -c + - | + class InputPath: + '''When creating component from function, InputPath should be used as function parameter annotation to tell the system to pass the *data file path* to the function instead of passing the actual data.''' + def __init__(self, type=None): + self.type = type + + def _make_parent_dirs_and_return_path(file_path: str): + import os + os.makedirs(os.path.dirname(file_path), exist_ok=True) + return file_path + + class OutputPath: + '''When creating component from function, OutputPath should be used as function parameter annotation to tell the system that the function wants to output data by writing it into a file with the given path instead of returning the data from the function.''' + def __init__(self, type=None): + self.type = type + + def ImportExampleGen( + input_base_path: InputPath('ExternalPath'), + #input_path: InputPath('ExternalPath'), + + example_artifacts_path: OutputPath('Examples'), + + input_config: 'JsonObject: example_gen_pb2.Input' = None, + output_config: 'JsonObject: example_gen_pb2.Output' = None, + ): + """ + Official TFX ImportExampleGen component. + + The ImportExampleGen component takes TFRecord files with TF Example data + format, and generates train and eval examples for downsteam components. + This component provides consistent and configurable partition, and it also + shuffle the dataset for ML best practice. + + Args: + input_base: A Channel of 'ExternalPath' type, which includes one artifact + whose uri is an external directory with TFRecord files inside + (required). + #input: Forwards compatibility alias for the 'input_base' argument. + input_config: An example_gen_pb2.Input instance, providing input + configuration. If unset, the files under input_base will be treated as a + single split. + output_config: An example_gen_pb2.Output instance, providing output + configuration. If unset, default splits will be 'train' and 'eval' with + size 2:1. + Returns: + example_artifacts: Optional channel of 'ExamplesPath' for output train and + eval examples. + + Raises: + RuntimeError: Only one of query and input_config should be set. + """ + from tfx.components.example_gen.import_example_gen.component import ImportExampleGen + component_class = ImportExampleGen + input_channels_with_splits = {} + output_channels_with_splits = {'example_artifacts'} + + import json + import os + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + import argparse + _parser = argparse.ArgumentParser(prog='Importexamplegen', description="Official TFX ImportExampleGen component.\n\n The ImportExampleGen component takes TFRecord files with TF Example data\n format, and generates train and eval examples for downsteam components.\n This component provides consistent and configurable partition, and it also\n shuffle the dataset for ML best practice.\n\n Args:\n input_base: A Channel of 'ExternalPath' type, which includes one artifact\n whose uri is an external directory with TFRecord files inside\n (required).\n #input: Forwards compatibility alias for the 'input_base' argument.\n input_config: An example_gen_pb2.Input instance, providing input\n configuration. If unset, the files under input_base will be treated as a\n single split.\n output_config: An example_gen_pb2.Output instance, providing output\n configuration. If unset, default splits will be 'train' and 'eval' with\n size 2:1.\n Returns:\n example_artifacts: Optional channel of 'ExamplesPath' for output train and\n eval examples.\n\n Raises:\n RuntimeError: Only one of query and input_config should be set.\n") + _parser.add_argument("--input-base", dest="input_base_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--input-config", dest="input_config", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--output-config", dest="output_config", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--example-artifacts", dest="example_artifacts_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = ImportExampleGen(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + _output_serializers = [ + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - --input-base + - inputPath: input_base + - if: + cond: + isPresent: input_config + then: + - --input-config + - inputValue: input_config + - if: + cond: + isPresent: output_config + then: + - --output-config + - inputValue: output_config + - --example-artifacts + - outputPath: example_artifacts From 4892bbf9b4fa67ef85d3f4abf6a50e4dccf45c35 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Thu, 7 Nov 2019 01:11:37 -0800 Subject: [PATCH 21/26] Added the Evaluator component --- components/tfx/Evaluator/component.py | 128 +++++++++++++ components/tfx/Evaluator/component.yaml | 232 ++++++++++++++++++++++++ 2 files changed, 360 insertions(+) create mode 100644 components/tfx/Evaluator/component.py create mode 100644 components/tfx/Evaluator/component.yaml diff --git a/components/tfx/Evaluator/component.py b/components/tfx/Evaluator/component.py new file mode 100644 index 00000000000..d26c904fa00 --- /dev/null +++ b/components/tfx/Evaluator/component.py @@ -0,0 +1,128 @@ +from kfp.components import InputPath, OutputPath + + +def Evaluator( + examples_path: InputPath('Examples'), + model_exports_path: InputPath('Model'), + #model_path: InputPath('Model'), + + output_path: OutputPath('ModelEval'), + + feature_slicing_spec: 'JsonObject: evaluator_pb2.FeatureSlicingSpec' = None, +): + """ + A TFX component to evaluate models trained by a TFX Trainer component. + + The Evaluator component performs model evaluations in the TFX pipeline and + the resultant metrics can be viewed in a Jupyter notebook. It uses the + input examples generated from the + [ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen) + component to evaluate the models. + + Specifically, it can provide: + - metrics computed on entire training and eval dataset + - tracking metrics over time + - model quality performance on different feature slices + + ## Exporting the EvalSavedModel in Trainer + + In order to setup Evaluator in a TFX pipeline, an EvalSavedModel needs to be + exported during training, which is a special SavedModel containing + annotations for the metrics, features, labels, and so on in your model. + Evaluator uses this EvalSavedModel to compute metrics. + + As part of this, the Trainer component creates eval_input_receiver_fn, + analogous to the serving_input_receiver_fn, which will extract the features + and labels from the input data. As with serving_input_receiver_fn, there are + utility functions to help with this. + + Please see https://www.tensorflow.org/tfx/model_analysis for more details. + + Args: + examples: A Channel of 'ExamplesPath' type, usually produced by ExampleGen + component. @Ark-kun: Must have the eval split. _required_ + model_exports: A Channel of 'ModelExportPath' type, usually produced by + Trainer component. Will be deprecated in the future for the `model` + parameter. + #model: Future replacement of the `model_exports` argument. + feature_slicing_spec: + [evaluator_pb2.FeatureSlicingSpec](https://github.com/tensorflow/tfx/blob/master/tfx/proto/evaluator.proto) + instance that describes how Evaluator should slice the data. + Returns: + output: Channel of `ModelEvalPath` to store the evaluation results. + + Either `model_exports` or `model` must be present in the input arguments. + + """ + from tfx.components.evaluator.component import Evaluator + component_class = Evaluator + input_channels_with_splits = {'examples'} + output_channels_with_splits = {} + + + import json + import os + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op( + Evaluator, + base_image='tensorflow/tfx:0.15.0rc0', + output_component_file='component.yaml' + ) diff --git a/components/tfx/Evaluator/component.yaml b/components/tfx/Evaluator/component.yaml new file mode 100644 index 00000000000..43e13337f64 --- /dev/null +++ b/components/tfx/Evaluator/component.yaml @@ -0,0 +1,232 @@ +name: Evaluator +description: | + A TFX component to evaluate models trained by a TFX Trainer component. + + The Evaluator component performs model evaluations in the TFX pipeline and + the resultant metrics can be viewed in a Jupyter notebook. It uses the + input examples generated from the + [ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen) + component to evaluate the models. + + Specifically, it can provide: + - metrics computed on entire training and eval dataset + - tracking metrics over time + - model quality performance on different feature slices + + ## Exporting the EvalSavedModel in Trainer + + In order to setup Evaluator in a TFX pipeline, an EvalSavedModel needs to be + exported during training, which is a special SavedModel containing + annotations for the metrics, features, labels, and so on in your model. + Evaluator uses this EvalSavedModel to compute metrics. + + As part of this, the Trainer component creates eval_input_receiver_fn, + analogous to the serving_input_receiver_fn, which will extract the features + and labels from the input data. As with serving_input_receiver_fn, there are + utility functions to help with this. + + Please see https://www.tensorflow.org/tfx/model_analysis for more details. + + Args: + examples: A Channel of 'ExamplesPath' type, usually produced by ExampleGen + component. @Ark-kun: Must have the eval split. _required_ + model_exports: A Channel of 'ModelExportPath' type, usually produced by + Trainer component. Will be deprecated in the future for the `model` + parameter. + #model: Future replacement of the `model_exports` argument. + feature_slicing_spec: + [evaluator_pb2.FeatureSlicingSpec](https://github.com/tensorflow/tfx/blob/master/tfx/proto/evaluator.proto) + instance that describes how Evaluator should slice the data. + Returns: + output: Channel of `ModelEvalPath` to store the evaluation results. + + Either `model_exports` or `model` must be present in the input arguments. +inputs: +- name: examples + type: Examples +- name: model_exports + type: Model +- name: feature_slicing_spec + type: 'JsonObject: evaluator_pb2.FeatureSlicingSpec' + optional: true +outputs: +- name: output + type: ModelEval +implementation: + container: + image: tensorflow/tfx:0.15.0rc0 + command: + - python3 + - -u + - -c + - | + class OutputPath: + '''When creating component from function, OutputPath should be used as function parameter annotation to tell the system that the function wants to output data by writing it into a file with the given path instead of returning the data from the function.''' + def __init__(self, type=None): + self.type = type + + class InputPath: + '''When creating component from function, InputPath should be used as function parameter annotation to tell the system to pass the *data file path* to the function instead of passing the actual data.''' + def __init__(self, type=None): + self.type = type + + def _make_parent_dirs_and_return_path(file_path: str): + import os + os.makedirs(os.path.dirname(file_path), exist_ok=True) + return file_path + + def Evaluator( + examples_path: InputPath('Examples'), + model_exports_path: InputPath('Model'), + #model_path: InputPath('Model'), + + output_path: OutputPath('ModelEval'), + + feature_slicing_spec: 'JsonObject: evaluator_pb2.FeatureSlicingSpec' = None, + ): + """ + A TFX component to evaluate models trained by a TFX Trainer component. + + The Evaluator component performs model evaluations in the TFX pipeline and + the resultant metrics can be viewed in a Jupyter notebook. It uses the + input examples generated from the + [ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen) + component to evaluate the models. + + Specifically, it can provide: + - metrics computed on entire training and eval dataset + - tracking metrics over time + - model quality performance on different feature slices + + ## Exporting the EvalSavedModel in Trainer + + In order to setup Evaluator in a TFX pipeline, an EvalSavedModel needs to be + exported during training, which is a special SavedModel containing + annotations for the metrics, features, labels, and so on in your model. + Evaluator uses this EvalSavedModel to compute metrics. + + As part of this, the Trainer component creates eval_input_receiver_fn, + analogous to the serving_input_receiver_fn, which will extract the features + and labels from the input data. As with serving_input_receiver_fn, there are + utility functions to help with this. + + Please see https://www.tensorflow.org/tfx/model_analysis for more details. + + Args: + examples: A Channel of 'ExamplesPath' type, usually produced by ExampleGen + component. @Ark-kun: Must have the eval split. _required_ + model_exports: A Channel of 'ModelExportPath' type, usually produced by + Trainer component. Will be deprecated in the future for the `model` + parameter. + #model: Future replacement of the `model_exports` argument. + feature_slicing_spec: + [evaluator_pb2.FeatureSlicingSpec](https://github.com/tensorflow/tfx/blob/master/tfx/proto/evaluator.proto) + instance that describes how Evaluator should slice the data. + Returns: + output: Channel of `ModelEvalPath` to store the evaluation results. + + Either `model_exports` or `model` must be present in the input arguments. + + """ + from tfx.components.evaluator.component import Evaluator + component_class = Evaluator + input_channels_with_splits = {'examples'} + output_channels_with_splits = {} + + import json + import os + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + import argparse + _parser = argparse.ArgumentParser(prog='Evaluator', description="A TFX component to evaluate models trained by a TFX Trainer component.\n\n The Evaluator component performs model evaluations in the TFX pipeline and\n the resultant metrics can be viewed in a Jupyter notebook. It uses the\n input examples generated from the\n [ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen)\n component to evaluate the models.\n\n Specifically, it can provide:\n - metrics computed on entire training and eval dataset\n - tracking metrics over time\n - model quality performance on different feature slices\n\n ## Exporting the EvalSavedModel in Trainer\n\n In order to setup Evaluator in a TFX pipeline, an EvalSavedModel needs to be\n exported during training, which is a special SavedModel containing\n annotations for the metrics, features, labels, and so on in your model.\n Evaluator uses this EvalSavedModel to compute metrics.\n\n As part of this, the Trainer component creates eval_input_receiver_fn,\n analogous to the serving_input_receiver_fn, which will extract the features\n and labels from the input data. As with serving_input_receiver_fn, there are\n utility functions to help with this.\n\n Please see https://www.tensorflow.org/tfx/model_analysis for more details.\n\n Args:\n examples: A Channel of 'ExamplesPath' type, usually produced by ExampleGen\n component. @Ark-kun: Must have the eval split. _required_\n model_exports: A Channel of 'ModelExportPath' type, usually produced by\n Trainer component. Will be deprecated in the future for the `model`\n parameter.\n #model: Future replacement of the `model_exports` argument.\n feature_slicing_spec:\n [evaluator_pb2.FeatureSlicingSpec](https://github.com/tensorflow/tfx/blob/master/tfx/proto/evaluator.proto)\n instance that describes how Evaluator should slice the data.\n Returns:\n output: Channel of `ModelEvalPath` to store the evaluation results.\n\n Either `model_exports` or `model` must be present in the input arguments.\n") + _parser.add_argument("--examples", dest="examples_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--model-exports", dest="model_exports_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--feature-slicing-spec", dest="feature_slicing_spec", type=str, required=False, default=argparse.SUPPRESS) + _parser.add_argument("--output", dest="output_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = Evaluator(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + _output_serializers = [ + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - --examples + - inputPath: examples + - --model-exports + - inputPath: model_exports + - if: + cond: + isPresent: feature_slicing_spec + then: + - --feature-slicing-spec + - inputValue: feature_slicing_spec + - --output + - outputPath: output From bda697868ddab4c3a666ffc44ef0fe3cac1df3ce Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Thu, 7 Nov 2019 01:16:36 -0800 Subject: [PATCH 22/26] Added the ExampleValidator component --- components/tfx/ExampleValidator/component.py | 117 ++++++++++ .../tfx/ExampleValidator/component.yaml | 203 ++++++++++++++++++ 2 files changed, 320 insertions(+) create mode 100644 components/tfx/ExampleValidator/component.py create mode 100644 components/tfx/ExampleValidator/component.yaml diff --git a/components/tfx/ExampleValidator/component.py b/components/tfx/ExampleValidator/component.py new file mode 100644 index 00000000000..7cf7fac9ffd --- /dev/null +++ b/components/tfx/ExampleValidator/component.py @@ -0,0 +1,117 @@ +from kfp.components import InputPath, OutputPath + + +def ExampleValidator( + stats_path: InputPath('ExampleStatistics'), + #statistics_path: InputPath('ExampleStatistics'), + schema_path: InputPath('Schema'), + + output_path: OutputPath('ExampleValidation'), +): + """ + A TFX component to validate input examples. + + The ExampleValidator component uses [Tensorflow Data + Validation](https://www.tensorflow.org/tfx/data_validation) to + validate the statistics of some splits on input examples against a schema. + + The ExampleValidator component identifies anomalies in training and serving + data. The component can be configured to detect different classes of anomalies + in the data. It can: + - perform validity checks by comparing data statistics against a schema that + codifies expectations of the user. + - detect data drift by looking at a series of data. + - detect changes in dataset-wide data (i.e., num_examples) across spans or + versions. + + Schema Based Example Validation + The ExampleValidator component identifies any anomalies in the example data by + comparing data statistics computed by the StatisticsGen component against a + schema. The schema codifies properties which the input data is expected to + satisfy, and is provided and maintained by the user. + + Please see https://www.tensorflow.org/tfx/data_validation for more details. + + Args: + stats: A Channel of 'ExampleStatisticsPath` type. This should contain at + least 'eval' split. Other splits are ignored currently. Will be + deprecated in the future for the `statistics` parameter. + #statistics: Future replacement of the 'stats' argument. + schema: A Channel of "SchemaPath' type. _required_ + Returns: + output: Output channel of 'ExampleValidationPath' type. + + Either `stats` or `statistics` must be present in the arguments. + """ + from tfx.components.example_validator.component import ExampleValidator + component_class = ExampleValidator + input_channels_with_splits = {'stats', 'statistics'} + output_channels_with_splits = {} + + + import json + import os + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op( + ExampleValidator, + base_image='tensorflow/tfx:0.15.0rc0', + output_component_file='component.yaml' + ) diff --git a/components/tfx/ExampleValidator/component.yaml b/components/tfx/ExampleValidator/component.yaml new file mode 100644 index 00000000000..4fbf968cd60 --- /dev/null +++ b/components/tfx/ExampleValidator/component.yaml @@ -0,0 +1,203 @@ +name: Examplevalidator +description: | + A TFX component to validate input examples. + + The ExampleValidator component uses [Tensorflow Data + Validation](https://www.tensorflow.org/tfx/data_validation) to + validate the statistics of some splits on input examples against a schema. + + The ExampleValidator component identifies anomalies in training and serving + data. The component can be configured to detect different classes of anomalies + in the data. It can: + - perform validity checks by comparing data statistics against a schema that + codifies expectations of the user. + - detect data drift by looking at a series of data. + - detect changes in dataset-wide data (i.e., num_examples) across spans or + versions. + + Schema Based Example Validation + The ExampleValidator component identifies any anomalies in the example data by + comparing data statistics computed by the StatisticsGen component against a + schema. The schema codifies properties which the input data is expected to + satisfy, and is provided and maintained by the user. + + Please see https://www.tensorflow.org/tfx/data_validation for more details. + + Args: + stats: A Channel of 'ExampleStatisticsPath` type. This should contain at + least 'eval' split. Other splits are ignored currently. Will be + deprecated in the future for the `statistics` parameter. + #statistics: Future replacement of the 'stats' argument. + schema: A Channel of "SchemaPath' type. _required_ + Returns: + output: Output channel of 'ExampleValidationPath' type. + + Either `stats` or `statistics` must be present in the arguments. +inputs: +- name: stats + type: ExampleStatistics +- name: schema + type: Schema +outputs: +- name: output + type: ExampleValidation +implementation: + container: + image: tensorflow/tfx:0.15.0rc0 + command: + - python3 + - -u + - -c + - | + class InputPath: + '''When creating component from function, InputPath should be used as function parameter annotation to tell the system to pass the *data file path* to the function instead of passing the actual data.''' + def __init__(self, type=None): + self.type = type + + def _make_parent_dirs_and_return_path(file_path: str): + import os + os.makedirs(os.path.dirname(file_path), exist_ok=True) + return file_path + + class OutputPath: + '''When creating component from function, OutputPath should be used as function parameter annotation to tell the system that the function wants to output data by writing it into a file with the given path instead of returning the data from the function.''' + def __init__(self, type=None): + self.type = type + + def ExampleValidator( + stats_path: InputPath('ExampleStatistics'), + #statistics_path: InputPath('ExampleStatistics'), + schema_path: InputPath('Schema'), + + output_path: OutputPath('ExampleValidation'), + ): + """ + A TFX component to validate input examples. + + The ExampleValidator component uses [Tensorflow Data + Validation](https://www.tensorflow.org/tfx/data_validation) to + validate the statistics of some splits on input examples against a schema. + + The ExampleValidator component identifies anomalies in training and serving + data. The component can be configured to detect different classes of anomalies + in the data. It can: + - perform validity checks by comparing data statistics against a schema that + codifies expectations of the user. + - detect data drift by looking at a series of data. + - detect changes in dataset-wide data (i.e., num_examples) across spans or + versions. + + Schema Based Example Validation + The ExampleValidator component identifies any anomalies in the example data by + comparing data statistics computed by the StatisticsGen component against a + schema. The schema codifies properties which the input data is expected to + satisfy, and is provided and maintained by the user. + + Please see https://www.tensorflow.org/tfx/data_validation for more details. + + Args: + stats: A Channel of 'ExampleStatisticsPath` type. This should contain at + least 'eval' split. Other splits are ignored currently. Will be + deprecated in the future for the `statistics` parameter. + #statistics: Future replacement of the 'stats' argument. + schema: A Channel of "SchemaPath' type. _required_ + Returns: + output: Output channel of 'ExampleValidationPath' type. + + Either `stats` or `statistics` must be present in the arguments. + """ + from tfx.components.example_validator.component import ExampleValidator + component_class = ExampleValidator + input_channels_with_splits = {'stats', 'statistics'} + output_channels_with_splits = {} + + import json + import os + from google.protobuf import json_format, message + from tfx.types import Artifact, channel_utils + + arguments = locals().copy() + + component_class_args = {} + + for name, execution_parameter in component_class.SPEC_CLASS.PARAMETERS.items(): + argument_value_obj = argument_value = arguments.get(name, None) + if argument_value is None: + continue + parameter_type = execution_parameter.type + if isinstance(parameter_type, type) and issubclass(parameter_type, message.Message): # execution_parameter.type can also be a tuple + argument_value_obj = parameter_type() + json_format.Parse(argument_value, argument_value_obj) + component_class_args[name] = argument_value_obj + + for name, channel_parameter in component_class.SPEC_CLASS.INPUTS.items(): + artifact_path = arguments[name + '_path'] + artifacts = [] + if name in input_channels_with_splits: + # Recovering splits + splits = sorted(os.listdir(artifact_path)) + for split in splits: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.split = split + artifact.uri = os.path.join(artifact_path, split) + '/' + artifacts.append(artifact) + else: + artifact = Artifact(type_name=channel_parameter.type_name) + artifact.uri = artifact_path + '/' # ? + artifacts.append(artifact) + component_class_args[name] = channel_utils.as_channel(artifacts) + + component_class_instance = component_class(**component_class_args) + + input_dict = {name: channel.get() for name, channel in component_class_instance.inputs.get_all().items()} + output_dict = {name: channel.get() for name, channel in component_class_instance.outputs.get_all().items()} + exec_properties = component_class_instance.exec_properties + + # Generating paths for output artifacts + for name, artifacts in output_dict.items(): + base_artifact_path = arguments[name + '_path'] + for artifact in artifacts: + artifact.uri = os.path.join(base_artifact_path, artifact.split) # Default split is '' + + print('component instance: ' + str(component_class_instance)) + + #executor = component_class.EXECUTOR_SPEC.executor_class() # Same + executor = component_class_instance.executor_spec.executor_class() + executor.Do( + input_dict=input_dict, + output_dict=output_dict, + exec_properties=exec_properties, + ) + + import argparse + _parser = argparse.ArgumentParser(prog='Examplevalidator', description='A TFX component to validate input examples.\n\n The ExampleValidator component uses [Tensorflow Data\n Validation](https://www.tensorflow.org/tfx/data_validation) to\n validate the statistics of some splits on input examples against a schema.\n\n The ExampleValidator component identifies anomalies in training and serving\n data. The component can be configured to detect different classes of anomalies\n in the data. It can:\n - perform validity checks by comparing data statistics against a schema that\n codifies expectations of the user.\n - detect data drift by looking at a series of data.\n - detect changes in dataset-wide data (i.e., num_examples) across spans or\n versions.\n\n Schema Based Example Validation\n The ExampleValidator component identifies any anomalies in the example data by\n comparing data statistics computed by the StatisticsGen component against a\n schema. The schema codifies properties which the input data is expected to\n satisfy, and is provided and maintained by the user.\n\n Please see https://www.tensorflow.org/tfx/data_validation for more details.\n\n Args:\n stats: A Channel of \'ExampleStatisticsPath` type. This should contain at\n least \'eval\' split. Other splits are ignored currently. Will be\n deprecated in the future for the `statistics` parameter.\n #statistics: Future replacement of the \'stats\' argument.\n schema: A Channel of "SchemaPath\' type. _required_\n Returns:\n output: Output channel of \'ExampleValidationPath\' type.\n\n Either `stats` or `statistics` must be present in the arguments.\n') + _parser.add_argument("--stats", dest="stats_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--schema", dest="schema_path", type=str, required=True, default=argparse.SUPPRESS) + _parser.add_argument("--output", dest="output_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) + _parsed_args = vars(_parser.parse_args()) + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = ExampleValidator(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + _output_serializers = [ + + ] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(_output_serializers[idx](_outputs[idx])) + args: + - --stats + - inputPath: stats + - --schema + - inputPath: schema + - --output + - outputPath: output From 093e2d2867442d7763778be60e4e525dcedde0ea Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Tue, 26 Nov 2019 15:48:40 -0800 Subject: [PATCH 23/26] Updated the sample Added ExampleValidator, Transform, Trainer, Evaluator --- components/tfx/_samples/TFX_pipeline.ipynb | 79 +++++++++++++++++++--- 1 file changed, 68 insertions(+), 11 deletions(-) diff --git a/components/tfx/_samples/TFX_pipeline.ipynb b/components/tfx/_samples/TFX_pipeline.ipynb index e5a93d1ae32..747e608e9a8 100644 --- a/components/tfx/_samples/TFX_pipeline.ipynb +++ b/components/tfx/_samples/TFX_pipeline.ipynb @@ -10,7 +10,11 @@ "\n", "* CsvExampleGen\n", "* StatisticsGen\n", - "* SchemaGen" + "* SchemaGen\n", + "* ExampleValidator\n", + "* Transform\n", + "* Trainer\n", + "* Evaluator" ] }, { @@ -27,6 +31,19 @@ "# ! Use kfp.Client(host='https://xxxxx.notebooks.googleusercontent.com/') if working from GCP notebooks (or local notebooks)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_data_uri = 'gs:///tensorflow-tfx/tfx/components/testdata/external/csv'\n", + "\n", + "#Only S3/GCS is supported. Replace with downloading component or GIT clone\n", + "#module_file = 'https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py'\n", + "module_file = 'gs:///tensorflow-tfx/tfx/examples/chicago_taxi_pipeline/taxi_utils.py'" + ] + }, { "cell_type": "code", "execution_count": null, @@ -36,10 +53,15 @@ "import json\n", "from kfp.components import load_component_from_url\n", "\n", - "download_from_gcs_op = load_component_from_url('https://raw.githubusercontent.com/Ark-kun/pipelines/290fa55/components/google-cloud/storage/download/component.yaml')\n", - "CsvExampleGen_op = load_component_from_url('https://raw.githubusercontent.com/Ark-kun/pipelines/3a1159a/components/tfx/ExampleGen/CsvExampleGen/component.yaml')\n", - "StatisticsGen_op = load_component_from_url('https://raw.githubusercontent.com/Ark-kun/pipelines/3a1159a/components/tfx/StatisticsGen/component.yaml')\n", - "SchemaGen_op = load_component_from_url('https://raw.githubusercontent.com/Ark-kun/pipelines/3a1159a/components/tfx/SchemaGen/component.yaml')\n", + "download_from_gcs_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/d013b8535666641ca5a5be6ce67e69e044bbf076/components/google-cloud/storage/download/component.yaml')\n", + "\n", + "CsvExampleGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/ExampleGen/CsvExampleGen/component.yaml')\n", + "StatisticsGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/StatisticsGen/component.yaml')\n", + "SchemaGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/SchemaGen/component.yaml')\n", + "ExampleValidator_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/ExampleValidator/component.yaml')\n", + "Transform_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/Transform/component.yaml')\n", + "Trainer_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/Trainer/component.yaml')\n", + "Evaluator_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/Evaluator/component.yaml')\n", "\n", "def tfx_pipeline(\n", " input_data_uri,\n", @@ -47,6 +69,7 @@ " download_task = download_from_gcs_op(\n", " input_data_uri,\n", " )\n", + "\n", " examples_task = CsvExampleGen_op(\n", " input_base=download_task.output,\n", " input_config=json.dumps({\n", @@ -71,12 +94,46 @@ " schema_task = SchemaGen_op(\n", " statistics_task.output,\n", " )\n", - " \n", + "\n", + " # Performs anomaly detection based on statistics and data schema.\n", + " validator_task = ExampleValidator_op(\n", + " stats=statistics_task.outputs['output'],\n", + " schema=schema_task.outputs['output'],\n", + " )\n", + "\n", + " # Performs transformations and feature engineering in training and serving.\n", + " transform_task = Transform_op(\n", + " input_data=examples_task.outputs['example_artifacts'],\n", + " schema=schema_task.outputs['output'],\n", + " module_file=module_file,\n", + " )\n", + "\n", + " trainer_task = Trainer_op(\n", + " module_file=module_file,\n", + " examples=transform_task.outputs['transformed_examples'],\n", + " schema=schema_task.outputs['output'],\n", + " transform_output=transform_task.outputs['transform_output'],\n", + " train_args=json.dumps({'num_steps': 10000}),\n", + " eval_args=json.dumps({'num_steps': 5000}),\n", + " )\n", + "\n", + " # Uses TFMA to compute a evaluation statistics over features of a model.\n", + " model_analyzer = Evaluator_op(\n", + " examples=examples_task.outputs['example_artifacts'],\n", + " model_exports=trainer_task.outputs['output'],\n", + " feature_slicing_spec=json.dumps({\n", + " 'specs': [\n", + " {'column_for_slicing': ['trip_start_hour']},\n", + " ],\n", + " }),\n", + " )\n", + "\n", + "\n", "client.create_run_from_pipeline_func(\n", " tfx_pipeline,\n", - " arguments={\n", - " 'input_data_uri': 'gs://avolkov/tensorflow-tfx/tfx/components/testdata/external/csv',\n", - " },\n", + " arguments=dict(\n", + " input_data_uri=input_data_uri,\n", + " ),\n", ")" ] } @@ -97,9 +154,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.5.3" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From 034cd328c429cc3cdbc89479cbb23615208c57ef Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Wed, 27 Nov 2019 16:39:36 -0800 Subject: [PATCH 24/26] Upgraded to TFX 0.15.0 --- components/tfx/Evaluator/component.py | 2 +- components/tfx/Evaluator/component.yaml | 2 +- components/tfx/ExampleGen/BigQueryExampleGen/component.py | 2 +- components/tfx/ExampleGen/BigQueryExampleGen/component.yaml | 2 +- components/tfx/ExampleGen/CsvExampleGen/component.py | 2 +- components/tfx/ExampleGen/CsvExampleGen/component.yaml | 2 +- .../tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py | 2 +- .../tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.yaml | 2 +- components/tfx/ExampleGen/ImportExampleGen/component.py | 2 +- components/tfx/ExampleGen/ImportExampleGen/component.yaml | 2 +- components/tfx/ExampleValidator/component.py | 2 +- components/tfx/ExampleValidator/component.yaml | 2 +- components/tfx/SchemaGen/component.py | 2 +- components/tfx/SchemaGen/component.yaml | 2 +- components/tfx/StatisticsGen/component.py | 2 +- components/tfx/StatisticsGen/component.yaml | 2 +- components/tfx/Trainer/component.py | 2 +- components/tfx/Trainer/component.yaml | 2 +- components/tfx/Transform/component.py | 2 +- components/tfx/Transform/component.yaml | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/components/tfx/Evaluator/component.py b/components/tfx/Evaluator/component.py index d26c904fa00..ed7c2488d23 100644 --- a/components/tfx/Evaluator/component.py +++ b/components/tfx/Evaluator/component.py @@ -123,6 +123,6 @@ def Evaluator( import kfp kfp.components.func_to_container_op( Evaluator, - base_image='tensorflow/tfx:0.15.0rc0', + base_image='tensorflow/tfx:0.15.0', output_component_file='component.yaml' ) diff --git a/components/tfx/Evaluator/component.yaml b/components/tfx/Evaluator/component.yaml index 43e13337f64..7fa0ad842fe 100644 --- a/components/tfx/Evaluator/component.yaml +++ b/components/tfx/Evaluator/component.yaml @@ -54,7 +54,7 @@ outputs: type: ModelEval implementation: container: - image: tensorflow/tfx:0.15.0rc0 + image: tensorflow/tfx:0.15.0 command: - python3 - -u diff --git a/components/tfx/ExampleGen/BigQueryExampleGen/component.py b/components/tfx/ExampleGen/BigQueryExampleGen/component.py index 5ad5f13791d..96192e7a6b7 100644 --- a/components/tfx/ExampleGen/BigQueryExampleGen/component.py +++ b/components/tfx/ExampleGen/BigQueryExampleGen/component.py @@ -100,6 +100,6 @@ def BigQueryExampleGen( import kfp kfp.components.func_to_container_op( BigQueryExampleGen, - base_image='tensorflow/tfx:0.15.0rc0', + base_image='tensorflow/tfx:0.15.0', output_component_file='component.yaml' ) diff --git a/components/tfx/ExampleGen/BigQueryExampleGen/component.yaml b/components/tfx/ExampleGen/BigQueryExampleGen/component.yaml index 721139ce34b..f0c8e76b3a6 100644 --- a/components/tfx/ExampleGen/BigQueryExampleGen/component.yaml +++ b/components/tfx/ExampleGen/BigQueryExampleGen/component.yaml @@ -36,7 +36,7 @@ outputs: type: Examples implementation: container: - image: tensorflow/tfx:0.15.0rc0 + image: tensorflow/tfx:0.15.0 command: - python3 - -u diff --git a/components/tfx/ExampleGen/CsvExampleGen/component.py b/components/tfx/ExampleGen/CsvExampleGen/component.py index 409fc6a38cb..5dd76a55e40 100644 --- a/components/tfx/ExampleGen/CsvExampleGen/component.py +++ b/components/tfx/ExampleGen/CsvExampleGen/component.py @@ -89,6 +89,6 @@ def CsvExampleGen( import kfp kfp.components.func_to_container_op( CsvExampleGen, - base_image='tensorflow/tfx:0.15.0rc0', + base_image='tensorflow/tfx:0.15.0', output_component_file='component.yaml' ) diff --git a/components/tfx/ExampleGen/CsvExampleGen/component.yaml b/components/tfx/ExampleGen/CsvExampleGen/component.yaml index 7ae96f73412..b5a3c52a8d2 100644 --- a/components/tfx/ExampleGen/CsvExampleGen/component.yaml +++ b/components/tfx/ExampleGen/CsvExampleGen/component.yaml @@ -23,7 +23,7 @@ description: | eval examples. implementation: container: - image: tensorflow/tfx:0.15.0rc0 + image: tensorflow/tfx:0.15.0 command: - python3 - -u diff --git a/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py b/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py index 9ac5e95096f..64c859711e8 100644 --- a/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py +++ b/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py @@ -91,6 +91,6 @@ def CsvExampleGen_GCS( # import kfp kfp.components.func_to_container_op( CsvExampleGen_GCS, - base_image='tensorflow/tfx:0.15.0rc0', + base_image='tensorflow/tfx:0.15.0', output_component_file='component.yaml' ) diff --git a/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.yaml b/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.yaml index 13ef94187b0..54a6868ad67 100644 --- a/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.yaml +++ b/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.yaml @@ -24,7 +24,7 @@ description: | eval examples. implementation: container: - image: tensorflow/tfx:0.15.0rc0 + image: tensorflow/tfx:0.15.0 command: - python3 - -u diff --git a/components/tfx/ExampleGen/ImportExampleGen/component.py b/components/tfx/ExampleGen/ImportExampleGen/component.py index 7f863e61b7d..8b390618472 100644 --- a/components/tfx/ExampleGen/ImportExampleGen/component.py +++ b/components/tfx/ExampleGen/ImportExampleGen/component.py @@ -105,6 +105,6 @@ def ImportExampleGen( import kfp kfp.components.func_to_container_op( ImportExampleGen, - base_image='tensorflow/tfx:0.15.0rc0', + base_image='tensorflow/tfx:0.15.0', output_component_file='component.yaml' ) diff --git a/components/tfx/ExampleGen/ImportExampleGen/component.yaml b/components/tfx/ExampleGen/ImportExampleGen/component.yaml index 87f4a2aa0ec..8328e455b50 100644 --- a/components/tfx/ExampleGen/ImportExampleGen/component.yaml +++ b/components/tfx/ExampleGen/ImportExampleGen/component.yaml @@ -38,7 +38,7 @@ outputs: type: Examples implementation: container: - image: tensorflow/tfx:0.15.0rc0 + image: tensorflow/tfx:0.15.0 command: - python3 - -u diff --git a/components/tfx/ExampleValidator/component.py b/components/tfx/ExampleValidator/component.py index 7cf7fac9ffd..3d1f350d55b 100644 --- a/components/tfx/ExampleValidator/component.py +++ b/components/tfx/ExampleValidator/component.py @@ -112,6 +112,6 @@ def ExampleValidator( import kfp kfp.components.func_to_container_op( ExampleValidator, - base_image='tensorflow/tfx:0.15.0rc0', + base_image='tensorflow/tfx:0.15.0', output_component_file='component.yaml' ) diff --git a/components/tfx/ExampleValidator/component.yaml b/components/tfx/ExampleValidator/component.yaml index 4fbf968cd60..0c7036a2ef3 100644 --- a/components/tfx/ExampleValidator/component.yaml +++ b/components/tfx/ExampleValidator/component.yaml @@ -43,7 +43,7 @@ outputs: type: ExampleValidation implementation: container: - image: tensorflow/tfx:0.15.0rc0 + image: tensorflow/tfx:0.15.0 command: - python3 - -u diff --git a/components/tfx/SchemaGen/component.py b/components/tfx/SchemaGen/component.py index b975f094e80..ba630436e5c 100644 --- a/components/tfx/SchemaGen/component.py +++ b/components/tfx/SchemaGen/component.py @@ -75,6 +75,6 @@ def SchemaGen( import kfp kfp.components.func_to_container_op( SchemaGen, - base_image='tensorflow/tfx:0.15.0rc0', + base_image='tensorflow/tfx:0.15.0', output_component_file='component.yaml' ) diff --git a/components/tfx/SchemaGen/component.yaml b/components/tfx/SchemaGen/component.yaml index d747ab8f5f5..dc0a5a32a5f 100644 --- a/components/tfx/SchemaGen/component.yaml +++ b/components/tfx/SchemaGen/component.yaml @@ -25,7 +25,7 @@ description: | output: Output `Schema` channel for schema result. implementation: container: - image: tensorflow/tfx:0.15.0rc0 + image: tensorflow/tfx:0.15.0 command: - python3 - -u diff --git a/components/tfx/StatisticsGen/component.py b/components/tfx/StatisticsGen/component.py index abc93c4eef9..acdffed8eab 100644 --- a/components/tfx/StatisticsGen/component.py +++ b/components/tfx/StatisticsGen/component.py @@ -72,6 +72,6 @@ def StatisticsGen( import kfp kfp.components.func_to_container_op( StatisticsGen, - base_image='tensorflow/tfx:0.15.0rc0', + base_image='tensorflow/tfx:0.15.0', output_component_file='component.yaml' ) diff --git a/components/tfx/StatisticsGen/component.yaml b/components/tfx/StatisticsGen/component.yaml index 41bc2915019..5d21d7d3d0f 100644 --- a/components/tfx/StatisticsGen/component.yaml +++ b/components/tfx/StatisticsGen/component.yaml @@ -18,7 +18,7 @@ description: | provided in the input examples. implementation: container: - image: tensorflow/tfx:0.15.0rc0 + image: tensorflow/tfx:0.15.0 command: - python3 - -u diff --git a/components/tfx/Trainer/component.py b/components/tfx/Trainer/component.py index b672890f5d0..152288dc893 100644 --- a/components/tfx/Trainer/component.py +++ b/components/tfx/Trainer/component.py @@ -166,6 +166,6 @@ def trainer_fn(tf.contrib.training.HParams, import kfp kfp.components.func_to_container_op( Trainer, - base_image='tensorflow/tfx:0.15.0rc0', + base_image='tensorflow/tfx:0.15.0', output_component_file='component.yaml' ) diff --git a/components/tfx/Trainer/component.yaml b/components/tfx/Trainer/component.yaml index 93791114755..0e45ee12ceb 100644 --- a/components/tfx/Trainer/component.yaml +++ b/components/tfx/Trainer/component.yaml @@ -103,7 +103,7 @@ outputs: type: Model implementation: container: - image: tensorflow/tfx:0.15.0rc0 + image: tensorflow/tfx:0.15.0 command: - python3 - -u diff --git a/components/tfx/Transform/component.py b/components/tfx/Transform/component.py index ac273887bfe..af38b1afc58 100644 --- a/components/tfx/Transform/component.py +++ b/components/tfx/Transform/component.py @@ -134,6 +134,6 @@ def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]: import kfp kfp.components.func_to_container_op( Transform, - base_image='tensorflow/tfx:0.15.0rc0', + base_image='tensorflow/tfx:0.15.0', output_component_file='component.yaml' ) diff --git a/components/tfx/Transform/component.yaml b/components/tfx/Transform/component.yaml index ed021a553e0..b3302d14387 100644 --- a/components/tfx/Transform/component.yaml +++ b/components/tfx/Transform/component.yaml @@ -67,7 +67,7 @@ outputs: type: Examples implementation: container: - image: tensorflow/tfx:0.15.0rc0 + image: tensorflow/tfx:0.15.0 command: - python3 - -u From 8fec5f1c5d27621cebf78149166257ac665feb1d Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Wed, 27 Nov 2019 16:40:40 -0800 Subject: [PATCH 25/26] Upgraded the sample to 0.15.0 --- components/tfx/_samples/TFX_pipeline.ipynb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/components/tfx/_samples/TFX_pipeline.ipynb b/components/tfx/_samples/TFX_pipeline.ipynb index 747e608e9a8..f3e63dbf251 100644 --- a/components/tfx/_samples/TFX_pipeline.ipynb +++ b/components/tfx/_samples/TFX_pipeline.ipynb @@ -55,13 +55,13 @@ "\n", "download_from_gcs_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/d013b8535666641ca5a5be6ce67e69e044bbf076/components/google-cloud/storage/download/component.yaml')\n", "\n", - "CsvExampleGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/ExampleGen/CsvExampleGen/component.yaml')\n", - "StatisticsGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/StatisticsGen/component.yaml')\n", - "SchemaGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/SchemaGen/component.yaml')\n", - "ExampleValidator_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/ExampleValidator/component.yaml')\n", - "Transform_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/Transform/component.yaml')\n", - "Trainer_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/Trainer/component.yaml')\n", - "Evaluator_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/bda69786/components/tfx/Evaluator/component.yaml')\n", + "CsvExampleGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/025c424a/components/tfx/ExampleGen/CsvExampleGen/component.yaml')\n", + "StatisticsGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/025c424a/components/tfx/StatisticsGen/component.yaml')\n", + "SchemaGen_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/025c424a/components/tfx/SchemaGen/component.yaml')\n", + "ExampleValidator_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/025c424a/components/tfx/ExampleValidator/component.yaml')\n", + "Transform_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/025c424a/components/tfx/Transform/component.yaml')\n", + "Trainer_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/025c424a/components/tfx/Trainer/component.yaml')\n", + "Evaluator_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/025c424a/components/tfx/Evaluator/component.yaml')\n", "\n", "def tfx_pipeline(\n", " input_data_uri,\n", From bbf11e3d2e473321973ffab76f9fbb1c8cd329da Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Wed, 27 Nov 2019 17:39:54 -0800 Subject: [PATCH 26/26] Silence Flake8 for annotations --- components/tfx/Evaluator/component.py | 2 ++ components/tfx/ExampleGen/BigQueryExampleGen/component.py | 2 ++ components/tfx/ExampleGen/CsvExampleGen/component.py | 2 ++ .../tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py | 2 ++ components/tfx/ExampleGen/ImportExampleGen/component.py | 2 ++ components/tfx/Trainer/component.py | 2 ++ components/tfx/Transform/component.py | 2 ++ 7 files changed, 14 insertions(+) diff --git a/components/tfx/Evaluator/component.py b/components/tfx/Evaluator/component.py index ed7c2488d23..a21deb8f5d6 100644 --- a/components/tfx/Evaluator/component.py +++ b/components/tfx/Evaluator/component.py @@ -1,3 +1,5 @@ +# flake8: noqa TODO + from kfp.components import InputPath, OutputPath diff --git a/components/tfx/ExampleGen/BigQueryExampleGen/component.py b/components/tfx/ExampleGen/BigQueryExampleGen/component.py index 96192e7a6b7..a7473e0bf81 100644 --- a/components/tfx/ExampleGen/BigQueryExampleGen/component.py +++ b/components/tfx/ExampleGen/BigQueryExampleGen/component.py @@ -1,3 +1,5 @@ +# flake8: noqa TODO + from kfp.components import InputPath, OutputPath diff --git a/components/tfx/ExampleGen/CsvExampleGen/component.py b/components/tfx/ExampleGen/CsvExampleGen/component.py index 5dd76a55e40..e956b29a883 100644 --- a/components/tfx/ExampleGen/CsvExampleGen/component.py +++ b/components/tfx/ExampleGen/CsvExampleGen/component.py @@ -1,3 +1,5 @@ +# flake8: noqa TODO + from kfp.components import InputPath, OutputPath def CsvExampleGen( diff --git a/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py b/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py index 64c859711e8..8e78dd05bfb 100644 --- a/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py +++ b/components/tfx/ExampleGen/CsvExampleGen/with_URI_IO/component.py @@ -1,3 +1,5 @@ +# flake8: noqa TODO + from typing import NamedTuple def CsvExampleGen_GCS( # diff --git a/components/tfx/ExampleGen/ImportExampleGen/component.py b/components/tfx/ExampleGen/ImportExampleGen/component.py index 8b390618472..02c4dac8a8f 100644 --- a/components/tfx/ExampleGen/ImportExampleGen/component.py +++ b/components/tfx/ExampleGen/ImportExampleGen/component.py @@ -1,3 +1,5 @@ +# flake8: noqa TODO + from kfp.components import InputPath, OutputPath diff --git a/components/tfx/Trainer/component.py b/components/tfx/Trainer/component.py index 152288dc893..d5cc637aa5f 100644 --- a/components/tfx/Trainer/component.py +++ b/components/tfx/Trainer/component.py @@ -1,3 +1,5 @@ +# flake8: noqa TODO + from kfp.components import InputPath, OutputPath diff --git a/components/tfx/Transform/component.py b/components/tfx/Transform/component.py index af38b1afc58..ed4d6300c4d 100644 --- a/components/tfx/Transform/component.py +++ b/components/tfx/Transform/component.py @@ -1,3 +1,5 @@ +# flake8: noqa TODO + from kfp.components import InputPath, OutputPath