From 76532caf14dd1856b782a71338af15557298a038 Mon Sep 17 00:00:00 2001 From: James Fletcher Date: Wed, 4 Dec 2019 00:38:08 +0000 Subject: [PATCH] Refactor encoding of graphs to reduce code complexity and make the pipeline clearer --- kglib/kgcn/examples/diagnosis/diagnosis.py | 6 +- kglib/kgcn/pipeline/encode.py | 115 +++++++++++---------- kglib/kgcn/pipeline/encode_test.py | 18 ++-- kglib/kgcn/pipeline/pipeline.py | 26 +---- 4 files changed, 75 insertions(+), 90 deletions(-) diff --git a/kglib/kgcn/examples/diagnosis/diagnosis.py b/kglib/kgcn/examples/diagnosis/diagnosis.py index 72dca923..76149361 100644 --- a/kglib/kgcn/examples/diagnosis/diagnosis.py +++ b/kglib/kgcn/examples/diagnosis/diagnosis.py @@ -108,12 +108,12 @@ def create_concept_graphs(example_indices, grakn_session): # Existing elements in the graph are those that pre-exist in the graph, and should be predicted to continue to exist PREEXISTS = dict(input=1, solution=0) -# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive examples -TO_INFER = dict(input=0, solution=2) - # Candidates are neither present in the input nor in the solution, they are negative examples CANDIDATE = dict(input=0, solution=1) +# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive examples +TO_INFER = dict(input=0, solution=2) + class QueryHandler: """ diff --git a/kglib/kgcn/pipeline/encode.py b/kglib/kgcn/pipeline/encode.py index 48969ea3..78a8224a 100644 --- a/kglib/kgcn/pipeline/encode.py +++ b/kglib/kgcn/pipeline/encode.py @@ -23,86 +23,87 @@ multidigraph_edge_data_iterator -def encode_types(graph, node_types, edge_types): - node_iterator = multidigraph_node_data_iterator(graph) - encode_categorically(node_iterator, node_types, 'type', 'categorical_type') - - edge_iterator = multidigraph_edge_data_iterator(graph) - encode_categorically(edge_iterator, edge_types, 'type', 'categorical_type') - return graph +def encode_values(graph, categorical_attributes, continuous_attributes): + for node_data in multidigraph_node_data_iterator(graph): + typ = node_data['type'] + if categorical_attributes is not None and typ in categorical_attributes.keys(): + # Add the integer value of the category for each categorical attribute instance + category_values = categorical_attributes[typ] + node_data['encoded_value'] = category_values.index(node_data['value']) -def create_input_graph(graph, features_field="features"): - input_graph = graph.copy() - augment_data_fields(multidigraph_data_iterator(input_graph), - ("input", "categorical_type", "encoded_value"), - features_field) - input_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32) - return input_graph + elif continuous_attributes is not None and typ in continuous_attributes.keys(): + min_val, max_val = continuous_attributes[typ] + node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val) + else: + node_data['encoded_value'] = 0 + for edge_data in multidigraph_edge_data_iterator(graph): + edge_data['encoded_value'] = 0 -def create_target_graph(graph, features_field="features"): - target_graph = graph.copy() - target_graph = encode_solutions(target_graph, solution_field="solution", encoded_solution_field="encoded_solution", - encodings=np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])) - augment_data_fields(multidigraph_data_iterator(target_graph), - ("encoded_solution",), - features_field) - target_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32) - return target_graph + return graph -def augment_data_fields(graph_data_iterator, fields_to_augment, augmented_field): +def encode_types(graph, iterator_func, types): """ - Returns a graph with features built from augmenting data fields found in the graph - + Encodes the type found in graph data as an integer according to the index it is found in `all_types` Args: - graph_data_iterator: iterator over the data for elements in a graph - fields_to_augment: the fields of the data dictionaries to augment together - augmented_field: the field in which to store the augmented fields - + graph: The graph to encode + iterator_func: An function to create an iterator of data in the graph (node data, edge data or combined node and edge data) + types: The full list of types to be encoded in this order + Returns: - None, updates the graph in-place + The graph, which is also is updated in-place """ + iterator = iterator_func(graph) - for data in graph_data_iterator: - data[augmented_field] = np.hstack([np.array(data[field], dtype=np.float32) for field in fields_to_augment]) + for data in iterator: + data['categorical_type'] = types.index(data['type']) + return graph -def encode_solutions(graph, solution_field="solution", encoded_solution_field="encoded_solution", - encodings=np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])): - """ - Determines the encoding to use for a solution category - Args: - graph: Graph to update - solution_field: The property in the graph that holds the value of the solution - encoded_solution_field: The property in the graph to use to hold the new solution value - encodings: An array, a row from which will be picked as the new solution based on using the current solution - as a row index - Returns: Graph with updated `encoded_solution_field` +def create_input_graph(graph): + input_graph = graph.copy() - """ + for data in multidigraph_data_iterator(input_graph): + if data["solution"] == 0: + preexists = 1 + else: + preexists = 0 - for data in multidigraph_data_iterator(graph): - solution = data[solution_field] - data[encoded_solution_field] = encodings[solution] + features = stack_features([preexists, data["categorical_type"], data["encoded_value"]]) + data.clear() + data["features"] = features - return graph + input_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32) + return input_graph + + +def create_target_graph(graph): + target_graph = graph.copy() + solution_one_hot_encoding = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], dtype=np.float32) + for data in multidigraph_data_iterator(target_graph): + features = solution_one_hot_encoding[data["solution"]] + data.clear() + data["features"] = features -def encode_categorically(graph_data_iterator, all_categories, category_field, encoding_field): + target_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32) + return target_graph + + +def stack_features(features): """ - Encodes the type found in graph data as an integer according to the index it is found in `all_types` + Stacks features together into a single vector + Args: - graph_data_iterator: An iterator of data in the graph (node data, edge data or combined node and edge data) - all_categories: The full list of categories to be encoded in this order - category_field: The data field containing the category to encode - encoding_field: The data field to use to store the encoding + features: iterable of features, features can be a single value or iterable Returns: + Numpy array (vector) of stacked features """ - for data in graph_data_iterator: - data[encoding_field] = all_categories.index(data[category_field]) + + return np.hstack([np.array(feature, dtype=np.float32) for feature in features]) diff --git a/kglib/kgcn/pipeline/encode_test.py b/kglib/kgcn/pipeline/encode_test.py index b9741d1b..e318d8f6 100644 --- a/kglib/kgcn/pipeline/encode_test.py +++ b/kglib/kgcn/pipeline/encode_test.py @@ -21,28 +21,28 @@ import numpy as np -from kglib.kgcn.pipeline.encode import augment_data_fields +from kglib.kgcn.pipeline.encode import stack_features class TestAugmentDataFields(unittest.TestCase): def test_numpy_fields_augmented_as_expected(self): - data = [dict(attr1=np.array([0, 1, 0]), attr2=np.array([5]))] + features = [np.array([0, 1, 0]), np.array([5])] - augment_data_fields(data, ('attr1', 'attr2'), 'features') + stacked = stack_features(features) - expected_data = [dict(attr1=np.array([0, 1, 0]), attr2=np.array([5]), features=np.array([0, 1, 0, 5]))] + expected = np.array([0, 1, 0, 5]) - np.testing.assert_equal(expected_data, data) + np.testing.assert_equal(expected, stacked) def test_augmenting_non_numpy_numeric(self): - data = [dict(attr1=np.array([0, 1, 0]), attr2=5)] + data = [np.array([0, 1, 0]), 5] - augment_data_fields(data, ('attr1', 'attr2'), 'features') + stacked = stack_features(data) - expected_data = [dict(attr1=np.array([0, 1, 0]), attr2=5, features=np.array([0, 1, 0, 5]))] + expected = np.array([0, 1, 0, 5]) - np.testing.assert_equal(expected_data, data) + np.testing.assert_equal(stacked, expected) if __name__ == "__main__": diff --git a/kglib/kgcn/pipeline/pipeline.py b/kglib/kgcn/pipeline/pipeline.py index ecc725df..321946c7 100644 --- a/kglib/kgcn/pipeline/pipeline.py +++ b/kglib/kgcn/pipeline/pipeline.py @@ -24,7 +24,7 @@ from kglib.kgcn.learn.learn import KGCNLearner from kglib.kgcn.models.attribute import ContinuousAttribute, CategoricalAttribute, BlankAttribute from kglib.kgcn.models.core import softmax, KGCN -from kglib.kgcn.pipeline.encode import encode_types, create_input_graph, create_target_graph +from kglib.kgcn.pipeline.encode import encode_types, create_input_graph, create_target_graph, encode_values from kglib.kgcn.pipeline.utils import apply_logits_to_graphs, duplicate_edges_in_reverse from kglib.kgcn.plot.plotting import plot_across_training, plot_predictions from kglib.utils.graph.iterate import multidigraph_node_data_iterator, multidigraph_data_iterator, \ @@ -51,29 +51,13 @@ def pipeline(graphs, ############################################################ # Encode attribute values - for graph in graphs: - for node_data in multidigraph_node_data_iterator(graph): - typ = node_data['type'] - - if categorical_attributes is not None and typ in categorical_attributes.keys(): - # Add the integer value of the category for each categorical attribute instance - category_values = categorical_attributes[typ] - node_data['encoded_value'] = category_values.index(node_data['value']) - - elif continuous_attributes is not None and typ in continuous_attributes.keys(): - min_val, max_val = continuous_attributes[typ] - node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val) - - else: - node_data['encoded_value'] = 0 - - for edge_data in multidigraph_edge_data_iterator(graph): - edge_data['encoded_value'] = 0 + graphs = [encode_values(graph, categorical_attributes, continuous_attributes) for graph in graphs] indexed_graphs = [nx.convert_node_labels_to_integers(graph, label_attribute='concept') for graph in graphs] graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs] - graphs = [encode_types(graph, node_types, edge_types) for graph in graphs] + graphs = [encode_types(graph, multidigraph_node_data_iterator, node_types) for graph in graphs] + graphs = [encode_types(graph, multidigraph_edge_data_iterator, edge_types) for graph in graphs] input_graphs = [create_input_graph(graph) for graph in graphs] target_graphs = [create_target_graph(graph) for graph in graphs] @@ -109,7 +93,7 @@ def pipeline(graphs, log_dir=output_dir) plot_across_training(*tr_info, output_file=f'{output_dir}learning.png') - plot_predictions(ge_input_graphs, test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png') + plot_predictions(graphs[tr_ge_split:], test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png') logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])