From 76532caf14dd1856b782a71338af15557298a038 Mon Sep 17 00:00:00 2001
From: James Fletcher <james@grakn.ai>
Date: Wed, 4 Dec 2019 00:38:08 +0000
Subject: [PATCH] Refactor encoding of graphs to reduce code complexity and
 make the pipeline clearer

---
 kglib/kgcn/examples/diagnosis/diagnosis.py |   6 +-
 kglib/kgcn/pipeline/encode.py              | 115 +++++++++++----------
 kglib/kgcn/pipeline/encode_test.py         |  18 ++--
 kglib/kgcn/pipeline/pipeline.py            |  26 +----
 4 files changed, 75 insertions(+), 90 deletions(-)

diff --git a/kglib/kgcn/examples/diagnosis/diagnosis.py b/kglib/kgcn/examples/diagnosis/diagnosis.py
index 72dca923..76149361 100644
--- a/kglib/kgcn/examples/diagnosis/diagnosis.py
+++ b/kglib/kgcn/examples/diagnosis/diagnosis.py
@@ -108,12 +108,12 @@ def create_concept_graphs(example_indices, grakn_session):
 # Existing elements in the graph are those that pre-exist in the graph, and should be predicted to continue to exist
 PREEXISTS = dict(input=1, solution=0)
 
-# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive examples
-TO_INFER = dict(input=0, solution=2)
-
 # Candidates are neither present in the input nor in the solution, they are negative examples
 CANDIDATE = dict(input=0, solution=1)
 
+# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive examples
+TO_INFER = dict(input=0, solution=2)
+
 
 class QueryHandler:
     """
diff --git a/kglib/kgcn/pipeline/encode.py b/kglib/kgcn/pipeline/encode.py
index 48969ea3..78a8224a 100644
--- a/kglib/kgcn/pipeline/encode.py
+++ b/kglib/kgcn/pipeline/encode.py
@@ -23,86 +23,87 @@
     multidigraph_edge_data_iterator
 
 
-def encode_types(graph, node_types, edge_types):
-    node_iterator = multidigraph_node_data_iterator(graph)
-    encode_categorically(node_iterator, node_types, 'type', 'categorical_type')
-
-    edge_iterator = multidigraph_edge_data_iterator(graph)
-    encode_categorically(edge_iterator, edge_types, 'type', 'categorical_type')
-    return graph
+def encode_values(graph, categorical_attributes, continuous_attributes):
+    for node_data in multidigraph_node_data_iterator(graph):
+        typ = node_data['type']
 
+        if categorical_attributes is not None and typ in categorical_attributes.keys():
+            # Add the integer value of the category for each categorical attribute instance
+            category_values = categorical_attributes[typ]
+            node_data['encoded_value'] = category_values.index(node_data['value'])
 
-def create_input_graph(graph, features_field="features"):
-    input_graph = graph.copy()
-    augment_data_fields(multidigraph_data_iterator(input_graph),
-                        ("input", "categorical_type", "encoded_value"),
-                        features_field)
-    input_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32)
-    return input_graph
+        elif continuous_attributes is not None and typ in continuous_attributes.keys():
+            min_val, max_val = continuous_attributes[typ]
+            node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val)
 
+        else:
+            node_data['encoded_value'] = 0
+    for edge_data in multidigraph_edge_data_iterator(graph):
+        edge_data['encoded_value'] = 0
 
-def create_target_graph(graph, features_field="features"):
-    target_graph = graph.copy()
-    target_graph = encode_solutions(target_graph, solution_field="solution", encoded_solution_field="encoded_solution",
-                                    encodings=np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]))
-    augment_data_fields(multidigraph_data_iterator(target_graph),
-                        ("encoded_solution",),
-                        features_field)
-    target_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32)
-    return target_graph
+    return graph
 
 
-def augment_data_fields(graph_data_iterator, fields_to_augment, augmented_field):
+def encode_types(graph, iterator_func, types):
     """
-    Returns a graph with features built from augmenting data fields found in the graph
-
+    Encodes the type found in graph data as an integer according to the index it is found in `all_types`
     Args:
-        graph_data_iterator: iterator over the data for elements in a graph
-        fields_to_augment: the fields of the data dictionaries to augment together
-        augmented_field: the field in which to store the augmented fields
-
+        graph: The graph to encode
+        iterator_func: An function to create an iterator of data in the graph (node data, edge data or combined node and edge data)
+        types: The full list of types to be encoded in this order
+        
     Returns:
-        None, updates the graph in-place
+        The graph, which is also is updated in-place
 
     """
+    iterator = iterator_func(graph)
 
-    for data in graph_data_iterator:
-        data[augmented_field] = np.hstack([np.array(data[field], dtype=np.float32) for field in fields_to_augment])
+    for data in iterator:
+        data['categorical_type'] = types.index(data['type'])
 
+    return graph
 
-def encode_solutions(graph, solution_field="solution", encoded_solution_field="encoded_solution",
-                     encodings=np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])):
-    """
-    Determines the encoding to use for a solution category
-    Args:
-        graph: Graph to update
-        solution_field: The property in the graph that holds the value of the solution
-        encoded_solution_field: The property in the graph to use to hold the new solution value
-        encodings: An array, a row from which will be picked as the new solution based on using the current solution
-            as a row index
 
-    Returns: Graph with updated `encoded_solution_field`
+def create_input_graph(graph):
+    input_graph = graph.copy()
 
-    """
+    for data in multidigraph_data_iterator(input_graph):
+        if data["solution"] == 0:
+            preexists = 1
+        else:
+            preexists = 0
 
-    for data in multidigraph_data_iterator(graph):
-        solution = data[solution_field]
-        data[encoded_solution_field] = encodings[solution]
+        features = stack_features([preexists, data["categorical_type"], data["encoded_value"]])
+        data.clear()
+        data["features"] = features
 
-    return graph
+    input_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32)
+    return input_graph
+
+
+def create_target_graph(graph):
+    target_graph = graph.copy()
+    solution_one_hot_encoding = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], dtype=np.float32)
 
+    for data in multidigraph_data_iterator(target_graph):
+        features = solution_one_hot_encoding[data["solution"]]
+        data.clear()
+        data["features"] = features
 
-def encode_categorically(graph_data_iterator, all_categories, category_field, encoding_field):
+    target_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32)
+    return target_graph
+
+
+def stack_features(features):
     """
-    Encodes the type found in graph data as an integer according to the index it is found in `all_types`
+    Stacks features together into a single vector
+
     Args:
-        graph_data_iterator: An iterator of data in the graph (node data, edge data or combined node and edge data)
-        all_categories: The full list of categories to be encoded in this order
-        category_field: The data field containing the category to encode
-        encoding_field: The data field to use to store the encoding
+        features: iterable of features, features can be a single value or iterable
 
     Returns:
+        Numpy array (vector) of stacked features
 
     """
-    for data in graph_data_iterator:
-        data[encoding_field] = all_categories.index(data[category_field])
+
+    return np.hstack([np.array(feature, dtype=np.float32) for feature in features])
diff --git a/kglib/kgcn/pipeline/encode_test.py b/kglib/kgcn/pipeline/encode_test.py
index b9741d1b..e318d8f6 100644
--- a/kglib/kgcn/pipeline/encode_test.py
+++ b/kglib/kgcn/pipeline/encode_test.py
@@ -21,28 +21,28 @@
 
 import numpy as np
 
-from kglib.kgcn.pipeline.encode import augment_data_fields
+from kglib.kgcn.pipeline.encode import stack_features
 
 
 class TestAugmentDataFields(unittest.TestCase):
 
     def test_numpy_fields_augmented_as_expected(self):
-        data = [dict(attr1=np.array([0, 1, 0]), attr2=np.array([5]))]
+        features = [np.array([0, 1, 0]), np.array([5])]
 
-        augment_data_fields(data, ('attr1', 'attr2'), 'features')
+        stacked = stack_features(features)
 
-        expected_data = [dict(attr1=np.array([0, 1, 0]), attr2=np.array([5]), features=np.array([0, 1, 0, 5]))]
+        expected = np.array([0, 1, 0, 5])
 
-        np.testing.assert_equal(expected_data, data)
+        np.testing.assert_equal(expected, stacked)
 
     def test_augmenting_non_numpy_numeric(self):
-        data = [dict(attr1=np.array([0, 1, 0]), attr2=5)]
+        data = [np.array([0, 1, 0]), 5]
 
-        augment_data_fields(data, ('attr1', 'attr2'), 'features')
+        stacked = stack_features(data)
 
-        expected_data = [dict(attr1=np.array([0, 1, 0]), attr2=5, features=np.array([0, 1, 0, 5]))]
+        expected = np.array([0, 1, 0, 5])
 
-        np.testing.assert_equal(expected_data, data)
+        np.testing.assert_equal(stacked, expected)
 
 
 if __name__ == "__main__":
diff --git a/kglib/kgcn/pipeline/pipeline.py b/kglib/kgcn/pipeline/pipeline.py
index ecc725df..321946c7 100644
--- a/kglib/kgcn/pipeline/pipeline.py
+++ b/kglib/kgcn/pipeline/pipeline.py
@@ -24,7 +24,7 @@
 from kglib.kgcn.learn.learn import KGCNLearner
 from kglib.kgcn.models.attribute import ContinuousAttribute, CategoricalAttribute, BlankAttribute
 from kglib.kgcn.models.core import softmax, KGCN
-from kglib.kgcn.pipeline.encode import encode_types, create_input_graph, create_target_graph
+from kglib.kgcn.pipeline.encode import encode_types, create_input_graph, create_target_graph, encode_values
 from kglib.kgcn.pipeline.utils import apply_logits_to_graphs, duplicate_edges_in_reverse
 from kglib.kgcn.plot.plotting import plot_across_training, plot_predictions
 from kglib.utils.graph.iterate import multidigraph_node_data_iterator, multidigraph_data_iterator, \
@@ -51,29 +51,13 @@ def pipeline(graphs,
     ############################################################
 
     # Encode attribute values
-    for graph in graphs:
-        for node_data in multidigraph_node_data_iterator(graph):
-            typ = node_data['type']
-
-            if categorical_attributes is not None and typ in categorical_attributes.keys():
-                # Add the integer value of the category for each categorical attribute instance
-                category_values = categorical_attributes[typ]
-                node_data['encoded_value'] = category_values.index(node_data['value'])
-
-            elif continuous_attributes is not None and typ in continuous_attributes.keys():
-                min_val, max_val = continuous_attributes[typ]
-                node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val)
-
-            else:
-                node_data['encoded_value'] = 0
-
-        for edge_data in multidigraph_edge_data_iterator(graph):
-            edge_data['encoded_value'] = 0
+    graphs = [encode_values(graph, categorical_attributes, continuous_attributes) for graph in graphs]
 
     indexed_graphs = [nx.convert_node_labels_to_integers(graph, label_attribute='concept') for graph in graphs]
     graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]
 
-    graphs = [encode_types(graph, node_types, edge_types) for graph in graphs]
+    graphs = [encode_types(graph, multidigraph_node_data_iterator, node_types) for graph in graphs]
+    graphs = [encode_types(graph, multidigraph_edge_data_iterator, edge_types) for graph in graphs]
 
     input_graphs = [create_input_graph(graph) for graph in graphs]
     target_graphs = [create_target_graph(graph) for graph in graphs]
@@ -109,7 +93,7 @@ def pipeline(graphs,
                                                  log_dir=output_dir)
 
     plot_across_training(*tr_info, output_file=f'{output_dir}learning.png')
-    plot_predictions(ge_input_graphs, test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png')
+    plot_predictions(graphs[tr_ge_split:], test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png')
 
     logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])