Refactor Graph Encoding

## What is the goal of this PR? Refactor encoding of graphs to reduce code complexity and make the pipeline clearer ## What are the changes implemented in this PR? - Moves value encoding out of pipeline - Condenses and de-generifies functions, making function purpose more explicit
typedb · Dec 4, 2019 · 86b49b0 · 86b49b0
1 parent 7724fe7
commit 86b49b0
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 90 deletions.
diff --git a/kglib/kgcn/examples/diagnosis/diagnosis.py b/kglib/kgcn/examples/diagnosis/diagnosis.py
@@ -108,12 +108,12 @@ def create_concept_graphs(example_indices, grakn_session):
 # Existing elements in the graph are those that pre-exist in the graph, and should be predicted to continue to exist
 PREEXISTS = dict(input=1, solution=0)
 
-# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive examples
-TO_INFER = dict(input=0, solution=2)
-
 # Candidates are neither present in the input nor in the solution, they are negative examples
 CANDIDATE = dict(input=0, solution=1)
 
+# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive examples
+TO_INFER = dict(input=0, solution=2)
+
 
 class QueryHandler:
  """

diff --git a/kglib/kgcn/pipeline/encode.py b/kglib/kgcn/pipeline/encode.py
@@ -23,86 +23,87 @@
  multidigraph_edge_data_iterator
 
 
-def encode_types(graph, node_types, edge_types):
- node_iterator = multidigraph_node_data_iterator(graph)
- encode_categorically(node_iterator, node_types, 'type', 'categorical_type')
-
- edge_iterator = multidigraph_edge_data_iterator(graph)
- encode_categorically(edge_iterator, edge_types, 'type', 'categorical_type')
- return graph
+def encode_values(graph, categorical_attributes, continuous_attributes):
+ for node_data in multidigraph_node_data_iterator(graph):
+ typ = node_data['type']
 
+ if categorical_attributes is not None and typ in categorical_attributes.keys():
+ # Add the integer value of the category for each categorical attribute instance
+ category_values = categorical_attributes[typ]
+ node_data['encoded_value'] = category_values.index(node_data['value'])
 
-def create_input_graph(graph, features_field="features"):
- input_graph = graph.copy()
- augment_data_fields(multidigraph_data_iterator(input_graph),
- ("input", "categorical_type", "encoded_value"),
- features_field)
- input_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32)
- return input_graph
+ elif continuous_attributes is not None and typ in continuous_attributes.keys():
+ min_val, max_val = continuous_attributes[typ]
+ node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val)
 
+ else:
+ node_data['encoded_value'] = 0
+ for edge_data in multidigraph_edge_data_iterator(graph):
+ edge_data['encoded_value'] = 0
 
-def create_target_graph(graph, features_field="features"):
- target_graph = graph.copy()
- target_graph = encode_solutions(target_graph, solution_field="solution", encoded_solution_field="encoded_solution",
- encodings=np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]))
- augment_data_fields(multidigraph_data_iterator(target_graph),
- ("encoded_solution",),
- features_field)
- target_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32)
- return target_graph
+ return graph
 
 
-def augment_data_fields(graph_data_iterator, fields_to_augment, augmented_field):
+def encode_types(graph, iterator_func, types):
  """
- Returns a graph with features built from augmenting data fields found in the graph
-
+ Encodes the type found in graph data as an integer according to the index it is found in `all_types`
  Args:
- graph_data_iterator: iterator over the data for elements in a graph
- fields_to_augment: the fields of the data dictionaries to augment together
- augmented_field: the field in which to store the augmented fields
-
+ graph: The graph to encode
+ iterator_func: An function to create an iterator of data in the graph (node data, edge data or combined node and edge data)
+ types: The full list of types to be encoded in this order
+ 
  Returns:
- None, updates the graph in-place
+ The graph, which is also is updated in-place
 
  """
+ iterator = iterator_func(graph)
 
- for data in graph_data_iterator:
- data[augmented_field] = np.hstack([np.array(data[field], dtype=np.float32) for field in fields_to_augment])
+ for data in iterator:
+ data['categorical_type'] = types.index(data['type'])
 
+ return graph
 
-def encode_solutions(graph, solution_field="solution", encoded_solution_field="encoded_solution",
- encodings=np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])):
- """
- Determines the encoding to use for a solution category
- Args:
- graph: Graph to update
- solution_field: The property in the graph that holds the value of the solution
- encoded_solution_field: The property in the graph to use to hold the new solution value
- encodings: An array, a row from which will be picked as the new solution based on using the current solution
- as a row index
 
- Returns: Graph with updated `encoded_solution_field`
+def create_input_graph(graph):
+ input_graph = graph.copy()
 
- """
+ for data in multidigraph_data_iterator(input_graph):
+ if data["solution"] == 0:
+ preexists = 1
+ else:
+ preexists = 0
 
- for data in multidigraph_data_iterator(graph):
- solution = data[solution_field]
- data[encoded_solution_field] = encodings[solution]
+  features = stack_features([preexists, data["categorical_type"], data["encoded_value"]])
+ data.clear()
+ data["features"] = features
 
- return graph
+ input_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32)
+ return input_graph
+
+
+def create_target_graph(graph):
+ target_graph = graph.copy()
+ solution_one_hot_encoding = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], dtype=np.float32)
 
+ for data in multidigraph_data_iterator(target_graph):
+ features = solution_one_hot_encoding[data["solution"]]
+ data.clear()
+ data["features"] = features
 
-def encode_categorically(graph_data_iterator, all_categories, category_field, encoding_field):
+ target_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32)
+ return target_graph
+
+
+def stack_features(features):
  """
- Encodes the type found in graph data as an integer according to the index it is found in `all_types`
+ Stacks features together into a single vector
+
  Args:
- graph_data_iterator: An iterator of data in the graph (node data, edge data or combined node and edge data)
- all_categories: The full list of categories to be encoded in this order
- category_field: The data field containing the category to encode
- encoding_field: The data field to use to store the encoding
+ features: iterable of features, features can be a single value or iterable
 
  Returns:
+ Numpy array (vector) of stacked features
 
  """
- for data in graph_data_iterator:
-  data[encoding_field] = all_categories.index(data[category_field])
+
+ return np.hstack([np.array(feature, dtype=np.float32) for feature in features])
diff --git a/kglib/kgcn/pipeline/encode_test.py b/kglib/kgcn/pipeline/encode_test.py
@@ -21,28 +21,28 @@
 
 import numpy as np
 
-from kglib.kgcn.pipeline.encode import augment_data_fields
+from kglib.kgcn.pipeline.encode import stack_features
 
 
 class TestAugmentDataFields(unittest.TestCase):
 
  def test_numpy_fields_augmented_as_expected(self):
- data = [dict(attr1=np.array([0, 1, 0]), attr2=np.array([5]))]
+ features = [np.array([0, 1, 0]), np.array([5])]
 
- augment_data_fields(data, ('attr1', 'attr2'), 'features')
+ stacked = stack_features(features)
 
- expected_data = [dict(attr1=np.array([0, 1, 0]), attr2=np.array([5]), features=np.array([0, 1, 0, 5]))]
+ expected = np.array([0, 1, 0, 5])
 
- np.testing.assert_equal(expected_data, data)
+ np.testing.assert_equal(expected, stacked)
 
  def test_augmenting_non_numpy_numeric(self):
- data = [dict(attr1=np.array([0, 1, 0]), attr2=5)]
+ data = [np.array([0, 1, 0]), 5]
 
- augment_data_fields(data, ('attr1', 'attr2'), 'features')
+ stacked = stack_features(data)
 
- expected_data = [dict(attr1=np.array([0, 1, 0]), attr2=5, features=np.array([0, 1, 0, 5]))]
+ expected = np.array([0, 1, 0, 5])
 
- np.testing.assert_equal(expected_data, data)
+ np.testing.assert_equal(stacked, expected)
 
 
 if __name__ == "__main__":

diff --git a/kglib/kgcn/pipeline/pipeline.py b/kglib/kgcn/pipeline/pipeline.py
@@ -24,7 +24,7 @@
 from kglib.kgcn.learn.learn import KGCNLearner
 from kglib.kgcn.models.attribute import ContinuousAttribute, CategoricalAttribute, BlankAttribute
 from kglib.kgcn.models.core import softmax, KGCN
-from kglib.kgcn.pipeline.encode import encode_types, create_input_graph, create_target_graph
+from kglib.kgcn.pipeline.encode import encode_types, create_input_graph, create_target_graph, encode_values
 from kglib.kgcn.pipeline.utils import apply_logits_to_graphs, duplicate_edges_in_reverse
 from kglib.kgcn.plot.plotting import plot_across_training, plot_predictions
 from kglib.utils.graph.iterate import multidigraph_node_data_iterator, multidigraph_data_iterator, \
@@ -51,29 +51,13 @@ def pipeline(graphs,
  ############################################################
 
  # Encode attribute values
- for graph in graphs:
- for node_data in multidigraph_node_data_iterator(graph):
- typ = node_data['type']
-
- if categorical_attributes is not None and typ in categorical_attributes.keys():
- # Add the integer value of the category for each categorical attribute instance
- category_values = categorical_attributes[typ]
- node_data['encoded_value'] = category_values.index(node_data['value'])
-
- elif continuous_attributes is not None and typ in continuous_attributes.keys():
- min_val, max_val = continuous_attributes[typ]
- node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val)
-
- else:
- node_data['encoded_value'] = 0
-
- for edge_data in multidigraph_edge_data_iterator(graph):
- edge_data['encoded_value'] = 0
+ graphs = [encode_values(graph, categorical_attributes, continuous_attributes) for graph in graphs]
 
  indexed_graphs = [nx.convert_node_labels_to_integers(graph, label_attribute='concept') for graph in graphs]
  graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]
 
- graphs = [encode_types(graph, node_types, edge_types) for graph in graphs]
+ graphs = [encode_types(graph, multidigraph_node_data_iterator, node_types) for graph in graphs]
+ graphs = [encode_types(graph, multidigraph_edge_data_iterator, edge_types) for graph in graphs]
 
  input_graphs = [create_input_graph(graph) for graph in graphs]
  target_graphs = [create_target_graph(graph) for graph in graphs]
@@ -109,7 +93,7 @@ def pipeline(graphs,
  log_dir=output_dir)
 
  plot_across_training(*tr_info, output_file=f'{output_dir}learning.png')
- plot_predictions(ge_input_graphs, test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png')
+ plot_predictions(graphs[tr_ge_split:], test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png')
 
  logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])