Skip to content
This repository has been archived by the owner on Nov 18, 2023. It is now read-only.

Commit

Permalink
Refactor Graph Encoding
Browse files Browse the repository at this point in the history
## What is the goal of this PR?

Refactor encoding of graphs to reduce code complexity and make the pipeline clearer

## What are the changes implemented in this PR?

- Moves value encoding out of pipeline
- Condenses and de-generifies functions, making function purpose more explicit
  • Loading branch information
jmsfltchr committed Dec 4, 2019
1 parent 7724fe7 commit 86b49b0
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 90 deletions.
6 changes: 3 additions & 3 deletions kglib/kgcn/examples/diagnosis/diagnosis.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,12 @@ def create_concept_graphs(example_indices, grakn_session):
# Existing elements in the graph are those that pre-exist in the graph, and should be predicted to continue to exist
PREEXISTS = dict(input=1, solution=0)

# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive examples
TO_INFER = dict(input=0, solution=2)

# Candidates are neither present in the input nor in the solution, they are negative examples
CANDIDATE = dict(input=0, solution=1)

# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive examples
TO_INFER = dict(input=0, solution=2)


class QueryHandler:
"""
Expand Down
115 changes: 58 additions & 57 deletions kglib/kgcn/pipeline/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,86 +23,87 @@
multidigraph_edge_data_iterator


def encode_types(graph, node_types, edge_types):
node_iterator = multidigraph_node_data_iterator(graph)
encode_categorically(node_iterator, node_types, 'type', 'categorical_type')

edge_iterator = multidigraph_edge_data_iterator(graph)
encode_categorically(edge_iterator, edge_types, 'type', 'categorical_type')
return graph
def encode_values(graph, categorical_attributes, continuous_attributes):
for node_data in multidigraph_node_data_iterator(graph):
typ = node_data['type']

if categorical_attributes is not None and typ in categorical_attributes.keys():
# Add the integer value of the category for each categorical attribute instance
category_values = categorical_attributes[typ]
node_data['encoded_value'] = category_values.index(node_data['value'])

def create_input_graph(graph, features_field="features"):
input_graph = graph.copy()
augment_data_fields(multidigraph_data_iterator(input_graph),
("input", "categorical_type", "encoded_value"),
features_field)
input_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32)
return input_graph
elif continuous_attributes is not None and typ in continuous_attributes.keys():
min_val, max_val = continuous_attributes[typ]
node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val)

else:
node_data['encoded_value'] = 0
for edge_data in multidigraph_edge_data_iterator(graph):
edge_data['encoded_value'] = 0

def create_target_graph(graph, features_field="features"):
target_graph = graph.copy()
target_graph = encode_solutions(target_graph, solution_field="solution", encoded_solution_field="encoded_solution",
encodings=np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]))
augment_data_fields(multidigraph_data_iterator(target_graph),
("encoded_solution",),
features_field)
target_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32)
return target_graph
return graph


def augment_data_fields(graph_data_iterator, fields_to_augment, augmented_field):
def encode_types(graph, iterator_func, types):
"""
Returns a graph with features built from augmenting data fields found in the graph
Encodes the type found in graph data as an integer according to the index it is found in `all_types`
Args:
graph_data_iterator: iterator over the data for elements in a graph
fields_to_augment: the fields of the data dictionaries to augment together
augmented_field: the field in which to store the augmented fields
graph: The graph to encode
iterator_func: An function to create an iterator of data in the graph (node data, edge data or combined node and edge data)
types: The full list of types to be encoded in this order
Returns:
None, updates the graph in-place
The graph, which is also is updated in-place
"""
iterator = iterator_func(graph)

for data in graph_data_iterator:
data[augmented_field] = np.hstack([np.array(data[field], dtype=np.float32) for field in fields_to_augment])
for data in iterator:
data['categorical_type'] = types.index(data['type'])

return graph

def encode_solutions(graph, solution_field="solution", encoded_solution_field="encoded_solution",
encodings=np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])):
"""
Determines the encoding to use for a solution category
Args:
graph: Graph to update
solution_field: The property in the graph that holds the value of the solution
encoded_solution_field: The property in the graph to use to hold the new solution value
encodings: An array, a row from which will be picked as the new solution based on using the current solution
as a row index

Returns: Graph with updated `encoded_solution_field`
def create_input_graph(graph):
input_graph = graph.copy()

"""
for data in multidigraph_data_iterator(input_graph):
if data["solution"] == 0:
preexists = 1
else:
preexists = 0

for data in multidigraph_data_iterator(graph):
solution = data[solution_field]
data[encoded_solution_field] = encodings[solution]
features = stack_features([preexists, data["categorical_type"], data["encoded_value"]])
data.clear()
data["features"] = features

return graph
input_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32)
return input_graph


def create_target_graph(graph):
target_graph = graph.copy()
solution_one_hot_encoding = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], dtype=np.float32)

for data in multidigraph_data_iterator(target_graph):
features = solution_one_hot_encoding[data["solution"]]
data.clear()
data["features"] = features

def encode_categorically(graph_data_iterator, all_categories, category_field, encoding_field):
target_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32)
return target_graph


def stack_features(features):
"""
Encodes the type found in graph data as an integer according to the index it is found in `all_types`
Stacks features together into a single vector
Args:
graph_data_iterator: An iterator of data in the graph (node data, edge data or combined node and edge data)
all_categories: The full list of categories to be encoded in this order
category_field: The data field containing the category to encode
encoding_field: The data field to use to store the encoding
features: iterable of features, features can be a single value or iterable
Returns:
Numpy array (vector) of stacked features
"""
for data in graph_data_iterator:
data[encoding_field] = all_categories.index(data[category_field])

return np.hstack([np.array(feature, dtype=np.float32) for feature in features])
18 changes: 9 additions & 9 deletions kglib/kgcn/pipeline/encode_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,28 @@

import numpy as np

from kglib.kgcn.pipeline.encode import augment_data_fields
from kglib.kgcn.pipeline.encode import stack_features


class TestAugmentDataFields(unittest.TestCase):

def test_numpy_fields_augmented_as_expected(self):
data = [dict(attr1=np.array([0, 1, 0]), attr2=np.array([5]))]
features = [np.array([0, 1, 0]), np.array([5])]

augment_data_fields(data, ('attr1', 'attr2'), 'features')
stacked = stack_features(features)

expected_data = [dict(attr1=np.array([0, 1, 0]), attr2=np.array([5]), features=np.array([0, 1, 0, 5]))]
expected = np.array([0, 1, 0, 5])

np.testing.assert_equal(expected_data, data)
np.testing.assert_equal(expected, stacked)

def test_augmenting_non_numpy_numeric(self):
data = [dict(attr1=np.array([0, 1, 0]), attr2=5)]
data = [np.array([0, 1, 0]), 5]

augment_data_fields(data, ('attr1', 'attr2'), 'features')
stacked = stack_features(data)

expected_data = [dict(attr1=np.array([0, 1, 0]), attr2=5, features=np.array([0, 1, 0, 5]))]
expected = np.array([0, 1, 0, 5])

np.testing.assert_equal(expected_data, data)
np.testing.assert_equal(stacked, expected)


if __name__ == "__main__":
Expand Down
26 changes: 5 additions & 21 deletions kglib/kgcn/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from kglib.kgcn.learn.learn import KGCNLearner
from kglib.kgcn.models.attribute import ContinuousAttribute, CategoricalAttribute, BlankAttribute
from kglib.kgcn.models.core import softmax, KGCN
from kglib.kgcn.pipeline.encode import encode_types, create_input_graph, create_target_graph
from kglib.kgcn.pipeline.encode import encode_types, create_input_graph, create_target_graph, encode_values
from kglib.kgcn.pipeline.utils import apply_logits_to_graphs, duplicate_edges_in_reverse
from kglib.kgcn.plot.plotting import plot_across_training, plot_predictions
from kglib.utils.graph.iterate import multidigraph_node_data_iterator, multidigraph_data_iterator, \
Expand All @@ -51,29 +51,13 @@ def pipeline(graphs,
############################################################

# Encode attribute values
for graph in graphs:
for node_data in multidigraph_node_data_iterator(graph):
typ = node_data['type']

if categorical_attributes is not None and typ in categorical_attributes.keys():
# Add the integer value of the category for each categorical attribute instance
category_values = categorical_attributes[typ]
node_data['encoded_value'] = category_values.index(node_data['value'])

elif continuous_attributes is not None and typ in continuous_attributes.keys():
min_val, max_val = continuous_attributes[typ]
node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val)

else:
node_data['encoded_value'] = 0

for edge_data in multidigraph_edge_data_iterator(graph):
edge_data['encoded_value'] = 0
graphs = [encode_values(graph, categorical_attributes, continuous_attributes) for graph in graphs]

indexed_graphs = [nx.convert_node_labels_to_integers(graph, label_attribute='concept') for graph in graphs]
graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

graphs = [encode_types(graph, node_types, edge_types) for graph in graphs]
graphs = [encode_types(graph, multidigraph_node_data_iterator, node_types) for graph in graphs]
graphs = [encode_types(graph, multidigraph_edge_data_iterator, edge_types) for graph in graphs]

input_graphs = [create_input_graph(graph) for graph in graphs]
target_graphs = [create_target_graph(graph) for graph in graphs]
Expand Down Expand Up @@ -109,7 +93,7 @@ def pipeline(graphs,
log_dir=output_dir)

plot_across_training(*tr_info, output_file=f'{output_dir}learning.png')
plot_predictions(ge_input_graphs, test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png')
plot_predictions(graphs[tr_ge_split:], test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png')

logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

Expand Down

0 comments on commit 86b49b0

Please sign in to comment.