Skip to content
This repository has been archived by the owner on Nov 18, 2023. It is now read-only.

Refactor Graph Encoding #109

Merged
merged 1 commit into from
Dec 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions kglib/kgcn/examples/diagnosis/diagnosis.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,12 @@ def create_concept_graphs(example_indices, grakn_session):
# Existing elements in the graph are those that pre-exist in the graph, and should be predicted to continue to exist
PREEXISTS = dict(input=1, solution=0)

# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive examples
TO_INFER = dict(input=0, solution=2)

# Candidates are neither present in the input nor in the solution, they are negative examples
CANDIDATE = dict(input=0, solution=1)

# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive examples
TO_INFER = dict(input=0, solution=2)


class QueryHandler:
"""
Expand Down
115 changes: 58 additions & 57 deletions kglib/kgcn/pipeline/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,86 +23,87 @@
multidigraph_edge_data_iterator


def encode_types(graph, node_types, edge_types):
node_iterator = multidigraph_node_data_iterator(graph)
encode_categorically(node_iterator, node_types, 'type', 'categorical_type')

edge_iterator = multidigraph_edge_data_iterator(graph)
encode_categorically(edge_iterator, edge_types, 'type', 'categorical_type')
return graph
def encode_values(graph, categorical_attributes, continuous_attributes):
for node_data in multidigraph_node_data_iterator(graph):
typ = node_data['type']

if categorical_attributes is not None and typ in categorical_attributes.keys():
# Add the integer value of the category for each categorical attribute instance
category_values = categorical_attributes[typ]
node_data['encoded_value'] = category_values.index(node_data['value'])

def create_input_graph(graph, features_field="features"):
input_graph = graph.copy()
augment_data_fields(multidigraph_data_iterator(input_graph),
("input", "categorical_type", "encoded_value"),
features_field)
input_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32)
return input_graph
elif continuous_attributes is not None and typ in continuous_attributes.keys():
min_val, max_val = continuous_attributes[typ]
node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val)

else:
node_data['encoded_value'] = 0
for edge_data in multidigraph_edge_data_iterator(graph):
edge_data['encoded_value'] = 0

def create_target_graph(graph, features_field="features"):
target_graph = graph.copy()
target_graph = encode_solutions(target_graph, solution_field="solution", encoded_solution_field="encoded_solution",
encodings=np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]))
augment_data_fields(multidigraph_data_iterator(target_graph),
("encoded_solution",),
features_field)
target_graph.graph[features_field] = np.array([0.0] * 5, dtype=np.float32)
return target_graph
return graph


def augment_data_fields(graph_data_iterator, fields_to_augment, augmented_field):
def encode_types(graph, iterator_func, types):
"""
Returns a graph with features built from augmenting data fields found in the graph

Encodes the type found in graph data as an integer according to the index it is found in `all_types`
Args:
graph_data_iterator: iterator over the data for elements in a graph
fields_to_augment: the fields of the data dictionaries to augment together
augmented_field: the field in which to store the augmented fields

graph: The graph to encode
iterator_func: An function to create an iterator of data in the graph (node data, edge data or combined node and edge data)
types: The full list of types to be encoded in this order
Returns:
None, updates the graph in-place
The graph, which is also is updated in-place

"""
iterator = iterator_func(graph)

for data in graph_data_iterator:
data[augmented_field] = np.hstack([np.array(data[field], dtype=np.float32) for field in fields_to_augment])
for data in iterator:
data['categorical_type'] = types.index(data['type'])

return graph

def encode_solutions(graph, solution_field="solution", encoded_solution_field="encoded_solution",
encodings=np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])):
"""
Determines the encoding to use for a solution category
Args:
graph: Graph to update
solution_field: The property in the graph that holds the value of the solution
encoded_solution_field: The property in the graph to use to hold the new solution value
encodings: An array, a row from which will be picked as the new solution based on using the current solution
as a row index

Returns: Graph with updated `encoded_solution_field`
def create_input_graph(graph):
input_graph = graph.copy()

"""
for data in multidigraph_data_iterator(input_graph):
if data["solution"] == 0:
preexists = 1
else:
preexists = 0

for data in multidigraph_data_iterator(graph):
solution = data[solution_field]
data[encoded_solution_field] = encodings[solution]
features = stack_features([preexists, data["categorical_type"], data["encoded_value"]])
data.clear()
data["features"] = features

return graph
input_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32)
return input_graph


def create_target_graph(graph):
target_graph = graph.copy()
solution_one_hot_encoding = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], dtype=np.float32)

for data in multidigraph_data_iterator(target_graph):
features = solution_one_hot_encoding[data["solution"]]
data.clear()
data["features"] = features

def encode_categorically(graph_data_iterator, all_categories, category_field, encoding_field):
target_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32)
return target_graph


def stack_features(features):
"""
Encodes the type found in graph data as an integer according to the index it is found in `all_types`
Stacks features together into a single vector

Args:
graph_data_iterator: An iterator of data in the graph (node data, edge data or combined node and edge data)
all_categories: The full list of categories to be encoded in this order
category_field: The data field containing the category to encode
encoding_field: The data field to use to store the encoding
features: iterable of features, features can be a single value or iterable

Returns:
Numpy array (vector) of stacked features

"""
for data in graph_data_iterator:
data[encoding_field] = all_categories.index(data[category_field])

return np.hstack([np.array(feature, dtype=np.float32) for feature in features])
18 changes: 9 additions & 9 deletions kglib/kgcn/pipeline/encode_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,28 @@

import numpy as np

from kglib.kgcn.pipeline.encode import augment_data_fields
from kglib.kgcn.pipeline.encode import stack_features


class TestAugmentDataFields(unittest.TestCase):

def test_numpy_fields_augmented_as_expected(self):
data = [dict(attr1=np.array([0, 1, 0]), attr2=np.array([5]))]
features = [np.array([0, 1, 0]), np.array([5])]

augment_data_fields(data, ('attr1', 'attr2'), 'features')
stacked = stack_features(features)

expected_data = [dict(attr1=np.array([0, 1, 0]), attr2=np.array([5]), features=np.array([0, 1, 0, 5]))]
expected = np.array([0, 1, 0, 5])

np.testing.assert_equal(expected_data, data)
np.testing.assert_equal(expected, stacked)

def test_augmenting_non_numpy_numeric(self):
data = [dict(attr1=np.array([0, 1, 0]), attr2=5)]
data = [np.array([0, 1, 0]), 5]

augment_data_fields(data, ('attr1', 'attr2'), 'features')
stacked = stack_features(data)

expected_data = [dict(attr1=np.array([0, 1, 0]), attr2=5, features=np.array([0, 1, 0, 5]))]
expected = np.array([0, 1, 0, 5])

np.testing.assert_equal(expected_data, data)
np.testing.assert_equal(stacked, expected)


if __name__ == "__main__":
Expand Down
26 changes: 5 additions & 21 deletions kglib/kgcn/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from kglib.kgcn.learn.learn import KGCNLearner
from kglib.kgcn.models.attribute import ContinuousAttribute, CategoricalAttribute, BlankAttribute
from kglib.kgcn.models.core import softmax, KGCN
from kglib.kgcn.pipeline.encode import encode_types, create_input_graph, create_target_graph
from kglib.kgcn.pipeline.encode import encode_types, create_input_graph, create_target_graph, encode_values
from kglib.kgcn.pipeline.utils import apply_logits_to_graphs, duplicate_edges_in_reverse
from kglib.kgcn.plot.plotting import plot_across_training, plot_predictions
from kglib.utils.graph.iterate import multidigraph_node_data_iterator, multidigraph_data_iterator, \
Expand All @@ -51,29 +51,13 @@ def pipeline(graphs,
############################################################

# Encode attribute values
for graph in graphs:
for node_data in multidigraph_node_data_iterator(graph):
typ = node_data['type']

if categorical_attributes is not None and typ in categorical_attributes.keys():
# Add the integer value of the category for each categorical attribute instance
category_values = categorical_attributes[typ]
node_data['encoded_value'] = category_values.index(node_data['value'])

elif continuous_attributes is not None and typ in continuous_attributes.keys():
min_val, max_val = continuous_attributes[typ]
node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val)

else:
node_data['encoded_value'] = 0

for edge_data in multidigraph_edge_data_iterator(graph):
edge_data['encoded_value'] = 0
graphs = [encode_values(graph, categorical_attributes, continuous_attributes) for graph in graphs]

indexed_graphs = [nx.convert_node_labels_to_integers(graph, label_attribute='concept') for graph in graphs]
graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs]

graphs = [encode_types(graph, node_types, edge_types) for graph in graphs]
graphs = [encode_types(graph, multidigraph_node_data_iterator, node_types) for graph in graphs]
graphs = [encode_types(graph, multidigraph_edge_data_iterator, edge_types) for graph in graphs]

input_graphs = [create_input_graph(graph) for graph in graphs]
target_graphs = [create_target_graph(graph) for graph in graphs]
Expand Down Expand Up @@ -109,7 +93,7 @@ def pipeline(graphs,
log_dir=output_dir)

plot_across_training(*tr_info, output_file=f'{output_dir}learning.png')
plot_predictions(ge_input_graphs, test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png')
plot_predictions(graphs[tr_ge_split:], test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png')

logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1])

Expand Down