Skip to content

Commit

Permalink
Merge pull request #44 from jajupmochi/v0.2.x
Browse files Browse the repository at this point in the history
V0.2.x
  • Loading branch information
jajupmochi authored Jan 18, 2021
2 parents 44bdcd6 + b72fb5d commit 830f96a
Show file tree
Hide file tree
Showing 25 changed files with 6,784 additions and 755 deletions.
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,20 @@ Fork the library and open a pull request! Make your own contribute to the commun

## Citation

Still waiting...
If you have used `graphkit-learn` in your publication, please cite the the following paper:
```
@article{JIA2021,
title = "graphkit-learn: A Python Library for Graph Kernels Based on Linear Patterns",
journal = "Pattern Recognition Letters",
year = "2021",
issn = "0167-8655",
doi = "https://doi.org/10.1016/j.patrec.2021.01.003",
url = "http://www.sciencedirect.com/science/article/pii/S0167865521000131",
author = "Linlin Jia and Benoit Gaüzère and Paul Honeine",
keywords = "Graph Kernels, Linear Patterns, Python Implementation",
abstract = "This paper presents graphkit-learn, the first Python library for efficient computation of graph kernels based on linear patterns, able to address various types of graphs. Graph kernels based on linear patterns are thoroughly implemented, each with specific computing methods, as well as two well-known graph kernels based on non-linear patterns for comparative analysis. Since computational complexity is an Achilles’ heel of graph kernels, we provide several strategies to address this critical issue, including parallelization, the trie data structure, and the FCSP method that we extend to other kernels and edge comparison. All proposed strategies save orders of magnitudes of computing time and memory usage. Moreover, all the graph kernels can be simply computed with a single Python statement, thus are appealing to researchers and practitioners. For the convenience of use, an advanced model selection procedure is provided for both regression and classification problems. Experiments on synthesized datasets and 11 real-world benchmark datasets show the relevance of the proposed library."
}
```

## Acknowledgments

Expand Down
6 changes: 4 additions & 2 deletions gklearn/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
class Dataset(object):


def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs):
def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', remove_null_graphs=True, clean_labels=True, reload=False, verbose=False, **kwargs):
self._substructures = None
self._node_label_dim = None
self._edge_label_dim = None
Expand Down Expand Up @@ -82,6 +82,8 @@ def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=
else:
raise TypeError('The "inputs" argument cannot be recognized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.')

if remove_null_graphs:
self.trim_dataset(edge_required=False)


def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs):
Expand Down Expand Up @@ -537,7 +539,7 @@ def cut_graphs(self, range_):


def trim_dataset(self, edge_required=False):
if edge_required:
if edge_required: # @todo: there is a possibility that some node labels will be removed.
trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)]
else:
trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0]
Expand Down
3 changes: 2 additions & 1 deletion gklearn/dataset/file_managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,8 @@ def get_class_label_map(label_map_strings):
content_targets = ga.read().splitlines() # targets (regression)
targets = [int(i) for i in content_targets]
else:
raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.')
exp_msg = 'Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.'
raise Exception(exp_msg)
if class_label_map is not None:
targets = [class_label_map[t] for t in targets]

Expand Down
11 changes: 11 additions & 0 deletions gklearn/experiments/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 15 18:22:34 2020
@author: ljia
"""

import os
EXP_ROOT = os.path.dirname(os.path.realpath(__file__)) + '/'
DATASET_ROOT = os.path.dirname(os.path.realpath(__file__)) + '/datasets/'
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Created on Wed Oct 20 11:48:02 2020
@author: ljia
"""
"""
# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1].

import os
Expand All @@ -13,15 +13,15 @@
import logging
from gklearn.ged.util import compute_geds
import time
from utils import get_dataset
from utils import get_dataset, set_edit_cost_consts
import sys
from group_results import group_trials
from group_results import group_trials, check_group_existence, update_group_marker


def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):

save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)

# Return if the file exists.
if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'):
return None, None
Expand All @@ -41,8 +41,11 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
}

edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1]

edit_cost_constants = set_edit_cost_consts(ratio,
node_labeled=len(dataset.node_labels),
edge_labeled=len(dataset.edge_labels),
mode='uniform')
# edit_cost_constants = [item * 0.01 for item in edit_cost_constants]
# pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb"))

Expand All @@ -53,7 +56,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):
options['node_attrs'] = dataset.node_attrs
options['edge_attrs'] = dataset.edge_attrs
parallel = True # if num_solutions == 1 else False

"""**5. Compute GED matrix.**"""
ged_mat = 'error'
runtime = 0
Expand All @@ -67,76 +70,86 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
logging.exception(save_file_suffix)
print(repr(exp))

"""**6. Get results.**"""

with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(ged_mat, f)
with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f:
pickle.dump(runtime, f)

return ged_mat, runtime


def save_trials_as_group(dataset, ds_name, num_solutions, ratio):
# Return if the group file exists.
name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.'
name_group = save_dir + 'groups/ged_mats' + name_middle + 'npy'
if os.path.isfile(name_group):
if check_group_existence(name_group):
return

ged_mats = []
runtimes = []
for trial in range(1, 101):
num_trials = 100
for trial in range(1, num_trials + 1):
print()
print('Trial:', trial)
ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial)
ged_mats.append(ged_mat)
runtimes.append(runtime)

# Group trials and Remove single files.
# @todo: if the program stops between the following lines, then there may be errors.
name_prefix = 'ged_matrix' + name_middle
group_trials(save_dir, name_prefix, True, True, False)
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials)
name_prefix = 'runtime' + name_middle
group_trials(save_dir, name_prefix, True, True, False)
group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials)
update_group_marker(name_group)


def results_for_a_dataset(ds_name):
"""**1. Get dataset.**"""
dataset = get_dataset(ds_name)
for num_solutions in num_solutions_list:

for ratio in ratio_list:
print()
print('# of solutions:', num_solutions)
for ratio in ratio_list:
print('Ratio:', ratio)
for num_solutions in num_solutions_list:
print()
print('Ratio:', ratio)
print('# of solutions:', num_solutions)
save_trials_as_group(dataset, ds_name, num_solutions, ratio)


def get_param_lists(ds_name):


def get_param_lists(ds_name, test=False):
if test:
num_solutions_list = [1, 10, 20, 30, 40, 50]
ratio_list = [10]
return num_solutions_list, ratio_list

if ds_name == 'AIDS_symb':
num_solutions_list = [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
else:
num_solutions_list = [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100]
ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1]

return num_solutions_list, ratio_list


if __name__ == '__main__':
if len(sys.argv) > 1:
ds_name_list = sys.argv[1:]
else:
ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']

save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/'
ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG']
# ds_name_list = ['Acyclic'] # 'Alkane_unlabeled']
# ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']

save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/'
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_dir + 'groups/', exist_ok=True)

for ds_name in ds_name_list:
print()
print('Dataset:', ds_name)
num_solutions_list, ratio_list = get_param_lists(ds_name)
num_solutions_list, ratio_list = get_param_lists(ds_name, test=False)
results_for_a_dataset(ds_name)
Loading

0 comments on commit 830f96a

Please sign in to comment.