Merge pull request #44 from jajupmochi/v0.2.x

V0.2.x
jajupmochi · Jan 18, 2021 · 830f96a · 830f96a
2 parents 44bdcd6 + b72fb5d
commit 830f96a
Show file tree

Hide file tree

Showing 25 changed files with 6,784 additions and 755 deletions.
diff --git a/README.md b/README.md
@@ -139,7 +139,20 @@ Fork the library and open a pull request! Make your own contribute to the commun
 
 ## Citation
 
-Still waiting...
+If you have used `graphkit-learn` in your publication, please cite the the following paper:
+```
+@article{JIA2021,
+	title = "graphkit-learn: A Python Library for Graph Kernels Based on Linear Patterns",
+	journal = "Pattern Recognition Letters",
+	year = "2021",
+	issn = "0167-8655",
+	doi = "https://doi.org/10.1016/j.patrec.2021.01.003",
+	url = "http://www.sciencedirect.com/science/article/pii/S0167865521000131",
+	author = "Linlin Jia and Benoit Gaüzère and Paul Honeine",
+	keywords = "Graph Kernels, Linear Patterns, Python Implementation",
+	abstract = "This paper presents graphkit-learn, the first Python library for efficient computation of graph kernels based on linear patterns, able to address various types of graphs. Graph kernels based on linear patterns are thoroughly implemented, each with specific computing methods, as well as two well-known graph kernels based on non-linear patterns for comparative analysis. Since computational complexity is an Achilles’ heel of graph kernels, we provide several strategies to address this critical issue, including parallelization, the trie data structure, and the FCSP method that we extend to other kernels and edge comparison. All proposed strategies save orders of magnitudes of computing time and memory usage. Moreover, all the graph kernels can be simply computed with a single Python statement, thus are appealing to researchers and practitioners. For the convenience of use, an advanced model selection procedure is provided for both regression and classification problems. Experiments on synthesized datasets and 11 real-world benchmark datasets show the relevance of the proposed library."
+}
+```
 
 ## Acknowledgments
 

diff --git a/gklearn/dataset/dataset.py b/gklearn/dataset/dataset.py
@@ -14,7 +14,7 @@
 class Dataset(object):
 
 
-	def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', clean_labels=True, reload=False, verbose=False, **kwargs):
+	def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=None, mode='networkx', remove_null_graphs=True, clean_labels=True, reload=False, verbose=False, **kwargs):
 		self._substructures = None
 		self._node_label_dim = None
 		self._edge_label_dim = None
@@ -82,6 +82,8 @@ def __init__(self, inputs=None, root='datasets', filename_targets=None, targets=
 		else:
 			raise TypeError('The "inputs" argument cannot be recognized. "Inputs" can be a list of graphs, a predefined dataset name, or a file name of a dataset.')
 
+		if remove_null_graphs:
+			self.trim_dataset(edge_required=False)
 
 
 	def load_dataset(self, filename, filename_targets=None, clean_labels=True, **kwargs):
@@ -537,7 +539,7 @@ def cut_graphs(self, range_):
 
 
 	def trim_dataset(self, edge_required=False):
-		if edge_required:
+		if edge_required: # @todo: there is a possibility that some node labels will be removed.
 			trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if (nx.number_of_nodes(g) != 0 and nx.number_of_edges(g) != 0)]
 		else:
 			trimed_pairs = [(idx, g) for idx, g in enumerate(self._graphs) if nx.number_of_nodes(g) != 0]

diff --git a/gklearn/dataset/file_managers.py b/gklearn/dataset/file_managers.py
@@ -332,7 +332,8 @@ def get_class_label_map(label_map_strings):
 				content_targets = ga.read().splitlines()  # targets (regression)
 			targets = [int(i) for i in content_targets]
 		else:
-			raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.')
+			exp_msg = 'Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.'
+			raise Exception(exp_msg)
 		if class_label_map is not None:
 			targets = [class_label_map[t] for t in targets]
 

diff --git a/gklearn/experiments/__init__.py b/gklearn/experiments/__init__.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Dec 15 18:22:34 2020
+
+@author: ljia
+"""
+
+import os
+EXP_ROOT = os.path.dirname(os.path.realpath(__file__)) + '/'
+DATASET_ROOT = os.path.dirname(os.path.realpath(__file__)) + '/datasets/'
diff --git a/...ility/edit_costs.nums_sols.ratios.IPFP.py → ..._costs.real_data.nums_sols.ratios.IPFP.py b/...ility/edit_costs.nums_sols.ratios.IPFP.py → ..._costs.real_data.nums_sols.ratios.IPFP.py
@@ -4,7 +4,7 @@
 Created on Wed Oct  20 11:48:02 2020
 
 @author: ljia
-"""	
+"""
 # This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1].
 
 import os
@@ -13,15 +13,15 @@
 import logging
 from gklearn.ged.util import compute_geds
 import time
-from utils import get_dataset
+from utils import get_dataset, set_edit_cost_consts
 import sys
-from group_results import group_trials
+from group_results import group_trials, check_group_existence, update_group_marker
 
 
 def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):
 
 	save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)
-	
+
 	# Return if the file exists.
 	if os.path.isfile(save_dir + 'ged_matrix' + save_file_suffix + '.pkl'):
 		return None, None
@@ -41,8 +41,11 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):
 				   'threads': multiprocessing.cpu_count(),
 				   'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
 				   }
-
-	edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1]
+
+	edit_cost_constants = set_edit_cost_consts(ratio,
+											node_labeled=len(dataset.node_labels),
+											edge_labeled=len(dataset.edge_labels),
+											mode='uniform')
 #	edit_cost_constants = [item * 0.01 for item in edit_cost_constants]
 #	pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb"))
 
@@ -53,7 +56,7 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):
 	options['node_attrs'] = dataset.node_attrs
 	options['edge_attrs'] = dataset.edge_attrs
 	parallel = True # if num_solutions == 1 else False
-	
+
 	"""**5.   Compute GED matrix.**"""
 	ged_mat = 'error'
 	runtime = 0
@@ -67,76 +70,86 @@ def xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial):
 		logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
 		logging.exception(save_file_suffix)
 		print(repr(exp))
-					
+
 	"""**6. Get results.**"""
-	
+
 	with open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb') as f:
 		pickle.dump(ged_mat, f)
 	with open(save_dir + 'runtime' + save_file_suffix + '.pkl', 'wb') as f:
 		pickle.dump(runtime, f)
 
 	return ged_mat, runtime
 
-	
+
 def save_trials_as_group(dataset, ds_name, num_solutions, ratio):
 	# Return if the group file exists.
 	name_middle = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.'
 	name_group = save_dir + 'groups/ged_mats' +  name_middle + 'npy'
-	if os.path.isfile(name_group):
+	if check_group_existence(name_group):
 		return
-	
+
 	ged_mats = []
 	runtimes = []
-	for trial in range(1, 101):
+	num_trials = 100
+	for trial in range(1, num_trials + 1):
 		print()
 		print('Trial:', trial)
 		ged_mat, runtime = xp_compute_ged_matrix(dataset, ds_name, num_solutions, ratio, trial)
 		ged_mats.append(ged_mat)
 		runtimes.append(runtime)
-		
+
 	# Group trials and Remove single files.
+	# @todo: if the program stops between the following lines, then there may be errors.
 	name_prefix = 'ged_matrix' + name_middle
-	group_trials(save_dir, name_prefix, True, True, False)
+	group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials)
 	name_prefix = 'runtime' + name_middle
-	group_trials(save_dir, name_prefix, True, True, False)
+	group_trials(save_dir, name_prefix, True, True, False, num_trials=num_trials)
+	update_group_marker(name_group)
 
 
 def results_for_a_dataset(ds_name):
 	"""**1.   Get dataset.**"""
 	dataset = get_dataset(ds_name)
-	
-	for num_solutions in num_solutions_list:
+
+	for ratio in ratio_list:
 		print()
-		print('# of solutions:', num_solutions)
-		for ratio in ratio_list:
+		print('Ratio:', ratio)
+		for num_solutions in num_solutions_list:
 			print()
-			print('Ratio:', ratio)
+			print('# of solutions:', num_solutions)
 			save_trials_as_group(dataset, ds_name, num_solutions, ratio)
-
-
-def get_param_lists(ds_name):
+
+
+def get_param_lists(ds_name, test=False):
+	if test:
+		num_solutions_list = [1, 10, 20, 30, 40, 50]
+		ratio_list = [10]
+		return num_solutions_list, ratio_list
+
 	if ds_name == 'AIDS_symb':
 		num_solutions_list = [1, 20, 40, 60, 80, 100]
 		ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
 	else:
-		num_solutions_list = [1, 20, 40, 60, 80, 100]
-		ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9]
-		
+		num_solutions_list = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # [1, 20, 40, 60, 80, 100]
+		ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 3, 5, 7, 9, 10][::-1]
+
 	return num_solutions_list, ratio_list
-				
+
 
 if __name__ == '__main__':
 	if len(sys.argv) > 1:
 		ds_name_list = sys.argv[1:]
 	else:
-		ds_name_list = ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
-
-	save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/'
+		ds_name_list = ['Acyclic', 'Alkane_unlabeled', 'MAO_lite', 'Monoterpenoides', 'MUTAG']
+# 		ds_name_list = ['Acyclic'] # 'Alkane_unlabeled']
+# 		ds_name_list = ['Acyclic', 'MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']
+
+	save_dir = 'outputs/edit_costs.real_data.num_sols.ratios.IPFP/'
 	os.makedirs(save_dir, exist_ok=True)
 	os.makedirs(save_dir + 'groups/', exist_ok=True)
-		
+
 	for ds_name in ds_name_list:
 		print()
 		print('Dataset:', ds_name)
-		num_solutions_list, ratio_list = get_param_lists(ds_name)
+		num_solutions_list, ratio_list = get_param_lists(ds_name, test=False)
 		results_for_a_dataset(ds_name)