Source code for geometric2dr.decomposition.shortest_path_patterns

"""Shortest path based decomposition algorithm to create graph documents. 
Inspired and adapted from Yanardag and Vishwanathan "Deep Graph Kernels" [2]_.

"""

# Copyright (c) 2016 Pinar Yanardag
#               2019 Paul Scherer

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Standard libraries
import sys
import os
import glob
import pickle
import itertools
import random
from collections import defaultdict
from time import time

# 3rd party
import numpy as np
import networkx as nx

# Internal
from .utils import get_files

# Random seeds from Yanardag et al.
random.seed(314124)
np.random.seed(2312312)

[docs]def load_graph(file_handle):
	""" Load the gexf format file as a nx_graph and an adjacency matrix.

	Parameters
	----------
	file_handle : str
	    path to gexf file

	Returns
	-------
	graph : networkx graph
	    Networkx graph instance of input gexf file
	adj_matrix : numpy ndarray
	    The adjacency matrix of the graph

	"""
	graph = nx.read_gexf(file_handle)
	adj_matrix = nx.to_numpy_matrix(graph)
	return graph, adj_matrix

[docs]def save_sp_doc(gexf_fh, gidx, coocurrence_corpus):
	"""Saves a shortest path graph doc with the extension .spp
	
	Parameters
	----------
	gexf_fh : str
		sdf
	gidx : int
		int id of the gexf (note: deprecated and will 
		be removed in next version)
	cooccurrence_corpus : list
		list of lists containing cooccuring shortest paths 
		(ie those starting from the same starting node)

	Returns
	-------
	None : None
		Saves the induced shortest path patterns into a graph document for the graph specified
		in the `gexf_fh` 

	"""

	open_fname = gexf_fh + ".spp"
	# if os.path.isfile(open_fname):
	# 	return
	with open(open_fname,'w') as fh:
		for spp_neighbourhood in coocurrence_corpus:
			sentence = str.join(" ", map(str, spp_neighbourhood))
			print (sentence, file=fh)

[docs]def sp_corpus(corpus_dir):
	"""Function which induces the shortest path patterns over the graphs inside a given directory.

	The main use case for this script is for the user to supply a path to the directory
	containing the .gexf graph files of a dataset. The decomposition function will produce
	a .spp file for each .gexf file which contains the shortest path patterns of a graph
	given the vocabulary of patterns across the dataset.

	Parameters
	----------
	corpus_dir : str
		path to directory containing graph files of dataset in .gexf format

	Returns
	-------
	corpus : list of lists of str
	    a list of lists, with each inner list containing all the shortest path patterns in one graph of the dataset
	vocabulary : list
	    a set of the unique shortest path pattern ids
	prob_map : dict
	    a map {gidx: {path: normalized_prob}} of normalized probabilities of a shortest path pattern appearing in a graph based on counts made in generation
	num_graphs : int
	    the number of graphs in the dataset
	graph_map : dict
	    a map {gidx: {path: count}} of the number of times a certain shortest path pattern appeared in a graph for each graph gidx in the dataset
	
	"""
	graph_files = get_files(corpus_dir, extension=".gexf")
	vocabulary = set() 
	count_map = {}
	graph_map = {}
	corpus = []

	for gexf_fh in graph_files:
		open_fname = gexf_fh + ".spp"
		if os.path.exists(open_fname):
			continue
		
		gidx = int((os.path.basename(gexf_fh)).replace(".gexf", ""))
		graph, am = load_graph(gexf_fh)

		count_map[gidx] = {}
		label_map = [graph.nodes[nidx]["Label"] for nidx in sorted(list(graph.nodes()))]
		coocurrence_corpus = []

		G = graph
		all_shortest_paths = nx.all_pairs_shortest_path(G)
		tmp_corpus = []
		
		# For each source and all the path endpoints add the label
		# We (Yanardag and subsequently Burnaev) consider paths to 
		# "cooccur" if they share the same source node
		for source, sink_map in list(all_shortest_paths):
			paths_cooccurrence = []
			for sink, path in sink_map.items():
				sp_length=len(path)-1
				label = "_".join(map(str, sorted([label_map[int(source)-1], label_map[int(sink)-1]]))) + "_" + str(sp_length)
				tmp_corpus.append(label)
				paths_cooccurrence.append(label)
				count_map[gidx][label] = count_map[gidx].get(label,0) + 1
				vocabulary.add(label)
			coocurrence_corpus.append(paths_cooccurrence)
		
		# Record and save the information
		save_sp_doc(gexf_fh, gidx, coocurrence_corpus)
		corpus.append(tmp_corpus)

	graph_map = count_map
	# When we are using a straight up MLE kernel we use the graph_map with the
	# counts instead
	prob_map = {gidx: {path: count/float(sum(paths.values())) \
		for path, count in paths.items()} for gidx, paths in count_map.items()}

	num_graphs = len(count_map)
	return corpus, vocabulary, prob_map, num_graphs, graph_map

if __name__ == '__main__':
	corpus_dir = "/home/morio/workspace/geo2dr/geometric2dr/data/dortmund_gexf/MUTAG/"
	corpus, vocabulary, prob_map, num_graphs, graph_map = sp_corpus(corpus_dir)