"""Shortest path based decomposition algorithm to create graph documents.
Inspired and adapted from Yanardag and Vishwanathan "Deep Graph Kernels" [2]_.
"""
# Copyright (c) 2016 Pinar Yanardag
# 2019 Paul Scherer
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Standard libraries
import sys
import os
import glob
import pickle
import itertools
import random
from collections import defaultdict
from time import time
# 3rd party
import numpy as np
import networkx as nx
# Internal
from .utils import get_files
# Random seeds from Yanardag et al.
random.seed(314124)
np.random.seed(2312312)
[docs]def load_graph(file_handle):
""" Load the gexf format file as a nx_graph and an adjacency matrix.
Parameters
----------
file_handle : str
path to gexf file
Returns
-------
graph : networkx graph
Networkx graph instance of input gexf file
adj_matrix : numpy ndarray
The adjacency matrix of the graph
"""
graph = nx.read_gexf(file_handle)
adj_matrix = nx.to_numpy_matrix(graph)
return graph, adj_matrix
[docs]def save_sp_doc(gexf_fh, gidx, coocurrence_corpus):
"""Saves a shortest path graph doc with the extension .spp
Parameters
----------
gexf_fh : str
sdf
gidx : int
int id of the gexf (note: deprecated and will
be removed in next version)
cooccurrence_corpus : list
list of lists containing cooccuring shortest paths
(ie those starting from the same starting node)
Returns
-------
None : None
Saves the induced shortest path patterns into a graph document for the graph specified
in the `gexf_fh`
"""
open_fname = gexf_fh + ".spp"
# if os.path.isfile(open_fname):
# return
with open(open_fname,'w') as fh:
for spp_neighbourhood in coocurrence_corpus:
sentence = str.join(" ", map(str, spp_neighbourhood))
print (sentence, file=fh)
[docs]def sp_corpus(corpus_dir):
"""Function which induces the shortest path patterns over the graphs inside a given directory.
The main use case for this script is for the user to supply a path to the directory
containing the .gexf graph files of a dataset. The decomposition function will produce
a .spp file for each .gexf file which contains the shortest path patterns of a graph
given the vocabulary of patterns across the dataset.
Parameters
----------
corpus_dir : str
path to directory containing graph files of dataset in .gexf format
Returns
-------
corpus : list of lists of str
a list of lists, with each inner list containing all the shortest path patterns in one graph of the dataset
vocabulary : list
a set of the unique shortest path pattern ids
prob_map : dict
a map {gidx: {path: normalized_prob}} of normalized probabilities of a shortest path pattern appearing in a graph based on counts made in generation
num_graphs : int
the number of graphs in the dataset
graph_map : dict
a map {gidx: {path: count}} of the number of times a certain shortest path pattern appeared in a graph for each graph gidx in the dataset
"""
graph_files = get_files(corpus_dir, extension=".gexf")
vocabulary = set()
count_map = {}
graph_map = {}
corpus = []
for gexf_fh in graph_files:
open_fname = gexf_fh + ".spp"
if os.path.exists(open_fname):
continue
gidx = int((os.path.basename(gexf_fh)).replace(".gexf", ""))
graph, am = load_graph(gexf_fh)
count_map[gidx] = {}
label_map = [graph.nodes[nidx]["Label"] for nidx in sorted(list(graph.nodes()))]
coocurrence_corpus = []
G = graph
all_shortest_paths = nx.all_pairs_shortest_path(G)
tmp_corpus = []
# For each source and all the path endpoints add the label
# We (Yanardag and subsequently Burnaev) consider paths to
# "cooccur" if they share the same source node
for source, sink_map in list(all_shortest_paths):
paths_cooccurrence = []
for sink, path in sink_map.items():
sp_length=len(path)-1
label = "_".join(map(str, sorted([label_map[int(source)-1], label_map[int(sink)-1]]))) + "_" + str(sp_length)
tmp_corpus.append(label)
paths_cooccurrence.append(label)
count_map[gidx][label] = count_map[gidx].get(label,0) + 1
vocabulary.add(label)
coocurrence_corpus.append(paths_cooccurrence)
# Record and save the information
save_sp_doc(gexf_fh, gidx, coocurrence_corpus)
corpus.append(tmp_corpus)
graph_map = count_map
# When we are using a straight up MLE kernel we use the graph_map with the
# counts instead
prob_map = {gidx: {path: count/float(sum(paths.values())) \
for path, count in paths.items()} for gidx, paths in count_map.items()}
num_graphs = len(count_map)
return corpus, vocabulary, prob_map, num_graphs, graph_map
if __name__ == '__main__':
corpus_dir = "/home/morio/workspace/geo2dr/geometric2dr/data/dortmund_gexf/MUTAG/"
corpus, vocabulary, prob_map, num_graphs, graph_map = sp_corpus(corpus_dir)