Source code for geometric2dr.decomposition.graphlet_patterns

"""Graphlet based graph decomposition algorithm to create graph documents. 
Inspired and adapted from Yanardag and Vishwanathan "Deep Graph Kernels" [2]_.

"""

# We adopt the original license and extend it here.
# Copyright (c) 2016 Pinar Yanardag
#               2019 Paul Scherer

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


# Sytem libraries
import sys
import os
import glob
import pickle
import itertools
import random
from collections import defaultdict
from time import time

# 3rd party
import numpy as np
import networkx as nx
import pynauty # make sure to install this

# Internal
from .utils import get_files

# Random seeds from Yanardag et al.
random.seed(314124)
np.random.seed(2312312)


[docs]def load_graph(file_handle): """ Load the gexf format file as a nx_graph and an adjacency matrix. Parameters ---------- file_handle : str path to gexf file Returns ------- graph : networkx graph Networkx graph instance of input gexf file adj_matrix : numpy ndarray The adjacency matrix of the graph """ graph = nx.read_gexf(file_handle) adj_matrix = nx.to_numpy_matrix(graph) return graph, adj_matrix
[docs]def get_maps(num_graphlets): """Load certificates created by the canonical representations of graphlets into canonical maps dictionary for quick isomorphism check when we extract graphlets from graphs in the users dataset Parameters ---------- num_graphlets : int The number of nodes in the graphlet pattern, aka the size of a graphlet pattern Returns ------- canonical_map : dict A dict of Nauty certificates of the canonical representations of graphlets of the size `num_graphlets`. """ data_path = os.path.join(os.path.dirname(__file__), 'canonical_maps') # data_path = "canonical_maps" with open(data_path + "/canonical_map_n%s.p"%(num_graphlets), 'rb') as handle: # canonical_map : {canonical string id: {"graph", "idx", "n"}} canonical_map = pickle.load(handle, encoding="latin1") return canonical_map
[docs]def get_graphlet(window, num_graphlets): """Compute the Nauty certificate of the graphlet in within a window of the adjacency matrix This function takes the upper triangle of a nxn matrix and computes its hash certificate using nauty. Given the parameters this usually involved computing the graphlet of num_graphlets size This is used for comparison with a bank of known certificates as loaded in get_maps(). Parameters ---------- window : numpy ndarray submatrix inside the adjacency matrix of a graph num_graphlets: int the size of the graphlets to extract Returns ------- cert : byte str certicate of the graphlet produced by finding its canonical representation with Nauty. """ adj_mat = {idx: [i for i in list(np.where(edge)[0]) if i!=idx] for idx, edge in enumerate(window)} g = pynauty.Graph(number_of_vertices=num_graphlets, directed=False, adjacency_dict = adj_mat) cert = pynauty.certificate(g) return cert
[docs]def graphlet_corpus(corpus_dir, num_graphlets, samplesize): """Function which induces the graphlet patterns over the graphs inside a given directory. The main use case is for the user to input a path containing individual graphs of the dataset in gexf format. The decomposition algorithm will induce graphlet patterns for graphs recording the dataset/"global" vocabulary of patterns within a dictionary. The graph and its associated patterns (by IDs given through our hash function) are saved into a <graphid>.wldr<depth> file which contains a line delimited list of all the substructure pattern ids. Parameters ---------- corpus_dir : str path to directory containing graph files of dataset in .gexf format. num_graphlets : int the size of the graphlet patterns to be extracted samplesize : int the number of samples to take from a graph. Returns ------- corpus : list of lists of str a list of lists, with each inner list containing all the samplesize*(1+ num_graphlets) patterns in one graph of the dataset vocabulary : list a set of the unique graphlet pattern ids prob_map : dict a map {gidx: {graphlet_idx: normalized_prob}} of normalized probabilities of a graphlet pattern appearing in a graph based on counts made in generation num_graphs : int the number of graphs in the dataset graph_map : dict a map {gidx: {graphlet_idx: count}} of the number of times a certain graphlet pattern appeared in a graph for each graph gidx in the dataset """ fallback_map = {1: 1, 2: 2, 3: 4, 4: 8, 5: 19, 6: 53, 7: 209, 8: 1253, 9: 13599} canonical_map = get_maps(num_graphlets) vocabulary = set() corpus = [] # randomly sample graphlets graph_map = {} graph_map2 = {} graph_files = get_files(corpus_dir, extension=".gexf") for gexf_fh in graph_files: gidx = int((os.path.basename(gexf_fh)).replace(".gexf", "")) nx_graph, adj_mat = load_graph(gexf_fh) num_nodes = len(adj_mat) count_map = {} # graphlet countmap tmp_corpus = [] # temporary corpus of graphlet patterns for a single graph cooccurence_corpus = [] # corpus which preserves cooccurence as in Yanardag et al. # Only sample graphlets if the number of nodes in the graph is larger than the # maximum graphlet size if num_nodes >= num_graphlets: for _ in range(samplesize): r = np.random.permutation(range(num_nodes)) for n in [num_graphlets]: # "Main Graphlet" # Choose a random set of num_graphlet nodes, find the graphlets of # desired size and add it to the count_map window = adj_mat[np.ix_(r[0:n],r[0:n])] g_type = canonical_map[str(get_graphlet(window, n), 'latin1')] graphlet_idx = g_type["idx"] count_map[graphlet_idx] = count_map.get(graphlet_idx, 0) + 1 vocabulary.add(graphlet_idx) tmp_corpus.append(graphlet_idx) cooccurence_corpus.append([graphlet_idx]) # "Co-occuring Graphlets" for node in r[0:n]: # select a non-overlapping window n-1 in size and one of # the nodes in the "main graphlet", to find a neighbouring graphlet new_n_arr = r[n:][0:n-1] r2 = np.array(list(new_n_arr) + [node]) window2 = adj_mat[np.ix_(r2,r2)] g_type2 = canonical_map[str(get_graphlet(window2, n), 'latin1')] graphlet_idx2 = g_type2["idx"] count_map[graphlet_idx2] = count_map.get(graphlet_idx2, 0) + 1 vocabulary.add(graphlet_idx2) tmp_corpus.append(graphlet_idx2) cooccurence_corpus[-1].append(graphlet_idx2) corpus.append(tmp_corpus) else: count_map[fallback_map[num_graphlets]] = samplesize # Record and save graphlet information for the one graph graph_map[gidx] = count_map save_graphlet_document(gexf_fh, gidx, num_graphlets, samplesize, cooccurence_corpus) # Normalise the probabilities of a graphlet in a graph. prob_map = {gidx: {graphlet: count/float(sum(graphlets.values())) \ for graphlet, count in graphlets.items()} for gidx, graphlets in graph_map.items()} num_graphs = len(prob_map) return corpus, vocabulary, prob_map, num_graphs, graph_map
[docs]def save_graphlet_document(gexf_fh, gidx, num_graphlets, samplesize, cooccurence_corpus): """Saves the induced graphlet patterns into dataset folder, it is only used in conjunction with graphlet_corpus() Parameters ---------- gexf_fh : str path to gexf file of graph gidx : int integer id of the graph, typically matches number in gexf_fh, if using TU Dortmund benchmark datasets num_graphlets : int size of graphlet patterns induced samplesize : int number of graphlet patterns sampled from graph cooccurrence_corpus : list list of graphlet patterns induced under cooccurrence rules, in this case all graphlets immediately adjacent to an induced graphlet pattern. Returns ------- None : None The decomposition algorithm will induce graphlet patterns for graphs recording the dataset/"global" vocabulary of patterns within a dictionary. The graph and its associated patterns (by IDs given through our hash function) are saved into a <graphid>.wldr<depth> file which contains a line delimited list of all the substructure pattern ids. """ open_fname = gexf_fh + ".graphlet" + "_ng_" + str(num_graphlets) + "_ss_" + str(samplesize) with open(open_fname,'w') as fh: for graphlet_neighbourhood in cooccurence_corpus: sentence = str.join(" ", map(str, graphlet_neighbourhood)) print (sentence, file=fh)
if __name__ == '__main__': corpus_dir = corpus_dir = "../data/dortmund_gexf/MUTAG/" corpus, vocabulary, prob_map, num_graphs, graph_map = graphlet_corpus(corpus_dir, num_graphlets=3, samplesize=6) # should result in files with 6 lines each with num_graphlets+1 items