Source code for geometric2dr.decomposition.graphlet_patterns

"""Graphlet based graph decomposition algorithm to create graph documents. 
Inspired and adapted from Yanardag and Vishwanathan "Deep Graph Kernels" [2]_.

"""

# We adopt the original license and extend it here.
# Copyright (c) 2016 Pinar Yanardag
#               2019 Paul Scherer

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


# Sytem libraries
import sys
import os
import glob
import pickle
import itertools
import random
from collections import defaultdict
from time import time

# 3rd party
import numpy as np
import networkx as nx
import pynauty # make sure to install this

# Internal
from .utils import get_files

# Random seeds from Yanardag et al.
random.seed(314124)
np.random.seed(2312312)


[docs]def load_graph(file_handle):
    """ Load the gexf format file as a nx_graph and an adjacency matrix.

    Parameters
    ----------
    file_handle : str
        path to gexf file

    Returns
    -------
    graph : networkx graph
        Networkx graph instance of input gexf file
    adj_matrix : numpy ndarray
        The adjacency matrix of the graph

    """

    graph = nx.read_gexf(file_handle)
    adj_matrix = nx.to_numpy_matrix(graph)
    return graph, adj_matrix

[docs]def get_maps(num_graphlets):
    """Load certificates created by the canonical representations of 
    graphlets into canonical maps dictionary for quick isomorphism
    check when we extract graphlets from graphs in the users dataset

    Parameters
    ----------
    num_graphlets : int
        The number of nodes in the graphlet pattern, aka the size of a graphlet pattern

    Returns
    -------
    canonical_map : dict
        A dict of Nauty certificates of the canonical representations of graphlets of the size `num_graphlets`.

    """

    data_path = os.path.join(os.path.dirname(__file__), 'canonical_maps')
    # data_path = "canonical_maps"
    with open(data_path + "/canonical_map_n%s.p"%(num_graphlets), 'rb') as handle:
        # canonical_map : {canonical string id: {"graph", "idx", "n"}}
        canonical_map = pickle.load(handle, encoding="latin1")
    return canonical_map

[docs]def get_graphlet(window, num_graphlets):
    """Compute the Nauty certificate of the graphlet in within a window of the adjacency matrix

    This function takes the upper triangle of a nxn matrix and
    computes its hash certificate using nauty. Given the parameters
    this usually involved computing the graphlet of num_graphlets 
    size

    This is used for comparison with a bank of known certificates
    as loaded in get_maps().

    Parameters
    ----------
    window : numpy ndarray
        submatrix inside the adjacency matrix of a graph
    num_graphlets: int
        the size of the graphlets to extract

    Returns
    -------
    cert : byte str
        certicate of the graphlet produced by finding its canonical representation with Nauty.

    """

    adj_mat = {idx: [i for i in list(np.where(edge)[0]) if i!=idx] for idx, edge in enumerate(window)}

    g = pynauty.Graph(number_of_vertices=num_graphlets, directed=False, adjacency_dict = adj_mat)
    cert = pynauty.certificate(g)
    return cert

[docs]def graphlet_corpus(corpus_dir, num_graphlets, samplesize):
    """Function which induces the graphlet patterns over the graphs inside a given directory.

    The main use case is for the user to input a path containing individual
    graphs of the dataset in gexf format.
    The decomposition algorithm will induce graphlet patterns for graphs
    recording the dataset/"global" vocabulary of patterns within a dictionary.
    The graph and its associated patterns (by IDs given through our hash function)
    are saved into a  <graphid>.wldr<depth> file which contains a line delimited
    list of all the substructure pattern ids.

    Parameters
    ----------
    corpus_dir : str
        path to directory containing graph files of dataset in .gexf format.
    num_graphlets : int 
        the size of the graphlet patterns to be extracted 
    samplesize : int
        the number of samples to take from a graph.

    Returns
    -------
    corpus : list of lists of str
        a list of lists, with each inner list containing all the samplesize*(1+ num_graphlets) patterns in one graph of the dataset
    vocabulary : list
        a set of the unique graphlet pattern ids
    prob_map : dict
        a map {gidx: {graphlet_idx: normalized_prob}} of normalized probabilities of a graphlet pattern appearing in a graph based on counts made in generation
    num_graphs : int
        the number of graphs in the dataset
    graph_map : dict
        a map {gidx: {graphlet_idx: count}} of the number of times a certain graphlet pattern appeared in a graph for each graph gidx in the dataset
    
    """

    fallback_map = {1: 1, 2: 2, 3: 4, 4: 8, 5: 19, 6: 53, 7: 209, 8: 1253, 9: 13599}
    canonical_map = get_maps(num_graphlets)
    vocabulary = set()
    corpus = []

    # randomly sample graphlets
    graph_map = {}
    graph_map2 = {}

    graph_files = get_files(corpus_dir, extension=".gexf")

    for gexf_fh in graph_files:
        gidx = int((os.path.basename(gexf_fh)).replace(".gexf", ""))
        nx_graph, adj_mat = load_graph(gexf_fh)
        num_nodes = len(adj_mat)

        count_map = {} # graphlet countmap
        tmp_corpus = [] # temporary corpus of graphlet patterns for a single graph
        cooccurence_corpus = [] # corpus which preserves cooccurence as in Yanardag et al.

        # Only sample graphlets if the number of nodes in the graph is larger than the
        # maximum graphlet size
        if num_nodes >= num_graphlets:
            for _ in range(samplesize):
                r =  np.random.permutation(range(num_nodes))
                for n in [num_graphlets]:
                    # "Main Graphlet"
                    # Choose a random set of num_graphlet nodes, find the graphlets of 
                    # desired size and add it to the count_map
                    window = adj_mat[np.ix_(r[0:n],r[0:n])]
                    g_type = canonical_map[str(get_graphlet(window, n), 'latin1')]
                    graphlet_idx = g_type["idx"]
                    count_map[graphlet_idx] = count_map.get(graphlet_idx, 0) + 1
                    vocabulary.add(graphlet_idx)
                    tmp_corpus.append(graphlet_idx)
                    cooccurence_corpus.append([graphlet_idx])

                    # "Co-occuring Graphlets"
                    for node in r[0:n]:
                        # select a non-overlapping window n-1 in size and one of 
                        # the nodes in the "main graphlet", to find a neighbouring graphlet
                        new_n_arr = r[n:][0:n-1]
                        r2 = np.array(list(new_n_arr) + [node])
                        window2 = adj_mat[np.ix_(r2,r2)]
                        g_type2 = canonical_map[str(get_graphlet(window2, n), 'latin1')]
                        graphlet_idx2 = g_type2["idx"]
                        count_map[graphlet_idx2] = count_map.get(graphlet_idx2, 0) + 1
                        vocabulary.add(graphlet_idx2)
                        tmp_corpus.append(graphlet_idx2)
                        cooccurence_corpus[-1].append(graphlet_idx2)

                    corpus.append(tmp_corpus)
        else:
            count_map[fallback_map[num_graphlets]] = samplesize

        # Record and save graphlet information for the one graph
        graph_map[gidx] = count_map
        save_graphlet_document(gexf_fh, gidx, num_graphlets, samplesize, cooccurence_corpus)

    # Normalise the probabilities of a graphlet in a graph.
    prob_map = {gidx: {graphlet: count/float(sum(graphlets.values())) \
        for graphlet, count in graphlets.items()} for gidx, graphlets in graph_map.items()}
    num_graphs = len(prob_map)

    return corpus, vocabulary, prob_map, num_graphs, graph_map

[docs]def save_graphlet_document(gexf_fh, gidx, num_graphlets, samplesize, cooccurence_corpus):
    """Saves the induced graphlet patterns into dataset folder, it is only used in conjunction
    with graphlet_corpus()

    Parameters
    ----------
    gexf_fh : str
        path to gexf file of graph
    gidx : int
        integer id of the graph, typically matches number in 
        gexf_fh, if using TU Dortmund benchmark datasets
    num_graphlets : int
        size of graphlet patterns induced
    samplesize : int
        number of graphlet patterns sampled from graph
    cooccurrence_corpus : list
        list of graphlet patterns induced under cooccurrence 
        rules, in this case all graphlets immediately adjacent
        to an induced graphlet pattern.

    Returns
    -------
    None : None
        The decomposition algorithm will induce graphlet patterns for graphs
        recording the dataset/"global" vocabulary of patterns within a dictionary.
        The graph and its associated patterns (by IDs given through our hash function)
        are saved into a  <graphid>.wldr<depth> file which contains a line delimited
        list of all the substructure pattern ids.


    """
    open_fname = gexf_fh + ".graphlet" + "_ng_" + str(num_graphlets) + "_ss_" + str(samplesize)
    with open(open_fname,'w') as fh:
        for graphlet_neighbourhood in cooccurence_corpus:
            sentence = str.join(" ", map(str, graphlet_neighbourhood))
            print (sentence, file=fh)


if __name__ == '__main__':
    corpus_dir = corpus_dir = "../data/dortmund_gexf/MUTAG/"
    corpus, vocabulary, prob_map, num_graphs, graph_map = graphlet_corpus(corpus_dir, num_graphlets=3, samplesize=6)
    # should result in files with 6 lines each with num_graphlets+1 items