Source code for geometric2dr.decomposition.weisfeiler_lehman_patterns

"""Functions for inducing Weisfeiler Lehman graph decomposition algorithm via node relabeling
as described in Shervashidze et al. [3]_.

Based on the implementation available in the original source code of Graph2Vec [4]_
and adapted for Geo2DR https://github.com/MLDroid/graph2vec_tf which has no license

"""

# Author: Paul Scherer 2019.


import os
import glob
from time import time
import networkx as nx
from tqdm import tqdm

# Global label_to_compressed_label_map in its initial empty state 
label_to_compressed_label_map = {}

# Function to get the node label (ignore the WLk_h id before it) as an int
get_int_node_label = lambda l: int(l.split('+')[-1])

[docs]def initial_relabel(graph, node_label_attr_name="Label"): 
    """The initial relabeling of the graphs in the dataset. 

    Taking the attributed label of the node as stated by the attr 
    in the gexf file, it gives new labels to these node types
    (without regard for neighbours) hence it really is just a relabeling
    of the initial labels into our own "0+<newlabel>" format (to use 
    with WL relabeling scheme after

    Parameters
    ----------
    graph : networkx graph
        a networkx graph
    node_label_attr_name : str
        string literal of the attribute name used as a label in the 
        gexf file NB: "label" in the gexf refers to nodeID "Label"
        refers to the dataset node label
    
    Returns
    -------
    graph : networkx graph
        the same nx graph but with a new "relabel" attribute with
        the 0th wlk-h entry label

    """
    global label_to_compressed_label_map # This is the global WL hash function for compresssed labels

    nx.convert_node_labels_to_integers(graph, first_label=0)
    for node in graph.nodes(): graph.nodes[node]['relabel'] = {} # make a dictionary attribute

    # Check for previous labelings otherwise we relabel
    for node in graph.nodes():
        try:
            label = graph.nodes[node][node_label_attr_name]
        except:
            # no node label in node_label_attr_name that is specifically pulled from the gexf file so
            graph.nodes[node]['relabel'][0] = '0+0'
            continue

        if not label in label_to_compressed_label_map:
            # if no label start with 1 and increment every time a new node label is seen
            compressed_label = len(label_to_compressed_label_map)+1
            label_to_compressed_label_map[label] = compressed_label
            graph.nodes[node]['relabel'][0] = '0+' + str(compressed_label)
        else:
            # if it already has a label we just keep the same label
            graph.nodes[node]['relabel'][0] = '0+' + str(label_to_compressed_label_map[label])

    return graph

[docs]def wl_relabel(graph, it):
    """Runs an iteration of the WL relabeling algorithm, onto
    the graph using the global label_to_compressed_label_map
    
    Parameters
    ----------
    graph : networkx graph
        a networkx graph from the dataset which we want to relabel 
        (has had to be initally relabeled, ie have the graph.nodes.[node]['relabel'] attribute)
    it : int
        an int, signifiying iteration in the WL relabeling algorithm.

    Returns
    -------
    graph : networkx graph
        the input nx graph with more labels in the "relabel" attribute

    """
    global label_to_compressed_label_map # This is the global hash function for compression

    prev_iter = it - 1
    for node in graph.nodes():
        prev_iter_node_label = get_int_node_label(graph.nodes[node]['relabel'][prev_iter]) # just a int ("1") in first it 0
        node_label = [prev_iter_node_label]
        neighbours = list(nx.all_neighbors(graph, node))
        neighbourhood_label = sorted([get_int_node_label(graph.nodes[nei]['relabel'][prev_iter]) for nei in neighbours])
        node_neighbourhood_label = tuple(node_label + neighbourhood_label)
        

        if not node_neighbourhood_label in label_to_compressed_label_map:
            compressed_label = len(label_to_compressed_label_map)+1
            label_to_compressed_label_map[node_neighbourhood_label] = compressed_label
            graph.nodes[node]['relabel'][it] = str(it) + "+" + str(compressed_label)
        else:
            graph.nodes[node]['relabel'][it] = str(it) + "+" + str(label_to_compressed_label_map[node_neighbourhood_label])
    return graph


[docs]def save_wl_doc(fname,max_h,graph=None):
    """Saves the induced rooted subgraph patterns 

    Saves the subgraph sentences in format <center> <context> <context> ....
    In other words we are saving the relabelings of node from the WL algorithm 
    into a text document which can be fed into our skipgram architecture
    
    Parameters
    ----------
    fname : str
        path/filename of the graph
    max_h : int
        highest_iteration wlk_h specified (ie depth of rooted subgraph)
    graph : networkx graph
        the nx graph of the filename

    Returns
    -------
    None : None 
        The rooted subgraph patterns are saved into a text file in the 
        format <center> <context> <context> <context> ....

    """

    open_fname = fname + '.wld' + str(max_h)

    # # no need to write if it already exists
    # if os.path.isfile(open_fname):
    #     return

    # otherwise we write into the file
    with open(open_fname,'w') as fh:
        for n,d in graph.nodes(data=True):
            for it in range(0, max_h+1):
                try:
                    center = d['relabel'][it]
                except:
                    continue
                # neis_labels_prev_deg = []
                # neis_labels_next_deg = []

                # if it != 0:
                #     neis_labels_prev_deg = list(set([graph.nodes[nei]['relabel'][it-1] for nei in nx.all_neighbors(graph, n)]))
                #     neis_labels_prev_deg.sort()
                NeisLabelsSameDeg = list(set([graph.nodes[nei]['relabel'][it] for nei in nx.all_neighbors(graph,n)])) # neighbours  on iteration it basically
                # if it != max_h:
                #     neis_labels_next_deg = list(set([graph.nodes[nei]['relabel'][it+1] for nei in nx.all_neighbors(graph,n)]))
                #     neis_labels_next_deg.sort()

                # nei_list = NeisLabelsSameDeg + neis_labels_prev_deg + neis_labels_next_deg
                nei_list = NeisLabelsSameDeg
                nei_list = ' '.join(nei_list)

                sentence = center + ' ' + nei_list
                print(sentence, file=fh)
                

[docs]def wl_corpus(fnames, max_h, node_label_attr_name='Label'):
    """Induce rooted subgraph patterns using the WL node relabeling 
    algorith given list gexf files and save corresponding graph files.

    Given a set of graphs from the dataset, a maximum h for WL, and 
    the label attribute name used in the gexf files we initially relabel
    the original labels into a compliant relabeling (caesar shift for 
    ease) then perform max-h iterations of the WL relabeling algorithm 
    (1979) to create new labels which are compressed versions of the 
    rooted subgraphs for each node in the graph. These are all present
    in the nx graph objects's nodes as attributes, with the original 
    label being 'Label' and our subsequent relabelings in the "relabel"
    attribute
    
    The main use case is for the user to input a path containing individual
    graphs of the dataset in gexf format.
    The decomposition algorithm will induce substructure patterns for graphs
    recording the dataset/"global" vocabulary of patterns within a dictionary.
    The graph and its associated patterns (by IDs given through our hash function)
    are saved into a  <graphid>.wldr<depth> file which contains a line delimited
    list of all the substructure pattern ids.

    Parameters
    ----------
    fnames : list
        list of gexf file paths for the graphs in the dataset
    max_h : int
        the maximum depth of rooted subgraph pattern to induce across 
        the dataset of graphs
    node_label_attr_name : str
        string literal of the attribute name used as a label in the 
        gexf file NB: "label" in the gexf refers to nodeID "Label"
        refers to the dataset node label

    Returns
    -------
    corpus : list of lists of str
        a list of lists, with each inner list containing all the rooted subgraph patterns in one graph of the dataset
    vocabulary : list
        a set of the unique rooted subgraph pattern ids
    prob_map : dict
        a map {gidx: {wl_pattern: normalized_prob}} of normalized probabilities of a rooted subgraph pattern appearing in a graph based on counts made in generation
    num_graphs : int
        the number of graphs in the dataset
    graph_map : dict
        a map {gidx: {wl_pattern: count}} of the number of times a certain rooted subgraph pattern appeared in a graph for each graph gidx in the dataset

    None : None
        The rooted subgraph patterns are also saved into a text file for each graph in 'fnames' in the 
        format <center> <context> <context> <context> ....

    """
    global label_to_compressed_label_map
    compressed_labels_map_list = [] # list of compressed labels maps that can be used to go backwards

    # Read each graph as a networkx graph
    print('#... Loading graphs')
    graphs = [nx.read_gexf(fname) for fname in tqdm(fnames)]





    assert len(graphs) > 0, "fnames parameter does not contain valid .gexf files"
    print ('#... Loaded all the graphs')

    # Do an initial relabeling of each nxgraph g in graphs
    graphs = [initial_relabel(g, node_label_attr_name) for g in graphs]
    print ('#... initial relabeling done in')

    # Perform the Weisfeiler-Lehman Relabeling Process for h iterations (up to h depth rooted subgraphs)
    for it in range(1, max_h + 1):
        t0 = time()
        compressed_labels_map_list.append(label_to_compressed_label_map)
        label_to_compressed_label_map = {}
        graphs = [wl_relabel(g, it) for g in tqdm(graphs)]
        print ('WL iteration {} done in {} sec.'.format(it, round(time() - t0, 2)))
        print ('num of WL rooted subgraphs in iter {} is {}'.format(it, len(label_to_compressed_label_map)))

    # Save the patterns into graph documents
    for fname, g in zip(fnames, graphs):
        save_wl_doc(fname, max_h, g)

    # Match return signatures of other decomposition algorithms
    corpus = []
    vocabulary = set()
    graph_map = {}

    for fname, g in zip(fnames, graphs):
        gidx = int((os.path.basename(fname)).replace(".gexf", ""))
        tmp_corpus = []
        count_map = {}
        for n, d in g.nodes(data=True):
            for it in range(0, max_h+1):
                try:
                    pattern_at_node = d['relabel'][it]
                    vocabulary.add(pattern_at_node)
                    tmp_corpus.append(pattern_at_node)
                    count_map[pattern_at_node] = count_map.get(pattern_at_node, 0) + 1
                except:
                    continue      

                NeisLabelsSameDeg = list(set([g.nodes[nei]['relabel'][it] for nei in nx.all_neighbors(g,n)]))
                for nei_pattern in NeisLabelsSameDeg:
                    vocabulary.add(nei_pattern)
                    tmp_corpus.append(nei_pattern)
                    count_map[nei_pattern] = count_map.get(nei_pattern, 0) + 1
        
        corpus.append(tmp_corpus)
        graph_map[gidx] = count_map

    # Normalise the probabilities of a graphlet in a graph.
    prob_map = {gidx: {graphlet: count/float(sum(graphlets.values())) \
        for graphlet, count in graphlets.items()} for gidx, graphlets in graph_map.items()}
    num_graphs = len(prob_map)          

    return corpus, vocabulary, prob_map, num_graphs, graph_map

# Manual test
if __name__ == "__main__":
    ip_folder ="../data/dortmund_gexf/MUTAG"
    max_h = 2

    all_files = sorted(glob.glob(os.path.join(ip_folder, '*gexf')))
    print("Loaded %s files in total" % (str(len(all_files))))

    corpus, vocabulary, prob_map, num_graphs, graph_map = wl_corpus(all_files, max_h)