Source code for geometric2dr.data.dortmund_formatter

"""Gexifier for TU Dortmund graph kernel based datasets.

"""

import pickle
import sys
import os
import networkx as nx
from collections import defaultdict
from tqdm import tqdm


################################################
################################################
# Explanation of the TU graph kernel files

# n = total number of nodes
# m = total number of edges
# N = number of graphs

# DS_A.txt (m lines):	represents a sparse (block diagonal) adjacency matrix for all graphs, 
#						each line corresponds to (row, col) position resp. (node_id, node_id).
#						All graphs are undirected. Hence, DS_A.txt contains two entries for 
#						each edge.

# DS_graph_indicator.txt (n_lines): column vector of graph identifiers for all nodes of all graphs
#									the value in the i-th line is the graph_id of the node with 
#									node_id i

# DS_graph_labels.txt (N lines):	class labels for all graphs in the dataset, the value in the i-th
#									line is the class label of the graph with graph_id i

# DS_node_labels.txt (n lines):		column vector of node labels, the value in the i-th line is the
#									node label for node i

# There are optional files if the respective information is available:
# DS_edge_labels.txt (m lines): same size as DS_A.txt: labels for the edges in DS_A.txt
# DS_edge_attributes.txt (m lines): same size as DS_A.txt, attributes for the edges in DS_A.txt
# DS_node_attributes.txt (n lines): matrix of node attributes, the comma seperated values in the i-th line is the feature vector of the node with node_id i
# DS_graph_attributes.txt (N lines): regression values for all graphs in the datset, the value in the i-th line is the attribute of the graph with graph_id i


[docs]class DortmundGexf(object): """A class which reads TU Dortmund style datasets and processes them into a corresponding set of .gexf graphs and an associated .Labels file This class helps turn datasets from the format with which TU Graph Kernel datasets are written into Gexf datasets, which Geo2DR can work with. It reads the DS_A.txt, DS_graph_indicator.txt, and DS_graph_labels.txt to create a folder of graphs in GEXF format and a graph-id to graph-classification label file. The saved format will be dataset_name/dataset_name/<name>.gexf : folder containing individual gexf files of each graph. dataset_name/dataset_name.Labels : a file denoting each gexf file to the integer class label See tu_gexifier for a more basic script based version. This class version will also contain various metadata about the dataset which may be useful for downstream decomposition algorithms and other analysis Parameters ---------- dataset : str string name of directory containing dataset, eg. "MUTAG". path_to_dataset : str path to directory containing directory of dataset. output_dir_for_graph_files : str path to where new dataset and labels file will be saved. """ def __init__(self, dataset, path_to_dataset, output_dir_for_graph_files): super(DortmundGexf, self).__init__() self.dataset = dataset # TODO Do some regex dark magicks self.graph_A_fname = path_to_dataset + dataset + "/" + dataset + "_A.txt" self.graph_indicator_fname = path_to_dataset + dataset + "/" + dataset + "_graph_indicator.txt" self.graph_labels_fname = path_to_dataset + dataset + "/" + dataset + "_graph_labels.txt" self.node_labels_fname = path_to_dataset + dataset + "/" + dataset + "_node_labels.txt" self.output_dir_for_graph_files = output_dir_for_graph_files self.folder_for_graph_files = output_dir_for_graph_files + dataset
[docs] def format_dataset(self): """Method which formats supplied TU-Dortmund formatted dataset into GEXF format compatible with other geometric2dr modules Returns ------- None The formatted dataset will be saved in `output_dir_for_graph_files` with the format described above """ if os.path.isdir(self.folder_for_graph_files): print("#... The dataset %s already exists, closing program ...#" % (self.folder_for_graph_files)) else: print("#... Starting gexification ...#") # Read each line in the graph_indicator and add a node_id (of the dataset) to its corresponding # graph indicated by the value print("#... Generating graph nodes dictionary ...#") graph_nodes = defaultdict(list) # each graph gets its list of nodes nodes = open(self.graph_indicator_fname).readlines() nodes = [int(x.strip()) for x in nodes] nodes_to_graph = {} node_id = 1 for gindex in nodes: graph_nodes[gindex].append(node_id) nodes_to_graph[node_id] = gindex node_id += 1 del(nodes) # Now get the edges for each graph making sure the edge nodes are actually in the graph as well print("#... Generating graph edges dictionary ...#") # Choke point of algorithm (but we realistically only do it once.) edges = open(self.graph_A_fname).readlines() edges = [x.split(",") for x in edges] edges = [(int(x.strip()), int(y.strip())) for x,y in edges] # nice little list of tuples todoedges = len(edges) # new version graph_edges = defaultdict(list) for x,y in tqdm(edges): x_gindex = nodes_to_graph[x] y_gindex = nodes_to_graph[y] if x_gindex == y_gindex: graph_edges[x_gindex].append((x,y)) del(edges) print("#... Generating NX Graph dictionary ...#") # The more you know, the defaultdict is a factory pattern graph_nx = defaultdict() for gindex in tqdm(graph_edges.keys()): G = nx.Graph() for u,v in graph_edges[gindex]: G.add_edge(u,v) graph_nx[gindex] = G del(graph_edges) print("#... Relabeling nodes as necessary via attribute file ...#") # Our system finds unique substructures across the dataset using the node labels # If the nodes have labels they will be found in the node_labels_fname # Else if we are dealing with unlabelled graphs we will use the degree of the nodes as Labels if os.path.isfile(self.node_labels_fname): ## This also takes time print("#... Relabeling nodes using %s ...#" % (self.node_labels_fname)) node_att_relabel = {} new_node_labels = open(self.node_labels_fname).readlines() old_node_label = 1 for new_node_label in new_node_labels: node_att_relabel[old_node_label] = new_node_label.strip() old_node_label += 1 for gindex in graph_nx.keys(): if gindex % 1000 == 0: print("Setting node label att %s" % (str(gindex))) nx.set_node_attributes(graph_nx[gindex], node_att_relabel, 'Label') else: print("#... Could not find node labeling file, will label by degree ...#") for gindex in graph_nx.keys(): node_att_relabel = {} G = graph_nx[gindex] for node in G.nodes: if gindex % 50 == 0: print("Setting node label att %s" % (str(gindex))) node_att_relabel[node] = G.degree(node) nx.set_node_attributes(graph_nx[gindex], node_att_relabel, 'Label') print ("#... Physical relabeling of nodes as necessary ...#") for gindex in tqdm(sorted(graph_nx.keys())): graph_nx[gindex] = nx.convert_node_labels_to_integers(graph_nx[gindex], first_label=1) print ("#... Generating graph classification labels ...#") graph_classes = defaultdict() labels = open(self.graph_labels_fname).readlines() labels = [x.strip() for x in labels] graph_id = 1 for label in labels: graph_classes[graph_id] = label graph_id += 1 print("#... Generating graph dataset dictionaries completed ...#") ################################################# ### Writing to files in and making the labels ### ################################################# print("#... Writing to gexf files ...#") # We need a gexf file for each graph and a folder to put them into if os.path.isfile(self.folder_for_graph_files): print("Already have the folder %s" % (self.folder_for_graph_files)) for gindex in sorted(graph_nx.keys()): graph_fname = str(gindex)+ ".gexf" graph_gexf_path = self.folder_for_graph_files + "/" + graph_fname nx.write_gexf(graph_nx[gindex], graph_gexf_path) else: print ("Dont have folder %s, creating and writing gexf files there" % (self.folder_for_graph_files)) os.makedirs(self.folder_for_graph_files) for gindex in sorted(graph_nx.keys()): graph_fname = str(gindex) + ".gexf" graph_gexf_path = self.folder_for_graph_files + "/" + graph_fname nx.write_gexf(graph_nx[gindex], graph_gexf_path) # Write the labels print("Writing classification labels file for graphs in dataset") name_of_labels_file = self.output_dir_for_graph_files + self.dataset + ".Labels" with open(name_of_labels_file, "w") as fh: for gindex in sorted(graph_classes.keys()): graph_fname = str(gindex) + ".gexf" fh.write("%s %s\n" % (graph_fname, graph_classes[gindex]))
if __name__ == '__main__': gexifier = DortmundGexf("MUTAG", "dortmund_data/", "/tmp/") gexifier.format_dataset()