Source code for geometric2dr.embedding_methods.utils

"""General purpose utilities for I/O

Currently Includes:
Functions for getting all files in a directory with a given extension
Saving graph embeddings into a JSON format
Generating a dictionary matching graph files with classification labels

"""

import os
import json

[docs]def get_files(dname, extension, max_files=0):
    """Returns a list of strings which are all the files with 
    the given extension in a sorted manner

    Parameters
    ----------
    dname : str
        directory with files
    extension : str
        string denoting which extension should be matched in search for files
    max_files : int (default=0)
        the maximum number of files to get, the default of 0 means all files

    Returns
    -------
    all_files : list
        list of all files matching extension inside the directory dname

    """
    all_files = [os.path.join(dname, f) for f in os.listdir(dname) if f.endswith(extension)]

    for root, dirs, files, in os.walk(dname):
        for f in files:
            if f.endswith(extension):
                all_files.append(os.path.join(root,f))

    # no duplicates
    all_files = list(set(all_files))
    all_files.sort()
    if (max_files):
        return(all_files)[:max_files]
    else:
        return all_files

[docs]def save_graph_embeddings(corpus, final_embeddings, opfname):
    """Saves the trained embeddings of a corpus into a dictionary
    and saves this into a json file on the path given by opfname

    Parameters
    ----------
    corpus : corpus
        any corpus class such as `PVDBOWCorpus`
    final_embeddings : numpy ndarray
        matrix of target embeddings to be saved
    opfname : str
        path to file where embeddings should be saved in json format (extension optional in Unix)

    Returns
    -------
    None
        embeddings will be saved into path denoted by `opfname`
    """

    dict_to_save = {}
    for i in range(len(final_embeddings)):
        graph_fname = corpus._id_to_graph_name_map[i]
        graph_embedding = final_embeddings[i,:].tolist()
        dict_to_save[graph_fname] =  graph_embedding

    with open(opfname, 'w') as filehandler:
        json.dump(dict_to_save, filehandler, indent=4)

[docs]def save_subgraph_embeddings(corpus, final_embeddings, opfname):
    """Save the embeddings along with a map to the patterns and the corpus
    
    Parameters
    ----------
    corpus : corpus
        a corpus class such as SkipgramCorpus
    final_embeddings : numpy ndarray
        matrix of target embeddings to be saved
    opfname : str
        path to file where embeddings should be saved in json format

    Returns
    -------
    None
        embeddings will be saved into path denoted by `opfname`

    """
    dict_to_save = {}
    for i in range(len(final_embeddings)):
        subgraph_name = corpus._id_to_subgraph_map[i]
        subgraph_embedding = final_embeddings[i,:].tolist()
        dict_to_save[subgraph_name] =  subgraph_embedding

    with open(opfname, 'w') as filehandler:
        json.dump(dict_to_save, filehandler, indent=4)

[docs]def get_class_labels(graph_files, class_labels_fname):
    """Given the list of graph files (as in get_files) and 
    path of the associated class labels returns the list 
    of labels associated with each graph file in graph_files

    Parameters
    ----------
    graph_files : list
        list of paths to graph_files
    class_labels_fname : str
        path to class labels file (.Labels typically) with file names in `graph_files`

    Returns
    -------
    labels : list
        list of class labels for corresponding to graph files in `graph_files`

    """
    graph_to_class_label_map = {l.split()[0].split('.')[0]: int(l.split()[1].strip()) for l in open (class_labels_fname)}
    labels = [graph_to_class_label_map[os.path.basename(g).split('.')[0]] for g in graph_files]
    return labels

[docs]def get_class_labels_tuples(graph_files, class_labels_fname):
    """Returns list of tuples associating each of the graph files 
    to their classification labels

    Parameters
    ----------
    graph_files : list
        list of paths to graph_files
    class_labels_fname : str
        path to class labels file (.Labels typically) with file names in `graph_files`

    Returns
    -------
    labels : list
        list of tuples (base_name_of_graph_file, class_label)

    """
    graph_to_class_label_map = {l.split()[0].split('.')[0]: int(l.split()[1].strip()) for l in open (class_labels_fname)}
    labels = []
    for g in graph_files:
        g_num = os.path.basename(g).split('.')[0]
        labels.append((int(g_num), graph_to_class_label_map[os.path.basename(g).split('.')[0]]))
    return labels


[docs]def get_kernel_matrix_row_idx_with_class(corpus, extension, graph_files, class_labels_fname):
    """Returns two arrays, the first is an list of integers each referencing a row in
    a kernel matrix and thereby a kernel vector corresponding to one of the graphs 
    in the dataset, the second is a list of class labels whose value is the classification
    of the graph in the same index of the first

    Parameters
    ----------
    corpus : corpus
        a corpus instance (such as SkipgramCorpus)
    extension : str
        extension of graph document under study
    graph_files : list
        list of paths to graph file
    class_labels_fname : str
        path to graph class label file
    
    Returns
    -------
    tuple
        kernel_row_x_id, kernel_row_y_id. The first is an list of integers each referencing a row in
        a kernel matrix and thereby a kernel vector corresponding to one of the graphs 
        in the dataset, the second is a list of class labels whose value is the classification
        of the graph in the same index of the first


    """
    graph_id_to_class_tuples = []
    graph_to_class_label_map = {l.split()[0]: int(l.split()[1].strip()) for l in open (class_labels_fname)}
    for graph_fname in graph_files:
        basename = os.path.basename(graph_fname)
        clabel = graph_to_class_label_map[basename]
        gidx = corpus._graph_name_to_id_map[graph_fname+extension]
        graph_id_to_class_tuples.append((gidx, clabel))

    graph_id_to_class_tuples.sort(key=lambda tup: tup[0])
    kernel_row_x_id, kernel_row_y_id = zip(*graph_id_to_class_tuples)

    return kernel_row_x_id, kernel_row_y_id