Source code for geometric2dr.embedding_methods.classify

"""Module containing various functions for classification (on top of the learned embeddings)
mainly useful for providing convenience functions on common benchmark classification methods

"""
import json

# Sklearn SVC (for "fair" comparison with existing methods)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV

from random import randint
import numpy as np
import logging

from .utils import get_files, get_class_labels

[docs]def linear_svm_classify(X_train, X_test, Y_train, Y_test): """Utility function for quickly performing Scikit Learn GridSearchCV over a linear SVM with 10 fold CrossVal given the train test splits Parameters ---------- X_train : numpy ndarray training feature vectors X_test : numpy ndarray testing feature vectors Y_train : numpy ndarray training set labels Y_test : numpy ndarray test set labels Returns ------- tuple tuple with accuracy, precision, recall, fbeta_score as applicable """ params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]} if len(set(Y_train)) == 2: classifier = GridSearchCV(LinearSVC(max_iter=100000000), params, cv=10, scoring='f1', verbose=1, n_jobs=-1) else: classifier = GridSearchCV(LinearSVC(max_iter=100000000), params, cv=10, scoring='f1_weighted', verbose=1, n_jobs=-1) classifier.fit(X_train, Y_train) logging.info('best classifier models hyperparameters', classifier.best_params_) Y_pred = classifier.predict(X_test) acc = accuracy_score(Y_test, Y_pred) logging.info('Linear SVM accuracy: {}'.format(acc)) report = classification_report(Y_test, Y_pred) logging.info(report) precision, recall, fbeta_score, support = precision_recall_fscore_support(Y_test, Y_pred) return (acc, precision, recall, fbeta_score)
[docs]def rbf_svm_classify(X_train, X_test, Y_train, Y_test): """Utility function for quickly performing Scikit Learn GridSearchCV over a rbf kernel SVM with 10 fold CrossVal given the train test splits Parameters ---------- X_train : numpy ndarray training feature vectors X_test : numpy ndarray testing feature vectors Y_train : numpy ndarray training set labels Y_test : numpy ndarray test set labels Returns ------- tuple tuple with accuracy, precision, recall, fbeta_score as applicable """ params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]} if len(set(Y_train)) == 2: classifier = GridSearchCV(SVC(gamma="scale"), params, cv=10, scoring='f1', verbose=1, n_jobs=-1) else: classifier = GridSearchCV(SVC(gamma="scale"), params, cv=10, scoring='f1_weighted', verbose=1, n_jobs=-1) classifier.fit(X_train, Y_train) Y_pred = classifier.predict(X_test) acc = accuracy_score(Y_test, Y_pred) precision, recall, fbeta_score, support = precision_recall_fscore_support(Y_test, Y_pred) return (acc, precision, recall, fbeta_score)
[docs]def perform_classification(corpus_dir, extension, embedding_fname, class_labels_fname): """Perform classification over the graph files of dataset given they have corresponding embeddings in the saved embedding file and class labels Parameters ---------- corpus_dir : str folder containing graphdoc files extension : str extension of the graphdoc files embedding_fname : str file containing embeddings class_labels_fname : str files containing labels of each graph Returns ------- tuple tuple with accuracy, precision, recall, fbeta_score as applicable """ wlk_files = get_files(corpus_dir, extension) Y = np.array(get_class_labels(wlk_files, class_labels_fname)) logging.info('Y (label) matrix shape: {}'.format(Y.shape)) seed = randint(0,1000) with open(embedding_fname, 'r') as fh: graph_embedding_dict = json.load(fh) X = np.array([graph_embedding_dict[fname] for fname in wlk_files]) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = seed) logging.info('Training and Test Matrix Shapes: {}. {}. {}. {} '.format(X_train.shape , X_test.shape, Y_train.shape, Y_test.shape)) scores = rbf_svm_classify(X_train, X_test, Y_train, Y_test) return scores
[docs]def cross_val_accuracy(corpus_dir, extension, embedding_fname, class_labels_fname, cv=10, mode=None): """ Performs 10 (default) fold cross validation, returns the mean accuracy and associated standard deviation Parameters ---------- corpus_dir : str folder containing graphdoc files extension : str extension of the graphdoc files embedding_fname : str file containing embeddings class_labels_fname : str files containing labels of each graph cv : int integer stating number of folds and therefore experiments to carry out Returns ------- tuple : (acc, std) tuple containing the mean accuracies of performing 10 fold cross validation 10 times. This gives a better picture of usual performance expected performance in a Monte Carlo fashion instead of presenting just best performance. """ # our accuracies acc_results = [] wlk_files = get_files(corpus_dir, extension) Y = np.array(get_class_labels(wlk_files, class_labels_fname)) for i in range(cv): seed = randint(0,1000) with open(embedding_fname, 'r') as fh: graph_embedding_dict = json.load(fh) X = np.array([graph_embedding_dict[fname] for fname in wlk_files]) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = seed) if mode == "linear": scores = linear_svm_classify(X_train, X_test, Y_train, Y_test) else: scores = rbf_svm_classify(X_train, X_test, Y_train, Y_test) acc_results.append(scores[0]) return np.mean(acc_results), np.std(acc_results)
[docs]def cross_val_accuracy_rbf_bag_of_words(P, y_ids, cv=10): r"""cv times Monte Carlo experimentation of 10 fold cross validation, used on given dataset matrix returns overall mean accuracy and associated standard deviation. Terminology and method name will be updated in future version to address overloading term and generalizability of function. Parameters ---------- P : numpy ndarray a obs x num_features matrix showing dataset y_ids : numpy ndarray numpy 1 x obs array of class labels for the rows of `P` cv : int (default=10) overloaded term of monte carlo restarts of the SVM evaluation over 10 fold CV Returns ------- tuple : (acc, std) tuple containing the mean accuracies of performing 10 fold cross validation 10 times. This gives a better picture of usual performance expected performance in a Monte Carlo fashion instead of presenting just best performance. """ acc_results = [] Y = np.array(y_ids) seeds = range(cv) for i in range(cv): seed = seeds[i] X_train, X_test, Y_train, Y_test = train_test_split(P, Y, test_size = 0.1, random_state = seed) scores = rbf_svm_classify(X_train, X_test, Y_train, Y_test) acc_results.append(scores[0]) return np.mean(acc_results), np.std(acc_results)