Source code for geocoding.clf_utilities

import numpy as np
from itertools import product
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier

from geocoding.config import Config


clf_callable_map = {
    'NaiveBayes': GaussianNB(),
    'NearestNeighbors': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(solver='liblinear', multi_class='auto'),
    'SVM': SVC(),
    'MLP': MLPClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'ExtraTrees': ExtraTreesClassifier(),
    'XGBoost': XGBClassifier()
}

clf_hparams_map = {
    'NaiveBayes': [Config.NB_hparams, Config.NB_hparams_dist],
    'NearestNeighbors': [Config.NN_hparams, Config.NN_hparams_dist],
    'LogisticRegression': [Config.LR_hparams, Config.LR_hparams_dist],
    'SVM': [Config.SVM_hparams, Config.SVM_hparams_dist],
    'MLP': [Config.MLP_hparams, Config.MLP_hparams_dist],
    'DecisionTree': [Config.DT_hparams, Config.DT_hparams_dist],
    'RandomForest': [Config.RF_hparams, Config.RF_hparams_dist],
    'ExtraTrees': [Config.ET_hparams, Config.RF_hparams_dist],
    'XGBoost': [Config.XGB_hparams, Config.XGB_hparams_dist],
}


[docs]def train_classifier(clf_name, X_train, y_train):
    """
    Trains a classifier through grid search.

    Args:
        clf_name (str): Classifier's name to be trained
        X_train (numpy.ndarray): Train features array
        y_train (numpy.ndarray): Train labels array

    Returns:
        object: The trained classifier
    """
    clf = clf_callable_map[clf_name]
    if Config.hyperparams_search_method.lower() == 'grid':
        params = clf_hparams_map[clf_name][0]
        clf = GridSearchCV(clf, params, cv=Config.n_folds, n_jobs=Config.n_jobs, verbose=Config.verbose)
        # elif self.search_method.lower() == 'hyperband' and clf_key in ['XGBoost', 'Extra-Trees', 'Random Forest']:
        #     HyperbandSearchCV(
        #         clf_val[0](probability=True) if clf_key == 'SVM' else clf_val[0](), clf_val[2].copy().pop('n_estimators'),
        #         resource_param='n_estimators',
        #         min_iter=500 if clf_key == 'XGBoost' else 200,
        #         max_iter=3000 if clf_key == 'XGBoost' else 1000,
        #         cv=self.inner_cv, random_state=seed_no, scoring=score
        #     )
    else:  # randomized is used as default
        params = clf_hparams_map[clf_name][1]
        clf = RandomizedSearchCV(
            clf, params, cv=Config.n_folds, n_jobs=Config.n_jobs, verbose=Config.verbose, n_iter=Config.max_iter
        )
    clf.fit(X_train, y_train)
    return clf


[docs]def evaluate(y_test, y_pred):
    """
    Evaluates model predictions through a series of metrics.

    Args:
        y_test (numpy.ndarray): True labels
        y_pred (numpy.ndarray): Predicted labels

    Returns:
        dict: Contains metrics names as keys and the corresponding values as \
        values
    """
    y_pred = y_pred[:, :1]
    scores = {
        'accuracy': accuracy_score(y_test, y_pred),
        'f1_macro': f1_score(y_test, y_pred, average='macro'),
        'f1_micro': f1_score(y_test, y_pred, average='micro'),
        'f1_weighted': f1_score(y_test, y_pred, average='weighted'),
    }
    return scores


[docs]def normalize_scores(scores):
    """
    Normalizes predictions scores to a probabilities-like format.

    Args:
        scores (list): Contains the predictions scores as predicted by the \
            model

    Returns:
        list: The normalized scores
    """
    s = sum(scores)
    normalized = [score/s for score in scores]
    return normalized


[docs]def get_predictions(model, X_test):
    """
    Makes predictions utilizing *model* over *X_test*.

    Args:
        model (object): The model to be used for predictions
        X_test (numpy.ndarray): The test features array

    Returns:
        list: Contains predictions in (label, score) pairs
    """
    preds = model.predict_proba(X_test)
    y_preds = []
    for pred in preds:
        labels = np.argsort(-pred)
        scores = normalize_scores(pred[labels])
        y_preds.append(zip(labels, scores))
    return y_preds


[docs]def inverse_transform_labels(encoder, preds):
    """
    Utilizes *encoder* to transform encoded labels back to the original \
    strings.

    Args:
        encoder (sklearn.preprocessing.LabelEncoder): The encoder to be \
            utilized
        k_preds (list): Contains predictions in (label, score) pairs

    Returns:
        list: Contains predictions in (label, score) pairs, where label is \
            now in the original string format
    """
    label_mapping = dict(
        zip(encoder.transform(encoder.classes_), encoder.classes_))
    k_preds_new = [(label_mapping[pred[0]], pred[1]) for k_pred in preds
                   for pred in k_pred]
    return k_preds_new


[docs]def is_valid(clf_name):
    """
    Checks whether *clf_name* is a valid classifier's name with respect to \
    the experiment setup.

    Args:
        clf_name (str): Classifier's name

    Returns:
        bool: Returns True if given classifier's name is valid
    """
    supported_clfs = [
        clf for clf in Config.supported_classifiers if clf != 'Baseline'
    ]
    if clf_name not in supported_clfs:
        print('Supported classifiers:', supported_clfs)
        return False
    return True


[docs]def create_clf_params_product_generator(params_grid):
    """
    Generates all possible combinations of classifier's hyperparameters values.

    Args:
        params_grid (dict): Contains classifier's hyperparameters names as \
            keys and the correspoding search space as values

    Yields:
        dict: Contains a classifier's hyperparameters configuration
    """
    keys = params_grid.keys()
    vals = params_grid.values()
    for instance in product(*vals):
        yield dict(zip(keys, instance))