Source code for interlinking.hyperparam_tuning

# Author: vkaff
# E-mail: vkaffes@imis.athena-innovation.gr

from interlinking import config
import numpy as np

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold


np.random.seed(config.seed_no)


[docs]class ParamTuning:
    """
    This class provides all main methods for selecting, fine tuning hyperparameters, training and testing the best
    classifier for toponym matching. The following classifiers are examined:

    * Support Vector Machine (SVM)
    * Decision Trees
    * Multi-Layer Perceptron (MLP)
    * Random Forest
    * Extra-Trees
    * eXtreme Gradient Boosting (XGBoost)
    """
    clf_names = {
        'SVM': [SVC, config.MLConf.SVM_hyperparameters, config.MLConf.SVM_hyperparameters_dist],
        'DecisionTree': [DecisionTreeClassifier, config.MLConf.DecisionTree_hyperparameters,
                          config.MLConf.DecisionTree_hyperparameters_dist],
        'MLP': [MLPClassifier, config.MLConf.MLP_hyperparameters, config.MLConf.MLP_hyperparameters_dist],
        'RandomForest': [RandomForestClassifier, config.MLConf.RandomForest_hyperparameters,
                          config.MLConf.RandomForest_hyperparameters_dist],
        'ExtraTrees': [ExtraTreesClassifier, config.MLConf.RandomForest_hyperparameters,
                        config.MLConf.RandomForest_hyperparameters_dist],
        'XGBoost': [XGBClassifier, config.MLConf.XGBoost_hyperparameters, config.MLConf.XGBoost_hyperparameters_dist]
    }

    def __init__(self):
        # To be used in outer CV
        self.outer_cv = StratifiedKFold(n_splits=config.MLConf.kfold_no, shuffle=False)

        self.kfold = config.MLConf.kfold_no
        self.n_jobs = config.MLConf.n_jobs

        self.search_method = config.MLConf.hyperparams_search_method
        self.n_iter = config.MLConf.max_iter

[docs]    def fineTuneClassifiers(self, X, y):
        """Search over specified parameter values for various estimators/classifiers and choose the best one.

        This method searches over specified values and selects the classifier that
        achieves the best avg accuracy score for all evaluations. The supported search methods are:

        * *GridSearchCV*: Exhaustive search over specified parameter values for supported estimators.
          The following variables are defined in :func:`~interlinking.config.MLConf` :

         * :attr:`~interlinking.config.MLConf.MLP_hyperparameters`
         * :attr:`~interlinking.config.MLConf.RandomForests_hyperparameters`
         * :attr:`~interlinking.config.MLConf.XGBoost_hyperparameters`
         * :attr:`~interlinking.config.MLConf.SVM_hyperparameters`
         * :attr:`~interlinking.config.MLConf.DecisionTree_hyperparameters`

        * *RandomizedSearchCV*: Randomized search over continuous distribution space. :attr:`~interlinking.config.MLConf.max_iter`
          defines the number of parameter settings that are sampled. :py:attr:`~interlinking.config.MLConf.max_iter` trades off
          runtime vs quality of the solution. The following variables are defined in :func:`~interlinking.config.MLConf` :

         * :attr:`~interlinking.config.MLConf.MLP_hyperparameters_dist`
         * :attr:`~interlinking.config.MLConf.RandomForests_hyperparameters_dist`
         * :attr:`~interlinking.config.MLConf.XGBoost_hyperparameters_dist`
         * :attr:`~interlinking.config.MLConf.SVM_hyperparameters_dist`
         * :attr:`~interlinking.config.MLConf.DecisionTree_hyperparameters_dist`

        Parameters
        ----------
        X: array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples.
        y: array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values, i.e. class labels.

        Returns
        -------
        out: :obj:`dict` of {:obj:`str`: :obj:`int`, :obj:`str`: :obj:`str`}
            It returns a dictionary with keys *accuracy*, i.e., the used similarity score, and *classifier*, i.e.,
            the name of the model in reference.
        """
        hyperparams_data = list()

        for clf_key in config.MLConf.classifiers:
            try:
                clf = None
                if self.search_method.lower() == 'grid':
                    clf = GridSearchCV(
                        self.clf_names[clf_key][0](), self.clf_names[clf_key][1],
                        cv=self.outer_cv, scoring=config.MLConf.score, verbose=1, n_jobs=self.n_jobs
                    )
                # elif self.search_method.lower() == 'hyperband' and clf_key in ['XGBoost', 'Extra-Trees', 'Random Forest']:
                #     HyperbandSearchCV(
                #         clf_val[0](probability=True) if clf_key == 'SVM' else clf_val[0](), clf_val[2].copy().pop('n_estimators'),
                #         resource_param='n_estimators',
                #         min_iter=500 if clf_key == 'XGBoost' else 200,
                #         max_iter=3000 if clf_key == 'XGBoost' else 1000,
                #         cv=self.inner_cv, random_state=seed_no, scoring=score
                #     )
                else:  # randomized is used as default
                    clf = RandomizedSearchCV(
                        self.clf_names[clf_key][0](), self.clf_names[clf_key][2],
                        cv=self.outer_cv, scoring=config.MLConf.score, verbose=1, n_jobs=self.n_jobs, n_iter=self.n_iter
                    )
                clf.fit(X, y)

                hyperparams_found = dict()
                hyperparams_found['score'] = clf.best_score_
                hyperparams_found['results'] = clf.cv_results_
                hyperparams_found['hyperparams'] = clf.best_params_
                hyperparams_found['estimator'] = clf.best_estimator_
                hyperparams_found['classifier'] = clf_key
                hyperparams_found['scorers'] = clf.scorer_

                hyperparams_data.append(hyperparams_found)
            except KeyError as e:
                print("type error: {} for key: {}".format(str(e), clf_key))

        _, best_clf = max(enumerate(hyperparams_data), key=(lambda x: x[1]['score']))

        return best_clf

[docs]    def trainClassifier(self, X_train, y_train, model):
        """Build a classifier from the training set (X_train, y_train).

        Parameters
        ----------
        X_train: array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples.
        y_train: array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values, i.e. class labels.
        model: classifier object
            An instance of a classifier.

        Returns
        -------
        classifier object
            It returns a trained classifier.
        """
        if hasattr(model, "n_jobs"): model.set_params(n_jobs=config.MLConf.n_jobs)

        model.fit(X_train, y_train)
        return model

[docs]    def testClassifier(self, X_test, y_test, model):
        """Evaluate a classifier on a testing set (X_test, y_test).

        Parameters
        ----------
        X_test: array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples.
        y_test: array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values, i.e. class labels.
        model: classifier object
            A trained classifier.

        Returns
        -------
        tuple of (float, float, float, float)
            Returns the computed metrics, i.e., *accuracy*, *precision*, *recall* and *f1*, for the specified model on the test
            dataset.
        """
        y_pred = model.predict(X_test)

        metrics = dict()
        metrics['accuracy'] = accuracy_score(y_test, y_pred)
        metrics['precision'] = precision_score(y_test, y_pred)
        metrics['recall'] = recall_score(y_test, y_pred)
        metrics['f1_score'] = f1_score(y_test, y_pred)

        return metrics