Source code for polygon_classification.param_tuning

# Author: vkaff
# E-mail: vkaffes@imis.athena-innovation.gr

from polygon_classification import config

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

# np.random.seed(config.seed_no)


[docs]class ParamTuning:
    """
    This class provides the methods for fine tuning hyperparameters, training and testing various
    classification models for polygon classification. The following classifiers are examined:

    * Support Vector Machine (SVM)
    * Decision Trees
    * Multi-Layer Perceptron (MLP)
    * Random Forest
    * Extra-Trees
    * eXtreme Gradient Boosting (XGBoost)
    """
    clf_names = {
        'SVM': [SVC, config.MLConf.SVM_hyperparameters, config.MLConf.SVM_hyperparameters_dist],
        'DecisionTree': [
            DecisionTreeClassifier, config.MLConf.DecisionTree_hyperparameters,
            config.MLConf.DecisionTree_hyperparameters_dist
        ],
        'MLP': [MLPClassifier, config.MLConf.MLP_hyperparameters, config.MLConf.MLP_hyperparameters_dist],
        'RandomForest': [
            RandomForestClassifier, config.MLConf.RandomForest_hyperparameters,
            config.MLConf.RandomForest_hyperparameters_dist
        ],
        'ExtraTrees': [
            ExtraTreesClassifier, config.MLConf.RandomForest_hyperparameters,
            config.MLConf.RandomForest_hyperparameters_dist
        ],
        'XGBoost': [XGBClassifier, config.MLConf.XGBoost_hyperparameters, config.MLConf.XGBoost_hyperparameters_dist]
    }

    def __init__(self):
        # To be used in CV
        self.outer_cv = StratifiedKFold(n_splits=config.MLConf.kfold_parameter, shuffle=True,
                                        random_state=config.seed_no)

        self.kfold = config.MLConf.kfold_parameter
        self.n_jobs = config.MLConf.n_jobs

        self.search_method = config.MLConf.hyperparams_search_method
        self.n_iter = config.MLConf.max_iter

[docs]    def fineTuneClassifiers(self, X, y, classifiers):
        """Search over specified parameter values for various estimators/classifiers and choose the best one.

        This method searches over specified values and selects the classifier that
        achieves the best avg accuracy score for all evaluations. The supported search methods are:

        * *GridSearchCV*: Exhaustive search over specified parameter values for supported estimators.
          The following variables are defined in :func:`~src.config.MLConf` :

         * :attr:`~src.config.MLConf.MLP_hyperparameters`
         * :attr:`~src.config.MLConf.RandomForests_hyperparameters`
         * :attr:`~src.config.MLConf.XGBoost_hyperparameters`
         * :attr:`~src.config.MLConf.SVM_hyperparameters`
         * :attr:`~src.config.MLConf.DecisionTree_hyperparameters`

        * *RandomizedSearchCV*: Randomized search over continuous distribution space. :attr:`~src.config.MLConf.max_iter`
          defines the number of parameter settings that are sampled. :py:attr:`~src.config.MLConf.max_iter` trades off
          runtime vs quality of the solution. The following variables are defined in :func:`~src.config.MLConf` :

         * :attr:`~src.config.MLConf.MLP_hyperparameters_dist`
         * :attr:`~src.config.MLConf.RandomForests_hyperparameters_dist`
         * :attr:`~src.config.MLConf.XGBoost_hyperparameters_dist`
         * :attr:`~src.config.MLConf.SVM_hyperparameters_dist`
         * :attr:`~src.config.MLConf.DecisionTree_hyperparameters_dist`

        Parameters
        ----------
        X: array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples.
        y: array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values, i.e. class labels.

        Returns
        -------
        out: :obj:`dict` of {:obj:`str`: :obj:`int`, :obj:`str`: :obj:`str`}
            It returns a dictionary with keys *accuracy*, i.e., the used similarity score, and *classifier*, i.e.,
            the name of the model in reference.
        """
        hyperparams_data = list()

        for clf_key in classifiers:
            try:
                print(f'Tuning {clf_key}...')

                clf = None
                if self.search_method.lower() == 'grid':
                    clf = GridSearchCV(
                        self.clf_names[clf_key][0](random_state=config.seed_no), self.clf_names[clf_key][1],
                        cv=self.outer_cv, scoring=config.MLConf.score, verbose=1, n_jobs=self.n_jobs
                    )
                # elif self.search_method.lower() == 'hyperband' and clf_key in ['XGBoost', 'Extra-Trees', 'Random Forest']:
                #     HyperbandSearchCV(
                #         clf_val[0](probability=True) if clf_key == 'SVM' else clf_val[0](), clf_val[2].copy().pop('n_estimators'),
                #         resource_param='n_estimators',
                #         min_iter=500 if clf_key == 'XGBoost' else 200,
                #         max_iter=3000 if clf_key == 'XGBoost' else 1000,
                #         cv=self.inner_cv, random_state=seed_no, scoring=score
                #     )
                else:  # randomized is used as default
                    clf = RandomizedSearchCV(
                        self.clf_names[clf_key][0](), self.clf_names[clf_key][2],
                        cv=self.outer_cv, scoring=config.MLConf.score, verbose=1, n_jobs=self.n_jobs, n_iter=self.n_iter
                    )
                clf.fit(X, y)

                hyperparams_found = dict()
                hyperparams_found['score'] = clf.best_score_
                hyperparams_found['results'] = clf.cv_results_
                hyperparams_found['hyperparams'] = clf.best_params_
                hyperparams_found['estimator'] = clf.best_estimator_
                hyperparams_found['clf_name'] = clf_key
                hyperparams_found['scorers'] = clf.scorer_

                hyperparams_data.append(hyperparams_found)
            except KeyError as e:
                print("type error: {} for key: {}".format(str(e), clf_key))

        _, best_clf = max(enumerate(hyperparams_data), key=(lambda x: x[1]['score']))

        return best_clf

[docs]    def trainClassifier(self, X_train, y_train, model):
        """Build a classifier from the training set (X_train, y_train).

        Parameters
        ----------
        X_train: array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples.
        y_train: array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values, i.e. class labels.
        model: classifier object
            An instance of a classifier.

        Returns
        -------
        classifier object
            It returns a trained classifier.
        """
        # if hasattr(model, "n_jobs"): model.set_params(n_jobs=config.MLConf.n_jobs)

        model.fit(X_train, y_train)
        return model

[docs]    def testClassifier(self, X_test, y_test, model, proba=False):
        """Evaluate a classifier on a testing set (X_test, y_test).

        Parameters
        ----------
        X_test: array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples.
        y_test: array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values, i.e. class labels.
        model: classifier object
            A trained classifier.
        proba : bool
            Predict class probabilities for X if it is `True`.

        Returns
        -------
        tuple of (float, float, float, float)
            Returns the computed metrics, i.e., *accuracy*, *precision*, *recall* and *f1*, for the specified model on
            the test dataset.
        """
        y_pred = model.predict(X_test)
        y_proba = None
        if proba:
            y_proba = model.predict_proba(X_test)

        acc = accuracy_score(y_test, y_pred)
        pre = precision_score(y_test, y_pred, average='macro')
        rec = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        res = np.equal(y_test.to_numpy(), y_pred)

        fimportance = None
        if hasattr(model, 'feature_importances_'):
            fimportance = model.feature_importances_
        # elif hasattr(model, 'coef_'):
        #     fimportance = model.coef_

        return dict(
            metrics=dict(accuracy=acc, precision=pre, recall=rec, f1_score=f1, stats=res),
            feature_imp=fimportance,
            proba=y_proba
        )