Source code for polygon_classification.core

# -*- coding: utf-8 -*-
# Author: vkaff
# E-mail: vkaffes@imis.athena-innovation.gr

import time
from sklearn.model_selection import train_test_split
from joblib import dump, load
import numpy as np
import pandas as pd
# We'll use this library to make the display pretty
from tabulate import tabulate
import os

from polygon_classification import param_tuning, config
from polygon_classification.features import Features
from polygon_classification.helpers import StaticValues


[docs]class StrategyEvaluator:
    """
    This class implements the pipeline for various strategies.
    """
    max_features_toshow = 10

    def __init__(self):
        pass

[docs]    def hyperparamTuning(self, dataset, classifiers):
        """A complete process of distinct steps in figuring out the best ML algorithm with best hyperparameters to
        polygon classification problem.
        """
        pt = param_tuning.ParamTuning()
        f = Features()

        tot_time = time.time(); start_time = time.time()
        Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset)
        print("Loaded train/test datasets in {} sec.".format(time.time() - start_time))

        fX = f.build(Xtrain)
        print("Build features from train data in {} sec.".format(time.time() - start_time))

        start_time = time.time()
        # 1st phase: find and fine tune the best classifier from a list of candidate ones
        best_clf = pt.fineTuneClassifiers(fX, ytrain, classifiers)
        estimator = best_clf['estimator']
        print("Best hyperparams, {}, with score {}; {} sec.".format(
            best_clf['hyperparams'], best_clf['score'], time.time() - start_time))

        start_time = time.time()
        # 2nd phase: train the fine tuned best classifier on the whole train dataset (no folds)
        estimator = pt.trainClassifier(fX, ytrain, estimator)
        print("Finished training model on dataset; {} sec.".format(time.time() - start_time))

        start_time = time.time()
        fX = f.build(Xtest)
        print("Build features from test data in {} sec".format(time.time() - start_time))

        start_time = time.time()
        # 3th phase: test the fine tuned best classifier on the test dataset
        res = pt.testClassifier(fX, ytest, estimator)
        self._print_stats(best_clf['clf_name'], res['metrics'], res['feature_imp'], start_time)

        print("The whole process took {} sec.".format(time.time() - tot_time))

[docs]    def exec_classifiers(self, dataset):
        """Train and evaluate selected ML algorithms with custom hyper-parameters on dataset.
        """
        f = Features()
        pt = param_tuning.ParamTuning()

        start_time = time.time()
        Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset)
        print("Loaded train/test datasets in {} sec.".format(time.time() - start_time))

        fX_train = f.build(Xtrain)
        fX_test = f.build(Xtest)
        print("Build features from train/test data in {} sec".format(time.time() - start_time))

        for clf in config.MLConf.clf_custom_params:
            print('Method {}'.format(clf))
            print('=======', end='')
            print(len(clf) * '=')

            tot_time = time.time(); start_time = time.time()
            # 1st phase: train each classifier on the whole train dataset (no folds)
            # estimator = pt.clf_names[clf][0](**config.MLConf.clf_custom_params[clf])
            estimator = pt.clf_names[clf][0](random_state=config.seed_no)
            estimator.set_params(**config.MLConf.clf_custom_params[clf])
            estimator = pt.trainClassifier(fX_train, ytrain, estimator)

            print("Finished training model on dataset; {} sec.".format(time.time() - start_time))

            start_time = time.time()
            # 2nd phase: test each classifier on the test dataset
            res = pt.testClassifier(fX_test, ytest, estimator)
            self._print_stats(clf, res['metrics'], res['feature_imp'], start_time)
            # if not os.path.exists('output'):
            #     os.makedirs('output')
            # np.savetxt(f'output/{clf}_default_stats.csv', res['metrics']['stats'], fmt="%u")

            print("The whole process took {} sec.\n".format(time.time() - tot_time))

[docs]    def train(self, dataset, classifiers):
        """A complete process of distinct steps in figuring out the best ML algorithm with optimal hyperparameters that
        best fits to data at hand for the polygon classification problem.

        Parameters
        ----------
        dataset : str
            Name of train dataset
        classifiers : str
            Comma separated classifiers to tune
        """
        pt = param_tuning.ParamTuning()
        f = Features()

        tot_time = time.time(); start_time = time.time()
        data_df = pd.read_csv(dataset)
        ytrain = data_df['status']
        Xtrain = data_df.drop('status', axis=1)
        print("Loaded train dataset in {} sec.".format(time.time() - start_time))

        fX = f.build(Xtrain)
        print("Build features from train data in {} sec.".format(time.time() - start_time))

        start_time = time.time()
        # 1st phase: find and fine tune the best classifier from a list of candidate ones
        best_clf = pt.fineTuneClassifiers(fX, ytrain, classifiers)
        estimator = best_clf['estimator']
        print("Best hyperparams for {}, {}, with score {}; {} sec.".format(
            best_clf['clf_name'], best_clf['hyperparams'], best_clf['score'], time.time() - start_time))

        estimator = pt.trainClassifier(fX, ytrain, estimator)
        os.makedirs(os.path.join(os.getcwd(), 'models'), exist_ok=True)
        dump(estimator, os.path.join(os.getcwd(), 'models', best_clf['clf_name'] + '_model.joblib'))

        print("The whole process took {} sec.".format(time.time() - tot_time))

[docs]    def evaluate(self, dataset, classifier):
        """Evaluate the best ML algorithm with optimal hyperparameters to new unseen data.

        Parameters
        ----------
        dataset : str
            Name of train dataset
        classifier : str
            Classifier to train and evaluate
        """
        pt = param_tuning.ParamTuning()
        f = Features()

        tot_time = time.time(); start_time = time.time()
        # Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset)
        data_df = pd.read_csv(dataset)
        ytest = data_df['status']
        Xtest = data_df.drop('status', axis=1)
        print("Loaded test dataset in {} sec.".format(time.time() - start_time))

        start_time = time.time()
        fX = f.build(Xtest)
        print("Build features from test data in {} sec".format(time.time() - start_time))

        start_time = time.time()
        # 3th phase: test the fine tuned best classifier on the test dataset
        estimator = load(os.path.join(os.getcwd(), 'models', classifier + '_model.joblib'))
        res = pt.testClassifier(fX, ytest, estimator, True)
        self._print_stats(classifier, res['metrics'], res['feature_imp'], start_time)

        Xtest.reset_index(inplace=True)
        Xtest = pd.concat([Xtest, pd.DataFrame(res['proba'], columns=['none_origin_pred', 'dian_origin_pred'])], axis=1)
        os.makedirs('output', exist_ok=True)
        Xtest[['pst_geom', 'dian_geom', 'none_origin_pred', 'dian_origin_pred']].to_csv(
            os.path.join('output', 'predictions.csv'), index=False)

        print("The whole process took {} sec.".format(time.time() - tot_time))

    def _print_stats(self, clf, params, fimp, tt):
        print("| Method\t\t& Accuracy\t& Precision\t& Recall\t& F1-Score\t& Time (sec)")
        print("||{0}\t& {1}\t& {2}\t& {3}\t& {4}\t& {5}".format(
            clf, params['accuracy'], params['precision'], params['recall'], params['f1_score'],
            time.time() - tt))

        if fimp is not None:
            importances = np.ma.masked_equal(fimp, 0.0)
            if importances.mask is np.ma.nomask: importances.mask = np.zeros(importances.shape, dtype=bool)

            indices = np.argsort(importances.compressed())[::-1][
                      :min(importances.compressed().shape[0], self.max_features_toshow)]
            headers = ["name", "score"]

            fcols = StaticValues.featureCols if config.MLConf.extra_features is False \
                else StaticValues.featureCols + StaticValues.extra_featureCols
            print(tabulate(zip(
                np.asarray(fcols, object)[~importances.mask][indices], importances.compressed()[indices]
            ), headers, tablefmt="simple"))

        print()

    def _load_and_split_data(self, dataset):
        data_df = pd.read_csv(dataset)
        # dian_df = gpd.read_file(getRelativePathtoWorking(config.dian))
        # dian_df.rename(columns={"geometry": "dian_geom"}, inplace=True)
        # data_df = data_df.merge(
        #     dian_df[['id', 'unique_id', 'area_doc_1', 'worktype', 'dian_geom']],
        #     left_on=['dian_id'], right_on=['id'], how='left'
        # )

        X = data_df.drop('status', axis=1)
        y = data_df['status']
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=config.test_split_thres, random_state=config.seed_no, stratify=y, shuffle=True
        )
        # pd.concat([X_train, y_train], axis=1).to_csv('data/train_dataset.csv', index=False)
        # pd.concat([X_test, y_test], axis=1).to_csv('data/test_dataset.csv', index=False)

        y_train.reset_index(drop=True, inplace=True)
        y_test.reset_index(drop=True, inplace=True)

        return X_train, X_test, y_train, y_test