# -*- coding: utf-8 -*-
# Author: vkaff
# E-mail: vkaffes@imis.athena-innovation.gr
import time
from sklearn.model_selection import train_test_split
from joblib import dump, load
import numpy as np
import pandas as pd
# We'll use this library to make the display pretty
from tabulate import tabulate
import os
from polygon_classification import param_tuning, config
from polygon_classification.features import Features
from polygon_classification.helpers import StaticValues
[docs]class StrategyEvaluator:
"""
This class implements the pipeline for various strategies.
"""
max_features_toshow = 10
def __init__(self):
pass
[docs] def hyperparamTuning(self, dataset, classifiers):
"""A complete process of distinct steps in figuring out the best ML algorithm with best hyperparameters to
polygon classification problem.
"""
pt = param_tuning.ParamTuning()
f = Features()
tot_time = time.time(); start_time = time.time()
Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset)
print("Loaded train/test datasets in {} sec.".format(time.time() - start_time))
fX = f.build(Xtrain)
print("Build features from train data in {} sec.".format(time.time() - start_time))
start_time = time.time()
# 1st phase: find and fine tune the best classifier from a list of candidate ones
best_clf = pt.fineTuneClassifiers(fX, ytrain, classifiers)
estimator = best_clf['estimator']
print("Best hyperparams, {}, with score {}; {} sec.".format(
best_clf['hyperparams'], best_clf['score'], time.time() - start_time))
start_time = time.time()
# 2nd phase: train the fine tuned best classifier on the whole train dataset (no folds)
estimator = pt.trainClassifier(fX, ytrain, estimator)
print("Finished training model on dataset; {} sec.".format(time.time() - start_time))
start_time = time.time()
fX = f.build(Xtest)
print("Build features from test data in {} sec".format(time.time() - start_time))
start_time = time.time()
# 3th phase: test the fine tuned best classifier on the test dataset
res = pt.testClassifier(fX, ytest, estimator)
self._print_stats(best_clf['clf_name'], res['metrics'], res['feature_imp'], start_time)
print("The whole process took {} sec.".format(time.time() - tot_time))
[docs] def exec_classifiers(self, dataset):
"""Train and evaluate selected ML algorithms with custom hyper-parameters on dataset.
"""
f = Features()
pt = param_tuning.ParamTuning()
start_time = time.time()
Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset)
print("Loaded train/test datasets in {} sec.".format(time.time() - start_time))
fX_train = f.build(Xtrain)
fX_test = f.build(Xtest)
print("Build features from train/test data in {} sec".format(time.time() - start_time))
for clf in config.MLConf.clf_custom_params:
print('Method {}'.format(clf))
print('=======', end='')
print(len(clf) * '=')
tot_time = time.time(); start_time = time.time()
# 1st phase: train each classifier on the whole train dataset (no folds)
# estimator = pt.clf_names[clf][0](**config.MLConf.clf_custom_params[clf])
estimator = pt.clf_names[clf][0](random_state=config.seed_no)
estimator.set_params(**config.MLConf.clf_custom_params[clf])
estimator = pt.trainClassifier(fX_train, ytrain, estimator)
print("Finished training model on dataset; {} sec.".format(time.time() - start_time))
start_time = time.time()
# 2nd phase: test each classifier on the test dataset
res = pt.testClassifier(fX_test, ytest, estimator)
self._print_stats(clf, res['metrics'], res['feature_imp'], start_time)
# if not os.path.exists('output'):
# os.makedirs('output')
# np.savetxt(f'output/{clf}_default_stats.csv', res['metrics']['stats'], fmt="%u")
print("The whole process took {} sec.\n".format(time.time() - tot_time))
[docs] def train(self, dataset, classifiers):
"""A complete process of distinct steps in figuring out the best ML algorithm with optimal hyperparameters that
best fits to data at hand for the polygon classification problem.
Parameters
----------
dataset : str
Name of train dataset
classifiers : str
Comma separated classifiers to tune
"""
pt = param_tuning.ParamTuning()
f = Features()
tot_time = time.time(); start_time = time.time()
data_df = pd.read_csv(dataset)
ytrain = data_df['status']
Xtrain = data_df.drop('status', axis=1)
print("Loaded train dataset in {} sec.".format(time.time() - start_time))
fX = f.build(Xtrain)
print("Build features from train data in {} sec.".format(time.time() - start_time))
start_time = time.time()
# 1st phase: find and fine tune the best classifier from a list of candidate ones
best_clf = pt.fineTuneClassifiers(fX, ytrain, classifiers)
estimator = best_clf['estimator']
print("Best hyperparams for {}, {}, with score {}; {} sec.".format(
best_clf['clf_name'], best_clf['hyperparams'], best_clf['score'], time.time() - start_time))
estimator = pt.trainClassifier(fX, ytrain, estimator)
os.makedirs(os.path.join(os.getcwd(), 'models'), exist_ok=True)
dump(estimator, os.path.join(os.getcwd(), 'models', best_clf['clf_name'] + '_model.joblib'))
print("The whole process took {} sec.".format(time.time() - tot_time))
[docs] def evaluate(self, dataset, classifier):
"""Evaluate the best ML algorithm with optimal hyperparameters to new unseen data.
Parameters
----------
dataset : str
Name of train dataset
classifier : str
Classifier to train and evaluate
"""
pt = param_tuning.ParamTuning()
f = Features()
tot_time = time.time(); start_time = time.time()
# Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset)
data_df = pd.read_csv(dataset)
ytest = data_df['status']
Xtest = data_df.drop('status', axis=1)
print("Loaded test dataset in {} sec.".format(time.time() - start_time))
start_time = time.time()
fX = f.build(Xtest)
print("Build features from test data in {} sec".format(time.time() - start_time))
start_time = time.time()
# 3th phase: test the fine tuned best classifier on the test dataset
estimator = load(os.path.join(os.getcwd(), 'models', classifier + '_model.joblib'))
res = pt.testClassifier(fX, ytest, estimator, True)
self._print_stats(classifier, res['metrics'], res['feature_imp'], start_time)
Xtest.reset_index(inplace=True)
Xtest = pd.concat([Xtest, pd.DataFrame(res['proba'], columns=['none_origin_pred', 'dian_origin_pred'])], axis=1)
os.makedirs('output', exist_ok=True)
Xtest[['pst_geom', 'dian_geom', 'none_origin_pred', 'dian_origin_pred']].to_csv(
os.path.join('output', 'predictions.csv'), index=False)
print("The whole process took {} sec.".format(time.time() - tot_time))
def _print_stats(self, clf, params, fimp, tt):
print("| Method\t\t& Accuracy\t& Precision\t& Recall\t& F1-Score\t& Time (sec)")
print("||{0}\t& {1}\t& {2}\t& {3}\t& {4}\t& {5}".format(
clf, params['accuracy'], params['precision'], params['recall'], params['f1_score'],
time.time() - tt))
if fimp is not None:
importances = np.ma.masked_equal(fimp, 0.0)
if importances.mask is np.ma.nomask: importances.mask = np.zeros(importances.shape, dtype=bool)
indices = np.argsort(importances.compressed())[::-1][
:min(importances.compressed().shape[0], self.max_features_toshow)]
headers = ["name", "score"]
fcols = StaticValues.featureCols if config.MLConf.extra_features is False \
else StaticValues.featureCols + StaticValues.extra_featureCols
print(tabulate(zip(
np.asarray(fcols, object)[~importances.mask][indices], importances.compressed()[indices]
), headers, tablefmt="simple"))
print()
def _load_and_split_data(self, dataset):
data_df = pd.read_csv(dataset)
# dian_df = gpd.read_file(getRelativePathtoWorking(config.dian))
# dian_df.rename(columns={"geometry": "dian_geom"}, inplace=True)
# data_df = data_df.merge(
# dian_df[['id', 'unique_id', 'area_doc_1', 'worktype', 'dian_geom']],
# left_on=['dian_id'], right_on=['id'], how='left'
# )
X = data_df.drop('status', axis=1)
y = data_df['status']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=config.test_split_thres, random_state=config.seed_no, stratify=y, shuffle=True
)
# pd.concat([X_train, y_train], axis=1).to_csv('data/train_dataset.csv', index=False)
# pd.concat([X_test, y_test], axis=1).to_csv('data/test_dataset.csv', index=False)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
return X_train, X_test, y_train, y_test