# -*- coding: utf-8 -*-
# Author: vkaff
# E-mail: vkaffes@imis.athena-innovation.gr
import time
import os
from interlinking import hyperparam_tuning, config
from interlinking.features import Features
from interlinking.sim_measures import LGMSimVars
[docs]class StrategyEvaluator:
"""
This class implements the pipeline for various strategies.
"""
def __init__(self, encoding='latin'):
self.encoding = encoding
[docs] def hyperparamTuning(self, train_data, test_data):
"""A complete process of distinct steps in figuring out the best ML algorithm with best hyperparameters to
toponym interlinking problem.
:param train_data: Train dataset filename.
:type train_data: str
:param test_data: Test dataset filename.
:type test_data: str
"""
LGMSimVars.per_metric_optValues = config.MLConf.opt_values[self.encoding.lower()]
assert (os.path.isfile(os.path.join(config.default_data_path, train_data))), \
f'{train_data} dataset does not exist'
assert (os.path.isfile(os.path.join(config.default_data_path, test_data))), \
f'{test_data} dataset does not exist'
f = Features()
pt = hyperparam_tuning.ParamTuning()
tot_time = time.time(); start_time = time.time()
f.load_data(os.path.join(config.default_data_path, train_data), self.encoding)
fX, y = f.build()
print("Loaded train dataset and build features for {} setup; {} sec.".format(
config.MLConf.classification_method, time.time() - start_time))
start_time = time.time()
# 1st phase: find out best classifier from a list of candidate ones
best_clf = pt.fineTuneClassifiers(fX, y)
print("Best classifier {} with hyperparams {} and score {}; {} sec.".format(
best_clf['classifier'], best_clf['hyperparams'], best_clf['score'], time.time() - start_time)
)
start_time = time.time()
# 2nd phase: train the fine tuned best classifier on the whole train dataset (no folds)
estimator = pt.trainClassifier(fX, y, best_clf['estimator'])
print("Finished training model on the dataset; {} sec.".format(time.time() - start_time))
start_time = time.time()
f.load_data(os.path.join(config.default_data_path, test_data), self.encoding)
fX, y = f.build()
print("Loaded test dataset and build features; {} sec".format(time.time() - start_time))
start_time = time.time()
# 3th phase: test the fine tuned best classifier on the test dataset
metrics = pt.testClassifier(fX, y, estimator)
self._print_stats({'classifier': best_clf['classifier'], **metrics, 'time': start_time})
print("The whole process took {} sec.".format(time.time() - tot_time))
[docs] def evaluate(self, train_data, test_data):
"""Train and evaluate selected ML algorithms with custom hyper-parameters on dataset.
:param train_data: Train dataset filename.
:type train_data: str
:param test_data: Test dataset filename.
:type test_data: str
"""
tot_time = time.time()
LGMSimVars.per_metric_optValues = config.MLConf.opt_values[self.encoding.lower()]
assert (os.path.isfile(os.path.join(config.default_data_path, train_data))), \
f'{train_data} dataset does not exist'
assert (os.path.isfile(os.path.join(config.default_data_path, test_data))), \
f'{test_data} dataset does not exist'
f = Features()
pt = hyperparam_tuning.ParamTuning()
start_time = time.time()
f.load_data(os.path.join(config.default_data_path, train_data), self.encoding)
fX_train, y_train = f.build()
print("Loaded train dataset and build features for {} setup; {} sec.".format(
config.MLConf.classification_method, time.time() - start_time))
start_time = time.time()
f.load_data(os.path.join(config.default_data_path, test_data), self.encoding)
fX_test, y_test = f.build()
print("Loaded test dataset and build features; {} sec".format(time.time() - start_time))
for clf in config.MLConf.clf_custom_params:
print('Method {}'.format(clf))
print('=======', end='')
print(len(clf) * '=')
start_time = time.time()
# 1st phase: train each classifier on the whole train dataset (no folds)
estimator = pt.clf_names[clf][0](**config.MLConf.clf_custom_params[clf])
estimator = pt.trainClassifier(fX_train, y_train, estimator)
print("Finished training model on dataset; {} sec.".format(time.time() - start_time))
start_time = time.time()
# 2nd phase: test each classifier on the test dataset
metrics = pt.testClassifier(fX_test, y_test, estimator)
self._print_stats({'classifier': clf, **metrics, 'time': start_time})
print("The whole process took {} sec.\n".format(time.time() - tot_time))
@staticmethod
def _print_stats(params):
print("| Method\t\t& Accuracy\t& Precision\t& Recall\t& F1-Score\t& Time (sec)")
print("||{0}\t& {1}\t& {2}\t& {3}\t& {4}\t& {5}".format(
params['classifier'], params['accuracy'], params['precision'], params['recall'], params['f1_score'],
time.time() - params['time']))
# if params['feature_importances'] is not None:
# importances = np.ma.masked_equal(params['feature_importances'], 0.0)
# if importances.mask is np.ma.nomask: importances.mask = np.zeros(importances.shape, dtype=bool)
#
# indices = np.argsort(importances.compressed())[::-1][
# :min(importances.compressed().shape[0], self.max_features_toshow)]
# headers = ["name", "score"]
#
# fcols = StaticValues.featureCols if config.MLConf.extra_features is False \
# else StaticValues.featureCols + StaticValues.extra_featureCols
# print(tabulate(zip(
# np.asarray(fcols, object)[~importances.mask][indices], importances.compressed()[indices]
# ), headers, tablefmt="simple"))
print()