Source code for interlinking.core

# -*- coding: utf-8 -*-
# Author: vkaff
# E-mail: vkaffes@imis.athena-innovation.gr

import time
import os

from interlinking import hyperparam_tuning, config
from interlinking.features import Features
from interlinking.sim_measures import LGMSimVars


[docs]class StrategyEvaluator: """ This class implements the pipeline for various strategies. """ def __init__(self, encoding='latin'): self.encoding = encoding
[docs] def hyperparamTuning(self, train_data, test_data): """A complete process of distinct steps in figuring out the best ML algorithm with best hyperparameters to toponym interlinking problem. :param train_data: Train dataset filename. :type train_data: str :param test_data: Test dataset filename. :type test_data: str """ LGMSimVars.per_metric_optValues = config.MLConf.opt_values[self.encoding.lower()] assert (os.path.isfile(os.path.join(config.default_data_path, train_data))), \ f'{train_data} dataset does not exist' assert (os.path.isfile(os.path.join(config.default_data_path, test_data))), \ f'{test_data} dataset does not exist' f = Features() pt = hyperparam_tuning.ParamTuning() tot_time = time.time(); start_time = time.time() f.load_data(os.path.join(config.default_data_path, train_data), self.encoding) fX, y = f.build() print("Loaded train dataset and build features for {} setup; {} sec.".format( config.MLConf.classification_method, time.time() - start_time)) start_time = time.time() # 1st phase: find out best classifier from a list of candidate ones best_clf = pt.fineTuneClassifiers(fX, y) print("Best classifier {} with hyperparams {} and score {}; {} sec.".format( best_clf['classifier'], best_clf['hyperparams'], best_clf['score'], time.time() - start_time) ) start_time = time.time() # 2nd phase: train the fine tuned best classifier on the whole train dataset (no folds) estimator = pt.trainClassifier(fX, y, best_clf['estimator']) print("Finished training model on the dataset; {} sec.".format(time.time() - start_time)) start_time = time.time() f.load_data(os.path.join(config.default_data_path, test_data), self.encoding) fX, y = f.build() print("Loaded test dataset and build features; {} sec".format(time.time() - start_time)) start_time = time.time() # 3th phase: test the fine tuned best classifier on the test dataset metrics = pt.testClassifier(fX, y, estimator) self._print_stats({'classifier': best_clf['classifier'], **metrics, 'time': start_time}) print("The whole process took {} sec.".format(time.time() - tot_time))
[docs] def evaluate(self, train_data, test_data): """Train and evaluate selected ML algorithms with custom hyper-parameters on dataset. :param train_data: Train dataset filename. :type train_data: str :param test_data: Test dataset filename. :type test_data: str """ tot_time = time.time() LGMSimVars.per_metric_optValues = config.MLConf.opt_values[self.encoding.lower()] assert (os.path.isfile(os.path.join(config.default_data_path, train_data))), \ f'{train_data} dataset does not exist' assert (os.path.isfile(os.path.join(config.default_data_path, test_data))), \ f'{test_data} dataset does not exist' f = Features() pt = hyperparam_tuning.ParamTuning() start_time = time.time() f.load_data(os.path.join(config.default_data_path, train_data), self.encoding) fX_train, y_train = f.build() print("Loaded train dataset and build features for {} setup; {} sec.".format( config.MLConf.classification_method, time.time() - start_time)) start_time = time.time() f.load_data(os.path.join(config.default_data_path, test_data), self.encoding) fX_test, y_test = f.build() print("Loaded test dataset and build features; {} sec".format(time.time() - start_time)) for clf in config.MLConf.clf_custom_params: print('Method {}'.format(clf)) print('=======', end='') print(len(clf) * '=') start_time = time.time() # 1st phase: train each classifier on the whole train dataset (no folds) estimator = pt.clf_names[clf][0](**config.MLConf.clf_custom_params[clf]) estimator = pt.trainClassifier(fX_train, y_train, estimator) print("Finished training model on dataset; {} sec.".format(time.time() - start_time)) start_time = time.time() # 2nd phase: test each classifier on the test dataset metrics = pt.testClassifier(fX_test, y_test, estimator) self._print_stats({'classifier': clf, **metrics, 'time': start_time}) print("The whole process took {} sec.\n".format(time.time() - tot_time))
@staticmethod def _print_stats(params): print("| Method\t\t& Accuracy\t& Precision\t& Recall\t& F1-Score\t& Time (sec)") print("||{0}\t& {1}\t& {2}\t& {3}\t& {4}\t& {5}".format( params['classifier'], params['accuracy'], params['precision'], params['recall'], params['f1_score'], time.time() - params['time'])) # if params['feature_importances'] is not None: # importances = np.ma.masked_equal(params['feature_importances'], 0.0) # if importances.mask is np.ma.nomask: importances.mask = np.zeros(importances.shape, dtype=bool) # # indices = np.argsort(importances.compressed())[::-1][ # :min(importances.compressed().shape[0], self.max_features_toshow)] # headers = ["name", "score"] # # fcols = StaticValues.featureCols if config.MLConf.extra_features is False \ # else StaticValues.featureCols + StaticValues.extra_featureCols # print(tabulate(zip( # np.asarray(fcols, object)[~importances.mask][indices], importances.compressed()[indices] # ), headers, tablefmt="simple")) print()