Source code for interlinking.learning

import os
import time
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import itertools

from interlinking import config, helpers
from interlinking import sim_measures


[docs]def learn_thres(fname, sim_group='basic'): """Compute optimal thresholds for each similarity metric in specified group regarding the `fname` dataset. Parameters ----------- fname: :obj:`str` File name of the dataset. sim_group: :obj:`str` Name of the group of the similarity metrics. Accepted values are *basic* or *basic_sorted*. """ low_thres = 30 high_thres = 91 step = 5 assert (os.path.isfile(os.path.join(config.default_data_path, fname))), f'{fname} dataset does not exist' start_time = time.time() data_df = pd.read_csv(os.path.join(config.default_data_path, fname), sep=config.delimiter, names=config.fieldnames, na_filter=False, encoding='utf8') print(f'The train data loaded in {(time.time() - start_time):.2f} sec.') sim_res = None if sim_group == 'basic': sim_res = np.asarray(list( map(_compute_basic_similarities, data_df[config.use_cols['s1']], data_df[config.use_cols['s2']]) ), dtype=float) elif sim_group == 'sorted': sim_res = np.asarray(list( map(_compute_sorted_similarities, data_df[config.use_cols['s1']], data_df[config.use_cols['s2']]) ), dtype=float) print(f'The similarity scores were computed in {(time.time() - start_time):.2f}.') res = {} for m in helpers.StaticValues.sim_metrics.keys(): res[m] = [] separator = '' print('Computing stats for thresholds', end='') for i in range(low_thres, high_thres, step): sim_thres = float(i / 100.0) print('{0} {1}'.format(separator, sim_thres), end='', flush=True) separator = ',' idx = 0 for sim, val in helpers.StaticValues.sim_metrics.items(): if sim_group in val: acc = accuracy_score(data_df[config.use_cols['status']], sim_res[:, idx] >= sim_thres) res[sim].append([acc, float(i / 100.0)]) idx += 1 print('\nThe process took {0:.2f} sec\n'.format(time.time() - start_time)) for key, val in res.items(): if len(val) == 0: print('{0} is empty'.format(key)) continue print(key, max(val, key=lambda x: x[0]))
[docs]def learn_params_for_lgm(fname, encoding): """Compute optimal thresholds and weights for each similarity metric in the ``LGM-Sim`` group only regarding the `fname` dataset. :param fname: File name of the dataset. :type fname: str :param encoding: Encoding of the `fname` dataset. :type encoding: str """ low_thres = 30 high_thres = 91 step = 5 low_split_thres = 50 high_split_thres = 91 split_step = 10 sim_group = 'lgm' assert (os.path.isfile(os.path.join(config.default_data_path, fname))), f'{fname} dataset does not exist' gstart_time = time.time() data_df = pd.read_csv(os.path.join(config.default_data_path, fname), sep=config.delimiter, names=config.fieldnames, na_filter=False, encoding='utf8') sim_measures.LGMSimVars().load_freq_terms(encoding) print(f'The train data and frequent terms loaded in {(time.time() - gstart_time):.2f} sec.') res = {} for m in helpers.StaticValues.sim_metrics.keys(): res[m] = [] for s in range(low_split_thres, high_split_thres, split_step): split_thres = float(s / 100.0) start_time = time.time() sim_res = np.asarray( list(map( _compute_lgm_similarities, data_df[config.use_cols['s1']], data_df[config.use_cols['s2']], [split_thres]*len(data_df.index)) ), dtype=float ) fscore = np.zeros(sim_res.shape[0]) print(f'The similarity scores were computed in {(time.time() - start_time):.2f} sec.') print(f'Computing stats for thresholds split: {split_thres}', end='') separator = ' and similarity:' # print('Computing stats for sim thres ', end='', flush=True) for i in range(low_thres, high_thres, step): sim_thres = float(i / 100.0) print('{0} {1}'.format(separator, sim_thres), end='', flush=True) separator = ',' for n in [3.34] + list(range(2, 8)): weight_combs = [ tuple(float(x / 10.0) for x in seq) for seq in itertools.product([1, 2, 3, 4, 5, 6, 2.5, 3.33], repeat=2) if sum(seq) == (10 - n) ] # print('Computing stats for weights ({})'.format(','.join(map(str, w)))) for w in weight_combs: w = (float(n / 10.0),) + w idx = 0 for sim, val in helpers.StaticValues.sim_metrics.items(): if sim_group in val: scols = [idx*9 + 1, idx*9 + 2, idx*9 + 4, idx*9 + 5, idx*9 + 7, idx*9 + 8] lweights = sim_measures.recalculate_weights_opt( sim_res[:, scols[0:2]], sim_res[:, scols[2:4]], sim_res[:, scols[4:6]], avg=True, weights=np.full((sim_res.shape[0], 3), list(w)) ) fscore = sim_res[:, idx*9] * lweights[:, 0] + \ sim_res[:, idx*9 + 3] * lweights[:, 1] + \ sim_res[:, idx*9 + 6] * lweights[:, 2] acc = accuracy_score(data_df[config.use_cols['status']], fscore >= sim_thres) res[sim].append([acc, float(i / 100.0), [split_thres, list(w)]]) idx += 1 print() print('\nThe process took {0:.2f} sec\n'.format(time.time() - gstart_time)) for key, val in res.items(): if len(val) == 0: print('{0} is empty'.format(key)) continue max_val = max(val, key=lambda x: x[0]) print('{}: {}'.format(key, list(max_val)))
def _compute_basic_similarities(a, b): f = [] for sim, val in helpers.StaticValues.sim_metrics.items(): if 'basic' in val: if '_reversed' in sim: f.append(getattr(sim_measures, sim[:-len('_reversed')])(a[::-1], b[::-1])) else: f.append(getattr(sim_measures, sim)(a, b)) return f def _compute_sorted_similarities(a, b): a, b = helpers.transform(a, b, sorting=True, canonical=True, simple_sorting=True) f = [] for sim, val in helpers.StaticValues.sim_metrics.items(): if 'sorted' in val: if '_reversed' in sim: f.append(getattr(sim_measures, sim[:-len('_reversed')])(a[::-1], b[::-1])) else: f.append(getattr(sim_measures, sim)(a, b)) return f def _compute_lgm_similarities(a, b, split_thres): a, b = helpers.transform(a, b, sorting=True, canonical=True) f = [] for sim, val in helpers.StaticValues.sim_metrics.items(): if 'lgm' in val: if '_reversed' in sim: base_t, mis_t, special_t = sim_measures.lgm_sim_split(a[::-1], b[::-1], split_thres) base_score, mis_score, special_score = sim_measures.score_per_term( base_t, mis_t, special_t, sim[:-len('_reversed')]) else: base_t, mis_t, special_t = sim_measures.lgm_sim_split(a, b, split_thres) base_score, mis_score, special_score = sim_measures.score_per_term(base_t, mis_t, special_t, sim) f.extend([ base_score, base_t['len'], base_t['char_len'], mis_score, mis_t['len'], mis_t['char_len'], special_score, special_t['len'], special_t['char_len'], ]) return f