Source code for geocoding.config

import numpy as np
from scipy.stats import randint as sp_randint, expon, truncnorm, uniform


[docs]class Config: """ Class that configures the execution process. Attributes: n_folds (int): The number of folds in the experiment source_crs (int): The EPSG crs code used in input files target_crs (int): The EPSG crs code to transform the data clusters_pct (float): Percentage of data points, indicating how many \ clusters to create in order to query Overpass API for streets osm_buffer (float): A buffer distance (in meters) to consider around \ each bounding box when querying Overpass API osm_timeout (int): Timeout (in seconds) after five requests to \ Overpass API max_overpass_tries (int): Maximum number of failed tries to extract the road network when querying the Overpass API before quiting. distance_thr (float): Distances in features greater than this value \ will be converted to this threshold baseline_service (str): The name of the service to consider when \ measuring baseline scores experiments_path (str): Path to folder that stores the experiments services (list): The services (geocoders) used in the setup supported_features (list): List of the supported features to choose \ from included_features (list): List of the features to be included in the \ experiment normalized_features (list): List of features to be normalized supported_classifiers (list): List of the supported classifiers to \ choose from included_classifiers (list): List of the classifiers to be included \ in the experiment NB_hparams (dict): Parameters search space for Naive Bayes classifier NN_hparams (dict): Parameters search space for Nearest Neighbors \ classifier LR_hparams (dict): Parameters search space for Logistic Regression \ classifier SVM_hparams (list): Parameters search space for SVM classifier MLP_hparams (dict): Parameters search space for MLP classifier DT_hparams (dict): Parameters search space for Decision Tree classifier RF_hparams (dict): Parameters search space for Random Forest classifier ET_hparams (dict): Parameters search space for Extra Trees classifier """ n_folds = 5 n_jobs = 4 #: int: Number of parallel jobs to be initiated. -1 means to utilize all available processors. # accepted values: randomized, grid, hyperband - not yet implemented!!! hyperparams_search_method = 'grid' """str: Search Method to use for finding best hyperparameters. (*randomized* | *grid*). """ #: int: Number of iterations that RandomizedSearchCV should execute. It applies only when #: :attr:`hyperparams_search_method` equals to 'randomized'. max_iter = 30 verbose = True source_crs = 4326 target_crs = 3857 clusters_pct = 0.015 osm_buffer = 0.001 osm_timeout = 50 max_overpass_tries = 5 distance_thr = 5000.0 square_thr = 500000.0 baseline_service = 'original' #: int: Seed to use by random number generators. seed_no = 13 base_dir = '/media/disk/LGM-Geocoding' services = [ 'original', 'arcgis', 'nominatim', ] supported_features = [ 'normalized_coords', 'pairwise_coords_distances', 'pairwise_points_distances', 'centroid_coords_distances', 'centroid_points_distances', 'mean_centroids_coords_distances', 'mean_centroids_points_distances', 'nearest_street_distance_per_service', 'nearest_street_distance_by_centroid', 'zip_codes', 'common_nearest_street_distance', 'intersects_on_common_nearest_street', 'points_area', 'polar_coords', ] included_features = [ # 'normalized_coords', 'pairwise_coords_distances', 'pairwise_points_distances', 'centroid_coords_distances', 'centroid_points_distances', 'mean_centroids_coords_distances', 'mean_centroids_points_distances', 'nearest_street_distance_per_service', 'nearest_street_distance_by_centroid', # 'zip_codes', 'common_nearest_street_distance', 'intersects_on_common_nearest_street', 'points_area', 'polar_coords', ] normalized_features = [ # 'normalized_coords', 'pairwise_coords_distances', 'pairwise_points_distances', 'centroid_coords_distances', 'centroid_points_distances', 'mean_centroids_coords_distances', 'mean_centroids_points_distances', 'nearest_street_distance_per_service', 'common_nearest_street_distance', 'points_area', 'polar_coords', ] supported_classifiers = [ 'Baseline', 'NaiveBayes', 'NearestNeighbors', 'LogisticRegression', 'SVM', 'MLP', 'DecisionTree', 'RandomForest', 'ExtraTrees', 'XGBoost' ] included_classifiers = [ 'Baseline', 'NaiveBayes', 'NearestNeighbors', 'LogisticRegression', 'SVM', 'MLP', 'DecisionTree', 'RandomForest', 'ExtraTrees', 'XGBoost' ] NB_hparams = {} NN_hparams = { 'n_neighbors': [2, 3, 5, 10] } LR_hparams = { 'max_iter': [100, 500], 'C': [0.001, 0.1, 1, 10, 1000] } SVM_hparams = [ {'kernel': ['rbf', 'sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5], 'C': [0.001, 0.01, 0.1, 1, 10, 25, 50, 100, 1000], 'probability': [True]}, {'kernel': ['poly'], 'degree': [1, 2, 3], 'gamma': ['scale', 'auto'], 'C': [0.1, 1, 10, 25, 50, 100, 1000], 'max_iter': [10000], 'probability': [True]}, ] MLP_hparams = { 'hidden_layer_sizes': [(100, ), (50, 50, )], # 'learning_rate_init': [0.0001, 0.01, 0.1], 'max_iter': [500, 1000], 'solver': ['sgd', 'adam'], } DT_hparams = { 'max_depth': [1, 4, 16, 32, 64], 'min_samples_split': [2, 5, 10, 20, 50, 100, 200], 'min_samples_leaf': [1, 2, 5, 10], 'max_features': list(np.arange(2, 11, 2)) + ["sqrt", "log2", None] } RF_hparams = { 'max_depth': [5, 10, 50, 100, 250, 300], 'n_estimators': [100, 250, 500, 1000], # 'min_samples_leaf': [1, 5, 10], 'min_samples_split': [2, 10], } ET_hparams = { 'max_depth': [5, 10, 50, 100, 250, 300], 'n_estimators': [100, 250, 500, 1000], # 'min_samples_leaf': [1, 5, 10], 'min_samples_split': [2, 10, 50], } XGB_hparams = { "n_estimators": [500, 1000, 3000], 'max_depth': [5, 10, 50, 100, 250, 300], # # hyperparameters to avoid overfitting # 'eta': list(np.linspace(0.01, 0.3, 10)), # 'learning_rate' # 'gamma': [0, 1, 5], # 'subsample': [0.8, 0.9, 1], # # Values from 0.3 to 0.8 if you have many columns (especially if you did one-hot encoding), # # or 0.8 to 1 if you only have a few columns # 'colsample_bytree': list(np.linspace(0.8, 1, 3)), # 'min_child_weight': [1, 5, 10], } # These parameters constitute the search space for RandomizedSearchCV in our experiments. NB_hparams_dist = {} NN_hparams_dist = { 'n_neighbors': sp_randint(1, 20) } LR_hparams_dist = { 'max_iter': sp_randint(100, 1000), 'C': expon(scale=200) } SVM_hparams_dist = { 'C': expon(loc=0.01, scale=20), # "C": uniform(2, 10), "gamma": uniform(1e-5, 1e-2), 'kernel': ['rbf', 'poly', 'sigmoid'], 'degree': sp_randint(1, 3), 'class_weight': ['balanced', None], 'tol': [1e-3, 1e-4], 'max_iter': [100000], 'probability': [True], # 'dual': [True, False] } DT_hparams_dist = { 'max_depth': sp_randint(10, 200), 'min_samples_split': sp_randint(2, 51), 'min_samples_leaf': sp_randint(1, 15), # 'max_features': sp_randint(1, 11), } RF_hparams_dist = { 'bootstrap': [True, False], # 'max_depth': [10, 20, 30, 40, 50, 60, 100, None], 'max_depth': sp_randint(10, 300), "n_estimators": sp_randint(250, 2000), # 'criterion': ['gini', 'entropy'], # 'max_features': ['sqrt', 'log2'], # sp_randint(1, 11) 'min_samples_leaf': sp_randint(1, 10), 'min_samples_split': sp_randint(2, 50), } XGB_hparams_dist = { "n_estimators": sp_randint(500, 4000), 'max_depth': sp_randint(3, 300), # 'eta': expon(loc=0.01, scale=0.1), # 'learning_rate' # hyperparameters to avoid overfitting 'gamma': sp_randint(0, 5), 'subsample': truncnorm(0.8, 1), 'colsample_bytree': truncnorm(0.8, 1), 'min_child_weight': sp_randint(1, 10), } MLP_hparams_dist = { 'learning_rate_init': expon(loc=0.0001, scale=0.1), 'max_iter': sp_randint(500, 2000), 'solver': ['sgd', 'adam'] }