Source code for geocoding.config

import numpy as np
from scipy.stats import randint as sp_randint, expon, truncnorm, uniform


[docs]class Config:

    """
    Class that configures the execution process.

    Attributes:
        n_folds (int): The number of folds in the experiment
        source_crs (int): The EPSG crs code used in input files
        target_crs (int): The EPSG crs code to transform the data
        clusters_pct (float): Percentage of data points, indicating how many \
            clusters to create in order to query Overpass API for streets
        osm_buffer (float): A buffer distance (in meters) to consider around \
            each bounding box when querying Overpass API
        osm_timeout (int): Timeout (in seconds) after five requests to \
            Overpass API
        max_overpass_tries (int): Maximum number of failed tries to extract the road network when querying the
            Overpass API before quiting.
        distance_thr (float): Distances in features greater than this value \
            will be converted to this threshold
        baseline_service (str): The name of the service to consider when \
            measuring baseline scores
        experiments_path (str): Path to folder that stores the experiments
        services (list): The services (geocoders) used in the setup

        supported_features (list): List of the supported features to choose \
            from
        included_features (list): List of the features to be included in the \
            experiment
        normalized_features (list): List of features to be normalized

        supported_classifiers (list): List of the supported classifiers to \
            choose from
        included_classifiers (list): List of the classifiers to be included \
            in the experiment

        NB_hparams (dict): Parameters search space for Naive Bayes classifier
        NN_hparams (dict): Parameters search space for Nearest Neighbors \
            classifier
        LR_hparams (dict): Parameters search space for Logistic Regression \
            classifier
        SVM_hparams (list): Parameters search space for SVM classifier
        MLP_hparams (dict): Parameters search space for MLP classifier
        DT_hparams (dict): Parameters search space for Decision Tree classifier
        RF_hparams (dict): Parameters search space for Random Forest classifier
        ET_hparams (dict): Parameters search space for Extra Trees classifier
    """

    n_folds = 5
    n_jobs = 4  #: int: Number of parallel jobs to be initiated. -1 means to utilize all available processors.
    # accepted values: randomized, grid, hyperband - not yet implemented!!!
    hyperparams_search_method = 'grid'
    """str: Search Method to use for finding best hyperparameters. (*randomized* | *grid*).
    """
    #: int: Number of iterations that RandomizedSearchCV should execute. It applies only when
    #: :attr:`hyperparams_search_method` equals to 'randomized'.
    max_iter = 30
    verbose = True

    source_crs = 4326
    target_crs = 3857
    clusters_pct = 0.015
    osm_buffer = 0.001
    osm_timeout = 50
    max_overpass_tries = 5
    distance_thr = 5000.0
    square_thr = 500000.0
    baseline_service = 'original'
    #: int: Seed to use by random number generators.
    seed_no = 13

    base_dir = '/media/disk/LGM-Geocoding'

    services = [
        'original',
        'arcgis',
        'nominatim',
    ]

    supported_features = [
        'normalized_coords',
        'pairwise_coords_distances',
        'pairwise_points_distances',
        'centroid_coords_distances',
        'centroid_points_distances',
        'mean_centroids_coords_distances',
        'mean_centroids_points_distances',
        'nearest_street_distance_per_service',
        'nearest_street_distance_by_centroid',
        'zip_codes',
        'common_nearest_street_distance',
        'intersects_on_common_nearest_street',
        'points_area',
        'polar_coords',
    ]

    included_features = [
        # 'normalized_coords',
        'pairwise_coords_distances',
        'pairwise_points_distances',
        'centroid_coords_distances',
        'centroid_points_distances',
        'mean_centroids_coords_distances',
        'mean_centroids_points_distances',
        'nearest_street_distance_per_service',
        'nearest_street_distance_by_centroid',
        # 'zip_codes',
        'common_nearest_street_distance',
        'intersects_on_common_nearest_street',
        'points_area',
        'polar_coords',
    ]

    normalized_features = [
        # 'normalized_coords',
        'pairwise_coords_distances',
        'pairwise_points_distances',
        'centroid_coords_distances',
        'centroid_points_distances',
        'mean_centroids_coords_distances',
        'mean_centroids_points_distances',
        'nearest_street_distance_per_service',
        'common_nearest_street_distance',
        'points_area',
        'polar_coords',
    ]

    supported_classifiers = [
        'Baseline',
        'NaiveBayes',
        'NearestNeighbors',
        'LogisticRegression',
        'SVM',
        'MLP',
        'DecisionTree',
        'RandomForest',
        'ExtraTrees',
        'XGBoost'
    ]

    included_classifiers = [
        'Baseline',
        'NaiveBayes',
        'NearestNeighbors',
        'LogisticRegression',
        'SVM',
        'MLP',
        'DecisionTree',
        'RandomForest',
        'ExtraTrees',
        'XGBoost'
    ]

    NB_hparams = {}

    NN_hparams = {
        'n_neighbors': [2, 3, 5, 10]
    }

    LR_hparams = {
        'max_iter': [100, 500],
        'C': [0.001, 0.1, 1, 10, 1000]
    }

    SVM_hparams = [
        {'kernel': ['rbf', 'sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
         'C': [0.001, 0.01, 0.1, 1, 10, 25, 50, 100, 1000], 'probability': [True]},
        {'kernel': ['poly'], 'degree': [1, 2, 3], 'gamma': ['scale', 'auto'],
         'C': [0.1, 1, 10, 25, 50, 100, 1000], 'max_iter': [10000], 'probability': [True]},
    ]

    MLP_hparams = {
        'hidden_layer_sizes': [(100, ), (50, 50, )],
        # 'learning_rate_init': [0.0001, 0.01, 0.1],
        'max_iter': [500, 1000],
        'solver': ['sgd', 'adam'],
    }

    DT_hparams = {
        'max_depth': [1, 4, 16, 32, 64],
        'min_samples_split': [2, 5, 10, 20, 50, 100, 200],
        'min_samples_leaf': [1, 2, 5, 10],
        'max_features': list(np.arange(2, 11, 2)) + ["sqrt", "log2", None]
    }

    RF_hparams = {
        'max_depth': [5, 10, 50, 100, 250, 300],
        'n_estimators': [100, 250, 500, 1000],
        # 'min_samples_leaf': [1, 5, 10],
        'min_samples_split': [2, 10],
    }

    ET_hparams = {
        'max_depth': [5, 10, 50, 100, 250, 300],
        'n_estimators': [100, 250, 500, 1000],
        # 'min_samples_leaf': [1, 5, 10],
        'min_samples_split': [2, 10, 50],
    }

    XGB_hparams = {
        "n_estimators": [500, 1000, 3000],
        'max_depth': [5, 10, 50, 100, 250, 300],
        # # hyperparameters to avoid overfitting
        # 'eta': list(np.linspace(0.01, 0.3, 10)),  # 'learning_rate'
        # 'gamma': [0, 1, 5],
        # 'subsample': [0.8, 0.9, 1],
        # # Values from 0.3 to 0.8 if you have many columns (especially if you did one-hot encoding),
        # # or 0.8 to 1 if you only have a few columns
        # 'colsample_bytree': list(np.linspace(0.8, 1, 3)),
        # 'min_child_weight': [1, 5, 10],
    }

    # These parameters constitute the search space for RandomizedSearchCV in our experiments.
    NB_hparams_dist = {}

    NN_hparams_dist = {
        'n_neighbors': sp_randint(1, 20)
    }

    LR_hparams_dist = {
        'max_iter': sp_randint(100, 1000),
        'C': expon(scale=200)
    }

    SVM_hparams_dist = {
        'C': expon(loc=0.01, scale=20),
        # "C": uniform(2, 10),
        "gamma": uniform(1e-5, 1e-2),
        'kernel': ['rbf', 'poly', 'sigmoid'],
        'degree': sp_randint(1, 3),
        'class_weight': ['balanced', None],
        'tol': [1e-3, 1e-4],
        'max_iter': [100000],
        'probability': [True],
        # 'dual': [True, False]
    }
    DT_hparams_dist = {
        'max_depth': sp_randint(10, 200),
        'min_samples_split': sp_randint(2, 51),
        'min_samples_leaf': sp_randint(1, 15),
        # 'max_features': sp_randint(1, 11),
    }
    RF_hparams_dist = {
        'bootstrap': [True, False],
        # 'max_depth': [10, 20, 30, 40, 50, 60, 100, None],
        'max_depth': sp_randint(10, 300),
        "n_estimators": sp_randint(250, 2000),
        # 'criterion': ['gini', 'entropy'],
        # 'max_features': ['sqrt', 'log2'],  # sp_randint(1, 11)
        'min_samples_leaf': sp_randint(1, 10),
        'min_samples_split': sp_randint(2, 50),
    }
    XGB_hparams_dist = {
        "n_estimators": sp_randint(500, 4000),
        'max_depth': sp_randint(3, 300),
        # 'eta': expon(loc=0.01, scale=0.1),  # 'learning_rate'
        # hyperparameters to avoid overfitting
        'gamma': sp_randint(0, 5),
        'subsample': truncnorm(0.8, 1),
        'colsample_bytree': truncnorm(0.8, 1),
        'min_child_weight': sp_randint(1, 10),
    }
    MLP_hparams_dist = {
        'learning_rate_init': expon(loc=0.0001, scale=0.1),
        'max_iter': sp_randint(500, 2000),
        'solver': ['sgd', 'adam']
    }