Source code for config

[docs]class config:

    """
    Class that defines the experiment configuration.

    Attributes:
        poi_fpath (str): Path to csv file containing train pois
        experiments_path (str): Path to folder that stores the experiments

        supported_adjacency_features (list): List of the supported adjacency \
            features to choose from
        supported_textual_features (list): List of the supported textual \
            features to choose from
        included_adjacency_features (list): List of the adjacency features to \
            be included in the experiment
        included_textual_features (list): List of the textual features to \
            be included in the experiment
        normalized_features (list): List of features to be normalized

        classes_in_radius_thr (list): Parameter space for \
            'classes_in_radius_bln' and 'classes_in_radius_cnt' features
        classes_in_street_and_radius_thr (list): Parameter space for \
            'classes_in_street_and_radius_bln' and \
            'classes_in_street_and_radius_cnt' features
        classes_in_neighbors_thr (list): Parameter space for \
            'classes_in_neighbors_bln' and 'classes_in_neighbors_cnt' features

        top_k_terms_pct (list): Parameter space for 'top_k_terms' feature
        top_k_trigrams_pct (list): Parameter space for 'top_k_trigrams' feature
        top_k_fourgrams_pct (list): Parameter space for 'top_k_fourgrams' \
            feature

        n_folds (int): The number of folds in the experiment

        supported_classifiers (list): List of the supported classifiers to \
            choose from
        included_classifiers (list): List of the classifiers to be included \
            in the experiment

        NaiveBayes_hyperparameters (dict): Parameters search space for Naive \
            Bayes classifier
        kNN_hyperparameters (dict): Parameters search space for Nearest \
            Neighbors classifier
        LogisticRegression_hyperparameters (dict): Parameters search space \
            for Logistic Regression classifier
        SVM_hyperparameters (list): Parameters search space for SVM classifier
        MLP_hyperparameters (dict): Parameters search space for MLP classifier
        DecisionTree_hyperparameters (dict): Parameters search space for \
            Decision Tree classifier
        RandomForest_hyperparameters (dict): Parameters search space for \
            Random Forest classifier

        top_k (list): List of different *k*, in order to measure \
            top-*k*-accuracy
        k_preds (int): Number of top predictions to take into consideration
        osm_crs (int): The EPSG crs code that OSM uses

        id_col (str): Column name referring to poi's id
        name_col (str): Column name referring to poi's name
        label_col (str): Column name referring to poi's label
        lon_col (str): Column name referring to poi's longitude
        lat_col (str): Column name referring to poi's latitude
        poi_crs (int): The EPSG crs code used in the pois csv file
    """

    poi_fpath = '/media/disk/LGM-Classification-utils/data/toronto/yelp_toronto_train.csv'

    experiments_path = '/media/disk/LGM-Classification-utils/experiments'

    supported_adjacency_features = [
        'classes_in_radius_bln', 'classes_in_radius_cnt',
        'classes_in_street_and_radius_bln', 'classes_in_street_and_radius_cnt',
        'classes_in_neighbors_bln', 'classes_in_neighbors_cnt',
        # 'classes_in_street_radius_bln', 'classes_in_street_radius_cnt',
    ]

    supported_textual_features = [
        'similarity_per_class', 'top_k_terms',
        'top_k_trigrams', 'top_k_fourgrams',
    ]

    # supported_geometric_features = [
    #     'area', 'perimeter', 'n_vertices',
    #     'mean_edge_length', 'var_edge_length'
    # ]

    included_adjacency_features = [
        # 'classes_in_radius_bln',
        'classes_in_radius_cnt',
        # 'classes_in_street_and_radius_bln',
        'classes_in_street_and_radius_cnt',
        # 'classes_in_neighbors_bln',
        'classes_in_neighbors_cnt',
        # 'classes_in_street_radius_bln',
        # 'classes_in_street_radius_cnt',
    ]

    included_textual_features = [
        'similarity_per_class',
        'top_k_terms',
        'top_k_trigrams',
        'top_k_fourgrams'
    ]

    # included_geometric_features = [
    #     'area',
    #     'perimeter',
    #     'n_vertices',
    #     'mean_edge_length',
    #     'var_edge_length'
    # ]

    normalized_features = [
        # 'classes_in_radius_cnt',
        # 'classes_in_street_and_radius_cnt',
        # 'classes_in_neighbors_cnt',
        # 'classes_in_street_radius_cnt',
        # 'similarity_per_class'
    ]

    classes_in_radius_thr = [200, 500]
    classes_in_street_and_radius_thr = [300, 500]
    classes_in_neighbors_thr = [5, 20]
    classes_in_street_radius_thr = [100]

    top_k_terms_pct = [0.05]
    top_k_trigrams_pct = [0.01]
    top_k_fourgrams_pct = [0.01]

    # matching_strategy = [
    #     {1: ['within', ['named', 20000], ['avg_lgm_sim_dl', 0.5, None]],
    #      2: ['within', ['unnamed', 10000], [None, None, None]],
    #      3: ['nearby', ['named', 20000], ['lgm_sim_jw', 0.5, 50]],
    #      4: ['nearby', ['unnamed', 10000], [None, None, 50]]}
    # ]

    n_folds = 5

    supported_classifiers = [
        'Baseline',
        'Naive Bayes',
        'Gaussian Process',
        'AdaBoost',
        'Nearest Neighbors',
        'Logistic Regression',
        'SVM',
        'MLP',
        'Decision Tree',
        'Random Forest',
        'Extra Trees'
    ]

    included_classifiers = [
        'Baseline',
        'Naive Bayes',
        # 'Gaussian Process',
        # 'AdaBoost',
        'Nearest Neighbors',
        'Logistic Regression',
        'SVM',
        'MLP',
        'Decision Tree',
        'Random Forest',
        'Extra Trees'
    ]

    NaiveBayes_hyperparameters = {}

    GaussianProcess_hyperparameters = {}

    AdaBoost_hyperparameters = {}

    kNN_hyperparameters = {'n_neighbors': [3, 5, 10]}

    LogisticRegression_hyperparameters = {
        'max_iter': [100, 500],
        'C': [0.1, 1, 10]}

    SVM_hyperparameters = [
        {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [0.01, 0.1, 1, 10, 100]},
        {'kernel': ['poly'], 'degree': [1, 2, 3], 'C': [0.01, 0.1, 1, 10, 100]},
    ]

    MLP_hyperparameters = {
        'hidden_layer_sizes': [(100, ), (50, 50, )],
        'learning_rate_init': [0.0001, 0.01, 0.1],
        'max_iter': [100, 200, 500]}

    DecisionTree_hyperparameters = {
        'max_depth': [1, 4, 16],
        'min_samples_split': [0.1, 0.5, 1.0]}

    RandomForest_hyperparameters = {
        'max_depth': [10, 100, None],
        'n_estimators': [250, 1000]}

    top_k = [1, 5, 10]
    k_preds = 5
    osm_crs = 4326

    # # Marousi
    # id_col = 'poi_id'
    # name_col = 'name'
    # label_col = 'class_name'
    # lon_col = 'x'
    # lat_col = 'y'
    # poi_crs = 2100

    # Yelp
    id_col = 'business_id'
    name_col = 'name'
    label_col = 'category'
    lon_col = 'longitude'
    lat_col = 'latitude'
    poi_crs = 4326