Source code for geocoding.features_utilities

import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.wkt import loads
import pickle
import os

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler

from geocoding import features as feats, osm_utilities as osm_ut
from geocoding.config import Config


features_getter_map = {
    'normalized_coords': 'get_normalized_coords',
    'pairwise_coords_distances': 'get_pairwise_coords_distances',
    'pairwise_points_distances': 'get_pairwise_points_distances',
    'centroid_coords_distances': 'get_centroid_coords_distances',
    'centroid_points_distances': 'get_centroid_points_distances',
    'mean_centroids_coords_distances': 'get_mean_centroids_coords_distances',
    'mean_centroids_points_distances': 'get_mean_centroids_points_distances',
    'nearest_street_distance_per_service': 'get_nearest_street_distance_per_service',
    'nearest_street_distance_by_centroid': 'get_nearest_street_distance_by_centroid',
    'zip_codes': 'get_zip_codes',
    'common_nearest_street_distance': 'get_common_nearest_street_distance',
    'intersects_on_common_nearest_street': 'get_intersects_on_common_nearest_street',
    'points_area': 'get_points_area',
    'polar_coords': 'get_polar_coords',
}

features_getter_args_map = {
    'normalized_coords': ['df'],
    'pairwise_coords_distances': ['df'],
    'pairwise_points_distances': ['df'],
    'centroid_coords_distances': ['df'],
    'centroid_points_distances': ['df'],
    'mean_centroids_coords_distances': ['df'],
    'mean_centroids_points_distances': ['df'],
    'nearest_street_distance_per_service': ['df', 'street_gdf'],
    'nearest_street_distance_by_centroid': ['df', 'street_gdf'],
    'zip_codes': ['df'],
    'common_nearest_street_distance': ['df', 'street_gdf'],
    'intersects_on_common_nearest_street': ['df', 'street_gdf'],
    'points_area': ['df'],
    'polar_coords': ['df'],
}


[docs]def load_points_df(points_fpath): """ Loads points in *points_fpath* into a pandas.DataFrame and project their \ geometries. Args: points_fpath (str): Path to file containing the points Returns: pandas.DataFrame """ df = pd.read_csv(points_fpath) for service in Config.services: # service_df = df[[f'x_{service}', f'y_{service}']] # service_df['geometry'] = service_df.apply(lambda x: Point(x[f'x_{service}'], x[f'y_{service}']), axis=1) service_gdf = gpd.GeoDataFrame( df[[f'x_{service}', f'y_{service}']], geometry=gpd.points_from_xy(df[f'x_{service}'], df[f'y_{service}']), crs=f'epsg:{Config.source_crs}' ) # service_gdf = gpd.GeoDataFrame(service_df, geometry='geometry', crs=f'epsg:{config.source_crs}') # print(service_df.geometry.crs, f'epsg:{config.source_crs}') # service_gdf.crs = f'epsg:{config.source_crs}' service_gdf = service_gdf.to_crs(f'epsg:{Config.target_crs}') df[f'lon_{service}'] = service_gdf.apply(lambda x: x.geometry.x, axis=1) df[f'lat_{service}'] = service_gdf.apply(lambda x: x.geometry.y, axis=1) return df
[docs]def encode_labels(df, encoder=None): """ Encodes target column to with integer values. Args: df (pandas.DataFrame): The DataFrame containing the column to be \ encoded encoder (sklearn.preprocessing.LabelEncoder, optional): The label \ encoder to be utilized Returns: tuple: pandas.DataFrame: The DataFrame with the encoded column sklearn.preprocessing.LabelEncoder: The label encoder utilized """ if encoder is None: encoder = LabelEncoder() df['target'] = encoder.fit_transform(df['label']) else: df['target'] = encoder.transform(df['label']) return df, encoder
[docs]def load_street_gdf(street_fpath): """ Loads streets in *street_fpath* into a geopandas.GeoDataFrame and project \ their geometries. Args: street_fpath (str): Path to file containing the streets Returns: geopandas.GeoDataFrame """ street_df = pd.read_csv(street_fpath) street_df['geometry'] = street_df['geometry'].apply(lambda x: loads(x)) street_gdf = gpd.GeoDataFrame(street_df, geometry='geometry', crs=f'epsg:{Config.source_crs}') # street_gdf.crs = f'epsg:{config.source_crs}' street_gdf = street_gdf.to_crs(f'epsg:{Config.target_crs}') return street_gdf
[docs]def prepare_feats_args(df, required_args, path): """ Prepares required arguments during features extraction. Args: df (pandas.DataFrame): Contains the points for which features will be \ created required_args (set): Contains the names of the required args path (str): Path to read from Returns: dict: Containing arguments names as keys and their corresponding \ structures as values """ args = {'df': df} if 'street_gdf' in required_args: args['street_gdf'] = load_street_gdf(os.path.join(path, 'osm_streets.csv')) return args
[docs]def create_train_features(df, in_path, out_path, features=None): """ Creates all the included train features arrays and saves them in \ *out_path*. Args: df (pandas.DataFrame): Contains the train points in_path (str): Path to read required items out_path (str): Path to write features (list, optional): Contains the names of the features to \ extract Returns: numpy.ndarray: The train features array """ included_features = Config.included_features if features is None else features required_args = set([ arg for f in included_features for arg in features_getter_args_map[f] ]) args = prepare_feats_args(df, required_args, in_path) Xs = [] cols = [] nonScaledXmin = [] nonScaledXmax = [] for f in included_features: X = getattr(feats, features_getter_map[f])(*[args[arg] for arg in features_getter_args_map[f]]) cols.extend(get_feature_col_names(f, X.shape[-1])) nonScaledXmin.append(np.amin(X, axis=0)) nonScaledXmax.append(np.amax(X, axis=0)) if f in Config.normalized_features: X, scaler = normalize_features(X) pickle.dump(scaler, open(os.path.join(out_path, 'pickled_objects', f'{f}_scaler.pkl'), 'wb')) np.save(out_path + f'/features/{f}_train.npy', X) Xs.append(X) X = np.hstack(Xs) print('Before normalization: ', list(zip(cols, np.hstack(nonScaledXmin), np.hstack(nonScaledXmax)))) return X
def get_feature_col_names(f, arr_size): col_names = [] for i in range(arr_size): col_names.append(f'{f}_{i+1}') return col_names
[docs]def create_test_features(df, in_path, scalers_path, out_path, features=None): """ Creates all the included test features arrays and saves them in \ *out_path*. Args: df (pandas.DataFrame): Contains the test points in_path (str): Path to read required items scalers_path (str): Path to load required scalers out_path (str): Path to write features (list, optional): Contains the names of the features to \ extract Returns: numpy.ndarray: The test features array """ included_features = Config.included_features if features is None else features required_args = set([ arg for f in included_features for arg in features_getter_args_map[f] ]) args = prepare_feats_args(df, required_args, in_path) Xs = [] for f in included_features: X = getattr(feats, features_getter_map[f])( *[args[arg] for arg in features_getter_args_map[f]]) if f in Config.normalized_features: scaler = pickle.load(open(os.path.join(scalers_path, f'{f}_scaler.pkl'), 'rb')) X, _ = normalize_features(X, scaler) np.save(os.path.join(out_path, f'features/{f}_test.npy'), X) Xs.append(X) X = np.hstack(Xs) return X
[docs]def normalize_features(X, scaler=None): """ Normalize features to [0, 1]. Args: X (numpy.ndarray): Features array to be normalized scaler (sklearn.preprocessing.MinMaxScaler, optional): Scaler to be \ utilized Returns: tuple: numpy.ndarray: The normalized features array sklearn.preprocessing.MinMaxScaler: The scaler utilized """ if scaler is None: scaler = MinMaxScaler() # scaler = RobustScaler() X_ = scaler.fit_transform(X) else: X_ = scaler.transform(X) return X_, scaler
[docs]def filter(values): """ Filters *values* by replacing values greater than *config.distance_thr* \ with *config.distance_thr*. Args: values (list): Contains distances created by various features Returns: list: Contains the filtered distances """ values_ = [ Config.distance_thr if v > Config.distance_thr else round(v, 2) for v in values ] return values_
[docs]def filter2(values): """ Filters *values* by replacing values greater than *config.distance_thr* \ with *config.distance_thr*. Args: values (list): Contains distances created by various features Returns: list: Contains the filtered distances """ values_ = [ Config.distance_thr if v > Config.square_thr else round(v, 2) for v in values ] return values_
def cart2pol(x, y): theta = np.arctan2(y, x) rho = np.hypot(x, y) return rho, theta def pol2cart(theta, rho): x = rho * np.cos(theta) y = rho * np.sin(theta) return x, y def cart2sph(x, y, z): hxy = np.hypot(x, y) r = np.hypot(hxy, z) el = np.arctan2(z, hxy) az = np.arctan2(y, x) return az, el, r def sph2cart(az, el, r): rcos_theta = r * np.cos(el) x = rcos_theta * np.cos(az) y = rcos_theta * np.sin(az) z = r * np.sin(el) return x, y, z # def get_bbox_coords(df): # x_cols = [f'x_{service}' for service in config.services] # y_cols = [f'y_{service}' for service in config.services] # west, east = np.min(df[x_cols].values), np.max(df[x_cols].values) # south, north = np.min(df[y_cols].values), np.max(df[y_cols].values) # return (south, west, north, east) # def get_centroids(df): # lons = df.apply(lambda x: np.mean([x[f'x_{service}'] for service in config.services]), axis=1) # lats = df.apply(lambda x: np.mean([x[f'y_{service}'] for service in config.services]), axis=1) # centroids = np.array(list(zip(lons, lats))) # return centroids
[docs]def get_points(df): """ Builds an array of all points appearing in *df*. This array will have a \ shape of (len(df) * number_of_services, 2). Args: df (pandas.DataFrame): Contains the data points Returns: numpy.ndarray """ points = [df[[f'x_{service}', f'y_{service}']].to_numpy() for service in Config.services] points = np.vstack(points) return points
[docs]def get_required_external_files(df, path, features=None): """ Checks if external files are required and if so, downloads them using the \ Overpass API. Args: df (pandas.DataFrame): Contains points in order to define the area to \ query with Overpass API path (str): Path to save the downloaded elements features (list, optional): Contains the names of the included features Returns: None """ included_features = Config.included_features if features is None else features required_args = set([ arg for f in included_features for arg in features_getter_args_map[f] ]) if 'street_gdf' in required_args: # osm_ut.extract_streets(get_centroids(df), path) osm_ut.extract_streets(get_points(df), path)