Source code for geocoding.features

import numpy as np
from itertools import combinations
from shapely.geometry import Point, asLineString, Polygon
import re

from geocoding import features_utilities as feat_ut
from geocoding.config import Config


[docs]def get_normalized_coords(df): """ Creates a features array. Normalizes each longitude or latitude column, \ by subtracting the corresponding column's mean value from it. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), number_of_services * 2) """ cols = [ col for service in Config.services for col in (f'x_{service}', f'y_{service}') ] X = np.zeros((len(df), len(cols))) for i, col in enumerate(cols): X[:, i] = df[col] - df[col].mean() return X
[docs]def get_polar_coords(df): """ Creates a features array. Transforms cartesian coordinates to polar coordinates. Args: df (pandas.DataFrame): Contains data points for which the features will be created Returns: numpy.ndarray: The features array of shape (n_samples, n_features), here (len(df), number_of_services * 2) """ cols = [ (f'x_{service}', f'y_{service}') for service in Config.services ] X = np.zeros((len(df), len(cols)*2)) for i, col in enumerate(cols): X[:, i * 2] = np.hypot(df[col[0]], df[col[1]]) X[:, i * 2 + 1] = np.arctan2(df[col[1]], df[col[0]]) return X
[docs]def get_pairwise_coords_distances(df): """ Creates a features array. For each address (each row), calculate the \ pairwise distances among coordinates suggested from different services. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), number_of_services * (number_of_services-1)) """ n_services = len(Config.services) X = np.zeros((len(df), n_services*(n_services-1))) # X = np.zeros((len(df), (n_services - 1) * 2)) for i in df.itertuples(): distances = [] for coord in ['lon', 'lat']: # base_coord = [ # df.loc[i.Index, f'{coord}_{service}'] for service in config.services # if service in config.baseline_service # ] # coords = [ # df.loc[i.Index, f'{coord}_{service}'] for service in config.services # if service not in config.baseline_service # ] coords = [ df.loc[i.Index, f'{coord}_{service}'] for service in Config.services ] pairs = combinations(coords, 2) # pairs = [(x, y) for x in base_coord for y in coords] coords_distances = [np.abs(pair[0]-pair[1]) for pair in pairs] distances.extend(coords_distances) X[i.Index] = feat_ut.filter(distances) return X
[docs]def get_pairwise_points_distances(df): """ Creates a features array. For each address (each row), calculate the \ pairwise distances among points suggested from different services. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), (number_of_services * (number_of_services-1)) / 2) """ n_services = len(Config.services) X = np.zeros((len(df), int((n_services*(n_services-1))/2))) # X = np.zeros((len(df), n_services - 1)) for i in df.itertuples(): # base_point = [ # (df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) # for service in config.services if service in config.baseline_service # ] # points = [ # (df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) # for service in config.services if service not in config.baseline_service # ] points = [ (df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services ] pairs = combinations(points, 2) # pairs = [(x, y) for x in base_point for y in points] distances = [ Point(pair[0]).distance(Point(pair[1])) for pair in pairs ] X[i.Index] = feat_ut.filter(distances) return X
[docs]def get_centroid_coords_distances(df): """ Creates a features array. For each address (each row), calculate the \ distances between the corresponding centroid coords and the coords \ suggested from different services. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), number_of_services * 2) """ n_services = len(Config.services) X = np.zeros((len(df), 2*n_services)) for i in df.itertuples(): distances = [] for coord in ['lon', 'lat']: coords = [df.loc[i.Index, f'{coord}_{service}'] for service in Config.services] coords_mean = np.mean(coords) coords_distances = [np.abs(c-coords_mean) for c in coords] distances.extend(coords_distances) X[i.Index] = feat_ut.filter(distances) return X
[docs]def get_centroid_points_distances(df): """ Creates a features array. For each address (each row), calculate the \ distances between the corresponding centroid and the points suggested \ from different services. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), number_of_services) """ n_services = len(Config.services) X = np.zeros((len(df), n_services)) for i in df.itertuples(): points = [(df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services] centroid = [sum(x)/len(x) for x in zip(*points)] distances = [ Point(point).distance(Point(centroid)) for point in points ] X[i.Index] = feat_ut.filter(distances) return X
[docs]def get_mean_centroids_coords_distances(df): """ Creates a features array. For each address (each row), calculate the \ mean distances between the corresponding centroid coords and the coords \ suggested from different services. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), 2) """ X = np.zeros((len(df), 2)) for i in df.itertuples(): distances = [] for coord in ['lon', 'lat']: coords = [df.loc[i.Index, f'{coord}_{service}'] for service in Config.services] coords_mean = np.mean(coords) coords_distances = [np.abs(c-coords_mean) for c in coords] distances.append(np.mean(coords_distances)) X[i.Index] = feat_ut.filter(distances) return X
[docs]def get_mean_centroids_points_distances(df): """ Creates a features array. For each address (each row), calculate the \ mean distance between the corresponding centroid and the points \ suggested from different services. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), 1) """ X = np.zeros((len(df), 1)) for i in df.itertuples(): points = [(df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services] centroid = [sum(x)/len(x) for x in zip(*points)] distances = [ Point(point).distance(Point(centroid)) for point in points ] X[i.Index] = np.mean(feat_ut.filter(distances)) return X
[docs]def get_nearest_street_distance_per_service(df, street_gdf): """ Creates a features array. For each address (each row) and for each \ service, calculate the distance to the nearest street. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created street_gdf (geopandas.GeoDataFrame): Contains all streets extracted \ from OSM, along with their geometries Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), number_of_services) """ street_index = street_gdf.sindex n_services = len(Config.services) X = np.zeros((len(df), n_services)) for i in df.itertuples(): points = [(df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services] distances = [ Point(p).distance(street_gdf.iloc[c]['geometry']) for p in points for c in list(street_index.nearest(p)) ] X[i.Index] = feat_ut.filter(distances) return X
[docs]def get_common_nearest_street_distance(df, street_gdf, k=3): """ Creates a features array. For each address (each row) and for each \ service, calculate the distance to the nearest street that is common to all geocoding sources. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created street_gdf (geopandas.GeoDataFrame): Contains all streets extracted \ from OSM, along with their geometries k (int): The number of closest streets to fetch per geocoding source. Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), number_of_services) """ street_index = street_gdf.sindex n_services = len(Config.services) X = np.zeros((len(df), n_services)) for i in df.itertuples(): points = [ (df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services ] lines = [ [street_gdf.iloc[c]['geometry'] for c in list(street_index.nearest(p, k))] for p in points ] common_lines = [] for li in range(1, len(lines)): for l1, l2 in ((x, y) for x in lines[0] for y in lines[li]): if l1.intersects(l2) or l1.touches(l2): common_lines.append(list(l1.coords)) if len(common_lines): mask = np.ones(len(common_lines)) combs = combinations(enumerate(common_lines), 2) for c in combs: is_intersected = asLineString(c[0][1]).intersects(asLineString(c[1][1])) is_touched = asLineString(c[0][1]).touches(asLineString(c[1][1])) if is_intersected or is_touched: mask[c[0][0]] = 0 masked_clines = np.ma.masked_array(np.arange(len(common_lines)), mask=mask) if len(masked_clines.compressed()): # distances = [ # np.mean([Point(p).distance(asLineString(common_lines[c])) for c in masked_clines.compressed()]) # for p in points # ] closer_geom = min([ (idx, Point(points[0]).distance(asLineString(common_lines[c]))) for idx, c in enumerate(masked_clines.compressed()) ], key=lambda x: x[1]) distances = [ Point(p).distance(asLineString(common_lines[masked_clines.compressed()[closer_geom[0]]])) for p in points ] else: closer_geom = min( [(idx, Point(points[0]).distance(asLineString(c))) for idx, c in enumerate(common_lines)], key=lambda x: x[1] ) distances = [Point(p).distance(asLineString(common_lines[closer_geom[0]])) for p in points] else: closer_geom = min( [(idx, Point(points[0]).distance(c)) for idx, c in enumerate(lines[0])], key=lambda x: x[1] ) distances = [Point(p).distance(lines[0][closer_geom[0]]) for p in points] X[i.Index] = feat_ut.filter(distances) return X
[docs]def get_intersects_on_common_nearest_street(df, street_gdf, k=3): """ Creates a features array. For each address (each row) and for each \ service, identify the nearest street that is common to all geocoding sources and return `True` if it intersects or touches it or `False` otherwise. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created street_gdf (geopandas.GeoDataFrame): Contains all streets extracted \ from OSM, along with their geometries k (int): The number of closest streets to fetch per geocoding source. Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), number_of_services) """ buffer_size = 1.5 street_index = street_gdf.sindex n_services = len(Config.services) X = np.zeros((len(df), n_services)) for i in df.itertuples(): points = [ (df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services ] lines = [ [street_gdf.iloc[c]['geometry'] for c in list(street_index.nearest(p, k))] for p in points ] common_lines = [] for li in range(1, len(lines)): for l1, l2 in ((x, y) for x in lines[0] for y in lines[li]): if l1.intersects(l2) or l1.touches(l2): common_lines.append(list(l1.coords)) if len(common_lines): mask = np.ones(len(common_lines)) combs = combinations(enumerate(common_lines), 2) for c in combs: is_intersected = asLineString(c[0][1]).intersects(asLineString(c[1][1])) is_touched = asLineString(c[0][1]).touches(asLineString(c[1][1])) if is_intersected or is_touched: mask[c[0][0]] = 0 masked_clines = np.ma.masked_array(np.arange(len(common_lines)), mask=mask) if len(masked_clines.compressed()): # intersection_flags = [ # np.any([ # (Point(p).intersects(asLineString(common_lines[c]).buffer(buffer_size)) or # Point(p).touches(asLineString(common_lines[c]).buffer(buffer_size))) # for c in masked_clines.compressed() # ]) for p in points # ] closer_geom = min([ (idx, Point(points[0]).distance(asLineString(common_lines[c]))) for idx, c in enumerate(masked_clines.compressed()) ], key=lambda x: x[1]) intersection_flags = [ np.any([ (Point(p).intersects(asLineString( common_lines[masked_clines.compressed()[closer_geom[0]]]).buffer(buffer_size)) or Point(p).touches(asLineString( common_lines[masked_clines.compressed()[closer_geom[0]]]).buffer(buffer_size))) ]) for p in points ] else: closer_geom = min( [(idx, Point(points[0]).distance(asLineString(c))) for idx, c in enumerate(common_lines)], key=lambda x: x[1] ) buffered_line = asLineString(common_lines[closer_geom[0]]).buffer(buffer_size) intersection_flags = [ Point(p).intersects(buffered_line) or Point(p).touches(buffered_line) for p in points ] else: closer_geom = min( [(idx, Point(points[0]).distance(c)) for idx, c in enumerate(lines[0])], key=lambda x: x[1] ) buffered_line = lines[0][closer_geom[0]].buffer(buffer_size) intersection_flags = [Point(p).intersects(buffered_line) or Point(p).touches(buffered_line) for p in points] X[i.Index] = intersection_flags return X
[docs]def get_points_area(df): """ Creates a features array. Calculate a polygon from the coordinates of all geocoding sources. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), 1) """ X = np.zeros((len(df), 1)) for i in df.itertuples(): poly = Polygon([ (df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services ]) X[i.Index] = feat_ut.filter2([poly.area]) return X
[docs]def get_nearest_street_distance_by_centroid(df, street_gdf): """ Creates a features array. For each address (each row), the nearest street \ to the corresponding centroid is identified at first. Then, distances \ between this street and points suggested from different services are \ calculated. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created street_gdf (geopandas.GeoDataFrame): Contains all streets extracted \ from OSM, along with their geometries Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), number_of_services) """ street_index = street_gdf.sindex n_services = len(Config.services) X = np.zeros((len(df), n_services)) for i in df.itertuples(): points = [(df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services] centroid = [sum(x)/len(x) for x in zip(*points)] candidates = list(street_index.nearest(centroid)) nearest = candidates[ np.argmin([Point(centroid).distance(street_gdf.iloc[c]['geometry']) for c in candidates]) ] distances = [ Point(p).distance(street_gdf.iloc[nearest]['geometry']) for p in points ] X[i.Index] = feat_ut.filter(distances) return X
[docs]def get_zip_codes(df): """ Creates a features array. For each address (each row), the first 2 digits \ of its zip code are extracted. Then for each row *r*, the array will \ contain 1 (True) in column *c*, if *c* represents the 2 digits that *r*'s \ zip code starts with. Args: df (pandas.DataFrame): Contains data points for which the features \ will be created Returns: numpy.ndarray: The features array of shape (n_samples, n_features), \ here (len(df), 76). This is due to the fact that there are 76 \ such valid combinations in Greece """ pattern = r'\d{5}(?=,)' X = np.zeros((len(df), 85-10+1)) for i in df.itertuples(): zip_code = re.search(pattern, i.address).group(0) region_code = zip_code[:2] X[i.Index, int(region_code)-10] = 1 return X