import numpy as np
from itertools import combinations
from shapely.geometry import Point, asLineString, Polygon
import re
from geocoding import features_utilities as feat_ut
from geocoding.config import Config
[docs]def get_normalized_coords(df):
"""
Creates a features array. Normalizes each longitude or latitude column, \
by subtracting the corresponding column's mean value from it.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), number_of_services * 2)
"""
cols = [
col for service in Config.services for col in (f'x_{service}', f'y_{service}')
]
X = np.zeros((len(df), len(cols)))
for i, col in enumerate(cols):
X[:, i] = df[col] - df[col].mean()
return X
[docs]def get_polar_coords(df):
"""
Creates a features array. Transforms cartesian coordinates to polar coordinates.
Args:
df (pandas.DataFrame): Contains data points for which the features will be created
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), here (len(df), number_of_services * 2)
"""
cols = [
(f'x_{service}', f'y_{service}') for service in Config.services
]
X = np.zeros((len(df), len(cols)*2))
for i, col in enumerate(cols):
X[:, i * 2] = np.hypot(df[col[0]], df[col[1]])
X[:, i * 2 + 1] = np.arctan2(df[col[1]], df[col[0]])
return X
[docs]def get_pairwise_coords_distances(df):
"""
Creates a features array. For each address (each row), calculate the \
pairwise distances among coordinates suggested from different services.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), number_of_services * (number_of_services-1))
"""
n_services = len(Config.services)
X = np.zeros((len(df), n_services*(n_services-1)))
# X = np.zeros((len(df), (n_services - 1) * 2))
for i in df.itertuples():
distances = []
for coord in ['lon', 'lat']:
# base_coord = [
# df.loc[i.Index, f'{coord}_{service}'] for service in config.services
# if service in config.baseline_service
# ]
# coords = [
# df.loc[i.Index, f'{coord}_{service}'] for service in config.services
# if service not in config.baseline_service
# ]
coords = [
df.loc[i.Index, f'{coord}_{service}'] for service in Config.services
]
pairs = combinations(coords, 2)
# pairs = [(x, y) for x in base_coord for y in coords]
coords_distances = [np.abs(pair[0]-pair[1]) for pair in pairs]
distances.extend(coords_distances)
X[i.Index] = feat_ut.filter(distances)
return X
[docs]def get_pairwise_points_distances(df):
"""
Creates a features array. For each address (each row), calculate the \
pairwise distances among points suggested from different services.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), (number_of_services * (number_of_services-1)) / 2)
"""
n_services = len(Config.services)
X = np.zeros((len(df), int((n_services*(n_services-1))/2)))
# X = np.zeros((len(df), n_services - 1))
for i in df.itertuples():
# base_point = [
# (df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}'])
# for service in config.services if service in config.baseline_service
# ]
# points = [
# (df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}'])
# for service in config.services if service not in config.baseline_service
# ]
points = [
(df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services
]
pairs = combinations(points, 2)
# pairs = [(x, y) for x in base_point for y in points]
distances = [
Point(pair[0]).distance(Point(pair[1]))
for pair in pairs
]
X[i.Index] = feat_ut.filter(distances)
return X
[docs]def get_centroid_coords_distances(df):
"""
Creates a features array. For each address (each row), calculate the \
distances between the corresponding centroid coords and the coords \
suggested from different services.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), number_of_services * 2)
"""
n_services = len(Config.services)
X = np.zeros((len(df), 2*n_services))
for i in df.itertuples():
distances = []
for coord in ['lon', 'lat']:
coords = [df.loc[i.Index, f'{coord}_{service}']
for service in Config.services]
coords_mean = np.mean(coords)
coords_distances = [np.abs(c-coords_mean) for c in coords]
distances.extend(coords_distances)
X[i.Index] = feat_ut.filter(distances)
return X
[docs]def get_centroid_points_distances(df):
"""
Creates a features array. For each address (each row), calculate the \
distances between the corresponding centroid and the points suggested \
from different services.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), number_of_services)
"""
n_services = len(Config.services)
X = np.zeros((len(df), n_services))
for i in df.itertuples():
points = [(df.loc[i.Index, f'lon_{service}'],
df.loc[i.Index, f'lat_{service}'])
for service in Config.services]
centroid = [sum(x)/len(x) for x in zip(*points)]
distances = [
Point(point).distance(Point(centroid))
for point in points
]
X[i.Index] = feat_ut.filter(distances)
return X
[docs]def get_mean_centroids_coords_distances(df):
"""
Creates a features array. For each address (each row), calculate the \
mean distances between the corresponding centroid coords and the coords \
suggested from different services.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), 2)
"""
X = np.zeros((len(df), 2))
for i in df.itertuples():
distances = []
for coord in ['lon', 'lat']:
coords = [df.loc[i.Index, f'{coord}_{service}']
for service in Config.services]
coords_mean = np.mean(coords)
coords_distances = [np.abs(c-coords_mean) for c in coords]
distances.append(np.mean(coords_distances))
X[i.Index] = feat_ut.filter(distances)
return X
[docs]def get_mean_centroids_points_distances(df):
"""
Creates a features array. For each address (each row), calculate the \
mean distance between the corresponding centroid and the points \
suggested from different services.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), 1)
"""
X = np.zeros((len(df), 1))
for i in df.itertuples():
points = [(df.loc[i.Index, f'lon_{service}'],
df.loc[i.Index, f'lat_{service}'])
for service in Config.services]
centroid = [sum(x)/len(x) for x in zip(*points)]
distances = [
Point(point).distance(Point(centroid))
for point in points
]
X[i.Index] = np.mean(feat_ut.filter(distances))
return X
[docs]def get_nearest_street_distance_per_service(df, street_gdf):
"""
Creates a features array. For each address (each row) and for each \
service, calculate the distance to the nearest street.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
street_gdf (geopandas.GeoDataFrame): Contains all streets extracted \
from OSM, along with their geometries
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), number_of_services)
"""
street_index = street_gdf.sindex
n_services = len(Config.services)
X = np.zeros((len(df), n_services))
for i in df.itertuples():
points = [(df.loc[i.Index, f'lon_{service}'],
df.loc[i.Index, f'lat_{service}'])
for service in Config.services]
distances = [
Point(p).distance(street_gdf.iloc[c]['geometry'])
for p in points
for c in list(street_index.nearest(p))
]
X[i.Index] = feat_ut.filter(distances)
return X
[docs]def get_common_nearest_street_distance(df, street_gdf, k=3):
"""
Creates a features array. For each address (each row) and for each \
service, calculate the distance to the nearest street that is common to all geocoding sources.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
street_gdf (geopandas.GeoDataFrame): Contains all streets extracted \
from OSM, along with their geometries
k (int): The number of closest streets to fetch per geocoding source.
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), number_of_services)
"""
street_index = street_gdf.sindex
n_services = len(Config.services)
X = np.zeros((len(df), n_services))
for i in df.itertuples():
points = [
(df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services
]
lines = [
[street_gdf.iloc[c]['geometry'] for c in list(street_index.nearest(p, k))] for p in points
]
common_lines = []
for li in range(1, len(lines)):
for l1, l2 in ((x, y) for x in lines[0] for y in lines[li]):
if l1.intersects(l2) or l1.touches(l2):
common_lines.append(list(l1.coords))
if len(common_lines):
mask = np.ones(len(common_lines))
combs = combinations(enumerate(common_lines), 2)
for c in combs:
is_intersected = asLineString(c[0][1]).intersects(asLineString(c[1][1]))
is_touched = asLineString(c[0][1]).touches(asLineString(c[1][1]))
if is_intersected or is_touched: mask[c[0][0]] = 0
masked_clines = np.ma.masked_array(np.arange(len(common_lines)), mask=mask)
if len(masked_clines.compressed()):
# distances = [
# np.mean([Point(p).distance(asLineString(common_lines[c])) for c in masked_clines.compressed()])
# for p in points
# ]
closer_geom = min([
(idx, Point(points[0]).distance(asLineString(common_lines[c])))
for idx, c in enumerate(masked_clines.compressed())
], key=lambda x: x[1])
distances = [
Point(p).distance(asLineString(common_lines[masked_clines.compressed()[closer_geom[0]]]))
for p in points
]
else:
closer_geom = min(
[(idx, Point(points[0]).distance(asLineString(c))) for idx, c in enumerate(common_lines)],
key=lambda x: x[1]
)
distances = [Point(p).distance(asLineString(common_lines[closer_geom[0]])) for p in points]
else:
closer_geom = min(
[(idx, Point(points[0]).distance(c)) for idx, c in enumerate(lines[0])],
key=lambda x: x[1]
)
distances = [Point(p).distance(lines[0][closer_geom[0]]) for p in points]
X[i.Index] = feat_ut.filter(distances)
return X
[docs]def get_intersects_on_common_nearest_street(df, street_gdf, k=3):
"""
Creates a features array. For each address (each row) and for each \
service, identify the nearest street that is common to all geocoding sources and return `True` if it intersects or
touches it or `False` otherwise.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
street_gdf (geopandas.GeoDataFrame): Contains all streets extracted \
from OSM, along with their geometries
k (int): The number of closest streets to fetch per geocoding source.
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), number_of_services)
"""
buffer_size = 1.5
street_index = street_gdf.sindex
n_services = len(Config.services)
X = np.zeros((len(df), n_services))
for i in df.itertuples():
points = [
(df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services
]
lines = [
[street_gdf.iloc[c]['geometry'] for c in list(street_index.nearest(p, k))] for p in points
]
common_lines = []
for li in range(1, len(lines)):
for l1, l2 in ((x, y) for x in lines[0] for y in lines[li]):
if l1.intersects(l2) or l1.touches(l2):
common_lines.append(list(l1.coords))
if len(common_lines):
mask = np.ones(len(common_lines))
combs = combinations(enumerate(common_lines), 2)
for c in combs:
is_intersected = asLineString(c[0][1]).intersects(asLineString(c[1][1]))
is_touched = asLineString(c[0][1]).touches(asLineString(c[1][1]))
if is_intersected or is_touched:
mask[c[0][0]] = 0
masked_clines = np.ma.masked_array(np.arange(len(common_lines)), mask=mask)
if len(masked_clines.compressed()):
# intersection_flags = [
# np.any([
# (Point(p).intersects(asLineString(common_lines[c]).buffer(buffer_size)) or
# Point(p).touches(asLineString(common_lines[c]).buffer(buffer_size)))
# for c in masked_clines.compressed()
# ]) for p in points
# ]
closer_geom = min([
(idx, Point(points[0]).distance(asLineString(common_lines[c])))
for idx, c in enumerate(masked_clines.compressed())
], key=lambda x: x[1])
intersection_flags = [
np.any([
(Point(p).intersects(asLineString(
common_lines[masked_clines.compressed()[closer_geom[0]]]).buffer(buffer_size)) or
Point(p).touches(asLineString(
common_lines[masked_clines.compressed()[closer_geom[0]]]).buffer(buffer_size)))
]) for p in points
]
else:
closer_geom = min(
[(idx, Point(points[0]).distance(asLineString(c))) for idx, c in enumerate(common_lines)],
key=lambda x: x[1]
)
buffered_line = asLineString(common_lines[closer_geom[0]]).buffer(buffer_size)
intersection_flags = [
Point(p).intersects(buffered_line) or Point(p).touches(buffered_line) for p in points
]
else:
closer_geom = min(
[(idx, Point(points[0]).distance(c)) for idx, c in enumerate(lines[0])],
key=lambda x: x[1]
)
buffered_line = lines[0][closer_geom[0]].buffer(buffer_size)
intersection_flags = [Point(p).intersects(buffered_line) or Point(p).touches(buffered_line) for p in points]
X[i.Index] = intersection_flags
return X
[docs]def get_points_area(df):
"""
Creates a features array. Calculate a polygon from the coordinates of all geocoding sources.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), 1)
"""
X = np.zeros((len(df), 1))
for i in df.itertuples():
poly = Polygon([
(df.loc[i.Index, f'lon_{service}'], df.loc[i.Index, f'lat_{service}']) for service in Config.services
])
X[i.Index] = feat_ut.filter2([poly.area])
return X
[docs]def get_nearest_street_distance_by_centroid(df, street_gdf):
"""
Creates a features array. For each address (each row), the nearest street \
to the corresponding centroid is identified at first. Then, distances \
between this street and points suggested from different services are \
calculated.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
street_gdf (geopandas.GeoDataFrame): Contains all streets extracted \
from OSM, along with their geometries
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), number_of_services)
"""
street_index = street_gdf.sindex
n_services = len(Config.services)
X = np.zeros((len(df), n_services))
for i in df.itertuples():
points = [(df.loc[i.Index, f'lon_{service}'],
df.loc[i.Index, f'lat_{service}'])
for service in Config.services]
centroid = [sum(x)/len(x) for x in zip(*points)]
candidates = list(street_index.nearest(centroid))
nearest = candidates[
np.argmin([Point(centroid).distance(street_gdf.iloc[c]['geometry']) for c in candidates])
]
distances = [
Point(p).distance(street_gdf.iloc[nearest]['geometry'])
for p in points
]
X[i.Index] = feat_ut.filter(distances)
return X
[docs]def get_zip_codes(df):
"""
Creates a features array. For each address (each row), the first 2 digits \
of its zip code are extracted. Then for each row *r*, the array will \
contain 1 (True) in column *c*, if *c* represents the 2 digits that *r*'s \
zip code starts with.
Args:
df (pandas.DataFrame): Contains data points for which the features \
will be created
Returns:
numpy.ndarray: The features array of shape (n_samples, n_features), \
here (len(df), 76). This is due to the fact that there are 76 \
such valid combinations in Greece
"""
pattern = r'\d{5}(?=,)'
X = np.zeros((len(df), 85-10+1))
for i in df.itertuples():
zip_code = re.search(pattern, i.address).group(0)
region_code = zip_code[:2]
X[i.Index, int(region_code)-10] = 1
return X