Source code for textual_features

import numpy as np
import os
from whoosh.fields import Schema, TEXT, STORED
from whoosh import index, qparser, scoring
from whoosh.analysis import StemmingAnalyzer

import features_utilities as feat_ut
from config import config


[docs]def create_textual_index(poi_gdf, path):
    """
    Creates index containing the pois names given.

    Args:
        poi_gdf (geopandas.GeoDataFrame): Contains pois to be stored in the \
            index
        path (str): Path to save the index

    Returns:
        None
    """
    schema = Schema(idx=STORED,
                    name=TEXT(analyzer=StemmingAnalyzer()),
                    label=STORED)
    os.mkdir(path)
    ix = index.create_in(path, schema)
    writer = ix.writer()
    for poi in poi_gdf.itertuples():
        writer.add_document(idx=poi.Index,
                            name=getattr(poi, config.name_col),
                            label=poi.label)
    writer.commit()
    return


[docs]def get_similarity_per_class(poi_gdf, textual_index_path, nlabels):
    """
    Creates a features array. For each poi *p* (each row) the array will \
    contain a score in column *c*, representing how similar *p*'s name is \
    with each poi category.

    Args:
        poi_gdf (geopandas.GeoDataFrame): Contains pois for which the \
            features will be created
        textual_index_path (str): Path to the stored index
        nlabels (int): Number of poi categories

    Returns:
        numpy.ndarray: The features array of shape (n_samples, n_features), \
            here (len(poi_gdf), nlabels)
    """
    ix = index.open_dir(textual_index_path)
    X = np.zeros((len(poi_gdf), nlabels))
    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        for poi in poi_gdf.itertuples():
            query = qparser.QueryParser('name', ix.schema, group=qparser.OrGroup).parse(getattr(poi, config.name_col))
            results = searcher.search(query)
            for r in results:
                if X[poi.Index][r['label']] < r.score:
                    X[poi.Index][r['label']] = r.score
    return X


[docs]def get_top_k_terms(poi_gdf, names, k):
    """
    Creates a features array. Firstly, the top *k* % terms among *names* are \
    considered (e.g. a set of terms *T*). Then, for each poi *p* (each row) \
    the array will contain 1 (True) in column *c*, if term *T[c]* appears in \
    *p*'s name.

    Args:
        poi_gdf (geopandas.GeoDataFrame): Contains pois for which the \
            features will be created
        names (list): Contains the names of train pois
        k (float): Percentage of top terms to be considered

    Returns:
        numpy.ndarray: The features array of shape (n_samples, n_features), \
            here (len(poi_gdf), len(*T*))
    """
    top_k_terms = feat_ut.get_top_k(names, k, mode='term')
    X = np.zeros((len(poi_gdf), len(top_k_terms)))
    for poi in poi_gdf.itertuples():
        for t_idx, t in enumerate(top_k_terms):
            if t in getattr(poi, config.name_col):
                X[poi.Index][t_idx] = 1
    return X


[docs]def get_top_k_trigrams(poi_gdf, names, k):
    """
    Creates a features array. Firstly, the top *k* % trigrams among *names* \
    are considered (e.g. a set of trigrams *T*). Then, for each poi *p* (each \
    row) the array will contain 1 (True) in column *c*, if trigram *T[c]* \
    appears in *p*'s name.

    Args:
        poi_gdf (geopandas.GeoDataFrame): Contains pois for which the \
            features will be created
        names (list): Contains the names of train pois
        k (float): Percentage of top trigrams to be considered

    Returns:
        numpy.ndarray: The features array of shape (n_samples, n_features), \
            here (len(poi_gdf), len(*T*))
    """
    top_k_trigrams = feat_ut.get_top_k(names, k, mode='trigram')
    X = np.zeros((len(poi_gdf), len(top_k_trigrams)))
    for poi in poi_gdf.itertuples():
        for t_idx, t in enumerate(top_k_trigrams):
            if t in getattr(poi, config.name_col):
                X[poi.Index][t_idx] = 1
    return X


[docs]def get_top_k_fourgrams(poi_gdf, names, k):
    """
    Creates a features array. Firstly, the top *k* % fourgrams among *names* \
    are considered (e.g. a set of fourgrams *T*). Then, for each poi *p* \
    (each row) the array will contain 1 (True) in column *c*, if fourgrams \
    *T[c]* appears in *p*'s name.

    Args:
        poi_gdf (geopandas.GeoDataFrame): Contains pois for which the \
            features will be created
        names (list): Contains the names of train pois
        k (float): Percentage of top fourgrams to be considered

    Returns:
        numpy.ndarray: The features array of shape (n_samples, n_features), \
            here (len(poi_gdf), len(*T*))
    """
    top_k_fourgrams = feat_ut.get_top_k(names, k, mode='fourgram')
    X = np.zeros((len(poi_gdf), len(top_k_fourgrams)))
    for poi in poi_gdf.itertuples():
        for f_idx, f in enumerate(top_k_fourgrams):
            if f in getattr(poi, config.name_col):
                X[poi.Index][f_idx] = 1
    return X