Source code for interlinking.pre_process
import csv
from collections import Counter, defaultdict
import itertools
import os
import re
from interlinking import config, helpers
[docs]def extract_freqterms(fname, encoding):
"""Extract frequent terms found in the `fname` dataset and save them to a file under
:data:`~interlinking.config.default_data_path`.
:param fname: File name of the dataset.
:type fname: str
:param encoding: Encoding of the input dataset (*global* | *latin*).
:type encoding: str
"""
pattern = re.compile("^[a-zA-Z]+")
ngram_stats = {
# '2gram': Counter(), '3gram': Counter(), '4gram': Counter(),
'gram_token': Counter(),
# '2gram_token': Counter(), '3gram_token': Counter()
}
dstemmed = defaultdict(set)
with open(os.path.join(config.default_data_path, fname)) as csv_file:
reader = csv.DictReader(csv_file, fieldnames=config.fieldnames, delimiter=config.delimiter)
for row in reader:
a, b = helpers.transform(row[config.use_cols['s1']], row[config.use_cols['s2']], canonical=True)
for s in [a, b]:
ngram_tokens, ngram_tokens_stemmed, _ = helpers.normalize_str(s)
for term, stem in zip(ngram_tokens, ngram_tokens_stemmed):
if len(term) < 3 or not pattern.match(term): continue
ngram_stats['gram_token'][stem] += 1
dstemmed[stem].add(term)
# for gram in list(itertools.chain.from_iterable(
# [[ngram_tokens_stemmed[i:i + n] for i in range(len(ngram_tokens_stemmed) - (n - 1))]
# for n in [2, 3]])
# ):
# if len(gram) == 2:
# ngram_stats['2gram_token'][' '.join(gram)] += 1
# else:
# ngram_stats['3gram_token'][' '.join(gram)] += 1
# # ngrams chars
# # ngrams = zip(*[''.join(strA_ngrams_tokens)[i:] for i in range(n) for n in [2, 3, 4]])
# for gram in list(itertools.chain.from_iterable(
# [[''.join(ngram_tokens)[i:i + n] for i in range(len(''.join(ngram_tokens)) - (n - 1))]
# for n in [2, 3, 4]])
# ):
# if len(gram) == 2:
# ngram_stats['2gram'][gram] += 1
# elif len(gram) == 3:
# ngram_stats['3gram'][gram] += 1
# elif len(gram) == 4:
# ngram_stats['4gram'][gram] += 1
for gram in ngram_stats.keys():
with open(os.path.join(config.default_data_path, "{0}s_{1}.csv".format(gram, encoding)), "w+") as f:
f.write('gram\tcount\n')
for value, count in ngram_stats[gram].most_common():
for t in dstemmed.get(value):
f.write("{}\t{}\n".format(t, count))