# -*- coding: utf-8 -*-
import pkg_resources
import pandas as pd
import numpy as np
import math
import os
import re
from .process import load_word_list
from .process import load_word_pattern
"""
texturizer.literacy: Literacy feature flags
Simple word matching to generate features for common literacy problems.
This includes typos or spelling mistakes and some simple grammar problems,
for example, not capitalizing the first word of a sentence.
"""
########################################################################################
misspelling_pat = load_word_pattern('misspelling.dat')
misspelling_re = re.compile(misspelling_pat)
grammar_pat = "[ '(\"][aA] [AaEeIiOoUu]|[^.][^A-Z]\. [a-z]|\b(\w+)\b \b\1\b"
grammar_re = re.compile(grammar_pat)
########################################################################################
[docs]def add_text_literacy_features(df, columns):
"""
Given a pandas dataframe and a set of column names.
calculate the simple literacy features and add them.
"""
rez = df.copy()
for col in columns:
rez = add_literacy_features(rez, col)
return rez
########################################################################################
[docs]def add_literacy_features(df, col):
"""
Given a pandas dataframe and a column name.
add simple text match features for literacy.
"""
def lit_features(x, col):
misspelling = 0
grammar_err = 0
if x[col]!=x[col]:
misspelling = 0
else:
text = (x[col])
grammar_err = len(grammar_re.findall(text))
text = (x[col].lower())
misspelling = len(misspelling_re.findall(text))
return misspelling, grammar_err
df[[ col+'_misspelling', col+'_grammar_err' ]] = df.apply(lit_features, col=col, axis=1, result_type="expand")
return df