Source code for texturizer.simple

# -*- coding: utf-8 -*-
import pandas as pd 
import numpy as np
import math
import os
import re

from .process import remove_escapes_and_non_printable
from .process import remove_urls_and_tags
from .process import load_word_list

"""
    texturizer.simple: Basic text feature calculation.

    Calculate statistics such as the average length of words, max word length
    proportion of non stop-words. We also create a clean version of the text
    that can be used by other functions in the library.

    Stop-word list taken from: https://www.textfixer.com/tutorials/common-english-words.txt 

"""

stop_word_list = load_word_list("stop-words.dat") 

########################################################################################
[docs]def add_text_summary_features(df, columns): """ Given a pandas dataframe and a set of column names. calculate the simple text summary features and add them. """ rez = df.copy() for col in columns: rez = add_text_features(rez, col) return rez
########################################################################################
[docs]def add_text_features(df, col): """ Given a pandas dataframe and a column name. calculate the simple text summary features and add them. """ col_len = col + "_len" df[col_len] = df[col].apply(null_tolerant_len) def cal_features(x, col): if x[col]!=x[col]: word_count = 0 sentence_count = 0 line_count = 0 avg_word_len = 0 max_word_len = 0 avg_sentence_len = 0 content_wd = 0 capital_d = 0 punct_d = 0 text = "" else: text = remove_urls_and_tags( remove_escapes_and_non_printable( x[col] ) ) chars = null_tolerant_len(x[col]) capitals = sum(1 for c in x[col] if c.isupper()) punct = sum(1 for c in x[col] if c in ['.','!','?',':',';','-',',']) capital_d = capitals/chars punct_d = punct/chars word_array = x[col].lower().split() sentence_array = [ x for x in re.split(r"[.?]", x[col].lower()) if x] line_array = [ x for x in re.split(r"[\r\n]+", x[col].lower()) if x] non_stop_words = list(set(word_array) - set(stop_word_list)) word_count = len(word_array) sentence_count = len(sentence_array) line_count = len(line_array) word_lengths = list(map(len, word_array)) max_word_len = max(word_lengths) avg_word_len = sum(word_lengths)/word_count content_wd = len(non_stop_words)/len(word_array) return word_count, sentence_count, line_count, avg_word_len, max_word_len, content_wd, capital_d, punct_d df[ get_simple_col_list(col) ] = df.apply(cal_features, col=col, axis=1, result_type="expand") return df
########################################################################################
[docs]def get_simple_col_list(col): return [col+'_wc', col+'_sc', col+'_lc', col+'_avg_wl', col+'_max_wl', col+'_cwd', col+'_caps', col+'_punc']
########################################################################################
[docs]def null_tolerant_len(x): if x != x: return 0 else: return len(x)
########################################################################################