Source code for texturizer.simple

# -*- coding: utf-8 -*-
import pandas as pd 
import numpy as np
import math
import os
import re

from .process import remove_escapes_and_non_printable
from .process import remove_urls_and_tags
from .process import load_word_list

"""
    texturizer.simple: Basic text feature calculation.

    Calculate statistics such as the average length of words, max word length
    proportion of non stop-words. We also create a clean version of the text
    that can be used by other functions in the library.

    Stop-word list taken from: https://www.textfixer.com/tutorials/common-english-words.txt 

"""

stop_word_list = load_word_list("stop-words.dat") 

########################################################################################
[docs]def add_text_summary_features(df, columns):
    """
        Given a pandas dataframe and a set of column names.
        calculate the simple text summary features and add them.
    """
    rez = df.copy()
    for col in columns:
        rez = add_text_features(rez, col)
    return rez

########################################################################################
[docs]def add_text_features(df, col):
    """
        Given a pandas dataframe and a column name.
        calculate the simple text summary features and add them.
    """
    col_len = col + "_len"
    df[col_len] = df[col].apply(null_tolerant_len)
    def cal_features(x, col):
        if x[col]!=x[col]:
            word_count = 0 
            sentence_count = 0 
            line_count = 0 
            avg_word_len = 0 
            max_word_len = 0
            avg_sentence_len = 0 
            content_wd = 0 
            capital_d = 0
            punct_d = 0
            text = ""
        else:
            text = remove_urls_and_tags( remove_escapes_and_non_printable( x[col] ) )
            chars = null_tolerant_len(x[col])
            capitals = sum(1 for c in x[col] if c.isupper())
            punct = sum(1 for c in x[col] if c in ['.','!','?',':',';','-',','])
            capital_d = capitals/chars
            punct_d = punct/chars
            word_array = x[col].lower().split()
            sentence_array = [ x for x in re.split(r"[.?]", x[col].lower()) if x]
            line_array = [ x for x in re.split(r"[\r\n]+", x[col].lower()) if x]
            non_stop_words = list(set(word_array) - set(stop_word_list))
            word_count = len(word_array)
            sentence_count = len(sentence_array)
            line_count = len(line_array)
            word_lengths = list(map(len, word_array))
            max_word_len = max(word_lengths)
            avg_word_len = sum(word_lengths)/word_count
            content_wd = len(non_stop_words)/len(word_array)
        return word_count, sentence_count, line_count, avg_word_len, max_word_len, content_wd, capital_d, punct_d

    df[ get_simple_col_list(col) ] = df.apply(cal_features, col=col, axis=1, result_type="expand")

    return df

########################################################################################
[docs]def get_simple_col_list(col):
    return [col+'_wc', col+'_sc', col+'_lc', col+'_avg_wl', col+'_max_wl', col+'_cwd', col+'_caps', col+'_punc']

########################################################################################
[docs]def null_tolerant_len(x):
    if x != x:
        return 0
    else:
        return len(x)

 
########################################################################################