Source code for texturizer.traits

# -*- coding: utf-8 -*-
import pkg_resources
import pandas as pd 
import numpy as np
import math
import os
import re

from .process import load_word_pattern
 
"""
    texturizer.traits: Personality trait feature flags

    This module performs word or phrase matching to generate features 
    that can be indicative of personailuty traits in a writer or speaker.

    Some ideas taken from these articles

    https://www.scientificamerican.com/article/you-are-what-you-say/

    https://hbr.org/2011/12/your-use-of-pronouns-reveals-your-personality
"""

########################################################################################

reasoning_pat = load_word_pattern('reasoning.dat')
reasoning_re = re.compile(reasoning_pat)

nuance_pat = load_word_pattern('nuance.dat')
nuance_re = re.compile(nuance_pat)

explain_pat = load_word_pattern('explain.dat')
explain_re = re.compile(explain_pat)

singular_pat = "\\bi\\b|\\bme\\b|\\bmyself\\b|\\bmy\\b|\\bmine\\b"
singular_re = re.compile(singular_pat)

plural_pat = "\\bwe\\b|\\bus\\b|\\bour\\b|\\bourselves\\b"
plural_re = re.compile(plural_pat)

quotation_pat = "\"[ a-zA-Z0-9.,?!:;']*\""

########################################################################################
[docs]def add_text_trait_features(df, columns): """ Given a pandas dataframe and a set of column names. calculate the personality trait features and add them. """ rez = df.copy() for col in columns: rez = add_trait_counts(rez, col) return rez
########################################################################################
[docs]def add_trait_counts(df, col): """ Given a pandas dataframe and a column name. Count the number of keyword matches for each trait """ df[col+'_reason']=df[col].str.count(reasoning_pat, flags=re.IGNORECASE) df[col+'_explain']=df[col].str.count(explain_pat, flags=re.IGNORECASE) df[col+'_nuance']=df[col].str.count(nuance_pat, flags=re.IGNORECASE) df[col+'_singular']=df[col].str.count(singular_pat, flags=re.IGNORECASE) df[col+'_plural']=df[col].str.count(plural_pat, flags=re.IGNORECASE) df[col+'_quotations']=df[col].str.count(quotation_pat, flags=re.IGNORECASE) return df