Source code for texturizer.traits

# -*- coding: utf-8 -*-
import pkg_resources
import pandas as pd 
import numpy as np
import math
import os
import re

from .process import load_word_pattern
 
"""
    texturizer.traits: Personality trait feature flags

    This module performs word or phrase matching to generate features 
    that can be indicative of personailuty traits in a writer or speaker.

    Some ideas taken from these articles

    https://www.scientificamerican.com/article/you-are-what-you-say/

    https://hbr.org/2011/12/your-use-of-pronouns-reveals-your-personality
"""

########################################################################################

reasoning_pat = load_word_pattern('reasoning.dat')
reasoning_re = re.compile(reasoning_pat)

nuance_pat = load_word_pattern('nuance.dat')
nuance_re = re.compile(nuance_pat)

explain_pat = load_word_pattern('explain.dat')
explain_re = re.compile(explain_pat)

singular_pat = "\\bi\\b|\\bme\\b|\\bmyself\\b|\\bmy\\b|\\bmine\\b"
singular_re = re.compile(singular_pat)

plural_pat = "\\bwe\\b|\\bus\\b|\\bour\\b|\\bourselves\\b"
plural_re = re.compile(plural_pat)

quotation_pat = "\"[ a-zA-Z0-9.,?!:;']*\""

########################################################################################
[docs]def add_text_trait_features(df, columns):
    """
        Given a pandas dataframe and a set of column names.
        calculate the personality trait features and add them.
    """
    rez = df.copy()
    for col in columns:
        rez = add_trait_counts(rez, col)
    return rez

########################################################################################
[docs]def add_trait_counts(df, col):
    """
        Given a pandas dataframe and a column name.
        Count the number of keyword matches for each trait
    """
    df[col+'_reason']=df[col].str.count(reasoning_pat, flags=re.IGNORECASE)
    df[col+'_explain']=df[col].str.count(explain_pat, flags=re.IGNORECASE)
    df[col+'_nuance']=df[col].str.count(nuance_pat, flags=re.IGNORECASE)
    df[col+'_singular']=df[col].str.count(singular_pat, flags=re.IGNORECASE)
    df[col+'_plural']=df[col].str.count(plural_pat, flags=re.IGNORECASE)
    df[col+'_quotations']=df[col].str.count(quotation_pat, flags=re.IGNORECASE)
    return df