Source code for texturizer.sentiment

# -*- coding: utf-8 -*-
from textblob import TextBlob
import numpy as np 
import re

from .process import load_word_pattern

"""
    texturizer.sentiment: Sentiment feature flags

    We use multiple sets of words lists for different sentiment features


    Hu and Liu

    This is a large list of postive and negative words. Taken from:
    http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html

    Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews."
       Proceedings of the ACM SIGKDD International Conference on Knowledge
       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle,
       Washington, USA,

    AFINN

    Smaller list created by extracting extreme sentiment words from the AFINN
    lists. Extracted from:
    http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html 

    This derived database of words is copyright protected and distributed under
    "Open Database License (ODbL) v1.0"
    http://www.opendatacommons.org/licenses/odbl/1.0/ 


    TODO

    Investgate how to use (license and processing requirements).

    http://www.wjh.harvard.edu/~inquirer/homecat.htm
"""

########################################################################################

positive_pat_large = load_word_pattern('positive-words.dat')
negative_pat_large = load_word_pattern('negative-words.dat')

positive_pat = load_word_pattern('positive.dat')
negative_pat = load_word_pattern('negative.dat')

########################################################################################
[docs]def add_text_sentiment_features(df, columns): """ Given a pandas dataframe and a set of column names. calculate the sentiment features and add them. """ rez = df.copy() for col in columns: rez = add_sentiment_features(rez, col) rez = add_textblob_features(rez, col) return rez
########################################################################################
[docs]def add_textblob_features(df, col): def tb_features(x, col): if x[col]!=x[col]: subjectivity = 0.0 #np.nan polarity = 0.0 #np.nan else: text = ( x[col] ) blob = TextBlob(text) subjectivity = blob.sentiment.subjectivity polarity = blob.sentiment.polarity return polarity, subjectivity df[[ col+'_tb_polarity', col+'_tb_subjectivity' ]] = df.apply(tb_features, col=col, axis=1, result_type="expand") return df
########################################################################################
[docs]def add_sentiment_features(df, col): """ Given a pandas dataframe and a column name. add simple text match features for sentiment. """ wc_col = col+'_wc' # This is ALWAYS computed first df[col+'_positive'] = df[col].str.count(positive_pat, flags=re.IGNORECASE).fillna(0) df[col+'_negative'] = df[col].str.count(negative_pat, flags=re.IGNORECASE).fillna(0) df[col+'_sentiment'] = (df[col+'_positive'] - df[col+'_negative'] )/df[wc_col] return df