# -*- coding: utf-8 -*-
import pkg_resources
import pandas as pd
import numpy as np
import math
import os
import re
from .process import load_word_list
from .process import load_word_pattern
"""
texturizer.topics: Common Topic Features
Simple pattern matching to generate features for very common topics for text data.
We have focused on topics that are common in both traditional and social media,
as well as commentary and discussion by the general public.
The goal with these features is to provide a count of words which are
unambigously related to a specific topic.
NOTE: That the words in these sets have been deliberately selected to be a set
that are less likely to match false positives. In other words they are
generally only used when talking about that specific topic, or when talking
using metaphor or analogy.
"""
########################################################################################
religion_pat = load_word_pattern('religion.dat', "\\bchristi|\\bislam|")
religion_re = re.compile(religion_pat)
politics_pat = load_word_pattern('politics.dat', "\\bpoliti|")
politics_re = re.compile(politics_pat)
sex_pat = load_word_pattern('sex.dat', "\\bsex[^t]|")
sex_re = re.compile(sex_pat)
ethno_pat = load_word_pattern('ethnicity.dat', "\\bethn|")
ethno_re = re.compile(ethno_pat)
health_pat = load_word_pattern('health.dat', "\\bhealth|")
health_re = re.compile(health_pat)
econo_pat = load_word_pattern('economics.dat', "\\becono|\\bfinan|")
econo_re = re.compile(econo_pat)
sport_pat = load_word_pattern('sports.dat', "\\bathlet|")
sport_re = re.compile(sport_pat)
arts_pat = load_word_pattern('arts.dat', "\\bartist|")
arts_re = re.compile(arts_pat)
family_pat = load_word_pattern('family.dat', "\\bfamil|")
family_re = re.compile(family_pat)
love_pat = load_word_pattern('love.dat', "\\bromanc|")
love_re = re.compile(love_pat)
crime_pat = load_word_pattern('crime.dat', "\\bcrimina|")
crime_re = re.compile(crime_pat)
travel_pat = load_word_pattern('travel.dat', "\\btravel|")
travel_re = re.compile(travel_pat)
food_pat = load_word_pattern('food.dat')
food_re = re.compile(food_pat)
technology_pat = load_word_pattern('technology.dat', "\\btechnol[^t]|")
fashion_pat = load_word_pattern('fashion.dat', "\\bfashion[^i]|")
culture_pat = load_word_pattern('culture.dat', "\\bcultur|")
education_pat = load_word_pattern('education.dat', "\\beducat|")
science_pat = load_word_pattern('science.dat', "\\bscientifi|")
########################################################################################
[docs]def add_text_topics_features(df, columns, type="flag"):
"""
Given a pandas dataframe and a set of column names.
calculate the simple text summary features and add them.
"""
rez = df.copy()
for col in columns:
if type=="count":
rez = add_topic_counts(rez, col)
if type=="normalize":
rez = add_topic_counts(rez, col, normalize=True)
else:
rez = add_topic_indicators(rez, col)
return rez
########################################################################################
[docs]def add_topic_indicators(df, col):
"""
Given a pandas dataframe and a column name.
add simple text match for top indicators.
"""
df[ col+'_religion' ] = 0
df.loc[(df[col].notnull()) & (df[col].str.contains(religion_pat)), col+'_religion' ] = 1
df[ col+'_politics' ] = 0
df.loc[(df[col].notnull()) & (df[col].str.contains(politics_pat)), col+'_politics' ] = 1
df[ col+'_sex' ]=0
df.loc[(df[col].notnull()) & (df[col].str.contains(sex_pat)), col+'_sex' ]=1
df[ col+'_ethnicity' ]=0
df.loc[(df[col].notnull()) & (df[col].str.contains(ethno_pat)), col+'_ethnicity' ]=1
df[ col+'_economics' ]=0
df.loc[(df[col].notnull()) & (df[col].str.contains(econo_pat)), col+'_economics' ]=1
df[ col+'_health' ]=0
df.loc[(df[col].notnull()) & (df[col].str.contains(health_pat)), col+'_health' ]=1
df[ col+'_sport' ]=0
df.loc[(df[col].notnull()) & (df[col].str.contains(sport_pat)), col+'_sport' ]=1
df[ col+'_arts' ]=0
df.loc[(df[col].notnull()) & (df[col].str.contains(arts_pat)), col+'_arts' ]=1
df[ col+'_family' ]=0
df.loc[(df[col].notnull()) & (df[col].str.contains(family_pat)), col+'_family' ]=1
df[ col+'_love' ]=0
df.loc[(df[col].notnull()) & (df[col].str.contains(love_pat)), col+'_love' ]=1
df[ col+'_crime' ]=0
df.loc[(df[col].notnull()) & (df[col].str.contains(crime_pat)), col+'_crime' ]=1
df[ col+'_travel' ]=0
df.loc[(df[col].notnull()) & (df[col].str.contains(travel_pat)), col+'_travel' ]=1
df[ col+'_food' ]=0
df.loc[(df[col].notnull()) & (df[col].str.contains(food_pat)), col+'_food' ]=1
df[col+'_technology']= 0
df.loc[(df[col].notnull()) & (df[col].str.contains(technology_pat)), col+'_technology' ]=1
df[col+'_fashion']=0
df.loc[(df[col].notnull()) & (df[col].str.contains(fashion_pat)), col+'_fashion' ]=1
df[col+'_culture']=0
df.loc[(df[col].notnull()) & (df[col].str.contains(culture_pat)), col+'_culture' ]=1
df[col+'_education']=0
df.loc[(df[col].notnull()) & (df[col].str.contains(education_pat)), col+'_education' ]=1
df[col+'_science']=0
df.loc[(df[col].notnull()) & (df[col].str.contains(science_pat)), col+'_science' ]=1
return df
########################################################################################
[docs]def add_topic_counts(df, col, normalize=False):
"""
Given a pandas dataframe and a column name.
Count the number of keyword matches for each topic
"""
df[col+'_religion']=df[col].str.count(religion_pat, flags=re.IGNORECASE)
df[col+'_politics']=df[col].str.count(politics_pat, flags=re.IGNORECASE)
df[col+'_sex']=df[col].str.count(sex_pat, flags=re.IGNORECASE)
df[col+'_ethnicity']=df[col].str.count(ethno_pat, flags=re.IGNORECASE)
df[col+'_economics']=df[col].str.count(econo_pat, flags=re.IGNORECASE)
df[col+'_health']=df[col].str.count(health_pat, flags=re.IGNORECASE)
df[col+'_sport']=df[col].str.count(sport_pat, flags=re.IGNORECASE)
df[col+'_arts']=df[col].str.count(arts_pat, flags=re.IGNORECASE)
df[col+'_family']=df[col].str.count(family_pat, flags=re.IGNORECASE)
df[col+'_love']=df[col].str.count(love_pat, flags=re.IGNORECASE)
df[col+'_crime']=df[col].str.count(crime_pat, flags=re.IGNORECASE)
df[col+'_travel']=df[col].str.count(travel_pat, flags=re.IGNORECASE)
df[col+'_food']=df[col].str.count(food_pat, flags=re.IGNORECASE)
df[col+'_technology']=df[col].str.count(technology_pat, flags=re.IGNORECASE)
df[col+'_fashion']=df[col].str.count(fashion_pat, flags=re.IGNORECASE)
df[col+'_culture']=df[col].str.count(culture_pat, flags=re.IGNORECASE)
df[col+'_education']=df[col].str.count(education_pat, flags=re.IGNORECASE)
df[col+'_science']=df[col].str.count(science_pat, flags=re.IGNORECASE)
if normalize:
totals = df[col+'_religion'] + df[col+'_politics'] + df[col+'_sex']+ df[col+'_ethnicity']+ df[col+'_economics']+ df[col+'_health']+ df[col+'_sport']+ df[col+'_arts']+ df[col+'_family']+ df[col+'_love']+ df[col+'_crime']+ df[col+'_travel']+ df[col+'_food']+ df[col+'_technology']+ df[col+'_fashion']+ df[col+'_culture']+ df[col+'_education']+df[col+'_science'] + 1
df[col+'_religion']=df[col+'_religion']/totals
df[col+'_politics']=df[col+'_politics']/totals
df[col+'_sex']=df[col+'_sex']/totals
df[col+'_ethnicity']=df[col+'_ethnicity']/totals
df[col+'_economics']=df[col+'_economics']/totals
df[col+'_health']=df[col+'_health']/totals
df[col+'_sport']=df[col+'_sport']/totals
df[col+'_arts']=df[col+'_arts']/totals
df[col+'_family']=df[col+'_family']/totals
df[col+'_love']=df[col+'_love']/totals
df[col+'_crime']=df[col+'_crime']/totals
df[col+'_travel']=df[col+'_travel']/totals
df[col+'_food']=df[col+'_food']/totals
df[col+'_technology']=df[col+'_technology']/totals
df[col+'_fashion']=df[col+'_fashion']/totals
df[col+'_culture']=df[col+'_culture']/totals
df[col+'_education']=df[col+'_education']/totals
df[col+'_science']=df[col+'_science']/totals
return df