Source code for texturizer.emoticons

# -*- coding: utf-8 -*-
import pandas as pd 
import numpy as np
import codecs
import re

from .process import load_word_list
from .process import load_word_pattern
from .process import remove_urls_and_tags
from .process import remove_escapes_and_non_printable

smiles = load_word_list("emoticons-smile.dat")
laughs = load_word_list("emoticons-laugh.dat")
winks = load_word_list("emoticons-wink.dat")
cheekys = load_word_list("emoticons-wink.dat")
kisses = load_word_list("emoticons-kiss.dat")
happycrys = load_word_list("emoticons-happy-cry.dat")
crys = load_word_list("emoticons-cry.dat")
sads = load_word_list("emoticons-sad.dat")
shocks = load_word_list("emoticons-shock.dat")
sceptics = load_word_list("emoticons-sceptical.dat")
 
fwd_regex = "[:;8BX]['’`]{0,1}[-=^oc]{0,2}[DPO0J3ox,Þþb@*\\|/()<>{}\[\]]{1,2}"
fwd_re = re.compile(fwd_regex)
bck_regex = "[@*\\|/()<>{}\[\]]{1,2}[-=^]{0,2}['’`]{0,1}[:;]"
bck_re = re.compile(bck_regex)

 
"""
    texturizer.emoticons: Emoticon Recognition Text Features
 
    The functions in this library will add columns to a dataframe that indivate
    whether there are emoticons in certain columns of text, and whether those
    emoticons represent one of the more common emotions.

    NOTE: In developing these regexes I have deliberately ignored certain emoticons 
     because of the likelihood of false positive matches in text containing brackets
     For example emoticons: 8) or (B will not be matched.

    To avoid matching characters inside document markup language tags there is a 
    rudimentary regex based tag removal and unescaped version of the text that is
    expecte to have been generated in the intial simple text function run by the program. 
    This will remove URLS and HTML tags before trying to match emoticons.

    Some references used when considering which empticons to include:

    https://www.unglobalpulse.org/2014/10/emoticon-use-in-arabic-spanish-and-english-tweets/

    https://www.researchgate.net/publication/266269913_From_Emoticon_to_Universal_Symbolic_Signs_Can_Written_Language_Survive_in_Cyberspace

    https://www.sciencedirect.com/science/article/abs/pii/S0950329317300939

    https://www.semanticscholar.org/paper/An-Approach-towards-Text-to-Emoticon-Conversion-and-Jha/3b81505fa7fec81563b2dafae3939fa1b07f3a98

    https://www.qualitative-research.net/index.php/fqs/article/view/175/391

    https://www.researchgate.net/publication/221622114_M_Textual_Affect_Sensing_for_Sociable_and_Expressive_Online_Communication

"""


########################################################################################
[docs]def add_text_emoticon_features(df, columns):
    """
        Given a pandas dataframe and a set of column names.
        Add features that detect the presence of emoticons.
    """
    rez = df.copy()
    for col in columns:
        rez = add_emoticon_features(rez, col)
    return rez


########################################################################################
[docs]def add_emoticon_features(df, col):
    """
        Given a pandas dataframe and a column name.
        Check for emoticons in the column and add a set of features
        that indicate both the presence and emotional flavour of the emoticon.
    """
    def cal_features(x, col):
        emos = 0 
        smiley = 0 
        wink = 0 
        kiss = 0
        happycry = 0
        laugh = 0
        cheeky = 0
        crying = 0
        sad = 0
        shock = 0 
        sceptic = 0 
        if x[col]==x[col]:
            text = remove_urls_and_tags( remove_escapes_and_non_printable( x[col] ) )
            matches = fwd_re.findall(text)
            bck_matches = bck_re.findall(text)
            if len(matches)>0 or len(bck_matches)>0:
                matches.extend(bck_matches)
                emos = len(matches)
                if set(matches).intersection( smiles ):                
                    smiley = 1
                if set(matches).intersection( crys ):        
                    crying = 1
                if set(matches).intersection( winks ):                
                    wink = 1
                if set(matches).intersection( kisses ):                
                    kiss = 1
                if set(matches).intersection( sads ):                
                    sad = 1
                if set(matches).intersection( shocks ):                
                    shock = 1
                if set(matches).intersection( sceptics ):                
                    sceptic = 1
                if set(matches).intersection( laughs ):                
                    laugh = 1
                if set(matches).intersection( cheekys ):                
                    cheeky = 1
                if set(matches).intersection( happycrys ):                
                    happycry = 1
        pos = smiley + wink + kiss + happycry + laugh + cheeky
        neg = crying + sad + shock + sceptic
        sent = pos - neg
        return emos,smiley,wink,kiss,happycry,laugh,cheeky,crying,sad,shock,sceptic,pos,neg,sent
 
    df[ get_emoticon_col_list(col) ] = df.apply(cal_features, col=col, axis=1, result_type="expand")

    return df

########################################################################################
[docs]def get_emoticon_col_list(col):
    return [col+'_emoticons', col+'_emo_smiley', col+'_emo_wink', col+'_emo_kiss', col+'_emo_happycry', col+'_emo_laugh', col+'_emo_cheeky', col+'_emo_cry', col+'_emo_sad', col+'_emo_shock', col+'_emo_sceptic', col+'_emo_pos', col+'_emo_neg', col+'_emo_sentiment']