Source code for texturizer.pos

# -*- coding: utf-8 -*-

import spacy
from .process import eprint

try:
   nlp = spacy.load("en_core_web_sm")
except:
   eprint(" * WARNING: POS features require the SpaCY language model : en_core_web_sm")

"""
    texturizer.pos Part of Speech features using spaCy

    Extraction of part of speech tags for a block of text and then generation
    of numerical features that summarise the grammatical structure of that text.

    Notes: I have implemented this inside a single 'apply' function which gives 
    some speed advantage. However, it is still slow. I am still looking for ways
    to speed this up.
"""

########################################################################################
[docs]def add_text_pos_features(df, columns): """ Given a pandas dataframe and a set of column names. calculate the part of speech features and add them. """ rez = df.copy() for col in columns: rez = add_pos_features(rez, col) return rez
########################################################################################
[docs]def add_pos_features(df, col): """ Given a pandas dataframe and a column name. add features for the proportion of dominant parts of speech Nouns, Verbs, Adjectives, Adverbs, Pronouns and Adpositions """ def pos_features(x, col): nouns = 0 verbs = 0 adj = 0 adv = 0 pron = 0 adp = 0 index = 1 if x[col]!=x[col]: nouns = 0 else: text = (x[col]) doc = nlp(text) index = 0 for token in doc: if token.pos_ == "VERB": verbs = verbs + 1 if token.pos_ == "NOUN": nouns = nouns + 1 if token.pos_ == "ADJ": adj = adj + 1 if token.pos_ == "ADV": adv = adv + 1 if token.pos_ == "PRON": pron = pron + 1 if token.pos_ == "ADP": adp = adp + 1 index = index + 1 return nouns/index, verbs/index, adj/index, adv/index, pron/index, adp/index df[[ col+'_nouns', col+'_verbs', col+'_adj', col+'_adv', col+'_pron', col+'_adp' ]] = df.apply(pos_features, col=col, axis=1, result_type="expand") return df