Source code for texturizer.texturizer

# -*- coding: utf-8 -*-
 
""" texturizer.texturizer: provides entry point main()."""
 
import pandas as pd
import sys
import os

from .process import load_complete_dataframe
from .process import process_file_in_chunks
from .process import initialise_profile
from .process import print_profiles
from .process import print_output

from .featurize import generate_feature_function

from .config import max_filesize
 
[docs]def main():
    """Main texturizer application entry point.
       parses out CL options and determine the size of the file.
       Then process the file for the requested features
    """
    if len(sys.argv) < 2:
        print("ERROR: MISSING ARGUMENTS")
        print_usage(sys.argv)
        exit(1)
    else:
        params = get_cmd_line_params(sys.argv)

        if not os.path.exists(params["dataset"]):
            print("ERROR: Dataset does not exist")
            print_usage(sys.argv)
            exit(1)

        initialise_profile()
        feature_func = generate_feature_function(params)

        filesize = os.stat(params["dataset"]).st_size
        if filesize<max_filesize:
            df = load_complete_dataframe( params["dataset"] )
            simple = feature_func(df)
            print_output( simple )
        else:
            process_file_in_chunks(params["dataset"], feature_func)

        print_profiles()


#############################################################
[docs]def get_cmd_line_params(argv):
    """ parse out the option from an array of command line arguments """
    data = argv[-1]
    options = argv[1:-1]
    result = {"dataset":data,
              "columns":[], 
              "profanity":False, 
              "sentiment":False, 
              "emoticons":False, 
              "topics":False, 
              "count_matches":False, 
              "traits":False, 
              "reason":False, 
              "rhetoric":False, 
              "pos":False, 
              "literacy":False, 
              "scarcity":False, 
              "comparison":False, 
              "embedding":False,
              "normalize_embedding":False,
              "normalize_topics":False,
    }
    for o in options:
        parts = o.split("=")
        if parts[0] == "-literacy":
            result["literacy"]=True
        if parts[0] == "-profanity":
            result["profanity"]=True
        if parts[0] == "-scarcity":
            result["scarcity"]=True
        if parts[0] == "-sentiment":
            result["sentiment"]=True
        if parts[0] == "-topics":
            result["topics"]=True
            if len(parts)>1:
                if parts[1] == 'count':
                    result["count_matches"]=True
                if parts[1] == 'normalize':
                    result["count_matches"]=True
                    result["normalize_topics"]=True
        if parts[0] == "-traits":
            result["traits"]=True
        if parts[0] == "-reason":
            result["reason"]=True
        if parts[0] == "-rhetoric":
            result["rhetoric"]=True
        if parts[0] == "-pos":
            result["pos"]=True
        if parts[0] == "-emoticons":
            result["emoticons"]=True
        if parts[0] == "-embedding":
            result["embedding"]=True
            if len(parts)>1:
                if parts[1] == 'normalize':
                    result["normalize_embedding"]=True
        if parts[0] == "-comparison":
            result["comparison"]=True
        if parts[0] == "-columns":
            cols = parts[1].split(",")
            result["columns"]=cols

    return result

#############################################################
[docs]def print_usage(args):
    """ Command line application usage instrutions. """
    print("USAGE ")
    print(args[0], " [ARGS] <PATH TO DATASET>")
    print("  <PATH TO DATASET> - Supported file types: csv, tsv, xls, xlsx, odf")
    print(" [ARGS] In most cases these are switches that turn on the feature type")
    print("  -columns=<COMMA SEPARATED LIST>. REQUIRED")
    print("  -topics            Default: False. Indicators for words from common topics.")
    print("  -topics=count                      Count matching words from common topics.")
    print("  -topics=normalize                  Count matching topic key words and normalize over topics.")
    print("  -traits            Default: False. Word usage for personality traits.")
    print("  -reason            Default: False. Word usage for reasoning: premises, conclusions.")
    print("  -rhetoric          Default: False. Word usage for rhetorical devices.")
    print("  -pos               Default: False. Part of speech proportions.")
    print("  -literacy          Default: False. Checks for simple literacy markers.")
    print("  -profanity         Default: False. Profanity check flags.")
    print("  -sentiment         Default: False. Words counts for positive and negative sentiment.")
    print("  -scarcity          Default: False. Word scarcity scores.")
    print("  -emoticons         Default: False. Emoticon check flags.")
    print("  -embedding         Default: False. Aggregate of Word Embedding Vectors.")
    print("  -embedding=normalize               Normalised Aggregate of Word Embedding Vectors.")
    print("  -comparison        Default: False. Cross-column comparisons using edit distances.")
    print("")