Source code for texturizer.comparison

import sys
import numpy as np
import jellyfish
import textdistance
import pandas as pd

#################################################################################
[docs]def add_comparison_features(df,columns):
    """
        This is the entry point to add all the core text similarity features.
        Note: We left out Ratcliff Obershelp from the set of metrics because it
        takes close to an order of magnitude longer to compute.

        Initial version just includes 4 string edit distance metrics. 
    """
    return add_string_match_features(df,columns)

#################################################################################
[docs]def add_string_match_features(df,columns):
    """
    Return a copy of a dataframe with features describing matching
    between the set of named text columns
    """
    def sm_features(x, col1, col2):
        if (x[col1] != x[col1]) or (x[col2] != x[col2]):
            jd = np.nan
            ld = np.nan
            ji = np.nan
            sd = np.nan
        else:
            raw_text1 = x[col1].lower()
            raw_text2 = x[col2].lower()
            jd = jellyfish.jaro_distance(raw_text1,raw_text2)
            ld = jellyfish.levenshtein_distance(raw_text1,raw_text2)
            ji = textdistance.jaccard(raw_text1,raw_text2)
            sd = textdistance.sorensen(raw_text1,raw_text2 )
        return jd, ld, ji, sd

    col_number = len(columns)
    for i in range( col_number-1 ):
        for j in range(i+1,col_number):
            col1 = columns[i]
            col2 = columns[j]
            prefix = col1 + "_vs_" + col2
            df[[prefix+'_jd', prefix+'_ld',prefix+'_ji',prefix+'_sd']] = df.apply(sm_features, col1=col1,col2=col2, axis=1, result_type="expand")
    return df


#################################################################################
[docs]def add_ratcliff_obershelp(df,columns):
    """
    Return a copy of a dataframe with features describing matching
    between the set of named text columns
    """
    def sm_features(x, col1, col2):
        if (x[col1] != x[col1]) or (x[col2] != x[col2]):
            ro = np.nan
        else:
            raw_text1 = x[col1].lower()
            raw_text2 = x[col2].lower()
            ro = textdistance.ratcliff_obershelp(raw_text1,raw_text2)
        return ro

    col_number = len(columns)
    for i in range( col_number-1 ):
        for j in range(i+1,col_number):
            col1 = columns[i]
            col2 = columns[j]
            prefix = col1 + "_vs_" + col2
            df[[prefix+'_rat_obers']] = df.apply(sm_features, col1=col1,col2=col2, axis=1, result_type="expand")
    return df