Source code for texturizer.comparison

import sys
import numpy as np
import jellyfish
import textdistance
import pandas as pd

#################################################################################
[docs]def add_comparison_features(df,columns): """ This is the entry point to add all the core text similarity features. Note: We left out Ratcliff Obershelp from the set of metrics because it takes close to an order of magnitude longer to compute. Initial version just includes 4 string edit distance metrics. """ return add_string_match_features(df,columns)
#################################################################################
[docs]def add_string_match_features(df,columns): """ Return a copy of a dataframe with features describing matching between the set of named text columns """ def sm_features(x, col1, col2): if (x[col1] != x[col1]) or (x[col2] != x[col2]): jd = np.nan ld = np.nan ji = np.nan sd = np.nan else: raw_text1 = x[col1].lower() raw_text2 = x[col2].lower() jd = jellyfish.jaro_distance(raw_text1,raw_text2) ld = jellyfish.levenshtein_distance(raw_text1,raw_text2) ji = textdistance.jaccard(raw_text1,raw_text2) sd = textdistance.sorensen(raw_text1,raw_text2 ) return jd, ld, ji, sd col_number = len(columns) for i in range( col_number-1 ): for j in range(i+1,col_number): col1 = columns[i] col2 = columns[j] prefix = col1 + "_vs_" + col2 df[[prefix+'_jd', prefix+'_ld',prefix+'_ji',prefix+'_sd']] = df.apply(sm_features, col1=col1,col2=col2, axis=1, result_type="expand") return df
#################################################################################
[docs]def add_ratcliff_obershelp(df,columns): """ Return a copy of a dataframe with features describing matching between the set of named text columns """ def sm_features(x, col1, col2): if (x[col1] != x[col1]) or (x[col2] != x[col2]): ro = np.nan else: raw_text1 = x[col1].lower() raw_text2 = x[col2].lower() ro = textdistance.ratcliff_obershelp(raw_text1,raw_text2) return ro col_number = len(columns) for i in range( col_number-1 ): for j in range(i+1,col_number): col1 = columns[i] col2 = columns[j] prefix = col1 + "_vs_" + col2 df[[prefix+'_rat_obers']] = df.apply(sm_features, col1=col1,col2=col2, axis=1, result_type="expand") return df