Feature Engineering , Categorical Encoding — Weight of Evidence, Counts & Frequency

Imports

Anakin
4 min readNov 23, 2020
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import OneHotEncoderfrom feature_engine import categorical_encoders as cefrom feature_engine.categorical_encoders import CountFrequencyCategoricalEncoderfrom feature_engine.categorical_encoders import OrdinalCategoricalEncoderfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoderfrom category_encoders.woe import WOEEncoder

Counts and Frequency — Manual and Feature Engine

def train_test(data, cols, target):
X_train, X_test, y_train, y_test = train_test_split( data[cols], # predictors
data[target], # target
test_size=0.3, # percentage of obs in test set
random_state=0) # seed to ensure reproducibility

return X_train, X_test, y_train, y_test


def manual_count_encode(data,count_features,target):
X_train, X_test, y_train, y_test = train_test(data, count_features, target)

for feature in count_features:
count_dict= X_train[feature].value_counts().to_dict()
X_train[feature] = X_train[feature].map(count_dict)
X_test[feature]=X_test[feature].map(count_dict)

print(feature, " ==> " ,count_dict)

return X_train, X_test


def manual_frequency_encode(data,count_features,target):
X_train, X_test, y_train, y_test = train_test(data, count_features, target)

for feature in count_features:
frequency_dict= (X_train[feature].value_counts() / len(X_train) ).to_dict()
X_train[feature] = X_train[feature].map(frequency_dict)
X_test[feature]=X_test[feature].map(frequency_dict)

print(feature, " ==> " ,frequency_dict)

return X_train, X_test


def count_feature_engine_encode(data,count_features,target):
X_train, X_test, y_train, y_test = train_test(data, count_features, target)

# define the encoder
count_enc = CountFrequencyCategoricalEncoder (
encoding_method= "count",
variables =count_features
)

#fit the encoder
count_enc.fit(X_train)

#transform
X_train_FE =count_enc.transform(X_train)
X_test_FE =count_enc.transform(X_test)

print(" FE COUNT DICTIONARY ", count_enc.encoder_dict_)
return(X_train_FE,X_test_FE)



def frequency_feature_engine_encode(data,count_features,target):
X_train, X_test, y_train, y_test = train_test(data, count_features, target)

# define the encoder
count_enc = CountFrequencyCategoricalEncoder (
encoding_method= "frequency",
variables =count_features
)

#fit the encoder
count_enc.fit(X_train)

#transform
X_train_FE =count_enc.transform(X_train)
X_test_FE =count_enc.transform(X_test)

print(" FE FREQUENCY DICTIONARY ", count_enc.encoder_dict_)
return(X_train_FE,X_test_FE)

OrdinalEncoder is same as SkLearn LabelEncoder , you can manual map this method as well


#OrdinalCategoricalEncoder is same as Label Encoder from SKlearn

def Ordinal_Categorical_Encoder(data,count_features,target):
X_train, X_test, y_train, y_test = train_test(data, count_features, target)

# define the encoder
ordinal_enc = OrdinalCategoricalEncoder(
encoding_method='arbitrary',
variables= count_features )

#fit the encoder
ordinal_enc.fit(X_train)
#transform
X_train_FE =ordinal_enc.transform(X_train)
X_test_FE =ordinal_enc.transform(X_test)

print(" ORDINAL ENCODING is same as label encoder ")
return(X_train_FE,X_test_FE) — Manual and Feature Engine

Weight of Evidence Encoding

Weight of Evidence (WoE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default.

That is, to predict how likely the money lent to a person or institution is to be lost. Thus, Weight of Evidence is a measure of the “strength” of a grouping technique to separate good and bad risk (default).

  • WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, if the outcome is random for that group.
  • If P(Bads) > P(Goods) the odds ratio will be < 1 and,
  • WoE will be < 0 if, P(Goods) > P(Bads).

WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.

The WoE transformation has three advantages:

  • It creates a monotonic relationship between the target and the independent variables.
  • It orders the categories on a “logistic” scale which is natural for logistic regression
  • The transformed variables can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive.

The WoE also has a limitation:

  • Prone to cause over-fitting

Weight of Evidence Implementation

def FE_WOE(data_woe,features2,target2):

X_train, X_test, y_train, y_test = train_test_split(
data_woe[['Cabin', 'Sex', 'Embarked']], # predictors
data_woe['Survived'], # target
test_size=0.3, # percentage of obs in test set
random_state=0) # seed to ensure reproducibility

# define the encoder
woe_enc = WoERatioCategoricalEncoder(
encoding_method = 'woe',
variables=['Cabin', 'Sex', 'Embarked'])

#fit the encoder
woe_enc.fit(X_train, y_train)

#transform
X_train_FE = woe_enc.transform(X_train)
X_test_FE = woe_enc.transform(X_test)

print(" FEATURE ENGINE WEIGHT OF EVIDENCE ")
print(" Dictionary of Weights ", woe_enc.encoder_dict_)
print(" Features Variables encoded ", woe_enc.variables)

return(X_train_FE,X_test_FE)

#CAT ENCODER IS EXACTLY SAME AS FE WOE
def CAT_ENCODER_WOE(data_woe,features2,target2):

X_train, X_test, y_train, y_test = train_test_split(
data_woe[['Cabin', 'Sex', 'Embarked']], # predictors
data_woe['Survived'], # target
test_size=0.3, # percentage of obs in test set
random_state=0) # seed to ensure reproducibility

# define the encoder
woe_enc = WOEEncoder(cols=['Cabin', 'Sex', 'Embarked'])

#fit the encoder
woe_enc.fit(X_train, y_train)

#transform
X_train_FE = woe_enc.transform(X_train)
X_test_FE = woe_enc.transform(X_test)

print(" CATEGORY ENCODER WOE ")
print(" Dictionary of Weights ", woe_enc.mapping)


return(X_train_FE,X_test_FE)

--

--

No responses yet