Feature Engineering , Categorical Encoding — Weight of Evidence, Counts & Frequency
import pandas as pd
import numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import OneHotEncoderfrom feature_engine import categorical_encoders as cefrom feature_engine.categorical_encoders import CountFrequencyCategoricalEncoderfrom feature_engine.categorical_encoders import OrdinalCategoricalEncoderfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoderfrom category_encoders.woe import WOEEncoder
Counts and Frequency — Manual and Feature Engine
def train_test(data, cols, target):
X_train, X_test, y_train, y_test = train_test_split( data[cols], # predictors
data[target], # target
test_size=0.3, # percentage of obs in test set
random_state=0) # seed to ensure reproducibility
return X_train, X_test, y_train, y_test
def manual_count_encode(data,count_features,target):
X_train, X_test, y_train, y_test = train_test(data, count_features, target)
for feature in count_features:
count_dict= X_train[feature].value_counts().to_dict()
X_train[feature] = X_train[feature].map(count_dict)
X_test[feature]=X_test[feature].map(count_dict)
print(feature, " ==> " ,count_dict)
return X_train, X_test
def manual_frequency_encode(data,count_features,target):
X_train, X_test, y_train, y_test = train_test(data, count_features, target)
for feature in count_features:
frequency_dict= (X_train[feature].value_counts() / len(X_train) ).to_dict()
X_train[feature] = X_train[feature].map(frequency_dict)
X_test[feature]=X_test[feature].map(frequency_dict)
print(feature, " ==> " ,frequency_dict)
return X_train, X_test
def count_feature_engine_encode(data,count_features,target):
X_train, X_test, y_train, y_test = train_test(data, count_features, target)
# define the encoder
count_enc = CountFrequencyCategoricalEncoder (
encoding_method= "count",
variables =count_features
)
#fit the encoder
count_enc.fit(X_train)
#transform
X_train_FE =count_enc.transform(X_train)
X_test_FE =count_enc.transform(X_test)
print(" FE COUNT DICTIONARY ", count_enc.encoder_dict_)
return(X_train_FE,X_test_FE)
def frequency_feature_engine_encode(data,count_features,target):
X_train, X_test, y_train, y_test = train_test(data, count_features, target)
# define the encoder
count_enc = CountFrequencyCategoricalEncoder (
encoding_method= "frequency",
variables =count_features
)
#fit the encoder
count_enc.fit(X_train)
#transform
X_train_FE =count_enc.transform(X_train)
X_test_FE =count_enc.transform(X_test)
print(" FE FREQUENCY DICTIONARY ", count_enc.encoder_dict_)
return(X_train_FE,X_test_FE)
OrdinalEncoder is same as SkLearn LabelEncoder , you can manual map this method as well
#OrdinalCategoricalEncoder is same as Label Encoder from SKlearn
def Ordinal_Categorical_Encoder(data,count_features,target):
X_train, X_test, y_train, y_test = train_test(data, count_features, target)
# define the encoder
ordinal_enc = OrdinalCategoricalEncoder(
encoding_method='arbitrary',
variables= count_features )
#fit the encoder
ordinal_enc.fit(X_train)
#transform
X_train_FE =ordinal_enc.transform(X_train)
X_test_FE =ordinal_enc.transform(X_test)
print(" ORDINAL ENCODING is same as label encoder ")
return(X_train_FE,X_test_FE) — Manual and Feature Engine
Weight of Evidence Encoding
Weight of Evidence (WoE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default.
That is, to predict how likely the money lent to a person or institution is to be lost. Thus, Weight of Evidence is a measure of the “strength” of a grouping technique to separate good and bad risk (default).
- WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, if the outcome is random for that group.
- If P(Bads) > P(Goods) the odds ratio will be < 1 and,
- WoE will be < 0 if, P(Goods) > P(Bads).
WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.
The WoE transformation has three advantages:
- It creates a monotonic relationship between the target and the independent variables.
- It orders the categories on a “logistic” scale which is natural for logistic regression
- The transformed variables can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive.
The WoE also has a limitation:
- Prone to cause over-fitting
Weight of Evidence Implementation
def FE_WOE(data_woe,features2,target2):
X_train, X_test, y_train, y_test = train_test_split(
data_woe[['Cabin', 'Sex', 'Embarked']], # predictors
data_woe['Survived'], # target
test_size=0.3, # percentage of obs in test set
random_state=0) # seed to ensure reproducibility
# define the encoder
woe_enc = WoERatioCategoricalEncoder(
encoding_method = 'woe',
variables=['Cabin', 'Sex', 'Embarked'])
#fit the encoder
woe_enc.fit(X_train, y_train)
#transform
X_train_FE = woe_enc.transform(X_train)
X_test_FE = woe_enc.transform(X_test)
print(" FEATURE ENGINE WEIGHT OF EVIDENCE ")
print(" Dictionary of Weights ", woe_enc.encoder_dict_)
print(" Features Variables encoded ", woe_enc.variables)
return(X_train_FE,X_test_FE)
#CAT ENCODER IS EXACTLY SAME AS FE WOEdef CAT_ENCODER_WOE(data_woe,features2,target2):
X_train, X_test, y_train, y_test = train_test_split(
data_woe[['Cabin', 'Sex', 'Embarked']], # predictors
data_woe['Survived'], # target
test_size=0.3, # percentage of obs in test set
random_state=0) # seed to ensure reproducibility
# define the encoder
woe_enc = WOEEncoder(cols=['Cabin', 'Sex', 'Embarked'])
#fit the encoder
woe_enc.fit(X_train, y_train)
#transform
X_train_FE = woe_enc.transform(X_train)
X_test_FE = woe_enc.transform(X_test)
print(" CATEGORY ENCODER WOE ")
print(" Dictionary of Weights ", woe_enc.mapping)
return(X_train_FE,X_test_FE)