Feature Engineering , Categorical Encoding — Weight of Evidence, Counts & Frequency

Imports

4 min readNov 23, 2020

import pandas as pd
import numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import OneHotEncoderfrom feature_engine import categorical_encoders as cefrom feature_engine.categorical_encoders import CountFrequencyCategoricalEncoderfrom feature_engine.categorical_encoders import OrdinalCategoricalEncoderfrom feature_engine.categorical_encoders import WoERatioCategoricalEncoderfrom category_encoders.woe import WOEEncoder

Counts and Frequency — Manual and Feature Engine

def train_test(data, cols, target):
    X_train, X_test, y_train, y_test = train_test_split( data[cols],  # predictors
    data[target],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

    return  X_train, X_test, y_train, y_test


def manual_count_encode(data,count_features,target):
    X_train, X_test, y_train, y_test = train_test(data, count_features, target)

    for feature in count_features:
        count_dict= X_train[feature].value_counts().to_dict()
        X_train[feature] = X_train[feature].map(count_dict)
        X_test[feature]=X_test[feature].map(count_dict)

        print(feature, " ==> " ,count_dict)

    return X_train, X_test


def manual_frequency_encode(data,count_features,target):
    X_train, X_test, y_train, y_test = train_test(data, count_features, target)

    for feature in count_features:
        frequency_dict= (X_train[feature].value_counts() / len(X_train) ).to_dict()
        X_train[feature] = X_train[feature].map(frequency_dict)
        X_test[feature]=X_test[feature].map(frequency_dict)

        print(feature, " ==> " ,frequency_dict)

    return X_train, X_test


def count_feature_engine_encode(data,count_features,target):
    X_train, X_test, y_train, y_test = train_test(data, count_features, target)

    # define the encoder
    count_enc = CountFrequencyCategoricalEncoder (
    encoding_method= "count",
        variables =count_features
    )

    #fit the encoder
    count_enc.fit(X_train)

    #transform
    X_train_FE =count_enc.transform(X_train)
    X_test_FE =count_enc.transform(X_test)

    print(" FE COUNT DICTIONARY ", count_enc.encoder_dict_)
    return(X_train_FE,X_test_FE)



def frequency_feature_engine_encode(data,count_features,target):
    X_train, X_test, y_train, y_test = train_test(data, count_features, target)

    # define the encoder
    count_enc = CountFrequencyCategoricalEncoder (
    encoding_method= "frequency",
        variables =count_features
    )

    #fit the encoder
    count_enc.fit(X_train)

    #transform
    X_train_FE =count_enc.transform(X_train)
    X_test_FE =count_enc.transform(X_test)

    print(" FE FREQUENCY DICTIONARY ", count_enc.encoder_dict_)
    return(X_train_FE,X_test_FE)

OrdinalEncoder is same as SkLearn LabelEncoder , you can manual map this method as well


#OrdinalCategoricalEncoder is same as Label Encoder from SKlearn
def Ordinal_Categorical_Encoder(data,count_features,target):
    X_train, X_test, y_train, y_test = train_test(data, count_features, target)

    # define the encoder
    ordinal_enc = OrdinalCategoricalEncoder(
        encoding_method='arbitrary',
        variables= count_features )

    #fit the encoder
    ordinal_enc.fit(X_train)
    #transform
    X_train_FE =ordinal_enc.transform(X_train)
    X_test_FE =ordinal_enc.transform(X_test)

    print(" ORDINAL ENCODING  is same as label encoder ")
    return(X_train_FE,X_test_FE) — Manual and Feature Engine

Weight of Evidence Encoding

Weight of Evidence (WoE) was developed primarily for the credit and financial industries to help build more predictive models to evaluate the risk of loan default.

That is, to predict how likely the money lent to a person or institution is to be lost. Thus, Weight of Evidence is a measure of the “strength” of a grouping technique to separate good and bad risk (default).

WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, if the outcome is random for that group.
If P(Bads) > P(Goods) the odds ratio will be < 1 and,
WoE will be < 0 if, P(Goods) > P(Bads).

WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.

The WoE transformation has three advantages:

It creates a monotonic relationship between the target and the independent variables.
It orders the categories on a “logistic” scale which is natural for logistic regression
The transformed variables can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive.

The WoE also has a limitation:

Prone to cause over-fitting

Weight of Evidence Implementation

def FE_WOE(data_woe,features2,target2):

    X_train, X_test, y_train, y_test = train_test_split(
    data_woe[['Cabin', 'Sex', 'Embarked']],  # predictors
    data_woe['Survived'], # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility
    # define the encoder
    woe_enc = WoERatioCategoricalEncoder(
    encoding_method = 'woe',
    variables=['Cabin', 'Sex', 'Embarked'])

    #fit the encoder
    woe_enc.fit(X_train, y_train)

    #transform
    X_train_FE = woe_enc.transform(X_train)
    X_test_FE = woe_enc.transform(X_test)

    print(" FEATURE ENGINE WEIGHT OF EVIDENCE ")
    print(" Dictionary of Weights ", woe_enc.encoder_dict_)
    print(" Features Variables encoded ", woe_enc.variables)

    return(X_train_FE,X_test_FE)

#CAT ENCODER IS EXACTLY SAME AS FE WOEdef CAT_ENCODER_WOE(data_woe,features2,target2):

    X_train, X_test, y_train, y_test = train_test_split(
    data_woe[['Cabin', 'Sex', 'Embarked']],  # predictors
    data_woe['Survived'], # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

    # define the encoder
    woe_enc = WOEEncoder(cols=['Cabin', 'Sex', 'Embarked'])

    #fit the encoder
    woe_enc.fit(X_train, y_train)

    #transform
    X_train_FE = woe_enc.transform(X_train)
    X_test_FE = woe_enc.transform(X_test)
    print(" CATEGORY ENCODER WOE ")
    print(" Dictionary of Weights ", woe_enc.mapping)


    return(X_train_FE,X_test_FE)