Multi Class Text Classification using CNN and word2vec
Multi Class Classification is not just Positive or Negative emotions it can have a range of outcomes [1,2,3,4,5,6…n]
Filtering comments for obscene and bad language before posting
Online Bullying
Understanding Sentiments
Identifying angry customers and offering timely service
Many More
Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score
Word2vec Dict : Using glove.6B/glove.6B
word2vec = {}
with open(os.path.join('glove.6B/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
# is just a space-separated text file in the format:
# word vec[0] vec[1] vec[2] ...
for line in f:
values = line.split()
word = values[0]
vec = np.asarray(values[1:], dtype='float32')
word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))
Tokenize the Train Set
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
Create the Embedding Matrix
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
if i < MAX_VOCAB_SIZE:
embedding_vector = word2vec.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all zeros.
embedding_matrix[i] = embedding_vector
CNN Model
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
output = Dense(len(possible_labels), activation='sigmoid')(x)
model = Model(input_, output)
model.compile(
loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy']
)
print('Training model...')
r = model.fit(
data,
targets,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
validation_split=VALIDATION_SPLIT
)
Predict
Tested it with Twitter Data , which was not used to train the model and it generalized rather well
NB is here