example/cnn_text_classification/old/text_cnn.py (177 lines of code) (raw):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import sys,os
import mxnet as mx
import numpy as np
import time
import math
import data_helpers
from collections import namedtuple
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) # get a logger to accuracies are printed
logs = sys.stderr
CNNModel = namedtuple("CNNModel", ['cnn_exec', 'symbol', 'data', 'label', 'param_blocks'])
def make_text_cnn(sentence_size, num_embed, batch_size, vocab_size,
num_label=2, filter_list=[3, 4, 5], num_filter=100,
dropout=0., with_embedding=True):
input_x = mx.sym.Variable('data') # placeholder for input
input_y = mx.sym.Variable('softmax_label') # placeholder for output
# embedding layer
if not with_embedding:
embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed')
conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentence_size, num_embed))
else:
conv_input = input_x
# create convolution + (max) pooling layer for each filter operation
pooled_outputs = []
for i, filter_size in enumerate(filter_list):
convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
relui = mx.sym.Activation(data=convi, act_type='relu')
pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))
pooled_outputs.append(pooli)
# combine all pooled outputs
total_filters = num_filter * len(filter_list)
concat = mx.sym.Concat(*pooled_outputs, dim=1)
h_pool = mx.sym.Reshape(data=concat, target_shape=(batch_size, total_filters))
# dropout layer
if dropout > 0.0:
h_drop = mx.sym.Dropout(data=h_pool, p=dropout)
else:
h_drop = h_pool
# fully connected
cls_weight = mx.sym.Variable('cls_weight')
cls_bias = mx.sym.Variable('cls_bias')
fc = mx.sym.FullyConnected(data=h_drop, weight=cls_weight, bias=cls_bias, num_hidden=num_label)
# softmax output
sm = mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax')
return sm
def setup_cnn_model(ctx, batch_size, sentence_size, num_embed, vocab_size,
dropout=0.5, initializer=mx.initializer.Uniform(0.1), with_embedding=True):
cnn = make_text_cnn(sentence_size, num_embed, batch_size=batch_size,
vocab_size=vocab_size, dropout=dropout, with_embedding=with_embedding)
arg_names = cnn.list_arguments()
input_shapes = {}
if with_embedding:
input_shapes['data'] = (batch_size, 1, sentence_size, num_embed)
else:
input_shapes['data'] = (batch_size, sentence_size)
arg_shape, out_shape, aux_shape = cnn.infer_shape(**input_shapes)
arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
args_grad = {}
for shape, name in zip(arg_shape, arg_names):
if name in ['softmax_label', 'data']: # input, output
continue
args_grad[name] = mx.nd.zeros(shape, ctx)
cnn_exec = cnn.bind(ctx=ctx, args=arg_arrays, args_grad=args_grad, grad_req='add')
param_blocks = []
arg_dict = dict(zip(arg_names, cnn_exec.arg_arrays))
for i, name in enumerate(arg_names):
if name in ['softmax_label', 'data']: # input, output
continue
initializer(name, arg_dict[name])
param_blocks.append( (i, arg_dict[name], args_grad[name], name) )
out_dict = dict(zip(cnn.list_outputs(), cnn_exec.outputs))
data = cnn_exec.arg_dict['data']
label = cnn_exec.arg_dict['softmax_label']
return CNNModel(cnn_exec=cnn_exec, symbol=cnn, data=data, label=label, param_blocks=param_blocks)
def train_cnn(model, X_train_batch, y_train_batch, X_dev_batch, y_dev_batch, batch_size,
optimizer='rmsprop', max_grad_norm=5.0, learning_rate=0.0005, epoch=200):
m = model
# create optimizer
opt = mx.optimizer.create(optimizer)
opt.lr = learning_rate
updater = mx.optimizer.get_updater(opt)
for iteration in range(epoch):
tic = time.time()
num_correct = 0
num_total = 0
for begin in range(0, X_train_batch.shape[0], batch_size):
batchX = X_train_batch[begin:begin+batch_size]
batchY = y_train_batch[begin:begin+batch_size]
if batchX.shape[0] != batch_size:
continue
m.data[:] = batchX
m.label[:] = batchY
# forward
m.cnn_exec.forward(is_train=True)
# backward
m.cnn_exec.backward()
# eval on training data
num_correct += sum(batchY == np.argmax(m.cnn_exec.outputs[0].asnumpy(), axis=1))
num_total += len(batchY)
# update weights
norm = 0
for idx, weight, grad, name in m.param_blocks:
grad /= batch_size
l2_norm = mx.nd.norm(grad).asscalar()
norm += l2_norm * l2_norm
norm = math.sqrt(norm)
for idx, weight, grad, name in m.param_blocks:
if norm > max_grad_norm:
grad *= (max_grad_norm / norm)
updater(idx, grad, weight)
# reset gradient to zero
grad[:] = 0.0
# decay learning rate
if iteration % 50 == 0 and iteration > 0:
opt.lr *= 0.5
print('reset learning rate to %g' % opt.lr,file=logs)
# end of training loop
toc = time.time()
train_time = toc - tic
train_acc = num_correct * 100 / float(num_total)
# saving checkpoint
if (iteration + 1) % 10 == 0:
prefix = 'cnn'
m.symbol.save('checkpoint/%s-symbol.json' % prefix)
save_dict = {('arg:%s' % k) :v for k, v in m.cnn_exec.arg_dict.items()}
save_dict.update({('aux:%s' % k) : v for k, v in m.cnn_exec.aux_dict.items()})
param_name = 'checkpoint/%s-%04d.params' % (prefix, iteration)
mx.nd.save(param_name, save_dict)
print('Saved checkpoint to %s' % param_name,file=logs)
# evaluate on dev set
num_correct = 0
num_total = 0
for begin in range(0, X_dev_batch.shape[0], batch_size):
batchX = X_dev_batch[begin:begin+batch_size]
batchY = y_dev_batch[begin:begin+batch_size]
if batchX.shape[0] != batch_size:
continue
m.data[:] = batchX
m.cnn_exec.forward(is_train=False)
num_correct += sum(batchY == np.argmax(m.cnn_exec.outputs[0].asnumpy(), axis=1))
num_total += len(batchY)
dev_acc = num_correct * 100 / float(num_total)
print('Iter [%d] Train: Time: %.3fs, Training Accuracy: %.3f \
--- Dev Accuracy thus far: %.3f' % (iteration, train_time, train_acc, dev_acc), file=logs)
def main():
print('Loading data...')
# word2vec = data_helpers.load_google_word2vec('data/GoogleNews-vectors-negative300.bin')
word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
x, y = data_helpers.load_data_with_word2vec(word2vec)
# randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
# split train/dev set
x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
print('Train/Dev split: %d/%d' % (len(y_train), len(y_dev)))
print('train shape:', x_train.shape)
print('dev shape:', x_dev.shape)
# reshpae for convolution input
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1], x_train.shape[2]))
x_dev = np.reshape(x_dev, (x_dev.shape[0], 1, x_dev.shape[1], x_dev.shape[2]))
num_embed = x_train.shape[-1]
sentence_size = x_train.shape[2]
print('sentence max words', sentence_size)
print('embedding size', num_embed)
batch_size = 50
cnn_model = setup_cnn_model(mx.gpu(1), batch_size, sentence_size, num_embed, dropout=0.5)
train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
def train_without_pretrained_embedding():
x, y, vocab, vocab_inv = data_helpers.load_data()
vocab_size = len(vocab)
# randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
# split train/dev set
x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
print('Train/Dev split: %d/%d' % (len(y_train), len(y_dev)))
print('train shape:', x_train.shape)
print('dev shape:', x_dev.shape)
print('vocab_size', vocab_size)
batch_size = 50
num_embed = 300
sentence_size = x_train.shape[1]
print('batch size', batch_size)
print('sentence max words', sentence_size)
print('embedding size', num_embed)
cnn_model = setup_cnn_model(mx.gpu(0), batch_size, sentence_size, num_embed, vocab_size, dropout=0.5, with_embedding=False)
train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
if __name__ == '__main__':
if not os.path.exists("checkpoint"):
os.mkdir("checkpoint")
train_without_pretrained_embedding()