MXNetでSeq2Seq with Attention


ゼロから作るDeep Learning ? ―自然言語処理編

ゼロから作るDeep Learning ? ―自然言語処理編

こちらで紹介されているデータを使って「Dive into Deep Learning」のSeq2Seq with Attentionを試してみる。


from collections import Counter

def load_data():
    questions_char = []
    answers_char = []

    freq = Counter()

    with open('date.txt', 'r') as f:

        for line in f:
            chars = [c for c in line]
            idx = line.find('_')
            answers_char.append(line[idx:-1]) #最後の1文字は改行文字
    vocab = sorted(list(freq.keys()))
    id_to_char = {i+1:t for i, t in enumerate(vocab)}
    char_to_id = {t:i+1 for i, t in enumerate(vocab)}

    questions = []
    questions_valid_len = []
    answers = []
    answers_valid_len = []

    for q in questions_char:
        ids = [char_to_id[c] for c in q]
            ids = ids + [char_to_id['<pad>']] * (29-len(ids))

    for q in answers_char:
        ids = [char_to_id[c] for c in q]

    return questions, questions_valid_len, answers, answers_valid_len, id_to_char, char_to_id

「Dive into Deep Learning」の「」を変更


class Seq2SeqAttentionDecoder(d2l.Decoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(Seq2SeqAttentionDecoder, self).__init__(**kwargs)
        self.attention_cell = d2l.MLPAttention(num_hiddens, dropout)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = rnn.LSTM(num_hiddens, num_layers, dropout=dropout)
        self.dense = nn.Dense(vocab_size, flatten=False)

    def init_state(self, enc_outputs, enc_valid_len, *args):
        outputs, hidden_state = enc_outputs
        # Transpose outputs to (batch_size, seq_len, hidden_size)
        return (outputs.swapaxes(0,1), hidden_state, enc_valid_len)

    def forward(self, X, state):
        enc_outputs, hidden_state, enc_valid_len = state
        X = self.embedding(X).swapaxes(0, 1)
        outputs = []
        for x in X:
            # query shape: (batch_size, 1, hidden_size)
            query = hidden_state[0][-1].expand_dims(axis=1)
            # context has same shape as query
            context = self.attention_cell(
                query, enc_outputs, enc_outputs, enc_valid_len)
            # concatenate on the feature dimension
            x = nd.concat(context, x.expand_dims(axis=1), dim=-1)
            # reshape x to (1, batch_size, embed_size+hidden_size)
            out, hidden_state = self.rnn(x.swapaxes(0, 1), hidden_state)
        outputs = self.dense(nd.concat(*outputs, dim=0))
        return outputs.swapaxes(0, 1), [enc_outputs, hidden_state,

def train_seq2seq(model, data_iter, lr, num_epochs, ctx):
    model.initialize(init.Xavier(), force_reinit=True, ctx=ctx)
    trainer = gluon.Trainer(model.collect_params(),
                            'adam', {'learning_rate': lr})
    loss = MaskedSoftmaxCELoss()
    for epoch in range(1, num_epochs+1):
        timer = d2l.Timer()
        metric = d2l.Accumulator(2)  # loss_sum, num_tokens
        for batch in data_iter:
            X, X_vlen, Y, Y_vlen = [x.astype(np.float32).as_in_context(ctx) for x in batch]
            Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1
            with autograd.record():
                Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen)
                l = loss(Y_hat, Y_label, Y_vlen)
            d2l.grad_clipping(model, 1)
            num_tokens = Y_vlen.sum().asscalar()
            metric.add(l.sum().asscalar(), num_tokens)
        print('epoch = %d  loss = %.3f'%(epoch, metric[0]/metric[1]))



import d2l
import sequence
import mxnet as mx
from mxnet import nd
from mxnet.gluon import nn, rnn

import pickle

batch_size = 128

#src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt(batch_size, num_steps)
a, b, c, d, id_to_char, char_to_id = sequence.load_data()

with open('id_to_char.pickle', 'wb') as f:
    pickle.dump(id_to_char, f)

with open('char_to_id.pickle', 'wb') as f:
    pickle.dump(char_to_id, f)
from sklearn.model_selection import train_test_split
train_x, test_x, train_x_len, test_x_len, train_y, test_y, train_y_len, test_y_len \
    =train_test_split(a, b, c, d, test_size=0.1, shuffle=True)

with open('test_x.pickle', 'wb') as f:
    pickle.dump(test_x, f)

with open('test_y.pickle', 'wb') as f:
    pickle.dump(test_y, f)

X = [x[::-1] for x in train_x]

from mxnet.gluon import data
dataset = data.dataset.ArrayDataset(X, train_x_len, train_y, train_y_len)
train_iter = data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

vocab_size = len(id_to_char)
embed_size = 16
num_hiddens = 256
num_layers = 1
dropout = 0.0

lr = 0.005
num_epochs = 3
ctx = mx.gpu()

encoder = d2l.Seq2SeqEncoder(
    vocab_size, embed_size, num_hiddens, num_layers, dropout)
decoder = d2l.Seq2SeqAttentionDecoder(
    vocab_size, embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)
d2l.train_seq2seq(model, train_iter, lr, num_epochs, ctx)


import d2l
import mxnet as mx
import pickle

with open('id_to_char.pickle', 'rb') as f:
    id_to_char = pickle.load(f)

with open('char_to_id.pickle', 'rb') as f:
    char_to_id = pickle.load(f)

with open('test_x.pickle', 'rb') as f:
    test_x = pickle.load(f)

with open('test_y.pickle', 'rb') as f:
    test_y = pickle.load(f)

vocab_size = len(id_to_char)
embed_size = 16
num_hiddens = 256
num_layers = 1

ctx = mx.gpu()

test_id = 1234

encoder = d2l.Seq2SeqEncoder(
    vocab_size, embed_size, num_hiddens, num_layers)
decoder = d2l.Seq2SeqAttentionDecoder(
    vocab_size, embed_size, num_hiddens, num_layers)
model = d2l.EncoderDecoder(encoder, decoder)


ctx = mx.gpu()

enc_valid_length = mx.nd.array(len(test_x[0]), ctx = ctx)
enc_X = mx.nd.array(test_x[test_id][::-1], ctx=ctx)

enc_outputs = model.encoder(enc_X.expand_dims(axis=0), enc_valid_length)
dec_state = model.decoder.init_state(enc_outputs, enc_valid_length)
dec_X = mx.nd.array([char_to_id['_']], ctx=ctx).expand_dims(axis=0)

predict_tokens = []

for _ in range(10):
    Y, dec_state = model.decoder(dec_X, dec_state)
    # The token with highest score is used as the next time step input.
    dec_X = Y.argmax(axis=2)
    py = dec_X.squeeze(axis=0).astype('int32').asscalar()
print(''.join([id_to_char[c] for c in predict_tokens]))
print(''.join([id_to_char[c] for c in test_x[test_id]]))