MXNetでSeq2Seq with Attention

はじめに

ゼロから作るDeep Learning ? ―自然言語処理編

ゼロから作るDeep Learning ? ―自然言語処理編

こちらで紹介されているデータを使って「Dive into Deep Learning」のSeq2Seq with Attentionを試してみる。

データをロードする関数を作る(「sequence.py」)

from collections import Counter

def load_data():
    questions_char = []
    answers_char = []

    freq = Counter()

    with open('date.txt', 'r') as f:

        for line in f:
            chars = [c for c in line]
            freq.update(set(chars))
            idx = line.find('_')
            questions_char.append(line[:idx].strip())
            answers_char.append(line[idx:-1]) #最後の1文字は改行文字
        
    vocab = sorted(list(freq.keys()))
    vocab.remove('\n')
    vocab.append('<pad>')
    id_to_char = {i+1:t for i, t in enumerate(vocab)}
    char_to_id = {t:i+1 for i, t in enumerate(vocab)}

    #文字列をIDに変える
    questions = []
    questions_valid_len = []
    answers = []
    answers_valid_len = []

    for q in questions_char:
        ids = [char_to_id[c] for c in q]
        #リストの長さを29文字にそろえる
        if(len(ids)<29):
            ids = ids + [char_to_id['<pad>']] * (29-len(ids))
        questions_valid_len.append(len(ids))
        questions.append(ids)

    for q in answers_char:
        ids = [char_to_id[c] for c in q]
        answers_valid_len.append(len(ids))
        answers.append(ids)

    return questions, questions_valid_len, answers, answers_valid_len, id_to_char, char_to_id

「Dive into Deep Learning」の「d2l.py」を変更

以下を追加する

class Seq2SeqAttentionDecoder(d2l.Decoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(Seq2SeqAttentionDecoder, self).__init__(**kwargs)
        self.attention_cell = d2l.MLPAttention(num_hiddens, dropout)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = rnn.LSTM(num_hiddens, num_layers, dropout=dropout)
        self.dense = nn.Dense(vocab_size, flatten=False)

    def init_state(self, enc_outputs, enc_valid_len, *args):
        outputs, hidden_state = enc_outputs
        # Transpose outputs to (batch_size, seq_len, hidden_size)
        return (outputs.swapaxes(0,1), hidden_state, enc_valid_len)

    def forward(self, X, state):
        enc_outputs, hidden_state, enc_valid_len = state
        X = self.embedding(X).swapaxes(0, 1)
        outputs = []
        for x in X:
            # query shape: (batch_size, 1, hidden_size)
            query = hidden_state[0][-1].expand_dims(axis=1)
            # context has same shape as query
            context = self.attention_cell(
                query, enc_outputs, enc_outputs, enc_valid_len)
            # concatenate on the feature dimension
            x = nd.concat(context, x.expand_dims(axis=1), dim=-1)
            # reshape x to (1, batch_size, embed_size+hidden_size)
            out, hidden_state = self.rnn(x.swapaxes(0, 1), hidden_state)
            outputs.append(out)
        outputs = self.dense(nd.concat(*outputs, dim=0))
        return outputs.swapaxes(0, 1), [enc_outputs, hidden_state,
                                        enc_valid_len]

def train_seq2seq(model, data_iter, lr, num_epochs, ctx):
    model.initialize(init.Xavier(), force_reinit=True, ctx=ctx)
    trainer = gluon.Trainer(model.collect_params(),
                            'adam', {'learning_rate': lr})
    loss = MaskedSoftmaxCELoss()
    for epoch in range(1, num_epochs+1):
        timer = d2l.Timer()
        metric = d2l.Accumulator(2)  # loss_sum, num_tokens
        for batch in data_iter:
            X, X_vlen, Y, Y_vlen = [x.astype(np.float32).as_in_context(ctx) for x in batch]
            Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1
            with autograd.record():
                Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen)
                l = loss(Y_hat, Y_label, Y_vlen)
            l.backward()
            d2l.grad_clipping(model, 1)
            num_tokens = Y_vlen.sum().asscalar()
            trainer.step(num_tokens)
            metric.add(l.sum().asscalar(), num_tokens)
        print('epoch = %d  loss = %.3f'%(epoch, metric[0]/metric[1]))

    model.save_parameters('seq2seq_model.params')

学習を行う

import d2l
import sequence
import mxnet as mx
from mxnet import nd
from mxnet.gluon import nn, rnn

import pickle

batch_size = 128

#src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt(batch_size, num_steps)
a, b, c, d, id_to_char, char_to_id = sequence.load_data()

with open('id_to_char.pickle', 'wb') as f:
    pickle.dump(id_to_char, f)

with open('char_to_id.pickle', 'wb') as f:
    pickle.dump(char_to_id, f)
   
from sklearn.model_selection import train_test_split
train_x, test_x, train_x_len, test_x_len, train_y, test_y, train_y_len, test_y_len \
    =train_test_split(a, b, c, d, test_size=0.1, shuffle=True)

with open('test_x.pickle', 'wb') as f:
    pickle.dump(test_x, f)

with open('test_y.pickle', 'wb') as f:
    pickle.dump(test_y, f)

#入力を反転
X = [x[::-1] for x in train_x]

from mxnet.gluon import data
dataset = data.dataset.ArrayDataset(X, train_x_len, train_y, train_y_len)
train_iter = data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

vocab_size = len(id_to_char)
embed_size = 16
num_hiddens = 256
num_layers = 1
dropout = 0.0

lr = 0.005
num_epochs = 3
ctx = mx.gpu()

encoder = d2l.Seq2SeqEncoder(
    vocab_size, embed_size, num_hiddens, num_layers, dropout)
decoder = d2l.Seq2SeqAttentionDecoder(
    vocab_size, embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)
d2l.train_seq2seq(model, train_iter, lr, num_epochs, ctx)

テストデータで確認

import d2l
import mxnet as mx
import pickle

with open('id_to_char.pickle', 'rb') as f:
    id_to_char = pickle.load(f)

with open('char_to_id.pickle', 'rb') as f:
    char_to_id = pickle.load(f)

with open('test_x.pickle', 'rb') as f:
    test_x = pickle.load(f)

with open('test_y.pickle', 'rb') as f:
    test_y = pickle.load(f)

vocab_size = len(id_to_char)
embed_size = 16
num_hiddens = 256
num_layers = 1

ctx = mx.gpu()

test_id = 1234

encoder = d2l.Seq2SeqEncoder(
    vocab_size, embed_size, num_hiddens, num_layers)
decoder = d2l.Seq2SeqAttentionDecoder(
    vocab_size, embed_size, num_hiddens, num_layers)
model = d2l.EncoderDecoder(encoder, decoder)

model.load_parameters('seq2seq_model.params',ctx=ctx)

ctx = mx.gpu()

enc_valid_length = mx.nd.array(len(test_x[0]), ctx = ctx)
enc_X = mx.nd.array(test_x[test_id][::-1], ctx=ctx)

enc_outputs = model.encoder(enc_X.expand_dims(axis=0), enc_valid_length)
dec_state = model.decoder.init_state(enc_outputs, enc_valid_length)
dec_X = mx.nd.array([char_to_id['_']], ctx=ctx).expand_dims(axis=0)

predict_tokens = []

for _ in range(10):
    Y, dec_state = model.decoder(dec_X, dec_state)
    # The token with highest score is used as the next time step input.
    dec_X = Y.argmax(axis=2)
    py = dec_X.squeeze(axis=0).astype('int32').asscalar()
    predict_tokens.append(py)
    
print(''.join([id_to_char[c] for c in predict_tokens]))
print(''.join([id_to_char[c] for c in test_x[test_id]]))

最後に

入力データの反転を行わずに「masked_softmax」を使うと良い結果が得られなかった。
入力データの反転を行い、有効文字数をパディングした後の文字数に固定すると良い結果が得られた。