はじめに
ゼロから作るDeep Learning ? ―自然言語処理編
- 作者: 斎藤康毅
- 出版社/メーカー: オライリージャパン
- 発売日: 2018/07/21
- メディア: 単行本(ソフトカバー)
- この商品を含むブログ (3件) を見る
データをロードする関数を作る(「sequence.py」)
from collections import Counter def load_data(): questions_char = [] answers_char = [] freq = Counter() with open('date.txt', 'r') as f: for line in f: chars = [c for c in line] freq.update(set(chars)) idx = line.find('_') questions_char.append(line[:idx].strip()) answers_char.append(line[idx:-1]) #最後の1文字は改行文字 vocab = sorted(list(freq.keys())) vocab.remove('\n') vocab.append('<pad>') id_to_char = {i+1:t for i, t in enumerate(vocab)} char_to_id = {t:i+1 for i, t in enumerate(vocab)} #文字列をIDに変える questions = [] questions_valid_len = [] answers = [] answers_valid_len = [] for q in questions_char: ids = [char_to_id[c] for c in q] #リストの長さを29文字にそろえる if(len(ids)<29): ids = ids + [char_to_id['<pad>']] * (29-len(ids)) questions_valid_len.append(len(ids)) questions.append(ids) for q in answers_char: ids = [char_to_id[c] for c in q] answers_valid_len.append(len(ids)) answers.append(ids) return questions, questions_valid_len, answers, answers_valid_len, id_to_char, char_to_id
「Dive into Deep Learning」の「d2l.py」を変更
以下を追加する
class Seq2SeqAttentionDecoder(d2l.Decoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs): super(Seq2SeqAttentionDecoder, self).__init__(**kwargs) self.attention_cell = d2l.MLPAttention(num_hiddens, dropout) self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = rnn.LSTM(num_hiddens, num_layers, dropout=dropout) self.dense = nn.Dense(vocab_size, flatten=False) def init_state(self, enc_outputs, enc_valid_len, *args): outputs, hidden_state = enc_outputs # Transpose outputs to (batch_size, seq_len, hidden_size) return (outputs.swapaxes(0,1), hidden_state, enc_valid_len) def forward(self, X, state): enc_outputs, hidden_state, enc_valid_len = state X = self.embedding(X).swapaxes(0, 1) outputs = [] for x in X: # query shape: (batch_size, 1, hidden_size) query = hidden_state[0][-1].expand_dims(axis=1) # context has same shape as query context = self.attention_cell( query, enc_outputs, enc_outputs, enc_valid_len) # concatenate on the feature dimension x = nd.concat(context, x.expand_dims(axis=1), dim=-1) # reshape x to (1, batch_size, embed_size+hidden_size) out, hidden_state = self.rnn(x.swapaxes(0, 1), hidden_state) outputs.append(out) outputs = self.dense(nd.concat(*outputs, dim=0)) return outputs.swapaxes(0, 1), [enc_outputs, hidden_state, enc_valid_len] def train_seq2seq(model, data_iter, lr, num_epochs, ctx): model.initialize(init.Xavier(), force_reinit=True, ctx=ctx) trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr}) loss = MaskedSoftmaxCELoss() for epoch in range(1, num_epochs+1): timer = d2l.Timer() metric = d2l.Accumulator(2) # loss_sum, num_tokens for batch in data_iter: X, X_vlen, Y, Y_vlen = [x.astype(np.float32).as_in_context(ctx) for x in batch] Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1 with autograd.record(): Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen) l = loss(Y_hat, Y_label, Y_vlen) l.backward() d2l.grad_clipping(model, 1) num_tokens = Y_vlen.sum().asscalar() trainer.step(num_tokens) metric.add(l.sum().asscalar(), num_tokens) print('epoch = %d loss = %.3f'%(epoch, metric[0]/metric[1])) model.save_parameters('seq2seq_model.params')
学習を行う
import d2l import sequence import mxnet as mx from mxnet import nd from mxnet.gluon import nn, rnn import pickle batch_size = 128 #src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt(batch_size, num_steps) a, b, c, d, id_to_char, char_to_id = sequence.load_data() with open('id_to_char.pickle', 'wb') as f: pickle.dump(id_to_char, f) with open('char_to_id.pickle', 'wb') as f: pickle.dump(char_to_id, f) from sklearn.model_selection import train_test_split train_x, test_x, train_x_len, test_x_len, train_y, test_y, train_y_len, test_y_len \ =train_test_split(a, b, c, d, test_size=0.1, shuffle=True) with open('test_x.pickle', 'wb') as f: pickle.dump(test_x, f) with open('test_y.pickle', 'wb') as f: pickle.dump(test_y, f) #入力を反転 X = [x[::-1] for x in train_x] from mxnet.gluon import data dataset = data.dataset.ArrayDataset(X, train_x_len, train_y, train_y_len) train_iter = data.DataLoader(dataset, batch_size=batch_size, shuffle=True) vocab_size = len(id_to_char) embed_size = 16 num_hiddens = 256 num_layers = 1 dropout = 0.0 lr = 0.005 num_epochs = 3 ctx = mx.gpu() encoder = d2l.Seq2SeqEncoder( vocab_size, embed_size, num_hiddens, num_layers, dropout) decoder = d2l.Seq2SeqAttentionDecoder( vocab_size, embed_size, num_hiddens, num_layers, dropout) model = d2l.EncoderDecoder(encoder, decoder) d2l.train_seq2seq(model, train_iter, lr, num_epochs, ctx)
テストデータで確認
import d2l import mxnet as mx import pickle with open('id_to_char.pickle', 'rb') as f: id_to_char = pickle.load(f) with open('char_to_id.pickle', 'rb') as f: char_to_id = pickle.load(f) with open('test_x.pickle', 'rb') as f: test_x = pickle.load(f) with open('test_y.pickle', 'rb') as f: test_y = pickle.load(f) vocab_size = len(id_to_char) embed_size = 16 num_hiddens = 256 num_layers = 1 ctx = mx.gpu() test_id = 1234 encoder = d2l.Seq2SeqEncoder( vocab_size, embed_size, num_hiddens, num_layers) decoder = d2l.Seq2SeqAttentionDecoder( vocab_size, embed_size, num_hiddens, num_layers) model = d2l.EncoderDecoder(encoder, decoder) model.load_parameters('seq2seq_model.params',ctx=ctx) ctx = mx.gpu() enc_valid_length = mx.nd.array(len(test_x[0]), ctx = ctx) enc_X = mx.nd.array(test_x[test_id][::-1], ctx=ctx) enc_outputs = model.encoder(enc_X.expand_dims(axis=0), enc_valid_length) dec_state = model.decoder.init_state(enc_outputs, enc_valid_length) dec_X = mx.nd.array([char_to_id['_']], ctx=ctx).expand_dims(axis=0) predict_tokens = [] for _ in range(10): Y, dec_state = model.decoder(dec_X, dec_state) # The token with highest score is used as the next time step input. dec_X = Y.argmax(axis=2) py = dec_X.squeeze(axis=0).astype('int32').asscalar() predict_tokens.append(py) print(''.join([id_to_char[c] for c in predict_tokens])) print(''.join([id_to_char[c] for c in test_x[test_id]]))
最後に
入力データの反転を行わずに「masked_softmax」を使うと良い結果が得られなかった。
入力データの反転を行い、有効文字数をパディングした後の文字数に固定すると良い結果が得られた。