RでMXNet(4)

Kaggleの「Dogs vs. Cats」をやってみる。
こちらを参考にさせて頂いた。

  • train.zip(25000枚のjpeg画像)をダウンロードして解凍。
  • 猫、犬の画像をそれぞれcatフォルダ、dogフォルダに分ける。

データの準備

library(EBImage)

#jpeg画像を配列に変換
x_train <- array(0, c(56, 56, 3, 25000))

path <- "D:/rworks/dogcat/train/cat/"
setwd(path)
files_cat <- list.files()

for (i in 1:12500) {
    img <- readImage(files_cat[i])
    resize <- resize(img, w = 56, h = 56)
    x_train[,,,i] <- resize@.Data
}

path <- "D:/rworks/dogcat/train/dog/"
setwd(path)
files_dog <- list.files()

for (i in 1:12500) {
    img <- readImage(files_dog[i])
    resize <- resize(img, w = 56, h = 56)
    x_train[,,, 12500 + i] <- resize@.Data
}

#答え(Cat:0,Dog:1)
t_train <- c(rep(0, 12500), rep(1, 12500))

#25000のうち22500を訓練データにして残りをテストデータにする
sampling <- sample(25000, 22500)
x_train_mini <- x_train[,,, sampling]
t_train_mini <- t_train[sampling]
x_test_mini <- x_train[,,, - sampling]
t_test_mini <- t_train[-sampling]

ニューラルネットワークの構築

library(mxnet)

mydata <- mx.symbol.Variable("data")

conv1 <- mx.symbol.Convolution(
    data = mydata,
    kernel = c(3, 3),
    num_filter = 32)

act1 <- mx.symbol.Activation(data = conv1, act_type = "relu")

pool1 <- mx.symbol.Pooling(
    data = act1,
    pool_type = "max",
    kernel = c(2, 2),
    stride = c(2, 2))

conv2 <- mx.symbol.Convolution(
    data = pool1,
    kernel = c(3, 3),
    num_filter = 64)

act2 <- mx.symbol.Activation(data = conv2, act_type = "relu")

pool2 <- mx.symbol.Pooling(
    data = act2,
    pool_type = "max",
    kernel = c(2, 2),
    stride = c(2, 2))

fc1 <- mx.symbol.FullyConnected(data = pool2, num_hidden = 200)

drop1 <- mx.symbol.Dropout(data=fc1)

act3 <- mx.symbol.Activation(data = drop1, act_type = "relu")

fc2 <- mx.symbol.FullyConnected(data = act3, num_hidden = 2)

drop2 <- mx.symbol.Dropout(data = fc2)

softmax <- mx.symbol.SoftmaxOutput(data = drop2)

mx.set.seed(1)

model <- mx.model.FeedForward.create(
    softmax, X = x_train_mini,
    y = t_train_mini,
    ctx = mx.cpu(),
    num.round = 30,
    array.batch.size = 100,
    optimizer = "adam",
    eval.metric = mx.metric.accuracy,
    initializer = mx.init.Xavier(factor_type = "in", rnd_type = "gaussian", magnitude = 2),
    eval.data = list(data = x_test_mini, label = t_test_mini),
    array.layout = "auto")

結果

Start training with 1 devices
[1] Train-accuracy=0.557276785714286
[1] Validation-accuracy=0.6684
[2] Train-accuracy=0.627555555555556
[2] Validation-accuracy=0.6976
[3] Train-accuracy=0.663066666666667
[3] Validation-accuracy=0.7332

・・・

[28] Train-accuracy=0.815111111111111
[28] Validation-accuracy=0.7948
[29] Train-accuracy=0.820444444444445
[29] Validation-accuracy=0.7892
[30] Train-accuracy=0.817377777777778
[30] Validation-accuracy=0.796

テストデータのaccuracy:79.6%

> pred <- predict(model, x_train_mini)
> pred_train <- apply(pred, 2, which.max) - 1
> sum(pred_train == t_train_mini) / 22500
[1] 0.9488

訓練データのaccuracy:94.9%

訓練後のデータを保存する場合にはこちらを参照

mx.model.save.RData <- function(model, filename) {
    if (!inherits(model, "MXFeedForwardModel")) stop("Not a MXNet model!")
    model_rdata <- list()
    model_rdata[['symbol_json']] <- model$symbol$as.json()
    model_rdata[['arg.params']] <- lapply(model$arg.params, as.array)
    model_rdata[['aux.params']] <- lapply(model$aux.params, as.array)
    saveRDS(model_rdata, filename)
}

mx.model.load.RData <- function(filename) {
    model_rdata <- readRDS(filename)
    symbol <- mx.symbol.load.json(model_rdata$symbol_json)
    arg.params <- lapply(model_rdata$arg.params, mx.nd.array)
    aux.params <- lapply(model_rdata$aux.params, mx.nd.array)
    model <- list(symbol = symbol, arg.params = arg.params, aux.params = aux.params)
    return(structure(model, class = "MXFeedForwardModel"))
}

(2017年6月27日追記)こちらのほうがおすすめ。

フィルター数32の1番目の畳み込み層をフィルタ数16の二つの畳み込み層に分けてみる

  • わずかに改善
library(mxnet)

mydata <- mx.symbol.Variable("data")

conv1 <- mx.symbol.Convolution(
    data = mydata,
    kernel = c(3, 3),
    num_filter = 16)

conv1_5 <- mx.symbol.Convolution(
    data = conv1,
    kernel = c(3, 3),
    num_filter = 16)

act1 <- mx.symbol.Activation(data = conv1_5, act_type = "relu")

pool1 <- mx.symbol.Pooling(
    data = act1,
    pool_type = "max",
    kernel = c(2, 2),
    stride = c(2, 2))

conv2 <- mx.symbol.Convolution(
    data = pool1,
    kernel = c(3, 3),
    num_filter = 64)

act2 <- mx.symbol.Activation(data = conv2, act_type = "relu")

pool2 <- mx.symbol.Pooling(
    data = act2,
    pool_type = "max",
    kernel = c(2, 2),
    stride = c(2, 2))

fc1 <- mx.symbol.FullyConnected(data = pool2, num_hidden = 200)

drop1 <- mx.symbol.Dropout(data = fc1)

act3 <- mx.symbol.Activation(data = drop1, act_type = "relu")

fc2 <- mx.symbol.FullyConnected(data = act3, num_hidden = 2)

drop2 <- mx.symbol.Dropout(data = fc2)

softmax <- mx.symbol.SoftmaxOutput(data = drop2)

mx.set.seed(1)

model <- mx.model.FeedForward.create(
    softmax, X = x_train_mini,
    y = t_train_mini,
    ctx = mx.cpu(),
    num.round = 30,
    array.batch.size = 100,
    optimizer = "adam",
    eval.metric = mx.metric.accuracy,
    initializer = mx.init.Xavier(factor_type = "in", rnd_type = "gaussian", magnitude = 2),
    eval.data = list(data = x_test_mini, label = t_test_mini),
    array.layout = "auto")

結果

Start training with 1 devices
[1] Train-accuracy=0.585848214285714
[1] Validation-accuracy=0.7144
[2] Train-accuracy=0.644711111111111
[2] Validation-accuracy=0.7272
[3] Train-accuracy=0.675244444444445
[3] Validation-accuracy=0.7536

・・・

[28] Train-accuracy=0.823822222222223
[28] Validation-accuracy=0.79
[29] Train-accuracy=0.815111111111112
[29] Validation-accuracy=0.7984
[30] Train-accuracy=0.826044444444445
[30] Validation-accuracy=0.806

1番目と2番目の畳み込み層の間に活性化関数を入れてみる

  • ほとんど変わらず
library(mxnet)

mydata <- mx.symbol.Variable("data")

conv1 <- mx.symbol.Convolution(
    data = mydata,
    kernel = c(3, 3),
    num_filter = 16)

act0 <- mx.symbol.Activation(data = conv1, act_type = "relu")

conv1_5 <- mx.symbol.Convolution(
    data = act0,
    kernel = c(3, 3),
    num_filter = 16)

act1 <- mx.symbol.Activation(data = conv1_5, act_type = "relu")

pool1 <- mx.symbol.Pooling(
    data = act1,
    pool_type = "max",
    kernel = c(2, 2),
    stride = c(2, 2))

conv2 <- mx.symbol.Convolution(
    data = pool1,
    kernel = c(3, 3),
    num_filter = 64)

act2 <- mx.symbol.Activation(data = conv2, act_type = "relu")

pool2 <- mx.symbol.Pooling(
    data = act2,
    pool_type = "max",
    kernel = c(2, 2),
    stride = c(2, 2))

fc1 <- mx.symbol.FullyConnected(data = pool2, num_hidden = 200)

drop1 <- mx.symbol.Dropout(data = fc1)

act3 <- mx.symbol.Activation(data = drop1, act_type = "relu")

fc2 <- mx.symbol.FullyConnected(data = act3, num_hidden = 2)

drop2 <- mx.symbol.Dropout(data = fc2)

softmax <- mx.symbol.SoftmaxOutput(data = drop2)

mx.set.seed(1)

model <- mx.model.FeedForward.create(
    softmax, X = x_train_mini,
    y = t_train_mini,
    ctx = mx.cpu(),
    num.round = 30,
    array.batch.size = 100,
    optimizer = "adam",
    eval.metric = mx.metric.accuracy,
    initializer = mx.init.Xavier(factor_type = "in", rnd_type = "gaussian", magnitude = 2),
    eval.data = list(data = x_test_mini, label = t_test_mini),
    array.layout = "auto")

結果

Start training with 1 devices
[1] Train-accuracy=0.558973214285714
[1] Validation-accuracy=0.6536
[2] Train-accuracy=0.642577777777778
[2] Validation-accuracy=0.7352
[3] Train-accuracy=0.681155555555555
[3] Validation-accuracy=0.7532

・・・

[28] Train-accuracy=0.824933333333334
[28] Validation-accuracy=0.8012
[29] Train-accuracy=0.820311111111112
[29] Validation-accuracy=0.8072
[30] Train-accuracy=0.827511111111112
[30] Validation-accuracy=0.798

2番目の畳み込み層をフィルタ数32、全結合層のニューロン数を300に増やしてみる

  • そろそろCPUの限界を感じ始めた(num.round=20に)
  • 時間の割には改善なし
library(mxnet)

mydata <- mx.symbol.Variable("data")

conv1 <- mx.symbol.Convolution(
    data = mydata,
    kernel = c(3, 3),
    num_filter = 16)

conv1_5 <- mx.symbol.Convolution(
    data = conv1,
    kernel = c(3, 3),
    num_filter = 32)

act1 <- mx.symbol.Activation(data = conv1_5, act_type = "relu")

pool1 <- mx.symbol.Pooling(
    data = act1,
    pool_type = "max",
    kernel = c(2, 2),
    stride = c(2, 2))

conv2 <- mx.symbol.Convolution(
    data = pool1,
    kernel = c(3, 3),
    num_filter = 64)

act2 <- mx.symbol.Activation(data = conv2, act_type = "relu")

pool2 <- mx.symbol.Pooling(
    data = act2,
    pool_type = "max",
    kernel = c(2, 2),
    stride = c(2, 2))

fc1 <- mx.symbol.FullyConnected(data = pool2, num_hidden = 300)

drop1 <- mx.symbol.Dropout(data = fc1)

act3 <- mx.symbol.Activation(data = drop1, act_type = "relu")

fc2 <- mx.symbol.FullyConnected(data = act3, num_hidden = 2)

drop2 <- mx.symbol.Dropout(data = fc2)

softmax <- mx.symbol.SoftmaxOutput(data = drop2)

mx.set.seed(1)

model <- mx.model.FeedForward.create(
    softmax, X = x_train_mini,
    y = t_train_mini,
    ctx = mx.cpu(),
    num.round = 20,
    array.batch.size = 100,
    optimizer = "adam",
    eval.metric = mx.metric.accuracy,
    initializer = mx.init.Xavier(factor_type = "in", rnd_type = "gaussian", magnitude = 2),
    eval.data = list(data = x_test_mini, label = t_test_mini),
    array.layout = "auto")

結果

Start training with 1 devices
[1] Train-accuracy=0.5896875
[1] Validation-accuracy=0.7084
[2] Train-accuracy=0.6484
[2] Validation-accuracy=0.7348
[3] Train-accuracy=0.692933333333333
[3] Validation-accuracy=0.74

・・・

[18] Train-accuracy=0.852711111111111
[18] Validation-accuracy=0.7916
[19] Train-accuracy=0.853066666666668
[19] Validation-accuracy=0.7856
[20] Train-accuracy=0.855155555555556
[20] Validation-accuracy=0.7948

訓練データ200~300枚程度を手作業で背景を取り除くようにトリミングしてみる

  • あとで縮小しやすいように正方形でトリミング
  • 結果はわずかに改善
  • 訓練データの重要性を改めて認識した
Start training with 1 devices
[1] Train-accuracy=0.581026785714286
[1] Validation-accuracy=0.6712
[2] Train-accuracy=0.654933333333333
[2] Validation-accuracy=0.7428
[3] Train-accuracy=0.689022222222222
[3] Validation-accuracy=0.7696

・・・

[18] Train-accuracy=0.853777777777778
[18] Validation-accuracy=0.8084
[19] Train-accuracy=0.855688888888889
[19] Validation-accuracy=0.8132
[20] Train-accuracy=0.854355555555556
[20] Validation-accuracy=0.8172

RでMXNet(3)

MNIST 99%以上を目指す!
と言っても下記の本で紹介されているニューラルネットワークをMXNetで書いただけ。

初期設定

library(mxnet)

データの読み込み

#訓練データの読み込み
x_train <- array(t(readRDS("x_train")),c(28,28,1,60000)) / 255
t_train <- apply(readRDS("t_train"), 1, which.max) - 1

#テストデータの読み込み
x_test <- array(t(readRDS("x_test")), c(28, 28, 1, 10000))
t_test <- apply(readRDS("t_test"), 1, which.max) - 1

ニューラルネットワークの構築

mydata <- mx.symbol.Variable("data")

conv1 <- mx.symbol.Convolution(
     data=mydata,
     kernel = c(3,3),
   pad = c(1,1),
   stride = c(1,1),
   num_filter = 16)

act1 <- mx.symbol.Activation(data=conv1, act_type = "relu")

conv2 <- mx.symbol.Convolution(
   data=act1,
   kernel = c(3,3),
   pad = c(1,1),
   stride = c(1,1),
   num_filter = 16)

act2 <- mx.symbol.Activation(data=conv2, act_type = "relu")

pool1 <- mx.symbol.Pooling(
   data=act2,
   pool_type = "max",
   kernel = c(2,2),
   stride = c(2,2))

conv3 <- mx.symbol.Convolution(
   data=pool1,
   kernel = c(3,3),
   pad = c(1,1),
   stride = c(1,1),
   num_filter = 32)

act3 <- mx.symbol.Activation(data=conv3, act_type = "relu")

conv4 <- mx.symbol.Convolution(
   data=act3,
   kernel = c(3,3),
   pad = c(2,2),
   stride = c(1,1),
   num_filter = 32)

act4 <- mx.symbol.Activation(data=conv4, act_type = "relu")

pool2 <- mx.symbol.Pooling(
   data=act4,
   pool_type = "max",
   kernel = c(2,2),
   stride = c(2,2))

conv5 <- mx.symbol.Convolution(
   data=pool2,
   kernel = c(3,3),
   pad = c(1,1),
   stride = c(1,1),
   num_filter = 64)

act5 <- mx.symbol.Activation(data=conv5, act_type = "relu")

conv6 <- mx.symbol.Convolution(
   data=act5,
   kernel = c(3,3),
   pad = c(1,1),
   stride = c(1,1),
   num_filter = 64)

act6 <- mx.symbol.Activation(data=conv6, act_type = "relu")

pool3 <- mx.symbol.Pooling(
   data=act6,
   pool_type = "max",
   kernel = c(2,2),
   stride = c(2,2))

fc1 <- mx.symbol.FullyConnected(data=pool3, num_hidden = 50)

act7 <- mx.symbol.Activation(data=fc1, act_type = "relu")

drop1 <- mx.symbol.Dropout(data=act7)

fc2 <- mx.symbol.FullyConnected(data=drop1, num_hidden = 10)

drop2 <- mx.symbol.Dropout(data=fc2)

softmax <- mx.symbol.SoftmaxOutput(drop2, name = "sm")

mx.set.seed(1)

model <- mx.model.FeedForward.create(
   softmax, X = x_train,
   y = t_train,
   ctx = mx.cpu(),
   num.round = 20,
   array.batch.size = 1000,
   optimizer = "adam",
   eval.metric = mx.metric.accuracy,
   initializer = mx.init.Xavier(factor_type="in",rnd_type="gaussian",magnitude=2),
   eval.data = list(data = x_test, label = t_test),
   array.layout = "auto")

結果

Start training with 1 devices
[1] Train-accuracy=0.233593220338983
[1] Validation-accuracy=0.8341
[2] Train-accuracy=0.354283333333333
[2] Validation-accuracy=0.9439
[3] Train-accuracy=0.411583333333333
[3] Validation-accuracy=0.9697

・・・

[18] Train-accuracy=0.5672
[18] Validation-accuracy=0.9934
[19] Train-accuracy=0.569166666666667
[19] Validation-accuracy=0.9944
[20] Train-accuracy=0.569616666666667
[20] Validation-accuracy=0.9932
  • Train-accuracyの数値があがってこない
  • どうやらTrain-accuracyの計算においてDropout層を無視していない様子
  • Validation-accuracyはDropout層を無視して正確に計算されていそう

実際のTrain-accuracy

> pred <- predict(model,x_train)
> pred_train <- apply(pred,2,which.max) - 1
> sum(pred_train == t_train)/60000
[1] 0.99485
  • predict関数ではDropout層を無視して正確に計算されていそう

RでMXNet(2)

optimizerを変えて畳み込みNNを作ってみた。

library(mxnet)

#訓練データの読み込み
x_train <- array(t(readRDS("x_train")),c(28,28,1,60000)) / 255
t_train <- apply(readRDS("t_train"), 1, which.max) - 1

#テストデータの読み込み
x_test <- array(t(readRDS("x_test")), c(28, 28, 1, 10000))
t_test <- apply(readRDS("t_test"), 1, which.max) - 1

#畳み込みNN
data <- mx.symbol.Variable("data")
conv <- mx.symbol.Convolution(
    data, name = "conv",
    kernel = c(5, 5),
    num_filter = 30)
act1 <- mx.symbol.Activation(conv, name = "relu1", act_type = "relu")
pool <- mx.symbol.Pooling(
    act1, name ="pool",
    pool_type = "max",
    kernel = c(2, 2),
    stride = c(2, 2))
fc1 <- mx.symbol.FullyConnected(pool, name = "fc1", num_hidden = 100)
act2 <- mx.symbol.Activation(fc1, name = "relu2", act_type = "relu")
fc2 <- mx.symbol.FullyConnected(act2, name = "fc2", num_hidden = 10)
softmax <- mx.symbol.SoftmaxOutput(fc2, name = "sm")

mx.set.seed(400)

model <- mx.model.FeedForward.create(
    softmax, X = x_train,
    y = t_train,
    ctx = mx.cpu(),
    num.round = 15,
    array.batch.size = 1000,
    optimizer = "adam",
    eval.metric = mx.metric.accuracy,
    eval.data = list(data = x_test, label = t_test),
    initializer = mx.init.normal(0.01),
    array.layout = "auto")

結果

Start training with 1 devices
[1] Train-accuracy=0.732254237288135
[1] Validation-accuracy=0.8897
[2] Train-accuracy=0.905133333333333
[2] Validation-accuracy=0.9161
[3] Train-accuracy=0.928166666666666
[3] Validation-accuracy=0.9353
[4] Train-accuracy=0.946616666666667
[4] Validation-accuracy=0.9516

・・・

[13] Train-accuracy=0.987566666666666
[13] Validation-accuracy=0.9833
[14] Train-accuracy=0.988583333333333
[14] Validation-accuracy=0.984
[15] Train-accuracy=0.98945
[15] Validation-accuracy=0.9847

活性化関数をtanh関数に変えて、Xavierの初期値を使ってみた。

library(mxnet)

#訓練データの読み込み
x_train <- array(t(readRDS("x_train")),c(28,28,1,60000)) / 255
t_train <- apply(readRDS("t_train"), 1, which.max) - 1

#テストデータの読み込み
x_test <- array(t(readRDS("x_test")), c(28, 28, 1, 10000))
t_test <- apply(readRDS("t_test"), 1, which.max) - 1

#畳み込みNN
data <- mx.symbol.Variable("data")
conv <- mx.symbol.Convolution(
    data, name = "conv",
    kernel = c(5, 5),
    num_filter = 30)
act1 <- mx.symbol.Activation(conv, name = "tanh1", act_type = "tanh")
pool <- mx.symbol.Pooling(
    act1, name ="pool",
    pool_type = "max",
    kernel = c(2, 2),
    stride = c(2, 2))
fc1 <- mx.symbol.FullyConnected(pool, name = "fc1", num_hidden = 100)
act2 <- mx.symbol.Activation(fc1, name = "tanh2", act_type = "tanh")
fc2 <- mx.symbol.FullyConnected(act2, name = "fc2", num_hidden = 10)
softmax <- mx.symbol.SoftmaxOutput(fc2, name = "sm")

mx.set.seed(400)

model <- mx.model.FeedForward.create(
    softmax, X = x_train,
    y = t_train,
    ctx = mx.cpu(),
    num.round = 15,
    array.batch.size = 1000,
    optimizer = "adam",
    eval.metric = mx.metric.accuracy,
    eval.data = list(data = x_test, label = t_test),
    initializer = mx.init.Xavier(),
    array.layout = "auto")

結果

Start training with 1 devices
[1] Train-accuracy=0.856135593220339
[1] Validation-accuracy=0.9145
[2] Train-accuracy=0.94735
[2] Validation-accuracy=0.9083
[3] Train-accuracy=0.964616666666667
[3] Validation-accuracy=0.8688
[4] Train-accuracy=0.9749
[4] Validation-accuracy=0.8313

・・・

[13] Train-accuracy=0.997116666666667
[13] Validation-accuracy=0.8038
[14] Train-accuracy=0.997566666666666
[14] Validation-accuracy=0.8091
[15] Train-accuracy=0.998033333333334
[15] Validation-accuracy=0.8135

トレイニングデータの正解率は非常に高いが、テストデータの正解率は非常に低い。
これが過学習だと思う。
次はDropoutを勉強しよう。

Xavierでfactor_type="in"、magnitude=2を指定すればHeの初期値?

RでMXNet(1)

ついにフレームワークに手を出した。
数ある中で「MXNet」を選択。(なんとなくRから使いやすそうなので)

WindowsのCPU用のインストールは非常に簡単。
Rのコマンドラインに以下を入力。

install.packages("drat")
drat::addRepo("dmlc")
install.packages("mxnet")

Rの「mxnet」パッケージがインストールされると同時に「MXNet」そのものがインストールされる。
そのためパッケージを読み込むだけですぐに使用できる。

library(mxnet)


(2017年7月4日追記)
上記でだめならこちらを試してみて下さい。

cran <- getOption("repos")
cran["dmlc"] <- "https://s3-us-west-2.amazonaws.com/apache-mxnet/R/CRAN/"
options(repos = cran)
install.packages("mxnet")

(ここまで)


(2017年7月17日追記)
GPU版はこちらを参照。
(ここまで)


「MNISTやってみました」という記事はネット上にあふれているが、詳細な日本語マニュアルは少ない。
実際、自分がいろいろやってみてわかったことを記述。

#訓練データの読み込み
x_train <- readRDS("x_train") / 255
t_train <- apply(readRDS("t_train"), 1, which.max) - 1

#テストデータの読み込み
x_test <- readRDS("x_test")
t_test <- apply(readRDS("t_test"), 1, which.max) - 1
  • 一般的にはone-hot表現は使わないらしい

簡単な2層ニューラルネットワークを作ってみた。

#2層NN
data <- mx.symbol.Variable("data")
fc1 <- mx.symbol.FullyConnected(data, name = "fc1", num_hidden = 100)
act1 <- mx.symbol.Activation(fc1, name = "relu1", act_type = "relu")
fc2 <- mx.symbol.FullyConnected(act1, name = "fc2", num_hidden = 10)
softmax <- mx.symbol.SoftmaxOutput(fc2, name = "sm")

#実行
mx.set.seed(200)
model <- mx.model.FeedForward.create(
    softmax, X = x_train,
    y = t_train,
    ctx = mx.cpu(),
    num.round = 15,
    array.batch.size = 100,
    learning.rate = 0.1, momentum = 0,
    eval.metric = mx.metric.accuracy,
    eval.data = list(data = x_test, label = t_test),
    initializer = mx.init.normal(0.01),
    array.layout = "rowmajor")
  • どうやらnum.roundはエポック数を指している
  • デフォルトの最適化はSGDであるが、厳密にはSGD with Momentumを指す

(momentum=0を指定するとsimple SGDになると思う)

  • 行列の行数=標本数の場合に、array.layout="rowmajor"

結果1

Start training with 1 devices
[1] Train-accuracy=0.809048414023373
[1] Validation-accuracy=0.9045
[2] Train-accuracy=0.915516666666667
[2] Validation-accuracy=0.9206
[3] Train-accuracy=0.932616666666668
[3] Validation-accuracy=0.9339
[4] Train-accuracy=0.943583333333335
[4] Validation-accuracy=0.9447
[5] Train-accuracy=0.951783333333336
[5] Validation-accuracy=0.9514
[6] Train-accuracy=0.957883333333338
[6] Validation-accuracy=0.9564
[7] Train-accuracy=0.962600000000005
[7] Validation-accuracy=0.9596
[8] Train-accuracy=0.966416666666673
[8] Validation-accuracy=0.9631
[9] Train-accuracy=0.969200000000007
[9] Validation-accuracy=0.9649
[10] Train-accuracy=0.971900000000006
[10] Validation-accuracy=0.966499999999999
[11] Train-accuracy=0.974400000000006
[11] Validation-accuracy=0.9679
[12] Train-accuracy=0.976300000000007
[12] Validation-accuracy=0.969299999999999
[13] Train-accuracy=0.97808333333334
[13] Validation-accuracy=0.97
[14] Train-accuracy=0.979600000000006
[14] Validation-accuracy=0.970699999999999
[15] Train-accuracy=0.980766666666673
[15] Validation-accuracy=0.9715
  • 1epoch毎に結果が出力される(Train-accuracy)
  • eval.dataを指定しておけばその結果(Validation-accuracy)も1epoch毎に返してくれる

実行&結果2

出力が必要なければ記録することも可能

#結果を保存するためのクラス
logger <- mx.metric.logger$new()

mx.set.seed(200)

model <- mx.model.FeedForward.create(
    softmax, X = x_train,
    y = t_train,
    ctx = mx.cpu(),
    num.round = 15,
    array.batch.size = 100,
    learning.rate = 0.1, momentum = 0,
    eval.metric = mx.metric.accuracy,
    eval.data = list(data=x_test,label=t_test),
    initializer = mx.init.normal(0.01),
    array.layout = "rowmajor",
    verbose = F,
    epoch.end.callback = mx.callback.log.train.metric(15,logger))
> logger
Reference class object of class "mx.metric.logger"
Field "train":
 [1] 0.8090484 0.9155167 0.9326167 0.9435833 0.9517833 0.9578833 0.9626000
 [8] 0.9664167 0.9692000 0.9719000 0.9744000 0.9763000 0.9780833 0.9796000
[15] 0.9807667
Field "eval":
 [1] 0.9045 0.9206 0.9339 0.9447 0.9514 0.9564 0.9596 0.9631 0.9649 0.9665
[11] 0.9679 0.9693 0.9700 0.9707 0.9715
  • mx.callback.log.train.metricで指定した数字はよく理解できない

(数字が変わっても結果は変わらない印象)

  • 学習後の重みとバイアスはmodel$arg.paramsから取り出せる
> is.list(model$arg.params)
[1] TRUE
> names(model$arg.params)
[1] "fc1_weight" "fc1_bias"   "fc2_weight" "fc2_bias"

RでDeep Learning(最終章)

Deep Learningフレームワークを使わずにRのみでMNIST 98%以上を目指す!

デザイン

Data

  →Conv(filter:30)
    →Relu
      →Pooling
 
  →Affine(Neuron:100)
    →Relu

  →Affine(Neuron:10)
    →Softmax

Adamによる最適化

使用するパッケージは以下の二つのみ

library(dplyr)
library(R6)

データの読み込み

トレイニングデータ60000、テストデータ10000

#データの読み込み
	x_train <- array(t(readRDS("x_train")),c(28,28,60000))/255
	t_train <- readRDS("t_train")
	x_test <- array(t(readRDS("x_test")),c(28,28,10000))
	t_test <- readRDS("t_test")

パラメータの初期化

#パラメータの初期化
	#Conv層
	W1 <- matrix(rnorm(750),nrow=25)*0.01
	b1 <- matrix(0,1,30)

		W1_m <- matrix(0,25,30)
		W1_v <- W1_m
		b1_m <- b1
		b1_v <- b1
	
	#Affin層1
	W2 <- matrix(rnorm(432000),nrow=4320)*(sqrt(2/30))
	b2 <- matrix(0,1,100)
	
		W2_m <- matrix(0,4320,100)
		W2_v <- W2_m
		b2_m <- b2
		b2_v <- b2

	#Affine層2
	W3 <- matrix(rnorm(1000),nrow=100)*(sqrt(2/100))
	b3 <- matrix(0,1,10)

		W3_m <- matrix(0,100,10)
		W3_v <- W3_m
		b3_m <- b3
		b3_v <- b3

im2col、col2imを定義

im2col_single <- function(input_data){

	#imput_data:3次元配列(チャネル数1)
	#フィルター5×5
	#stride:1
	#pad:0

	H <- dim(input_data)[1]
	W <- dim(input_data)[2]
	N <- dim(input_data)[3]

	out_h <- H - 4
	out_w <- W - 4

	m_row <- N * out_h * out_w
	m <- matrix(0,m_row,25)

	myF <- function(x){
		for(w_index in 1:out_w){
			for(h_index in 1:out_h){
				m[row_number,] <<- as.vector(x[h_index:(h_index+4),w_index:(w_index+4)])
				row_number <<- row_number + 1
				}
			}
		}
	row_number <- 1
	apply(input_data,3,myF)
	
	return(m)
}

im2col_pooling <- function(input_data){

	#stride=2
	#pad=0
        #フィルター2×2

	H <- dim(input_data)[1]
	W <- dim(input_data)[2]
	C <- dim(input_data)[3]
	N <- dim(input_data)[4]

	row_num <- ((H*W)/4)*C*N
	col_num <- 4

	t <- array(input_data,c(2,H/2,2,W/2,C,N))
	tt <- aperm(t,c(1,3,2,4,5,6))
	ttt <- matrix(tt,c(row_num,col_num),byrow=T)
	
	return(ttt)
}

col2im_pooling <- function(x,H,W,C,N){

	#stride=2
	#pad=0
        #フィルター2×2

	t <- array(t(x),c(2,2,H/2,W/2,C,N))
	tt <- aperm(t,c(1,3,2,4,5,6))
	ttt <- array(tt,c(H,W,C,N))
	return(ttt)
}

各層の定義

Convolution <- R6Class("Convolution",
	
	#stride=1
	#pad=0
	#filterサイズ=5×5
	#filter数=30
	
	public = list(
		W = NULL,
		b = NULL,
		col = NULL,

		dW = NULL,
		db = NULL,
		
		initialize = function(W,b){
			self$W <- W
			self$b <- b
		},

		forward = function(x){
			H <- dim(x)[1]	
			W <- dim(x)[2]
			N <- dim(x)[3]
 
			out_h <- (H - 5) + 1
			out_w <- (W - 5) + 1
			
			col <- im2col_single(x)
			out <- col %*% self$W + matrix(rep(self$b,out_h*out_w*N),nrow=out_h*out_w*N,byrow=T)
			out <- array(out,c(out_h,out_w,N,30)) %>% aperm(.,c(1,2,4,3))
			self$col <- col
			return(out) 
		},
		
		backward = function(dout){
			
			dout <- matrix(aperm(dout,c(1,2,4,3)),ncol=30)
			
			self$db <- apply(dout,2,sum)			
			self$dW <- (self$col %>% t()) %*% dout
			
			#dxのコード
		}
	)
)

Pooling <- R6Class("Pooling",
	
	#stride=2
	#pad=0
	#filterサイズ=2×2
	
	public = list(
		x = NULL,
		arg_max = NULL,

		forward = function(x){

			N <- dim(x)[4]

			col <- im2col_pooling(x)
			out <- matrix(apply(col,1,max),nrow=N,byrow=T)
			 
			self$x <- x
			self$arg_max <- apply(col,1,which.max)

			return(out) 
		},
		
		backward = function(dout){
			
			dout <- as.vector(t(dout))
			dmax <- matrix(0,length(self$arg_max),4)
			for(i in 1:length(self$arg_max)){
				dmax[i,self$arg_max[i]] <- dout[i]
			}
			H <- dim(self$x)[1]	
			W <- dim(self$x)[2]
			C <- dim(self$x)[3]
			N <- dim(self$x)[4]
			dx <- col2im_pooling(dmax,H,W,C,N)

			return(dx)
		}
	)
)

Relu <- R6Class("Relu",

	public = list(
		mask = NULL,
		
		forward = function(x){
			out <- x
			self$mask <- x<=0
			out[self$mask] <-0
			return(out)
		},
		
		backward = function(dout){
			dout[self$mask] <- 0
			return(dout)
		}
	)
)

Affine <- R6Class("Affine",

	public = list(
		W = NULL,
		b = NULL,
		x = NULL,
		dW = NULL,
		db = NULL,
		
		initialize = function(W,b){
			self$W <- W
			self$b <- b
		},

		forward = function(x){
			if(x %>% is.vector()){
				batch <- 1
			}else{
				batch <- nrow(x)
			}
			self$x <- x
			out <- (x %*% self$W)+matrix(rep(self$b,batch),nrow=batch,byrow=T)
			return(out) 
		},
		
		backward = function(dout){
			dx <- dout %*% (self$W %>% t())
			self$dW <- (self$x %>% t()) %*% dout
			self$db <- apply(dout,2,sum)
			return(dx)
		}
	)
)

#softmax関数
softmax <- function(x){
	sm_in <- function(a){
		m <- max(a)
		exp_x <- exp(a-m)
		sum_exp_x <- sum(exp_x)
		b <- exp_x/sum_exp_x
		return(b)
	}
	y <- apply(x,1,sm_in) %>% t()
	return(y)
}

SoftmaxWithLoss <- R6Class("SoftmaxWithLoss",

	public = list(
		loss = NULL,
		y = NULL,
		t = NULL,

		forward = function(x,t){
			self$t <- t
			self$y <- softmax(x) 
		},
		
		backward = function(dout){
			batch_size <- self$t %>% nrow()
			dx <- (self$y - self$t)/batch_size
			return(dx)
		}
	)
)

層を並べて、その他の関数を定義

layers <- as.list(NULL)
layers[[1]] <- Convolution$new(W1,b1)
layers[[2]] <- Relu$new()
layers[[3]] <- Pooling$new()
layers[[4]] <- Affine$new(W2,b2)
layers[[5]] <- Relu$new()
layers[[6]] <- Affine$new(W3,b3)
lastlayer <- SoftmaxWithLoss$new()

predict <- function(x){
	for(i in 1:6){
		x <- layers[[i]]$forward(x)
	}
	return(x)
}

loss <- function(x,t){
	y <- predict(x)
	lastlayer$forward(y,t)
}

gradient <- function(x,t){

	#forward
	loss(x,t)

	#backward
	dout <- lastlayer$backward(1)
	for(i in 6:1){
		dout <- layers[[i]]$backward(dout)
	}
}

accuracy <- function(x,t){
	batch <- dim(x)[3]
	y <- predict(x)
	y <- apply(y,1,which.max)
	t <- apply(t,1,which.max)
	accuracy <- sum(t==y)/batch
	return(accuracy)
}

accuracy_test <- function(x,t){
	ac <- 0
	for(i in 0:9){
		x_batch <- x[,,(i*1000+1):(i*1000+1000)]
		t_batch <- t[(i*1000+1):(i*1000+1000),]
		ac <- ac + accuracy(x_batch,t_batch)
	}
	return(ac/10)
}

accuracy_train <- function(x,t){
	ac <- 0
	for(i in 0:59){
		x_batch <- x[,,(i*1000+1):(i*1000+1000)]
		t_batch <- t[(i*1000+1):(i*1000+1000),]
		ac <- ac + accuracy(x_batch,t_batch)
	}
	return(ac/60)
}

実行

#ハイパーパラメータ
iters_num <- 1200
train_size <- 60000
batch_size <- 1000

for(iter in 1:iters_num){

	#ミニバッチの取得
	batch_mask<-sample(train_size,batch_size)
	x_batch <- x_train[,,batch_mask]
	t_batch <- t_train[batch_mask,]

	#実行
	gradient(x_batch,t_batch)

	#パラメータの更新
	lr_t <- 0.001 * sqrt(1-0.999^iter)/(1-0.9^iter)
	W1_m <- (1-0.9)*layers[[1]]$dW + 0.9*W1_m
	W1_v <- (1-0.999)*(layers[[1]]$dW**2) + 0.999*W1_v  
	b1_m <- (1-0.9)*layers[[1]]$db + 0.9*b1_m
	b1_v <- (1-0.999)*(layers[[1]]$db**2) + 0.999*b1_v
	layers[[1]]$W <- layers[[1]]$W - lr_t * W1_m/(sqrt(W1_v)+1e-7)
	layers[[1]]$b <- layers[[1]]$b - lr_t * b1_m/(sqrt(b1_v)+1e-7)

	W2_m <- (1-0.9)*layers[[4]]$dW + 0.9*W2_m
	W2_v <- (1-0.999)*(layers[[4]]$dW**2) + 0.999*W2_v  
	b2_m <- (1-0.9)*layers[[4]]$db + 0.9*b2_m
	b2_v <- (1-0.999)*(layers[[4]]$db**2) + 0.999*b2_v
	layers[[4]]$W <- layers[[4]]$W - lr_t * W2_m/(sqrt(W2_v)+1e-7)
	layers[[4]]$b <- layers[[4]]$b - lr_t * b2_m/(sqrt(b2_v)+1e-7)

	W3_m <- (1-0.9)*layers[[6]]$dW + 0.9*W3_m
	W3_v <- (1-0.999)*(layers[[6]]$dW**2) + 0.999*W3_v  
	b3_m <- (1-0.9)*layers[[6]]$db + 0.9*b3_m
	b3_v <- (1-0.999)*(layers[[6]]$db**2) + 0.999*b3_v
	layers[[6]]$W <- layers[[6]]$W - lr_t * W3_m/(sqrt(W3_v)+1e-7)
	layers[[6]]$b <- layers[[6]]$b - lr_t * b3_m/(sqrt(b3_v)+1e-7)
}

結果

> accuracy_test(x_test,t_test)
[1] 0.986
> accuracy_train(x_train,t_train)
[1] 0.9973667

テストデータの正解率:98.6%
トレイニングデータの正解率:99.7%

さらに上をめざしたのがこちら

Rで行列計算(行列の足し算)

sweepは速いと思っていたが違った。
(環境:Windows 10 64bit、R3.3.3)

> system.time(test1 <- out + matrix(rep(b1,576000),nrow=576000,byrow=T))
   ユーザ   システム       経過  
      0.13       0.03       0.16 
> system.time(test2 <- sweep(out,2,b1,FUN="+",check.margin=F))
   ユーザ   システム       経過  
      0.28       0.00       0.28 
> identical(test1,test2)
[1] TRUE

RでDeep Learning(10)

Convolutin層、Pooling層の実装

Convolution <- R6Class("Convolution",
	
	#stride=1
	#pad=0
	#filterサイズ=5×5
	#filter数=30
	
	public = list(
		W = NULL,
		b = NULL,
		x = NULL,
		col = NULL,

		dW = NULL,
		db = NULL,
		
		initialize = function(W,b){
			self$W <- W
			self$b <- b
		},

		forward = function(x){
			H <- dim(x)[1]	
			W <- dim(x)[2]
			C <- dim(x)[3]
			N <- dim(x)[4] #batch数100

			out_h <- (H - 5) + 1
			out_w <- (W - 5) + 1
			
			col <- im2col(x,5,5,stride=1,pad=0)
			out <- col %*% self$W + matrix(rep(self$b,out_h*out_w*N),nrow=out_h*out_w*N,byrow=T)
			out <- array(out,c(out_h,out_w,N,30)) %>% aperm(.,c(1,2,4,3))
			self$x <- x
			self$col <- col
			return(out) 
		},
		
		backward = function(dout){
			
			dout <- matrix(aperm(dout,c(1,2,4,3)),ncol=30)
			
			self$db <- apply(dout,2,sum)			
			self$dW <- (self$col %>% t()) %*% dout
			
			#dxのコード
		}
	)
)
Pooling <- R6Class("Pooling",
	
	#stride=2
	#pad=0
	#filterサイズ=2×2
	
	public = list(
		x = NULL,
		arg_max = NULL,

		forward = function(x){

			N <- dim(x)[4] #batch数100

			col <- im2col_pooling(x,2,2)
			out <- matrix(apply(col,1,max),nrow=N,byrow=T)
			 
			self$x <- x
			self$arg_max <- apply(col,1,which.max)

			return(out) 
		},
		
		backward = function(dout){
			
			dout <- as.vector(t(dout))
			dmax <- matrix(0,length(self$arg_max),4)
			for(i in 1:length(self$arg_max)){
				dmax[i,self$arg_max[i]] <- dout[i]
			}
			H <- dim(self$x)[1]	
			W <- dim(self$x)[2]
			C <- dim(self$x)[3]
			N <- dim(self$x)[4] #batch数100
			dx <- col2im_pooling(dmax,H,W,C,N,2,2)

			return(dx)
		}
	)
)