RでDeep Learning(最終章)

Deep Learningフレームワークを使わずにRのみでMNIST 98%以上を目指す!

デザイン

Data

  →Conv(filter:30)
    →Relu
      →Pooling
 
  →Affine(Neuron:100)
    →Relu

  →Affine(Neuron:10)
    →Softmax

Adamによる最適化

使用するパッケージは以下の二つのみ

library(dplyr)
library(R6)

データの読み込み

トレイニングデータ60000、テストデータ10000

#データの読み込み
	x_train <- array(t(readRDS("x_train")),c(28,28,60000))/255
	t_train <- readRDS("t_train")
	x_test <- array(t(readRDS("x_test")),c(28,28,10000))
	t_test <- readRDS("t_test")

パラメータの初期化

#パラメータの初期化
	#Conv層
	W1 <- matrix(rnorm(750),nrow=25)*0.01
	b1 <- matrix(0,1,30)

		W1_m <- matrix(0,25,30)
		W1_v <- W1_m
		b1_m <- b1
		b1_v <- b1
	
	#Affin層1
	W2 <- matrix(rnorm(432000),nrow=4320)*(sqrt(2/30))
	b2 <- matrix(0,1,100)
	
		W2_m <- matrix(0,4320,100)
		W2_v <- W2_m
		b2_m <- b2
		b2_v <- b2

	#Affine層2
	W3 <- matrix(rnorm(1000),nrow=100)*(sqrt(2/100))
	b3 <- matrix(0,1,10)

		W3_m <- matrix(0,100,10)
		W3_v <- W3_m
		b3_m <- b3
		b3_v <- b3

im2col、col2imを定義

im2col_single <- function(input_data){

	#imput_data:3次元配列(チャネル数1)
	#フィルター5×5
	#stride:1
	#pad:0

	H <- dim(input_data)[1]
	W <- dim(input_data)[2]
	N <- dim(input_data)[3]

	out_h <- H - 4
	out_w <- W - 4

	m_row <- N * out_h * out_w
	m <- matrix(0,m_row,25)

	myF <- function(x){
		for(w_index in 1:out_w){
			for(h_index in 1:out_h){
				m[row_number,] <<- as.vector(x[h_index:(h_index+4),w_index:(w_index+4)])
				row_number <<- row_number + 1
				}
			}
		}
	row_number <- 1
	apply(input_data,3,myF)
	
	return(m)
}

im2col_pooling <- function(input_data){

	#stride=2
	#pad=0
        #フィルター2×2

	H <- dim(input_data)[1]
	W <- dim(input_data)[2]
	C <- dim(input_data)[3]
	N <- dim(input_data)[4]

	row_num <- ((H*W)/4)*C*N
	col_num <- 4

	t <- array(input_data,c(2,H/2,2,W/2,C,N))
	tt <- aperm(t,c(1,3,2,4,5,6))
	ttt <- matrix(tt,c(row_num,col_num),byrow=T)
	
	return(ttt)
}

col2im_pooling <- function(x,H,W,C,N){

	#stride=2
	#pad=0
        #フィルター2×2

	t <- array(t(x),c(2,2,H/2,W/2,C,N))
	tt <- aperm(t,c(1,3,2,4,5,6))
	ttt <- array(tt,c(H,W,C,N))
	return(ttt)
}

各層の定義

Convolution <- R6Class("Convolution",
	
	#stride=1
	#pad=0
	#filterサイズ=5×5
	#filter数=30
	
	public = list(
		W = NULL,
		b = NULL,
		col = NULL,

		dW = NULL,
		db = NULL,
		
		initialize = function(W,b){
			self$W <- W
			self$b <- b
		},

		forward = function(x){
			H <- dim(x)[1]	
			W <- dim(x)[2]
			N <- dim(x)[3]
 
			out_h <- (H - 5) + 1
			out_w <- (W - 5) + 1
			
			col <- im2col_single(x)
			out <- col %*% self$W + matrix(rep(self$b,out_h*out_w*N),nrow=out_h*out_w*N,byrow=T)
			out <- array(out,c(out_h,out_w,N,30)) %>% aperm(.,c(1,2,4,3))
			self$col <- col
			return(out) 
		},
		
		backward = function(dout){
			
			dout <- matrix(aperm(dout,c(1,2,4,3)),ncol=30)
			
			self$db <- apply(dout,2,sum)			
			self$dW <- (self$col %>% t()) %*% dout
			
			#dxのコード
		}
	)
)

Pooling <- R6Class("Pooling",
	
	#stride=2
	#pad=0
	#filterサイズ=2×2
	
	public = list(
		x = NULL,
		arg_max = NULL,

		forward = function(x){

			N <- dim(x)[4]

			col <- im2col_pooling(x)
			out <- matrix(apply(col,1,max),nrow=N,byrow=T)
			 
			self$x <- x
			self$arg_max <- apply(col,1,which.max)

			return(out) 
		},
		
		backward = function(dout){
			
			dout <- as.vector(t(dout))
			dmax <- matrix(0,length(self$arg_max),4)
			for(i in 1:length(self$arg_max)){
				dmax[i,self$arg_max[i]] <- dout[i]
			}
			H <- dim(self$x)[1]	
			W <- dim(self$x)[2]
			C <- dim(self$x)[3]
			N <- dim(self$x)[4]
			dx <- col2im_pooling(dmax,H,W,C,N)

			return(dx)
		}
	)
)

Relu <- R6Class("Relu",

	public = list(
		mask = NULL,
		
		forward = function(x){
			out <- x
			self$mask <- x<=0
			out[self$mask] <-0
			return(out)
		},
		
		backward = function(dout){
			dout[self$mask] <- 0
			return(dout)
		}
	)
)

Affine <- R6Class("Affine",

	public = list(
		W = NULL,
		b = NULL,
		x = NULL,
		dW = NULL,
		db = NULL,
		
		initialize = function(W,b){
			self$W <- W
			self$b <- b
		},

		forward = function(x){
			if(x %>% is.vector()){
				batch <- 1
			}else{
				batch <- nrow(x)
			}
			self$x <- x
			out <- (x %*% self$W)+matrix(rep(self$b,batch),nrow=batch,byrow=T)
			return(out) 
		},
		
		backward = function(dout){
			dx <- dout %*% (self$W %>% t())
			self$dW <- (self$x %>% t()) %*% dout
			self$db <- apply(dout,2,sum)
			return(dx)
		}
	)
)

#softmax関数
softmax <- function(x){
	sm_in <- function(a){
		m <- max(a)
		exp_x <- exp(a-m)
		sum_exp_x <- sum(exp_x)
		b <- exp_x/sum_exp_x
		return(b)
	}
	y <- apply(x,1,sm_in) %>% t()
	return(y)
}

SoftmaxWithLoss <- R6Class("SoftmaxWithLoss",

	public = list(
		loss = NULL,
		y = NULL,
		t = NULL,

		forward = function(x,t){
			self$t <- t
			self$y <- softmax(x) 
		},
		
		backward = function(dout){
			batch_size <- self$t %>% nrow()
			dx <- (self$y - self$t)/batch_size
			return(dx)
		}
	)
)

層を並べて、その他の関数を定義

layers <- as.list(NULL)
layers[[1]] <- Convolution$new(W1,b1)
layers[[2]] <- Relu$new()
layers[[3]] <- Pooling$new()
layers[[4]] <- Affine$new(W2,b2)
layers[[5]] <- Relu$new()
layers[[6]] <- Affine$new(W3,b3)
lastlayer <- SoftmaxWithLoss$new()

predict <- function(x){
	for(i in 1:6){
		x <- layers[[i]]$forward(x)
	}
	return(x)
}

loss <- function(x,t){
	y <- predict(x)
	lastlayer$forward(y,t)
}

gradient <- function(x,t){

	#forward
	loss(x,t)

	#backward
	dout <- lastlayer$backward(1)
	for(i in 6:1){
		dout <- layers[[i]]$backward(dout)
	}
}

accuracy <- function(x,t){
	batch <- dim(x)[3]
	y <- predict(x)
	y <- apply(y,1,which.max)
	t <- apply(t,1,which.max)
	accuracy <- sum(t==y)/batch
	return(accuracy)
}

accuracy_test <- function(x,t){
	ac <- 0
	for(i in 0:9){
		x_batch <- x[,,(i*1000+1):(i*1000+1000)]
		t_batch <- t[(i*1000+1):(i*1000+1000),]
		ac <- ac + accuracy(x_batch,t_batch)
	}
	return(ac/10)
}

accuracy_train <- function(x,t){
	ac <- 0
	for(i in 0:59){
		x_batch <- x[,,(i*1000+1):(i*1000+1000)]
		t_batch <- t[(i*1000+1):(i*1000+1000),]
		ac <- ac + accuracy(x_batch,t_batch)
	}
	return(ac/60)
}

実行

#ハイパーパラメータ
iters_num <- 1200
train_size <- 60000
batch_size <- 1000

for(iter in 1:iters_num){

	#ミニバッチの取得
	batch_mask<-sample(train_size,batch_size)
	x_batch <- x_train[,,batch_mask]
	t_batch <- t_train[batch_mask,]

	#実行
	gradient(x_batch,t_batch)

	#パラメータの更新
	lr_t <- 0.001 * sqrt(1-0.999^iter)/(1-0.9^iter)
	W1_m <- (1-0.9)*layers[[1]]$dW + 0.9*W1_m
	W1_v <- (1-0.999)*(layers[[1]]$dW**2) + 0.999*W1_v  
	b1_m <- (1-0.9)*layers[[1]]$db + 0.9*b1_m
	b1_v <- (1-0.999)*(layers[[1]]$db**2) + 0.999*b1_v
	layers[[1]]$W <- layers[[1]]$W - lr_t * W1_m/(sqrt(W1_v)+1e-7)
	layers[[1]]$b <- layers[[1]]$b - lr_t * b1_m/(sqrt(b1_v)+1e-7)

	W2_m <- (1-0.9)*layers[[4]]$dW + 0.9*W2_m
	W2_v <- (1-0.999)*(layers[[4]]$dW**2) + 0.999*W2_v  
	b2_m <- (1-0.9)*layers[[4]]$db + 0.9*b2_m
	b2_v <- (1-0.999)*(layers[[4]]$db**2) + 0.999*b2_v
	layers[[4]]$W <- layers[[4]]$W - lr_t * W2_m/(sqrt(W2_v)+1e-7)
	layers[[4]]$b <- layers[[4]]$b - lr_t * b2_m/(sqrt(b2_v)+1e-7)

	W3_m <- (1-0.9)*layers[[6]]$dW + 0.9*W3_m
	W3_v <- (1-0.999)*(layers[[6]]$dW**2) + 0.999*W3_v  
	b3_m <- (1-0.9)*layers[[6]]$db + 0.9*b3_m
	b3_v <- (1-0.999)*(layers[[6]]$db**2) + 0.999*b3_v
	layers[[6]]$W <- layers[[6]]$W - lr_t * W3_m/(sqrt(W3_v)+1e-7)
	layers[[6]]$b <- layers[[6]]$b - lr_t * b3_m/(sqrt(b3_v)+1e-7)
}

結果

> accuracy_test(x_test,t_test)
[1] 0.986
> accuracy_train(x_train,t_train)
[1] 0.9973667

テストデータの正解率:98.6%
トレイニングデータの正解率:99.7%

さらに上をめざしたのがこちら