読者です 読者をやめる 読者になる 読者になる

RでDeep Learning(12)

CNNの実装

library(dplyr)
library(R6)

#パラメータの初期化
	#Conv層
	W1 <- matrix(rnorm(750),nrow=25)*0.01
	b1 <- matrix(numeric(30),nrow=1)
	
	#Affin層1
	W2 <- matrix(rnorm(432000),nrow=4320)*0.01
	b2 <- matrix(numeric(100),nrow=1)

	#Affine層2
	W3 <- matrix(rnorm(1000),nrow=100)*0.01
	b3 <- matrix(numeric(10),nrow=1)

#データの読み込み
	x_train <- array(t(readRDS("x_train")),c(28,28,60000))/255
	t_train <- readRDS("t_train")
	x_test <- array(t(readRDS("x_test")),c(28,28,10000))
	t_test <- readRDS("t_test")

今回のために新たにConv層,Pooling層を定義(汎用性なし)

Convolution <- R6Class("Convolution",
	
	#stride=1
	#pad=0
	#filterサイズ=5×5
	#filter数=30
	
	public = list(
		W = NULL,
		b = NULL,
		x = NULL,
		col = NULL,

		dW = NULL,
		db = NULL,
		
		initialize = function(W,b){
			self$W <- W
			self$b <- b
		},

		forward = function(x){
			H <- dim(x)[1]	
			W <- dim(x)[2]
			N <- dim(x)[3]
 
			out_h <- (H - 5) + 1
			out_w <- (W - 5) + 1
			
			col <- im2col_single(x)
			out <- col %*% self$W + matrix(rep(self$b,out_h*out_w*N),nrow=out_h*out_w*N,byrow=T)
			out <- array(out,c(out_h,out_w,N,30)) %>% aperm(.,c(1,2,4,3))
			self$x <- x
			self$col <- col
			return(out) 
		},
		
		backward = function(dout){
			
			dout <- matrix(aperm(dout,c(1,2,4,3)),ncol=30)
			
			self$db <- apply(dout,2,sum)			
			self$dW <- (self$col %>% t()) %*% dout
			
			#dxのコード
		}
	)
)

Pooling <- R6Class("Pooling",
	
	#stride=2
	#pad=0
	#filterサイズ=2×2
	
	public = list(
		x = NULL,
		arg_max = NULL,

		forward = function(x){

			N <- dim(x)[4]

			col <- im2col_pooling(x,2,2)
			out <- matrix(apply(col,1,max),nrow=N,byrow=T)
			 
			self$x <- x
			self$arg_max <- apply(col,1,which.max)

			return(out) 
		},
		
		backward = function(dout){
			
			dout <- as.vector(t(dout))
			dmax <- matrix(0,length(self$arg_max),4)
			for(i in 1:length(self$arg_max)){
				dmax[i,self$arg_max[i]] <- dout[i]
			}
			H <- dim(self$x)[1]	
			W <- dim(self$x)[2]
			C <- dim(self$x)[3]
			N <- dim(self$x)[4]
			dx <- col2im_pooling(dmax,H,W,C,N,2,2)

			return(dx)
		}
	)
)

その他の層は以前のものを使用

Relu <- R6Class("Relu",

	public = list(
		mask = NULL,
		
		forward = function(x){
			out <- x
			self$mask <- x<=0
			out[self$mask] <-0
			return(out)
		},
		
		backward = function(dout){
			dout[self$mask] <- 0
			return(dout)
		}
	)
)

Affine <- R6Class("Affine",

	public = list(
		W = NULL,
		b = NULL,
		x = NULL,
		dW = NULL,
		db = NULL,
		
		initialize = function(W,b){
			self$W <- W
			self$b <- b
		},

		forward = function(x){
			if(x %>% is.vector()){
				batch <- 1
			}else{
				batch <- nrow(x)
			}
			self$x <- x
			out <- (x %*% self$W)+matrix(rep(self$b,batch),nrow=batch,byrow=T)
			return(out) 
		},
		
		backward = function(dout){
			dx <- dout %*% (self$W %>% t())
			self$dW <- (self$x %>% t()) %*% dout
			self$db <- apply(dout,2,sum)
			return(dx)
		}
	)
)

#softmax関数
softmax <- function(x){
	sm_in <- function(a){
		m <- max(a)
		exp_x <- exp(a-m)
		sum_exp_x <- sum(exp_x)
		b <- exp_x/sum_exp_x
		return(b)
	}
	y <- apply(x,1,sm_in) %>% t()
	return(y)
}

#交差エントロピー誤差
cross_entropy_error <- function(y,t){

	batch_size <- nrow(y)

	temp <- (-1)*sum(t*log(y))/batch_size
	return(temp)
}

SoftmaxWithLoss <- R6Class("SoftmaxWithLoss",

	public = list(
		loss = NULL,
		y = NULL,
		t = NULL,

		forward = function(x,t){
			self$t <- t
			self$y <- softmax(x)
			self$loss <- cross_entropy_error(self$y,self$t)
			return(self$loss) 
		},
		
		backward = function(dout){
			batch_size <- self$t %>% nrow()
			dx <- (self$y - self$t)/batch_size
			return(dx)
		}
	)
)

im2col、col2imを定義

im2col_single <- function(input_data){

	#imput_data:3次元配列(チャネル数1)
	#フィルター5×5
	#stride:1
	#pad:0

	H <- dim(input_data)[1]
	W <- dim(input_data)[2]
	N <- dim(input_data)[3]

	out_h <- H - 4
	out_w <- W - 4

	m_row <- N * out_h * out_w
	m <- matrix(0,m_row,25)

	myF <- function(x){
		for(w_index in 1:out_w){
			for(h_index in 1:out_h){
				m[row_number,] <<- as.vector(x[h_index:(h_index+4),w_index:(w_index+4)])
				row_number <<- row_number + 1
				}
			}
		}
	row_number <- 1
	apply(input_data,3,myF)
	
	return(m)
}

im2col_pooling <- function(input_data,filter_h,filter_w){

	#stride=2
	#pad=0

	H <- dim(input_data)[1]
	W <- dim(input_data)[2]
	C <- dim(input_data)[3]
	N <- dim(input_data)[4]

	row_num <- ((H*W)/(filter_h*filter_w))*C*N
	col_num <- filter_h * filter_w

	t <- array(input_data,c(filter_h,H/filter_h,filter_w,W/filter_w,C,N))
	tt <- aperm(t,c(1,3,2,4,5,6))
	ttt <- matrix(tt,c(row_num,col_num),byrow=T)
	
	return(ttt)
}

col2im_pooling <- function(x,H,W,C,N,filter_h,filter_w){

	#stride=2
	#pad=0

	t <- array(t(x),c(filter_h,filter_w,H/filter_h,W/filter_w,C,N))
	tt <- aperm(t,c(1,3,2,4,5,6))
	ttt <- array(tt,c(H,W,C,N))
	return(ttt)
}

層を並べて、その他の関数を定義

layers <- as.list(NULL)
layers[[1]] <- Convolution$new(W1,b1)
layers[[2]] <- Relu$new()
layers[[3]] <- Pooling$new()
layers[[4]] <- Affine$new(W2,b2)
layers[[5]] <- Relu$new()
layers[[6]] <- Affine$new(W3,b3)
lastlayer <- SoftmaxWithLoss$new()

predict <- function(x){
	for(i in 1:6){
		x <- layers[[i]]$forward(x)
	}
	return(x)
}

loss <- function(x,t){
	y <- predict(x)
	z <- lastlayer$forward(y,t)
	return(z)
}

gradient <- function(x,t){

	#forward
	loss_vector <<- c(loss_vector,loss(x,t))

	#backward
	dout <- lastlayer$backward(1)
	for(i in 6:1){
		dout <- layers[[i]]$backward(dout)
	}
	
	grads <- as.list(NULL)
	grads <- grads %>% c(list(W1=layers[[1]]$dW))
	grads <- grads %>% c(list(b1=layers[[1]]$db))
	grads <- grads %>% c(list(W2=layers[[4]]$dW))
	grads <- grads %>% c(list(b2=layers[[4]]$db))
	grads <- grads %>% c(list(W3=layers[[6]]$dW))
	grads <- grads %>% c(list(b3=layers[[6]]$db))
	return(grads)
}

accuracy <- function(x,t){
	batch <- dim(x)[3]
	y <- predict(x)
	y <- apply(y,1,which.max)
	t <- apply(t,1,which.max)
	accuracy <- sum(t==y)/batch
	return(accuracy)
}

accuracy_all <- function(x,t){
	ac <- 0
	for(i in 0:9){
		x_batch <- x[,,(i*1000+1):(i*1000+1000)]
		t_batch <- t[(i*1000+1):(i*1000+1000),]
		ac <- ac + accuracy(x_batch,t_batch)
	}
	return(ac/10)
}

最後に実行

#ハイパーパラメータ
iters_num <- 60
train_size <- 60000
batch_size <- 1000
learning_rate <- 0.1

loss_vector <- NULL

for(i in 1:iters_num){

	#ミニバッチの取得
	batch_mask<-sample(train_size,batch_size)
	x_batch <- x_train[,,batch_mask]
	t_batch <- t_train[batch_mask,]

	#勾配の計算
	grads <- gradient(x_batch,t_batch)

	#パラメータの更新
	layers[[1]]$W <- layers[[1]]$W - (grads$W1 * learning_rate)
	layers[[4]]$W <- layers[[4]]$W - (grads$W2 * learning_rate)
	layers[[6]]$W <- layers[[6]]$W - (grads$W3 * learning_rate)
	layers[[1]]$b <- layers[[1]]$b - (grads$b1 * learning_rate)
	layers[[4]]$b <- layers[[4]]$b - (grads$b2 * learning_rate)
	layers[[6]]$b <- layers[[6]]$b - (grads$b3 * learning_rate)
}

Rで行列計算(行列の足し算)

sweepは速いと思っていたが違った。
(環境:Windows 10 64bit、R3.3.3)

> system.time(test1 <- out + matrix(rep(b1,576000),nrow=576000,byrow=T))
   ユーザ   システム       経過  
      0.13       0.03       0.16 
> system.time(test2 <- sweep(out,2,b1,FUN="+",check.margin=F))
   ユーザ   システム       経過  
      0.28       0.00       0.28 
> identical(test1,test2)
[1] TRUE

RでDeep Learning(11)

Convolutin層、Pooling層の実装

Convolution <- R6Class("Convolution",
	
	#stride=1
	#pad=0
	#filterサイズ=5×5
	#filter数=30
	
	public = list(
		W = NULL,
		b = NULL,
		x = NULL,
		col = NULL,

		dW = NULL,
		db = NULL,
		
		initialize = function(W,b){
			self$W <- W
			self$b <- b
		},

		forward = function(x){
			H <- dim(x)[1]	
			W <- dim(x)[2]
			C <- dim(x)[3]
			N <- dim(x)[4] #batch数100

			out_h <- (H - 5) + 1
			out_w <- (W - 5) + 1
			
			col <- im2col(x,5,5,stride=1,pad=0)
			out <- col %*% self$W + matrix(rep(self$b,out_h*out_w*N),nrow=out_h*out_w*N,byrow=T)
			out <- array(out,c(out_h,out_w,N,30)) %>% aperm(.,c(1,2,4,3))
			self$x <- x
			self$col <- col
			return(out) 
		},
		
		backward = function(dout){
			
			dout <- matrix(aperm(dout,c(1,2,4,3)),ncol=30)
			
			self$db <- apply(dout,2,sum)			
			self$dW <- (self$col %>% t()) %*% dout
			
			#dxのコード
		}
	)
)
Pooling <- R6Class("Pooling",
	
	#stride=2
	#pad=0
	#filterサイズ=2×2
	
	public = list(
		x = NULL,
		arg_max = NULL,

		forward = function(x){

			N <- dim(x)[4] #batch数100

			col <- im2col_pooling(x,2,2)
			out <- matrix(apply(col,1,max),nrow=N,byrow=T)
			 
			self$x <- x
			self$arg_max <- apply(col,1,which.max)

			return(out) 
		},
		
		backward = function(dout){
			
			dout <- as.vector(t(dout))
			dmax <- matrix(0,length(self$arg_max),4)
			for(i in 1:length(self$arg_max)){
				dmax[i,self$arg_max[i]] <- dout[i]
			}
			H <- dim(self$x)[1]	
			W <- dim(self$x)[2]
			C <- dim(self$x)[3]
			N <- dim(self$x)[4] #batch数100
			dx <- col2im_pooling(dmax,H,W,C,N,2,2)

			return(dx)
		}
	)
)

Rで行列計算(行列の積)

RjpWikiによると「crossprod」と「%*%」では前者のほうが計算が速いとのこと。
実際にやってみた。
(環境:Windows 7 64bit、R3.4.0)

> x <- matrix(rnorm(25000000),c(5000,5000))
> y <- matrix(rnorm(25000000),c(5000,5000))
> system.time(a <- t(x) %*% y)
   ユーザ   システム       経過  
     85.10       0.11      85.20 
> system.time(b <- crossprod(x,y))
   ユーザ   システム       経過  
    129.95       0.09     130.16 
> identical(a,b)
[1] TRUE

同じことをMicrosoft R Openでもやってみた。
(環境:Windows 7 64bit、Microsoft R Open3.3.3)

> x <- matrix(rnorm(25000000),c(5000,5000))
> y <- matrix(rnorm(25000000),c(5000,5000))
> system.time(a <- t(x) %*% y)
   ユーザ   システム       経過  
     13.17       0.01       6.82 
> system.time(b <- crossprod(x,y))
   ユーザ   システム       経過  
     14.29       0.03       7.18 
> identical(a,b)
[1] TRUE

とにかくMicrosoft R Openは格段に速い。

RでDeep Learning(10)

Affineレイヤの書き換え

Affine <- R6Class("Affine",

	public = list(
		W = NULL,
		b = NULL,
		x = NULL,
		x_shape = NULL,
		dW = NULL,
		db = NULL,
		
		initialize = function(W,b){
			self$W <- W
			self$b <- b
		},

		forward = function(x){

			dummy_shape <- dim(x)
			dummy_shape[1] <- dim(x)[2]
			dummy_shape[2] <- dim(x)[1]
			self$x_shape <- dummy_shape
			batch <- dim(x)[4]
			seld$x <-  x %>% aperm(.,c(2,1,3,4)) %>% as.vector %>% matrix(.,nrow=batch,byrow=T)
			out <- (self$x %*% self$W)+matrix(rep(self$b,batch),nrow=batch,byrow=T)
			return(out) 
		},
		
		backward = function(dout){
			dx <- dout %*% (self$W %>% t())
			self$dW <- (self$x %>% t()) %*% dout
			self$db <- apply(dout,2,sum)
			dx <- array(t(dx),self$x_shape) %>% aperm(.,c(2,1,3,4))
			return(dx)
		}
	)
)

RでDeep Learning(9)

Rでim2colの実装(プーリング層はim2col_poolingとして別に実装)
Rのdrop機能にはまった。

im2col <- function(input_data,filter_h,filter_w,stride=1,pad=0){
	#imput_data:4次元配列からなるデータ
	#filter_h:フィルターの高さ
	#filter_w:フィルターの幅

	H <- dim(input_data)[1]
	W <- dim(input_data)[2]
	C <- dim(input_data)[3]
	N <- dim(input_data)[4]
	
	out_h <- ((H + 2*pad - filter_h) %/% stride) + 1
	out_w <- ((W + 2*pad - filter_w) %/% stride) + 1

	if(pad==0){
		img <- input_data
	}else{
		img <- array(0,c((H+2*pad),(W+2*pad),C,N))
		for(i in 1:C){
			for(v in 1:N){
				img[(1+pad):(H+pad),(1+pad):(W+pad),i,v] <- input_data[,,i,v]
			}
		}
	}
	m_col <- filter_h * filter_w * C
	m_row <- N * out_h * out_w
	m <- matrix(0,m_row,m_col)

	if(C != 1){
		row_number <- 1
		for(dim_4 in 1:N){
			for(out_w_index in 1:out_w){
				w_start <- (out_w_index - 1) * stride + 1
				w_end <- (out_w_index - 1) * stride + filter_w
			
				for(out_h_index in 1:out_h){
					h_start <- (out_h_index - 1) * stride + 1
					h_end <- (out_h_index - 1) * stride + filter_h
					y <- apply(img[,,,dim_4],3,function(x)x[h_start:h_end,w_start:w_end])
					m[row_number,] <- as.vector(y)
					row_number <- row_number + 1
				}
			}
		}
	}else{
		row_number <- 1
		for(dim_4 in 1:N){
			for(out_w_index in 1:out_w){
				w_start <- (out_w_index - 1) * stride + 1
				w_end <- (out_w_index - 1) * stride + filter_w
			
				for(out_h_index in 1:out_h){
					h_start <- (out_h_index - 1) * stride + 1
					h_end <- (out_h_index - 1) * stride + filter_h
					y <- img[h_start:h_end,w_start:w_end,1,dim_4]
					m[row_number,] <- as.vector(y)
					row_number <- row_number + 1
				}
			}
		}
	}
	return(m)
}

MNISTはチャネル数1なので3次元用のim2col_singleを使用。
(コード簡略化と高速化のため)
Convolution層を最初にしか置かないのでこれでOK。

im2col_single <- function(input_data){

	#imput_data:3次元配列(チャネル数1)
	#フィルター5×5
	#stride:1
	#pad:0

	H <- dim(input_data)[1]
	W <- dim(input_data)[2]
	N <- dim(input_data)[3]

	out_h <- H - 4
	out_w <- W - 4

	m_row <- N * out_h * out_w
	m <- matrix(0,m_row,25)

	myF <- function(x){
		for(w_index in 1:out_w){
			for(h_index in 1:out_h){
				m[row_number,] <<- as.vector(x[h_index:(h_index+4),w_index:(w_index+4)])
				row_number <<- row_number + 1
				}
			}
		}
	row_number <- 1
	apply(input_data,3,myF)
	
	return(m)
}
> dim(x_test)
[1]    28    28     1 10000
> x_test_3D <- x_test[,,1,]
> dim(x_test_3D)
[1]    28    28 10000
> system.time(col <- im2col(x_test,5,5,stride=1,pad=0))
   ユーザ   システム       経過  
     37.84       0.28      38.19 
> system.time(col2 <- im2col_single(x_test_3D))
   ユーザ   システム       経過  
     26.41       0.23      26.72 
> identical(col,col2)
[1] TRUE
im2col_pooling <- function(input_data,filter_h,filter_w){

	#stride=2
	#pad=0

	H <- dim(input_data)[1]
	W <- dim(input_data)[2]
	C <- dim(input_data)[3]
	N <- dim(input_data)[4]

	row_num <- ((H*W)/(filter_h*filter_w))*C*N
	col_num <- filter_h * filter_w

	t <- array(input_data,c(filter_h,H/filter_h,filter_w,W/filter_w,C,N))
	tt <- aperm(t,c(1,3,2,4,5,6))
	ttt <- matrix(tt,c(row_num,col_num),byrow=T)
	
	return(ttt)
}
col2im_pooling <- function(x,H,W,C,N,filter_h,filter_w){

	#stride=2
	#pad=0

	t <- array(t(x),c(filter_h,filter_w,H/filter_h,W/filter_w,C,N))
	tt <- aperm(t,c(1,3,2,4,5,6))
	ttt <- array(tt,c(H,W,C,N))
	return(ttt)
}

RでDeep Learning(8)

前回の続き。

正解率はこの通り。

> test_acc
[1] 0.9713
> train_acc
[1] 0.9892667

正解できなかった問題の一部。

y <- predict(x_test)
y_1 <- apply(y,1,which.max)
t <- apply(t_test,1,which.max)
error <- t!=y_1

num_image <- x_test[error,]
num_t <- t_test[error,]
num_predict <- y[error,]

hyozi <- function(x){
a <- num[x,]
b <- matrix(a,nrow=28,byrow=T)
d <- t(b[nrow(b):1,ncol(b):1])[ncol(b):1,]
image(d,axes=FALSE)

print(paste("正解:",which.max(num_t[x,])-1))
print(paste("予測:",which.max(num_predict[x,])-1))
}

正解:9、予測:3
f:id:touch-sp:20170428044716p:plain:w200

正解:5、予測:6
f:id:touch-sp:20170428044813p:plain:w200

正解:0、予測:8
f:id:touch-sp:20170428044939p:plain:w200