GloVe Word Embeddings 「慢性腎臓病」-「慢性」+「急性」=「急性腎障害」

下準備

#ストップワードを決める
library(tm)
new_stopwords <- c(stopwords("en"), "also", "however", "thus", "may")
saveRDS(new_stopwords, "new_stopwords")

データの取得

腎臓関連の文献約10万本のアブストラクトを使用。
touch-sp.hatenablog.com

GloVe Word Embeddingsを使用

#データの読み込み
new_stopwords <- readRDS("new_stopwords")
Absts <- readLines("Absts_train.txt")
#単語の抽出
it <- itoken(Absts, tolower, word_tokenizer)
voc <- create_vocabulary(it, stopwords = new_stopwords)
voc <- prune_vocabulary(voc, doc_count_min = 5L)
voc <- voc[!grepl("^[0-9]*$", voc$term),]
> voc
Number of docs: 105282 
178 stopwords: i, me, my, myself, we, our ... 
ngram_min = 1; ngram_max = 1 
Vocabulary: 
               term term_count doc_count
    1: cryosections          5         5
    2:      invites          5         5
    3:  acidophilic          5         5
    4:         dieu          5         5
    5:   detachable          5         5
   ---                                  
33410:      disease      94930     48540
33411:       kidney     107759     49634
33412:            p     110387     32916
33413:        renal     186330     68010
33414:     patients     276182     66617
#TCM(term-co-occurence matrix)の作成
tcm <- create_tcm(it, vocab_vectorizer(voc), skip_grams_window = 8L)
#ワードベクトルの作成
glove <- GloVe$new(word_vectors_size = 50, vocabulary = voc, x_max = 10)
main <- glove$fit_transform(tcm, n_iter = 20)
context <- glove$components
word_vectors <- main + t(context)

結果の確認

#結果の確認
CKD <- word_vectors["ckd",, drop = FALSE]
chronic <- word_vectors["chronic",, drop = FALSE]
acute <- word_vectors["acute",, drop = FALSE]

answer <- CKD - chronic + acute

cos_sim <- sim2(x = word_vectors, y = answer, method = "cosine", norm = "l2")
head(sort(cos_sim[, 1], decreasing = TRUE), 5)
      aki     acute mortality   failure    injury 
0.9147137 0.8503673 0.7743812 0.7721670 0.7521985 
#結果の確認_2
answer <- word_vectors["development",, drop = FALSE]
cos_sim <- sim2(x = word_vectors, y = answer, method = "cosine", norm = "l2")
sort(cos_sim[, 1], decreasing = TRUE)[2:6]
progression     disease       early  contribute   important 
  0.8232047   0.7976576   0.7907084   0.7826525   0.7817541 
#結果の確認_3
answer <- word_vectors["smoking",, drop = FALSE]
cos_sim <- sim2(x = word_vectors, y = answer, method = "cosine", norm = "l2")
sort(cos_sim[, 1], decreasing = TRUE)[2:6]
cigarette   alcohol   tobacco    gender   history 
0.7918731 0.7415679 0.7381536 0.7137309 0.6848627 

環境

> sessionInfo()
R version 3.4.3 (2017-11-30)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 16299)

Matrix products: default

locale:
[1] LC_COLLATE=Japanese_Japan.932  LC_CTYPE=Japanese_Japan.932    LC_MONETARY=Japanese_Japan.932
[4] LC_NUMERIC=C                   LC_TIME=Japanese_Japan.932    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] RevoUtils_10.0.7     RevoUtilsMath_10.0.1

loaded via a namespace (and not attached):
[1] compiler_3.4.3 rtvs_1.0.0.0
> packageVersion("text2vec")
[1] ‘0.5.0’