下準備
#ストップワードを決める library(tm) new_stopwords <- c(stopwords("en"), "also", "however", "thus", "may") saveRDS(new_stopwords, "new_stopwords")
データの取得
腎臓関連の文献約10万本のアブストラクトを使用。
touch-sp.hatenablog.com
GloVe Word Embeddingsを使用
#データの読み込み new_stopwords <- readRDS("new_stopwords") Absts <- readLines("Absts_train.txt")
#単語の抽出 it <- itoken(Absts, tolower, word_tokenizer) voc <- create_vocabulary(it, stopwords = new_stopwords) voc <- prune_vocabulary(voc, doc_count_min = 5L) voc <- voc[!grepl("^[0-9]*$", voc$term),]
> voc Number of docs: 105282 178 stopwords: i, me, my, myself, we, our ... ngram_min = 1; ngram_max = 1 Vocabulary: term term_count doc_count 1: cryosections 5 5 2: invites 5 5 3: acidophilic 5 5 4: dieu 5 5 5: detachable 5 5 --- 33410: disease 94930 48540 33411: kidney 107759 49634 33412: p 110387 32916 33413: renal 186330 68010 33414: patients 276182 66617
#TCM(term-co-occurence matrix)の作成 tcm <- create_tcm(it, vocab_vectorizer(voc), skip_grams_window = 8L)
#ワードベクトルの作成 glove <- GloVe$new(word_vectors_size = 50, vocabulary = voc, x_max = 10) main <- glove$fit_transform(tcm, n_iter = 20) context <- glove$components word_vectors <- main + t(context)
結果の確認
#結果の確認 CKD <- word_vectors["ckd",, drop = FALSE] chronic <- word_vectors["chronic",, drop = FALSE] acute <- word_vectors["acute",, drop = FALSE] answer <- CKD - chronic + acute cos_sim <- sim2(x = word_vectors, y = answer, method = "cosine", norm = "l2") head(sort(cos_sim[, 1], decreasing = TRUE), 5)
aki acute mortality failure injury 0.9147137 0.8503673 0.7743812 0.7721670 0.7521985
#結果の確認_2 answer <- word_vectors["development",, drop = FALSE] cos_sim <- sim2(x = word_vectors, y = answer, method = "cosine", norm = "l2") sort(cos_sim[, 1], decreasing = TRUE)[2:6]
progression disease early contribute important 0.8232047 0.7976576 0.7907084 0.7826525 0.7817541
#結果の確認_3 answer <- word_vectors["smoking",, drop = FALSE] cos_sim <- sim2(x = word_vectors, y = answer, method = "cosine", norm = "l2") sort(cos_sim[, 1], decreasing = TRUE)[2:6]
cigarette alcohol tobacco gender history 0.7918731 0.7415679 0.7381536 0.7137309 0.6848627
環境
> sessionInfo() R version 3.4.3 (2017-11-30) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows 10 x64 (build 16299) Matrix products: default locale: [1] LC_COLLATE=Japanese_Japan.932 LC_CTYPE=Japanese_Japan.932 LC_MONETARY=Japanese_Japan.932 [4] LC_NUMERIC=C LC_TIME=Japanese_Japan.932 attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] RevoUtils_10.0.7 RevoUtilsMath_10.0.1 loaded via a namespace (and not attached): [1] compiler_3.4.3 rtvs_1.0.0.0
> packageVersion("text2vec") [1] ‘0.5.0’