RとfastTextを使ってSCDVのようなことをやってみる

Ubuntu上でfastTextを実行(word2vecの代わり)

touch-sp.hatenablog.com

その結果をRで拾い上げ、その後を実行

library(text2vec)

#fastTextデータ(ワードベクトル)を読み込む
word_vec <- read.table("model3.vec", header = F, sep = " ", skip = 1, row.names = 1)
word_vec <- word_vec[, -101]
word_vec <- as.matrix(word_vec)

#text2vec用に用意した訓練データを読み込みボキャブラリーテーブルを作成する
Absts <- readLines("post_text2vec.txt", warn = FALSE, encoding = "utf8")
it <- itoken(Absts, tolower, word_tokenizer)
voc <- create_vocabulary(it)

#ボキャブラリーテーブルとワードベクトルをそろえる
library(tm)
new_stopwords <- c(stopwords("en"), letters, "can", "could", "may", "might", "also", "however")

both <- intersect(voc$term, rownames(word_vec))
both <- both[!(both %in% new_stopwords)]

voc <- voc[(voc$term %in% both),]
voc <- voc[order(voc$term),]

word_vec <- word_vec[rownames(word_vec) %in% both,]
word_vec <- word_vec[order(rownames(word_vec)),]

#idfを計算する
Number_of_docs <- attr(voc, "document_count")
idf <- log((Number_of_docs + 1) / (voc_after_after$doc_count + 1)) + 1

#Gaussian Mixture Modelling
library(ClusterR)
gmm <- GMM(word_vec_, 60, "maha_dist", "random_subset", 10, 10)
pr <- predict_GMM(word_vec, gmm$centroids, gmm$covariance_matrices, gmm$weights)
proba <- pr$cluster_proba

#新しいワードベクトルを作成する
Number_of_words <- nrow(word_vec)
new_vec <- matrix(numeric(Number_of_words * 100 * 60), nrow = Number_of_words)
for (i in 0:59) {
    new_vec[, (i * 100 + 1):((i + 1) * 100)] <- word_vec * proba[, i + 1]
}
new_vec <- new_vec * idf
rownames(new_vec) <- rownames(word_vec)