その結果をRで拾い上げ、その後を実行
library(text2vec)
word_vec <- read.table("model3.vec", header = F, sep = " ", skip = 1, row.names = 1)
word_vec <- word_vec[, -101]
word_vec <- as.matrix(word_vec)
Absts <- readLines("post_text2vec.txt", warn = FALSE, encoding = "utf8")
it <- itoken(Absts, tolower, word_tokenizer)
voc <- create_vocabulary(it)
library(tm)
new_stopwords <- c(stopwords("en"), letters, "can", "could", "may", "might", "also", "however")
both <- intersect(voc$term, rownames(word_vec))
both <- both[!(both %in% new_stopwords)]
voc <- voc[(voc$term %in% both),]
voc <- voc[order(voc$term),]
word_vec <- word_vec[rownames(word_vec) %in% both,]
word_vec <- word_vec[order(rownames(word_vec)),]
Number_of_docs <- attr(voc, "document_count")
idf <- log((Number_of_docs + 1) / (voc_after_after$doc_count + 1)) + 1
library(ClusterR)
gmm <- GMM(word_vec_, 60, "maha_dist", "random_subset", 10, 10)
pr <- predict_GMM(word_vec, gmm$centroids, gmm$covariance_matrices, gmm$weights)
proba <- pr$cluster_proba
Number_of_words <- nrow(word_vec)
new_vec <- matrix(numeric(Number_of_words * 100 * 60), nrow = Number_of_words)
for (i in 0:59) {
new_vec[, (i * 100 + 1):((i + 1) * 100)] <- word_vec * proba[, i + 1]
}
new_vec <- new_vec * idf
rownames(new_vec) <- rownames(word_vec)