Mittels des R Pakets wordVectors lässt sich der von [Mikolov] vorgetsellte Algorithmus word2vec in R aufrufen. Ziel ist es für verschiedene Subcorpora des Transnorms-Projekts wordembeddings zu trainieren. Darauf basierend sollen dann für eine definierte Menge von Worten, zu verschiedenen Zeitpunkten/Lokalitätsebenen die ähnlichsten Begriffe (Cosinus Sitanz der Embeddings) bestimtm werden. Der Vergleich der ähnlichsten Begriffe soll dann über die Bedeutung/Übersetzung dieser Schlüsselbegriffe Aufschluss geben.
library(wordVectors)
load("token.RData")
token<-db_data$token
token<-cbind(paste0(token$id,"_",token$sid,sep=""),token)
colnames(token)[1]<-"sentence_id"
token$sentence_id<-stringr::str_remove_all(string = as.character(token$sentence_id),pattern="Transnorms no Loc_")
unique_sentence_ids<-unique(token$sentence_id)
# Anzahl an Sätzen
length(unique_sentence_ids)
## [1] 77501
count=0
sentences_orig<-unlist(lapply(X = 1:length(unique_sentence_ids),FUN = function(x){
if(x%%10000==0){
print(x)
}
paste(token$word[which(token$sentence_id==unique_sentence_ids[x])],collapse = " ")
}))
## [1] 10000
## [1] 20000
## [1] 30000
## [1] 40000
## [1] 50000
## [1] 60000
## [1] 70000
sentences<-sentences_orig
# remove emty & too short sentences
sentences <- stringr::str_replace_all(string = sentences,pattern = " +",replacement = " ")
# remove newline
sentences<-stringr::str_replace_all(string = sentences,pattern ="\\\\n" ,replacement = " ")
sentences<-stringr::str_replace_all(string = sentences,pattern ="\\n" ,replacement = " ")
# tolower
sentences<-tolower(x = sentences)
# remove punctuation
sentences <- stringr::str_replace_all(string = sentences,pattern = stringr::regex(" [\\p{P}\\p{S}]+ "),replacement = " ")
sentences <- stringr::str_replace_all(string = sentences,pattern="[[:punct:]]+",replacement = " ")
# replace numbers
sentences <- stringr::str_replace_all(string = sentences,pattern = stringr::regex("[0-9]+"),replacement = "#")
sentences<-sentences[-which(nchar(sentences)<6)]
writeLines(text = sentences,con = "inputtext.txt",sep = "\n")
wordVectors::prep_word2vec(origin = "inputtext.txt",destination = "input.txt",lowercase=F,bundle_ngrams = 1,min_count=3)
## Beginning tokenization to text file at input.txt
## Prepping inputtext.txt
model = train_word2vec(train_file = "input.txt",output_file = "vectors.bin",vectors = 80, threads = 6, window = 5, cbow = 0,min_count = 3,iter = 20,force = T)
## Starting training using file /home/christian/R/Transnorms/Word2Vec/input.txt
## 100K
200K
300K
400K
500K
600K
700K
800K
900K
1000K
1100K
1200K
1300K
Vocab size: 11336
## Words in train file: 1381204
## Filename ends with .bin, so reading in binary format
## Reading a word2vec binary file of 11336 rows and 80 columns
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|==================== | 32%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================= | 52%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================ | 75%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|=========================================================== | 92%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================= | 95%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
wordVectors::closest_to(matrix = model,vector = "nuclear",n = 20,fancy_names = F)->result
result
## word similarity
## 1 nuclear 1.0000000
## 2 power 0.7139748
## 3 generation 0.6783276
## 4 coal 0.6736706
## 5 coalbased 0.6714035
## 6 utilityscale 0.6695388
## 7 reactors 0.6553851
## 8 gigawatts 0.6552829
## 9 hydro 0.6472311
## 10 highefficiency 0.6459507
## 11 geothermal 0.6448424
## 12 hydroelectric 0.6421985
## 13 thermal 0.6394015
## 14 plants 0.6388707
## 15 ultrasupercritical 0.6120512
## 16 station 0.6027416
## 17 hydroelectricity 0.5969970
## 18 wind 0.5957769
## 19 adjarala 0.5955060
## 20 photovoltaic 0.5922985