fork download
  1. library(tm)
  2. library(tmcn)
  3. library(Rwordseg)
  4.  
  5. d.corpus <- Corpus(DirSource("docs", encoding = 'UTF-8'))
  6.  
  7. # 清除標點符號, 數字
  8. d.corpus <- tm_map(d.corpus, removePunctuation)
  9. d.corpus <- tm_map(d.corpus, removeNumbers)
  10. # 清除大小寫英文與數字
  11. d.corpus <- tm_map(d.corpus, function(word) {
  12. gsub("[A-Za-z0-9]", "", word)
  13. })
  14.  
  15. d.corpus <- tm_map(d.corpus, segmentCN, nature = TRUE)
  16.  
  17. d.corpus <- Corpus(VectorSource(d.corpus))
  18.  
  19. myStopWords <- c(stopwordsCN(), "編輯", "時間", "標題", "發信", "實業", "作者", "!")
  20. d.corpus <- tm_map(d.corpus, removeWords, myStopWords)
  21.  
  22. tdm <- TermDocumentMatrix(d.corpus, control = list(wordLengths = c(2, Inf)))
  23.  
  24. inspect(tdm)
Runtime error #stdin #stdout #stderr 0.82s 23176KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Error in library(tmcn) : there is no package called 'tmcn'
Execution halted