library(tm)
library(tmcn)
library(Rwordseg)
d.corpus <- Corpus(DirSource("docs", encoding = 'UTF-8'))
# 清除標點符號, 數字
d.corpus <- tm_map(d.corpus, removePunctuation)
d.corpus <- tm_map(d.corpus, removeNumbers)
# 清除大小寫英文與數字
d.corpus <- tm_map(d.corpus, function(word) {
gsub("[A-Za-z0-9]", "", word)
})
d.corpus <- tm_map(d.corpus, segmentCN, nature = TRUE)
d.corpus <- Corpus(VectorSource(d.corpus))
myStopWords <- c(stopwordsCN(), "編輯", "時間", "標題", "發信", "實業", "作者", "!")
d.corpus <- tm_map(d.corpus, removeWords, myStopWords)
tdm <- TermDocumentMatrix(d.corpus, control = list(wordLengths = c(2, Inf)))
inspect(tdm)
bGlicmFyeSh0bSkKbGlicmFyeSh0bWNuKQpsaWJyYXJ5KFJ3b3Jkc2VnKQoKZC5jb3JwdXMgPC0gQ29ycHVzKERpclNvdXJjZSgiZG9jcyIsIGVuY29kaW5nID0gJ1VURi04JykpCgojIOa4hemZpOaomem7nuespuiZnywg5pW45a2XCmQuY29ycHVzIDwtIHRtX21hcChkLmNvcnB1cywgcmVtb3ZlUHVuY3R1YXRpb24pCmQuY29ycHVzIDwtIHRtX21hcChkLmNvcnB1cywgcmVtb3ZlTnVtYmVycykKIyDmuIXpmaTlpKflsI/lr6voi7HmlofoiIfmlbjlrZcKZC5jb3JwdXMgPC0gdG1fbWFwKGQuY29ycHVzLCBmdW5jdGlvbih3b3JkKSB7CiAgZ3N1YigiW0EtWmEtejAtOV0iLCAiIiwgd29yZCkKfSkKCmQuY29ycHVzIDwtIHRtX21hcChkLmNvcnB1cywgc2VnbWVudENOLCBuYXR1cmUgPSBUUlVFKQoKZC5jb3JwdXMgPC0gQ29ycHVzKFZlY3RvclNvdXJjZShkLmNvcnB1cykpCgpteVN0b3BXb3JkcyA8LSBjKHN0b3B3b3Jkc0NOKCksICLnt6jovK8iLCAi5pmC6ZaTIiwgIuaomemhjCIsICLnmbzkv6EiLCAi5a+m5qWtIiwgIuS9nOiAhSIsICLvvIEiKQpkLmNvcnB1cyA8LSB0bV9tYXAoZC5jb3JwdXMsIHJlbW92ZVdvcmRzLCBteVN0b3BXb3JkcykKCnRkbSA8LSBUZXJtRG9jdW1lbnRNYXRyaXgoZC5jb3JwdXMsIGNvbnRyb2wgPSBsaXN0KHdvcmRMZW5ndGhzID0gYygyLCBJbmYpKSkKCmluc3BlY3QodGRtKQ==