##############################################################################################################
## setting Simplified Chinese
##############################################################################################################

Sys.setlocale("LC_ALL","Chinese")

##############################################################################################################
## 爬各篇文章的網址
##############################################################################################################

##############################################################################
# 建立工作空間
##############################################################################

setwd("C:/Users/erica_sung/Desktop/RCrawler/Tmall")

##############################################################################
# 開啟爬蟲的library
##############################################################################

library(xml2) #沒有辦法設定cookie
library(httr) #需要cookie
#library(magrittr) #簡化code
library(jsonlite) #第二階段會抓回JSON
#library(RODBC)

##############################################################################
# 設定主網址, R才能找到html指令做爬蟲
##############################################################################

url <- 'https://a...content-available-to-author-only...l.com/i/asynSearch.htm?_ksTS=1524104678076_405&callback=jsonp406&mid=w-14434706031-0&wid=14434706031&path=/search.htm&search=y&spm=a312a.7700824.w5001-14434542985.2.1ee52442XxgtMD&scene=taobao_shop'

doc <- GET(url, config = set_cookies(
  'hng' = 'TW%7Czh-TW%7CTWD%7C158', 
  'cna' = 'E1wuE2k9ZDsCASTg3lX3Mf+w',
  'enc' = 'IJq7NvFdNdQioyHm%2FW7eLj1oTU%2BpLnV0h%2FZe3f5NIrCpiTi5MAwEUKOZkW4EIASKidASmJAfUwhfRaqZH0T%2Bkw%3D%3D', 
  '_uab_collina' = '152360084906149649411295',
  '_m_h5_tk' = 'fd235beee7c10942953c5c382e988802_1524045248697',
  '_m_h5_tk_enc' = '0ac50107c199def3d50bf0996d5c566b',
  't' = 'a00d9085fb70629a260b253bf769e5a0',
  'cookie2' = '1d1b7bcbbb7b7d259645daeebf945b95',
  'isg' = 'BGdnS8nG8aa2w3UiZJxAC_9B9pvxRG1sqe8efznUgfYdKIfqQbzLHqUOTiC2xxNG'
                                  ))


##抓所有文章的url跟其他資訊

xpath.url <- '/html/body/div/div[3]/div/dl/dd[2]/a'

tu <- trimws(xml_attr(xml_find_all(content(doc), xpath.url), "href"))
tu <- gsub('\"//detail.tmall.com/item.htm','https://d...content-available-to-author-only...l.com/item.htm',tu)
tu <- gsub('&abbucket=12','&abbucket=1',tu)
tu <- substr(tu,2,nchar(tu)-2)

a <- regexpr('id',tu)
b <- regexpr('&rn',tu)
id <- substr(tu,a+3,b-1)

ttl <- xml_text(xml_find_all(content(doc), xpath.url))


xpath.price <- '/html/body/div/div[3]/div/dl/dd[2]/div/div[1]/span[2]'
tp <- trimws(xml_text(xml_find_all(content(doc), xpath.price)))


weblist <- cbind(tu,id,ttl,tp)
weblist <- data.frame(matrix(weblist, ncol=4, byrow=F))
colnames(weblist) <- c('url','id','title','price')

#write.csv(weblist, 'weblist.csv', row.names=T, fileEncoding = 'UTF-8') 


##爬每一個文章

nrow <- nrow(weblist)

final_ratelist <- c()

for ( i in 1:nrow)
{
  for (j in 1:99)
  {
  suburl <- paste0('https://r...content-available-to-author-only...l.com/list_detail_rate.htm?itemId=',weblist$id[i],'&sellerId=686773455&order=3&currentPage=',j,'&append=0&content=1')
  subdoc <- GET(suburl, config = set_cookies('JSESSIONID' = '8F0C390B5E686300BC04AA763900EB6D'))
  
  json <- xml_text(xml_find_all(content(subdoc), '/html/body'))

  a <- regexpr('rateList',json)
  
  if(a[1]==-1) {
    Sys.sleep(60)
    subdoc <- GET(suburl, config = set_cookies('JSESSIONID' =
                                                 '8F0C390B5E686300BC04AA763900EB6D'))
    json <- xml_text(xml_find_all(content(subdoc), '/html/body'))
  }     
  
  #只想要rateList這段
  a <- regexpr('rateList',json)
  b <- regexpr('searchinfo',json)
  json <- substr(json,a+10,b-3)
  Encoding(json) <- 'UTF-8'
  
  json <- gsub("<U\\+[0-9A-F]{4}>",'',gsub(':",',':"",',gsub(':"}',':""}',gsub('""','"',json))))
  ratelist <- fromJSON(json)
  
  productname <- weblist$title[i]
  ratelist <- ratelist[, which(colnames(ratelist)%in%c('auctionSku','gmtCreateTime','goldUser','id','pics','position','rateContent','reply','rateDate','tradeEndTime'))]
  ratelist <- cbind(productname, ratelist)
  
  final_ratelist <- rbind(final_ratelist,ratelist)
  
  }
}