fork download
  1. ##############################################################################################################
  2. ## setting Simplified Chinese
  3. ##############################################################################################################
  4.  
  5. Sys.setlocale("LC_ALL","Chinese")
  6.  
  7. ##############################################################################################################
  8. ## 爬各篇文章的網址
  9. ##############################################################################################################
  10.  
  11. ##############################################################################
  12. # 建立工作空間
  13. ##############################################################################
  14.  
  15. setwd("C:/Users/erica_sung/Desktop/RCrawler/Tmall")
  16.  
  17. ##############################################################################
  18. # 開啟爬蟲的library
  19. ##############################################################################
  20.  
  21. library(xml2) #沒有辦法設定cookie
  22. library(httr) #需要cookie
  23. #library(magrittr) #簡化code
  24. library(jsonlite) #第二階段會抓回JSON
  25. #library(RODBC)
  26.  
  27. ##############################################################################
  28. # 設定主網址, R才能找到html指令做爬蟲
  29. ##############################################################################
  30.  
  31. url <- 'https://a...content-available-to-author-only...l.com/i/asynSearch.htm?_ksTS=1524104678076_405&callback=jsonp406&mid=w-14434706031-0&wid=14434706031&path=/search.htm&search=y&spm=a312a.7700824.w5001-14434542985.2.1ee52442XxgtMD&scene=taobao_shop'
  32.  
  33. doc <- GET(url, config = set_cookies(
  34. 'hng' = 'TW%7Czh-TW%7CTWD%7C158',
  35. 'cna' = 'E1wuE2k9ZDsCASTg3lX3Mf+w',
  36. 'enc' = 'IJq7NvFdNdQioyHm%2FW7eLj1oTU%2BpLnV0h%2FZe3f5NIrCpiTi5MAwEUKOZkW4EIASKidASmJAfUwhfRaqZH0T%2Bkw%3D%3D',
  37. '_uab_collina' = '152360084906149649411295',
  38. '_m_h5_tk' = 'fd235beee7c10942953c5c382e988802_1524045248697',
  39. '_m_h5_tk_enc' = '0ac50107c199def3d50bf0996d5c566b',
  40. 't' = 'a00d9085fb70629a260b253bf769e5a0',
  41. 'cookie2' = '1d1b7bcbbb7b7d259645daeebf945b95',
  42. 'isg' = 'BGdnS8nG8aa2w3UiZJxAC_9B9pvxRG1sqe8efznUgfYdKIfqQbzLHqUOTiC2xxNG'
  43. ))
  44.  
  45.  
  46. ##抓所有文章的url跟其他資訊
  47.  
  48. xpath.url <- '/html/body/div/div[3]/div/dl/dd[2]/a'
  49.  
  50. tu <- trimws(xml_attr(xml_find_all(content(doc), xpath.url), "href"))
  51. tu <- gsub('\"//detail.tmall.com/item.htm','https://d...content-available-to-author-only...l.com/item.htm',tu)
  52. tu <- gsub('&abbucket=12','&abbucket=1',tu)
  53. tu <- substr(tu,2,nchar(tu)-2)
  54.  
  55. a <- regexpr('id',tu)
  56. b <- regexpr('&rn',tu)
  57. id <- substr(tu,a+3,b-1)
  58.  
  59. ttl <- xml_text(xml_find_all(content(doc), xpath.url))
  60.  
  61.  
  62. xpath.price <- '/html/body/div/div[3]/div/dl/dd[2]/div/div[1]/span[2]'
  63. tp <- trimws(xml_text(xml_find_all(content(doc), xpath.price)))
  64.  
  65.  
  66. weblist <- cbind(tu,id,ttl,tp)
  67. weblist <- data.frame(matrix(weblist, ncol=4, byrow=F))
  68. colnames(weblist) <- c('url','id','title','price')
  69.  
  70. #write.csv(weblist, 'weblist.csv', row.names=T, fileEncoding = 'UTF-8')
  71.  
  72.  
  73. ##爬每一個文章
  74.  
  75. nrow <- nrow(weblist)
  76.  
  77. final_ratelist <- c()
  78.  
  79. for ( i in 1:nrow)
  80. {
  81. for (j in 1:99)
  82. {
  83. suburl <- paste0('https://r...content-available-to-author-only...l.com/list_detail_rate.htm?itemId=',weblist$id[i],'&sellerId=686773455&order=3&currentPage=',j,'&append=0&content=1')
  84. subdoc <- GET(suburl, config = set_cookies('JSESSIONID' = '8F0C390B5E686300BC04AA763900EB6D'))
  85.  
  86. json <- xml_text(xml_find_all(content(subdoc), '/html/body'))
  87.  
  88. a <- regexpr('rateList',json)
  89.  
  90. if(a[1]==-1) {
  91. Sys.sleep(60)
  92. subdoc <- GET(suburl, config = set_cookies('JSESSIONID' =
  93. '8F0C390B5E686300BC04AA763900EB6D'))
  94. json <- xml_text(xml_find_all(content(subdoc), '/html/body'))
  95. }
  96.  
  97. #只想要rateList這段
  98. a <- regexpr('rateList',json)
  99. b <- regexpr('searchinfo',json)
  100. json <- substr(json,a+10,b-3)
  101. Encoding(json) <- 'UTF-8'
  102.  
  103. json <- gsub("<U\\+[0-9A-F]{4}>",'',gsub(':",',':"",',gsub(':"}',':""}',gsub('""','"',json))))
  104. ratelist <- fromJSON(json)
  105.  
  106. productname <- weblist$title[i]
  107. ratelist <- ratelist[, which(colnames(ratelist)%in%c('auctionSku','gmtCreateTime','goldUser','id','pics','position','rateContent','reply','rateDate','tradeEndTime'))]
  108. ratelist <- cbind(productname, ratelist)
  109.  
  110. final_ratelist <- rbind(final_ratelist,ratelist)
  111.  
  112. }
  113. }
Success #stdin #stdout #stderr 0.26s 38976KB
stdin
Standard input is empty
stdout
[1] ""
stderr
Warning message:
In Sys.setlocale("LC_ALL", "Chinese") :
  OS reports request to set locale to "Chinese" cannot be honored
Error in setwd("C:/Users/erica_sung/Desktop/RCrawler/Tmall") : 
  cannot change working directory
Execution halted