##############################################################################################################
## setting Simplified Chinese
##############################################################################################################
##############################################################################################################
## 爬各篇文章的網址
##############################################################################################################
##############################################################################
# 建立工作空間
##############################################################################
setwd( "C:/Users/erica_sung/Desktop/RCrawler/Tmall" )
##############################################################################
# 開啟爬蟲的library
##############################################################################
library( xml2) #沒有辦法設定cookie
library( httr) #需要cookie
#library(magrittr) #簡化code
library( jsonlite) #第二階段會抓回JSON
#library(RODBC)
##############################################################################
# 設定主網址, R才能找到html指令做爬蟲
##############################################################################
url <- 'https://a...content-available-to-author-only...l.com/i/asynSearch.htm?_ksTS=1524104678076_405&callback=jsonp406&mid=w-14434706031-0&wid=14434706031&path=/search.htm&search=y&spm=a312a.7700824.w5001-14434542985.2.1ee52442XxgtMD&scene=taobao_shop'
doc <- GET( url, config = set_cookies(
'hng' = 'TW%7Czh-TW%7CTWD%7C158' ,
'cna' = 'E1wuE2k9ZDsCASTg3lX3Mf+w' ,
'enc' = 'IJq7NvFdNdQioyHm%2FW7eLj1oTU%2BpLnV0h%2FZe3f5NIrCpiTi5MAwEUKOZkW4EIASKidASmJAfUwhfRaqZH0T%2Bkw%3D%3D' ,
'_uab_collina' = '152360084906149649411295' ,
'_m_h5_tk' = 'fd235beee7c10942953c5c382e988802_1524045248697' ,
'_m_h5_tk_enc' = '0ac50107c199def3d50bf0996d5c566b' ,
't' = 'a00d9085fb70629a260b253bf769e5a0' ,
'cookie2' = '1d1b7bcbbb7b7d259645daeebf945b95' ,
'isg' = 'BGdnS8nG8aa2w3UiZJxAC_9B9pvxRG1sqe8efznUgfYdKIfqQbzLHqUOTiC2xxNG'
) )
##抓所有文章的url跟其他資訊
xpath.url <- '/html/body/div/div[3]/div/dl/dd[2]/a'
tu <- trimws( xml_attr( xml_find_all( content( doc) , xpath.url ) , "href" ) )
tu <- gsub( '\" //detail.tmall.com/item.htm' , 'https://d...content-available-to-author-only...l.com/item.htm' , tu)
tu <- gsub( '&abbucket=12' , '&abbucket=1' , tu)
tu <- substr( tu, 2 , nchar( tu) - 2 )
a <- regexpr( 'id' , tu)
b <- regexpr( '&rn' , tu)
id <- substr( tu, a+ 3 , b- 1 )
ttl <- xml_text( xml_find_all( content( doc) , xpath.url ) )
xpath.price <- '/html/body/div/div[3]/div/dl/dd[2]/div/div[1]/span[2]'
tp <- trimws( xml_text( xml_find_all( content( doc) , xpath.price ) ) )
weblist <- cbind( tu, id, ttl, tp)
weblist <- data.frame ( matrix( weblist, ncol= 4 , byrow= F) )
colnames( weblist) <- c( 'url' , 'id' , 'title' , 'price' )
#write.csv(weblist, 'weblist.csv', row.names=T, fileEncoding = 'UTF-8')
##爬每一個文章
nrow <- nrow( weblist)
final_ratelist <- c( )
for ( i in 1 : nrow)
{
for ( j in 1 : 99 )
{
suburl <- paste0( 'https://r...content-available-to-author-only...l.com/list_detail_rate.htm?itemId=' , weblist$id[ i] , '&sellerId=686773455&order=3¤tPage=' , j, '&append=0&content=1' )
subdoc <- GET( suburl, config = set_cookies( 'JSESSIONID' = '8F0C390B5E686300BC04AA763900EB6D' ) )
json <- xml_text( xml_find_all( content( subdoc) , '/html/body' ) )
a <- regexpr( 'rateList' , json)
if ( a[ 1 ] ==- 1 ) {
Sys.sleep ( 60 )
subdoc <- GET( suburl, config = set_cookies( 'JSESSIONID' =
'8F0C390B5E686300BC04AA763900EB6D' ) )
json <- xml_text( xml_find_all( content( subdoc) , '/html/body' ) )
}
#只想要rateList這段
a <- regexpr( 'rateList' , json)
b <- regexpr( 'searchinfo' , json)
json <- substr( json, a+ 10 , b- 3 )
Encoding( json) <- 'UTF-8'
json <- gsub( "<U\\ +[0-9A-F]{4}>" , '' , gsub( ':",' , ':"",' , gsub( ':"}' , ':""}' , gsub( '""' , '"' , json) ) ) )
ratelist <- fromJSON( json)
productname <- weblist$title[ i]
ratelist <- ratelist[ , which( colnames( ratelist) % in% c( 'auctionSku' , 'gmtCreateTime' , 'goldUser' , 'id' , 'pics' , 'position' , 'rateContent' , 'reply' , 'rateDate' , 'tradeEndTime' ) ) ]
ratelist <- cbind( productname, ratelist)
final_ratelist <- rbind( final_ratelist, ratelist)
}
}
IyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMKIyMgc2V0dGluZyBTaW1wbGlmaWVkIENoaW5lc2UKIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMKClN5cy5zZXRsb2NhbGUoIkxDX0FMTCIsIkNoaW5lc2UiKQoKIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMKIyMg54is5ZCE56+H5paH56ug55qE57ay5Z2ACiMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjCgojIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMKIyDlu7rnq4vlt6XkvZznqbrplpMKIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjCgpzZXR3ZCgiQzovVXNlcnMvZXJpY2Ffc3VuZy9EZXNrdG9wL1JDcmF3bGVyL1RtYWxsIikKCiMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIwojIOmWi+WVn+eIrOifsueahGxpYnJhcnkKIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjCgpsaWJyYXJ5KHhtbDIpICPmspLmnInovqbms5XoqK3lrppjb29raWUKbGlicmFyeShodHRyKSAj6ZyA6KaBY29va2llCiNsaWJyYXJ5KG1hZ3JpdHRyKSAj57Ch5YyWY29kZQpsaWJyYXJ5KGpzb25saXRlKSAj56ys5LqM6ZqO5q615pyD5oqT5ZueSlNPTgojbGlicmFyeShST0RCQykKCiMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIwojIOioreWumuS4u+e2suWdgCwgUuaJjeiDveaJvuWIsGh0bWzmjIfku6TlgZrniKzon7IKIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjCgp1cmwgPC0gJ2h0dHBzOi8vYS4uLmNvbnRlbnQtYXZhaWxhYmxlLXRvLWF1dGhvci1vbmx5Li4ubC5jb20vaS9hc3luU2VhcmNoLmh0bT9fa3NUUz0xNTI0MTA0Njc4MDc2XzQwNSZjYWxsYmFjaz1qc29ucDQwNiZtaWQ9dy0xNDQzNDcwNjAzMS0wJndpZD0xNDQzNDcwNjAzMSZwYXRoPS9zZWFyY2guaHRtJnNlYXJjaD15JnNwbT1hMzEyYS43NzAwODI0Lnc1MDAxLTE0NDM0NTQyOTg1LjIuMWVlNTI0NDJYeGd0TUQmc2NlbmU9dGFvYmFvX3Nob3AnCgpkb2MgPC0gR0VUKHVybCwgY29uZmlnID0gc2V0X2Nvb2tpZXMoCiAgJ2huZycgPSAnVFclN0N6aC1UVyU3Q1RXRCU3QzE1OCcsIAogICdjbmEnID0gJ0Uxd3VFMms5WkRzQ0FTVGczbFgzTWYrdycsCiAgJ2VuYycgPSAnSUpxN052RmROZFFpb3lIbSUyRlc3ZUxqMW9UVSUyQnBMblYwaCUyRlplM2Y1TklyQ3BpVGk1TUF3RVVLT1prVzRFSUFTS2lkQVNtSkFmVXdoZlJhcVpIMFQlMkJrdyUzRCUzRCcsIAogICdfdWFiX2NvbGxpbmEnID0gJzE1MjM2MDA4NDkwNjE0OTY0OTQxMTI5NScsCiAgJ19tX2g1X3RrJyA9ICdmZDIzNWJlZWU3YzEwOTQyOTUzYzVjMzgyZTk4ODgwMl8xNTI0MDQ1MjQ4Njk3JywKICAnX21faDVfdGtfZW5jJyA9ICcwYWM1MDEwN2MxOTlkZWYzZDUwYmYwOTk2ZDVjNTY2YicsCiAgJ3QnID0gJ2EwMGQ5MDg1ZmI3MDYyOWEyNjBiMjUzYmY3NjllNWEwJywKICAnY29va2llMicgPSAnMWQxYjdiY2JiYjdiN2QyNTk2NDVkYWVlYmY5NDViOTUnLAogICdpc2cnID0gJ0JHZG5TOG5HOGFhMnczVWlaSnhBQ185QjlwdnhSRzFzcWU4ZWZ6blVnZllkS0lmcVFiekxIcVVPVGlDMnh4TkcnCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICApKQoKCiMj5oqT5omA5pyJ5paH56ug55qEdXJs6Lef5YW25LuW6LOH6KiKCgp4cGF0aC51cmwgPC0gJy9odG1sL2JvZHkvZGl2L2RpdlszXS9kaXYvZGwvZGRbMl0vYScKCnR1IDwtIHRyaW13cyh4bWxfYXR0cih4bWxfZmluZF9hbGwoY29udGVudChkb2MpLCB4cGF0aC51cmwpLCAiaHJlZiIpKQp0dSA8LSBnc3ViKCdcIi8vZGV0YWlsLnRtYWxsLmNvbS9pdGVtLmh0bScsJ2h0dHBzOi8vZC4uLmNvbnRlbnQtYXZhaWxhYmxlLXRvLWF1dGhvci1vbmx5Li4ubC5jb20vaXRlbS5odG0nLHR1KQp0dSA8LSBnc3ViKCcmYWJidWNrZXQ9MTInLCcmYWJidWNrZXQ9MScsdHUpCnR1IDwtIHN1YnN0cih0dSwyLG5jaGFyKHR1KS0yKQoKYSA8LSByZWdleHByKCdpZCcsdHUpCmIgPC0gcmVnZXhwcignJnJuJyx0dSkKaWQgPC0gc3Vic3RyKHR1LGErMyxiLTEpCgp0dGwgPC0geG1sX3RleHQoeG1sX2ZpbmRfYWxsKGNvbnRlbnQoZG9jKSwgeHBhdGgudXJsKSkKCgp4cGF0aC5wcmljZSA8LSAnL2h0bWwvYm9keS9kaXYvZGl2WzNdL2Rpdi9kbC9kZFsyXS9kaXYvZGl2WzFdL3NwYW5bMl0nCnRwIDwtIHRyaW13cyh4bWxfdGV4dCh4bWxfZmluZF9hbGwoY29udGVudChkb2MpLCB4cGF0aC5wcmljZSkpKQoKCndlYmxpc3QgPC0gY2JpbmQodHUsaWQsdHRsLHRwKQp3ZWJsaXN0IDwtIGRhdGEuZnJhbWUobWF0cml4KHdlYmxpc3QsIG5jb2w9NCwgYnlyb3c9RikpCmNvbG5hbWVzKHdlYmxpc3QpIDwtIGMoJ3VybCcsJ2lkJywndGl0bGUnLCdwcmljZScpCgojd3JpdGUuY3N2KHdlYmxpc3QsICd3ZWJsaXN0LmNzdicsIHJvdy5uYW1lcz1ULCBmaWxlRW5jb2RpbmcgPSAnVVRGLTgnKSAKCgojI+eIrOavj+S4gOWAi+aWh+eroAoKbnJvdyA8LSBucm93KHdlYmxpc3QpCgpmaW5hbF9yYXRlbGlzdCA8LSBjKCkKCmZvciAoIGkgaW4gMTpucm93KQp7CiAgZm9yIChqIGluIDE6OTkpCiAgewogIHN1YnVybCA8LSBwYXN0ZTAoJ2h0dHBzOi8vci4uLmNvbnRlbnQtYXZhaWxhYmxlLXRvLWF1dGhvci1vbmx5Li4ubC5jb20vbGlzdF9kZXRhaWxfcmF0ZS5odG0/aXRlbUlkPScsd2VibGlzdCRpZFtpXSwnJnNlbGxlcklkPTY4Njc3MzQ1NSZvcmRlcj0zJmN1cnJlbnRQYWdlPScsaiwnJmFwcGVuZD0wJmNvbnRlbnQ9MScpCiAgc3ViZG9jIDwtIEdFVChzdWJ1cmwsIGNvbmZpZyA9IHNldF9jb29raWVzKCdKU0VTU0lPTklEJyA9ICc4RjBDMzkwQjVFNjg2MzAwQkMwNEFBNzYzOTAwRUI2RCcpKQogIAogIGpzb24gPC0geG1sX3RleHQoeG1sX2ZpbmRfYWxsKGNvbnRlbnQoc3ViZG9jKSwgJy9odG1sL2JvZHknKSkKCiAgYSA8LSByZWdleHByKCdyYXRlTGlzdCcsanNvbikKICAKICBpZihhWzFdPT0tMSkgewogICAgU3lzLnNsZWVwKDYwKQogICAgc3ViZG9jIDwtIEdFVChzdWJ1cmwsIGNvbmZpZyA9IHNldF9jb29raWVzKCdKU0VTU0lPTklEJyA9CiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnOEYwQzM5MEI1RTY4NjMwMEJDMDRBQTc2MzkwMEVCNkQnKSkKICAgIGpzb24gPC0geG1sX3RleHQoeG1sX2ZpbmRfYWxsKGNvbnRlbnQoc3ViZG9jKSwgJy9odG1sL2JvZHknKSkKICB9ICAgICAKICAKICAj5Y+q5oOz6KaBcmF0ZUxpc3TpgJnmrrUKICBhIDwtIHJlZ2V4cHIoJ3JhdGVMaXN0Jyxqc29uKQogIGIgPC0gcmVnZXhwcignc2VhcmNoaW5mbycsanNvbikKICBqc29uIDwtIHN1YnN0cihqc29uLGErMTAsYi0zKQogIEVuY29kaW5nKGpzb24pIDwtICdVVEYtOCcKICAKICBqc29uIDwtIGdzdWIoIjxVXFwrWzAtOUEtRl17NH0+IiwnJyxnc3ViKCc6IiwnLCc6IiIsJyxnc3ViKCc6In0nLCc6IiJ9Jyxnc3ViKCciIicsJyInLGpzb24pKSkpCiAgcmF0ZWxpc3QgPC0gZnJvbUpTT04oanNvbikKICAKICBwcm9kdWN0bmFtZSA8LSB3ZWJsaXN0JHRpdGxlW2ldCiAgcmF0ZWxpc3QgPC0gcmF0ZWxpc3RbLCB3aGljaChjb2xuYW1lcyhyYXRlbGlzdCklaW4lYygnYXVjdGlvblNrdScsJ2dtdENyZWF0ZVRpbWUnLCdnb2xkVXNlcicsJ2lkJywncGljcycsJ3Bvc2l0aW9uJywncmF0ZUNvbnRlbnQnLCdyZXBseScsJ3JhdGVEYXRlJywndHJhZGVFbmRUaW1lJykpXQogIHJhdGVsaXN0IDwtIGNiaW5kKHByb2R1Y3RuYW1lLCByYXRlbGlzdCkKICAKICBmaW5hbF9yYXRlbGlzdCA8LSByYmluZChmaW5hbF9yYXRlbGlzdCxyYXRlbGlzdCkKICAKICB9Cn0=