fork download
  1. # Q:想要擷取出網頁中某區塊標籤所有的文字(包含html標籤等)內的某一資料...結果是擷取出網頁內所有同屬性的資料...
  2.  
  3. # crawl library ----
  4. library(bitops)
  5. library(XML)
  6. library(RCurl)
  7.  
  8. # only for windows -----
  9. signatures <- system.file("CurlSSL", cainfo="cacert.pem", package="RCurl")
  10.  
  11. home <- "http://w...content-available-to-author-only...e.com/aussteller/messen/index.php?OK=1&sortierid=0&maxPerPage=20&i_cockpitkeyfindwo=2&i_cockpitkeyfindart=1&currPage=1"
  12. home <- getURL(home, cainfo = signatures)
  13. home <- htmlParse(home)
  14.  
  15. # 共有20筆的block
  16. block <-getNodeSet(home, "//div[@class='shm']")
  17.  
  18. # length(block) # 共20筆
  19.  
  20. doc <- block[[1]] # 抓取第一筆
  21.  
  22. # doc
  23. # <div class="shm">
  24. # <div class="listdates">
  25. # <div class="date">08Jan-17Jan2016</div>
  26. # </div>
  27. # <div class="search_result_list_box">
  28. # <div class="city">London, United Kingdom</div>
  29. # <div class="firma"><a href="show.php?id=352&amp;timer=m1452657261&amp;tmid=&amp;currPage=1&amp;maxPerPage=20&amp;params=timer%3Dm1452657261%26amp%3Btimer%3Dm1452657261%26amp%3Bi_cockpitkeyfindwo%3D2%26amp%3Bi_cockpitkeyfindart%3D1%26amp%3Bsortierid%3D0%26amp%3Btimer%3Dm1452657261%26amp%3BmaxPerPage%3D20%26amp%3BshowPrintlist%3D0%26amp%3BmaxPerPage%3D20">London Boat Show</a></div>
  30. # </div>
  31. # <div class="search_result_box_right">
  32. # <div class="branchen"><strong>Business sectors:</strong> Boats</div>
  33. # </div>
  34. # <div class="fixfloat"></div>
  35. # </div>
  36.  
  37. # 想要抓取 block[[1]]筆的 date
  38. date <- xpathSApply(doc, "//div[@class='date']", xmlValue)
  39.  
  40. # 結果秀出網頁內全部 date 的資料
  41. # [1] "08Jan-17Jan2016" "08Jan-17Jan2016" "09Jan-17Jan2016" "07Jan-15Jan2017" "06Jan-14Jan2018"
  42. # [6] "09Jan-17Jan2016" "09Jan-17Jan2016" "09Jan-17Jan2016" "09Jan-17Jan2016" "10Jan-13Jan2016"
  43. # [11] "10Jan-13Jan2016" "10Jan-13Jan2016" "10Jan-13Jan2016" "11Jan-13Jan2016" "11Jan-13Jan2016"
  44. # [16] "11Jan-13Jan2016" "11Jan-14Jan2016" "11Jan-14Jan2016" "11Jan-14Jan2016" "11Jan-14Jan2016"
  45. # [21] "11Jan-14Jan2016" "11Jan-24Jan2016" "09Jan-22Jan2017"# your code goes here
Success #stdin #stdout #stderr 0.45s 79168KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Error in library(bitops) : there is no package called ‘bitops’
Execution halted