fork download
  1. # Import libraries
  2. import requests
  3. import urllib.request
  4. import time
  5. from bs4 import BeautifulSoup
  6. import os
  7.  
  8. files = os.listdir('/home/ghost/11111')
  9. #print(files)
  10.  
  11. # Set the URL you want to webscrape from
  12. #url = 'http://w...content-available-to-author-only...a.info/developers/turnstile.html'
  13. #url = 'http://i...content-available-to-author-only...p.com'
  14. url = 'https://w...content-available-to-author-only...e.com/machine-learning'
  15.  
  16. agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
  17.  
  18. # Connect to the URL
  19. response = requests.get(url)
  20. #print (BeautifulSoup(response.content, 'lxml'))
  21.  
  22. # Parse HTML and save to BeautifulSoup object¶
  23. soup = BeautifulSoup(response.text, "html.parser")
  24.  
  25.  
  26. print('len of a tag:',len(soup.findAll('a'))+1)
  27.  
  28. # To download the whole data set, let's do a for loop through all a tags
  29. for i in range(36,len(soup.findAll('a'))+1): #'a' tags are for links
  30.  
  31. opener=urllib.request.build_opener()
  32. opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
  33. urllib.request.install_opener(opener)
  34. one_a_tag = soup.findAll('a')[i]
  35. print(one_a_tag)
  36. link = one_a_tag['href']
  37. path = link.split('/')[-1]
  38. if path in files:
  39. print('in', i)
  40. continue
  41. if not (path.endswith("zip") or path.endswith("pdf")):
  42. print('out', i)
  43. continue
  44. try:
  45. urllib.request.urlretrieve(link,'/home/ghost/11111/'+path)
  46. except Exception as e:
  47. print(e)
  48. continue
  49. time.sleep(1) #pause the code for a sec
Not running #stdin #stdout 0s 0KB
stdin
Standard input is empty
stdout
Standard output is empty