# Import libraries
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import os
files = os.listdir('/home/ghost/11111')
#print(files)
# Set the URL you want to webscrape from
#url = 'http://w...content-available-to-author-only...a.info/developers/turnstile.html'
#url = 'http://i...content-available-to-author-only...p.com'
url = 'https://w...content-available-to-author-only...e.com/machine-learning'
agent = {"User-Agent":'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
# Connect to the URL
response = requests.get(url)
#print (BeautifulSoup(response.content, 'lxml'))
# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")
print('len of a tag:',len(soup.findAll('a'))+1)
# To download the whole data set, let's do a for loop through all a tags
for i in range(36,len(soup.findAll('a'))+1): #'a' tags are for links
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
one_a_tag = soup.findAll('a')[i]
print(one_a_tag)
link = one_a_tag['href']
path = link.split('/')[-1]
if path in files:
print('in', i)
continue
if not (path.endswith("zip") or path.endswith("pdf")):
print('out', i)
continue
try:
urllib.request.urlretrieve(link,'/home/ghost/11111/'+path)
except Exception as e:
print(e)
continue
time.sleep(1) #pause the code for a sec
IyBJbXBvcnQgbGlicmFyaWVzCmltcG9ydCByZXF1ZXN0cwppbXBvcnQgdXJsbGliLnJlcXVlc3QKaW1wb3J0IHRpbWUKZnJvbSBiczQgaW1wb3J0IEJlYXV0aWZ1bFNvdXAKaW1wb3J0IG9zCgpmaWxlcyA9IG9zLmxpc3RkaXIoJy9ob21lL2dob3N0LzExMTExJykKI3ByaW50KGZpbGVzKQoKIyBTZXQgdGhlIFVSTCB5b3Ugd2FudCB0byB3ZWJzY3JhcGUgZnJvbQojdXJsID0gJ2h0dHA6Ly93Li4uY29udGVudC1hdmFpbGFibGUtdG8tYXV0aG9yLW9ubHkuLi5hLmluZm8vZGV2ZWxvcGVycy90dXJuc3RpbGUuaHRtbCcKI3VybCA9ICdodHRwOi8vaS4uLmNvbnRlbnQtYXZhaWxhYmxlLXRvLWF1dGhvci1vbmx5Li4ucC5jb20nCnVybCA9ICdodHRwczovL3cuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLmUuY29tL21hY2hpbmUtbGVhcm5pbmcnCgphZ2VudCA9IHsiVXNlci1BZ2VudCI6J01vemlsbGEvNS4wIChXaW5kb3dzIE5UIDYuMzsgV09XNjQpIEFwcGxlV2ViS2l0LzUzNy4zNiAoS0hUTUwsIGxpa2UgR2Vja28pIENocm9tZS81OS4wLjMwNzEuMTE1IFNhZmFyaS81MzcuMzYnfQoKIyBDb25uZWN0IHRvIHRoZSBVUkwKcmVzcG9uc2UgPSByZXF1ZXN0cy5nZXQodXJsKQojcHJpbnQgKEJlYXV0aWZ1bFNvdXAocmVzcG9uc2UuY29udGVudCwgJ2x4bWwnKSkKCiMgUGFyc2UgSFRNTCBhbmQgc2F2ZSB0byBCZWF1dGlmdWxTb3VwIG9iamVjdMK2CnNvdXAgPSBCZWF1dGlmdWxTb3VwKHJlc3BvbnNlLnRleHQsICJodG1sLnBhcnNlciIpCgoKcHJpbnQoJ2xlbiBvZiBhIHRhZzonLGxlbihzb3VwLmZpbmRBbGwoJ2EnKSkrMSkKCiMgVG8gZG93bmxvYWQgdGhlIHdob2xlIGRhdGEgc2V0LCBsZXQncyBkbyBhIGZvciBsb29wIHRocm91Z2ggYWxsIGEgdGFncwpmb3IgaSBpbiByYW5nZSgzNixsZW4oc291cC5maW5kQWxsKCdhJykpKzEpOiAjJ2EnIHRhZ3MgYXJlIGZvciBsaW5rcwogICAgCiAgICBvcGVuZXI9dXJsbGliLnJlcXVlc3QuYnVpbGRfb3BlbmVyKCkKICAgIG9wZW5lci5hZGRoZWFkZXJzPVsoJ1VzZXItQWdlbnQnLCdNb3ppbGxhLzUuMCAoV2luZG93cyBOVCA2LjE7IFdPVzY0KSBBcHBsZVdlYktpdC81MzcuMzYgKEtIVE1MLCBsaWtlIEdlY2tvKSBDaHJvbWUvMzYuMC4xOTQxLjAgU2FmYXJpLzUzNy4zNicpXQogICAgdXJsbGliLnJlcXVlc3QuaW5zdGFsbF9vcGVuZXIob3BlbmVyKQogICAgb25lX2FfdGFnID0gc291cC5maW5kQWxsKCdhJylbaV0KICAgIHByaW50KG9uZV9hX3RhZykKICAgIGxpbmsgPSBvbmVfYV90YWdbJ2hyZWYnXQogICAgcGF0aCA9IGxpbmsuc3BsaXQoJy8nKVstMV0KICAgIGlmIHBhdGggaW4gZmlsZXM6CiAgICAgICAgcHJpbnQoJ2luJywgaSkKICAgICAgICBjb250aW51ZQogICAgaWYgbm90IChwYXRoLmVuZHN3aXRoKCJ6aXAiKSBvciBwYXRoLmVuZHN3aXRoKCJwZGYiKSk6CiAgICAgICAgcHJpbnQoJ291dCcsIGkpCiAgICAgICAgY29udGludWUKICAgIHRyeToKICAgICAgICB1cmxsaWIucmVxdWVzdC51cmxyZXRyaWV2ZShsaW5rLCcvaG9tZS9naG9zdC8xMTExMS8nK3BhdGgpCiAgICBleGNlcHQgRXhjZXB0aW9uIGFzIGU6CiAgICAgICAgcHJpbnQoZSkKICAgICAgICBjb250aW51ZQp0aW1lLnNsZWVwKDEpICNwYXVzZSB0aGUgY29kZSBmb3IgYSBzZWM=