from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup
def get_urls(url):
topic_urls = []
html = urlopen(url).read()
soup = BeautifulSoup(html)
# news feed
links = soup.find_all('a', class_ = 'newsFeed_item_link' )
for link in links:
topic_urls.append(urljoin(url, link['href']))
# next page
next_page = soup.find_all('li', class_ = 'pagination_item pagination_item-next ' )
if not next_page:
next_url = None
else:
next_url = urljoin(url, next_page[0].a['href'])
return {'topic': topic_urls, 'next': next_url}
def main():
next_url = 'https://n...content-available-to-author-only...o.jp/topics/top-picks'
topic_urls = []
while next_url is not None:
print(next_url)
urls = get_urls(next_url)
topic_urls.extend(urls['topic'])
next_url = urls['next']
print(topic_urls)
ZnJvbSB1cmxsaWIucmVxdWVzdCBpbXBvcnQgdXJsb3BlbiwgdXJsam9pbgpmcm9tIGJzNCBpbXBvcnQgQmVhdXRpZnVsU291cAoKCmRlZiBnZXRfdXJscyh1cmwpOgogICAgdG9waWNfdXJscyA9IFtdCiAgICBodG1sID0gdXJsb3Blbih1cmwpLnJlYWQoKQogICAgc291cCA9IEJlYXV0aWZ1bFNvdXAoaHRtbCkKICAgICMgbmV3cyBmZWVkCiAgICBsaW5rcyA9IHNvdXAuZmluZF9hbGwoJ2EnLCBjbGFzc18gPSAnbmV3c0ZlZWRfaXRlbV9saW5rJyApCiAgICBmb3IgbGluayBpbiBsaW5rczoKICAgICAgICB0b3BpY191cmxzLmFwcGVuZCh1cmxqb2luKHVybCwgbGlua1snaHJlZiddKSkKICAgICMgbmV4dCBwYWdlCiAgICBuZXh0X3BhZ2UgPSBzb3VwLmZpbmRfYWxsKCdsaScsIGNsYXNzXyA9ICdwYWdpbmF0aW9uX2l0ZW0gcGFnaW5hdGlvbl9pdGVtLW5leHQgJyApCiAgICBpZiBub3QgbmV4dF9wYWdlOgogICAgICAgIG5leHRfdXJsID0gTm9uZQogICAgZWxzZToKICAgICAgICBuZXh0X3VybCA9IHVybGpvaW4odXJsLCBuZXh0X3BhZ2VbMF0uYVsnaHJlZiddKQogICAgcmV0dXJuIHsndG9waWMnOiB0b3BpY191cmxzLCAnbmV4dCc6IG5leHRfdXJsfQoKZGVmIG1haW4oKToKCW5leHRfdXJsID0gJ2h0dHBzOi8vbi4uLmNvbnRlbnQtYXZhaWxhYmxlLXRvLWF1dGhvci1vbmx5Li4uby5qcC90b3BpY3MvdG9wLXBpY2tzJwoJdG9waWNfdXJscyA9IFtdCgl3aGlsZSBuZXh0X3VybCBpcyBub3QgTm9uZToKCQlwcmludChuZXh0X3VybCkKCQl1cmxzID0gZ2V0X3VybHMobmV4dF91cmwpCgkJdG9waWNfdXJscy5leHRlbmQodXJsc1sndG9waWMnXSkKCQluZXh0X3VybCA9IHVybHNbJ25leHQnXQoJCglwcmludCh0b3BpY191cmxzKQo=