from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import urllib.request
import re
lists = open("AmiAmi_mini.csv").read().split("\n")[:-1]
filename = "Ami_full.csv"
f = open(filename, "w", encoding="utf-8")
headers = "fig_name_eng, fig_name_jap, Scale, Size, Type, Materials, Sell_price, Price, Sale, Status, Code, date, brand, line_prod, series, char_name, sculpt\n"
f.write(headers)
f.close
for url in lists:
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
page_soup = soup(page_html, "html.parser")
page_soup.findAll("div", style="width:600px;")
f = open(filename, "a", encoding="utf-8")
info4 = page_soup.findAll("h2", {"class": "heading_10"})
container4 = info4[0]
for container4 in info4:
jap = container4.find('span', class_='').text
if jap:
Jap = jap.strip()
else:
Jap = "NA"
r = str(container4)
container5 = soup(r[:r.rindex('<br/>')], 'html.parser')
if container5:
eng = container5.find('h2').text.strip()
else:
eng = "NA"
f.write(eng.replace(",", "|") + "," + jap.replace(",", "|") + ",")
info1 = page_soup.findAll("p", {"class": "box_01"})
container1 = info1[0]
for container1 in info1:
Scale1 = container1.findAll(text=re.compile('Scale'))
if Scale1:
Scale = Scale1[0].strip(' \t\n\r')
else:
Scale = "NA"
Size1 = container1.findAll(text=re.compile('Size'))
if Size1:
Size = Size1[0].strip(' \t\n\r')
else:
Size = "NA"
Type1 = info1[0].next_element.strip()
if Type1:
Type = Type1
else:
Type = "NA"
Material1 = container1.findAll(text=re.compile('Material'))
if Material1:
Material = Material1[0].strip(' \t\n\r')
else:
Material = "NA"
print("Scale: "+Scale)
print("Size: "+Size)
print("Type: "+Type)
print("Materials: "+Material)
f.write(Scale.replace(",", ".") + "," + Size.replace(",", ".") + "," + Type + "," + Material + ",")
info2 = page_soup.findAll("ul")
container2 = info2[4]
for container2 in info2:
Sell_price1 = container2.findAll("li", {"class": "selling_price"})
if Sell_price1:
Sell_price = Sell_price1[0].text.strip()
else:
Sell_price = "NA"
Price1 = container2.findAll("li", {"class": "price"})
if Price1:
Price = Price1[0].findAll(text=re.compile('JPY'))[0].strip()
else:
Price = "NA"
sale1 = container2.li("span", {"class": "off_price"})
if sale1:
Sale = sale1[0].text
else:
Sale = "NA"
Status = container2.findAll("li", {"class": "selling_price"})
if Status:
Status = Status[1].text.strip()
else:
Status = "NA"
print("Sell price: "+Sell_price)
print("Price: "+ Price)
print("Sale: "+Sale)
print("Status: "+Status)
f.write(Sell_price.replace(",", ".") + "," + Price.replace(",", ".") + "," + Sale.replace(",", ".") + "," + Status.replace(",", ".") + ",")
info3 = page_soup.findAll("dl", {"class": "spec_data"})
container3 = info3[0]
fields = ["JAN Code", "Release Date", "Brand", "Product Line",
"Series Title", "Character Name", "Sculptor"]
for container3 in info3:
data = []
for field in fileds:
tag = container3.find('dt', text=field)
if tag:
field_text = tag.find_next_sibling("dd").text.strip().replace(",",".")
else:
field_text = "NA"
data.append(field_text)
f.write(",".join(data))
f.close()
ZnJvbSBiczQgaW1wb3J0IEJlYXV0aWZ1bFNvdXAgYXMgc291cApmcm9tIHVybGxpYi5yZXF1ZXN0IGltcG9ydCB1cmxvcGVuIGFzIHVSZXEKaW1wb3J0IHVybGxpYi5yZXF1ZXN0CmltcG9ydCByZQoKbGlzdHMgPSBvcGVuKCJBbWlBbWlfbWluaS5jc3YiKS5yZWFkKCkuc3BsaXQoIlxuIilbOi0xXQoKZmlsZW5hbWUgPSAiQW1pX2Z1bGwuY3N2IgpmID0gb3BlbihmaWxlbmFtZSwgInciLCBlbmNvZGluZz0idXRmLTgiKQpoZWFkZXJzID0gImZpZ19uYW1lX2VuZywgZmlnX25hbWVfamFwLCBTY2FsZSwgU2l6ZSwgVHlwZSwgTWF0ZXJpYWxzLCBTZWxsX3ByaWNlLCBQcmljZSwgU2FsZSwgU3RhdHVzLCBDb2RlLCBkYXRlLCBicmFuZCwgbGluZV9wcm9kLCBzZXJpZXMsIGNoYXJfbmFtZSwgc2N1bHB0XG4iCmYud3JpdGUoaGVhZGVycykKZi5jbG9zZQoKZm9yIHVybCBpbiBsaXN0czoKCiAgICB1Q2xpZW50ID0gdVJlcSh1cmwpCiAgICBwYWdlX2h0bWwgPSB1Q2xpZW50LnJlYWQoKQogICAgdUNsaWVudC5jbG9zZSgpCiAgICBwYWdlX3NvdXAgPSBzb3VwKHBhZ2VfaHRtbCwgImh0bWwucGFyc2VyIikKCiAgICBwYWdlX3NvdXAgPSBzb3VwKHBhZ2VfaHRtbCwgImh0bWwucGFyc2VyIikKICAgIHBhZ2Vfc291cC5maW5kQWxsKCJkaXYiLCBzdHlsZT0id2lkdGg6NjAwcHg7IikKCiAgICBmID0gb3BlbihmaWxlbmFtZSwgImEiLCBlbmNvZGluZz0idXRmLTgiKQoKICAgIGluZm80ID0gcGFnZV9zb3VwLmZpbmRBbGwoImgyIiwgeyJjbGFzcyI6ICJoZWFkaW5nXzEwIn0pCiAgICBjb250YWluZXI0ID0gaW5mbzRbMF0KCiAgICBmb3IgY29udGFpbmVyNCBpbiBpbmZvNDoKICAgICAgICBqYXAgPSBjb250YWluZXI0LmZpbmQoJ3NwYW4nLCBjbGFzc189JycpLnRleHQKICAgICAgICBpZiBqYXA6CiAgICAgICAgICAgIEphcCA9IGphcC5zdHJpcCgpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgSmFwID0gIk5BIgogICAgICAgIHIgPSBzdHIoY29udGFpbmVyNCkKICAgICAgICBjb250YWluZXI1ID0gc291cChyWzpyLnJpbmRleCgnPGJyLz4nKV0sICdodG1sLnBhcnNlcicpCiAgICAgICAgaWYgY29udGFpbmVyNToKICAgICAgICAgICAgZW5nID0gY29udGFpbmVyNS5maW5kKCdoMicpLnRleHQuc3RyaXAoKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIGVuZyA9ICJOQSIKCiAgICAgICAgZi53cml0ZShlbmcucmVwbGFjZSgiLCIsICJ8IikgKyAiLCIgKyBqYXAucmVwbGFjZSgiLCIsICJ8IikgKyAiLCIpCgogICAgaW5mbzEgPSBwYWdlX3NvdXAuZmluZEFsbCgicCIsIHsiY2xhc3MiOiAiYm94XzAxIn0pCiAgICBjb250YWluZXIxID0gaW5mbzFbMF0KCiAgICBmb3IgY29udGFpbmVyMSBpbiBpbmZvMToKICAgICAgICBTY2FsZTEgPSBjb250YWluZXIxLmZpbmRBbGwodGV4dD1yZS5jb21waWxlKCdTY2FsZScpKQogICAgICAgIGlmIFNjYWxlMToKICAgICAgICAgICAgU2NhbGUgPSBTY2FsZTFbMF0uc3RyaXAoJyBcdFxuXHInKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIFNjYWxlID0gIk5BIgoKICAgICAgICBTaXplMSA9IGNvbnRhaW5lcjEuZmluZEFsbCh0ZXh0PXJlLmNvbXBpbGUoJ1NpemUnKSkKICAgICAgICBpZiBTaXplMToKICAgICAgICAgICAgU2l6ZSA9IFNpemUxWzBdLnN0cmlwKCcgXHRcblxyJykKICAgICAgICBlbHNlOgogICAgICAgICAgICBTaXplID0gIk5BIgoKICAgICAgICBUeXBlMSA9IGluZm8xWzBdLm5leHRfZWxlbWVudC5zdHJpcCgpCiAgICAgICAgaWYgVHlwZTE6CiAgICAgICAgICAgIFR5cGUgPSBUeXBlMQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIFR5cGUgPSAiTkEiCgogICAgICAgIE1hdGVyaWFsMSA9IGNvbnRhaW5lcjEuZmluZEFsbCh0ZXh0PXJlLmNvbXBpbGUoJ01hdGVyaWFsJykpCiAgICAgICAgaWYgTWF0ZXJpYWwxOgogICAgICAgICAgICBNYXRlcmlhbCA9IE1hdGVyaWFsMVswXS5zdHJpcCgnIFx0XG5ccicpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgTWF0ZXJpYWwgPSAiTkEiCgogICAgICAgIHByaW50KCJTY2FsZTogIitTY2FsZSkKICAgICAgICBwcmludCgiU2l6ZTogIitTaXplKQogICAgICAgIHByaW50KCJUeXBlOiAiK1R5cGUpCiAgICAgICAgcHJpbnQoIk1hdGVyaWFsczogIitNYXRlcmlhbCkKICAgICAgICBmLndyaXRlKFNjYWxlLnJlcGxhY2UoIiwiLCAiLiIpICsgIiwiICsgU2l6ZS5yZXBsYWNlKCIsIiwgIi4iKSArICIsIiArIFR5cGUgKyAiLCIgKyBNYXRlcmlhbCArICIsIikKCiAgICBpbmZvMiA9IHBhZ2Vfc291cC5maW5kQWxsKCJ1bCIpCiAgICBjb250YWluZXIyID0gaW5mbzJbNF0KCiAgICBmb3IgY29udGFpbmVyMiBpbiBpbmZvMjoKICAgICAgICBTZWxsX3ByaWNlMSA9IGNvbnRhaW5lcjIuZmluZEFsbCgibGkiLCB7ImNsYXNzIjogInNlbGxpbmdfcHJpY2UifSkKICAgICAgICBpZiBTZWxsX3ByaWNlMToKICAgICAgICAgICAgU2VsbF9wcmljZSA9IFNlbGxfcHJpY2UxWzBdLnRleHQuc3RyaXAoKQogICAgICAgIGVsc2U6CiAgICAgICAgICAgIFNlbGxfcHJpY2UgPSAiTkEiCgogICAgICAgIFByaWNlMSA9IGNvbnRhaW5lcjIuZmluZEFsbCgibGkiLCB7ImNsYXNzIjogInByaWNlIn0pCiAgICAgICAgaWYgUHJpY2UxOgogICAgICAgICAgICBQcmljZSA9IFByaWNlMVswXS5maW5kQWxsKHRleHQ9cmUuY29tcGlsZSgnSlBZJykpWzBdLnN0cmlwKCkKICAgICAgICBlbHNlOgogICAgICAgICAgICBQcmljZSA9ICJOQSIKCiAgICAgICAgc2FsZTEgPSBjb250YWluZXIyLmxpKCJzcGFuIiwgeyJjbGFzcyI6ICJvZmZfcHJpY2UifSkKICAgICAgICBpZiBzYWxlMToKICAgICAgICAgICAgU2FsZSA9IHNhbGUxWzBdLnRleHQKICAgICAgICBlbHNlOgogICAgICAgICAgICBTYWxlID0gIk5BIgoKICAgICAgICBTdGF0dXMgPSBjb250YWluZXIyLmZpbmRBbGwoImxpIiwgeyJjbGFzcyI6ICJzZWxsaW5nX3ByaWNlIn0pCiAgICAgICAgaWYgU3RhdHVzOgogICAgICAgICAgICBTdGF0dXMgPSBTdGF0dXNbMV0udGV4dC5zdHJpcCgpCiAgICAgICAgZWxzZToKICAgICAgICAgICAgU3RhdHVzID0gIk5BIgoKICAgICAgICBwcmludCgiU2VsbCBwcmljZTogIitTZWxsX3ByaWNlKQogICAgICAgIHByaW50KCJQcmljZTogIisgUHJpY2UpCiAgICAgICAgcHJpbnQoIlNhbGU6ICIrU2FsZSkKICAgICAgICBwcmludCgiU3RhdHVzOiAiK1N0YXR1cykKICAgICAgICBmLndyaXRlKFNlbGxfcHJpY2UucmVwbGFjZSgiLCIsICIuIikgKyAiLCIgKyBQcmljZS5yZXBsYWNlKCIsIiwgIi4iKSArICIsIiArIFNhbGUucmVwbGFjZSgiLCIsICIuIikgKyAiLCIgKyBTdGF0dXMucmVwbGFjZSgiLCIsICIuIikgKyAiLCIpCgogICAgaW5mbzMgPSBwYWdlX3NvdXAuZmluZEFsbCgiZGwiLCB7ImNsYXNzIjogInNwZWNfZGF0YSJ9KQogICAgY29udGFpbmVyMyA9IGluZm8zWzBdCgoJZmllbGRzID0gWyJKQU4gQ29kZSIsICJSZWxlYXNlIERhdGUiLCAiQnJhbmQiLCAiUHJvZHVjdCBMaW5lIiwKCQkJCSJTZXJpZXMgVGl0bGUiLCAiQ2hhcmFjdGVyIE5hbWUiLCAiU2N1bHB0b3IiXQoKICAgIGZvciBjb250YWluZXIzIGluIGluZm8zOgogICAgCWRhdGEgPSBbXQogICAgCWZvciBmaWVsZCBpbiBmaWxlZHM6CiAgICAJCXRhZyA9IGNvbnRhaW5lcjMuZmluZCgnZHQnLCB0ZXh0PWZpZWxkKQogICAgCQlpZiB0YWc6CiAgICAJCQlmaWVsZF90ZXh0ID0gdGFnLmZpbmRfbmV4dF9zaWJsaW5nKCJkZCIpLnRleHQuc3RyaXAoKS5yZXBsYWNlKCIsIiwiLiIpCiAgICAJCWVsc2U6CiAgICAJCQlmaWVsZF90ZXh0ID0gIk5BIgogICAgCQlkYXRhLmFwcGVuZChmaWVsZF90ZXh0KQoKICAgIAlmLndyaXRlKCIsIi5qb2luKGRhdGEpKQoKICAgIGYuY2xvc2UoKQ==