53 lines
1.3 KiB
Python
Executable File
53 lines
1.3 KiB
Python
Executable File
#!/root/.pyenv/versions/dev/bin/python
|
|
|
|
import re
|
|
from datetime import datetime
|
|
|
|
import pandas as pd
|
|
|
|
from .scrape import get_soup, scrape_page, scrape_page_links
|
|
|
|
|
|
def scrape_plato(get=None):
|
|
ic()
|
|
url = "https://www.platomania.nl/vinyl-aanbiedingen?page=1"
|
|
|
|
ic(url)
|
|
soup = get_soup(url=url, get=get)
|
|
articles_info = scrape_page(soup)
|
|
ic(len(articles_info))
|
|
|
|
links = sorted(set(scrape_page_links(soup)), key=lambda x: int(x.split("=")[-1]))
|
|
for link in links:
|
|
ic(link)
|
|
soup = get_soup(url=link, get=get)
|
|
tmp = scrape_page(soup)
|
|
ic(len(tmp))
|
|
articles_info.extend(tmp)
|
|
|
|
def clean(name):
|
|
tmp = " ".join(reversed(name.split(", ")))
|
|
tmp = tmp.lower()
|
|
tmp = re.sub(r"\s+\([^)]*\)", "", tmp)
|
|
return tmp
|
|
|
|
articles_df = pd.DataFrame(articles_info).reindex(
|
|
columns=[
|
|
"artist",
|
|
"title",
|
|
"url",
|
|
"label",
|
|
"release_date",
|
|
"origin",
|
|
"item_number",
|
|
"ean",
|
|
"delivery_info",
|
|
"price",
|
|
]
|
|
)
|
|
articles_df["_artist"] = articles_df["artist"].map(clean)
|
|
articles_df["_price"] = articles_df["price"].map(lambda x: float(x.split(" ")[-1]))
|
|
articles_df["_date"] = datetime.now()
|
|
|
|
return articles_df
|