#!/root/.pyenv/versions/dev/bin/python import re from datetime import datetime import pandas as pd from .scrape import get_soup, scrape_page, scrape_page_links def scrape_plato(get=None): ic() url = "https://www.platomania.nl/vinyl-aanbiedingen?page=1" ic(url) soup = get_soup(url=url, get=get) articles_info = scrape_page(soup) ic(len(articles_info)) links = sorted(set(scrape_page_links(soup)), key=lambda x: int(x.split("=")[-1])) for link in links: ic(link) soup = get_soup(url=link, get=get) tmp = scrape_page(soup) ic(len(tmp)) articles_info.extend(tmp) def clean(name): tmp = " ".join(reversed(name.split(", "))) tmp = tmp.lower() tmp = re.sub(r"\s+\([^)]*\)", "", tmp) return tmp articles_df = pd.DataFrame(articles_info).reindex( columns=[ "artist", "title", "url", "label", "release_date", "origin", "item_number", "ean", "delivery_info", "price", ] ) articles_df["_artist"] = articles_df["artist"].map(clean) articles_df["_price"] = articles_df["price"].map(lambda x: float(x.split(" ")[-1])) articles_df["_date"] = datetime.now() return articles_df