scrape platenzaak

This commit is contained in:
2025-08-22 10:22:16 +02:00
parent e0cda85d20
commit 1d9bd68612
4 changed files with 101 additions and 5 deletions

View File

View File

@@ -0,0 +1,90 @@
from collections.abc import Iterator
import pandas as pd
import requests
from bs4 import BeautifulSoup
from structlog.stdlib import BoundLogger
def parse_price(price_block):
"""
Convert a price block like:
<span class="amount theme-money">€ 30<sup>99</sup></span>
into a float: 30.99
"""
if not price_block:
return None
# Extract the main number (before <sup>)
main = price_block.find(string=True, recursive=False)
main = main.strip().replace("", "").replace(",", ".").strip()
# Extract the <sup> part (cents)
sup = price_block.find("sup")
cents = sup.get_text(strip=True) if sup else "00"
try:
return float(f"{main}.{cents}")
except ValueError:
return None
def parse_page(html) -> Iterator[dict]:
soup = BeautifulSoup(html, "html.parser")
for block in soup.select("div.product-block__inner"):
# Wishlist button holds most metadata
wishlist = block.select_one("[data-wlh-id]")
if not wishlist:
continue
product = {
"id": wishlist.get("data-wlh-id"),
"variant_id": wishlist.get("data-wlh-variantid"),
"name": wishlist.get("data-wlh-name"),
"price": wishlist.get("data-wlh-price"),
"url": wishlist.get("data-wlh-link"),
"image": wishlist.get("data-wlh-image"),
}
# Artist + Title (in the title link)
title_block = block.select_one(".product-block__title-price .title")
if title_block:
artist = title_block.find("span")
if artist:
product["artist"] = artist.get_text(strip=True)
# The text after <br> is the album title
product["album"] = (
title_block.get_text(separator="|").split("|")[-1].strip()
)
# Current price (might include discounts)
price_block = block.select_one(".price .amount")
product["current_price"] = parse_price(price_block)
# Original price if on sale
old_price_block = block.select_one(".price del .theme-money")
product["original_price"] = parse_price(old_price_block)
# Sale label
sale_label = block.select_one(".product-label--sale")
product["on_sale"] = bool(sale_label)
yield product
def scrape(logger: BoundLogger) -> pd.DataFrame:
page = 1
products = []
while True:
response = requests.get(
f"https://www.platenzaak.nl/collections/sale?filter.p.m.custom.config_group=Vinyl&page={page}"
)
response.raise_for_status()
page_products = list(parse_page(response.text))
logger.info("Scraped page", page=page, products=len(page_products))
if not page_products:
break
products.extend(page_products)
page += 1
return pd.DataFrame(products)