scrape platenzaak
This commit is contained in:
@@ -12,17 +12,19 @@ from dagster_polars.patito import patito_model_to_dagster_type
|
|||||||
from jinja2 import Environment, FileSystemLoader
|
from jinja2 import Environment, FileSystemLoader
|
||||||
from models import Deal
|
from models import Deal
|
||||||
from partitions import daily_partitions_def, multi_partitions_def
|
from partitions import daily_partitions_def, multi_partitions_def
|
||||||
|
from platenzaak.scrape import scrape as scrape_platenzaak
|
||||||
from plato.parse import parse as parse_plato
|
from plato.parse import parse as parse_plato
|
||||||
from plato.scrape import scrape as scrape_plato
|
from plato.scrape import scrape as scrape_plato
|
||||||
from shared.utils import get_partition_keys, parse_partition_keys
|
from shared.utils import get_partition_keys, parse_partition_keys
|
||||||
from sounds.parse import parse as parse_sounds
|
from sounds.parse import parse as parse_sounds
|
||||||
from sounds.scrape import scrape as scrape_sounds
|
from sounds.scrape import scrape as scrape_sounds
|
||||||
|
from structlog.stdlib import BoundLogger
|
||||||
from utils.email import EmailService
|
from utils.email import EmailService
|
||||||
|
|
||||||
import dagster as dg
|
import dagster as dg
|
||||||
|
|
||||||
asset = partial(dg.asset, key_prefix=APP)
|
asset = partial(dg.asset, key_prefix=APP)
|
||||||
logger = structlog.get_logger()
|
logger: BoundLogger = structlog.get_logger()
|
||||||
|
|
||||||
|
|
||||||
@asset(
|
@asset(
|
||||||
@@ -75,10 +77,14 @@ def deals(context: dg.AssetExecutionContext) -> pl.DataFrame:
|
|||||||
logger.info("Scraped Sounds", rows=len(df), head=df.head().to_markdown())
|
logger.info("Scraped Sounds", rows=len(df), head=df.head().to_markdown())
|
||||||
ic(df.columns)
|
ic(df.columns)
|
||||||
return pl.from_pandas(df.assign(**partition_key))
|
return pl.from_pandas(df.assign(**partition_key))
|
||||||
|
if source == "platenzaak":
|
||||||
|
logger.info("Scraping Platenzaak")
|
||||||
|
df = scrape_platenzaak(logger=logger)
|
||||||
|
logger.info("Scraped Sounds", rows=len(df), head=df.head().to_markdown())
|
||||||
|
ic(df.columns)
|
||||||
|
return pl.from_pandas(df.assign(**partition_key))
|
||||||
|
|
||||||
return pl.DataFrame(
|
raise NotImplementedError(f"No implementation for source {source}")
|
||||||
[{"date": context.partition_key, "data": f"Data for {context.partition_key}"}]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@asset(
|
@asset(
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import os
|
|||||||
|
|
||||||
import dagster as dg
|
import dagster as dg
|
||||||
|
|
||||||
SOURCES = ["plato", "sounds"]
|
SOURCES = ["plato", "sounds", "platenzaak"]
|
||||||
daily_partitions_def = dg.DailyPartitionsDefinition(
|
daily_partitions_def = dg.DailyPartitionsDefinition(
|
||||||
start_date="2024-09-01", end_offset=1, timezone=os.environ.get("TZ", "UTC")
|
start_date="2024-09-01", end_offset=1, timezone=os.environ.get("TZ", "UTC")
|
||||||
)
|
)
|
||||||
|
|||||||
0
apps/vinyl/src/platenzaak/__init__.py
Normal file
0
apps/vinyl/src/platenzaak/__init__.py
Normal file
90
apps/vinyl/src/platenzaak/scrape.py
Normal file
90
apps/vinyl/src/platenzaak/scrape.py
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
from collections.abc import Iterator
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from structlog.stdlib import BoundLogger
|
||||||
|
|
||||||
|
|
||||||
|
def parse_price(price_block):
|
||||||
|
"""
|
||||||
|
Convert a price block like:
|
||||||
|
<span class="amount theme-money">€ 30<sup>99</sup></span>
|
||||||
|
into a float: 30.99
|
||||||
|
"""
|
||||||
|
if not price_block:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Extract the main number (before <sup>)
|
||||||
|
main = price_block.find(string=True, recursive=False)
|
||||||
|
main = main.strip().replace("€", "").replace(",", ".").strip()
|
||||||
|
|
||||||
|
# Extract the <sup> part (cents)
|
||||||
|
sup = price_block.find("sup")
|
||||||
|
cents = sup.get_text(strip=True) if sup else "00"
|
||||||
|
|
||||||
|
try:
|
||||||
|
return float(f"{main}.{cents}")
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_page(html) -> Iterator[dict]:
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
for block in soup.select("div.product-block__inner"):
|
||||||
|
# Wishlist button holds most metadata
|
||||||
|
wishlist = block.select_one("[data-wlh-id]")
|
||||||
|
if not wishlist:
|
||||||
|
continue
|
||||||
|
|
||||||
|
product = {
|
||||||
|
"id": wishlist.get("data-wlh-id"),
|
||||||
|
"variant_id": wishlist.get("data-wlh-variantid"),
|
||||||
|
"name": wishlist.get("data-wlh-name"),
|
||||||
|
"price": wishlist.get("data-wlh-price"),
|
||||||
|
"url": wishlist.get("data-wlh-link"),
|
||||||
|
"image": wishlist.get("data-wlh-image"),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Artist + Title (in the title link)
|
||||||
|
title_block = block.select_one(".product-block__title-price .title")
|
||||||
|
if title_block:
|
||||||
|
artist = title_block.find("span")
|
||||||
|
if artist:
|
||||||
|
product["artist"] = artist.get_text(strip=True)
|
||||||
|
# The text after <br> is the album title
|
||||||
|
product["album"] = (
|
||||||
|
title_block.get_text(separator="|").split("|")[-1].strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Current price (might include discounts)
|
||||||
|
price_block = block.select_one(".price .amount")
|
||||||
|
product["current_price"] = parse_price(price_block)
|
||||||
|
|
||||||
|
# Original price if on sale
|
||||||
|
old_price_block = block.select_one(".price del .theme-money")
|
||||||
|
product["original_price"] = parse_price(old_price_block)
|
||||||
|
|
||||||
|
# Sale label
|
||||||
|
sale_label = block.select_one(".product-label--sale")
|
||||||
|
product["on_sale"] = bool(sale_label)
|
||||||
|
|
||||||
|
yield product
|
||||||
|
|
||||||
|
|
||||||
|
def scrape(logger: BoundLogger) -> pd.DataFrame:
|
||||||
|
page = 1
|
||||||
|
products = []
|
||||||
|
while True:
|
||||||
|
response = requests.get(
|
||||||
|
f"https://www.platenzaak.nl/collections/sale?filter.p.m.custom.config_group=Vinyl&page={page}"
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
page_products = list(parse_page(response.text))
|
||||||
|
logger.info("Scraped page", page=page, products=len(page_products))
|
||||||
|
if not page_products:
|
||||||
|
break
|
||||||
|
products.extend(page_products)
|
||||||
|
page += 1
|
||||||
|
return pd.DataFrame(products)
|
||||||
Reference in New Issue
Block a user