parse platenzaak deals
This commit is contained in:
@@ -12,6 +12,7 @@ from dagster_polars.patito import patito_model_to_dagster_type
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from models import Deal
|
||||
from partitions import daily_partitions_def, multi_partitions_def
|
||||
from platenzaak.parse import parse as parse_platenzaak
|
||||
from platenzaak.scrape import scrape as scrape_platenzaak
|
||||
from plato.parse import parse as parse_plato
|
||||
from plato.scrape import scrape as scrape_plato
|
||||
@@ -65,26 +66,24 @@ def deals(context: dg.AssetExecutionContext) -> pl.DataFrame:
|
||||
logger.error("Failed to load CSV file!", error=e)
|
||||
raise dg.Failure(f"Cannot materialize for the past: {date.date()}")
|
||||
|
||||
if source == "plato":
|
||||
logger.info("Scraping Plato")
|
||||
df = scrape_plato()
|
||||
logger.info("Scraped Plato", rows=len(df), head=df.head().to_markdown())
|
||||
ic(df.columns)
|
||||
return pl.from_pandas(df.assign(**partition_key))
|
||||
if source == "sounds":
|
||||
logger.info("Scraping Sounds")
|
||||
df = scrape_sounds()
|
||||
logger.info("Scraped Sounds", rows=len(df), head=df.head().to_markdown())
|
||||
ic(df.columns)
|
||||
return pl.from_pandas(df.assign(**partition_key))
|
||||
if source == "platenzaak":
|
||||
logger.info("Scraping Platenzaak")
|
||||
df = scrape_platenzaak(logger=logger)
|
||||
logger.info("Scraped Sounds", rows=len(df), head=df.head().to_markdown())
|
||||
ic(df.columns)
|
||||
return pl.from_pandas(df.assign(**partition_key))
|
||||
match source:
|
||||
case "plato":
|
||||
logger.info("Scraping Plato")
|
||||
df = scrape_plato()
|
||||
logger.info("Scraped Plato", rows=len(df), head=df.head().to_markdown())
|
||||
case "sounds":
|
||||
logger.info("Scraping Sounds")
|
||||
df = scrape_sounds()
|
||||
logger.info("Scraped Sounds", rows=len(df), head=df.head().to_markdown())
|
||||
case "platenzaak":
|
||||
logger.info("Scraping Platenzaak")
|
||||
df = scrape_platenzaak(logger=logger)
|
||||
logger.info("Scraped Sounds", rows=len(df), head=df.head().to_markdown())
|
||||
case _:
|
||||
raise ValueError(f"Unknown source: {source}!")
|
||||
|
||||
raise NotImplementedError(f"No implementation for source {source}")
|
||||
ic(df.columns)
|
||||
return pl.from_pandas(df.assign(**partition_key))
|
||||
|
||||
|
||||
@asset(
|
||||
@@ -111,9 +110,10 @@ def cleaned_deals(
|
||||
parsed_df = parse_plato(df)
|
||||
case "sounds":
|
||||
parsed_df = parse_sounds(df)
|
||||
case "platenzaak":
|
||||
parsed_df = parse_platenzaak(df)
|
||||
case _:
|
||||
context.log.warning(f"Unknown source: {source}!")
|
||||
return
|
||||
raise ValueError(f"Unknown source: {source}!")
|
||||
|
||||
ic(parsed_df.collect_schema())
|
||||
|
||||
|
||||
13
apps/vinyl/src/platenzaak/parse.py
Normal file
13
apps/vinyl/src/platenzaak/parse.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import polars as pl
|
||||
|
||||
|
||||
def parse(df: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Parse the Platenzaak DataFrame."""
|
||||
return df.with_columns(
|
||||
date=pl.col("date").cast(pl.Date),
|
||||
artist=pl.col("artist").str.strip_chars().str.to_lowercase(),
|
||||
title=pl.col("album").str.strip_chars().str.to_lowercase(),
|
||||
release=pl.lit(None),
|
||||
price=pl.col("current_price").cast(pl.Float64),
|
||||
url=pl.format("https://platenzaak.nl{}", pl.col("id")),
|
||||
)
|
||||
@@ -3,7 +3,7 @@ from utils.parse import parse_date
|
||||
|
||||
|
||||
def parse(df: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Parse the Plato DataFrame."""
|
||||
"""Parse the Sounds DataFrame."""
|
||||
return df.with_columns(
|
||||
date=pl.col("date").cast(pl.Date),
|
||||
artist=pl.coalesce(pl.col("artist"), pl.col("name").str.split("-").list.get(1))
|
||||
|
||||
Reference in New Issue
Block a user