rewrite parsing of deals
This commit is contained in:
@@ -1,11 +1,10 @@
|
||||
from datetime import datetime
|
||||
from glob import glob
|
||||
|
||||
import duckdb
|
||||
import polars as pl
|
||||
import structlog
|
||||
from duckdb.typing import DATE, VARCHAR
|
||||
from plato.fetch import scrape_plato
|
||||
from shared.utils import get_partition_keys, parse_partition_keys
|
||||
from sounds.fetch import fetch_deals
|
||||
from utils import parse_date
|
||||
|
||||
@@ -15,17 +14,23 @@ SOURCES = ["plato", "sounds"]
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
partitions_def = dg.MultiPartitionsDefinition(
|
||||
daily_partitions_def = dg.DailyPartitionsDefinition(
|
||||
start_date="2024-09-01", end_offset=1
|
||||
)
|
||||
multi_partitions_def = dg.MultiPartitionsDefinition(
|
||||
{
|
||||
"date": dg.DailyPartitionsDefinition(start_date="2024-09-01", end_offset=1),
|
||||
"date": daily_partitions_def,
|
||||
"source": dg.StaticPartitionsDefinition(SOURCES),
|
||||
}
|
||||
)
|
||||
partitions_mapping = dg.MultiToSingleDimensionPartitionMapping(
|
||||
partition_dimension_name="date"
|
||||
)
|
||||
|
||||
|
||||
@dg.asset(
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
partitions_def=partitions_def,
|
||||
partitions_def=multi_partitions_def,
|
||||
metadata={
|
||||
"partition_by": ["date", "source"],
|
||||
},
|
||||
@@ -83,82 +88,95 @@ def deals(context):
|
||||
|
||||
|
||||
@dg.asset(
|
||||
deps=[deals.key],
|
||||
ins={"df": dg.AssetIn(key=deals.key)},
|
||||
automation_condition=dg.AutomationCondition.eager(),
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
partitions_def=daily_partitions_def,
|
||||
ins={"partitions": dg.AssetIn(key=deals.key, partition_mapping=partitions_mapping)},
|
||||
automation_condition=dg.AutomationCondition.eager(),
|
||||
)
|
||||
def new_deals(context: dg.OpExecutionContext) -> pl.DataFrame:
|
||||
def new_deals(
|
||||
context: dg.OpExecutionContext, partitions: dict[str, pl.DataFrame]
|
||||
) -> None: # pl.DataFrame:
|
||||
"""Combine deals from Plato and Sounds into a single DataFrame."""
|
||||
ic()
|
||||
storage_dir = context.resources.polars_parquet_io_manager.base_dir
|
||||
asset_key = "deals"
|
||||
partition_keys = parse_partition_keys(context)
|
||||
ic(partition_keys)
|
||||
return
|
||||
|
||||
# TODO: can we directly query from the deals input?
|
||||
|
||||
with duckdb.connect() as con:
|
||||
con.create_function("PARSE_DATE", parse_date, [VARCHAR], DATE)
|
||||
return con.execute(
|
||||
f"""
|
||||
WITH tmp_plato AS (
|
||||
SELECT
|
||||
source,
|
||||
CAST(date AS DATE) AS date,
|
||||
ean AS id,
|
||||
_artist AS artist,
|
||||
LOWER(title) AS title,
|
||||
CAST(_date AS DATE) AS release,
|
||||
CAST(_price AS FLOAT) AS price,
|
||||
CONCAT('https://www.platomania.nl', url) AS url
|
||||
FROM read_parquet(
|
||||
'{storage_dir}/{asset_key}/*/plato.parquet',
|
||||
union_by_name = true
|
||||
)
|
||||
),
|
||||
tmp_sounds AS (
|
||||
SELECT
|
||||
source,
|
||||
date,
|
||||
id,
|
||||
LOWER(TRIM(COALESCE(artist, SPLIT(name, '-')[1]))) AS artist,
|
||||
LOWER(TRIM(COALESCE(
|
||||
title,
|
||||
ARRAY_TO_STRING(SPLIT(name, '-')[2:], '-')
|
||||
))) AS title,
|
||||
PARSE_DATE(release) AS release,
|
||||
CAST(price AS FLOAT) AS price,
|
||||
CONCAT('https://www.sounds.nl/detail/', id) AS url
|
||||
FROM read_parquet(
|
||||
'{storage_dir}/{asset_key}/*/sounds.parquet',
|
||||
union_by_name = true
|
||||
)
|
||||
),
|
||||
tmp_both AS (
|
||||
SELECT * FROM tmp_plato
|
||||
UNION ALL
|
||||
SELECT * FROM tmp_sounds
|
||||
)
|
||||
SELECT
|
||||
source,
|
||||
date,
|
||||
id,
|
||||
artist,
|
||||
title,
|
||||
release,
|
||||
price,
|
||||
url
|
||||
FROM tmp_both
|
||||
QUALIFY ROW_NUMBER() OVER (
|
||||
PARTITION BY source, id, artist, title, price
|
||||
ORDER BY date DESC
|
||||
) = 1
|
||||
ORDER BY date ASC
|
||||
"""
|
||||
).pl()
|
||||
def parse_plato(df: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Parse the Sounds DataFrame."""
|
||||
ic()
|
||||
return pl.sql(
|
||||
"""
|
||||
SELECT source,
|
||||
CAST(date AS DATE) AS date,
|
||||
ean AS id,
|
||||
_artist AS artist,
|
||||
LOWER(title) AS title,
|
||||
CAST(_date AS DATE) AS release,
|
||||
CAST(_price AS FLOAT) AS price,
|
||||
CONCAT('https://www.platomania.nl', url) AS url
|
||||
FROM df
|
||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY source, id, artist, title, price ORDER BY date DESC) = 1
|
||||
ORDER BY date ASC
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def parse_sounds(df: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Parse the Plato DataFrame."""
|
||||
return df.with_columns(
|
||||
artist=pl.coalesce(pl.col("artist"), pl.col("name").str.split("-").list.get(1))
|
||||
.str.strip_chars()
|
||||
.str.to_lowercase(),
|
||||
title=pl.coalesce(
|
||||
pl.col("title"), pl.col("name").str.split("-").list.slice(2).list.join("-")
|
||||
)
|
||||
.str.strip_chars()
|
||||
.str.to_lowercase(),
|
||||
release=pl.col("release").map_elements(parse_date, return_dtype=pl.Date),
|
||||
price=pl.col("price").cast(pl.Float64),
|
||||
url=pl.format("https://www.sounds.nl/detail/{}", pl.col("id")),
|
||||
)
|
||||
|
||||
|
||||
@dg.asset(
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
partitions_def=deals.partitions_def,
|
||||
ins={"df": dg.AssetIn(key=deals.key)},
|
||||
automation_condition=dg.AutomationCondition.on_missing().without(
|
||||
dg.AutomationCondition.in_latest_time_window()
|
||||
),
|
||||
)
|
||||
def cleaned_deals(
|
||||
context: dg.OpExecutionContext, df: pl.LazyFrame
|
||||
) -> pl.DataFrame | None:
|
||||
ic()
|
||||
partition_keys = get_partition_keys(context)
|
||||
ic(partition_keys)
|
||||
|
||||
# Specific parsing for each source
|
||||
match source := partition_keys["source"]:
|
||||
case "plato":
|
||||
parsed_df = parse_plato(df)
|
||||
case "sounds":
|
||||
parsed_df = parse_sounds(df)
|
||||
case _:
|
||||
context.log.warning(f"Unknown source: {source}!")
|
||||
return None
|
||||
|
||||
# Deduplicate and sort the DataFrame
|
||||
columns = ["source", "id", "artist", "title", "price"]
|
||||
return (
|
||||
parsed_df.collect()
|
||||
.sort("date", descending=True)
|
||||
.unique(subset=columns, keep="first")
|
||||
.sort("date", descending=False)
|
||||
.select(*columns, "date", "release", "url")
|
||||
)
|
||||
|
||||
|
||||
@dg.asset(
|
||||
deps=[new_deals.key],
|
||||
ins={"df": dg.AssetIn(key=new_deals.key)},
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
automation_condition=dg.AutomationCondition.eager(),
|
||||
|
||||
Reference in New Issue
Block a user