towards detecting new deal

This commit is contained in:
2025-07-26 19:09:06 +02:00
parent 8d06b236b7
commit 66bcb3e2d3
4 changed files with 61 additions and 25 deletions

View File

@@ -1,8 +1,10 @@
from collections.abc import Iterator
from datetime import datetime
from glob import glob
import polars as pl
import structlog
from dagster_polars.patito import patito_model_to_dagster_type
from models import Deal
from plato.fetch import scrape_plato
from plato.parse import parse as parse_plato
@@ -33,9 +35,6 @@ partitions_mapping = dg.MultiToSingleDimensionPartitionMapping(
@dg.asset(
io_manager_key="polars_parquet_io_manager",
partitions_def=multi_partitions_def,
metadata={
"partition_by": ["date", "source"],
},
config_schema={"import_dir": dg.Field(str, default_value="/storage/import")},
)
def deals(context: dg.AssetExecutionContext) -> pl.DataFrame:
@@ -99,7 +98,7 @@ def deals(context: dg.AssetExecutionContext) -> pl.DataFrame:
)
def cleaned_deals(
context: dg.AssetExecutionContext, df: pl.LazyFrame
) -> Deal.DataFrame | None:
) -> Deal.DataFrame:
"""Clean and parse deals from the raw source tables."""
ic()
partition_keys = get_partition_keys(context)
@@ -113,7 +112,7 @@ def cleaned_deals(
parsed_df = parse_sounds(df)
case _:
context.log.warning(f"Unknown source: {source}!")
return None
return Deal.DataFrame()
ic(parsed_df.collect_schema())
@@ -144,3 +143,49 @@ def works(context: dg.AssetExecutionContext) -> pl.DataFrame | None:
columns = ["artist", "title", "release"]
return pl.concat(dfs, how="vertical_relaxed").select(columns).unique()
return None
@dg.asset(
io_manager_key="polars_parquet_io_manager",
partitions_def=multi_partitions_def,
ins={
"partitions": dg.AssetIn(
key=cleaned_deals.key,
partition_mapping=dg.MultiPartitionMapping(
{
"date": dg.DimensionPartitionMapping(
dimension_name="date",
partition_mapping=dg.TimeWindowPartitionMapping(
start_offset=-10,
end_offset=0,
allow_nonexistent_upstream_partitions=True,
),
),
"source": dg.DimensionPartitionMapping(
dimension_name="source",
partition_mapping=dg.IdentityPartitionMapping(),
),
}
),
)
},
output_required=False,
dagster_type=patito_model_to_dagster_type(Deal),
)
def new_deals(
context: dg.AssetExecutionContext, partitions: dict[str, pl.LazyFrame | None]
) -> Iterator[dg.Output[Deal.DataFrame]]:
"""Fetch new deals from all sources."""
ic()
partition_keys = get_partition_keys(context)
ic(partition_keys)
if len(partition_keys := sorted(partitions.keys())) < 2:
context.log.warning("Not enough partitions to fetch new deals!")
return
yield dg.Output(Deal.DataFrame(partitions[partition_keys[-1]].limit(5).collect()))
# def good_deals(): ...