handle missing partitions

This commit is contained in:
2025-07-30 21:53:10 +02:00
parent 02db619c6d
commit 1d7df06dcf
6 changed files with 95 additions and 50 deletions

View File

@@ -84,14 +84,16 @@ def deals(context: dg.AssetExecutionContext) -> pl.DataFrame:
io_manager_key="polars_parquet_io_manager",
partitions_def=deals.partitions_def,
ins={"df": dg.AssetIn(key=deals.key)},
automation_condition=dg.AutomationCondition.on_missing().without(
dg.AutomationCondition.in_latest_time_window()
),
automation_condition=dg.AutomationCondition.eager(),
output_required=False,
)
def cleaned_deals(
context: dg.AssetExecutionContext, df: pl.LazyFrame
) -> Deal.DataFrame:
context: dg.AssetExecutionContext, df: pl.LazyFrame | None
) -> Iterator[dg.Output[Deal.DataFrame]]:
"""Clean and parse deals from the raw source tables."""
if df is None:
return
ic()
partition_keys = get_partition_keys(context)
ic(partition_keys)
@@ -104,27 +106,27 @@ def cleaned_deals(
parsed_df = parse_sounds(df)
case _:
context.log.warning(f"Unknown source: {source}!")
return Deal.DataFrame()
return
ic(parsed_df.collect_schema())
# Deduplicate and sort the DataFrame
columns = ["source", "id", "artist", "title", "price"]
return Deal.DataFrame(
parsed_df.sort("date", descending=True)
.unique(subset=columns, keep="first")
.sort("date", descending=False)
.select(*columns, "date", "release", "url")
.collect()
yield dg.Output(
Deal.DataFrame(
parsed_df.sort("date", descending=True)
.unique(subset=columns, keep="first")
.sort("date", descending=False)
.select(*columns, "date", "release", "url")
.collect()
)
)
@asset(
deps=[cleaned_deals],
io_manager_key="polars_parquet_io_manager",
automation_condition=dg.AutomationCondition.on_missing().without(
dg.AutomationCondition.in_latest_time_window()
),
automation_condition=dg.AutomationCondition.eager(),
)
def works(context: dg.AssetExecutionContext) -> pl.DataFrame | None:
"""Aggregate works from cleaned deals."""
@@ -167,15 +169,21 @@ def works(context: dg.AssetExecutionContext) -> pl.DataFrame | None:
automation_condition=dg.AutomationCondition.eager(),
)
def new_deals(
context: dg.AssetExecutionContext, partitions: dict[str, pl.LazyFrame]
context: dg.AssetExecutionContext, partitions: dict[str, pl.LazyFrame | None]
) -> Iterator[dg.Output[Deal.DataFrame]]:
"""Fetch new deals from all sources."""
ic()
partition_keys = get_partition_keys(context)
parsed_partition_keys = parse_partition_keys(context, "partitions")
ic(partitions.keys())
if not (partitions := {k: v for k, v in partitions.items() if v is not None}):
return
ic(partitions.keys())
partition_keys = get_partition_keys(context)
ic(partition_keys)
parsed_partition_keys = parse_partition_keys(context, "partitions")
ic(parsed_partition_keys)
if len(partition_keys := sorted(partitions.keys())) < 2:
context.log.warning("Not enough partitions to fetch new deals!")
@@ -245,7 +253,7 @@ def good_deals(
]
# Render HTML from Jinja template
env = Environment(loader=FileSystemLoader(".."))
env = Environment(loader=FileSystemLoader(f"/apps/{APP}"))
template = env.get_template("email.html")
html_content = template.render(deals=deals)