handle missing partitions

This commit is contained in:
2025-07-30 21:53:10 +02:00
parent 02db619c6d
commit 1d7df06dcf
6 changed files with 95 additions and 50 deletions

View File

@@ -1,7 +1,7 @@
from functools import partial
import assets
import polars as pl
from assets import deals
from config import APP
import dagster as dg
@@ -11,36 +11,50 @@ job = partial(dg.job, **kwargs)
define_asset_job = partial(dg.define_asset_job, **kwargs)
deals_job = define_asset_job(
"deals_job", selection=[deals.key], partitions_def=deals.partitions_def
"deals_job",
selection=[assets.deals.key],
partitions_def=assets.deals.partitions_def,
)
@dg.op(required_resource_keys={"polars_parquet_io_manager"})
def check_partitions(context: dg.OpExecutionContext) -> None:
asset_key = deals.key
for asset in [assets.deals, assets.cleaned_deals]:
asset_key = asset.key
# Fetch the materialized partitions for the asset key
materialized_partitions = context.instance.get_materialized_partitions(asset_key)
ic(materialized_partitions)
storage_dir = context.resources.polars_parquet_io_manager.base_dir
asset_path = "/".join(asset_key.path)
ic(storage_dir, asset_key, asset_path)
for row in (
pl.scan_parquet(
f"{storage_dir}/{asset_path}/*/*.parquet", extra_columns="ignore"
# Fetch the materialized partitions for the asset key
materialized_partitions = context.instance.get_materialized_partitions(
asset_key
)
.select(["date", "source"])
.unique()
.collect()
.iter_rows()
):
partition = "|".join(row)
if partition not in materialized_partitions:
context.log.info(f"Missing partition: {partition}")
context.log_event(
dg.AssetMaterialization(asset_key=asset_key, partition=partition)
ic(materialized_partitions)
storage_dir = context.resources.polars_parquet_io_manager.base_dir
asset_path = "/".join(asset_key.path)
ic(storage_dir, asset_key, asset_path)
partitions = []
for row in (
pl.scan_parquet(
f"{storage_dir}/{asset_path}/*/*.parquet", extra_columns="ignore"
)
.select(["date", "source"]) # asset.partitions_def.names ?
.unique()
.with_columns(pl.exclude(pl.String).cast(str))
.collect()
.iter_rows()
):
partition = "|".join(row)
if partition not in materialized_partitions:
context.log.info(f"[{asset_key}] Adding partition: {partition}")
context.log_event(
dg.AssetMaterialization(asset_key=asset_key, partition=partition)
)
partitions.append(partition)
missing = set(materialized_partitions) - set(partitions)
ic(missing)
for partition in missing:
context.log.info(f"[{asset_key}] Should remove partition: {partition}")
@job