implement schema check and use automation instead of sensor
This commit is contained in:
@@ -3,9 +3,10 @@ from glob import glob
|
||||
|
||||
import polars as pl
|
||||
import structlog
|
||||
from models import Deal
|
||||
from plato.fetch import scrape_plato
|
||||
from plato.parse import parse as parse_plato
|
||||
from shared.utils import get_partition_keys, parse_partition_keys
|
||||
from shared.utils import get_partition_keys, load_partitions
|
||||
from sounds.fetch import fetch_deals
|
||||
from sounds.parse import parse as parse_sounds
|
||||
|
||||
@@ -37,12 +38,12 @@ partitions_mapping = dg.MultiToSingleDimensionPartitionMapping(
|
||||
},
|
||||
config_schema={"import_dir": dg.Field(str, default_value="/storage/import")},
|
||||
)
|
||||
def deals(context):
|
||||
def deals(context: dg.AssetExecutionContext) -> pl.DataFrame:
|
||||
ic()
|
||||
ic(context.partition_key)
|
||||
ic(context.op_config)
|
||||
import_dir = context.op_config["import_dir"]
|
||||
partition_key = context.partition_key.keys_by_dimension
|
||||
partition_key = get_partition_keys(context)
|
||||
date_str = partition_key["date"]
|
||||
source = partition_key["source"]
|
||||
logger.info("Materializing deals", date=date_str, source=source)
|
||||
@@ -88,22 +89,6 @@ def deals(context):
|
||||
)
|
||||
|
||||
|
||||
@dg.asset(
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
partitions_def=daily_partitions_def,
|
||||
ins={"partitions": dg.AssetIn(key=deals.key, partition_mapping=partitions_mapping)},
|
||||
automation_condition=dg.AutomationCondition.eager(),
|
||||
)
|
||||
def new_deals(
|
||||
context: dg.OpExecutionContext, partitions: dict[str, pl.DataFrame]
|
||||
) -> None: # pl.DataFrame:
|
||||
"""Combine deals from Plato and Sounds into a single DataFrame."""
|
||||
ic()
|
||||
partition_keys = parse_partition_keys(context)
|
||||
ic(partition_keys)
|
||||
return
|
||||
|
||||
|
||||
@dg.asset(
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
partitions_def=deals.partitions_def,
|
||||
@@ -113,8 +98,9 @@ def new_deals(
|
||||
),
|
||||
)
|
||||
def cleaned_deals(
|
||||
context: dg.OpExecutionContext, df: pl.LazyFrame
|
||||
) -> pl.DataFrame | None:
|
||||
context: dg.AssetExecutionContext, df: pl.LazyFrame
|
||||
) -> Deal.DataFrame | None:
|
||||
"""Clean and parse deals from the raw source tables."""
|
||||
ic()
|
||||
partition_keys = get_partition_keys(context)
|
||||
ic(partition_keys)
|
||||
@@ -129,22 +115,32 @@ def cleaned_deals(
|
||||
context.log.warning(f"Unknown source: {source}!")
|
||||
return None
|
||||
|
||||
ic(parsed_df.collect_schema())
|
||||
|
||||
# Deduplicate and sort the DataFrame
|
||||
columns = ["source", "id", "artist", "title", "price"]
|
||||
return (
|
||||
parsed_df.collect()
|
||||
.sort("date", descending=True)
|
||||
return Deal.DataFrame(
|
||||
parsed_df.sort("date", descending=True)
|
||||
.unique(subset=columns, keep="first")
|
||||
.sort("date", descending=False)
|
||||
.select(*columns, "date", "release", "url")
|
||||
.collect()
|
||||
)
|
||||
|
||||
|
||||
@dg.asset(
|
||||
ins={"df": dg.AssetIn(key=new_deals.key)},
|
||||
deps=[cleaned_deals],
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
automation_condition=dg.AutomationCondition.eager(),
|
||||
automation_condition=dg.AutomationCondition.on_missing().without(
|
||||
dg.AutomationCondition.in_latest_time_window()
|
||||
),
|
||||
)
|
||||
def works(df: pl.DataFrame) -> pl.DataFrame:
|
||||
columns = ["artist", "title", "release"]
|
||||
return df[columns].unique()
|
||||
def works(context: dg.AssetExecutionContext) -> pl.DataFrame | None:
|
||||
"""Aggregate works from cleaned deals."""
|
||||
partitions = context.instance.get_materialized_partitions(cleaned_deals.key)
|
||||
ic(partitions)
|
||||
dfs = list(load_partitions(context, cleaned_deals.key, partitions))
|
||||
if dfs:
|
||||
columns = ["artist", "title", "release"]
|
||||
return pl.concat(dfs, how="vertical_relaxed").select(columns).unique()
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user