rewrite to dagster as dg
This commit is contained in:
@@ -9,35 +9,27 @@ from plato.fetch import scrape_plato
|
||||
from sounds.fetch import fetch_deals
|
||||
from utils import parse_date
|
||||
|
||||
from dagster import (
|
||||
DailyPartitionsDefinition,
|
||||
Failure,
|
||||
Field,
|
||||
MultiPartitionsDefinition,
|
||||
OpExecutionContext,
|
||||
StaticPartitionsDefinition,
|
||||
asset,
|
||||
)
|
||||
import dagster as dg
|
||||
|
||||
SOURCES = ["plato", "sounds"]
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
partitions_def = MultiPartitionsDefinition(
|
||||
partitions_def = dg.MultiPartitionsDefinition(
|
||||
{
|
||||
"date": DailyPartitionsDefinition(start_date="2024-09-01", end_offset=1),
|
||||
"source": StaticPartitionsDefinition(SOURCES),
|
||||
"date": dg.DailyPartitionsDefinition(start_date="2024-09-01", end_offset=1),
|
||||
"source": dg.StaticPartitionsDefinition(SOURCES),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@asset(
|
||||
@dg.asset(
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
partitions_def=partitions_def,
|
||||
metadata={
|
||||
"partition_by": ["date", "source"],
|
||||
},
|
||||
config_schema={"import_dir": Field(str, default_value="/storage/import")},
|
||||
config_schema={"import_dir": dg.Field(str, default_value="/storage/import")},
|
||||
)
|
||||
def deals(context):
|
||||
ic()
|
||||
@@ -53,7 +45,7 @@ def deals(context):
|
||||
days = (date - datetime.today()).days
|
||||
ic(days)
|
||||
if days > 0:
|
||||
raise Failure(f"Cannot materialize for the future: {date.date()}")
|
||||
raise dg.Failure(f"Cannot materialize for the future: {date.date()}")
|
||||
if days < -1:
|
||||
if source == "sounds":
|
||||
pattern = f"{import_dir}/{date.date()}_*_sounds.csv"
|
||||
@@ -70,7 +62,7 @@ def deals(context):
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Failed to load CSV file!", error=e)
|
||||
raise Failure(f"Cannot materialize for the past: {date.date()}")
|
||||
raise dg.Failure(f"Cannot materialize for the past: {date.date()}")
|
||||
|
||||
if source == "plato":
|
||||
logger.info("Scraping Plato")
|
||||
@@ -90,8 +82,13 @@ def deals(context):
|
||||
)
|
||||
|
||||
|
||||
@asset(deps=[deals], io_manager_key="polars_parquet_io_manager")
|
||||
def new_deals(context: OpExecutionContext) -> pl.DataFrame:
|
||||
@dg.asset(
|
||||
deps=[deals.key],
|
||||
ins={"df": dg.AssetIn(key=deals.key)},
|
||||
automation_condition=dg.AutomationCondition.eager(),
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
)
|
||||
def new_deals(context: dg.OpExecutionContext) -> pl.DataFrame:
|
||||
"""Combine deals from Plato and Sounds into a single DataFrame."""
|
||||
ic()
|
||||
storage_dir = context.resources.polars_parquet_io_manager.base_dir
|
||||
@@ -160,17 +157,12 @@ def new_deals(context: OpExecutionContext) -> pl.DataFrame:
|
||||
).pl()
|
||||
|
||||
|
||||
@asset(
|
||||
@dg.asset(
|
||||
deps=[new_deals.key],
|
||||
ins={"df": dg.AssetIn(key=new_deals.key)},
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
automation_condition=dg.AutomationCondition.eager(),
|
||||
)
|
||||
def works(new_deals: pl.DataFrame) -> pl.DataFrame:
|
||||
# Pandas
|
||||
# columns = ["artist", "title"]
|
||||
# return pl.from_pandas(new_deals[columns].to_pandas().drop_duplicates())
|
||||
|
||||
# Polars
|
||||
# return new_deals[columns].unique(subset=columns)
|
||||
|
||||
# DuckDB
|
||||
with duckdb.connect() as con:
|
||||
return con.execute("SELECT DISTINCT artist, title, release FROM new_deals").pl()
|
||||
def works(df: pl.DataFrame) -> pl.DataFrame:
|
||||
columns = ["artist", "title", "release"]
|
||||
return df[columns].unique()
|
||||
|
||||
Reference in New Issue
Block a user