refactor to allow for multiple code locations

This commit is contained in:
2025-07-20 19:49:30 +02:00
parent 9b8cfabee5
commit fd73e1367c
40 changed files with 161 additions and 628 deletions

0
apps/other/__init__.py Normal file
View File

View File

@@ -0,0 +1,3 @@
from icecream import install
install()

98
apps/other/src/assets.py Normal file
View File

@@ -0,0 +1,98 @@
import polars as pl
from dagster import (
AssetIn,
DailyPartitionsDefinition,
DimensionPartitionMapping,
IdentityPartitionMapping,
MultiPartitionMapping,
MultiPartitionsDefinition,
StaticPartitionsDefinition,
TimeWindowPartitionMapping,
asset,
)
partitions_def_single = DailyPartitionsDefinition(start_date="2024-09-20")
partitions_def_multi = MultiPartitionsDefinition(
{
"date": DailyPartitionsDefinition(start_date="2024-09-20"),
"source": StaticPartitionsDefinition(["plato", "sounds"]),
}
)
@asset(
io_manager_key="polars_parquet_io_manager",
partitions_def=partitions_def_single,
metadata={
"partition_by": ["date"],
},
)
def asset_single_1(context):
ic()
ic(context.partition_key)
return pl.DataFrame(
[{"date": context.partition_key, "data": f"Data for {context.partition_key}"}]
)
@asset(
io_manager_key="polars_parquet_io_manager",
partitions_def=partitions_def_multi,
metadata={
"partition_by": ["date", "source"],
},
)
def asset_multi_1(context):
ic()
ic(context.partition_key)
return pl.DataFrame(
[{"date": context.partition_key, "data": f"Data for {context.partition_key}"}]
)
@asset(
partitions_def=partitions_def_single,
ins={
"asset_single_1": AssetIn(
partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=0)
)
},
)
def asset_single_2(context, asset_single_1):
ic()
ic(context.partition_key)
ic(asset_single_1.keys())
partition_key = context.asset_partition_key_for_output()
return f"Processed data for {partition_key}"
partition_mapping = MultiPartitionMapping(
{
"date": DimensionPartitionMapping(
dimension_name="date",
partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=0),
),
"source": DimensionPartitionMapping(
dimension_name="source",
partition_mapping=IdentityPartitionMapping(),
),
}
)
@asset(
partitions_def=partitions_def_multi,
ins={"asset_multi_1": AssetIn(partition_mapping=partition_mapping)},
)
def asset_multi_2(context, asset_multi_1):
ic()
ic(context.partition_key)
ic(context.partition_key.keys_by_dimension)
ic(asset_multi_1)
partition_key = context.asset_partition_key_for_output()
ic(partition_key)
return f"Processed data for {partition_key}"

View File

@@ -0,0 +1,13 @@
from assets import asset_multi_1, asset_multi_2, asset_single_1, asset_single_2
from dagster_polars import PolarsParquetIOManager
from dagster import Definitions, define_asset_job
# Define a job that includes both assets
daily_job = define_asset_job("daily_job", selection=[asset_multi_1, asset_multi_2])
definitions = Definitions(
assets=[asset_single_1, asset_multi_1, asset_single_2, asset_multi_2],
resources={"polars_parquet_io_manager": PolarsParquetIOManager()},
jobs=[daily_job],
)

108
apps/other/src/mapping.py Normal file
View File

@@ -0,0 +1,108 @@
from datetime import datetime, timedelta
from typing import Optional
from dagster import MultiPartitionKey, PartitionMapping, PartitionsDefinition
from dagster._core.definitions.partition import PartitionsSubset
from dagster._core.definitions.partition_mapping import (
MultiPartitionMapping,
UpstreamPartitionsResult,
)
from dagster._core.instance import DynamicPartitionsStore
from dagster._serdes import whitelist_for_serdes
# @whitelist_for_serdes
class LatestTwoPartitionsMapping(PartitionMapping):
def get_upstream_mapped_partitions_result_for_partitions(
self,
downstream_partitions_subset: Optional[PartitionsSubset],
downstream_partitions_def: Optional[PartitionsDefinition],
upstream_partitions_def: PartitionsDefinition,
current_time: Optional[datetime] = None,
dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,
) -> UpstreamPartitionsResult:
ic()
# Get upstream partitions from the subset
all_partitions = upstream_partitions_def.get_partition_keys()
ic(all_partitions)
if len(all_partitions) < 2:
raise ValueError("Not enough partitions to proceed.")
# Select the last two partitions
partition_keys = [all_partitions[-2], all_partitions[-1]]
return UpstreamPartitionsResult(
upstream_partitions_def.subset_with_partition_keys(partition_keys), []
)
def get_downstream_partitions_for_partitions(
self,
upstream_partitions_subset: PartitionsSubset,
downstream_partitions_def,
upstream_partitions_def,
) -> PartitionsSubset:
ic()
# Get the downstream partition that corresponds to the latest upstream partition
downstream_partition_key = upstream_partitions_subset.get_partition_keys()[-1]
return downstream_partitions_def.subset_with_partition_keys(
[downstream_partition_key]
)
@property
def description(self):
return "Maps to the latest two upstream partitions."
@whitelist_for_serdes
class X(MultiPartitionMapping):
def get_upstream_partitions_for_partition_range(
self,
downstream_partition_range,
upstream_partitions_def,
downstream_partitions_def,
) -> UpstreamPartitionsResult:
ic()
# Extract downstream partition range keys
downstream_keys = downstream_partition_range.get_partition_keys()
# Initialize a list to hold the upstream partition keys
upstream_keys = []
# Iterate over each downstream partition key
for downstream_key in downstream_keys:
# Parse the MultiPartitionKey
downstream_mpk = MultiPartitionKey.from_str(downstream_key)
for i in [1, 2]:
# Shift the daily partition by one day
shifted_date = datetime.strptime(
downstream_mpk.keys_by_dimension["date"], "%Y-%m-%d"
) - timedelta(days=i)
# Recreate the MultiPartitionKey with the shifted daily partition
upstream_mpk = MultiPartitionKey(
{
"source": downstream_mpk.keys_by_dimension["source"],
"date": shifted_date.strftime("%Y-%m-%d"),
}
)
# Add the upstream partition key
upstream_keys.append(upstream_mpk.to_string())
return UpstreamPartitionsResult(
upstream_partitions_def.subset_with_partition_keys(upstream_keys), []
)
def get_downstream_partitions_for_partition_range(
self,
upstream_partition_range,
downstream_partitions_def,
upstream_partitions_def,
) -> PartitionsSubset:
# This method would map upstream partitions back to downstream, but for simplicity, let's assume it's symmetric.
return self.get_upstream_partitions_for_partition_range(
upstream_partition_range, upstream_partitions_def, downstream_partitions_def
)

48
apps/other/test.py Normal file
View File

@@ -0,0 +1,48 @@
from assets import asset_multi_1, asset_multi_2, asset_single_1, asset_single_2
from dagster_polars import PolarsParquetIOManager
from dagster import materialize
resources = {
"polars_parquet_io_manager": PolarsParquetIOManager(base_dir="/opt/dagster/storage")
}
def test_single():
result = materialize(
[asset_single_1, asset_single_2],
partition_key="2024-10-02",
resources=resources,
)
assert result.success
ic(result.asset_value)
def test_multi():
# result = materialize([
# asset_multi_1
# ], partition_key="2024-10-01|plato", resources=resources
# )
# assert result.success
# ic(result.asset_value)
#
#
# result = materialize([
# asset_multi_1
# ], partition_key="2024-10-02|plato", resources=resources
# )
# assert result.success
# ic(result.asset_value)
result = materialize(
[asset_multi_1, asset_multi_2],
partition_key="2024-10-02|plato",
resources=resources,
)
assert result.success
ic(result.asset_value)
if __name__ == "__main__":
# test_single()
test_multi()