refactor to allow for multiple code locations
This commit is contained in:
0
apps/other/__init__.py
Normal file
0
apps/other/__init__.py
Normal file
3
apps/other/src/__init__.py
Normal file
3
apps/other/src/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from icecream import install
|
||||
|
||||
install()
|
||||
98
apps/other/src/assets.py
Normal file
98
apps/other/src/assets.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import polars as pl
|
||||
|
||||
from dagster import (
|
||||
AssetIn,
|
||||
DailyPartitionsDefinition,
|
||||
DimensionPartitionMapping,
|
||||
IdentityPartitionMapping,
|
||||
MultiPartitionMapping,
|
||||
MultiPartitionsDefinition,
|
||||
StaticPartitionsDefinition,
|
||||
TimeWindowPartitionMapping,
|
||||
asset,
|
||||
)
|
||||
|
||||
partitions_def_single = DailyPartitionsDefinition(start_date="2024-09-20")
|
||||
|
||||
partitions_def_multi = MultiPartitionsDefinition(
|
||||
{
|
||||
"date": DailyPartitionsDefinition(start_date="2024-09-20"),
|
||||
"source": StaticPartitionsDefinition(["plato", "sounds"]),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@asset(
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
partitions_def=partitions_def_single,
|
||||
metadata={
|
||||
"partition_by": ["date"],
|
||||
},
|
||||
)
|
||||
def asset_single_1(context):
|
||||
ic()
|
||||
ic(context.partition_key)
|
||||
return pl.DataFrame(
|
||||
[{"date": context.partition_key, "data": f"Data for {context.partition_key}"}]
|
||||
)
|
||||
|
||||
|
||||
@asset(
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
partitions_def=partitions_def_multi,
|
||||
metadata={
|
||||
"partition_by": ["date", "source"],
|
||||
},
|
||||
)
|
||||
def asset_multi_1(context):
|
||||
ic()
|
||||
ic(context.partition_key)
|
||||
|
||||
return pl.DataFrame(
|
||||
[{"date": context.partition_key, "data": f"Data for {context.partition_key}"}]
|
||||
)
|
||||
|
||||
|
||||
@asset(
|
||||
partitions_def=partitions_def_single,
|
||||
ins={
|
||||
"asset_single_1": AssetIn(
|
||||
partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=0)
|
||||
)
|
||||
},
|
||||
)
|
||||
def asset_single_2(context, asset_single_1):
|
||||
ic()
|
||||
ic(context.partition_key)
|
||||
ic(asset_single_1.keys())
|
||||
partition_key = context.asset_partition_key_for_output()
|
||||
return f"Processed data for {partition_key}"
|
||||
|
||||
|
||||
partition_mapping = MultiPartitionMapping(
|
||||
{
|
||||
"date": DimensionPartitionMapping(
|
||||
dimension_name="date",
|
||||
partition_mapping=TimeWindowPartitionMapping(start_offset=-1, end_offset=0),
|
||||
),
|
||||
"source": DimensionPartitionMapping(
|
||||
dimension_name="source",
|
||||
partition_mapping=IdentityPartitionMapping(),
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@asset(
|
||||
partitions_def=partitions_def_multi,
|
||||
ins={"asset_multi_1": AssetIn(partition_mapping=partition_mapping)},
|
||||
)
|
||||
def asset_multi_2(context, asset_multi_1):
|
||||
ic()
|
||||
ic(context.partition_key)
|
||||
ic(context.partition_key.keys_by_dimension)
|
||||
ic(asset_multi_1)
|
||||
|
||||
partition_key = context.asset_partition_key_for_output()
|
||||
ic(partition_key)
|
||||
return f"Processed data for {partition_key}"
|
||||
13
apps/other/src/definitions.py
Normal file
13
apps/other/src/definitions.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from assets import asset_multi_1, asset_multi_2, asset_single_1, asset_single_2
|
||||
from dagster_polars import PolarsParquetIOManager
|
||||
|
||||
from dagster import Definitions, define_asset_job
|
||||
|
||||
# Define a job that includes both assets
|
||||
daily_job = define_asset_job("daily_job", selection=[asset_multi_1, asset_multi_2])
|
||||
|
||||
definitions = Definitions(
|
||||
assets=[asset_single_1, asset_multi_1, asset_single_2, asset_multi_2],
|
||||
resources={"polars_parquet_io_manager": PolarsParquetIOManager()},
|
||||
jobs=[daily_job],
|
||||
)
|
||||
108
apps/other/src/mapping.py
Normal file
108
apps/other/src/mapping.py
Normal file
@@ -0,0 +1,108 @@
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from dagster import MultiPartitionKey, PartitionMapping, PartitionsDefinition
|
||||
from dagster._core.definitions.partition import PartitionsSubset
|
||||
from dagster._core.definitions.partition_mapping import (
|
||||
MultiPartitionMapping,
|
||||
UpstreamPartitionsResult,
|
||||
)
|
||||
from dagster._core.instance import DynamicPartitionsStore
|
||||
from dagster._serdes import whitelist_for_serdes
|
||||
|
||||
|
||||
# @whitelist_for_serdes
|
||||
class LatestTwoPartitionsMapping(PartitionMapping):
|
||||
def get_upstream_mapped_partitions_result_for_partitions(
|
||||
self,
|
||||
downstream_partitions_subset: Optional[PartitionsSubset],
|
||||
downstream_partitions_def: Optional[PartitionsDefinition],
|
||||
upstream_partitions_def: PartitionsDefinition,
|
||||
current_time: Optional[datetime] = None,
|
||||
dynamic_partitions_store: Optional[DynamicPartitionsStore] = None,
|
||||
) -> UpstreamPartitionsResult:
|
||||
ic()
|
||||
|
||||
# Get upstream partitions from the subset
|
||||
all_partitions = upstream_partitions_def.get_partition_keys()
|
||||
ic(all_partitions)
|
||||
|
||||
if len(all_partitions) < 2:
|
||||
raise ValueError("Not enough partitions to proceed.")
|
||||
|
||||
# Select the last two partitions
|
||||
partition_keys = [all_partitions[-2], all_partitions[-1]]
|
||||
return UpstreamPartitionsResult(
|
||||
upstream_partitions_def.subset_with_partition_keys(partition_keys), []
|
||||
)
|
||||
|
||||
def get_downstream_partitions_for_partitions(
|
||||
self,
|
||||
upstream_partitions_subset: PartitionsSubset,
|
||||
downstream_partitions_def,
|
||||
upstream_partitions_def,
|
||||
) -> PartitionsSubset:
|
||||
ic()
|
||||
# Get the downstream partition that corresponds to the latest upstream partition
|
||||
downstream_partition_key = upstream_partitions_subset.get_partition_keys()[-1]
|
||||
return downstream_partitions_def.subset_with_partition_keys(
|
||||
[downstream_partition_key]
|
||||
)
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return "Maps to the latest two upstream partitions."
|
||||
|
||||
|
||||
@whitelist_for_serdes
|
||||
class X(MultiPartitionMapping):
|
||||
def get_upstream_partitions_for_partition_range(
|
||||
self,
|
||||
downstream_partition_range,
|
||||
upstream_partitions_def,
|
||||
downstream_partitions_def,
|
||||
) -> UpstreamPartitionsResult:
|
||||
ic()
|
||||
|
||||
# Extract downstream partition range keys
|
||||
downstream_keys = downstream_partition_range.get_partition_keys()
|
||||
|
||||
# Initialize a list to hold the upstream partition keys
|
||||
upstream_keys = []
|
||||
|
||||
# Iterate over each downstream partition key
|
||||
for downstream_key in downstream_keys:
|
||||
# Parse the MultiPartitionKey
|
||||
downstream_mpk = MultiPartitionKey.from_str(downstream_key)
|
||||
|
||||
for i in [1, 2]:
|
||||
# Shift the daily partition by one day
|
||||
shifted_date = datetime.strptime(
|
||||
downstream_mpk.keys_by_dimension["date"], "%Y-%m-%d"
|
||||
) - timedelta(days=i)
|
||||
|
||||
# Recreate the MultiPartitionKey with the shifted daily partition
|
||||
upstream_mpk = MultiPartitionKey(
|
||||
{
|
||||
"source": downstream_mpk.keys_by_dimension["source"],
|
||||
"date": shifted_date.strftime("%Y-%m-%d"),
|
||||
}
|
||||
)
|
||||
|
||||
# Add the upstream partition key
|
||||
upstream_keys.append(upstream_mpk.to_string())
|
||||
|
||||
return UpstreamPartitionsResult(
|
||||
upstream_partitions_def.subset_with_partition_keys(upstream_keys), []
|
||||
)
|
||||
|
||||
def get_downstream_partitions_for_partition_range(
|
||||
self,
|
||||
upstream_partition_range,
|
||||
downstream_partitions_def,
|
||||
upstream_partitions_def,
|
||||
) -> PartitionsSubset:
|
||||
# This method would map upstream partitions back to downstream, but for simplicity, let's assume it's symmetric.
|
||||
return self.get_upstream_partitions_for_partition_range(
|
||||
upstream_partition_range, upstream_partitions_def, downstream_partitions_def
|
||||
)
|
||||
48
apps/other/test.py
Normal file
48
apps/other/test.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from assets import asset_multi_1, asset_multi_2, asset_single_1, asset_single_2
|
||||
from dagster_polars import PolarsParquetIOManager
|
||||
|
||||
from dagster import materialize
|
||||
|
||||
resources = {
|
||||
"polars_parquet_io_manager": PolarsParquetIOManager(base_dir="/opt/dagster/storage")
|
||||
}
|
||||
|
||||
|
||||
def test_single():
|
||||
result = materialize(
|
||||
[asset_single_1, asset_single_2],
|
||||
partition_key="2024-10-02",
|
||||
resources=resources,
|
||||
)
|
||||
assert result.success
|
||||
ic(result.asset_value)
|
||||
|
||||
|
||||
def test_multi():
|
||||
# result = materialize([
|
||||
# asset_multi_1
|
||||
# ], partition_key="2024-10-01|plato", resources=resources
|
||||
# )
|
||||
# assert result.success
|
||||
# ic(result.asset_value)
|
||||
#
|
||||
#
|
||||
# result = materialize([
|
||||
# asset_multi_1
|
||||
# ], partition_key="2024-10-02|plato", resources=resources
|
||||
# )
|
||||
# assert result.success
|
||||
# ic(result.asset_value)
|
||||
|
||||
result = materialize(
|
||||
[asset_multi_1, asset_multi_2],
|
||||
partition_key="2024-10-02|plato",
|
||||
resources=resources,
|
||||
)
|
||||
assert result.success
|
||||
ic(result.asset_value)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test_single()
|
||||
test_multi()
|
||||
Reference in New Issue
Block a user