demo of delta lake with forked io manager

This commit is contained in:
2025-07-29 15:37:37 +02:00
parent f59b30b9ea
commit 4c59dcb8ac
6 changed files with 64 additions and 6 deletions

View File

@@ -11,6 +11,8 @@ anyio==4.9.0
# gql
# starlette
# watchfiles
arro3-core==0.5.1
# via deltalake
asttokens==3.0.0
# via icecream
backoff==2.2.1
@@ -51,6 +53,7 @@ dagster==1.11.3
# via
# dev (pyproject.toml)
# dagster-aws
# dagster-delta
# dagster-docker
# dagster-duckdb
# dagster-duckdb-pandas
@@ -60,6 +63,8 @@ dagster==1.11.3
# dagster-webserver
dagster-aws==0.27.3
# via dev (pyproject.toml)
dagster-delta @ git+https://github.com/ASML-Labs/dagster-delta.git@d28de7a7c13b7071f42231234eb9231269c7c1bf#subdirectory=libraries/dagster-delta
# via dev (pyproject.toml)
dagster-docker==0.27.3
# via dev (pyproject.toml)
dagster-duckdb==0.27.3
@@ -82,6 +87,10 @@ dagster-shared==1.11.3
# via dagster
dagster-webserver==1.11.3
# via dagit
deltalake==1.1.3
# via dagster-delta
deprecated==1.2.18
# via deltalake
dnspython==2.7.0
# via email-validator
docker==7.1.0
@@ -195,6 +204,8 @@ patito==0.8.3
# via
# dev (pyproject.toml)
# dagster-polars
pendulum==3.1.0
# via dagster-delta
pillow==11.3.0
# via matplotlib
polars==1.31.0
@@ -237,6 +248,7 @@ python-dateutil==2.9.0.post0
# graphene
# matplotlib
# pandas
# pendulum
python-dotenv==1.1.1
# via
# dagster
@@ -310,6 +322,7 @@ typing-extensions==4.14.1
# via
# alembic
# anyio
# arro3-core
# beautifulsoup4
# dagster-polars
# dagster-shared
@@ -325,7 +338,9 @@ typing-inspection==0.4.1
# pydantic
# pydantic-settings
tzdata==2025.2
# via pandas
# via
# pandas
# pendulum
universal-pathlib==0.2.6
# via
# dagster
@@ -345,6 +360,8 @@ watchfiles==1.1.0
# via uvicorn
websockets==15.0.1
# via uvicorn
wrapt==1.17.2
# via deprecated
xlsxwriter==3.2.5
# via dev (pyproject.toml)
yarl==1.20.1

View File

@@ -2,11 +2,15 @@ import sys
from functools import partial
from logging import getLogger
import pandas as pd
import pyarrow as pa
from config import APP
import dagster as dg
asset = partial(dg.asset, key_prefix=APP)
TAGS = {"app": APP}
asset = partial(dg.asset, key_prefix=APP, tags=TAGS)
@asset()
@@ -18,3 +22,27 @@ def logging(context):
sys.__stderr__.write("This goes to stderr!\n")
getLogger("mylogger").info("This is an info message from mylogger")
@asset(io_manager_key="delta_io_manager")
def iris_dataset() -> pa.Table:
df = pd.read_csv(
"https://docs.dagster.io/assets/iris.csv",
names=[
"sepal_length_cm",
"sepal_width_cm",
"petal_length_cm",
"petal_width_cm",
"species",
],
)
return pa.Table.from_pandas(df)
@asset(
io_manager_key="delta_io_manager", ins={"table": dg.AssetIn(key=iris_dataset.key)}
)
def iris_cleaned(table: pa.Table) -> pa.Table:
df = table.to_pandas()
result_df = df.dropna().drop_duplicates()
return pa.Table.from_pandas(result_df)

View File

@@ -1,4 +1,5 @@
import assets
from dagster_delta import DeltaLakePyarrowIOManager, LocalConfig, WriteMode
from dagster_polars import PolarsParquetIOManager
from icecream import install
from shared.config import APP, STORAGE_DIR
@@ -11,11 +12,16 @@ definitions = dg.Definitions(
assets=[
asset.with_attributes(
group_names_by_key={asset.key: APP},
tags_by_key={asset.key: {"app": APP}},
)
for asset in dg.load_assets_from_modules([assets])
],
resources={
"polars_parquet_io_manager": PolarsParquetIOManager(base_dir=STORAGE_DIR)
"polars_parquet_io_manager": PolarsParquetIOManager(base_dir=STORAGE_DIR),
"delta_io_manager": DeltaLakePyarrowIOManager(
root_uri=STORAGE_DIR,
storage_options=LocalConfig(),
mode=WriteMode.overwrite,
parquet_read_options={"coerce_int96_timestamp_unit": "us"},
),
},
)