demo of delta lake with forked io manager

This commit is contained in:
2025-07-29 15:37:37 +02:00
parent f59b30b9ea
commit 4c59dcb8ac
6 changed files with 64 additions and 6 deletions

View File

@@ -1,5 +1,7 @@
FROM python:3.12-slim FROM python:3.12-slim
RUN apt update && apt install --no-install-recommends --yes git
# Checkout and install dagster libraries needed to run the gRPC server # Checkout and install dagster libraries needed to run the gRPC server
# exposing your repository to dagit and dagster-daemon, and to load the DagsterInstance # exposing your repository to dagit and dagster-daemon, and to load the DagsterInstance

View File

@@ -11,6 +11,8 @@ anyio==4.9.0
# gql # gql
# starlette # starlette
# watchfiles # watchfiles
arro3-core==0.5.1
# via deltalake
asttokens==3.0.0 asttokens==3.0.0
# via icecream # via icecream
backoff==2.2.1 backoff==2.2.1
@@ -51,6 +53,7 @@ dagster==1.11.3
# via # via
# dev (pyproject.toml) # dev (pyproject.toml)
# dagster-aws # dagster-aws
# dagster-delta
# dagster-docker # dagster-docker
# dagster-duckdb # dagster-duckdb
# dagster-duckdb-pandas # dagster-duckdb-pandas
@@ -60,6 +63,8 @@ dagster==1.11.3
# dagster-webserver # dagster-webserver
dagster-aws==0.27.3 dagster-aws==0.27.3
# via dev (pyproject.toml) # via dev (pyproject.toml)
dagster-delta @ git+https://github.com/ASML-Labs/dagster-delta.git@d28de7a7c13b7071f42231234eb9231269c7c1bf#subdirectory=libraries/dagster-delta
# via dev (pyproject.toml)
dagster-docker==0.27.3 dagster-docker==0.27.3
# via dev (pyproject.toml) # via dev (pyproject.toml)
dagster-duckdb==0.27.3 dagster-duckdb==0.27.3
@@ -82,6 +87,10 @@ dagster-shared==1.11.3
# via dagster # via dagster
dagster-webserver==1.11.3 dagster-webserver==1.11.3
# via dagit # via dagit
deltalake==1.1.3
# via dagster-delta
deprecated==1.2.18
# via deltalake
dnspython==2.7.0 dnspython==2.7.0
# via email-validator # via email-validator
docker==7.1.0 docker==7.1.0
@@ -195,6 +204,8 @@ patito==0.8.3
# via # via
# dev (pyproject.toml) # dev (pyproject.toml)
# dagster-polars # dagster-polars
pendulum==3.1.0
# via dagster-delta
pillow==11.3.0 pillow==11.3.0
# via matplotlib # via matplotlib
polars==1.31.0 polars==1.31.0
@@ -237,6 +248,7 @@ python-dateutil==2.9.0.post0
# graphene # graphene
# matplotlib # matplotlib
# pandas # pandas
# pendulum
python-dotenv==1.1.1 python-dotenv==1.1.1
# via # via
# dagster # dagster
@@ -310,6 +322,7 @@ typing-extensions==4.14.1
# via # via
# alembic # alembic
# anyio # anyio
# arro3-core
# beautifulsoup4 # beautifulsoup4
# dagster-polars # dagster-polars
# dagster-shared # dagster-shared
@@ -325,7 +338,9 @@ typing-inspection==0.4.1
# pydantic # pydantic
# pydantic-settings # pydantic-settings
tzdata==2025.2 tzdata==2025.2
# via pandas # via
# pandas
# pendulum
universal-pathlib==0.2.6 universal-pathlib==0.2.6
# via # via
# dagster # dagster
@@ -345,6 +360,8 @@ watchfiles==1.1.0
# via uvicorn # via uvicorn
websockets==15.0.1 websockets==15.0.1
# via uvicorn # via uvicorn
wrapt==1.17.2
# via deprecated
xlsxwriter==3.2.5 xlsxwriter==3.2.5
# via dev (pyproject.toml) # via dev (pyproject.toml)
yarl==1.20.1 yarl==1.20.1

View File

@@ -2,11 +2,15 @@ import sys
from functools import partial from functools import partial
from logging import getLogger from logging import getLogger
import pandas as pd
import pyarrow as pa
from config import APP from config import APP
import dagster as dg import dagster as dg
asset = partial(dg.asset, key_prefix=APP) TAGS = {"app": APP}
asset = partial(dg.asset, key_prefix=APP, tags=TAGS)
@asset() @asset()
@@ -18,3 +22,27 @@ def logging(context):
sys.__stderr__.write("This goes to stderr!\n") sys.__stderr__.write("This goes to stderr!\n")
getLogger("mylogger").info("This is an info message from mylogger") getLogger("mylogger").info("This is an info message from mylogger")
@asset(io_manager_key="delta_io_manager")
def iris_dataset() -> pa.Table:
df = pd.read_csv(
"https://docs.dagster.io/assets/iris.csv",
names=[
"sepal_length_cm",
"sepal_width_cm",
"petal_length_cm",
"petal_width_cm",
"species",
],
)
return pa.Table.from_pandas(df)
@asset(
io_manager_key="delta_io_manager", ins={"table": dg.AssetIn(key=iris_dataset.key)}
)
def iris_cleaned(table: pa.Table) -> pa.Table:
df = table.to_pandas()
result_df = df.dropna().drop_duplicates()
return pa.Table.from_pandas(result_df)

View File

@@ -1,4 +1,5 @@
import assets import assets
from dagster_delta import DeltaLakePyarrowIOManager, LocalConfig, WriteMode
from dagster_polars import PolarsParquetIOManager from dagster_polars import PolarsParquetIOManager
from icecream import install from icecream import install
from shared.config import APP, STORAGE_DIR from shared.config import APP, STORAGE_DIR
@@ -11,11 +12,16 @@ definitions = dg.Definitions(
assets=[ assets=[
asset.with_attributes( asset.with_attributes(
group_names_by_key={asset.key: APP}, group_names_by_key={asset.key: APP},
tags_by_key={asset.key: {"app": APP}},
) )
for asset in dg.load_assets_from_modules([assets]) for asset in dg.load_assets_from_modules([assets])
], ],
resources={ resources={
"polars_parquet_io_manager": PolarsParquetIOManager(base_dir=STORAGE_DIR) "polars_parquet_io_manager": PolarsParquetIOManager(base_dir=STORAGE_DIR),
"delta_io_manager": DeltaLakePyarrowIOManager(
root_uri=STORAGE_DIR,
storage_options=LocalConfig(),
mode=WriteMode.overwrite,
parquet_read_options={"coerce_int96_timestamp_unit": "us"},
),
}, },
) )

View File

@@ -66,7 +66,12 @@ weather = [
"requests_cache", "requests_cache",
"retry_requests" "retry_requests"
] ]
other = [] other = [
# "deltalake>=1.0.0",
# "dagster-deltalake-pandas",
# "dagster-deltalake-polars",
"dagster-delta @ git+https://github.com/ASML-Labs/dagster-delta.git@dagster_delta-0.5.1#subdirectory=libraries/dagster-delta"
]
unknown = [ unknown = [
"fastapi", "fastapi",
"geopandas", "geopandas",

View File

@@ -1,4 +1,4 @@
import os import os
APP = os.environ["APP"] APP = os.environ["APP"]
STORAGE_DIR = os.environ.get("STORAGE_DIR", "/storage") + f"/{APP}" STORAGE_DIR = os.environ.get("STORAGE_DIR", "/storage")