use polars and duckdb for lazy processing
This commit is contained in:
@@ -2,7 +2,6 @@ from datetime import datetime
|
||||
from glob import glob
|
||||
|
||||
import duckdb
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import structlog
|
||||
from duckdb.typing import DATE, VARCHAR
|
||||
@@ -110,17 +109,13 @@ def deals(context):
|
||||
)
|
||||
|
||||
|
||||
@asset(deps=[deals], io_manager_key="duckdb_io_manager")
|
||||
def new_deals(
|
||||
context: OpExecutionContext,
|
||||
# duckdb: DuckDBResource
|
||||
) -> pd.DataFrame:
|
||||
@asset(deps=[deals], io_manager_key="polars_parquet_io_manager")
|
||||
def new_deals(context: OpExecutionContext) -> pl.DataFrame:
|
||||
ic()
|
||||
storage_dir = context.instance.storage_directory()
|
||||
asset_key = "deals"
|
||||
|
||||
with duckdb.connect() as con:
|
||||
# with duckdb.get_connection() as con:
|
||||
con.create_function("PARSE_DATE", parse_date, [VARCHAR], DATE)
|
||||
return con.execute(
|
||||
f"""
|
||||
@@ -167,16 +162,20 @@ def new_deals(
|
||||
WHERE rn = 1
|
||||
ORDER BY date ASC
|
||||
"""
|
||||
).df()
|
||||
).pl()
|
||||
|
||||
|
||||
@asset(
|
||||
# deps=[new_deals],
|
||||
io_manager_key="duckdb_io_manager"
|
||||
io_manager_key="polars_parquet_io_manager",
|
||||
)
|
||||
def works(new_deals: pd.DataFrame) -> pd.DataFrame:
|
||||
return new_deals[["artist", "title"]].drop_duplicates()
|
||||
# with duckdb.get_connection() as con:
|
||||
# return con.execute(
|
||||
# "SELECT DISTINCT artist, title, release FROM vinyl.public.new_deals"
|
||||
# ).df()
|
||||
def works(new_deals: pl.DataFrame) -> pl.DataFrame:
|
||||
# Pandas
|
||||
# columns = ["artist", "title"]
|
||||
# return pl.from_pandas(new_deals[columns].to_pandas().drop_duplicates())
|
||||
|
||||
# Polars
|
||||
# return new_deals[columns].unique(subset=columns)
|
||||
|
||||
# DuckDB
|
||||
with duckdb.connect() as con:
|
||||
return con.execute("SELECT DISTINCT artist, title, release FROM new_deals").pl()
|
||||
|
||||
@@ -23,8 +23,7 @@ vinyl = Definitions(
|
||||
assets=[deals, new_deals, works],
|
||||
resources={
|
||||
"polars_parquet_io_manager": PolarsParquetIOManager(),
|
||||
"duckdb_io_manager": PandasDuckDBIOManager(database="vinyl.duckdb"),
|
||||
# "duckdb": DuckDBResource(database="vinyl.duckdb")
|
||||
"duckdb_io_manager": PandasDuckDBIOManager(database="vinyl"),
|
||||
},
|
||||
jobs=[deals_job, check_partititions_job, musicbrainz_lookup_job],
|
||||
schedules=[deals_schedule],
|
||||
|
||||
Reference in New Issue
Block a user