improve sql
This commit is contained in:
@@ -109,6 +109,7 @@ def deals(context):
|
|||||||
|
|
||||||
@asset(deps=[deals], io_manager_key="polars_parquet_io_manager")
|
@asset(deps=[deals], io_manager_key="polars_parquet_io_manager")
|
||||||
def new_deals(context: OpExecutionContext) -> pl.DataFrame:
|
def new_deals(context: OpExecutionContext) -> pl.DataFrame:
|
||||||
|
"""Combine deals from Plato and Sounds into a single DataFrame."""
|
||||||
ic()
|
ic()
|
||||||
storage_dir = context.resources.polars_parquet_io_manager.base_dir
|
storage_dir = context.resources.polars_parquet_io_manager.base_dir
|
||||||
asset_key = "deals"
|
asset_key = "deals"
|
||||||
@@ -126,26 +127,34 @@ def new_deals(context: OpExecutionContext) -> pl.DataFrame:
|
|||||||
LOWER(title) AS title,
|
LOWER(title) AS title,
|
||||||
CAST(_date AS DATE) AS release,
|
CAST(_date AS DATE) AS release,
|
||||||
CAST(_price AS FLOAT) AS price,
|
CAST(_price AS FLOAT) AS price,
|
||||||
CONCAT('https://www.platomania.nl', url) AS url,
|
CONCAT('https://www.platomania.nl', url) AS url
|
||||||
FROM read_parquet('{storage_dir}/{asset_key}/*/plato.parquet', union_by_name = true)
|
FROM read_parquet(
|
||||||
), tmp_sounds AS (
|
'{storage_dir}/{asset_key}/*/plato.parquet',
|
||||||
|
union_by_name = true
|
||||||
|
)
|
||||||
|
),
|
||||||
|
tmp_sounds AS (
|
||||||
SELECT
|
SELECT
|
||||||
source,
|
source,
|
||||||
date,
|
date,
|
||||||
id,
|
id,
|
||||||
LOWER(TRIM(COALESCE(artist, SPLIT(name, '-')[1]))) AS artist,
|
LOWER(TRIM(COALESCE(artist, SPLIT(name, '-')[1]))) AS artist,
|
||||||
LOWER(TRIM(COALESCE(title, ARRAY_TO_STRING(split(name, '-')[2:], '-')))) AS title,
|
LOWER(TRIM(COALESCE(
|
||||||
|
title,
|
||||||
|
ARRAY_TO_STRING(SPLIT(name, '-')[2:], '-')
|
||||||
|
))) AS title,
|
||||||
PARSE_DATE(release) AS release,
|
PARSE_DATE(release) AS release,
|
||||||
CAST(price AS FLOAT) AS price,
|
CAST(price AS FLOAT) AS price,
|
||||||
CONCAT('https://www.sounds.nl/detail/', id) AS url
|
CONCAT('https://www.sounds.nl/detail/', id) AS url
|
||||||
FROM read_parquet('{storage_dir}/{asset_key}/*/sounds.parquet', union_by_name = true)
|
FROM read_parquet(
|
||||||
), tmp_both AS (
|
'{storage_dir}/{asset_key}/*/sounds.parquet',
|
||||||
SELECT * FROM tmp_plato UNION ALL SELECT * FROM tmp_sounds
|
union_by_name = true
|
||||||
), tmp_rn AS (
|
)
|
||||||
SELECT
|
),
|
||||||
*,
|
tmp_both AS (
|
||||||
ROW_NUMBER() OVER(PARTITION BY source, id, artist, title, price ORDER BY date DESC) as rn
|
SELECT * FROM tmp_plato
|
||||||
FROM tmp_both
|
UNION ALL
|
||||||
|
SELECT * FROM tmp_sounds
|
||||||
)
|
)
|
||||||
SELECT
|
SELECT
|
||||||
source,
|
source,
|
||||||
@@ -156,8 +165,11 @@ def new_deals(context: OpExecutionContext) -> pl.DataFrame:
|
|||||||
release,
|
release,
|
||||||
price,
|
price,
|
||||||
url
|
url
|
||||||
FROM tmp_rn
|
FROM tmp_both
|
||||||
WHERE rn = 1
|
QUALIFY ROW_NUMBER() OVER (
|
||||||
|
PARTITION BY source, id, artist, title, price
|
||||||
|
ORDER BY date DESC
|
||||||
|
) = 1
|
||||||
ORDER BY date ASC
|
ORDER BY date ASC
|
||||||
"""
|
"""
|
||||||
).pl()
|
).pl()
|
||||||
|
|||||||
Reference in New Issue
Block a user