move parsing logic
This commit is contained in:
@@ -4,9 +4,10 @@ from glob import glob
|
|||||||
import polars as pl
|
import polars as pl
|
||||||
import structlog
|
import structlog
|
||||||
from plato.fetch import scrape_plato
|
from plato.fetch import scrape_plato
|
||||||
|
from plato.parse import parse as parse_plato
|
||||||
from shared.utils import get_partition_keys, parse_partition_keys
|
from shared.utils import get_partition_keys, parse_partition_keys
|
||||||
from sounds.fetch import fetch_deals
|
from sounds.fetch import fetch_deals
|
||||||
from utils import parse_date
|
from sounds.parse import parse as parse_sounds
|
||||||
|
|
||||||
import dagster as dg
|
import dagster as dg
|
||||||
|
|
||||||
@@ -103,43 +104,6 @@ def new_deals(
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def parse_plato(df: pl.LazyFrame) -> pl.LazyFrame:
|
|
||||||
"""Parse the Sounds DataFrame."""
|
|
||||||
ic()
|
|
||||||
return pl.sql(
|
|
||||||
"""
|
|
||||||
SELECT source,
|
|
||||||
CAST(date AS DATE) AS date,
|
|
||||||
ean AS id,
|
|
||||||
_artist AS artist,
|
|
||||||
LOWER(title) AS title,
|
|
||||||
CAST(_date AS DATE) AS release,
|
|
||||||
CAST(_price AS FLOAT) AS price,
|
|
||||||
CONCAT('https://www.platomania.nl', url) AS url
|
|
||||||
FROM df
|
|
||||||
QUALIFY ROW_NUMBER() OVER (PARTITION BY source, id, artist, title, price ORDER BY date DESC) = 1
|
|
||||||
ORDER BY date ASC
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_sounds(df: pl.LazyFrame) -> pl.LazyFrame:
|
|
||||||
"""Parse the Plato DataFrame."""
|
|
||||||
return df.with_columns(
|
|
||||||
artist=pl.coalesce(pl.col("artist"), pl.col("name").str.split("-").list.get(1))
|
|
||||||
.str.strip_chars()
|
|
||||||
.str.to_lowercase(),
|
|
||||||
title=pl.coalesce(
|
|
||||||
pl.col("title"), pl.col("name").str.split("-").list.slice(2).list.join("-")
|
|
||||||
)
|
|
||||||
.str.strip_chars()
|
|
||||||
.str.to_lowercase(),
|
|
||||||
release=pl.col("release").map_elements(parse_date, return_dtype=pl.Date),
|
|
||||||
price=pl.col("price").cast(pl.Float64),
|
|
||||||
url=pl.format("https://www.sounds.nl/detail/{}", pl.col("id")),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dg.asset(
|
@dg.asset(
|
||||||
io_manager_key="polars_parquet_io_manager",
|
io_manager_key="polars_parquet_io_manager",
|
||||||
partitions_def=deals.partitions_def,
|
partitions_def=deals.partitions_def,
|
||||||
|
|||||||
21
apps/vinyl/src/plato/parse.py
Normal file
21
apps/vinyl/src/plato/parse.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import polars as pl
|
||||||
|
|
||||||
|
|
||||||
|
def parse(df: pl.LazyFrame) -> pl.LazyFrame:
|
||||||
|
"""Parse the Sounds DataFrame."""
|
||||||
|
ic()
|
||||||
|
return pl.sql(
|
||||||
|
"""
|
||||||
|
SELECT source,
|
||||||
|
CAST(date AS DATE) AS date,
|
||||||
|
ean AS id,
|
||||||
|
_artist AS artist,
|
||||||
|
LOWER(title) AS title,
|
||||||
|
CAST(_date AS DATE) AS release,
|
||||||
|
CAST(_price AS FLOAT) AS price,
|
||||||
|
CONCAT('https://www.platomania.nl', url) AS url
|
||||||
|
FROM df
|
||||||
|
QUALIFY ROW_NUMBER() OVER (PARTITION BY source, id, artist, title, price ORDER BY date DESC) = 1
|
||||||
|
ORDER BY date ASC
|
||||||
|
"""
|
||||||
|
)
|
||||||
19
apps/vinyl/src/sounds/parse.py
Normal file
19
apps/vinyl/src/sounds/parse.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
import polars as pl
|
||||||
|
from utils import parse_date
|
||||||
|
|
||||||
|
|
||||||
|
def parse(df: pl.LazyFrame) -> pl.LazyFrame:
|
||||||
|
"""Parse the Plato DataFrame."""
|
||||||
|
return df.with_columns(
|
||||||
|
artist=pl.coalesce(pl.col("artist"), pl.col("name").str.split("-").list.get(1))
|
||||||
|
.str.strip_chars()
|
||||||
|
.str.to_lowercase(),
|
||||||
|
title=pl.coalesce(
|
||||||
|
pl.col("title"), pl.col("name").str.split("-").list.slice(2).list.join("-")
|
||||||
|
)
|
||||||
|
.str.strip_chars()
|
||||||
|
.str.to_lowercase(),
|
||||||
|
release=pl.col("release").map_elements(parse_date, return_dtype=pl.Date),
|
||||||
|
price=pl.col("price").cast(pl.Float64),
|
||||||
|
url=pl.format("https://www.sounds.nl/detail/{}", pl.col("id")),
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user