chore: refactor ingestion
This commit is contained in:
@@ -4,3 +4,12 @@ storage:
|
|||||||
postgres:
|
postgres:
|
||||||
postgres_url:
|
postgres_url:
|
||||||
env: DAGSTER_POSTGRES_URL
|
env: DAGSTER_POSTGRES_URL
|
||||||
|
pool_size: 5
|
||||||
|
max_overflow: 5
|
||||||
|
|
||||||
|
# Limit concurrent runs to avoid overwhelming the VM and database.
|
||||||
|
concurrency:
|
||||||
|
default_op_concurrency_limit: 1
|
||||||
|
|
||||||
|
run_queue:
|
||||||
|
max_concurrent_runs: 1
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
"""Funda real-estate ingestion assets."""
|
"""Funda real-estate ingestion assets."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from dagster import (
|
from dagster import (
|
||||||
@@ -10,6 +11,7 @@ from dagster import (
|
|||||||
MetadataValue,
|
MetadataValue,
|
||||||
asset,
|
asset,
|
||||||
)
|
)
|
||||||
|
from funda import Listing
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
|
|
||||||
from data_platform.helpers import (
|
from data_platform.helpers import (
|
||||||
@@ -94,14 +96,14 @@ def raw_funda_search_results(
|
|||||||
|
|
||||||
all_listings = []
|
all_listings = []
|
||||||
for page in range(config.max_pages):
|
for page in range(config.max_pages):
|
||||||
context.log.info(f"Fetching search page {page + 1}/{config.max_pages} …")
|
context.log.info(f"Fetching search page {page + 1}/{config.max_pages}...")
|
||||||
kwargs["page"] = page
|
kwargs["page"] = page
|
||||||
results = client.search_listing(**kwargs)
|
results = client.search_listing(**kwargs)
|
||||||
if not results:
|
if not results:
|
||||||
context.log.info("No more results.")
|
context.log.info("No more results.")
|
||||||
break
|
break
|
||||||
all_listings.extend(results)
|
all_listings.extend(results)
|
||||||
context.log.info(f" got {len(results)} listings (total: {len(all_listings)})")
|
context.log.info(f"Got {len(results)} listings (total: {len(all_listings)}).")
|
||||||
|
|
||||||
if not all_listings:
|
if not all_listings:
|
||||||
context.log.warning("Search returned zero results.")
|
context.log.warning("Search returned zero results.")
|
||||||
@@ -109,7 +111,9 @@ def raw_funda_search_results(
|
|||||||
|
|
||||||
engine = postgres.get_engine()
|
engine = postgres.get_engine()
|
||||||
with engine.begin() as conn:
|
with engine.begin() as conn:
|
||||||
conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {_SCHEMA}"))
|
conn.execute(
|
||||||
|
text(render_sql(_SQL_DIR, "ddl/create_schema.sql", schema=_SCHEMA))
|
||||||
|
)
|
||||||
conn.execute(
|
conn.execute(
|
||||||
text(render_sql(_SQL_DIR, "ddl/create_search_results.sql", schema=_SCHEMA))
|
text(render_sql(_SQL_DIR, "ddl/create_search_results.sql", schema=_SCHEMA))
|
||||||
)
|
)
|
||||||
@@ -150,10 +154,11 @@ def raw_funda_search_results(
|
|||||||
with engine.begin() as conn:
|
with engine.begin() as conn:
|
||||||
result = conn.execute(
|
result = conn.execute(
|
||||||
text(
|
text(
|
||||||
f"UPDATE {_SCHEMA}.search_results"
|
render_sql(
|
||||||
f" SET is_active = FALSE"
|
_SQL_DIR,
|
||||||
f" WHERE last_seen_at < now() - INTERVAL '7 days'"
|
"dml/mark_inactive_search_results.sql",
|
||||||
f" RETURNING global_id"
|
schema=_SCHEMA,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
newly_inactive = result.rowcount
|
newly_inactive = result.rowcount
|
||||||
@@ -202,23 +207,21 @@ def raw_funda_listing_details(
|
|||||||
engine = postgres.get_engine()
|
engine = postgres.get_engine()
|
||||||
|
|
||||||
with engine.begin() as conn:
|
with engine.begin() as conn:
|
||||||
conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {_SCHEMA}"))
|
conn.execute(
|
||||||
|
text(render_sql(_SQL_DIR, "ddl/create_schema.sql", schema=_SCHEMA))
|
||||||
|
)
|
||||||
conn.execute(
|
conn.execute(
|
||||||
text(render_sql(_SQL_DIR, "ddl/create_listing_details.sql", schema=_SCHEMA))
|
text(render_sql(_SQL_DIR, "ddl/create_listing_details.sql", schema=_SCHEMA))
|
||||||
)
|
)
|
||||||
|
|
||||||
with engine.connect() as conn:
|
with engine.connect() as conn:
|
||||||
if config.fetch_all:
|
if config.fetch_all:
|
||||||
query = text(f"SELECT DISTINCT global_id FROM {_SCHEMA}.search_results")
|
query = text(
|
||||||
|
render_sql(_SQL_DIR, "dml/select_all_detail_ids.sql", schema=_SCHEMA)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
query = text(
|
query = text(
|
||||||
f"""
|
render_sql(_SQL_DIR, "dml/select_new_detail_ids.sql", schema=_SCHEMA)
|
||||||
SELECT DISTINCT s.global_id
|
|
||||||
FROM {_SCHEMA}.search_results s
|
|
||||||
LEFT JOIN {_SCHEMA}.listing_details d ON s.global_id = d.global_id
|
|
||||||
WHERE s.is_active = TRUE
|
|
||||||
AND (d.global_id IS NULL OR d.is_stale = TRUE)
|
|
||||||
"""
|
|
||||||
)
|
)
|
||||||
result = conn.execute(query)
|
result = conn.execute(query)
|
||||||
ids = [row[0] for row in result if row[0]]
|
ids = [row[0] for row in result if row[0]]
|
||||||
@@ -227,7 +230,7 @@ def raw_funda_listing_details(
|
|||||||
context.log.warning("No search results found – run funda_search_results first.")
|
context.log.warning("No search results found – run funda_search_results first.")
|
||||||
return MaterializeResult(metadata={"count": 0})
|
return MaterializeResult(metadata={"count": 0})
|
||||||
|
|
||||||
context.log.info(f"Fetching details for {len(ids)} listings …")
|
context.log.info(f"Fetching details for {len(ids)} listings...")
|
||||||
|
|
||||||
rows = []
|
rows = []
|
||||||
errors = 0
|
errors = 0
|
||||||
@@ -282,7 +285,9 @@ def raw_funda_listing_details(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if (i + 1) % 10 == 0:
|
if (i + 1) % 10 == 0:
|
||||||
context.log.info(f" fetched {i + 1}/{len(ids)} …")
|
context.log.info(f"Fetched {i + 1}/{len(ids)} listings.")
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
if rows:
|
if rows:
|
||||||
postgres.execute_many(
|
postgres.execute_many(
|
||||||
@@ -293,14 +298,11 @@ def raw_funda_listing_details(
|
|||||||
with engine.begin() as conn:
|
with engine.begin() as conn:
|
||||||
conn.execute(
|
conn.execute(
|
||||||
text(
|
text(
|
||||||
f"""
|
render_sql(
|
||||||
UPDATE {_SCHEMA}.listing_details d
|
_SQL_DIR,
|
||||||
SET is_stale = TRUE
|
"dml/mark_stale_listing_details.sql",
|
||||||
FROM {_SCHEMA}.search_results s
|
schema=_SCHEMA,
|
||||||
WHERE d.global_id = s.global_id
|
)
|
||||||
AND s.is_active = FALSE
|
|
||||||
AND d.is_stale = FALSE
|
|
||||||
"""
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -345,47 +347,51 @@ def raw_funda_price_history(
|
|||||||
engine = postgres.get_engine()
|
engine = postgres.get_engine()
|
||||||
|
|
||||||
with engine.begin() as conn:
|
with engine.begin() as conn:
|
||||||
conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {_SCHEMA}"))
|
conn.execute(
|
||||||
|
text(render_sql(_SQL_DIR, "ddl/create_schema.sql", schema=_SCHEMA))
|
||||||
|
)
|
||||||
conn.execute(
|
conn.execute(
|
||||||
text(render_sql(_SQL_DIR, "ddl/create_price_history.sql", schema=_SCHEMA))
|
text(render_sql(_SQL_DIR, "ddl/create_price_history.sql", schema=_SCHEMA))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Fetch listing metadata (url, title, postcode) from the DB so we can call
|
||||||
|
# get_price_history without re-fetching each listing from the Funda API.
|
||||||
with engine.connect() as conn:
|
with engine.connect() as conn:
|
||||||
if config.fetch_all:
|
if config.fetch_all:
|
||||||
query = text(f"SELECT DISTINCT global_id FROM {_SCHEMA}.listing_details")
|
query = text(
|
||||||
|
render_sql(
|
||||||
|
_SQL_DIR,
|
||||||
|
"dml/select_all_price_history_listings.sql",
|
||||||
|
schema=_SCHEMA,
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
query = text(
|
query = text(
|
||||||
f"""
|
render_sql(
|
||||||
SELECT DISTINCT d.global_id
|
_SQL_DIR,
|
||||||
FROM {_SCHEMA}.listing_details d
|
"dml/select_new_price_history_listings.sql",
|
||||||
JOIN {_SCHEMA}.search_results s ON d.global_id = s.global_id
|
schema=_SCHEMA,
|
||||||
WHERE s.is_active = TRUE
|
)
|
||||||
UNION
|
|
||||||
SELECT DISTINCT d.global_id
|
|
||||||
FROM {_SCHEMA}.listing_details d
|
|
||||||
LEFT JOIN {_SCHEMA}.price_history p ON d.global_id = p.global_id
|
|
||||||
WHERE p.global_id IS NULL
|
|
||||||
"""
|
|
||||||
)
|
)
|
||||||
result = conn.execute(query)
|
result = conn.execute(query)
|
||||||
ids = [row[0] for row in result if row[0]]
|
listings = [(row[0], row[1], row[2], row[3]) for row in result if row[0]]
|
||||||
|
|
||||||
if not ids:
|
if not listings:
|
||||||
context.log.warning(
|
context.log.warning(
|
||||||
"No listing details found – run funda_listing_details first."
|
"No listing details found – run funda_listing_details first."
|
||||||
)
|
)
|
||||||
return MaterializeResult(metadata={"count": 0})
|
return MaterializeResult(metadata={"count": 0})
|
||||||
|
|
||||||
context.log.info(f"Fetching price history for {len(ids)} listings …")
|
context.log.info(f"Fetching price history for {len(listings)} listings...")
|
||||||
|
|
||||||
rows = []
|
batch_size = 25
|
||||||
|
total_rows = 0
|
||||||
errors = 0
|
errors = 0
|
||||||
for i, gid in enumerate(ids):
|
for i, (gid, url, title, postcode) in enumerate(listings):
|
||||||
try:
|
try:
|
||||||
listing = client.get_listing(int(gid))
|
stub = Listing(data={"url": url, "title": title, "postcode": postcode})
|
||||||
history = client.get_price_history(listing)
|
history = client.get_price_history(stub)
|
||||||
for entry in history:
|
rows = [
|
||||||
rows.append(
|
|
||||||
{
|
{
|
||||||
"global_id": gid,
|
"global_id": gid,
|
||||||
"price": safe_int(entry.get("price")),
|
"price": safe_int(entry.get("price")),
|
||||||
@@ -395,28 +401,35 @@ def raw_funda_price_history(
|
|||||||
"source": entry.get("source"),
|
"source": entry.get("source"),
|
||||||
"status": entry.get("status"),
|
"status": entry.get("status"),
|
||||||
}
|
}
|
||||||
|
for entry in history
|
||||||
|
]
|
||||||
|
if rows:
|
||||||
|
postgres.execute_many(
|
||||||
|
render_sql(
|
||||||
|
_SQL_DIR, "dml/insert_price_history.sql", schema=_SCHEMA
|
||||||
|
),
|
||||||
|
rows,
|
||||||
)
|
)
|
||||||
|
total_rows += len(rows)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
errors += 1
|
errors += 1
|
||||||
context.log.warning(f"Failed to fetch price history for {gid}: {e}")
|
context.log.warning(f"Failed to fetch price history for {gid}: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if (i + 1) % 10 == 0:
|
if (i + 1) % batch_size == 0:
|
||||||
context.log.info(f" fetched {i + 1}/{len(ids)} …")
|
context.log.info(f"Fetched {i + 1}/{len(listings)} price histories.")
|
||||||
|
|
||||||
if rows:
|
time.sleep(1)
|
||||||
postgres.execute_many(
|
|
||||||
render_sql(_SQL_DIR, "dml/insert_price_history.sql", schema=_SCHEMA), rows
|
|
||||||
)
|
|
||||||
|
|
||||||
context.log.info(
|
context.log.info(
|
||||||
f"Inserted {len(rows)} price history records ({errors} errors) into {_SCHEMA}.price_history"
|
f"Inserted {total_rows} price history records ({errors} errors) "
|
||||||
|
f"into {_SCHEMA}.price_history"
|
||||||
)
|
)
|
||||||
|
|
||||||
return MaterializeResult(
|
return MaterializeResult(
|
||||||
metadata={
|
metadata={
|
||||||
"count": len(rows),
|
"count": total_rows,
|
||||||
"errors": errors,
|
"errors": errors,
|
||||||
"listings_processed": len(ids) - errors,
|
"listings_processed": len(listings) - errors,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
create schema if not exists {{ schema }};
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
update {{ schema }}.search_results
|
||||||
|
set is_active = false
|
||||||
|
where last_seen_at < now() - interval '7 days'
|
||||||
|
returning global_id
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
update {{ schema }}.listing_details d
|
||||||
|
set is_stale = true
|
||||||
|
from {{ schema }}.search_results as s
|
||||||
|
where
|
||||||
|
d.global_id = s.global_id
|
||||||
|
and s.is_active = false
|
||||||
|
and d.is_stale = false
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
select distinct global_id
|
||||||
|
from {{ schema }}.search_results
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
select distinct
|
||||||
|
d.global_id,
|
||||||
|
d.url,
|
||||||
|
d.title,
|
||||||
|
d.postcode
|
||||||
|
from {{ schema }}.listing_details as d
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
select distinct s.global_id
|
||||||
|
from {{ schema }}.search_results as s
|
||||||
|
left join {{ schema }}.listing_details as d on s.global_id = d.global_id
|
||||||
|
where
|
||||||
|
s.is_active = true
|
||||||
|
and (d.global_id is null or d.is_stale = true)
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
select distinct
|
||||||
|
d.global_id,
|
||||||
|
d.url,
|
||||||
|
d.title,
|
||||||
|
d.postcode
|
||||||
|
from {{ schema }}.listing_details as d
|
||||||
|
inner join {{ schema }}.search_results as s on d.global_id = s.global_id
|
||||||
|
where s.is_active = true
|
||||||
|
union
|
||||||
|
select distinct
|
||||||
|
d.global_id,
|
||||||
|
d.url,
|
||||||
|
d.title,
|
||||||
|
d.postcode
|
||||||
|
from {{ schema }}.listing_details as d
|
||||||
|
left join {{ schema }}.price_history as p on d.global_id = p.global_id
|
||||||
|
where p.global_id is null
|
||||||
@@ -1,4 +1,8 @@
|
|||||||
from dagster import Definitions
|
from dagster import (
|
||||||
|
AutomationConditionSensorDefinition,
|
||||||
|
DefaultSensorStatus,
|
||||||
|
Definitions,
|
||||||
|
)
|
||||||
from dagster_dbt import DbtCliResource
|
from dagster_dbt import DbtCliResource
|
||||||
|
|
||||||
from data_platform.assets.dbt import DBT_PROJECT_DIR, dbt_project_assets
|
from data_platform.assets.dbt import DBT_PROJECT_DIR, dbt_project_assets
|
||||||
@@ -30,6 +34,13 @@ defs = Definitions(
|
|||||||
]
|
]
|
||||||
),
|
),
|
||||||
jobs=[funda_ingestion_job, funda_raw_quality_job, elementary_refresh_job],
|
jobs=[funda_ingestion_job, funda_raw_quality_job, elementary_refresh_job],
|
||||||
|
sensors=[
|
||||||
|
AutomationConditionSensorDefinition(
|
||||||
|
name="automation_condition_sensor",
|
||||||
|
target="*",
|
||||||
|
default_status=DefaultSensorStatus.RUNNING,
|
||||||
|
),
|
||||||
|
],
|
||||||
schedules=[
|
schedules=[
|
||||||
funda_ingestion_schedule,
|
funda_ingestion_schedule,
|
||||||
funda_raw_quality_schedule,
|
funda_raw_quality_schedule,
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
from dagster import ConfigurableResource, EnvVar
|
from dagster import ConfigurableResource, EnvVar
|
||||||
from funda import Funda
|
from funda import Funda
|
||||||
from sqlalchemy import create_engine, text
|
from sqlalchemy import create_engine, text
|
||||||
|
from sqlalchemy.pool import NullPool
|
||||||
|
|
||||||
|
|
||||||
class FundaResource(ConfigurableResource):
|
class FundaResource(ConfigurableResource):
|
||||||
@@ -25,7 +26,7 @@ class PostgresResource(ConfigurableResource):
|
|||||||
|
|
||||||
def get_engine(self):
|
def get_engine(self):
|
||||||
url = f"postgresql://{self.user}:{self.password}@{self.host}:{self.port}/{self.dbname}"
|
url = f"postgresql://{self.user}:{self.password}@{self.host}:{self.port}/{self.dbname}"
|
||||||
return create_engine(url)
|
return create_engine(url, poolclass=NullPool)
|
||||||
|
|
||||||
def execute(self, statement: str, params: dict | None = None):
|
def execute(self, statement: str, params: dict | None = None):
|
||||||
engine = self.get_engine()
|
engine = self.get_engine()
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ models:
|
|||||||
data_type: text
|
data_type: text
|
||||||
tests:
|
tests:
|
||||||
- accepted_values:
|
- accepted_values:
|
||||||
values: ["A3", "A2", "A1", "A", "B", "C", "D", "E", "F", "G"]
|
values: ["A4", "A3", "A2", "A1", "A", "B", "C", "D", "E", "F", "G"]
|
||||||
where: "energy_label is not null"
|
where: "energy_label is not null"
|
||||||
- name: living_area
|
- name: living_area
|
||||||
description: Interior floor area in m².
|
description: Interior floor area in m².
|
||||||
|
|||||||
@@ -260,9 +260,17 @@ class TestFundaPriceHistory:
|
|||||||
assert mat[0].metadata["count"].value == 0
|
assert mat[0].metadata["count"].value == 0
|
||||||
|
|
||||||
def test_price_history_inserted(self):
|
def test_price_history_inserted(self):
|
||||||
engine, _, _ = make_mock_engine(select_rows=[("1234567",)])
|
engine, _, _ = make_mock_engine(
|
||||||
|
select_rows=[
|
||||||
|
(
|
||||||
|
"1234567",
|
||||||
|
"https://www.funda.nl/detail/koop/amsterdam/app/87654321/",
|
||||||
|
"Teststraat 1",
|
||||||
|
"1234AB",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
client = MagicMock()
|
client = MagicMock()
|
||||||
client.get_listing.return_value = make_mock_listing(_DETAIL_LISTING_DATA)
|
|
||||||
client.get_price_history.return_value = [
|
client.get_price_history.return_value = [
|
||||||
{
|
{
|
||||||
"price": 350000,
|
"price": 350000,
|
||||||
|
|||||||
Reference in New Issue
Block a user