feat: add cleanup to elementary op
This commit is contained in:
@@ -10,7 +10,17 @@ from sqlalchemy import create_engine, text
|
|||||||
_DBT_DIR = Path(__file__).parents[2] / "dbt"
|
_DBT_DIR = Path(__file__).parents[2] / "dbt"
|
||||||
|
|
||||||
|
|
||||||
def _elementary_schema_exists() -> bool:
|
_DAYS_BACK = 3
|
||||||
|
|
||||||
|
_CLEANUP_TABLES = [
|
||||||
|
"elementary_test_results",
|
||||||
|
"dbt_run_results",
|
||||||
|
"dbt_invocations",
|
||||||
|
"dbt_source_freshness_results",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_engine():
|
||||||
url = "postgresql://{user}:{password}@{host}:{port}/{dbname}".format(
|
url = "postgresql://{user}:{password}@{host}:{port}/{dbname}".format(
|
||||||
user=os.environ["POSTGRES_USER"],
|
user=os.environ["POSTGRES_USER"],
|
||||||
password=os.environ["POSTGRES_PASSWORD"],
|
password=os.environ["POSTGRES_PASSWORD"],
|
||||||
@@ -18,12 +28,16 @@ def _elementary_schema_exists() -> bool:
|
|||||||
port=os.environ.get("POSTGRES_PORT", "5432"),
|
port=os.environ.get("POSTGRES_PORT", "5432"),
|
||||||
dbname=os.environ["POSTGRES_DB"],
|
dbname=os.environ["POSTGRES_DB"],
|
||||||
)
|
)
|
||||||
engine = create_engine(
|
return create_engine(
|
||||||
url,
|
url,
|
||||||
pool_pre_ping=True,
|
pool_pre_ping=True,
|
||||||
connect_args={"connect_timeout": 10},
|
connect_args={"connect_timeout": 10},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _elementary_schema_exists() -> bool:
|
||||||
|
engine = _get_engine()
|
||||||
|
|
||||||
from data_platform.resources import _retry_on_operational_error
|
from data_platform.resources import _retry_on_operational_error
|
||||||
|
|
||||||
def _query():
|
def _query():
|
||||||
@@ -69,9 +83,34 @@ def elementary_run_models(context: OpExecutionContext) -> None:
|
|||||||
raise Exception(f"dbt run elementary failed with exit code {returncode}")
|
raise Exception(f"dbt run elementary failed with exit code {returncode}")
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup_old_elementary_data(context: OpExecutionContext) -> None:
|
||||||
|
"""Delete elementary rows older than _DAYS_BACK to prevent OOM during report generation."""
|
||||||
|
engine = _get_engine()
|
||||||
|
total = 0
|
||||||
|
with engine.begin() as conn:
|
||||||
|
for table in _CLEANUP_TABLES:
|
||||||
|
result = conn.execute(
|
||||||
|
text(
|
||||||
|
f"DELETE FROM elementary.{table} " # noqa: S608
|
||||||
|
f"WHERE created_at < now() - interval '{_DAYS_BACK} days'"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if result.rowcount:
|
||||||
|
context.log.info(
|
||||||
|
f"Cleaned up {result.rowcount} old rows from elementary.{table}"
|
||||||
|
)
|
||||||
|
total += result.rowcount
|
||||||
|
if total:
|
||||||
|
context.log.info(f"Total rows cleaned: {total}")
|
||||||
|
else:
|
||||||
|
context.log.info("No old elementary data to clean up.")
|
||||||
|
|
||||||
|
|
||||||
@op(ins={"after": In(Nothing)})
|
@op(ins={"after": In(Nothing)})
|
||||||
def elementary_generate_report(context: OpExecutionContext) -> None:
|
def elementary_generate_report(context: OpExecutionContext) -> None:
|
||||||
"""Run edr report to regenerate the Elementary HTML report."""
|
"""Run edr report to regenerate the Elementary HTML report."""
|
||||||
|
_cleanup_old_elementary_data(context)
|
||||||
|
|
||||||
report_path = (
|
report_path = (
|
||||||
Path(__file__).parents[2] / "dbt" / "edr_target" / "elementary_report.html"
|
Path(__file__).parents[2] / "dbt" / "edr_target" / "elementary_report.html"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from data_platform.ops.check_source_freshness import (
|
|||||||
SourceFreshnessConfig,
|
SourceFreshnessConfig,
|
||||||
)
|
)
|
||||||
from data_platform.ops.elementary import (
|
from data_platform.ops.elementary import (
|
||||||
|
_cleanup_old_elementary_data,
|
||||||
_elementary_schema_exists,
|
_elementary_schema_exists,
|
||||||
elementary_generate_report,
|
elementary_generate_report,
|
||||||
elementary_run_models,
|
elementary_run_models,
|
||||||
@@ -108,21 +109,62 @@ class TestElementaryRunModels:
|
|||||||
# elementary_generate_report
|
# elementary_generate_report
|
||||||
|
|
||||||
|
|
||||||
|
class TestCleanupOldElementaryData:
|
||||||
|
@patch("data_platform.ops.elementary._get_engine")
|
||||||
|
def test_deletes_old_rows(self, mock_get_engine):
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
mock_conn = MagicMock()
|
||||||
|
mock_result = MagicMock()
|
||||||
|
mock_result.rowcount = 5
|
||||||
|
mock_conn.execute.return_value = mock_result
|
||||||
|
mock_engine = MagicMock()
|
||||||
|
mock_engine.begin.return_value.__enter__ = lambda _: mock_conn
|
||||||
|
mock_engine.begin.return_value.__exit__ = MagicMock(return_value=False)
|
||||||
|
mock_get_engine.return_value = mock_engine
|
||||||
|
|
||||||
|
context = build_op_context()
|
||||||
|
_cleanup_old_elementary_data(context)
|
||||||
|
assert mock_conn.execute.call_count == 4
|
||||||
|
|
||||||
|
@patch("data_platform.ops.elementary._get_engine")
|
||||||
|
def test_logs_when_no_rows_deleted(self, mock_get_engine):
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
mock_conn = MagicMock()
|
||||||
|
mock_result = MagicMock()
|
||||||
|
mock_result.rowcount = 0
|
||||||
|
mock_conn.execute.return_value = mock_result
|
||||||
|
mock_engine = MagicMock()
|
||||||
|
mock_engine.begin.return_value.__enter__ = lambda _: mock_conn
|
||||||
|
mock_engine.begin.return_value.__exit__ = MagicMock(return_value=False)
|
||||||
|
mock_get_engine.return_value = mock_engine
|
||||||
|
|
||||||
|
context = build_op_context()
|
||||||
|
_cleanup_old_elementary_data(context)
|
||||||
|
assert mock_conn.execute.call_count == 4
|
||||||
|
|
||||||
|
|
||||||
|
# elementary_generate_report
|
||||||
|
|
||||||
|
|
||||||
|
@patch("data_platform.ops.elementary._cleanup_old_elementary_data")
|
||||||
class TestElementaryGenerateReport:
|
class TestElementaryGenerateReport:
|
||||||
@patch("data_platform.ops.elementary.subprocess.Popen")
|
@patch("data_platform.ops.elementary.subprocess.Popen")
|
||||||
def test_calls_edr_report(self, mock_popen):
|
def test_calls_edr_report(self, mock_popen, mock_cleanup):
|
||||||
mock_popen.return_value = _mock_popen(
|
mock_popen.return_value = _mock_popen(
|
||||||
returncode=0, stdout_lines=["report generated\n"]
|
returncode=0, stdout_lines=["report generated\n"]
|
||||||
)
|
)
|
||||||
context = build_op_context()
|
context = build_op_context()
|
||||||
elementary_generate_report(context)
|
elementary_generate_report(context)
|
||||||
|
mock_cleanup.assert_called_once()
|
||||||
mock_popen.assert_called_once()
|
mock_popen.assert_called_once()
|
||||||
args = mock_popen.call_args[0][0]
|
args = mock_popen.call_args[0][0]
|
||||||
assert "edr" in args
|
assert "edr" in args
|
||||||
assert "report" in args
|
assert "report" in args
|
||||||
|
|
||||||
@patch("data_platform.ops.elementary.subprocess.Popen")
|
@patch("data_platform.ops.elementary.subprocess.Popen")
|
||||||
def test_raises_on_failure(self, mock_popen):
|
def test_raises_on_failure(self, mock_popen, mock_cleanup):
|
||||||
mock_popen.return_value = _mock_popen(
|
mock_popen.return_value = _mock_popen(
|
||||||
returncode=1, stdout_lines=["fatal error\n"]
|
returncode=1, stdout_lines=["fatal error\n"]
|
||||||
)
|
)
|
||||||
@@ -131,7 +173,7 @@ class TestElementaryGenerateReport:
|
|||||||
elementary_generate_report(context)
|
elementary_generate_report(context)
|
||||||
|
|
||||||
@patch("data_platform.ops.elementary.subprocess.Popen")
|
@patch("data_platform.ops.elementary.subprocess.Popen")
|
||||||
def test_success_returns_none(self, mock_popen):
|
def test_success_returns_none(self, mock_popen, mock_cleanup):
|
||||||
mock_popen.return_value = _mock_popen(returncode=0, stdout_lines=["done\n"])
|
mock_popen.return_value = _mock_popen(returncode=0, stdout_lines=["done\n"])
|
||||||
context = build_op_context()
|
context = build_op_context()
|
||||||
result = elementary_generate_report(context)
|
result = elementary_generate_report(context)
|
||||||
|
|||||||
Reference in New Issue
Block a user