diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..9c62f82 --- /dev/null +++ b/.env.example @@ -0,0 +1,12 @@ +# ── PostgreSQL ─────────────────────────────────────────────────────────────── +POSTGRES_HOST=postgres +POSTGRES_PORT=5432 +POSTGRES_USER=dagster +POSTGRES_PASSWORD=changeme +POSTGRES_DB=dagster + +# ── Dagster metadata storage (uses the same postgres instance) ─────────────── +DAGSTER_POSTGRES_URL=postgresql://dagster:changeme@postgres:5432/dagster + +# ── dbt profile target (overrides profiles.yml env_var defaults) ───────────── +DBT_TARGET=dev diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8fcc4cc --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +# Environment / secrets +.env + +# Python +__pycache__/ +*.py[cod] +.venv/ +*.egg-info/ +dist/ +build/ + +# uv +uv.lock + +# dbt +dbt/target/ +dbt/dbt_packages/ +dbt/logs/ + +# Dagster +dagster_home/storage/ +dagster_home/logs/ +dagster_home/schedule_logs/ +dagster_home/compute_logs/ + +# Docker +*.log diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..dc5f65e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.12-slim + +WORKDIR /app + +# Install uv +COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv + +# Install dependencies before copying full source (layer caching) +COPY pyproject.toml uv.lock* ./ +RUN uv sync --frozen --no-dev 2>/dev/null || uv sync --no-dev + +# Copy application source +COPY . . + +# Make the venv's binaries available on PATH +ENV PATH="/app/.venv/bin:$PATH" +ENV DAGSTER_HOME=/app/dagster_home diff --git a/README.md b/README.md index c55c57e..73a2a1c 100644 --- a/README.md +++ b/README.md @@ -1 +1,63 @@ -# data-platform \ No newline at end of file +# data-platform + +A [Dagster](https://dagster.io/) + [dbt](https://www.getdbt.com/) data platform, +managed with [uv](https://github.com/astral-sh/uv) and deployed via Docker Compose. + +## Stack + +| Layer | Tool | +|---|---| +| Orchestration | Dagster (webserver + daemon) | +| Transformation | dbt-core + dbt-postgres | +| Storage | PostgreSQL 16 | +| Package/venv | uv | +| Secrets | `.env` file | + +## Project layout + +``` +data_platform/ # Dagster Python package (assets, definitions) +dbt/ # dbt project (models, seeds, tests) + profiles.yml # reads credentials from env vars +dagster_home/ # dagster.yaml + workspace.yaml +Dockerfile # single image used by both dagster services +docker-compose.yaml # postgres + dagster-webserver + dagster-daemon +.env.example # copy to .env and fill in credentials +pyproject.toml # uv-managed dependencies +``` + +## Getting started + +```bash +# 1. Install uv (if not already) +curl -Lsf https://astral.sh/uv/install.sh | sh + +# 2. Clone and enter the project +cd ~/git/data-platform + +# 3. Create your credentials file +cp .env.example .env +# Edit .env with your passwords + +# 4. Install dependencies into a local venv +uv sync + +# 5. Generate the dbt manifest (needed before first run) +uv run dbt parse --profiles-dir dbt --project-dir dbt + +# 6. Start all services +docker compose up -d --build + +# 7. Open the Dagster UI +# http://localhost:3000 +``` + +## Local development (without Docker) + +```bash +uv sync +source .venv/bin/activate + +# Run the Dagster UI locally +DAGSTER_HOME=$PWD/dagster_home dagster dev +``` \ No newline at end of file diff --git a/dagster_home/dagster.yaml b/dagster_home/dagster.yaml new file mode 100644 index 0000000..9e2ce04 --- /dev/null +++ b/dagster_home/dagster.yaml @@ -0,0 +1,6 @@ +# Dagster stores run history, event logs, and schedules in PostgreSQL. +# Connection URL is read from the DAGSTER_POSTGRES_URL environment variable. +storage: + postgres: + postgres_url: + env: DAGSTER_POSTGRES_URL diff --git a/dagster_home/workspace.yaml b/dagster_home/workspace.yaml new file mode 100644 index 0000000..f5ea6fc --- /dev/null +++ b/dagster_home/workspace.yaml @@ -0,0 +1,4 @@ +load_from: + - python_package: + package_name: data_platform + attribute: defs diff --git a/data_platform/__init__.py b/data_platform/__init__.py new file mode 100644 index 0000000..592d06a --- /dev/null +++ b/data_platform/__init__.py @@ -0,0 +1,3 @@ +from data_platform.definitions import defs + +__all__ = ["defs"] diff --git a/data_platform/definitions.py b/data_platform/definitions.py new file mode 100644 index 0000000..3eb2300 --- /dev/null +++ b/data_platform/definitions.py @@ -0,0 +1,36 @@ +from pathlib import Path + +from dagster import Definitions +from dagster_dbt import DbtCliResource, DbtProject, dbt_assets + +# --------------------------------------------------------------------------- +# dbt project +# --------------------------------------------------------------------------- + +DBT_PROJECT_DIR = Path(__file__).parent.parent / "dbt" + +dbt_project = DbtProject(project_dir=str(DBT_PROJECT_DIR)) + +# When running locally outside Docker, generate/refresh the manifest automatically. +dbt_project.prepare_if_dev() + + +# --------------------------------------------------------------------------- +# dbt assets – every dbt model/test/snapshot becomes a Dagster asset +# --------------------------------------------------------------------------- + +@dbt_assets(manifest=dbt_project.manifest_path) +def dbt_project_assets(context, dbt: DbtCliResource): + yield from dbt.cli(["build"], context=context).stream() + + +# --------------------------------------------------------------------------- +# Definitions +# --------------------------------------------------------------------------- + +defs = Definitions( + assets=[dbt_project_assets], + resources={ + "dbt": DbtCliResource(project_dir=str(DBT_PROJECT_DIR)), + }, +) diff --git a/dbt/dbt_project.yml b/dbt/dbt_project.yml new file mode 100644 index 0000000..08a2da4 --- /dev/null +++ b/dbt/dbt_project.yml @@ -0,0 +1,22 @@ +name: data_platform +version: "1.0.0" +profile: data_platform + +model-paths: ["models"] +seed-paths: ["seeds"] +test-paths: ["tests"] +analysis-paths: ["analyses"] +macro-paths: ["macros"] +snapshot-paths: ["snapshots"] + +target-path: "target" +clean-targets: + - "target" + - "dbt_packages" + +models: + data_platform: + staging: + +materialized: view + marts: + +materialized: table diff --git a/dbt/models/staging/schema.yml b/dbt/models/staging/schema.yml new file mode 100644 index 0000000..47169dc --- /dev/null +++ b/dbt/models/staging/schema.yml @@ -0,0 +1,12 @@ +version: 2 + +models: + - name: stg_example + description: > + A placeholder staging model. Replace with your actual source tables. + columns: + - name: id + description: Primary key. + tests: + - unique + - not_null diff --git a/dbt/models/staging/stg_example.sql b/dbt/models/staging/stg_example.sql new file mode 100644 index 0000000..265e72c --- /dev/null +++ b/dbt/models/staging/stg_example.sql @@ -0,0 +1,5 @@ +-- Placeholder staging model. +-- Replace this with your actual source query, e.g.: +-- select * from {{ source('my_source', 'my_table') }} + +select 1 as id, 'example' as name diff --git a/dbt/profiles.yml b/dbt/profiles.yml new file mode 100644 index 0000000..88c988d --- /dev/null +++ b/dbt/profiles.yml @@ -0,0 +1,12 @@ +data_platform: + target: "{{ env_var('DBT_TARGET', 'dev') }}" + outputs: + dev: + type: postgres + host: "{{ env_var('POSTGRES_HOST', 'localhost') }}" + port: "{{ env_var('POSTGRES_PORT', '5432') | int }}" + user: "{{ env_var('POSTGRES_USER') }}" + password: "{{ env_var('POSTGRES_PASSWORD') }}" + dbname: "{{ env_var('POSTGRES_DB') }}" + schema: staging + threads: 4 diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..b6f598b --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,51 @@ +version: "3.9" + +# Shared config for all dagster services +x-dagster: &dagster-common + build: + context: . + dockerfile: Dockerfile + env_file: .env + depends_on: + postgres: + condition: service_healthy + restart: unless-stopped + +services: + + # Metadata storage and dbt target + postgres: + image: postgres:16 + container_name: postgres + restart: unless-stopped + env_file: .env + environment: + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: ${POSTGRES_DB} + volumes: + - postgres-data:/var/lib/postgresql/data + ports: + - "10.0.0.108:5432:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER}"] + interval: 10s + timeout: 5s + retries: 5 + + # Web UI + dagster-webserver: + <<: *dagster-common + container_name: dagster-webserver + command: ["dagster-webserver", "-h", "0.0.0.0", "-p", "3000"] + ports: + - "3000:3000" + + # Schedules, sensors and run queuing + dagster-daemon: + <<: *dagster-common + container_name: dagster-daemon + command: ["dagster-daemon", "run"] + +volumes: + postgres-data: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..03732d3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[project] +name = "data-platform" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "dagster", + "dagster-webserver", + "dagster-postgres", + "dagster-dbt", + "dbt-core", + "dbt-postgres", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["data_platform"] + +[tool.uv] +dev-dependencies = [ + "pytest", + "dagster-webserver", +]