use playwright to scrape stocks page

2025-07-29 11:31:26 +02:00
parent 8aa943b0bf
commit 586a4ae904
4 changed files with 94 additions and 56 deletions
--- a/apps/stocks/src/assets.py
+++ b/apps/stocks/src/assets.py
@@ -1,72 +1,74 @@
-import time
+import asyncio
 from functools import partial
 from config import APP, URL
-from selenium import webdriver
+from playwright.async_api import async_playwright
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 import dagster as dg
-asset = partial(dg.asset, key_prefix=APP)
+TAGS = {"app": APP}
 asset = partial(dg.asset, key_prefix=APP, tags=TAGS)
-@asset(
+async def main() -> str:
-    io_manager_key="html_io_manager",
+    async with async_playwright() as p:
-    name="raw",
+        browser = await p.chromium.launch(headless=True)
-    tags={"dagster/image": "dagster-code-stocks:playwright"},
+        context = await browser.new_context(viewport={"width": 1000, "height": 2000})
-)
+        page = await context.new_page()
 def raw_html(context: dg.AssetExecutionContext) -> None:
    # Start a headless browser
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    driver.set_window_size(1000, 2000)
-    # Load the page
+        await page.goto(URL, timeout=60000)
    driver.get(URL)
-    # Wait until at least one toggle button is present
+        # Wait until at least one toggle button is present
-    wait = WebDriverWait(driver, 20)
+        await page.wait_for_selector(".toggle-btn", timeout=20000)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn")))
-    driver.execute_script("document.body.style.zoom='50%'")
+        # Set zoom
        await page.evaluate("document.body.style.zoom='50%'")
-    # Find all toggle buttons (wait ensures DOM is ready)
+        # Find all toggle buttons
-    toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn")
+        toggle_buttons = await page.query_selector_all(".toggle-btn")
-    context.log.info(f"Found {len(toggle_buttons)} toggle buttons")
+        print(f"Found {len(toggle_buttons)} toggle buttons")
-    # Click each toggle button if it's visible and enabled
+        for i, btn in enumerate(toggle_buttons):
-    for i, btn in enumerate(toggle_buttons):
+            try:
-        try:
+                # Ensure it's visible and enabled
-            # Wait until clickable and click
+                if await btn.is_visible() and await btn.is_enabled():
-            WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn))
+                    await btn.click()
-            btn.click()
+                    await page.wait_for_timeout(1000)
            time.sleep(1)  # short pause for content to load
-            if i == len(toggle_buttons) - 1:
+                if i == len(toggle_buttons) - 1:
-                break
+                    break
-            scroll_step = 500
+                # Scroll down gradually
-            total_height = driver.execute_script("return document.body.scrollHeight")
+                scroll_step = 500
-            current_position = 0
+                total_height = await page.evaluate("() => document.body.scrollHeight")
                current_position = 0
-            while current_position < total_height:
+                while current_position < total_height:
-                driver.execute_script(f"window.scrollTo(0, {current_position});")
+                    await page.evaluate(f"window.scrollTo(0, {current_position});")
-                time.sleep(0.1)
+                    await page.wait_for_timeout(100)
-                current_position += scroll_step
+                    current_position += scroll_step
-                total_height = driver.execute_script(
+                    total_height = await page.evaluate(
-                    "return document.body.scrollHeight"
+                        "() => document.body.scrollHeight"
-                )
+                    )
-        except Exception as e:
+            except Exception as e:
-            context.log.info(f"Skipped button due to error: {e}")
+                print(f"Skipped button due to error: {e}")
-    # Continue with scraping after all sections are expanded...
+        # Get the page content
-    html = driver.page_source
+        page_source = await page.content()
-    # Close browser when done
+        # Close the browser
-    driver.quit()
+        await browser.close()
-    return html
+        # Continue scraping logic here...
        print("Scraping done")
        # Save the page content to a file
        with open("scraped_page.html", "w") as f:
            f.write(page_source)
        return page_source
@asset(io_manager_key="html_io_manager", name="raw")
 def raw_html() -> str:
    return asyncio.run(main())
--- a/apps/stocks/src/config.py
+++ b/apps/stocks/src/config.py
@@ -2,3 +2,4 @@ import os
 from pathlib import Path
 APP = os.environ.get("APP", Path(__file__).parent.parent.name)
 URL = "https://www.hellostocks.ai/superinvestor/strategies"
--- a/apps/stocks/src/definitions.py
+++ b/apps/stocks/src/definitions.py
@@ -4,6 +4,7 @@ import assets
 import sensors
 from dagster_polars import PolarsParquetIOManager
 from icecream import install
 from resources import HtmlIOManager
 import dagster as dg
 from dagster import load_assets_from_modules
@@ -12,15 +13,17 @@ install()
 APP = os.environ["APP"]
 storage_dir = os.environ.get("STORAGE_DIR", "/storage")
 definitions = dg.Definitions(
    assets=[
        asset.with_attributes(
            group_names_by_key={asset.key: APP},
            tags_by_key={asset.key: {"app": APP}},
        )
        for asset in load_assets_from_modules([assets])
    ],
    resources={
        "html_io_manager": HtmlIOManager(base_dir=storage_dir),
        "polars_parquet_io_manager": PolarsParquetIOManager(
            base_dir=os.environ.get("STORAGE_DIR", "/storage") + f"/{APP}"
        ),
--- a/apps/stocks/src/sensors.py
+++ b/apps/stocks/src/sensors.py
@@ -1,7 +1,11 @@
 import re
 from collections.abc import Iterator
-from datetime import date
+from datetime import datetime
 import jobs
 import requests
 from bs4 import BeautifulSoup
 from config import URL
 import dagster as dg
@@ -10,6 +14,34 @@ import dagster as dg
 def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]:
    ic(context.cursor)
-    yield dg.RunRequest()
+    response = requests.get(URL)
    response.raise_for_status()
-    context.update_cursor(date.today().strftime("%Y-%m-%d"))
+    try:
        # Parse with BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")
        # Find the first <div> after </header>
        if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
            # Extract date part using regex
            match = re.search(
                r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text
            )
            if match:
                day, _, month, year = match.groups()
                date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
                date_str = date.strftime("%Y-%m-%d ")
                context.log.info(f"Found date: {date_str}")
                if date_str > context.cursor:
                    context.update_cursor(date_str)
                    yield dg.RunRequest()
                    return
    except Exception as e:
        context.log.error(f"Parsing error: {e}")
    now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    file = f"{now_str} stocks.html"
    context.log.info(f"Saving file: {file}")
    with open(f"/cache/{file}") as fp:
        fp.write(response.text)