diff --git a/apps/stocks/src/assets.py b/apps/stocks/src/assets.py index fad1ff3..b97d856 100644 --- a/apps/stocks/src/assets.py +++ b/apps/stocks/src/assets.py @@ -1,72 +1,74 @@ -import time +import asyncio from functools import partial from config import APP, URL -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait +from playwright.async_api import async_playwright import dagster as dg -asset = partial(dg.asset, key_prefix=APP) +TAGS = {"app": APP} +asset = partial(dg.asset, key_prefix=APP, tags=TAGS) -@asset( - io_manager_key="html_io_manager", - name="raw", - tags={"dagster/image": "dagster-code-stocks:playwright"}, -) -def raw_html(context: dg.AssetExecutionContext) -> None: - # Start a headless browser - options = webdriver.ChromeOptions() - options.add_argument("--headless") - driver = webdriver.Chrome(options=options) - driver.set_window_size(1000, 2000) +async def main() -> str: + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context(viewport={"width": 1000, "height": 2000}) + page = await context.new_page() - # Load the page - driver.get(URL) + await page.goto(URL, timeout=60000) - # Wait until at least one toggle button is present - wait = WebDriverWait(driver, 20) - wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn"))) + # Wait until at least one toggle button is present + await page.wait_for_selector(".toggle-btn", timeout=20000) - driver.execute_script("document.body.style.zoom='50%'") + # Set zoom + await page.evaluate("document.body.style.zoom='50%'") - # Find all toggle buttons (wait ensures DOM is ready) - toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn") - context.log.info(f"Found {len(toggle_buttons)} toggle buttons") + # Find all toggle buttons + toggle_buttons = await page.query_selector_all(".toggle-btn") + print(f"Found {len(toggle_buttons)} toggle buttons") - # Click each toggle button if it's visible and enabled - for i, btn in enumerate(toggle_buttons): - try: - # Wait until clickable and click - WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn)) - btn.click() - time.sleep(1) # short pause for content to load + for i, btn in enumerate(toggle_buttons): + try: + # Ensure it's visible and enabled + if await btn.is_visible() and await btn.is_enabled(): + await btn.click() + await page.wait_for_timeout(1000) - if i == len(toggle_buttons) - 1: - break + if i == len(toggle_buttons) - 1: + break - scroll_step = 500 - total_height = driver.execute_script("return document.body.scrollHeight") - current_position = 0 + # Scroll down gradually + scroll_step = 500 + total_height = await page.evaluate("() => document.body.scrollHeight") + current_position = 0 - while current_position < total_height: - driver.execute_script(f"window.scrollTo(0, {current_position});") - time.sleep(0.1) - current_position += scroll_step - total_height = driver.execute_script( - "return document.body.scrollHeight" - ) + while current_position < total_height: + await page.evaluate(f"window.scrollTo(0, {current_position});") + await page.wait_for_timeout(100) + current_position += scroll_step + total_height = await page.evaluate( + "() => document.body.scrollHeight" + ) - except Exception as e: - context.log.info(f"Skipped button due to error: {e}") + except Exception as e: + print(f"Skipped button due to error: {e}") - # Continue with scraping after all sections are expanded... - html = driver.page_source + # Get the page content + page_source = await page.content() - # Close browser when done - driver.quit() + # Close the browser + await browser.close() - return html + # Continue scraping logic here... + print("Scraping done") + + # Save the page content to a file + with open("scraped_page.html", "w") as f: + f.write(page_source) + return page_source + + +@asset(io_manager_key="html_io_manager", name="raw") +def raw_html() -> str: + return asyncio.run(main()) diff --git a/apps/stocks/src/config.py b/apps/stocks/src/config.py index fefeadf..18d528e 100644 --- a/apps/stocks/src/config.py +++ b/apps/stocks/src/config.py @@ -2,3 +2,4 @@ import os from pathlib import Path APP = os.environ.get("APP", Path(__file__).parent.parent.name) +URL = "https://www.hellostocks.ai/superinvestor/strategies" diff --git a/apps/stocks/src/definitions.py b/apps/stocks/src/definitions.py index db3528d..9a57ae3 100644 --- a/apps/stocks/src/definitions.py +++ b/apps/stocks/src/definitions.py @@ -4,6 +4,7 @@ import assets import sensors from dagster_polars import PolarsParquetIOManager from icecream import install +from resources import HtmlIOManager import dagster as dg from dagster import load_assets_from_modules @@ -12,15 +13,17 @@ install() APP = os.environ["APP"] +storage_dir = os.environ.get("STORAGE_DIR", "/storage") + definitions = dg.Definitions( assets=[ asset.with_attributes( group_names_by_key={asset.key: APP}, - tags_by_key={asset.key: {"app": APP}}, ) for asset in load_assets_from_modules([assets]) ], resources={ + "html_io_manager": HtmlIOManager(base_dir=storage_dir), "polars_parquet_io_manager": PolarsParquetIOManager( base_dir=os.environ.get("STORAGE_DIR", "/storage") + f"/{APP}" ), diff --git a/apps/stocks/src/sensors.py b/apps/stocks/src/sensors.py index bed19ab..e819db7 100644 --- a/apps/stocks/src/sensors.py +++ b/apps/stocks/src/sensors.py @@ -1,7 +1,11 @@ +import re from collections.abc import Iterator -from datetime import date +from datetime import datetime import jobs +import requests +from bs4 import BeautifulSoup +from config import URL import dagster as dg @@ -10,6 +14,34 @@ import dagster as dg def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]: ic(context.cursor) - yield dg.RunRequest() + response = requests.get(URL) + response.raise_for_status() - context.update_cursor(date.today().strftime("%Y-%m-%d")) + try: + # Parse with BeautifulSoup + soup = BeautifulSoup(response.text, "html.parser") + + # Find the first