use playwright to scrape stocks page

This commit is contained in:
2025-07-29 11:31:26 +02:00
parent 8aa943b0bf
commit 586a4ae904
4 changed files with 94 additions and 56 deletions

View File

@@ -1,72 +1,74 @@
import time
import asyncio
from functools import partial
from config import APP, URL
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from playwright.async_api import async_playwright
import dagster as dg
asset = partial(dg.asset, key_prefix=APP)
TAGS = {"app": APP}
asset = partial(dg.asset, key_prefix=APP, tags=TAGS)
@asset(
io_manager_key="html_io_manager",
name="raw",
tags={"dagster/image": "dagster-code-stocks:playwright"},
)
def raw_html(context: dg.AssetExecutionContext) -> None:
# Start a headless browser
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
driver.set_window_size(1000, 2000)
async def main() -> str:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(viewport={"width": 1000, "height": 2000})
page = await context.new_page()
# Load the page
driver.get(URL)
await page.goto(URL, timeout=60000)
# Wait until at least one toggle button is present
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn")))
# Wait until at least one toggle button is present
await page.wait_for_selector(".toggle-btn", timeout=20000)
driver.execute_script("document.body.style.zoom='50%'")
# Set zoom
await page.evaluate("document.body.style.zoom='50%'")
# Find all toggle buttons (wait ensures DOM is ready)
toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn")
context.log.info(f"Found {len(toggle_buttons)} toggle buttons")
# Find all toggle buttons
toggle_buttons = await page.query_selector_all(".toggle-btn")
print(f"Found {len(toggle_buttons)} toggle buttons")
# Click each toggle button if it's visible and enabled
for i, btn in enumerate(toggle_buttons):
try:
# Wait until clickable and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn))
btn.click()
time.sleep(1) # short pause for content to load
for i, btn in enumerate(toggle_buttons):
try:
# Ensure it's visible and enabled
if await btn.is_visible() and await btn.is_enabled():
await btn.click()
await page.wait_for_timeout(1000)
if i == len(toggle_buttons) - 1:
break
if i == len(toggle_buttons) - 1:
break
scroll_step = 500
total_height = driver.execute_script("return document.body.scrollHeight")
current_position = 0
# Scroll down gradually
scroll_step = 500
total_height = await page.evaluate("() => document.body.scrollHeight")
current_position = 0
while current_position < total_height:
driver.execute_script(f"window.scrollTo(0, {current_position});")
time.sleep(0.1)
current_position += scroll_step
total_height = driver.execute_script(
"return document.body.scrollHeight"
)
while current_position < total_height:
await page.evaluate(f"window.scrollTo(0, {current_position});")
await page.wait_for_timeout(100)
current_position += scroll_step
total_height = await page.evaluate(
"() => document.body.scrollHeight"
)
except Exception as e:
context.log.info(f"Skipped button due to error: {e}")
except Exception as e:
print(f"Skipped button due to error: {e}")
# Continue with scraping after all sections are expanded...
html = driver.page_source
# Get the page content
page_source = await page.content()
# Close browser when done
driver.quit()
# Close the browser
await browser.close()
return html
# Continue scraping logic here...
print("Scraping done")
# Save the page content to a file
with open("scraped_page.html", "w") as f:
f.write(page_source)
return page_source
@asset(io_manager_key="html_io_manager", name="raw")
def raw_html() -> str:
return asyncio.run(main())

View File

@@ -2,3 +2,4 @@ import os
from pathlib import Path
APP = os.environ.get("APP", Path(__file__).parent.parent.name)
URL = "https://www.hellostocks.ai/superinvestor/strategies"

View File

@@ -4,6 +4,7 @@ import assets
import sensors
from dagster_polars import PolarsParquetIOManager
from icecream import install
from resources import HtmlIOManager
import dagster as dg
from dagster import load_assets_from_modules
@@ -12,15 +13,17 @@ install()
APP = os.environ["APP"]
storage_dir = os.environ.get("STORAGE_DIR", "/storage")
definitions = dg.Definitions(
assets=[
asset.with_attributes(
group_names_by_key={asset.key: APP},
tags_by_key={asset.key: {"app": APP}},
)
for asset in load_assets_from_modules([assets])
],
resources={
"html_io_manager": HtmlIOManager(base_dir=storage_dir),
"polars_parquet_io_manager": PolarsParquetIOManager(
base_dir=os.environ.get("STORAGE_DIR", "/storage") + f"/{APP}"
),

View File

@@ -1,7 +1,11 @@
import re
from collections.abc import Iterator
from datetime import date
from datetime import datetime
import jobs
import requests
from bs4 import BeautifulSoup
from config import URL
import dagster as dg
@@ -10,6 +14,34 @@ import dagster as dg
def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]:
ic(context.cursor)
yield dg.RunRequest()
response = requests.get(URL)
response.raise_for_status()
context.update_cursor(date.today().strftime("%Y-%m-%d"))
try:
# Parse with BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# Find the first <div> after </header>
if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
# Extract date part using regex
match = re.search(
r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text
)
if match:
day, _, month, year = match.groups()
date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
date_str = date.strftime("%Y-%m-%d ")
context.log.info(f"Found date: {date_str}")
if date_str > context.cursor:
context.update_cursor(date_str)
yield dg.RunRequest()
return
except Exception as e:
context.log.error(f"Parsing error: {e}")
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
file = f"{now_str} stocks.html"
context.log.info(f"Saving file: {file}")
with open(f"/cache/{file}") as fp:
fp.write(response.text)