move utils

2025-08-04 18:02:00 +02:00
parent a3d931c1b3
commit f7318e85cd
5 changed files with 126 additions and 19 deletions
--- a/apps/stocks/src/utils.py
+++ b/apps/stocks/src/utils.py
@@ -1,19 +0,0 @@
-import re
-from collections.abc import Iterator
-from datetime import date, datetime
-
-from bs4 import BeautifulSoup
-
-
-def extract_date(page_source: str) -> Iterator[date]:
-    # Parse with BeautifulSoup
-    soup = BeautifulSoup(page_source, "html.parser")
-
-    # Find the first <div> after </header>
-    if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
-        # Extract date part using regex
-        match = re.search(r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text)
-        if match:
-            day, _, month, year = match.groups()
-            date_obj = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
-            yield date_obj
--- a/apps/stocks/src/utils/init.py
+++ b/apps/stocks/src/utils/init.py
--- a/apps/stocks/src/utils/extracter.py
+++ b/apps/stocks/src/utils/extracter.py
@@ -0,0 +1,55 @@
+import re
+from collections.abc import Iterator
+from datetime import date, datetime
+
+import pandas as pd
+from bs4 import BeautifulSoup
+from pandas import DataFrame
+
+
+def extract_date(page_source: str) -> Iterator[date]:
+    # Parse with BeautifulSoup
+    soup = BeautifulSoup(page_source, "html.parser")
+
+    # Find the first <div> after </header>
+    if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
+        # Extract date part using regex
+        match = re.search(r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text)
+        if match:
+            day, _, month, year = match.groups()
+            date_obj = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
+            yield date_obj
+
+
+def extract_tables(
+    page_source: str,
+) -> Iterator[tuple[str | None, str | None, DataFrame]]:
+    soup = BeautifulSoup(page_source, "html.parser")
+    accordion_items = soup.find_all("div", class_="accordion-item")
+
+    for item in accordion_items:
+        # Extract the title
+        header = item.find("div", class_="accordion-header")
+        title = header.find("h2").get_text(strip=True) if header else None
+
+        # Extract the description
+        description_block = item.find("div", class_="accordion-description")
+        description = (
+            description_block.find("p").get_text(strip=True)
+            if description_block
+            else None
+        )
+
+        # Extract the table
+        table = item.find("table")
+        if table:
+            rows = []
+            for row in table.find_all("tr"):
+                cells = [
+                    cell.get_text(strip=True) for cell in row.find_all(["th", "td"])
+                ]
+                rows.append(cells)
+
+            if rows:
+                df = pd.DataFrame(rows[1:], columns=rows[0])
+                yield title, description, df
--- a/apps/stocks/src/utils/scraper.py
+++ b/apps/stocks/src/utils/scraper.py
@@ -0,0 +1,60 @@
+async def scrape(url: str) -> str:
+    from playwright.async_api import async_playwright
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(viewport={"width": 1000, "height": 2000})
+        page = await context.new_page()
+
+        await page.goto(url, timeout=60000)
+
+        # Wait until at least one toggle button is present
+        await page.wait_for_selector(".toggle-btn", timeout=20000)
+
+        # Set zoom
+        await page.evaluate("document.body.style.zoom='50%'")
+
+        # Find all toggle buttons
+        toggle_buttons = await page.query_selector_all(".toggle-btn")
+        print(f"Found {len(toggle_buttons)} toggle buttons")
+
+        for i, btn in enumerate(toggle_buttons):
+            try:
+                # Ensure it's visible and enabled
+                if await btn.is_visible() and await btn.is_enabled():
+                    await btn.click()
+                    await page.wait_for_timeout(1000)
+
+                if i == len(toggle_buttons) - 1:
+                    break
+
+                # Scroll down gradually
+                scroll_step = 500
+                total_height = await page.evaluate("() => document.body.scrollHeight")
+                current_position = 0
+
+                while current_position < total_height:
+                    await page.evaluate(f"window.scrollTo(0, {current_position});")
+                    await page.wait_for_timeout(100)
+                    current_position += scroll_step
+                    total_height = await page.evaluate(
+                        "() => document.body.scrollHeight"
+                    )
+
+            except Exception as e:
+                print(f"Skipped button due to error: {e}")
+
+        # Get the page content
+        page_source = await page.content()
+
+        # Close the browser
+        await browser.close()
+
+        # Continue scraping logic here...
+        print("Scraping done")
+
+        # Save the page content to a file
+        with open("/cache/scraped_page.html", "w") as fp:
+            fp.write(page_source)
+
+    return page_source
--- a/apps/stocks/src/utils/text.py
+++ b/apps/stocks/src/utils/text.py
@@ -0,0 +1,11 @@
+import re
+import unicodedata
+
+
+def slugify(text: str) -> str:
+    # Normalize unicode characters
+    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
+    # Replace non-word characters with hyphens
+    text = re.sub(r"[^\w\s-]", "", text).strip().lower()
+    # Replace spaces and repeated hyphens with a single hyphen
+    return re.sub(r"[-\s]+", "-", text)