refactor to allow for multiple code locations

2025-07-20 19:49:30 +02:00
parent 9b8cfabee5
commit fd73e1367c
40 changed files with 161 additions and 628 deletions
--- a/apps/vinyl/src/sounds/fetch.py
+++ b/apps/vinyl/src/sounds/fetch.py
@@ -0,0 +1,110 @@
+#!/usr/bin/python3
+
+import time
+from datetime import datetime
+
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+
+
+def get_page_count(html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    # Find all pagination links
+    page_links = soup.select("ul.pagination li a")
+
+    # Extract the numbers from the hrefs and convert to integers
+    page_numbers = [
+        int(link.get_text()) for link in page_links if link.get_text().isdigit()
+    ]
+
+    return max(page_numbers)
+
+
+def parse_page(html_content):
+    entries = []
+    soup = BeautifulSoup(html_content, "html.parser")
+    for product in soup.find_all("div", {"class": "search-product"}):
+        item_id = product.find("a", rel=True)["rel"][0]
+        name = product.find("h5").text.strip()
+        artist_title = name.split("-")
+        artist = artist_title[0].strip()
+        title = artist_title[1].strip()
+        price = (
+            product.find("span", class_="product-price")
+            .text.strip()
+            .replace("€", "")
+            .strip()
+        )
+
+        entry = {
+            "id": item_id,
+            "name": name,
+            "artist": artist,
+            "title": title,
+            "price": price,
+        }
+        if detail := product.find("h6", {"class": "hide-for-small"}):
+            entry["detail"] = detail.text
+        if supply := product.find("div", {"class": "product-voorraad"}):
+            entry["supply"] = supply.text
+
+        for info in product.find_all("div", {"class": "product-info"}):
+            info = info.text.split(":")
+            if "Genre" in info[0]:
+                entry["genre"] = info[1].strip()
+            if "Releasedatum" in info[0]:
+                entry["release"] = info[1].strip()
+            entries.append(entry)
+
+    return pd.DataFrame(entries).reindex(
+        columns=[
+            "id",
+            "name",
+            "artist",
+            "title",
+            "price",
+            "supply",
+            "release",
+            "genre",
+            "detail",
+        ]
+    )
+
+
+def fetch_deals():
+    # Get page count
+    page_count = get_page_count(
+        requests.get("https://www.sounds.nl/uitverkoop/1/lp/all/art").text
+    )
+    time.sleep(1)
+    print(f"Number of pages: {page_count}")
+
+    # Parse all pages
+    base_url = "https://www.sounds.nl/uitverkoop/{page_number}/lp/all"
+    dfs = []
+    for i in tqdm(range(page_count)):
+        df = parse_page(requests.get(base_url.format(page_number=i)).text)
+        dfs.append(df)
+        time.sleep(2)
+
+    # Combine dfs
+    return pd.concat(dfs) if dfs else pd.DataFrame(columns=["id", "name", "price"])
+
+
+if __name__ == "__main__":
+    df = fetch_deals()
+    print(f"Found {len(df)} deals")
+
+    # Show current deals
+    print(df.sort_values(by="price").head(10))
+
+    # Write to file
+    now = datetime.now()
+    prefix = now.strftime("%Y-%m-%d_%H:%M:%S")
+    directory = "/home/bram/src/python"
+    filepath = f"{directory}/{prefix}_sounds.csv"
+    print(f"Writing data to {filepath}")
+    df.to_csv(filepath)