From a7d8dfdbd33869523b5f463af39ae6922f906ac6 Mon Sep 17 00:00:00 2001 From: Rik Veenboer Date: Mon, 14 Oct 2024 11:24:03 +0200 Subject: [PATCH] update sounds fetch --- src/app/vinyl/sounds/fetch.py | 64 ++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/src/app/vinyl/sounds/fetch.py b/src/app/vinyl/sounds/fetch.py index 7be3185..219d1cb 100644 --- a/src/app/vinyl/sounds/fetch.py +++ b/src/app/vinyl/sounds/fetch.py @@ -24,29 +24,55 @@ def get_page_count(html_content): def parse_page(html_content): + entries = [] soup = BeautifulSoup(html_content, "html.parser") - - # Extract the name (artist - album) from the h5 tag - names = list(map(lambda x: x.get_text(strip=True), soup.find_all("h5"))) - - # Remove 'Telefoon', 'E-mail', 'Facebook' - names = list(filter(lambda x: " -" in x, names)) - - # Extract the numerical id from the a tag - ids = list(map(lambda x: x["rel"][0], soup.find_all("a", rel=True))) - - # Extract the price - prices = list( - map( - lambda x: float(x.get_text(strip=True).split()[1]), - soup.find_all("span", class_="product-price"), + for product in soup.find_all("div", {"class": "search-product"}): + item_id = product.find("a", rel=True)["rel"][0] + name = product.find("h5").text.strip() + artist_title = name.split("-") + artist = artist_title[0].strip() + title = artist_title[1].strip() + price = ( + product.find("span", class_="product-price") + .text.strip() + .replace("€", "") + .strip() ) + + entry = { + "id": item_id, + "name": name, + "artist": artist, + "title": title, + "price": price, + } + if detail := product.find("h6", {"class": "hide-for-small"}): + entry["detail"] = detail.text + if supply := product.find("div", {"class": "product-voorraad"}): + entry["supply"] = supply.text + + for info in product.find_all("div", {"class": "product-info"}): + info = info.text.split(":") + if "Genre" in info[0]: + entry["genre"] = info[1].strip() + if "Releasedatum" in info[0]: + entry["release"] = info[1].strip() + entries.append(entry) + + return pd.DataFrame(entries).reindex( + columns=[ + "id", + "name", + "artist", + "title", + "price", + "supply", + "release", + "genre", + "detail", + ] ) - df = pd.DataFrame({"id": ids, "name": names, "price": prices}) - - return df - def fetch_deals(): # Get page count