This commit is contained in:
2024-10-14 12:49:32 +02:00
parent a7d8dfdbd3
commit ebef914be6
11 changed files with 138 additions and 142 deletions

View File

@@ -14,14 +14,14 @@ from .scrape import *
def scrape_plato(get=None):
ic()
url = 'https://www.platomania.nl/vinyl-aanbiedingen?page=1'
url = "https://www.platomania.nl/vinyl-aanbiedingen?page=1"
ic(url)
soup = get_soup(url=url, get=get)
articles_info = scrape_page(soup)
ic(len(articles_info))
links = sorted(set(scrape_page_links(soup)), key=lambda x: int(x.split('=')[-1]))
links = sorted(set(scrape_page_links(soup)), key=lambda x: int(x.split("=")[-1]))
for link in links:
ic(link)
soup = get_soup(url=link, get=get)
@@ -31,20 +31,20 @@ def scrape_plato(get=None):
# break
def clean(name):
tmp = ' '.join(reversed(name.split(', ')))
tmp = " ".join(reversed(name.split(", ")))
tmp = tmp.lower()
tmp = re.sub(r'\s+\([^\)]*\)', '', tmp)
tmp = re.sub(r"\s+\([^\)]*\)", "", tmp)
return tmp
articles_df = pd.DataFrame(articles_info)
articles_df['_artist'] = articles_df['artist'].map(clean)
articles_df['_price'] = articles_df['price'].map(lambda x: float(x.split(' ')[-1]))
articles_df['_date'] = datetime.now()
articles_df["_artist"] = articles_df["artist"].map(clean)
articles_df["_price"] = articles_df["price"].map(lambda x: float(x.split(" ")[-1]))
articles_df["_date"] = datetime.now()
return articles_df
def update_database(articles_df=None, database_file='/home/user/plato.parquet'):
def update_database(articles_df=None, database_file="/home/user/plato.parquet"):
if os.path.exists(database_file):
database_df = pd.read_parquet(database_file)
else:
@@ -57,18 +57,22 @@ def update_database(articles_df=None, database_file='/home/user/plato.parquet'):
articles_df.to_parquet(database_file)
return articles_df, articles_df
compare = ['ean', '_price']
compare = ["ean", "_price"]
check_df = pd.merge(
database_df[compare],
articles_df[compare],
how='right',
indicator=True
database_df[compare], articles_df[compare], how="right", indicator=True
)
new_df = (
check_df[check_df["_merge"] == "right_only"]
.drop(columns="_merge")
.merge(articles_df)
)
database_df = (
pd.concat([database_df, new_df])
.sort_values("_date")
.groupby("ean")
.last()
.reset_index()
)
new_df = check_df[check_df['_merge'] == 'right_only'].drop(columns='_merge').merge(articles_df)
database_df = pd.concat([
database_df,
new_df
]).sort_values('_date').groupby('ean').last().reset_index()
database_df.to_parquet(database_file)
return database_df, new_df
@@ -84,7 +88,7 @@ def send_email(lines):
BODY_TEXT = ""
# The HTML body of the email
tmp = '\n'.join(lines)
tmp = "\n".join(lines)
BODY_HTML = f"""<html>
<head></head>
<body>
@@ -97,29 +101,31 @@ def send_email(lines):
# Try to send the email
try:
client = boto3.client('ses', region_name='eu-west-1') # Change the region as needed
client = boto3.client(
"ses", region_name="eu-west-1"
) # Change the region as needed
# Provide the contents of the email
response = client.send_email(
Destination={
'ToAddresses': [
"ToAddresses": [
RECIPIENT,
],
},
Message={
'Body': {
'Html': {
'Charset': CHARSET,
'Data': BODY_HTML,
"Body": {
"Html": {
"Charset": CHARSET,
"Data": BODY_HTML,
},
'Text': {
'Charset': CHARSET,
'Data': BODY_TEXT,
"Text": {
"Charset": CHARSET,
"Data": BODY_TEXT,
},
},
'Subject': {
'Charset': CHARSET,
'Data': SUBJECT,
"Subject": {
"Charset": CHARSET,
"Data": SUBJECT,
},
},
Source=SENDER,
@@ -133,12 +139,12 @@ def send_email(lines):
print(f"Error: {e}")
else:
print("Email sent! Message ID:"),
print(response['MessageId'])
print(response["MessageId"])
def get(url, proxy=True):
if proxy:
tmp = 'socks5://localhost:1080'
tmp = "socks5://localhost:1080"
kwargs = dict(proxies=dict(http=tmp, https=tmp))
else:
kwargs = {}
@@ -146,16 +152,16 @@ def get(url, proxy=True):
def main(dry=False):
load_dotenv('/opt/.env')
load_dotenv("/opt/.env")
local_ip = get('http://ifconfig.me', False).text
get_ip = get('http://ifconfig.me').text
print(f'Local IP = {local_ip}')
print(f'Request IP = {get_ip}')
local_ip = get("http://ifconfig.me", False).text
get_ip = get("http://ifconfig.me").text
print(f"Local IP = {local_ip}")
print(f"Request IP = {get_ip}")
assert local_ip != get_ip
artists = open('/home/user/artists.txt').read().strip().splitlines()
print(f'Number of known artists = {len(artists)}')
artists = open("/home/user/artists.txt").read().strip().splitlines()
print(f"Number of known artists = {len(artists)}")
if dry:
articles_df = None
@@ -166,26 +172,28 @@ def main(dry=False):
if dry:
new_df = database_df.sample(20)
print(f'Database size = {len(database_df)}')
print(f'New = {len(new_df)}')
print(f"Database size = {len(database_df)}")
print(f"New = {len(new_df)}")
# new_df = new_df[new_df['_artist'].isin(artists)].query('_price <= 25')
new_df = new_df.query('_price <= 25 and ean != ""')
print(f'Interesting = {len(new_df)}')
print(f"Interesting = {len(new_df)}")
if new_df is not None and len(new_df):
message = []
for _, row in new_df.head(10).iterrows():
message.append(f'<a href="https://www.platomania.nl{row.url}"><h1>NEW</h1></a>')
message.append('<ul>')
message.append(f'<li>[artist] {row.artist}</li>')
message.append(f'<li>[title] {row.title}</li>')
message.append(f'<li>[price] {row.price}</li>')
message.append(f'<li>[release] {row.release_date}</li>')
message.append('</ul>')
message.append(
f'<a href="https://www.platomania.nl{row.url}"><h1>NEW</h1></a>'
)
message.append("<ul>")
message.append(f"<li>[artist] {row.artist}</li>")
message.append(f"<li>[title] {row.title}</li>")
message.append(f"<li>[price] {row.price}</li>")
message.append(f"<li>[release] {row.release_date}</li>")
message.append("</ul>")
send_email(message)
if __name__ == '__main__':
if __name__ == "__main__":
cwd = os.path.dirname(__file__)
main(dry=False)

View File

@@ -2,8 +2,7 @@ import requests
from bs4 import BeautifulSoup
def get_soup(url, get = None):
def get_soup(url, get=None):
# Send a GET request to the specified URL
if get is None:
get = requests.get
@@ -12,21 +11,23 @@ def get_soup(url, get = None):
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content of the page
return BeautifulSoup(response.content, 'html.parser')
return BeautifulSoup(response.content, "html.parser")
else:
raise ValueError(f"Failed to retrieve the page. Status code: {response.status_code}")
raise ValueError(
f"Failed to retrieve the page. Status code: {response.status_code}"
)
def scrape_page_links(soup):
# Find all <li> elements with class "page-item"
page_items = soup.find_all('li', class_='page-item')
page_items = soup.find_all("li", class_="page-item")
# Extract the href attribute of <a> tags within these <li> elements
links = []
for item in page_items:
a_tag = item.find('a', class_='page-link')
if a_tag and 'href' in a_tag.attrs:
links.append(a_tag['href'])
a_tag = item.find("a", class_="page-link")
if a_tag and "href" in a_tag.attrs:
links.append(a_tag["href"])
return links
@@ -35,43 +36,44 @@ def extract_article_info(article):
info = {}
# Extract the artist name
artist_tag = article.find('h1', class_='product-card__artist')
info['artist'] = artist_tag.text.strip() if artist_tag else None
artist_tag = article.find("h1", class_="product-card__artist")
info["artist"] = artist_tag.text.strip() if artist_tag else None
# Extract the title and URL
title_tag = article.find('h2', class_='product-card__title')
info['title'] = title_tag.text.strip() if title_tag else None
url_tag = title_tag.find_parent('a') if title_tag else None
info['url'] = url_tag['href'] if url_tag else None
title_tag = article.find("h2", class_="product-card__title")
info["title"] = title_tag.text.strip() if title_tag else None
url_tag = title_tag.find_parent("a") if title_tag else None
info["url"] = url_tag["href"] if url_tag else None
# Extract additional details
details = article.find_all('div', class_='article-details__text')
details = article.find_all("div", class_="article-details__text")
for detail in details:
text = detail.text.strip()
if 'Label:' in text:
info['label'] = text.replace('Label: ', '').strip()
elif 'Releasedatum:' in text:
info['release_date'] = text.replace('Releasedatum: ', '').strip()
elif 'Herkomst:' in text:
info['origin'] = text.replace('Herkomst: ', '').strip()
elif 'Item-nr:' in text:
info['item_number'] = text.replace('Item-nr: ', '').strip()
elif 'EAN:' in text:
info['ean'] = text.replace('EAN:', '').strip()
if "Label:" in text:
info["label"] = text.replace("Label: ", "").strip()
elif "Releasedatum:" in text:
info["release_date"] = text.replace("Releasedatum: ", "").strip()
elif "Herkomst:" in text:
info["origin"] = text.replace("Herkomst: ", "").strip()
elif "Item-nr:" in text:
info["item_number"] = text.replace("Item-nr: ", "").strip()
elif "EAN:" in text:
info["ean"] = text.replace("EAN:", "").strip()
# Extract delivery information
delivery_tag = article.find('div', class_='article-details__delivery-text')
info['delivery_info'] = delivery_tag.text.strip() if delivery_tag else None
delivery_tag = article.find("div", class_="article-details__delivery-text")
info["delivery_info"] = delivery_tag.text.strip() if delivery_tag else None
# Extract price
price_tag = article.find('div', class_='article__price')
info['price'] = price_tag.text.strip() if price_tag else None
price_tag = article.find("div", class_="article__price")
info["price"] = price_tag.text.strip() if price_tag else None
return info
def scrape_page(soup):
# Find all article blocks
article_blocks = soup.find_all('article', class_='article LP')
article_blocks = soup.find_all("article", class_="article LP")
# Extract information from each article block
return [extract_article_info(article) for article in article_blocks]