Made a micro scraper.

This commit is contained in:
吳元皓 2025-05-10 23:40:19 +08:00
parent bf38065e52
commit 089c90f79b
2 changed files with 83 additions and 3 deletions

View File

@ -1,10 +1,12 @@
# Going to love the stupid tab tab tab tab system, instead of using brances.
import time import time
import re # Regular expressions import re # Regular expressions
from urllib.request import urlopen # URL request lib. from urllib.request import urlopen # URL request lib.
from bs4 import BeautifulSoup # BeautifulSoup lib. from bs4 import BeautifulSoup # BeautifulSoup lib.
import json
headers = { headers = {
'User-Agent': 'NewsSceraperBot/1.0 (news.yuanhau.com)' 'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)'
} }
#url = "https://tw.news.yahoo.com/" #url = "https://tw.news.yahoo.com/"
@ -14,5 +16,32 @@ page = urlopen(url)
html_bytes = page.read() html_bytes = page.read()
html = html_bytes.decode("utf-8") html = html_bytes.decode("utf-8")
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
hotarticles = soup.find_all("article")
news_data = []
for article in hotarticles:
try:
title_elem = article.find('a', class_='gPFEn')
title = title_elem.text.strip() if title_elem else ''
source_elem = article.find('div', class_='vr1PYe')
source = source_elem.text.strip() if source_elem else ''
link_elem = article.find('a', class_='WwrzSb')
orglink = link_elem['href'] if link_elem else ''
link = re.sub(r'./read/', 'https://news.google.com/read/', orglink)
article_data = {
'title': title,
'source': source,
'link': link,
}
news_data.append(article_data)
except Exception as e:
print(f"Error processing article: {e}")
continue
with open('news.json', 'w', encoding='utf-8') as f:
json.dump(news_data, f, ensure_ascii=False, indent=2)
print(soup.find_all("article"))

File diff suppressed because one or more lines are too long