Made a micro scraper.

2025-06-24 00:01:03 +08:00 · 2025-05-10 23:40:19 +08:00 · 2025-05-10 23:40:19 +08:00 · 089c90f79b
commit 089c90f79b
parent bf38065e52
2 changed files with 83 additions and 3 deletions
--- a/scraping/main.py
+++ b/scraping/main.py
@ -1,10 +1,12 @@
 # Going to love the stupid tab tab tab tab system, instead of using brances.
 import time
 import re # Regular expressions
 from urllib.request import urlopen # URL request lib.
 from bs4 import BeautifulSoup # BeautifulSoup lib.
 import json
 headers = {
-    'User-Agent': 'NewsSceraperBot/1.0 (news.yuanhau.com)'
+    'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)'
 }
 #url = "https://tw.news.yahoo.com/"
@ -14,5 +16,32 @@ page = urlopen(url)
 html_bytes = page.read()
 html = html_bytes.decode("utf-8")
 soup = BeautifulSoup(html, "html.parser")
 hotarticles = soup.find_all("article")
 news_data = []
 for article in hotarticles:
    try:
        title_elem = article.find('a', class_='gPFEn')
        title = title_elem.text.strip() if title_elem else ''
        source_elem = article.find('div', class_='vr1PYe')
        source = source_elem.text.strip() if source_elem else ''
        link_elem = article.find('a', class_='WwrzSb')
        orglink = link_elem['href'] if link_elem else ''
        link = re.sub(r'./read/', 'https://news.google.com/read/', orglink)
        article_data = {
            'title': title,
            'source': source,
            'link': link,
        }
        news_data.append(article_data)
    except Exception as e:
        print(f"Error processing article: {e}")
        continue
 with open('news.json', 'w', encoding='utf-8') as f:
    json.dump(news_data, f, ensure_ascii=False, indent=2)
 print(soup.find_all("article"))
--- a/scraping/news.json
+++ b/scraping/news.json