Made a micro scraper.

2025-06-24 00:01:03 +08:00 · 2025-05-10 23:40:19 +08:00 · 2025-05-10 23:40:19 +08:00 · 089c90f79b
commit 089c90f79b
parent bf38065e52
2 changed files with 83 additions and 3 deletions
--- a/scraping/main.py
+++ b/scraping/main.py
@ -1,10 +1,12 @@
+# Going to love the stupid tab tab tab tab system, instead of using brances.
 import time
 import re # Regular expressions
 from urllib.request import urlopen # URL request lib.
 from bs4 import BeautifulSoup # BeautifulSoup lib.
+import json

 headers = {
-    'User-Agent': 'NewsSceraperBot/1.0 (news.yuanhau.com)'
+    'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)'
 }

 #url = "https://tw.news.yahoo.com/"
@ -14,5 +16,32 @@ page = urlopen(url)
 html_bytes = page.read()
 html = html_bytes.decode("utf-8")
 soup = BeautifulSoup(html, "html.parser")
+hotarticles = soup.find_all("article")
+news_data = []
+for article in hotarticles:
+    try:
+        title_elem = article.find('a', class_='gPFEn')
+        title = title_elem.text.strip() if title_elem else ''
+        
+        source_elem = article.find('div', class_='vr1PYe')
+        source = source_elem.text.strip() if source_elem else ''
+        
+        link_elem = article.find('a', class_='WwrzSb')
+        orglink = link_elem['href'] if link_elem else ''
+        link = re.sub(r'./read/', 'https://news.google.com/read/', orglink)
+        
+        article_data = {
+            'title': title,
+            'source': source,
+            'link': link,
+        }
+        
+        news_data.append(article_data)
+        
+    except Exception as e:
+        print(f"Error processing article: {e}")
+        continue
+
+with open('news.json', 'w', encoding='utf-8') as f:
+    json.dump(news_data, f, ensure_ascii=False, indent=2)

-print(soup.find_all("article"))
--- a/scraping/news.json
+++ b/scraping/news.json