From bf38065e52de302dc65301f6493ae1bfe6e5e2d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=B3=E5=85=83=E7=9A=93?= Date: Sat, 10 May 2025 23:21:36 +0800 Subject: [PATCH] Implement structural updates and optimizations across multiple modules --- README.md | 2 ++ scraping/main.py | 17 ++++++++++++++--- scraping/news.json | 1 + scraping/requirements.txt | 3 ++- 4 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 scraping/news.json diff --git a/README.md b/README.md index a5ea031..6b67d65 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # 新聞解析 / News Analyze +![](https://hackatime-badge.hackclub.com/U087ATD163V/news-analyize) + ## Why? 我們使用這個新聞來舉例: diff --git a/scraping/main.py b/scraping/main.py index ac3f97e..8cec59f 100644 --- a/scraping/main.py +++ b/scraping/main.py @@ -1,7 +1,18 @@ -from urllib.request import urlopen +import time +import re # Regular expressions +from urllib.request import urlopen # URL request lib. +from bs4 import BeautifulSoup # BeautifulSoup lib. -url = "https://tw.news.yahoo.com/" +headers = { + 'User-Agent': 'NewsSceraperBot/1.0 (news.yuanhau.com)' +} + +#url = "https://tw.news.yahoo.com/" +url = "https://news.google.com/home?hl=zh-TW&gl=TW&ceid=TW:zh-Hant" page = urlopen(url) +html_bytes = page.read() +html = html_bytes.decode("utf-8") +soup = BeautifulSoup(html, "html.parser") -page \ No newline at end of file +print(soup.find_all("article")) diff --git a/scraping/news.json b/scraping/news.json new file mode 100644 index 0000000..bf4e9b1 --- /dev/null +++ b/scraping/news.json @@ -0,0 +1 @@ +[
聯合新聞網
2個恐怖組織成印巴衝突導火線 虔誠軍、穆罕默德軍是何來頭?
,
奇摩新聞
印度與巴基斯坦達成全面停火 巴國當局重新開放領空
,
聯合新聞網
演哪齣?印巴互射一輪飛彈互攻 彼此突喊話「收手讓情勢降溫」
,
chinatimes.com
驚傳印軍「飆風戰機飛行員」被俘虜 媒體人訝異1事
,
奇摩股市
中美關稅談判在即…川普突發文:對中關稅80%合理 台股周一開盤恐遭波及漲勢疲軟
,
聯合新聞網
結束了?美中瑞士經貿會談 路透:雙方代表團離場
,
奇摩新聞
美中貿易戰僵局終於破冰 雙方代表在日內瓦展開關稅會談
,
中央社 CNA
川普提對中關稅降至80% 中國學者:不可接受| 兩岸
,
,
] diff --git a/scraping/requirements.txt b/scraping/requirements.txt index 6530294..3bc2c93 100644 --- a/scraping/requirements.txt +++ b/scraping/requirements.txt @@ -1 +1,2 @@ -urlopen \ No newline at end of file +urlopen +beautifulsoup4 \ No newline at end of file