diff --git a/scraping/main.py b/scraping/main.py index 8cec59f..adb1805 100644 --- a/scraping/main.py +++ b/scraping/main.py @@ -1,10 +1,12 @@ +# Going to love the stupid tab tab tab tab system, instead of using brances. import time import re # Regular expressions from urllib.request import urlopen # URL request lib. from bs4 import BeautifulSoup # BeautifulSoup lib. +import json headers = { - 'User-Agent': 'NewsSceraperBot/1.0 (news.yuanhau.com)' + 'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)' } #url = "https://tw.news.yahoo.com/" @@ -14,5 +16,32 @@ page = urlopen(url) html_bytes = page.read() html = html_bytes.decode("utf-8") soup = BeautifulSoup(html, "html.parser") +hotarticles = soup.find_all("article") +news_data = [] +for article in hotarticles: + try: + title_elem = article.find('a', class_='gPFEn') + title = title_elem.text.strip() if title_elem else '' + + source_elem = article.find('div', class_='vr1PYe') + source = source_elem.text.strip() if source_elem else '' + + link_elem = article.find('a', class_='WwrzSb') + orglink = link_elem['href'] if link_elem else '' + link = re.sub(r'./read/', 'https://news.google.com/read/', orglink) + + article_data = { + 'title': title, + 'source': source, + 'link': link, + } + + news_data.append(article_data) + + except Exception as e: + print(f"Error processing article: {e}") + continue + +with open('news.json', 'w', encoding='utf-8') as f: + json.dump(news_data, f, ensure_ascii=False, indent=2) -print(soup.find_all("article")) diff --git a/scraping/news.json b/scraping/news.json index bf4e9b1..23bcd40 100644 --- a/scraping/news.json +++ b/scraping/news.json @@ -1 +1,52 @@ -[
聯合新聞網
2個恐怖組織成印巴衝突導火線 虔誠軍、穆罕默德軍是何來頭?
,
奇摩新聞
印度與巴基斯坦達成全面停火 巴國當局重新開放領空
,
聯合新聞網
演哪齣?印巴互射一輪飛彈互攻 彼此突喊話「收手讓情勢降溫」
,
chinatimes.com
驚傳印軍「飆風戰機飛行員」被俘虜 媒體人訝異1事
,
奇摩股市
中美關稅談判在即…川普突發文:對中關稅80%合理 台股周一開盤恐遭波及漲勢疲軟
,
聯合新聞網
結束了?美中瑞士經貿會談 路透:雙方代表團離場
,
奇摩新聞
美中貿易戰僵局終於破冰 雙方代表在日內瓦展開關稅會談
,
中央社 CNA
川普提對中關稅降至80% 中國學者:不可接受| 兩岸
,
,
] +[ + { + "title": "印度巴基斯坦同意立即全面停火川普稱美國居中調解| 國際", + "source": "中央社 CNA", + "link": "https://news.google.com/read/CBMiX0FVX3lxTE8zendQUzhpWHUyR2ZiVUZVRHFOd2x6eVB2ZEI3NHRONW80ZGlvX2c1Q3ktbWdDTldjeVR4R2VsS2VVNDNtRUFuMlhSZzNUWTMtaWlIcUpMRHhMdUxBU1Rj?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant" + }, + { + "title": "演哪齣?印巴互射一輪飛彈互攻 彼此突喊話「收手讓情勢降溫」", + "source": "聯合新聞網", + "link": "https://news.google.com/read/CBMiU0FVX3lxTE8xNUNDQ0VobEhfdlBkbGZWcDhSTjhIYWlLWW84VVMwZjk3UDFEa0tibjRuV0xEcExsR05sTDI0V2ZjbVFpTDRPbEtlTVhyUjk2RzZR0gFYQVVfeXFMTVpXaWJQdVIyMXNvdEpVajJQZ0l5eDZEU2NPVzhudFFWY3RjcnNpZEROYXlTM3FUc3duY0lfQzhGRkxCUG4zb21rZWM5RlNvNkxtbzdVS1BkdA?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant" + }, + { + "title": "霹靂15飛彈擊落飆風戰機 蘭寧利:解放軍的沉默令人憂心", + "source": "奇摩新聞", + "link": "https://news.google.com/read/CBMi7AJBVV95cUxNUjlsTFlSelh2OEJaZlpHM2JiLVZEcHZSdDVLc1RGalI2SmVKdXpuR2dUTW5obm83QTdvdzNfUms5NEpyUENJYzNJV3dXV1BjUDZWNDVlanpjcERFa2NDSFQ1RVUzbTJUN2NZTGtmbUtFZE5KUjhsMlA5MGs4LXFoVVdPaGRiQ1N5ZnZQQ1c1TEplYmdpeElkd1BkeE9vVHkxa01lcVlOR3VEeFJULXpqLVhld1JOdFBGY2doNm1YQjdHN3NDb1FZLS1DZE1CMWlSVVAxc2Uzb1N0MmpfREJyQlp4U3NOQVNCZ1ZDUXdRS2lSVjBmek1CZEdhYlJ5b0FLeGMzWFFyU3JQa3BUVHFmVnJOVmtROVJ6SmRqMTNRVVJaNWpfRHZ3bktVV0JXVzkxYmlhT2hJSzk1bkYxVnluN25XMWZrSXVkZFJzakF2NjhCOHRIVElEYzE3VjNQeno1OXNscUFGemNEYzFY?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant" + }, + { + "title": "驚傳印軍「飆風戰機飛行員」被俘虜 媒體人訝異1事", + "source": "chinatimes.com", + "link": "https://news.google.com/read/CBMibkFVX3lxTFB5VmVtdDA3dXVUNG1yWXdGWjVDUnRzTTBNR3dNVng0UVhVSElTTW9FcU0xdXk0aEowUEtnSlZ5Q245ZGhWNEVPeUlDei1IMzRaTXdHWHU1UkwwTkxKVlhaWDNoY1E2YWNZU0R2LVdn?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant" + }, + { + "title": "中美關稅談判在即…川普突發文:對中關稅80%合理 台股周一開盤恐遭波及漲勢疲軟", + "source": "奇摩股市", + "link": "https://news.google.com/read/CBMi6wJBVV95cUxNVElQS2Y3a0JJYlNPZ0hxd3FxYnVna2J6Vm9fTFo5RHQ0M21veFpMR3NKRlQycHBrS1VQckVZcE54cTRtMjBVVUtSQy05dEcxYnVzNW41OTRnVFBGZHpvWjkxRjdSNFJsOWJETmRMMkJsNFRlZ3d3QTRYY2JNTDM3am1JcWcwdjhVTGRUeHQxNWZSdHVYQ21Vd09hUUpVeHBpVll5allVTWVlUkZRV3dXMGZDQ29VVnQ4VWhGTWxCT3JNTHY1QnRTZjdza3g4aFJpLWdZR1RsNTlYT0xWSHJJei1sci1vdURERGJhWGdFQjFwbHFZSmk4Nll2WVhBWlJPUm9oQ21POWVmcEhBSFRvRjE2cTFNR09oNjJHd3YzOXgzd0I3ZFM0VkMxbjdmTkE1REhMdmt0VE84TXUxMmJKa0NzMDhybWhzcHFVcTE3aVhFZjRFV01jU2JMZk44S0d0YnppYXJfQVVyZzg?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant" + }, + { + "title": "結束了?美中瑞士經貿會談 路透:雙方代表團離場", + "source": "聯合新聞網", + "link": "https://news.google.com/read/CBMiU0FVX3lxTE5lQzVwWFlqRHZCQ0xoeHZaR1RQSWhPR3lVYkg0UHpMT1NUcUZlZVMyV3hrbTNZLTFrOEdmNXBpc0hoMVZ0dlF5T2IyV0ZyNlNBSURN0gFYQVVfeXFMUDltV2ZsSTh0Z185VEhXSmpTZ1dBR3VTZ0N0VlljMGR6MnMzV1VhalRWWDJQR3o0aGhuRlJobmd3OWo1VHJjcFQ2VUZEMlNLV1dCNjFSY01VQw?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant" + }, + { + "title": "美中貿易戰僵局終於破冰 雙方代表在日內瓦展開關稅會談", + "source": "奇摩新聞", + "link": "https://news.google.com/read/CBMigANBVV95cUxPZ2dwZUV4c3JjdUpZNE1XVDhFTlFqeGtwRHBLMy1maTYyMmdVUjFDRENpc2pWYlBaeC02R3E5bDNXaG5kXzBiZWRHQUxmV19LOXlxbGhSYmhYUVVwM25SbFhNcUk1MFRzM3lvb2lsU1BjMVNPOF9PN2RVaFJVVUx5cnpGby1xeFllSmlIQkZSNDE1X09TeEMwdTExWTRaakYtSHltM25IUm9jQ2ZBWXpfVWNYOVFoSVZTUVA0WTV0cllBWUFDVU1qZ29PbVlXQzIxUUpDR2RkR0w4Qm1vYzV1bjFyOExSczdlOWJRTkFYSlZ0YjB2NFFPNWJuTWswR2Z2NmJQZy1tLWhyb1gyWV9DbU9tNmRETkwyTVdoa3BBTUw3U3ZOeXdJSzYtQXhzWlcyNjdaWDVZQVJscmN1a2JVUE0wNlZpbUVjbUI1V244QXBBTTJ4TkpaSXMxWUtKal9BSldtUklvNm92WGtCM0VXcmp5UnhiZERmQ0NIZUZYN0Y?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant" + }, + { + "title": "中美日內瓦談判已開始!專家:短期內難達成實質協議", + "source": "news.cnyes.com", + "link": "https://news.google.com/read/CBMiT0FVX3lxTE8tVTFQb0hzR01MR2tvckkyQktkdnJDeGpHdnlmT2lpSTAxNmlpMU0tNi01M0pGVk5SbkRkcTI1X1h4ZFNVZmo1R25ZWFFyelk?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant" + }, + { + "title": "", + "source": "中央社 CNA", + "link": "https://news.google.com/read/CBMiX0FVX3lxTE5SZlVlM1VDdHpRTGUwSy1ReEwzTzFlTFVOUVp0NXRodG92UkthdF9wUTZucVMwUERaR0p0NW8yY3hmS3BtOXRyM0c4c1VwQTZhMEJsaDZ4Nm1UTllzR0gw?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant" + }, + { + "title": "", + "source": "自由時報", + "link": "https://news.google.com/read/CBMiZEFVX3lxTE92QnhFdzlJQklENWJfV1lrczhVajVfZUJ1elZuZFpBaDlDMUNoc2d2aGo1WkZsbmVMaWJaSXlfdFhFeVBGLXA5Z29aZ2FOV2RVYng1c3hUc0xuLTQ2ZUwzQy1iT0XSAWpBVV95cUxNOE1VSndXbkNLTWxoYlo1WWJ3SDVxZkRHTF9uOFhqUEl6MldDSkY0aFNLcy1ScFdrZUlUb2hkREFTandRSThWQ3FEdFRvTm9TT0xEQ0RKeUtSRW1NSnN6VmxIZWNfdVhoS2d3?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant" + } +] \ No newline at end of file