From 456b2ad33ec4d022be9593fd76c80ba4bab182e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=B3=E5=85=83=E7=9A=93?= Date: Wed, 14 May 2025 00:17:26 +0800 Subject: [PATCH] Fuck Microsoft, Yahoo news is better to scrape :D --- scraping/hot_articles.py | 2 -- scraping/msn.py | 57 ++++++++++++++++++++++++++++++++++++++++ scraping/yahoo.py | 57 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 scraping/msn.py create mode 100644 scraping/yahoo.py diff --git a/scraping/hot_articles.py b/scraping/hot_articles.py index 58ff574..3113b11 100644 --- a/scraping/hot_articles.py +++ b/scraping/hot_articles.py @@ -25,8 +25,6 @@ headers = { 'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)' } -#url = "https://tw.news.yahoo.com/" -#url = "https://news.google.com/home?hl=zh-TW&gl=TW&ceid=TW:zh-Hant" url = "https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNRFptTXpJU0JYcG9MVlJYS0FBUAE?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant" topiccwiz_css = "PO9Zff Ccj79 kUVvS" diff --git a/scraping/msn.py b/scraping/msn.py new file mode 100644 index 0000000..c8c0b0a --- /dev/null +++ b/scraping/msn.py @@ -0,0 +1,57 @@ +# BROKEN +import re +from urllib.request import urlopen, Request +import chardet +from bs4 import BeautifulSoup +import json +import psycopg2 +import pandas as pd +import dotenv +import os +import gzip +import io + +# Load environment variables from .env file +dotenv.load_dotenv() + +headers = { + #'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': '*', + 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'same-origin', + 'Cache-Control': 'max-age=0', +} + + +url = "https://www.msn.com/zh-tw/news/living/%E7%99%BD%E5%A4%A9-%E6%99%9A%E4%B8%8A%E9%81%8B%E5%8B%95%E5%93%AA%E5%80%8B%E5%A5%BD-%E9%86%AB%E6%8F%AD-1%E9%97%9C%E9%8D%B5-%E6%AF%94%E6%8C%91%E6%99%82%E9%96%93%E6%9B%B4%E9%87%8D%E8%A6%81/ar-AA1D4zTQ" + +try: + req = Request(url, headers=headers) + response = urlopen(req) + if response.info().get('Content-Encoding') == 'gzip': + gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read())) + html = gzip_file.read().decode('utf-8') + else: + html = response.read().decode('utf-8') + + + soupsoup = BeautifulSoup(html, "html.parser") + soup = soupsoup.find('views-header-wc') + # Extract content + title = soup.find('h1', class_='viewsHeaderText') + title_text = title.text.strip() if title else "No title found" + + article = soup.find('body', class_="article-body") + paragraph = article.text.strip() if article else "" + + # Print results + print(f"Title: {title_text}") + print(f"Content: {paragraph}") + +except Exception as e: + print(f"Error: {str(e)}") diff --git a/scraping/yahoo.py b/scraping/yahoo.py new file mode 100644 index 0000000..1b4f5d3 --- /dev/null +++ b/scraping/yahoo.py @@ -0,0 +1,57 @@ +# BROKEN +import re +from urllib.request import urlopen, Request +import chardet +from bs4 import BeautifulSoup +import json +import psycopg2 +import pandas as pd +import dotenv +import os +import gzip +import io + +# Load environment variables from .env file +dotenv.load_dotenv() + +headers = { + #'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': '*', + 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'same-origin', + 'Cache-Control': 'max-age=0', +} + + +url = "https://tw.news.yahoo.com/%E5%93%BD%E5%92%BD%E7%A8%B1%E8%80%81%E5%90%8C%E5%AD%B8%E6%9D%8E%E6%96%87%E5%AE%97-%E8%A2%AB%E5%86%A4%E7%8D%84-%E6%9F%AF%E6%96%87%E5%93%B2-%E4%BD%A0%E5%80%91%E5%8F%AA%E6%98%AF%E8%A6%81%E6%8A%BC%E6%88%91-%E5%85%B6%E4%BB%96%E5%85%88%E6%94%BE%E8%B5%B0-122535612.html" + +try: + req = Request(url, headers=headers) + response = urlopen(req) + if response.info().get('Content-Encoding') == 'gzip': + gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read())) + html = gzip_file.read().decode('utf-8') + else: + html = response.read().decode('utf-8') + + + soup = BeautifulSoup(html, "html.parser") + + # Extract content + title = soup.find('h1', attrs={"data-test-locator": "headline"}) + title_text = title.text.strip() if title else "No title found" + + article = soup.find('div', class_="caas-body") + paragraph = article.text.strip() if article else "" + + # Print results + print(f"Title: {title_text}") + print(f"Content: {paragraph}") + +except Exception as e: + print(f"Error: {str(e)}")