mirror of
https://github.com/hpware/news-analyze.git
synced 2025-06-24 00:01:03 +08:00
Made a micro scraper.
This commit is contained in:
parent
bf38065e52
commit
089c90f79b
@ -1,10 +1,12 @@
|
|||||||
|
# Going to love the stupid tab tab tab tab system, instead of using brances.
|
||||||
import time
|
import time
|
||||||
import re # Regular expressions
|
import re # Regular expressions
|
||||||
from urllib.request import urlopen # URL request lib.
|
from urllib.request import urlopen # URL request lib.
|
||||||
from bs4 import BeautifulSoup # BeautifulSoup lib.
|
from bs4 import BeautifulSoup # BeautifulSoup lib.
|
||||||
|
import json
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'NewsSceraperBot/1.0 (news.yuanhau.com)'
|
'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)'
|
||||||
}
|
}
|
||||||
|
|
||||||
#url = "https://tw.news.yahoo.com/"
|
#url = "https://tw.news.yahoo.com/"
|
||||||
@ -14,5 +16,32 @@ page = urlopen(url)
|
|||||||
html_bytes = page.read()
|
html_bytes = page.read()
|
||||||
html = html_bytes.decode("utf-8")
|
html = html_bytes.decode("utf-8")
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
hotarticles = soup.find_all("article")
|
||||||
|
news_data = []
|
||||||
|
for article in hotarticles:
|
||||||
|
try:
|
||||||
|
title_elem = article.find('a', class_='gPFEn')
|
||||||
|
title = title_elem.text.strip() if title_elem else ''
|
||||||
|
|
||||||
|
source_elem = article.find('div', class_='vr1PYe')
|
||||||
|
source = source_elem.text.strip() if source_elem else ''
|
||||||
|
|
||||||
|
link_elem = article.find('a', class_='WwrzSb')
|
||||||
|
orglink = link_elem['href'] if link_elem else ''
|
||||||
|
link = re.sub(r'./read/', 'https://news.google.com/read/', orglink)
|
||||||
|
|
||||||
|
article_data = {
|
||||||
|
'title': title,
|
||||||
|
'source': source,
|
||||||
|
'link': link,
|
||||||
|
}
|
||||||
|
|
||||||
|
news_data.append(article_data)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing article: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
with open('news.json', 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(news_data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
print(soup.find_all("article"))
|
|
||||||
|
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user