吳元皓 81012f5061 Made the line_today.py kinda work ig. But I have no idea how can I run
this without issues in prod tho. and the "BlurPageBeforeLogin" thing
works just file, oh and checkCookie is now working (but without the
database part just yet)
2025-05-17 23:31:55 +08:00

66 lines
2.0 KiB
Python

# THIS WORKS :D
import re
from urllib.request import urlopen, Request
import chardet
from bs4 import BeautifulSoup
import json
import psycopg2
import pandas as pd
import dotenv
import os
import gzip
import io
import argparse
parser = argparse.ArgumentParser(description="A LINE Today Scraper.")
parser.add_argument("-s", "--slug", type=str, help="The article URL like: oqmazXP")
args = parser.parse_args()
if not args.slug:
print("No Slug entered, please use -s oqmazXP as a demo.")
exit(1)
# Load environment variables from .env file
dotenv.load_dotenv()
headers = {
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze) (A little note: If you see this, It means that your website is being scraped by other people, not the user hpware. Please keep that in mind at don't spam issues, I can't fix it.)',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': '*',
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Cache-Control': 'max-age=0',
}
try:
url = "https://today.line.me/tw/v2/article/" + args.slug
req = Request(url, headers=headers)
response = urlopen(req)
if response.info().get('Content-Encoding') == 'gzip':
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
html = gzip_file.read().decode('utf-8')
else:
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, "html.parser")
# Extract content
title = soup.find('h1', class_="entityTitle")
title_text = title.text.strip() if title else "No title found"
article = soup.find('article', class_="news-content")
paragraph = article.text.strip() if article else ""
# Print results
print(f"Title: {title_text}")
print(f"Content: {paragraph}")
except Exception as e:
print(f"Error: {str(e)}")