news-analyze/server/scrape/line_today.ts

106 lines
3.2 KiB
TypeScript

import * as cheerio from "cheerio";
function findTime(timeText: string) {
const now = new Date();
const hourMatch = timeText.match(/(\d+)小時前/);
const dayMatch = timeText.match(/(\d+)天前/);
const minuteMatch = timeText.match(/(\d+)分鐘前/);
if (hourMatch) {
const hoursAgo = parseInt(hourMatch[1]);
return new Date(now.getTime() - hoursAgo * 60 * 60 * 1000);
} else if (dayMatch) {
const daysAgo = parseInt(dayMatch[1]);
return new Date(now.getTime() - daysAgo * 24 * 60 * 60 * 1000);
} else if (minuteMatch) {
const minutesAgo = parseInt(minuteMatch[1]);
return new Date(now.getTime() - minutesAgo * 60 * 1000);
}
return null;
}
async function lineToday(slug: string) {
const url = "https://today.line.me/tw/v2/article/" + slug;
const fetchPageCode = await fetch(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Accept: "*",
"Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
Connection: "keep-alive",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Cache-Control": "max-age=0",
},
});
// 幹 又忘了 await
const data = await fetchPageCode.text();
// 加 await? no.
// AHHH I NEED TO CHANGE TO SOMETHING ELSE.
const html = cheerio.load(data);
const title = html("h1.entityTitle")
.text()
.replaceAll("\n", "")
.replace(" ", "");
const paragraph = <any[]>[];
const images = <any[]>[];
html("article.news-content")
.contents()
.each((i, element) => {
if (element.type === "tag" && element.tagName === "figure") {
const imgSrc = html(element).find("img").attr("src");
if (imgSrc) {
images.push(imgSrc);
}
} else if (element.type === "tag" && element.tagName === "p") {
const text = html(element).text().trim();
if (text) {
paragraph.push(text);
}
}
});
const newsOrgdir = html("h4.entityPublishInfo-publisher")
.text()
.replaceAll("\n", "")
.replaceAll(" ", "");
let author = "";
const authorInfo = html("span.entityPublishInfo-meta-info")
.text()
.replace(/更新.*發布.*•/g, "")
.replaceAll("\n", "")
.replaceAll(" ", "");
if (/更新.*發布.*/.test(authorInfo)) {
author = "未知";
} else {
author = authorInfo;
}
const orgAuthorDateData = html("span.entityPublishInfo-meta-info").text();
const updateMatch = orgAuthorDateData.match(/更新於\s*([^•]+)/);
const publishMatch = orgAuthorDateData.match(/發布於\s*(.+)$/);
let updatedAt: Date | null = null;
if (updateMatch) {
updatedAt = findTime(updateMatch[1].trim());
}
let publishedAt: Date | null = null;
if (publishMatch) {
publishedAt = findTime(publishMatch[1].trim());
}
const getAuthorUrl = html("entityPublishInfo-avatarLin").html();
console.log(getAuthorUrl);
return {
title: title,
paragraph: paragraph,
origin: newsOrgdir,
author: author,
images: images,
updateat: updatedAt,
publishedat: publishedAt,
};
}
export default lineToday;