feat: update README with folder structure and execution instructions; add scraping scripts for news articles

This commit is contained in:
吳元皓 2025-05-11 17:31:37 +08:00
parent c68606ffbe
commit f8fa412de9
7 changed files with 188 additions and 41 deletions

View File

@ -34,3 +34,48 @@ App Design: [Freeform](https://www.icloud.com/freeform/026AxB798cViZ9jJ2DkNsXUCQ
- Groq - Groq
- Clerk - Clerk
- Custom Infra - Custom Infra
## Folder Structure
```
├── .github/
│ └── workflows/
├── .nuxt/
├── .output/
├── components/
│ ├── app/
│ │ └── newsOrgAbout/
│ └── ui/
├── i18n/
├── layouts/
├── lib/
├── pages/
│ └── app/
├── public/
├── scraping/
├── server/
│ ├── api/
│ │ └── objectstorage/
│ ├── components/
│ └── routes/
├── styles/
├── app.vue
├── createDatabase.ts
├── nuxt.config.ts
├── package.json
├── tailwind.config.js
└── tsconfig.json
```
## 如何執行
1. First, rename `.env.example` to `.env` and fill in the blanks.
2. Run `bun install` to install dependencies.
3. Run `bun run createDatabase` to create the database.
4. Run `ps1 clone-env.ps1` or `bash clone-env.sh` to clone the `.env` file to the `scraping` folder.
5. Run `bun run build` to build the project.
6. Run `bun run preview` to start the preview server.
7. Open `http://localhost:3000` in your browser.
### For scaping
First, Run `ps1 clone-env.ps1` or `bash clone-env.sh` to clone the `.env` file to the `scraping` folder, then cd into the `scraping` folder. Run `python main.py` to start scraping in Google News.

View File

@ -10,20 +10,30 @@ gsap.registerPlugin(TextPlugin);
// Import Windows // Import Windows
import SignIn from "~/components/app/windows/login.vue"; import SignIn from "~/components/app/windows/login.vue";
// Import Shadcn/UI components
import AlertComponent from "~/components/ui/alert/Alert.vue";
import ButtonComponent from "~/components/ui/button/Button.vue";
import DialogComponent from "~/components/ui/dialog/Dialog.vue";
import ProgressComponent from "~/components/ui/progress/Progress.vue";
import HoverCardComponent from "~/components/ui/hover-card/HoverCard.vue";
// Icons // Icons
import { ComputerDesktopIcon, UserIcon, LanguageIcon, ChevronRightIcon} from "@heroicons/vue/24/outline"; import { ComputerDesktopIcon, UserIcon, LanguageIcon, ChevronRightIcon } from "@heroicons/vue/24/outline";
// i18n // i18n
const { t, locale, locales } = useI18n(); const { t, locale, locales } = useI18n();
const switchLocalePath = useSwitchLocalePath(); const switchLocalePath = useSwitchLocalePath();
const localePath = useLocalePath(); const localePath = useLocalePath();
// Router // Router
const router = useRouter(); const router = useRouter();
// values // values
const popMessage = ref(null); const popMessage = ref(null);
const menuOpen = ref(false); const menuOpen = ref(false);
const langMenuOpen = ref(false); const langMenuOpen = ref(false);
const lang = ref(locale.value); const lang = ref(locale.value);
// Date // Date
const currentDate = ref( const currentDate = ref(
new Date().toLocaleDateString("zh-TW", { new Date().toLocaleDateString("zh-TW", {
@ -51,35 +61,6 @@ onMounted(() => {
}); });
// functions // functions
const showLogin = () => {
const desktopEl = document.getElementById('desktop');
if (!desktopEl) return;
const loginWindow = document.createElement("div");
loginWindow.className = "login-window absolute top-1/2 left-1/2 transform -translate-x-1/2 -translate-y-1/2";
desktopEl.appendChild(loginWindow);
const app = createApp(SignIn);
app.mount(loginWindow);
setTimeout(() => {
gsap.fromTo(
loginWindow,
{ opacity: 0, scale: 0.5 },
{ opacity: 1, scale: 1, duration: 0.5 },
);
}, 100);
setTimeout(() => {
gsap.to(loginWindow, {
opacity: 0,
scale: 0.5,
duration: 0.5,
onComplete: () => {
desktopEl.removeChild(loginWindow);
},
});
}, 5000);
}
const openWindow = (windowName?: string) => { const openWindow = (windowName?: string) => {
console.log(windowName); console.log(windowName);
menuOpen.value = false; menuOpen.value = false;
@ -103,15 +84,15 @@ const toggleLangMenu = () => {
</script> </script>
<template> <template>
<div <div
class="absolute inset-x-0 flex flex-row px-2 py-1 bg-white dark:bg-gray-900 justify-between align-center text-center z-50" class="absolute inset-x-0 flex flex-row px-2 py-1 bg-[#7D7C7C]/70 text-white justify-between align-center text-center z-50"
> >
<div class="flex flex-row g-2 text-gray-400 dark:text-gray-500"> <div class="flex flex-row g-2 text-gray-400 text-white ">
<button @click="toggleMenu" class="w-8 h-8 text-gray-400 dark:text-gray-500 hover:text-blue-500 transition-all duration-100 flex flex-row"> <button @click="toggleMenu" class="w-8 h-8 text-white hover:text-blue-500 transition-all duration-100 flex flex-row">
<ComputerDesktopIcon/> <ComputerDesktopIcon/>
</button> </button>
<span class="ml-1 mr-2 text-[20px]">|</span> <span class="ml-1 mr-2 text-[20px]">|</span>
</div> </div>
<div class="text-gray-400">{{ currentDate }}</div> <div class="text-center align-middle justify-center text-white">{{ currentDate }}</div>
</div> </div>
<div class="w-full h-[2.5em]"></div> <div class="w-full h-[2.5em]"></div>
<Transition <Transition
@ -156,8 +137,8 @@ const toggleLangMenu = () => {
<!--Clerk--> <!--Clerk-->
<SignedOut> <SignedOut>
<SignInButton> <SignInButton>
<button @click="showLogin" class="w-8 h-8 text-gray-400 dark:text-gray-500 flex flex-row"> <button @click="openWindow('login')" class="w-8 h-8 text-gray-400 flex flex-row">
<UserIcon class="w-8 h-8 text-gray-400 dark:text-gray-500 hover:text-blue-500 transition-all duration-100" /> <UserIcon class="w-8 h-8 text-gray-400 hover:text-blue-500 transition-all duration-100" />
</button> </button>
</SignInButton> </SignInButton>
</SignedOut> </SignedOut>

View File

@ -0,0 +1,7 @@
# Status
## cna.py
Not working
## setn.py
Working

56
scraping/findText/cna.py Normal file
View File

@ -0,0 +1,56 @@
# BROKEN
import re
from urllib.request import urlopen, Request
import chardet
from bs4 import BeautifulSoup
import json
import psycopg2
import pandas as pd
import dotenv
import os
import gzip
import io
# Load environment variables from .env file
dotenv.load_dotenv()
headers = {
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': '*',
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Cache-Control': 'max-age=0',
}
url = "https://www.cna.com.tw/news/aspt/202505110112.aspx"
paragraph_css = "paragraph"
try:
req = Request(url, headers=headers)
response = urlopen(req)
if response.info().get('Content-Encoding') == 'gzip':
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
html = gzip_file.read().decode('utf-8')
else:
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, "html.parser")
# Extract content
title = soup.find('h1').text.strip() if soup.find('h1') else ""
article = soup.find('div', class_=paragraph_css)
paragraph = article.text.strip() if article else ""
# Print results
print(f"Title: {title}")
print(f"Content: {paragraph}")
except Exception as e:
print(f"Error: {str(e)}")

58
scraping/findText/setn.py Normal file
View File

@ -0,0 +1,58 @@
import re
from urllib.request import urlopen, Request
import chardet
from bs4 import BeautifulSoup
import json
import psycopg2
import pandas as pd
import dotenv
import os
import gzip
import io
dotenv.load_dotenv()
headers = {
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': '*',
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Cache-Control': 'max-age=0',
}
url = "https://www.setn.com/News.aspx?NewsID=1654352"
paragraph_css = "article"
try:
req = Request(url, headers=headers)
response = urlopen(req)
if response.info().get('Content-Encoding') == 'gzip':
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
html = gzip_file.read().decode('utf-8')
else:
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, "html.parser")
title = soup.find('h1', class_='news-title-3')
title_text = title.text.strip() if title else "No title found"
article = soup.find('article')
content = article.text.strip() if article else "No content found"
# Print results
print(f"Title: {title_text}")
print(f"Content: {content}")
except Exception as e:
print(f"Error: {str(e)}")
if 'soup' in locals():
print("\nAvailable classes in HTML:")
for tag in soup.find_all(class_=True):
print(f"Tag: {tag.name}, Class: {tag['class']}")

View File

@ -1,5 +1,4 @@
# Going to love the stupid tab tab tab tab system, instead of using brances. import re
import re # Regular expressions
from urllib.request import urlopen # URL request lib. from urllib.request import urlopen # URL request lib.
from bs4 import BeautifulSoup # BeautifulSoup lib. from bs4 import BeautifulSoup # BeautifulSoup lib.
import json import json

View File

@ -3,3 +3,4 @@ beautifulsoup4
psycopg2-binary psycopg2-binary
pandas pandas
dotenv dotenv
chardet