mirror of
https://github.com/hpware/news-analyze.git
synced 2025-06-24 00:01:03 +08:00
feat: update README with folder structure and execution instructions; add scraping scripts for news articles
This commit is contained in:
parent
c68606ffbe
commit
f8fa412de9
45
README.md
45
README.md
@ -34,3 +34,48 @@ App Design: [Freeform](https://www.icloud.com/freeform/026AxB798cViZ9jJ2DkNsXUCQ
|
|||||||
- Groq
|
- Groq
|
||||||
- Clerk
|
- Clerk
|
||||||
- Custom Infra
|
- Custom Infra
|
||||||
|
|
||||||
|
## Folder Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
├── .github/
|
||||||
|
│ └── workflows/
|
||||||
|
├── .nuxt/
|
||||||
|
├── .output/
|
||||||
|
├── components/
|
||||||
|
│ ├── app/
|
||||||
|
│ │ └── newsOrgAbout/
|
||||||
|
│ └── ui/
|
||||||
|
├── i18n/
|
||||||
|
├── layouts/
|
||||||
|
├── lib/
|
||||||
|
├── pages/
|
||||||
|
│ └── app/
|
||||||
|
├── public/
|
||||||
|
├── scraping/
|
||||||
|
├── server/
|
||||||
|
│ ├── api/
|
||||||
|
│ │ └── objectstorage/
|
||||||
|
│ ├── components/
|
||||||
|
│ └── routes/
|
||||||
|
├── styles/
|
||||||
|
├── app.vue
|
||||||
|
├── createDatabase.ts
|
||||||
|
├── nuxt.config.ts
|
||||||
|
├── package.json
|
||||||
|
├── tailwind.config.js
|
||||||
|
└── tsconfig.json
|
||||||
|
```
|
||||||
|
|
||||||
|
## 如何執行
|
||||||
|
|
||||||
|
1. First, rename `.env.example` to `.env` and fill in the blanks.
|
||||||
|
2. Run `bun install` to install dependencies.
|
||||||
|
3. Run `bun run createDatabase` to create the database.
|
||||||
|
4. Run `ps1 clone-env.ps1` or `bash clone-env.sh` to clone the `.env` file to the `scraping` folder.
|
||||||
|
5. Run `bun run build` to build the project.
|
||||||
|
6. Run `bun run preview` to start the preview server.
|
||||||
|
7. Open `http://localhost:3000` in your browser.
|
||||||
|
|
||||||
|
### For scaping
|
||||||
|
First, Run `ps1 clone-env.ps1` or `bash clone-env.sh` to clone the `.env` file to the `scraping` folder, then cd into the `scraping` folder. Run `python main.py` to start scraping in Google News.
|
@ -10,6 +10,13 @@ gsap.registerPlugin(TextPlugin);
|
|||||||
// Import Windows
|
// Import Windows
|
||||||
import SignIn from "~/components/app/windows/login.vue";
|
import SignIn from "~/components/app/windows/login.vue";
|
||||||
|
|
||||||
|
// Import Shadcn/UI components
|
||||||
|
import AlertComponent from "~/components/ui/alert/Alert.vue";
|
||||||
|
import ButtonComponent from "~/components/ui/button/Button.vue";
|
||||||
|
import DialogComponent from "~/components/ui/dialog/Dialog.vue";
|
||||||
|
import ProgressComponent from "~/components/ui/progress/Progress.vue";
|
||||||
|
import HoverCardComponent from "~/components/ui/hover-card/HoverCard.vue";
|
||||||
|
|
||||||
// Icons
|
// Icons
|
||||||
import { ComputerDesktopIcon, UserIcon, LanguageIcon, ChevronRightIcon } from "@heroicons/vue/24/outline";
|
import { ComputerDesktopIcon, UserIcon, LanguageIcon, ChevronRightIcon } from "@heroicons/vue/24/outline";
|
||||||
|
|
||||||
@ -17,13 +24,16 @@ import { ComputerDesktopIcon, UserIcon, LanguageIcon, ChevronRightIcon} from "@h
|
|||||||
const { t, locale, locales } = useI18n();
|
const { t, locale, locales } = useI18n();
|
||||||
const switchLocalePath = useSwitchLocalePath();
|
const switchLocalePath = useSwitchLocalePath();
|
||||||
const localePath = useLocalePath();
|
const localePath = useLocalePath();
|
||||||
|
|
||||||
// Router
|
// Router
|
||||||
const router = useRouter();
|
const router = useRouter();
|
||||||
|
|
||||||
// values
|
// values
|
||||||
const popMessage = ref(null);
|
const popMessage = ref(null);
|
||||||
const menuOpen = ref(false);
|
const menuOpen = ref(false);
|
||||||
const langMenuOpen = ref(false);
|
const langMenuOpen = ref(false);
|
||||||
const lang = ref(locale.value);
|
const lang = ref(locale.value);
|
||||||
|
|
||||||
// Date
|
// Date
|
||||||
const currentDate = ref(
|
const currentDate = ref(
|
||||||
new Date().toLocaleDateString("zh-TW", {
|
new Date().toLocaleDateString("zh-TW", {
|
||||||
@ -51,35 +61,6 @@ onMounted(() => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// functions
|
// functions
|
||||||
const showLogin = () => {
|
|
||||||
const desktopEl = document.getElementById('desktop');
|
|
||||||
if (!desktopEl) return;
|
|
||||||
|
|
||||||
const loginWindow = document.createElement("div");
|
|
||||||
loginWindow.className = "login-window absolute top-1/2 left-1/2 transform -translate-x-1/2 -translate-y-1/2";
|
|
||||||
desktopEl.appendChild(loginWindow);
|
|
||||||
const app = createApp(SignIn);
|
|
||||||
app.mount(loginWindow);
|
|
||||||
|
|
||||||
setTimeout(() => {
|
|
||||||
gsap.fromTo(
|
|
||||||
loginWindow,
|
|
||||||
{ opacity: 0, scale: 0.5 },
|
|
||||||
{ opacity: 1, scale: 1, duration: 0.5 },
|
|
||||||
);
|
|
||||||
}, 100);
|
|
||||||
|
|
||||||
setTimeout(() => {
|
|
||||||
gsap.to(loginWindow, {
|
|
||||||
opacity: 0,
|
|
||||||
scale: 0.5,
|
|
||||||
duration: 0.5,
|
|
||||||
onComplete: () => {
|
|
||||||
desktopEl.removeChild(loginWindow);
|
|
||||||
},
|
|
||||||
});
|
|
||||||
}, 5000);
|
|
||||||
}
|
|
||||||
const openWindow = (windowName?: string) => {
|
const openWindow = (windowName?: string) => {
|
||||||
console.log(windowName);
|
console.log(windowName);
|
||||||
menuOpen.value = false;
|
menuOpen.value = false;
|
||||||
@ -103,15 +84,15 @@ const toggleLangMenu = () => {
|
|||||||
</script>
|
</script>
|
||||||
<template>
|
<template>
|
||||||
<div
|
<div
|
||||||
class="absolute inset-x-0 flex flex-row px-2 py-1 bg-white dark:bg-gray-900 justify-between align-center text-center z-50"
|
class="absolute inset-x-0 flex flex-row px-2 py-1 bg-[#7D7C7C]/70 text-white justify-between align-center text-center z-50"
|
||||||
>
|
>
|
||||||
<div class="flex flex-row g-2 text-gray-400 dark:text-gray-500">
|
<div class="flex flex-row g-2 text-gray-400 text-white ">
|
||||||
<button @click="toggleMenu" class="w-8 h-8 text-gray-400 dark:text-gray-500 hover:text-blue-500 transition-all duration-100 flex flex-row">
|
<button @click="toggleMenu" class="w-8 h-8 text-white hover:text-blue-500 transition-all duration-100 flex flex-row">
|
||||||
<ComputerDesktopIcon/>
|
<ComputerDesktopIcon/>
|
||||||
</button>
|
</button>
|
||||||
<span class="ml-1 mr-2 text-[20px]">|</span>
|
<span class="ml-1 mr-2 text-[20px]">|</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="text-gray-400">{{ currentDate }}</div>
|
<div class="text-center align-middle justify-center text-white">{{ currentDate }}</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="w-full h-[2.5em]"></div>
|
<div class="w-full h-[2.5em]"></div>
|
||||||
<Transition
|
<Transition
|
||||||
@ -156,8 +137,8 @@ const toggleLangMenu = () => {
|
|||||||
<!--Clerk-->
|
<!--Clerk-->
|
||||||
<SignedOut>
|
<SignedOut>
|
||||||
<SignInButton>
|
<SignInButton>
|
||||||
<button @click="showLogin" class="w-8 h-8 text-gray-400 dark:text-gray-500 flex flex-row">
|
<button @click="openWindow('login')" class="w-8 h-8 text-gray-400 flex flex-row">
|
||||||
<UserIcon class="w-8 h-8 text-gray-400 dark:text-gray-500 hover:text-blue-500 transition-all duration-100" />
|
<UserIcon class="w-8 h-8 text-gray-400 hover:text-blue-500 transition-all duration-100" />
|
||||||
</button>
|
</button>
|
||||||
</SignInButton>
|
</SignInButton>
|
||||||
</SignedOut>
|
</SignedOut>
|
||||||
|
7
scraping/findText/README.md
Normal file
7
scraping/findText/README.md
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
# Status
|
||||||
|
|
||||||
|
## cna.py
|
||||||
|
Not working
|
||||||
|
|
||||||
|
## setn.py
|
||||||
|
Working
|
56
scraping/findText/cna.py
Normal file
56
scraping/findText/cna.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# BROKEN
|
||||||
|
import re
|
||||||
|
from urllib.request import urlopen, Request
|
||||||
|
import chardet
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import psycopg2
|
||||||
|
import pandas as pd
|
||||||
|
import dotenv
|
||||||
|
import os
|
||||||
|
import gzip
|
||||||
|
import io
|
||||||
|
|
||||||
|
# Load environment variables from .env file
|
||||||
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Accept': '*',
|
||||||
|
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'same-origin',
|
||||||
|
'Cache-Control': 'max-age=0',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
url = "https://www.cna.com.tw/news/aspt/202505110112.aspx"
|
||||||
|
paragraph_css = "paragraph"
|
||||||
|
|
||||||
|
try:
|
||||||
|
req = Request(url, headers=headers)
|
||||||
|
response = urlopen(req)
|
||||||
|
if response.info().get('Content-Encoding') == 'gzip':
|
||||||
|
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||||
|
html = gzip_file.read().decode('utf-8')
|
||||||
|
else:
|
||||||
|
html = response.read().decode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
# Extract content
|
||||||
|
title = soup.find('h1').text.strip() if soup.find('h1') else ""
|
||||||
|
article = soup.find('div', class_=paragraph_css)
|
||||||
|
paragraph = article.text.strip() if article else ""
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
print(f"Title: {title}")
|
||||||
|
print(f"Content: {paragraph}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {str(e)}")
|
58
scraping/findText/setn.py
Normal file
58
scraping/findText/setn.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import re
|
||||||
|
from urllib.request import urlopen, Request
|
||||||
|
import chardet
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import psycopg2
|
||||||
|
import pandas as pd
|
||||||
|
import dotenv
|
||||||
|
import os
|
||||||
|
import gzip
|
||||||
|
import io
|
||||||
|
|
||||||
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Accept': '*',
|
||||||
|
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'same-origin',
|
||||||
|
'Cache-Control': 'max-age=0',
|
||||||
|
}
|
||||||
|
|
||||||
|
url = "https://www.setn.com/News.aspx?NewsID=1654352"
|
||||||
|
paragraph_css = "article"
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
req = Request(url, headers=headers)
|
||||||
|
response = urlopen(req)
|
||||||
|
if response.info().get('Content-Encoding') == 'gzip':
|
||||||
|
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||||
|
html = gzip_file.read().decode('utf-8')
|
||||||
|
else:
|
||||||
|
html = response.read().decode('utf-8')
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
title = soup.find('h1', class_='news-title-3')
|
||||||
|
title_text = title.text.strip() if title else "No title found"
|
||||||
|
|
||||||
|
article = soup.find('article')
|
||||||
|
content = article.text.strip() if article else "No content found"
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
print(f"Title: {title_text}")
|
||||||
|
print(f"Content: {content}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {str(e)}")
|
||||||
|
if 'soup' in locals():
|
||||||
|
print("\nAvailable classes in HTML:")
|
||||||
|
for tag in soup.find_all(class_=True):
|
||||||
|
print(f"Tag: {tag.name}, Class: {tag['class']}")
|
@ -1,5 +1,4 @@
|
|||||||
# Going to love the stupid tab tab tab tab system, instead of using brances.
|
import re
|
||||||
import re # Regular expressions
|
|
||||||
from urllib.request import urlopen # URL request lib.
|
from urllib.request import urlopen # URL request lib.
|
||||||
from bs4 import BeautifulSoup # BeautifulSoup lib.
|
from bs4 import BeautifulSoup # BeautifulSoup lib.
|
||||||
import json
|
import json
|
@ -3,3 +3,4 @@ beautifulsoup4
|
|||||||
psycopg2-binary
|
psycopg2-binary
|
||||||
pandas
|
pandas
|
||||||
dotenv
|
dotenv
|
||||||
|
chardet
|
Loading…
x
Reference in New Issue
Block a user