mirror of
https://github.com/hpware/news-analyze.git
synced 2025-06-23 15:51:01 +08:00
feat: update README with folder structure and execution instructions; add scraping scripts for news articles
This commit is contained in:
parent
c68606ffbe
commit
f8fa412de9
47
README.md
47
README.md
@ -33,4 +33,49 @@ App Design: [Freeform](https://www.icloud.com/freeform/026AxB798cViZ9jJ2DkNsXUCQ
|
||||
- BunJS
|
||||
- Groq
|
||||
- Clerk
|
||||
- Custom Infra
|
||||
- Custom Infra
|
||||
|
||||
## Folder Structure
|
||||
|
||||
```
|
||||
├── .github/
|
||||
│ └── workflows/
|
||||
├── .nuxt/
|
||||
├── .output/
|
||||
├── components/
|
||||
│ ├── app/
|
||||
│ │ └── newsOrgAbout/
|
||||
│ └── ui/
|
||||
├── i18n/
|
||||
├── layouts/
|
||||
├── lib/
|
||||
├── pages/
|
||||
│ └── app/
|
||||
├── public/
|
||||
├── scraping/
|
||||
├── server/
|
||||
│ ├── api/
|
||||
│ │ └── objectstorage/
|
||||
│ ├── components/
|
||||
│ └── routes/
|
||||
├── styles/
|
||||
├── app.vue
|
||||
├── createDatabase.ts
|
||||
├── nuxt.config.ts
|
||||
├── package.json
|
||||
├── tailwind.config.js
|
||||
└── tsconfig.json
|
||||
```
|
||||
|
||||
## 如何執行
|
||||
|
||||
1. First, rename `.env.example` to `.env` and fill in the blanks.
|
||||
2. Run `bun install` to install dependencies.
|
||||
3. Run `bun run createDatabase` to create the database.
|
||||
4. Run `ps1 clone-env.ps1` or `bash clone-env.sh` to clone the `.env` file to the `scraping` folder.
|
||||
5. Run `bun run build` to build the project.
|
||||
6. Run `bun run preview` to start the preview server.
|
||||
7. Open `http://localhost:3000` in your browser.
|
||||
|
||||
### For scaping
|
||||
First, Run `ps1 clone-env.ps1` or `bash clone-env.sh` to clone the `.env` file to the `scraping` folder, then cd into the `scraping` folder. Run `python main.py` to start scraping in Google News.
|
@ -10,20 +10,30 @@ gsap.registerPlugin(TextPlugin);
|
||||
// Import Windows
|
||||
import SignIn from "~/components/app/windows/login.vue";
|
||||
|
||||
// Import Shadcn/UI components
|
||||
import AlertComponent from "~/components/ui/alert/Alert.vue";
|
||||
import ButtonComponent from "~/components/ui/button/Button.vue";
|
||||
import DialogComponent from "~/components/ui/dialog/Dialog.vue";
|
||||
import ProgressComponent from "~/components/ui/progress/Progress.vue";
|
||||
import HoverCardComponent from "~/components/ui/hover-card/HoverCard.vue";
|
||||
|
||||
// Icons
|
||||
import { ComputerDesktopIcon, UserIcon, LanguageIcon, ChevronRightIcon} from "@heroicons/vue/24/outline";
|
||||
import { ComputerDesktopIcon, UserIcon, LanguageIcon, ChevronRightIcon } from "@heroicons/vue/24/outline";
|
||||
|
||||
// i18n
|
||||
const { t, locale, locales } = useI18n();
|
||||
const switchLocalePath = useSwitchLocalePath();
|
||||
const localePath = useLocalePath();
|
||||
|
||||
// Router
|
||||
const router = useRouter();
|
||||
|
||||
// values
|
||||
const popMessage = ref(null);
|
||||
const menuOpen = ref(false);
|
||||
const langMenuOpen = ref(false);
|
||||
const lang = ref(locale.value);
|
||||
|
||||
// Date
|
||||
const currentDate = ref(
|
||||
new Date().toLocaleDateString("zh-TW", {
|
||||
@ -51,35 +61,6 @@ onMounted(() => {
|
||||
});
|
||||
|
||||
// functions
|
||||
const showLogin = () => {
|
||||
const desktopEl = document.getElementById('desktop');
|
||||
if (!desktopEl) return;
|
||||
|
||||
const loginWindow = document.createElement("div");
|
||||
loginWindow.className = "login-window absolute top-1/2 left-1/2 transform -translate-x-1/2 -translate-y-1/2";
|
||||
desktopEl.appendChild(loginWindow);
|
||||
const app = createApp(SignIn);
|
||||
app.mount(loginWindow);
|
||||
|
||||
setTimeout(() => {
|
||||
gsap.fromTo(
|
||||
loginWindow,
|
||||
{ opacity: 0, scale: 0.5 },
|
||||
{ opacity: 1, scale: 1, duration: 0.5 },
|
||||
);
|
||||
}, 100);
|
||||
|
||||
setTimeout(() => {
|
||||
gsap.to(loginWindow, {
|
||||
opacity: 0,
|
||||
scale: 0.5,
|
||||
duration: 0.5,
|
||||
onComplete: () => {
|
||||
desktopEl.removeChild(loginWindow);
|
||||
},
|
||||
});
|
||||
}, 5000);
|
||||
}
|
||||
const openWindow = (windowName?: string) => {
|
||||
console.log(windowName);
|
||||
menuOpen.value = false;
|
||||
@ -103,15 +84,15 @@ const toggleLangMenu = () => {
|
||||
</script>
|
||||
<template>
|
||||
<div
|
||||
class="absolute inset-x-0 flex flex-row px-2 py-1 bg-white dark:bg-gray-900 justify-between align-center text-center z-50"
|
||||
class="absolute inset-x-0 flex flex-row px-2 py-1 bg-[#7D7C7C]/70 text-white justify-between align-center text-center z-50"
|
||||
>
|
||||
<div class="flex flex-row g-2 text-gray-400 dark:text-gray-500">
|
||||
<button @click="toggleMenu" class="w-8 h-8 text-gray-400 dark:text-gray-500 hover:text-blue-500 transition-all duration-100 flex flex-row">
|
||||
<div class="flex flex-row g-2 text-gray-400 text-white ">
|
||||
<button @click="toggleMenu" class="w-8 h-8 text-white hover:text-blue-500 transition-all duration-100 flex flex-row">
|
||||
<ComputerDesktopIcon/>
|
||||
</button>
|
||||
<span class="ml-1 mr-2 text-[20px]">|</span>
|
||||
</div>
|
||||
<div class="text-gray-400">{{ currentDate }}</div>
|
||||
<div class="text-center align-middle justify-center text-white">{{ currentDate }}</div>
|
||||
</div>
|
||||
<div class="w-full h-[2.5em]"></div>
|
||||
<Transition
|
||||
@ -131,7 +112,7 @@ const toggleLangMenu = () => {
|
||||
class="flex flex-col justify-center align-center text-center absolute w-full h-screen inset-x-0 inset-y-0 z-[-1]"
|
||||
id="desktop"
|
||||
>
|
||||
|
||||
|
||||
</div>
|
||||
<div
|
||||
class="absolute w-[calc(100% - 5px)] inset-x-0 bottom-0 mx-[1.5px] p-3 justify-between align-center flex flex-row"
|
||||
@ -156,8 +137,8 @@ const toggleLangMenu = () => {
|
||||
<!--Clerk-->
|
||||
<SignedOut>
|
||||
<SignInButton>
|
||||
<button @click="showLogin" class="w-8 h-8 text-gray-400 dark:text-gray-500 flex flex-row">
|
||||
<UserIcon class="w-8 h-8 text-gray-400 dark:text-gray-500 hover:text-blue-500 transition-all duration-100" />
|
||||
<button @click="openWindow('login')" class="w-8 h-8 text-gray-400 flex flex-row">
|
||||
<UserIcon class="w-8 h-8 text-gray-400 hover:text-blue-500 transition-all duration-100" />
|
||||
</button>
|
||||
</SignInButton>
|
||||
</SignedOut>
|
||||
|
7
scraping/findText/README.md
Normal file
7
scraping/findText/README.md
Normal file
@ -0,0 +1,7 @@
|
||||
# Status
|
||||
|
||||
## cna.py
|
||||
Not working
|
||||
|
||||
## setn.py
|
||||
Working
|
56
scraping/findText/cna.py
Normal file
56
scraping/findText/cna.py
Normal file
@ -0,0 +1,56 @@
|
||||
# BROKEN
|
||||
import re
|
||||
from urllib.request import urlopen, Request
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import gzip
|
||||
import io
|
||||
|
||||
# Load environment variables from .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
|
||||
url = "https://www.cna.com.tw/news/aspt/202505110112.aspx"
|
||||
paragraph_css = "paragraph"
|
||||
|
||||
try:
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Extract content
|
||||
title = soup.find('h1').text.strip() if soup.find('h1') else ""
|
||||
article = soup.find('div', class_=paragraph_css)
|
||||
paragraph = article.text.strip() if article else ""
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title}")
|
||||
print(f"Content: {paragraph}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
58
scraping/findText/setn.py
Normal file
58
scraping/findText/setn.py
Normal file
@ -0,0 +1,58 @@
|
||||
import re
|
||||
from urllib.request import urlopen, Request
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import dotenv
|
||||
import os
|
||||
import gzip
|
||||
import io
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
headers = {
|
||||
#'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*',
|
||||
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Cache-Control': 'max-age=0',
|
||||
}
|
||||
|
||||
url = "https://www.setn.com/News.aspx?NewsID=1654352"
|
||||
paragraph_css = "article"
|
||||
|
||||
|
||||
try:
|
||||
req = Request(url, headers=headers)
|
||||
response = urlopen(req)
|
||||
if response.info().get('Content-Encoding') == 'gzip':
|
||||
gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
|
||||
html = gzip_file.read().decode('utf-8')
|
||||
else:
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
title = soup.find('h1', class_='news-title-3')
|
||||
title_text = title.text.strip() if title else "No title found"
|
||||
|
||||
article = soup.find('article')
|
||||
content = article.text.strip() if article else "No content found"
|
||||
|
||||
# Print results
|
||||
print(f"Title: {title_text}")
|
||||
print(f"Content: {content}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
if 'soup' in locals():
|
||||
print("\nAvailable classes in HTML:")
|
||||
for tag in soup.find_all(class_=True):
|
||||
print(f"Tag: {tag.name}, Class: {tag['class']}")
|
@ -1,5 +1,4 @@
|
||||
# Going to love the stupid tab tab tab tab system, instead of using brances.
|
||||
import re # Regular expressions
|
||||
import re
|
||||
from urllib.request import urlopen # URL request lib.
|
||||
from bs4 import BeautifulSoup # BeautifulSoup lib.
|
||||
import json
|
@ -2,4 +2,5 @@ urlopen
|
||||
beautifulsoup4
|
||||
psycopg2-binary
|
||||
pandas
|
||||
dotenv
|
||||
dotenv
|
||||
chardet
|
Loading…
x
Reference in New Issue
Block a user