feat: update README with folder structure and execution instructions; add scraping scripts for news articles

2025-06-24 00:01:03 +08:00 · 2025-05-11 17:31:37 +08:00 · 2025-05-11 17:31:37 +08:00 · f8fa412de9
commit f8fa412de9
parent c68606ffbe
7 changed files with 188 additions and 41 deletions
--- a/README.md
+++ b/README.md
@ -34,3 +34,48 @@ App Design: [Freeform](https://www.icloud.com/freeform/026AxB798cViZ9jJ2DkNsXUCQ
 - Groq
 - Clerk
 - Custom Infra
 ## Folder Structure
 ```
 ├── .github/
 │   └── workflows/
 ├── .nuxt/
 ├── .output/
 ├── components/
 │   ├── app/
 │   │   └── newsOrgAbout/
 │   └── ui/
 ├── i18n/
 ├── layouts/
 ├── lib/
 ├── pages/
 │   └── app/
 ├── public/
 ├── scraping/
 ├── server/
 │   ├── api/
 │   │   └── objectstorage/
 │   ├── components/
 │   └── routes/
 ├── styles/
 ├── app.vue
 ├── createDatabase.ts
 ├── nuxt.config.ts
 ├── package.json
 ├── tailwind.config.js
 └── tsconfig.json
 ```
 ## 如何執行
 1. First, rename `.env.example` to `.env` and fill in the blanks.
 2. Run `bun install` to install dependencies.
 3. Run `bun run createDatabase` to create the database.
 4. Run `ps1 clone-env.ps1` or `bash clone-env.sh` to clone the `.env` file to the `scraping` folder.
 5. Run `bun run build` to build the project.
 6. Run `bun run preview` to start the preview server.
 7. Open `http://localhost:3000` in your browser.
 ### For scaping
 First, Run `ps1 clone-env.ps1` or `bash clone-env.sh` to clone the `.env` file to the `scraping` folder, then cd into the `scraping` folder. Run `python main.py` to start scraping in Google News.
--- a/pages/app/index.vue
+++ b/pages/app/index.vue
@ -10,20 +10,30 @@ gsap.registerPlugin(TextPlugin);
 // Import Windows
 import SignIn from "~/components/app/windows/login.vue";
 // Import Shadcn/UI components
 import AlertComponent from "~/components/ui/alert/Alert.vue";
 import ButtonComponent from "~/components/ui/button/Button.vue";
 import DialogComponent from "~/components/ui/dialog/Dialog.vue";
 import ProgressComponent from "~/components/ui/progress/Progress.vue";
 import HoverCardComponent from "~/components/ui/hover-card/HoverCard.vue";
 // Icons
-import { ComputerDesktopIcon, UserIcon, LanguageIcon, ChevronRightIcon} from "@heroicons/vue/24/outline";
+import { ComputerDesktopIcon, UserIcon, LanguageIcon, ChevronRightIcon } from "@heroicons/vue/24/outline";
 // i18n
 const { t, locale, locales } = useI18n();
 const switchLocalePath = useSwitchLocalePath();
 const localePath = useLocalePath();
 // Router
 const router = useRouter();
 // values
 const popMessage = ref(null);
 const menuOpen = ref(false);
 const langMenuOpen = ref(false);
 const lang = ref(locale.value);
 // Date
 const currentDate = ref(
  new Date().toLocaleDateString("zh-TW", {
@ -51,35 +61,6 @@ onMounted(() => {
 });
 // functions
 const showLogin = () => {
  const desktopEl = document.getElementById('desktop');
  if (!desktopEl) return;
  const loginWindow = document.createElement("div");
  loginWindow.className = "login-window absolute top-1/2 left-1/2 transform -translate-x-1/2 -translate-y-1/2";
  desktopEl.appendChild(loginWindow);
  const app = createApp(SignIn);
  app.mount(loginWindow);
  setTimeout(() => {
    gsap.fromTo(
      loginWindow,
      { opacity: 0, scale: 0.5 },
      { opacity: 1, scale: 1, duration: 0.5 },
    );
  }, 100);
  setTimeout(() => {
    gsap.to(loginWindow, {
      opacity: 0,
      scale: 0.5,
      duration: 0.5,
      onComplete: () => {
        desktopEl.removeChild(loginWindow);
      },
    });
  }, 5000);
 }
 const openWindow = (windowName?: string) => {
  console.log(windowName);
  menuOpen.value = false;
@ -103,15 +84,15 @@ const toggleLangMenu = () => {
 </script>
 <template>
  <div
-    class="absolute inset-x-0 flex flex-row px-2 py-1 bg-white dark:bg-gray-900 justify-between align-center text-center z-50"
+    class="absolute inset-x-0 flex flex-row px-2 py-1 bg-[#7D7C7C]/70 text-white justify-between align-center text-center z-50"
  >
-    <div class="flex flex-row g-2 text-gray-400 dark:text-gray-500">
+    <div class="flex flex-row g-2 text-gray-400 text-white ">
-      <button @click="toggleMenu" class="w-8 h-8 text-gray-400 dark:text-gray-500 hover:text-blue-500 transition-all duration-100 flex flex-row">
+      <button @click="toggleMenu" class="w-8 h-8 text-white hover:text-blue-500 transition-all duration-100 flex flex-row">
        <ComputerDesktopIcon/>
      </button>
      <span class="ml-1 mr-2 text-[20px]">|</span>
    </div>
-    <div class="text-gray-400">{{ currentDate }}</div>
+    <div class="text-center align-middle justify-center text-white">{{ currentDate }}</div>
  </div>
  <div class="w-full h-[2.5em]"></div>
  <Transition
@ -156,8 +137,8 @@ const toggleLangMenu = () => {
      <!--Clerk-->
      <SignedOut>
        <SignInButton>
-          <button @click="showLogin" class="w-8 h-8 text-gray-400 dark:text-gray-500 flex flex-row">
+          <button @click="openWindow('login')" class="w-8 h-8 text-gray-400  flex flex-row">
-          <UserIcon class="w-8 h-8 text-gray-400 dark:text-gray-500 hover:text-blue-500 transition-all duration-100" />
+          <UserIcon class="w-8 h-8 text-gray-400  hover:text-blue-500 transition-all duration-100" />
        </button>
        </SignInButton>
      </SignedOut>
--- a/scraping/findText/README.md
+++ b/scraping/findText/README.md
@ -0,0 +1,7 @@
 # Status
 ## cna.py
 Not working
 ## setn.py 
 Working
--- a/scraping/findText/cna.py
+++ b/scraping/findText/cna.py
@ -0,0 +1,56 @@
 # BROKEN
 import re
 from urllib.request import urlopen, Request
 import chardet
 from bs4 import BeautifulSoup
 import json
 import psycopg2
 import pandas as pd
 import dotenv
 import os
 import gzip
 import io
 # Load environment variables from .env file
 dotenv.load_dotenv()
 headers = {
    #'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': '*',
    'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Cache-Control': 'max-age=0',
 }
 url = "https://www.cna.com.tw/news/aspt/202505110112.aspx"
 paragraph_css = "paragraph"
 try:
    req = Request(url, headers=headers)
    response = urlopen(req)
    if response.info().get('Content-Encoding') == 'gzip':
        gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
        html = gzip_file.read().decode('utf-8')
    else:
        html = response.read().decode('utf-8')
    soup = BeautifulSoup(html, "html.parser")
    # Extract content
    title = soup.find('h1').text.strip() if soup.find('h1') else ""
    article = soup.find('div', class_=paragraph_css)
    paragraph = article.text.strip() if article else ""
    # Print results
    print(f"Title: {title}")
    print(f"Content: {paragraph}")
 except Exception as e:
    print(f"Error: {str(e)}")
--- a/scraping/findText/setn.py
+++ b/scraping/findText/setn.py
@ -0,0 +1,58 @@
 import re
 from urllib.request import urlopen, Request
 import chardet
 from bs4 import BeautifulSoup
 import json
 import psycopg2
 import pandas as pd
 import dotenv
 import os
 import gzip
 import io
 dotenv.load_dotenv()
 headers = {
    #'User-Agent': 'NewsSceraperBot/1.0 (https://github.com/hpware/news-analyze)',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': '*',
    'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Cache-Control': 'max-age=0',
 }
 url = "https://www.setn.com/News.aspx?NewsID=1654352"
 paragraph_css = "article"
 try:
    req = Request(url, headers=headers)
    response = urlopen(req)
    if response.info().get('Content-Encoding') == 'gzip':
        gzip_file = gzip.GzipFile(fileobj=io.BytesIO(response.read()))
        html = gzip_file.read().decode('utf-8')
    else:
        html = response.read().decode('utf-8')
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find('h1', class_='news-title-3') 
    title_text = title.text.strip() if title else "No title found"
    article = soup.find('article') 
    content = article.text.strip() if article else "No content found"
    # Print results
    print(f"Title: {title_text}")
    print(f"Content: {content}")
 except Exception as e:
    print(f"Error: {str(e)}")
    if 'soup' in locals():
        print("\nAvailable classes in HTML:")
        for tag in soup.find_all(class_=True):
            print(f"Tag: {tag.name}, Class: {tag['class']}")
--- a/scraping/hot_articles.py
+++ b/scraping/hot_articles.py
@ -1,5 +1,4 @@
-# Going to love the stupid tab tab tab tab system, instead of using brances.
+import re
 import re # Regular expressions
 from urllib.request import urlopen # URL request lib.
 from bs4 import BeautifulSoup # BeautifulSoup lib.
 import json
--- a/scraping/requirements.txt
+++ b/scraping/requirements.txt
@ -3,3 +3,4 @@ beautifulsoup4
 psycopg2-binary
 pandas
 dotenv
 chardet