From f8879b307c6847fd005d535ea4bbbc0e7a043773 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=B3=E5=85=83=E7=9A=93?= Date: Wed, 21 May 2025 10:20:33 +0800 Subject: [PATCH] Switch to a different algroithm. --- about/scraping_line_today_home.md | 6 ++++- bun.lock | 3 +++ .../checks/checkKidUnfriendlyContent.ts | 16 +++++++------ database/kidunfriendlycontent.json | 3 ++- package.json | 1 + pages/test.vue | 24 +++++++++++++++++++ pages/tools/checkweirdkeywords.vue | 10 ++++---- .../api/contentcheck/kidunfriendlycontent.ts | 7 ++++++ server/api/home/uuid_lt/action.ts | 18 +++++++++++++- 9 files changed, 72 insertions(+), 16 deletions(-) create mode 100644 pages/test.vue diff --git a/about/scraping_line_today_home.md b/about/scraping_line_today_home.md index f0523ff..f6d3872 100644 --- a/about/scraping_line_today_home.md +++ b/about/scraping_line_today_home.md @@ -106,4 +106,8 @@ Apperently, there is something called a "hybrid listing" which is s simple recom ``` This is 100% easier to work with, and with a another extra, I can easily search shitty news terms. Also there is as category type??? What? -Also the id can just work with the following pattern in regex: `news_cat:[a-zA-Z0-9]{24}` +Also the id can just work with the following pattern in regex: `news_cat:[a-zA-Z0-9]{24}`, there is also `top_foryou:[a-zA-Z0-9]{24}` + +### Hybrid listings? +- news_cat +- top_foryou diff --git a/bun.lock b/bun.lock index 34fe786..d0be1e6 100644 --- a/bun.lock +++ b/bun.lock @@ -17,6 +17,7 @@ "@tailwindcss/vite": "^4.1.5", "@uploadthing/nuxt": "^7.1.7", "@vueuse/core": "^13.1.0", + "ahocorasick": "^1.0.2", "animate.css": "^4.1.1", "argon2": "^0.43.0", "axios": "^1.9.0", @@ -884,6 +885,8 @@ "agentkeepalive": ["agentkeepalive@4.6.0", "", { "dependencies": { "humanize-ms": "^1.2.1" } }, "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ=="], + "ahocorasick": ["ahocorasick@1.0.2", "", {}, "sha512-hCOfMzbFx5IDutmWLAt6MZwOUjIfSM9G9FyVxytmE4Rs/5YDPWQrD/+IR1w+FweD9H2oOZEnv36TmkjhNURBVA=="], + "ajv": ["ajv@6.12.6", "", { "dependencies": { "fast-deep-equal": "^3.1.1", "fast-json-stable-stringify": "^2.0.0", "json-schema-traverse": "^0.4.1", "uri-js": "^4.2.2" } }, "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g=="], "algoliasearch": ["algoliasearch@5.25.0", "", { "dependencies": { "@algolia/client-abtesting": "5.25.0", "@algolia/client-analytics": "5.25.0", "@algolia/client-common": "5.25.0", "@algolia/client-insights": "5.25.0", "@algolia/client-personalization": "5.25.0", "@algolia/client-query-suggestions": "5.25.0", "@algolia/client-search": "5.25.0", "@algolia/ingestion": "1.25.0", "@algolia/monitoring": "1.25.0", "@algolia/recommend": "5.25.0", "@algolia/requester-browser-xhr": "5.25.0", "@algolia/requester-fetch": "5.25.0", "@algolia/requester-node-http": "5.25.0" } }, "sha512-n73BVorL4HIwKlfJKb4SEzAYkR3Buwfwbh+MYxg2mloFph2fFGV58E90QTzdbfzWrLn4HE5Czx/WTjI8fcHaMg=="], diff --git a/components/checks/checkKidUnfriendlyContent.ts b/components/checks/checkKidUnfriendlyContent.ts index a5405fa..75350c2 100644 --- a/components/checks/checkKidUnfriendlyContent.ts +++ b/components/checks/checkKidUnfriendlyContent.ts @@ -1,14 +1,16 @@ -import NewsAnalyzer from "~/components/newsAnalyzer"; -const newsAnalyzer = new NewsAnalyzer(); +// Trying out the ahocorasick algorithm +// Recommended by: https://www.threads.com/@hsinspeng/post/DJ3yVGQxBg7 +import AhoCorasick from "ahocorasick"; + async function checkUnsafeContent(title: string) { try { const req = await fetch("/api/contentcheck/kidunfriendlycontent"); const res = await req.json(); - const patterns = res.words.map((word) => new RegExp(word, "i")); - console.log(patterns); - newsAnalyzer.setSensitivePatterns(patterns); - const kidfriendly = newsAnalyzer.isKidFriendly(title); - return !kidfriendly; + console.log(res.words); + const ac = new AhoCorasick(res.words); + const kidfriendly = ac.search(title); + console.log(kidfriendly); + return kidfriendly; } catch (e) { console.log(e); } diff --git a/database/kidunfriendlycontent.json b/database/kidunfriendlycontent.json index b4ad51a..276fd66 100644 --- a/database/kidunfriendlycontent.json +++ b/database/kidunfriendlycontent.json @@ -20,6 +20,7 @@ "大露", "色誘", "死亡", - "撩妹" + "撩妹", + "裸上身" ] } diff --git a/package.json b/package.json index 1b4a8de..1614a9e 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ "@tailwindcss/vite": "^4.1.5", "@uploadthing/nuxt": "^7.1.7", "@vueuse/core": "^13.1.0", + "ahocorasick": "^1.0.2", "animate.css": "^4.1.1", "argon2": "^0.43.0", "axios": "^1.9.0", diff --git a/pages/test.vue b/pages/test.vue new file mode 100644 index 0000000..aa6d46b --- /dev/null +++ b/pages/test.vue @@ -0,0 +1,24 @@ + + diff --git a/pages/tools/checkweirdkeywords.vue b/pages/tools/checkweirdkeywords.vue index 4670572..98fee1d 100644 --- a/pages/tools/checkweirdkeywords.vue +++ b/pages/tools/checkweirdkeywords.vue @@ -2,9 +2,10 @@ import CheckKidUnfriendlyContent from "~/components/checks/checkKidUnfriendlyContent"; const title = ref(""); const system = ref(false); +const testingReturn = ref(""); const checkTitle = async () => { if (!title.value) return; - system.value = await CheckKidUnfriendlyContent(title.value); + testingReturn.value = await CheckKidUnfriendlyContent(title.value); }; useSeoMeta({ title: "這個文章是不是使用偏色情的標體?", @@ -16,11 +17,7 @@ useSeoMeta({ >

這個文章是不是使用偏色情的標體?

- + @@ -28,5 +25,6 @@ useSeoMeta({ 不是
+
{{ testingReturn }}
diff --git a/server/api/contentcheck/kidunfriendlycontent.ts b/server/api/contentcheck/kidunfriendlycontent.ts index 9425b6e..7647ce8 100644 --- a/server/api/contentcheck/kidunfriendlycontent.ts +++ b/server/api/contentcheck/kidunfriendlycontent.ts @@ -17,6 +17,13 @@ export default defineEventHandler(async (event) => { "裸照", "性感", "找妹", + "肉蹼", + "超兇北半球", + "大露", + "色誘", + "死亡", + "撩妹", + "裸上身", ], }; }); diff --git a/server/api/home/uuid_lt/action.ts b/server/api/home/uuid_lt/action.ts index d4f055c..f5a38bd 100644 --- a/server/api/home/uuid_lt/action.ts +++ b/server/api/home/uuid_lt/action.ts @@ -73,7 +73,23 @@ export default defineEventHandler(async (event) => { if (noDup.includes(key)) { return; } else { - noDup.push(key); + noDup.push({ + type: "nuuid", + content: key, + }); + } + }); + const nonUUIDbutValidLinks = data.filter((id) => + /.*:[a-zA-Z0-9]{24}/g.test(id), + ); + nonUUIDbutValidLinks.forEach((key) => { + if (noDup.includes(key)) { + return; + } else { + noDup.push({ + type: "vUUID", + content: key, + }); } }); return {