diff --git a/packages/db/src/generating-decks.ts b/packages/db/src/generating-decks.ts index 10f84b2..a056780 100644 --- a/packages/db/src/generating-decks.ts +++ b/packages/db/src/generating-decks.ts @@ -2,6 +2,7 @@ - [x] Setup — hardcoded path, name, description, source language, POS - [x] Read wordlist — load the 1000 nouns - [x] Query terms — match to database, find which ones have translations +- [ ] writing missing words to textfile for future use - [ ] Validation — determine validated languages from the data (e.g., ["en", "it"] if all matched terms have both) - [ ] Check idempotency — skip if deck exists - [ ] Create deck — insert with discovered validated_for_languages @@ -9,6 +10,11 @@ - [ ] Report — summary */ +// TODO: Wordlist contains 1000 lines but only 999 unique words (965 found + 34 missing = 999). +// Likely cause: duplicate entry in top1000englishnouns file. +// Investigate with: const unique = new Set(words); console.log(words.length - unique.size); +// Fix either by deduplicating in code ([...new Set(words)]) or cleaning the source file. + import fs from "node:fs/promises"; import { db } from "@glossa/db"; import { translations, terms } from "@glossa/db/schema"; @@ -64,12 +70,14 @@ const main = async () => { console.log("📖 Reading word list..."); const sourceWords = await readingFromWordlist(); console.log(` ${sourceWords.length} words loaded\n`); + // check if sourceWords exist in database console.log("🔍 Checking against database..."); const { termIds, missingWords } = await checkingSourceWordsAgainstDB(sourceWords); console.log("words found in db: ", termIds.length); console.log("words NOT found in db: ", missingWords.length); + // write missing words to file await writeMissingWordsToFile(missingWords); };