From 7fdcedd1dd853304d3e35ab57b99bd34dcd7bb9b Mon Sep 17 00:00:00 2001 From: lila Date: Wed, 1 Apr 2026 02:43:55 +0200 Subject: [PATCH] wip --- packages/db/src/db/schema.ts | 5 +- packages/db/src/generating-decks.ts | 136 +++++++++++++++++++++------- 2 files changed, 106 insertions(+), 35 deletions(-) diff --git a/packages/db/src/db/schema.ts b/packages/db/src/db/schema.ts index 4babe8a..90604af 100644 --- a/packages/db/src/db/schema.ts +++ b/packages/db/src/db/schema.ts @@ -133,10 +133,7 @@ export const decks = pgTable( name: text().notNull(), description: text(), source_language: varchar({ length: 10 }).notNull(), - validated_for_languages: varchar({ length: 10 }) - .array() - .notNull() - .default([]), + validated_languages: varchar({ length: 10 }).array().notNull().default([]), is_public: boolean().default(false).notNull(), created_at: timestamp({ withTimezone: true }).defaultNow().notNull(), }, diff --git a/packages/db/src/generating-decks.ts b/packages/db/src/generating-decks.ts index a056780..42e14dc 100644 --- a/packages/db/src/generating-decks.ts +++ b/packages/db/src/generating-decks.ts @@ -1,24 +1,42 @@ /* -- [x] Setup — hardcoded path, name, description, source language, POS -- [x] Read wordlist — load the 1000 nouns -- [x] Query terms — match to database, find which ones have translations -- [ ] writing missing words to textfile for future use -- [ ] Validation — determine validated languages from the data (e.g., ["en", "it"] if all matched terms have both) -- [ ] Check idempotency — skip if deck exists -- [ ] Create deck — insert with discovered validated_for_languages -- [ ] Link terms — insert deck_terms -- [ ] Report — summary -*/ + * + * Builds the "top English nouns" deck from a curated wordlist of the 1000 most + * frequently used English nouns. The deck has English as its source language — + * meaning it was curated from an English-centric frequency list, and a separate + * deck would be needed for other source languages. For each word in the list, all + * matching term IDs are looked up in the database via the translations table + * (language: "en", POS: "noun") — homonyms are intentionally included as separate + * cards since the quiz UI displays a gloss alongside each word. Words from the + * list that have no DB match are skipped and written to a file for future + * reference. The script is idempotent: if the deck already exists, only terms + * present in the source but missing from the deck are added; terms already in the + * deck are left untouched; terms in the deck but absent from the source are never + * removed. After resolving all matched terms, the script determines + * validated_for_languages by checking which languages — excluding the source + * language — have full translation coverage across all matched terms, and updates + * the array on every run. + */ -// TODO: Wordlist contains 1000 lines but only 999 unique words (965 found + 34 missing = 999). -// Likely cause: duplicate entry in top1000englishnouns file. -// Investigate with: const unique = new Set(words); console.log(words.length - unique.size); -// Fix either by deduplicating in code ([...new Set(words)]) or cleaning the source file. +/* + * roadmap + * + * [x] Setup — hardcoded path, name, description, source language, POS + * [x] Read wordlist — load and deduplicate the 1000 nouns + * [x] Query terms — match to database, collect all term IDs per word (including homonyms) + * [x] Write missing words to file for future reference + * [x] Determine validated_for_languages — find languages (excluding source) with full coverage across all matched terms + * [ ] Check idempotency — if deck exists, diff matched terms against existing deck_terms + * [ ] Create deck if it doesn't exist — insert with name, source_language, validated_for_languages + * [ ] Add new terms — insert only term IDs present in source but missing from deck + * [ ] Update validated_for_languages — recalculate and update on every run + * [ ] Report — summary of words found, missing, added, and validated languages + */ import fs from "node:fs/promises"; import { db } from "@glossa/db"; -import { translations, terms } from "@glossa/db/schema"; +import { translations, terms, decks } from "@glossa/db/schema"; import { inArray, and, eq } from "drizzle-orm"; +import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared"; const pathToWordlist = "./src/data/wordlists/top1000englishnouns"; const nameOfDeck = "top english nouns"; @@ -27,12 +45,20 @@ const descriptionOfDeck = const sourceLanguage = "en"; const sourcePOS = "noun"; +// new Set() automatically discards duplicate values, +// and spreading it back with ... converts it to a plain array again. +// So if "bank" appears twice in the file, +// the resulting array will only contain it once. const readingFromWordlist = async () => { const raw = await fs.readFile(pathToWordlist, "utf8"); - const words = raw - .split("\n") - .map((w) => w.trim().toLowerCase()) - .filter(Boolean); + const words = [ + ...new Set( + raw + .split("\n") + .map((w) => w.trim().toLowerCase()) + .filter(Boolean), + ), + ]; return words; }; @@ -49,37 +75,85 @@ const checkingSourceWordsAgainstDB = async (words: string[]) => { ), ); - // map word text to term_id - const wordToTermId = new Map(); + const wordToTermIds = new Map(); for (const row of rows) { const word = row.text.toLowerCase(); - if (!wordToTermId.has(word)) { - wordToTermId.set(word, row.termId); - } + const existing = wordToTermIds.get(word) ?? []; + wordToTermIds.set(word, [...existing, row.termId]); } - const termIds = Array.from(wordToTermId.values()); - const missingWords = words.filter((w) => !wordToTermId.has(w)); + const termIds = Array.from(wordToTermIds.values()).flat(); + const missingWords = words.filter((w) => !wordToTermIds.has(w)); return { termIds, missingWords }; }; -const writeMissingWordsToFile = async (words: string[]) => {}; +const writeMissingWordsToFile = async (missingWords: string[]) => { + const outputPath = `${pathToWordlist}-missing`; + await fs.writeFile(outputPath, missingWords.join("\n"), "utf8"); +}; + +const validateLanguages = async (sourceLanguage: string, termIds: string[]) => { + // create array of language code from the supported languages + // remove source language from it + const languages = SUPPORTED_LANGUAGE_CODES.filter( + (language) => language !== sourceLanguage, + ); + const validatedLanguages: string[] = []; + // For each remaining language, count how many of the termIds have a translation in that language + for (const language of languages) { + const rows = await db + .select({ termId: translations.term_id }) + .from(translations) + .where( + and( + inArray(translations.term_id, termIds), + eq(translations.language_code, language), + ), + ); + if (rows.length === termIds.length) { + validatedLanguages.push(language); + } + } + + // If the count equals termIds.length → full coverage → include in result + // Return the array of fully covered languages + return validatedLanguages; +}; + +const findExistingDeck = async () => { + const existing = await db + .select({ id: decks.id, validatedForLanguages: decks.validated_languages }) + .from(decks) + .where( + and( + eq(decks.name, nameOfDeck), + eq(decks.source_language, sourceLanguage), + ), + ); + return existing[0] ?? null; +}; const main = async () => { - // Read and normalise the word list + // reading from source file console.log("📖 Reading word list..."); const sourceWords = await readingFromWordlist(); console.log(` ${sourceWords.length} words loaded\n`); - // check if sourceWords exist in database + // checking if sourceWords exist in database console.log("🔍 Checking against database..."); const { termIds, missingWords } = await checkingSourceWordsAgainstDB(sourceWords); console.log("words found in db: ", termIds.length); - console.log("words NOT found in db: ", missingWords.length); + console.log("words NOT found in db: ", missingWords.length, "\n"); - // write missing words to file + // writing missing words to file + console.log("writing missing words to file...\n"); await writeMissingWordsToFile(missingWords); + + // validating languages + console.log("validation languages..."); + const validatedLanguages = await validateLanguages(sourceLanguage, termIds); + console.log("validated these languages: ", validatedLanguages, "\n"); }; main().catch((error) => {