feat(db): complete deck generation script for top english nouns

- add deck_terms to schema imports - add addTermsToDeck — diffs source term IDs against existing deck_terms, inserts only new ones, returns count of inserted terms - add updateValidatedLanguages — recalculates and persists validated_languages on every run so coverage stays accurate as translation data grows - wire both functions into main with isNewDeck guard to avoid redundant validated_languages update on deck creation - add final summary report - fix possible undefined on result[0] in createDeck - tick off remaining roadmap items
2026-04-01 17:56:31 +02:00 · 2026-04-01 17:56:31 +02:00 · 3bb8bfdb39
commit 3bb8bfdb39
parent 7fdcedd1dd
12 changed files with 442 additions and 875 deletions
--- a/packages/db/src/data/wordlists/top1000englishnouns-missing
+++ b/packages/db/src/data/wordlists/top1000englishnouns-missing
@ -0,0 +1,34 @@
+a
+other
+us
+may
+st
+paul
+new
+software
+oxford
+english
+mary
+japan
+while
+pp
+membership
+manchester
+tony
+alan
+jones
+un
+northern
+simon
+behalf
+co
+graham
+joe
+guy
+lewis
+jane
+taylor
+co-operation
+travel
+self
+thatcher
--- a/packages/db/src/db/schema.ts
+++ b/packages/db/src/db/schema.ts
@ -144,11 +144,11 @@ export const decks = pgTable(
    ),
    check(
      "validated_languages_check",
-      sql`validated_for_languages <@ ARRAY[${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))}]::varchar[]`,
+      sql`validated_languages <@ ARRAY[${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))}]::varchar[]`,
    ),
    check(
      "validated_languages_excludes_source",
-      sql`NOT (${table.source_language} = ANY(validated_for_languages))`,
+      sql`NOT (${table.source_language} = ANY(${table.validated_languages}))`,
    ),
    unique("unique_deck_name").on(table.name, table.source_language),
  ],
--- a/packages/db/src/generating-deck.ts
+++ b/packages/db/src/generating-deck.ts
@ -0,0 +1,302 @@
+/*
+ *
+ * Builds the "top English nouns" deck from a curated wordlist of the 1000 most
+ * frequently used English nouns. The deck has English as its source language —
+ * meaning it was curated from an English-centric frequency list, and a separate
+ * deck would be needed for other source languages. For each word in the list, all
+ * matching term IDs are looked up in the database via the translations table
+ * (language: "en", POS: "noun") — homonyms are intentionally included as separate
+ * cards since the quiz UI displays a gloss alongside each word. Words from the
+ * list that have no DB match are skipped and written to a file for future
+ * reference. The script is idempotent: if the deck already exists, only terms
+ * present in the source but missing from the deck are added; terms already in the
+ * deck are left untouched; terms in the deck but absent from the source are never
+ * removed. After resolving all matched terms, the script determines
+ * validated_for_languages by checking which languages — excluding the source
+ * language — have full translation coverage across all matched terms, and updates
+ * the array on every run.
+ */
+
+/*
+ * roadmap
+ *
+ * [x] Setup - hardcoded path, name, description, source language, POS
+ * [x] Read wordlist - load and deduplicate the 1000 nouns
+ * [x] Query terms - match to database, collect all term IDs per word (including homonyms)
+ * [x] Write missing words to file for future reference
+ * [x] Determine validated_languages - find languages (excluding source) with full coverage across all matched terms
+ * [x] Check idempotency - if deck exists, diff matched terms against existing deck_terms
+ * [x] Create deck if it doesn't exist - insert with name, source_language, validated_languages
+ * [x] Add new terms - insert only term IDs present in source but missing from deck
+ * [x] Update validated_languages - recalculate and update on every run
+ * [x] Report - summary of words found, missing, added, and validated languages
+ */
+
+import fs from "node:fs/promises";
+import { db } from "@glossa/db";
+import { translations, terms, decks, deck_terms } from "@glossa/db/schema";
+import { inArray, and, eq } from "drizzle-orm";
+import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";
+
+const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
+const nameOfDeck = "top english nouns";
+const descriptionOfDeck =
+  "Most frequently used English nouns for vocabulary practice";
+const sourceLanguage = "en";
+const sourcePOS = "noun";
+
+// new Set() automatically discards duplicate values,
+// and spreading it back with ... converts it to a plain array again.
+// So if "bank" appears twice in the file,
+// the resulting array will only contain it once.
+const readingFromWordlist = async () => {
+  const raw = await fs.readFile(pathToWordlist, "utf8");
+  const words = [
+    ...new Set(
+      raw
+        .split("\n")
+        .map((w) => w.trim().toLowerCase())
+        .filter(Boolean),
+    ),
+  ];
+  return words;
+};
+
+const checkingSourceWordsAgainstDB = async (words: string[]) => {
+  const rows = await db
+    .select({ text: translations.text, termId: translations.term_id })
+    .from(translations)
+    .innerJoin(terms, eq(translations.term_id, terms.id))
+    .where(
+      and(
+        inArray(translations.text, words),
+        eq(translations.language_code, sourceLanguage),
+        eq(terms.pos, sourcePOS),
+      ),
+    );
+
+  const wordToTermIds = new Map<string, string[]>();
+  for (const row of rows) {
+    const word = row.text.toLowerCase();
+    const existing = wordToTermIds.get(word) ?? [];
+    wordToTermIds.set(word, [...existing, row.termId]);
+  }
+  const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())];
+  const missingWords = words.filter((w) => !wordToTermIds.has(w));
+
+  return { termIds, missingWords };
+};
+
+const writeMissingWordsToFile = async (missingWords: string[]) => {
+  const outputPath = `${pathToWordlist}-missing`;
+  await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
+};
+
+const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
+  // create array of language code from the supported languages
+  // remove source language from it
+  const languages = SUPPORTED_LANGUAGE_CODES.filter(
+    (language) => language !== sourceLanguage,
+  );
+  const validatedLanguages: string[] = [];
+  // For each remaining language, count how many of the termIds have a translation in that language
+  for (const language of languages) {
+    const rows = await db
+      .selectDistinct({ termId: translations.term_id })
+      .from(translations)
+      .where(
+        and(
+          inArray(translations.term_id, termIds),
+          eq(translations.language_code, language),
+        ),
+      );
+    if (rows.length === termIds.length) {
+      validatedLanguages.push(language);
+    }
+  }
+
+  // If the count equals termIds.length → full coverage → include in result
+  // Return the array of fully covered languages
+  return validatedLanguages;
+};
+
+// Check idempotency — if deck exists, diff matched terms against existing deck_terms
+const findExistingDeck = async () => {
+  const existing = await db
+    .select({ id: decks.id, validatedForLanguages: decks.validated_languages })
+    .from(decks)
+    .where(
+      and(
+        eq(decks.name, nameOfDeck),
+        eq(decks.source_language, sourceLanguage),
+      ),
+    );
+  return existing[0] ?? null;
+};
+
+// logging translation coverage per language across all matched terms
+const logLanguageCoverage = async (termIds: string[]) => {
+  const languages = SUPPORTED_LANGUAGE_CODES.filter(
+    (language) => language !== sourceLanguage,
+  );
+  for (const language of languages) {
+    const rows = await db
+      .selectDistinct({ termId: translations.term_id })
+      .from(translations)
+      .where(
+        and(
+          inArray(translations.term_id, termIds),
+          eq(translations.language_code, language),
+        ),
+      );
+    console.log(
+      `   ${language}: ${rows.length} / ${termIds.length} terms covered`,
+    );
+
+    const coveredIds = new Set(rows.map((r) => r.termId));
+    const missingTermIds = termIds.filter((id) => !coveredIds.has(id));
+    console.log(`   missing term IDs count:`, missingTermIds.length);
+
+    const missingEnglish = await db
+      .selectDistinct({ text: translations.text })
+      .from(translations)
+      .where(
+        and(
+          inArray(translations.term_id, missingTermIds),
+          eq(translations.language_code, "en"),
+        ),
+      );
+    console.log(
+      `   missing words in ${language}:`,
+      missingEnglish.map((r) => r.text),
+      "\n",
+    );
+  }
+};
+
+// creating a deck
+const createDeck = async (validatedLanguages: string[]) => {
+  const result = await db
+    .insert(decks)
+    .values({
+      name: nameOfDeck,
+      description: descriptionOfDeck,
+      source_language: sourceLanguage,
+      validated_languages: validatedLanguages,
+      is_public: false,
+    })
+    .returning({ id: decks.id });
+  const created = result[0];
+  if (!created) throw new Error("Failed to create deck: no row returned");
+  return created.id;
+};
+
+// Diffs termIds against the existing deck_terms for this deck and inserts only
+// the ones not already present. Returns the count of newly inserted terms.
+const addTermsToDeck = async (
+  deckId: string,
+  termIds: string[],
+): Promise<number> => {
+  const existingRows = await db
+    .select({ termId: deck_terms.term_id })
+    .from(deck_terms)
+    .where(eq(deck_terms.deck_id, deckId));
+
+  const existingTermIds = new Set(existingRows.map((r) => r.termId));
+  const newTermIds = termIds.filter((id) => !existingTermIds.has(id));
+
+  if (newTermIds.length === 0) return 0;
+
+  await db
+    .insert(deck_terms)
+    .values(newTermIds.map((termId) => ({ deck_id: deckId, term_id: termId })));
+
+  return newTermIds.length;
+};
+
+// Recalculates and persists validated_languages on every run so the field stays
+// accurate as translation coverage grows over time.
+const updateValidatedLanguages = async (
+  deckId: string,
+  validatedLanguages: string[],
+): Promise<void> => {
+  await db
+    .update(decks)
+    .set({ validated_languages: validatedLanguages })
+    .where(eq(decks.id, deckId));
+};
+
+const main = async () => {
+  console.log("📖 Reading word list...");
+  const sourceWords = await readingFromWordlist();
+  console.log(`   ${sourceWords.length} words loaded\n`);
+
+  console.log("🔍 Checking against database...");
+  const { termIds, missingWords } =
+    await checkingSourceWordsAgainstDB(sourceWords);
+  console.log(`   ${termIds.length} terms found`);
+  console.log(`   ${missingWords.length} words not found in DB\n`);
+
+  console.log("🖊️ Writing missing words to file...\n");
+  await writeMissingWordsToFile(missingWords);
+
+  console.log("✅ Validating languages...");
+  const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
+  console.log(
+    `   Validated languages: ${JSON.stringify(validatedLanguages)}\n`,
+  );
+
+  console.log("🔬 Language coverage breakdown...");
+  await logLanguageCoverage(termIds);
+
+  console.log("🃏 Looking for existing deck...");
+  const existingDeck = await findExistingDeck();
+
+  let deckId: string;
+  let isNewDeck: boolean;
+
+  if (!existingDeck) {
+    console.log("   No existing deck found, will create one\n");
+    console.log("🆕 Creating deck...");
+    deckId = await createDeck(validatedLanguages);
+    console.log(`   Deck created with id: ${deckId}\n`);
+    isNewDeck = true;
+  } else {
+    console.log(`   Found existing deck with id: ${existingDeck.id}\n`);
+    deckId = existingDeck.id;
+    isNewDeck = false;
+  }
+
+  console.log("➕ Adding terms to deck...");
+  const addedCount = await addTermsToDeck(deckId, termIds);
+  const alreadyPresentCount = termIds.length - addedCount;
+  console.log(`   ${addedCount} terms added`);
+  console.log(`   ${alreadyPresentCount} terms already in deck\n`);
+
+  if (!isNewDeck) {
+    console.log("🔄 Updating validated languages...");
+    await updateValidatedLanguages(deckId, validatedLanguages);
+    console.log(`   Updated to: ${JSON.stringify(validatedLanguages)}\n`);
+  }
+
+  console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
+  console.log("📊 Summary");
+  console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
+  console.log(`   Words loaded from wordlist : ${sourceWords.length}`);
+  console.log(
+    `   Words matched in DB        : ${sourceWords.length - missingWords.length}`,
+  );
+  console.log(`   Words not found in DB      : ${missingWords.length}`);
+  console.log(`   Term IDs resolved          : ${termIds.length}`);
+  console.log(`   Terms added to deck        : ${addedCount}`);
+  console.log(`   Terms already in deck      : ${alreadyPresentCount}`);
+  console.log(
+    `   Validated languages        : ${validatedLanguages.length > 0 ? validatedLanguages.join(", ") : "none"}`,
+  );
+  console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
+};
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
--- a/packages/db/src/generating-decks.ts
+++ b/packages/db/src/generating-decks.ts
@ -1,162 +0,0 @@
-/*
- *
- * Builds the "top English nouns" deck from a curated wordlist of the 1000 most
- * frequently used English nouns. The deck has English as its source language —
- * meaning it was curated from an English-centric frequency list, and a separate
- * deck would be needed for other source languages. For each word in the list, all
- * matching term IDs are looked up in the database via the translations table
- * (language: "en", POS: "noun") — homonyms are intentionally included as separate
- * cards since the quiz UI displays a gloss alongside each word. Words from the
- * list that have no DB match are skipped and written to a file for future
- * reference. The script is idempotent: if the deck already exists, only terms
- * present in the source but missing from the deck are added; terms already in the
- * deck are left untouched; terms in the deck but absent from the source are never
- * removed. After resolving all matched terms, the script determines
- * validated_for_languages by checking which languages — excluding the source
- * language — have full translation coverage across all matched terms, and updates
- * the array on every run.
- */
-
-/*
- * roadmap
- *
- * [x] Setup — hardcoded path, name, description, source language, POS
- * [x] Read wordlist — load and deduplicate the 1000 nouns
- * [x] Query terms — match to database, collect all term IDs per word (including homonyms)
- * [x] Write missing words to file for future reference
- * [x] Determine validated_for_languages — find languages (excluding source) with full coverage across all matched terms
- * [ ] Check idempotency — if deck exists, diff matched terms against existing deck_terms
- * [ ] Create deck if it doesn't exist — insert with name, source_language, validated_for_languages
- * [ ] Add new terms — insert only term IDs present in source but missing from deck
- * [ ] Update validated_for_languages — recalculate and update on every run
- * [ ] Report — summary of words found, missing, added, and validated languages
- */
-
-import fs from "node:fs/promises";
-import { db } from "@glossa/db";
-import { translations, terms, decks } from "@glossa/db/schema";
-import { inArray, and, eq } from "drizzle-orm";
-import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";
-
-const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
-const nameOfDeck = "top english nouns";
-const descriptionOfDeck =
-  "Most frequently used English nouns for vocabulary practice";
-const sourceLanguage = "en";
-const sourcePOS = "noun";
-
-// new Set() automatically discards duplicate values,
-// and spreading it back with ... converts it to a plain array again.
-// So if "bank" appears twice in the file,
-// the resulting array will only contain it once.
-const readingFromWordlist = async () => {
-  const raw = await fs.readFile(pathToWordlist, "utf8");
-  const words = [
-    ...new Set(
-      raw
-        .split("\n")
-        .map((w) => w.trim().toLowerCase())
-        .filter(Boolean),
-    ),
-  ];
-  return words;
-};
-
-const checkingSourceWordsAgainstDB = async (words: string[]) => {
-  const rows = await db
-    .select({ text: translations.text, termId: translations.term_id })
-    .from(translations)
-    .innerJoin(terms, eq(translations.term_id, terms.id))
-    .where(
-      and(
-        inArray(translations.text, words),
-        eq(translations.language_code, sourceLanguage),
-        eq(terms.pos, sourcePOS),
-      ),
-    );
-
-  const wordToTermIds = new Map<string, string[]>();
-  for (const row of rows) {
-    const word = row.text.toLowerCase();
-    const existing = wordToTermIds.get(word) ?? [];
-    wordToTermIds.set(word, [...existing, row.termId]);
-  }
-  const termIds = Array.from(wordToTermIds.values()).flat();
-  const missingWords = words.filter((w) => !wordToTermIds.has(w));
-
-  return { termIds, missingWords };
-};
-
-const writeMissingWordsToFile = async (missingWords: string[]) => {
-  const outputPath = `${pathToWordlist}-missing`;
-  await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
-};
-
-const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
-  // create array of language code from the supported languages
-  // remove source language from it
-  const languages = SUPPORTED_LANGUAGE_CODES.filter(
-    (language) => language !== sourceLanguage,
-  );
-  const validatedLanguages: string[] = [];
-  // For each remaining language, count how many of the termIds have a translation in that language
-  for (const language of languages) {
-    const rows = await db
-      .select({ termId: translations.term_id })
-      .from(translations)
-      .where(
-        and(
-          inArray(translations.term_id, termIds),
-          eq(translations.language_code, language),
-        ),
-      );
-    if (rows.length === termIds.length) {
-      validatedLanguages.push(language);
-    }
-  }
-
-  // If the count equals termIds.length → full coverage → include in result
-  // Return the array of fully covered languages
-  return validatedLanguages;
-};
-
-const findExistingDeck = async () => {
-  const existing = await db
-    .select({ id: decks.id, validatedForLanguages: decks.validated_languages })
-    .from(decks)
-    .where(
-      and(
-        eq(decks.name, nameOfDeck),
-        eq(decks.source_language, sourceLanguage),
-      ),
-    );
-  return existing[0] ?? null;
-};
-
-const main = async () => {
-  // reading from source file
-  console.log("📖 Reading word list...");
-  const sourceWords = await readingFromWordlist();
-  console.log(`   ${sourceWords.length} words loaded\n`);
-
-  // checking if sourceWords exist in database
-  console.log("🔍 Checking against database...");
-  const { termIds, missingWords } =
-    await checkingSourceWordsAgainstDB(sourceWords);
-  console.log("words found in db: ", termIds.length);
-  console.log("words NOT found in db: ", missingWords.length, "\n");
-
-  // writing missing words to file
-  console.log("writing missing words to file...\n");
-  await writeMissingWordsToFile(missingWords);
-
-  // validating languages
-  console.log("validation languages...");
-  const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
-  console.log("validated these languages: ", validatedLanguages, "\n");
-};
-
-main().catch((error) => {
-  console.error(error);
-  process.exit(1);
-});