From 7fdcedd1dd853304d3e35ab57b99bd34dcd7bb9b Mon Sep 17 00:00:00 2001
From: lila <beiweitemderbeste@protonmail.com>
Date: Wed, 1 Apr 2026 02:43:55 +0200
Subject: [PATCH] wip

---
 packages/db/src/db/schema.ts        |   5 +-
 packages/db/src/generating-decks.ts | 136 +++++++++++++++++++++-------
 2 files changed, 106 insertions(+), 35 deletions(-)

diff --git a/packages/db/src/db/schema.ts b/packages/db/src/db/schema.ts
index 4babe8a..90604af 100644
--- a/packages/db/src/db/schema.ts
+++ b/packages/db/src/db/schema.ts
@@ -133,10 +133,7 @@ export const decks = pgTable(
     name: text().notNull(),
     description: text(),
     source_language: varchar({ length: 10 }).notNull(),
-    validated_for_languages: varchar({ length: 10 })
-      .array()
-      .notNull()
-      .default([]),
+    validated_languages: varchar({ length: 10 }).array().notNull().default([]),
     is_public: boolean().default(false).notNull(),
     created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
   },
diff --git a/packages/db/src/generating-decks.ts b/packages/db/src/generating-decks.ts
index a056780..42e14dc 100644
--- a/packages/db/src/generating-decks.ts
+++ b/packages/db/src/generating-decks.ts
@@ -1,24 +1,42 @@
 /*
-- [x] Setup — hardcoded path, name, description, source language, POS
-- [x] Read wordlist — load the 1000 nouns
-- [x] Query terms — match to database, find which ones have translations
-- [ ] writing missing words to textfile for future use
-- [ ] Validation — determine validated languages from the data (e.g., ["en", "it"] if all matched terms have both)
-- [ ] Check idempotency — skip if deck exists
-- [ ] Create deck — insert with discovered validated_for_languages
-- [ ] Link terms — insert deck_terms
-- [ ] Report — summary
-*/
+ *
+ * Builds the "top English nouns" deck from a curated wordlist of the 1000 most
+ * frequently used English nouns. The deck has English as its source language —
+ * meaning it was curated from an English-centric frequency list, and a separate
+ * deck would be needed for other source languages. For each word in the list, all
+ * matching term IDs are looked up in the database via the translations table
+ * (language: "en", POS: "noun") — homonyms are intentionally included as separate
+ * cards since the quiz UI displays a gloss alongside each word. Words from the
+ * list that have no DB match are skipped and written to a file for future
+ * reference. The script is idempotent: if the deck already exists, only terms
+ * present in the source but missing from the deck are added; terms already in the
+ * deck are left untouched; terms in the deck but absent from the source are never
+ * removed. After resolving all matched terms, the script determines
+ * validated_for_languages by checking which languages — excluding the source
+ * language — have full translation coverage across all matched terms, and updates
+ * the array on every run.
+ */
 
-// TODO: Wordlist contains 1000 lines but only 999 unique words (965 found + 34 missing = 999).
-// Likely cause: duplicate entry in top1000englishnouns file.
-// Investigate with: const unique = new Set(words); console.log(words.length - unique.size);
-// Fix either by deduplicating in code ([...new Set(words)]) or cleaning the source file.
+/*
+ * roadmap
+ *
+ * [x] Setup — hardcoded path, name, description, source language, POS
+ * [x] Read wordlist — load and deduplicate the 1000 nouns
+ * [x] Query terms — match to database, collect all term IDs per word (including homonyms)
+ * [x] Write missing words to file for future reference
+ * [x] Determine validated_for_languages — find languages (excluding source) with full coverage across all matched terms
+ * [ ] Check idempotency — if deck exists, diff matched terms against existing deck_terms
+ * [ ] Create deck if it doesn't exist — insert with name, source_language, validated_for_languages
+ * [ ] Add new terms — insert only term IDs present in source but missing from deck
+ * [ ] Update validated_for_languages — recalculate and update on every run
+ * [ ] Report — summary of words found, missing, added, and validated languages
+ */
 
 import fs from "node:fs/promises";
 import { db } from "@glossa/db";
-import { translations, terms } from "@glossa/db/schema";
+import { translations, terms, decks } from "@glossa/db/schema";
 import { inArray, and, eq } from "drizzle-orm";
+import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";
 
 const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
 const nameOfDeck = "top english nouns";
@@ -27,12 +45,20 @@ const descriptionOfDeck =
 const sourceLanguage = "en";
 const sourcePOS = "noun";
 
+// new Set() automatically discards duplicate values,
+// and spreading it back with ... converts it to a plain array again.
+// So if "bank" appears twice in the file,
+// the resulting array will only contain it once.
 const readingFromWordlist = async () => {
   const raw = await fs.readFile(pathToWordlist, "utf8");
-  const words = raw
-    .split("\n")
-    .map((w) => w.trim().toLowerCase())
-    .filter(Boolean);
+  const words = [
+    ...new Set(
+      raw
+        .split("\n")
+        .map((w) => w.trim().toLowerCase())
+        .filter(Boolean),
+    ),
+  ];
   return words;
 };
 
@@ -49,37 +75,85 @@ const checkingSourceWordsAgainstDB = async (words: string[]) => {
       ),
     );
 
-  // map word text to term_id
-  const wordToTermId = new Map<string, string>();
+  const wordToTermIds = new Map<string, string[]>();
   for (const row of rows) {
     const word = row.text.toLowerCase();
-    if (!wordToTermId.has(word)) {
-      wordToTermId.set(word, row.termId);
-    }
+    const existing = wordToTermIds.get(word) ?? [];
+    wordToTermIds.set(word, [...existing, row.termId]);
   }
-  const termIds = Array.from(wordToTermId.values());
-  const missingWords = words.filter((w) => !wordToTermId.has(w));
+  const termIds = Array.from(wordToTermIds.values()).flat();
+  const missingWords = words.filter((w) => !wordToTermIds.has(w));
 
   return { termIds, missingWords };
 };
 
-const writeMissingWordsToFile = async (words: string[]) => {};
+const writeMissingWordsToFile = async (missingWords: string[]) => {
+  const outputPath = `${pathToWordlist}-missing`;
+  await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
+};
+
+const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
+  // create array of language code from the supported languages
+  // remove source language from it
+  const languages = SUPPORTED_LANGUAGE_CODES.filter(
+    (language) => language !== sourceLanguage,
+  );
+  const validatedLanguages: string[] = [];
+  // For each remaining language, count how many of the termIds have a translation in that language
+  for (const language of languages) {
+    const rows = await db
+      .select({ termId: translations.term_id })
+      .from(translations)
+      .where(
+        and(
+          inArray(translations.term_id, termIds),
+          eq(translations.language_code, language),
+        ),
+      );
+    if (rows.length === termIds.length) {
+      validatedLanguages.push(language);
+    }
+  }
+
+  // If the count equals termIds.length → full coverage → include in result
+  // Return the array of fully covered languages
+  return validatedLanguages;
+};
+
+const findExistingDeck = async () => {
+  const existing = await db
+    .select({ id: decks.id, validatedForLanguages: decks.validated_languages })
+    .from(decks)
+    .where(
+      and(
+        eq(decks.name, nameOfDeck),
+        eq(decks.source_language, sourceLanguage),
+      ),
+    );
+  return existing[0] ?? null;
+};
 
 const main = async () => {
-  // Read and normalise the word list
+  // reading from source file
   console.log("📖 Reading word list...");
   const sourceWords = await readingFromWordlist();
   console.log(`   ${sourceWords.length} words loaded\n`);
 
-  // check if sourceWords exist in database
+  // checking if sourceWords exist in database
   console.log("🔍 Checking against database...");
   const { termIds, missingWords } =
     await checkingSourceWordsAgainstDB(sourceWords);
   console.log("words found in db: ", termIds.length);
-  console.log("words NOT found in db: ", missingWords.length);
+  console.log("words NOT found in db: ", missingWords.length, "\n");
 
-  // write missing words to file
+  // writing missing words to file
+  console.log("writing missing words to file...\n");
   await writeMissingWordsToFile(missingWords);
+
+  // validating languages
+  console.log("validation languages...");
+  const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
+  console.log("validated these languages: ", validatedLanguages, "\n");
 };
 
 main().catch((error) => {