wip

2026-04-01 02:43:55 +02:00 · 2026-04-01 02:43:55 +02:00 · 7fdcedd1dd
commit 7fdcedd1dd
parent a49bce4a5a
2 changed files with 106 additions and 35 deletions
--- a/packages/db/src/db/schema.ts
+++ b/packages/db/src/db/schema.ts
@ -133,10 +133,7 @@ export const decks = pgTable(
    name: text().notNull(),
    description: text(),
    source_language: varchar({ length: 10 }).notNull(),
-    validated_for_languages: varchar({ length: 10 })
-      .array()
-      .notNull()
-      .default([]),
+    validated_languages: varchar({ length: 10 }).array().notNull().default([]),
    is_public: boolean().default(false).notNull(),
    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
  },
--- a/packages/db/src/generating-decks.ts
+++ b/packages/db/src/generating-decks.ts
@ -1,24 +1,42 @@
 /*
- [x] Setup — hardcoded path, name, description, source language, POS
- [x] Read wordlist — load the 1000 nouns
- [x] Query terms — match to database, find which ones have translations
- [ ] writing missing words to textfile for future use
- [ ] Validation — determine validated languages from the data (e.g., ["en", "it"] if all matched terms have both)
- [ ] Check idempotency — skip if deck exists
- [ ] Create deck — insert with discovered validated_for_languages
- [ ] Link terms — insert deck_terms
- [ ] Report — summary
-*/
+ *
+ * Builds the "top English nouns" deck from a curated wordlist of the 1000 most
+ * frequently used English nouns. The deck has English as its source language —
+ * meaning it was curated from an English-centric frequency list, and a separate
+ * deck would be needed for other source languages. For each word in the list, all
+ * matching term IDs are looked up in the database via the translations table
+ * (language: "en", POS: "noun") — homonyms are intentionally included as separate
+ * cards since the quiz UI displays a gloss alongside each word. Words from the
+ * list that have no DB match are skipped and written to a file for future
+ * reference. The script is idempotent: if the deck already exists, only terms
+ * present in the source but missing from the deck are added; terms already in the
+ * deck are left untouched; terms in the deck but absent from the source are never
+ * removed. After resolving all matched terms, the script determines
+ * validated_for_languages by checking which languages — excluding the source
+ * language — have full translation coverage across all matched terms, and updates
+ * the array on every run.
+ */

-// TODO: Wordlist contains 1000 lines but only 999 unique words (965 found + 34 missing = 999).
-// Likely cause: duplicate entry in top1000englishnouns file.
-// Investigate with: const unique = new Set(words); console.log(words.length - unique.size);
-// Fix either by deduplicating in code ([...new Set(words)]) or cleaning the source file.
+/*
+ * roadmap
+ *
+ * [x] Setup — hardcoded path, name, description, source language, POS
+ * [x] Read wordlist — load and deduplicate the 1000 nouns
+ * [x] Query terms — match to database, collect all term IDs per word (including homonyms)
+ * [x] Write missing words to file for future reference
+ * [x] Determine validated_for_languages — find languages (excluding source) with full coverage across all matched terms
+ * [ ] Check idempotency — if deck exists, diff matched terms against existing deck_terms
+ * [ ] Create deck if it doesn't exist — insert with name, source_language, validated_for_languages
+ * [ ] Add new terms — insert only term IDs present in source but missing from deck
+ * [ ] Update validated_for_languages — recalculate and update on every run
+ * [ ] Report — summary of words found, missing, added, and validated languages
+ */

 import fs from "node:fs/promises";
 import { db } from "@glossa/db";
-import { translations, terms } from "@glossa/db/schema";
+import { translations, terms, decks } from "@glossa/db/schema";
 import { inArray, and, eq } from "drizzle-orm";
+import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";

 const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
 const nameOfDeck = "top english nouns";
@ -27,12 +45,20 @@ const descriptionOfDeck =
 const sourceLanguage = "en";
 const sourcePOS = "noun";

+// new Set() automatically discards duplicate values,
+// and spreading it back with ... converts it to a plain array again.
+// So if "bank" appears twice in the file,
+// the resulting array will only contain it once.
 const readingFromWordlist = async () => {
  const raw = await fs.readFile(pathToWordlist, "utf8");
-  const words = raw
-    .split("\n")
-    .map((w) => w.trim().toLowerCase())
-    .filter(Boolean);
+  const words = [
+    ...new Set(
+      raw
+        .split("\n")
+        .map((w) => w.trim().toLowerCase())
+        .filter(Boolean),
+    ),
+  ];
  return words;
 };

@ -49,37 +75,85 @@ const checkingSourceWordsAgainstDB = async (words: string[]) => {
      ),
    );

-  // map word text to term_id
-  const wordToTermId = new Map<string, string>();
+  const wordToTermIds = new Map<string, string[]>();
  for (const row of rows) {
    const word = row.text.toLowerCase();
-    if (!wordToTermId.has(word)) {
-      wordToTermId.set(word, row.termId);
-    }
+    const existing = wordToTermIds.get(word) ?? [];
+    wordToTermIds.set(word, [...existing, row.termId]);
  }
-  const termIds = Array.from(wordToTermId.values());
-  const missingWords = words.filter((w) => !wordToTermId.has(w));
+  const termIds = Array.from(wordToTermIds.values()).flat();
+  const missingWords = words.filter((w) => !wordToTermIds.has(w));

  return { termIds, missingWords };
 };

-const writeMissingWordsToFile = async (words: string[]) => {};
+const writeMissingWordsToFile = async (missingWords: string[]) => {
+  const outputPath = `${pathToWordlist}-missing`;
+  await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
+};
+
+const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
+  // create array of language code from the supported languages
+  // remove source language from it
+  const languages = SUPPORTED_LANGUAGE_CODES.filter(
+    (language) => language !== sourceLanguage,
+  );
+  const validatedLanguages: string[] = [];
+  // For each remaining language, count how many of the termIds have a translation in that language
+  for (const language of languages) {
+    const rows = await db
+      .select({ termId: translations.term_id })
+      .from(translations)
+      .where(
+        and(
+          inArray(translations.term_id, termIds),
+          eq(translations.language_code, language),
+        ),
+      );
+    if (rows.length === termIds.length) {
+      validatedLanguages.push(language);
+    }
+  }
+
+  // If the count equals termIds.length → full coverage → include in result
+  // Return the array of fully covered languages
+  return validatedLanguages;
+};
+
+const findExistingDeck = async () => {
+  const existing = await db
+    .select({ id: decks.id, validatedForLanguages: decks.validated_languages })
+    .from(decks)
+    .where(
+      and(
+        eq(decks.name, nameOfDeck),
+        eq(decks.source_language, sourceLanguage),
+      ),
+    );
+  return existing[0] ?? null;
+};

 const main = async () => {
-  // Read and normalise the word list
+  // reading from source file
  console.log("📖 Reading word list...");
  const sourceWords = await readingFromWordlist();
  console.log(`   ${sourceWords.length} words loaded\n`);

-  // check if sourceWords exist in database
+  // checking if sourceWords exist in database
  console.log("🔍 Checking against database...");
  const { termIds, missingWords } =
    await checkingSourceWordsAgainstDB(sourceWords);
  console.log("words found in db: ", termIds.length);
-  console.log("words NOT found in db: ", missingWords.length);
+  console.log("words NOT found in db: ", missingWords.length, "\n");

-  // write missing words to file
+  // writing missing words to file
+  console.log("writing missing words to file...\n");
  await writeMissingWordsToFile(missingWords);
+
+  // validating languages
+  console.log("validation languages...");
+  const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
+  console.log("validated these languages: ", validatedLanguages, "\n");
 };

 main().catch((error) => {