wip

2026-04-01 02:43:55 +02:00 · 2026-04-01 02:43:55 +02:00 · 7fdcedd1dd
commit 7fdcedd1dd
parent a49bce4a5a
2 changed files with 106 additions and 35 deletions
--- a/packages/db/src/db/schema.ts
+++ b/packages/db/src/db/schema.ts
@ -133,10 +133,7 @@ export const decks = pgTable(
    name: text().notNull(),
    description: text(),
    source_language: varchar({ length: 10 }).notNull(),
-    validated_for_languages: varchar({ length: 10 })
+    validated_languages: varchar({ length: 10 }).array().notNull().default([]),
      .array()
      .notNull()
      .default([]),
    is_public: boolean().default(false).notNull(),
    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
  },
--- a/packages/db/src/generating-decks.ts
+++ b/packages/db/src/generating-decks.ts
@ -1,24 +1,42 @@
 /*
- [x] Setup — hardcoded path, name, description, source language, POS
+ *
- [x] Read wordlist — load the 1000 nouns
+ * Builds the "top English nouns" deck from a curated wordlist of the 1000 most
- [x] Query terms — match to database, find which ones have translations
+ * frequently used English nouns. The deck has English as its source language —
- [ ] writing missing words to textfile for future use
+ * meaning it was curated from an English-centric frequency list, and a separate
- [ ] Validation — determine validated languages from the data (e.g., ["en", "it"] if all matched terms have both)
+ * deck would be needed for other source languages. For each word in the list, all
- [ ] Check idempotency — skip if deck exists
+ * matching term IDs are looked up in the database via the translations table
- [ ] Create deck — insert with discovered validated_for_languages
+ * (language: "en", POS: "noun") — homonyms are intentionally included as separate
- [ ] Link terms — insert deck_terms
+ * cards since the quiz UI displays a gloss alongside each word. Words from the
- [ ] Report — summary
+ * list that have no DB match are skipped and written to a file for future
-*/
+ * reference. The script is idempotent: if the deck already exists, only terms
 * present in the source but missing from the deck are added; terms already in the
 * deck are left untouched; terms in the deck but absent from the source are never
 * removed. After resolving all matched terms, the script determines
 * validated_for_languages by checking which languages — excluding the source
 * language — have full translation coverage across all matched terms, and updates
 * the array on every run.
 */
-// TODO: Wordlist contains 1000 lines but only 999 unique words (965 found + 34 missing = 999).
+/*
-// Likely cause: duplicate entry in top1000englishnouns file.
+ * roadmap
-// Investigate with: const unique = new Set(words); console.log(words.length - unique.size);
+ *
-// Fix either by deduplicating in code ([...new Set(words)]) or cleaning the source file.
+ * [x] Setup — hardcoded path, name, description, source language, POS
 * [x] Read wordlist — load and deduplicate the 1000 nouns
 * [x] Query terms — match to database, collect all term IDs per word (including homonyms)
 * [x] Write missing words to file for future reference
 * [x] Determine validated_for_languages — find languages (excluding source) with full coverage across all matched terms
 * [ ] Check idempotency — if deck exists, diff matched terms against existing deck_terms
 * [ ] Create deck if it doesn't exist — insert with name, source_language, validated_for_languages
 * [ ] Add new terms — insert only term IDs present in source but missing from deck
 * [ ] Update validated_for_languages — recalculate and update on every run
 * [ ] Report — summary of words found, missing, added, and validated languages
 */
 import fs from "node:fs/promises";
 import { db } from "@glossa/db";
-import { translations, terms } from "@glossa/db/schema";
+import { translations, terms, decks } from "@glossa/db/schema";
 import { inArray, and, eq } from "drizzle-orm";
 import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";
 const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
 const nameOfDeck = "top english nouns";
@ -27,12 +45,20 @@ const descriptionOfDeck =
 const sourceLanguage = "en";
 const sourcePOS = "noun";
 // new Set() automatically discards duplicate values,
 // and spreading it back with ... converts it to a plain array again.
 // So if "bank" appears twice in the file,
 // the resulting array will only contain it once.
 const readingFromWordlist = async () => {
  const raw = await fs.readFile(pathToWordlist, "utf8");
-  const words = raw
+  const words = [
-    .split("\n")
+    ...new Set(
-    .map((w) => w.trim().toLowerCase())
+      raw
-    .filter(Boolean);
+        .split("\n")
        .map((w) => w.trim().toLowerCase())
        .filter(Boolean),
    ),
  ];
  return words;
 };
@ -49,37 +75,85 @@ const checkingSourceWordsAgainstDB = async (words: string[]) => {
      ),
    );
-  // map word text to term_id
+  const wordToTermIds = new Map<string, string[]>();
  const wordToTermId = new Map<string, string>();
  for (const row of rows) {
    const word = row.text.toLowerCase();
-    if (!wordToTermId.has(word)) {
+    const existing = wordToTermIds.get(word) ?? [];
-      wordToTermId.set(word, row.termId);
+    wordToTermIds.set(word, [...existing, row.termId]);
    }
  }
-  const termIds = Array.from(wordToTermId.values());
+  const termIds = Array.from(wordToTermIds.values()).flat();
-  const missingWords = words.filter((w) => !wordToTermId.has(w));
+  const missingWords = words.filter((w) => !wordToTermIds.has(w));
  return { termIds, missingWords };
 };
-const writeMissingWordsToFile = async (words: string[]) => {};
+const writeMissingWordsToFile = async (missingWords: string[]) => {
  const outputPath = `${pathToWordlist}-missing`;
  await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
 };
 const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
  // create array of language code from the supported languages
  // remove source language from it
  const languages = SUPPORTED_LANGUAGE_CODES.filter(
    (language) => language !== sourceLanguage,
  );
  const validatedLanguages: string[] = [];
  // For each remaining language, count how many of the termIds have a translation in that language
  for (const language of languages) {
    const rows = await db
      .select({ termId: translations.term_id })
      .from(translations)
      .where(
        and(
          inArray(translations.term_id, termIds),
          eq(translations.language_code, language),
        ),
      );
    if (rows.length === termIds.length) {
      validatedLanguages.push(language);
    }
  }
  // If the count equals termIds.length → full coverage → include in result
  // Return the array of fully covered languages
  return validatedLanguages;
 };
 const findExistingDeck = async () => {
  const existing = await db
    .select({ id: decks.id, validatedForLanguages: decks.validated_languages })
    .from(decks)
    .where(
      and(
        eq(decks.name, nameOfDeck),
        eq(decks.source_language, sourceLanguage),
      ),
    );
  return existing[0] ?? null;
 };
 const main = async () => {
-  // Read and normalise the word list
+  // reading from source file
  console.log("📖 Reading word list...");
  const sourceWords = await readingFromWordlist();
  console.log(`   ${sourceWords.length} words loaded\n`);
-  // check if sourceWords exist in database
+  // checking if sourceWords exist in database
  console.log("🔍 Checking against database...");
  const { termIds, missingWords } =
    await checkingSourceWordsAgainstDB(sourceWords);
  console.log("words found in db: ", termIds.length);
-  console.log("words NOT found in db: ", missingWords.length);
+  console.log("words NOT found in db: ", missingWords.length, "\n");
-  // write missing words to file
+  // writing missing words to file
  console.log("writing missing words to file...\n");
  await writeMissingWordsToFile(missingWords);
  // validating languages
  console.log("validation languages...");
  const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
  console.log("validated these languages: ", validatedLanguages, "\n");
 };
 main().catch((error) => {