feat(db): complete deck generation script for top english nouns

- add deck_terms to schema imports
- add addTermsToDeck — diffs source term IDs against existing deck_terms,
  inserts only new ones, returns count of inserted terms
- add updateValidatedLanguages — recalculates and persists validated_languages
  on every run so coverage stays accurate as translation data grows
- wire both functions into main with isNewDeck guard to avoid redundant
  validated_languages update on deck creation
- add final summary report
- fix possible undefined on result[0] in createDeck
- tick off remaining roadmap items
This commit is contained in:
lila 2026-04-01 17:56:31 +02:00
parent 7fdcedd1dd
commit 3bb8bfdb39
12 changed files with 442 additions and 875 deletions

View file

@ -0,0 +1,34 @@
a
other
us
may
st
paul
new
software
oxford
english
mary
japan
while
pp
membership
manchester
tony
alan
jones
un
northern
simon
behalf
co
graham
joe
guy
lewis
jane
taylor
co-operation
travel
self
thatcher

View file

@ -144,11 +144,11 @@ export const decks = pgTable(
),
check(
"validated_languages_check",
sql`validated_for_languages <@ ARRAY[${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))}]::varchar[]`,
sql`validated_languages <@ ARRAY[${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))}]::varchar[]`,
),
check(
"validated_languages_excludes_source",
sql`NOT (${table.source_language} = ANY(validated_for_languages))`,
sql`NOT (${table.source_language} = ANY(${table.validated_languages}))`,
),
unique("unique_deck_name").on(table.name, table.source_language),
],

View file

@ -0,0 +1,302 @@
/*
*
* Builds the "top English nouns" deck from a curated wordlist of the 1000 most
* frequently used English nouns. The deck has English as its source language
* meaning it was curated from an English-centric frequency list, and a separate
* deck would be needed for other source languages. For each word in the list, all
* matching term IDs are looked up in the database via the translations table
* (language: "en", POS: "noun") homonyms are intentionally included as separate
* cards since the quiz UI displays a gloss alongside each word. Words from the
* list that have no DB match are skipped and written to a file for future
* reference. The script is idempotent: if the deck already exists, only terms
* present in the source but missing from the deck are added; terms already in the
* deck are left untouched; terms in the deck but absent from the source are never
* removed. After resolving all matched terms, the script determines
* validated_for_languages by checking which languages excluding the source
* language have full translation coverage across all matched terms, and updates
* the array on every run.
*/
/*
* roadmap
*
* [x] Setup - hardcoded path, name, description, source language, POS
* [x] Read wordlist - load and deduplicate the 1000 nouns
* [x] Query terms - match to database, collect all term IDs per word (including homonyms)
* [x] Write missing words to file for future reference
* [x] Determine validated_languages - find languages (excluding source) with full coverage across all matched terms
* [x] Check idempotency - if deck exists, diff matched terms against existing deck_terms
* [x] Create deck if it doesn't exist - insert with name, source_language, validated_languages
* [x] Add new terms - insert only term IDs present in source but missing from deck
* [x] Update validated_languages - recalculate and update on every run
* [x] Report - summary of words found, missing, added, and validated languages
*/
import fs from "node:fs/promises";
import { db } from "@glossa/db";
import { translations, terms, decks, deck_terms } from "@glossa/db/schema";
import { inArray, and, eq } from "drizzle-orm";
import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";
const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
const nameOfDeck = "top english nouns";
const descriptionOfDeck =
"Most frequently used English nouns for vocabulary practice";
const sourceLanguage = "en";
const sourcePOS = "noun";
// new Set() automatically discards duplicate values,
// and spreading it back with ... converts it to a plain array again.
// So if "bank" appears twice in the file,
// the resulting array will only contain it once.
const readingFromWordlist = async () => {
const raw = await fs.readFile(pathToWordlist, "utf8");
const words = [
...new Set(
raw
.split("\n")
.map((w) => w.trim().toLowerCase())
.filter(Boolean),
),
];
return words;
};
const checkingSourceWordsAgainstDB = async (words: string[]) => {
const rows = await db
.select({ text: translations.text, termId: translations.term_id })
.from(translations)
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(
and(
inArray(translations.text, words),
eq(translations.language_code, sourceLanguage),
eq(terms.pos, sourcePOS),
),
);
const wordToTermIds = new Map<string, string[]>();
for (const row of rows) {
const word = row.text.toLowerCase();
const existing = wordToTermIds.get(word) ?? [];
wordToTermIds.set(word, [...existing, row.termId]);
}
const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())];
const missingWords = words.filter((w) => !wordToTermIds.has(w));
return { termIds, missingWords };
};
const writeMissingWordsToFile = async (missingWords: string[]) => {
const outputPath = `${pathToWordlist}-missing`;
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
};
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
// create array of language code from the supported languages
// remove source language from it
const languages = SUPPORTED_LANGUAGE_CODES.filter(
(language) => language !== sourceLanguage,
);
const validatedLanguages: string[] = [];
// For each remaining language, count how many of the termIds have a translation in that language
for (const language of languages) {
const rows = await db
.selectDistinct({ termId: translations.term_id })
.from(translations)
.where(
and(
inArray(translations.term_id, termIds),
eq(translations.language_code, language),
),
);
if (rows.length === termIds.length) {
validatedLanguages.push(language);
}
}
// If the count equals termIds.length → full coverage → include in result
// Return the array of fully covered languages
return validatedLanguages;
};
// Check idempotency — if deck exists, diff matched terms against existing deck_terms
const findExistingDeck = async () => {
const existing = await db
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
.from(decks)
.where(
and(
eq(decks.name, nameOfDeck),
eq(decks.source_language, sourceLanguage),
),
);
return existing[0] ?? null;
};
// logging translation coverage per language across all matched terms
const logLanguageCoverage = async (termIds: string[]) => {
const languages = SUPPORTED_LANGUAGE_CODES.filter(
(language) => language !== sourceLanguage,
);
for (const language of languages) {
const rows = await db
.selectDistinct({ termId: translations.term_id })
.from(translations)
.where(
and(
inArray(translations.term_id, termIds),
eq(translations.language_code, language),
),
);
console.log(
` ${language}: ${rows.length} / ${termIds.length} terms covered`,
);
const coveredIds = new Set(rows.map((r) => r.termId));
const missingTermIds = termIds.filter((id) => !coveredIds.has(id));
console.log(` missing term IDs count:`, missingTermIds.length);
const missingEnglish = await db
.selectDistinct({ text: translations.text })
.from(translations)
.where(
and(
inArray(translations.term_id, missingTermIds),
eq(translations.language_code, "en"),
),
);
console.log(
` missing words in ${language}:`,
missingEnglish.map((r) => r.text),
"\n",
);
}
};
// creating a deck
const createDeck = async (validatedLanguages: string[]) => {
const result = await db
.insert(decks)
.values({
name: nameOfDeck,
description: descriptionOfDeck,
source_language: sourceLanguage,
validated_languages: validatedLanguages,
is_public: false,
})
.returning({ id: decks.id });
const created = result[0];
if (!created) throw new Error("Failed to create deck: no row returned");
return created.id;
};
// Diffs termIds against the existing deck_terms for this deck and inserts only
// the ones not already present. Returns the count of newly inserted terms.
const addTermsToDeck = async (
deckId: string,
termIds: string[],
): Promise<number> => {
const existingRows = await db
.select({ termId: deck_terms.term_id })
.from(deck_terms)
.where(eq(deck_terms.deck_id, deckId));
const existingTermIds = new Set(existingRows.map((r) => r.termId));
const newTermIds = termIds.filter((id) => !existingTermIds.has(id));
if (newTermIds.length === 0) return 0;
await db
.insert(deck_terms)
.values(newTermIds.map((termId) => ({ deck_id: deckId, term_id: termId })));
return newTermIds.length;
};
// Recalculates and persists validated_languages on every run so the field stays
// accurate as translation coverage grows over time.
const updateValidatedLanguages = async (
deckId: string,
validatedLanguages: string[],
): Promise<void> => {
await db
.update(decks)
.set({ validated_languages: validatedLanguages })
.where(eq(decks.id, deckId));
};
const main = async () => {
console.log("📖 Reading word list...");
const sourceWords = await readingFromWordlist();
console.log(` ${sourceWords.length} words loaded\n`);
console.log("🔍 Checking against database...");
const { termIds, missingWords } =
await checkingSourceWordsAgainstDB(sourceWords);
console.log(` ${termIds.length} terms found`);
console.log(` ${missingWords.length} words not found in DB\n`);
console.log("🖊️ Writing missing words to file...\n");
await writeMissingWordsToFile(missingWords);
console.log("✅ Validating languages...");
const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
console.log(
` Validated languages: ${JSON.stringify(validatedLanguages)}\n`,
);
console.log("🔬 Language coverage breakdown...");
await logLanguageCoverage(termIds);
console.log("🃏 Looking for existing deck...");
const existingDeck = await findExistingDeck();
let deckId: string;
let isNewDeck: boolean;
if (!existingDeck) {
console.log(" No existing deck found, will create one\n");
console.log("🆕 Creating deck...");
deckId = await createDeck(validatedLanguages);
console.log(` Deck created with id: ${deckId}\n`);
isNewDeck = true;
} else {
console.log(` Found existing deck with id: ${existingDeck.id}\n`);
deckId = existingDeck.id;
isNewDeck = false;
}
console.log(" Adding terms to deck...");
const addedCount = await addTermsToDeck(deckId, termIds);
const alreadyPresentCount = termIds.length - addedCount;
console.log(` ${addedCount} terms added`);
console.log(` ${alreadyPresentCount} terms already in deck\n`);
if (!isNewDeck) {
console.log("🔄 Updating validated languages...");
await updateValidatedLanguages(deckId, validatedLanguages);
console.log(` Updated to: ${JSON.stringify(validatedLanguages)}\n`);
}
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
console.log("📊 Summary");
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
console.log(` Words loaded from wordlist : ${sourceWords.length}`);
console.log(
` Words matched in DB : ${sourceWords.length - missingWords.length}`,
);
console.log(` Words not found in DB : ${missingWords.length}`);
console.log(` Term IDs resolved : ${termIds.length}`);
console.log(` Terms added to deck : ${addedCount}`);
console.log(` Terms already in deck : ${alreadyPresentCount}`);
console.log(
` Validated languages : ${validatedLanguages.length > 0 ? validatedLanguages.join(", ") : "none"}`,
);
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
};
main().catch((error) => {
console.error(error);
process.exit(1);
});

View file

@ -1,162 +0,0 @@
/*
*
* Builds the "top English nouns" deck from a curated wordlist of the 1000 most
* frequently used English nouns. The deck has English as its source language
* meaning it was curated from an English-centric frequency list, and a separate
* deck would be needed for other source languages. For each word in the list, all
* matching term IDs are looked up in the database via the translations table
* (language: "en", POS: "noun") homonyms are intentionally included as separate
* cards since the quiz UI displays a gloss alongside each word. Words from the
* list that have no DB match are skipped and written to a file for future
* reference. The script is idempotent: if the deck already exists, only terms
* present in the source but missing from the deck are added; terms already in the
* deck are left untouched; terms in the deck but absent from the source are never
* removed. After resolving all matched terms, the script determines
* validated_for_languages by checking which languages excluding the source
* language have full translation coverage across all matched terms, and updates
* the array on every run.
*/
/*
* roadmap
*
* [x] Setup hardcoded path, name, description, source language, POS
* [x] Read wordlist load and deduplicate the 1000 nouns
* [x] Query terms match to database, collect all term IDs per word (including homonyms)
* [x] Write missing words to file for future reference
* [x] Determine validated_for_languages find languages (excluding source) with full coverage across all matched terms
* [ ] Check idempotency if deck exists, diff matched terms against existing deck_terms
* [ ] Create deck if it doesn't exist insert with name, source_language, validated_for_languages
* [ ] Add new terms insert only term IDs present in source but missing from deck
* [ ] Update validated_for_languages recalculate and update on every run
* [ ] Report summary of words found, missing, added, and validated languages
*/
import fs from "node:fs/promises";
import { db } from "@glossa/db";
import { translations, terms, decks } from "@glossa/db/schema";
import { inArray, and, eq } from "drizzle-orm";
import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";
const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
const nameOfDeck = "top english nouns";
const descriptionOfDeck =
"Most frequently used English nouns for vocabulary practice";
const sourceLanguage = "en";
const sourcePOS = "noun";
// new Set() automatically discards duplicate values,
// and spreading it back with ... converts it to a plain array again.
// So if "bank" appears twice in the file,
// the resulting array will only contain it once.
const readingFromWordlist = async () => {
const raw = await fs.readFile(pathToWordlist, "utf8");
const words = [
...new Set(
raw
.split("\n")
.map((w) => w.trim().toLowerCase())
.filter(Boolean),
),
];
return words;
};
const checkingSourceWordsAgainstDB = async (words: string[]) => {
const rows = await db
.select({ text: translations.text, termId: translations.term_id })
.from(translations)
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(
and(
inArray(translations.text, words),
eq(translations.language_code, sourceLanguage),
eq(terms.pos, sourcePOS),
),
);
const wordToTermIds = new Map<string, string[]>();
for (const row of rows) {
const word = row.text.toLowerCase();
const existing = wordToTermIds.get(word) ?? [];
wordToTermIds.set(word, [...existing, row.termId]);
}
const termIds = Array.from(wordToTermIds.values()).flat();
const missingWords = words.filter((w) => !wordToTermIds.has(w));
return { termIds, missingWords };
};
const writeMissingWordsToFile = async (missingWords: string[]) => {
const outputPath = `${pathToWordlist}-missing`;
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
};
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
// create array of language code from the supported languages
// remove source language from it
const languages = SUPPORTED_LANGUAGE_CODES.filter(
(language) => language !== sourceLanguage,
);
const validatedLanguages: string[] = [];
// For each remaining language, count how many of the termIds have a translation in that language
for (const language of languages) {
const rows = await db
.select({ termId: translations.term_id })
.from(translations)
.where(
and(
inArray(translations.term_id, termIds),
eq(translations.language_code, language),
),
);
if (rows.length === termIds.length) {
validatedLanguages.push(language);
}
}
// If the count equals termIds.length → full coverage → include in result
// Return the array of fully covered languages
return validatedLanguages;
};
const findExistingDeck = async () => {
const existing = await db
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
.from(decks)
.where(
and(
eq(decks.name, nameOfDeck),
eq(decks.source_language, sourceLanguage),
),
);
return existing[0] ?? null;
};
const main = async () => {
// reading from source file
console.log("📖 Reading word list...");
const sourceWords = await readingFromWordlist();
console.log(` ${sourceWords.length} words loaded\n`);
// checking if sourceWords exist in database
console.log("🔍 Checking against database...");
const { termIds, missingWords } =
await checkingSourceWordsAgainstDB(sourceWords);
console.log("words found in db: ", termIds.length);
console.log("words NOT found in db: ", missingWords.length, "\n");
// writing missing words to file
console.log("writing missing words to file...\n");
await writeMissingWordsToFile(missingWords);
// validating languages
console.log("validation languages...");
const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
console.log("validated these languages: ", validatedLanguages, "\n");
};
main().catch((error) => {
console.error(error);
process.exit(1);
});