lila/packages/db/src/generating-deck.ts
lila 3f7bc4111e chore: rename project from glossa to lila
- Update all package names from @glossa/* to @lila/*
- Update all imports, container names, volume names
- Update documentation references
- Recreate database with new credentials
2026-04-13 10:00:52 +02:00

211 lines
6.6 KiB
TypeScript

import fs from "node:fs/promises";
import { db } from "@lila/db";
import { translations, terms, decks, deck_terms } from "@lila/db/schema";
import { inArray, and, eq, ne, countDistinct } from "drizzle-orm";
type DbOrTx = Parameters<Parameters<typeof db.transaction>[0]>[0];
const config = {
pathToWordlist: "./src/data/wordlists/top1000englishnouns",
deckName: "top english nouns",
deckDescription: "Most frequently used English nouns for vocabulary practice",
sourceLanguage: "en",
sourcePOS: "noun",
} as const;
const readWordList = async () => {
const raw = await fs.readFile(config.pathToWordlist, "utf8");
const words = [
...new Set(
raw
.split("\n")
.map((w) => w.trim().toLowerCase())
.filter(Boolean),
),
];
return words;
};
const resolveSourceTerms = async (words: string[]) => {
const rows = await db
.select({ text: translations.text, termId: translations.term_id })
.from(translations)
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(
and(
inArray(translations.text, words),
eq(translations.language_code, config.sourceLanguage),
eq(terms.pos, config.sourcePOS),
),
);
const wordToTermIds = new Map<string, string[]>();
for (const row of rows) {
const word = row.text.toLowerCase();
if (!wordToTermIds.has(word)) {
wordToTermIds.set(word, []);
}
wordToTermIds.get(word)!.push(row.termId);
}
// Deduplicate: multiple words can map to the same term ID (e.g. via synonyms)
const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())];
const missingWords = words.filter((w) => !wordToTermIds.has(w));
return { termIds, missingWords };
};
const writeMissingWordsToFile = async (missingWords: string[]) => {
const outputPath = `${config.pathToWordlist}-missing`;
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
};
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
const coverage = await db
.select({
language: translations.language_code,
coveredCount: countDistinct(translations.term_id),
})
.from(translations)
.where(
and(
inArray(translations.term_id, termIds),
ne(translations.language_code, sourceLanguage),
),
)
.groupBy(translations.language_code);
const validatedLanguages = coverage
.filter((row) => Number(row.coveredCount) === termIds.length)
.map((row) => row.language);
return { coverage, validatedLanguages };
};
const findExistingDeck = async (tx: DbOrTx) => {
const existing = await tx
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
.from(decks)
.where(
and(
eq(decks.name, config.deckName),
eq(decks.source_language, config.sourceLanguage),
),
);
return existing[0] ?? null;
};
const createDeck = async (tx: DbOrTx, validatedLanguages: string[]) => {
const result = await tx
.insert(decks)
.values({
name: config.deckName,
description: config.deckDescription,
source_language: config.sourceLanguage,
validated_languages: validatedLanguages,
type: "core",
})
.returning({ id: decks.id });
const created = result[0];
if (!created) throw new Error("Failed to create deck: no row returned");
return created.id;
};
const addTermsToDeck = async (
tx: DbOrTx,
deckId: string,
termIds: string[],
): Promise<number> => {
if (termIds.length === 0) return 0;
await tx
.insert(deck_terms)
.values(termIds.map((termId) => ({ deck_id: deckId, term_id: termId })))
.onConflictDoNothing();
return termIds.length;
};
const updateValidatedLanguages = async (
tx: DbOrTx,
deckId: string,
validatedLanguages: string[],
): Promise<void> => {
await tx
.update(decks)
.set({ validated_languages: validatedLanguages })
.where(eq(decks.id, deckId));
};
const main = async () => {
console.log("📖 Reading word list...");
const sourceWords = await readWordList();
console.log(` ${sourceWords.length} words loaded\n`);
console.log("🔍 Checking against database...");
const { termIds, missingWords } = await resolveSourceTerms(sourceWords);
console.log(` ${termIds.length} terms found`);
console.log(` ${missingWords.length} words not found in DB\n`);
console.log("🖊️ Writing missing words to file...\n");
await writeMissingWordsToFile(missingWords);
console.log("✅ Validating languages...");
const { coverage, validatedLanguages } = await validateLanguages(
config.sourceLanguage,
termIds,
);
console.log(
` Validated languages: ${JSON.stringify(validatedLanguages)}\n`,
);
console.log("🔬 Language coverage breakdown...");
for (const row of coverage) {
console.log(
` ${row.language}: ${row.coveredCount} / ${termIds.length} terms covered`,
);
}
console.log("🃏 Looking for existing deck...");
const addedCount = await db.transaction(async (tx) => {
const existingDeck = await findExistingDeck(tx);
const deckId = existingDeck
? existingDeck.id
: await createDeck(tx, validatedLanguages);
const addedCount = await addTermsToDeck(tx, deckId, termIds);
const currentLanguages = existingDeck?.validatedForLanguages ?? [];
const hasChanged =
JSON.stringify([...currentLanguages].sort()) !==
JSON.stringify([...validatedLanguages].sort());
if (hasChanged) {
await updateValidatedLanguages(tx, deckId, validatedLanguages);
}
return addedCount;
});
const alreadyPresentCount = termIds.length - addedCount;
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
console.log("📊 Summary");
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
console.log(` Words loaded from wordlist : ${sourceWords.length}`);
console.log(
` Words matched in DB : ${sourceWords.length - missingWords.length}`,
);
console.log(` Words not found in DB : ${missingWords.length}`);
console.log(` Term IDs resolved : ${termIds.length}`);
console.log(` Terms added to deck : ${addedCount}`);
console.log(` Terms already in deck : ${alreadyPresentCount}`);
console.log(
` Validated languages : ${validatedLanguages.length > 0 ? validatedLanguages.join(", ") : "none"}`,
);
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
};
main().catch((error) => {
console.error(error);
process.exit(1);
});