import fs from "node:fs/promises"; import { db } from "@lila/db"; import { translations, terms, decks, deck_terms } from "@lila/db/schema"; import { inArray, and, eq, ne, countDistinct } from "drizzle-orm"; type DbOrTx = Parameters[0]>[0]; const config = { pathToWordlist: "./src/data/wordlists/top1000englishnouns", deckName: "top english nouns", deckDescription: "Most frequently used English nouns for vocabulary practice", sourceLanguage: "en", sourcePOS: "noun", } as const; const readWordList = async () => { const raw = await fs.readFile(config.pathToWordlist, "utf8"); const words = [ ...new Set( raw .split("\n") .map((w) => w.trim().toLowerCase()) .filter(Boolean), ), ]; return words; }; const resolveSourceTerms = async (words: string[]) => { const rows = await db .select({ text: translations.text, termId: translations.term_id }) .from(translations) .innerJoin(terms, eq(translations.term_id, terms.id)) .where( and( inArray(translations.text, words), eq(translations.language_code, config.sourceLanguage), eq(terms.pos, config.sourcePOS), ), ); const wordToTermIds = new Map(); for (const row of rows) { const word = row.text.toLowerCase(); if (!wordToTermIds.has(word)) { wordToTermIds.set(word, []); } wordToTermIds.get(word)!.push(row.termId); } // Deduplicate: multiple words can map to the same term ID (e.g. via synonyms) const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())]; const missingWords = words.filter((w) => !wordToTermIds.has(w)); return { termIds, missingWords }; }; const writeMissingWordsToFile = async (missingWords: string[]) => { const outputPath = `${config.pathToWordlist}-missing`; await fs.writeFile(outputPath, missingWords.join("\n"), "utf8"); }; const validateLanguages = async (sourceLanguage: string, termIds: string[]) => { const coverage = await db .select({ language: translations.language_code, coveredCount: countDistinct(translations.term_id), }) .from(translations) .where( and( inArray(translations.term_id, termIds), ne(translations.language_code, sourceLanguage), ), ) .groupBy(translations.language_code); const validatedLanguages = coverage .filter((row) => Number(row.coveredCount) === termIds.length) .map((row) => row.language); return { coverage, validatedLanguages }; }; const findExistingDeck = async (tx: DbOrTx) => { const existing = await tx .select({ id: decks.id, validatedForLanguages: decks.validated_languages }) .from(decks) .where( and( eq(decks.name, config.deckName), eq(decks.source_language, config.sourceLanguage), ), ); return existing[0] ?? null; }; const createDeck = async (tx: DbOrTx, validatedLanguages: string[]) => { const result = await tx .insert(decks) .values({ name: config.deckName, description: config.deckDescription, source_language: config.sourceLanguage, validated_languages: validatedLanguages, type: "core", }) .returning({ id: decks.id }); const created = result[0]; if (!created) throw new Error("Failed to create deck: no row returned"); return created.id; }; const addTermsToDeck = async ( tx: DbOrTx, deckId: string, termIds: string[], ): Promise => { if (termIds.length === 0) return 0; await tx .insert(deck_terms) .values(termIds.map((termId) => ({ deck_id: deckId, term_id: termId }))) .onConflictDoNothing(); return termIds.length; }; const updateValidatedLanguages = async ( tx: DbOrTx, deckId: string, validatedLanguages: string[], ): Promise => { await tx .update(decks) .set({ validated_languages: validatedLanguages }) .where(eq(decks.id, deckId)); }; const main = async () => { console.log("📖 Reading word list..."); const sourceWords = await readWordList(); console.log(` ${sourceWords.length} words loaded\n`); console.log("🔍 Checking against database..."); const { termIds, missingWords } = await resolveSourceTerms(sourceWords); console.log(` ${termIds.length} terms found`); console.log(` ${missingWords.length} words not found in DB\n`); console.log("🖊️ Writing missing words to file...\n"); await writeMissingWordsToFile(missingWords); console.log("✅ Validating languages..."); const { coverage, validatedLanguages } = await validateLanguages( config.sourceLanguage, termIds, ); console.log( ` Validated languages: ${JSON.stringify(validatedLanguages)}\n`, ); console.log("🔬 Language coverage breakdown..."); for (const row of coverage) { console.log( ` ${row.language}: ${row.coveredCount} / ${termIds.length} terms covered`, ); } console.log("🃏 Looking for existing deck..."); const addedCount = await db.transaction(async (tx) => { const existingDeck = await findExistingDeck(tx); const deckId = existingDeck ? existingDeck.id : await createDeck(tx, validatedLanguages); const addedCount = await addTermsToDeck(tx, deckId, termIds); const currentLanguages = existingDeck?.validatedForLanguages ?? []; const hasChanged = JSON.stringify([...currentLanguages].sort()) !== JSON.stringify([...validatedLanguages].sort()); if (hasChanged) { await updateValidatedLanguages(tx, deckId, validatedLanguages); } return addedCount; }); const alreadyPresentCount = termIds.length - addedCount; console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); console.log("📊 Summary"); console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); console.log(` Words loaded from wordlist : ${sourceWords.length}`); console.log( ` Words matched in DB : ${sourceWords.length - missingWords.length}`, ); console.log(` Words not found in DB : ${missingWords.length}`); console.log(` Term IDs resolved : ${termIds.length}`); console.log(` Terms added to deck : ${addedCount}`); console.log(` Terms already in deck : ${alreadyPresentCount}`); console.log( ` Validated languages : ${validatedLanguages.length > 0 ? validatedLanguages.join(", ") : "none"}`, ); console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); }; main().catch((error) => { console.error(error); process.exit(1); });