211 lines
6.6 KiB
TypeScript
211 lines
6.6 KiB
TypeScript
import fs from "node:fs/promises";
|
|
import { db } from "@glossa/db";
|
|
import { translations, terms, decks, deck_terms } from "@glossa/db/schema";
|
|
import { inArray, and, eq, ne, countDistinct } from "drizzle-orm";
|
|
|
|
type DbOrTx = Parameters<Parameters<typeof db.transaction>[0]>[0];
|
|
|
|
const config = {
|
|
pathToWordlist: "./src/data/wordlists/top1000englishnouns",
|
|
deckName: "top english nouns",
|
|
deckDescription: "Most frequently used English nouns for vocabulary practice",
|
|
sourceLanguage: "en",
|
|
sourcePOS: "noun",
|
|
} as const;
|
|
|
|
const readWordList = async () => {
|
|
const raw = await fs.readFile(config.pathToWordlist, "utf8");
|
|
const words = [
|
|
...new Set(
|
|
raw
|
|
.split("\n")
|
|
.map((w) => w.trim().toLowerCase())
|
|
.filter(Boolean),
|
|
),
|
|
];
|
|
return words;
|
|
};
|
|
|
|
const resolveSourceTerms = async (words: string[]) => {
|
|
const rows = await db
|
|
.select({ text: translations.text, termId: translations.term_id })
|
|
.from(translations)
|
|
.innerJoin(terms, eq(translations.term_id, terms.id))
|
|
.where(
|
|
and(
|
|
inArray(translations.text, words),
|
|
eq(translations.language_code, config.sourceLanguage),
|
|
eq(terms.pos, config.sourcePOS),
|
|
),
|
|
);
|
|
|
|
const wordToTermIds = new Map<string, string[]>();
|
|
for (const row of rows) {
|
|
const word = row.text.toLowerCase();
|
|
|
|
if (!wordToTermIds.has(word)) {
|
|
wordToTermIds.set(word, []);
|
|
}
|
|
wordToTermIds.get(word)!.push(row.termId);
|
|
}
|
|
// Deduplicate: multiple words can map to the same term ID (e.g. via synonyms)
|
|
const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())];
|
|
const missingWords = words.filter((w) => !wordToTermIds.has(w));
|
|
|
|
return { termIds, missingWords };
|
|
};
|
|
|
|
const writeMissingWordsToFile = async (missingWords: string[]) => {
|
|
const outputPath = `${config.pathToWordlist}-missing`;
|
|
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
|
|
};
|
|
|
|
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
|
|
const coverage = await db
|
|
.select({
|
|
language: translations.language_code,
|
|
coveredCount: countDistinct(translations.term_id),
|
|
})
|
|
.from(translations)
|
|
.where(
|
|
and(
|
|
inArray(translations.term_id, termIds),
|
|
ne(translations.language_code, sourceLanguage),
|
|
),
|
|
)
|
|
.groupBy(translations.language_code);
|
|
|
|
const validatedLanguages = coverage
|
|
.filter((row) => Number(row.coveredCount) === termIds.length)
|
|
.map((row) => row.language);
|
|
|
|
return { coverage, validatedLanguages };
|
|
};
|
|
|
|
const findExistingDeck = async (tx: DbOrTx) => {
|
|
const existing = await tx
|
|
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
|
|
.from(decks)
|
|
.where(
|
|
and(
|
|
eq(decks.name, config.deckName),
|
|
eq(decks.source_language, config.sourceLanguage),
|
|
),
|
|
);
|
|
return existing[0] ?? null;
|
|
};
|
|
|
|
const createDeck = async (tx: DbOrTx, validatedLanguages: string[]) => {
|
|
const result = await tx
|
|
.insert(decks)
|
|
.values({
|
|
name: config.deckName,
|
|
description: config.deckDescription,
|
|
source_language: config.sourceLanguage,
|
|
validated_languages: validatedLanguages,
|
|
is_public: false,
|
|
})
|
|
.returning({ id: decks.id });
|
|
const created = result[0];
|
|
if (!created) throw new Error("Failed to create deck: no row returned");
|
|
return created.id;
|
|
};
|
|
|
|
const addTermsToDeck = async (
|
|
tx: DbOrTx,
|
|
deckId: string,
|
|
termIds: string[],
|
|
): Promise<number> => {
|
|
if (termIds.length === 0) return 0;
|
|
|
|
await tx
|
|
.insert(deck_terms)
|
|
.values(termIds.map((termId) => ({ deck_id: deckId, term_id: termId })))
|
|
.onConflictDoNothing();
|
|
|
|
return termIds.length;
|
|
};
|
|
|
|
const updateValidatedLanguages = async (
|
|
tx: DbOrTx,
|
|
deckId: string,
|
|
validatedLanguages: string[],
|
|
): Promise<void> => {
|
|
await tx
|
|
.update(decks)
|
|
.set({ validated_languages: validatedLanguages })
|
|
.where(eq(decks.id, deckId));
|
|
};
|
|
|
|
const main = async () => {
|
|
console.log("📖 Reading word list...");
|
|
const sourceWords = await readWordList();
|
|
console.log(` ${sourceWords.length} words loaded\n`);
|
|
|
|
console.log("🔍 Checking against database...");
|
|
const { termIds, missingWords } = await resolveSourceTerms(sourceWords);
|
|
console.log(` ${termIds.length} terms found`);
|
|
console.log(` ${missingWords.length} words not found in DB\n`);
|
|
|
|
console.log("🖊️ Writing missing words to file...\n");
|
|
await writeMissingWordsToFile(missingWords);
|
|
|
|
console.log("✅ Validating languages...");
|
|
const { coverage, validatedLanguages } = await validateLanguages(
|
|
config.sourceLanguage,
|
|
termIds,
|
|
);
|
|
console.log(
|
|
` Validated languages: ${JSON.stringify(validatedLanguages)}\n`,
|
|
);
|
|
|
|
console.log("🔬 Language coverage breakdown...");
|
|
for (const row of coverage) {
|
|
console.log(
|
|
` ${row.language}: ${row.coveredCount} / ${termIds.length} terms covered`,
|
|
);
|
|
}
|
|
|
|
console.log("🃏 Looking for existing deck...");
|
|
const addedCount = await db.transaction(async (tx) => {
|
|
const existingDeck = await findExistingDeck(tx);
|
|
const deckId = existingDeck
|
|
? existingDeck.id
|
|
: await createDeck(tx, validatedLanguages);
|
|
|
|
const addedCount = await addTermsToDeck(tx, deckId, termIds);
|
|
|
|
const currentLanguages = existingDeck?.validatedForLanguages ?? [];
|
|
const hasChanged =
|
|
JSON.stringify([...currentLanguages].sort()) !==
|
|
JSON.stringify([...validatedLanguages].sort());
|
|
|
|
if (hasChanged) {
|
|
await updateValidatedLanguages(tx, deckId, validatedLanguages);
|
|
}
|
|
|
|
return addedCount;
|
|
});
|
|
const alreadyPresentCount = termIds.length - addedCount;
|
|
|
|
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
|
console.log("📊 Summary");
|
|
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
|
console.log(` Words loaded from wordlist : ${sourceWords.length}`);
|
|
console.log(
|
|
` Words matched in DB : ${sourceWords.length - missingWords.length}`,
|
|
);
|
|
console.log(` Words not found in DB : ${missingWords.length}`);
|
|
console.log(` Term IDs resolved : ${termIds.length}`);
|
|
console.log(` Terms added to deck : ${addedCount}`);
|
|
console.log(` Terms already in deck : ${alreadyPresentCount}`);
|
|
console.log(
|
|
` Validated languages : ${validatedLanguages.length > 0 ? validatedLanguages.join(", ") : "none"}`,
|
|
);
|
|
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
|
};
|
|
|
|
main().catch((error) => {
|
|
console.error(error);
|
|
process.exit(1);
|
|
});
|