refactoring
This commit is contained in:
parent
b0c0baf9ab
commit
cdedbc44cd
2 changed files with 416 additions and 175 deletions
|
|
@ -1,56 +1,20 @@
|
|||
/*
|
||||
*
|
||||
* Builds the "top English nouns" deck from a curated wordlist of the 1000 most
|
||||
* frequently used English nouns. The deck has English as its source language —
|
||||
* meaning it was curated from an English-centric frequency list, and a separate
|
||||
* deck would be needed for other source languages. For each word in the list, all
|
||||
* matching term IDs are looked up in the database via the translations table
|
||||
* (language: "en", POS: "noun") — homonyms are intentionally included as separate
|
||||
* cards since the quiz UI displays a gloss alongside each word. Words from the
|
||||
* list that have no DB match are skipped and written to a file for future
|
||||
* reference. The script is idempotent: if the deck already exists, only terms
|
||||
* present in the source but missing from the deck are added; terms already in the
|
||||
* deck are left untouched; terms in the deck but absent from the source are never
|
||||
* removed. After resolving all matched terms, the script determines
|
||||
* validated_for_languages by checking which languages — excluding the source
|
||||
* language — have full translation coverage across all matched terms, and updates
|
||||
* the array on every run.
|
||||
*/
|
||||
|
||||
/*
|
||||
* roadmap
|
||||
*
|
||||
* [x] Setup - hardcoded path, name, description, source language, POS
|
||||
* [x] Read wordlist - load and deduplicate the 1000 nouns
|
||||
* [x] Query terms - match to database, collect all term IDs per word (including homonyms)
|
||||
* [x] Write missing words to file for future reference
|
||||
* [x] Determine validated_languages - find languages (excluding source) with full coverage across all matched terms
|
||||
* [x] Check idempotency - if deck exists, diff matched terms against existing deck_terms
|
||||
* [x] Create deck if it doesn't exist - insert with name, source_language, validated_languages
|
||||
* [x] Add new terms - insert only term IDs present in source but missing from deck
|
||||
* [x] Update validated_languages - recalculate and update on every run
|
||||
* [x] Report - summary of words found, missing, added, and validated languages
|
||||
*/
|
||||
|
||||
import fs from "node:fs/promises";
|
||||
import { db } from "@glossa/db";
|
||||
import { translations, terms, decks, deck_terms } from "@glossa/db/schema";
|
||||
import { inArray, and, eq } from "drizzle-orm";
|
||||
import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";
|
||||
import { inArray, and, eq, count, ne } from "drizzle-orm";
|
||||
|
||||
const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
|
||||
const nameOfDeck = "top english nouns";
|
||||
const descriptionOfDeck =
|
||||
"Most frequently used English nouns for vocabulary practice";
|
||||
const sourceLanguage = "en";
|
||||
const sourcePOS = "noun";
|
||||
type DbOrTx = Parameters<Parameters<typeof db.transaction>[0]>[0];
|
||||
|
||||
// new Set() automatically discards duplicate values,
|
||||
// and spreading it back with ... converts it to a plain array again.
|
||||
// So if "bank" appears twice in the file,
|
||||
// the resulting array will only contain it once.
|
||||
const readingFromWordlist = async () => {
|
||||
const raw = await fs.readFile(pathToWordlist, "utf8");
|
||||
const config = {
|
||||
pathToWordlist: "./src/data/wordlists/top1000englishnouns",
|
||||
deckName: "top english nouns",
|
||||
deckDescription: "Most frequently used English nouns for vocabulary practice",
|
||||
sourceLanguage: "en",
|
||||
sourcePOS: "noun",
|
||||
} as const;
|
||||
|
||||
const readWordList = async () => {
|
||||
const raw = await fs.readFile(config.pathToWordlist, "utf8");
|
||||
const words = [
|
||||
...new Set(
|
||||
raw
|
||||
|
|
@ -62,7 +26,7 @@ const readingFromWordlist = async () => {
|
|||
return words;
|
||||
};
|
||||
|
||||
const checkingSourceWordsAgainstDB = async (words: string[]) => {
|
||||
const resolveSourceTerms = async (words: string[]) => {
|
||||
const rows = await db
|
||||
.select({ text: translations.text, termId: translations.term_id })
|
||||
.from(translations)
|
||||
|
|
@ -70,17 +34,21 @@ const checkingSourceWordsAgainstDB = async (words: string[]) => {
|
|||
.where(
|
||||
and(
|
||||
inArray(translations.text, words),
|
||||
eq(translations.language_code, sourceLanguage),
|
||||
eq(terms.pos, sourcePOS),
|
||||
eq(translations.language_code, config.sourceLanguage),
|
||||
eq(terms.pos, config.sourcePOS),
|
||||
),
|
||||
);
|
||||
|
||||
const wordToTermIds = new Map<string, string[]>();
|
||||
for (const row of rows) {
|
||||
const word = row.text.toLowerCase();
|
||||
const existing = wordToTermIds.get(word) ?? [];
|
||||
wordToTermIds.set(word, [...existing, row.termId]);
|
||||
|
||||
if (!wordToTermIds.has(word)) {
|
||||
wordToTermIds.set(word, []);
|
||||
}
|
||||
wordToTermIds.get(word)!.push(row.termId);
|
||||
}
|
||||
// Deduplicate: multiple words can map to the same term ID (e.g. via synonyms)
|
||||
const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())];
|
||||
const missingWords = words.filter((w) => !wordToTermIds.has(w));
|
||||
|
||||
|
|
@ -88,100 +56,52 @@ const checkingSourceWordsAgainstDB = async (words: string[]) => {
|
|||
};
|
||||
|
||||
const writeMissingWordsToFile = async (missingWords: string[]) => {
|
||||
const outputPath = `${pathToWordlist}-missing`;
|
||||
const outputPath = `${config.pathToWordlist}-missing`;
|
||||
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
|
||||
};
|
||||
|
||||
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
|
||||
// create array of language code from the supported languages
|
||||
// remove source language from it
|
||||
const languages = SUPPORTED_LANGUAGE_CODES.filter(
|
||||
(language) => language !== sourceLanguage,
|
||||
);
|
||||
const validatedLanguages: string[] = [];
|
||||
// For each remaining language, count how many of the termIds have a translation in that language
|
||||
for (const language of languages) {
|
||||
const rows = await db
|
||||
.selectDistinct({ termId: translations.term_id })
|
||||
.from(translations)
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.term_id, termIds),
|
||||
eq(translations.language_code, language),
|
||||
),
|
||||
);
|
||||
if (rows.length === termIds.length) {
|
||||
validatedLanguages.push(language);
|
||||
}
|
||||
}
|
||||
const coverage = await db
|
||||
.select({
|
||||
language: translations.language_code,
|
||||
coveredCount: count(translations.term_id),
|
||||
})
|
||||
.from(translations)
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.term_id, termIds),
|
||||
ne(translations.language_code, sourceLanguage),
|
||||
),
|
||||
)
|
||||
.groupBy(translations.language_code);
|
||||
|
||||
// If the count equals termIds.length → full coverage → include in result
|
||||
// Return the array of fully covered languages
|
||||
return validatedLanguages;
|
||||
const validatedLanguages = coverage
|
||||
.filter((row) => Number(row.coveredCount) === termIds.length)
|
||||
.map((row) => row.language);
|
||||
|
||||
return { coverage, validatedLanguages };
|
||||
};
|
||||
|
||||
// Check idempotency — if deck exists, diff matched terms against existing deck_terms
|
||||
const findExistingDeck = async () => {
|
||||
const existing = await db
|
||||
const findExistingDeck = async (tx: DbOrTx) => {
|
||||
const existing = await tx
|
||||
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
|
||||
.from(decks)
|
||||
.where(
|
||||
and(
|
||||
eq(decks.name, nameOfDeck),
|
||||
eq(decks.source_language, sourceLanguage),
|
||||
eq(decks.name, config.deckName),
|
||||
eq(decks.source_language, config.sourceLanguage),
|
||||
),
|
||||
);
|
||||
return existing[0] ?? null;
|
||||
};
|
||||
|
||||
// logging translation coverage per language across all matched terms
|
||||
const logLanguageCoverage = async (termIds: string[]) => {
|
||||
const languages = SUPPORTED_LANGUAGE_CODES.filter(
|
||||
(language) => language !== sourceLanguage,
|
||||
);
|
||||
for (const language of languages) {
|
||||
const rows = await db
|
||||
.selectDistinct({ termId: translations.term_id })
|
||||
.from(translations)
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.term_id, termIds),
|
||||
eq(translations.language_code, language),
|
||||
),
|
||||
);
|
||||
console.log(
|
||||
` ${language}: ${rows.length} / ${termIds.length} terms covered`,
|
||||
);
|
||||
|
||||
const coveredIds = new Set(rows.map((r) => r.termId));
|
||||
const missingTermIds = termIds.filter((id) => !coveredIds.has(id));
|
||||
console.log(` missing term IDs count:`, missingTermIds.length);
|
||||
|
||||
const missingEnglish = await db
|
||||
.selectDistinct({ text: translations.text })
|
||||
.from(translations)
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.term_id, missingTermIds),
|
||||
eq(translations.language_code, "en"),
|
||||
),
|
||||
);
|
||||
console.log(
|
||||
` missing words in ${language}:`,
|
||||
missingEnglish.map((r) => r.text),
|
||||
"\n",
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
// creating a deck
|
||||
const createDeck = async (validatedLanguages: string[]) => {
|
||||
const result = await db
|
||||
const createDeck = async (tx: DbOrTx, validatedLanguages: string[]) => {
|
||||
const result = await tx
|
||||
.insert(decks)
|
||||
.values({
|
||||
name: nameOfDeck,
|
||||
description: descriptionOfDeck,
|
||||
source_language: sourceLanguage,
|
||||
name: config.deckName,
|
||||
description: config.deckDescription,
|
||||
source_language: config.sourceLanguage,
|
||||
validated_languages: validatedLanguages,
|
||||
is_public: false,
|
||||
})
|
||||
|
|
@ -191,36 +111,27 @@ const createDeck = async (validatedLanguages: string[]) => {
|
|||
return created.id;
|
||||
};
|
||||
|
||||
// Diffs termIds against the existing deck_terms for this deck and inserts only
|
||||
// the ones not already present. Returns the count of newly inserted terms.
|
||||
const addTermsToDeck = async (
|
||||
tx: DbOrTx,
|
||||
deckId: string,
|
||||
termIds: string[],
|
||||
): Promise<number> => {
|
||||
const existingRows = await db
|
||||
.select({ termId: deck_terms.term_id })
|
||||
.from(deck_terms)
|
||||
.where(eq(deck_terms.deck_id, deckId));
|
||||
if (termIds.length === 0) return 0;
|
||||
|
||||
const existingTermIds = new Set(existingRows.map((r) => r.termId));
|
||||
const newTermIds = termIds.filter((id) => !existingTermIds.has(id));
|
||||
|
||||
if (newTermIds.length === 0) return 0;
|
||||
|
||||
await db
|
||||
await tx
|
||||
.insert(deck_terms)
|
||||
.values(newTermIds.map((termId) => ({ deck_id: deckId, term_id: termId })));
|
||||
.values(termIds.map((termId) => ({ deck_id: deckId, term_id: termId })))
|
||||
.onConflictDoNothing();
|
||||
|
||||
return newTermIds.length;
|
||||
return termIds.length;
|
||||
};
|
||||
|
||||
// Recalculates and persists validated_languages on every run so the field stays
|
||||
// accurate as translation coverage grows over time.
|
||||
const updateValidatedLanguages = async (
|
||||
tx: DbOrTx,
|
||||
deckId: string,
|
||||
validatedLanguages: string[],
|
||||
): Promise<void> => {
|
||||
await db
|
||||
await tx
|
||||
.update(decks)
|
||||
.set({ validated_languages: validatedLanguages })
|
||||
.where(eq(decks.id, deckId));
|
||||
|
|
@ -228,12 +139,11 @@ const updateValidatedLanguages = async (
|
|||
|
||||
const main = async () => {
|
||||
console.log("📖 Reading word list...");
|
||||
const sourceWords = await readingFromWordlist();
|
||||
const sourceWords = await readWordList();
|
||||
console.log(` ${sourceWords.length} words loaded\n`);
|
||||
|
||||
console.log("🔍 Checking against database...");
|
||||
const { termIds, missingWords } =
|
||||
await checkingSourceWordsAgainstDB(sourceWords);
|
||||
const { termIds, missingWords } = await resolveSourceTerms(sourceWords);
|
||||
console.log(` ${termIds.length} terms found`);
|
||||
console.log(` ${missingWords.length} words not found in DB\n`);
|
||||
|
||||
|
|
@ -241,43 +151,42 @@ const main = async () => {
|
|||
await writeMissingWordsToFile(missingWords);
|
||||
|
||||
console.log("✅ Validating languages...");
|
||||
const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
|
||||
const { coverage, validatedLanguages } = await validateLanguages(
|
||||
config.sourceLanguage,
|
||||
termIds,
|
||||
);
|
||||
console.log(
|
||||
` Validated languages: ${JSON.stringify(validatedLanguages)}\n`,
|
||||
);
|
||||
|
||||
console.log("🔬 Language coverage breakdown...");
|
||||
await logLanguageCoverage(termIds);
|
||||
for (const row of coverage) {
|
||||
console.log(
|
||||
` ${row.language}: ${row.coveredCount} / ${termIds.length} terms covered`,
|
||||
);
|
||||
}
|
||||
|
||||
console.log("🃏 Looking for existing deck...");
|
||||
const existingDeck = await findExistingDeck();
|
||||
const addedCount = await db.transaction(async (tx) => {
|
||||
const existingDeck = await findExistingDeck(tx);
|
||||
const deckId = existingDeck
|
||||
? existingDeck.id
|
||||
: await createDeck(tx, validatedLanguages);
|
||||
|
||||
let deckId: string;
|
||||
let isNewDeck: boolean;
|
||||
const addedCount = await addTermsToDeck(tx, deckId, termIds);
|
||||
|
||||
if (!existingDeck) {
|
||||
console.log(" No existing deck found, will create one\n");
|
||||
console.log("🆕 Creating deck...");
|
||||
deckId = await createDeck(validatedLanguages);
|
||||
console.log(` Deck created with id: ${deckId}\n`);
|
||||
isNewDeck = true;
|
||||
} else {
|
||||
console.log(` Found existing deck with id: ${existingDeck.id}\n`);
|
||||
deckId = existingDeck.id;
|
||||
isNewDeck = false;
|
||||
}
|
||||
const currentLanguages = existingDeck?.validatedForLanguages ?? [];
|
||||
const hasChanged =
|
||||
JSON.stringify([...currentLanguages].sort()) !==
|
||||
JSON.stringify([...validatedLanguages].sort());
|
||||
|
||||
console.log("➕ Adding terms to deck...");
|
||||
const addedCount = await addTermsToDeck(deckId, termIds);
|
||||
if (hasChanged) {
|
||||
await updateValidatedLanguages(tx, deckId, validatedLanguages);
|
||||
}
|
||||
|
||||
return addedCount;
|
||||
});
|
||||
const alreadyPresentCount = termIds.length - addedCount;
|
||||
console.log(` ${addedCount} terms added`);
|
||||
console.log(` ${alreadyPresentCount} terms already in deck\n`);
|
||||
|
||||
if (!isNewDeck) {
|
||||
console.log("🔄 Updating validated languages...");
|
||||
await updateValidatedLanguages(deckId, validatedLanguages);
|
||||
console.log(` Updated to: ${JSON.stringify(validatedLanguages)}\n`);
|
||||
}
|
||||
|
||||
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
console.log("📊 Summary");
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue