This commit is contained in:
lila 2026-04-01 02:43:55 +02:00
parent a49bce4a5a
commit 7fdcedd1dd
2 changed files with 106 additions and 35 deletions

View file

@ -133,10 +133,7 @@ export const decks = pgTable(
name: text().notNull(), name: text().notNull(),
description: text(), description: text(),
source_language: varchar({ length: 10 }).notNull(), source_language: varchar({ length: 10 }).notNull(),
validated_for_languages: varchar({ length: 10 }) validated_languages: varchar({ length: 10 }).array().notNull().default([]),
.array()
.notNull()
.default([]),
is_public: boolean().default(false).notNull(), is_public: boolean().default(false).notNull(),
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(), created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
}, },

View file

@ -1,24 +1,42 @@
/* /*
- [x] Setup hardcoded path, name, description, source language, POS *
- [x] Read wordlist load the 1000 nouns * Builds the "top English nouns" deck from a curated wordlist of the 1000 most
- [x] Query terms match to database, find which ones have translations * frequently used English nouns. The deck has English as its source language
- [ ] writing missing words to textfile for future use * meaning it was curated from an English-centric frequency list, and a separate
- [ ] Validation determine validated languages from the data (e.g., ["en", "it"] if all matched terms have both) * deck would be needed for other source languages. For each word in the list, all
- [ ] Check idempotency skip if deck exists * matching term IDs are looked up in the database via the translations table
- [ ] Create deck insert with discovered validated_for_languages * (language: "en", POS: "noun") homonyms are intentionally included as separate
- [ ] Link terms insert deck_terms * cards since the quiz UI displays a gloss alongside each word. Words from the
- [ ] Report summary * list that have no DB match are skipped and written to a file for future
*/ * reference. The script is idempotent: if the deck already exists, only terms
* present in the source but missing from the deck are added; terms already in the
* deck are left untouched; terms in the deck but absent from the source are never
* removed. After resolving all matched terms, the script determines
* validated_for_languages by checking which languages excluding the source
* language have full translation coverage across all matched terms, and updates
* the array on every run.
*/
// TODO: Wordlist contains 1000 lines but only 999 unique words (965 found + 34 missing = 999). /*
// Likely cause: duplicate entry in top1000englishnouns file. * roadmap
// Investigate with: const unique = new Set(words); console.log(words.length - unique.size); *
// Fix either by deduplicating in code ([...new Set(words)]) or cleaning the source file. * [x] Setup hardcoded path, name, description, source language, POS
* [x] Read wordlist load and deduplicate the 1000 nouns
* [x] Query terms match to database, collect all term IDs per word (including homonyms)
* [x] Write missing words to file for future reference
* [x] Determine validated_for_languages find languages (excluding source) with full coverage across all matched terms
* [ ] Check idempotency if deck exists, diff matched terms against existing deck_terms
* [ ] Create deck if it doesn't exist insert with name, source_language, validated_for_languages
* [ ] Add new terms insert only term IDs present in source but missing from deck
* [ ] Update validated_for_languages recalculate and update on every run
* [ ] Report summary of words found, missing, added, and validated languages
*/
import fs from "node:fs/promises"; import fs from "node:fs/promises";
import { db } from "@glossa/db"; import { db } from "@glossa/db";
import { translations, terms } from "@glossa/db/schema"; import { translations, terms, decks } from "@glossa/db/schema";
import { inArray, and, eq } from "drizzle-orm"; import { inArray, and, eq } from "drizzle-orm";
import { SUPPORTED_LANGUAGE_CODES } from "@glossa/shared";
const pathToWordlist = "./src/data/wordlists/top1000englishnouns"; const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
const nameOfDeck = "top english nouns"; const nameOfDeck = "top english nouns";
@ -27,12 +45,20 @@ const descriptionOfDeck =
const sourceLanguage = "en"; const sourceLanguage = "en";
const sourcePOS = "noun"; const sourcePOS = "noun";
// new Set() automatically discards duplicate values,
// and spreading it back with ... converts it to a plain array again.
// So if "bank" appears twice in the file,
// the resulting array will only contain it once.
const readingFromWordlist = async () => { const readingFromWordlist = async () => {
const raw = await fs.readFile(pathToWordlist, "utf8"); const raw = await fs.readFile(pathToWordlist, "utf8");
const words = raw const words = [
.split("\n") ...new Set(
.map((w) => w.trim().toLowerCase()) raw
.filter(Boolean); .split("\n")
.map((w) => w.trim().toLowerCase())
.filter(Boolean),
),
];
return words; return words;
}; };
@ -49,37 +75,85 @@ const checkingSourceWordsAgainstDB = async (words: string[]) => {
), ),
); );
// map word text to term_id const wordToTermIds = new Map<string, string[]>();
const wordToTermId = new Map<string, string>();
for (const row of rows) { for (const row of rows) {
const word = row.text.toLowerCase(); const word = row.text.toLowerCase();
if (!wordToTermId.has(word)) { const existing = wordToTermIds.get(word) ?? [];
wordToTermId.set(word, row.termId); wordToTermIds.set(word, [...existing, row.termId]);
}
} }
const termIds = Array.from(wordToTermId.values()); const termIds = Array.from(wordToTermIds.values()).flat();
const missingWords = words.filter((w) => !wordToTermId.has(w)); const missingWords = words.filter((w) => !wordToTermIds.has(w));
return { termIds, missingWords }; return { termIds, missingWords };
}; };
const writeMissingWordsToFile = async (words: string[]) => {}; const writeMissingWordsToFile = async (missingWords: string[]) => {
const outputPath = `${pathToWordlist}-missing`;
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
};
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
// create array of language code from the supported languages
// remove source language from it
const languages = SUPPORTED_LANGUAGE_CODES.filter(
(language) => language !== sourceLanguage,
);
const validatedLanguages: string[] = [];
// For each remaining language, count how many of the termIds have a translation in that language
for (const language of languages) {
const rows = await db
.select({ termId: translations.term_id })
.from(translations)
.where(
and(
inArray(translations.term_id, termIds),
eq(translations.language_code, language),
),
);
if (rows.length === termIds.length) {
validatedLanguages.push(language);
}
}
// If the count equals termIds.length → full coverage → include in result
// Return the array of fully covered languages
return validatedLanguages;
};
const findExistingDeck = async () => {
const existing = await db
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
.from(decks)
.where(
and(
eq(decks.name, nameOfDeck),
eq(decks.source_language, sourceLanguage),
),
);
return existing[0] ?? null;
};
const main = async () => { const main = async () => {
// Read and normalise the word list // reading from source file
console.log("📖 Reading word list..."); console.log("📖 Reading word list...");
const sourceWords = await readingFromWordlist(); const sourceWords = await readingFromWordlist();
console.log(` ${sourceWords.length} words loaded\n`); console.log(` ${sourceWords.length} words loaded\n`);
// check if sourceWords exist in database // checking if sourceWords exist in database
console.log("🔍 Checking against database..."); console.log("🔍 Checking against database...");
const { termIds, missingWords } = const { termIds, missingWords } =
await checkingSourceWordsAgainstDB(sourceWords); await checkingSourceWordsAgainstDB(sourceWords);
console.log("words found in db: ", termIds.length); console.log("words found in db: ", termIds.length);
console.log("words NOT found in db: ", missingWords.length); console.log("words NOT found in db: ", missingWords.length, "\n");
// write missing words to file // writing missing words to file
console.log("writing missing words to file...\n");
await writeMissingWordsToFile(missingWords); await writeMissingWordsToFile(missingWords);
// validating languages
console.log("validation languages...");
const validatedLanguages = await validateLanguages(sourceLanguage, termIds);
console.log("validated these languages: ", validatedLanguages, "\n");
}; };
main().catch((error) => { main().catch((error) => {