import fs from "node:fs/promises"; import { eq, inArray } from "drizzle-orm"; import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS, CEFR_LEVELS, DIFFICULTY_LEVELS, } from "@lila/shared"; import { db } from "@lila/db"; import { translations, terms } from "@lila/db/schema"; type POS = (typeof SUPPORTED_POS)[number]; type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number]; type CEFRLevel = (typeof CEFR_LEVELS)[number]; type Difficulty = (typeof DIFFICULTY_LEVELS)[number]; type MergedRecord = { word: string; pos: POS; cefr: CEFRLevel; difficulty: Difficulty; sources: string[]; }; const dataDir = "./src/data/"; const BATCH_SIZE = 500; // ──────────────────────────────────────────────────────────── // Helpers // ──────────────────────────────────────────────────────────── function chunk(arr: T[], size: number): T[][] { const out: T[][] = []; for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size)); return out; } function fmt(n: number): string { return n.toLocaleString("en-US"); } // ──────────────────────────────────────────────────────────── // Enrichment per language // ──────────────────────────────────────────────────────────── async function enrichLanguage(language: LanguageCode): Promise { const filename = `${language}-merged.json`; const filepath = dataDir + filename; console.log(`\n📝 Enriching ${filename}...`); let records: MergedRecord[]; try { const raw = await fs.readFile(filepath, "utf8"); records = JSON.parse(raw) as MergedRecord[]; } catch (e) { console.warn(` ⚠️ Could not read file: ${(e as Error).message}`); return; } console.log(` Loaded ${fmt(records.length)} entries`); // 1. Bulk fetch existing translations for this language console.log(` 🔍 Fetching existing translations from DB...`); const existingTranslations = await db .select({ id: translations.id, text: translations.text, pos: terms.pos }) .from(translations) .innerJoin(terms, eq(translations.term_id, terms.id)) .where(eq(translations.language_code, language)); // 2. Build lookup map: "lowercase_word|pos" -> translation IDs const translationMap = new Map(); for (const t of existingTranslations) { const key = `${t.text.toLowerCase()}|${t.pos}`; if (!translationMap.has(key)) translationMap.set(key, []); translationMap.get(key)!.push(t.id); } // 3. Match records to DB IDs and group by target (cefr, difficulty) const updatesByValue = new Map(); const unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }> = []; for (const rec of records) { const key = `${rec.word.toLowerCase()}|${rec.pos}`; const ids = translationMap.get(key); if (ids && ids.length > 0) { const valueKey = `${rec.cefr}|${rec.difficulty}`; if (!updatesByValue.has(valueKey)) updatesByValue.set(valueKey, []); updatesByValue.get(valueKey)!.push(...ids); } else { unmatchedWords.push({ word: rec.word, pos: rec.pos, cefr: rec.cefr }); } } // 4. Batch updates grouped by (cefr, difficulty) let totalUpdated = 0; for (const [valueKey, ids] of updatesByValue.entries()) { const [cefr, difficulty] = valueKey.split("|") as [CEFRLevel, Difficulty]; const uniqueIds = [...new Set(ids)]; // Deduplicate synonyms/duplicates for (const idBatch of chunk(uniqueIds, BATCH_SIZE)) { await db .update(translations) .set({ cefr_level: cefr, difficulty }) .where(inArray(translations.id, idBatch)); totalUpdated += idBatch.length; } } // 5. Summary console.log(`\n ✅ Updated ${fmt(totalUpdated)} translations`); console.log(` ⚠️ Unmatched: ${fmt(unmatchedWords.length)}`); if (unmatchedWords.length > 0) { console.log(`\n Sample unmatched words (first 20):`); for (const { word, pos, cefr } of unmatchedWords.slice(0, 20)) { console.log(` "${word}" (${pos}, ${cefr})`); } if (unmatchedWords.length > 20) { console.log(` ... and ${fmt(unmatchedWords.length - 20)} more`); } } } // ──────────────────────────────────────────────────────────── // Main // ──────────────────────────────────────────────────────────── const main = async () => { console.log("##########################################"); console.log("lila — CEFR Enrichment"); console.log("##########################################\n"); for (const lang of SUPPORTED_LANGUAGE_CODES) { await enrichLanguage(lang); } console.log("\n##########################################"); console.log("Done"); console.log("##########################################"); }; main().catch((err) => { console.error(err); process.exit(1); });