/* This script performs a cross-reference check between two specific data sets: - The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for. - The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app. What it calculates: It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?" Matched: The word from the JSON file was found in the DB. (Ready for enrichment). Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment). */ import fs from "node:fs/promises"; import { eq } from "drizzle-orm"; import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS, CEFR_LEVELS, DIFFICULTY_LEVELS, } from "@lila/shared"; import { db } from "@lila/db"; import { terms, translations } from "@lila/db/schema"; type POS = (typeof SUPPORTED_POS)[number]; type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number]; type CEFRLevel = (typeof CEFR_LEVELS)[number]; type Difficulty = (typeof DIFFICULTY_LEVELS)[number]; type MergedRecord = { word: string; pos: POS; cefr: CEFRLevel; difficulty: Difficulty; sources: string[]; }; type CoverageStats = { total: number; matched: number; unmatched: number; byCefr: Record; byDifficulty: Record; unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>; }; const dataDir = "./src/data/"; async function checkCoverage(language: LanguageCode): Promise { const filename = `${language}-merged.json`; const filepath = dataDir + filename; console.log(`\nšŸ“„ Checking ${filename}...`); // Load merged data let records: MergedRecord[]; try { const raw = await fs.readFile(filepath, "utf8"); records = JSON.parse(raw) as MergedRecord[]; } catch (e) { console.warn(` āš ļø Could not read file: ${(e as Error).message}`); return; } console.log(` Loaded ${records.length.toLocaleString("en-US")} entries`); // Initialize stats const stats: CoverageStats = { total: records.length, matched: 0, unmatched: 0, byCefr: {} as Record, byDifficulty: {} as Record, unmatchedWords: [], }; for (const level of CEFR_LEVELS) stats.byCefr[level] = { total: 0, matched: 0 }; for (const diff of DIFFICULTY_LEVELS) stats.byDifficulty[diff] = { total: 0, matched: 0 }; // ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ── console.log(` šŸ” Querying database for existing translations...`); // Get all existing translations for this language + POS combo const existingRows = await db .select({ text: translations.text, pos: terms.pos }) .from(translations) .innerJoin(terms, eq(translations.term_id, terms.id)) .where(eq(translations.language_code, language)); // Create a Set for O(1) lookup: "word|pos" -> true const existingSet = new Set( existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`), ); // ── Process records against the in-memory Set ── for (const record of records) { stats.byCefr[record.cefr].total++; stats.byDifficulty[record.difficulty].total++; const key = `${record.word.toLowerCase()}|${record.pos}`; if (existingSet.has(key)) { stats.matched++; stats.byCefr[record.cefr].matched++; stats.byDifficulty[record.difficulty].matched++; } else { stats.unmatched++; if (stats.unmatchedWords.length < 20) { stats.unmatchedWords.push({ word: record.word, pos: record.pos, cefr: record.cefr, }); } } } // ── Print results (same as your draft) ── console.log(`\nšŸ“Š Coverage for ${language}:`); console.log(` Total entries: ${stats.total.toLocaleString("en-US")}`); console.log( ` Matched in DB: ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`, ); console.log( ` Unmatched: ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`, ); console.log(`\n By CEFR level:`); for (const level of CEFR_LEVELS) { const { total, matched } = stats.byCefr[level]; if (total > 0) { const pct = ((matched / total) * 100).toFixed(1); console.log( ` ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`, ); } } console.log(`\n By difficulty:`); for (const diff of DIFFICULTY_LEVELS) { const { total, matched } = stats.byDifficulty[diff]; if (total > 0) { const pct = ((matched / total) * 100).toFixed(1); console.log( ` ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`, ); } } if (stats.unmatchedWords.length > 0) { console.log(`\nāš ļø Sample unmatched words (first 20):`); for (const { word, pos, cefr } of stats.unmatchedWords) { console.log(` "${word}" (${pos}, ${cefr})`); } if (stats.unmatched > 20) { console.log(` ... and ${stats.unmatched - 20} more`); } } } const main = async () => { console.log("##########################################"); console.log("lila — CEFR Coverage Check"); console.log("##########################################"); for (const language of SUPPORTED_LANGUAGE_CODES) { await checkCoverage(language); } console.log("\n##########################################"); console.log("Done"); console.log("##########################################"); }; main().catch((err) => { console.error(err); process.exit(1); });