lila/packages/db/src/checking-cefr-coverage.ts
lila 3f7bc4111e chore: rename project from glossa to lila
- Update all package names from @glossa/* to @lila/*
- Update all imports, container names, volume names
- Update documentation references
- Recreate database with new credentials
2026-04-13 10:00:52 +02:00

183 lines
5.8 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
This script performs a cross-reference check between two specific data sets:
- The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for.
- The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app.
What it calculates:
It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?"
Matched: The word from the JSON file was found in the DB. (Ready for enrichment).
Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment).
*/
import fs from "node:fs/promises";
import { eq } from "drizzle-orm";
import {
SUPPORTED_LANGUAGE_CODES,
SUPPORTED_POS,
CEFR_LEVELS,
DIFFICULTY_LEVELS,
} from "@lila/shared";
import { db } from "@lila/db";
import { terms, translations } from "@lila/db/schema";
type POS = (typeof SUPPORTED_POS)[number];
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
type CEFRLevel = (typeof CEFR_LEVELS)[number];
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
type MergedRecord = {
word: string;
pos: POS;
cefr: CEFRLevel;
difficulty: Difficulty;
sources: string[];
};
type CoverageStats = {
total: number;
matched: number;
unmatched: number;
byCefr: Record<CEFRLevel, { total: number; matched: number }>;
byDifficulty: Record<Difficulty, { total: number; matched: number }>;
unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>;
};
const dataDir = "./src/data/";
async function checkCoverage(language: LanguageCode): Promise<void> {
const filename = `${language}-merged.json`;
const filepath = dataDir + filename;
console.log(`\n📄 Checking ${filename}...`);
// Load merged data
let records: MergedRecord[];
try {
const raw = await fs.readFile(filepath, "utf8");
records = JSON.parse(raw) as MergedRecord[];
} catch (e) {
console.warn(` ⚠️ Could not read file: ${(e as Error).message}`);
return;
}
console.log(` Loaded ${records.length.toLocaleString("en-US")} entries`);
// Initialize stats
const stats: CoverageStats = {
total: records.length,
matched: 0,
unmatched: 0,
byCefr: {} as Record<CEFRLevel, { total: number; matched: number }>,
byDifficulty: {} as Record<Difficulty, { total: number; matched: number }>,
unmatchedWords: [],
};
for (const level of CEFR_LEVELS)
stats.byCefr[level] = { total: 0, matched: 0 };
for (const diff of DIFFICULTY_LEVELS)
stats.byDifficulty[diff] = { total: 0, matched: 0 };
// ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ──
console.log(` 🔍 Querying database for existing translations...`);
// Get all existing translations for this language + POS combo
const existingRows = await db
.select({ text: translations.text, pos: terms.pos })
.from(translations)
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(eq(translations.language_code, language));
// Create a Set for O(1) lookup: "word|pos" -> true
const existingSet = new Set(
existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`),
);
// ── Process records against the in-memory Set ──
for (const record of records) {
stats.byCefr[record.cefr].total++;
stats.byDifficulty[record.difficulty].total++;
const key = `${record.word.toLowerCase()}|${record.pos}`;
if (existingSet.has(key)) {
stats.matched++;
stats.byCefr[record.cefr].matched++;
stats.byDifficulty[record.difficulty].matched++;
} else {
stats.unmatched++;
if (stats.unmatchedWords.length < 20) {
stats.unmatchedWords.push({
word: record.word,
pos: record.pos,
cefr: record.cefr,
});
}
}
}
// ── Print results (same as your draft) ──
console.log(`\n📊 Coverage for ${language}:`);
console.log(` Total entries: ${stats.total.toLocaleString("en-US")}`);
console.log(
` Matched in DB: ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`,
);
console.log(
` Unmatched: ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`,
);
console.log(`\n By CEFR level:`);
for (const level of CEFR_LEVELS) {
const { total, matched } = stats.byCefr[level];
if (total > 0) {
const pct = ((matched / total) * 100).toFixed(1);
console.log(
` ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
);
}
}
console.log(`\n By difficulty:`);
for (const diff of DIFFICULTY_LEVELS) {
const { total, matched } = stats.byDifficulty[diff];
if (total > 0) {
const pct = ((matched / total) * 100).toFixed(1);
console.log(
` ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
);
}
}
if (stats.unmatchedWords.length > 0) {
console.log(`\n⚠ Sample unmatched words (first 20):`);
for (const { word, pos, cefr } of stats.unmatchedWords) {
console.log(` "${word}" (${pos}, ${cefr})`);
}
if (stats.unmatched > 20) {
console.log(` ... and ${stats.unmatched - 20} more`);
}
}
}
const main = async () => {
console.log("##########################################");
console.log("lila — CEFR Coverage Check");
console.log("##########################################");
for (const language of SUPPORTED_LANGUAGE_CODES) {
await checkCoverage(language);
}
console.log("\n##########################################");
console.log("Done");
console.log("##########################################");
};
main().catch((err) => {
console.error(err);
process.exit(1);
});