adding script to check cefr coverage between json files and database, adding script to write cefr levels from json to db

This commit is contained in:
lila 2026-04-09 10:25:20 +02:00
parent 3374bd8b20
commit 13cc709b09
7 changed files with 279296 additions and 1 deletions

View file

@ -0,0 +1,183 @@
/*
This script performs a cross-reference check between two specific data sets:
- The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for.
- The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app.
What it calculates:
It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?"
Matched: The word from the JSON file was found in the DB. (Ready for enrichment).
Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment).
*/
import fs from "node:fs/promises";
import { eq } from "drizzle-orm";
import {
SUPPORTED_LANGUAGE_CODES,
SUPPORTED_POS,
CEFR_LEVELS,
DIFFICULTY_LEVELS,
} from "@glossa/shared";
import { db } from "@glossa/db";
import { terms, translations } from "@glossa/db/schema";
type POS = (typeof SUPPORTED_POS)[number];
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
type CEFRLevel = (typeof CEFR_LEVELS)[number];
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
type MergedRecord = {
word: string;
pos: POS;
cefr: CEFRLevel;
difficulty: Difficulty;
sources: string[];
};
type CoverageStats = {
total: number;
matched: number;
unmatched: number;
byCefr: Record<CEFRLevel, { total: number; matched: number }>;
byDifficulty: Record<Difficulty, { total: number; matched: number }>;
unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>;
};
const dataDir = "./src/data/";
async function checkCoverage(language: LanguageCode): Promise<void> {
const filename = `${language}-merged.json`;
const filepath = dataDir + filename;
console.log(`\n📄 Checking ${filename}...`);
// Load merged data
let records: MergedRecord[];
try {
const raw = await fs.readFile(filepath, "utf8");
records = JSON.parse(raw) as MergedRecord[];
} catch (e) {
console.warn(` ⚠️ Could not read file: ${(e as Error).message}`);
return;
}
console.log(` Loaded ${records.length.toLocaleString("en-US")} entries`);
// Initialize stats
const stats: CoverageStats = {
total: records.length,
matched: 0,
unmatched: 0,
byCefr: {} as Record<CEFRLevel, { total: number; matched: number }>,
byDifficulty: {} as Record<Difficulty, { total: number; matched: number }>,
unmatchedWords: [],
};
for (const level of CEFR_LEVELS)
stats.byCefr[level] = { total: 0, matched: 0 };
for (const diff of DIFFICULTY_LEVELS)
stats.byDifficulty[diff] = { total: 0, matched: 0 };
// ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ──
console.log(` 🔍 Querying database for existing translations...`);
// Get all existing translations for this language + POS combo
const existingRows = await db
.select({ text: translations.text, pos: terms.pos })
.from(translations)
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(eq(translations.language_code, language));
// Create a Set for O(1) lookup: "word|pos" -> true
const existingSet = new Set(
existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`),
);
// ── Process records against the in-memory Set ──
for (const record of records) {
stats.byCefr[record.cefr].total++;
stats.byDifficulty[record.difficulty].total++;
const key = `${record.word.toLowerCase()}|${record.pos}`;
if (existingSet.has(key)) {
stats.matched++;
stats.byCefr[record.cefr].matched++;
stats.byDifficulty[record.difficulty].matched++;
} else {
stats.unmatched++;
if (stats.unmatchedWords.length < 20) {
stats.unmatchedWords.push({
word: record.word,
pos: record.pos,
cefr: record.cefr,
});
}
}
}
// ── Print results (same as your draft) ──
console.log(`\n📊 Coverage for ${language}:`);
console.log(` Total entries: ${stats.total.toLocaleString("en-US")}`);
console.log(
` Matched in DB: ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`,
);
console.log(
` Unmatched: ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`,
);
console.log(`\n By CEFR level:`);
for (const level of CEFR_LEVELS) {
const { total, matched } = stats.byCefr[level];
if (total > 0) {
const pct = ((matched / total) * 100).toFixed(1);
console.log(
` ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
);
}
}
console.log(`\n By difficulty:`);
for (const diff of DIFFICULTY_LEVELS) {
const { total, matched } = stats.byDifficulty[diff];
if (total > 0) {
const pct = ((matched / total) * 100).toFixed(1);
console.log(
` ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
);
}
}
if (stats.unmatchedWords.length > 0) {
console.log(`\n⚠ Sample unmatched words (first 20):`);
for (const { word, pos, cefr } of stats.unmatchedWords) {
console.log(` "${word}" (${pos}, ${cefr})`);
}
if (stats.unmatched > 20) {
console.log(` ... and ${stats.unmatched - 20} more`);
}
}
}
const main = async () => {
console.log("##########################################");
console.log("Glossa — CEFR Coverage Check");
console.log("##########################################");
for (const language of SUPPORTED_LANGUAGE_CODES) {
await checkCoverage(language);
}
console.log("\n##########################################");
console.log("Done");
console.log("##########################################");
};
main().catch((err) => {
console.error(err);
process.exit(1);
});