adding script to check cefr coverage between json files and database, adding script to write cefr levels from json to db

2026-04-09 10:25:20 +02:00 · 2026-04-09 10:25:20 +02:00 · 13cc709b09
commit 13cc709b09
parent 3374bd8b20
7 changed files with 279296 additions and 1 deletions
--- a/packages/db/src/checking-cefr-coverage.ts
+++ b/packages/db/src/checking-cefr-coverage.ts
@ -0,0 +1,183 @@
+/*
+
+This script performs a cross-reference check between two specific data sets:
+
+    - The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for.
+    - The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app.
+
+What it calculates:
+It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?"
+
+    Matched: The word from the JSON file was found in the DB. (Ready for enrichment).
+    Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment).
+
+*/
+
+import fs from "node:fs/promises";
+import { eq } from "drizzle-orm";
+
+import {
+  SUPPORTED_LANGUAGE_CODES,
+  SUPPORTED_POS,
+  CEFR_LEVELS,
+  DIFFICULTY_LEVELS,
+} from "@glossa/shared";
+import { db } from "@glossa/db";
+import { terms, translations } from "@glossa/db/schema";
+
+type POS = (typeof SUPPORTED_POS)[number];
+type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
+type CEFRLevel = (typeof CEFR_LEVELS)[number];
+type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
+
+type MergedRecord = {
+  word: string;
+  pos: POS;
+  cefr: CEFRLevel;
+  difficulty: Difficulty;
+  sources: string[];
+};
+
+type CoverageStats = {
+  total: number;
+  matched: number;
+  unmatched: number;
+  byCefr: Record<CEFRLevel, { total: number; matched: number }>;
+  byDifficulty: Record<Difficulty, { total: number; matched: number }>;
+  unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>;
+};
+
+const dataDir = "./src/data/";
+
+async function checkCoverage(language: LanguageCode): Promise<void> {
+  const filename = `${language}-merged.json`;
+  const filepath = dataDir + filename;
+
+  console.log(`\n📄 Checking ${filename}...`);
+
+  // Load merged data
+  let records: MergedRecord[];
+  try {
+    const raw = await fs.readFile(filepath, "utf8");
+    records = JSON.parse(raw) as MergedRecord[];
+  } catch (e) {
+    console.warn(`   ⚠️  Could not read file: ${(e as Error).message}`);
+    return;
+  }
+
+  console.log(`   Loaded ${records.length.toLocaleString("en-US")} entries`);
+
+  // Initialize stats
+  const stats: CoverageStats = {
+    total: records.length,
+    matched: 0,
+    unmatched: 0,
+    byCefr: {} as Record<CEFRLevel, { total: number; matched: number }>,
+    byDifficulty: {} as Record<Difficulty, { total: number; matched: number }>,
+    unmatchedWords: [],
+  };
+
+  for (const level of CEFR_LEVELS)
+    stats.byCefr[level] = { total: 0, matched: 0 };
+  for (const diff of DIFFICULTY_LEVELS)
+    stats.byDifficulty[diff] = { total: 0, matched: 0 };
+
+  // ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ──
+  console.log(`   🔍 Querying database for existing translations...`);
+
+  // Get all existing translations for this language + POS combo
+  const existingRows = await db
+    .select({ text: translations.text, pos: terms.pos })
+    .from(translations)
+    .innerJoin(terms, eq(translations.term_id, terms.id))
+    .where(eq(translations.language_code, language));
+
+  // Create a Set for O(1) lookup: "word|pos" -> true
+  const existingSet = new Set(
+    existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`),
+  );
+
+  // ── Process records against the in-memory Set ──
+  for (const record of records) {
+    stats.byCefr[record.cefr].total++;
+    stats.byDifficulty[record.difficulty].total++;
+
+    const key = `${record.word.toLowerCase()}|${record.pos}`;
+
+    if (existingSet.has(key)) {
+      stats.matched++;
+      stats.byCefr[record.cefr].matched++;
+      stats.byDifficulty[record.difficulty].matched++;
+    } else {
+      stats.unmatched++;
+      if (stats.unmatchedWords.length < 20) {
+        stats.unmatchedWords.push({
+          word: record.word,
+          pos: record.pos,
+          cefr: record.cefr,
+        });
+      }
+    }
+  }
+
+  // ── Print results (same as your draft) ──
+  console.log(`\n📊 Coverage for ${language}:`);
+  console.log(`   Total entries:    ${stats.total.toLocaleString("en-US")}`);
+  console.log(
+    `   Matched in DB:    ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`,
+  );
+  console.log(
+    `   Unmatched:        ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`,
+  );
+
+  console.log(`\n   By CEFR level:`);
+  for (const level of CEFR_LEVELS) {
+    const { total, matched } = stats.byCefr[level];
+    if (total > 0) {
+      const pct = ((matched / total) * 100).toFixed(1);
+      console.log(
+        `      ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
+      );
+    }
+  }
+
+  console.log(`\n   By difficulty:`);
+  for (const diff of DIFFICULTY_LEVELS) {
+    const { total, matched } = stats.byDifficulty[diff];
+    if (total > 0) {
+      const pct = ((matched / total) * 100).toFixed(1);
+      console.log(
+        `      ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
+      );
+    }
+  }
+
+  if (stats.unmatchedWords.length > 0) {
+    console.log(`\n⚠️  Sample unmatched words (first 20):`);
+    for (const { word, pos, cefr } of stats.unmatchedWords) {
+      console.log(`      "${word}" (${pos}, ${cefr})`);
+    }
+    if (stats.unmatched > 20) {
+      console.log(`      ... and ${stats.unmatched - 20} more`);
+    }
+  }
+}
+
+const main = async () => {
+  console.log("##########################################");
+  console.log("Glossa — CEFR Coverage Check");
+  console.log("##########################################");
+
+  for (const language of SUPPORTED_LANGUAGE_CODES) {
+    await checkCoverage(language);
+  }
+
+  console.log("\n##########################################");
+  console.log("Done");
+  console.log("##########################################");
+};
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});