adding script to check cefr coverage between json files and database, adding script to write cefr levels from json to db

2026-04-09 10:25:20 +02:00 · 2026-04-09 10:25:20 +02:00 · 13cc709b09
commit 13cc709b09
parent 3374bd8b20
7 changed files with 279296 additions and 1 deletions
--- a/packages/db/src/checking-cefr-coverage.ts
+++ b/packages/db/src/checking-cefr-coverage.ts
@ -0,0 +1,183 @@
+/*
+
+This script performs a cross-reference check between two specific data sets:
+
+    - The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for.
+    - The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app.
+
+What it calculates:
+It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?"
+
+    Matched: The word from the JSON file was found in the DB. (Ready for enrichment).
+    Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment).
+
+*/
+
+import fs from "node:fs/promises";
+import { eq } from "drizzle-orm";
+
+import {
+  SUPPORTED_LANGUAGE_CODES,
+  SUPPORTED_POS,
+  CEFR_LEVELS,
+  DIFFICULTY_LEVELS,
+} from "@glossa/shared";
+import { db } from "@glossa/db";
+import { terms, translations } from "@glossa/db/schema";
+
+type POS = (typeof SUPPORTED_POS)[number];
+type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
+type CEFRLevel = (typeof CEFR_LEVELS)[number];
+type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
+
+type MergedRecord = {
+  word: string;
+  pos: POS;
+  cefr: CEFRLevel;
+  difficulty: Difficulty;
+  sources: string[];
+};
+
+type CoverageStats = {
+  total: number;
+  matched: number;
+  unmatched: number;
+  byCefr: Record<CEFRLevel, { total: number; matched: number }>;
+  byDifficulty: Record<Difficulty, { total: number; matched: number }>;
+  unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>;
+};
+
+const dataDir = "./src/data/";
+
+async function checkCoverage(language: LanguageCode): Promise<void> {
+  const filename = `${language}-merged.json`;
+  const filepath = dataDir + filename;
+
+  console.log(`\n📄 Checking ${filename}...`);
+
+  // Load merged data
+  let records: MergedRecord[];
+  try {
+    const raw = await fs.readFile(filepath, "utf8");
+    records = JSON.parse(raw) as MergedRecord[];
+  } catch (e) {
+    console.warn(`   ⚠️  Could not read file: ${(e as Error).message}`);
+    return;
+  }
+
+  console.log(`   Loaded ${records.length.toLocaleString("en-US")} entries`);
+
+  // Initialize stats
+  const stats: CoverageStats = {
+    total: records.length,
+    matched: 0,
+    unmatched: 0,
+    byCefr: {} as Record<CEFRLevel, { total: number; matched: number }>,
+    byDifficulty: {} as Record<Difficulty, { total: number; matched: number }>,
+    unmatchedWords: [],
+  };
+
+  for (const level of CEFR_LEVELS)
+    stats.byCefr[level] = { total: 0, matched: 0 };
+  for (const diff of DIFFICULTY_LEVELS)
+    stats.byDifficulty[diff] = { total: 0, matched: 0 };
+
+  // ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ──
+  console.log(`   🔍 Querying database for existing translations...`);
+
+  // Get all existing translations for this language + POS combo
+  const existingRows = await db
+    .select({ text: translations.text, pos: terms.pos })
+    .from(translations)
+    .innerJoin(terms, eq(translations.term_id, terms.id))
+    .where(eq(translations.language_code, language));
+
+  // Create a Set for O(1) lookup: "word|pos" -> true
+  const existingSet = new Set(
+    existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`),
+  );
+
+  // ── Process records against the in-memory Set ──
+  for (const record of records) {
+    stats.byCefr[record.cefr].total++;
+    stats.byDifficulty[record.difficulty].total++;
+
+    const key = `${record.word.toLowerCase()}|${record.pos}`;
+
+    if (existingSet.has(key)) {
+      stats.matched++;
+      stats.byCefr[record.cefr].matched++;
+      stats.byDifficulty[record.difficulty].matched++;
+    } else {
+      stats.unmatched++;
+      if (stats.unmatchedWords.length < 20) {
+        stats.unmatchedWords.push({
+          word: record.word,
+          pos: record.pos,
+          cefr: record.cefr,
+        });
+      }
+    }
+  }
+
+  // ── Print results (same as your draft) ──
+  console.log(`\n📊 Coverage for ${language}:`);
+  console.log(`   Total entries:    ${stats.total.toLocaleString("en-US")}`);
+  console.log(
+    `   Matched in DB:    ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`,
+  );
+  console.log(
+    `   Unmatched:        ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`,
+  );
+
+  console.log(`\n   By CEFR level:`);
+  for (const level of CEFR_LEVELS) {
+    const { total, matched } = stats.byCefr[level];
+    if (total > 0) {
+      const pct = ((matched / total) * 100).toFixed(1);
+      console.log(
+        `      ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
+      );
+    }
+  }
+
+  console.log(`\n   By difficulty:`);
+  for (const diff of DIFFICULTY_LEVELS) {
+    const { total, matched } = stats.byDifficulty[diff];
+    if (total > 0) {
+      const pct = ((matched / total) * 100).toFixed(1);
+      console.log(
+        `      ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
+      );
+    }
+  }
+
+  if (stats.unmatchedWords.length > 0) {
+    console.log(`\n⚠️  Sample unmatched words (first 20):`);
+    for (const { word, pos, cefr } of stats.unmatchedWords) {
+      console.log(`      "${word}" (${pos}, ${cefr})`);
+    }
+    if (stats.unmatched > 20) {
+      console.log(`      ... and ${stats.unmatched - 20} more`);
+    }
+  }
+}
+
+const main = async () => {
+  console.log("##########################################");
+  console.log("Glossa — CEFR Coverage Check");
+  console.log("##########################################");
+
+  for (const language of SUPPORTED_LANGUAGE_CODES) {
+    await checkCoverage(language);
+  }
+
+  console.log("\n##########################################");
+  console.log("Done");
+  console.log("##########################################");
+};
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
--- a/packages/db/src/data/en-merged.json
+++ b/packages/db/src/data/en-merged.json
--- a/packages/db/src/data/it-merged.json
+++ b/packages/db/src/data/it-merged.json
--- a/packages/db/src/data/datafiles/omw-noun.json
+++ b/packages/db/src/data/datafiles/omw-noun.json
--- a/packages/db/src/data/datafiles/omw-verb.json
+++ b/packages/db/src/data/datafiles/omw-verb.json
--- a/packages/db/src/seeding-cefr-levels.ts
+++ b/packages/db/src/seeding-cefr-levels.ts
@ -0,0 +1,148 @@
+import fs from "node:fs/promises";
+import { eq, inArray } from "drizzle-orm";
+
+import {
+  SUPPORTED_LANGUAGE_CODES,
+  SUPPORTED_POS,
+  CEFR_LEVELS,
+  DIFFICULTY_LEVELS,
+} from "@glossa/shared";
+import { db } from "@glossa/db";
+import { translations, terms } from "@glossa/db/schema";
+
+type POS = (typeof SUPPORTED_POS)[number];
+type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
+type CEFRLevel = (typeof CEFR_LEVELS)[number];
+type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
+
+type MergedRecord = {
+  word: string;
+  pos: POS;
+  cefr: CEFRLevel;
+  difficulty: Difficulty;
+  sources: string[];
+};
+
+const dataDir = "./src/data/";
+const BATCH_SIZE = 500;
+
+// ────────────────────────────────────────────────────────────
+// Helpers
+// ────────────────────────────────────────────────────────────
+
+function chunk<T>(arr: T[], size: number): T[][] {
+  const out: T[][] = [];
+  for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
+  return out;
+}
+
+function fmt(n: number): string {
+  return n.toLocaleString("en-US");
+}
+
+// ────────────────────────────────────────────────────────────
+// Enrichment per language
+// ────────────────────────────────────────────────────────────
+
+async function enrichLanguage(language: LanguageCode): Promise<void> {
+  const filename = `${language}-merged.json`;
+  const filepath = dataDir + filename;
+
+  console.log(`\n📝 Enriching ${filename}...`);
+
+  let records: MergedRecord[];
+  try {
+    const raw = await fs.readFile(filepath, "utf8");
+    records = JSON.parse(raw) as MergedRecord[];
+  } catch (e) {
+    console.warn(`   ⚠️  Could not read file: ${(e as Error).message}`);
+    return;
+  }
+
+  console.log(`   Loaded ${fmt(records.length)} entries`);
+
+  // 1. Bulk fetch existing translations for this language
+  console.log(`   🔍 Fetching existing translations from DB...`);
+  const existingTranslations = await db
+    .select({ id: translations.id, text: translations.text, pos: terms.pos })
+    .from(translations)
+    .innerJoin(terms, eq(translations.term_id, terms.id))
+    .where(eq(translations.language_code, language));
+
+  // 2. Build lookup map: "lowercase_word|pos" -> translation IDs
+  const translationMap = new Map<string, string[]>();
+  for (const t of existingTranslations) {
+    const key = `${t.text.toLowerCase()}|${t.pos}`;
+    if (!translationMap.has(key)) translationMap.set(key, []);
+    translationMap.get(key)!.push(t.id);
+  }
+
+  // 3. Match records to DB IDs and group by target (cefr, difficulty)
+  const updatesByValue = new Map<string, string[]>();
+  const unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }> = [];
+
+  for (const rec of records) {
+    const key = `${rec.word.toLowerCase()}|${rec.pos}`;
+    const ids = translationMap.get(key);
+
+    if (ids && ids.length > 0) {
+      const valueKey = `${rec.cefr}|${rec.difficulty}`;
+      if (!updatesByValue.has(valueKey)) updatesByValue.set(valueKey, []);
+      updatesByValue.get(valueKey)!.push(...ids);
+    } else {
+      unmatchedWords.push({ word: rec.word, pos: rec.pos, cefr: rec.cefr });
+    }
+  }
+
+  // 4. Batch updates grouped by (cefr, difficulty)
+  let totalUpdated = 0;
+  for (const [valueKey, ids] of updatesByValue.entries()) {
+    const [cefr, difficulty] = valueKey.split("|") as [CEFRLevel, Difficulty];
+    const uniqueIds = [...new Set(ids)]; // Deduplicate synonyms/duplicates
+
+    for (const idBatch of chunk(uniqueIds, BATCH_SIZE)) {
+      await db
+        .update(translations)
+        .set({ cefr_level: cefr, difficulty })
+        .where(inArray(translations.id, idBatch));
+      totalUpdated += idBatch.length;
+    }
+  }
+
+  // 5. Summary
+  console.log(`\n   ✅ Updated ${fmt(totalUpdated)} translations`);
+  console.log(`   ⚠️  Unmatched: ${fmt(unmatchedWords.length)}`);
+
+  if (unmatchedWords.length > 0) {
+    console.log(`\n   Sample unmatched words (first 20):`);
+    for (const { word, pos, cefr } of unmatchedWords.slice(0, 20)) {
+      console.log(`      "${word}" (${pos}, ${cefr})`);
+    }
+    if (unmatchedWords.length > 20) {
+      console.log(`      ... and ${fmt(unmatchedWords.length - 20)} more`);
+    }
+  }
+}
+
+// ────────────────────────────────────────────────────────────
+// Main
+// ────────────────────────────────────────────────────────────
+
+const main = async () => {
+  console.log("##########################################");
+  console.log("Glossa — CEFR Enrichment");
+  console.log("##########################################\n");
+
+  for (const lang of SUPPORTED_LANGUAGE_CODES) {
+    await enrichLanguage(lang);
+  }
+
+  console.log("\n##########################################");
+  console.log("Done");
+  console.log("##########################################");
+};
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
--- a/packages/db/src/seeding-datafiles.ts
+++ b/packages/db/src/seeding-datafiles.ts
@ -15,7 +15,7 @@ type SynsetRecord = {
  glosses: Partial<Record<LanguageCode, string[]>>;
 };

-const dataDir = "./src/data/datafiles/";
+const dataDir = "./src/data/";
 const BATCH_SIZE = 500;

 // ────────────────────────────────────────────────────────────