adding script to check cefr coverage between json files and database, adding script to write cefr levels from json to db

2026-04-09 10:25:20 +02:00 · 2026-04-09 10:25:20 +02:00 · 13cc709b09
commit 13cc709b09
parent 3374bd8b20
7 changed files with 279296 additions and 1 deletions
--- a/packages/db/src/checking-cefr-coverage.ts
+++ b/packages/db/src/checking-cefr-coverage.ts
@ -0,0 +1,183 @@
 /*
 This script performs a cross-reference check between two specific data sets:
    - The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for.
    - The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app.
 What it calculates:
 It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?"
    Matched: The word from the JSON file was found in the DB. (Ready for enrichment).
    Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment).
 */
 import fs from "node:fs/promises";
 import { eq } from "drizzle-orm";
 import {
  SUPPORTED_LANGUAGE_CODES,
  SUPPORTED_POS,
  CEFR_LEVELS,
  DIFFICULTY_LEVELS,
 } from "@glossa/shared";
 import { db } from "@glossa/db";
 import { terms, translations } from "@glossa/db/schema";
 type POS = (typeof SUPPORTED_POS)[number];
 type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
 type CEFRLevel = (typeof CEFR_LEVELS)[number];
 type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
 type MergedRecord = {
  word: string;
  pos: POS;
  cefr: CEFRLevel;
  difficulty: Difficulty;
  sources: string[];
 };
 type CoverageStats = {
  total: number;
  matched: number;
  unmatched: number;
  byCefr: Record<CEFRLevel, { total: number; matched: number }>;
  byDifficulty: Record<Difficulty, { total: number; matched: number }>;
  unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>;
 };
 const dataDir = "./src/data/";
 async function checkCoverage(language: LanguageCode): Promise<void> {
  const filename = `${language}-merged.json`;
  const filepath = dataDir + filename;
  console.log(`\n📄 Checking ${filename}...`);
  // Load merged data
  let records: MergedRecord[];
  try {
    const raw = await fs.readFile(filepath, "utf8");
    records = JSON.parse(raw) as MergedRecord[];
  } catch (e) {
    console.warn(`   ⚠️  Could not read file: ${(e as Error).message}`);
    return;
  }
  console.log(`   Loaded ${records.length.toLocaleString("en-US")} entries`);
  // Initialize stats
  const stats: CoverageStats = {
    total: records.length,
    matched: 0,
    unmatched: 0,
    byCefr: {} as Record<CEFRLevel, { total: number; matched: number }>,
    byDifficulty: {} as Record<Difficulty, { total: number; matched: number }>,
    unmatchedWords: [],
  };
  for (const level of CEFR_LEVELS)
    stats.byCefr[level] = { total: 0, matched: 0 };
  for (const diff of DIFFICULTY_LEVELS)
    stats.byDifficulty[diff] = { total: 0, matched: 0 };
  // ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ──
  console.log(`   🔍 Querying database for existing translations...`);
  // Get all existing translations for this language + POS combo
  const existingRows = await db
    .select({ text: translations.text, pos: terms.pos })
    .from(translations)
    .innerJoin(terms, eq(translations.term_id, terms.id))
    .where(eq(translations.language_code, language));
  // Create a Set for O(1) lookup: "word|pos" -> true
  const existingSet = new Set(
    existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`),
  );
  // ── Process records against the in-memory Set ──
  for (const record of records) {
    stats.byCefr[record.cefr].total++;
    stats.byDifficulty[record.difficulty].total++;
    const key = `${record.word.toLowerCase()}|${record.pos}`;
    if (existingSet.has(key)) {
      stats.matched++;
      stats.byCefr[record.cefr].matched++;
      stats.byDifficulty[record.difficulty].matched++;
    } else {
      stats.unmatched++;
      if (stats.unmatchedWords.length < 20) {
        stats.unmatchedWords.push({
          word: record.word,
          pos: record.pos,
          cefr: record.cefr,
        });
      }
    }
  }
  // ── Print results (same as your draft) ──
  console.log(`\n📊 Coverage for ${language}:`);
  console.log(`   Total entries:    ${stats.total.toLocaleString("en-US")}`);
  console.log(
    `   Matched in DB:    ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`,
  );
  console.log(
    `   Unmatched:        ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`,
  );
  console.log(`\n   By CEFR level:`);
  for (const level of CEFR_LEVELS) {
    const { total, matched } = stats.byCefr[level];
    if (total > 0) {
      const pct = ((matched / total) * 100).toFixed(1);
      console.log(
        `      ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
      );
    }
  }
  console.log(`\n   By difficulty:`);
  for (const diff of DIFFICULTY_LEVELS) {
    const { total, matched } = stats.byDifficulty[diff];
    if (total > 0) {
      const pct = ((matched / total) * 100).toFixed(1);
      console.log(
        `      ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
      );
    }
  }
  if (stats.unmatchedWords.length > 0) {
    console.log(`\n⚠️  Sample unmatched words (first 20):`);
    for (const { word, pos, cefr } of stats.unmatchedWords) {
      console.log(`      "${word}" (${pos}, ${cefr})`);
    }
    if (stats.unmatched > 20) {
      console.log(`      ... and ${stats.unmatched - 20} more`);
    }
  }
 }
 const main = async () => {
  console.log("##########################################");
  console.log("Glossa — CEFR Coverage Check");
  console.log("##########################################");
  for (const language of SUPPORTED_LANGUAGE_CODES) {
    await checkCoverage(language);
  }
  console.log("\n##########################################");
  console.log("Done");
  console.log("##########################################");
 };
 main().catch((err) => {
  console.error(err);
  process.exit(1);
 });
--- a/packages/db/src/data/en-merged.json
+++ b/packages/db/src/data/en-merged.json
--- a/packages/db/src/data/it-merged.json
+++ b/packages/db/src/data/it-merged.json
--- a/packages/db/src/data/datafiles/omw-noun.json
+++ b/packages/db/src/data/datafiles/omw-noun.json
--- a/packages/db/src/data/datafiles/omw-verb.json
+++ b/packages/db/src/data/datafiles/omw-verb.json
--- a/packages/db/src/seeding-cefr-levels.ts
+++ b/packages/db/src/seeding-cefr-levels.ts
@ -0,0 +1,148 @@
 import fs from "node:fs/promises";
 import { eq, inArray } from "drizzle-orm";
 import {
  SUPPORTED_LANGUAGE_CODES,
  SUPPORTED_POS,
  CEFR_LEVELS,
  DIFFICULTY_LEVELS,
 } from "@glossa/shared";
 import { db } from "@glossa/db";
 import { translations, terms } from "@glossa/db/schema";
 type POS = (typeof SUPPORTED_POS)[number];
 type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
 type CEFRLevel = (typeof CEFR_LEVELS)[number];
 type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
 type MergedRecord = {
  word: string;
  pos: POS;
  cefr: CEFRLevel;
  difficulty: Difficulty;
  sources: string[];
 };
 const dataDir = "./src/data/";
 const BATCH_SIZE = 500;
 // ────────────────────────────────────────────────────────────
 // Helpers
 // ────────────────────────────────────────────────────────────
 function chunk<T>(arr: T[], size: number): T[][] {
  const out: T[][] = [];
  for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
  return out;
 }
 function fmt(n: number): string {
  return n.toLocaleString("en-US");
 }
 // ────────────────────────────────────────────────────────────
 // Enrichment per language
 // ────────────────────────────────────────────────────────────
 async function enrichLanguage(language: LanguageCode): Promise<void> {
  const filename = `${language}-merged.json`;
  const filepath = dataDir + filename;
  console.log(`\n📝 Enriching ${filename}...`);
  let records: MergedRecord[];
  try {
    const raw = await fs.readFile(filepath, "utf8");
    records = JSON.parse(raw) as MergedRecord[];
  } catch (e) {
    console.warn(`   ⚠️  Could not read file: ${(e as Error).message}`);
    return;
  }
  console.log(`   Loaded ${fmt(records.length)} entries`);
  // 1. Bulk fetch existing translations for this language
  console.log(`   🔍 Fetching existing translations from DB...`);
  const existingTranslations = await db
    .select({ id: translations.id, text: translations.text, pos: terms.pos })
    .from(translations)
    .innerJoin(terms, eq(translations.term_id, terms.id))
    .where(eq(translations.language_code, language));
  // 2. Build lookup map: "lowercase_word|pos" -> translation IDs
  const translationMap = new Map<string, string[]>();
  for (const t of existingTranslations) {
    const key = `${t.text.toLowerCase()}|${t.pos}`;
    if (!translationMap.has(key)) translationMap.set(key, []);
    translationMap.get(key)!.push(t.id);
  }
  // 3. Match records to DB IDs and group by target (cefr, difficulty)
  const updatesByValue = new Map<string, string[]>();
  const unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }> = [];
  for (const rec of records) {
    const key = `${rec.word.toLowerCase()}|${rec.pos}`;
    const ids = translationMap.get(key);
    if (ids && ids.length > 0) {
      const valueKey = `${rec.cefr}|${rec.difficulty}`;
      if (!updatesByValue.has(valueKey)) updatesByValue.set(valueKey, []);
      updatesByValue.get(valueKey)!.push(...ids);
    } else {
      unmatchedWords.push({ word: rec.word, pos: rec.pos, cefr: rec.cefr });
    }
  }
  // 4. Batch updates grouped by (cefr, difficulty)
  let totalUpdated = 0;
  for (const [valueKey, ids] of updatesByValue.entries()) {
    const [cefr, difficulty] = valueKey.split("|") as [CEFRLevel, Difficulty];
    const uniqueIds = [...new Set(ids)]; // Deduplicate synonyms/duplicates
    for (const idBatch of chunk(uniqueIds, BATCH_SIZE)) {
      await db
        .update(translations)
        .set({ cefr_level: cefr, difficulty })
        .where(inArray(translations.id, idBatch));
      totalUpdated += idBatch.length;
    }
  }
  // 5. Summary
  console.log(`\n   ✅ Updated ${fmt(totalUpdated)} translations`);
  console.log(`   ⚠️  Unmatched: ${fmt(unmatchedWords.length)}`);
  if (unmatchedWords.length > 0) {
    console.log(`\n   Sample unmatched words (first 20):`);
    for (const { word, pos, cefr } of unmatchedWords.slice(0, 20)) {
      console.log(`      "${word}" (${pos}, ${cefr})`);
    }
    if (unmatchedWords.length > 20) {
      console.log(`      ... and ${fmt(unmatchedWords.length - 20)} more`);
    }
  }
 }
 // ────────────────────────────────────────────────────────────
 // Main
 // ────────────────────────────────────────────────────────────
 const main = async () => {
  console.log("##########################################");
  console.log("Glossa — CEFR Enrichment");
  console.log("##########################################\n");
  for (const lang of SUPPORTED_LANGUAGE_CODES) {
    await enrichLanguage(lang);
  }
  console.log("\n##########################################");
  console.log("Done");
  console.log("##########################################");
 };
 main().catch((err) => {
  console.error(err);
  process.exit(1);
 });
--- a/packages/db/src/seeding-datafiles.ts
+++ b/packages/db/src/seeding-datafiles.ts
@ -15,7 +15,7 @@ type SynsetRecord = {
  glosses: Partial<Record<LanguageCode, string[]>>;
 };
-const dataDir = "./src/data/datafiles/";
+const dataDir = "./src/data/";
 const BATCH_SIZE = 500;
 // ────────────────────────────────────────────────────────────