lila/packages/db/src/checking-cefr-coverage.ts

/*

This script performs a cross-reference check between two specific data sets:

    - The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for.
    - The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app.

What it calculates:
It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?"

    Matched: The word from the JSON file was found in the DB. (Ready for enrichment).
    Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment).

*/

import fs from "node:fs/promises";
import { eq } from "drizzle-orm";

import {
  SUPPORTED_LANGUAGE_CODES,
  SUPPORTED_POS,
  CEFR_LEVELS,
  DIFFICULTY_LEVELS,
} from "@lila/shared";
import { db } from "@lila/db";
import { terms, translations } from "@lila/db/schema";

type POS = (typeof SUPPORTED_POS)[number];
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
type CEFRLevel = (typeof CEFR_LEVELS)[number];
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];

type MergedRecord = {
  word: string;
  pos: POS;
  cefr: CEFRLevel;
  difficulty: Difficulty;
  sources: string[];
};

type CoverageStats = {
  total: number;
  matched: number;
  unmatched: number;
  byCefr: Record<CEFRLevel, { total: number; matched: number }>;
  byDifficulty: Record<Difficulty, { total: number; matched: number }>;
  unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>;
};

const dataDir = "./src/data/";

async function checkCoverage(language: LanguageCode): Promise<void> {
  const filename = `${language}-merged.json`;
  const filepath = dataDir + filename;

  console.log(`\n📄 Checking ${filename}...`);

  // Load merged data
  let records: MergedRecord[];
  try {
    const raw = await fs.readFile(filepath, "utf8");
    records = JSON.parse(raw) as MergedRecord[];
  } catch (e) {
    console.warn(`   ⚠️  Could not read file: ${(e as Error).message}`);
    return;
  }

  console.log(`   Loaded ${records.length.toLocaleString("en-US")} entries`);

  // Initialize stats
  const stats: CoverageStats = {
    total: records.length,
    matched: 0,
    unmatched: 0,
    byCefr: {} as Record<CEFRLevel, { total: number; matched: number }>,
    byDifficulty: {} as Record<Difficulty, { total: number; matched: number }>,
    unmatchedWords: [],
  };

  for (const level of CEFR_LEVELS)
    stats.byCefr[level] = { total: 0, matched: 0 };
  for (const diff of DIFFICULTY_LEVELS)
    stats.byDifficulty[diff] = { total: 0, matched: 0 };

  // ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ──
  console.log(`   🔍 Querying database for existing translations...`);

  // Get all existing translations for this language + POS combo
  const existingRows = await db
    .select({ text: translations.text, pos: terms.pos })
    .from(translations)
    .innerJoin(terms, eq(translations.term_id, terms.id))
    .where(eq(translations.language_code, language));

  // Create a Set for O(1) lookup: "word|pos" -> true
  const existingSet = new Set(
    existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`),
  );

  // ── Process records against the in-memory Set ──
  for (const record of records) {
    stats.byCefr[record.cefr].total++;
    stats.byDifficulty[record.difficulty].total++;

    const key = `${record.word.toLowerCase()}|${record.pos}`;

    if (existingSet.has(key)) {
      stats.matched++;
      stats.byCefr[record.cefr].matched++;
      stats.byDifficulty[record.difficulty].matched++;
    } else {
      stats.unmatched++;
      if (stats.unmatchedWords.length < 20) {
        stats.unmatchedWords.push({
          word: record.word,
          pos: record.pos,
          cefr: record.cefr,
        });
      }
    }
  }

  // ── Print results (same as your draft) ──
  console.log(`\n📊 Coverage for ${language}:`);
  console.log(`   Total entries:    ${stats.total.toLocaleString("en-US")}`);
  console.log(
    `   Matched in DB:    ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`,
  );
  console.log(
    `   Unmatched:        ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`,
  );

  console.log(`\n   By CEFR level:`);
  for (const level of CEFR_LEVELS) {
    const { total, matched } = stats.byCefr[level];
    if (total > 0) {
      const pct = ((matched / total) * 100).toFixed(1);
      console.log(
        `      ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
      );
    }
  }

  console.log(`\n   By difficulty:`);
  for (const diff of DIFFICULTY_LEVELS) {
    const { total, matched } = stats.byDifficulty[diff];
    if (total > 0) {
      const pct = ((matched / total) * 100).toFixed(1);
      console.log(
        `      ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
      );
    }
  }

  if (stats.unmatchedWords.length > 0) {
    console.log(`\n⚠️  Sample unmatched words (first 20):`);
    for (const { word, pos, cefr } of stats.unmatchedWords) {
      console.log(`      "${word}" (${pos}, ${cefr})`);
    }
    if (stats.unmatched > 20) {
      console.log(`      ... and ${stats.unmatched - 20} more`);
    }
  }
}

const main = async () => {
  console.log("##########################################");
  console.log("lila — CEFR Coverage Check");
  console.log("##########################################");

  for (const language of SUPPORTED_LANGUAGE_CODES) {
    await checkCoverage(language);
  }

  console.log("\n##########################################");
  console.log("Done");
  console.log("##########################################");
};

main().catch((err) => {
  console.error(err);
  process.exit(1);
});