lila/packages/db/src/seeding-cefr-levels.ts

import fs from "node:fs/promises";
import { eq, inArray } from "drizzle-orm";

import {
  SUPPORTED_LANGUAGE_CODES,
  SUPPORTED_POS,
  CEFR_LEVELS,
  DIFFICULTY_LEVELS,
} from "@lila/shared";
import { db } from "@lila/db";
import { translations, terms } from "@lila/db/schema";

type POS = (typeof SUPPORTED_POS)[number];
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
type CEFRLevel = (typeof CEFR_LEVELS)[number];
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];

type MergedRecord = {
  word: string;
  pos: POS;
  cefr: CEFRLevel;
  difficulty: Difficulty;
  sources: string[];
};

const dataDir = "./src/data/";
const BATCH_SIZE = 500;

// ────────────────────────────────────────────────────────────
// Helpers
// ────────────────────────────────────────────────────────────

function chunk<T>(arr: T[], size: number): T[][] {
  const out: T[][] = [];
  for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
  return out;
}

function fmt(n: number): string {
  return n.toLocaleString("en-US");
}

// ────────────────────────────────────────────────────────────
// Enrichment per language
// ────────────────────────────────────────────────────────────

async function enrichLanguage(language: LanguageCode): Promise<void> {
  const filename = `${language}-merged.json`;
  const filepath = dataDir + filename;

  console.log(`\n📝 Enriching ${filename}...`);

  let records: MergedRecord[];
  try {
    const raw = await fs.readFile(filepath, "utf8");
    records = JSON.parse(raw) as MergedRecord[];
  } catch (e) {
    console.warn(`   ⚠️  Could not read file: ${(e as Error).message}`);
    return;
  }

  console.log(`   Loaded ${fmt(records.length)} entries`);

  // 1. Bulk fetch existing translations for this language
  console.log(`   🔍 Fetching existing translations from DB...`);
  const existingTranslations = await db
    .select({ id: translations.id, text: translations.text, pos: terms.pos })
    .from(translations)
    .innerJoin(terms, eq(translations.term_id, terms.id))
    .where(eq(translations.language_code, language));

  // 2. Build lookup map: "lowercase_word|pos" -> translation IDs
  const translationMap = new Map<string, string[]>();
  for (const t of existingTranslations) {
    const key = `${t.text.toLowerCase()}|${t.pos}`;
    if (!translationMap.has(key)) translationMap.set(key, []);
    translationMap.get(key)!.push(t.id);
  }

  // 3. Match records to DB IDs and group by target (cefr, difficulty)
  const updatesByValue = new Map<string, string[]>();
  const unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }> = [];

  for (const rec of records) {
    const key = `${rec.word.toLowerCase()}|${rec.pos}`;
    const ids = translationMap.get(key);

    if (ids && ids.length > 0) {
      const valueKey = `${rec.cefr}|${rec.difficulty}`;
      if (!updatesByValue.has(valueKey)) updatesByValue.set(valueKey, []);
      updatesByValue.get(valueKey)!.push(...ids);
    } else {
      unmatchedWords.push({ word: rec.word, pos: rec.pos, cefr: rec.cefr });
    }
  }

  // 4. Batch updates grouped by (cefr, difficulty)
  let totalUpdated = 0;
  for (const [valueKey, ids] of updatesByValue.entries()) {
    const [cefr, difficulty] = valueKey.split("|") as [CEFRLevel, Difficulty];
    const uniqueIds = [...new Set(ids)]; // Deduplicate synonyms/duplicates

    for (const idBatch of chunk(uniqueIds, BATCH_SIZE)) {
      await db
        .update(translations)
        .set({ cefr_level: cefr, difficulty })
        .where(inArray(translations.id, idBatch));
      totalUpdated += idBatch.length;
    }
  }

  // 5. Summary
  console.log(`\n   ✅ Updated ${fmt(totalUpdated)} translations`);
  console.log(`   ⚠️  Unmatched: ${fmt(unmatchedWords.length)}`);

  if (unmatchedWords.length > 0) {
    console.log(`\n   Sample unmatched words (first 20):`);
    for (const { word, pos, cefr } of unmatchedWords.slice(0, 20)) {
      console.log(`      "${word}" (${pos}, ${cefr})`);
    }
    if (unmatchedWords.length > 20) {
      console.log(`      ... and ${fmt(unmatchedWords.length - 20)} more`);
    }
  }
}

// ────────────────────────────────────────────────────────────
// Main
// ────────────────────────────────────────────────────────────

const main = async () => {
  console.log("##########################################");
  console.log("lila — CEFR Enrichment");
  console.log("##########################################\n");

  for (const lang of SUPPORTED_LANGUAGE_CODES) {
    await enrichLanguage(lang);
  }

  console.log("\n##########################################");
  console.log("Done");
  console.log("##########################################");
};

main().catch((err) => {
  console.error(err);
  process.exit(1);
});