lila/data-pipeline/db/import.ts

import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
import { openDb } from "./index.js";

// ── Types ─────────────────────────────────────────────────────────────────────

type Example = { text: string; source: "omw" | "cefr" };

type AnnotatedRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
  votes: Partial<
    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
  >;
};

// ── Paths ─────────────────────────────────────────────────────────────────────

const __dirname = path.dirname(fileURLToPath(import.meta.url));

const PATHS = {
  annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
};

// ── Loading ───────────────────────────────────────────────────────────────────

async function loadAnnotated(): Promise<AnnotatedRecord[]> {
  // Use en.json as the base — it has the most complete glosses and examples.
  // Merge votes and CEFR examples from the other language files.
  const baseRaw = await fs.readFile(
    path.join(PATHS.annotatedDir, "en.json"),
    "utf-8",
  );
  const base = JSON.parse(baseRaw) as AnnotatedRecord[];

  const byId = new Map<string, AnnotatedRecord>();
  for (const record of base) {
    byId.set(record.source_id, record);
  }

  for (const lang of SUPPORTED_LANGUAGE_CODES) {
    if (lang === "en") continue;

    const raw = await fs.readFile(
      path.join(PATHS.annotatedDir, `${lang}.json`),
      "utf-8",
    );
    const records = JSON.parse(raw) as AnnotatedRecord[];

    for (const record of records) {
      const base = byId.get(record.source_id);
      if (!base) continue;

      // Merge votes
      for (const [l, langVotes] of Object.entries(record.votes)) {
        if (!base.votes[l as SupportedLanguageCode]) {
          base.votes[l as SupportedLanguageCode] = {};
        }
        Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
      }

      // Merge CEFR examples not already in base
      for (const [l, examples] of Object.entries(record.examples)) {
        const lang = l as SupportedLanguageCode;
        const cefrExamples = examples.filter((e) => e.source === "cefr");
        if (cefrExamples.length === 0) continue;

        if (!base.examples[lang]) {
          base.examples[lang] = cefrExamples;
        } else {
          base.examples[lang].push(...cefrExamples);
        }
      }
    }
  }

  return [...byId.values()];
}

// ── Import ────────────────────────────────────────────────────────────────────

export async function importStage2(): Promise<void> {
  console.log("Loading stage 2 annotated files...");
  const records = await loadAnnotated();
  console.log(`  Loaded ${records.length.toLocaleString()} synsets`);

  const db = openDb();

  const insertSynset = db.prepare(
    `INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
  );

  const insertTranslation = db.prepare(
    `INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
  );

  const insertGloss = db.prepare(
    `INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
  );

  const insertExample = db.prepare(
    `INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
  );

  const insertCefrVote = db.prepare(`
    INSERT INTO cefr_source_votes (translation_id, cefr_level)
    VALUES (
      (SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
      ?
    )
  `);

  console.log("\nImporting into pipeline.db...");

  const importAll = db.transaction(() => {
    let synsets = 0;
    let translations = 0;
    let glosses = 0;
    let examples = 0;
    let cefrVotes = 0;

    for (const record of records) {
      insertSynset.run(record.source_id, record.pos);
      synsets++;

      // Translations
      for (const [lang, words] of Object.entries(record.translations)) {
        const unique = [...new Set(words)];
        for (const word of unique) {
          insertTranslation.run(record.source_id, lang, word);
          translations++;
        }
      }

      // Glosses
      for (const [lang, glossList] of Object.entries(record.glosses)) {
        for (const text of glossList) {
          insertGloss.run(record.source_id, lang, text);
          glosses++;
        }
      }

      // Examples
      for (const [lang, exList] of Object.entries(record.examples)) {
        for (const example of exList) {
          insertExample.run(
            record.source_id,
            lang,
            example.text,
            example.source,
          );
          examples++;
        }
      }

      // CEFR source votes
      for (const [lang, langVotes] of Object.entries(record.votes)) {
        for (const [word, vote] of Object.entries(
          langVotes as Record<string, { cefr_source: string }>,
        )) {
          insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
          cefrVotes++;
        }
      }
    }

    return { synsets, translations, glosses, examples, cefrVotes };
  });

  const counts = importAll();

  console.log(`  synsets:      ${counts.synsets.toLocaleString()}`);
  console.log(`  translations: ${counts.translations.toLocaleString()}`);
  console.log(`  glosses:      ${counts.glosses.toLocaleString()}`);
  console.log(`  examples:     ${counts.examples.toLocaleString()}`);
  console.log(`  cefr votes:   ${counts.cefrVotes.toLocaleString()}`);

  db.close();
  console.log("\nImport complete.");
}

// ── Check if already imported ─────────────────────────────────────────────────

export function isImported(): boolean {
  const db = openDb();
  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
    count: number;
  };
  db.close();
  return row.count > 0;
}

// ── Main ─────────────────────────────────────────────────────────────────────

async function main(): Promise<void> {
  const db = openDb();
  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
    count: number;
  };
  db.close();

  if (row.count > 0) {
    console.log(
      `pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
    );
    console.log("Delete pipeline.db and re-run db:init to start fresh.");
    process.exit(0);
  }

  await importStage2();
}

if (import.meta.url === `file://${process.argv[1]}`) {
  main().catch((err) => {
    console.error(err);
    process.exit(1);
  });
}