feat: add db import script, fix duplicate translations in extract, add annotate script

2026-05-03 22:05:10 +02:00 · 2026-05-03 22:05:10 +02:00 · f59399be02
commit f59399be02
parent 4a842140b9
7 changed files with 274 additions and 62 deletions
--- a/data-pipeline/db/import.ts
+++ b/data-pipeline/db/import.ts
@ -0,0 +1,222 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
+import { openDb } from "./index.js";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type Example = { text: string; source: "omw" | "cefr" };
+
+type AnnotatedRecord = {
+  source_id: string;
+  pos: SupportedPos;
+  translations: Partial<Record<SupportedLanguageCode, string[]>>;
+  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
+  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
+  votes: Partial<
+    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
+  >;
+};
+
+// ── Paths ─────────────────────────────────────────────────────────────────────
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const PATHS = {
+  annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
+};
+
+// ── Loading ───────────────────────────────────────────────────────────────────
+
+async function loadAnnotated(): Promise<AnnotatedRecord[]> {
+  // Use en.json as the base — it has the most complete glosses and examples.
+  // Merge votes and CEFR examples from the other language files.
+  const baseRaw = await fs.readFile(
+    path.join(PATHS.annotatedDir, "en.json"),
+    "utf-8",
+  );
+  const base = JSON.parse(baseRaw) as AnnotatedRecord[];
+
+  const byId = new Map<string, AnnotatedRecord>();
+  for (const record of base) {
+    byId.set(record.source_id, record);
+  }
+
+  for (const lang of SUPPORTED_LANGUAGE_CODES) {
+    if (lang === "en") continue;
+
+    const raw = await fs.readFile(
+      path.join(PATHS.annotatedDir, `${lang}.json`),
+      "utf-8",
+    );
+    const records = JSON.parse(raw) as AnnotatedRecord[];
+
+    for (const record of records) {
+      const base = byId.get(record.source_id);
+      if (!base) continue;
+
+      // Merge votes
+      for (const [l, langVotes] of Object.entries(record.votes)) {
+        if (!base.votes[l as SupportedLanguageCode]) {
+          base.votes[l as SupportedLanguageCode] = {};
+        }
+        Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
+      }
+
+      // Merge CEFR examples not already in base
+      for (const [l, examples] of Object.entries(record.examples)) {
+        const lang = l as SupportedLanguageCode;
+        const cefrExamples = examples.filter((e) => e.source === "cefr");
+        if (cefrExamples.length === 0) continue;
+
+        if (!base.examples[lang]) {
+          base.examples[lang] = cefrExamples;
+        } else {
+          base.examples[lang].push(...cefrExamples);
+        }
+      }
+    }
+  }
+
+  return [...byId.values()];
+}
+
+// ── Import ────────────────────────────────────────────────────────────────────
+
+export async function importStage2(): Promise<void> {
+  console.log("Loading stage 2 annotated files...");
+  const records = await loadAnnotated();
+  console.log(`  Loaded ${records.length.toLocaleString()} synsets`);
+
+  const db = openDb();
+
+  const insertSynset = db.prepare(
+    `INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
+  );
+
+  const insertTranslation = db.prepare(
+    `INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
+  );
+
+  const insertGloss = db.prepare(
+    `INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
+  );
+
+  const insertExample = db.prepare(
+    `INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
+  );
+
+  const insertCefrVote = db.prepare(`
+    INSERT INTO cefr_source_votes (translation_id, cefr_level)
+    VALUES (
+      (SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
+      ?
+    )
+  `);
+
+  console.log("\nImporting into pipeline.db...");
+
+  const importAll = db.transaction(() => {
+    let synsets = 0;
+    let translations = 0;
+    let glosses = 0;
+    let examples = 0;
+    let cefrVotes = 0;
+
+    for (const record of records) {
+      insertSynset.run(record.source_id, record.pos);
+      synsets++;
+
+      // Translations
+      for (const [lang, words] of Object.entries(record.translations)) {
+        const unique = [...new Set(words)];
+        for (const word of unique) {
+          insertTranslation.run(record.source_id, lang, word);
+          translations++;
+        }
+      }
+
+      // Glosses
+      for (const [lang, glossList] of Object.entries(record.glosses)) {
+        for (const text of glossList) {
+          insertGloss.run(record.source_id, lang, text);
+          glosses++;
+        }
+      }
+
+      // Examples
+      for (const [lang, exList] of Object.entries(record.examples)) {
+        for (const example of exList) {
+          insertExample.run(
+            record.source_id,
+            lang,
+            example.text,
+            example.source,
+          );
+          examples++;
+        }
+      }
+
+      // CEFR source votes
+      for (const [lang, langVotes] of Object.entries(record.votes)) {
+        for (const [word, vote] of Object.entries(
+          langVotes as Record<string, { cefr_source: string }>,
+        )) {
+          insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
+          cefrVotes++;
+        }
+      }
+    }
+
+    return { synsets, translations, glosses, examples, cefrVotes };
+  });
+
+  const counts = importAll();
+
+  console.log(`  synsets:      ${counts.synsets.toLocaleString()}`);
+  console.log(`  translations: ${counts.translations.toLocaleString()}`);
+  console.log(`  glosses:      ${counts.glosses.toLocaleString()}`);
+  console.log(`  examples:     ${counts.examples.toLocaleString()}`);
+  console.log(`  cefr votes:   ${counts.cefrVotes.toLocaleString()}`);
+
+  db.close();
+  console.log("\nImport complete.");
+}
+
+// ── Check if already imported ─────────────────────────────────────────────────
+
+export function isImported(): boolean {
+  const db = openDb();
+  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
+    count: number;
+  };
+  db.close();
+  return row.count > 0;
+}
+
+// ── Main ─────────────────────────────────────────────────────────────────────
+
+async function main(): Promise<void> {
+  const db = openDb();
+  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
+    count: number;
+  };
+  db.close();
+
+  if (row.count > 0) {
+    console.log(
+      `pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
+    );
+    console.log("Delete pipeline.db and re-run db:init to start fresh.");
+    process.exit(0);
+  }
+
+  await importStage2();
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});