feat: add Kaikki extraction and import scripts for stage 1

- Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL, filters to supported POS and languages, skips abbreviations and senses with no translations in supported languages - Rewrite db/import.ts for Kaikki flat model — tracks sense_index offsets per headword+pos to handle duplicate JSONL entries - Rewrite db/schema.sql for Kaikki model — entries, translations, LLM vote tables, resolved tables - Add extract and db:import scripts to package.json - Sample mode hardcoded to 500 entries for development
2026-05-05 18:11:53 +02:00 · 2026-05-05 18:11:53 +02:00 · 209d52f54b
commit 209d52f54b
parent 963bff4eb8
17 changed files with 346 additions and 1055737 deletions
--- a/data-pipeline/db/import.ts
+++ b/data-pipeline/db/import.ts
@ -1,185 +1,98 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
-import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
-import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
 import { openDb } from "./index.js";
-
-// ── Types ─────────────────────────────────────────────────────────────────────
-
-type Example = { text: string; source: "omw" | "cefr" };
-
-type AnnotatedRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
-};
+import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";

 // ── Paths ─────────────────────────────────────────────────────────────────────

 const __dirname = path.dirname(fileURLToPath(import.meta.url));

 const PATHS = {
-  annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
+  extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
 };

-// ── Loading ───────────────────────────────────────────────────────────────────
-
-async function loadAnnotated(): Promise<AnnotatedRecord[]> {
-  // Use en.json as the base — it has the most complete glosses and examples.
-  // Merge votes and CEFR examples from the other language files.
-  const baseRaw = await fs.readFile(
-    path.join(PATHS.annotatedDir, "en.json"),
-    "utf-8",
-  );
-  const base = JSON.parse(baseRaw) as AnnotatedRecord[];
-
-  const byId = new Map<string, AnnotatedRecord>();
-  for (const record of base) {
-    byId.set(record.source_id, record);
-  }
-
-  for (const lang of SUPPORTED_LANGUAGE_CODES) {
-    if (lang === "en") continue;
-
-    const raw = await fs.readFile(
-      path.join(PATHS.annotatedDir, `${lang}.json`),
-      "utf-8",
-    );
-    const records = JSON.parse(raw) as AnnotatedRecord[];
-
-    for (const record of records) {
-      const base = byId.get(record.source_id);
-      if (!base) continue;
-
-      // Merge votes
-      for (const [l, langVotes] of Object.entries(record.votes)) {
-        if (!base.votes[l as SupportedLanguageCode]) {
-          base.votes[l as SupportedLanguageCode] = {};
-        }
-        Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
-      }
-
-      // Merge CEFR examples not already in base
-      for (const [l, examples] of Object.entries(record.examples)) {
-        const lang = l as SupportedLanguageCode;
-        const cefrExamples = examples.filter((e) => e.source === "cefr");
-        if (cefrExamples.length === 0) continue;
-
-        if (!base.examples[lang]) {
-          base.examples[lang] = cefrExamples;
-        } else {
-          base.examples[lang].push(...cefrExamples);
-        }
-      }
-    }
-  }
-
-  return [...byId.values()];
-}
-
 // ── Import ────────────────────────────────────────────────────────────────────

-export async function importStage2(): Promise<void> {
-  console.log("Loading stage 2 annotated files...");
-  const records = await loadAnnotated();
-  console.log(`  Loaded ${records.length.toLocaleString()} synsets`);
+export async function importKaikki(): Promise<void> {
+  console.log("Loading extracted Kaikki data...");
+  const raw = await fs.readFile(PATHS.extracted, "utf-8");
+  const senses = JSON.parse(raw) as ExtractedSense[];
+  console.log(`  Loaded ${senses.length.toLocaleString()} senses`);

  const db = openDb();

-  const insertSynset = db.prepare(
-    `INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
-  );
-
-  const insertTranslation = db.prepare(
-    `INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
-  );
-
-  const insertGloss = db.prepare(
-    `INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
-  );
-
-  const insertExample = db.prepare(
-    `INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
-  );
-
-  const insertCefrVote = db.prepare(`
-    INSERT INTO cefr_source_votes (translation_id, cefr_level)
-    VALUES (
-      (SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
-      ?
-    )
+  const insertEntry = db.prepare(`
+    INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
+    VALUES (?, ?, ?, ?, ?, ?)
+    ON CONFLICT (headword, language, pos, sense_index)
+    DO UPDATE SET
+      gloss    = excluded.gloss,
+      examples = excluded.examples
+    RETURNING id
  `);

+  const insertTranslation = db.prepare(`
+    INSERT INTO translations (entry_id, target_lang, word, sense_hint)
+    VALUES (?, ?, ?, ?)
+    ON CONFLICT (entry_id, target_lang, word) DO NOTHING
+  `);
+
+  // Track next available sense_index per (headword, pos) to handle
+  // the same word appearing in multiple JSONL entries with the same POS.
+  const senseIndexMap = new Map<string, number>();
+
  console.log("\nImporting into pipeline.db...");

  const importAll = db.transaction(() => {
-    let synsets = 0;
+    let entries = 0;
    let translations = 0;
-    let glosses = 0;
-    let examples = 0;
-    let cefrVotes = 0;
+    let skipped = 0;

-    for (const record of records) {
-      insertSynset.run(record.source_id, record.pos);
-      synsets++;
+    for (const sense of senses) {
+      const key = `${sense.headword}|${sense.pos}`;
+      const nextIndex = senseIndexMap.get(key) ?? 0;

-      // Translations
-      for (const [lang, words] of Object.entries(record.translations)) {
-        const unique = [...new Set(words)];
-        for (const word of unique) {
-          insertTranslation.run(record.source_id, lang, word);
-          translations++;
-        }
+      // Use the offset sense_index to avoid collisions when the same word
+      // appears in multiple JSONL entries with the same POS.
+      const senseIndex = nextIndex;
+      senseIndexMap.set(key, nextIndex + 1);
+
+      const row = insertEntry.get(
+        sense.headword,
+        "en",
+        sense.pos,
+        senseIndex,
+        sense.gloss ?? null,
+        JSON.stringify(sense.examples),
+      ) as { id: number } | undefined;
+
+      if (!row) {
+        skipped++;
+        continue;
      }

-      // Glosses
-      for (const [lang, glossList] of Object.entries(record.glosses)) {
-        for (const text of glossList) {
-          insertGloss.run(record.source_id, lang, text);
-          glosses++;
-        }
-      }
+      entries++;

-      // Examples
-      for (const [lang, exList] of Object.entries(record.examples)) {
-        for (const example of exList) {
-          insertExample.run(
-            record.source_id,
-            lang,
-            example.text,
-            example.source,
-          );
-          examples++;
-        }
-      }
-
-      // CEFR source votes
-      for (const [lang, langVotes] of Object.entries(record.votes)) {
-        for (const [word, vote] of Object.entries(
-          langVotes as Record<string, { cefr_source: string }>,
-        )) {
-          insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
-          cefrVotes++;
-        }
+      for (const t of sense.translations) {
+        insertTranslation.run(
+          row.id,
+          t.target_lang,
+          t.word,
+          t.sense_hint ?? null,
+        );
+        translations++;
      }
    }

-    return { synsets, translations, glosses, examples, cefrVotes };
+    return { entries, translations, skipped };
  });

  const counts = importAll();

-  console.log(`  synsets:      ${counts.synsets.toLocaleString()}`);
+  console.log(`  entries:      ${counts.entries.toLocaleString()}`);
  console.log(`  translations: ${counts.translations.toLocaleString()}`);
-  console.log(`  glosses:      ${counts.glosses.toLocaleString()}`);
-  console.log(`  examples:     ${counts.examples.toLocaleString()}`);
-  console.log(`  cefr votes:   ${counts.cefrVotes.toLocaleString()}`);
+  console.log(`  skipped:      ${counts.skipped.toLocaleString()}`);

  db.close();
  console.log("\nImport complete.");
@ -189,7 +102,7 @@ export async function importStage2(): Promise<void> {

 export function isImported(): boolean {
  const db = openDb();
-  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
+  const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
    count: number;
  };
  db.close();
@ -200,20 +113,20 @@ export function isImported(): boolean {

 async function main(): Promise<void> {
  const db = openDb();
-  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
+  const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
    count: number;
  };
  db.close();

  if (row.count > 0) {
    console.log(
-      `pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
+      `pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
    );
    console.log("Delete pipeline.db and re-run db:init to start fresh.");
    process.exit(0);
  }

-  await importStage2();
+  await importKaikki();
 }

 if (import.meta.url === `file://${process.argv[1]}`) {
--- a/data-pipeline/db/schema.sql
+++ b/data-pipeline/db/schema.sql
@ -1,62 +1,58 @@
 -- ── Base data ─────────────────────────────────────────────────────────────────
-- Imported from stage 2 JSON on first run. Never mutated after import.
+-- Imported from Kaikki on first run. Never mutated after import.

-CREATE TABLE IF NOT EXISTS synsets (
-  source_id TEXT PRIMARY KEY,
-  pos       TEXT NOT NULL
+CREATE TABLE IF NOT EXISTS entries (
+  id          INTEGER PRIMARY KEY,
+  headword    TEXT    NOT NULL,
+  language    TEXT    NOT NULL,
+  pos         TEXT    NOT NULL,
+  sense_index INTEGER NOT NULL DEFAULT 0,
+  gloss       TEXT,
+  examples    TEXT    NOT NULL DEFAULT '[]', -- JSON array of strings
+  source      TEXT    NOT NULL DEFAULT 'kaikki',
+  UNIQUE (headword, language, pos, sense_index)
 );

 CREATE TABLE IF NOT EXISTS translations (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  word      TEXT    NOT NULL,
-  UNIQUE (source_id, language, word)
-);
-
-CREATE TABLE IF NOT EXISTS glosses (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL
-);
-
-CREATE TABLE IF NOT EXISTS examples (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL
-);
-
-CREATE TABLE IF NOT EXISTS cefr_source_votes (
-  id             INTEGER PRIMARY KEY,
-  translation_id INTEGER NOT NULL REFERENCES translations(id),
-  cefr_level     TEXT    NOT NULL,
-  UNIQUE (translation_id)
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  sense_hint  TEXT,
+  source      TEXT    NOT NULL DEFAULT 'kaikki',
+  UNIQUE (entry_id, target_lang, word)
 );

 -- ── Status tracking ───────────────────────────────────────────────────────────
-- One row per synset per model per stage. Drives resumability.
+-- One row per entry per model per stage. Drives resumability.
+-- Sentinel rows use entry_id = 0 for one-time pipeline steps.
 -- stage:  round1 | round2 | tiebreak
 -- status: pending | complete | needs_review | flagged

 CREATE TABLE IF NOT EXISTS run_status (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL,
+  entry_id   INTEGER NOT NULL,
  model_name TEXT    NOT NULL,
  stage      TEXT    NOT NULL,
  status     TEXT    NOT NULL,
  created_at TEXT    NOT NULL DEFAULT (datetime('now')),
  updated_at TEXT    NOT NULL DEFAULT (datetime('now')),
-  UNIQUE (source_id, model_name, stage)
+  UNIQUE (entry_id, model_name, stage)
 );

 -- ── Round 1 output ────────────────────────────────────────────────────────────
-- One row per translation/language per model. Written atomically per record.
+-- Written atomically per entry per model.
 -- Unique constraints enforce one model one vote.

-CREATE TABLE IF NOT EXISTS model_cefr_votes (
+CREATE TABLE IF NOT EXISTS model_entry_cefr_votes (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  model_name TEXT    NOT NULL,
+  cefr_level TEXT    NOT NULL,
+  UNIQUE (entry_id, model_name)
+);
+
+CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
  id             INTEGER PRIMARY KEY,
  translation_id INTEGER NOT NULL REFERENCES translations(id),
  model_name     TEXT    NOT NULL,
@ -64,38 +60,29 @@ CREATE TABLE IF NOT EXISTS model_cefr_votes (
  UNIQUE (translation_id, model_name)
 );

-CREATE TABLE IF NOT EXISTS model_translation_rejections (
-  id             INTEGER PRIMARY KEY,
-  translation_id INTEGER NOT NULL REFERENCES translations(id),
-  model_name     TEXT    NOT NULL,
-  UNIQUE (translation_id, model_name)
-);
-
 CREATE TABLE IF NOT EXISTS generated_glosses (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
  model_name TEXT    NOT NULL,
-  language   TEXT    NOT NULL,
  text       TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+  UNIQUE (entry_id, model_name)
 );

 CREATE TABLE IF NOT EXISTS generated_examples (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
  model_name TEXT    NOT NULL,
-  language   TEXT    NOT NULL,
  text       TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+  UNIQUE (entry_id, model_name)
 );

-CREATE TABLE IF NOT EXISTS generated_descriptions (
-  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
-  model_name TEXT    NOT NULL,
-  language   TEXT    NOT NULL,
-  text       TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+CREATE TABLE IF NOT EXISTS generated_translations (
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  model_name  TEXT    NOT NULL,
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  UNIQUE (entry_id, model_name, target_lang)
 );

 -- ── Round 2 output ────────────────────────────────────────────────────────────
@ -116,20 +103,28 @@ CREATE TABLE IF NOT EXISTS example_candidate_votes (
  UNIQUE (example_id, model_name)
 );

-CREATE TABLE IF NOT EXISTS description_candidate_votes (
+CREATE TABLE IF NOT EXISTS translation_candidate_votes (
  id             INTEGER PRIMARY KEY,
-  description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
+  translation_id INTEGER NOT NULL REFERENCES generated_translations(id),
  model_name     TEXT    NOT NULL,
-  UNIQUE (description_id, model_name)
+  UNIQUE (translation_id, model_name)
 );

 -- ── Resolved output ───────────────────────────────────────────────────────────
 -- Written by merge. Never updated after writing.
-- Only fully resolved records are written here — no nulls, no flags.
+-- Only fully resolved records are written here — no nulls.
 -- Absence of a row means unresolved. Flagged status tracked in run_status.
-- source: omw | cefr | model_name
+-- source: kaikki | model_name

-CREATE TABLE IF NOT EXISTS resolved_translations (
+CREATE TABLE IF NOT EXISTS resolved_entry_cefr (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  cefr_level TEXT    NOT NULL,
+  difficulty TEXT    NOT NULL,
+  UNIQUE (entry_id)
+);
+
+CREATE TABLE IF NOT EXISTS resolved_translation_cefr (
  id             INTEGER PRIMARY KEY,
  translation_id INTEGER NOT NULL REFERENCES translations(id),
  cefr_level     TEXT    NOT NULL,
@ -138,27 +133,25 @@ CREATE TABLE IF NOT EXISTS resolved_translations (
 );

 CREATE TABLE IF NOT EXISTS resolved_glosses (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL,
-  UNIQUE (source_id, language)
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  text       TEXT    NOT NULL,
+  source     TEXT    NOT NULL,
+  UNIQUE (entry_id)
 );

 CREATE TABLE IF NOT EXISTS resolved_examples (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  text       TEXT    NOT NULL,
+  source     TEXT    NOT NULL
 );

-CREATE TABLE IF NOT EXISTS resolved_descriptions (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL,
-  UNIQUE (source_id, language)
+CREATE TABLE IF NOT EXISTS resolved_generated_translations (
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  source      TEXT    NOT NULL,
+  UNIQUE (entry_id, target_lang)
 );