WIP: checkpoint before stage-3 sub-stage rewrite

feat: enrich script working, redesigning to sub-stage architecture
- Enrich script functional with timeout, progress tracking, rejection mechanism - Identified ordering issue: CEFR voting needs validated translations first - Redesign: round1_gloss → round1_example → round1_translations → round1_cefr - Update data-pipeline.md with new sub-stage design and roadmap - Qwen3.5-4B confirmed working with thinking disabled
2026-05-12 22:13:14 +02:00 · 2026-05-07 13:09:43 +02:00 · 2026-05-05 19:30:18 +02:00 · 2026-05-05 19:28:38 +02:00 · 2026-05-05 19:10:19 +02:00 · 2026-05-05 19:04:28 +02:00
45 changed files with 4465 additions and 1054955 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,6 +12,10 @@ __pycache__/

 data-pipeline/archive/
 data-pipeline/stage-1-extract/output/
+data-pipeline/stage-1-extract/sources/
 data-pipeline/stage-2-annotate/output/
 data-pipeline/stage-3-enrich/output/
 data-pipeline/stage-4-merge/output/
+data-pipeline/db/pipeline.db
+data-pipeline/reports/
+data-pipeline/.env
--- a/apps/api/src/controllers/gameController.test.ts
+++ b/apps/api/src/controllers/gameController.test.ts
@ -64,9 +64,14 @@ const validBody = {
 };

 const fakeTerms = [
-  { termId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
-  { termId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
-  { termId: "t3", sourceText: "house", targetText: "casa", sourceGloss: null },
+  { entryId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
+  { entryId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
+  {
+    entryId: "t3",
+    sourceText: "house",
+    targetText: "casa",
+    sourceGloss: "a building for living in",
+  },
 ];

 beforeEach(() => {
--- a/apps/api/src/services/gameService.test.ts
+++ b/apps/api/src/services/gameService.test.ts
@ -19,10 +19,10 @@ const validRequest: GameRequest = {
 };

 const fakeTerms = [
-  { termId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
-  { termId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
+  { entryId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
+  { entryId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
  {
-    termId: "t3",
+    entryId: "t3",
    sourceText: "house",
    targetText: "casa",
    sourceGloss: "a building for living in",
--- a/apps/api/src/services/gameService.ts
+++ b/apps/api/src/services/gameService.ts
@ -38,8 +38,9 @@ export const createGameSession = async (
  const questions: GameQuestion[] = await Promise.all(
    terms.map(async (term) => {
      const distractorTexts = await getDistractors(
-        term.termId,
+        term.entryId,
        term.targetText,
+        request.source_language,
        request.target_language,
        request.pos,
        request.difficulty,
--- a/apps/api/src/services/multiplayerGameService.test.ts
+++ b/apps/api/src/services/multiplayerGameService.test.ts
@ -9,10 +9,10 @@ const mockGetGameTerms = vi.mocked(getGameTerms);
 const mockGetDistractors = vi.mocked(getDistractors);

 const fakeTerms = [
-  { termId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
-  { termId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
+  { entryId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
+  { entryId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
  {
-    termId: "t3",
+    entryId: "t3",
    sourceText: "house",
    targetText: "casa",
    sourceGloss: "a building for living in",
--- a/apps/api/src/services/multiplayerGameService.ts
+++ b/apps/api/src/services/multiplayerGameService.ts
@ -44,8 +44,9 @@ export const generateMultiplayerQuestions = async (): Promise<
  const questions: MultiplayerQuestion[] = await Promise.all(
    correctAnswers.map(async (correctAnswer) => {
      const distractorTexts = await getDistractors(
-        correctAnswer.termId,
+        correctAnswer.entryId,
        correctAnswer.targetText,
+        MULTIPLAYER_DEFAULTS.sourceLanguage,
        MULTIPLAYER_DEFAULTS.targetLanguage,
        MULTIPLAYER_DEFAULTS.pos,
        MULTIPLAYER_DEFAULTS.difficulty,
--- a/data-pipeline/.env.example
+++ b/data-pipeline/.env.example
@ -0,0 +1,7 @@
+# OpenRouter API key — required for OpenRouter providers
+# Get one at https://openrouter.ai/keys
+OPENROUTER_API_KEY=
+
+# Anthropic API key — required for Anthropic provider (reference baseline only)
+# Get one at https://console.anthropic.com/
+ANTHROPIC_API_KEY=
--- a/data-pipeline/audit.ts
+++ b/data-pipeline/audit.ts
@ -0,0 +1,87 @@
+import Database from "better-sqlite3";
+import path from "node:path";
+import fs from "node:fs";
+import { fileURLToPath } from "node:url";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const DB_PATH = path.join(__dirname, "db/pipeline.db");
+
+const db = new Database(DB_PATH, { readonly: true });
+
+// Pull 50 synsets: ~12 per POS, all must have German translations
+const synsets = db
+  .prepare(
+    `
+    SELECT DISTINCT s.source_id, s.pos
+    FROM synsets s
+    JOIN translations t ON t.source_id = s.source_id
+    WHERE t.language = 'de'
+    ORDER BY RANDOM()
+    LIMIT 50
+  `,
+  )
+  .all() as { source_id: string; pos: string }[];
+
+const results: string[] = [];
+let index = 0;
+
+for (const synset of synsets) {
+  index++;
+
+  const glosses = db
+    .prepare("SELECT language, text FROM glosses WHERE source_id = ?")
+    .all(synset.source_id) as { language: string; text: string }[];
+
+  const enGloss = glosses.find((g) => g.language === "en")?.text ?? "—";
+  const deGloss = glosses.find((g) => g.language === "de")?.text ?? "—";
+
+  const deTranslations = db
+    .prepare(
+      "SELECT word FROM translations WHERE source_id = ? AND language = 'de'",
+    )
+    .all(synset.source_id) as { word: string }[];
+
+  const enTranslations = db
+    .prepare(
+      "SELECT word FROM translations WHERE source_id = ? AND language = 'en'",
+    )
+    .all(synset.source_id) as { word: string }[];
+
+  const deWords = deTranslations.map((t) => t.word);
+  const enWords = enTranslations.map((t) => t.word);
+
+  results.push(
+    [
+      `${String(index).padStart(2, " ")}. [${synset.pos}] ${synset.source_id}`,
+      `    EN gloss: ${enGloss}`,
+      `    DE gloss: ${deGloss}`,
+      `    EN words: ${enWords.join(", ")}`,
+      `    DE words: ${deWords.join(", ")}`,
+      `    QUALITY:  ___`,
+      ``,
+    ].join("\n"),
+  );
+}
+
+const output = [
+  "# OMW German Translation Quality Audit",
+  "",
+  "Instructions: for each entry, check if the German translations",
+  "match the meaning described by the English gloss.",
+  "",
+  "Mark QUALITY as:",
+  "  OK    — all German translations fit the meaning",
+  "  PARTIAL — some fit, some don't",
+  "  BAD   — none of the German translations fit",
+  "  USELESS — translations are correct but useless for learners",
+  "",
+  "---",
+  "",
+  ...results,
+].join("\n");
+
+const outPath = path.join(__dirname, "audit.md");
+fs.writeFileSync(outPath, output, "utf-8");
+console.log(`Wrote ${synsets.length} entries → ${outPath}`);
+
+db.close();
--- a/data-pipeline/db/import.ts
+++ b/data-pipeline/db/import.ts
@ -0,0 +1,154 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import { openDb } from "./index.js";
+import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
+
+// ── Paths ─────────────────────────────────────────────────────────────────────
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output");
+
+// ── Import ────────────────────────────────────────────────────────────────────
+
+export async function importKaikki(): Promise<void> {
+  const db = openDb();
+
+  const insertEntry = db.prepare(`
+    INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
+    VALUES (?, ?, ?, ?, ?, ?)
+    ON CONFLICT (headword, language, pos, sense_index)
+    DO UPDATE SET
+      gloss    = excluded.gloss,
+      examples = excluded.examples
+    RETURNING id
+  `);
+
+  const insertTranslation = db.prepare(`
+    INSERT INTO translations (entry_id, target_lang, word, sense_hint)
+    VALUES (?, ?, ?, ?)
+    ON CONFLICT (entry_id, target_lang, word) DO NOTHING
+  `);
+
+  let totalEntries = 0;
+  let totalTranslations = 0;
+  let totalSkipped = 0;
+
+  for (const lang of SUPPORTED_LANGUAGE_CODES) {
+    const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
+
+    let senses: ExtractedSense[];
+    try {
+      const raw = await fs.readFile(filePath, "utf-8");
+      senses = JSON.parse(raw) as ExtractedSense[];
+    } catch {
+      console.warn(`  Warning: no output file found for ${lang}, skipping`);
+      continue;
+    }
+
+    console.log(
+      `  Importing ${lang}: ${senses.length.toLocaleString()} senses...`,
+    );
+
+    // Track next available sense_index per (headword, pos) to handle
+    // the same word appearing in multiple JSONL entries with the same POS.
+    const senseIndexMap = new Map<string, number>();
+
+    const importLang = db.transaction(() => {
+      let entries = 0;
+      let translations = 0;
+      let skipped = 0;
+
+      for (const sense of senses) {
+        const key = `${sense.headword}|${sense.pos}`;
+        const nextIndex = senseIndexMap.get(key) ?? 0;
+        senseIndexMap.set(key, nextIndex + 1);
+
+        const row = insertEntry.get(
+          sense.headword,
+          sense.language,
+          sense.pos,
+          nextIndex,
+          sense.gloss ?? null,
+          JSON.stringify(sense.examples),
+        ) as { id: number } | undefined;
+
+        if (!row) {
+          skipped++;
+          continue;
+        }
+
+        entries++;
+
+        for (const t of sense.translations) {
+          insertTranslation.run(
+            row.id,
+            t.target_lang,
+            t.word,
+            t.sense_hint ?? null,
+          );
+          translations++;
+        }
+      }
+
+      return { entries, translations, skipped };
+    });
+
+    const counts = importLang();
+    totalEntries += counts.entries;
+    totalTranslations += counts.translations;
+    totalSkipped += counts.skipped;
+
+    console.log(
+      `    entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`,
+    );
+  }
+
+  db.close();
+
+  console.log(`\nImport complete:`);
+  console.log(`  Total entries:      ${totalEntries.toLocaleString()}`);
+  console.log(`  Total translations: ${totalTranslations.toLocaleString()}`);
+  console.log(`  Total skipped:      ${totalSkipped.toLocaleString()}`);
+}
+
+// ── Check if already imported ─────────────────────────────────────────────────
+
+export function isImported(): boolean {
+  const db = openDb();
+  const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
+    count: number;
+  };
+  db.close();
+  return row.count > 0;
+}
+
+// ── Main ─────────────────────────────────────────────────────────────────────
+
+async function main(): Promise<void> {
+  const db = openDb();
+  const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
+    count: number;
+  };
+  db.close();
+
+  if (row.count > 0) {
+    console.log(
+      `pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
+    );
+    console.log("Delete pipeline.db and re-run db:init to start fresh.");
+    process.exit(0);
+  }
+
+  console.log("Importing Kaikki data into pipeline.db...");
+  await importKaikki();
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main().catch((err) => {
+    console.error(err);
+    process.exit(1);
+  });
+}
--- a/data-pipeline/db/index.ts
+++ b/data-pipeline/db/index.ts
@ -0,0 +1,24 @@
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import Database from "better-sqlite3";
+
+// ── Paths ─────────────────────────────────────────────────────────────────────
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const DB_PATH = path.join(__dirname, "pipeline.db");
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+export type Db = InstanceType<typeof Database>;
+
+// ── Open ──────────────────────────────────────────────────────────────────────
+
+export function openDb(): Db {
+  const db = new Database(DB_PATH);
+
+  db.pragma("journal_mode = WAL");
+  db.pragma("foreign_keys = ON");
+
+  return db;
+}
--- a/data-pipeline/db/init.ts
+++ b/data-pipeline/db/init.ts
@ -0,0 +1,42 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import Database from "better-sqlite3";
+
+// ── Paths ─────────────────────────────────────────────────────────────────────
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const PATHS = {
+  schema: path.join(__dirname, "schema.sql"),
+  db: path.join(__dirname, "pipeline.db"),
+};
+
+// ── Init ──────────────────────────────────────────────────────────────────────
+
+export async function initDb(): Promise<void> {
+  const schema = await fs.readFile(PATHS.schema, "utf-8");
+  const db = new Database(PATHS.db);
+
+  db.pragma("journal_mode = WAL");
+  db.pragma("foreign_keys = ON");
+  db.exec(schema);
+  db.close();
+
+  console.log(`  pipeline.db initialised → ${PATHS.db}`);
+}
+
+// ── Main ─────────────────────────────────────────────────────────────────────
+
+async function main(): Promise<void> {
+  console.log("Initialising pipeline.db...");
+  await initDb();
+}
+
+// after
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main().catch((err) => {
+    console.error(err);
+    process.exit(1);
+  });
+}
--- a/data-pipeline/db/pipeline.db-shm
+++ b/data-pipeline/db/pipeline.db-shm
--- a/data-pipeline/db/pipeline.db-wal
+++ b/data-pipeline/db/pipeline.db-wal
--- a/data-pipeline/db/reset.ts
+++ b/data-pipeline/db/reset.ts
@ -0,0 +1,41 @@
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import Database from "better-sqlite3";
+
+// ── Paths ─────────────────────────────────────────────────────────────────────
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const DB_PATH = path.join(__dirname, "pipeline.db");
+
+// ── Main ──────────────────────────────────────────────────────────────────────
+
+function main(): void {
+  const mode = process.argv[2];
+
+  if (!mode || (mode !== "round1" && mode !== "all")) {
+    console.error("Usage: pnpm db:reset round1 | all");
+    console.error("  round1 — delete all round1 sub-stage rows");
+    console.error("  all    — delete all run_status rows except reverse_link");
+    process.exit(1);
+  }
+
+  const db = new Database(DB_PATH);
+
+  let result: { changes: number };
+
+  if (mode === "round1") {
+    result = db
+      .prepare("DELETE FROM run_status WHERE stage LIKE 'round1%'")
+      .run();
+    console.log(`Deleted ${result.changes} round1 rows from run_status`);
+  } else {
+    result = db
+      .prepare("DELETE FROM run_status WHERE stage NOT IN ('reverse_link')")
+      .run();
+    console.log(`Deleted ${result.changes} rows from run_status`);
+  }
+
+  db.close();
+}
+
+main();
--- a/data-pipeline/db/schema.sql
+++ b/data-pipeline/db/schema.sql
@ -0,0 +1,164 @@
+-- ── Base data ─────────────────────────────────────────────────────────────────
+-- Imported from Kaikki on first run. Never mutated after import.
+
+CREATE TABLE IF NOT EXISTS entries (
+  id          INTEGER PRIMARY KEY,
+  headword    TEXT    NOT NULL,
+  language    TEXT    NOT NULL,
+  pos         TEXT    NOT NULL,
+  sense_index INTEGER NOT NULL DEFAULT 0,
+  gloss       TEXT,
+  examples    TEXT    NOT NULL DEFAULT '[]', -- JSON array of strings
+  source      TEXT    NOT NULL DEFAULT 'kaikki',
+  UNIQUE (headword, language, pos, sense_index)
+);
+
+CREATE TABLE IF NOT EXISTS translations (
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  sense_hint  TEXT,
+  source      TEXT    NOT NULL DEFAULT 'kaikki',
+  UNIQUE (entry_id, target_lang, word)
+);
+
+-- ── Status tracking ───────────────────────────────────────────────────────────
+-- One row per entry per model per stage. Drives resumability.
+-- Sentinel rows use entry_id = 0 for one-time pipeline steps.
+-- stage:  round1 | round2 | tiebreak
+-- status: pending | complete | needs_review | flagged
+
+CREATE TABLE IF NOT EXISTS run_status (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL,
+  model_name TEXT    NOT NULL,
+  stage      TEXT    NOT NULL,
+  status     TEXT    NOT NULL,
+  created_at TEXT    NOT NULL DEFAULT (datetime('now')),
+  updated_at TEXT    NOT NULL DEFAULT (datetime('now')),
+  UNIQUE (entry_id, model_name, stage)
+);
+
+-- ── Round 1 output ────────────────────────────────────────────────────────────
+-- Written atomically per entry per model.
+-- Unique constraints enforce one model one vote.
+
+CREATE TABLE IF NOT EXISTS model_entry_cefr_votes (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  model_name TEXT    NOT NULL,
+  cefr_level TEXT    NOT NULL,
+  UNIQUE (entry_id, model_name)
+);
+
+CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
+  id             INTEGER PRIMARY KEY,
+  translation_id INTEGER NOT NULL REFERENCES translations(id),
+  model_name     TEXT    NOT NULL,
+  cefr_level     TEXT    NOT NULL,
+  UNIQUE (translation_id, model_name)
+);
+
+CREATE TABLE IF NOT EXISTS model_translation_rejections (
+  id             INTEGER PRIMARY KEY,
+  translation_id INTEGER NOT NULL REFERENCES translations(id),
+  model_name     TEXT    NOT NULL,
+  UNIQUE (translation_id, model_name)
+);
+
+CREATE TABLE IF NOT EXISTS generated_glosses (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  model_name TEXT    NOT NULL,
+  text       TEXT    NOT NULL,
+  UNIQUE (entry_id, model_name)
+);
+
+CREATE TABLE IF NOT EXISTS generated_examples (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  model_name TEXT    NOT NULL,
+  text       TEXT    NOT NULL,
+  UNIQUE (entry_id, model_name)
+);
+
+CREATE TABLE IF NOT EXISTS generated_translations (
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  model_name  TEXT    NOT NULL,
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  UNIQUE (entry_id, model_name, target_lang)
+);
+
+-- ── Round 2 output ────────────────────────────────────────────────────────────
+-- Each row represents one model voting for one candidate.
+-- The candidate with the most votes wins in merge.
+
+CREATE TABLE IF NOT EXISTS gloss_candidate_votes (
+  id         INTEGER PRIMARY KEY,
+  gloss_id   INTEGER NOT NULL REFERENCES generated_glosses(id),
+  model_name TEXT    NOT NULL,
+  UNIQUE (gloss_id, model_name)
+);
+
+CREATE TABLE IF NOT EXISTS example_candidate_votes (
+  id         INTEGER PRIMARY KEY,
+  example_id INTEGER NOT NULL REFERENCES generated_examples(id),
+  model_name TEXT    NOT NULL,
+  UNIQUE (example_id, model_name)
+);
+
+CREATE TABLE IF NOT EXISTS translation_candidate_votes (
+  id             INTEGER PRIMARY KEY,
+  translation_id INTEGER NOT NULL REFERENCES generated_translations(id),
+  model_name     TEXT    NOT NULL,
+  UNIQUE (translation_id, model_name)
+);
+
+-- ── Resolved output ───────────────────────────────────────────────────────────
+-- Written by merge. Never updated after writing.
+-- Only fully resolved records are written here — no nulls.
+-- Absence of a row means unresolved. Flagged status tracked in run_status.
+-- source: kaikki | model_name
+
+CREATE TABLE IF NOT EXISTS resolved_entry_cefr (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  cefr_level TEXT    NOT NULL,
+  difficulty TEXT    NOT NULL,
+  UNIQUE (entry_id)
+);
+
+CREATE TABLE IF NOT EXISTS resolved_translation_cefr (
+  id             INTEGER PRIMARY KEY,
+  translation_id INTEGER NOT NULL REFERENCES translations(id),
+  cefr_level     TEXT    NOT NULL,
+  difficulty     TEXT    NOT NULL,
+  UNIQUE (translation_id)
+);
+
+CREATE TABLE IF NOT EXISTS resolved_glosses (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  text       TEXT    NOT NULL,
+  source     TEXT    NOT NULL,
+  UNIQUE (entry_id)
+);
+
+CREATE TABLE IF NOT EXISTS resolved_examples (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  text       TEXT    NOT NULL,
+  source     TEXT    NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS resolved_generated_translations (
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  source      TEXT    NOT NULL,
+  UNIQUE (entry_id, target_lang)
+);
--- a/data-pipeline/package.json
+++ b/data-pipeline/package.json
@ -3,7 +3,16 @@
  "version": "1.0.0",
  "private": true,
  "type": "module",
-  "scripts": {},
+  "scripts": {
+    "db:reset": "tsx db/reset.ts",
+    "extract": "tsx stage-1-extract/scripts/extract.ts",
+    "reverse-link": "tsx stage-2-reverse-link/scripts/reverse-link.ts",
+    "db:import": "tsx db/import.ts",
+    "db:init": "tsx db/init.ts",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "pipeline:run": "tsx --env-file .env pipeline.ts"
+  },
  "dependencies": {
    "@lila/shared": "workspace:*",
    "better-sqlite3": "^12.9.0"
@ -12,6 +21,7 @@
    "@types/better-sqlite3": "^7.6.13",
    "@types/node": "^24.12.0",
    "tsx": "^4.21.0",
-    "typescript": "^5.9.3"
+    "typescript": "^5.9.3",
+    "vitest": "^4.1.0"
  }
 }
--- a/data-pipeline/pipeline.ts
+++ b/data-pipeline/pipeline.ts
@ -0,0 +1,616 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import { initDb } from "./db/init.js";
+import { isImported, importKaikki } from "./db/import.js";
+import { openDb } from "./db/index.js";
+import { reverseLink } from "./stage-2-reverse-link/scripts/reverse-link.js";
+import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js";
+import type { ProviderConfig } from "./stage-3-enrich/config.js";
+import { enrich } from "./stage-3-enrich/scripts/enrich.js";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type RunStage =
+  | "round1"
+  | "compile_candidates"
+  | "round2"
+  | "compile_votes"
+  | "merge"
+  | "tiebreak"
+  | "compare";
+
+type StageStatus = "complete" | "pending" | "in_progress";
+
+type RunStats = {
+  startedAt: Date;
+  stoppedAt: Date | null;
+  recordsProcessed: number;
+  recordsSkipped: number;
+  needsReview: number;
+  modelsRun: string[];
+  currentStage: RunStage | null;
+};
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const PATHS = {
+  extractedEn: path.join(__dirname, "stage-1-extract/output/en.json"),
+  db: path.join(__dirname, "db/pipeline.db"),
+  reports: path.join(__dirname, "reports"),
+  llamaHealth: "http://127.0.0.1:8080/health",
+};
+
+const SENTINEL = { entryId: 0, modelName: "system" };
+
+// ── Startup checks ────────────────────────────────────────────────────────────
+
+async function checkExtractedFilesExist(): Promise<void> {
+  try {
+    await fs.access(PATHS.extractedEn);
+  } catch {
+    console.error("\n  ERROR: stage-1-extract/output/en.json not found.");
+    console.error("  Run the stage 1 extraction script first:");
+    console.error("    pnpm extract\n");
+    process.exit(1);
+  }
+}
+
+async function checkAndInitDb(): Promise<void> {
+  try {
+    await fs.access(PATHS.db);
+  } catch {
+    console.log("  pipeline.db not found — initialising...");
+    await initDb();
+  }
+}
+
+async function checkAndImportDb(): Promise<void> {
+  if (!isImported()) {
+    console.log("  Base tables empty — importing Kaikki data...");
+    await importKaikki();
+  }
+}
+
+async function checkLlamaServer(): Promise<boolean> {
+  try {
+    const res = await fetch(PATHS.llamaHealth);
+    return res.ok;
+  } catch {
+    return false;
+  }
+}
+
+function isLocalProvider(provider: ProviderConfig): boolean {
+  return provider.apiKey === "none";
+}
+
+async function checkProviderReady(provider: ProviderConfig): Promise<void> {
+  if (isLocalProvider(provider)) {
+    const running = await checkLlamaServer();
+    if (!running) {
+      console.error("\n  ERROR: llama.cpp server is not running.");
+      console.error("  Start the server before running the pipeline:");
+      console.error(
+        "    ./build/bin/llama-server --model models/<model>.gguf \\",
+      );
+      console.error("      --port 8080 --host 127.0.0.1");
+      console.error("  See llm-setup.md for full instructions.\n");
+      process.exit(1);
+    }
+  } else {
+    validateProviderKey(provider);
+  }
+}
+
+// ── Run name generation ───────────────────────────────────────────────────────
+
+async function generateRunName(): Promise<string> {
+  await fs.mkdir(PATHS.reports, { recursive: true });
+
+  const date = new Date().toISOString().exi(0, 10);
+  const files = await fs.readdir(PATHS.reports);
+  const todaysRuns = files.filter(
+    (f) => f.startsWith(date) && f.endsWith(".json"),
+  ).length;
+
+  return `${date}_run-${todaysRuns + 1}`;
+}
+
+// ── Shutdown handler ──────────────────────────────────────────────────────────
+
+let shutdownRequested = false;
+
+function registerShutdownHandler(stats: RunStats): void {
+  const handler = (): void => {
+    if (shutdownRequested) return;
+    shutdownRequested = true;
+    stats.stoppedAt = new Date();
+    console.log("\n\n  Shutdown requested — finishing current record...");
+  };
+
+  process.on("SIGINT", handler);
+  process.on("SIGTERM", handler);
+}
+
+// ── Stage status helpers ──────────────────────────────────────────────────────
+
+function getSentinelStatus(stage: RunStage): StageStatus {
+  const db = openDb();
+  const row = db
+    .prepare(
+      `SELECT status FROM run_status
+       WHERE entry_id = ? AND model_name = ? AND stage = ?`,
+    )
+    .get(SENTINEL.entryId, SENTINEL.modelName, stage) as
+    | { status: string }
+    | undefined;
+  db.close();
+  return row?.status === "complete" ? "complete" : "pending";
+}
+
+function markSentinelComplete(stage: RunStage): void {
+  const db = openDb();
+  db.prepare(
+    `INSERT INTO run_status (entry_id, model_name, stage, status)
+     VALUES (?, ?, ?, 'complete')
+     ON CONFLICT (entry_id, model_name, stage)
+     DO UPDATE SET status = 'complete', updated_at = datetime('now')`,
+  ).run(SENTINEL.entryId, SENTINEL.modelName, stage);
+  db.close();
+}
+
+function getModelRound1Status(modelName: string): StageStatus {
+  const db = openDb();
+
+  const total = (
+    db
+      .prepare("SELECT COUNT(*) as count FROM entries WHERE language = 'en'")
+      .get() as { count: number }
+  ).count;
+
+  const complete = (
+    db
+      .prepare(
+        `SELECT COUNT(*) as count FROM run_status
+         WHERE model_name = ? AND stage = 'round1_gloss'
+         AND status = 'complete'`,
+      )
+      .get(modelName) as { count: number }
+  ).count;
+
+  db.close();
+
+  if (complete === 0) return "pending";
+  if (complete >= total) return "complete";
+  return "in_progress";
+}
+
+function getModelRound2Status(modelName: string): StageStatus {
+  const db = openDb();
+
+  const total = (
+    db
+      .prepare("SELECT COUNT(*) as count FROM entries WHERE language = 'en'")
+      .get() as { count: number }
+  ).count;
+
+  const complete = (
+    db
+      .prepare(
+        `SELECT COUNT(*) as count FROM run_status
+         WHERE model_name = ? AND stage = 'round2' AND status = 'complete'`,
+      )
+      .get(modelName) as { count: number }
+  ).count;
+
+  db.close();
+
+  if (complete === 0) return "pending";
+  if (complete >= total) return "complete";
+  return "in_progress";
+}
+
+function isReverseLinkDone(): boolean {
+  const db = openDb();
+  const row = db
+    .prepare(
+      `SELECT status FROM run_status
+       WHERE entry_id = ? AND model_name = ? AND stage = 'reverse_link'`,
+    )
+    .get(SENTINEL.entryId, SENTINEL.modelName) as
+    | { status: string }
+    | undefined;
+  db.close();
+  return row?.status === "complete";
+}
+
+function markReverseLinkComplete(): void {
+  const db = openDb();
+  db.prepare(
+    `INSERT INTO run_status (entry_id, model_name, stage, status)
+     VALUES (?, ?, 'reverse_link', 'complete')
+     ON CONFLICT (entry_id, model_name, stage)
+     DO UPDATE SET status = 'complete', updated_at = datetime('now')`,
+  ).run(SENTINEL.entryId, SENTINEL.modelName);
+  db.close();
+}
+
+// ── Stage runners ─────────────────────────────────────────────────────────────
+
+function runReverseLinkStage(): void {
+  if (isReverseLinkDone()) {
+    console.log("\n  [reverse link] Already complete, skipping");
+    return;
+  }
+  console.log("\n  [reverse link] Syncing reverse translation links...");
+  reverseLink();
+  markReverseLinkComplete();
+}
+
+async function runRound1(
+  provider: ProviderConfig,
+  stats: RunStats,
+): Promise<void> {
+  console.log(`\n  [round 1] Running ${provider.name}...`);
+  const counts = await enrich(provider);
+  stats.recordsProcessed += counts.processed;
+  stats.recordsSkipped += counts.skipped;
+  stats.needsReview += counts.needsReview;
+  stats.modelsRun.push(provider.name);
+}
+
+function compileCandidates(): void {
+  console.log("\n  [compile candidates] Compiling round 1 output...");
+  // TODO: implement compile candidates script
+  console.log("  [compile candidates] not yet implemented");
+  markSentinelComplete("compile_candidates");
+}
+
+function runRound2(provider: ProviderConfig, stats: RunStats): void {
+  console.log(`\n  [round 2] Running ${provider.name}...`);
+  // TODO: implement round 2 enrich script
+  console.log(`  [round 2] ${provider.name} — not yet implemented`);
+  stats.modelsRun.push(provider.name);
+}
+
+function compileVotes(): void {
+  console.log("\n  [compile votes] Compiling round 2 votes...");
+  // TODO: implement compile votes script
+  console.log("  [compile votes] not yet implemented");
+  markSentinelComplete("compile_votes");
+}
+
+function runMerge(): void {
+  console.log("\n  [merge] Resolving votes...");
+  // TODO: implement merge script
+  console.log("  [merge] not yet implemented");
+  markSentinelComplete("merge");
+}
+
+function runTiebreak(stats: RunStats): void {
+  console.log("\n  [tiebreak] Resolving flagged entries...");
+  // TODO: implement tiebreak logic
+  console.log("  [tiebreak] not yet implemented");
+  stats.currentStage = "tiebreak";
+}
+
+function runCompare(): void {
+  console.log("\n  [compare] Generating COVERAGE.md...");
+  // TODO: implement compare script
+  console.log("  [compare] not yet implemented");
+  markSentinelComplete("compare");
+}
+
+// ── Report generation ─────────────────────────────────────────────────────────
+
+async function generateReport(runName: string, stats: RunStats): Promise<void> {
+  const db = openDb();
+
+  const totalEntries = (
+    db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
+      count: number;
+    }
+  ).count;
+
+  const resolvedEntries = (
+    db.prepare("SELECT COUNT(*) as count FROM resolved_entry_cefr").get() as {
+      count: number;
+    }
+  ).count;
+
+  const flaggedEntries = (
+    db
+      .prepare(
+        `SELECT COUNT(*) as count FROM run_status
+         WHERE stage = 'merge' AND status = 'flagged'`,
+      )
+      .get() as { count: number }
+  ).count;
+
+  const needsReview = (
+    db
+      .prepare(
+        `SELECT COUNT(*) as count FROM run_status
+         WHERE status = 'needs_review'`,
+      )
+      .get() as { count: number }
+  ).count;
+
+  db.close();
+
+  const stoppedAt = stats.stoppedAt ?? new Date();
+  const durationMs = stoppedAt.getTime() - stats.startedAt.getTime();
+  const durationMin = Math.round(durationMs / 60_000);
+
+  const isFinal =
+    getSentinelStatus("compare") === "complete" && flaggedEntries === 0;
+
+  const report = {
+    runName,
+    generatedAt: stoppedAt.toISOString(),
+    durationMinutes: durationMin,
+    isFinal,
+    progress: {
+      totalEntries,
+      resolvedEntries,
+      flaggedEntries,
+      needsReview,
+      recordsProcessedThisRun: stats.recordsProcessed,
+      recordsSkippedThisRun: stats.recordsSkipped,
+    },
+    modelsRun: stats.modelsRun,
+    stages: {
+      reverseLink: isReverseLinkDone() ? "complete" : "pending",
+      round1: ALL_PROVIDERS.map((p) => ({
+        model: p.name,
+        status: getModelRound1Status(p.name),
+      })),
+      compileCandidates: getSentinelStatus("compile_candidates"),
+      round2: ALL_PROVIDERS.map((p) => ({
+        model: p.name,
+        status: getModelRound2Status(p.name),
+      })),
+      compileVotes: getSentinelStatus("compile_votes"),
+      merge: getSentinelStatus("merge"),
+      compare: getSentinelStatus("compare"),
+    },
+  };
+
+  await fs.mkdir(PATHS.reports, { recursive: true });
+
+  const jsonPath = path.join(PATHS.reports, `${runName}.json`);
+  const mdPath = path.join(PATHS.reports, `${runName}.md`);
+
+  await fs.writeFile(jsonPath, JSON.stringify(report, null, 2), "utf-8");
+
+  const md = [
+    `# Pipeline run: ${runName}`,
+    ``,
+    `Generated: ${stoppedAt.toISOString()}`,
+    `Duration: ${durationMin} minutes`,
+    isFinal
+      ? `**Status: FINAL — pipeline complete**`
+      : `**Status: In progress**`,
+    ``,
+    `## Progress`,
+    ``,
+    `| Metric | Value |`,
+    `| ------ | ----- |`,
+    `| Total entries | ${totalEntries.toLocaleString()} |`,
+    `| Resolved entries | ${resolvedEntries.toLocaleString()} |`,
+    `| Flagged entries | ${flaggedEntries.toLocaleString()} |`,
+    `| Needs review | ${needsReview.toLocaleString()} |`,
+    `| Records processed this run | ${stats.recordsProcessed.toLocaleString()} |`,
+    `| Records skipped this run | ${stats.recordsSkipped.toLocaleString()} |`,
+    ``,
+    `## Stage status`,
+    ``,
+    `### Reverse link: ${report.stages.reverseLink}`,
+    ``,
+    `### Round 1`,
+    ``,
+    ...report.stages.round1.map(
+      (s) =>
+        `- ${s.status === "complete" ? "✅" : s.status === "in_progress" ? "🔄" : "🔲"} ${s.model}`,
+    ),
+    ``,
+    `### Compile candidates: ${report.stages.compileCandidates}`,
+    ``,
+    `### Round 2`,
+    ``,
+    ...report.stages.round2.map(
+      (s) =>
+        `- ${s.status === "complete" ? "✅" : s.status === "in_progress" ? "🔄" : "🔲"} ${s.model}`,
+    ),
+    ``,
+    `### Compile votes: ${report.stages.compileVotes}`,
+    `### Merge: ${report.stages.merge}`,
+    `### Compare: ${report.stages.compare}`,
+    ``,
+    `## Models run this session`,
+    ``,
+    stats.modelsRun.length > 0
+      ? stats.modelsRun.map((m) => `- ${m}`).join("\n")
+      : "_none_",
+  ].join("\n");
+
+  await fs.writeFile(mdPath, md, "utf-8");
+
+  console.log(`\n  Report written → ${jsonPath}`);
+  console.log(`  Report written → ${mdPath}`);
+}
+
+// ── Main ──────────────────────────────────────────────────────────────────────
+
+async function main(): Promise<void> {
+  console.log("lila data pipeline\n");
+
+  // ── Startup checks
+  console.log("Checking prerequisites...");
+  await checkExtractedFilesExist();
+  await checkAndInitDb();
+  await checkAndImportDb();
+  console.log("  Prerequisites OK");
+
+  // ── Run name
+  const runName = await generateRunName();
+  console.log(`\n  Run: ${runName}`);
+
+  // ── Stats
+  const stats: RunStats = {
+    startedAt: new Date(),
+    stoppedAt: null,
+    recordsProcessed: 0,
+    recordsSkipped: 0,
+    needsReview: 0,
+    modelsRun: [],
+    currentStage: null,
+  };
+
+  registerShutdownHandler(stats);
+
+  // ── Stage 2 — Reverse link
+  runReverseLinkStage();
+
+  if (shutdownRequested) {
+    await generateReport(runName, stats);
+    process.exit(0);
+  }
+
+  // ── Round 1
+  console.log("\nRound 1 — generation");
+  for (const provider of ALL_PROVIDERS) {
+    if (shutdownRequested) break;
+
+    const status = getModelRound1Status(provider.name);
+
+    if (status === "complete") {
+      console.log(`  [round 1] ${provider.name} — already complete, skipping`);
+      continue;
+    }
+
+    await checkProviderReady(provider);
+    stats.currentStage = "round1";
+
+    if (status === "in_progress") {
+      console.log(`  [round 1] ${provider.name} — resuming...`);
+    }
+
+    await runRound1(provider, stats);
+  }
+
+  if (shutdownRequested) {
+    await generateReport(runName, stats);
+    process.exit(0);
+  }
+
+  // ── Compile candidates
+  if (getSentinelStatus("compile_candidates") === "complete") {
+    console.log("\n  [compile candidates] Already complete, skipping");
+  } else {
+    stats.currentStage = "compile_candidates";
+    compileCandidates();
+  }
+
+  if (shutdownRequested) {
+    await generateReport(runName, stats);
+    process.exit(0);
+  }
+
+  // ── Round 2
+  console.log("\nRound 2 — voting");
+  for (const provider of ALL_PROVIDERS) {
+    if (shutdownRequested) break;
+
+    const status = getModelRound2Status(provider.name);
+
+    if (status === "complete") {
+      console.log(`  [round 2] ${provider.name} — already complete, skipping`);
+      continue;
+    }
+
+    await checkProviderReady(provider);
+    stats.currentStage = "round2";
+
+    if (status === "in_progress") {
+      console.log(`  [round 2] ${provider.name} — resuming...`);
+    }
+
+    runRound2(provider, stats);
+  }
+
+  if (shutdownRequested) {
+    await generateReport(runName, stats);
+    process.exit(0);
+  }
+
+  // ── Compile votes
+  if (getSentinelStatus("compile_votes") === "complete") {
+    console.log("\n  [compile votes] Already complete, skipping");
+  } else {
+    stats.currentStage = "compile_votes";
+    compileVotes();
+  }
+
+  if (shutdownRequested) {
+    await generateReport(runName, stats);
+    process.exit(0);
+  }
+
+  // ── Merge
+  if (getSentinelStatus("merge") === "complete") {
+    console.log("\n  [merge] Already complete, skipping");
+  } else {
+    stats.currentStage = "merge";
+    runMerge();
+  }
+
+  if (shutdownRequested) {
+    await generateReport(runName, stats);
+    process.exit(0);
+  }
+
+  // ── Tiebreak
+  const db = openDb();
+  const flagged = (
+    db
+      .prepare(
+        `SELECT COUNT(*) as count FROM run_status
+         WHERE stage = 'merge' AND status = 'flagged'`,
+      )
+      .get() as { count: number }
+  ).count;
+  db.close();
+
+  if (flagged > 0) {
+    stats.currentStage = "tiebreak";
+    runTiebreak(stats);
+  }
+
+  if (shutdownRequested) {
+    await generateReport(runName, stats);
+    process.exit(0);
+  }
+
+  // ── Compare
+  if (getSentinelStatus("compare") === "complete") {
+    console.log("\n  [compare] Already complete, skipping");
+  } else {
+    stats.currentStage = "compare";
+    runCompare();
+  }
+
+  // ── Report (disabled until full pipeline is implemented)
+  // stats.stoppedAt = new Date();
+  // await generateReport(runName, stats);
+
+  console.log("\nPipeline complete.");
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
--- a/data-pipeline/sample/output/sample.json
+++ b/data-pipeline/sample/output/sample.json
--- a/data-pipeline/sample/scripts/sample.ts
+++ b/data-pipeline/sample/scripts/sample.ts
@ -154,7 +154,7 @@ async function loadAnnotated(): Promise<AnnotatedRecord[]> {
      for (const [l, examples] of Object.entries(record.examples)) {
        const lang = l as SupportedLanguageCode;
        if (!base.examples[lang]) {
-          base.examples[lang] = examples as Example[];
+          base.examples[lang] = examples;
        }
      }
    }
--- a/data-pipeline/stage-1-extract/scripts/extract.py
+++ b/data-pipeline/stage-1-extract/scripts/extract.py
@ -1,204 +0,0 @@
-"""
-data-pipeline/stage-1-extract/scripts/extract.py
-
-Extract all synsets from the Open Multilingual Wordnet (OMW) for all
-supported languages and parts of speech.
-
-Output: one JSON file per language, written to stage-1-extract/output/
-  en.json, it.json, es.json, de.json, fr.json
-
-Each file is a JSON array of synset records:
-  {
-    "source_id": "ili:i12345",
-    "pos": "noun",
-    "translations": { "en": ["dog", "canine"], "it": ["cane"] },
-    "glosses":      { "en": ["a domesticated animal..."] },
-    "examples":     { "en": ["the dog barked at the stranger"] }
-  }
-
-Usage:
-  python stage-1-extract/scripts/extract.py
-  python stage-1-extract/scripts/extract.py --sample
-
-Prerequisites:
-  pip install wn
-  python -m wn download omw-en:1.4
-  python -m wn download omw-it:1.4
-  python -m wn download omw-de:1.4
-  python -m wn download omw-es:1.4
-  python -m wn download omw-fr:1.4
-"""
-
-import json
-import sys
-from pathlib import Path
-
-import wn
-
-SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
-POS_MAP: dict[str, str] = {
-    "n": "noun",
-    "v": "verb",
-    "a": "adjective",
-    "s": "adjective",  # adjective satellite — collapsed into adjective
-    "r": "adverb",
-}
-
-
-def extract_all(
-    output_dir: str = "stage-1-extract/output", sample: bool = False
-) -> None:
-    out = Path(output_dir)
-    out.mkdir(parents=True, exist_ok=True)
-
-    sample_size = 100 if sample else None
-
-    # Load one Wordnet object per language up front.
-    print("Loading wordnets...")
-    wordnets: dict[str, wn.Wordnet] = {}
-    for lang in SUPPORTED_LANGUAGE_CODES:
-        try:
-            wordnets[lang] = wn.Wordnet(lang=lang)
-            synset_count = len(wordnets[lang].synsets())
-            print(f"  {lang}: {synset_count:,} total synsets")
-        except wn.Error as e:
-            print(f"  ERROR loading {lang}: {e}")
-            print(f"  Run: python -m wn download omw-{lang}:1.4")
-            sys.exit(1)
-
-    # Collect per-ILI data across all languages and POS.
-    print("\nExtracting synsets...")
-    by_ili: dict[str, dict] = {}
-
-    for lang, wnet in wordnets.items():
-        for omw_pos, pos_label in POS_MAP.items():
-            synsets = wnet.synsets(pos=omw_pos)
-            covered = 0
-            for synset in synsets:
-                ili = synset.ili
-                if not ili:
-                    continue
-                covered += 1
-
-                lemmas = [str(lemma) for lemma in synset.lemmas()]
-                defns = [d for d in synset.definitions() if d]
-                examples = [e for e in synset.examples() if e]
-
-                if ili not in by_ili:
-                    by_ili[ili] = {"pos": pos_label}
-
-                if lang not in by_ili[ili]:
-                    by_ili[ili][lang] = {
-                        "lemmas": lemmas,
-                        "glosses": defns,
-                        "examples": examples,
-                    }
-                else:
-                    # ILI already exists for this language — merge data.
-                    # Happens when 'a' and 's' both map to adjective for the
-                    # same ILI. Deduplicate to avoid repeated entries.
-                    existing = by_ili[ili][lang]
-                    existing["lemmas"] = list(
-                        dict.fromkeys(existing["lemmas"] + lemmas)
-                    )
-                    existing["glosses"] = list(
-                        dict.fromkeys(existing["glosses"] + defns)
-                    )
-                    existing["examples"] = list(
-                        dict.fromkeys(existing["examples"] + examples)
-                    )
-
-            print(f"  {lang} {pos_label}: {covered:,} synsets with ILI")
-
-    # Build records and write single combined output file.
-    print("\nBuilding records...")
-    ilis = sorted(by_ili.keys())
-    if sample_size:
-        ilis = ilis[:sample_size]
-
-    records: list[dict] = []
-    for ili in ilis:
-        data = by_ili[ili]
-        record: dict = {
-            "source_id": f"ili:{ili}",
-            "pos": data["pos"],
-            "translations": {},
-            "glosses": {},
-            "examples": {},
-        }
-
-        for key, value in data.items():
-            if key == "pos":
-                continue
-            lang = key
-            if value["lemmas"]:
-                record["translations"][lang] = value["lemmas"]
-            if value["glosses"]:
-                record["glosses"][lang] = value["glosses"]
-            if value["examples"]:
-                record["examples"][lang] = value["examples"]
-
-        records.append(record)
-
-    output_file = out / "omw.json"
-    with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(records, f, indent=2, ensure_ascii=False)
-
-    print(f"\nWrote {len(records):,} synsets → {output_file}")
-    _print_coverage(records)
-
-
-def _print_coverage(records: list[dict]) -> None:
-    """Print per-language translation, gloss, and example counts."""
-    lang_stats: dict[str, dict[str, int]] = {}
-    for lang in SUPPORTED_LANGUAGE_CODES:
-        lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
-
-    pos_stats: dict[str, int] = {}
-
-    for r in records:
-        pos = r["pos"]
-        pos_stats[pos] = pos_stats.get(pos, 0) + 1
-
-        for lang, lemmas in r["translations"].items():
-            if lang in lang_stats:
-                lang_stats[lang]["translations"] += len(lemmas)
-        for lang, gloss_list in r["glosses"].items():
-            if lang in lang_stats:
-                lang_stats[lang]["glosses"] += len(gloss_list)
-        for lang, example_list in r["examples"].items():
-            if lang in lang_stats:
-                lang_stats[lang]["examples"] += len(example_list)
-
-    print("\nPOS breakdown:")
-    for pos, count in sorted(pos_stats.items()):
-        print(f"  {pos}: {count:,}")
-
-    print("\nCoverage per language:")
-    for lang, counts in lang_stats.items():
-        t = counts["translations"]
-        g = counts["glosses"]
-        e = counts["examples"]
-        total = len(records)
-        print(
-            f"  {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
-        )
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
-    parser.add_argument(
-        "--output-dir",
-        default="stage-1-extract/output",
-        help="Output directory for JSON files",
-    )
-    parser.add_argument(
-        "--sample",
-        action="store_true",
-        help="Extract only 100 synsets per language for inspection",
-    )
-    args = parser.parse_args()
-
-    extract_all(output_dir=args.output_dir, sample=args.sample)
--- a/data-pipeline/stage-1-extract/scripts/extract.ts
+++ b/data-pipeline/stage-1-extract/scripts/extract.ts
@ -0,0 +1,257 @@
+import fs from "node:fs";
+import path from "node:path";
+import readline from "node:readline";
+import { fileURLToPath } from "node:url";
+import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type KaikkiTranslation = {
+  code?: string;
+  lang_code?: string;
+  word?: string;
+  sense?: string;
+};
+
+type KaikkiSense = {
+  glosses?: string[];
+  examples?: { text?: string }[];
+  translations?: KaikkiTranslation[];
+};
+
+type KaikkiEntry = {
+  word?: string;
+  pos?: string;
+  lang_code?: string;
+  senses?: KaikkiSense[];
+};
+
+export type ExtractedSense = {
+  headword: string;
+  language: SupportedLanguageCode;
+  pos: SupportedPos;
+  sense_index: number;
+  gloss: string | null;
+  examples: string[];
+  translations: {
+    target_lang: SupportedLanguageCode;
+    word: string;
+    sense_hint: string | null;
+  }[];
+};
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const SOURCES_DIR = path.resolve(__dirname, "../sources");
+const OUTPUT_DIR = path.resolve(__dirname, "../output");
+
+const LANG_TO_FILE: Record<SupportedLanguageCode, string> = {
+  en: "kaikki.org-dictionary-English.jsonl",
+  de: "kaikki.org-dictionary-German.jsonl",
+  it: "kaikki.org-dictionary-Italian.jsonl",
+  fr: "kaikki.org-dictionary-French.jsonl",
+  es: "kaikki.org-dictionary-Spanish.jsonl",
+};
+
+const POS_MAP: Record<string, SupportedPos> = {
+  noun: "noun",
+  verb: "verb",
+  adj: "adjective",
+  adv: "adverb",
+};
+
+const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+function mapPos(kaikkiPos: string): SupportedPos | null {
+  return POS_MAP[kaikkiPos] ?? null;
+}
+
+function isAbbreviation(gloss: string): boolean {
+  return gloss.toLowerCase().startsWith("abbreviation of");
+}
+
+function extractTranslations(
+  sense: KaikkiSense,
+  sourceLang: SupportedLanguageCode,
+): ExtractedSense["translations"] {
+  const seen = new Set<string>();
+  const result: ExtractedSense["translations"] = [];
+
+  for (const t of sense.translations ?? []) {
+    const code = t.code ?? t.lang_code;
+    if (!code || !SUPPORTED_LANG_SET.has(code)) continue;
+    if (code === sourceLang) continue; // skip same-language translations
+    if (!t.word?.trim()) continue;
+
+    const key = `${code}:${t.word.trim()}`;
+    if (seen.has(key)) continue;
+    seen.add(key);
+
+    result.push({
+      target_lang: code as SupportedLanguageCode,
+      word: t.word.trim(),
+      sense_hint: t.sense?.trim() ?? null,
+    });
+  }
+
+  return result;
+}
+
+function extractExamples(sense: KaikkiSense): string[] {
+  return (sense.examples ?? [])
+    .map((e) => e.text?.trim())
+    .filter((t): t is string => !!t);
+}
+
+function processEntry(
+  entry: KaikkiEntry,
+  sourceLang: SupportedLanguageCode,
+): Omit<ExtractedSense, "sense_index">[] {
+  const pos = mapPos(entry.pos ?? "");
+  if (!pos) return [];
+  if (!entry.word?.trim()) return [];
+
+  // For non-English files, only process entries in the target language
+  const entryLang = (entry as Record<string, unknown>)["lang_code"] as
+    | string
+    | undefined;
+  if (sourceLang !== "en" && entryLang !== sourceLang) return [];
+
+  const headword = entry.word.trim();
+  const results: Omit<ExtractedSense, "sense_index">[] = [];
+
+  for (const sense of entry.senses ?? []) {
+    const gloss = sense.glosses?.[0]?.trim() ?? null;
+
+    if (gloss && isAbbreviation(gloss)) continue;
+
+    if (sourceLang === "en") {
+      // English: require translations in supported languages
+      const translations = extractTranslations(sense, sourceLang);
+      if (translations.length === 0) continue;
+      results.push({
+        headword,
+        language: sourceLang,
+        pos,
+        gloss,
+        examples: extractExamples(sense),
+        translations,
+      });
+    } else {
+      // Non-English: just extract the entry, no translations needed
+      results.push({
+        headword,
+        language: sourceLang,
+        pos,
+        gloss,
+        examples: extractExamples(sense),
+        translations: [],
+      });
+    }
+  }
+
+  return results;
+}
+
+// ── Extract ───────────────────────────────────────────────────────────────────
+
+export async function extract(
+  lang: SupportedLanguageCode,
+  sampleLimit?: number,
+): Promise<void> {
+  const filename = LANG_TO_FILE[lang];
+  const sourcePath = path.join(SOURCES_DIR, filename);
+  const outputPath = path.join(OUTPUT_DIR, `${lang}.json`);
+
+  console.log(`\nExtracting ${lang}...`);
+  console.log(`  Source: ${sourcePath}`);
+  if (sampleLimit) console.log(`  Sample mode: ${sampleLimit} entries`);
+
+  await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+
+  const fileStream = fs.createReadStream(sourcePath);
+  const rl = readline.createInterface({
+    input: fileStream,
+    crlfDelay: Infinity,
+  });
+
+  const senses: ExtractedSense[] = [];
+  const senseIndexMap = new Map<string, number>();
+  let linesRead = 0;
+  let entriesProcessed = 0;
+  let entriesSkipped = 0;
+
+  for await (const line of rl) {
+    if (!line.trim()) continue;
+    if (sampleLimit && entriesProcessed >= sampleLimit) break;
+
+    linesRead++;
+
+    let entry: KaikkiEntry;
+    try {
+      entry = JSON.parse(line) as KaikkiEntry;
+    } catch {
+      console.warn(`  Warning: failed to parse line ${linesRead}, skipping`);
+      continue;
+    }
+
+    const extracted = processEntry(entry, lang);
+
+    if (extracted.length === 0) {
+      entriesSkipped++;
+      continue;
+    }
+
+    for (const sense of extracted) {
+      const key = `${sense.headword}|${sense.pos}`;
+      const senseIndex = senseIndexMap.get(key) ?? 0;
+      senseIndexMap.set(key, senseIndex + 1);
+      senses.push({ ...sense, sense_index: senseIndex });
+    }
+
+    entriesProcessed++;
+
+    if (entriesProcessed % 10_000 === 0) {
+      console.log(
+        `  Processed ${entriesProcessed.toLocaleString()} entries...`,
+      );
+    }
+  }
+
+  await fs.promises.writeFile(
+    outputPath,
+    JSON.stringify(senses, null, 2),
+    "utf-8",
+  );
+
+  console.log(`  Lines read:        ${linesRead.toLocaleString()}`);
+  console.log(`  Entries processed: ${entriesProcessed.toLocaleString()}`);
+  console.log(`  Entries skipped:   ${entriesSkipped.toLocaleString()}`);
+  console.log(`  Senses extracted:  ${senses.length.toLocaleString()}`);
+  console.log(`  Output:            ${outputPath}`);
+}
+
+// ── Main ─────────────────────────────────────────────────────────────────────
+
+async function main(): Promise<void> {
+  // Hardcoded sample limit for development — remove for full extraction
+  const SAMPLE = 500;
+
+  for (const lang of SUPPORTED_LANGUAGE_CODES) {
+    await extract(lang, SAMPLE);
+  }
+
+  console.log("\nExtraction complete.");
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main().catch((err) => {
+    console.error(err);
+    process.exit(1);
+  });
+}
--- a/data-pipeline/stage-2-annotate/scripts/annotate.ts
+++ b/data-pipeline/stage-2-annotate/scripts/annotate.ts
@ -1,227 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
-import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
-
-// ── Types ────────────────────────────────────────────────────────────────────
-
-type OmwExample = { text: string; source: "omw" };
-
-type CefrExample = { text: string; source: "cefr" };
-
-type Example = OmwExample | CefrExample;
-
-type OmwRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, string[]>>;
-};
-
-type AnnotatedRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
-};
-
-type CefrSourceEntry = {
-  word: string;
-  pos: string;
-  cefr_level: string;
-  example_sentence_native?: string;
-};
-
-type ConflictEntry = {
-  word: string;
-  pos: string;
-  language: SupportedLanguageCode;
-  levels: string[];
-};
-
-// ── Constants ─────────────────────────────────────────────────────────────────
-
-const POS_NORMALIZE: Record<string, SupportedPos> = {
-  noun: "noun",
-  n: "noun",
-  nom: "noun", // French
-  verb: "verb",
-  verbs: "verb",
-  v: "verb",
-  v1: "verb",
-  adjective: "adjective",
-  adjektiv: "adjective", // German
-  adj: "adjective",
-  adverb: "adverb",
-  adverbs: "adverb",
-  adv: "adverb",
-};
-
-const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
-
-const PATHS = {
-  omw: "stage-1-extract/output/omw.json",
-  cefrDir: "stage-2-annotate/sources/cefr",
-  outputDir: "stage-2-annotate/output",
-};
-
-// ── CEFR source loading ───────────────────────────────────────────────────────
-
-type CefrIndex = Map<string, { level: string; example?: string }>;
-
-async function loadCefrSource(
-  lang: SupportedLanguageCode,
-): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
-  const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
-  const raw = await fs.readFile(filepath, "utf-8");
-  const entries = JSON.parse(raw) as CefrSourceEntry[];
-
-  // First pass — detect conflicts.
-  // Structure: "word|pos" -> Set of CEFR levels seen
-  const seen = new Map<string, Set<string>>();
-
-  for (const entry of entries) {
-    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
-    if (!pos) continue;
-    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
-
-    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
-    if (!seen.has(key)) seen.set(key, new Set());
-    seen.get(key)!.add(entry.cefr_level);
-  }
-
-  const conflicts: ConflictEntry[] = [];
-  for (const [key, levels] of seen.entries()) {
-    if (levels.size > 1) {
-      const [word, pos] = key.split("|") as [string, string];
-      conflicts.push({ word, pos, language: lang, levels: [...levels] });
-    }
-  }
-
-  // Second pass — build index, skip conflicting entries.
-  const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
-
-  const index: CefrIndex = new Map();
-  for (const entry of entries) {
-    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
-    if (!pos) continue;
-    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
-
-    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
-    if (conflictKeys.has(key)) continue;
-
-    index.set(key, {
-      level: entry.cefr_level,
-      ...(entry.example_sentence_native
-        ? { example: entry.example_sentence_native }
-        : {}),
-    });
-  }
-
-  return { index, conflicts };
-}
-
-// ── Annotation ────────────────────────────────────────────────────────────────
-
-async function annotate(): Promise<void> {
-  // Load OMW records
-  console.log("Reading OMW extract...");
-  const raw = await fs.readFile(PATHS.omw, "utf-8");
-  const omwRecords = JSON.parse(raw) as OmwRecord[];
-  console.log(`  Loaded ${omwRecords.length.toLocaleString()} synsets`);
-
-  // Load CEFR sources for all languages
-  console.log("\nLoading CEFR source files...");
-  const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
-  const allConflicts: ConflictEntry[] = [];
-
-  for (const lang of SUPPORTED_LANGUAGE_CODES) {
-    const { index, conflicts } = await loadCefrSource(lang);
-    cefrIndexes.set(lang, index);
-    allConflicts.push(...conflicts);
-    console.log(
-      `  ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
-    );
-  }
-
-  // Write conflicts file
-  await fs.mkdir(PATHS.outputDir, { recursive: true });
-  await fs.writeFile(
-    path.join(PATHS.outputDir, "conflicts.json"),
-    JSON.stringify(allConflicts, null, 2),
-    "utf-8",
-  );
-  console.log(
-    `\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
-  );
-
-  // Annotate and write one file per language
-  console.log("\nAnnotating...");
-  for (const lang of SUPPORTED_LANGUAGE_CODES) {
-    const index = cefrIndexes.get(lang)!;
-    const records: AnnotatedRecord[] = [];
-    let matched = 0;
-
-    for (const record of omwRecords) {
-      const annotated: AnnotatedRecord = {
-        source_id: record.source_id,
-        pos: record.pos,
-        translations: record.translations,
-        glosses: record.glosses,
-        examples: {},
-        votes: {},
-      };
-
-      // Convert OMW examples to typed format
-      for (const [l, exList] of Object.entries(record.examples)) {
-        annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
-          text,
-          source: "omw" as const,
-        }));
-      }
-
-      // Match translations for this language against CEFR index
-      const langTranslations = record.translations[lang] ?? [];
-      for (const word of langTranslations) {
-        const key = `${word.toLowerCase().trim()}|${record.pos}`;
-        const cefrEntry = index.get(key);
-        if (!cefrEntry) continue;
-
-        matched++;
-
-        // Add CEFR vote
-        if (!annotated.votes[lang]) annotated.votes[lang] = {};
-        annotated.votes[lang]![word] = { cefr_source: cefrEntry.level };
-
-        // Add native example if present
-        if (cefrEntry.example) {
-          if (!annotated.examples[lang]) annotated.examples[lang] = [];
-          annotated.examples[lang]!.push({
-            text: cefrEntry.example,
-            source: "cefr" as const,
-          });
-        }
-      }
-
-      records.push(annotated);
-    }
-
-    const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
-    await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
-    console.log(
-      `  ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
-    );
-  }
-}
-
-// ── Main ─────────────────────────────────────────────────────────────────────
-
-annotate().catch((err) => {
-  console.error(err);
-  process.exit(1);
-});
--- a/data-pipeline/stage-2-annotate/sources/cefr/de.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/de.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/en.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/en.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/es.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/es.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/fr.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/fr.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/it.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/it.json
--- a/data-pipeline/stage-2-reverse-link/scripts/reverse-link.ts
+++ b/data-pipeline/stage-2-reverse-link/scripts/reverse-link.ts
@ -0,0 +1,109 @@
+import { openDb } from "../../db/index.js";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type TranslationRow = {
+  translation_id: number;
+  entry_id: number;
+  entry_language: string;
+  entry_headword: string;
+  target_lang: string;
+  word: string;
+  sense_hint: string | null;
+};
+
+type EntryRow = { id: number };
+
+// ── Sync ──────────────────────────────────────────────────────────────────────
+
+export function reverseLink(): void {
+  const db = openDb();
+
+  // Find all translations and their source entry details
+  const translations = db
+    .prepare(
+      `SELECT
+        t.id          AS translation_id,
+        t.entry_id,
+        e.language    AS entry_language,
+        e.headword    AS entry_headword,
+        t.target_lang,
+        t.word,
+        t.sense_hint
+       FROM translations t
+       JOIN entries e ON e.id = t.entry_id`,
+    )
+    .all() as TranslationRow[];
+
+  console.log(
+    `  Found ${translations.length.toLocaleString()} translations to check`,
+  );
+
+  const findEntry = db.prepare(
+    `SELECT id FROM entries WHERE headword = ? AND language = ? LIMIT 1`,
+  );
+
+  const insertReverseLink = db.prepare(
+    `INSERT INTO translations (entry_id, target_lang, word, sense_hint, source)
+     VALUES (?, ?, ?, ?, 'reverse_link')
+     ON CONFLICT (entry_id, target_lang, word) DO NOTHING`,
+  );
+
+  const sync = db.transaction(() => {
+    let inserted = 0;
+    let skipped = 0;
+    let noEntry = 0;
+
+    for (const t of translations) {
+      // Look for an entry in the target language with the translation word as headword
+      const targetEntry = findEntry.get(t.word, t.target_lang) as
+        | EntryRow
+        | undefined;
+
+      if (!targetEntry) {
+        noEntry++;
+        continue;
+      }
+
+      // Insert reverse link: target entry → source language → source headword
+      const result = insertReverseLink.run(
+        targetEntry.id,
+        t.entry_language,
+        t.entry_headword,
+        t.sense_hint ?? null,
+      );
+
+      if (result.changes > 0) {
+        inserted++;
+      } else {
+        skipped++;
+      }
+    }
+
+    return { inserted, skipped, noEntry };
+  });
+
+  const counts = sync();
+
+  db.close();
+
+  console.log(`  Inserted: ${counts.inserted.toLocaleString()} reverse links`);
+  console.log(
+    `  Skipped:  ${counts.skipped.toLocaleString()} (already existed)`,
+  );
+  console.log(
+    `  No entry: ${counts.noEntry.toLocaleString()} (target word not in entries)`,
+  );
+}
+
+// ── Main ─────────────────────────────────────────────────────────────────────
+
+function main(): void {
+  console.log("Running reverse link sync...");
+  reverseLink();
+  console.log("\nReverse link sync complete.");
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main();
+}
--- a/data-pipeline/stage-3-enrich/config.ts
+++ b/data-pipeline/stage-3-enrich/config.ts
@ -0,0 +1,123 @@
+// ── Provider configuration ────────────────────────────────────────────────────
+//
+// Each provider + model combination counts as one vote in the final majority.
+// Running the same model twice is not supported — one model, one vote.
+// The `name` field is used as the model identifier in pipeline.db and must
+// be unique across all runs.
+//
+// The pipeline iterates through ALL_PROVIDERS in order, skipping models that
+// have already completed a full run and resuming models with partial progress.
+//
+// See llm-setup.md for full setup instructions and model recommendations.
+
+export type ProviderConfig = {
+  name: string; // unique model identifier — stored in pipeline.db
+  baseURL: string;
+  apiKey: string;
+  model: string;
+  maxTokens: number;
+};
+
+// ── Local llama.cpp ───────────────────────────────────────────────────────────
+
+export const LOCAL_QWEN35_4B: ProviderConfig = {
+  name: "local-qwen3.5-4b",
+  baseURL: "http://127.0.0.1:8080/v1",
+  apiKey: "none",
+  model: "qwen3.5-4b",
+  maxTokens: 1024, // no reasoning overhead so 1024 is enough
+};
+
+export const LOCAL_GEMMA4: ProviderConfig = {
+  name: "local-gemma4-e4b",
+  baseURL: "http://127.0.0.1:8080/v1",
+  apiKey: "none", // llama.cpp ignores this
+  model: "gemma4-e4b", // llama.cpp ignores model name, uses loaded model
+  maxTokens: 2048,
+};
+
+export const LOCAL_QWEN7B: ProviderConfig = {
+  name: "local-qwen2.5-7b",
+  baseURL: "http://127.0.0.1:8080/v1",
+  apiKey: "none",
+  model: "qwen2.5-7b",
+  maxTokens: 512,
+};
+
+// ── OpenRouter — free tier ────────────────────────────────────────────────────
+
+export const OR_QWEN3_480B: ProviderConfig = {
+  name: "or-qwen3-480b",
+  baseURL: "https://openrouter.ai/api/v1",
+  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
+  model: "qwen/qwen3-coder:free",
+  maxTokens: 512,
+};
+
+export const OR_GEMMA4_31B: ProviderConfig = {
+  name: "or-gemma4-31b",
+  baseURL: "https://openrouter.ai/api/v1",
+  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
+  model: "google/gemma-4-31b-it:free",
+  maxTokens: 512,
+};
+
+export const OR_QWEN3_80B: ProviderConfig = {
+  name: "or-qwen3-80b",
+  baseURL: "https://openrouter.ai/api/v1",
+  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
+  model: "qwen/qwen3-next-80b-a3b-instruct:free",
+  maxTokens: 512,
+};
+
+export const OR_NEMOTRON: ProviderConfig = {
+  name: "or-nemotron-120b",
+  baseURL: "https://openrouter.ai/api/v1",
+  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
+  model: "nvidia/nemotron-3-super-120b-a12b:free",
+  maxTokens: 512,
+};
+
+// ── Anthropic — reference baseline ───────────────────────────────────────────
+// Note: Anthropic uses a different API format. An adapter is required.
+// See llm-setup.md for details.
+
+export const ANTHROPIC_SONNET: ProviderConfig = {
+  name: "anthropic-sonnet-4",
+  baseURL: "https://api.anthropic.com/v1",
+  apiKey: process.env["ANTHROPIC_API_KEY"] ?? "",
+  model: "claude-sonnet-4-6",
+  maxTokens: 512,
+};
+
+// ── All configured providers ──────────────────────────────────────────────────
+// The pipeline runs through these in order — local models first, then cloud.
+// Add new providers here to include them in the voting pool.
+
+export const ALL_PROVIDERS: ProviderConfig[] = [
+  LOCAL_QWEN35_4B,
+  // LOCAL_GEMMA4,
+  // LOCAL_QWEN7B,
+  // OR_QWEN3_480B,
+  // OR_GEMMA4_31B,
+  // OR_QWEN3_80B,
+  // OR_NEMOTRON,
+  // ANTHROPIC_SONNET,
+];
+
+// ── Key validation ────────────────────────────────────────────────────────────
+
+const LOCAL_PROVIDERS = new Set(["none"]);
+
+export function validateProviderKey(provider: ProviderConfig): void {
+  if (LOCAL_PROVIDERS.has(provider.apiKey)) return;
+
+  if (!provider.apiKey) {
+    const keyName = provider.name.startsWith("anthropic")
+      ? "ANTHROPIC_API_KEY"
+      : "OPENROUTER_API_KEY";
+    console.error(`\n  ERROR: ${keyName} is not set in .env`);
+    console.error(`  Provider "${provider.name}" requires this key to run.\n`);
+    process.exit(1);
+  }
+}
--- a/data-pipeline/stage-3-enrich/scripts/enrich.ts
+++ b/data-pipeline/stage-3-enrich/scripts/enrich.ts
@ -0,0 +1,877 @@
+import { openDb } from "../../db/index.js";
+import type { ProviderConfig } from "../config.js";
+import { CEFR_LEVELS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import type { SupportedLanguageCode } from "@lila/shared";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type EntryRow = {
+  id: number;
+  headword: string;
+  language: SupportedLanguageCode;
+  pos: string;
+  gloss: string | null;
+  examples: string; // JSON array string
+};
+
+type TranslationRow = {
+  id: number;
+  target_lang: SupportedLanguageCode;
+  word: string;
+};
+
+type GlossResult = { status: "ok" } | { status: "improved"; gloss: string };
+
+type ExampleResult = { status: "ok" } | { status: "improved"; example: string };
+
+type TranslationResult = {
+  translations: Partial<
+    Record<SupportedLanguageCode, Record<string, "ok" | "reject">>
+  >;
+  generated?: Partial<Record<SupportedLanguageCode, string>>;
+};
+
+type CefrResult = {
+  headword_cefr: string;
+  translation_cefr: Partial<
+    Record<SupportedLanguageCode, Record<string, string>>
+  >;
+};
+
+type SubStage =
+  | "round1_gloss"
+  | "round1_example"
+  | "round1_translations"
+  | "round1_cefr";
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
+const CEFR_SET = new Set<string>(CEFR_LEVELS);
+
+// ── Shutdown ──────────────────────────────────────────────────────────────────
+
+let shutdownRequested = false;
+let currentCallController: AbortController | null = null;
+
+export function registerEnrichShutdown(): void {
+  const handler = (): void => {
+    if (shutdownRequested) return;
+    shutdownRequested = true;
+    console.log("\n\n  Shutdown requested — aborting current LLM call...");
+    currentCallController?.abort();
+  };
+  process.on("SIGINT", handler);
+  process.on("SIGTERM", handler);
+}
+
+// ── Prompt builders ───────────────────────────────────────────────────────────
+
+function buildGlossPrompt(entry: EntryRow): string {
+  const glossText = entry.gloss ?? "none";
+  const examples: string[] = JSON.parse(entry.examples) as string[];
+  const examplesText =
+    examples.length > 0 ? examples.map((e) => `  - ${e}`).join("\n") : "  none";
+
+  return `You are a language learning expert.
+
+Review this gloss for the ${entry.pos} "${entry.headword}" (sense ${entry.sense_index}).
+Gloss: "${glossText}"
+Examples of this specific sense:
+${examplesText}
+
+Is this gloss clear, accurate for this specific sense, and suitable for a language learner?
+- If yes, respond with: {"status": "ok"}
+- If no or if gloss is "none", respond with: {"status": "improved", "gloss": "your improved gloss here"}
+
+IMPORTANT: Your improved gloss must describe THIS SPECIFIC SENSE shown by the examples above,
+not a more common or general meaning of the word.
+
+Respond ONLY with valid JSON and nothing else.`;
+}
+
+function buildTranslationsPrompt(
+  entry: EntryRow,
+  translations: TranslationRow[],
+  verifiedGloss: string,
+): string {
+  const byLang = new Map<SupportedLanguageCode, string[]>();
+  for (const t of translations) {
+    if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, []);
+    byLang.get(t.target_lang)!.push(t.word);
+  }
+
+  const coveredLangs = new Set(byLang.keys());
+  const missingLangs = SUPPORTED_LANGUAGE_CODES.filter(
+    (l) => l !== entry.language && !coveredLangs.has(l),
+  );
+
+  const translationsText =
+    byLang.size > 0
+      ? [...byLang.entries()]
+          .map(([lang, words]) => `  ${lang}: ${words.join(", ")}`)
+          .join("\n")
+      : "  none";
+
+  const missingText =
+    missingLangs.length > 0 ? missingLangs.join(", ") : "none";
+
+  const exampleResponse: Record<string, unknown> = {
+    translations: {
+      de: { frei: "ok", "-frei": "reject" },
+      it: { libero: "ok", free: "reject" },
+    },
+  };
+  if (missingLangs.length > 0) {
+    exampleResponse["generated"] = { es: "libre", fr: "libre" };
+  }
+
+  return `You are a language learning expert.
+
+For the ${entry.language} ${entry.pos} "${entry.headword}" (meaning: "${verifiedGloss}"), review these translations:
+${translationsText}
+
+For each translation:
+- Write "ok" if it is a valid translation for this specific meaning
+- Write "reject" if it is wrong, a suffix (starts with -), garbled text, or the wrong language
+
+Examples of correct behaviour:
+- "free" listed as Italian → "reject" (it is English, not Italian)
+- "-frei" listed as German → "reject" (it is a suffix, not a standalone word)
+- "libre" listed as Spanish → "ok" (it is a valid Spanish word)
+
+${missingLangs.length > 0 ? `Also generate the single best translation for these missing languages: ${missingText}` : ""}
+
+Respond ONLY with valid JSON and nothing else:
+${JSON.stringify(exampleResponse, null, 2)}`;
+}
+
+function buildCefrPrompt(
+  entry: EntryRow,
+  verifiedGloss: string,
+  validatedTranslations: Map<SupportedLanguageCode, string[]>,
+): string {
+  const translationsText =
+    validatedTranslations.size > 0
+      ? [...validatedTranslations.entries()]
+          .map(([lang, words]) => `  ${lang}: ${words.join(", ")}`)
+          .join("\n")
+      : "  none";
+
+  return `You are a language learning expert.
+
+Assign CEFR levels (A1, A2, B1, B2, C1, or C2) to this word and its validated translations.
+Base your levels on how commonly a language learner at that level would encounter this specific sense.
+Consider register — slang, technical, and archaic words should be rated higher.
+
+WORD: ${entry.headword} (${entry.pos})
+MEANING: ${verifiedGloss}
+VALIDATED TRANSLATIONS:
+${translationsText}
+
+Respond ONLY with valid JSON and nothing else:
+{
+  "headword_cefr": "B1",
+  "translation_cefr": {
+    "de": { "frei": "A2" },
+    "it": { "libero": "A2" }
+  }
+}`;
+}
+
+// ── Validation ────────────────────────────────────────────────────────────────
+
+function validateGloss(raw: string): GlossResult | null {
+  try {
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (obj["status"] === "ok") return { status: "ok" };
+    if (
+      obj["status"] === "improved" &&
+      typeof obj["gloss"] === "string" &&
+      obj["gloss"].trim()
+    ) {
+      return { status: "improved", gloss: obj["gloss"].trim() };
+    }
+    return null;
+  } catch {
+    return null;
+  }
+}
+
+function validateExample(raw: string): ExampleResult | null {
+  try {
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (obj["status"] === "ok") return { status: "ok" };
+    if (
+      obj["status"] === "improved" &&
+      typeof obj["example"] === "string" &&
+      obj["example"].trim()
+    ) {
+      return { status: "improved", example: obj["example"].trim() };
+    }
+    return null;
+  } catch {
+    return null;
+  }
+}
+
+function validateTranslations(
+  raw: string,
+  translations: TranslationRow[],
+): TranslationResult | null {
+  try {
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (typeof obj["translations"] !== "object" || obj["translations"] === null)
+      return null;
+
+    const result: TranslationResult = { translations: {} };
+    const translationsObj = obj["translations"] as Record<string, unknown>;
+
+    // Validate each language's votes
+    for (const [lang, votes] of Object.entries(translationsObj)) {
+      if (!SUPPORTED_LANG_SET.has(lang)) continue;
+      if (typeof votes !== "object" || votes === null) continue;
+
+      result.translations[lang as SupportedLanguageCode] = {};
+      for (const [word, status] of Object.entries(
+        votes as Record<string, unknown>,
+      )) {
+        if (status === "ok" || status === "reject") {
+          result.translations[lang as SupportedLanguageCode]![word] = status;
+        }
+      }
+    }
+
+    // Validate generated translations
+    if (obj["generated"] !== undefined && obj["generated"] !== null) {
+      if (typeof obj["generated"] !== "object") return null;
+      result.generated = {};
+      for (const [lang, word] of Object.entries(
+        obj["generated"] as Record<string, unknown>,
+      )) {
+        if (!SUPPORTED_LANG_SET.has(lang)) continue;
+        if (typeof word === "string" && word.trim()) {
+          result.generated[lang as SupportedLanguageCode] = word.trim();
+        }
+      }
+    }
+
+    // Check all translations got a vote
+    const byLang = new Map<string, Set<string>>();
+    for (const t of translations) {
+      if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, new Set());
+      byLang.get(t.target_lang)!.add(t.word);
+    }
+
+    for (const [lang, words] of byLang.entries()) {
+      const votes = result.translations[lang as SupportedLanguageCode];
+      if (!votes) return null;
+      for (const word of words) {
+        if (!votes[word]) return null;
+      }
+    }
+
+    return result;
+  } catch {
+    return null;
+  }
+}
+
+function validateCefr(
+  raw: string,
+  validatedTranslations: Map<SupportedLanguageCode, string[]>,
+): CefrResult | null {
+  try {
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (typeof obj["headword_cefr"] !== "string") return null;
+    if (!CEFR_SET.has(obj["headword_cefr"])) return null;
+    if (
+      typeof obj["translation_cefr"] !== "object" ||
+      obj["translation_cefr"] === null
+    )
+      return null;
+
+    const translationCefr = obj["translation_cefr"] as Record<string, unknown>;
+
+    // Verify all validated translations have a CEFR vote
+    for (const [lang, words] of validatedTranslations.entries()) {
+      const votes = translationCefr[lang] as Record<string, string> | undefined;
+      if (!votes) return null;
+      for (const word of words) {
+        if (!votes[word] || !CEFR_SET.has(votes[word])) return null;
+      }
+    }
+
+    return {
+      headword_cefr: obj["headword_cefr"],
+      translation_cefr: translationCefr as Partial<
+        Record<SupportedLanguageCode, Record<string, string>>
+      >,
+    };
+  } catch {
+    return null;
+  }
+}
+
+// ── LLM call ──────────────────────────────────────────────────────────────────
+
+async function callLlm(
+  prompt: string,
+  provider: ProviderConfig,
+): Promise<string> {
+  currentCallController = new AbortController();
+  const timeout = setTimeout(() => currentCallController?.abort(), 120_000);
+
+  let response: Response;
+  try {
+    response = await fetch(`${provider.baseURL}/chat/completions`, {
+      method: "POST",
+      signal: currentCallController.signal,
+      headers: {
+        "Content-Type": "application/json",
+        Authorization: `Bearer ${provider.apiKey}`,
+      },
+      body: JSON.stringify({
+        model: provider.model,
+        max_tokens: provider.maxTokens,
+        messages: [{ role: "user", content: prompt }],
+        temperature: 0.1,
+      }),
+    });
+  } finally {
+    clearTimeout(timeout);
+    currentCallController = null;
+  }
+
+  if (!response.ok) {
+    throw new Error(`LLM API error: ${response.status} ${response.statusText}`);
+  }
+
+  const data = (await response.json()) as {
+    choices?: { message?: { content?: string } }[];
+  };
+
+  const content = data.choices?.[0]?.message?.content;
+  if (!content) throw new Error("LLM returned empty response");
+
+  return content
+    .replace(/```json\n?/g, "")
+    .replace(/```\n?/g, "")
+    .trim();
+}
+
+// ── Status helpers ────────────────────────────────────────────────────────────
+
+function getSubStageStatus(
+  entryId: number,
+  modelName: string,
+  stage: SubStage,
+): "complete" | "needs_review" | "pending" {
+  const db = openDb();
+  const row = db
+    .prepare(
+      `SELECT status FROM run_status
+       WHERE entry_id = ? AND model_name = ? AND stage = ?`,
+    )
+    .get(entryId, modelName, stage) as { status: string } | undefined;
+  db.close();
+  if (!row) return "pending";
+  if (row.status === "complete") return "complete";
+  if (row.status === "needs_review") return "needs_review";
+  return "pending";
+}
+
+function markSubStage(
+  entryId: number,
+  modelName: string,
+  stage: SubStage,
+  status: "complete" | "needs_review",
+): void {
+  const db = openDb();
+  db.prepare(
+    `INSERT INTO run_status (entry_id, model_name, stage, status)
+     VALUES (?, ?, ?, ?)
+     ON CONFLICT (entry_id, model_name, stage)
+     DO UPDATE SET status = ?, updated_at = datetime('now')`,
+  ).run(entryId, modelName, stage, status, status);
+  db.close();
+}
+
+// ── Write helpers ─────────────────────────────────────────────────────────────
+
+function writeGloss(
+  entryId: number,
+  modelName: string,
+  result: GlossResult,
+): void {
+  if (result.status === "improved") {
+    const db = openDb();
+    db.prepare(
+      `INSERT INTO generated_glosses (entry_id, model_name, text)
+       VALUES (?, ?, ?)
+       ON CONFLICT (entry_id, model_name) DO NOTHING`,
+    ).run(entryId, modelName, result.gloss);
+    db.close();
+  }
+}
+
+function writeExample(
+  entryId: number,
+  modelName: string,
+  result: ExampleResult,
+): void {
+  if (result.status === "improved") {
+    const db = openDb();
+    db.prepare(
+      `INSERT INTO generated_examples (entry_id, model_name, text)
+       VALUES (?, ?, ?)
+       ON CONFLICT (entry_id, model_name) DO NOTHING`,
+    ).run(entryId, modelName, result.example);
+    db.close();
+  }
+}
+
+function writeTranslations(
+  entryId: number,
+  modelName: string,
+  result: TranslationResult,
+  translations: TranslationRow[],
+): void {
+  const db = openDb();
+
+  db.transaction(() => {
+    // Write rejections
+    for (const t of translations) {
+      const vote = result.translations[t.target_lang]?.[t.word];
+      if (vote === "reject") {
+        db.prepare(
+          `INSERT INTO model_translation_rejections (translation_id, model_name)
+           VALUES (?, ?)
+           ON CONFLICT (translation_id, model_name) DO NOTHING`,
+        ).run(t.id, modelName);
+      }
+    }
+
+    // Write generated translations
+    if (result.generated) {
+      for (const [lang, word] of Object.entries(result.generated)) {
+        db.prepare(
+          `INSERT INTO generated_translations (entry_id, model_name, target_lang, word)
+           VALUES (?, ?, ?, ?)
+           ON CONFLICT (entry_id, model_name, target_lang) DO NOTHING`,
+        ).run(entryId, modelName, lang, word);
+      }
+    }
+  })();
+
+  db.close();
+}
+
+function writeCefr(
+  entryId: number,
+  modelName: string,
+  result: CefrResult,
+  translations: TranslationRow[],
+): void {
+  const db = openDb();
+
+  db.transaction(() => {
+    // Headword CEFR
+    db.prepare(
+      `INSERT INTO model_entry_cefr_votes (entry_id, model_name, cefr_level)
+       VALUES (?, ?, ?)
+       ON CONFLICT (entry_id, model_name) DO NOTHING`,
+    ).run(entryId, modelName, result.headword_cefr);
+
+    // Translation CEFR votes
+    for (const t of translations) {
+      const level = result.translation_cefr[t.target_lang]?.[t.word];
+      if (level && CEFR_SET.has(level)) {
+        db.prepare(
+          `INSERT INTO model_translation_cefr_votes (translation_id, model_name, cefr_level)
+           VALUES (?, ?, ?)
+           ON CONFLICT (translation_id, model_name) DO NOTHING`,
+        ).run(t.id, modelName, level);
+      }
+    }
+  })();
+
+  db.close();
+}
+
+// ── Progress ──────────────────────────────────────────────────────────────────
+
+function updateProgress(
+  processed: number,
+  needsReview: number,
+  total: number,
+  llmMs: number,
+  startTime: number,
+): void {
+  const totalProcessed = processed + needsReview;
+  const pct = ((totalProcessed / total) * 100).toFixed(1);
+  const elapsed = (Date.now() - startTime) / 1000;
+  const rate = elapsed > 0 ? totalProcessed / elapsed : 0;
+  const remaining = rate > 0 ? (total - totalProcessed) / rate : 0;
+  const eta =
+    remaining === 0
+      ? "calculating..."
+      : remaining < 60
+        ? `${Math.round(remaining)}s`
+        : `${Math.round(remaining / 60)}m`;
+  const totalElapsedStr =
+    elapsed < 60
+      ? `${Math.round(elapsed)}s`
+      : `${Math.floor(elapsed / 60)}m ${Math.round(elapsed % 60)}s`;
+
+  process.stdout.write(
+    `\r    ${totalProcessed}/${total} (${pct}%) — entry: ${(llmMs / 1000).toFixed(1)}s — total: ${totalElapsedStr} — ETA: ${eta}    `,
+  );
+}
+
+// ── Main enrich function ──────────────────────────────────────────────────────
+
+export async function enrich(
+  provider: ProviderConfig,
+): Promise<{ processed: number; skipped: number; needsReview: number }> {
+  registerEnrichShutdown();
+  const db = openDb();
+
+  const allEntries = db
+    .prepare(`SELECT * FROM entries WHERE language = 'en'`)
+    .all() as EntryRow[];
+
+  // An entry is fully complete when all 4 sub-stages are complete
+  const completeEntries = db
+    .prepare(
+      `SELECT entry_id FROM run_status
+       WHERE model_name = ? AND stage = 'round1_gloss'
+       AND status = 'complete'`,
+    )
+    .all(provider.name) as { entry_id: number }[];
+
+  const completeIds = new Set(completeEntries.map((r) => r.entry_id));
+  const pending = allEntries.filter((e) => !completeIds.has(e.id)).slice(0, 50);
+
+  db.close();
+
+  console.log(`\n  Model: ${provider.name}`);
+  console.log(`  Total entries: ${allEntries.length.toLocaleString()}`);
+  console.log(`  Already complete: ${completeIds.size.toLocaleString()}`);
+  console.log(`  Pending: ${pending.length.toLocaleString()}`);
+
+  if (pending.length === 0) {
+    console.log("  Nothing to process.");
+    return { processed: 0, skipped: completeIds.size, needsReview: 0 };
+  }
+
+  let processedCount = 0;
+  let needsReviewCount = 0;
+  let llmMs = 0;
+  const startTime = Date.now();
+
+  for (const entry of pending) {
+    if (shutdownRequested) break;
+
+    const db2 = openDb();
+    const translations = db2
+      .prepare(
+        `SELECT id, target_lang, word FROM translations WHERE entry_id = ? AND source = 'kaikki'`,
+      )
+      .all(entry.id) as TranslationRow[];
+    db2.close();
+
+    let entryFailed = false;
+
+    // ── Sub-stage 1: Gloss ────────────────────────────────────────────────────
+
+    let verifiedGloss = entry.gloss ?? "";
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_gloss") !== "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(buildGlossPrompt(entry), provider);
+        llmMs = Date.now() - llmStart;
+
+        const result = validateGloss(raw);
+        if (!result) {
+          markSubStage(entry.id, provider.name, "round1_gloss", "needs_review");
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_gloss — invalid response`,
+          );
+          entryFailed = true;
+        } else {
+          writeGloss(entry.id, provider.name, result);
+          if (result.status === "improved") verifiedGloss = result.gloss;
+          markSubStage(entry.id, provider.name, "round1_gloss", "complete");
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(entry.id, provider.name, "round1_gloss", "needs_review");
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_gloss — ${message}`,
+        );
+        entryFailed = true;
+      }
+    }
+
+    if (entryFailed) {
+      needsReviewCount++;
+      updateProgress(
+        processedCount,
+        needsReviewCount,
+        pending.length,
+        llmMs,
+        startTime,
+      );
+      continue;
+    }
+
+    /*
+    // ── Sub-stages 2, 3, 4 — not yet active ──────────────────────────────────
+    // ── Sub-stage 2: Example ──────────────────────────────────────────────────
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_example") !==
+      "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(
+          buildExamplePrompt(entry, verifiedGloss),
+          provider,
+        );
+        llmMs = Date.now() - llmStart;
+
+        const result = validateExample(raw);
+        if (!result) {
+          markSubStage(
+            entry.id,
+            provider.name,
+            "round1_example",
+            "needs_review",
+          );
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_example — invalid response`,
+          );
+          entryFailed = true;
+        } else {
+          writeExample(entry.id, provider.name, result);
+          markSubStage(entry.id, provider.name, "round1_example", "complete");
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(entry.id, provider.name, "round1_example", "needs_review");
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_example — ${message}`,
+        );
+        entryFailed = true;
+      }
+    }
+
+    if (entryFailed) {
+      needsReviewCount++;
+      updateProgress(
+        processedCount,
+        needsReviewCount,
+        pending.length,
+        llmMs,
+        startTime,
+      );
+      continue;
+    }
+
+    // ── Sub-stage 3: Translations ─────────────────────────────────────────────
+
+    const validatedTranslations = new Map<SupportedLanguageCode, string[]>();
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_translations") !==
+      "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(
+          buildTranslationsPrompt(entry, translations, verifiedGloss),
+          provider,
+        );
+        llmMs = Date.now() - llmStart;
+
+        const result = validateTranslations(raw, translations);
+        if (!result) {
+          markSubStage(
+            entry.id,
+            provider.name,
+            "round1_translations",
+            "needs_review",
+          );
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_translations — invalid response`,
+          );
+          entryFailed = true;
+        } else {
+          writeTranslations(entry.id, provider.name, result, translations);
+          markSubStage(
+            entry.id,
+            provider.name,
+            "round1_translations",
+            "complete",
+          );
+
+          // Build validated translations map for CEFR sub-stage
+          // Include kaikki translations that were ok'd + generated translations
+          for (const t of translations) {
+            const vote = result.translations[t.target_lang]?.[t.word];
+            if (vote === "ok") {
+              if (!validatedTranslations.has(t.target_lang)) {
+                validatedTranslations.set(t.target_lang, []);
+              }
+              validatedTranslations.get(t.target_lang)!.push(t.word);
+            }
+          }
+          if (result.generated) {
+            for (const [lang, word] of Object.entries(result.generated)) {
+              const l = lang as SupportedLanguageCode;
+              if (!validatedTranslations.has(l))
+                validatedTranslations.set(l, []);
+              validatedTranslations.get(l)!.push(word);
+            }
+          }
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(
+          entry.id,
+          provider.name,
+          "round1_translations",
+          "needs_review",
+        );
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_translations — ${message}`,
+        );
+        entryFailed = true;
+      }
+    } else {
+      // Already complete — rebuild validated translations from db
+      const db3 = openDb();
+      const rejections = new Set(
+        (
+          db3
+            .prepare(
+              `SELECT translation_id FROM model_translation_rejections WHERE model_name = ?`,
+            )
+            .all(provider.name) as { translation_id: number }[]
+        ).map((r) => r.translation_id),
+      );
+      for (const t of translations) {
+        if (!rejections.has(t.id)) {
+          if (!validatedTranslations.has(t.target_lang)) {
+            validatedTranslations.set(t.target_lang, []);
+          }
+          validatedTranslations.get(t.target_lang)!.push(t.word);
+        }
+      }
+      const generated = db3
+        .prepare(
+          `SELECT target_lang, word FROM generated_translations WHERE entry_id = ? AND model_name = ?`,
+        )
+        .all(entry.id, provider.name) as {
+        target_lang: SupportedLanguageCode;
+        word: string;
+      }[];
+      for (const g of generated) {
+        if (!validatedTranslations.has(g.target_lang))
+          validatedTranslations.set(g.target_lang, []);
+        validatedTranslations.get(g.target_lang)!.push(g.word);
+      }
+      db3.close();
+    }
+
+    if (entryFailed) {
+      needsReviewCount++;
+      updateProgress(
+        processedCount,
+        needsReviewCount,
+        pending.length,
+        llmMs,
+        startTime,
+      );
+      continue;
+    }
+
+    // ── Sub-stage 4: CEFR ─────────────────────────────────────────────────────
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_cefr") !== "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(
+          buildCefrPrompt(entry, verifiedGloss, validatedTranslations),
+          provider,
+        );
+        llmMs = Date.now() - llmStart;
+
+        const result = validateCefr(raw, validatedTranslations);
+        if (!result) {
+          markSubStage(entry.id, provider.name, "round1_cefr", "needs_review");
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_cefr — invalid response`,
+          );
+          needsReviewCount++;
+        } else {
+          // Get translation rows for validated words only
+          const validatedRows = translations.filter((t) => {
+            return validatedTranslations.get(t.target_lang)?.includes(t.word);
+          });
+          writeCefr(entry.id, provider.name, result, validatedRows);
+          markSubStage(entry.id, provider.name, "round1_cefr", "complete");
+          processedCount++;
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(entry.id, provider.name, "round1_cefr", "needs_review");
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_cefr — ${message}`,
+        );
+        needsReviewCount++;
+      }
+    } else {
+      processedCount++;
+    }
+
+    */
+
+    processedCount++;
+    updateProgress(
+      processedCount,
+      needsReviewCount,
+      pending.length,
+      llmMs,
+      startTime,
+    );
+  }
+
+  process.stdout.write("\n");
+  const totalMs = Date.now() - startTime;
+  const totalMin = Math.floor(totalMs / 60_000);
+  const totalSec = Math.round((totalMs % 60_000) / 1000);
+  console.log(`  Total time: ${totalMin}m ${totalSec}s`);
+  console.log(
+    `  Avg per entry: ${(totalMs / Math.max(processedCount + needsReviewCount, 1) / 1000).toFixed(1)}s`,
+  );
+  console.log(`  Processed: ${processedCount.toLocaleString()}`);
+  console.log(`  Needs review: ${needsReviewCount.toLocaleString()}`);
+
+  return {
+    processed: processedCount,
+    skipped: completeIds.size,
+    needsReview: needsReviewCount,
+  };
+}
--- a/data-pipeline/tests/validation/db-import.validation.test.ts
+++ b/data-pipeline/tests/validation/db-import.validation.test.ts
@ -0,0 +1,230 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { describe, it, expect, beforeAll } from "vitest";
+import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type ExtractedSense = {
+  headword: string;
+  language: SupportedLanguageCode;
+  pos: SupportedPos;
+  sense_index: number;
+  gloss: string | null;
+  examples: string[];
+  translations: {
+    target_lang: SupportedLanguageCode;
+    word: string;
+    sense_hint: string | null;
+  }[];
+};
+
+// ── Paths ─────────────────────────────────────────────────────────────────────
+
+const DB_PATH = path.resolve("db/pipeline.db");
+const OUTPUT_DIR = path.resolve("stage-1-extract/output");
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+async function dbExists(): Promise<boolean> {
+  try {
+    await fs.access(DB_PATH);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+describe("pipeline.db — import validation", () => {
+  let db: import("better-sqlite3").Database;
+  let expectedEntriesByLang: Map<SupportedLanguageCode, number>;
+  let expectedTotalTranslations: number;
+
+  beforeAll(async () => {
+    if (!(await dbExists())) return;
+
+    const Database = (await import("better-sqlite3")).default;
+    db = new Database(DB_PATH, { readonly: true });
+    db.pragma("foreign_keys = ON");
+
+    expectedEntriesByLang = new Map();
+    expectedTotalTranslations = 0;
+
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      try {
+        const raw = await fs.readFile(
+          path.join(OUTPUT_DIR, `${lang}.json`),
+          "utf-8",
+        );
+        const senses = JSON.parse(raw) as ExtractedSense[];
+        expectedEntriesByLang.set(lang, senses.length);
+        if (lang === "en") {
+          for (const sense of senses) {
+            expectedTotalTranslations += sense.translations.length;
+          }
+        }
+      } catch {
+        expectedEntriesByLang.set(lang, 0);
+      }
+    }
+  }, 30_000);
+
+  it("pipeline.db exists — skipping all tests if not", async () => {
+    const exists = await dbExists();
+    if (!exists) {
+      console.warn(
+        "\n  pipeline.db not found — run pnpm db:init and pnpm db:import first\n",
+      );
+    }
+    expect(exists).toBe(true);
+  });
+
+  it("entry count per language matches source files", () => {
+    if (!db) return;
+    const errors: string[] = [];
+
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      const expected = expectedEntriesByLang.get(lang) ?? 0;
+      const row = db
+        .prepare("SELECT COUNT(*) as count FROM entries WHERE language = ?")
+        .get(lang) as { count: number };
+
+      if (row.count !== expected) {
+        errors.push(`${lang}: expected ${expected} entries, got ${row.count}`);
+      }
+    }
+
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("translation count matches source files plus reverse links", () => {
+    if (!db) return;
+    const row = db
+      .prepare("SELECT COUNT(*) as count FROM translations")
+      .get() as { count: number };
+    const reverseLinks = db
+      .prepare(
+        "SELECT COUNT(*) as count FROM translations WHERE source = 'reverse_link'",
+      )
+      .get() as { count: number };
+    expect(row.count).toBe(expectedTotalTranslations + reverseLinks.count);
+  });
+
+  it("every translation references a valid entry", () => {
+    if (!db) return;
+    const rows = db
+      .prepare(
+        `SELECT t.id, t.entry_id
+         FROM translations t
+         LEFT JOIN entries e ON e.id = t.entry_id
+         WHERE e.id IS NULL`,
+      )
+      .all() as { id: number; entry_id: number }[];
+
+    const errors = rows.map(
+      (r) => `translation ${r.id}: references missing entry ${r.entry_id}`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every entry has a valid language code", () => {
+    if (!db) return;
+    const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", ");
+    const rows = db
+      .prepare(
+        `SELECT id, headword, language FROM entries
+         WHERE language NOT IN (${validLangs})`,
+      )
+      .all() as { id: number; headword: string; language: string }[];
+
+    const errors = rows.map(
+      (r) => `entry ${r.id} "${r.headword}": invalid language "${r.language}"`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every entry has a valid pos", () => {
+    if (!db) return;
+    const rows = db
+      .prepare(
+        `SELECT id, headword, pos FROM entries
+         WHERE pos NOT IN ('noun', 'verb', 'adjective', 'adverb')`,
+      )
+      .all() as { id: number; headword: string; pos: string }[];
+
+    const errors = rows.map(
+      (r) => `entry ${r.id} "${r.headword}": invalid pos "${r.pos}"`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("sense_index is unique per headword, language, pos", () => {
+    if (!db) return;
+    const rows = db
+      .prepare(
+        `SELECT headword, language, pos, sense_index, COUNT(*) as c
+         FROM entries
+         GROUP BY headword, language, pos, sense_index
+         HAVING c > 1`,
+      )
+      .all() as {
+      headword: string;
+      language: string;
+      pos: string;
+      sense_index: number;
+      c: number;
+    }[];
+
+    const errors = rows.map(
+      (r) =>
+        `"${r.headword}" (${r.language} ${r.pos}): duplicate sense_index ${r.sense_index} (${r.c} rows)`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("non-English entries have no Kaikki translations", () => {
+    if (!db) return;
+    const nonEnLangs = SUPPORTED_LANGUAGE_CODES.filter((l) => l !== "en")
+      .map((l) => `'${l}'`)
+      .join(", ");
+
+    const rows = db
+      .prepare(
+        `SELECT e.headword, e.language, COUNT(t.id) as c
+         FROM entries e
+         JOIN translations t ON t.entry_id = e.id
+         WHERE e.language IN (${nonEnLangs})
+         AND t.source = 'kaikki'
+         GROUP BY e.id`,
+      )
+      .all() as { headword: string; language: string; c: number }[];
+
+    const errors = rows.map(
+      (r) =>
+        `"${r.headword}" (${r.language}): unexpected ${r.c} Kaikki translations`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("all Kaikki translation target languages are supported and not English", () => {
+    if (!db) return;
+    const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", ");
+
+    const rows = db
+      .prepare(
+        `SELECT t.id, t.target_lang
+         FROM translations t
+         WHERE t.source = 'kaikki'
+         AND (t.target_lang NOT IN (${validLangs}) OR t.target_lang = 'en')`,
+      )
+      .all() as { id: number; target_lang: string }[];
+
+    const errors = rows.map(
+      (r) => `translation ${r.id}: invalid target_lang "${r.target_lang}"`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+});
--- a/data-pipeline/tests/validation/stage-1.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-1.validation.test.ts
@ -0,0 +1,192 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { describe, it, expect, beforeAll } from "vitest";
+import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@lila/shared";
+import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type ExtractedSense = {
+  headword: string;
+  language: SupportedLanguageCode;
+  pos: SupportedPos;
+  sense_index: number;
+  gloss: string | null;
+  examples: string[];
+  translations: {
+    target_lang: SupportedLanguageCode;
+    word: string;
+    sense_hint: string | null;
+  }[];
+};
+
+// ── Paths ─────────────────────────────────────────────────────────────────────
+
+const OUTPUT_DIR = path.resolve("stage-1-extract/output");
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+describe("stage 1 — Kaikki extraction output validation", () => {
+  const sensesByLang = new Map<SupportedLanguageCode, ExtractedSense[]>();
+
+  beforeAll(async () => {
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
+      const raw = await fs.readFile(filePath, "utf-8");
+      sensesByLang.set(lang, JSON.parse(raw) as ExtractedSense[]);
+    }
+  }, 30_000);
+
+  it("all five language output files exist", async () => {
+    const errors: string[] = [];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      try {
+        await fs.access(path.join(OUTPUT_DIR, `${lang}.json`));
+      } catch {
+        errors.push(`missing: ${lang}.json`);
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every language file is a non-empty array", () => {
+    const errors: string[] = [];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      const senses = sensesByLang.get(lang)!;
+      if (!Array.isArray(senses)) errors.push(`${lang}: not an array`);
+      else if (senses.length === 0) errors.push(`${lang}: empty array`);
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every sense has required fields", () => {
+    const errors: string[] = [];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      for (const sense of sensesByLang.get(lang)!) {
+        if (!sense.headword) errors.push(`${lang}: sense missing headword`);
+        if (!sense.language)
+          errors.push(`${lang} ${sense.headword}: missing language`);
+        if (!sense.pos) errors.push(`${lang} ${sense.headword}: missing pos`);
+        if (sense.sense_index === undefined)
+          errors.push(`${lang} ${sense.headword}: missing sense_index`);
+        if (!Array.isArray(sense.examples))
+          errors.push(`${lang} ${sense.headword}: examples not an array`);
+        if (!Array.isArray(sense.translations))
+          errors.push(`${lang} ${sense.headword}: translations not an array`);
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every sense has a valid pos", () => {
+    const errors: string[] = [];
+    const validPos = new Set(SUPPORTED_POS);
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      for (const sense of sensesByLang.get(lang)!) {
+        if (!validPos.has(sense.pos)) {
+          errors.push(`${lang} ${sense.headword}: invalid pos "${sense.pos}"`);
+        }
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every sense language code matches its file", () => {
+    const errors: string[] = [];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      for (const sense of sensesByLang.get(lang)!) {
+        if (sense.language !== lang) {
+          errors.push(
+            `${lang} ${sense.headword}: language field "${sense.language}" does not match file`,
+          );
+        }
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("no abbreviation senses in output", () => {
+    const errors: string[] = [];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      for (const sense of sensesByLang.get(lang)!) {
+        if (sense.gloss?.toLowerCase().startsWith("abbreviation of")) {
+          errors.push(
+            `${lang} ${sense.headword}: abbreviation sense not filtered`,
+          );
+        }
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("English senses all have at least one translation", () => {
+    const errors: string[] = [];
+    for (const sense of sensesByLang.get("en")!) {
+      if (sense.translations.length === 0) {
+        errors.push(
+          `en ${sense.headword} (sense ${sense.sense_index}): no translations`,
+        );
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("non-English senses have no translations", () => {
+    const errors: string[] = [];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      if (lang === "en") continue;
+      for (const sense of sensesByLang.get(lang)!) {
+        if (sense.translations.length > 0) {
+          errors.push(
+            `${lang} ${sense.headword}: unexpected translations in non-English file`,
+          );
+        }
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("all translation target languages are supported and not English", () => {
+    const errors: string[] = [];
+    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
+    for (const sense of sensesByLang.get("en")!) {
+      for (const t of sense.translations) {
+        if (!validLangs.has(t.target_lang)) {
+          errors.push(
+            `en ${sense.headword}: unsupported translation language "${t.target_lang}"`,
+          );
+        }
+        if (t.target_lang === "en") {
+          errors.push(
+            `en ${sense.headword}: translation to same language "en"`,
+          );
+        }
+        if (!t.word?.trim()) {
+          errors.push(
+            `en ${sense.headword}: empty translation word for ${t.target_lang}`,
+          );
+        }
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("sense_index is unique per headword and pos within each language", () => {
+    const errors: string[] = [];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      const seen = new Map<string, Set<number>>();
+      for (const sense of sensesByLang.get(lang)!) {
+        const key = `${sense.headword}|${sense.pos}`;
+        if (!seen.has(key)) seen.set(key, new Set());
+        const indexes = seen.get(key)!;
+        if (indexes.has(sense.sense_index)) {
+          errors.push(
+            `${lang} ${sense.headword} (${sense.pos}): duplicate sense_index ${sense.sense_index}`,
+          );
+        }
+        indexes.add(sense.sense_index);
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+});
--- a/data-pipeline/tsconfig.json
+++ b/data-pipeline/tsconfig.json
@ -8,5 +8,5 @@
    "types": ["node"]
  },
  "references": [{ "path": "../packages/shared" }],
-  "include": ["./**/*"]
+  "include": ["./**/*", "vitest.config.ts"]
 }
--- a/data-pipeline/vitest.config.ts
+++ b/data-pipeline/vitest.config.ts
@ -0,0 +1,11 @@
+import { defineConfig } from "vitest/config";
+
+export default defineConfig({
+  test: {
+    environment: "node",
+    globals: true,
+    include: ["tests/**/*.test.ts"],
+    exclude: ["**/dist/**", "**/node_modules/**"],
+    testTimeout: 60_000,
+  },
+});
--- a/documentation/data-pipeline.md
+++ b/documentation/data-pipeline.md
@ -1,335 +1,302 @@
 # lila data pipeline

-> **NOTE: BEFORE RUNNING THE PIPELINE, CONSIDER IMPROVING THE CEFR SOURCE
-> FILES IN `stage-2-annotate/sources/cefr/`. BETTER SOURCE COVERAGE MEANS
-> FEWER WORDS FOR THE LLM TO ANNOTATE FROM SCRATCH, FASTER OVERNIGHT RUNS,
-> AND HIGHER CONFIDENCE IN THE FINAL OUTPUT. SEE UNIVERSALCEFR
-> (huggingface.co/UniversalCEFR) AND CEFR-J
-> (github.com/openlanguageprofiles/olp-en-cefrj) AS STARTING POINTS.**
-
-This pipeline extracts vocabulary data from the Open Multilingual Wordnet (OMW), annotates it with CEFR levels from curated source files, verifies and enriches annotations using local LLMs, and produces authoritative JSON files per language. These files are consumed by the seeder in `packages/db` to populate the database with terms, translations, glosses, CEFR levels, difficulty ratings, and LLM-generated descriptions.
+This pipeline extracts vocabulary data from Wiktionary via the Kaikki dataset, enriches it with CEFR levels and fills content gaps using local LLMs, and produces authoritative output in `pipeline.db`. This database is consumed by the sync script to populate the production database with vocabulary entries, translations, glosses, CEFR levels, and difficulty ratings.

 ## Overview

 ```mermaid
 flowchart LR
-    omw[(OMW SQLite DBs)]
-    cefr[(CEFR JSON files)]
+    kaikki[(Kaikki JSONL)]
    extract[Extract]
-    annotate[Annotate]
+    reverselink[Reverse Link Sync]
    enrich[Enrich]
+    pipelinedb[(pipeline.db)]
    merge[Merge]
-    final[(final/lang.json)]
-    flagged[(flagged/lang.json)]
-    seeder[packages/db seeder]
-    db[(Database)]
+    tiebreak[Tiebreak]
+    compare[Compare]
+    sync[Sync]
+    db[(PostgreSQL)]

-    omw --> extract
-    cefr --> annotate
-    extract --> annotate
-    annotate --> enrich
-    enrich --> merge
-    merge --> final
-    merge --> flagged
-    final --> seeder
-    seeder --> db
+    kaikki --> extract
+    extract --> pipelinedb
+    pipelinedb --> reverselink
+    reverselink --> pipelinedb
+    pipelinedb --> enrich
+    enrich --> pipelinedb
+    pipelinedb --> merge
+    merge --> pipelinedb
+    pipelinedb --> tiebreak
+    tiebreak --> pipelinedb
+    pipelinedb --> compare
+    pipelinedb --> sync
+    sync --> db
 ```

-Each stage is a standalone script that reads from the previous stage's output and produces one JSON file per language. Stages can be re-run independently without affecting earlier or later stages.
+Each stage is a standalone script that reads from and writes to `pipeline.db`. The pipeline is fully resumable — interrupted overnight runs pick up from the last processed record without losing work.

-The enrich stage is the exception — it produces one checkpoint file per model run per language, plus a compiled votes file once all runs are complete. It is designed to run overnight, one model at a time, and is fully resumable if interrupted.
+Stage 1 is a manual prerequisite and is not run by the pipeline orchestrator. See **Stage 1 — Extract** for instructions.

-Only fully annotated output in `stage-4-merge/output/final/` reaches the database. Words where LLMs could not reach a majority vote land in `stage-4-merge/output/flagged/` and wait for manual review before seeding.
+The enrich stage is designed to run overnight, one model at a time. Each model processes every entry and writes results to `pipeline.db` atomically per record.

-## Data sources
+Only fully resolved records reach the production database. Records where LLMs could not reach a majority vote are handled automatically by the tiebreaker stage before syncing.

-### OMW / WordNet
+## pipeline.db

-The Open Multilingual Wordnet (OMW) is the base vocabulary source. It provides synsets — groups of synonymous words — with translations and glosses across multiple languages. One SQLite database per language is downloaded and placed in `sources/omw/`. These files are not committed to git.
+All pipeline state is stored in `pipeline.db` — a SQLite database in `data-pipeline/db/`. It is created automatically on first run and is not committed to git.

-All four parts of speech are extracted: noun, verb, adjective, adverb. WordNet's adjective satellites are collapsed into adjective — this is a WordNet-internal distinction that has no relevance for language learning. Alongside translations and glosses, usage examples are extracted where available and stored in the database as term_examples.
+The database serves three purposes:

-See **Setup** for download instructions.
+- **Resumability** — every record is written atomically with a status. Interrupted overnight runs resume from the last pending record without losing work.
+- **Vote tracking** — all model votes for CEFR levels and generated content are stored per model per record, giving full auditability of how every decision was reached.
+- **Resolved output** — the final resolved records live here and are read by the sync script to seed the production database.

-### CEFR source files
+The schema is defined in `data-pipeline/db/schema.sql`. Never edit `pipeline.db` directly — all writes go through the pipeline scripts.

-Per-language JSON files in `sources/cefr/` provide the initial CEFR level annotations. These files do not cover the full vocabulary extracted from OMW — coverage varies by language. Gaps and disagreements are handled by the enrich stage.
+On first run the orchestrator initialises `pipeline.db` automatically and imports the stage 1 output into the base tables. This happens once — subsequent runs skip the import if the base tables are already populated.

-| Language | File                   |
-| -------- | ---------------------- |
-| English  | `sources/cefr/en.json` |
-| Italian  | `sources/cefr/it.json` |
-| Spanish  | `sources/cefr/es.json` |
-| German   | `sources/cefr/de.json` |
-| French   | `sources/cefr/fr.json` |
+## Common commands

-These files are committed to git. For per-language coverage detail see `COVERAGE.md`.
+### Starting llama.cpp

-### CEFR annotation and verification
+```bash
+cd ~/Downloads/llama.cpp
+./build/bin/llama-server \
+  --model models/qwen3.5-4b-q4_k_m.gguf \
+  --port 8080 \
+  --ctx-size 4096 \
+  --n-gpu-layers 999 \
+  --host 127.0.0.1 \
+  --chat-template-kwargs '{"enable_thinking":false}' \
+  --reasoning-budget 0
+```

-CEFR levels are determined by a majority vote combining all available sources:
+Verify the server is running:

- The CEFR source file counts as one vote (if it has an entry for the word)
- Each LLM model run counts as one vote
+```bash
+curl http://127.0.0.1:8080/health
+```

-The LLMs verify existing annotations as well as filling gaps — a source file entry does not automatically win. Majority vote across all sources determines the final level.
+### Running the pipeline

-If no majority is reached, the word is flagged for manual review and excluded from the database until resolved.
+```bash
+pnpm --filter @lila/pipeline pipeline:run
+```
+
+The pipeline auto-generates a run name from the date and a counter. It picks up where it left off — completed stages are skipped automatically.
+
+### Stage 1 — Extract
+
+```bash
+pnpm --filter @lila/pipeline extract
+```
+
+Runs in sample mode (500 entries per language) by default. Remove the hardcoded limit in `stage-1-extract/scripts/extract.ts` for a full run.
+
+### Stage 2 — Reverse link sync
+
+```bash
+pnpm --filter @lila/pipeline reverse-link
+```
+
+### Initialising and importing the database
+
+```bash
+# Initialise pipeline.db from schema
+pnpm --filter @lila/pipeline db:init
+
+# Import stage 1 output into pipeline.db
+pnpm --filter @lila/pipeline db:import
+```
+
+### Resetting the database
+
+```bash
+# Full reset — delete and reinitialise
+rm data-pipeline/db/pipeline.db
+pnpm --filter @lila/pipeline db:init
+pnpm --filter @lila/pipeline db:import
+pnpm --filter @lila/pipeline reverse-link
+```
+
+### Resetting enrich stage progress
+
+```bash
+# Reset round 1 only
+pnpm --filter @lila/pipeline db:reset round1
+
+# Reset all stages except reverse link
+pnpm --filter @lila/pipeline db:reset all
+```
+
+### Checking pipeline progress
+
+```bash
+node -e "
+const Database = require('better-sqlite3');
+const db = new Database('data-pipeline/db/pipeline.db', { readonly: true });
+const total = db.prepare('SELECT COUNT(*) as c FROM entries WHERE language = \\'en\\'').get().c;
+const complete = db.prepare(\"SELECT COUNT(*) as c FROM run_status WHERE stage = 'round1' AND status = 'complete'\").get().c;
+const needsReview = db.prepare(\"SELECT COUNT(*) as c FROM run_status WHERE stage = 'round1' AND status = 'needs_review'\").get().c;
+console.log('Total English entries:', total);
+console.log('Round 1 complete:', complete);
+console.log('Needs review:', needsReview);
+console.log('Pending:', total - complete - needsReview);
+db.close();
+"
+```
+
+## Data source
+
+### Kaikki (Wiktionary)
+
+The pipeline uses pre-extracted Wiktionary data from [kaikki.org](https://kaikki.org), built with the [wiktextract](https://github.com/tatuylonen/wiktextract) tool. This data is updated weekly from the English Wiktionary dump and is freely available under the same license as Wiktionary (CC-BY-SA).
+
+**Why Kaikki instead of OMW:**
+Kaikki is structured per word sense. Each headword has multiple senses, and translations are linked to a specific sense rather than a general concept. This prevents the sense disambiguation problems found in OMW, where a single concept entry could contain translations from entirely different meanings of a word.
+
+Each Kaikki entry provides:
+
+- A headword in the entry language
+- One or more senses, each with a gloss and examples
+- Per-sense translations to other languages with sense hints
+- IPA pronunciations and audio file references (deferred — see **Further extensions**)
+- Inflected forms (deferred — see **Further extensions**)
+
+The pipeline uses the English Wiktionary edition (`enwiktionary`), which contains entries for all five supported languages with glosses in English.
+
+### CEFR levels
+
+CEFR levels are assigned entirely by LLM majority vote. Each model receives the headword, gloss, and an example sentence and votes on the appropriate level (A1–C2). There are no curated source files — the LLMs are the sole source of CEFR annotations.
+
+If no majority is reached after all model runs, the entry is handled automatically by the tiebreaker stage.

 ## Setup

-### OMW databases
+### Kaikki data files

-Download the OMW SQLite database for each language using the `wn` Python
-library:
+Download the pre-extracted Kaikki JSONL files for each language. These are large files — download them to `stage-1-extract/sources/` which is not committed to git.

 ```bash
-python -m wn download omw-en:1.4
-python -m wn download omw-it:1.4
-python -m wn download omw-de:1.4
-python -m wn download omw-es:1.4
-python -m wn download omw-fr:1.4
-```
+mkdir -p stage-1-extract/sources
+cd stage-1-extract/sources

-The data is stored automatically at `~/.wn_data/wn.db` and is not committed
-to git.
+# English entries (contains translations to all other languages)
+wget https://kaikki.org/dictionary/English/kaikki.org-dictionary-English.jsonl.gz
+
+# Per-language files (for entries written in those languages)
+wget https://kaikki.org/dictionary/German/kaikki.org-dictionary-German.jsonl.gz
+wget https://kaikki.org/dictionary/Italian/kaikki.org-dictionary-Italian.jsonl.gz
+wget https://kaikki.org/dictionary/French/kaikki.org-dictionary-French.jsonl.gz
+wget https://kaikki.org/dictionary/Spanish/kaikki.org-dictionary-Spanish.jsonl.gz
+
+# Decompress
+gunzip *.gz
+```

 ### LLM setup

-See `LLM-SETUP.md`.
+See `llm-setup.md`.

 ## Pipeline stages

-The pipeline runs in five stages. Each stage is independent and can be re-run without affecting the others.
-
 | Stage           | What it does                                                             |
-| ----------- | -------------------------------------------------------------------- |
-| 1. Extract  | Reads OMW SQLite database, outputs normalized JSON per language      |
-| 2. Annotate | Merges CEFR source files into extracted data, adds source file votes |
-| 3. Enrich   | Runs local LLMs in two rounds — generation then voting               |
-| 4. Merge    | Resolves votes, derives difficulty, splits into final and flagged    |
-| 5. Compare  | Generates COVERAGE.md with detailed quality report                   |
+| --------------- | ------------------------------------------------------------------------ |
+| 1. Extract      | Parses Kaikki JSONL, imports entries into `pipeline.db`                  |
+| 2. Reverse link | Inserts missing reverse translations between language pairs              |
+| 3. Enrich       | LLMs fill translation gaps, improve glosses/examples, assign CEFR levels |
+| 4. Merge        | Resolves LLM votes into final values                                     |
+| 4b. Tiebreak    | Runs unused models on flagged entries until majority is reached          |
+| 5. Compare / QA | Generates `COVERAGE.md` with detailed quality report                     |
+| 6. Sync         | Upserts resolved records into production PostgreSQL                      |

 ### 1. Extract

-Reads the OMW SQLite database (`~/.wn_data/wn.db`) and produces a single normalized JSON file containing all synsets with their translations, glosses, and usage examples across all five languages and all parts of speech. Adjective satellites are collapsed into adjective at this stage.
+Parses the Kaikki JSONL files for all five languages and imports them into the base tables of `pipeline.db`. Filters to the four supported parts of speech: noun, verb, adjective, adverb. Each Kaikki sense becomes one row in `vocabulary_entries`. Translations are stored in `entry_translations` with their sense hints.

-**Input:** `~/.wn_data/wn.db`
-**Output:** `stage-1-extract/output/omw.json`
+**Input:** `stage-1-extract/sources/*.jsonl`
+**Output:** `pipeline.db` — `vocabulary_entries` and `entry_translations` tables populated

 ```bash
-python stage-1-extract/scripts/extract.py
+pnpm --filter @lila/pipeline extract
 ```

-Add `--sample` to extract 100 synsets for inspection before running the full
-extraction.
+Add `--sample 100` to import only 100 entries per language for inspection before running the full import.

-Each record in the output looks like this:
+Each entry in `pipeline.db` looks like this:

 ```json
 {
-  "source_id": "ili:i1",
-  "pos": "adjective",
-  "translations": {
-    "en": ["able"],
-    "it": ["abile", "intelligente", "valente", "capace"],
-    "es": ["capaz"],
-    "fr": ["comptable"]
+  "headword": "thrill",
+  "language": "en",
+  "pos": "verb",
+  "sense_index": 0,
+  "gloss": "To suddenly excite someone, or to give them great pleasure.",
+  "examples": ["The movie thrilled the audience."],
+  "translations": [
+    { "language": "de", "word": "begeistern", "sense_hint": "suddenly excite" },
+    {
+      "language": "fr",
+      "word": "enthousiasmer",
+      "sense_hint": "suddenly excite"
    },
-  "glosses": {
-    "en": [
-      "(usually followed by 'to') having the necessary means or skill or know-how or authority to do something"
+    { "language": "it", "word": "entusiasmare" },
+    { "language": "es", "word": "emocionar" }
  ]
-  },
-  "examples": { "en": ["able to swim", "she was able to program her computer"] }
 }
 ```

-Note: glosses and examples are not available for all languages. French and Spanish have no glosses or examples in the current OMW database — these will be generated by the LLM in the enrich stage. Coverage detail is in `COVERAGE.md`.
+> **Note:** Stage 1 is a manual prerequisite. It is not run by the pipeline orchestrator (`pipeline.ts`). Run it once before running the orchestrator for the first time, and re-run it manually if the Kaikki source files are updated.

-### 2. Annotate
+### 2. Reverse link sync

-Reads the combined OMW extract and merges CEFR source data into it. Each translation in each language is matched against the corresponding CEFR source
-file by word text and part of speech. Matched translations receive a `cefr_source` vote which carries into the enrich stage. Unmatched translations proceed without a vote.
+A pure script stage — no LLMs. For each translation pair in `entry_translations`, checks whether the reverse link exists. If English _thrill → begeistern_ exists and the German entry _begeistern_ exists in `vocabulary_entries` but lacks the English back-link, it is inserted automatically.

-This stage also extracts native example sentences from the CEFR source files and adds them to the record alongside OMW examples, with `source: "cefr"` to distinguish them.
+This runs before the enrich stage so that LLMs only generate translations that are genuinely missing — not translations that would be found by a simple reverse lookup.

-Words appearing in the CEFR source file multiple times with different CEFR levels are written to `conflicts.json` for manual review and excluded from voting until resolved.
-
-**Input:** `stage-1-extract/output/omw.json` + `stage-2-annotate/sources/cefr/{lang}.json`
-**Output:**
-
- `stage-2-annotate/output/{lang}.json` — one per language
- `stage-2-annotate/output/conflicts.json` — cross-language conflicts for review
+**Input:** `pipeline.db` — populated `vocabulary_entries` and `entry_translations`
+**Output:** `pipeline.db` — missing reverse links inserted into `entry_translations`

 ```bash
-pnpm --filter @lila/pipeline annotate
+pnpm --filter @lila/pipeline reverse-link
 ```

-Each record in the output extends the OMW record with a `votes` field and any additional examples from the CEFR source file:
-
-```json
-{
-  "source_id": "ili:i1",
-  "pos": "adjective",
-  "translations": {
-    "en": ["able"],
-    "it": ["abile", "intelligente", "valente", "capace"],
-    "es": ["capaz"],
-    "fr": ["comptable"]
-  },
-  "glosses": { "en": ["having the necessary means or skill to do something"] },
-  "examples": {
-    "en": [
-      { "text": "able to swim", "source": "omw" },
-      { "text": "She was able to finish the task.", "source": "cefr" }
-    ]
-  },
-  "votes": { "en": { "able": { "cefr_source": "B1" } } }
-}
-```
-
-Words not present in the CEFR source file will have an empty `votes` object.
-
 ### 3. Enrich

-The enrich stage runs in two rounds, both designed to execute overnight one model at a time. The llama.cpp server must be running locally before starting either round. See `LLM-SETUP.md` for setup instructions.
+> **Note:** Before running this stage, ensure the llama.cpp server is running
+> locally. The orchestrator checks for a running server at
+> `http://127.0.0.1:8080/health` and exits with instructions if it is not
+> reachable. See `llm-setup.md` for setup instructions.

-**Round 1 — generation**
+The enrich stage runs in four ordered sub-stages per entry, designed to build context progressively. All output is written to `pipeline.db` atomically per sub-stage — runs are fully resumable if interrupted. Each model is run once — one model produces one vote per sub-stage.

-Each model processes every word in every language one term at a time and
-generates:
+**Sub-stage order:**

- A CEFR level vote for each translation
- A description for each language
- A translation for each language, only if OMW provides none
- A gloss for each language, only if OMW provides none
- Usage examples for each language, only if OMW provides none
+1. **`round1_gloss`** — the LLM reviews the existing gloss. If it is clear and learner-friendly, it confirms it. If not, it generates a better one.

-OMW data is never duplicated — the script checks what OMW already provides before building the prompt. For translations, glosses and examples, if OMW data exists for that language the LLM skips generation entirely. This significantly reduces compute time for languages with good OMW coverage such as English.
+2. **`round1_example`** — the LLM reviews the existing examples. If they are natural and suitable, it confirms them. If not, it generates one better example sentence in the entry language.

-All model-generated content is stored with an anonymised source (`model_1`, `model_2` etc.) so models cannot be biased by knowing who generated what in round 2.
+3. **`round1_translations`** — using the verified gloss as context, the LLM reviews each existing translation. Valid translations are confirmed. Invalid ones (wrong language, suffixes, garbled text, wrong sense) are explicitly rejected. Missing languages get a generated translation.

-**Input:** `stage-2-annotate/output/{lang}.json`
-**Output:** `stage-3-enrich/output/round1/{lang}_{model}.json` per run
+4. **`round1_cefr`** — using only the validated translations from the previous sub-stage, the LLM votes on the CEFR level for the headword and for each confirmed translation. Rejected translations never reach this sub-stage.

-```bash
-pnpm --filter @lila/pipeline enrich --round 1 --model {model}
-```
+This ordering ensures the CEFR voting sub-stage only sees clean, verified data.

-**Compiling candidates**
+All output is written to `pipeline.db` atomically per sub-stage per entry. Interrupted runs resume from the last incomplete sub-stage without losing work. Each model is run once — one model, one vote per sub-stage.

-Once all round 1 runs are complete, compile all generated candidates into a single structured file per language. This is the input to round 2.
+**Input:** `pipeline.db` — entries after reverse link sync
+**Output:** `pipeline.db` — gloss votes, example votes, translation votes, CEFR votes per entry per model

-**Input:** `stage-3-enrich/output/round1/{lang}_{model}.json`
-**Output:** `stage-3-enrich/output/candidates/{lang}_candidates.json`
-
-```bash
-pnpm --filter @lila/pipeline enrich --compile-candidates
-```
-
-**Round 2 — voting**
-
-Each model receives the compiled candidate list for every word and votes on:
-
- The best gloss candidate (if multiple exist)
- The best description candidate (if multiple exist)
- The best usage examples candidate (if multiple exist)
- A CEFR level vote for each translation
-
-OMW data is not put to a vote — it automatically wins over any LLM-generated candidate. Round 2 only resolves conflicts between model-generated candidates. The prompt is kept small — one word at a time, a clean numbered candidate list — to fit within a limited context window.
-
-**Input:** `stage-3-enrich/output/candidates/{lang}_candidates.json`
-**Output:** `stage-3-enrich/output/round2/{lang}_{model}.json` per run
-
-```bash
-pnpm --filter @lila/pipeline enrich --round 2 --model {model}
-```
-
-**Compiling votes**
-
-Once all round 2 runs are complete, compile all votes into a single file per language. This is the input to the merge stage.
-
-**Input:** `stage-3-enrich/output/round2/{lang}_{model}.json`
-**Output:** `stage-3-enrich/output/votes/{lang}_votes.json`
-
-```bash
-pnpm --filter @lila/pipeline enrich --compile-votes
-```
-
-Each record in the votes file looks like this:
-
-```json
-{
-  "source_id": "omw-en-12345",
-  "pos": "noun",
-  "translations": {
-    "en": [
-      {
-        "text": "dog",
-        "votes": { "cefr_source": "A1", "model_1": "A1", "model_2": "A1" }
-      },
-      {
-        "text": "canine",
-        "votes": { "cefr_source": "B2", "model_1": "B2", "model_2": "B1" }
-      }
-    ],
-    "it": [
-      {
-        "text": "cane",
-        "votes": { "cefr_source": "A1", "model_1": "A1", "model_2": "A1" }
-      }
-    ]
-  },
-  "glosses": {
-    "en": { "text": "a domesticated carnivorous mammal", "source": "omw" },
-    "fr": {
-      "candidates": [
-        { "text": "un mammifère carnivore domestiqué", "source": "model_1" },
-        { "text": "un animal domestique carnivore", "source": "model_2" }
-      ],
-      "votes": { "model_1": 1, "model_2": 1 }
-    }
-  },
-  "examples": {
-    "en": [{ "text": "the dog barked at the stranger", "source": "omw" }],
-    "fr": {
-      "candidates": [
-        { "text": "le chien a aboyé", "source": "model_1" },
-        { "text": "le chien gardait la maison", "source": "model_2" }
-      ],
-      "votes": { "model_1": 2, "model_2": 1 }
-    }
-  },
-  "descriptions": {
-    "en": {
-      "candidates": [
-        {
-          "text": "a common household pet known for loyalty",
-          "source": "model_1"
-        },
-        {
-          "text": "a domesticated animal and loyal companion",
-          "source": "model_2"
-        }
-      ],
-      "votes": { "model_1": 2, "model_2": 1 }
-    }
-  }
-}
-```
+> **Note:** The tiebreaker is not a standalone script. It runs automatically > as part of the pipeline orchestrator after merge completes.

 ### 4. Merge

-Reads the votes file per language and resolves the final value for every field. Produces two output files per language — fully resolved records ready for seeding, and flagged records that need manual review.
+Reads all LLM votes from `pipeline.db` and resolves the final value for every field. Writes resolved entries back to `pipeline.db`.

 **Merge rules:**

- OMW data wins automatically and is never overridden
- For CEFR levels: the level with the most votes wins. If no majority is reached, that translation is flagged
- For LLM-generated text fields (gloss, examples, descriptions): the candidate with the most votes wins
-
-<!-- TODO: decide fallback strategy when no majority is reached for text fields -->
+- Kaikki source data wins automatically and is never overridden by LLM output
+- For CEFR levels: the level with the most votes wins. If no majority is reached, the entry is flagged for the tiebreaker
+- For LLM-generated text fields: the candidate with the most votes wins. If no majority is reached, the tiebreaker runs

 **Difficulty mapping:**

@ -339,93 +306,85 @@ Reads the votes file per language and resolves the final value for every field.
 | B1, B2 | intermediate |
 | C1, C2 | hard         |

-**Input:** `stage-3-enrich/output/votes/{lang}_votes.json`
-**Output:**
+**Input:** `pipeline.db` — LLM votes
+**Output:** `pipeline.db` — entries updated with resolved values or flagged status

- `stage-4-merge/output/final/{lang}.json` — fully resolved, ready for seeding
- `stage-4-merge/output/flagged/{lang}.json` — CEFR majority not reached, needs manual review before seeding
+### 4b. Tiebreak

-```bash
-pnpm --filter @lila/pipeline merge
-```
+Runs automatically after merge if any entries remain flagged. The script queries `pipeline.db` for flagged entries, identifies which configured models have not yet voted on each entry, and runs those models on the flagged subset only. Merge is re-run after each tiebreaker pass. This repeats until all flagged entries are resolved or no unused models remain.

-Each record in `final/{lang}.json` looks like this:
+If unused models are exhausted and flagged entries remain, the script logs a detailed report showing the exact vote split for each unresolved entry and lists available models from OpenRouter that have not been used. Syncing is blocked until all entries are resolved. To continue, add one or more models to the config and re-run the pipeline — the tiebreaker will pick up automatically.

-```json
-{
-  "source_id": "omw-en-12345",
-  "pos": "noun",
-  "translations": {
-    "en": [
-      { "text": "dog", "cefr_level": "A1", "difficulty": "easy" },
-      { "text": "canine", "cefr_level": "B2", "difficulty": "intermediate" }
-    ],
-    "it": [{ "text": "cane", "cefr_level": "A1", "difficulty": "easy" }]
-  },
-  "glosses": {
-    "en": { "text": "a domesticated carnivorous mammal", "source": "omw" },
-    "fr": { "text": "un mammifère carnivore domestiqué", "source": "model_1" }
-  },
-  "examples": {
-    "en": [{ "text": "the dog barked at the stranger", "source": "omw" }],
-    "fr": [{ "text": "le chien a aboyé", "source": "model_1" }]
-  },
-  "descriptions": {
-    "en": {
-      "text": "a common household pet known for loyalty and companionship",
-      "source": "model_1"
-    },
-    "it": {
-      "text": "un animale domestico comune noto per la sua fedeltà",
-      "source": "model_2"
-    }
-  }
-}
-```
-
-**Resolving flagged words:**
-
-Open `stage-4-merge/output/flagged/{lang}.json`, manually set the correct `cefr_level` and `difficulty` for each flagged translation, then move the resolved entries into `stage-4-merge/output/final/{lang}.json`. Re-run the seeder after resolving.
+> **Note:** The tiebreaker is not a standalone script. It runs automatically as part of the pipeline orchestrator after merge completes.

 ### 5. Compare / QA

-Read-only. Generates `COVERAGE.md` with a full breakdown of the pipeline
-output quality per language. Run this after merge to verify output before
-seeding the database.
-
-**Input:**
-
- `stage-4-merge/output/final/{lang}.json`
- `stage-4-merge/output/flagged/{lang}.json`
+Read-only. Generates `COVERAGE.md` with a full breakdown of pipeline output quality per language. Run this after merge to verify output before syncing to the database.

+**Input:** `pipeline.db` — entries with status `final`
 **Output:** `COVERAGE.md`

-```bash
-pnpm --filter @lila/pipeline compare
-```
-
 `COVERAGE.md` reports the following per language:

- Total synsets extracted
- Total translations per language
- POS breakdown per language — word counts for noun, verb, adjective, adverb
- CEFR coverage per language — how many translations have a resolved CEFR level, broken down by level (A1, A2, B1, B2, C1, C2)
- Difficulty breakdown per language — word counts for easy, intermediate, hard
- Flagged count per language — how many translations are awaiting manual review
- Gloss coverage per language — total glosses, broken down by source (omw vs LLM-generated) and which languages have no glosses at all
- Example coverage per language — same breakdown as glosses
- Description coverage per language — how many translations have a description, broken down by source
- CEFR source file coverage per language — how many words from the source file were matched against OMW translations
+- Total entries extracted
+- POS breakdown — entry counts for noun, verb, adjective, adverb
+- Translation coverage — how many entries have translations in each other language
+- CEFR coverage — how many entries have a resolved CEFR level, broken down by level
+- Difficulty breakdown — entry counts for easy, intermediate, hard
+- Gloss coverage — how many entries have a gloss, broken down by source (Kaikki vs LLM-generated)
+- Example coverage — same breakdown as glosses
 - LLM model contribution — how many CEFR votes and text candidates each anonymised model contributed

+## Sync
+
+The sync script transfers all entries with status `final` in `pipeline.db` to the production PostgreSQL database. It is upsert-based and never wipes existing data. For each entry it checks whether a matching record already exists in the target database:
+
+- **Missing** → insert
+- **Present but changed** → update
+- **Present and unchanged** → skip
+
+Run this after all entries are resolved and Compare / QA has been reviewed.
+
+```bash
+pnpm --filter @lila/pipeline sync
+```
+
+The sync script requires a connection string to the target database. Set `DATABASE_URL` in your `.env` file before running.
+
+## Reports
+
+The pipeline generates a report at the end of every run. Reports are written to `data-pipeline/reports/` as a JSON file and a markdown file with the same name. The markdown is generated from the JSON and contains identical data.
+
+```
+data-pipeline/reports/
+  2026-05-03_run-1.json
+  2026-05-03_run-1.md
+```
+
+The run name is auto-generated from the date and a counter. Reports are not committed to git.
+
+**Nightly report** contains:
+
+- Entries processed this run vs total
+- Entries remaining per stage
+- Average processing speed and estimated nights remaining
+- `needs_review` count — entries that failed structural validation
+- Per-model progress breakdown
+
+**Final report** (generated when all entries are processed) additionally contains:
+
+- Full vote breakdown per model
+- Flagged entries with exact vote splits
+- Available unused models from OpenRouter for tiebreaking
+- Per-model quality metrics — CEFR agreement rate, field coverage, JSON parse rate
+
 ## Adding a new language

 1. Add the language code to `SUPPORTED_LANGUAGE_CODES` in `packages/shared/src/constants.ts`
 2. Build shared: `pnpm --filter @lila/shared build`
 3. Generate and run a DB migration: `pnpm --filter @lila/db generate` then `pnpm --filter @lila/db migrate`
-4. Download the OMW lexicon for the language using the `wn` Python library
-5. Add a CEFR source file at `stage-2-annotate/sources/cefr/{lang}.json`
-6. Run the full pipeline
+4. Download the Kaikki JSONL file for the language from kaikki.org
+5. Re-run the full pipeline

 ## Constants and constraints

@ -442,27 +401,89 @@ Adding a new value to any of these requires a constants update and a database mi

 ## Further extensions

-These are not part of the current pipeline but are worth considering as the
-dataset matures:
+These are not part of the current pipeline but are worth considering as the dataset matures:

- **Grammatical gender and articles** — Wiktionary dumps contain gender and
-  article data for nouns across all supported languages. Could be extracted
-  and stored as a new `translation_forms` table.
- **Conjugations** — Wiktionary also carries verb conjugation tables. Useful
-  for a future grammar-focused quiz mode.
- **IPA pronunciations** — Wiktionary and Forvo are potential sources for
-  phonetic transcriptions per language.
- **TTS audio files** — Generate pronunciation audio for each translation
-  using a local or cloud TTS engine. Stored as static files, served alongside
-  the quiz UI.
- **Images** — Associate an image with each synset to support visual
-  vocabulary learning. Could be sourced from open image datasets like
-  ImageNet or WikiMedia Commons.
- **Frequency data** — Word frequency rankings per language from sources like
-  the Google Ngram dataset. Useful for smarter difficulty calibration beyond
-  CEFR levels alone.
- **Improved CEFR source files** — See note at the top of this document.
-  UniversalCEFR and CEFR-J are good starting points.
- **Additional languages** — The pipeline is language-agnostic. Adding a new
-  language requires an OMW lexicon, a CEFR source file, and a constants
-  update. See **Adding a new language**.
+- **IPA pronunciations** — Kaikki includes IPA transcriptions for most entries. Could be extracted and stored in a `entry_pronunciations` table and displayed in the quiz UI.
+- **Audio files** — kaikki.org provides bulk audio file downloads (~20GB) for pronunciations. Could be stored as static files and served alongside the quiz UI.
+- **Inflected forms** — Kaikki provides conjugation and declension tables in a `forms` array. Useful for a future grammar-focused quiz mode.
+- **Grammatical gender** — Kaikki includes grammatical gender for nouns. Could be stored per entry and used as an additional quiz mechanic.
+- **Frequency data** — Word frequency rankings per language from sources like the Google Ngram dataset. Useful for smarter difficulty calibration beyond CEFR levels alone.
+- **Additional languages** — The pipeline is language-agnostic. Adding a new language requires downloading its Kaikki JSONL file, a constants update, and a database migration. See **Adding a new language**.
+
+## Roadmap
+
+**Current state:** Stage 1 extraction and stage 2 reverse link sync complete and verified on sample data. Stage 3 enrich script written and tested — redesigning to sub-stage architecture for better data quality. llama.cpp running with Qwen3.5-4B.
+
+**Next action:** Rewrite enrich script for sub-stage design.
+
+| Stage           | Status         |
+| --------------- | -------------- |
+| 1. Extract      | 🔲 not started |
+| 2. Reverse link | 🔲 not started |
+| 3. Enrich       | 🔲 not started |
+| 4. Merge        | 🔲 not started |
+| 4b. Tiebreak    | 🔲 not started |
+| 5. Compare / QA | 🔲 not started |
+| 6. Sync         | 🔲 not started |
+
+### Stage 1 — Extract `🔄 in progress`
+
+- [x] Download Kaikki JSONL files for all 5 languages
+- [x] Write extraction script
+- [x] Write stage 1 validation tests
+- [x] Write db schema, init, and import scripts
+- [x] Write db import validation tests
+- [x] Run sample extraction → `stage-1-extract/output/{lang}.json`
+- [ ] Remove sample limit and run full extraction
+- [ ] Re-run full import → `pipeline.db`
+
+### Stage 2 — Reverse link sync `🔄 in progress`
+
+- [x] Write reverse link sync script
+- [x] Run reverse link sync on sample data → 141 links inserted
+- [ ] Run reverse link sync on full data after full extraction
+
+### Stage 3 — Enrich `🔄 in progress`
+
+**Next action:** Rewrite enrich script for sub-stage design.
+
+- [x] Write initial enrich script (single-prompt design)
+- [x] Install llama.cpp and verify server
+- [x] Smoke test with sample entries
+- [ ] Rewrite enrich script for sub-stage design (round1_gloss, round1_example, round1_translations, round1_cefr)
+- [ ] Write tests for enrich sub-stages
+- [ ] Run full sample, collect metrics
+- [ ] Compare providers (local vs OpenRouter free models)
+- [ ] Production run — all entries, all models
+
+### Stage 4 — Merge `🔲 not started`
+
+- [ ] Write merge script
+- [ ] Write tests
+- [ ] Run merge → `pipeline.db`
+- [ ] Confirm tiebreaker resolves all flagged entries
+
+### Stage 4b — Tiebreak `🔲 not started`
+
+- [ ] Write tiebreak logic
+- [ ] Run tiebreaker for all flagged entries
+- [ ] Confirm no flagged entries remain before syncing
+
+### Stage 5 — Compare / QA `🔲 not started`
+
+- [ ] Write compare script
+- [ ] Write tests
+- [ ] Run compare → `COVERAGE.md`
+- [ ] Review output quality before syncing
+
+### Stage 6 — Sync `🔲 not started`
+
+- [ ] Write sync script
+- [ ] Write tests
+- [ ] Configure `DATABASE_URL` in `.env`
+- [ ] Run sync → production PostgreSQL
+- [ ] Verify seeded data in production
+
+### Utilities
+
+**`sample/`** — Runs the pipeline against a small sample to produce human-readable output for a quick sanity check before committing to a full run. Run this after any script change before running the full pipeline.
--- a/documentation/llm-setup.md
+++ b/documentation/llm-setup.md
@ -1,9 +1,12 @@
 # LLM Setup — lila pipeline

-This document covers the LLM infrastructure for stage 3 (enrich) of the lila
-data pipeline. It documents the hardware constraints, supported providers,
-model recommendations, and how to configure and swap providers in the test
-and production scripts.
+This document covers the LLM infrastructure for stage 3 (enrich) of the lila data pipeline. It documents the hardware constraints, supported providers, model recommendations, and how to configure and swap providers in the test and production scripts.
+
+---
+
+## Provider model
+
+Each provider + model combination counts as one vote in the final majority. Running the same model twice is not supported — one model, one vote. To increase vote confidence, add more models rather than re-running existing ones.

 ---

@ -16,17 +19,13 @@ and production scripts.
 | GPU       | NVIDIA GeForce GTX 950M — 4 GB VRAM (Maxwell, CUDA compute 5.0) |
 | OS        | Debian GNU/Linux 13 (trixie) x86_64                             |

-**Local inference verdict:** viable for small/quantized models, not for
-production runs. See the [Local inference](#local-inference-llamacpp) section
-for details.
+**Local inference verdict:** viable for small/quantized models, not for production runs. See the [Local inference](#local-inference-llamacpp) section for details.

 ---

 ## Provider overview

-The enrich script uses a single, swappable provider config. All providers
-except Anthropic expose an OpenAI-compatible API, so the same client code
-works across all of them — only `baseURL`, `apiKey`, and `model` change.
+The enrich script uses a single, swappable provider config. All providers except Anthropic expose an OpenAI-compatible API, so the same client code works across all of them — only `baseURL`, `apiKey`, and `model` change.

 | Provider               | Use case                                      | Cost               | Rate limits            |
 | ---------------------- | --------------------------------------------- | ------------------ | ---------------------- |
@ -41,20 +40,13 @@ works across all of them — only `baseURL`, `apiKey`, and `model` change.

 ### Why local inference is worth testing

-Time is not a constraint — the pipeline scripts are fully resumable. The
-laptop can run overnight for multiple nights. The only question is output
-quality, which the test script evaluates empirically.
+Time is not a constraint — the pipeline scripts are fully resumable. The laptop can run overnight for multiple nights. The only question is output quality, which the test script evaluates empirically.

 ### Hardware constraints

-The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0).
-llama.cpp supports Maxwell via CUDA backend but newer builds may require
-the `--cuda-no-kv-offload` flag depending on the version.
+The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0). llama.cpp supports Maxwell via CUDA backend but newer builds may require the `--cuda-no-kv-offload` flag depending on the version.

-llama.cpp splits model layers between GPU and CPU automatically via
-`--n-gpu-layers`. You set how many layers go on the GPU; the rest run on
-CPU/RAM. This means a model larger than VRAM is not a dead end — it runs
-in hybrid mode, slower than full-GPU but much faster than pure CPU.
+llama.cpp splits model layers between GPU and CPU automatically via `--n-gpu-layers`. You set how many layers go on the GPU; the rest run on CPU/RAM. This means a model larger than VRAM is not a dead end — it runs in hybrid mode, slower than full-GPU but much faster than pure CPU.

 Practical estimates for this hardware (~3.5 GB VRAM usable after drivers):

@ -67,24 +59,19 @@ Practical estimates for this hardware (~3.5 GB VRAM usable after drivers):

 ### Recommended local models

-Two candidates worth testing, covering different points on the size/quality
-tradeoff:
+Two candidates worth testing, covering different points on the size/quality tradeoff:

 **Gemma 4 E4B Instruct (Q4 / UD-Q4_K_XL)**

 - GGUF file: `gemma-4-E4B-it-UD-Q4_K_XL.gguf` (~2.5 GB)
 - Source: https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF
- Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+
-  language support including all five pipeline languages. First candidate
-  to test.
+- Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+ language support including all five pipeline languages. First candidate to test.

 **Qwen2.5 7B Instruct (Q4_K_M)**

 - GGUF file: `Qwen2.5-7B-Instruct-Q4_K_M.gguf` (~4.5 GB)
 - Source: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF
- Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~8–12 tok/s.
-  Stronger multilingual generation than any 3–4B model. Second candidate,
-  for comparison against the smaller Gemma 4 E4B.
+- Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~8–12 tok/s. Stronger multilingual generation than any 3–4B model. Second candidate, for comparison against the smaller Gemma 4 E4B.

 ### Installation

@ -190,16 +177,17 @@ Set `Authorization: Bearer <OPENROUTER_API_KEY>` in the request headers.

 ---

-## Provider configuration in the test script
+## Provider configuration in the enrich script

-The enrich test script reads a single config object. To switch providers,
-change this object and re-run.
+The enrich script reads a single config object. To switch providers,
+change this object and re-run. The `name` field is used as the model
+identifier in `pipeline.db` — it must be unique across all runs.

 ```typescript
 // config.ts

 export type ProviderConfig = {
-  name: string; // used for output folder naming
+  name: string; // used as model identifier in pipeline.db — must be unique
  baseURL: string;
  apiKey: string;
  model: string;
@ -243,14 +231,9 @@ export const ANTHROPIC_SONNET: ProviderConfig = {
 };
 ```

-Output from each run lands in:
-
-```
-stage-3-enrich/test/output/{provider.name}/results.json
-stage-3-enrich/test/output/{provider.name}/metrics.json
-```
-
-The evaluate script compares all `metrics.json` files side by side.
+All output is written to `pipeline.db`. Each record is stored with the
+model name as identifier so results from different providers can be
+compared and compiled into votes.

 ---

@ -297,5 +280,6 @@ The test script measures the following per provider run:
   production. If not, use the cloud model that passed.

 5. **Production run**
-   Full 117k records. Resume-safe — the script checkpoints after each
-   record so overnight runs can be stopped and continued.
+   Full 117k records. Resume-safe — each record is written to `pipeline.db`
+   atomically as it is processed. Overnight runs can be stopped and
+   continued at any time without losing work.
--- a/documentation/model-strategy.md
+++ b/documentation/model-strategy.md
@ -0,0 +1,173 @@
+# Model Strategy
+
+## The problem
+
+The pipeline requires LLMs to perform four tasks per vocabulary entry:
+
+1. **Gloss review** — confirm or improve the existing gloss
+2. **Example review** — confirm or improve existing examples
+3. **Translation validation** — confirm valid translations, reject bad data, generate missing ones
+4. **CEFR assignment** — assign A1-C2 to the headword and each translation
+
+The core challenge is that vocabulary entries have **multiple senses**. The word "cat" appears five times in the database — as an animal, as slang for "guy", as a nautical term, as a verb meaning "to vomit", and as a verb meaning "to hoist an anchor". Each sense requires a different CEFR level and different translations. A model that only knows "cat" is A1 gets four out of five wrong.
+
+This makes CEFR assignment fundamentally a **sense-disambiguation problem**, not just a vocabulary lookup. Specialized CEFR classifiers (like `cefrpy` or `dksysd/cefr-classifier`) operate at the word or sentence level and cannot distinguish between senses of the same word. General LLMs handle sense disambiguation well but introduce quality and reliability problems that depend heavily on model size.
+
+The secondary challenge is **hardware constraints**. The available local hardware (GTX 950M, 4GB VRAM) can only run models up to approximately 4B parameters fully in GPU memory. Larger models run in hybrid CPU/GPU mode which is significantly slower. Free cloud API tiers are generous enough for the sample dataset but have daily limits that make processing 100k+ entries across multiple sub-stages a multi-day or multi-week operation.
+
+## What we tried and why it failed or worked
+
+### Single-prompt design (abandoned)
+
+The first enrich script sent one large prompt per entry covering all four tasks at once — CEFR voting, gloss improvement, example improvement, translation validation, and missing translation generation. This produced the following problems:
+
+- The model skipped translations it considered invalid rather than explicitly rejecting them, causing validation failures
+- Bad data in the translation table (`it:free`, `de:-frei`, `es:de fai`) caused consistent validation failures because the model refused to vote on them even when explicitly instructed
+- The combined prompt was large enough to trigger reasoning mode on Gemma 4 E4B, consuming all available tokens on thinking before producing output
+- 20% of entries required manual review
+
+### Sub-stage design (current)
+
+Splitting into four ordered sub-stages fixed the reasoning and validation problems:
+
+1. `round1_gloss` — LLM reviews the gloss in isolation
+2. `round1_example` — LLM reviews examples with verified gloss as context
+3. `round1_translations` — LLM validates translations with verified gloss as context
+4. `round1_cefr` — LLM assigns CEFR levels only to validated translations
+
+This ordering ensures the CEFR sub-stage never sees bad data. The smaller, focused prompts eliminated reasoning mode triggering and reduced per-entry time from ~120 seconds to ~25 seconds.
+
+### Gloss quality (ongoing)
+
+Testing on 50 entries with Qwen3.5-4B showed ~80% good quality. The 20% failures fall into three categories:
+
+- **Category header glosses** — Kaikki occasionally uses "Terms relating to people." or "Terms relating to things." as a gloss instead of a real definition. No model handles these correctly because there is no real meaning to improve.
+- **Rare/obscure senses** — slang, archaic, and theological senses that a 4B model does not have enough knowledge to handle (e.g. "cat" meaning "to vomit", "word" meaning "Logos, Christ").
+- **Short ambiguous glosses** — one or two word glosses with no example context cause hallucination.
+
+### Gemma 4 E4B (rejected)
+
+Gemma 4 E4B is a hybrid reasoning model. Disabling thinking via `--reasoning-budget 0` or `--chat-template-kwargs '{"enable_thinking":false}'` does not work reliably in llama.cpp for the E4B variant — the model either puts reasoning into the content field as plain text or returns empty content with reasoning in `reasoning_content`. Per-entry time exceeded 100 seconds making it impractical.
+
+### Qwen3.5-4B (current local model)
+
+Non-thinking by default for the small series. Runs fully in 4GB VRAM at ~5 seconds per sub-stage. Acceptable quality for common vocabulary (A1-B2) but struggles with rare and specialized senses. Used as the primary local voter.
+
+### Specialized CEFR classifiers (rejected for primary use)
+
+HuggingFace hosts several CEFR text classifiers (`dksysd/cefr-classifier`, `AbdulSami/bert-base-cased-cefr`) and the `cefrpy` Python library maps individual words to CEFR levels. These operate at the word or sentence level and cannot distinguish between senses. "cat" would always be assigned A1 regardless of whether the sense is the animal or obscure nautical slang. Useful only as a sanity check signal, not as a primary voter.
+
+## Available free resources
+
+| Resource                     | Type               | Requests/day      | Quality   | Notes                                                                  |
+| ---------------------------- | ------------------ | ----------------- | --------- | ---------------------------------------------------------------------- |
+| Local Qwen3.5-4B Q4_K_M      | Local model        | Unlimited         | Decent    | Non-thinking by default, fits in 4GB VRAM, ~5s per sub-stage           |
+| Local Qwen3.5-9B Q4_K_M      | Local model        | Unlimited         | Good      | Hybrid CPU/GPU mode on 4GB VRAM, slower but better quality             |
+| Local Llama 3.1 8B Q4_K_M    | Local model        | Unlimited         | Decent    | ~4.3GB, fits in VRAM or light hybrid, different architecture from Qwen |
+| Groq — Llama 3.3 70B         | Cloud API          | 1,000             | Excellent | Best free quality available, 5-10x with batching                       |
+| Groq — Llama 3.1 8B          | Cloud API          | 14,400            | Decent    | High volume, similar quality to local 4B                               |
+| Google Gemini AI Studio      | Cloud API          | 1,500             | Very good | Google account required, 5-10x with batching                           |
+| OpenRouter free rotation     | Cloud API          | 50–1,000          | Varies    | Rotates between free models automatically via `openrouter/free`        |
+| Wiktionary API               | Context enrichment | Unlimited         | N/A       | Structured vocabulary data, directly related to Kaikki source          |
+| `cefrpy` Python library      | Word lookup        | Unlimited         | Limited   | Deterministic English word CEFR lookup, no sense disambiguation        |
+| HuggingFace CEFR classifiers | Text classifier    | Unlimited (local) | Limited   | Sentence-level difficulty, not sense-aware                             |
+
+### Batching
+
+All cloud APIs support sending multiple entries in a single request. Sending 5 entries per request multiplies effective daily capacity by 5x:
+
+- Groq Llama 3.3 70B: 1,000 requests → ~5,000 entries/day
+- Gemini: 1,500 requests → ~7,500 entries/day
+
+### Multiple accounts
+
+Prohibited by the terms of service of all providers listed above.
+
+## Final approach per sub-stage
+
+The pipeline runs multiple models as independent voters. Each model processes every entry once and writes its votes to `pipeline.db`. The merge stage resolves disagreements by majority vote. A tiebreaker runs additional models on flagged entries where no majority was reached.
+
+### round1_gloss and round1_example
+
+These sub-stages require a model that understands sense context from examples. Specialized classifiers cannot help here — only general LLMs can evaluate whether a gloss correctly describes a specific sense.
+
+**Primary voter:** Local Qwen3.5-9B Q4_K_M — runs overnight, unlimited, handles common vocabulary well.
+
+**Secondary voter:** Groq Llama 3.3 70B with 5-entry batching — higher quality, catches errors the local model makes on rare or specialized senses.
+
+**Tertiary voter:** Gemini AI Studio with 5-entry batching — third independent opinion, different training data from both Groq and local model.
+
+**Context enrichment via Wiktionary API:** Before calling any model for the gloss or example sub-stage, the pipeline queries the Wiktionary API for the headword. The API returns the full Wiktionary entry including all senses, usage notes, and examples. This structured data is added to the prompt as additional context, giving the model a much clearer picture of which specific sense it is working with.
+
+This directly fixes the two hardest failure cases:
+- **Category header glosses** ("Terms relating to people.") — the Wiktionary entry contains the real definition which the model can use to generate a proper gloss
+- **Short ambiguous glosses** — the additional sense context prevents the model from guessing the wrong meaning
+
+The Wiktionary API is free, has no rate limits for reasonable use, and is directly related to the Kaikki data source since Kaikki extracts from Wiktionary.
+
+### round1_translations
+
+Same voter stack as gloss/example. The few-shot examples in the prompt (showing that `it:free` → reject and `de:-frei` → reject) handle the bad data cases that caused validation failures in the single-prompt design.
+
+### round1_cefr
+
+This sub-stage only receives translations that survived the validation step. All bad data is already excluded.
+
+**Primary voter:** Local Qwen3.5-9B Q4_K_M.
+
+**Secondary voter:** Groq Llama 3.3 70B with 5-entry batching.
+
+**Tertiary voter:** Gemini AI Studio with 5-entry batching.
+
+**Sanity check:** `cefrpy` provides a deterministic English word CEFR level as a reference signal. If the majority LLM vote disagrees significantly (e.g. LLMs vote C2 for "cat" the animal), the entry is flagged for human review. `cefrpy` does not vote — it only triggers review flags.
+
+### Voter summary
+
+| Sub-stage           | Voter 1            | Voter 2            | Voter 3 |
+| ------------------- | ------------------ | ------------------ | ------- |
+| round1_gloss        | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+| round1_example      | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+| round1_translations | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+| round1_cefr         | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+
+Three voters means a correct majority requires at least two models to agree. Even if the local model gets a difficult sense wrong, the two cloud models will likely agree on the correct answer and outvote it.
+
+## Open questions
+
+### Wiktionary API context extraction
+The Wiktionary API returns the full entry for a word including all senses. For a word like "free" with 8+ senses, dumping the entire entry into the prompt wastes tokens and may confuse the model. The open question is how to extract only the relevant sense — options include matching by sense_index, fuzzy-matching the Kaikki gloss against Wiktionary glosses, or letting the model see all senses and identify the correct one itself.
+
+### Batching prompt design
+Batching 5-10 entries per API call multiplies effective daily capacity significantly. The prompt and validation logic for batched requests is more complex — the model must return a structured JSON object keyed by entry ID, and partial failures (one entry in a batch fails validation) need careful handling. Not yet designed or tested.
+
+### Groq and Gemini API integration
+Neither Groq nor Gemini is integrated into the pipeline yet. Both use OpenAI-compatible APIs so integration is straightforward — add provider configs to `stage-3-enrich/config.ts` and set API keys in `.env`. The batching prompt design needs to be finalised first.
+
+### OpenRouter free model rotation
+OpenRouter's `openrouter/free` router selects a model at random from available free models. This means output style and quality vary between requests, which complicates round 2 voting where models review each other's candidates. May need to pin specific free models rather than using the router.
+
+### Qwen3.5-9B performance on hard cases
+The 9B model has not yet been tested. It is expected to handle rare and specialized senses better than the 4B model but this has not been verified. Needs a test run against the same 50 entries used to evaluate the 4B model.
+
+### Llama.cpp Gemma 4 bug
+The llama.cpp chat template bug preventing reliable JSON output from Gemma 4 E4B may be fixed in a future release. The model fits in 4GB VRAM and would be a useful additional local voter if the bug is resolved. Worth checking periodically.
+
+### Full dataset scale
+The current pipeline runs on a 500-entry sample per language. The full Kaikki English file contains approximately 1.3 million entries, of which a fraction will pass the POS and translation filters. The exact count and the time required to run all sub-stages across all models at full scale is not yet known.
+
+### Category header glosses
+Kaikki occasionally uses category headers ("Terms relating to people.", "Terms relating to things.") as glosses. These are not real definitions and no model produces useful output for them. Options include pre-filtering them before the gloss sub-stage and generating a gloss purely from examples, or flagging them as a special case for human review.
+
+
+
+
+wget -O models/llama-3.1-8b-instruct-q4_k_m.gguf \
+  "https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
+
+# Q4_K_M (5.68GB — hybrid mode, better quality)
+wget -O models/qwen3.5-9b-q4_k_m.gguf \
+  "https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q4_K_M.gguf"
+
+# Q3_K_S (4.32GB — might fit fully in VRAM)
+wget -O models/qwen3.5-9b-q3_k_s.gguf \
+  "https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q3_K_S.gguf"
--- a/eslint.config.mjs
+++ b/eslint.config.mjs
@ -12,7 +12,6 @@ export default defineConfig([
    "node_modules/",
    "routeTree.gen.ts",
    "scripts/**",
-    "data-pipeline/**/*",
  ]),

  eslint.configs.recommended,
--- a/package.json
+++ b/package.json
@ -23,7 +23,7 @@
      "prettier --write"
    ]
  },
-  "packageManager": "pnpm@10.33.1",
+  "packageManager": "pnpm@10.33.2",
  "devDependencies": {
    "@eslint/js": "^10.0.1",
    "@tanstack/eslint-plugin-router": "^1.161.6",
--- a/packages/db/drizzle/0011_nice_spyke.sql
+++ b/packages/db/drizzle/0011_nice_spyke.sql
@ -0,0 +1,46 @@
+CREATE TABLE "entry_translations" (
+	"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
+	"entry_id" uuid NOT NULL,
+	"target_language_code" varchar(10) NOT NULL,
+	"translation" text NOT NULL,
+	"sense_hint" text,
+	"cefr_level" varchar(2),
+	"difficulty" varchar(20),
+	"source" varchar(50) DEFAULT 'kaikki' NOT NULL,
+	"created_at" timestamp with time zone DEFAULT now() NOT NULL,
+	CONSTRAINT "unique_translation" UNIQUE("entry_id","target_language_code","translation"),
+	CONSTRAINT "target_language_code_check" CHECK ("entry_translations"."target_language_code" IN ('en', 'it', 'de', 'fr', 'es')),
+	CONSTRAINT "cefr_check" CHECK ("entry_translations"."cefr_level" IS NULL OR "entry_translations"."cefr_level" IN ('A1', 'A2', 'B1', 'B2', 'C1', 'C2')),
+	CONSTRAINT "difficulty_check" CHECK ("entry_translations"."difficulty" IS NULL OR "entry_translations"."difficulty" IN ('easy', 'intermediate', 'hard'))
+);
+--> statement-breakpoint
+CREATE TABLE "vocabulary_entries" (
+	"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
+	"headword" text NOT NULL,
+	"language_code" varchar(10) NOT NULL,
+	"pos" varchar(20) NOT NULL,
+	"sense_index" smallint DEFAULT 0 NOT NULL,
+	"gloss" text,
+	"examples" text[] DEFAULT '{}' NOT NULL,
+	"cefr_level" varchar(2),
+	"difficulty" varchar(20),
+	"source" varchar(50) DEFAULT 'kaikki' NOT NULL,
+	"created_at" timestamp with time zone DEFAULT now() NOT NULL,
+	CONSTRAINT "unique_entry" UNIQUE("headword","language_code","pos","sense_index"),
+	CONSTRAINT "language_code_check" CHECK ("vocabulary_entries"."language_code" IN ('en', 'it', 'de', 'fr', 'es')),
+	CONSTRAINT "pos_check" CHECK ("vocabulary_entries"."pos" IN ('noun', 'verb', 'adjective', 'adverb')),
+	CONSTRAINT "cefr_check" CHECK ("vocabulary_entries"."cefr_level" IS NULL OR "vocabulary_entries"."cefr_level" IN ('A1', 'A2', 'B1', 'B2', 'C1', 'C2')),
+	CONSTRAINT "difficulty_check" CHECK ("vocabulary_entries"."difficulty" IS NULL OR "vocabulary_entries"."difficulty" IN ('easy', 'intermediate', 'hard'))
+);
+--> statement-breakpoint
+DROP TABLE "deck_terms" CASCADE;--> statement-breakpoint
+DROP TABLE "decks" CASCADE;--> statement-breakpoint
+DROP TABLE "term_examples" CASCADE;--> statement-breakpoint
+DROP TABLE "term_glosses" CASCADE;--> statement-breakpoint
+DROP TABLE "term_topics" CASCADE;--> statement-breakpoint
+DROP TABLE "terms" CASCADE;--> statement-breakpoint
+DROP TABLE "topics" CASCADE;--> statement-breakpoint
+DROP TABLE "translations" CASCADE;--> statement-breakpoint
+ALTER TABLE "entry_translations" ADD CONSTRAINT "entry_translations_entry_id_vocabulary_entries_id_fk" FOREIGN KEY ("entry_id") REFERENCES "public"."vocabulary_entries"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
+CREATE INDEX "idx_translations_target_lang" ON "entry_translations" USING btree ("target_language_code","difficulty","entry_id");--> statement-breakpoint
+CREATE INDEX "idx_entries_lang_pos" ON "vocabulary_entries" USING btree ("language_code","pos","difficulty");
--- a/packages/db/drizzle/meta/0011_snapshot.json
+++ b/packages/db/drizzle/meta/0011_snapshot.json
@ -0,0 +1,750 @@
+{
+  "id": "6f1811a6-8573-4d43-912a-ceb5191341cc",
+  "prevId": "6c1cb049-807d-43d0-b83e-d3575b80de33",
+  "version": "7",
+  "dialect": "postgresql",
+  "tables": {
+    "public.account": {
+      "name": "account",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "account_id": {
+          "name": "account_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "provider_id": {
+          "name": "provider_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "access_token": {
+          "name": "access_token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "refresh_token": {
+          "name": "refresh_token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "id_token": {
+          "name": "id_token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "access_token_expires_at": {
+          "name": "access_token_expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "refresh_token_expires_at": {
+          "name": "refresh_token_expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "scope": {
+          "name": "scope",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "password": {
+          "name": "password",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        }
+      },
+      "indexes": {
+        "account_userId_idx": {
+          "name": "account_userId_idx",
+          "columns": [
+            {
+              "expression": "user_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "account_user_id_user_id_fk": {
+          "name": "account_user_id_user_id_fk",
+          "tableFrom": "account",
+          "tableTo": "user",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.entry_translations": {
+      "name": "entry_translations",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "entry_id": {
+          "name": "entry_id",
+          "type": "uuid",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "target_language_code": {
+          "name": "target_language_code",
+          "type": "varchar(10)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "translation": {
+          "name": "translation",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "sense_hint": {
+          "name": "sense_hint",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "cefr_level": {
+          "name": "cefr_level",
+          "type": "varchar(2)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "difficulty": {
+          "name": "difficulty",
+          "type": "varchar(20)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "source": {
+          "name": "source",
+          "type": "varchar(50)",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'kaikki'"
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "idx_translations_target_lang": {
+          "name": "idx_translations_target_lang",
+          "columns": [
+            {
+              "expression": "target_language_code",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "difficulty",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "entry_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "entry_translations_entry_id_vocabulary_entries_id_fk": {
+          "name": "entry_translations_entry_id_vocabulary_entries_id_fk",
+          "tableFrom": "entry_translations",
+          "tableTo": "vocabulary_entries",
+          "columnsFrom": ["entry_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "unique_translation": {
+          "name": "unique_translation",
+          "nullsNotDistinct": false,
+          "columns": ["entry_id", "target_language_code", "translation"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {
+        "target_language_code_check": {
+          "name": "target_language_code_check",
+          "value": "\"entry_translations\".\"target_language_code\" IN ('en', 'it', 'de', 'fr', 'es')"
+        },
+        "cefr_check": {
+          "name": "cefr_check",
+          "value": "\"entry_translations\".\"cefr_level\" IS NULL OR \"entry_translations\".\"cefr_level\" IN ('A1', 'A2', 'B1', 'B2', 'C1', 'C2')"
+        },
+        "difficulty_check": {
+          "name": "difficulty_check",
+          "value": "\"entry_translations\".\"difficulty\" IS NULL OR \"entry_translations\".\"difficulty\" IN ('easy', 'intermediate', 'hard')"
+        }
+      },
+      "isRLSEnabled": false
+    },
+    "public.lobbies": {
+      "name": "lobbies",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "code": {
+          "name": "code",
+          "type": "varchar(10)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "host_user_id": {
+          "name": "host_user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "status": {
+          "name": "status",
+          "type": "varchar(20)",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'waiting'"
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "lobbies_host_user_id_user_id_fk": {
+          "name": "lobbies_host_user_id_user_id_fk",
+          "tableFrom": "lobbies",
+          "tableTo": "user",
+          "columnsFrom": ["host_user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "lobbies_code_unique": {
+          "name": "lobbies_code_unique",
+          "nullsNotDistinct": false,
+          "columns": ["code"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {
+        "lobby_status_check": {
+          "name": "lobby_status_check",
+          "value": "\"lobbies\".\"status\" IN ('waiting', 'in_progress', 'finished')"
+        }
+      },
+      "isRLSEnabled": false
+    },
+    "public.lobby_players": {
+      "name": "lobby_players",
+      "schema": "",
+      "columns": {
+        "lobby_id": {
+          "name": "lobby_id",
+          "type": "uuid",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "score": {
+          "name": "score",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "default": 0
+        },
+        "joined_at": {
+          "name": "joined_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "lobby_players_lobby_id_lobbies_id_fk": {
+          "name": "lobby_players_lobby_id_lobbies_id_fk",
+          "tableFrom": "lobby_players",
+          "tableTo": "lobbies",
+          "columnsFrom": ["lobby_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "lobby_players_user_id_user_id_fk": {
+          "name": "lobby_players_user_id_user_id_fk",
+          "tableFrom": "lobby_players",
+          "tableTo": "user",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {
+        "lobby_players_lobby_id_user_id_pk": {
+          "name": "lobby_players_lobby_id_user_id_pk",
+          "columns": ["lobby_id", "user_id"]
+        }
+      },
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.session": {
+      "name": "session",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "expires_at": {
+          "name": "expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "token": {
+          "name": "token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "ip_address": {
+          "name": "ip_address",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "user_agent": {
+          "name": "user_agent",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        }
+      },
+      "indexes": {
+        "session_userId_idx": {
+          "name": "session_userId_idx",
+          "columns": [
+            {
+              "expression": "user_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "session_user_id_user_id_fk": {
+          "name": "session_user_id_user_id_fk",
+          "tableFrom": "session",
+          "tableTo": "user",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "session_token_unique": {
+          "name": "session_token_unique",
+          "nullsNotDistinct": false,
+          "columns": ["token"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.user": {
+      "name": "user",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "name": {
+          "name": "name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "email": {
+          "name": "email",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "email_verified": {
+          "name": "email_verified",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "image": {
+          "name": "image",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "user_email_unique": {
+          "name": "user_email_unique",
+          "nullsNotDistinct": false,
+          "columns": ["email"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.verification": {
+      "name": "verification",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "identifier": {
+          "name": "identifier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "value": {
+          "name": "value",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "expires_at": {
+          "name": "expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "verification_identifier_idx": {
+          "name": "verification_identifier_idx",
+          "columns": [
+            {
+              "expression": "identifier",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.vocabulary_entries": {
+      "name": "vocabulary_entries",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "headword": {
+          "name": "headword",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "language_code": {
+          "name": "language_code",
+          "type": "varchar(10)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "pos": {
+          "name": "pos",
+          "type": "varchar(20)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "sense_index": {
+          "name": "sense_index",
+          "type": "smallint",
+          "primaryKey": false,
+          "notNull": true,
+          "default": 0
+        },
+        "gloss": {
+          "name": "gloss",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "examples": {
+          "name": "examples",
+          "type": "text[]",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'{}'"
+        },
+        "cefr_level": {
+          "name": "cefr_level",
+          "type": "varchar(2)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "difficulty": {
+          "name": "difficulty",
+          "type": "varchar(20)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "source": {
+          "name": "source",
+          "type": "varchar(50)",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'kaikki'"
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "idx_entries_lang_pos": {
+          "name": "idx_entries_lang_pos",
+          "columns": [
+            {
+              "expression": "language_code",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "pos",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "difficulty",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "unique_entry": {
+          "name": "unique_entry",
+          "nullsNotDistinct": false,
+          "columns": ["headword", "language_code", "pos", "sense_index"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {
+        "language_code_check": {
+          "name": "language_code_check",
+          "value": "\"vocabulary_entries\".\"language_code\" IN ('en', 'it', 'de', 'fr', 'es')"
+        },
+        "pos_check": {
+          "name": "pos_check",
+          "value": "\"vocabulary_entries\".\"pos\" IN ('noun', 'verb', 'adjective', 'adverb')"
+        },
+        "cefr_check": {
+          "name": "cefr_check",
+          "value": "\"vocabulary_entries\".\"cefr_level\" IS NULL OR \"vocabulary_entries\".\"cefr_level\" IN ('A1', 'A2', 'B1', 'B2', 'C1', 'C2')"
+        },
+        "difficulty_check": {
+          "name": "difficulty_check",
+          "value": "\"vocabulary_entries\".\"difficulty\" IS NULL OR \"vocabulary_entries\".\"difficulty\" IN ('easy', 'intermediate', 'hard')"
+        }
+      },
+      "isRLSEnabled": false
+    }
+  },
+  "enums": {},
+  "schemas": {},
+  "sequences": {},
+  "roles": {},
+  "policies": {},
+  "views": {},
+  "_meta": { "columns": {}, "schemas": {}, "tables": {} }
+}
--- a/packages/db/drizzle/meta/_journal.json
+++ b/packages/db/drizzle/meta/_journal.json
@ -78,6 +78,13 @@
      "when": 1776929932845,
      "tag": "0010_thankful_reaper",
      "breakpoints": true
+    },
+    {
+      "idx": 11,
+      "version": "7",
+      "when": 1777994750330,
+      "tag": "0011_nice_spyke",
+      "breakpoints": true
    }
  ]
 }
--- a/packages/db/src/db/schema.ts
+++ b/packages/db/src/db/schema.ts
@ -10,6 +10,7 @@ import {
  index,
  boolean,
  integer,
+  smallint,
 } from "drizzle-orm/pg-core";

 import { sql, relations } from "drizzle-orm";
@ -18,182 +19,100 @@ import {
  SUPPORTED_POS,
  SUPPORTED_LANGUAGE_CODES,
  CEFR_LEVELS,
-  SUPPORTED_DECK_TYPES,
  DIFFICULTY_LEVELS,
  LOBBY_STATUSES,
 } from "@lila/shared";

-export const terms = pgTable(
-  "terms",
+// ── Vocabulary ────────────────────────────────────────────────────────────────
+
+export const vocabulary_entries = pgTable(
+  "vocabulary_entries",
  {
    id: uuid().primaryKey().defaultRandom(),
-    source: varchar({ length: 50 }), // 'omw', 'wiktionary', null for manual
-    source_id: text(), // synset_id value for omw, wiktionary QID, etc.
+    headword: text().notNull(),
+    language_code: varchar({ length: 10 }).notNull(),
    pos: varchar({ length: 20 }).notNull(),
+    sense_index: smallint().notNull().default(0),
+    gloss: text(),
+    examples: text().array().notNull().default([]),
+    cefr_level: varchar({ length: 2 }),
+    difficulty: varchar({ length: 20 }),
+    source: varchar({ length: 50 }).notNull().default("kaikki"),
    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
  },
  (table) => [
+    unique("unique_entry").on(
+      table.headword,
+      table.language_code,
+      table.pos,
+      table.sense_index,
+    ),
+    check(
+      "language_code_check",
+      sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
+    ),
    check(
      "pos_check",
      sql`${table.pos} IN (${sql.raw(SUPPORTED_POS.map((p) => `'${p}'`).join(", "))})`,
    ),
-    unique("unique_source_id").on(table.source, table.source_id),
-    index("idx_terms_source_pos").on(table.source, table.pos),
-  ],
-);
-
-export const term_glosses = pgTable(
-  "term_glosses",
-  {
-    id: uuid().primaryKey().defaultRandom(),
-    term_id: uuid()
-      .notNull()
-      .references(() => terms.id, { onDelete: "cascade" }),
-    language_code: varchar({ length: 10 }).notNull(),
-    text: text().notNull(),
-    description: text(),
-    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
-  },
-  (table) => [
-    unique("unique_term_gloss").on(table.term_id, table.language_code),
-    check(
-      "language_code_check",
-      sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
-    ),
-  ],
-);
-
-export const term_examples = pgTable(
-  "term_examples",
-  {
-    id: uuid().primaryKey().defaultRandom(),
-    term_id: uuid()
-      .notNull()
-      .references(() => terms.id, { onDelete: "cascade" }),
-    language_code: varchar({ length: 10 }).notNull(),
-    text: text().notNull(),
-    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
-  },
-  (table) => [
-    unique("unique_term_example").on(
-      table.term_id,
-      table.language_code,
-      table.text,
-    ),
-    check(
-      "language_code_check",
-      sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
-    ),
-    index("idx_term_examples_term_id").on(table.term_id, table.language_code),
-  ],
-);
-
-export const translations = pgTable(
-  "translations",
-  {
-    id: uuid().primaryKey().defaultRandom(),
-    term_id: uuid()
-      .notNull()
-      .references(() => terms.id, { onDelete: "cascade" }),
-    language_code: varchar({ length: 10 }).notNull(),
-    text: text().notNull(),
-    cefr_level: varchar({ length: 2 }),
-    difficulty: varchar({ length: 20 }),
-    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
-  },
-  (table) => [
-    unique("unique_translations").on(
-      table.term_id,
-      table.language_code,
-      table.text,
-    ),
-    check(
-      "language_code_check",
-      sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
-    ),
    check(
      "cefr_check",
-      sql`${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((l) => `'${l}'`).join(", "))})`,
+      sql`${table.cefr_level} IS NULL OR ${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((l) => `'${l}'`).join(", "))})`,
    ),
    check(
      "difficulty_check",
-      sql`${table.difficulty} IN (${sql.raw(DIFFICULTY_LEVELS.map((d) => `'${d}'`).join(", "))})`,
+      sql`${table.difficulty} IS NULL OR ${table.difficulty} IN (${sql.raw(DIFFICULTY_LEVELS.map((d) => `'${d}'`).join(", "))})`,
    ),
-    index("idx_translations_lang").on(
+    index("idx_entries_lang_pos").on(
      table.language_code,
+      table.pos,
      table.difficulty,
-      table.cefr_level,
-      table.term_id,
    ),
  ],
 );

-export const decks = pgTable(
-  "decks",
+export const entry_translations = pgTable(
+  "entry_translations",
  {
    id: uuid().primaryKey().defaultRandom(),
-    name: text().notNull(),
-    description: text(),
-    source_language: varchar({ length: 10 }).notNull(),
-    validated_languages: varchar({ length: 10 }).array().notNull().default([]),
-    type: varchar({ length: 20 }).notNull(),
+    entry_id: uuid()
+      .notNull()
+      .references(() => vocabulary_entries.id, { onDelete: "cascade" }),
+    target_language_code: varchar({ length: 10 }).notNull(),
+    translation: text().notNull(),
+    sense_hint: text(),
+    cefr_level: varchar({ length: 2 }),
+    difficulty: varchar({ length: 20 }),
+    source: varchar({ length: 50 }).notNull().default("kaikki"),
    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
  },
  (table) => [
-    check(
-      "source_language_check",
-      sql`${table.source_language} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
+    unique("unique_translation").on(
+      table.entry_id,
+      table.target_language_code,
+      table.translation,
    ),
    check(
-      "validated_languages_check",
-      sql`validated_languages <@ ARRAY[${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))}]::varchar[]`,
+      "target_language_code_check",
+      sql`${table.target_language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
    ),
    check(
-      "validated_languages_excludes_source",
-      sql`NOT (${table.source_language} = ANY(${table.validated_languages}))`,
+      "cefr_check",
+      sql`${table.cefr_level} IS NULL OR ${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((l) => `'${l}'`).join(", "))})`,
    ),
    check(
-      "deck_type_check",
-      sql`${table.type} IN (${sql.raw(SUPPORTED_DECK_TYPES.map((t) => `'${t}'`).join(", "))})`,
+      "difficulty_check",
+      sql`${table.difficulty} IS NULL OR ${table.difficulty} IN (${sql.raw(DIFFICULTY_LEVELS.map((d) => `'${d}'`).join(", "))})`,
+    ),
+    index("idx_translations_target_lang").on(
+      table.target_language_code,
+      table.difficulty,
+      table.entry_id,
    ),
-    unique("unique_deck_name").on(table.name, table.source_language),
-    index("idx_decks_type").on(table.type, table.source_language),
  ],
 );

-export const deck_terms = pgTable(
-  "deck_terms",
-  {
-    deck_id: uuid()
-      .notNull()
-      .references(() => decks.id, { onDelete: "cascade" }),
-    term_id: uuid()
-      .notNull()
-      .references(() => terms.id, { onDelete: "cascade" }),
-  },
-  (table) => [primaryKey({ columns: [table.deck_id, table.term_id] })],
-);
-
-export const topics = pgTable("topics", {
-  id: uuid().primaryKey().defaultRandom(),
-  slug: varchar({ length: 50 }).notNull().unique(),
-  label: text().notNull(),
-  description: text(),
-  created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
-});
-
-export const term_topics = pgTable(
-  "term_topics",
-  {
-    term_id: uuid()
-      .notNull()
-      .references(() => terms.id, { onDelete: "cascade" }),
-    topic_id: uuid()
-      .notNull()
-      .references(() => topics.id, { onDelete: "cascade" }),
-  },
-  (table) => [primaryKey({ columns: [table.term_id, table.topic_id] })],
-);
+// ── Auth (managed by Better Auth) ─────────────────────────────────────────────

 export const user = pgTable("user", {
  id: text("id").primaryKey(),
@ -204,7 +123,7 @@ export const user = pgTable("user", {
  createdAt: timestamp("created_at").defaultNow().notNull(),
  updatedAt: timestamp("updated_at")
    .defaultNow()
-    .$onUpdate(() => /* @__PURE__ */ new Date())
+    .$onUpdate(() => new Date())
    .notNull(),
 });

@ -216,7 +135,7 @@ export const session = pgTable(
    token: text("token").notNull().unique(),
    createdAt: timestamp("created_at").defaultNow().notNull(),
    updatedAt: timestamp("updated_at")
-      .$onUpdate(() => /* @__PURE__ */ new Date())
+      .$onUpdate(() => new Date())
      .notNull(),
    ipAddress: text("ip_address"),
    userAgent: text("user_agent"),
@ -245,7 +164,7 @@ export const account = pgTable(
    password: text("password"),
    createdAt: timestamp("created_at").defaultNow().notNull(),
    updatedAt: timestamp("updated_at")
-      .$onUpdate(() => /* @__PURE__ */ new Date())
+      .$onUpdate(() => new Date())
      .notNull(),
  },
  (table) => [index("account_userId_idx").on(table.userId)],
@ -261,24 +180,13 @@ export const verification = pgTable(
    createdAt: timestamp("created_at").defaultNow().notNull(),
    updatedAt: timestamp("updated_at")
      .defaultNow()
-      .$onUpdate(() => /* @__PURE__ */ new Date())
+      .$onUpdate(() => new Date())
      .notNull(),
  },
  (table) => [index("verification_identifier_idx").on(table.identifier)],
 );

-export const userRelations = relations(user, ({ many }) => ({
-  sessions: many(session),
-  accounts: many(account),
-}));
-
-export const sessionRelations = relations(session, ({ one }) => ({
-  user: one(user, { fields: [session.userId], references: [user.id] }),
-}));
-
-export const accountRelations = relations(account, ({ one }) => ({
-  user: one(user, { fields: [account.userId], references: [user.id] }),
-}));
+// ── Lobbies ───────────────────────────────────────────────────────────────────

 export const lobbies = pgTable(
  "lobbies",
@ -318,6 +226,36 @@ export const lobby_players = pgTable(
  (table) => [primaryKey({ columns: [table.lobbyId, table.userId] })],
 );

+// ── Relations ─────────────────────────────────────────────────────────────────
+
+export const vocabularyEntryRelations = relations(
+  vocabulary_entries,
+  ({ many }) => ({ translations: many(entry_translations) }),
+);
+
+export const entryTranslationRelations = relations(
+  entry_translations,
+  ({ one }) => ({
+    entry: one(vocabulary_entries, {
+      fields: [entry_translations.entry_id],
+      references: [vocabulary_entries.id],
+    }),
+  }),
+);
+
+export const userRelations = relations(user, ({ many }) => ({
+  sessions: many(session),
+  accounts: many(account),
+}));
+
+export const sessionRelations = relations(session, ({ one }) => ({
+  user: one(user, { fields: [session.userId], references: [user.id] }),
+}));
+
+export const accountRelations = relations(account, ({ one }) => ({
+  user: one(user, { fields: [account.userId], references: [user.id] }),
+}));
+
 export const lobbyRelations = relations(lobbies, ({ one, many }) => ({
  host: one(user, { fields: [lobbies.hostUserId], references: [user.id] }),
  players: many(lobby_players),
--- a/packages/db/src/models/termModel.ts
+++ b/packages/db/src/models/termModel.ts
@ -1,25 +1,27 @@
 import { db } from "@lila/db";
-import { eq, and, isNotNull, sql, ne } from "drizzle-orm";
-import { terms, translations, term_glosses } from "@lila/db/schema";
+import { eq, and, ne, sql, isNotNull } from "drizzle-orm";
+import { vocabulary_entries, entry_translations } from "@lila/db/schema";
 import { alias } from "drizzle-orm/pg-core";
-
 import type {
  SupportedLanguageCode,
  SupportedPos,
  DifficultyLevel,
 } from "@lila/shared";

+// ── Types ─────────────────────────────────────────────────────────────────────
+
 export type TranslationPairRow = {
-  termId: string;
+  entryId: string;
  sourceText: string;
  targetText: string;
  sourceGloss: string | null;
 };

-// Note: difficulty filter is intentionally asymmetric. We filter on the target
-// (answer) side only — a word can be A2 in Italian but B1 in English, and what
-// matters for the learner is the difficulty of the word they're being taught.
+// ── Queries ───────────────────────────────────────────────────────────────────

+// Note: difficulty filter is intentionally on the target (translation) side.
+// A word can be A2 in one language but B1 in another — what matters for the
+// learner is the difficulty of the word they are being tested on.
 export const getGameTerms = async (
  sourceLanguage: SupportedLanguageCode,
  targetLanguage: SupportedLanguageCode,
@ -27,53 +29,36 @@ export const getGameTerms = async (
  difficulty: DifficultyLevel,
  rounds: number,
 ): Promise<TranslationPairRow[]> => {
-  const sourceTranslations = alias(translations, "source_translations");
-  const targetTranslations = alias(translations, "target_translations");
+  const sourceEntries = alias(vocabulary_entries, "source_entries");
+  const targetTranslations = alias(entry_translations, "target_translations");

  const rows = await db
    .select({
-      termId: terms.id,
-      sourceText: sourceTranslations.text,
-      targetText: targetTranslations.text,
-      sourceGloss: term_glosses.text,
+      entryId: sourceEntries.id,
+      sourceText: sourceEntries.headword,
+      targetText: targetTranslations.translation,
+      sourceGloss: sourceEntries.gloss,
    })
-    .from(terms)
-    .innerJoin(
-      sourceTranslations,
-      and(
-        eq(sourceTranslations.term_id, terms.id),
-        eq(sourceTranslations.language_code, sourceLanguage), // Filter here!
-      ),
-    )
+    .from(sourceEntries)
    .innerJoin(
      targetTranslations,
      and(
-        eq(targetTranslations.term_id, terms.id),
-        eq(targetTranslations.language_code, targetLanguage), // Filter here!
-      ),
-    )
-    .leftJoin(
-      term_glosses,
-      and(
-        eq(term_glosses.term_id, terms.id),
-        eq(term_glosses.language_code, sourceLanguage),
+        eq(targetTranslations.entry_id, sourceEntries.id),
+        eq(targetTranslations.target_language_code, targetLanguage),
+        eq(targetTranslations.difficulty, difficulty),
+        isNotNull(targetTranslations.translation),
      ),
    )
    .where(
      and(
-        eq(terms.pos, pos),
-        eq(targetTranslations.difficulty, difficulty),
-        isNotNull(sourceTranslations.difficulty), // Good data quality check!
+        eq(sourceEntries.language_code, sourceLanguage),
+        eq(sourceEntries.pos, pos),
+        isNotNull(sourceEntries.difficulty),
      ),
    )
-    // TODO(post-mvp): ORDER BY RANDOM() sorts the entire filtered result set before
-    // applying LIMIT, which is fine at current data volumes (low thousands of rows
-    // after POS + difficulty filters) but degrades as the terms table grows. Once
-    // the database is fully populated and tagged, replace with one of:
-    //   - TABLESAMPLE BERNOULLI(n) for approximate sampling on large tables
-    //   - Random offset: SELECT ... OFFSET floor(random() * (SELECT count(*) ...))
-    //   - Pre-computed random column with a btree index, reshuffled periodically
-    // Benchmark first — don't optimise until it actually hurts.
+    // TODO(post-mvp): ORDER BY RANDOM() sorts the entire filtered result set
+    // before applying LIMIT, which is fine at current data volumes but degrades
+    // as the table grows. See original termModel.ts for optimisation options.
    .orderBy(sql`RANDOM()`)
    .limit(rounds);

@ -81,32 +66,33 @@ export const getGameTerms = async (
 };

 export const getDistractors = async (
-  excludeTermId: string,
+  excludeEntryId: string,
  excludeText: string,
+  sourceLanguage: SupportedLanguageCode,
  targetLanguage: SupportedLanguageCode,
  pos: SupportedPos,
  difficulty: DifficultyLevel,
  count: number,
 ): Promise<string[]> => {
  const rows = await db
-    .select({ text: translations.text })
-    .from(terms)
+    .select({ text: entry_translations.translation })
+    .from(vocabulary_entries)
    .innerJoin(
-      translations,
+      entry_translations,
      and(
-        eq(translations.term_id, terms.id),
-        eq(translations.language_code, targetLanguage),
+        eq(entry_translations.entry_id, vocabulary_entries.id),
+        eq(entry_translations.target_language_code, targetLanguage),
+        eq(entry_translations.difficulty, difficulty),
      ),
    )
    .where(
      and(
-        eq(terms.pos, pos),
-        eq(translations.difficulty, difficulty),
-        ne(terms.id, excludeTermId),
-        ne(translations.text, excludeText),
+        eq(vocabulary_entries.language_code, sourceLanguage),
+        eq(vocabulary_entries.pos, pos),
+        ne(vocabulary_entries.id, excludeEntryId),
+        ne(entry_translations.translation, excludeText),
      ),
    )
-    // TODO(post-mvp): same ORDER BY RANDOM() concern as getGameTerms — see comment there.
    .orderBy(sql`RANDOM()`)
    .limit(count);

--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@ -173,6 +173,9 @@ importers:
      typescript:
        specifier: ^5.9.3
        version: 5.9.3
+      vitest:
+        specifier: ^4.1.0
+        version: 4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))

  packages/db:
    dependencies:
@ -4391,7 +4394,6 @@ snapshots:
      magic-string: 0.30.21
    optionalDependencies:
      vite: 8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)
-    optional: true

  '@vitest/mocker@4.1.0(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))':
    dependencies:
@ -6136,7 +6138,6 @@ snapshots:
      jsdom: 29.0.1(@noble/hashes@2.2.0)
    transitivePeerDependencies:
      - msw
-    optional: true

  vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)):
    dependencies:
Author	SHA1	Message	Date
lila	04a581efe1	WIP: checkpoint before stage-3 sub-stage rewrite	2026-05-12 22:13:14 +02:00
lila	73fb12ac35	feat: enrich script working, redesigning to sub-stage architecture - Enrich script functional with timeout, progress tracking, rejection mechanism - Identified ordering issue: CEFR voting needs validated translations first - Redesign: round1_gloss → round1_example → round1_translations → round1_cefr - Update data-pipeline.md with new sub-stage design and roadmap - Qwen3.5-4B confirmed working with thinking disabled	2026-05-07 13:09:43 +02:00
lila	7f10c35e03	docs: update roadmap — stage 3 enrich script written, llama.cpp next	2026-05-05 19:30:18 +02:00
lila	9642daf6dd	feat: add stage 3 round 1 enrich script and wire into orchestrator	2026-05-05 19:28:38 +02:00
lila	76af2ab093	fix: update db import validation tests to account for reverse links - Translation count test now adds reverse link count to expected total - Non-English translations test now filters to kaikki source only - Target language test now filters to kaikki source only — reverse links to English are valid and expected	2026-05-05 19:10:19 +02:00
lila	1c44ef989b	feat: update pipeline orchestrator for Kaikki — wire up stages 1 and 2 - Replace checkOmwExists with checkExtractedFilesExist - Wire up importKaikki and reverseLink as real stage implementations - Track reverse link completion via sentinel row in run_status - Update report to use resolved_entry_cefr and entry counts - Stages 3 onwards remain as stubs	2026-05-05 19:04:28 +02:00
lila	6f9a42c707	feat: add stage 2 reverse link sync script	2026-05-05 18:57:55 +02:00
lila	b5a76ee178	docs: update roadmap — stage 1 in progress, sample extraction complete	2026-05-05 18:52:10 +02:00
lila	ba2635e3f7	feat: add stage 1 and db import validation tests for Kaikki schema	2026-05-05 18:51:11 +02:00
lila	0cc643e308	feat: update extractor for all 5 languages, update import for multi-language - Extract.ts now processes all 5 language files, filters non-English entries by lang_code, skips translation extraction for non-English (no translations in source files) - Import.ts now imports all 5 language output files, uses language field from ExtractedSense instead of hardcoding en - Sample limit hardcoded to 500 entries per language for development	2026-05-05 18:46:32 +02:00
lila	209d52f54b	feat: add Kaikki extraction and import scripts for stage 1 - Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL, filters to supported POS and languages, skips abbreviations and senses with no translations in supported languages - Rewrite db/import.ts for Kaikki flat model — tracks sense_index offsets per headword+pos to handle duplicate JSONL entries - Rewrite db/schema.sql for Kaikki model — entries, translations, LLM vote tables, resolved tables - Add extract and db:import scripts to package.json - Sample mode hardcoded to 500 entries for development	2026-05-05 18:11:53 +02:00
lila	963bff4eb8	feat: migrate production schema from OMW to Kaikki flat vocabulary model - Replace terms/translations/term_glosses/term_examples with vocabulary_entries and entry_translations - Remove decks, topics and related tables (deferred) - Add cefr_level and difficulty to entry_translations for game query filtering - Update termModel.ts for new schema — getDistractors now takes sourceLanguage - Update gameService.ts and multiplayerGameService.ts for entryId rename - Update all test fixtures from termId to entryId - Generate and apply migration 0011	2026-05-05 17:39:25 +02:00
lila	38d8b85228	docs: rewrite data-pipeline.md for Kaikki migration	2026-05-05 17:14:48 +02:00
lila	87aeb072c5	feat: add pipeline orchestrator skeleton with startup checks, stage runners, shutdown handler, and report generation	2026-05-03 23:01:29 +02:00
lila	080fad1998	feat: enrich stage foundation — provider config, env setup, schema fix - Remove foreign key on run_status.source_id to support sentinel rows for tracking one-time pipeline steps (compile_candidates, compile_votes, merge, compare) - Add stage-3-enrich/config.ts with all provider configurations, ALL_PROVIDERS ordered local-first, and validateProviderKey() for startup key checks - Add .env.example with required API keys for OpenRouter and Anthropic - Add pipeline:run script to package.json using --env-file .env - Add .env to root .gitignore coverage for data-pipeline/.env	2026-05-03 22:44:14 +02:00
lila	4d42fe4397	removing db from git tracking, adding it to gitignore, add db import validation tests	2026-05-03 22:16:43 +02:00
lila	f59399be02	feat: add db import script, fix duplicate translations in extract, add annotate script	2026-05-03 22:05:10 +02:00
lila	4a842140b9	feat: add stage 1 and 2 validation tests	2026-05-03 21:36:56 +02:00
lila	4fa3073412	feat: add db schema, init, and vitest config	2026-05-03 17:56:29 +02:00
lila	74cfc82bdd	docs: finalise data-pipeline.md with tiebreak, pipeline.db, reports, sync	2026-05-03 17:21:02 +02:00
lila	6007fe1e38	docs: update data-pipeline.md and llm-setup.md to reflect sqlite architecture	2026-05-02 20:13:05 +02:00