diff --git a/data-pipeline/db/pipeline.db-shm b/data-pipeline/db/pipeline.db-shm
new file mode 100644
index 0000000..fe9ac28
Binary files /dev/null and b/data-pipeline/db/pipeline.db-shm differ
diff --git a/data-pipeline/db/pipeline.db-wal b/data-pipeline/db/pipeline.db-wal
new file mode 100644
index 0000000..e69de29
diff --git a/data-pipeline/db/reset.ts b/data-pipeline/db/reset.ts
new file mode 100644
index 0000000..f1643ba
--- /dev/null
+++ b/data-pipeline/db/reset.ts
@@ -0,0 +1,41 @@
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import Database from "better-sqlite3";
+
+// ── Paths ─────────────────────────────────────────────────────────────────────
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const DB_PATH = path.join(__dirname, "pipeline.db");
+
+// ── Main ──────────────────────────────────────────────────────────────────────
+
+function main(): void {
+  const mode = process.argv[2];
+
+  if (!mode || (mode !== "round1" && mode !== "all")) {
+    console.error("Usage: pnpm db:reset round1 | all");
+    console.error("  round1 — delete all round1 sub-stage rows");
+    console.error("  all    — delete all run_status rows except reverse_link");
+    process.exit(1);
+  }
+
+  const db = new Database(DB_PATH);
+
+  let result: { changes: number };
+
+  if (mode === "round1") {
+    result = db
+      .prepare("DELETE FROM run_status WHERE stage LIKE 'round1%'")
+      .run();
+    console.log(`Deleted ${result.changes} round1 rows from run_status`);
+  } else {
+    result = db
+      .prepare("DELETE FROM run_status WHERE stage NOT IN ('reverse_link')")
+      .run();
+    console.log(`Deleted ${result.changes} rows from run_status`);
+  }
+
+  db.close();
+}
+
+main();
diff --git a/data-pipeline/package.json b/data-pipeline/package.json
index e47936a..b8004c3 100644
--- a/data-pipeline/package.json
+++ b/data-pipeline/package.json
@@ -4,11 +4,11 @@
   "private": true,
   "type": "module",
   "scripts": {
+    "db:reset": "tsx db/reset.ts",
     "extract": "tsx stage-1-extract/scripts/extract.ts",
     "reverse-link": "tsx stage-2-reverse-link/scripts/reverse-link.ts",
     "db:import": "tsx db/import.ts",
     "db:init": "tsx db/init.ts",
-    "annotate": "tsx stage-2-annotate/scripts/annotate.ts",
     "test": "vitest run",
     "test:watch": "vitest",
     "pipeline:run": "tsx --env-file .env pipeline.ts"
diff --git a/data-pipeline/pipeline.ts b/data-pipeline/pipeline.ts
index 8652817..230c18a 100644
--- a/data-pipeline/pipeline.ts
+++ b/data-pipeline/pipeline.ts
@@ -110,7 +110,7 @@ async function checkProviderReady(provider: ProviderConfig): Promise<void> {
 async function generateRunName(): Promise<string> {
   await fs.mkdir(PATHS.reports, { recursive: true });
 
-  const date = new Date().toISOString().slice(0, 10);
+  const date = new Date().toISOString().exi(0, 10);
   const files = await fs.readdir(PATHS.reports);
   const todaysRuns = files.filter(
     (f) => f.startsWith(date) && f.endsWith(".json"),
@@ -175,7 +175,8 @@ function getModelRound1Status(modelName: string): StageStatus {
     db
       .prepare(
         `SELECT COUNT(*) as count FROM run_status
-         WHERE model_name = ? AND stage = 'round1' AND status = 'complete'`,
+         WHERE model_name = ? AND stage = 'round1_gloss'
+         AND status = 'complete'`,
       )
       .get(modelName) as { count: number }
   ).count;
@@ -602,9 +603,9 @@ async function main(): Promise<void> {
     runCompare();
   }
 
-  // ── Report
-  stats.stoppedAt = new Date();
-  await generateReport(runName, stats);
+  // ── Report (disabled until full pipeline is implemented)
+  // stats.stoppedAt = new Date();
+  // await generateReport(runName, stats);
 
   console.log("\nPipeline complete.");
 }
diff --git a/data-pipeline/stage-3-enrich/scripts/enrich.ts b/data-pipeline/stage-3-enrich/scripts/enrich.ts
index a4d9989..e732e0a 100644
--- a/data-pipeline/stage-3-enrich/scripts/enrich.ts
+++ b/data-pipeline/stage-3-enrich/scripts/enrich.ts
@@ -20,253 +20,297 @@ type TranslationRow = {
   word: string;
 };
 
-type LlmResponse = {
+type GlossResult = { status: "ok" } | { status: "improved"; gloss: string };
+
+type ExampleResult = { status: "ok" } | { status: "improved"; example: string };
+
+type TranslationResult = {
+  translations: Partial<
+    Record<SupportedLanguageCode, Record<string, "ok" | "reject">>
+  >;
+  generated?: Partial<Record<SupportedLanguageCode, string>>;
+};
+
+type CefrResult = {
   headword_cefr: string;
   translation_cefr: Partial<
     Record<SupportedLanguageCode, Record<string, string>>
   >;
-  generated_translations?: Partial<Record<SupportedLanguageCode, string>>;
-  generated_gloss?: string;
-  generated_example?: string;
 };
 
-type ValidationResult =
-  | { valid: true; data: LlmResponse }
-  | { valid: false; reason: string };
+type SubStage =
+  | "round1_gloss"
+  | "round1_example"
+  | "round1_translations"
+  | "round1_cefr";
 
 // ── Constants ─────────────────────────────────────────────────────────────────
 
 const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
 const CEFR_SET = new Set<string>(CEFR_LEVELS);
 
-// ── Prompt builder ────────────────────────────────────────────────────────────
+// ── Shutdown ──────────────────────────────────────────────────────────────────
 
-function buildPrompt(entry: EntryRow, translations: TranslationRow[]): string {
+let shutdownRequested = false;
+let currentCallController: AbortController | null = null;
+
+export function registerEnrichShutdown(): void {
+  const handler = (): void => {
+    if (shutdownRequested) return;
+    shutdownRequested = true;
+    console.log("\n\n  Shutdown requested — aborting current LLM call...");
+    currentCallController?.abort();
+  };
+  process.on("SIGINT", handler);
+  process.on("SIGTERM", handler);
+}
+
+// ── Prompt builders ───────────────────────────────────────────────────────────
+
+function buildGlossPrompt(entry: EntryRow): string {
+  const glossText = entry.gloss ?? "none";
   const examples: string[] = JSON.parse(entry.examples) as string[];
+  const examplesText =
+    examples.length > 0 ? examples.map((e) => `  - ${e}`).join("\n") : "  none";
 
-  // Group translations by language
+  return `You are a language learning expert.
+
+Review this gloss for the ${entry.pos} "${entry.headword}" (sense ${entry.sense_index}).
+Gloss: "${glossText}"
+Examples of this specific sense:
+${examplesText}
+
+Is this gloss clear, accurate for this specific sense, and suitable for a language learner?
+- If yes, respond with: {"status": "ok"}
+- If no or if gloss is "none", respond with: {"status": "improved", "gloss": "your improved gloss here"}
+
+IMPORTANT: Your improved gloss must describe THIS SPECIFIC SENSE shown by the examples above,
+not a more common or general meaning of the word.
+
+Respond ONLY with valid JSON and nothing else.`;
+}
+
+function buildTranslationsPrompt(
+  entry: EntryRow,
+  translations: TranslationRow[],
+  verifiedGloss: string,
+): string {
   const byLang = new Map<SupportedLanguageCode, string[]>();
   for (const t of translations) {
     if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, []);
     byLang.get(t.target_lang)!.push(t.word);
   }
 
-  // Find missing languages
   const coveredLangs = new Set(byLang.keys());
   const missingLangs = SUPPORTED_LANGUAGE_CODES.filter(
     (l) => l !== entry.language && !coveredLangs.has(l),
   );
 
-  const existingTranslationsText =
+  const translationsText =
     byLang.size > 0
       ? [...byLang.entries()]
           .map(([lang, words]) => `  ${lang}: ${words.join(", ")}`)
           .join("\n")
       : "  none";
 
-  const missingTranslationsText =
+  const missingText =
     missingLangs.length > 0 ? missingLangs.join(", ") : "none";
 
-  const examplesText =
-    examples.length > 0 ? examples.map((e) => `  - ${e}`).join("\n") : "  none";
+  const exampleResponse: Record<string, unknown> = {
+    translations: {
+      de: { frei: "ok", "-frei": "reject" },
+      it: { libero: "ok", free: "reject" },
+    },
+  };
+  if (missingLangs.length > 0) {
+    exampleResponse["generated"] = { es: "libre", fr: "libre" };
+  }
 
-  const glossText = entry.gloss ?? "none";
+  return `You are a language learning expert.
 
-  return `You are a language learning expert building a multilingual vocabulary database.
+For the ${entry.language} ${entry.pos} "${entry.headword}" (meaning: "${verifiedGloss}"), review these translations:
+${translationsText}
 
-Given an English word sense, your tasks are:
-1. Assign a CEFR level (A1, A2, B1, B2, C1, or C2) to the English headword for this specific sense
-2. Assign a CEFR level to each existing translation listed
-3. If MISSING TRANSLATIONS lists any languages, generate the single best translation for each
-4. If the existing gloss is missing or unsuitable for a language learner, generate a better one
-5. If the existing examples are missing or unsuitable for a language learner, generate one natural sentence in English
+For each translation:
+- Write "ok" if it is a valid translation for this specific meaning
+- Write "reject" if it is wrong, a suffix (starts with -), garbled text, or the wrong language
 
-Base CEFR levels on how commonly a language learner at that level would encounter this specific sense, not the word in general. Consider register — slang, technical, and archaic words should be rated higher.
+Examples of correct behaviour:
+- "free" listed as Italian → "reject" (it is English, not Italian)
+- "-frei" listed as German → "reject" (it is a suffix, not a standalone word)
+- "libre" listed as Spanish → "ok" (it is a valid Spanish word)
 
-WORD: ${entry.headword}
-PART OF SPEECH: ${entry.pos}
-GLOSS: ${glossText}
-EXAMPLES:
-${examplesText}
-EXISTING TRANSLATIONS:
-${existingTranslationsText}
-MISSING TRANSLATIONS: ${missingTranslationsText}
+${missingLangs.length > 0 ? `Also generate the single best translation for these missing languages: ${missingText}` : ""}
 
-Respond ONLY with valid JSON and nothing else — no explanation, no markdown:
+Respond ONLY with valid JSON and nothing else:
+${JSON.stringify(exampleResponse, null, 2)}`;
+}
+
+function buildCefrPrompt(
+  entry: EntryRow,
+  verifiedGloss: string,
+  validatedTranslations: Map<SupportedLanguageCode, string[]>,
+): string {
+  const translationsText =
+    validatedTranslations.size > 0
+      ? [...validatedTranslations.entries()]
+          .map(([lang, words]) => `  ${lang}: ${words.join(", ")}`)
+          .join("\n")
+      : "  none";
+
+  return `You are a language learning expert.
+
+Assign CEFR levels (A1, A2, B1, B2, C1, or C2) to this word and its validated translations.
+Base your levels on how commonly a language learner at that level would encounter this specific sense.
+Consider register — slang, technical, and archaic words should be rated higher.
+
+WORD: ${entry.headword} (${entry.pos})
+MEANING: ${verifiedGloss}
+VALIDATED TRANSLATIONS:
+${translationsText}
+
+Respond ONLY with valid JSON and nothing else:
 {
   "headword_cefr": "B1",
   "translation_cefr": {
     "de": { "frei": "A2" },
-    "es": { "libre": "A2" },
-    "fr": { "libre": "A2" },
     "it": { "libero": "A2" }
-  },
-  "generated_translations": { "missing_lang": "word" },
-  "generated_gloss": "A clearer definition for learners.",
-  "generated_example": "A natural example sentence."
-}
-
-EXAMPLE OF CORRECT BEHAVIOUR:
-If you receive:
-  WORD: cat
-  EXISTING TRANSLATIONS:
-    it: gatto, cat
-
-The correct response includes "reject" for "cat" because it is an English word, not Italian:
-  "translation_cefr": {
-    "it": { "gatto": "A1", "cat": "reject" }
   }
-
-Similarly, if you receive:
-  EXISTING TRANSLATIONS:
-    de: frei, -frei
-
-The correct response includes "reject" for "-frei" because it is a suffix, not a standalone word:
-  "translation_cefr": {
-    "de": { "frei": "A2", "-frei": "reject" }
-  }
-
-  EXAMPLE OF CORRECT BEHAVIOUR:
-  If you receive:
-    WORD: cat
-    EXISTING TRANSLATIONS:
-      it: gatto, cat
-
-  The correct response includes "reject" for "cat" because it is an English word, not Italian:
-    "translation_cefr": {
-      "it": { "gatto": "A1", "cat": "reject" }
-    }
-
-  Similarly, if you receive:
-    EXISTING TRANSLATIONS:
-      de: frei, -frei
-
-  The correct response includes "reject" for "-frei" because it is a suffix, not a standalone word:
-    "translation_cefr": {
-      "de": { "frei": "A2", "-frei": "reject" }
-    }
-
-IMPORTANT:
-- You MUST include EVERY translation listed in EXISTING TRANSLATIONS in your response — no exceptions
-- Use the CEFR level (A1-C2) if the translation is valid for this sense
-- Use "reject" if the translation does not fit this specific sense, is not a real word in that language, or is clearly bad data
-- Never silently omit a translation — every word must get either a CEFR level or "reject"
-- translation_cefr must map each language to an object of word:level pairs
-- Only include "generated_translations" if MISSING TRANSLATIONS lists languages
-- Only include "generated_gloss" if you judge the existing gloss unsuitable
-- Only include "generated_example" if you judge the existing examples unsuitable
-`;
+}`;
 }
 
 // ── Validation ────────────────────────────────────────────────────────────────
 
-function validateResponse(
+function validateGloss(raw: string): GlossResult | null {
+  try {
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (obj["status"] === "ok") return { status: "ok" };
+    if (
+      obj["status"] === "improved" &&
+      typeof obj["gloss"] === "string" &&
+      obj["gloss"].trim()
+    ) {
+      return { status: "improved", gloss: obj["gloss"].trim() };
+    }
+    return null;
+  } catch {
+    return null;
+  }
+}
+
+function validateExample(raw: string): ExampleResult | null {
+  try {
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (obj["status"] === "ok") return { status: "ok" };
+    if (
+      obj["status"] === "improved" &&
+      typeof obj["example"] === "string" &&
+      obj["example"].trim()
+    ) {
+      return { status: "improved", example: obj["example"].trim() };
+    }
+    return null;
+  } catch {
+    return null;
+  }
+}
+
+function validateTranslations(
   raw: string,
   translations: TranslationRow[],
-): ValidationResult {
-  let parsed: unknown;
+): TranslationResult | null {
   try {
-    parsed = JSON.parse(raw);
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (typeof obj["translations"] !== "object" || obj["translations"] === null)
+      return null;
+
+    const result: TranslationResult = { translations: {} };
+    const translationsObj = obj["translations"] as Record<string, unknown>;
+
+    // Validate each language's votes
+    for (const [lang, votes] of Object.entries(translationsObj)) {
+      if (!SUPPORTED_LANG_SET.has(lang)) continue;
+      if (typeof votes !== "object" || votes === null) continue;
+
+      result.translations[lang as SupportedLanguageCode] = {};
+      for (const [word, status] of Object.entries(
+        votes as Record<string, unknown>,
+      )) {
+        if (status === "ok" || status === "reject") {
+          result.translations[lang as SupportedLanguageCode]![word] = status;
+        }
+      }
+    }
+
+    // Validate generated translations
+    if (obj["generated"] !== undefined && obj["generated"] !== null) {
+      if (typeof obj["generated"] !== "object") return null;
+      result.generated = {};
+      for (const [lang, word] of Object.entries(
+        obj["generated"] as Record<string, unknown>,
+      )) {
+        if (!SUPPORTED_LANG_SET.has(lang)) continue;
+        if (typeof word === "string" && word.trim()) {
+          result.generated[lang as SupportedLanguageCode] = word.trim();
+        }
+      }
+    }
+
+    // Check all translations got a vote
+    const byLang = new Map<string, Set<string>>();
+    for (const t of translations) {
+      if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, new Set());
+      byLang.get(t.target_lang)!.add(t.word);
+    }
+
+    for (const [lang, words] of byLang.entries()) {
+      const votes = result.translations[lang as SupportedLanguageCode];
+      if (!votes) return null;
+      for (const word of words) {
+        if (!votes[word]) return null;
+      }
+    }
+
+    return result;
   } catch {
-    return { valid: false, reason: "invalid JSON" };
+    return null;
   }
+}
 
-  if (typeof parsed !== "object" || parsed === null) {
-    return { valid: false, reason: "response is not an object" };
-  }
+function validateCefr(
+  raw: string,
+  validatedTranslations: Map<SupportedLanguageCode, string[]>,
+): CefrResult | null {
+  try {
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (typeof obj["headword_cefr"] !== "string") return null;
+    if (!CEFR_SET.has(obj["headword_cefr"])) return null;
+    if (
+      typeof obj["translation_cefr"] !== "object" ||
+      obj["translation_cefr"] === null
+    )
+      return null;
 
-  const obj = parsed as Record<string, unknown>;
+    const translationCefr = obj["translation_cefr"] as Record<string, unknown>;
+
+    // Verify all validated translations have a CEFR vote
+    for (const [lang, words] of validatedTranslations.entries()) {
+      const votes = translationCefr[lang] as Record<string, string> | undefined;
+      if (!votes) return null;
+      for (const word of words) {
+        if (!votes[word] || !CEFR_SET.has(votes[word])) return null;
+      }
+    }
 
-  // headword_cefr required
-  if (typeof obj["headword_cefr"] !== "string") {
-    return { valid: false, reason: "missing headword_cefr" };
-  }
-  if (!CEFR_SET.has(obj["headword_cefr"])) {
     return {
-      valid: false,
-      reason: `invalid headword_cefr: ${obj["headword_cefr"]}`,
+      headword_cefr: obj["headword_cefr"],
+      translation_cefr: translationCefr as Partial<
+        Record<SupportedLanguageCode, Record<string, string>>
+      >,
     };
+  } catch {
+    return null;
   }
-
-  // translation_cefr required
-  if (
-    typeof obj["translation_cefr"] !== "object" ||
-    obj["translation_cefr"] === null
-  ) {
-    return { valid: false, reason: "missing translation_cefr" };
-  }
-
-  const translationCefr = obj["translation_cefr"] as Record<string, unknown>;
-
-  // Verify all existing translations have a CEFR vote
-  const byLang = new Map<string, Set<string>>();
-  for (const t of translations) {
-    if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, new Set());
-    byLang.get(t.target_lang)!.add(t.word);
-  }
-
-  for (const [lang, words] of byLang.entries()) {
-    const votes = translationCefr[lang] as Record<string, string> | undefined;
-    if (!votes) {
-      return {
-        valid: false,
-        reason: `missing translation_cefr for language: ${lang}`,
-      };
-    }
-    for (const word of words) {
-      if (!votes[word]) {
-        return {
-          valid: false,
-          reason: `missing CEFR vote for ${lang}:${word}`,
-        };
-      }
-    }
-  }
-
-  // Optional fields
-  if (
-    obj["generated_translations"] !== undefined &&
-    obj["generated_translations"] !== null
-  ) {
-    if (typeof obj["generated_translations"] !== "object") {
-      return {
-        valid: false,
-        reason: "generated_translations is not an object",
-      };
-    }
-    for (const [lang, word] of Object.entries(
-      obj["generated_translations"] as Record<string, unknown>,
-    )) {
-      if (!SUPPORTED_LANG_SET.has(lang)) {
-        return {
-          valid: false,
-          reason: `unsupported language in generated_translations: ${lang}`,
-        };
-      }
-      if (typeof word !== "string" || !word.trim()) {
-        return {
-          valid: false,
-          reason: `empty generated translation for ${lang}`,
-        };
-      }
-    }
-  }
-
-  if (
-    obj["generated_gloss"] !== undefined &&
-    typeof obj["generated_gloss"] !== "string"
-  ) {
-    return { valid: false, reason: "generated_gloss is not a string" };
-  }
-
-  if (
-    obj["generated_example"] !== undefined &&
-    typeof obj["generated_example"] !== "string"
-  ) {
-    return { valid: false, reason: "generated_example is not a string" };
-  }
-
-  return { valid: true, data: obj as unknown as LlmResponse };
 }
 
 // ── LLM call ──────────────────────────────────────────────────────────────────
@@ -275,14 +319,14 @@ async function callLlm(
   prompt: string,
   provider: ProviderConfig,
 ): Promise<string> {
-  const controller = new AbortController();
-  const timeout = setTimeout(() => controller.abort(), 120_000); // 2 minutes
+  currentCallController = new AbortController();
+  const timeout = setTimeout(() => currentCallController?.abort(), 120_000);
 
   let response: Response;
   try {
     response = await fetch(`${provider.baseURL}/chat/completions`, {
       method: "POST",
-      signal: controller.signal,
+      signal: currentCallController.signal,
       headers: {
         "Content-Type": "application/json",
         Authorization: `Bearer ${provider.apiKey}`,
@@ -296,6 +340,7 @@ async function callLlm(
     });
   } finally {
     clearTimeout(timeout);
+    currentCallController = null;
   }
 
   if (!response.ok) {
@@ -306,15 +351,7 @@ async function callLlm(
     choices?: { message?: { content?: string } }[];
   };
 
-  const content =
-    data.choices?.[0]?.message?.content ||
-    ((data.choices?.[0]?.message as Record<string, unknown>)?.[
-      "reasoning_content"
-    ] as string | undefined);
-  console.log(
-    "\n  DEBUG response:",
-    JSON.stringify(data.choices?.[0]?.message),
-  );
+  const content = data.choices?.[0]?.message?.content;
   if (!content) throw new Error("LLM returned empty response");
 
   return content
@@ -323,124 +360,147 @@ async function callLlm(
     .trim();
 }
 
-// ── Write results ─────────────────────────────────────────────────────────────
+// ── Status helpers ────────────────────────────────────────────────────────────
 
-function writeResults(
+function getSubStageStatus(
   entryId: number,
   modelName: string,
-  data: LlmResponse,
+  stage: SubStage,
+): "complete" | "needs_review" | "pending" {
+  const db = openDb();
+  const row = db
+    .prepare(
+      `SELECT status FROM run_status
+       WHERE entry_id = ? AND model_name = ? AND stage = ?`,
+    )
+    .get(entryId, modelName, stage) as { status: string } | undefined;
+  db.close();
+  if (!row) return "pending";
+  if (row.status === "complete") return "complete";
+  if (row.status === "needs_review") return "needs_review";
+  return "pending";
+}
+
+function markSubStage(
+  entryId: number,
+  modelName: string,
+  stage: SubStage,
+  status: "complete" | "needs_review",
+): void {
+  const db = openDb();
+  db.prepare(
+    `INSERT INTO run_status (entry_id, model_name, stage, status)
+     VALUES (?, ?, ?, ?)
+     ON CONFLICT (entry_id, model_name, stage)
+     DO UPDATE SET status = ?, updated_at = datetime('now')`,
+  ).run(entryId, modelName, stage, status, status);
+  db.close();
+}
+
+// ── Write helpers ─────────────────────────────────────────────────────────────
+
+function writeGloss(
+  entryId: number,
+  modelName: string,
+  result: GlossResult,
+): void {
+  if (result.status === "improved") {
+    const db = openDb();
+    db.prepare(
+      `INSERT INTO generated_glosses (entry_id, model_name, text)
+       VALUES (?, ?, ?)
+       ON CONFLICT (entry_id, model_name) DO NOTHING`,
+    ).run(entryId, modelName, result.gloss);
+    db.close();
+  }
+}
+
+function writeExample(
+  entryId: number,
+  modelName: string,
+  result: ExampleResult,
+): void {
+  if (result.status === "improved") {
+    const db = openDb();
+    db.prepare(
+      `INSERT INTO generated_examples (entry_id, model_name, text)
+       VALUES (?, ?, ?)
+       ON CONFLICT (entry_id, model_name) DO NOTHING`,
+    ).run(entryId, modelName, result.example);
+    db.close();
+  }
+}
+
+function writeTranslations(
+  entryId: number,
+  modelName: string,
+  result: TranslationResult,
   translations: TranslationRow[],
 ): void {
   const db = openDb();
 
-  const insertEntryCefr = db.prepare(`
-    INSERT INTO model_entry_cefr_votes (entry_id, model_name, cefr_level)
-    VALUES (?, ?, ?)
-    ON CONFLICT (entry_id, model_name) DO NOTHING
-  `);
-
-  const insertTranslationCefr = db.prepare(`
-    INSERT INTO model_translation_cefr_votes (translation_id, model_name, cefr_level)
-    VALUES (?, ?, ?)
-    ON CONFLICT (translation_id, model_name) DO NOTHING
-  `);
-
-  const insertGeneratedTranslation = db.prepare(`
-    INSERT INTO generated_translations (entry_id, model_name, target_lang, word)
-    VALUES (?, ?, ?, ?)
-    ON CONFLICT (entry_id, model_name, target_lang) DO NOTHING
-  `);
-
-  const insertGeneratedGloss = db.prepare(`
-    INSERT INTO generated_glosses (entry_id, model_name, text)
-    VALUES (?, ?, ?)
-    ON CONFLICT (entry_id, model_name) DO NOTHING
-  `);
-
-  const insertGeneratedExample = db.prepare(`
-    INSERT INTO generated_examples (entry_id, model_name, text)
-    VALUES (?, ?, ?)
-    ON CONFLICT (entry_id, model_name) DO NOTHING
-  `);
-
-  const updateRunStatus = db.prepare(`
-    INSERT INTO run_status (entry_id, model_name, stage, status)
-    VALUES (?, ?, 'round1', 'complete')
-    ON CONFLICT (entry_id, model_name, stage)
-    DO UPDATE SET status = 'complete', updated_at = datetime('now')
-  `);
-
   db.transaction(() => {
-    // CEFR vote for headword
-    insertEntryCefr.run(entryId, modelName, data.headword_cefr);
-
-    // CEFR votes and rejections for translations
+    // Write rejections
     for (const t of translations) {
-      const level = data.translation_cefr[t.target_lang]?.[t.word];
-
-      if (!level) continue;
-      if (level === "reject") {
-        // Explicit rejection or silently skipped — both treated as rejection
+      const vote = result.translations[t.target_lang]?.[t.word];
+      if (vote === "reject") {
         db.prepare(
-          `
-          INSERT INTO model_translation_rejections (translation_id, model_name)
-          VALUES (?, ?)
-          ON CONFLICT (translation_id, model_name) DO NOTHING
-        `,
+          `INSERT INTO model_translation_rejections (translation_id, model_name)
+           VALUES (?, ?)
+           ON CONFLICT (translation_id, model_name) DO NOTHING`,
         ).run(t.id, modelName);
-      } else {
-        insertTranslationCefr.run(t.id, modelName, level);
       }
     }
 
-    // Generated translations
-    if (data.generated_translations) {
-      for (const [lang, word] of Object.entries(data.generated_translations)) {
-        if (word.trim()) {
-          insertGeneratedTranslation.run(entryId, modelName, lang, word.trim());
-        }
+    // Write generated translations
+    if (result.generated) {
+      for (const [lang, word] of Object.entries(result.generated)) {
+        db.prepare(
+          `INSERT INTO generated_translations (entry_id, model_name, target_lang, word)
+           VALUES (?, ?, ?, ?)
+           ON CONFLICT (entry_id, model_name, target_lang) DO NOTHING`,
+        ).run(entryId, modelName, lang, word);
       }
     }
-
-    // Generated gloss
-    if (data.generated_gloss?.trim()) {
-      insertGeneratedGloss.run(entryId, modelName, data.generated_gloss.trim());
-    }
-
-    // Generated example
-    if (data.generated_example?.trim()) {
-      insertGeneratedExample.run(
-        entryId,
-        modelName,
-        data.generated_example.trim(),
-      );
-    }
-
-    // Mark complete
-    updateRunStatus.run(entryId, modelName);
   })();
 
   db.close();
 }
 
-function markNeedsReview(
+function writeCefr(
   entryId: number,
   modelName: string,
-  reason: string,
+  result: CefrResult,
+  translations: TranslationRow[],
 ): void {
   const db = openDb();
-  db.prepare(
-    `
-    INSERT INTO run_status (entry_id, model_name, stage, status)
-    VALUES (?, ?, 'round1', 'needs_review')
-    ON CONFLICT (entry_id, model_name, stage)
-    DO UPDATE SET status = 'needs_review', updated_at = datetime('now')
-  `,
-  ).run(entryId, modelName);
+
+  db.transaction(() => {
+    // Headword CEFR
+    db.prepare(
+      `INSERT INTO model_entry_cefr_votes (entry_id, model_name, cefr_level)
+       VALUES (?, ?, ?)
+       ON CONFLICT (entry_id, model_name) DO NOTHING`,
+    ).run(entryId, modelName, result.headword_cefr);
+
+    // Translation CEFR votes
+    for (const t of translations) {
+      const level = result.translation_cefr[t.target_lang]?.[t.word];
+      if (level && CEFR_SET.has(level)) {
+        db.prepare(
+          `INSERT INTO model_translation_cefr_votes (translation_id, model_name, cefr_level)
+           VALUES (?, ?, ?)
+           ON CONFLICT (translation_id, model_name) DO NOTHING`,
+        ).run(t.id, modelName, level);
+      }
+    }
+  })();
+
   db.close();
-  console.warn(`    needs_review: entry ${entryId} — ${reason}`);
 }
 
+// ── Progress ──────────────────────────────────────────────────────────────────
+
 function updateProgress(
   processed: number,
   needsReview: number,
@@ -474,66 +534,91 @@ function updateProgress(
 export async function enrich(
   provider: ProviderConfig,
 ): Promise<{ processed: number; skipped: number; needsReview: number }> {
+  registerEnrichShutdown();
   const db = openDb();
 
-  // Load all English entries
   const allEntries = db
     .prepare(`SELECT * FROM entries WHERE language = 'en'`)
     .all() as EntryRow[];
 
-  // Find already processed entries for this model
-  const processed = db
+  // An entry is fully complete when all 4 sub-stages are complete
+  const completeEntries = db
     .prepare(
       `SELECT entry_id FROM run_status
-       WHERE model_name = ? AND stage = 'round1'
-       AND status IN ('complete', 'needs_review')`,
+       WHERE model_name = ? AND stage = 'round1_gloss'
+       AND status = 'complete'`,
     )
     .all(provider.name) as { entry_id: number }[];
 
-  const processedIds = new Set(processed.map((r) => r.entry_id));
-  const pending = allEntries
-    .filter((e) => !processedIds.has(e.id))
-    .slice(0, 10);
+  const completeIds = new Set(completeEntries.map((r) => r.entry_id));
+  const pending = allEntries.filter((e) => !completeIds.has(e.id)).slice(0, 50);
 
   db.close();
 
   console.log(`\n  Model: ${provider.name}`);
   console.log(`  Total entries: ${allEntries.length.toLocaleString()}`);
-  console.log(`  Already processed: ${processedIds.size.toLocaleString()}`);
+  console.log(`  Already complete: ${completeIds.size.toLocaleString()}`);
   console.log(`  Pending: ${pending.length.toLocaleString()}`);
 
   if (pending.length === 0) {
     console.log("  Nothing to process.");
-    return { processed: 0, skipped: allEntries.length, needsReview: 0 };
+    return { processed: 0, skipped: completeIds.size, needsReview: 0 };
   }
 
   let processedCount = 0;
   let needsReviewCount = 0;
   let llmMs = 0;
-
   const startTime = Date.now();
 
   for (const entry of pending) {
+    if (shutdownRequested) break;
+
     const db2 = openDb();
     const translations = db2
       .prepare(
-        `SELECT id, target_lang, word FROM translations
-         WHERE entry_id = ? AND source = 'kaikki'`,
+        `SELECT id, target_lang, word FROM translations WHERE entry_id = ? AND source = 'kaikki'`,
       )
       .all(entry.id) as TranslationRow[];
     db2.close();
 
-    const prompt = buildPrompt(entry, translations);
+    let entryFailed = false;
 
-    let raw: string;
+    // ── Sub-stage 1: Gloss ────────────────────────────────────────────────────
 
-    try {
-      const llmStart = Date.now();
-      raw = await callLlm(prompt, provider);
-      llmMs = Date.now() - llmStart;
-    } catch (err) {
-      const message = err instanceof Error ? err.message : String(err);
-      markNeedsReview(entry.id, provider.name, `LLM call failed: ${message}`);
+    let verifiedGloss = entry.gloss ?? "";
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_gloss") !== "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(buildGlossPrompt(entry), provider);
+        llmMs = Date.now() - llmStart;
+
+        const result = validateGloss(raw);
+        if (!result) {
+          markSubStage(entry.id, provider.name, "round1_gloss", "needs_review");
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_gloss — invalid response`,
+          );
+          entryFailed = true;
+        } else {
+          writeGloss(entry.id, provider.name, result);
+          if (result.status === "improved") verifiedGloss = result.gloss;
+          markSubStage(entry.id, provider.name, "round1_gloss", "complete");
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(entry.id, provider.name, "round1_gloss", "needs_review");
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_gloss — ${message}`,
+        );
+        entryFailed = true;
+      }
+    }
+
+    if (entryFailed) {
       needsReviewCount++;
       updateProgress(
         processedCount,
@@ -545,13 +630,50 @@ export async function enrich(
       continue;
     }
 
-    const validation = validateResponse(raw, translations);
-    if (!validation.valid) {
-      markNeedsReview(
-        entry.id,
-        provider.name,
-        `validation failed: ${validation.reason}`,
-      );
+    /*
+    // ── Sub-stages 2, 3, 4 — not yet active ──────────────────────────────────
+    // ── Sub-stage 2: Example ──────────────────────────────────────────────────
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_example") !==
+      "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(
+          buildExamplePrompt(entry, verifiedGloss),
+          provider,
+        );
+        llmMs = Date.now() - llmStart;
+
+        const result = validateExample(raw);
+        if (!result) {
+          markSubStage(
+            entry.id,
+            provider.name,
+            "round1_example",
+            "needs_review",
+          );
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_example — invalid response`,
+          );
+          entryFailed = true;
+        } else {
+          writeExample(entry.id, provider.name, result);
+          markSubStage(entry.id, provider.name, "round1_example", "complete");
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(entry.id, provider.name, "round1_example", "needs_review");
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_example — ${message}`,
+        );
+        entryFailed = true;
+      }
+    }
+
+    if (entryFailed) {
       needsReviewCount++;
       updateProgress(
         processedCount,
@@ -563,7 +685,169 @@ export async function enrich(
       continue;
     }
 
-    writeResults(entry.id, provider.name, validation.data, translations);
+    // ── Sub-stage 3: Translations ─────────────────────────────────────────────
+
+    const validatedTranslations = new Map<SupportedLanguageCode, string[]>();
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_translations") !==
+      "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(
+          buildTranslationsPrompt(entry, translations, verifiedGloss),
+          provider,
+        );
+        llmMs = Date.now() - llmStart;
+
+        const result = validateTranslations(raw, translations);
+        if (!result) {
+          markSubStage(
+            entry.id,
+            provider.name,
+            "round1_translations",
+            "needs_review",
+          );
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_translations — invalid response`,
+          );
+          entryFailed = true;
+        } else {
+          writeTranslations(entry.id, provider.name, result, translations);
+          markSubStage(
+            entry.id,
+            provider.name,
+            "round1_translations",
+            "complete",
+          );
+
+          // Build validated translations map for CEFR sub-stage
+          // Include kaikki translations that were ok'd + generated translations
+          for (const t of translations) {
+            const vote = result.translations[t.target_lang]?.[t.word];
+            if (vote === "ok") {
+              if (!validatedTranslations.has(t.target_lang)) {
+                validatedTranslations.set(t.target_lang, []);
+              }
+              validatedTranslations.get(t.target_lang)!.push(t.word);
+            }
+          }
+          if (result.generated) {
+            for (const [lang, word] of Object.entries(result.generated)) {
+              const l = lang as SupportedLanguageCode;
+              if (!validatedTranslations.has(l))
+                validatedTranslations.set(l, []);
+              validatedTranslations.get(l)!.push(word);
+            }
+          }
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(
+          entry.id,
+          provider.name,
+          "round1_translations",
+          "needs_review",
+        );
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_translations — ${message}`,
+        );
+        entryFailed = true;
+      }
+    } else {
+      // Already complete — rebuild validated translations from db
+      const db3 = openDb();
+      const rejections = new Set(
+        (
+          db3
+            .prepare(
+              `SELECT translation_id FROM model_translation_rejections WHERE model_name = ?`,
+            )
+            .all(provider.name) as { translation_id: number }[]
+        ).map((r) => r.translation_id),
+      );
+      for (const t of translations) {
+        if (!rejections.has(t.id)) {
+          if (!validatedTranslations.has(t.target_lang)) {
+            validatedTranslations.set(t.target_lang, []);
+          }
+          validatedTranslations.get(t.target_lang)!.push(t.word);
+        }
+      }
+      const generated = db3
+        .prepare(
+          `SELECT target_lang, word FROM generated_translations WHERE entry_id = ? AND model_name = ?`,
+        )
+        .all(entry.id, provider.name) as {
+        target_lang: SupportedLanguageCode;
+        word: string;
+      }[];
+      for (const g of generated) {
+        if (!validatedTranslations.has(g.target_lang))
+          validatedTranslations.set(g.target_lang, []);
+        validatedTranslations.get(g.target_lang)!.push(g.word);
+      }
+      db3.close();
+    }
+
+    if (entryFailed) {
+      needsReviewCount++;
+      updateProgress(
+        processedCount,
+        needsReviewCount,
+        pending.length,
+        llmMs,
+        startTime,
+      );
+      continue;
+    }
+
+    // ── Sub-stage 4: CEFR ─────────────────────────────────────────────────────
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_cefr") !== "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(
+          buildCefrPrompt(entry, verifiedGloss, validatedTranslations),
+          provider,
+        );
+        llmMs = Date.now() - llmStart;
+
+        const result = validateCefr(raw, validatedTranslations);
+        if (!result) {
+          markSubStage(entry.id, provider.name, "round1_cefr", "needs_review");
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_cefr — invalid response`,
+          );
+          needsReviewCount++;
+        } else {
+          // Get translation rows for validated words only
+          const validatedRows = translations.filter((t) => {
+            return validatedTranslations.get(t.target_lang)?.includes(t.word);
+          });
+          writeCefr(entry.id, provider.name, result, validatedRows);
+          markSubStage(entry.id, provider.name, "round1_cefr", "complete");
+          processedCount++;
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(entry.id, provider.name, "round1_cefr", "needs_review");
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_cefr — ${message}`,
+        );
+        needsReviewCount++;
+      }
+    } else {
+      processedCount++;
+    }
+
+    */
+
     processedCount++;
     updateProgress(
       processedCount,
@@ -582,13 +866,12 @@ export async function enrich(
   console.log(
     `  Avg per entry: ${(totalMs / Math.max(processedCount + needsReviewCount, 1) / 1000).toFixed(1)}s`,
   );
-
   console.log(`  Processed: ${processedCount.toLocaleString()}`);
   console.log(`  Needs review: ${needsReviewCount.toLocaleString()}`);
 
   return {
     processed: processedCount,
-    skipped: processedIds.size,
+    skipped: completeIds.size,
     needsReview: needsReviewCount,
   };
 }
diff --git a/documentation/data-pipeline.md b/documentation/data-pipeline.md
index 48ba7b2..e8899dc 100644
--- a/documentation/data-pipeline.md
+++ b/documentation/data-pipeline.md
@@ -121,23 +121,11 @@ pnpm --filter @lila/pipeline reverse-link
 ### Resetting enrich stage progress
 
 ```bash
-# Reset round 1 only (retry failed or incomplete run)
-node -e "
-const Database = require('better-sqlite3');
-const db = new Database('/db/pipeline.db');
-const result = db.prepare(\"DELETE FROM run_status WHERE stage = 'round1'\").run();
-console.log('Deleted', result.changes, 'rows');
-db.close();
-"
+# Reset round 1 only
+pnpm --filter @lila/pipeline db:reset round1
 
-# Reset all enrich progress (round 1 and round 2)
-node -e "
-const Database = require('better-sqlite3');
-const db = new Database('data-pipeline/db/pipeline.db');
-const result = db.prepare(\"DELETE FROM run_status WHERE stage IN ('round1', 'round2')\").run();
-console.log('Deleted', result.changes, 'rows');
-db.close();
-"
+# Reset all stages except reverse link
+pnpm --filter @lila/pipeline db:reset all
 ```
 
 ### Checking pipeline progress
diff --git a/documentation/model-strategy.md b/documentation/model-strategy.md
new file mode 100644
index 0000000..bc36327
--- /dev/null
+++ b/documentation/model-strategy.md
@@ -0,0 +1,173 @@
+# Model Strategy
+
+## The problem
+
+The pipeline requires LLMs to perform four tasks per vocabulary entry:
+
+1. **Gloss review** — confirm or improve the existing gloss
+2. **Example review** — confirm or improve existing examples
+3. **Translation validation** — confirm valid translations, reject bad data, generate missing ones
+4. **CEFR assignment** — assign A1-C2 to the headword and each translation
+
+The core challenge is that vocabulary entries have **multiple senses**. The word "cat" appears five times in the database — as an animal, as slang for "guy", as a nautical term, as a verb meaning "to vomit", and as a verb meaning "to hoist an anchor". Each sense requires a different CEFR level and different translations. A model that only knows "cat" is A1 gets four out of five wrong.
+
+This makes CEFR assignment fundamentally a **sense-disambiguation problem**, not just a vocabulary lookup. Specialized CEFR classifiers (like `cefrpy` or `dksysd/cefr-classifier`) operate at the word or sentence level and cannot distinguish between senses of the same word. General LLMs handle sense disambiguation well but introduce quality and reliability problems that depend heavily on model size.
+
+The secondary challenge is **hardware constraints**. The available local hardware (GTX 950M, 4GB VRAM) can only run models up to approximately 4B parameters fully in GPU memory. Larger models run in hybrid CPU/GPU mode which is significantly slower. Free cloud API tiers are generous enough for the sample dataset but have daily limits that make processing 100k+ entries across multiple sub-stages a multi-day or multi-week operation.
+
+## What we tried and why it failed or worked
+
+### Single-prompt design (abandoned)
+
+The first enrich script sent one large prompt per entry covering all four tasks at once — CEFR voting, gloss improvement, example improvement, translation validation, and missing translation generation. This produced the following problems:
+
+- The model skipped translations it considered invalid rather than explicitly rejecting them, causing validation failures
+- Bad data in the translation table (`it:free`, `de:-frei`, `es:de fai`) caused consistent validation failures because the model refused to vote on them even when explicitly instructed
+- The combined prompt was large enough to trigger reasoning mode on Gemma 4 E4B, consuming all available tokens on thinking before producing output
+- 20% of entries required manual review
+
+### Sub-stage design (current)
+
+Splitting into four ordered sub-stages fixed the reasoning and validation problems:
+
+1. `round1_gloss` — LLM reviews the gloss in isolation
+2. `round1_example` — LLM reviews examples with verified gloss as context
+3. `round1_translations` — LLM validates translations with verified gloss as context
+4. `round1_cefr` — LLM assigns CEFR levels only to validated translations
+
+This ordering ensures the CEFR sub-stage never sees bad data. The smaller, focused prompts eliminated reasoning mode triggering and reduced per-entry time from ~120 seconds to ~25 seconds.
+
+### Gloss quality (ongoing)
+
+Testing on 50 entries with Qwen3.5-4B showed ~80% good quality. The 20% failures fall into three categories:
+
+- **Category header glosses** — Kaikki occasionally uses "Terms relating to people." or "Terms relating to things." as a gloss instead of a real definition. No model handles these correctly because there is no real meaning to improve.
+- **Rare/obscure senses** — slang, archaic, and theological senses that a 4B model does not have enough knowledge to handle (e.g. "cat" meaning "to vomit", "word" meaning "Logos, Christ").
+- **Short ambiguous glosses** — one or two word glosses with no example context cause hallucination.
+
+### Gemma 4 E4B (rejected)
+
+Gemma 4 E4B is a hybrid reasoning model. Disabling thinking via `--reasoning-budget 0` or `--chat-template-kwargs '{"enable_thinking":false}'` does not work reliably in llama.cpp for the E4B variant — the model either puts reasoning into the content field as plain text or returns empty content with reasoning in `reasoning_content`. Per-entry time exceeded 100 seconds making it impractical.
+
+### Qwen3.5-4B (current local model)
+
+Non-thinking by default for the small series. Runs fully in 4GB VRAM at ~5 seconds per sub-stage. Acceptable quality for common vocabulary (A1-B2) but struggles with rare and specialized senses. Used as the primary local voter.
+
+### Specialized CEFR classifiers (rejected for primary use)
+
+HuggingFace hosts several CEFR text classifiers (`dksysd/cefr-classifier`, `AbdulSami/bert-base-cased-cefr`) and the `cefrpy` Python library maps individual words to CEFR levels. These operate at the word or sentence level and cannot distinguish between senses. "cat" would always be assigned A1 regardless of whether the sense is the animal or obscure nautical slang. Useful only as a sanity check signal, not as a primary voter.
+
+## Available free resources
+
+| Resource                     | Type               | Requests/day      | Quality   | Notes                                                                  |
+| ---------------------------- | ------------------ | ----------------- | --------- | ---------------------------------------------------------------------- |
+| Local Qwen3.5-4B Q4_K_M      | Local model        | Unlimited         | Decent    | Non-thinking by default, fits in 4GB VRAM, ~5s per sub-stage           |
+| Local Qwen3.5-9B Q4_K_M      | Local model        | Unlimited         | Good      | Hybrid CPU/GPU mode on 4GB VRAM, slower but better quality             |
+| Local Llama 3.1 8B Q4_K_M    | Local model        | Unlimited         | Decent    | ~4.3GB, fits in VRAM or light hybrid, different architecture from Qwen |
+| Groq — Llama 3.3 70B         | Cloud API          | 1,000             | Excellent | Best free quality available, 5-10x with batching                       |
+| Groq — Llama 3.1 8B          | Cloud API          | 14,400            | Decent    | High volume, similar quality to local 4B                               |
+| Google Gemini AI Studio      | Cloud API          | 1,500             | Very good | Google account required, 5-10x with batching                           |
+| OpenRouter free rotation     | Cloud API          | 50–1,000          | Varies    | Rotates between free models automatically via `openrouter/free`        |
+| Wiktionary API               | Context enrichment | Unlimited         | N/A       | Structured vocabulary data, directly related to Kaikki source          |
+| `cefrpy` Python library      | Word lookup        | Unlimited         | Limited   | Deterministic English word CEFR lookup, no sense disambiguation        |
+| HuggingFace CEFR classifiers | Text classifier    | Unlimited (local) | Limited   | Sentence-level difficulty, not sense-aware                             |
+
+### Batching
+
+All cloud APIs support sending multiple entries in a single request. Sending 5 entries per request multiplies effective daily capacity by 5x:
+
+- Groq Llama 3.3 70B: 1,000 requests → ~5,000 entries/day
+- Gemini: 1,500 requests → ~7,500 entries/day
+
+### Multiple accounts
+
+Prohibited by the terms of service of all providers listed above.
+
+## Final approach per sub-stage
+
+The pipeline runs multiple models as independent voters. Each model processes every entry once and writes its votes to `pipeline.db`. The merge stage resolves disagreements by majority vote. A tiebreaker runs additional models on flagged entries where no majority was reached.
+
+### round1_gloss and round1_example
+
+These sub-stages require a model that understands sense context from examples. Specialized classifiers cannot help here — only general LLMs can evaluate whether a gloss correctly describes a specific sense.
+
+**Primary voter:** Local Qwen3.5-9B Q4_K_M — runs overnight, unlimited, handles common vocabulary well.
+
+**Secondary voter:** Groq Llama 3.3 70B with 5-entry batching — higher quality, catches errors the local model makes on rare or specialized senses.
+
+**Tertiary voter:** Gemini AI Studio with 5-entry batching — third independent opinion, different training data from both Groq and local model.
+
+**Context enrichment via Wiktionary API:** Before calling any model for the gloss or example sub-stage, the pipeline queries the Wiktionary API for the headword. The API returns the full Wiktionary entry including all senses, usage notes, and examples. This structured data is added to the prompt as additional context, giving the model a much clearer picture of which specific sense it is working with.
+
+This directly fixes the two hardest failure cases:
+- **Category header glosses** ("Terms relating to people.") — the Wiktionary entry contains the real definition which the model can use to generate a proper gloss
+- **Short ambiguous glosses** — the additional sense context prevents the model from guessing the wrong meaning
+
+The Wiktionary API is free, has no rate limits for reasonable use, and is directly related to the Kaikki data source since Kaikki extracts from Wiktionary.
+
+### round1_translations
+
+Same voter stack as gloss/example. The few-shot examples in the prompt (showing that `it:free` → reject and `de:-frei` → reject) handle the bad data cases that caused validation failures in the single-prompt design.
+
+### round1_cefr
+
+This sub-stage only receives translations that survived the validation step. All bad data is already excluded.
+
+**Primary voter:** Local Qwen3.5-9B Q4_K_M.
+
+**Secondary voter:** Groq Llama 3.3 70B with 5-entry batching.
+
+**Tertiary voter:** Gemini AI Studio with 5-entry batching.
+
+**Sanity check:** `cefrpy` provides a deterministic English word CEFR level as a reference signal. If the majority LLM vote disagrees significantly (e.g. LLMs vote C2 for "cat" the animal), the entry is flagged for human review. `cefrpy` does not vote — it only triggers review flags.
+
+### Voter summary
+
+| Sub-stage           | Voter 1            | Voter 2            | Voter 3 |
+| ------------------- | ------------------ | ------------------ | ------- |
+| round1_gloss        | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+| round1_example      | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+| round1_translations | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+| round1_cefr         | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+
+Three voters means a correct majority requires at least two models to agree. Even if the local model gets a difficult sense wrong, the two cloud models will likely agree on the correct answer and outvote it.
+
+## Open questions
+
+### Wiktionary API context extraction
+The Wiktionary API returns the full entry for a word including all senses. For a word like "free" with 8+ senses, dumping the entire entry into the prompt wastes tokens and may confuse the model. The open question is how to extract only the relevant sense — options include matching by sense_index, fuzzy-matching the Kaikki gloss against Wiktionary glosses, or letting the model see all senses and identify the correct one itself.
+
+### Batching prompt design
+Batching 5-10 entries per API call multiplies effective daily capacity significantly. The prompt and validation logic for batched requests is more complex — the model must return a structured JSON object keyed by entry ID, and partial failures (one entry in a batch fails validation) need careful handling. Not yet designed or tested.
+
+### Groq and Gemini API integration
+Neither Groq nor Gemini is integrated into the pipeline yet. Both use OpenAI-compatible APIs so integration is straightforward — add provider configs to `stage-3-enrich/config.ts` and set API keys in `.env`. The batching prompt design needs to be finalised first.
+
+### OpenRouter free model rotation
+OpenRouter's `openrouter/free` router selects a model at random from available free models. This means output style and quality vary between requests, which complicates round 2 voting where models review each other's candidates. May need to pin specific free models rather than using the router.
+
+### Qwen3.5-9B performance on hard cases
+The 9B model has not yet been tested. It is expected to handle rare and specialized senses better than the 4B model but this has not been verified. Needs a test run against the same 50 entries used to evaluate the 4B model.
+
+### Llama.cpp Gemma 4 bug
+The llama.cpp chat template bug preventing reliable JSON output from Gemma 4 E4B may be fixed in a future release. The model fits in 4GB VRAM and would be a useful additional local voter if the bug is resolved. Worth checking periodically.
+
+### Full dataset scale
+The current pipeline runs on a 500-entry sample per language. The full Kaikki English file contains approximately 1.3 million entries, of which a fraction will pass the POS and translation filters. The exact count and the time required to run all sub-stages across all models at full scale is not yet known.
+
+### Category header glosses
+Kaikki occasionally uses category headers ("Terms relating to people.", "Terms relating to things.") as glosses. These are not real definitions and no model produces useful output for them. Options include pre-filtering them before the gloss sub-stage and generating a gloss purely from examples, or flagging them as a special case for human review.
+
+
+
+
+wget -O models/llama-3.1-8b-instruct-q4_k_m.gguf \
+  "https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
+
+# Q4_K_M (5.68GB — hybrid mode, better quality)
+wget -O models/qwen3.5-9b-q4_k_m.gguf \
+  "https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q4_K_M.gguf"
+
+# Q3_K_S (4.32GB — might fit fully in VRAM)
+wget -O models/qwen3.5-9b-q3_k_s.gguf \
+  "https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q3_K_S.gguf"