feat: enrich script working, redesigning to sub-stage architecture

- Enrich script functional with timeout, progress tracking, rejection mechanism - Identified ordering issue: CEFR voting needs validated translations first - Redesign: round1_gloss → round1_example → round1_translations → round1_cefr - Update data-pipeline.md with new sub-stage design and roadmap - Qwen3.5-4B confirmed working with thinking disabled
2026-05-07 13:09:43 +02:00 · 2026-05-07 13:09:43 +02:00 · 73fb12ac35
commit 73fb12ac35
parent 7f10c35e03
7 changed files with 337 additions and 122 deletions
--- a/data-pipeline/db/pipeline.db-shm
+++ b/data-pipeline/db/pipeline.db-shm
--- a/data-pipeline/db/pipeline.db-wal
+++ b/data-pipeline/db/pipeline.db-wal
--- a/data-pipeline/db/schema.sql
+++ b/data-pipeline/db/schema.sql
@ -60,6 +60,13 @@ CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
  UNIQUE (translation_id, model_name)
 );
 CREATE TABLE IF NOT EXISTS model_translation_rejections (
  id             INTEGER PRIMARY KEY,
  translation_id INTEGER NOT NULL REFERENCES translations(id),
  model_name     TEXT    NOT NULL,
  UNIQUE (translation_id, model_name)
 );
 CREATE TABLE IF NOT EXISTS generated_glosses (
  id         INTEGER PRIMARY KEY,
  entry_id   INTEGER NOT NULL REFERENCES entries(id),
--- a/data-pipeline/stage-3-enrich/config.ts
+++ b/data-pipeline/stage-3-enrich/config.ts
@ -20,12 +20,20 @@ export type ProviderConfig = {
 // ── Local llama.cpp ───────────────────────────────────────────────────────────
 export const LOCAL_QWEN35_4B: ProviderConfig = {
  name: "local-qwen3.5-4b",
  baseURL: "http://127.0.0.1:8080/v1",
  apiKey: "none",
  model: "qwen3.5-4b",
  maxTokens: 1024, // no reasoning overhead so 1024 is enough
 };
 export const LOCAL_GEMMA4: ProviderConfig = {
  name: "local-gemma4-e4b",
  baseURL: "http://127.0.0.1:8080/v1",
  apiKey: "none", // llama.cpp ignores this
  model: "gemma4-e4b", // llama.cpp ignores model name, uses loaded model
-  maxTokens: 512,
+  maxTokens: 2048,
 };
 export const LOCAL_QWEN7B: ProviderConfig = {
@ -87,13 +95,14 @@ export const ANTHROPIC_SONNET: ProviderConfig = {
 // Add new providers here to include them in the voting pool.
 export const ALL_PROVIDERS: ProviderConfig[] = [
-  LOCAL_GEMMA4,
+  LOCAL_QWEN35_4B,
-  LOCAL_QWEN7B,
+  // LOCAL_GEMMA4,
-  OR_QWEN3_480B,
+  // LOCAL_QWEN7B,
-  OR_GEMMA4_31B,
+  // OR_QWEN3_480B,
-  OR_QWEN3_80B,
+  // OR_GEMMA4_31B,
-  OR_NEMOTRON,
+  // OR_QWEN3_80B,
-  ANTHROPIC_SONNET,
+  // OR_NEMOTRON,
  // ANTHROPIC_SONNET,
 ];
 // ── Key validation ────────────────────────────────────────────────────────────
--- a/data-pipeline/stage-3-enrich/scripts/enrich.ts
+++ b/data-pipeline/stage-3-enrich/scripts/enrich.ts
@ -94,19 +94,68 @@ MISSING TRANSLATIONS: ${missingTranslationsText}
 Respond ONLY with valid JSON and nothing else — no explanation, no markdown:
 {
-  "headword_cefr": "<level>",
+  "headword_cefr": "B1",
  "translation_cefr": {
-    "<lang>": { "<word>": "<level>", ... },
+    "de": { "frei": "A2" },
-    ...
+    "es": { "libre": "A2" },
    "fr": { "libre": "A2" },
    "it": { "libero": "A2" }
  },
-  "generated_translations": { "<lang>": "<word>", ... },
+  "generated_translations": { "missing_lang": "word" },
-  "generated_gloss": "<gloss if needed, omit if existing is fine>",
+  "generated_gloss": "A clearer definition for learners.",
-  "generated_example": "<example sentence in English if needed, omit if existing is fine>"
+  "generated_example": "A natural example sentence."
 }
-Only include "generated_translations" if there are missing languages.
+EXAMPLE OF CORRECT BEHAVIOUR:
-Only include "generated_gloss" if you judge the existing gloss unsuitable.
+If you receive:
-Only include "generated_example" if you judge the existing examples unsuitable.`;
+  WORD: cat
  EXISTING TRANSLATIONS:
    it: gatto, cat
 The correct response includes "reject" for "cat" because it is an English word, not Italian:
  "translation_cefr": {
    "it": { "gatto": "A1", "cat": "reject" }
  }
 Similarly, if you receive:
  EXISTING TRANSLATIONS:
    de: frei, -frei
 The correct response includes "reject" for "-frei" because it is a suffix, not a standalone word:
  "translation_cefr": {
    "de": { "frei": "A2", "-frei": "reject" }
  }
  EXAMPLE OF CORRECT BEHAVIOUR:
  If you receive:
    WORD: cat
    EXISTING TRANSLATIONS:
      it: gatto, cat
  The correct response includes "reject" for "cat" because it is an English word, not Italian:
    "translation_cefr": {
      "it": { "gatto": "A1", "cat": "reject" }
    }
  Similarly, if you receive:
    EXISTING TRANSLATIONS:
      de: frei, -frei
  The correct response includes "reject" for "-frei" because it is a suffix, not a standalone word:
    "translation_cefr": {
      "de": { "frei": "A2", "-frei": "reject" }
    }
 IMPORTANT:
 - You MUST include EVERY translation listed in EXISTING TRANSLATIONS in your response — no exceptions
 - Use the CEFR level (A1-C2) if the translation is valid for this sense
 - Use "reject" if the translation does not fit this specific sense, is not a real word in that language, or is clearly bad data
 - Never silently omit a translation — every word must get either a CEFR level or "reject"
 - translation_cefr must map each language to an object of word:level pairs
 - Only include "generated_translations" if MISSING TRANSLATIONS lists languages
 - Only include "generated_gloss" if you judge the existing gloss unsuitable
 - Only include "generated_example" if you judge the existing examples unsuitable
 `;
 }
 // ── Validation ────────────────────────────────────────────────────────────────
@ -148,30 +197,6 @@ function validateResponse(
  }
  const translationCefr = obj["translation_cefr"] as Record<string, unknown>;
  for (const [lang, votes] of Object.entries(translationCefr)) {
    if (!SUPPORTED_LANG_SET.has(lang)) {
      return {
        valid: false,
        reason: `unsupported language in translation_cefr: ${lang}`,
      };
    }
    if (typeof votes !== "object" || votes === null) {
      return {
        valid: false,
        reason: `translation_cefr.${lang} is not an object`,
      };
    }
    for (const [word, level] of Object.entries(
      votes as Record<string, unknown>,
    )) {
      if (typeof level !== "string" || !CEFR_SET.has(level)) {
        return {
          valid: false,
          reason: `invalid CEFR level for ${lang}.${word}: ${String(level)}`,
        };
      }
    }
  }
  // Verify all existing translations have a CEFR vote
  const byLang = new Map<string, Set<string>>();
@ -199,11 +224,11 @@ function validateResponse(
  }
  // Optional fields
-  if (obj["generated_translations"] !== undefined) {
+  if (
-    if (
+    obj["generated_translations"] !== undefined &&
-      typeof obj["generated_translations"] !== "object" ||
+    obj["generated_translations"] !== null
-      obj["generated_translations"] === null
+  ) {
-    ) {
+    if (typeof obj["generated_translations"] !== "object") {
      return {
        valid: false,
        reason: "generated_translations is not an object",
@ -250,19 +275,28 @@ async function callLlm(
  prompt: string,
  provider: ProviderConfig,
 ): Promise<string> {
-  const response = await fetch(`${provider.baseURL}/chat/completions`, {
+  const controller = new AbortController();
-    method: "POST",
+  const timeout = setTimeout(() => controller.abort(), 120_000); // 2 minutes
-    headers: {
+
-      "Content-Type": "application/json",
+  let response: Response;
-      Authorization: `Bearer ${provider.apiKey}`,
+  try {
-    },
+    response = await fetch(`${provider.baseURL}/chat/completions`, {
-    body: JSON.stringify({
+      method: "POST",
-      model: provider.model,
+      signal: controller.signal,
-      max_tokens: provider.maxTokens,
+      headers: {
-      messages: [{ role: "user", content: prompt }],
+        "Content-Type": "application/json",
-      temperature: 0.1, // low temperature for consistent structured output
+        Authorization: `Bearer ${provider.apiKey}`,
-    }),
+      },
-  });
+      body: JSON.stringify({
        model: provider.model,
        max_tokens: provider.maxTokens,
        messages: [{ role: "user", content: prompt }],
        temperature: 0.1,
      }),
    });
  } finally {
    clearTimeout(timeout);
  }
  if (!response.ok) {
    throw new Error(`LLM API error: ${response.status} ${response.statusText}`);
@ -272,10 +306,17 @@ async function callLlm(
    choices?: { message?: { content?: string } }[];
  };
-  const content = data.choices?.[0]?.message?.content;
+  const content =
    data.choices?.[0]?.message?.content ||
    ((data.choices?.[0]?.message as Record<string, unknown>)?.[
      "reasoning_content"
    ] as string | undefined);
  console.log(
    "\n  DEBUG response:",
    JSON.stringify(data.choices?.[0]?.message),
  );
  if (!content) throw new Error("LLM returned empty response");
  // Strip markdown code fences if present
  return content
    .replace(/```json\n?/g, "")
    .replace(/```\n?/g, "")
@ -333,10 +374,21 @@ function writeResults(
    // CEFR vote for headword
    insertEntryCefr.run(entryId, modelName, data.headword_cefr);
-    // CEFR votes for translations
+    // CEFR votes and rejections for translations
    for (const t of translations) {
      const level = data.translation_cefr[t.target_lang]?.[t.word];
-      if (level) {
+
      if (!level) continue;
      if (level === "reject") {
        // Explicit rejection or silently skipped — both treated as rejection
        db.prepare(
          `
          INSERT INTO model_translation_rejections (translation_id, model_name)
          VALUES (?, ?)
          ON CONFLICT (translation_id, model_name) DO NOTHING
        `,
        ).run(t.id, modelName);
      } else {
        insertTranslationCefr.run(t.id, modelName, level);
      }
    }
@ -389,6 +441,34 @@ function markNeedsReview(
  console.warn(`    needs_review: entry ${entryId} — ${reason}`);
 }
 function updateProgress(
  processed: number,
  needsReview: number,
  total: number,
  llmMs: number,
  startTime: number,
 ): void {
  const totalProcessed = processed + needsReview;
  const pct = ((totalProcessed / total) * 100).toFixed(1);
  const elapsed = (Date.now() - startTime) / 1000;
  const rate = elapsed > 0 ? totalProcessed / elapsed : 0;
  const remaining = rate > 0 ? (total - totalProcessed) / rate : 0;
  const eta =
    remaining === 0
      ? "calculating..."
      : remaining < 60
        ? `${Math.round(remaining)}s`
        : `${Math.round(remaining / 60)}m`;
  const totalElapsedStr =
    elapsed < 60
      ? `${Math.round(elapsed)}s`
      : `${Math.floor(elapsed / 60)}m ${Math.round(elapsed % 60)}s`;
  process.stdout.write(
    `\r    ${totalProcessed}/${total} (${pct}%) — entry: ${(llmMs / 1000).toFixed(1)}s — total: ${totalElapsedStr} — ETA: ${eta}    `,
  );
 }
 // ── Main enrich function ──────────────────────────────────────────────────────
 export async function enrich(
@ -411,7 +491,9 @@ export async function enrich(
    .all(provider.name) as { entry_id: number }[];
  const processedIds = new Set(processed.map((r) => r.entry_id));
-  const pending = allEntries.filter((e) => !processedIds.has(e.id));
+  const pending = allEntries
    .filter((e) => !processedIds.has(e.id))
    .slice(0, 10);
  db.close();
@ -427,6 +509,9 @@ export async function enrich(
  let processedCount = 0;
  let needsReviewCount = 0;
  let llmMs = 0;
  const startTime = Date.now();
  for (const entry of pending) {
    const db2 = openDb();
@ -441,17 +526,26 @@ export async function enrich(
    const prompt = buildPrompt(entry, translations);
    let raw: string;
    try {
      const llmStart = Date.now();
      raw = await callLlm(prompt, provider);
      llmMs = Date.now() - llmStart;
    } catch (err) {
      const message = err instanceof Error ? err.message : String(err);
      markNeedsReview(entry.id, provider.name, `LLM call failed: ${message}`);
      needsReviewCount++;
      updateProgress(
        processedCount,
        needsReviewCount,
        pending.length,
        llmMs,
        startTime,
      );
      continue;
    }
    const validation = validateResponse(raw, translations);
    if (!validation.valid) {
      markNeedsReview(
        entry.id,
@ -459,19 +553,36 @@ export async function enrich(
        `validation failed: ${validation.reason}`,
      );
      needsReviewCount++;
      updateProgress(
        processedCount,
        needsReviewCount,
        pending.length,
        llmMs,
        startTime,
      );
      continue;
    }
    writeResults(entry.id, provider.name, validation.data, translations);
    processedCount++;
-
+    updateProgress(
-    if (processedCount % 100 === 0) {
+      processedCount,
-      console.log(
+      needsReviewCount,
-        `    Processed ${processedCount.toLocaleString()} entries...`,
+      pending.length,
-      );
+      llmMs,
-    }
+      startTime,
    );
  }
  process.stdout.write("\n");
  const totalMs = Date.now() - startTime;
  const totalMin = Math.floor(totalMs / 60_000);
  const totalSec = Math.round((totalMs % 60_000) / 1000);
  console.log(`  Total time: ${totalMin}m ${totalSec}s`);
  console.log(
    `  Avg per entry: ${(totalMs / Math.max(processedCount + needsReviewCount, 1) / 1000).toFixed(1)}s`,
  );
  console.log(`  Processed: ${processedCount.toLocaleString()}`);
  console.log(`  Needs review: ${needsReviewCount.toLocaleString()}`);
--- a/documentation/data-pipeline.md
+++ b/documentation/data-pipeline.md
@ -54,6 +54,109 @@ The schema is defined in `data-pipeline/db/schema.sql`. Never edit `pipeline.db`
 On first run the orchestrator initialises `pipeline.db` automatically and imports the stage 1 output into the base tables. This happens once — subsequent runs skip the import if the base tables are already populated.
 ## Common commands
 ### Starting llama.cpp
 ```bash
 cd ~/Downloads/llama.cpp
 ./build/bin/llama-server \
  --model models/qwen3.5-4b-q4_k_m.gguf \
  --port 8080 \
  --ctx-size 4096 \
  --n-gpu-layers 999 \
  --host 127.0.0.1 \
  --chat-template-kwargs '{"enable_thinking":false}' \
  --reasoning-budget 0
 ```
 Verify the server is running:
 ```bash
 curl http://127.0.0.1:8080/health
 ```
 ### Running the pipeline
 ```bash
 pnpm --filter @lila/pipeline pipeline:run
 ```
 The pipeline auto-generates a run name from the date and a counter. It picks up where it left off — completed stages are skipped automatically.
 ### Stage 1 — Extract
 ```bash
 pnpm --filter @lila/pipeline extract
 ```
 Runs in sample mode (500 entries per language) by default. Remove the hardcoded limit in `stage-1-extract/scripts/extract.ts` for a full run.
 ### Stage 2 — Reverse link sync
 ```bash
 pnpm --filter @lila/pipeline reverse-link
 ```
 ### Initialising and importing the database
 ```bash
 # Initialise pipeline.db from schema
 pnpm --filter @lila/pipeline db:init
 # Import stage 1 output into pipeline.db
 pnpm --filter @lila/pipeline db:import
 ```
 ### Resetting the database
 ```bash
 # Full reset — delete and reinitialise
 rm data-pipeline/db/pipeline.db
 pnpm --filter @lila/pipeline db:init
 pnpm --filter @lila/pipeline db:import
 pnpm --filter @lila/pipeline reverse-link
 ```
 ### Resetting enrich stage progress
 ```bash
 # Reset round 1 only (retry failed or incomplete run)
 node -e "
 const Database = require('better-sqlite3');
 const db = new Database('/db/pipeline.db');
 const result = db.prepare(\"DELETE FROM run_status WHERE stage = 'round1'\").run();
 console.log('Deleted', result.changes, 'rows');
 db.close();
 "
 # Reset all enrich progress (round 1 and round 2)
 node -e "
 const Database = require('better-sqlite3');
 const db = new Database('data-pipeline/db/pipeline.db');
 const result = db.prepare(\"DELETE FROM run_status WHERE stage IN ('round1', 'round2')\").run();
 console.log('Deleted', result.changes, 'rows');
 db.close();
 "
 ```
 ### Checking pipeline progress
 ```bash
 node -e "
 const Database = require('better-sqlite3');
 const db = new Database('data-pipeline/db/pipeline.db', { readonly: true });
 const total = db.prepare('SELECT COUNT(*) as c FROM entries WHERE language = \\'en\\'').get().c;
 const complete = db.prepare(\"SELECT COUNT(*) as c FROM run_status WHERE stage = 'round1' AND status = 'complete'\").get().c;
 const needsReview = db.prepare(\"SELECT COUNT(*) as c FROM run_status WHERE stage = 'round1' AND status = 'needs_review'\").get().c;
 console.log('Total English entries:', total);
 console.log('Round 1 complete:', complete);
 console.log('Needs review:', needsReview);
 console.log('Pending:', total - complete - needsReview);
 db.close();
 "
 ```
 ## Data source
 ### Kaikki (Wiktionary)
@ -171,24 +274,31 @@ pnpm --filter @lila/pipeline reverse-link
 ### 3. Enrich
-The enrich stage runs LLMs to fill four types of gaps, in this order:
+> **Note:** Before running this stage, ensure the llama.cpp server is running
 > locally. The orchestrator checks for a running server at
 > `http://127.0.0.1:8080/health` and exits with instructions if it is not
 > reachable. See `llm-setup.md` for setup instructions.
-**A — Missing translations:** for each entry that has no translation in one or more supported languages after reverse link sync, the LLM generates the best translation for that language given the entry's headword, gloss, and examples.
+The enrich stage runs in four ordered sub-stages per entry, designed to build context progressively. All output is written to `pipeline.db` atomically per sub-stage — runs are fully resumable if interrupted. Each model is run once — one model produces one vote per sub-stage.
-**B — Weak glosses and examples:** for each entry where the gloss is missing or the examples are missing, the LLM generates a natural, learner-friendly gloss and one usage example in the entry's language.
+**Sub-stage order:**
-**C — CEFR levels:** for every entry, the LLM assigns a CEFR level (A1–C2) based on the headword, gloss, and examples. This runs for all entries regardless of whether other enrichment was needed.
+1. **`round1_gloss`** — the LLM reviews the existing gloss. If it is clear and learner-friendly, it confirms it. If not, it generates a better one.
-All output is written to `pipeline.db` atomically per entry — runs are fully resumable if interrupted. Each model is run once — one model produces one vote.
+2. **`round1_example`** — the LLM reviews the existing examples. If they are natural and suitable, it confirms them. If not, it generates one better example sentence in the entry language.
-> **Note:** Before running this stage, ensure the llama.cpp server is running locally. The orchestrator checks for a running server at `http://127.0.0.1:8080/health` and exits with instructions if it is not reachable. See `llm-setup.md` for setup instructions.
+3. **`round1_translations`** — using the verified gloss as context, the LLM reviews each existing translation. Valid translations are confirmed. Invalid ones (wrong language, suffixes, garbled text, wrong sense) are explicitly rejected. Missing languages get a generated translation.
 4. **`round1_cefr`** — using only the validated translations from the previous sub-stage, the LLM votes on the CEFR level for the headword and for each confirmed translation. Rejected translations never reach this sub-stage.
 This ordering ensures the CEFR voting sub-stage only sees clean, verified data.
 All output is written to `pipeline.db` atomically per sub-stage per entry. Interrupted runs resume from the last incomplete sub-stage without losing work. Each model is run once — one model, one vote per sub-stage.
 **Input:** `pipeline.db` — entries after reverse link sync
-**Output:** `pipeline.db` — LLM-generated translations, glosses, examples, and CEFR votes
+**Output:** `pipeline.db` — gloss votes, example votes, translation votes, CEFR votes per entry per model
-```bash
+> **Note:** The tiebreaker is not a standalone script. It runs automatically > as part of the pipeline orchestrator after merge completes.
 pnpm --filter @lila/pipeline run --name "night-1"
 ```
 ### 4. Merge
@ -314,11 +424,9 @@ These are not part of the current pipeline but are worth considering as the data
 ## Roadmap
-**Current state:** Stages 1 and 2 complete and verified on sample data.
+**Current state:** Stage 1 extraction and stage 2 reverse link sync complete and verified on sample data. Stage 3 enrich script written and tested — redesigning to sub-stage architecture for better data quality. llama.cpp running with Qwen3.5-4B.
 Stage 3 round 1 enrich script written. llama.cpp not yet installed.
 pipeline.db contains 4,156 entries and 4,287 translations across 5 languages.
-**Next action:** Install llama.cpp, run smoke test with sample data.
+**Next action:** Rewrite enrich script for sub-stage design.
 | Stage           | Status         |
 | --------------- | -------------- |
@ -347,14 +455,15 @@ pipeline.db contains 4,156 entries and 4,287 translations across 5 languages.
 - [x] Run reverse link sync on sample data → 141 links inserted
 - [ ] Run reverse link sync on full data after full extraction
-### Stage 3 — Enrich `🔲 not started`
+### Stage 3 — Enrich `🔄 in progress`
-**Next action:** Write the enrich script after production schema is complete.
+**Next action:** Rewrite enrich script for sub-stage design.
- [x] Write enrich script (missing translations, glosses, examples, CEFR votes)
+- [x] Write initial enrich script (single-prompt design)
- [ ] Write tests
+- [x] Install llama.cpp and verify server
- [ ] Install llama.cpp and verify server
+- [x] Smoke test with sample entries
- [ ] Smoke test with sample entries
+- [ ] Rewrite enrich script for sub-stage design (round1_gloss, round1_example, round1_translations, round1_cefr)
 - [ ] Write tests for enrich sub-stages
 - [ ] Run full sample, collect metrics
 - [ ] Compare providers (local vs OpenRouter free models)
 - [ ] Production run — all entries, all models
--- a/documentation/llm-setup.md
+++ b/documentation/llm-setup.md
@ -1,17 +1,12 @@
 # LLM Setup — lila pipeline
-This document covers the LLM infrastructure for stage 3 (enrich) of the lila
+This document covers the LLM infrastructure for stage 3 (enrich) of the lila data pipeline. It documents the hardware constraints, supported providers, model recommendations, and how to configure and swap providers in the test and production scripts.
 data pipeline. It documents the hardware constraints, supported providers,
 model recommendations, and how to configure and swap providers in the test
 and production scripts.
 ---
 ## Provider model
-Each provider + model combination counts as one vote in the final majority.
+Each provider + model combination counts as one vote in the final majority. Running the same model twice is not supported — one model, one vote. To increase vote confidence, add more models rather than re-running existing ones.
 Running the same model twice is not supported — one model, one vote. To
 increase vote confidence, add more models rather than re-running existing ones.
 ---
@ -24,17 +19,13 @@ increase vote confidence, add more models rather than re-running existing ones.
 | GPU       | NVIDIA GeForce GTX 950M — 4 GB VRAM (Maxwell, CUDA compute 5.0) |
 | OS        | Debian GNU/Linux 13 (trixie) x86_64                             |
-**Local inference verdict:** viable for small/quantized models, not for
+**Local inference verdict:** viable for small/quantized models, not for production runs. See the [Local inference](#local-inference-llamacpp) section for details.
 production runs. See the [Local inference](#local-inference-llamacpp) section
 for details.
 ---
 ## Provider overview
-The enrich script uses a single, swappable provider config. All providers
+The enrich script uses a single, swappable provider config. All providers except Anthropic expose an OpenAI-compatible API, so the same client code works across all of them — only `baseURL`, `apiKey`, and `model` change.
 except Anthropic expose an OpenAI-compatible API, so the same client code
 works across all of them — only `baseURL`, `apiKey`, and `model` change.
 | Provider               | Use case                                      | Cost               | Rate limits            |
 | ---------------------- | --------------------------------------------- | ------------------ | ---------------------- |
@ -49,20 +40,13 @@ works across all of them — only `baseURL`, `apiKey`, and `model` change.
 ### Why local inference is worth testing
-Time is not a constraint — the pipeline scripts are fully resumable. The
+Time is not a constraint — the pipeline scripts are fully resumable. The laptop can run overnight for multiple nights. The only question is output quality, which the test script evaluates empirically.
 laptop can run overnight for multiple nights. The only question is output
 quality, which the test script evaluates empirically.
 ### Hardware constraints
-The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0).
+The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0). llama.cpp supports Maxwell via CUDA backend but newer builds may require the `--cuda-no-kv-offload` flag depending on the version.
 llama.cpp supports Maxwell via CUDA backend but newer builds may require
 the `--cuda-no-kv-offload` flag depending on the version.
-llama.cpp splits model layers between GPU and CPU automatically via
+llama.cpp splits model layers between GPU and CPU automatically via `--n-gpu-layers`. You set how many layers go on the GPU; the rest run on CPU/RAM. This means a model larger than VRAM is not a dead end — it runs in hybrid mode, slower than full-GPU but much faster than pure CPU.
 `--n-gpu-layers`. You set how many layers go on the GPU; the rest run on
 CPU/RAM. This means a model larger than VRAM is not a dead end — it runs
 in hybrid mode, slower than full-GPU but much faster than pure CPU.
 Practical estimates for this hardware (~3.5 GB VRAM usable after drivers):
@ -75,24 +59,19 @@ Practical estimates for this hardware (~3.5 GB VRAM usable after drivers):
 ### Recommended local models
-Two candidates worth testing, covering different points on the size/quality
+Two candidates worth testing, covering different points on the size/quality tradeoff:
 tradeoff:
 **Gemma 4 E4B Instruct (Q4 / UD-Q4_K_XL)**
 - GGUF file: `gemma-4-E4B-it-UD-Q4_K_XL.gguf` (~2.5 GB)
 - Source: https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF
- Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+
+- Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+ language support including all five pipeline languages. First candidate to test.
  language support including all five pipeline languages. First candidate
  to test.
 **Qwen2.5 7B Instruct (Q4_K_M)**
 - GGUF file: `Qwen2.5-7B-Instruct-Q4_K_M.gguf` (~4.5 GB)
 - Source: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF
- Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~8–12 tok/s.
+- Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~8–12 tok/s. Stronger multilingual generation than any 3–4B model. Second candidate, for comparison against the smaller Gemma 4 E4B.
  Stronger multilingual generation than any 3–4B model. Second candidate,
  for comparison against the smaller Gemma 4 E4B.
 ### Installation