feat: enrich script working, redesigning to sub-stage architecture

- Enrich script functional with timeout, progress tracking, rejection mechanism - Identified ordering issue: CEFR voting needs validated translations first - Redesign: round1_gloss → round1_example → round1_translations → round1_cefr - Update data-pipeline.md with new sub-stage design and roadmap - Qwen3.5-4B confirmed working with thinking disabled
2026-05-07 13:09:43 +02:00 · 2026-05-07 13:09:43 +02:00 · 73fb12ac35
commit 73fb12ac35
parent 7f10c35e03
7 changed files with 337 additions and 122 deletions
--- a/data-pipeline/db/pipeline.db-shm
+++ b/data-pipeline/db/pipeline.db-shm
--- a/data-pipeline/db/pipeline.db-wal
+++ b/data-pipeline/db/pipeline.db-wal
--- a/data-pipeline/db/schema.sql
+++ b/data-pipeline/db/schema.sql
@ -60,6 +60,13 @@ CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
  UNIQUE (translation_id, model_name)
 );

+CREATE TABLE IF NOT EXISTS model_translation_rejections (
+  id             INTEGER PRIMARY KEY,
+  translation_id INTEGER NOT NULL REFERENCES translations(id),
+  model_name     TEXT    NOT NULL,
+  UNIQUE (translation_id, model_name)
+);
+
 CREATE TABLE IF NOT EXISTS generated_glosses (
  id         INTEGER PRIMARY KEY,
  entry_id   INTEGER NOT NULL REFERENCES entries(id),
--- a/data-pipeline/stage-3-enrich/config.ts
+++ b/data-pipeline/stage-3-enrich/config.ts
@ -20,12 +20,20 @@ export type ProviderConfig = {

 // ── Local llama.cpp ───────────────────────────────────────────────────────────

+export const LOCAL_QWEN35_4B: ProviderConfig = {
+  name: "local-qwen3.5-4b",
+  baseURL: "http://127.0.0.1:8080/v1",
+  apiKey: "none",
+  model: "qwen3.5-4b",
+  maxTokens: 1024, // no reasoning overhead so 1024 is enough
+};
+
 export const LOCAL_GEMMA4: ProviderConfig = {
  name: "local-gemma4-e4b",
  baseURL: "http://127.0.0.1:8080/v1",
  apiKey: "none", // llama.cpp ignores this
  model: "gemma4-e4b", // llama.cpp ignores model name, uses loaded model
-  maxTokens: 512,
+  maxTokens: 2048,
 };

 export const LOCAL_QWEN7B: ProviderConfig = {
@ -87,13 +95,14 @@ export const ANTHROPIC_SONNET: ProviderConfig = {
 // Add new providers here to include them in the voting pool.

 export const ALL_PROVIDERS: ProviderConfig[] = [
-  LOCAL_GEMMA4,
-  LOCAL_QWEN7B,
-  OR_QWEN3_480B,
-  OR_GEMMA4_31B,
-  OR_QWEN3_80B,
-  OR_NEMOTRON,
-  ANTHROPIC_SONNET,
+  LOCAL_QWEN35_4B,
+  // LOCAL_GEMMA4,
+  // LOCAL_QWEN7B,
+  // OR_QWEN3_480B,
+  // OR_GEMMA4_31B,
+  // OR_QWEN3_80B,
+  // OR_NEMOTRON,
+  // ANTHROPIC_SONNET,
 ];

 // ── Key validation ────────────────────────────────────────────────────────────
--- a/data-pipeline/stage-3-enrich/scripts/enrich.ts
+++ b/data-pipeline/stage-3-enrich/scripts/enrich.ts
@ -94,19 +94,68 @@ MISSING TRANSLATIONS: ${missingTranslationsText}

 Respond ONLY with valid JSON and nothing else — no explanation, no markdown:
 {
-  "headword_cefr": "<level>",
+  "headword_cefr": "B1",
  "translation_cefr": {
-    "<lang>": { "<word>": "<level>", ... },
-    ...
+    "de": { "frei": "A2" },
+    "es": { "libre": "A2" },
+    "fr": { "libre": "A2" },
+    "it": { "libero": "A2" }
  },
-  "generated_translations": { "<lang>": "<word>", ... },
-  "generated_gloss": "<gloss if needed, omit if existing is fine>",
-  "generated_example": "<example sentence in English if needed, omit if existing is fine>"
+  "generated_translations": { "missing_lang": "word" },
+  "generated_gloss": "A clearer definition for learners.",
+  "generated_example": "A natural example sentence."
 }

-Only include "generated_translations" if there are missing languages.
-Only include "generated_gloss" if you judge the existing gloss unsuitable.
-Only include "generated_example" if you judge the existing examples unsuitable.`;
+EXAMPLE OF CORRECT BEHAVIOUR:
+If you receive:
+  WORD: cat
+  EXISTING TRANSLATIONS:
+    it: gatto, cat
+
+The correct response includes "reject" for "cat" because it is an English word, not Italian:
+  "translation_cefr": {
+    "it": { "gatto": "A1", "cat": "reject" }
+  }
+
+Similarly, if you receive:
+  EXISTING TRANSLATIONS:
+    de: frei, -frei
+
+The correct response includes "reject" for "-frei" because it is a suffix, not a standalone word:
+  "translation_cefr": {
+    "de": { "frei": "A2", "-frei": "reject" }
+  }
+
+  EXAMPLE OF CORRECT BEHAVIOUR:
+  If you receive:
+    WORD: cat
+    EXISTING TRANSLATIONS:
+      it: gatto, cat
+
+  The correct response includes "reject" for "cat" because it is an English word, not Italian:
+    "translation_cefr": {
+      "it": { "gatto": "A1", "cat": "reject" }
+    }
+
+  Similarly, if you receive:
+    EXISTING TRANSLATIONS:
+      de: frei, -frei
+
+  The correct response includes "reject" for "-frei" because it is a suffix, not a standalone word:
+    "translation_cefr": {
+      "de": { "frei": "A2", "-frei": "reject" }
+    }
+
+IMPORTANT:
+- You MUST include EVERY translation listed in EXISTING TRANSLATIONS in your response — no exceptions
+- Use the CEFR level (A1-C2) if the translation is valid for this sense
+- Use "reject" if the translation does not fit this specific sense, is not a real word in that language, or is clearly bad data
+- Never silently omit a translation — every word must get either a CEFR level or "reject"
+- translation_cefr must map each language to an object of word:level pairs
+- Only include "generated_translations" if MISSING TRANSLATIONS lists languages
+- Only include "generated_gloss" if you judge the existing gloss unsuitable
+- Only include "generated_example" if you judge the existing examples unsuitable
+`;
 }

 // ── Validation ────────────────────────────────────────────────────────────────
@ -148,30 +197,6 @@ function validateResponse(
  }

  const translationCefr = obj["translation_cefr"] as Record<string, unknown>;
-  for (const [lang, votes] of Object.entries(translationCefr)) {
-    if (!SUPPORTED_LANG_SET.has(lang)) {
-      return {
-        valid: false,
-        reason: `unsupported language in translation_cefr: ${lang}`,
-      };
-    }
-    if (typeof votes !== "object" || votes === null) {
-      return {
-        valid: false,
-        reason: `translation_cefr.${lang} is not an object`,
-      };
-    }
-    for (const [word, level] of Object.entries(
-      votes as Record<string, unknown>,
-    )) {
-      if (typeof level !== "string" || !CEFR_SET.has(level)) {
-        return {
-          valid: false,
-          reason: `invalid CEFR level for ${lang}.${word}: ${String(level)}`,
-        };
-      }
-    }
-  }

  // Verify all existing translations have a CEFR vote
  const byLang = new Map<string, Set<string>>();
@ -199,11 +224,11 @@ function validateResponse(
  }

  // Optional fields
-  if (obj["generated_translations"] !== undefined) {
-    if (
-      typeof obj["generated_translations"] !== "object" ||
-      obj["generated_translations"] === null
-    ) {
+  if (
+    obj["generated_translations"] !== undefined &&
+    obj["generated_translations"] !== null
+  ) {
+    if (typeof obj["generated_translations"] !== "object") {
      return {
        valid: false,
        reason: "generated_translations is not an object",
@ -250,19 +275,28 @@ async function callLlm(
  prompt: string,
  provider: ProviderConfig,
 ): Promise<string> {
-  const response = await fetch(`${provider.baseURL}/chat/completions`, {
-    method: "POST",
-    headers: {
-      "Content-Type": "application/json",
-      Authorization: `Bearer ${provider.apiKey}`,
-    },
-    body: JSON.stringify({
-      model: provider.model,
-      max_tokens: provider.maxTokens,
-      messages: [{ role: "user", content: prompt }],
-      temperature: 0.1, // low temperature for consistent structured output
-    }),
-  });
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), 120_000); // 2 minutes
+
+  let response: Response;
+  try {
+    response = await fetch(`${provider.baseURL}/chat/completions`, {
+      method: "POST",
+      signal: controller.signal,
+      headers: {
+        "Content-Type": "application/json",
+        Authorization: `Bearer ${provider.apiKey}`,
+      },
+      body: JSON.stringify({
+        model: provider.model,
+        max_tokens: provider.maxTokens,
+        messages: [{ role: "user", content: prompt }],
+        temperature: 0.1,
+      }),
+    });
+  } finally {
+    clearTimeout(timeout);
+  }

  if (!response.ok) {
    throw new Error(`LLM API error: ${response.status} ${response.statusText}`);
@ -272,10 +306,17 @@ async function callLlm(
    choices?: { message?: { content?: string } }[];
  };

-  const content = data.choices?.[0]?.message?.content;
+  const content =
+    data.choices?.[0]?.message?.content ||
+    ((data.choices?.[0]?.message as Record<string, unknown>)?.[
+      "reasoning_content"
+    ] as string | undefined);
+  console.log(
+    "\n  DEBUG response:",
+    JSON.stringify(data.choices?.[0]?.message),
+  );
  if (!content) throw new Error("LLM returned empty response");

-  // Strip markdown code fences if present
  return content
    .replace(/```json\n?/g, "")
    .replace(/```\n?/g, "")
@ -333,10 +374,21 @@ function writeResults(
    // CEFR vote for headword
    insertEntryCefr.run(entryId, modelName, data.headword_cefr);

-    // CEFR votes for translations
+    // CEFR votes and rejections for translations
    for (const t of translations) {
      const level = data.translation_cefr[t.target_lang]?.[t.word];
-      if (level) {
+
+      if (!level) continue;
+      if (level === "reject") {
+        // Explicit rejection or silently skipped — both treated as rejection
+        db.prepare(
+          `
+          INSERT INTO model_translation_rejections (translation_id, model_name)
+          VALUES (?, ?)
+          ON CONFLICT (translation_id, model_name) DO NOTHING
+        `,
+        ).run(t.id, modelName);
+      } else {
        insertTranslationCefr.run(t.id, modelName, level);
      }
    }
@ -389,6 +441,34 @@ function markNeedsReview(
  console.warn(`    needs_review: entry ${entryId} — ${reason}`);
 }

+function updateProgress(
+  processed: number,
+  needsReview: number,
+  total: number,
+  llmMs: number,
+  startTime: number,
+): void {
+  const totalProcessed = processed + needsReview;
+  const pct = ((totalProcessed / total) * 100).toFixed(1);
+  const elapsed = (Date.now() - startTime) / 1000;
+  const rate = elapsed > 0 ? totalProcessed / elapsed : 0;
+  const remaining = rate > 0 ? (total - totalProcessed) / rate : 0;
+  const eta =
+    remaining === 0
+      ? "calculating..."
+      : remaining < 60
+        ? `${Math.round(remaining)}s`
+        : `${Math.round(remaining / 60)}m`;
+  const totalElapsedStr =
+    elapsed < 60
+      ? `${Math.round(elapsed)}s`
+      : `${Math.floor(elapsed / 60)}m ${Math.round(elapsed % 60)}s`;
+
+  process.stdout.write(
+    `\r    ${totalProcessed}/${total} (${pct}%) — entry: ${(llmMs / 1000).toFixed(1)}s — total: ${totalElapsedStr} — ETA: ${eta}    `,
+  );
+}
+
 // ── Main enrich function ──────────────────────────────────────────────────────

 export async function enrich(
@ -411,7 +491,9 @@ export async function enrich(
    .all(provider.name) as { entry_id: number }[];

  const processedIds = new Set(processed.map((r) => r.entry_id));
-  const pending = allEntries.filter((e) => !processedIds.has(e.id));
+  const pending = allEntries
+    .filter((e) => !processedIds.has(e.id))
+    .slice(0, 10);

  db.close();

@ -427,6 +509,9 @@ export async function enrich(

  let processedCount = 0;
  let needsReviewCount = 0;
+  let llmMs = 0;
+
+  const startTime = Date.now();

  for (const entry of pending) {
    const db2 = openDb();
@ -441,17 +526,26 @@ export async function enrich(
    const prompt = buildPrompt(entry, translations);

    let raw: string;
+
    try {
+      const llmStart = Date.now();
      raw = await callLlm(prompt, provider);
+      llmMs = Date.now() - llmStart;
    } catch (err) {
      const message = err instanceof Error ? err.message : String(err);
      markNeedsReview(entry.id, provider.name, `LLM call failed: ${message}`);
      needsReviewCount++;
+      updateProgress(
+        processedCount,
+        needsReviewCount,
+        pending.length,
+        llmMs,
+        startTime,
+      );
      continue;
    }

    const validation = validateResponse(raw, translations);
-
    if (!validation.valid) {
      markNeedsReview(
        entry.id,
@ -459,19 +553,36 @@ export async function enrich(
        `validation failed: ${validation.reason}`,
      );
      needsReviewCount++;
+      updateProgress(
+        processedCount,
+        needsReviewCount,
+        pending.length,
+        llmMs,
+        startTime,
+      );
      continue;
    }

    writeResults(entry.id, provider.name, validation.data, translations);
    processedCount++;
-
-    if (processedCount % 100 === 0) {
-      console.log(
-        `    Processed ${processedCount.toLocaleString()} entries...`,
-      );
-    }
+    updateProgress(
+      processedCount,
+      needsReviewCount,
+      pending.length,
+      llmMs,
+      startTime,
+    );
  }

+  process.stdout.write("\n");
+  const totalMs = Date.now() - startTime;
+  const totalMin = Math.floor(totalMs / 60_000);
+  const totalSec = Math.round((totalMs % 60_000) / 1000);
+  console.log(`  Total time: ${totalMin}m ${totalSec}s`);
+  console.log(
+    `  Avg per entry: ${(totalMs / Math.max(processedCount + needsReviewCount, 1) / 1000).toFixed(1)}s`,
+  );
+
  console.log(`  Processed: ${processedCount.toLocaleString()}`);
  console.log(`  Needs review: ${needsReviewCount.toLocaleString()}`);