feat: add stage 3 round 1 enrich script and wire into orchestrator

2026-05-05 19:28:38 +02:00 · 2026-05-05 19:28:38 +02:00 · 9642daf6dd
commit 9642daf6dd
parent 76af2ab093
2 changed files with 493 additions and 4 deletions
--- a/data-pipeline/pipeline.ts
+++ b/data-pipeline/pipeline.ts
@ -7,6 +7,7 @@ import { openDb } from "./db/index.js";
 import { reverseLink } from "./stage-2-reverse-link/scripts/reverse-link.js";
 import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js";
 import type { ProviderConfig } from "./stage-3-enrich/config.js";
+import { enrich } from "./stage-3-enrich/scripts/enrich.js";

 // ── Types ─────────────────────────────────────────────────────────────────────

@ -248,10 +249,15 @@ function runReverseLinkStage(): void {
  markReverseLinkComplete();
 }

-function runRound1(provider: ProviderConfig, stats: RunStats): void {
+async function runRound1(
+  provider: ProviderConfig,
+  stats: RunStats,
+): Promise<void> {
  console.log(`\n  [round 1] Running ${provider.name}...`);
-  // TODO: implement round 1 enrich script
-  console.log(`  [round 1] ${provider.name} — not yet implemented`);
+  const counts = await enrich(provider);
+  stats.recordsProcessed += counts.processed;
+  stats.recordsSkipped += counts.skipped;
+  stats.needsReview += counts.needsReview;
  stats.modelsRun.push(provider.name);
 }

@ -492,7 +498,7 @@ async function main(): Promise<void> {
      console.log(`  [round 1] ${provider.name} — resuming...`);
    }

-    runRound1(provider, stats);
+    await runRound1(provider, stats);
  }

  if (shutdownRequested) {
--- a/data-pipeline/stage-3-enrich/scripts/enrich.ts
+++ b/data-pipeline/stage-3-enrich/scripts/enrich.ts
@ -0,0 +1,483 @@
+import { openDb } from "../../db/index.js";
+import type { ProviderConfig } from "../config.js";
+import { CEFR_LEVELS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import type { SupportedLanguageCode } from "@lila/shared";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type EntryRow = {
+  id: number;
+  headword: string;
+  language: SupportedLanguageCode;
+  pos: string;
+  gloss: string | null;
+  examples: string; // JSON array string
+};
+
+type TranslationRow = {
+  id: number;
+  target_lang: SupportedLanguageCode;
+  word: string;
+};
+
+type LlmResponse = {
+  headword_cefr: string;
+  translation_cefr: Partial<
+    Record<SupportedLanguageCode, Record<string, string>>
+  >;
+  generated_translations?: Partial<Record<SupportedLanguageCode, string>>;
+  generated_gloss?: string;
+  generated_example?: string;
+};
+
+type ValidationResult =
+  | { valid: true; data: LlmResponse }
+  | { valid: false; reason: string };
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
+const CEFR_SET = new Set<string>(CEFR_LEVELS);
+
+// ── Prompt builder ────────────────────────────────────────────────────────────
+
+function buildPrompt(entry: EntryRow, translations: TranslationRow[]): string {
+  const examples: string[] = JSON.parse(entry.examples) as string[];
+
+  // Group translations by language
+  const byLang = new Map<SupportedLanguageCode, string[]>();
+  for (const t of translations) {
+    if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, []);
+    byLang.get(t.target_lang)!.push(t.word);
+  }
+
+  // Find missing languages
+  const coveredLangs = new Set(byLang.keys());
+  const missingLangs = SUPPORTED_LANGUAGE_CODES.filter(
+    (l) => l !== entry.language && !coveredLangs.has(l),
+  );
+
+  const existingTranslationsText =
+    byLang.size > 0
+      ? [...byLang.entries()]
+          .map(([lang, words]) => `  ${lang}: ${words.join(", ")}`)
+          .join("\n")
+      : "  none";
+
+  const missingTranslationsText =
+    missingLangs.length > 0 ? missingLangs.join(", ") : "none";
+
+  const examplesText =
+    examples.length > 0 ? examples.map((e) => `  - ${e}`).join("\n") : "  none";
+
+  const glossText = entry.gloss ?? "none";
+
+  return `You are a language learning expert building a multilingual vocabulary database.
+
+Given an English word sense, your tasks are:
+1. Assign a CEFR level (A1, A2, B1, B2, C1, or C2) to the English headword for this specific sense
+2. Assign a CEFR level to each existing translation listed
+3. If MISSING TRANSLATIONS lists any languages, generate the single best translation for each
+4. If the existing gloss is missing or unsuitable for a language learner, generate a better one
+5. If the existing examples are missing or unsuitable for a language learner, generate one natural sentence in English
+
+Base CEFR levels on how commonly a language learner at that level would encounter this specific sense, not the word in general. Consider register — slang, technical, and archaic words should be rated higher.
+
+WORD: ${entry.headword}
+PART OF SPEECH: ${entry.pos}
+GLOSS: ${glossText}
+EXAMPLES:
+${examplesText}
+EXISTING TRANSLATIONS:
+${existingTranslationsText}
+MISSING TRANSLATIONS: ${missingTranslationsText}
+
+Respond ONLY with valid JSON and nothing else — no explanation, no markdown:
+{
+  "headword_cefr": "<level>",
+  "translation_cefr": {
+    "<lang>": { "<word>": "<level>", ... },
+    ...
+  },
+  "generated_translations": { "<lang>": "<word>", ... },
+  "generated_gloss": "<gloss if needed, omit if existing is fine>",
+  "generated_example": "<example sentence in English if needed, omit if existing is fine>"
+}
+
+Only include "generated_translations" if there are missing languages.
+Only include "generated_gloss" if you judge the existing gloss unsuitable.
+Only include "generated_example" if you judge the existing examples unsuitable.`;
+}
+
+// ── Validation ────────────────────────────────────────────────────────────────
+
+function validateResponse(
+  raw: string,
+  translations: TranslationRow[],
+): ValidationResult {
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(raw);
+  } catch {
+    return { valid: false, reason: "invalid JSON" };
+  }
+
+  if (typeof parsed !== "object" || parsed === null) {
+    return { valid: false, reason: "response is not an object" };
+  }
+
+  const obj = parsed as Record<string, unknown>;
+
+  // headword_cefr required
+  if (typeof obj["headword_cefr"] !== "string") {
+    return { valid: false, reason: "missing headword_cefr" };
+  }
+  if (!CEFR_SET.has(obj["headword_cefr"])) {
+    return {
+      valid: false,
+      reason: `invalid headword_cefr: ${obj["headword_cefr"]}`,
+    };
+  }
+
+  // translation_cefr required
+  if (
+    typeof obj["translation_cefr"] !== "object" ||
+    obj["translation_cefr"] === null
+  ) {
+    return { valid: false, reason: "missing translation_cefr" };
+  }
+
+  const translationCefr = obj["translation_cefr"] as Record<string, unknown>;
+  for (const [lang, votes] of Object.entries(translationCefr)) {
+    if (!SUPPORTED_LANG_SET.has(lang)) {
+      return {
+        valid: false,
+        reason: `unsupported language in translation_cefr: ${lang}`,
+      };
+    }
+    if (typeof votes !== "object" || votes === null) {
+      return {
+        valid: false,
+        reason: `translation_cefr.${lang} is not an object`,
+      };
+    }
+    for (const [word, level] of Object.entries(
+      votes as Record<string, unknown>,
+    )) {
+      if (typeof level !== "string" || !CEFR_SET.has(level)) {
+        return {
+          valid: false,
+          reason: `invalid CEFR level for ${lang}.${word}: ${String(level)}`,
+        };
+      }
+    }
+  }
+
+  // Verify all existing translations have a CEFR vote
+  const byLang = new Map<string, Set<string>>();
+  for (const t of translations) {
+    if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, new Set());
+    byLang.get(t.target_lang)!.add(t.word);
+  }
+
+  for (const [lang, words] of byLang.entries()) {
+    const votes = translationCefr[lang] as Record<string, string> | undefined;
+    if (!votes) {
+      return {
+        valid: false,
+        reason: `missing translation_cefr for language: ${lang}`,
+      };
+    }
+    for (const word of words) {
+      if (!votes[word]) {
+        return {
+          valid: false,
+          reason: `missing CEFR vote for ${lang}:${word}`,
+        };
+      }
+    }
+  }
+
+  // Optional fields
+  if (obj["generated_translations"] !== undefined) {
+    if (
+      typeof obj["generated_translations"] !== "object" ||
+      obj["generated_translations"] === null
+    ) {
+      return {
+        valid: false,
+        reason: "generated_translations is not an object",
+      };
+    }
+    for (const [lang, word] of Object.entries(
+      obj["generated_translations"] as Record<string, unknown>,
+    )) {
+      if (!SUPPORTED_LANG_SET.has(lang)) {
+        return {
+          valid: false,
+          reason: `unsupported language in generated_translations: ${lang}`,
+        };
+      }
+      if (typeof word !== "string" || !word.trim()) {
+        return {
+          valid: false,
+          reason: `empty generated translation for ${lang}`,
+        };
+      }
+    }
+  }
+
+  if (
+    obj["generated_gloss"] !== undefined &&
+    typeof obj["generated_gloss"] !== "string"
+  ) {
+    return { valid: false, reason: "generated_gloss is not a string" };
+  }
+
+  if (
+    obj["generated_example"] !== undefined &&
+    typeof obj["generated_example"] !== "string"
+  ) {
+    return { valid: false, reason: "generated_example is not a string" };
+  }
+
+  return { valid: true, data: obj as unknown as LlmResponse };
+}
+
+// ── LLM call ──────────────────────────────────────────────────────────────────
+
+async function callLlm(
+  prompt: string,
+  provider: ProviderConfig,
+): Promise<string> {
+  const response = await fetch(`${provider.baseURL}/chat/completions`, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+      Authorization: `Bearer ${provider.apiKey}`,
+    },
+    body: JSON.stringify({
+      model: provider.model,
+      max_tokens: provider.maxTokens,
+      messages: [{ role: "user", content: prompt }],
+      temperature: 0.1, // low temperature for consistent structured output
+    }),
+  });
+
+  if (!response.ok) {
+    throw new Error(`LLM API error: ${response.status} ${response.statusText}`);
+  }
+
+  const data = (await response.json()) as {
+    choices?: { message?: { content?: string } }[];
+  };
+
+  const content = data.choices?.[0]?.message?.content;
+  if (!content) throw new Error("LLM returned empty response");
+
+  // Strip markdown code fences if present
+  return content
+    .replace(/```json\n?/g, "")
+    .replace(/```\n?/g, "")
+    .trim();
+}
+
+// ── Write results ─────────────────────────────────────────────────────────────
+
+function writeResults(
+  entryId: number,
+  modelName: string,
+  data: LlmResponse,
+  translations: TranslationRow[],
+): void {
+  const db = openDb();
+
+  const insertEntryCefr = db.prepare(`
+    INSERT INTO model_entry_cefr_votes (entry_id, model_name, cefr_level)
+    VALUES (?, ?, ?)
+    ON CONFLICT (entry_id, model_name) DO NOTHING
+  `);
+
+  const insertTranslationCefr = db.prepare(`
+    INSERT INTO model_translation_cefr_votes (translation_id, model_name, cefr_level)
+    VALUES (?, ?, ?)
+    ON CONFLICT (translation_id, model_name) DO NOTHING
+  `);
+
+  const insertGeneratedTranslation = db.prepare(`
+    INSERT INTO generated_translations (entry_id, model_name, target_lang, word)
+    VALUES (?, ?, ?, ?)
+    ON CONFLICT (entry_id, model_name, target_lang) DO NOTHING
+  `);
+
+  const insertGeneratedGloss = db.prepare(`
+    INSERT INTO generated_glosses (entry_id, model_name, text)
+    VALUES (?, ?, ?)
+    ON CONFLICT (entry_id, model_name) DO NOTHING
+  `);
+
+  const insertGeneratedExample = db.prepare(`
+    INSERT INTO generated_examples (entry_id, model_name, text)
+    VALUES (?, ?, ?)
+    ON CONFLICT (entry_id, model_name) DO NOTHING
+  `);
+
+  const updateRunStatus = db.prepare(`
+    INSERT INTO run_status (entry_id, model_name, stage, status)
+    VALUES (?, ?, 'round1', 'complete')
+    ON CONFLICT (entry_id, model_name, stage)
+    DO UPDATE SET status = 'complete', updated_at = datetime('now')
+  `);
+
+  db.transaction(() => {
+    // CEFR vote for headword
+    insertEntryCefr.run(entryId, modelName, data.headword_cefr);
+
+    // CEFR votes for translations
+    for (const t of translations) {
+      const level = data.translation_cefr[t.target_lang]?.[t.word];
+      if (level) {
+        insertTranslationCefr.run(t.id, modelName, level);
+      }
+    }
+
+    // Generated translations
+    if (data.generated_translations) {
+      for (const [lang, word] of Object.entries(data.generated_translations)) {
+        if (word.trim()) {
+          insertGeneratedTranslation.run(entryId, modelName, lang, word.trim());
+        }
+      }
+    }
+
+    // Generated gloss
+    if (data.generated_gloss?.trim()) {
+      insertGeneratedGloss.run(entryId, modelName, data.generated_gloss.trim());
+    }
+
+    // Generated example
+    if (data.generated_example?.trim()) {
+      insertGeneratedExample.run(
+        entryId,
+        modelName,
+        data.generated_example.trim(),
+      );
+    }
+
+    // Mark complete
+    updateRunStatus.run(entryId, modelName);
+  })();
+
+  db.close();
+}
+
+function markNeedsReview(
+  entryId: number,
+  modelName: string,
+  reason: string,
+): void {
+  const db = openDb();
+  db.prepare(
+    `
+    INSERT INTO run_status (entry_id, model_name, stage, status)
+    VALUES (?, ?, 'round1', 'needs_review')
+    ON CONFLICT (entry_id, model_name, stage)
+    DO UPDATE SET status = 'needs_review', updated_at = datetime('now')
+  `,
+  ).run(entryId, modelName);
+  db.close();
+  console.warn(`    needs_review: entry ${entryId} — ${reason}`);
+}
+
+// ── Main enrich function ──────────────────────────────────────────────────────
+
+export async function enrich(
+  provider: ProviderConfig,
+): Promise<{ processed: number; skipped: number; needsReview: number }> {
+  const db = openDb();
+
+  // Load all English entries
+  const allEntries = db
+    .prepare(`SELECT * FROM entries WHERE language = 'en'`)
+    .all() as EntryRow[];
+
+  // Find already processed entries for this model
+  const processed = db
+    .prepare(
+      `SELECT entry_id FROM run_status
+       WHERE model_name = ? AND stage = 'round1'
+       AND status IN ('complete', 'needs_review')`,
+    )
+    .all(provider.name) as { entry_id: number }[];
+
+  const processedIds = new Set(processed.map((r) => r.entry_id));
+  const pending = allEntries.filter((e) => !processedIds.has(e.id));
+
+  db.close();
+
+  console.log(`\n  Model: ${provider.name}`);
+  console.log(`  Total entries: ${allEntries.length.toLocaleString()}`);
+  console.log(`  Already processed: ${processedIds.size.toLocaleString()}`);
+  console.log(`  Pending: ${pending.length.toLocaleString()}`);
+
+  if (pending.length === 0) {
+    console.log("  Nothing to process.");
+    return { processed: 0, skipped: allEntries.length, needsReview: 0 };
+  }
+
+  let processedCount = 0;
+  let needsReviewCount = 0;
+
+  for (const entry of pending) {
+    const db2 = openDb();
+    const translations = db2
+      .prepare(
+        `SELECT id, target_lang, word FROM translations
+         WHERE entry_id = ? AND source = 'kaikki'`,
+      )
+      .all(entry.id) as TranslationRow[];
+    db2.close();
+
+    const prompt = buildPrompt(entry, translations);
+
+    let raw: string;
+    try {
+      raw = await callLlm(prompt, provider);
+    } catch (err) {
+      const message = err instanceof Error ? err.message : String(err);
+      markNeedsReview(entry.id, provider.name, `LLM call failed: ${message}`);
+      needsReviewCount++;
+      continue;
+    }
+
+    const validation = validateResponse(raw, translations);
+
+    if (!validation.valid) {
+      markNeedsReview(
+        entry.id,
+        provider.name,
+        `validation failed: ${validation.reason}`,
+      );
+      needsReviewCount++;
+      continue;
+    }
+
+    writeResults(entry.id, provider.name, validation.data, translations);
+    processedCount++;
+
+    if (processedCount % 100 === 0) {
+      console.log(
+        `    Processed ${processedCount.toLocaleString()} entries...`,
+      );
+    }
+  }
+
+  console.log(`  Processed: ${processedCount.toLocaleString()}`);
+  console.log(`  Needs review: ${needsReviewCount.toLocaleString()}`);
+
+  return {
+    processed: processedCount,
+    skipped: processedIds.size,
+    needsReview: needsReviewCount,
+  };
+}