From 9642daf6dde86f74a7d830e4292bcf74f7710fa9 Mon Sep 17 00:00:00 2001 From: lila Date: Tue, 5 May 2026 19:28:38 +0200 Subject: [PATCH] feat: add stage 3 round 1 enrich script and wire into orchestrator --- data-pipeline/pipeline.ts | 14 +- .../stage-3-enrich/scripts/enrich.ts | 483 ++++++++++++++++++ 2 files changed, 493 insertions(+), 4 deletions(-) create mode 100644 data-pipeline/stage-3-enrich/scripts/enrich.ts diff --git a/data-pipeline/pipeline.ts b/data-pipeline/pipeline.ts index 5be9660..8652817 100644 --- a/data-pipeline/pipeline.ts +++ b/data-pipeline/pipeline.ts @@ -7,6 +7,7 @@ import { openDb } from "./db/index.js"; import { reverseLink } from "./stage-2-reverse-link/scripts/reverse-link.js"; import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js"; import type { ProviderConfig } from "./stage-3-enrich/config.js"; +import { enrich } from "./stage-3-enrich/scripts/enrich.js"; // ── Types ───────────────────────────────────────────────────────────────────── @@ -248,10 +249,15 @@ function runReverseLinkStage(): void { markReverseLinkComplete(); } -function runRound1(provider: ProviderConfig, stats: RunStats): void { +async function runRound1( + provider: ProviderConfig, + stats: RunStats, +): Promise { console.log(`\n [round 1] Running ${provider.name}...`); - // TODO: implement round 1 enrich script - console.log(` [round 1] ${provider.name} — not yet implemented`); + const counts = await enrich(provider); + stats.recordsProcessed += counts.processed; + stats.recordsSkipped += counts.skipped; + stats.needsReview += counts.needsReview; stats.modelsRun.push(provider.name); } @@ -492,7 +498,7 @@ async function main(): Promise { console.log(` [round 1] ${provider.name} — resuming...`); } - runRound1(provider, stats); + await runRound1(provider, stats); } if (shutdownRequested) { diff --git a/data-pipeline/stage-3-enrich/scripts/enrich.ts b/data-pipeline/stage-3-enrich/scripts/enrich.ts new file mode 100644 index 0000000..d5d0fe8 --- /dev/null +++ b/data-pipeline/stage-3-enrich/scripts/enrich.ts @@ -0,0 +1,483 @@ +import { openDb } from "../../db/index.js"; +import type { ProviderConfig } from "../config.js"; +import { CEFR_LEVELS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; +import type { SupportedLanguageCode } from "@lila/shared"; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type EntryRow = { + id: number; + headword: string; + language: SupportedLanguageCode; + pos: string; + gloss: string | null; + examples: string; // JSON array string +}; + +type TranslationRow = { + id: number; + target_lang: SupportedLanguageCode; + word: string; +}; + +type LlmResponse = { + headword_cefr: string; + translation_cefr: Partial< + Record> + >; + generated_translations?: Partial>; + generated_gloss?: string; + generated_example?: string; +}; + +type ValidationResult = + | { valid: true; data: LlmResponse } + | { valid: false; reason: string }; + +// ── Constants ───────────────────────────────────────────────────────────────── + +const SUPPORTED_LANG_SET = new Set(SUPPORTED_LANGUAGE_CODES); +const CEFR_SET = new Set(CEFR_LEVELS); + +// ── Prompt builder ──────────────────────────────────────────────────────────── + +function buildPrompt(entry: EntryRow, translations: TranslationRow[]): string { + const examples: string[] = JSON.parse(entry.examples) as string[]; + + // Group translations by language + const byLang = new Map(); + for (const t of translations) { + if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, []); + byLang.get(t.target_lang)!.push(t.word); + } + + // Find missing languages + const coveredLangs = new Set(byLang.keys()); + const missingLangs = SUPPORTED_LANGUAGE_CODES.filter( + (l) => l !== entry.language && !coveredLangs.has(l), + ); + + const existingTranslationsText = + byLang.size > 0 + ? [...byLang.entries()] + .map(([lang, words]) => ` ${lang}: ${words.join(", ")}`) + .join("\n") + : " none"; + + const missingTranslationsText = + missingLangs.length > 0 ? missingLangs.join(", ") : "none"; + + const examplesText = + examples.length > 0 ? examples.map((e) => ` - ${e}`).join("\n") : " none"; + + const glossText = entry.gloss ?? "none"; + + return `You are a language learning expert building a multilingual vocabulary database. + +Given an English word sense, your tasks are: +1. Assign a CEFR level (A1, A2, B1, B2, C1, or C2) to the English headword for this specific sense +2. Assign a CEFR level to each existing translation listed +3. If MISSING TRANSLATIONS lists any languages, generate the single best translation for each +4. If the existing gloss is missing or unsuitable for a language learner, generate a better one +5. If the existing examples are missing or unsuitable for a language learner, generate one natural sentence in English + +Base CEFR levels on how commonly a language learner at that level would encounter this specific sense, not the word in general. Consider register — slang, technical, and archaic words should be rated higher. + +WORD: ${entry.headword} +PART OF SPEECH: ${entry.pos} +GLOSS: ${glossText} +EXAMPLES: +${examplesText} +EXISTING TRANSLATIONS: +${existingTranslationsText} +MISSING TRANSLATIONS: ${missingTranslationsText} + +Respond ONLY with valid JSON and nothing else — no explanation, no markdown: +{ + "headword_cefr": "", + "translation_cefr": { + "": { "": "", ... }, + ... + }, + "generated_translations": { "": "", ... }, + "generated_gloss": "", + "generated_example": "" +} + +Only include "generated_translations" if there are missing languages. +Only include "generated_gloss" if you judge the existing gloss unsuitable. +Only include "generated_example" if you judge the existing examples unsuitable.`; +} + +// ── Validation ──────────────────────────────────────────────────────────────── + +function validateResponse( + raw: string, + translations: TranslationRow[], +): ValidationResult { + let parsed: unknown; + try { + parsed = JSON.parse(raw); + } catch { + return { valid: false, reason: "invalid JSON" }; + } + + if (typeof parsed !== "object" || parsed === null) { + return { valid: false, reason: "response is not an object" }; + } + + const obj = parsed as Record; + + // headword_cefr required + if (typeof obj["headword_cefr"] !== "string") { + return { valid: false, reason: "missing headword_cefr" }; + } + if (!CEFR_SET.has(obj["headword_cefr"])) { + return { + valid: false, + reason: `invalid headword_cefr: ${obj["headword_cefr"]}`, + }; + } + + // translation_cefr required + if ( + typeof obj["translation_cefr"] !== "object" || + obj["translation_cefr"] === null + ) { + return { valid: false, reason: "missing translation_cefr" }; + } + + const translationCefr = obj["translation_cefr"] as Record; + for (const [lang, votes] of Object.entries(translationCefr)) { + if (!SUPPORTED_LANG_SET.has(lang)) { + return { + valid: false, + reason: `unsupported language in translation_cefr: ${lang}`, + }; + } + if (typeof votes !== "object" || votes === null) { + return { + valid: false, + reason: `translation_cefr.${lang} is not an object`, + }; + } + for (const [word, level] of Object.entries( + votes as Record, + )) { + if (typeof level !== "string" || !CEFR_SET.has(level)) { + return { + valid: false, + reason: `invalid CEFR level for ${lang}.${word}: ${String(level)}`, + }; + } + } + } + + // Verify all existing translations have a CEFR vote + const byLang = new Map>(); + for (const t of translations) { + if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, new Set()); + byLang.get(t.target_lang)!.add(t.word); + } + + for (const [lang, words] of byLang.entries()) { + const votes = translationCefr[lang] as Record | undefined; + if (!votes) { + return { + valid: false, + reason: `missing translation_cefr for language: ${lang}`, + }; + } + for (const word of words) { + if (!votes[word]) { + return { + valid: false, + reason: `missing CEFR vote for ${lang}:${word}`, + }; + } + } + } + + // Optional fields + if (obj["generated_translations"] !== undefined) { + if ( + typeof obj["generated_translations"] !== "object" || + obj["generated_translations"] === null + ) { + return { + valid: false, + reason: "generated_translations is not an object", + }; + } + for (const [lang, word] of Object.entries( + obj["generated_translations"] as Record, + )) { + if (!SUPPORTED_LANG_SET.has(lang)) { + return { + valid: false, + reason: `unsupported language in generated_translations: ${lang}`, + }; + } + if (typeof word !== "string" || !word.trim()) { + return { + valid: false, + reason: `empty generated translation for ${lang}`, + }; + } + } + } + + if ( + obj["generated_gloss"] !== undefined && + typeof obj["generated_gloss"] !== "string" + ) { + return { valid: false, reason: "generated_gloss is not a string" }; + } + + if ( + obj["generated_example"] !== undefined && + typeof obj["generated_example"] !== "string" + ) { + return { valid: false, reason: "generated_example is not a string" }; + } + + return { valid: true, data: obj as unknown as LlmResponse }; +} + +// ── LLM call ────────────────────────────────────────────────────────────────── + +async function callLlm( + prompt: string, + provider: ProviderConfig, +): Promise { + const response = await fetch(`${provider.baseURL}/chat/completions`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${provider.apiKey}`, + }, + body: JSON.stringify({ + model: provider.model, + max_tokens: provider.maxTokens, + messages: [{ role: "user", content: prompt }], + temperature: 0.1, // low temperature for consistent structured output + }), + }); + + if (!response.ok) { + throw new Error(`LLM API error: ${response.status} ${response.statusText}`); + } + + const data = (await response.json()) as { + choices?: { message?: { content?: string } }[]; + }; + + const content = data.choices?.[0]?.message?.content; + if (!content) throw new Error("LLM returned empty response"); + + // Strip markdown code fences if present + return content + .replace(/```json\n?/g, "") + .replace(/```\n?/g, "") + .trim(); +} + +// ── Write results ───────────────────────────────────────────────────────────── + +function writeResults( + entryId: number, + modelName: string, + data: LlmResponse, + translations: TranslationRow[], +): void { + const db = openDb(); + + const insertEntryCefr = db.prepare(` + INSERT INTO model_entry_cefr_votes (entry_id, model_name, cefr_level) + VALUES (?, ?, ?) + ON CONFLICT (entry_id, model_name) DO NOTHING + `); + + const insertTranslationCefr = db.prepare(` + INSERT INTO model_translation_cefr_votes (translation_id, model_name, cefr_level) + VALUES (?, ?, ?) + ON CONFLICT (translation_id, model_name) DO NOTHING + `); + + const insertGeneratedTranslation = db.prepare(` + INSERT INTO generated_translations (entry_id, model_name, target_lang, word) + VALUES (?, ?, ?, ?) + ON CONFLICT (entry_id, model_name, target_lang) DO NOTHING + `); + + const insertGeneratedGloss = db.prepare(` + INSERT INTO generated_glosses (entry_id, model_name, text) + VALUES (?, ?, ?) + ON CONFLICT (entry_id, model_name) DO NOTHING + `); + + const insertGeneratedExample = db.prepare(` + INSERT INTO generated_examples (entry_id, model_name, text) + VALUES (?, ?, ?) + ON CONFLICT (entry_id, model_name) DO NOTHING + `); + + const updateRunStatus = db.prepare(` + INSERT INTO run_status (entry_id, model_name, stage, status) + VALUES (?, ?, 'round1', 'complete') + ON CONFLICT (entry_id, model_name, stage) + DO UPDATE SET status = 'complete', updated_at = datetime('now') + `); + + db.transaction(() => { + // CEFR vote for headword + insertEntryCefr.run(entryId, modelName, data.headword_cefr); + + // CEFR votes for translations + for (const t of translations) { + const level = data.translation_cefr[t.target_lang]?.[t.word]; + if (level) { + insertTranslationCefr.run(t.id, modelName, level); + } + } + + // Generated translations + if (data.generated_translations) { + for (const [lang, word] of Object.entries(data.generated_translations)) { + if (word.trim()) { + insertGeneratedTranslation.run(entryId, modelName, lang, word.trim()); + } + } + } + + // Generated gloss + if (data.generated_gloss?.trim()) { + insertGeneratedGloss.run(entryId, modelName, data.generated_gloss.trim()); + } + + // Generated example + if (data.generated_example?.trim()) { + insertGeneratedExample.run( + entryId, + modelName, + data.generated_example.trim(), + ); + } + + // Mark complete + updateRunStatus.run(entryId, modelName); + })(); + + db.close(); +} + +function markNeedsReview( + entryId: number, + modelName: string, + reason: string, +): void { + const db = openDb(); + db.prepare( + ` + INSERT INTO run_status (entry_id, model_name, stage, status) + VALUES (?, ?, 'round1', 'needs_review') + ON CONFLICT (entry_id, model_name, stage) + DO UPDATE SET status = 'needs_review', updated_at = datetime('now') + `, + ).run(entryId, modelName); + db.close(); + console.warn(` needs_review: entry ${entryId} — ${reason}`); +} + +// ── Main enrich function ────────────────────────────────────────────────────── + +export async function enrich( + provider: ProviderConfig, +): Promise<{ processed: number; skipped: number; needsReview: number }> { + const db = openDb(); + + // Load all English entries + const allEntries = db + .prepare(`SELECT * FROM entries WHERE language = 'en'`) + .all() as EntryRow[]; + + // Find already processed entries for this model + const processed = db + .prepare( + `SELECT entry_id FROM run_status + WHERE model_name = ? AND stage = 'round1' + AND status IN ('complete', 'needs_review')`, + ) + .all(provider.name) as { entry_id: number }[]; + + const processedIds = new Set(processed.map((r) => r.entry_id)); + const pending = allEntries.filter((e) => !processedIds.has(e.id)); + + db.close(); + + console.log(`\n Model: ${provider.name}`); + console.log(` Total entries: ${allEntries.length.toLocaleString()}`); + console.log(` Already processed: ${processedIds.size.toLocaleString()}`); + console.log(` Pending: ${pending.length.toLocaleString()}`); + + if (pending.length === 0) { + console.log(" Nothing to process."); + return { processed: 0, skipped: allEntries.length, needsReview: 0 }; + } + + let processedCount = 0; + let needsReviewCount = 0; + + for (const entry of pending) { + const db2 = openDb(); + const translations = db2 + .prepare( + `SELECT id, target_lang, word FROM translations + WHERE entry_id = ? AND source = 'kaikki'`, + ) + .all(entry.id) as TranslationRow[]; + db2.close(); + + const prompt = buildPrompt(entry, translations); + + let raw: string; + try { + raw = await callLlm(prompt, provider); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + markNeedsReview(entry.id, provider.name, `LLM call failed: ${message}`); + needsReviewCount++; + continue; + } + + const validation = validateResponse(raw, translations); + + if (!validation.valid) { + markNeedsReview( + entry.id, + provider.name, + `validation failed: ${validation.reason}`, + ); + needsReviewCount++; + continue; + } + + writeResults(entry.id, provider.name, validation.data, translations); + processedCount++; + + if (processedCount % 100 === 0) { + console.log( + ` Processed ${processedCount.toLocaleString()} entries...`, + ); + } + } + + console.log(` Processed: ${processedCount.toLocaleString()}`); + console.log(` Needs review: ${needsReviewCount.toLocaleString()}`); + + return { + processed: processedCount, + skipped: processedIds.size, + needsReview: needsReviewCount, + }; +}