feat: add stage 3 round 1 enrich script and wire into orchestrator

This commit is contained in:
lila 2026-05-05 19:28:38 +02:00
parent 76af2ab093
commit 9642daf6dd
2 changed files with 493 additions and 4 deletions

View file

@ -7,6 +7,7 @@ import { openDb } from "./db/index.js";
import { reverseLink } from "./stage-2-reverse-link/scripts/reverse-link.js"; import { reverseLink } from "./stage-2-reverse-link/scripts/reverse-link.js";
import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js"; import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js";
import type { ProviderConfig } from "./stage-3-enrich/config.js"; import type { ProviderConfig } from "./stage-3-enrich/config.js";
import { enrich } from "./stage-3-enrich/scripts/enrich.js";
// ── Types ───────────────────────────────────────────────────────────────────── // ── Types ─────────────────────────────────────────────────────────────────────
@ -248,10 +249,15 @@ function runReverseLinkStage(): void {
markReverseLinkComplete(); markReverseLinkComplete();
} }
function runRound1(provider: ProviderConfig, stats: RunStats): void { async function runRound1(
provider: ProviderConfig,
stats: RunStats,
): Promise<void> {
console.log(`\n [round 1] Running ${provider.name}...`); console.log(`\n [round 1] Running ${provider.name}...`);
// TODO: implement round 1 enrich script const counts = await enrich(provider);
console.log(` [round 1] ${provider.name} — not yet implemented`); stats.recordsProcessed += counts.processed;
stats.recordsSkipped += counts.skipped;
stats.needsReview += counts.needsReview;
stats.modelsRun.push(provider.name); stats.modelsRun.push(provider.name);
} }
@ -492,7 +498,7 @@ async function main(): Promise<void> {
console.log(` [round 1] ${provider.name} — resuming...`); console.log(` [round 1] ${provider.name} — resuming...`);
} }
runRound1(provider, stats); await runRound1(provider, stats);
} }
if (shutdownRequested) { if (shutdownRequested) {

View file

@ -0,0 +1,483 @@
import { openDb } from "../../db/index.js";
import type { ProviderConfig } from "../config.js";
import { CEFR_LEVELS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type EntryRow = {
id: number;
headword: string;
language: SupportedLanguageCode;
pos: string;
gloss: string | null;
examples: string; // JSON array string
};
type TranslationRow = {
id: number;
target_lang: SupportedLanguageCode;
word: string;
};
type LlmResponse = {
headword_cefr: string;
translation_cefr: Partial<
Record<SupportedLanguageCode, Record<string, string>>
>;
generated_translations?: Partial<Record<SupportedLanguageCode, string>>;
generated_gloss?: string;
generated_example?: string;
};
type ValidationResult =
| { valid: true; data: LlmResponse }
| { valid: false; reason: string };
// ── Constants ─────────────────────────────────────────────────────────────────
const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
const CEFR_SET = new Set<string>(CEFR_LEVELS);
// ── Prompt builder ────────────────────────────────────────────────────────────
function buildPrompt(entry: EntryRow, translations: TranslationRow[]): string {
const examples: string[] = JSON.parse(entry.examples) as string[];
// Group translations by language
const byLang = new Map<SupportedLanguageCode, string[]>();
for (const t of translations) {
if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, []);
byLang.get(t.target_lang)!.push(t.word);
}
// Find missing languages
const coveredLangs = new Set(byLang.keys());
const missingLangs = SUPPORTED_LANGUAGE_CODES.filter(
(l) => l !== entry.language && !coveredLangs.has(l),
);
const existingTranslationsText =
byLang.size > 0
? [...byLang.entries()]
.map(([lang, words]) => ` ${lang}: ${words.join(", ")}`)
.join("\n")
: " none";
const missingTranslationsText =
missingLangs.length > 0 ? missingLangs.join(", ") : "none";
const examplesText =
examples.length > 0 ? examples.map((e) => ` - ${e}`).join("\n") : " none";
const glossText = entry.gloss ?? "none";
return `You are a language learning expert building a multilingual vocabulary database.
Given an English word sense, your tasks are:
1. Assign a CEFR level (A1, A2, B1, B2, C1, or C2) to the English headword for this specific sense
2. Assign a CEFR level to each existing translation listed
3. If MISSING TRANSLATIONS lists any languages, generate the single best translation for each
4. If the existing gloss is missing or unsuitable for a language learner, generate a better one
5. If the existing examples are missing or unsuitable for a language learner, generate one natural sentence in English
Base CEFR levels on how commonly a language learner at that level would encounter this specific sense, not the word in general. Consider register slang, technical, and archaic words should be rated higher.
WORD: ${entry.headword}
PART OF SPEECH: ${entry.pos}
GLOSS: ${glossText}
EXAMPLES:
${examplesText}
EXISTING TRANSLATIONS:
${existingTranslationsText}
MISSING TRANSLATIONS: ${missingTranslationsText}
Respond ONLY with valid JSON and nothing else no explanation, no markdown:
{
"headword_cefr": "<level>",
"translation_cefr": {
"<lang>": { "<word>": "<level>", ... },
...
},
"generated_translations": { "<lang>": "<word>", ... },
"generated_gloss": "<gloss if needed, omit if existing is fine>",
"generated_example": "<example sentence in English if needed, omit if existing is fine>"
}
Only include "generated_translations" if there are missing languages.
Only include "generated_gloss" if you judge the existing gloss unsuitable.
Only include "generated_example" if you judge the existing examples unsuitable.`;
}
// ── Validation ────────────────────────────────────────────────────────────────
function validateResponse(
raw: string,
translations: TranslationRow[],
): ValidationResult {
let parsed: unknown;
try {
parsed = JSON.parse(raw);
} catch {
return { valid: false, reason: "invalid JSON" };
}
if (typeof parsed !== "object" || parsed === null) {
return { valid: false, reason: "response is not an object" };
}
const obj = parsed as Record<string, unknown>;
// headword_cefr required
if (typeof obj["headword_cefr"] !== "string") {
return { valid: false, reason: "missing headword_cefr" };
}
if (!CEFR_SET.has(obj["headword_cefr"])) {
return {
valid: false,
reason: `invalid headword_cefr: ${obj["headword_cefr"]}`,
};
}
// translation_cefr required
if (
typeof obj["translation_cefr"] !== "object" ||
obj["translation_cefr"] === null
) {
return { valid: false, reason: "missing translation_cefr" };
}
const translationCefr = obj["translation_cefr"] as Record<string, unknown>;
for (const [lang, votes] of Object.entries(translationCefr)) {
if (!SUPPORTED_LANG_SET.has(lang)) {
return {
valid: false,
reason: `unsupported language in translation_cefr: ${lang}`,
};
}
if (typeof votes !== "object" || votes === null) {
return {
valid: false,
reason: `translation_cefr.${lang} is not an object`,
};
}
for (const [word, level] of Object.entries(
votes as Record<string, unknown>,
)) {
if (typeof level !== "string" || !CEFR_SET.has(level)) {
return {
valid: false,
reason: `invalid CEFR level for ${lang}.${word}: ${String(level)}`,
};
}
}
}
// Verify all existing translations have a CEFR vote
const byLang = new Map<string, Set<string>>();
for (const t of translations) {
if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, new Set());
byLang.get(t.target_lang)!.add(t.word);
}
for (const [lang, words] of byLang.entries()) {
const votes = translationCefr[lang] as Record<string, string> | undefined;
if (!votes) {
return {
valid: false,
reason: `missing translation_cefr for language: ${lang}`,
};
}
for (const word of words) {
if (!votes[word]) {
return {
valid: false,
reason: `missing CEFR vote for ${lang}:${word}`,
};
}
}
}
// Optional fields
if (obj["generated_translations"] !== undefined) {
if (
typeof obj["generated_translations"] !== "object" ||
obj["generated_translations"] === null
) {
return {
valid: false,
reason: "generated_translations is not an object",
};
}
for (const [lang, word] of Object.entries(
obj["generated_translations"] as Record<string, unknown>,
)) {
if (!SUPPORTED_LANG_SET.has(lang)) {
return {
valid: false,
reason: `unsupported language in generated_translations: ${lang}`,
};
}
if (typeof word !== "string" || !word.trim()) {
return {
valid: false,
reason: `empty generated translation for ${lang}`,
};
}
}
}
if (
obj["generated_gloss"] !== undefined &&
typeof obj["generated_gloss"] !== "string"
) {
return { valid: false, reason: "generated_gloss is not a string" };
}
if (
obj["generated_example"] !== undefined &&
typeof obj["generated_example"] !== "string"
) {
return { valid: false, reason: "generated_example is not a string" };
}
return { valid: true, data: obj as unknown as LlmResponse };
}
// ── LLM call ──────────────────────────────────────────────────────────────────
async function callLlm(
prompt: string,
provider: ProviderConfig,
): Promise<string> {
const response = await fetch(`${provider.baseURL}/chat/completions`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${provider.apiKey}`,
},
body: JSON.stringify({
model: provider.model,
max_tokens: provider.maxTokens,
messages: [{ role: "user", content: prompt }],
temperature: 0.1, // low temperature for consistent structured output
}),
});
if (!response.ok) {
throw new Error(`LLM API error: ${response.status} ${response.statusText}`);
}
const data = (await response.json()) as {
choices?: { message?: { content?: string } }[];
};
const content = data.choices?.[0]?.message?.content;
if (!content) throw new Error("LLM returned empty response");
// Strip markdown code fences if present
return content
.replace(/```json\n?/g, "")
.replace(/```\n?/g, "")
.trim();
}
// ── Write results ─────────────────────────────────────────────────────────────
function writeResults(
entryId: number,
modelName: string,
data: LlmResponse,
translations: TranslationRow[],
): void {
const db = openDb();
const insertEntryCefr = db.prepare(`
INSERT INTO model_entry_cefr_votes (entry_id, model_name, cefr_level)
VALUES (?, ?, ?)
ON CONFLICT (entry_id, model_name) DO NOTHING
`);
const insertTranslationCefr = db.prepare(`
INSERT INTO model_translation_cefr_votes (translation_id, model_name, cefr_level)
VALUES (?, ?, ?)
ON CONFLICT (translation_id, model_name) DO NOTHING
`);
const insertGeneratedTranslation = db.prepare(`
INSERT INTO generated_translations (entry_id, model_name, target_lang, word)
VALUES (?, ?, ?, ?)
ON CONFLICT (entry_id, model_name, target_lang) DO NOTHING
`);
const insertGeneratedGloss = db.prepare(`
INSERT INTO generated_glosses (entry_id, model_name, text)
VALUES (?, ?, ?)
ON CONFLICT (entry_id, model_name) DO NOTHING
`);
const insertGeneratedExample = db.prepare(`
INSERT INTO generated_examples (entry_id, model_name, text)
VALUES (?, ?, ?)
ON CONFLICT (entry_id, model_name) DO NOTHING
`);
const updateRunStatus = db.prepare(`
INSERT INTO run_status (entry_id, model_name, stage, status)
VALUES (?, ?, 'round1', 'complete')
ON CONFLICT (entry_id, model_name, stage)
DO UPDATE SET status = 'complete', updated_at = datetime('now')
`);
db.transaction(() => {
// CEFR vote for headword
insertEntryCefr.run(entryId, modelName, data.headword_cefr);
// CEFR votes for translations
for (const t of translations) {
const level = data.translation_cefr[t.target_lang]?.[t.word];
if (level) {
insertTranslationCefr.run(t.id, modelName, level);
}
}
// Generated translations
if (data.generated_translations) {
for (const [lang, word] of Object.entries(data.generated_translations)) {
if (word.trim()) {
insertGeneratedTranslation.run(entryId, modelName, lang, word.trim());
}
}
}
// Generated gloss
if (data.generated_gloss?.trim()) {
insertGeneratedGloss.run(entryId, modelName, data.generated_gloss.trim());
}
// Generated example
if (data.generated_example?.trim()) {
insertGeneratedExample.run(
entryId,
modelName,
data.generated_example.trim(),
);
}
// Mark complete
updateRunStatus.run(entryId, modelName);
})();
db.close();
}
function markNeedsReview(
entryId: number,
modelName: string,
reason: string,
): void {
const db = openDb();
db.prepare(
`
INSERT INTO run_status (entry_id, model_name, stage, status)
VALUES (?, ?, 'round1', 'needs_review')
ON CONFLICT (entry_id, model_name, stage)
DO UPDATE SET status = 'needs_review', updated_at = datetime('now')
`,
).run(entryId, modelName);
db.close();
console.warn(` needs_review: entry ${entryId}${reason}`);
}
// ── Main enrich function ──────────────────────────────────────────────────────
export async function enrich(
provider: ProviderConfig,
): Promise<{ processed: number; skipped: number; needsReview: number }> {
const db = openDb();
// Load all English entries
const allEntries = db
.prepare(`SELECT * FROM entries WHERE language = 'en'`)
.all() as EntryRow[];
// Find already processed entries for this model
const processed = db
.prepare(
`SELECT entry_id FROM run_status
WHERE model_name = ? AND stage = 'round1'
AND status IN ('complete', 'needs_review')`,
)
.all(provider.name) as { entry_id: number }[];
const processedIds = new Set(processed.map((r) => r.entry_id));
const pending = allEntries.filter((e) => !processedIds.has(e.id));
db.close();
console.log(`\n Model: ${provider.name}`);
console.log(` Total entries: ${allEntries.length.toLocaleString()}`);
console.log(` Already processed: ${processedIds.size.toLocaleString()}`);
console.log(` Pending: ${pending.length.toLocaleString()}`);
if (pending.length === 0) {
console.log(" Nothing to process.");
return { processed: 0, skipped: allEntries.length, needsReview: 0 };
}
let processedCount = 0;
let needsReviewCount = 0;
for (const entry of pending) {
const db2 = openDb();
const translations = db2
.prepare(
`SELECT id, target_lang, word FROM translations
WHERE entry_id = ? AND source = 'kaikki'`,
)
.all(entry.id) as TranslationRow[];
db2.close();
const prompt = buildPrompt(entry, translations);
let raw: string;
try {
raw = await callLlm(prompt, provider);
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
markNeedsReview(entry.id, provider.name, `LLM call failed: ${message}`);
needsReviewCount++;
continue;
}
const validation = validateResponse(raw, translations);
if (!validation.valid) {
markNeedsReview(
entry.id,
provider.name,
`validation failed: ${validation.reason}`,
);
needsReviewCount++;
continue;
}
writeResults(entry.id, provider.name, validation.data, translations);
processedCount++;
if (processedCount % 100 === 0) {
console.log(
` Processed ${processedCount.toLocaleString()} entries...`,
);
}
}
console.log(` Processed: ${processedCount.toLocaleString()}`);
console.log(` Needs review: ${needsReviewCount.toLocaleString()}`);
return {
processed: processedCount,
skipped: processedIds.size,
needsReview: needsReviewCount,
};
}