feat: add stage 3 round 1 enrich script and wire into orchestrator
This commit is contained in:
parent
76af2ab093
commit
9642daf6dd
2 changed files with 493 additions and 4 deletions
|
|
@ -7,6 +7,7 @@ import { openDb } from "./db/index.js";
|
|||
import { reverseLink } from "./stage-2-reverse-link/scripts/reverse-link.js";
|
||||
import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js";
|
||||
import type { ProviderConfig } from "./stage-3-enrich/config.js";
|
||||
import { enrich } from "./stage-3-enrich/scripts/enrich.js";
|
||||
|
||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
|
@ -248,10 +249,15 @@ function runReverseLinkStage(): void {
|
|||
markReverseLinkComplete();
|
||||
}
|
||||
|
||||
function runRound1(provider: ProviderConfig, stats: RunStats): void {
|
||||
async function runRound1(
|
||||
provider: ProviderConfig,
|
||||
stats: RunStats,
|
||||
): Promise<void> {
|
||||
console.log(`\n [round 1] Running ${provider.name}...`);
|
||||
// TODO: implement round 1 enrich script
|
||||
console.log(` [round 1] ${provider.name} — not yet implemented`);
|
||||
const counts = await enrich(provider);
|
||||
stats.recordsProcessed += counts.processed;
|
||||
stats.recordsSkipped += counts.skipped;
|
||||
stats.needsReview += counts.needsReview;
|
||||
stats.modelsRun.push(provider.name);
|
||||
}
|
||||
|
||||
|
|
@ -492,7 +498,7 @@ async function main(): Promise<void> {
|
|||
console.log(` [round 1] ${provider.name} — resuming...`);
|
||||
}
|
||||
|
||||
runRound1(provider, stats);
|
||||
await runRound1(provider, stats);
|
||||
}
|
||||
|
||||
if (shutdownRequested) {
|
||||
|
|
|
|||
483
data-pipeline/stage-3-enrich/scripts/enrich.ts
Normal file
483
data-pipeline/stage-3-enrich/scripts/enrich.ts
Normal file
|
|
@ -0,0 +1,483 @@
|
|||
import { openDb } from "../../db/index.js";
|
||||
import type { ProviderConfig } from "../config.js";
|
||||
import { CEFR_LEVELS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||
import type { SupportedLanguageCode } from "@lila/shared";
|
||||
|
||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
type EntryRow = {
|
||||
id: number;
|
||||
headword: string;
|
||||
language: SupportedLanguageCode;
|
||||
pos: string;
|
||||
gloss: string | null;
|
||||
examples: string; // JSON array string
|
||||
};
|
||||
|
||||
type TranslationRow = {
|
||||
id: number;
|
||||
target_lang: SupportedLanguageCode;
|
||||
word: string;
|
||||
};
|
||||
|
||||
type LlmResponse = {
|
||||
headword_cefr: string;
|
||||
translation_cefr: Partial<
|
||||
Record<SupportedLanguageCode, Record<string, string>>
|
||||
>;
|
||||
generated_translations?: Partial<Record<SupportedLanguageCode, string>>;
|
||||
generated_gloss?: string;
|
||||
generated_example?: string;
|
||||
};
|
||||
|
||||
type ValidationResult =
|
||||
| { valid: true; data: LlmResponse }
|
||||
| { valid: false; reason: string };
|
||||
|
||||
// ── Constants ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
|
||||
const CEFR_SET = new Set<string>(CEFR_LEVELS);
|
||||
|
||||
// ── Prompt builder ────────────────────────────────────────────────────────────
|
||||
|
||||
function buildPrompt(entry: EntryRow, translations: TranslationRow[]): string {
|
||||
const examples: string[] = JSON.parse(entry.examples) as string[];
|
||||
|
||||
// Group translations by language
|
||||
const byLang = new Map<SupportedLanguageCode, string[]>();
|
||||
for (const t of translations) {
|
||||
if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, []);
|
||||
byLang.get(t.target_lang)!.push(t.word);
|
||||
}
|
||||
|
||||
// Find missing languages
|
||||
const coveredLangs = new Set(byLang.keys());
|
||||
const missingLangs = SUPPORTED_LANGUAGE_CODES.filter(
|
||||
(l) => l !== entry.language && !coveredLangs.has(l),
|
||||
);
|
||||
|
||||
const existingTranslationsText =
|
||||
byLang.size > 0
|
||||
? [...byLang.entries()]
|
||||
.map(([lang, words]) => ` ${lang}: ${words.join(", ")}`)
|
||||
.join("\n")
|
||||
: " none";
|
||||
|
||||
const missingTranslationsText =
|
||||
missingLangs.length > 0 ? missingLangs.join(", ") : "none";
|
||||
|
||||
const examplesText =
|
||||
examples.length > 0 ? examples.map((e) => ` - ${e}`).join("\n") : " none";
|
||||
|
||||
const glossText = entry.gloss ?? "none";
|
||||
|
||||
return `You are a language learning expert building a multilingual vocabulary database.
|
||||
|
||||
Given an English word sense, your tasks are:
|
||||
1. Assign a CEFR level (A1, A2, B1, B2, C1, or C2) to the English headword for this specific sense
|
||||
2. Assign a CEFR level to each existing translation listed
|
||||
3. If MISSING TRANSLATIONS lists any languages, generate the single best translation for each
|
||||
4. If the existing gloss is missing or unsuitable for a language learner, generate a better one
|
||||
5. If the existing examples are missing or unsuitable for a language learner, generate one natural sentence in English
|
||||
|
||||
Base CEFR levels on how commonly a language learner at that level would encounter this specific sense, not the word in general. Consider register — slang, technical, and archaic words should be rated higher.
|
||||
|
||||
WORD: ${entry.headword}
|
||||
PART OF SPEECH: ${entry.pos}
|
||||
GLOSS: ${glossText}
|
||||
EXAMPLES:
|
||||
${examplesText}
|
||||
EXISTING TRANSLATIONS:
|
||||
${existingTranslationsText}
|
||||
MISSING TRANSLATIONS: ${missingTranslationsText}
|
||||
|
||||
Respond ONLY with valid JSON and nothing else — no explanation, no markdown:
|
||||
{
|
||||
"headword_cefr": "<level>",
|
||||
"translation_cefr": {
|
||||
"<lang>": { "<word>": "<level>", ... },
|
||||
...
|
||||
},
|
||||
"generated_translations": { "<lang>": "<word>", ... },
|
||||
"generated_gloss": "<gloss if needed, omit if existing is fine>",
|
||||
"generated_example": "<example sentence in English if needed, omit if existing is fine>"
|
||||
}
|
||||
|
||||
Only include "generated_translations" if there are missing languages.
|
||||
Only include "generated_gloss" if you judge the existing gloss unsuitable.
|
||||
Only include "generated_example" if you judge the existing examples unsuitable.`;
|
||||
}
|
||||
|
||||
// ── Validation ────────────────────────────────────────────────────────────────
|
||||
|
||||
function validateResponse(
|
||||
raw: string,
|
||||
translations: TranslationRow[],
|
||||
): ValidationResult {
|
||||
let parsed: unknown;
|
||||
try {
|
||||
parsed = JSON.parse(raw);
|
||||
} catch {
|
||||
return { valid: false, reason: "invalid JSON" };
|
||||
}
|
||||
|
||||
if (typeof parsed !== "object" || parsed === null) {
|
||||
return { valid: false, reason: "response is not an object" };
|
||||
}
|
||||
|
||||
const obj = parsed as Record<string, unknown>;
|
||||
|
||||
// headword_cefr required
|
||||
if (typeof obj["headword_cefr"] !== "string") {
|
||||
return { valid: false, reason: "missing headword_cefr" };
|
||||
}
|
||||
if (!CEFR_SET.has(obj["headword_cefr"])) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: `invalid headword_cefr: ${obj["headword_cefr"]}`,
|
||||
};
|
||||
}
|
||||
|
||||
// translation_cefr required
|
||||
if (
|
||||
typeof obj["translation_cefr"] !== "object" ||
|
||||
obj["translation_cefr"] === null
|
||||
) {
|
||||
return { valid: false, reason: "missing translation_cefr" };
|
||||
}
|
||||
|
||||
const translationCefr = obj["translation_cefr"] as Record<string, unknown>;
|
||||
for (const [lang, votes] of Object.entries(translationCefr)) {
|
||||
if (!SUPPORTED_LANG_SET.has(lang)) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: `unsupported language in translation_cefr: ${lang}`,
|
||||
};
|
||||
}
|
||||
if (typeof votes !== "object" || votes === null) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: `translation_cefr.${lang} is not an object`,
|
||||
};
|
||||
}
|
||||
for (const [word, level] of Object.entries(
|
||||
votes as Record<string, unknown>,
|
||||
)) {
|
||||
if (typeof level !== "string" || !CEFR_SET.has(level)) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: `invalid CEFR level for ${lang}.${word}: ${String(level)}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Verify all existing translations have a CEFR vote
|
||||
const byLang = new Map<string, Set<string>>();
|
||||
for (const t of translations) {
|
||||
if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, new Set());
|
||||
byLang.get(t.target_lang)!.add(t.word);
|
||||
}
|
||||
|
||||
for (const [lang, words] of byLang.entries()) {
|
||||
const votes = translationCefr[lang] as Record<string, string> | undefined;
|
||||
if (!votes) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: `missing translation_cefr for language: ${lang}`,
|
||||
};
|
||||
}
|
||||
for (const word of words) {
|
||||
if (!votes[word]) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: `missing CEFR vote for ${lang}:${word}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Optional fields
|
||||
if (obj["generated_translations"] !== undefined) {
|
||||
if (
|
||||
typeof obj["generated_translations"] !== "object" ||
|
||||
obj["generated_translations"] === null
|
||||
) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: "generated_translations is not an object",
|
||||
};
|
||||
}
|
||||
for (const [lang, word] of Object.entries(
|
||||
obj["generated_translations"] as Record<string, unknown>,
|
||||
)) {
|
||||
if (!SUPPORTED_LANG_SET.has(lang)) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: `unsupported language in generated_translations: ${lang}`,
|
||||
};
|
||||
}
|
||||
if (typeof word !== "string" || !word.trim()) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: `empty generated translation for ${lang}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
obj["generated_gloss"] !== undefined &&
|
||||
typeof obj["generated_gloss"] !== "string"
|
||||
) {
|
||||
return { valid: false, reason: "generated_gloss is not a string" };
|
||||
}
|
||||
|
||||
if (
|
||||
obj["generated_example"] !== undefined &&
|
||||
typeof obj["generated_example"] !== "string"
|
||||
) {
|
||||
return { valid: false, reason: "generated_example is not a string" };
|
||||
}
|
||||
|
||||
return { valid: true, data: obj as unknown as LlmResponse };
|
||||
}
|
||||
|
||||
// ── LLM call ──────────────────────────────────────────────────────────────────
|
||||
|
||||
async function callLlm(
|
||||
prompt: string,
|
||||
provider: ProviderConfig,
|
||||
): Promise<string> {
|
||||
const response = await fetch(`${provider.baseURL}/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${provider.apiKey}`,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: provider.model,
|
||||
max_tokens: provider.maxTokens,
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
temperature: 0.1, // low temperature for consistent structured output
|
||||
}),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`LLM API error: ${response.status} ${response.statusText}`);
|
||||
}
|
||||
|
||||
const data = (await response.json()) as {
|
||||
choices?: { message?: { content?: string } }[];
|
||||
};
|
||||
|
||||
const content = data.choices?.[0]?.message?.content;
|
||||
if (!content) throw new Error("LLM returned empty response");
|
||||
|
||||
// Strip markdown code fences if present
|
||||
return content
|
||||
.replace(/```json\n?/g, "")
|
||||
.replace(/```\n?/g, "")
|
||||
.trim();
|
||||
}
|
||||
|
||||
// ── Write results ─────────────────────────────────────────────────────────────
|
||||
|
||||
function writeResults(
|
||||
entryId: number,
|
||||
modelName: string,
|
||||
data: LlmResponse,
|
||||
translations: TranslationRow[],
|
||||
): void {
|
||||
const db = openDb();
|
||||
|
||||
const insertEntryCefr = db.prepare(`
|
||||
INSERT INTO model_entry_cefr_votes (entry_id, model_name, cefr_level)
|
||||
VALUES (?, ?, ?)
|
||||
ON CONFLICT (entry_id, model_name) DO NOTHING
|
||||
`);
|
||||
|
||||
const insertTranslationCefr = db.prepare(`
|
||||
INSERT INTO model_translation_cefr_votes (translation_id, model_name, cefr_level)
|
||||
VALUES (?, ?, ?)
|
||||
ON CONFLICT (translation_id, model_name) DO NOTHING
|
||||
`);
|
||||
|
||||
const insertGeneratedTranslation = db.prepare(`
|
||||
INSERT INTO generated_translations (entry_id, model_name, target_lang, word)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT (entry_id, model_name, target_lang) DO NOTHING
|
||||
`);
|
||||
|
||||
const insertGeneratedGloss = db.prepare(`
|
||||
INSERT INTO generated_glosses (entry_id, model_name, text)
|
||||
VALUES (?, ?, ?)
|
||||
ON CONFLICT (entry_id, model_name) DO NOTHING
|
||||
`);
|
||||
|
||||
const insertGeneratedExample = db.prepare(`
|
||||
INSERT INTO generated_examples (entry_id, model_name, text)
|
||||
VALUES (?, ?, ?)
|
||||
ON CONFLICT (entry_id, model_name) DO NOTHING
|
||||
`);
|
||||
|
||||
const updateRunStatus = db.prepare(`
|
||||
INSERT INTO run_status (entry_id, model_name, stage, status)
|
||||
VALUES (?, ?, 'round1', 'complete')
|
||||
ON CONFLICT (entry_id, model_name, stage)
|
||||
DO UPDATE SET status = 'complete', updated_at = datetime('now')
|
||||
`);
|
||||
|
||||
db.transaction(() => {
|
||||
// CEFR vote for headword
|
||||
insertEntryCefr.run(entryId, modelName, data.headword_cefr);
|
||||
|
||||
// CEFR votes for translations
|
||||
for (const t of translations) {
|
||||
const level = data.translation_cefr[t.target_lang]?.[t.word];
|
||||
if (level) {
|
||||
insertTranslationCefr.run(t.id, modelName, level);
|
||||
}
|
||||
}
|
||||
|
||||
// Generated translations
|
||||
if (data.generated_translations) {
|
||||
for (const [lang, word] of Object.entries(data.generated_translations)) {
|
||||
if (word.trim()) {
|
||||
insertGeneratedTranslation.run(entryId, modelName, lang, word.trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Generated gloss
|
||||
if (data.generated_gloss?.trim()) {
|
||||
insertGeneratedGloss.run(entryId, modelName, data.generated_gloss.trim());
|
||||
}
|
||||
|
||||
// Generated example
|
||||
if (data.generated_example?.trim()) {
|
||||
insertGeneratedExample.run(
|
||||
entryId,
|
||||
modelName,
|
||||
data.generated_example.trim(),
|
||||
);
|
||||
}
|
||||
|
||||
// Mark complete
|
||||
updateRunStatus.run(entryId, modelName);
|
||||
})();
|
||||
|
||||
db.close();
|
||||
}
|
||||
|
||||
function markNeedsReview(
|
||||
entryId: number,
|
||||
modelName: string,
|
||||
reason: string,
|
||||
): void {
|
||||
const db = openDb();
|
||||
db.prepare(
|
||||
`
|
||||
INSERT INTO run_status (entry_id, model_name, stage, status)
|
||||
VALUES (?, ?, 'round1', 'needs_review')
|
||||
ON CONFLICT (entry_id, model_name, stage)
|
||||
DO UPDATE SET status = 'needs_review', updated_at = datetime('now')
|
||||
`,
|
||||
).run(entryId, modelName);
|
||||
db.close();
|
||||
console.warn(` needs_review: entry ${entryId} — ${reason}`);
|
||||
}
|
||||
|
||||
// ── Main enrich function ──────────────────────────────────────────────────────
|
||||
|
||||
export async function enrich(
|
||||
provider: ProviderConfig,
|
||||
): Promise<{ processed: number; skipped: number; needsReview: number }> {
|
||||
const db = openDb();
|
||||
|
||||
// Load all English entries
|
||||
const allEntries = db
|
||||
.prepare(`SELECT * FROM entries WHERE language = 'en'`)
|
||||
.all() as EntryRow[];
|
||||
|
||||
// Find already processed entries for this model
|
||||
const processed = db
|
||||
.prepare(
|
||||
`SELECT entry_id FROM run_status
|
||||
WHERE model_name = ? AND stage = 'round1'
|
||||
AND status IN ('complete', 'needs_review')`,
|
||||
)
|
||||
.all(provider.name) as { entry_id: number }[];
|
||||
|
||||
const processedIds = new Set(processed.map((r) => r.entry_id));
|
||||
const pending = allEntries.filter((e) => !processedIds.has(e.id));
|
||||
|
||||
db.close();
|
||||
|
||||
console.log(`\n Model: ${provider.name}`);
|
||||
console.log(` Total entries: ${allEntries.length.toLocaleString()}`);
|
||||
console.log(` Already processed: ${processedIds.size.toLocaleString()}`);
|
||||
console.log(` Pending: ${pending.length.toLocaleString()}`);
|
||||
|
||||
if (pending.length === 0) {
|
||||
console.log(" Nothing to process.");
|
||||
return { processed: 0, skipped: allEntries.length, needsReview: 0 };
|
||||
}
|
||||
|
||||
let processedCount = 0;
|
||||
let needsReviewCount = 0;
|
||||
|
||||
for (const entry of pending) {
|
||||
const db2 = openDb();
|
||||
const translations = db2
|
||||
.prepare(
|
||||
`SELECT id, target_lang, word FROM translations
|
||||
WHERE entry_id = ? AND source = 'kaikki'`,
|
||||
)
|
||||
.all(entry.id) as TranslationRow[];
|
||||
db2.close();
|
||||
|
||||
const prompt = buildPrompt(entry, translations);
|
||||
|
||||
let raw: string;
|
||||
try {
|
||||
raw = await callLlm(prompt, provider);
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
markNeedsReview(entry.id, provider.name, `LLM call failed: ${message}`);
|
||||
needsReviewCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const validation = validateResponse(raw, translations);
|
||||
|
||||
if (!validation.valid) {
|
||||
markNeedsReview(
|
||||
entry.id,
|
||||
provider.name,
|
||||
`validation failed: ${validation.reason}`,
|
||||
);
|
||||
needsReviewCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
writeResults(entry.id, provider.name, validation.data, translations);
|
||||
processedCount++;
|
||||
|
||||
if (processedCount % 100 === 0) {
|
||||
console.log(
|
||||
` Processed ${processedCount.toLocaleString()} entries...`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(` Processed: ${processedCount.toLocaleString()}`);
|
||||
console.log(` Needs review: ${needsReviewCount.toLocaleString()}`);
|
||||
|
||||
return {
|
||||
processed: processedCount,
|
||||
skipped: processedIds.size,
|
||||
needsReview: needsReviewCount,
|
||||
};
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue