feat: enrich script working, redesigning to sub-stage architecture

- Enrich script functional with timeout, progress tracking, rejection mechanism
- Identified ordering issue: CEFR voting needs validated translations first
- Redesign: round1_gloss → round1_example → round1_translations → round1_cefr
- Update data-pipeline.md with new sub-stage design and roadmap
- Qwen3.5-4B confirmed working with thinking disabled
This commit is contained in:
lila 2026-05-07 13:09:43 +02:00
parent 7f10c35e03
commit 73fb12ac35
7 changed files with 337 additions and 122 deletions

Binary file not shown.

View file

@ -60,6 +60,13 @@ CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
UNIQUE (translation_id, model_name)
);
CREATE TABLE IF NOT EXISTS model_translation_rejections (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
model_name TEXT NOT NULL,
UNIQUE (translation_id, model_name)
);
CREATE TABLE IF NOT EXISTS generated_glosses (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),

View file

@ -20,12 +20,20 @@ export type ProviderConfig = {
// ── Local llama.cpp ───────────────────────────────────────────────────────────
export const LOCAL_QWEN35_4B: ProviderConfig = {
name: "local-qwen3.5-4b",
baseURL: "http://127.0.0.1:8080/v1",
apiKey: "none",
model: "qwen3.5-4b",
maxTokens: 1024, // no reasoning overhead so 1024 is enough
};
export const LOCAL_GEMMA4: ProviderConfig = {
name: "local-gemma4-e4b",
baseURL: "http://127.0.0.1:8080/v1",
apiKey: "none", // llama.cpp ignores this
model: "gemma4-e4b", // llama.cpp ignores model name, uses loaded model
maxTokens: 512,
maxTokens: 2048,
};
export const LOCAL_QWEN7B: ProviderConfig = {
@ -87,13 +95,14 @@ export const ANTHROPIC_SONNET: ProviderConfig = {
// Add new providers here to include them in the voting pool.
export const ALL_PROVIDERS: ProviderConfig[] = [
LOCAL_GEMMA4,
LOCAL_QWEN7B,
OR_QWEN3_480B,
OR_GEMMA4_31B,
OR_QWEN3_80B,
OR_NEMOTRON,
ANTHROPIC_SONNET,
LOCAL_QWEN35_4B,
// LOCAL_GEMMA4,
// LOCAL_QWEN7B,
// OR_QWEN3_480B,
// OR_GEMMA4_31B,
// OR_QWEN3_80B,
// OR_NEMOTRON,
// ANTHROPIC_SONNET,
];
// ── Key validation ────────────────────────────────────────────────────────────

View file

@ -94,19 +94,68 @@ MISSING TRANSLATIONS: ${missingTranslationsText}
Respond ONLY with valid JSON and nothing else no explanation, no markdown:
{
"headword_cefr": "<level>",
"headword_cefr": "B1",
"translation_cefr": {
"<lang>": { "<word>": "<level>", ... },
...
"de": { "frei": "A2" },
"es": { "libre": "A2" },
"fr": { "libre": "A2" },
"it": { "libero": "A2" }
},
"generated_translations": { "<lang>": "<word>", ... },
"generated_gloss": "<gloss if needed, omit if existing is fine>",
"generated_example": "<example sentence in English if needed, omit if existing is fine>"
"generated_translations": { "missing_lang": "word" },
"generated_gloss": "A clearer definition for learners.",
"generated_example": "A natural example sentence."
}
Only include "generated_translations" if there are missing languages.
Only include "generated_gloss" if you judge the existing gloss unsuitable.
Only include "generated_example" if you judge the existing examples unsuitable.`;
EXAMPLE OF CORRECT BEHAVIOUR:
If you receive:
WORD: cat
EXISTING TRANSLATIONS:
it: gatto, cat
The correct response includes "reject" for "cat" because it is an English word, not Italian:
"translation_cefr": {
"it": { "gatto": "A1", "cat": "reject" }
}
Similarly, if you receive:
EXISTING TRANSLATIONS:
de: frei, -frei
The correct response includes "reject" for "-frei" because it is a suffix, not a standalone word:
"translation_cefr": {
"de": { "frei": "A2", "-frei": "reject" }
}
EXAMPLE OF CORRECT BEHAVIOUR:
If you receive:
WORD: cat
EXISTING TRANSLATIONS:
it: gatto, cat
The correct response includes "reject" for "cat" because it is an English word, not Italian:
"translation_cefr": {
"it": { "gatto": "A1", "cat": "reject" }
}
Similarly, if you receive:
EXISTING TRANSLATIONS:
de: frei, -frei
The correct response includes "reject" for "-frei" because it is a suffix, not a standalone word:
"translation_cefr": {
"de": { "frei": "A2", "-frei": "reject" }
}
IMPORTANT:
- You MUST include EVERY translation listed in EXISTING TRANSLATIONS in your response no exceptions
- Use the CEFR level (A1-C2) if the translation is valid for this sense
- Use "reject" if the translation does not fit this specific sense, is not a real word in that language, or is clearly bad data
- Never silently omit a translation every word must get either a CEFR level or "reject"
- translation_cefr must map each language to an object of word:level pairs
- Only include "generated_translations" if MISSING TRANSLATIONS lists languages
- Only include "generated_gloss" if you judge the existing gloss unsuitable
- Only include "generated_example" if you judge the existing examples unsuitable
`;
}
// ── Validation ────────────────────────────────────────────────────────────────
@ -148,30 +197,6 @@ function validateResponse(
}
const translationCefr = obj["translation_cefr"] as Record<string, unknown>;
for (const [lang, votes] of Object.entries(translationCefr)) {
if (!SUPPORTED_LANG_SET.has(lang)) {
return {
valid: false,
reason: `unsupported language in translation_cefr: ${lang}`,
};
}
if (typeof votes !== "object" || votes === null) {
return {
valid: false,
reason: `translation_cefr.${lang} is not an object`,
};
}
for (const [word, level] of Object.entries(
votes as Record<string, unknown>,
)) {
if (typeof level !== "string" || !CEFR_SET.has(level)) {
return {
valid: false,
reason: `invalid CEFR level for ${lang}.${word}: ${String(level)}`,
};
}
}
}
// Verify all existing translations have a CEFR vote
const byLang = new Map<string, Set<string>>();
@ -199,11 +224,11 @@ function validateResponse(
}
// Optional fields
if (obj["generated_translations"] !== undefined) {
if (
typeof obj["generated_translations"] !== "object" ||
obj["generated_translations"] === null
) {
if (
obj["generated_translations"] !== undefined &&
obj["generated_translations"] !== null
) {
if (typeof obj["generated_translations"] !== "object") {
return {
valid: false,
reason: "generated_translations is not an object",
@ -250,19 +275,28 @@ async function callLlm(
prompt: string,
provider: ProviderConfig,
): Promise<string> {
const response = await fetch(`${provider.baseURL}/chat/completions`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${provider.apiKey}`,
},
body: JSON.stringify({
model: provider.model,
max_tokens: provider.maxTokens,
messages: [{ role: "user", content: prompt }],
temperature: 0.1, // low temperature for consistent structured output
}),
});
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 120_000); // 2 minutes
let response: Response;
try {
response = await fetch(`${provider.baseURL}/chat/completions`, {
method: "POST",
signal: controller.signal,
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${provider.apiKey}`,
},
body: JSON.stringify({
model: provider.model,
max_tokens: provider.maxTokens,
messages: [{ role: "user", content: prompt }],
temperature: 0.1,
}),
});
} finally {
clearTimeout(timeout);
}
if (!response.ok) {
throw new Error(`LLM API error: ${response.status} ${response.statusText}`);
@ -272,10 +306,17 @@ async function callLlm(
choices?: { message?: { content?: string } }[];
};
const content = data.choices?.[0]?.message?.content;
const content =
data.choices?.[0]?.message?.content ||
((data.choices?.[0]?.message as Record<string, unknown>)?.[
"reasoning_content"
] as string | undefined);
console.log(
"\n DEBUG response:",
JSON.stringify(data.choices?.[0]?.message),
);
if (!content) throw new Error("LLM returned empty response");
// Strip markdown code fences if present
return content
.replace(/```json\n?/g, "")
.replace(/```\n?/g, "")
@ -333,10 +374,21 @@ function writeResults(
// CEFR vote for headword
insertEntryCefr.run(entryId, modelName, data.headword_cefr);
// CEFR votes for translations
// CEFR votes and rejections for translations
for (const t of translations) {
const level = data.translation_cefr[t.target_lang]?.[t.word];
if (level) {
if (!level) continue;
if (level === "reject") {
// Explicit rejection or silently skipped — both treated as rejection
db.prepare(
`
INSERT INTO model_translation_rejections (translation_id, model_name)
VALUES (?, ?)
ON CONFLICT (translation_id, model_name) DO NOTHING
`,
).run(t.id, modelName);
} else {
insertTranslationCefr.run(t.id, modelName, level);
}
}
@ -389,6 +441,34 @@ function markNeedsReview(
console.warn(` needs_review: entry ${entryId}${reason}`);
}
function updateProgress(
processed: number,
needsReview: number,
total: number,
llmMs: number,
startTime: number,
): void {
const totalProcessed = processed + needsReview;
const pct = ((totalProcessed / total) * 100).toFixed(1);
const elapsed = (Date.now() - startTime) / 1000;
const rate = elapsed > 0 ? totalProcessed / elapsed : 0;
const remaining = rate > 0 ? (total - totalProcessed) / rate : 0;
const eta =
remaining === 0
? "calculating..."
: remaining < 60
? `${Math.round(remaining)}s`
: `${Math.round(remaining / 60)}m`;
const totalElapsedStr =
elapsed < 60
? `${Math.round(elapsed)}s`
: `${Math.floor(elapsed / 60)}m ${Math.round(elapsed % 60)}s`;
process.stdout.write(
`\r ${totalProcessed}/${total} (${pct}%) — entry: ${(llmMs / 1000).toFixed(1)}s — total: ${totalElapsedStr} — ETA: ${eta} `,
);
}
// ── Main enrich function ──────────────────────────────────────────────────────
export async function enrich(
@ -411,7 +491,9 @@ export async function enrich(
.all(provider.name) as { entry_id: number }[];
const processedIds = new Set(processed.map((r) => r.entry_id));
const pending = allEntries.filter((e) => !processedIds.has(e.id));
const pending = allEntries
.filter((e) => !processedIds.has(e.id))
.slice(0, 10);
db.close();
@ -427,6 +509,9 @@ export async function enrich(
let processedCount = 0;
let needsReviewCount = 0;
let llmMs = 0;
const startTime = Date.now();
for (const entry of pending) {
const db2 = openDb();
@ -441,17 +526,26 @@ export async function enrich(
const prompt = buildPrompt(entry, translations);
let raw: string;
try {
const llmStart = Date.now();
raw = await callLlm(prompt, provider);
llmMs = Date.now() - llmStart;
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
markNeedsReview(entry.id, provider.name, `LLM call failed: ${message}`);
needsReviewCount++;
updateProgress(
processedCount,
needsReviewCount,
pending.length,
llmMs,
startTime,
);
continue;
}
const validation = validateResponse(raw, translations);
if (!validation.valid) {
markNeedsReview(
entry.id,
@ -459,19 +553,36 @@ export async function enrich(
`validation failed: ${validation.reason}`,
);
needsReviewCount++;
updateProgress(
processedCount,
needsReviewCount,
pending.length,
llmMs,
startTime,
);
continue;
}
writeResults(entry.id, provider.name, validation.data, translations);
processedCount++;
if (processedCount % 100 === 0) {
console.log(
` Processed ${processedCount.toLocaleString()} entries...`,
);
}
updateProgress(
processedCount,
needsReviewCount,
pending.length,
llmMs,
startTime,
);
}
process.stdout.write("\n");
const totalMs = Date.now() - startTime;
const totalMin = Math.floor(totalMs / 60_000);
const totalSec = Math.round((totalMs % 60_000) / 1000);
console.log(` Total time: ${totalMin}m ${totalSec}s`);
console.log(
` Avg per entry: ${(totalMs / Math.max(processedCount + needsReviewCount, 1) / 1000).toFixed(1)}s`,
);
console.log(` Processed: ${processedCount.toLocaleString()}`);
console.log(` Needs review: ${needsReviewCount.toLocaleString()}`);