feat: enrich script working, redesigning to sub-stage architecture
- Enrich script functional with timeout, progress tracking, rejection mechanism - Identified ordering issue: CEFR voting needs validated translations first - Redesign: round1_gloss → round1_example → round1_translations → round1_cefr - Update data-pipeline.md with new sub-stage design and roadmap - Qwen3.5-4B confirmed working with thinking disabled
This commit is contained in:
parent
7f10c35e03
commit
73fb12ac35
7 changed files with 337 additions and 122 deletions
Binary file not shown.
|
|
@ -60,6 +60,13 @@ CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
|
|||
UNIQUE (translation_id, model_name)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS model_translation_rejections (
|
||||
id INTEGER PRIMARY KEY,
|
||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||
model_name TEXT NOT NULL,
|
||||
UNIQUE (translation_id, model_name)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS generated_glosses (
|
||||
id INTEGER PRIMARY KEY,
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||
|
|
|
|||
|
|
@ -20,12 +20,20 @@ export type ProviderConfig = {
|
|||
|
||||
// ── Local llama.cpp ───────────────────────────────────────────────────────────
|
||||
|
||||
export const LOCAL_QWEN35_4B: ProviderConfig = {
|
||||
name: "local-qwen3.5-4b",
|
||||
baseURL: "http://127.0.0.1:8080/v1",
|
||||
apiKey: "none",
|
||||
model: "qwen3.5-4b",
|
||||
maxTokens: 1024, // no reasoning overhead so 1024 is enough
|
||||
};
|
||||
|
||||
export const LOCAL_GEMMA4: ProviderConfig = {
|
||||
name: "local-gemma4-e4b",
|
||||
baseURL: "http://127.0.0.1:8080/v1",
|
||||
apiKey: "none", // llama.cpp ignores this
|
||||
model: "gemma4-e4b", // llama.cpp ignores model name, uses loaded model
|
||||
maxTokens: 512,
|
||||
maxTokens: 2048,
|
||||
};
|
||||
|
||||
export const LOCAL_QWEN7B: ProviderConfig = {
|
||||
|
|
@ -87,13 +95,14 @@ export const ANTHROPIC_SONNET: ProviderConfig = {
|
|||
// Add new providers here to include them in the voting pool.
|
||||
|
||||
export const ALL_PROVIDERS: ProviderConfig[] = [
|
||||
LOCAL_GEMMA4,
|
||||
LOCAL_QWEN7B,
|
||||
OR_QWEN3_480B,
|
||||
OR_GEMMA4_31B,
|
||||
OR_QWEN3_80B,
|
||||
OR_NEMOTRON,
|
||||
ANTHROPIC_SONNET,
|
||||
LOCAL_QWEN35_4B,
|
||||
// LOCAL_GEMMA4,
|
||||
// LOCAL_QWEN7B,
|
||||
// OR_QWEN3_480B,
|
||||
// OR_GEMMA4_31B,
|
||||
// OR_QWEN3_80B,
|
||||
// OR_NEMOTRON,
|
||||
// ANTHROPIC_SONNET,
|
||||
];
|
||||
|
||||
// ── Key validation ────────────────────────────────────────────────────────────
|
||||
|
|
|
|||
|
|
@ -94,19 +94,68 @@ MISSING TRANSLATIONS: ${missingTranslationsText}
|
|||
|
||||
Respond ONLY with valid JSON and nothing else — no explanation, no markdown:
|
||||
{
|
||||
"headword_cefr": "<level>",
|
||||
"headword_cefr": "B1",
|
||||
"translation_cefr": {
|
||||
"<lang>": { "<word>": "<level>", ... },
|
||||
...
|
||||
"de": { "frei": "A2" },
|
||||
"es": { "libre": "A2" },
|
||||
"fr": { "libre": "A2" },
|
||||
"it": { "libero": "A2" }
|
||||
},
|
||||
"generated_translations": { "<lang>": "<word>", ... },
|
||||
"generated_gloss": "<gloss if needed, omit if existing is fine>",
|
||||
"generated_example": "<example sentence in English if needed, omit if existing is fine>"
|
||||
"generated_translations": { "missing_lang": "word" },
|
||||
"generated_gloss": "A clearer definition for learners.",
|
||||
"generated_example": "A natural example sentence."
|
||||
}
|
||||
|
||||
Only include "generated_translations" if there are missing languages.
|
||||
Only include "generated_gloss" if you judge the existing gloss unsuitable.
|
||||
Only include "generated_example" if you judge the existing examples unsuitable.`;
|
||||
EXAMPLE OF CORRECT BEHAVIOUR:
|
||||
If you receive:
|
||||
WORD: cat
|
||||
EXISTING TRANSLATIONS:
|
||||
it: gatto, cat
|
||||
|
||||
The correct response includes "reject" for "cat" because it is an English word, not Italian:
|
||||
"translation_cefr": {
|
||||
"it": { "gatto": "A1", "cat": "reject" }
|
||||
}
|
||||
|
||||
Similarly, if you receive:
|
||||
EXISTING TRANSLATIONS:
|
||||
de: frei, -frei
|
||||
|
||||
The correct response includes "reject" for "-frei" because it is a suffix, not a standalone word:
|
||||
"translation_cefr": {
|
||||
"de": { "frei": "A2", "-frei": "reject" }
|
||||
}
|
||||
|
||||
EXAMPLE OF CORRECT BEHAVIOUR:
|
||||
If you receive:
|
||||
WORD: cat
|
||||
EXISTING TRANSLATIONS:
|
||||
it: gatto, cat
|
||||
|
||||
The correct response includes "reject" for "cat" because it is an English word, not Italian:
|
||||
"translation_cefr": {
|
||||
"it": { "gatto": "A1", "cat": "reject" }
|
||||
}
|
||||
|
||||
Similarly, if you receive:
|
||||
EXISTING TRANSLATIONS:
|
||||
de: frei, -frei
|
||||
|
||||
The correct response includes "reject" for "-frei" because it is a suffix, not a standalone word:
|
||||
"translation_cefr": {
|
||||
"de": { "frei": "A2", "-frei": "reject" }
|
||||
}
|
||||
|
||||
IMPORTANT:
|
||||
- You MUST include EVERY translation listed in EXISTING TRANSLATIONS in your response — no exceptions
|
||||
- Use the CEFR level (A1-C2) if the translation is valid for this sense
|
||||
- Use "reject" if the translation does not fit this specific sense, is not a real word in that language, or is clearly bad data
|
||||
- Never silently omit a translation — every word must get either a CEFR level or "reject"
|
||||
- translation_cefr must map each language to an object of word:level pairs
|
||||
- Only include "generated_translations" if MISSING TRANSLATIONS lists languages
|
||||
- Only include "generated_gloss" if you judge the existing gloss unsuitable
|
||||
- Only include "generated_example" if you judge the existing examples unsuitable
|
||||
`;
|
||||
}
|
||||
|
||||
// ── Validation ────────────────────────────────────────────────────────────────
|
||||
|
|
@ -148,30 +197,6 @@ function validateResponse(
|
|||
}
|
||||
|
||||
const translationCefr = obj["translation_cefr"] as Record<string, unknown>;
|
||||
for (const [lang, votes] of Object.entries(translationCefr)) {
|
||||
if (!SUPPORTED_LANG_SET.has(lang)) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: `unsupported language in translation_cefr: ${lang}`,
|
||||
};
|
||||
}
|
||||
if (typeof votes !== "object" || votes === null) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: `translation_cefr.${lang} is not an object`,
|
||||
};
|
||||
}
|
||||
for (const [word, level] of Object.entries(
|
||||
votes as Record<string, unknown>,
|
||||
)) {
|
||||
if (typeof level !== "string" || !CEFR_SET.has(level)) {
|
||||
return {
|
||||
valid: false,
|
||||
reason: `invalid CEFR level for ${lang}.${word}: ${String(level)}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Verify all existing translations have a CEFR vote
|
||||
const byLang = new Map<string, Set<string>>();
|
||||
|
|
@ -199,11 +224,11 @@ function validateResponse(
|
|||
}
|
||||
|
||||
// Optional fields
|
||||
if (obj["generated_translations"] !== undefined) {
|
||||
if (
|
||||
typeof obj["generated_translations"] !== "object" ||
|
||||
obj["generated_translations"] === null
|
||||
) {
|
||||
if (
|
||||
obj["generated_translations"] !== undefined &&
|
||||
obj["generated_translations"] !== null
|
||||
) {
|
||||
if (typeof obj["generated_translations"] !== "object") {
|
||||
return {
|
||||
valid: false,
|
||||
reason: "generated_translations is not an object",
|
||||
|
|
@ -250,19 +275,28 @@ async function callLlm(
|
|||
prompt: string,
|
||||
provider: ProviderConfig,
|
||||
): Promise<string> {
|
||||
const response = await fetch(`${provider.baseURL}/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${provider.apiKey}`,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: provider.model,
|
||||
max_tokens: provider.maxTokens,
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
temperature: 0.1, // low temperature for consistent structured output
|
||||
}),
|
||||
});
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), 120_000); // 2 minutes
|
||||
|
||||
let response: Response;
|
||||
try {
|
||||
response = await fetch(`${provider.baseURL}/chat/completions`, {
|
||||
method: "POST",
|
||||
signal: controller.signal,
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${provider.apiKey}`,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: provider.model,
|
||||
max_tokens: provider.maxTokens,
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
temperature: 0.1,
|
||||
}),
|
||||
});
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`LLM API error: ${response.status} ${response.statusText}`);
|
||||
|
|
@ -272,10 +306,17 @@ async function callLlm(
|
|||
choices?: { message?: { content?: string } }[];
|
||||
};
|
||||
|
||||
const content = data.choices?.[0]?.message?.content;
|
||||
const content =
|
||||
data.choices?.[0]?.message?.content ||
|
||||
((data.choices?.[0]?.message as Record<string, unknown>)?.[
|
||||
"reasoning_content"
|
||||
] as string | undefined);
|
||||
console.log(
|
||||
"\n DEBUG response:",
|
||||
JSON.stringify(data.choices?.[0]?.message),
|
||||
);
|
||||
if (!content) throw new Error("LLM returned empty response");
|
||||
|
||||
// Strip markdown code fences if present
|
||||
return content
|
||||
.replace(/```json\n?/g, "")
|
||||
.replace(/```\n?/g, "")
|
||||
|
|
@ -333,10 +374,21 @@ function writeResults(
|
|||
// CEFR vote for headword
|
||||
insertEntryCefr.run(entryId, modelName, data.headword_cefr);
|
||||
|
||||
// CEFR votes for translations
|
||||
// CEFR votes and rejections for translations
|
||||
for (const t of translations) {
|
||||
const level = data.translation_cefr[t.target_lang]?.[t.word];
|
||||
if (level) {
|
||||
|
||||
if (!level) continue;
|
||||
if (level === "reject") {
|
||||
// Explicit rejection or silently skipped — both treated as rejection
|
||||
db.prepare(
|
||||
`
|
||||
INSERT INTO model_translation_rejections (translation_id, model_name)
|
||||
VALUES (?, ?)
|
||||
ON CONFLICT (translation_id, model_name) DO NOTHING
|
||||
`,
|
||||
).run(t.id, modelName);
|
||||
} else {
|
||||
insertTranslationCefr.run(t.id, modelName, level);
|
||||
}
|
||||
}
|
||||
|
|
@ -389,6 +441,34 @@ function markNeedsReview(
|
|||
console.warn(` needs_review: entry ${entryId} — ${reason}`);
|
||||
}
|
||||
|
||||
function updateProgress(
|
||||
processed: number,
|
||||
needsReview: number,
|
||||
total: number,
|
||||
llmMs: number,
|
||||
startTime: number,
|
||||
): void {
|
||||
const totalProcessed = processed + needsReview;
|
||||
const pct = ((totalProcessed / total) * 100).toFixed(1);
|
||||
const elapsed = (Date.now() - startTime) / 1000;
|
||||
const rate = elapsed > 0 ? totalProcessed / elapsed : 0;
|
||||
const remaining = rate > 0 ? (total - totalProcessed) / rate : 0;
|
||||
const eta =
|
||||
remaining === 0
|
||||
? "calculating..."
|
||||
: remaining < 60
|
||||
? `${Math.round(remaining)}s`
|
||||
: `${Math.round(remaining / 60)}m`;
|
||||
const totalElapsedStr =
|
||||
elapsed < 60
|
||||
? `${Math.round(elapsed)}s`
|
||||
: `${Math.floor(elapsed / 60)}m ${Math.round(elapsed % 60)}s`;
|
||||
|
||||
process.stdout.write(
|
||||
`\r ${totalProcessed}/${total} (${pct}%) — entry: ${(llmMs / 1000).toFixed(1)}s — total: ${totalElapsedStr} — ETA: ${eta} `,
|
||||
);
|
||||
}
|
||||
|
||||
// ── Main enrich function ──────────────────────────────────────────────────────
|
||||
|
||||
export async function enrich(
|
||||
|
|
@ -411,7 +491,9 @@ export async function enrich(
|
|||
.all(provider.name) as { entry_id: number }[];
|
||||
|
||||
const processedIds = new Set(processed.map((r) => r.entry_id));
|
||||
const pending = allEntries.filter((e) => !processedIds.has(e.id));
|
||||
const pending = allEntries
|
||||
.filter((e) => !processedIds.has(e.id))
|
||||
.slice(0, 10);
|
||||
|
||||
db.close();
|
||||
|
||||
|
|
@ -427,6 +509,9 @@ export async function enrich(
|
|||
|
||||
let processedCount = 0;
|
||||
let needsReviewCount = 0;
|
||||
let llmMs = 0;
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
for (const entry of pending) {
|
||||
const db2 = openDb();
|
||||
|
|
@ -441,17 +526,26 @@ export async function enrich(
|
|||
const prompt = buildPrompt(entry, translations);
|
||||
|
||||
let raw: string;
|
||||
|
||||
try {
|
||||
const llmStart = Date.now();
|
||||
raw = await callLlm(prompt, provider);
|
||||
llmMs = Date.now() - llmStart;
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
markNeedsReview(entry.id, provider.name, `LLM call failed: ${message}`);
|
||||
needsReviewCount++;
|
||||
updateProgress(
|
||||
processedCount,
|
||||
needsReviewCount,
|
||||
pending.length,
|
||||
llmMs,
|
||||
startTime,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
const validation = validateResponse(raw, translations);
|
||||
|
||||
if (!validation.valid) {
|
||||
markNeedsReview(
|
||||
entry.id,
|
||||
|
|
@ -459,19 +553,36 @@ export async function enrich(
|
|||
`validation failed: ${validation.reason}`,
|
||||
);
|
||||
needsReviewCount++;
|
||||
updateProgress(
|
||||
processedCount,
|
||||
needsReviewCount,
|
||||
pending.length,
|
||||
llmMs,
|
||||
startTime,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
writeResults(entry.id, provider.name, validation.data, translations);
|
||||
processedCount++;
|
||||
|
||||
if (processedCount % 100 === 0) {
|
||||
console.log(
|
||||
` Processed ${processedCount.toLocaleString()} entries...`,
|
||||
);
|
||||
}
|
||||
updateProgress(
|
||||
processedCount,
|
||||
needsReviewCount,
|
||||
pending.length,
|
||||
llmMs,
|
||||
startTime,
|
||||
);
|
||||
}
|
||||
|
||||
process.stdout.write("\n");
|
||||
const totalMs = Date.now() - startTime;
|
||||
const totalMin = Math.floor(totalMs / 60_000);
|
||||
const totalSec = Math.round((totalMs % 60_000) / 1000);
|
||||
console.log(` Total time: ${totalMin}m ${totalSec}s`);
|
||||
console.log(
|
||||
` Avg per entry: ${(totalMs / Math.max(processedCount + needsReviewCount, 1) / 1000).toFixed(1)}s`,
|
||||
);
|
||||
|
||||
console.log(` Processed: ${processedCount.toLocaleString()}`);
|
||||
console.log(` Needs review: ${needsReviewCount.toLocaleString()}`);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue