feat: update pipeline orchestrator for Kaikki — wire up stages 1 and 2

- Replace checkOmwExists with checkExtractedFilesExist
- Wire up importKaikki and reverseLink as real stage implementations
- Track reverse link completion via sentinel row in run_status
- Update report to use resolved_entry_cefr and entry counts
- Stages 3 onwards remain as stubs
This commit is contained in:
lila 2026-05-05 19:04:28 +02:00
parent 6f9a42c707
commit 1c44ef989b
2 changed files with 92 additions and 41 deletions

View file

@ -2,8 +2,9 @@ import fs from "node:fs/promises";
import path from "node:path"; import path from "node:path";
import { fileURLToPath } from "node:url"; import { fileURLToPath } from "node:url";
import { initDb } from "./db/init.js"; import { initDb } from "./db/init.js";
import { isImported, importStage2 } from "./db/import.js"; import { isImported, importKaikki } from "./db/import.js";
import { openDb } from "./db/index.js"; import { openDb } from "./db/index.js";
import { reverseLink } from "./stage-2-reverse-link/scripts/reverse-link.js";
import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js"; import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js";
import type { ProviderConfig } from "./stage-3-enrich/config.js"; import type { ProviderConfig } from "./stage-3-enrich/config.js";
@ -35,23 +36,23 @@ type RunStats = {
const __dirname = path.dirname(fileURLToPath(import.meta.url)); const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PATHS = { const PATHS = {
omw: path.join(__dirname, "stage-1-extract/output/omw.json"), extractedEn: path.join(__dirname, "stage-1-extract/output/en.json"),
db: path.join(__dirname, "db/pipeline.db"), db: path.join(__dirname, "db/pipeline.db"),
reports: path.join(__dirname, "reports"), reports: path.join(__dirname, "reports"),
llamaHealth: "http://127.0.0.1:8080/health", llamaHealth: "http://127.0.0.1:8080/health",
}; };
const SENTINEL = { sourceId: "system", modelName: "system" }; const SENTINEL = { entryId: 0, modelName: "system" };
// ── Startup checks ──────────────────────────────────────────────────────────── // ── Startup checks ────────────────────────────────────────────────────────────
async function checkOmwExists(): Promise<void> { async function checkExtractedFilesExist(): Promise<void> {
try { try {
await fs.access(PATHS.omw); await fs.access(PATHS.extractedEn);
} catch { } catch {
console.error("\n ERROR: stage-1-extract/output/omw.json not found."); console.error("\n ERROR: stage-1-extract/output/en.json not found.");
console.error(" Run the stage 1 extraction script first:"); console.error(" Run the stage 1 extraction script first:");
console.error(" python stage-1-extract/scripts/extract.py\n"); console.error(" pnpm extract\n");
process.exit(1); process.exit(1);
} }
} }
@ -67,8 +68,8 @@ async function checkAndInitDb(): Promise<void> {
async function checkAndImportDb(): Promise<void> { async function checkAndImportDb(): Promise<void> {
if (!isImported()) { if (!isImported()) {
console.log(" Base tables empty — importing stage 2 data..."); console.log(" Base tables empty — importing Kaikki data...");
await importStage2(); await importKaikki();
} }
} }
@ -132,6 +133,7 @@ function registerShutdownHandler(stats: RunStats): void {
process.on("SIGINT", handler); process.on("SIGINT", handler);
process.on("SIGTERM", handler); process.on("SIGTERM", handler);
} }
// ── Stage status helpers ────────────────────────────────────────────────────── // ── Stage status helpers ──────────────────────────────────────────────────────
function getSentinelStatus(stage: RunStage): StageStatus { function getSentinelStatus(stage: RunStage): StageStatus {
@ -139,9 +141,9 @@ function getSentinelStatus(stage: RunStage): StageStatus {
const row = db const row = db
.prepare( .prepare(
`SELECT status FROM run_status `SELECT status FROM run_status
WHERE source_id = ? AND model_name = ? AND stage = ?`, WHERE entry_id = ? AND model_name = ? AND stage = ?`,
) )
.get(SENTINEL.sourceId, SENTINEL.modelName, stage) as .get(SENTINEL.entryId, SENTINEL.modelName, stage) as
| { status: string } | { status: string }
| undefined; | undefined;
db.close(); db.close();
@ -151,11 +153,11 @@ function getSentinelStatus(stage: RunStage): StageStatus {
function markSentinelComplete(stage: RunStage): void { function markSentinelComplete(stage: RunStage): void {
const db = openDb(); const db = openDb();
db.prepare( db.prepare(
`INSERT INTO run_status (source_id, model_name, stage, status) `INSERT INTO run_status (entry_id, model_name, stage, status)
VALUES (?, ?, ?, 'complete') VALUES (?, ?, ?, 'complete')
ON CONFLICT (source_id, model_name, stage) ON CONFLICT (entry_id, model_name, stage)
DO UPDATE SET status = 'complete', updated_at = datetime('now')`, DO UPDATE SET status = 'complete', updated_at = datetime('now')`,
).run(SENTINEL.sourceId, SENTINEL.modelName, stage); ).run(SENTINEL.entryId, SENTINEL.modelName, stage);
db.close(); db.close();
} }
@ -163,9 +165,9 @@ function getModelRound1Status(modelName: string): StageStatus {
const db = openDb(); const db = openDb();
const total = ( const total = (
db.prepare("SELECT COUNT(*) as count FROM synsets").get() as { db
count: number; .prepare("SELECT COUNT(*) as count FROM entries WHERE language = 'en'")
} .get() as { count: number }
).count; ).count;
const complete = ( const complete = (
@ -188,9 +190,9 @@ function getModelRound2Status(modelName: string): StageStatus {
const db = openDb(); const db = openDb();
const total = ( const total = (
db.prepare("SELECT COUNT(*) as count FROM synsets").get() as { db
count: number; .prepare("SELECT COUNT(*) as count FROM entries WHERE language = 'en'")
} .get() as { count: number }
).count; ).count;
const complete = ( const complete = (
@ -209,7 +211,42 @@ function getModelRound2Status(modelName: string): StageStatus {
return "in_progress"; return "in_progress";
} }
// ── Stage runners (stubs) ───────────────────────────────────────────────────── function isReverseLinkDone(): boolean {
const db = openDb();
const row = db
.prepare(
`SELECT status FROM run_status
WHERE entry_id = ? AND model_name = ? AND stage = 'reverse_link'`,
)
.get(SENTINEL.entryId, SENTINEL.modelName) as
| { status: string }
| undefined;
db.close();
return row?.status === "complete";
}
function markReverseLinkComplete(): void {
const db = openDb();
db.prepare(
`INSERT INTO run_status (entry_id, model_name, stage, status)
VALUES (?, ?, 'reverse_link', 'complete')
ON CONFLICT (entry_id, model_name, stage)
DO UPDATE SET status = 'complete', updated_at = datetime('now')`,
).run(SENTINEL.entryId, SENTINEL.modelName);
db.close();
}
// ── Stage runners ─────────────────────────────────────────────────────────────
function runReverseLinkStage(): void {
if (isReverseLinkDone()) {
console.log("\n [reverse link] Already complete, skipping");
return;
}
console.log("\n [reverse link] Syncing reverse translation links...");
reverseLink();
markReverseLinkComplete();
}
function runRound1(provider: ProviderConfig, stats: RunStats): void { function runRound1(provider: ProviderConfig, stats: RunStats): void {
console.log(`\n [round 1] Running ${provider.name}...`); console.log(`\n [round 1] Running ${provider.name}...`);
@ -247,7 +284,7 @@ function runMerge(): void {
} }
function runTiebreak(stats: RunStats): void { function runTiebreak(stats: RunStats): void {
console.log("\n [tiebreak] Resolving flagged translations..."); console.log("\n [tiebreak] Resolving flagged entries...");
// TODO: implement tiebreak logic // TODO: implement tiebreak logic
console.log(" [tiebreak] not yet implemented"); console.log(" [tiebreak] not yet implemented");
stats.currentStage = "tiebreak"; stats.currentStage = "tiebreak";
@ -265,19 +302,19 @@ function runCompare(): void {
async function generateReport(runName: string, stats: RunStats): Promise<void> { async function generateReport(runName: string, stats: RunStats): Promise<void> {
const db = openDb(); const db = openDb();
const totalSynsets = ( const totalEntries = (
db.prepare("SELECT COUNT(*) as count FROM synsets").get() as { db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
count: number; count: number;
} }
).count; ).count;
const resolvedTranslations = ( const resolvedEntries = (
db.prepare("SELECT COUNT(*) as count FROM resolved_translations").get() as { db.prepare("SELECT COUNT(*) as count FROM resolved_entry_cefr").get() as {
count: number; count: number;
} }
).count; ).count;
const flaggedTranslations = ( const flaggedEntries = (
db db
.prepare( .prepare(
`SELECT COUNT(*) as count FROM run_status `SELECT COUNT(*) as count FROM run_status
@ -302,7 +339,7 @@ async function generateReport(runName: string, stats: RunStats): Promise<void> {
const durationMin = Math.round(durationMs / 60_000); const durationMin = Math.round(durationMs / 60_000);
const isFinal = const isFinal =
getSentinelStatus("compare") === "complete" && flaggedTranslations === 0; getSentinelStatus("compare") === "complete" && flaggedEntries === 0;
const report = { const report = {
runName, runName,
@ -310,15 +347,16 @@ async function generateReport(runName: string, stats: RunStats): Promise<void> {
durationMinutes: durationMin, durationMinutes: durationMin,
isFinal, isFinal,
progress: { progress: {
totalSynsets, totalEntries,
resolvedTranslations, resolvedEntries,
flaggedTranslations, flaggedEntries,
needsReview, needsReview,
recordsProcessedThisRun: stats.recordsProcessed, recordsProcessedThisRun: stats.recordsProcessed,
recordsSkippedThisRun: stats.recordsSkipped, recordsSkippedThisRun: stats.recordsSkipped,
}, },
modelsRun: stats.modelsRun, modelsRun: stats.modelsRun,
stages: { stages: {
reverseLink: isReverseLinkDone() ? "complete" : "pending",
round1: ALL_PROVIDERS.map((p) => ({ round1: ALL_PROVIDERS.map((p) => ({
model: p.name, model: p.name,
status: getModelRound1Status(p.name), status: getModelRound1Status(p.name),
@ -354,15 +392,17 @@ async function generateReport(runName: string, stats: RunStats): Promise<void> {
``, ``,
`| Metric | Value |`, `| Metric | Value |`,
`| ------ | ----- |`, `| ------ | ----- |`,
`| Total synsets | ${totalSynsets.toLocaleString()} |`, `| Total entries | ${totalEntries.toLocaleString()} |`,
`| Resolved translations | ${resolvedTranslations.toLocaleString()} |`, `| Resolved entries | ${resolvedEntries.toLocaleString()} |`,
`| Flagged translations | ${flaggedTranslations.toLocaleString()} |`, `| Flagged entries | ${flaggedEntries.toLocaleString()} |`,
`| Needs review | ${needsReview.toLocaleString()} |`, `| Needs review | ${needsReview.toLocaleString()} |`,
`| Records processed this run | ${stats.recordsProcessed.toLocaleString()} |`, `| Records processed this run | ${stats.recordsProcessed.toLocaleString()} |`,
`| Records skipped this run | ${stats.recordsSkipped.toLocaleString()} |`, `| Records skipped this run | ${stats.recordsSkipped.toLocaleString()} |`,
``, ``,
`## Stage status`, `## Stage status`,
``, ``,
`### Reverse link: ${report.stages.reverseLink}`,
``,
`### Round 1`, `### Round 1`,
``, ``,
...report.stages.round1.map( ...report.stages.round1.map(
@ -403,7 +443,7 @@ async function main(): Promise<void> {
// ── Startup checks // ── Startup checks
console.log("Checking prerequisites..."); console.log("Checking prerequisites...");
await checkOmwExists(); await checkExtractedFilesExist();
await checkAndInitDb(); await checkAndInitDb();
await checkAndImportDb(); await checkAndImportDb();
console.log(" Prerequisites OK"); console.log(" Prerequisites OK");
@ -425,6 +465,14 @@ async function main(): Promise<void> {
registerShutdownHandler(stats); registerShutdownHandler(stats);
// ── Stage 2 — Reverse link
runReverseLinkStage();
if (shutdownRequested) {
await generateReport(runName, stats);
process.exit(0);
}
// ── Round 1 // ── Round 1
console.log("\nRound 1 — generation"); console.log("\nRound 1 — generation");
for (const provider of ALL_PROVIDERS) { for (const provider of ALL_PROVIDERS) {

View file

@ -314,9 +314,12 @@ These are not part of the current pipeline but are worth considering as the data
## Roadmap ## Roadmap
**Current state:** Production schema migrated to Kaikki flat model. Stage 1 extraction scripts written and sample run complete (500 entries per language). pipeline.db initialised and imported with sample data. Stage 2 reverse link sync not yet written. llama.cpp not installed. **Current state:** Stage 1 extraction and stage 2 reverse link sync scripts
written and verified on sample data. pipeline.db contains 4,156 entries and
4,287 translations across 5 languages. Stage 3 enrich scripts not yet written.
llama.cpp not installed.
**Next action:** Write the stage 2 reverse link sync script. **Next action:** Write the stage 3 enrich script.
| Stage | Status | | Stage | Status |
| --------------- | -------------- | | --------------- | -------------- |
@ -339,11 +342,11 @@ These are not part of the current pipeline but are worth considering as the data
- [ ] Remove sample limit and run full extraction - [ ] Remove sample limit and run full extraction
- [ ] Re-run full import → `pipeline.db` - [ ] Re-run full import → `pipeline.db`
### Stage 2 — Reverse link sync `🔲 not started` ### Stage 2 — Reverse link sync `🔄 in progress`
- [ ] Write reverse link sync script - [x] Write reverse link sync script
- [ ] Write tests - [x] Run reverse link sync on sample data → 141 links inserted
- [ ] Run reverse link sync `pipeline.db` - [ ] Run reverse link sync on full data after full extraction
### Stage 3 — Enrich `🔲 not started` ### Stage 3 — Enrich `🔲 not started`