From 1c44ef989bb943dcc75a1ca5b3e3354b3a9d4181 Mon Sep 17 00:00:00 2001 From: lila Date: Tue, 5 May 2026 19:04:28 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20update=20pipeline=20orchestrator=20for?= =?UTF-8?q?=20Kaikki=20=E2=80=94=20wire=20up=20stages=201=20and=202?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace checkOmwExists with checkExtractedFilesExist - Wire up importKaikki and reverseLink as real stage implementations - Track reverse link completion via sentinel row in run_status - Update report to use resolved_entry_cefr and entry counts - Stages 3 onwards remain as stubs --- data-pipeline/pipeline.ts | 118 +++++++++++++++++++++++---------- documentation/data-pipeline.md | 15 +++-- 2 files changed, 92 insertions(+), 41 deletions(-) diff --git a/data-pipeline/pipeline.ts b/data-pipeline/pipeline.ts index 43e33c1..5be9660 100644 --- a/data-pipeline/pipeline.ts +++ b/data-pipeline/pipeline.ts @@ -2,8 +2,9 @@ import fs from "node:fs/promises"; import path from "node:path"; import { fileURLToPath } from "node:url"; import { initDb } from "./db/init.js"; -import { isImported, importStage2 } from "./db/import.js"; +import { isImported, importKaikki } from "./db/import.js"; import { openDb } from "./db/index.js"; +import { reverseLink } from "./stage-2-reverse-link/scripts/reverse-link.js"; import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js"; import type { ProviderConfig } from "./stage-3-enrich/config.js"; @@ -35,23 +36,23 @@ type RunStats = { const __dirname = path.dirname(fileURLToPath(import.meta.url)); const PATHS = { - omw: path.join(__dirname, "stage-1-extract/output/omw.json"), + extractedEn: path.join(__dirname, "stage-1-extract/output/en.json"), db: path.join(__dirname, "db/pipeline.db"), reports: path.join(__dirname, "reports"), llamaHealth: "http://127.0.0.1:8080/health", }; -const SENTINEL = { sourceId: "system", modelName: "system" }; +const SENTINEL = { entryId: 0, modelName: "system" }; // ── Startup checks ──────────────────────────────────────────────────────────── -async function checkOmwExists(): Promise { +async function checkExtractedFilesExist(): Promise { try { - await fs.access(PATHS.omw); + await fs.access(PATHS.extractedEn); } catch { - console.error("\n ERROR: stage-1-extract/output/omw.json not found."); + console.error("\n ERROR: stage-1-extract/output/en.json not found."); console.error(" Run the stage 1 extraction script first:"); - console.error(" python stage-1-extract/scripts/extract.py\n"); + console.error(" pnpm extract\n"); process.exit(1); } } @@ -67,8 +68,8 @@ async function checkAndInitDb(): Promise { async function checkAndImportDb(): Promise { if (!isImported()) { - console.log(" Base tables empty — importing stage 2 data..."); - await importStage2(); + console.log(" Base tables empty — importing Kaikki data..."); + await importKaikki(); } } @@ -132,6 +133,7 @@ function registerShutdownHandler(stats: RunStats): void { process.on("SIGINT", handler); process.on("SIGTERM", handler); } + // ── Stage status helpers ────────────────────────────────────────────────────── function getSentinelStatus(stage: RunStage): StageStatus { @@ -139,9 +141,9 @@ function getSentinelStatus(stage: RunStage): StageStatus { const row = db .prepare( `SELECT status FROM run_status - WHERE source_id = ? AND model_name = ? AND stage = ?`, + WHERE entry_id = ? AND model_name = ? AND stage = ?`, ) - .get(SENTINEL.sourceId, SENTINEL.modelName, stage) as + .get(SENTINEL.entryId, SENTINEL.modelName, stage) as | { status: string } | undefined; db.close(); @@ -151,11 +153,11 @@ function getSentinelStatus(stage: RunStage): StageStatus { function markSentinelComplete(stage: RunStage): void { const db = openDb(); db.prepare( - `INSERT INTO run_status (source_id, model_name, stage, status) + `INSERT INTO run_status (entry_id, model_name, stage, status) VALUES (?, ?, ?, 'complete') - ON CONFLICT (source_id, model_name, stage) + ON CONFLICT (entry_id, model_name, stage) DO UPDATE SET status = 'complete', updated_at = datetime('now')`, - ).run(SENTINEL.sourceId, SENTINEL.modelName, stage); + ).run(SENTINEL.entryId, SENTINEL.modelName, stage); db.close(); } @@ -163,9 +165,9 @@ function getModelRound1Status(modelName: string): StageStatus { const db = openDb(); const total = ( - db.prepare("SELECT COUNT(*) as count FROM synsets").get() as { - count: number; - } + db + .prepare("SELECT COUNT(*) as count FROM entries WHERE language = 'en'") + .get() as { count: number } ).count; const complete = ( @@ -188,9 +190,9 @@ function getModelRound2Status(modelName: string): StageStatus { const db = openDb(); const total = ( - db.prepare("SELECT COUNT(*) as count FROM synsets").get() as { - count: number; - } + db + .prepare("SELECT COUNT(*) as count FROM entries WHERE language = 'en'") + .get() as { count: number } ).count; const complete = ( @@ -209,7 +211,42 @@ function getModelRound2Status(modelName: string): StageStatus { return "in_progress"; } -// ── Stage runners (stubs) ───────────────────────────────────────────────────── +function isReverseLinkDone(): boolean { + const db = openDb(); + const row = db + .prepare( + `SELECT status FROM run_status + WHERE entry_id = ? AND model_name = ? AND stage = 'reverse_link'`, + ) + .get(SENTINEL.entryId, SENTINEL.modelName) as + | { status: string } + | undefined; + db.close(); + return row?.status === "complete"; +} + +function markReverseLinkComplete(): void { + const db = openDb(); + db.prepare( + `INSERT INTO run_status (entry_id, model_name, stage, status) + VALUES (?, ?, 'reverse_link', 'complete') + ON CONFLICT (entry_id, model_name, stage) + DO UPDATE SET status = 'complete', updated_at = datetime('now')`, + ).run(SENTINEL.entryId, SENTINEL.modelName); + db.close(); +} + +// ── Stage runners ───────────────────────────────────────────────────────────── + +function runReverseLinkStage(): void { + if (isReverseLinkDone()) { + console.log("\n [reverse link] Already complete, skipping"); + return; + } + console.log("\n [reverse link] Syncing reverse translation links..."); + reverseLink(); + markReverseLinkComplete(); +} function runRound1(provider: ProviderConfig, stats: RunStats): void { console.log(`\n [round 1] Running ${provider.name}...`); @@ -247,7 +284,7 @@ function runMerge(): void { } function runTiebreak(stats: RunStats): void { - console.log("\n [tiebreak] Resolving flagged translations..."); + console.log("\n [tiebreak] Resolving flagged entries..."); // TODO: implement tiebreak logic console.log(" [tiebreak] not yet implemented"); stats.currentStage = "tiebreak"; @@ -265,19 +302,19 @@ function runCompare(): void { async function generateReport(runName: string, stats: RunStats): Promise { const db = openDb(); - const totalSynsets = ( - db.prepare("SELECT COUNT(*) as count FROM synsets").get() as { + const totalEntries = ( + db.prepare("SELECT COUNT(*) as count FROM entries").get() as { count: number; } ).count; - const resolvedTranslations = ( - db.prepare("SELECT COUNT(*) as count FROM resolved_translations").get() as { + const resolvedEntries = ( + db.prepare("SELECT COUNT(*) as count FROM resolved_entry_cefr").get() as { count: number; } ).count; - const flaggedTranslations = ( + const flaggedEntries = ( db .prepare( `SELECT COUNT(*) as count FROM run_status @@ -302,7 +339,7 @@ async function generateReport(runName: string, stats: RunStats): Promise { const durationMin = Math.round(durationMs / 60_000); const isFinal = - getSentinelStatus("compare") === "complete" && flaggedTranslations === 0; + getSentinelStatus("compare") === "complete" && flaggedEntries === 0; const report = { runName, @@ -310,15 +347,16 @@ async function generateReport(runName: string, stats: RunStats): Promise { durationMinutes: durationMin, isFinal, progress: { - totalSynsets, - resolvedTranslations, - flaggedTranslations, + totalEntries, + resolvedEntries, + flaggedEntries, needsReview, recordsProcessedThisRun: stats.recordsProcessed, recordsSkippedThisRun: stats.recordsSkipped, }, modelsRun: stats.modelsRun, stages: { + reverseLink: isReverseLinkDone() ? "complete" : "pending", round1: ALL_PROVIDERS.map((p) => ({ model: p.name, status: getModelRound1Status(p.name), @@ -354,15 +392,17 @@ async function generateReport(runName: string, stats: RunStats): Promise { ``, `| Metric | Value |`, `| ------ | ----- |`, - `| Total synsets | ${totalSynsets.toLocaleString()} |`, - `| Resolved translations | ${resolvedTranslations.toLocaleString()} |`, - `| Flagged translations | ${flaggedTranslations.toLocaleString()} |`, + `| Total entries | ${totalEntries.toLocaleString()} |`, + `| Resolved entries | ${resolvedEntries.toLocaleString()} |`, + `| Flagged entries | ${flaggedEntries.toLocaleString()} |`, `| Needs review | ${needsReview.toLocaleString()} |`, `| Records processed this run | ${stats.recordsProcessed.toLocaleString()} |`, `| Records skipped this run | ${stats.recordsSkipped.toLocaleString()} |`, ``, `## Stage status`, ``, + `### Reverse link: ${report.stages.reverseLink}`, + ``, `### Round 1`, ``, ...report.stages.round1.map( @@ -403,7 +443,7 @@ async function main(): Promise { // ── Startup checks console.log("Checking prerequisites..."); - await checkOmwExists(); + await checkExtractedFilesExist(); await checkAndInitDb(); await checkAndImportDb(); console.log(" Prerequisites OK"); @@ -425,6 +465,14 @@ async function main(): Promise { registerShutdownHandler(stats); + // ── Stage 2 — Reverse link + runReverseLinkStage(); + + if (shutdownRequested) { + await generateReport(runName, stats); + process.exit(0); + } + // ── Round 1 console.log("\nRound 1 — generation"); for (const provider of ALL_PROVIDERS) { diff --git a/documentation/data-pipeline.md b/documentation/data-pipeline.md index 9543d7f..88fc779 100644 --- a/documentation/data-pipeline.md +++ b/documentation/data-pipeline.md @@ -314,9 +314,12 @@ These are not part of the current pipeline but are worth considering as the data ## Roadmap -**Current state:** Production schema migrated to Kaikki flat model. Stage 1 extraction scripts written and sample run complete (500 entries per language). pipeline.db initialised and imported with sample data. Stage 2 reverse link sync not yet written. llama.cpp not installed. +**Current state:** Stage 1 extraction and stage 2 reverse link sync scripts +written and verified on sample data. pipeline.db contains 4,156 entries and +4,287 translations across 5 languages. Stage 3 enrich scripts not yet written. +llama.cpp not installed. -**Next action:** Write the stage 2 reverse link sync script. +**Next action:** Write the stage 3 enrich script. | Stage | Status | | --------------- | -------------- | @@ -339,11 +342,11 @@ These are not part of the current pipeline but are worth considering as the data - [ ] Remove sample limit and run full extraction - [ ] Re-run full import → `pipeline.db` -### Stage 2 — Reverse link sync `🔲 not started` +### Stage 2 — Reverse link sync `🔄 in progress` -- [ ] Write reverse link sync script -- [ ] Write tests -- [ ] Run reverse link sync → `pipeline.db` +- [x] Write reverse link sync script +- [x] Run reverse link sync on sample data → 141 links inserted +- [ ] Run reverse link sync on full data after full extraction ### Stage 3 — Enrich `🔲 not started`