feat: update pipeline orchestrator for Kaikki — wire up stages 1 and 2
- Replace checkOmwExists with checkExtractedFilesExist - Wire up importKaikki and reverseLink as real stage implementations - Track reverse link completion via sentinel row in run_status - Update report to use resolved_entry_cefr and entry counts - Stages 3 onwards remain as stubs
This commit is contained in:
parent
6f9a42c707
commit
1c44ef989b
2 changed files with 92 additions and 41 deletions
|
|
@ -2,8 +2,9 @@ import fs from "node:fs/promises";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
import { fileURLToPath } from "node:url";
|
import { fileURLToPath } from "node:url";
|
||||||
import { initDb } from "./db/init.js";
|
import { initDb } from "./db/init.js";
|
||||||
import { isImported, importStage2 } from "./db/import.js";
|
import { isImported, importKaikki } from "./db/import.js";
|
||||||
import { openDb } from "./db/index.js";
|
import { openDb } from "./db/index.js";
|
||||||
|
import { reverseLink } from "./stage-2-reverse-link/scripts/reverse-link.js";
|
||||||
import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js";
|
import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js";
|
||||||
import type { ProviderConfig } from "./stage-3-enrich/config.js";
|
import type { ProviderConfig } from "./stage-3-enrich/config.js";
|
||||||
|
|
||||||
|
|
@ -35,23 +36,23 @@ type RunStats = {
|
||||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
const PATHS = {
|
const PATHS = {
|
||||||
omw: path.join(__dirname, "stage-1-extract/output/omw.json"),
|
extractedEn: path.join(__dirname, "stage-1-extract/output/en.json"),
|
||||||
db: path.join(__dirname, "db/pipeline.db"),
|
db: path.join(__dirname, "db/pipeline.db"),
|
||||||
reports: path.join(__dirname, "reports"),
|
reports: path.join(__dirname, "reports"),
|
||||||
llamaHealth: "http://127.0.0.1:8080/health",
|
llamaHealth: "http://127.0.0.1:8080/health",
|
||||||
};
|
};
|
||||||
|
|
||||||
const SENTINEL = { sourceId: "system", modelName: "system" };
|
const SENTINEL = { entryId: 0, modelName: "system" };
|
||||||
|
|
||||||
// ── Startup checks ────────────────────────────────────────────────────────────
|
// ── Startup checks ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
async function checkOmwExists(): Promise<void> {
|
async function checkExtractedFilesExist(): Promise<void> {
|
||||||
try {
|
try {
|
||||||
await fs.access(PATHS.omw);
|
await fs.access(PATHS.extractedEn);
|
||||||
} catch {
|
} catch {
|
||||||
console.error("\n ERROR: stage-1-extract/output/omw.json not found.");
|
console.error("\n ERROR: stage-1-extract/output/en.json not found.");
|
||||||
console.error(" Run the stage 1 extraction script first:");
|
console.error(" Run the stage 1 extraction script first:");
|
||||||
console.error(" python stage-1-extract/scripts/extract.py\n");
|
console.error(" pnpm extract\n");
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -67,8 +68,8 @@ async function checkAndInitDb(): Promise<void> {
|
||||||
|
|
||||||
async function checkAndImportDb(): Promise<void> {
|
async function checkAndImportDb(): Promise<void> {
|
||||||
if (!isImported()) {
|
if (!isImported()) {
|
||||||
console.log(" Base tables empty — importing stage 2 data...");
|
console.log(" Base tables empty — importing Kaikki data...");
|
||||||
await importStage2();
|
await importKaikki();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -132,6 +133,7 @@ function registerShutdownHandler(stats: RunStats): void {
|
||||||
process.on("SIGINT", handler);
|
process.on("SIGINT", handler);
|
||||||
process.on("SIGTERM", handler);
|
process.on("SIGTERM", handler);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Stage status helpers ──────────────────────────────────────────────────────
|
// ── Stage status helpers ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
function getSentinelStatus(stage: RunStage): StageStatus {
|
function getSentinelStatus(stage: RunStage): StageStatus {
|
||||||
|
|
@ -139,9 +141,9 @@ function getSentinelStatus(stage: RunStage): StageStatus {
|
||||||
const row = db
|
const row = db
|
||||||
.prepare(
|
.prepare(
|
||||||
`SELECT status FROM run_status
|
`SELECT status FROM run_status
|
||||||
WHERE source_id = ? AND model_name = ? AND stage = ?`,
|
WHERE entry_id = ? AND model_name = ? AND stage = ?`,
|
||||||
)
|
)
|
||||||
.get(SENTINEL.sourceId, SENTINEL.modelName, stage) as
|
.get(SENTINEL.entryId, SENTINEL.modelName, stage) as
|
||||||
| { status: string }
|
| { status: string }
|
||||||
| undefined;
|
| undefined;
|
||||||
db.close();
|
db.close();
|
||||||
|
|
@ -151,11 +153,11 @@ function getSentinelStatus(stage: RunStage): StageStatus {
|
||||||
function markSentinelComplete(stage: RunStage): void {
|
function markSentinelComplete(stage: RunStage): void {
|
||||||
const db = openDb();
|
const db = openDb();
|
||||||
db.prepare(
|
db.prepare(
|
||||||
`INSERT INTO run_status (source_id, model_name, stage, status)
|
`INSERT INTO run_status (entry_id, model_name, stage, status)
|
||||||
VALUES (?, ?, ?, 'complete')
|
VALUES (?, ?, ?, 'complete')
|
||||||
ON CONFLICT (source_id, model_name, stage)
|
ON CONFLICT (entry_id, model_name, stage)
|
||||||
DO UPDATE SET status = 'complete', updated_at = datetime('now')`,
|
DO UPDATE SET status = 'complete', updated_at = datetime('now')`,
|
||||||
).run(SENTINEL.sourceId, SENTINEL.modelName, stage);
|
).run(SENTINEL.entryId, SENTINEL.modelName, stage);
|
||||||
db.close();
|
db.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -163,9 +165,9 @@ function getModelRound1Status(modelName: string): StageStatus {
|
||||||
const db = openDb();
|
const db = openDb();
|
||||||
|
|
||||||
const total = (
|
const total = (
|
||||||
db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
|
db
|
||||||
count: number;
|
.prepare("SELECT COUNT(*) as count FROM entries WHERE language = 'en'")
|
||||||
}
|
.get() as { count: number }
|
||||||
).count;
|
).count;
|
||||||
|
|
||||||
const complete = (
|
const complete = (
|
||||||
|
|
@ -188,9 +190,9 @@ function getModelRound2Status(modelName: string): StageStatus {
|
||||||
const db = openDb();
|
const db = openDb();
|
||||||
|
|
||||||
const total = (
|
const total = (
|
||||||
db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
|
db
|
||||||
count: number;
|
.prepare("SELECT COUNT(*) as count FROM entries WHERE language = 'en'")
|
||||||
}
|
.get() as { count: number }
|
||||||
).count;
|
).count;
|
||||||
|
|
||||||
const complete = (
|
const complete = (
|
||||||
|
|
@ -209,7 +211,42 @@ function getModelRound2Status(modelName: string): StageStatus {
|
||||||
return "in_progress";
|
return "in_progress";
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Stage runners (stubs) ─────────────────────────────────────────────────────
|
function isReverseLinkDone(): boolean {
|
||||||
|
const db = openDb();
|
||||||
|
const row = db
|
||||||
|
.prepare(
|
||||||
|
`SELECT status FROM run_status
|
||||||
|
WHERE entry_id = ? AND model_name = ? AND stage = 'reverse_link'`,
|
||||||
|
)
|
||||||
|
.get(SENTINEL.entryId, SENTINEL.modelName) as
|
||||||
|
| { status: string }
|
||||||
|
| undefined;
|
||||||
|
db.close();
|
||||||
|
return row?.status === "complete";
|
||||||
|
}
|
||||||
|
|
||||||
|
function markReverseLinkComplete(): void {
|
||||||
|
const db = openDb();
|
||||||
|
db.prepare(
|
||||||
|
`INSERT INTO run_status (entry_id, model_name, stage, status)
|
||||||
|
VALUES (?, ?, 'reverse_link', 'complete')
|
||||||
|
ON CONFLICT (entry_id, model_name, stage)
|
||||||
|
DO UPDATE SET status = 'complete', updated_at = datetime('now')`,
|
||||||
|
).run(SENTINEL.entryId, SENTINEL.modelName);
|
||||||
|
db.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Stage runners ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function runReverseLinkStage(): void {
|
||||||
|
if (isReverseLinkDone()) {
|
||||||
|
console.log("\n [reverse link] Already complete, skipping");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
console.log("\n [reverse link] Syncing reverse translation links...");
|
||||||
|
reverseLink();
|
||||||
|
markReverseLinkComplete();
|
||||||
|
}
|
||||||
|
|
||||||
function runRound1(provider: ProviderConfig, stats: RunStats): void {
|
function runRound1(provider: ProviderConfig, stats: RunStats): void {
|
||||||
console.log(`\n [round 1] Running ${provider.name}...`);
|
console.log(`\n [round 1] Running ${provider.name}...`);
|
||||||
|
|
@ -247,7 +284,7 @@ function runMerge(): void {
|
||||||
}
|
}
|
||||||
|
|
||||||
function runTiebreak(stats: RunStats): void {
|
function runTiebreak(stats: RunStats): void {
|
||||||
console.log("\n [tiebreak] Resolving flagged translations...");
|
console.log("\n [tiebreak] Resolving flagged entries...");
|
||||||
// TODO: implement tiebreak logic
|
// TODO: implement tiebreak logic
|
||||||
console.log(" [tiebreak] not yet implemented");
|
console.log(" [tiebreak] not yet implemented");
|
||||||
stats.currentStage = "tiebreak";
|
stats.currentStage = "tiebreak";
|
||||||
|
|
@ -265,19 +302,19 @@ function runCompare(): void {
|
||||||
async function generateReport(runName: string, stats: RunStats): Promise<void> {
|
async function generateReport(runName: string, stats: RunStats): Promise<void> {
|
||||||
const db = openDb();
|
const db = openDb();
|
||||||
|
|
||||||
const totalSynsets = (
|
const totalEntries = (
|
||||||
db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
|
db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
|
||||||
count: number;
|
count: number;
|
||||||
}
|
}
|
||||||
).count;
|
).count;
|
||||||
|
|
||||||
const resolvedTranslations = (
|
const resolvedEntries = (
|
||||||
db.prepare("SELECT COUNT(*) as count FROM resolved_translations").get() as {
|
db.prepare("SELECT COUNT(*) as count FROM resolved_entry_cefr").get() as {
|
||||||
count: number;
|
count: number;
|
||||||
}
|
}
|
||||||
).count;
|
).count;
|
||||||
|
|
||||||
const flaggedTranslations = (
|
const flaggedEntries = (
|
||||||
db
|
db
|
||||||
.prepare(
|
.prepare(
|
||||||
`SELECT COUNT(*) as count FROM run_status
|
`SELECT COUNT(*) as count FROM run_status
|
||||||
|
|
@ -302,7 +339,7 @@ async function generateReport(runName: string, stats: RunStats): Promise<void> {
|
||||||
const durationMin = Math.round(durationMs / 60_000);
|
const durationMin = Math.round(durationMs / 60_000);
|
||||||
|
|
||||||
const isFinal =
|
const isFinal =
|
||||||
getSentinelStatus("compare") === "complete" && flaggedTranslations === 0;
|
getSentinelStatus("compare") === "complete" && flaggedEntries === 0;
|
||||||
|
|
||||||
const report = {
|
const report = {
|
||||||
runName,
|
runName,
|
||||||
|
|
@ -310,15 +347,16 @@ async function generateReport(runName: string, stats: RunStats): Promise<void> {
|
||||||
durationMinutes: durationMin,
|
durationMinutes: durationMin,
|
||||||
isFinal,
|
isFinal,
|
||||||
progress: {
|
progress: {
|
||||||
totalSynsets,
|
totalEntries,
|
||||||
resolvedTranslations,
|
resolvedEntries,
|
||||||
flaggedTranslations,
|
flaggedEntries,
|
||||||
needsReview,
|
needsReview,
|
||||||
recordsProcessedThisRun: stats.recordsProcessed,
|
recordsProcessedThisRun: stats.recordsProcessed,
|
||||||
recordsSkippedThisRun: stats.recordsSkipped,
|
recordsSkippedThisRun: stats.recordsSkipped,
|
||||||
},
|
},
|
||||||
modelsRun: stats.modelsRun,
|
modelsRun: stats.modelsRun,
|
||||||
stages: {
|
stages: {
|
||||||
|
reverseLink: isReverseLinkDone() ? "complete" : "pending",
|
||||||
round1: ALL_PROVIDERS.map((p) => ({
|
round1: ALL_PROVIDERS.map((p) => ({
|
||||||
model: p.name,
|
model: p.name,
|
||||||
status: getModelRound1Status(p.name),
|
status: getModelRound1Status(p.name),
|
||||||
|
|
@ -354,15 +392,17 @@ async function generateReport(runName: string, stats: RunStats): Promise<void> {
|
||||||
``,
|
``,
|
||||||
`| Metric | Value |`,
|
`| Metric | Value |`,
|
||||||
`| ------ | ----- |`,
|
`| ------ | ----- |`,
|
||||||
`| Total synsets | ${totalSynsets.toLocaleString()} |`,
|
`| Total entries | ${totalEntries.toLocaleString()} |`,
|
||||||
`| Resolved translations | ${resolvedTranslations.toLocaleString()} |`,
|
`| Resolved entries | ${resolvedEntries.toLocaleString()} |`,
|
||||||
`| Flagged translations | ${flaggedTranslations.toLocaleString()} |`,
|
`| Flagged entries | ${flaggedEntries.toLocaleString()} |`,
|
||||||
`| Needs review | ${needsReview.toLocaleString()} |`,
|
`| Needs review | ${needsReview.toLocaleString()} |`,
|
||||||
`| Records processed this run | ${stats.recordsProcessed.toLocaleString()} |`,
|
`| Records processed this run | ${stats.recordsProcessed.toLocaleString()} |`,
|
||||||
`| Records skipped this run | ${stats.recordsSkipped.toLocaleString()} |`,
|
`| Records skipped this run | ${stats.recordsSkipped.toLocaleString()} |`,
|
||||||
``,
|
``,
|
||||||
`## Stage status`,
|
`## Stage status`,
|
||||||
``,
|
``,
|
||||||
|
`### Reverse link: ${report.stages.reverseLink}`,
|
||||||
|
``,
|
||||||
`### Round 1`,
|
`### Round 1`,
|
||||||
``,
|
``,
|
||||||
...report.stages.round1.map(
|
...report.stages.round1.map(
|
||||||
|
|
@ -403,7 +443,7 @@ async function main(): Promise<void> {
|
||||||
|
|
||||||
// ── Startup checks
|
// ── Startup checks
|
||||||
console.log("Checking prerequisites...");
|
console.log("Checking prerequisites...");
|
||||||
await checkOmwExists();
|
await checkExtractedFilesExist();
|
||||||
await checkAndInitDb();
|
await checkAndInitDb();
|
||||||
await checkAndImportDb();
|
await checkAndImportDb();
|
||||||
console.log(" Prerequisites OK");
|
console.log(" Prerequisites OK");
|
||||||
|
|
@ -425,6 +465,14 @@ async function main(): Promise<void> {
|
||||||
|
|
||||||
registerShutdownHandler(stats);
|
registerShutdownHandler(stats);
|
||||||
|
|
||||||
|
// ── Stage 2 — Reverse link
|
||||||
|
runReverseLinkStage();
|
||||||
|
|
||||||
|
if (shutdownRequested) {
|
||||||
|
await generateReport(runName, stats);
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
// ── Round 1
|
// ── Round 1
|
||||||
console.log("\nRound 1 — generation");
|
console.log("\nRound 1 — generation");
|
||||||
for (const provider of ALL_PROVIDERS) {
|
for (const provider of ALL_PROVIDERS) {
|
||||||
|
|
|
||||||
|
|
@ -314,9 +314,12 @@ These are not part of the current pipeline but are worth considering as the data
|
||||||
|
|
||||||
## Roadmap
|
## Roadmap
|
||||||
|
|
||||||
**Current state:** Production schema migrated to Kaikki flat model. Stage 1 extraction scripts written and sample run complete (500 entries per language). pipeline.db initialised and imported with sample data. Stage 2 reverse link sync not yet written. llama.cpp not installed.
|
**Current state:** Stage 1 extraction and stage 2 reverse link sync scripts
|
||||||
|
written and verified on sample data. pipeline.db contains 4,156 entries and
|
||||||
|
4,287 translations across 5 languages. Stage 3 enrich scripts not yet written.
|
||||||
|
llama.cpp not installed.
|
||||||
|
|
||||||
**Next action:** Write the stage 2 reverse link sync script.
|
**Next action:** Write the stage 3 enrich script.
|
||||||
|
|
||||||
| Stage | Status |
|
| Stage | Status |
|
||||||
| --------------- | -------------- |
|
| --------------- | -------------- |
|
||||||
|
|
@ -339,11 +342,11 @@ These are not part of the current pipeline but are worth considering as the data
|
||||||
- [ ] Remove sample limit and run full extraction
|
- [ ] Remove sample limit and run full extraction
|
||||||
- [ ] Re-run full import → `pipeline.db`
|
- [ ] Re-run full import → `pipeline.db`
|
||||||
|
|
||||||
### Stage 2 — Reverse link sync `🔲 not started`
|
### Stage 2 — Reverse link sync `🔄 in progress`
|
||||||
|
|
||||||
- [ ] Write reverse link sync script
|
- [x] Write reverse link sync script
|
||||||
- [ ] Write tests
|
- [x] Run reverse link sync on sample data → 141 links inserted
|
||||||
- [ ] Run reverse link sync → `pipeline.db`
|
- [ ] Run reverse link sync on full data after full extraction
|
||||||
|
|
||||||
### Stage 3 — Enrich `🔲 not started`
|
### Stage 3 — Enrich `🔲 not started`
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue