feat: enrich stage foundation — provider config, env setup, schema fix

- Remove foreign key on run_status.source_id to support sentinel rows for tracking one-time pipeline steps (compile_candidates, compile_votes, merge, compare) - Add stage-3-enrich/config.ts with all provider configurations, ALL_PROVIDERS ordered local-first, and validateProviderKey() for startup key checks - Add .env.example with required API keys for OpenRouter and Anthropic - Add pipeline:run script to package.json using --env-file .env - Add .env to root .gitignore coverage for data-pipeline/.env
2026-05-03 22:44:14 +02:00 · 2026-05-03 22:44:14 +02:00 · 080fad1998
commit 080fad1998
parent 4d42fe4397
5 changed files with 125 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -17,4 +17,4 @@ data-pipeline/stage-3-enrich/output/
 data-pipeline/stage-4-merge/output/
 data-pipeline/db/pipeline.db
 data-pipeline/reports/
-
+data-pipeline/.env
--- a/data-pipeline/.env.example
+++ b/data-pipeline/.env.example
@ -0,0 +1,7 @@
+# OpenRouter API key — required for OpenRouter providers
+# Get one at https://openrouter.ai/keys
+OPENROUTER_API_KEY=
+
+# Anthropic API key — required for Anthropic provider (reference baseline only)
+# Get one at https://console.anthropic.com/
+ANTHROPIC_API_KEY=
--- a/data-pipeline/db/schema.sql
+++ b/data-pipeline/db/schema.sql
@ -41,16 +41,7 @@ CREATE TABLE IF NOT EXISTS cefr_source_votes (
 -- stage:  round1 | round2 | tiebreak
 -- status: pending | complete | needs_review | flagged

-CREATE TABLE IF NOT EXISTS run_status (
-  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
-  model_name TEXT    NOT NULL,
-  stage      TEXT    NOT NULL,
-  status     TEXT    NOT NULL,
-  created_at TEXT    NOT NULL DEFAULT (datetime('now')),
-  updated_at TEXT    NOT NULL DEFAULT (datetime('now')),
-  UNIQUE (source_id, model_name, stage)
-);
+

 -- ── Round 1 output ────────────────────────────────────────────────────────────
 -- One row per translation/language per model. Written atomically per record.
--- a/data-pipeline/package.json
+++ b/data-pipeline/package.json
@ -8,7 +8,8 @@
    "db:init": "tsx db/init.ts",
    "annotate": "tsx stage-2-annotate/scripts/annotate.ts",
    "test": "vitest run",
-    "test:watch": "vitest"
+    "test:watch": "vitest",
+    "pipeline:run": "tsx --env-file .env pipeline.ts"
  },
  "dependencies": {
    "@lila/shared": "workspace:*",
--- a/data-pipeline/stage-3-enrich/config.ts
+++ b/data-pipeline/stage-3-enrich/config.ts
@ -0,0 +1,114 @@
+// ── Provider configuration ────────────────────────────────────────────────────
+//
+// Each provider + model combination counts as one vote in the final majority.
+// Running the same model twice is not supported — one model, one vote.
+// The `name` field is used as the model identifier in pipeline.db and must
+// be unique across all runs.
+//
+// The pipeline iterates through ALL_PROVIDERS in order, skipping models that
+// have already completed a full run and resuming models with partial progress.
+//
+// See llm-setup.md for full setup instructions and model recommendations.
+
+export type ProviderConfig = {
+  name: string; // unique model identifier — stored in pipeline.db
+  baseURL: string;
+  apiKey: string;
+  model: string;
+  maxTokens: number;
+};
+
+// ── Local llama.cpp ───────────────────────────────────────────────────────────
+
+export const LOCAL_GEMMA4: ProviderConfig = {
+  name: "local-gemma4-e4b",
+  baseURL: "http://127.0.0.1:8080/v1",
+  apiKey: "none", // llama.cpp ignores this
+  model: "gemma4-e4b", // llama.cpp ignores model name, uses loaded model
+  maxTokens: 512,
+};
+
+export const LOCAL_QWEN7B: ProviderConfig = {
+  name: "local-qwen2.5-7b",
+  baseURL: "http://127.0.0.1:8080/v1",
+  apiKey: "none",
+  model: "qwen2.5-7b",
+  maxTokens: 512,
+};
+
+// ── OpenRouter — free tier ────────────────────────────────────────────────────
+
+export const OR_QWEN3_480B: ProviderConfig = {
+  name: "or-qwen3-480b",
+  baseURL: "https://openrouter.ai/api/v1",
+  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
+  model: "qwen/qwen3-coder:free",
+  maxTokens: 512,
+};
+
+export const OR_GEMMA4_31B: ProviderConfig = {
+  name: "or-gemma4-31b",
+  baseURL: "https://openrouter.ai/api/v1",
+  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
+  model: "google/gemma-4-31b-it:free",
+  maxTokens: 512,
+};
+
+export const OR_QWEN3_80B: ProviderConfig = {
+  name: "or-qwen3-80b",
+  baseURL: "https://openrouter.ai/api/v1",
+  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
+  model: "qwen/qwen3-next-80b-a3b-instruct:free",
+  maxTokens: 512,
+};
+
+export const OR_NEMOTRON: ProviderConfig = {
+  name: "or-nemotron-120b",
+  baseURL: "https://openrouter.ai/api/v1",
+  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
+  model: "nvidia/nemotron-3-super-120b-a12b:free",
+  maxTokens: 512,
+};
+
+// ── Anthropic — reference baseline ───────────────────────────────────────────
+// Note: Anthropic uses a different API format. An adapter is required.
+// See llm-setup.md for details.
+
+export const ANTHROPIC_SONNET: ProviderConfig = {
+  name: "anthropic-sonnet-4",
+  baseURL: "https://api.anthropic.com/v1",
+  apiKey: process.env["ANTHROPIC_API_KEY"] ?? "",
+  model: "claude-sonnet-4-6",
+  maxTokens: 512,
+};
+
+// ── All configured providers ──────────────────────────────────────────────────
+// The pipeline runs through these in order — local models first, then cloud.
+// Add new providers here to include them in the voting pool.
+
+export const ALL_PROVIDERS: ProviderConfig[] = [
+  LOCAL_GEMMA4,
+  LOCAL_QWEN7B,
+  OR_QWEN3_480B,
+  OR_GEMMA4_31B,
+  OR_QWEN3_80B,
+  OR_NEMOTRON,
+  ANTHROPIC_SONNET,
+];
+
+// ── Key validation ────────────────────────────────────────────────────────────
+
+const LOCAL_PROVIDERS = new Set(["none"]);
+
+export function validateProviderKey(provider: ProviderConfig): void {
+  if (LOCAL_PROVIDERS.has(provider.apiKey)) return;
+
+  if (!provider.apiKey) {
+    const keyName = provider.name.startsWith("anthropic")
+      ? "ANTHROPIC_API_KEY"
+      : "OPENROUTER_API_KEY";
+    console.error(`\n  ERROR: ${keyName} is not set in .env`);
+    console.error(`  Provider "${provider.name}" requires this key to run.\n`);
+    process.exit(1);
+  }
+}