feat: enrich stage foundation — provider config, env setup, schema fix

- Remove foreign key on run_status.source_id to support sentinel rows for tracking one-time pipeline steps (compile_candidates, compile_votes, merge, compare) - Add stage-3-enrich/config.ts with all provider configurations, ALL_PROVIDERS ordered local-first, and validateProviderKey() for startup key checks - Add .env.example with required API keys for OpenRouter and Anthropic - Add pipeline:run script to package.json using --env-file .env - Add .env to root .gitignore coverage for data-pipeline/.env
2026-05-03 22:44:14 +02:00 · 2026-05-03 22:44:14 +02:00 · 080fad1998
commit 080fad1998
parent 4d42fe4397
5 changed files with 125 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -17,4 +17,4 @@ data-pipeline/stage-3-enrich/output/
 data-pipeline/stage-4-merge/output/
 data-pipeline/db/pipeline.db
 data-pipeline/reports/
-
+data-pipeline/.env
--- a/data-pipeline/.env.example
+++ b/data-pipeline/.env.example
@ -0,0 +1,7 @@
 # OpenRouter API key — required for OpenRouter providers
 # Get one at https://openrouter.ai/keys
 OPENROUTER_API_KEY=
 # Anthropic API key — required for Anthropic provider (reference baseline only)
 # Get one at https://console.anthropic.com/
 ANTHROPIC_API_KEY=
--- a/data-pipeline/db/schema.sql
+++ b/data-pipeline/db/schema.sql
@ -41,16 +41,7 @@ CREATE TABLE IF NOT EXISTS cefr_source_votes (
 -- stage:  round1 | round2 | tiebreak
 -- status: pending | complete | needs_review | flagged
-CREATE TABLE IF NOT EXISTS run_status (
+
  id         INTEGER PRIMARY KEY,
  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
  model_name TEXT    NOT NULL,
  stage      TEXT    NOT NULL,
  status     TEXT    NOT NULL,
  created_at TEXT    NOT NULL DEFAULT (datetime('now')),
  updated_at TEXT    NOT NULL DEFAULT (datetime('now')),
  UNIQUE (source_id, model_name, stage)
 );
 -- ── Round 1 output ────────────────────────────────────────────────────────────
 -- One row per translation/language per model. Written atomically per record.
--- a/data-pipeline/package.json
+++ b/data-pipeline/package.json
@ -8,7 +8,8 @@
    "db:init": "tsx db/init.ts",
    "annotate": "tsx stage-2-annotate/scripts/annotate.ts",
    "test": "vitest run",
-    "test:watch": "vitest"
+    "test:watch": "vitest",
    "pipeline:run": "tsx --env-file .env pipeline.ts"
  },
  "dependencies": {
    "@lila/shared": "workspace:*",
--- a/data-pipeline/stage-3-enrich/config.ts
+++ b/data-pipeline/stage-3-enrich/config.ts
@ -0,0 +1,114 @@
 // ── Provider configuration ────────────────────────────────────────────────────
 //
 // Each provider + model combination counts as one vote in the final majority.
 // Running the same model twice is not supported — one model, one vote.
 // The `name` field is used as the model identifier in pipeline.db and must
 // be unique across all runs.
 //
 // The pipeline iterates through ALL_PROVIDERS in order, skipping models that
 // have already completed a full run and resuming models with partial progress.
 //
 // See llm-setup.md for full setup instructions and model recommendations.
 export type ProviderConfig = {
  name: string; // unique model identifier — stored in pipeline.db
  baseURL: string;
  apiKey: string;
  model: string;
  maxTokens: number;
 };
 // ── Local llama.cpp ───────────────────────────────────────────────────────────
 export const LOCAL_GEMMA4: ProviderConfig = {
  name: "local-gemma4-e4b",
  baseURL: "http://127.0.0.1:8080/v1",
  apiKey: "none", // llama.cpp ignores this
  model: "gemma4-e4b", // llama.cpp ignores model name, uses loaded model
  maxTokens: 512,
 };
 export const LOCAL_QWEN7B: ProviderConfig = {
  name: "local-qwen2.5-7b",
  baseURL: "http://127.0.0.1:8080/v1",
  apiKey: "none",
  model: "qwen2.5-7b",
  maxTokens: 512,
 };
 // ── OpenRouter — free tier ────────────────────────────────────────────────────
 export const OR_QWEN3_480B: ProviderConfig = {
  name: "or-qwen3-480b",
  baseURL: "https://openrouter.ai/api/v1",
  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
  model: "qwen/qwen3-coder:free",
  maxTokens: 512,
 };
 export const OR_GEMMA4_31B: ProviderConfig = {
  name: "or-gemma4-31b",
  baseURL: "https://openrouter.ai/api/v1",
  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
  model: "google/gemma-4-31b-it:free",
  maxTokens: 512,
 };
 export const OR_QWEN3_80B: ProviderConfig = {
  name: "or-qwen3-80b",
  baseURL: "https://openrouter.ai/api/v1",
  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
  model: "qwen/qwen3-next-80b-a3b-instruct:free",
  maxTokens: 512,
 };
 export const OR_NEMOTRON: ProviderConfig = {
  name: "or-nemotron-120b",
  baseURL: "https://openrouter.ai/api/v1",
  apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
  model: "nvidia/nemotron-3-super-120b-a12b:free",
  maxTokens: 512,
 };
 // ── Anthropic — reference baseline ───────────────────────────────────────────
 // Note: Anthropic uses a different API format. An adapter is required.
 // See llm-setup.md for details.
 export const ANTHROPIC_SONNET: ProviderConfig = {
  name: "anthropic-sonnet-4",
  baseURL: "https://api.anthropic.com/v1",
  apiKey: process.env["ANTHROPIC_API_KEY"] ?? "",
  model: "claude-sonnet-4-6",
  maxTokens: 512,
 };
 // ── All configured providers ──────────────────────────────────────────────────
 // The pipeline runs through these in order — local models first, then cloud.
 // Add new providers here to include them in the voting pool.
 export const ALL_PROVIDERS: ProviderConfig[] = [
  LOCAL_GEMMA4,
  LOCAL_QWEN7B,
  OR_QWEN3_480B,
  OR_GEMMA4_31B,
  OR_QWEN3_80B,
  OR_NEMOTRON,
  ANTHROPIC_SONNET,
 ];
 // ── Key validation ────────────────────────────────────────────────────────────
 const LOCAL_PROVIDERS = new Set(["none"]);
 export function validateProviderKey(provider: ProviderConfig): void {
  if (LOCAL_PROVIDERS.has(provider.apiKey)) return;
  if (!provider.apiKey) {
    const keyName = provider.name.startsWith("anthropic")
      ? "ANTHROPIC_API_KEY"
      : "OPENROUTER_API_KEY";
    console.error(`\n  ERROR: ${keyName} is not set in .env`);
    console.error(`  Provider "${provider.name}" requires this key to run.\n`);
    process.exit(1);
  }
 }