feat: enrich stage foundation — provider config, env setup, schema fix

- Remove foreign key on run_status.source_id to support sentinel rows
  for tracking one-time pipeline steps (compile_candidates, compile_votes,
  merge, compare)
- Add stage-3-enrich/config.ts with all provider configurations,
  ALL_PROVIDERS ordered local-first, and validateProviderKey() for
  startup key checks
- Add .env.example with required API keys for OpenRouter and Anthropic
- Add pipeline:run script to package.json using --env-file .env
- Add .env to root .gitignore coverage for data-pipeline/.env
This commit is contained in:
lila 2026-05-03 22:44:14 +02:00
parent 4d42fe4397
commit 080fad1998
5 changed files with 125 additions and 12 deletions

2
.gitignore vendored
View file

@ -17,4 +17,4 @@ data-pipeline/stage-3-enrich/output/
data-pipeline/stage-4-merge/output/
data-pipeline/db/pipeline.db
data-pipeline/reports/
data-pipeline/.env

View file

@ -0,0 +1,7 @@
# OpenRouter API key — required for OpenRouter providers
# Get one at https://openrouter.ai/keys
OPENROUTER_API_KEY=
# Anthropic API key — required for Anthropic provider (reference baseline only)
# Get one at https://console.anthropic.com/
ANTHROPIC_API_KEY=

View file

@ -41,16 +41,7 @@ CREATE TABLE IF NOT EXISTS cefr_source_votes (
-- stage: round1 | round2 | tiebreak
-- status: pending | complete | needs_review | flagged
CREATE TABLE IF NOT EXISTS run_status (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
model_name TEXT NOT NULL,
stage TEXT NOT NULL,
status TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
UNIQUE (source_id, model_name, stage)
);
-- ── Round 1 output ────────────────────────────────────────────────────────────
-- One row per translation/language per model. Written atomically per record.

View file

@ -8,7 +8,8 @@
"db:init": "tsx db/init.ts",
"annotate": "tsx stage-2-annotate/scripts/annotate.ts",
"test": "vitest run",
"test:watch": "vitest"
"test:watch": "vitest",
"pipeline:run": "tsx --env-file .env pipeline.ts"
},
"dependencies": {
"@lila/shared": "workspace:*",

View file

@ -0,0 +1,114 @@
// ── Provider configuration ────────────────────────────────────────────────────
//
// Each provider + model combination counts as one vote in the final majority.
// Running the same model twice is not supported — one model, one vote.
// The `name` field is used as the model identifier in pipeline.db and must
// be unique across all runs.
//
// The pipeline iterates through ALL_PROVIDERS in order, skipping models that
// have already completed a full run and resuming models with partial progress.
//
// See llm-setup.md for full setup instructions and model recommendations.
export type ProviderConfig = {
name: string; // unique model identifier — stored in pipeline.db
baseURL: string;
apiKey: string;
model: string;
maxTokens: number;
};
// ── Local llama.cpp ───────────────────────────────────────────────────────────
export const LOCAL_GEMMA4: ProviderConfig = {
name: "local-gemma4-e4b",
baseURL: "http://127.0.0.1:8080/v1",
apiKey: "none", // llama.cpp ignores this
model: "gemma4-e4b", // llama.cpp ignores model name, uses loaded model
maxTokens: 512,
};
export const LOCAL_QWEN7B: ProviderConfig = {
name: "local-qwen2.5-7b",
baseURL: "http://127.0.0.1:8080/v1",
apiKey: "none",
model: "qwen2.5-7b",
maxTokens: 512,
};
// ── OpenRouter — free tier ────────────────────────────────────────────────────
export const OR_QWEN3_480B: ProviderConfig = {
name: "or-qwen3-480b",
baseURL: "https://openrouter.ai/api/v1",
apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
model: "qwen/qwen3-coder:free",
maxTokens: 512,
};
export const OR_GEMMA4_31B: ProviderConfig = {
name: "or-gemma4-31b",
baseURL: "https://openrouter.ai/api/v1",
apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
model: "google/gemma-4-31b-it:free",
maxTokens: 512,
};
export const OR_QWEN3_80B: ProviderConfig = {
name: "or-qwen3-80b",
baseURL: "https://openrouter.ai/api/v1",
apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
model: "qwen/qwen3-next-80b-a3b-instruct:free",
maxTokens: 512,
};
export const OR_NEMOTRON: ProviderConfig = {
name: "or-nemotron-120b",
baseURL: "https://openrouter.ai/api/v1",
apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
model: "nvidia/nemotron-3-super-120b-a12b:free",
maxTokens: 512,
};
// ── Anthropic — reference baseline ───────────────────────────────────────────
// Note: Anthropic uses a different API format. An adapter is required.
// See llm-setup.md for details.
export const ANTHROPIC_SONNET: ProviderConfig = {
name: "anthropic-sonnet-4",
baseURL: "https://api.anthropic.com/v1",
apiKey: process.env["ANTHROPIC_API_KEY"] ?? "",
model: "claude-sonnet-4-6",
maxTokens: 512,
};
// ── All configured providers ──────────────────────────────────────────────────
// The pipeline runs through these in order — local models first, then cloud.
// Add new providers here to include them in the voting pool.
export const ALL_PROVIDERS: ProviderConfig[] = [
LOCAL_GEMMA4,
LOCAL_QWEN7B,
OR_QWEN3_480B,
OR_GEMMA4_31B,
OR_QWEN3_80B,
OR_NEMOTRON,
ANTHROPIC_SONNET,
];
// ── Key validation ────────────────────────────────────────────────────────────
const LOCAL_PROVIDERS = new Set(["none"]);
export function validateProviderKey(provider: ProviderConfig): void {
if (LOCAL_PROVIDERS.has(provider.apiKey)) return;
if (!provider.apiKey) {
const keyName = provider.name.startsWith("anthropic")
? "ANTHROPIC_API_KEY"
: "OPENROUTER_API_KEY";
console.error(`\n ERROR: ${keyName} is not set in .env`);
console.error(` Provider "${provider.name}" requires this key to run.\n`);
process.exit(1);
}
}