diff --git a/.gitignore b/.gitignore index 893044a..f441639 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,4 @@ data-pipeline/stage-3-enrich/output/ data-pipeline/stage-4-merge/output/ data-pipeline/db/pipeline.db data-pipeline/reports/ - +data-pipeline/.env diff --git a/data-pipeline/.env.example b/data-pipeline/.env.example new file mode 100644 index 0000000..923dfd5 --- /dev/null +++ b/data-pipeline/.env.example @@ -0,0 +1,7 @@ +# OpenRouter API key — required for OpenRouter providers +# Get one at https://openrouter.ai/keys +OPENROUTER_API_KEY= + +# Anthropic API key — required for Anthropic provider (reference baseline only) +# Get one at https://console.anthropic.com/ +ANTHROPIC_API_KEY= diff --git a/data-pipeline/db/schema.sql b/data-pipeline/db/schema.sql index 7441bb1..fb4f838 100644 --- a/data-pipeline/db/schema.sql +++ b/data-pipeline/db/schema.sql @@ -41,16 +41,7 @@ CREATE TABLE IF NOT EXISTS cefr_source_votes ( -- stage: round1 | round2 | tiebreak -- status: pending | complete | needs_review | flagged -CREATE TABLE IF NOT EXISTS run_status ( - id INTEGER PRIMARY KEY, - source_id TEXT NOT NULL REFERENCES synsets(source_id), - model_name TEXT NOT NULL, - stage TEXT NOT NULL, - status TEXT NOT NULL, - created_at TEXT NOT NULL DEFAULT (datetime('now')), - updated_at TEXT NOT NULL DEFAULT (datetime('now')), - UNIQUE (source_id, model_name, stage) -); + -- ── Round 1 output ──────────────────────────────────────────────────────────── -- One row per translation/language per model. Written atomically per record. diff --git a/data-pipeline/package.json b/data-pipeline/package.json index 1fd2636..1510876 100644 --- a/data-pipeline/package.json +++ b/data-pipeline/package.json @@ -8,7 +8,8 @@ "db:init": "tsx db/init.ts", "annotate": "tsx stage-2-annotate/scripts/annotate.ts", "test": "vitest run", - "test:watch": "vitest" + "test:watch": "vitest", + "pipeline:run": "tsx --env-file .env pipeline.ts" }, "dependencies": { "@lila/shared": "workspace:*", diff --git a/data-pipeline/stage-3-enrich/config.ts b/data-pipeline/stage-3-enrich/config.ts new file mode 100644 index 0000000..81350d3 --- /dev/null +++ b/data-pipeline/stage-3-enrich/config.ts @@ -0,0 +1,114 @@ +// ── Provider configuration ──────────────────────────────────────────────────── +// +// Each provider + model combination counts as one vote in the final majority. +// Running the same model twice is not supported — one model, one vote. +// The `name` field is used as the model identifier in pipeline.db and must +// be unique across all runs. +// +// The pipeline iterates through ALL_PROVIDERS in order, skipping models that +// have already completed a full run and resuming models with partial progress. +// +// See llm-setup.md for full setup instructions and model recommendations. + +export type ProviderConfig = { + name: string; // unique model identifier — stored in pipeline.db + baseURL: string; + apiKey: string; + model: string; + maxTokens: number; +}; + +// ── Local llama.cpp ─────────────────────────────────────────────────────────── + +export const LOCAL_GEMMA4: ProviderConfig = { + name: "local-gemma4-e4b", + baseURL: "http://127.0.0.1:8080/v1", + apiKey: "none", // llama.cpp ignores this + model: "gemma4-e4b", // llama.cpp ignores model name, uses loaded model + maxTokens: 512, +}; + +export const LOCAL_QWEN7B: ProviderConfig = { + name: "local-qwen2.5-7b", + baseURL: "http://127.0.0.1:8080/v1", + apiKey: "none", + model: "qwen2.5-7b", + maxTokens: 512, +}; + +// ── OpenRouter — free tier ──────────────────────────────────────────────────── + +export const OR_QWEN3_480B: ProviderConfig = { + name: "or-qwen3-480b", + baseURL: "https://openrouter.ai/api/v1", + apiKey: process.env["OPENROUTER_API_KEY"] ?? "", + model: "qwen/qwen3-coder:free", + maxTokens: 512, +}; + +export const OR_GEMMA4_31B: ProviderConfig = { + name: "or-gemma4-31b", + baseURL: "https://openrouter.ai/api/v1", + apiKey: process.env["OPENROUTER_API_KEY"] ?? "", + model: "google/gemma-4-31b-it:free", + maxTokens: 512, +}; + +export const OR_QWEN3_80B: ProviderConfig = { + name: "or-qwen3-80b", + baseURL: "https://openrouter.ai/api/v1", + apiKey: process.env["OPENROUTER_API_KEY"] ?? "", + model: "qwen/qwen3-next-80b-a3b-instruct:free", + maxTokens: 512, +}; + +export const OR_NEMOTRON: ProviderConfig = { + name: "or-nemotron-120b", + baseURL: "https://openrouter.ai/api/v1", + apiKey: process.env["OPENROUTER_API_KEY"] ?? "", + model: "nvidia/nemotron-3-super-120b-a12b:free", + maxTokens: 512, +}; + +// ── Anthropic — reference baseline ─────────────────────────────────────────── +// Note: Anthropic uses a different API format. An adapter is required. +// See llm-setup.md for details. + +export const ANTHROPIC_SONNET: ProviderConfig = { + name: "anthropic-sonnet-4", + baseURL: "https://api.anthropic.com/v1", + apiKey: process.env["ANTHROPIC_API_KEY"] ?? "", + model: "claude-sonnet-4-6", + maxTokens: 512, +}; + +// ── All configured providers ────────────────────────────────────────────────── +// The pipeline runs through these in order — local models first, then cloud. +// Add new providers here to include them in the voting pool. + +export const ALL_PROVIDERS: ProviderConfig[] = [ + LOCAL_GEMMA4, + LOCAL_QWEN7B, + OR_QWEN3_480B, + OR_GEMMA4_31B, + OR_QWEN3_80B, + OR_NEMOTRON, + ANTHROPIC_SONNET, +]; + +// ── Key validation ──────────────────────────────────────────────────────────── + +const LOCAL_PROVIDERS = new Set(["none"]); + +export function validateProviderKey(provider: ProviderConfig): void { + if (LOCAL_PROVIDERS.has(provider.apiKey)) return; + + if (!provider.apiKey) { + const keyName = provider.name.startsWith("anthropic") + ? "ANTHROPIC_API_KEY" + : "OPENROUTER_API_KEY"; + console.error(`\n ERROR: ${keyName} is not set in .env`); + console.error(` Provider "${provider.name}" requires this key to run.\n`); + process.exit(1); + } +}