feat: enrich stage foundation — provider config, env setup, schema fix
- Remove foreign key on run_status.source_id to support sentinel rows for tracking one-time pipeline steps (compile_candidates, compile_votes, merge, compare) - Add stage-3-enrich/config.ts with all provider configurations, ALL_PROVIDERS ordered local-first, and validateProviderKey() for startup key checks - Add .env.example with required API keys for OpenRouter and Anthropic - Add pipeline:run script to package.json using --env-file .env - Add .env to root .gitignore coverage for data-pipeline/.env
This commit is contained in:
parent
4d42fe4397
commit
080fad1998
5 changed files with 125 additions and 12 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -17,4 +17,4 @@ data-pipeline/stage-3-enrich/output/
|
||||||
data-pipeline/stage-4-merge/output/
|
data-pipeline/stage-4-merge/output/
|
||||||
data-pipeline/db/pipeline.db
|
data-pipeline/db/pipeline.db
|
||||||
data-pipeline/reports/
|
data-pipeline/reports/
|
||||||
|
data-pipeline/.env
|
||||||
|
|
|
||||||
7
data-pipeline/.env.example
Normal file
7
data-pipeline/.env.example
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
# OpenRouter API key — required for OpenRouter providers
|
||||||
|
# Get one at https://openrouter.ai/keys
|
||||||
|
OPENROUTER_API_KEY=
|
||||||
|
|
||||||
|
# Anthropic API key — required for Anthropic provider (reference baseline only)
|
||||||
|
# Get one at https://console.anthropic.com/
|
||||||
|
ANTHROPIC_API_KEY=
|
||||||
|
|
@ -41,16 +41,7 @@ CREATE TABLE IF NOT EXISTS cefr_source_votes (
|
||||||
-- stage: round1 | round2 | tiebreak
|
-- stage: round1 | round2 | tiebreak
|
||||||
-- status: pending | complete | needs_review | flagged
|
-- status: pending | complete | needs_review | flagged
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS run_status (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
||||||
model_name TEXT NOT NULL,
|
|
||||||
stage TEXT NOT NULL,
|
|
||||||
status TEXT NOT NULL,
|
|
||||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
||||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
||||||
UNIQUE (source_id, model_name, stage)
|
|
||||||
);
|
|
||||||
|
|
||||||
-- ── Round 1 output ────────────────────────────────────────────────────────────
|
-- ── Round 1 output ────────────────────────────────────────────────────────────
|
||||||
-- One row per translation/language per model. Written atomically per record.
|
-- One row per translation/language per model. Written atomically per record.
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,8 @@
|
||||||
"db:init": "tsx db/init.ts",
|
"db:init": "tsx db/init.ts",
|
||||||
"annotate": "tsx stage-2-annotate/scripts/annotate.ts",
|
"annotate": "tsx stage-2-annotate/scripts/annotate.ts",
|
||||||
"test": "vitest run",
|
"test": "vitest run",
|
||||||
"test:watch": "vitest"
|
"test:watch": "vitest",
|
||||||
|
"pipeline:run": "tsx --env-file .env pipeline.ts"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@lila/shared": "workspace:*",
|
"@lila/shared": "workspace:*",
|
||||||
|
|
|
||||||
114
data-pipeline/stage-3-enrich/config.ts
Normal file
114
data-pipeline/stage-3-enrich/config.ts
Normal file
|
|
@ -0,0 +1,114 @@
|
||||||
|
// ── Provider configuration ────────────────────────────────────────────────────
|
||||||
|
//
|
||||||
|
// Each provider + model combination counts as one vote in the final majority.
|
||||||
|
// Running the same model twice is not supported — one model, one vote.
|
||||||
|
// The `name` field is used as the model identifier in pipeline.db and must
|
||||||
|
// be unique across all runs.
|
||||||
|
//
|
||||||
|
// The pipeline iterates through ALL_PROVIDERS in order, skipping models that
|
||||||
|
// have already completed a full run and resuming models with partial progress.
|
||||||
|
//
|
||||||
|
// See llm-setup.md for full setup instructions and model recommendations.
|
||||||
|
|
||||||
|
export type ProviderConfig = {
|
||||||
|
name: string; // unique model identifier — stored in pipeline.db
|
||||||
|
baseURL: string;
|
||||||
|
apiKey: string;
|
||||||
|
model: string;
|
||||||
|
maxTokens: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Local llama.cpp ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export const LOCAL_GEMMA4: ProviderConfig = {
|
||||||
|
name: "local-gemma4-e4b",
|
||||||
|
baseURL: "http://127.0.0.1:8080/v1",
|
||||||
|
apiKey: "none", // llama.cpp ignores this
|
||||||
|
model: "gemma4-e4b", // llama.cpp ignores model name, uses loaded model
|
||||||
|
maxTokens: 512,
|
||||||
|
};
|
||||||
|
|
||||||
|
export const LOCAL_QWEN7B: ProviderConfig = {
|
||||||
|
name: "local-qwen2.5-7b",
|
||||||
|
baseURL: "http://127.0.0.1:8080/v1",
|
||||||
|
apiKey: "none",
|
||||||
|
model: "qwen2.5-7b",
|
||||||
|
maxTokens: 512,
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── OpenRouter — free tier ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export const OR_QWEN3_480B: ProviderConfig = {
|
||||||
|
name: "or-qwen3-480b",
|
||||||
|
baseURL: "https://openrouter.ai/api/v1",
|
||||||
|
apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
|
||||||
|
model: "qwen/qwen3-coder:free",
|
||||||
|
maxTokens: 512,
|
||||||
|
};
|
||||||
|
|
||||||
|
export const OR_GEMMA4_31B: ProviderConfig = {
|
||||||
|
name: "or-gemma4-31b",
|
||||||
|
baseURL: "https://openrouter.ai/api/v1",
|
||||||
|
apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
|
||||||
|
model: "google/gemma-4-31b-it:free",
|
||||||
|
maxTokens: 512,
|
||||||
|
};
|
||||||
|
|
||||||
|
export const OR_QWEN3_80B: ProviderConfig = {
|
||||||
|
name: "or-qwen3-80b",
|
||||||
|
baseURL: "https://openrouter.ai/api/v1",
|
||||||
|
apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
|
||||||
|
model: "qwen/qwen3-next-80b-a3b-instruct:free",
|
||||||
|
maxTokens: 512,
|
||||||
|
};
|
||||||
|
|
||||||
|
export const OR_NEMOTRON: ProviderConfig = {
|
||||||
|
name: "or-nemotron-120b",
|
||||||
|
baseURL: "https://openrouter.ai/api/v1",
|
||||||
|
apiKey: process.env["OPENROUTER_API_KEY"] ?? "",
|
||||||
|
model: "nvidia/nemotron-3-super-120b-a12b:free",
|
||||||
|
maxTokens: 512,
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Anthropic — reference baseline ───────────────────────────────────────────
|
||||||
|
// Note: Anthropic uses a different API format. An adapter is required.
|
||||||
|
// See llm-setup.md for details.
|
||||||
|
|
||||||
|
export const ANTHROPIC_SONNET: ProviderConfig = {
|
||||||
|
name: "anthropic-sonnet-4",
|
||||||
|
baseURL: "https://api.anthropic.com/v1",
|
||||||
|
apiKey: process.env["ANTHROPIC_API_KEY"] ?? "",
|
||||||
|
model: "claude-sonnet-4-6",
|
||||||
|
maxTokens: 512,
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── All configured providers ──────────────────────────────────────────────────
|
||||||
|
// The pipeline runs through these in order — local models first, then cloud.
|
||||||
|
// Add new providers here to include them in the voting pool.
|
||||||
|
|
||||||
|
export const ALL_PROVIDERS: ProviderConfig[] = [
|
||||||
|
LOCAL_GEMMA4,
|
||||||
|
LOCAL_QWEN7B,
|
||||||
|
OR_QWEN3_480B,
|
||||||
|
OR_GEMMA4_31B,
|
||||||
|
OR_QWEN3_80B,
|
||||||
|
OR_NEMOTRON,
|
||||||
|
ANTHROPIC_SONNET,
|
||||||
|
];
|
||||||
|
|
||||||
|
// ── Key validation ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const LOCAL_PROVIDERS = new Set(["none"]);
|
||||||
|
|
||||||
|
export function validateProviderKey(provider: ProviderConfig): void {
|
||||||
|
if (LOCAL_PROVIDERS.has(provider.apiKey)) return;
|
||||||
|
|
||||||
|
if (!provider.apiKey) {
|
||||||
|
const keyName = provider.name.startsWith("anthropic")
|
||||||
|
? "ANTHROPIC_API_KEY"
|
||||||
|
: "OPENROUTER_API_KEY";
|
||||||
|
console.error(`\n ERROR: ${keyName} is not set in .env`);
|
||||||
|
console.error(` Provider "${provider.name}" requires this key to run.\n`);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue