- Remove foreign key on run_status.source_id to support sentinel rows for tracking one-time pipeline steps (compile_candidates, compile_votes, merge, compare) - Add stage-3-enrich/config.ts with all provider configurations, ALL_PROVIDERS ordered local-first, and validateProviderKey() for startup key checks - Add .env.example with required API keys for OpenRouter and Anthropic - Add pipeline:run script to package.json using --env-file .env - Add .env to root .gitignore coverage for data-pipeline/.env
148 lines
5.4 KiB
SQL
148 lines
5.4 KiB
SQL
-- ── Base data ─────────────────────────────────────────────────────────────────
|
|
-- Imported from stage 2 JSON on first run. Never mutated after import.
|
|
|
|
CREATE TABLE IF NOT EXISTS synsets (
|
|
source_id TEXT PRIMARY KEY,
|
|
pos TEXT NOT NULL
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS translations (
|
|
id INTEGER PRIMARY KEY,
|
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
language TEXT NOT NULL,
|
|
word TEXT NOT NULL,
|
|
UNIQUE (source_id, language, word)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS glosses (
|
|
id INTEGER PRIMARY KEY,
|
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
language TEXT NOT NULL,
|
|
text TEXT NOT NULL
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS examples (
|
|
id INTEGER PRIMARY KEY,
|
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
language TEXT NOT NULL,
|
|
text TEXT NOT NULL,
|
|
source TEXT NOT NULL
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS cefr_source_votes (
|
|
id INTEGER PRIMARY KEY,
|
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
|
cefr_level TEXT NOT NULL,
|
|
UNIQUE (translation_id)
|
|
);
|
|
|
|
-- ── Status tracking ───────────────────────────────────────────────────────────
|
|
-- One row per synset per model per stage. Drives resumability.
|
|
-- stage: round1 | round2 | tiebreak
|
|
-- status: pending | complete | needs_review | flagged
|
|
|
|
|
|
|
|
-- ── Round 1 output ────────────────────────────────────────────────────────────
|
|
-- One row per translation/language per model. Written atomically per record.
|
|
-- Unique constraints enforce one model one vote.
|
|
|
|
CREATE TABLE IF NOT EXISTS model_cefr_votes (
|
|
id INTEGER PRIMARY KEY,
|
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
|
model_name TEXT NOT NULL,
|
|
cefr_level TEXT NOT NULL,
|
|
UNIQUE (translation_id, model_name)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS generated_glosses (
|
|
id INTEGER PRIMARY KEY,
|
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
model_name TEXT NOT NULL,
|
|
language TEXT NOT NULL,
|
|
text TEXT NOT NULL,
|
|
UNIQUE (source_id, model_name, language)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS generated_examples (
|
|
id INTEGER PRIMARY KEY,
|
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
model_name TEXT NOT NULL,
|
|
language TEXT NOT NULL,
|
|
text TEXT NOT NULL,
|
|
UNIQUE (source_id, model_name, language)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS generated_descriptions (
|
|
id INTEGER PRIMARY KEY,
|
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
model_name TEXT NOT NULL,
|
|
language TEXT NOT NULL,
|
|
text TEXT NOT NULL,
|
|
UNIQUE (source_id, model_name, language)
|
|
);
|
|
|
|
-- ── Round 2 output ────────────────────────────────────────────────────────────
|
|
-- Each row represents one model voting for one candidate.
|
|
-- The candidate with the most votes wins in merge.
|
|
|
|
CREATE TABLE IF NOT EXISTS gloss_candidate_votes (
|
|
id INTEGER PRIMARY KEY,
|
|
gloss_id INTEGER NOT NULL REFERENCES generated_glosses(id),
|
|
model_name TEXT NOT NULL,
|
|
UNIQUE (gloss_id, model_name)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS example_candidate_votes (
|
|
id INTEGER PRIMARY KEY,
|
|
example_id INTEGER NOT NULL REFERENCES generated_examples(id),
|
|
model_name TEXT NOT NULL,
|
|
UNIQUE (example_id, model_name)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS description_candidate_votes (
|
|
id INTEGER PRIMARY KEY,
|
|
description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
|
|
model_name TEXT NOT NULL,
|
|
UNIQUE (description_id, model_name)
|
|
);
|
|
|
|
-- ── Resolved output ───────────────────────────────────────────────────────────
|
|
-- Written by merge. Never updated after writing.
|
|
-- Only fully resolved records are written here — no nulls, no flags.
|
|
-- Absence of a row means unresolved. Flagged status tracked in run_status.
|
|
-- source: omw | cefr | model_name
|
|
|
|
CREATE TABLE IF NOT EXISTS resolved_translations (
|
|
id INTEGER PRIMARY KEY,
|
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
|
cefr_level TEXT NOT NULL,
|
|
difficulty TEXT NOT NULL,
|
|
UNIQUE (translation_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS resolved_glosses (
|
|
id INTEGER PRIMARY KEY,
|
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
language TEXT NOT NULL,
|
|
text TEXT NOT NULL,
|
|
source TEXT NOT NULL,
|
|
UNIQUE (source_id, language)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS resolved_examples (
|
|
id INTEGER PRIMARY KEY,
|
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
language TEXT NOT NULL,
|
|
text TEXT NOT NULL,
|
|
source TEXT NOT NULL
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS resolved_descriptions (
|
|
id INTEGER PRIMARY KEY,
|
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
language TEXT NOT NULL,
|
|
text TEXT NOT NULL,
|
|
source TEXT NOT NULL,
|
|
UNIQUE (source_id, language)
|
|
);
|