- Enrich script functional with timeout, progress tracking, rejection mechanism - Identified ordering issue: CEFR voting needs validated translations first - Redesign: round1_gloss → round1_example → round1_translations → round1_cefr - Update data-pipeline.md with new sub-stage design and roadmap - Qwen3.5-4B confirmed working with thinking disabled
164 lines
6 KiB
SQL
164 lines
6 KiB
SQL
-- ── Base data ─────────────────────────────────────────────────────────────────
|
|
-- Imported from Kaikki on first run. Never mutated after import.
|
|
|
|
CREATE TABLE IF NOT EXISTS entries (
|
|
id INTEGER PRIMARY KEY,
|
|
headword TEXT NOT NULL,
|
|
language TEXT NOT NULL,
|
|
pos TEXT NOT NULL,
|
|
sense_index INTEGER NOT NULL DEFAULT 0,
|
|
gloss TEXT,
|
|
examples TEXT NOT NULL DEFAULT '[]', -- JSON array of strings
|
|
source TEXT NOT NULL DEFAULT 'kaikki',
|
|
UNIQUE (headword, language, pos, sense_index)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS translations (
|
|
id INTEGER PRIMARY KEY,
|
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
|
target_lang TEXT NOT NULL,
|
|
word TEXT NOT NULL,
|
|
sense_hint TEXT,
|
|
source TEXT NOT NULL DEFAULT 'kaikki',
|
|
UNIQUE (entry_id, target_lang, word)
|
|
);
|
|
|
|
-- ── Status tracking ───────────────────────────────────────────────────────────
|
|
-- One row per entry per model per stage. Drives resumability.
|
|
-- Sentinel rows use entry_id = 0 for one-time pipeline steps.
|
|
-- stage: round1 | round2 | tiebreak
|
|
-- status: pending | complete | needs_review | flagged
|
|
|
|
CREATE TABLE IF NOT EXISTS run_status (
|
|
id INTEGER PRIMARY KEY,
|
|
entry_id INTEGER NOT NULL,
|
|
model_name TEXT NOT NULL,
|
|
stage TEXT NOT NULL,
|
|
status TEXT NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
UNIQUE (entry_id, model_name, stage)
|
|
);
|
|
|
|
-- ── Round 1 output ────────────────────────────────────────────────────────────
|
|
-- Written atomically per entry per model.
|
|
-- Unique constraints enforce one model one vote.
|
|
|
|
CREATE TABLE IF NOT EXISTS model_entry_cefr_votes (
|
|
id INTEGER PRIMARY KEY,
|
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
|
model_name TEXT NOT NULL,
|
|
cefr_level TEXT NOT NULL,
|
|
UNIQUE (entry_id, model_name)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
|
|
id INTEGER PRIMARY KEY,
|
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
|
model_name TEXT NOT NULL,
|
|
cefr_level TEXT NOT NULL,
|
|
UNIQUE (translation_id, model_name)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS model_translation_rejections (
|
|
id INTEGER PRIMARY KEY,
|
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
|
model_name TEXT NOT NULL,
|
|
UNIQUE (translation_id, model_name)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS generated_glosses (
|
|
id INTEGER PRIMARY KEY,
|
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
|
model_name TEXT NOT NULL,
|
|
text TEXT NOT NULL,
|
|
UNIQUE (entry_id, model_name)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS generated_examples (
|
|
id INTEGER PRIMARY KEY,
|
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
|
model_name TEXT NOT NULL,
|
|
text TEXT NOT NULL,
|
|
UNIQUE (entry_id, model_name)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS generated_translations (
|
|
id INTEGER PRIMARY KEY,
|
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
|
model_name TEXT NOT NULL,
|
|
target_lang TEXT NOT NULL,
|
|
word TEXT NOT NULL,
|
|
UNIQUE (entry_id, model_name, target_lang)
|
|
);
|
|
|
|
-- ── Round 2 output ────────────────────────────────────────────────────────────
|
|
-- Each row represents one model voting for one candidate.
|
|
-- The candidate with the most votes wins in merge.
|
|
|
|
CREATE TABLE IF NOT EXISTS gloss_candidate_votes (
|
|
id INTEGER PRIMARY KEY,
|
|
gloss_id INTEGER NOT NULL REFERENCES generated_glosses(id),
|
|
model_name TEXT NOT NULL,
|
|
UNIQUE (gloss_id, model_name)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS example_candidate_votes (
|
|
id INTEGER PRIMARY KEY,
|
|
example_id INTEGER NOT NULL REFERENCES generated_examples(id),
|
|
model_name TEXT NOT NULL,
|
|
UNIQUE (example_id, model_name)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS translation_candidate_votes (
|
|
id INTEGER PRIMARY KEY,
|
|
translation_id INTEGER NOT NULL REFERENCES generated_translations(id),
|
|
model_name TEXT NOT NULL,
|
|
UNIQUE (translation_id, model_name)
|
|
);
|
|
|
|
-- ── Resolved output ───────────────────────────────────────────────────────────
|
|
-- Written by merge. Never updated after writing.
|
|
-- Only fully resolved records are written here — no nulls.
|
|
-- Absence of a row means unresolved. Flagged status tracked in run_status.
|
|
-- source: kaikki | model_name
|
|
|
|
CREATE TABLE IF NOT EXISTS resolved_entry_cefr (
|
|
id INTEGER PRIMARY KEY,
|
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
|
cefr_level TEXT NOT NULL,
|
|
difficulty TEXT NOT NULL,
|
|
UNIQUE (entry_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS resolved_translation_cefr (
|
|
id INTEGER PRIMARY KEY,
|
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
|
cefr_level TEXT NOT NULL,
|
|
difficulty TEXT NOT NULL,
|
|
UNIQUE (translation_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS resolved_glosses (
|
|
id INTEGER PRIMARY KEY,
|
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
|
text TEXT NOT NULL,
|
|
source TEXT NOT NULL,
|
|
UNIQUE (entry_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS resolved_examples (
|
|
id INTEGER PRIMARY KEY,
|
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
|
text TEXT NOT NULL,
|
|
source TEXT NOT NULL
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS resolved_generated_translations (
|
|
id INTEGER PRIMARY KEY,
|
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
|
target_lang TEXT NOT NULL,
|
|
word TEXT NOT NULL,
|
|
source TEXT NOT NULL,
|
|
UNIQUE (entry_id, target_lang)
|
|
);
|