feat: add Kaikki extraction and import scripts for stage 1

- Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL,
  filters to supported POS and languages, skips abbreviations and
  senses with no translations in supported languages
- Rewrite db/import.ts for Kaikki flat model — tracks sense_index
  offsets per headword+pos to handle duplicate JSONL entries
- Rewrite db/schema.sql for Kaikki model — entries, translations,
  LLM vote tables, resolved tables
- Add extract and db:import scripts to package.json
- Sample mode hardcoded to 500 entries for development
This commit is contained in:
lila 2026-05-05 18:11:53 +02:00
parent 963bff4eb8
commit 209d52f54b
17 changed files with 346 additions and 1055737 deletions

View file

@ -1,62 +1,58 @@
-- ── Base data ─────────────────────────────────────────────────────────────────
-- Imported from stage 2 JSON on first run. Never mutated after import.
-- Imported from Kaikki on first run. Never mutated after import.
CREATE TABLE IF NOT EXISTS synsets (
source_id TEXT PRIMARY KEY,
pos TEXT NOT NULL
CREATE TABLE IF NOT EXISTS entries (
id INTEGER PRIMARY KEY,
headword TEXT NOT NULL,
language TEXT NOT NULL,
pos TEXT NOT NULL,
sense_index INTEGER NOT NULL DEFAULT 0,
gloss TEXT,
examples TEXT NOT NULL DEFAULT '[]', -- JSON array of strings
source TEXT NOT NULL DEFAULT 'kaikki',
UNIQUE (headword, language, pos, sense_index)
);
CREATE TABLE IF NOT EXISTS translations (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
word TEXT NOT NULL,
UNIQUE (source_id, language, word)
);
CREATE TABLE IF NOT EXISTS glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS cefr_source_votes (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
cefr_level TEXT NOT NULL,
UNIQUE (translation_id)
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
target_lang TEXT NOT NULL,
word TEXT NOT NULL,
sense_hint TEXT,
source TEXT NOT NULL DEFAULT 'kaikki',
UNIQUE (entry_id, target_lang, word)
);
-- ── Status tracking ───────────────────────────────────────────────────────────
-- One row per synset per model per stage. Drives resumability.
-- One row per entry per model per stage. Drives resumability.
-- Sentinel rows use entry_id = 0 for one-time pipeline steps.
-- stage: round1 | round2 | tiebreak
-- status: pending | complete | needs_review | flagged
CREATE TABLE IF NOT EXISTS run_status (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL,
entry_id INTEGER NOT NULL,
model_name TEXT NOT NULL,
stage TEXT NOT NULL,
status TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
UNIQUE (source_id, model_name, stage)
UNIQUE (entry_id, model_name, stage)
);
-- ── Round 1 output ────────────────────────────────────────────────────────────
-- One row per translation/language per model. Written atomically per record.
-- Written atomically per entry per model.
-- Unique constraints enforce one model one vote.
CREATE TABLE IF NOT EXISTS model_cefr_votes (
CREATE TABLE IF NOT EXISTS model_entry_cefr_votes (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
cefr_level TEXT NOT NULL,
UNIQUE (entry_id, model_name)
);
CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
model_name TEXT NOT NULL,
@ -64,38 +60,29 @@ CREATE TABLE IF NOT EXISTS model_cefr_votes (
UNIQUE (translation_id, model_name)
);
CREATE TABLE IF NOT EXISTS model_translation_rejections (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
model_name TEXT NOT NULL,
UNIQUE (translation_id, model_name)
);
CREATE TABLE IF NOT EXISTS generated_glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
UNIQUE (entry_id, model_name)
);
CREATE TABLE IF NOT EXISTS generated_examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
UNIQUE (entry_id, model_name)
);
CREATE TABLE IF NOT EXISTS generated_descriptions (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
CREATE TABLE IF NOT EXISTS generated_translations (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
target_lang TEXT NOT NULL,
word TEXT NOT NULL,
UNIQUE (entry_id, model_name, target_lang)
);
-- ── Round 2 output ────────────────────────────────────────────────────────────
@ -116,20 +103,28 @@ CREATE TABLE IF NOT EXISTS example_candidate_votes (
UNIQUE (example_id, model_name)
);
CREATE TABLE IF NOT EXISTS description_candidate_votes (
CREATE TABLE IF NOT EXISTS translation_candidate_votes (
id INTEGER PRIMARY KEY,
description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
translation_id INTEGER NOT NULL REFERENCES generated_translations(id),
model_name TEXT NOT NULL,
UNIQUE (description_id, model_name)
UNIQUE (translation_id, model_name)
);
-- ── Resolved output ───────────────────────────────────────────────────────────
-- Written by merge. Never updated after writing.
-- Only fully resolved records are written here — no nulls, no flags.
-- Only fully resolved records are written here — no nulls.
-- Absence of a row means unresolved. Flagged status tracked in run_status.
-- source: omw | cefr | model_name
-- source: kaikki | model_name
CREATE TABLE IF NOT EXISTS resolved_translations (
CREATE TABLE IF NOT EXISTS resolved_entry_cefr (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
cefr_level TEXT NOT NULL,
difficulty TEXT NOT NULL,
UNIQUE (entry_id)
);
CREATE TABLE IF NOT EXISTS resolved_translation_cefr (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
cefr_level TEXT NOT NULL,
@ -138,27 +133,25 @@ CREATE TABLE IF NOT EXISTS resolved_translations (
);
CREATE TABLE IF NOT EXISTS resolved_glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (source_id, language)
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
text TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (entry_id)
);
CREATE TABLE IF NOT EXISTS resolved_examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
text TEXT NOT NULL,
source TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS resolved_descriptions (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (source_id, language)
CREATE TABLE IF NOT EXISTS resolved_generated_translations (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
target_lang TEXT NOT NULL,
word TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (entry_id, target_lang)
);