feat: add Kaikki extraction and import scripts for stage 1

- Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL,
  filters to supported POS and languages, skips abbreviations and
  senses with no translations in supported languages
- Rewrite db/import.ts for Kaikki flat model — tracks sense_index
  offsets per headword+pos to handle duplicate JSONL entries
- Rewrite db/schema.sql for Kaikki model — entries, translations,
  LLM vote tables, resolved tables
- Add extract and db:import scripts to package.json
- Sample mode hardcoded to 500 entries for development
This commit is contained in:
lila 2026-05-05 18:11:53 +02:00
parent 963bff4eb8
commit 209d52f54b
17 changed files with 346 additions and 1055737 deletions

View file

@ -1,185 +1,98 @@
import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
import { openDb } from "./index.js";
// ── Types ─────────────────────────────────────────────────────────────────────
type Example = { text: string; source: "omw" | "cefr" };
type AnnotatedRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
votes: Partial<
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
>;
};
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
// ── Paths ─────────────────────────────────────────────────────────────────────
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PATHS = {
annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
};
// ── Loading ───────────────────────────────────────────────────────────────────
async function loadAnnotated(): Promise<AnnotatedRecord[]> {
// Use en.json as the base — it has the most complete glosses and examples.
// Merge votes and CEFR examples from the other language files.
const baseRaw = await fs.readFile(
path.join(PATHS.annotatedDir, "en.json"),
"utf-8",
);
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
const byId = new Map<string, AnnotatedRecord>();
for (const record of base) {
byId.set(record.source_id, record);
}
for (const lang of SUPPORTED_LANGUAGE_CODES) {
if (lang === "en") continue;
const raw = await fs.readFile(
path.join(PATHS.annotatedDir, `${lang}.json`),
"utf-8",
);
const records = JSON.parse(raw) as AnnotatedRecord[];
for (const record of records) {
const base = byId.get(record.source_id);
if (!base) continue;
// Merge votes
for (const [l, langVotes] of Object.entries(record.votes)) {
if (!base.votes[l as SupportedLanguageCode]) {
base.votes[l as SupportedLanguageCode] = {};
}
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
}
// Merge CEFR examples not already in base
for (const [l, examples] of Object.entries(record.examples)) {
const lang = l as SupportedLanguageCode;
const cefrExamples = examples.filter((e) => e.source === "cefr");
if (cefrExamples.length === 0) continue;
if (!base.examples[lang]) {
base.examples[lang] = cefrExamples;
} else {
base.examples[lang].push(...cefrExamples);
}
}
}
}
return [...byId.values()];
}
// ── Import ────────────────────────────────────────────────────────────────────
export async function importStage2(): Promise<void> {
console.log("Loading stage 2 annotated files...");
const records = await loadAnnotated();
console.log(` Loaded ${records.length.toLocaleString()} synsets`);
export async function importKaikki(): Promise<void> {
console.log("Loading extracted Kaikki data...");
const raw = await fs.readFile(PATHS.extracted, "utf-8");
const senses = JSON.parse(raw) as ExtractedSense[];
console.log(` Loaded ${senses.length.toLocaleString()} senses`);
const db = openDb();
const insertSynset = db.prepare(
`INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
);
const insertTranslation = db.prepare(
`INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
);
const insertGloss = db.prepare(
`INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
);
const insertExample = db.prepare(
`INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
);
const insertCefrVote = db.prepare(`
INSERT INTO cefr_source_votes (translation_id, cefr_level)
VALUES (
(SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
?
)
const insertEntry = db.prepare(`
INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT (headword, language, pos, sense_index)
DO UPDATE SET
gloss = excluded.gloss,
examples = excluded.examples
RETURNING id
`);
const insertTranslation = db.prepare(`
INSERT INTO translations (entry_id, target_lang, word, sense_hint)
VALUES (?, ?, ?, ?)
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
`);
// Track next available sense_index per (headword, pos) to handle
// the same word appearing in multiple JSONL entries with the same POS.
const senseIndexMap = new Map<string, number>();
console.log("\nImporting into pipeline.db...");
const importAll = db.transaction(() => {
let synsets = 0;
let entries = 0;
let translations = 0;
let glosses = 0;
let examples = 0;
let cefrVotes = 0;
let skipped = 0;
for (const record of records) {
insertSynset.run(record.source_id, record.pos);
synsets++;
for (const sense of senses) {
const key = `${sense.headword}|${sense.pos}`;
const nextIndex = senseIndexMap.get(key) ?? 0;
// Translations
for (const [lang, words] of Object.entries(record.translations)) {
const unique = [...new Set(words)];
for (const word of unique) {
insertTranslation.run(record.source_id, lang, word);
translations++;
}
// Use the offset sense_index to avoid collisions when the same word
// appears in multiple JSONL entries with the same POS.
const senseIndex = nextIndex;
senseIndexMap.set(key, nextIndex + 1);
const row = insertEntry.get(
sense.headword,
"en",
sense.pos,
senseIndex,
sense.gloss ?? null,
JSON.stringify(sense.examples),
) as { id: number } | undefined;
if (!row) {
skipped++;
continue;
}
// Glosses
for (const [lang, glossList] of Object.entries(record.glosses)) {
for (const text of glossList) {
insertGloss.run(record.source_id, lang, text);
glosses++;
}
}
entries++;
// Examples
for (const [lang, exList] of Object.entries(record.examples)) {
for (const example of exList) {
insertExample.run(
record.source_id,
lang,
example.text,
example.source,
);
examples++;
}
}
// CEFR source votes
for (const [lang, langVotes] of Object.entries(record.votes)) {
for (const [word, vote] of Object.entries(
langVotes as Record<string, { cefr_source: string }>,
)) {
insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
cefrVotes++;
}
for (const t of sense.translations) {
insertTranslation.run(
row.id,
t.target_lang,
t.word,
t.sense_hint ?? null,
);
translations++;
}
}
return { synsets, translations, glosses, examples, cefrVotes };
return { entries, translations, skipped };
});
const counts = importAll();
console.log(` synsets: ${counts.synsets.toLocaleString()}`);
console.log(` entries: ${counts.entries.toLocaleString()}`);
console.log(` translations: ${counts.translations.toLocaleString()}`);
console.log(` glosses: ${counts.glosses.toLocaleString()}`);
console.log(` examples: ${counts.examples.toLocaleString()}`);
console.log(` cefr votes: ${counts.cefrVotes.toLocaleString()}`);
console.log(` skipped: ${counts.skipped.toLocaleString()}`);
db.close();
console.log("\nImport complete.");
@ -189,7 +102,7 @@ export async function importStage2(): Promise<void> {
export function isImported(): boolean {
const db = openDb();
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
count: number;
};
db.close();
@ -200,20 +113,20 @@ export function isImported(): boolean {
async function main(): Promise<void> {
const db = openDb();
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
count: number;
};
db.close();
if (row.count > 0) {
console.log(
`pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
`pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
);
console.log("Delete pipeline.db and re-run db:init to start fresh.");
process.exit(0);
}
await importStage2();
await importKaikki();
}
if (import.meta.url === `file://${process.argv[1]}`) {

View file

@ -1,62 +1,58 @@
-- ── Base data ─────────────────────────────────────────────────────────────────
-- Imported from stage 2 JSON on first run. Never mutated after import.
-- Imported from Kaikki on first run. Never mutated after import.
CREATE TABLE IF NOT EXISTS synsets (
source_id TEXT PRIMARY KEY,
pos TEXT NOT NULL
CREATE TABLE IF NOT EXISTS entries (
id INTEGER PRIMARY KEY,
headword TEXT NOT NULL,
language TEXT NOT NULL,
pos TEXT NOT NULL,
sense_index INTEGER NOT NULL DEFAULT 0,
gloss TEXT,
examples TEXT NOT NULL DEFAULT '[]', -- JSON array of strings
source TEXT NOT NULL DEFAULT 'kaikki',
UNIQUE (headword, language, pos, sense_index)
);
CREATE TABLE IF NOT EXISTS translations (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
word TEXT NOT NULL,
UNIQUE (source_id, language, word)
);
CREATE TABLE IF NOT EXISTS glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS cefr_source_votes (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
cefr_level TEXT NOT NULL,
UNIQUE (translation_id)
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
target_lang TEXT NOT NULL,
word TEXT NOT NULL,
sense_hint TEXT,
source TEXT NOT NULL DEFAULT 'kaikki',
UNIQUE (entry_id, target_lang, word)
);
-- ── Status tracking ───────────────────────────────────────────────────────────
-- One row per synset per model per stage. Drives resumability.
-- One row per entry per model per stage. Drives resumability.
-- Sentinel rows use entry_id = 0 for one-time pipeline steps.
-- stage: round1 | round2 | tiebreak
-- status: pending | complete | needs_review | flagged
CREATE TABLE IF NOT EXISTS run_status (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL,
entry_id INTEGER NOT NULL,
model_name TEXT NOT NULL,
stage TEXT NOT NULL,
status TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
UNIQUE (source_id, model_name, stage)
UNIQUE (entry_id, model_name, stage)
);
-- ── Round 1 output ────────────────────────────────────────────────────────────
-- One row per translation/language per model. Written atomically per record.
-- Written atomically per entry per model.
-- Unique constraints enforce one model one vote.
CREATE TABLE IF NOT EXISTS model_cefr_votes (
CREATE TABLE IF NOT EXISTS model_entry_cefr_votes (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
cefr_level TEXT NOT NULL,
UNIQUE (entry_id, model_name)
);
CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
model_name TEXT NOT NULL,
@ -64,38 +60,29 @@ CREATE TABLE IF NOT EXISTS model_cefr_votes (
UNIQUE (translation_id, model_name)
);
CREATE TABLE IF NOT EXISTS model_translation_rejections (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
model_name TEXT NOT NULL,
UNIQUE (translation_id, model_name)
);
CREATE TABLE IF NOT EXISTS generated_glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
UNIQUE (entry_id, model_name)
);
CREATE TABLE IF NOT EXISTS generated_examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
UNIQUE (entry_id, model_name)
);
CREATE TABLE IF NOT EXISTS generated_descriptions (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
CREATE TABLE IF NOT EXISTS generated_translations (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
target_lang TEXT NOT NULL,
word TEXT NOT NULL,
UNIQUE (entry_id, model_name, target_lang)
);
-- ── Round 2 output ────────────────────────────────────────────────────────────
@ -116,20 +103,28 @@ CREATE TABLE IF NOT EXISTS example_candidate_votes (
UNIQUE (example_id, model_name)
);
CREATE TABLE IF NOT EXISTS description_candidate_votes (
CREATE TABLE IF NOT EXISTS translation_candidate_votes (
id INTEGER PRIMARY KEY,
description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
translation_id INTEGER NOT NULL REFERENCES generated_translations(id),
model_name TEXT NOT NULL,
UNIQUE (description_id, model_name)
UNIQUE (translation_id, model_name)
);
-- ── Resolved output ───────────────────────────────────────────────────────────
-- Written by merge. Never updated after writing.
-- Only fully resolved records are written here — no nulls, no flags.
-- Only fully resolved records are written here — no nulls.
-- Absence of a row means unresolved. Flagged status tracked in run_status.
-- source: omw | cefr | model_name
-- source: kaikki | model_name
CREATE TABLE IF NOT EXISTS resolved_translations (
CREATE TABLE IF NOT EXISTS resolved_entry_cefr (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
cefr_level TEXT NOT NULL,
difficulty TEXT NOT NULL,
UNIQUE (entry_id)
);
CREATE TABLE IF NOT EXISTS resolved_translation_cefr (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
cefr_level TEXT NOT NULL,
@ -138,27 +133,25 @@ CREATE TABLE IF NOT EXISTS resolved_translations (
);
CREATE TABLE IF NOT EXISTS resolved_glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (source_id, language)
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
text TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (entry_id)
);
CREATE TABLE IF NOT EXISTS resolved_examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
text TEXT NOT NULL,
source TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS resolved_descriptions (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (source_id, language)
CREATE TABLE IF NOT EXISTS resolved_generated_translations (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
target_lang TEXT NOT NULL,
word TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (entry_id, target_lang)
);