feat: add Kaikki extraction and import scripts for stage 1

- Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL,
  filters to supported POS and languages, skips abbreviations and
  senses with no translations in supported languages
- Rewrite db/import.ts for Kaikki flat model — tracks sense_index
  offsets per headword+pos to handle duplicate JSONL entries
- Rewrite db/schema.sql for Kaikki model — entries, translations,
  LLM vote tables, resolved tables
- Add extract and db:import scripts to package.json
- Sample mode hardcoded to 500 entries for development
This commit is contained in:
lila 2026-05-05 18:11:53 +02:00
parent 963bff4eb8
commit 209d52f54b
17 changed files with 346 additions and 1055737 deletions

View file

@ -1,170 +0,0 @@
[
{
"_fixture": "noun_with_cefr_vote",
"source_id": "ili:i100955",
"pos": "noun",
"translations": { "en": ["grain"], "de": ["Korn", "Kornbrand"] },
"glosses": { "en": ["a cereal grass"], "de": ["ein Getreidegras"] },
"examples": {
"en": [
{ "text": "wheat is a grain that is grown in Kansas", "source": "omw" }
]
},
"votes": { "en": { "grain": { "cefr_source": "B1" } } }
},
{
"_fixture": "verb_no_votes_no_translations",
"source_id": "ili:i21779",
"pos": "verb",
"translations": { "en": ["respire"] },
"glosses": {
"en": [
"undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide"
]
},
"examples": {},
"votes": {}
},
{
"_fixture": "verb_with_cefr_vote_all_languages",
"source_id": "ili:i21778",
"pos": "verb",
"translations": {
"en": ["breathe", "take a breath", "respire", "suspire"],
"it": ["respirare"],
"es": ["aspirar", "respirar"],
"de": ["Luft holen", "hauchen", "Luft bekommen", "Luft kriegen", "atmen"],
"fr": ["inspirer", "respirer"]
},
"glosses": {
"en": ["draw air into, and expel out of, the lungs"],
"de": ["Luft in die Lunge saugen und aus ihr ausstoßen"]
},
"examples": {
"en": [
{
"text": "I can breathe better when the air is clean",
"source": "omw"
},
{ "text": "The patient is respiring", "source": "omw" }
]
},
"votes": { "en": { "breathe": { "cefr_source": "A1" } } }
},
{
"_fixture": "adjective_all_languages_multiple_translations",
"source_id": "ili:i10007",
"pos": "adjective",
"translations": {
"en": ["possible"],
"it": [
"attuabile",
"effettuabile",
"eseguibile",
"fattibile",
"operabile",
"possibile",
"producibile",
"realizzabile"
],
"es": ["posible"],
"de": [
"möglich",
"denkbar",
"eventuell",
"möglicherweise",
"allfällig",
"etwaig",
"gegebenenfalls",
"eventuell"
],
"fr": ["possible", "éventuel"]
},
"glosses": {
"en": ["capable of happening or existing"],
"de": ["in der Lage, zu geschehen oder zu existieren"]
},
"examples": {
"en": [
{ "text": "a breakthrough may be possible next year", "source": "omw" },
{ "text": "anything is possible", "source": "omw" },
{ "text": "warned of possible consequences", "source": "omw" }
]
},
"votes": { "en": { "possible": { "cefr_source": "A2" } } }
},
{
"_fixture": "adjective_multiple_de_votes_cefr_examples",
"source_id": "ili:i10000",
"pos": "adjective",
"translations": {
"en": ["negative"],
"de": [
"dürftig",
"zu wünschen übrig lassen",
"schlecht",
"widrig",
"ungut",
"lausig",
"negativ",
"von Nachteil",
"schädlich",
"nachteilig",
"ungünstig"
],
"fr": ["négatif", "strictement négatif"]
},
"glosses": { "en": ["less than zero"], "de": ["kleiner als Null"] },
"examples": {
"en": [{ "text": "a negative number", "source": "omw" }],
"de": [
{ "text": "Die Beweise waren dürftig.", "source": "cefr" },
{ "text": "Das Wetter ist heute schlecht.", "source": "cefr" },
{
"text": "Trotz widriger Umstände haben sie es geschafft.",
"source": "cefr"
},
{
"text": "Er hatte ein ungutes Gefühl bei der Sache.",
"source": "cefr"
},
{ "text": "Er hat eine sehr negative Einstellung.", "source": "cefr" },
{
"text": "Rauchen ist schädlich für die Gesundheit.",
"source": "cefr"
},
{
"text": "Diese Entscheidung könnte nachteilig sein.",
"source": "cefr"
},
{
"text": "Das Wetter ist heute ungünstig für einen Ausflug.",
"source": "cefr"
}
]
},
"votes": {
"de": {
"dürftig": { "cefr_source": "C1" },
"schlecht": { "cefr_source": "A1" },
"widrig": { "cefr_source": "C1" },
"ungut": { "cefr_source": "B2" },
"negativ": { "cefr_source": "A2" },
"schädlich": { "cefr_source": "B1" },
"nachteilig": { "cefr_source": "B1" },
"ungünstig": { "cefr_source": "B2" }
}
}
},
{
"_fixture": "adverb_no_votes",
"source_id": "ili:i18157",
"pos": "adverb",
"translations": { "en": ["a cappella"], "es": ["a capella"] },
"glosses": { "en": ["without musical accompaniment"] },
"examples": {
"en": [{ "text": "they performed a cappella", "source": "omw" }]
},
"votes": {}
}
]

View file

@ -1,4 +0,0 @@
[
{ "word": "macht", "pos": "noun", "language": "de", "levels": ["A2", "B1"] },
{ "word": "bleiche", "pos": "noun", "language": "de", "levels": ["B2", "B1"] }
]

View file

@ -1,237 +0,0 @@
import fs from "node:fs/promises";
import path from "node:path";
import { describe, it, expect, beforeAll } from "vitest";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type Example = { text: string; source: "omw" | "cefr" };
type AnnotatedRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
votes: Partial<
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
>;
};
// ── Paths ─────────────────────────────────────────────────────────────────────
const DB_PATH = path.resolve("db/pipeline.db");
const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
const ANNOTATED_DIR = path.resolve("stage-2-annotate/output");
// ── Helpers ───────────────────────────────────────────────────────────────────
async function dbExists(): Promise<boolean> {
try {
await fs.access(DB_PATH);
return true;
} catch {
return false;
}
}
// ── Tests ─────────────────────────────────────────────────────────────────────
describe("pipeline.db — import validation", () => {
let db: import("better-sqlite3").Database;
let expectedSynsetCount: number;
let expectedCefrVoteCount: number;
beforeAll(async () => {
if (!(await dbExists())) return;
const Database = (await import("better-sqlite3")).default;
db = new Database(DB_PATH, { readonly: true });
db.pragma("foreign_keys = ON");
// Count expected synsets from omw.json
const omwRaw = await fs.readFile(OMW_PATH, "utf-8");
const omwRecords = JSON.parse(omwRaw) as unknown[];
expectedSynsetCount = omwRecords.length;
// Count expected CEFR votes from stage 2 annotated files.
// Merge all language files the same way the import script does —
// use en.json as base and merge votes from the other language files.
const byId = new Map<string, AnnotatedRecord>();
const baseRaw = await fs.readFile(
path.join(ANNOTATED_DIR, "en.json"),
"utf-8",
);
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
for (const record of base) {
byId.set(record.source_id, record);
}
for (const lang of SUPPORTED_LANGUAGE_CODES) {
if (lang === "en") continue;
const raw = await fs.readFile(
path.join(ANNOTATED_DIR, `${lang}.json`),
"utf-8",
);
const records = JSON.parse(raw) as AnnotatedRecord[];
for (const record of records) {
const base = byId.get(record.source_id);
if (!base) continue;
for (const [l, langVotes] of Object.entries(record.votes)) {
if (!base.votes[l as SupportedLanguageCode]) {
base.votes[l as SupportedLanguageCode] = {};
}
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
}
}
}
expectedCefrVoteCount = 0;
for (const record of byId.values()) {
for (const langVotes of Object.values(record.votes)) {
expectedCefrVoteCount += Object.keys(langVotes ?? {}).length;
}
}
}, 120_000);
it("pipeline.db exists — skipping all tests if not", async () => {
const exists = await dbExists();
if (!exists) {
console.warn(
"\n pipeline.db not found — run pnpm db:init and pnpm db:import first\n",
);
}
expect(exists).toBe(true);
});
it("synsets count matches omw.json", () => {
if (!db) return;
const row = db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
count: number;
};
expect(row.count).toBe(expectedSynsetCount);
});
it("every synset has at least one translation", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT s.source_id
FROM synsets s
LEFT JOIN translations t ON t.source_id = s.source_id
WHERE t.id IS NULL
`,
)
.all() as { source_id: string }[];
const errors = rows.map((r) => `${r.source_id}: no translations`);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every translation belongs to a valid synset", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT t.id, t.source_id
FROM translations t
LEFT JOIN synsets s ON s.source_id = t.source_id
WHERE s.source_id IS NULL
`,
)
.all() as { id: number; source_id: string }[];
const errors = rows.map(
(r) => `translation ${r.id}: references missing synset ${r.source_id}`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every cefr_source_vote references a valid translation", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT v.id, v.translation_id
FROM cefr_source_votes v
LEFT JOIN translations t ON t.id = v.translation_id
WHERE t.id IS NULL
`,
)
.all() as { id: number; translation_id: number }[];
const errors = rows.map(
(r) =>
`cefr_vote ${r.id}: references missing translation ${r.translation_id}`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("cefr_source_votes count matches stage 2 annotated output", () => {
if (!db) return;
const row = db
.prepare("SELECT COUNT(*) as count FROM cefr_source_votes")
.get() as { count: number };
expect(row.count).toBe(expectedCefrVoteCount);
});
it("every example has a valid source", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT source_id, language, source
FROM examples
WHERE source NOT IN ('omw', 'cefr')
`,
)
.all() as { source_id: string; language: string; source: string }[];
const errors = rows.map(
(r) =>
`${r.source_id} (${r.language}): invalid example source "${r.source}"`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every example belongs to a valid synset", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT e.id, e.source_id
FROM examples e
LEFT JOIN synsets s ON s.source_id = e.source_id
WHERE s.source_id IS NULL
`,
)
.all() as { id: number; source_id: string }[];
const errors = rows.map(
(r) => `example ${r.id}: references missing synset ${r.source_id}`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every gloss belongs to a valid synset", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT g.id, g.source_id
FROM glosses g
LEFT JOIN synsets s ON s.source_id = g.source_id
WHERE s.source_id IS NULL
`,
)
.all() as { id: number; source_id: string }[];
const errors = rows.map(
(r) => `gloss ${r.id}: references missing synset ${r.source_id}`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
});

View file

@ -1,166 +0,0 @@
import fs from "node:fs/promises";
import path from "node:path";
import { describe, it, expect } from "vitest";
import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type OmwRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, string[]>>;
};
// ── Paths ─────────────────────────────────────────────────────────────────────
const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
// ── Helpers ───────────────────────────────────────────────────────────────────
function isValidSourceId(id: string): boolean {
return /^ili:i\d+$/.test(id);
}
// ── Tests ─────────────────────────────────────────────────────────────────────
describe("stage 1 — omw.json validation", () => {
let records: OmwRecord[];
it("file exists and is valid JSON", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
expect(records).toBeDefined();
});
it("is a non-empty array", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
expect(Array.isArray(records)).toBe(true);
expect(records.length).toBeGreaterThan(0);
});
it("every record has required fields", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
for (const record of records) {
if (!record.source_id) {
errors.push(`missing source_id`);
continue;
}
if (!record.pos) errors.push(`${record.source_id}: missing pos`);
if (!record.translations)
errors.push(`${record.source_id}: missing translations`);
if (!record.glosses) errors.push(`${record.source_id}: missing glosses`);
if (!record.examples)
errors.push(`${record.source_id}: missing examples`);
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every source_id matches ili:i{number} pattern", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
for (const record of records) {
if (!isValidSourceId(record.source_id)) {
errors.push(`invalid source_id: ${record.source_id}`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every source_id is unique", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const seen = new Set<string>();
const errors: string[] = [];
for (const record of records) {
if (seen.has(record.source_id)) {
errors.push(`duplicate source_id: ${record.source_id}`);
}
seen.add(record.source_id);
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every pos is a valid supported value", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
const validPos = new Set(SUPPORTED_POS);
for (const record of records) {
if (!validPos.has(record.pos)) {
errors.push(`${record.source_id}: invalid pos "${record.pos}"`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every record has at least one translation in at least one language", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
for (const record of records) {
const langs = Object.keys(record.translations) as SupportedLanguageCode[];
if (langs.length === 0) {
errors.push(`${record.source_id}: no translations`);
continue;
}
for (const lang of langs) {
if (!validLangs.has(lang)) {
errors.push(`${record.source_id}: unsupported language "${lang}"`);
}
const words = record.translations[lang] ?? [];
if (words.length === 0) {
errors.push(`${record.source_id}: empty translations for "${lang}"`);
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("no duplicate translations within a single synset and language", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
const records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
for (const record of records) {
for (const [lang, words] of Object.entries(record.translations)) {
const seen = new Set<string>();
for (const word of words) {
if (seen.has(word)) {
errors.push(
`${record.source_id} (${lang}): duplicate translation "${word}"`,
);
}
seen.add(word);
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
});

View file

@ -1,218 +0,0 @@
import fs from "node:fs/promises";
import path from "node:path";
import { describe, it, expect, beforeAll } from "vitest";
import {
SUPPORTED_POS,
SUPPORTED_LANGUAGE_CODES,
CEFR_LEVELS,
} from "@lila/shared";
import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type Example = { text: string; source: "omw" | "cefr" };
type AnnotatedRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
votes: Partial<
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
>;
};
type ConflictEntry = {
word: string;
pos: string;
language: SupportedLanguageCode;
levels: string[];
};
// ── Paths ─────────────────────────────────────────────────────────────────────
const OUTPUT_DIR = path.resolve("stage-2-annotate/output");
// ── Tests ─────────────────────────────────────────────────────────────────────
describe("stage 2 — annotated output validation", () => {
const recordsByLang = new Map<SupportedLanguageCode, AnnotatedRecord[]>();
let conflicts: ConflictEntry[] = [];
beforeAll(async () => {
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const raw = await fs.readFile(
path.join(OUTPUT_DIR, `${lang}.json`),
"utf-8",
);
recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]);
}
const raw = await fs.readFile(
path.join(OUTPUT_DIR, "conflicts.json"),
"utf-8",
);
conflicts = JSON.parse(raw) as ConflictEntry[];
}, 60_000);
it("all five language files exist", async () => {
const errors: string[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
try {
await fs.access(filePath);
} catch {
errors.push(`missing file: ${lang}.json`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("conflicts.json exists", async () => {
const filePath = path.join(OUTPUT_DIR, "conflicts.json");
await expect(fs.access(filePath)).resolves.toBeUndefined();
});
it("every language file is a non-empty array", () => {
const errors: string[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
if (!Array.isArray(records)) {
errors.push(`${lang}.json: not an array`);
} else if (records.length === 0) {
errors.push(`${lang}.json: empty array`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every record has required fields", () => {
const errors: string[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
if (!record.source_id) {
errors.push(`${lang}: record missing source_id`);
continue;
}
if (!record.pos)
errors.push(`${lang} ${record.source_id}: missing pos`);
if (!record.translations)
errors.push(`${lang} ${record.source_id}: missing translations`);
if (!record.glosses)
errors.push(`${lang} ${record.source_id}: missing glosses`);
if (record.examples === undefined)
errors.push(`${lang} ${record.source_id}: missing examples`);
if (record.votes === undefined)
errors.push(`${lang} ${record.source_id}: missing votes`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every pos is a valid supported value", () => {
const errors: string[] = [];
const validPos = new Set(SUPPORTED_POS);
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
if (!validPos.has(record.pos)) {
errors.push(
`${lang} ${record.source_id}: invalid pos "${record.pos}"`,
);
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every example has text and a valid source", () => {
const errors: string[] = [];
const validSources = new Set(["omw", "cefr"]);
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
for (const [l, examples] of Object.entries(record.examples)) {
for (const example of examples) {
if (!example.text) {
errors.push(
`${lang} ${record.source_id} (${l}): example missing text`,
);
}
if (!validSources.has(example.source)) {
errors.push(
`${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`,
);
}
}
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every cefr_source vote is a valid CEFR level", () => {
const errors: string[] = [];
const validLevels = new Set(CEFR_LEVELS);
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
for (const [l, langVotes] of Object.entries(record.votes)) {
for (const [word, vote] of Object.entries(langVotes ?? {})) {
if (
!validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number])
) {
errors.push(
`${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`,
);
}
}
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("conflicts.json entries have required fields and valid CEFR levels", () => {
const errors: string[] = [];
const validLevels = new Set(CEFR_LEVELS);
const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
for (const entry of conflicts) {
if (!entry.word) errors.push(`conflict missing word`);
if (!entry.pos) errors.push(`conflict missing pos`);
if (!entry.language) {
errors.push(`conflict missing language`);
} else if (!validLangs.has(entry.language)) {
errors.push(`conflict invalid language "${entry.language}"`);
}
if (!Array.isArray(entry.levels) || entry.levels.length < 2) {
errors.push(`${entry.word}: levels must have at least 2 entries`);
} else {
for (const level of entry.levels) {
if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) {
errors.push(`${entry.word}: invalid level "${level}"`);
}
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
});