diff --git a/data-pipeline/db/import.ts b/data-pipeline/db/import.ts new file mode 100644 index 0000000..276536f --- /dev/null +++ b/data-pipeline/db/import.ts @@ -0,0 +1,222 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; +import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; +import { openDb } from "./index.js"; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type Example = { text: string; source: "omw" | "cefr" }; + +type AnnotatedRecord = { + source_id: string; + pos: SupportedPos; + translations: Partial>; + glosses: Partial>; + examples: Partial>; + votes: Partial< + Record> + >; +}; + +// ── Paths ───────────────────────────────────────────────────────────────────── + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const PATHS = { + annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"), +}; + +// ── Loading ─────────────────────────────────────────────────────────────────── + +async function loadAnnotated(): Promise { + // Use en.json as the base — it has the most complete glosses and examples. + // Merge votes and CEFR examples from the other language files. + const baseRaw = await fs.readFile( + path.join(PATHS.annotatedDir, "en.json"), + "utf-8", + ); + const base = JSON.parse(baseRaw) as AnnotatedRecord[]; + + const byId = new Map(); + for (const record of base) { + byId.set(record.source_id, record); + } + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + if (lang === "en") continue; + + const raw = await fs.readFile( + path.join(PATHS.annotatedDir, `${lang}.json`), + "utf-8", + ); + const records = JSON.parse(raw) as AnnotatedRecord[]; + + for (const record of records) { + const base = byId.get(record.source_id); + if (!base) continue; + + // Merge votes + for (const [l, langVotes] of Object.entries(record.votes)) { + if (!base.votes[l as SupportedLanguageCode]) { + base.votes[l as SupportedLanguageCode] = {}; + } + Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes); + } + + // Merge CEFR examples not already in base + for (const [l, examples] of Object.entries(record.examples)) { + const lang = l as SupportedLanguageCode; + const cefrExamples = examples.filter((e) => e.source === "cefr"); + if (cefrExamples.length === 0) continue; + + if (!base.examples[lang]) { + base.examples[lang] = cefrExamples; + } else { + base.examples[lang].push(...cefrExamples); + } + } + } + } + + return [...byId.values()]; +} + +// ── Import ──────────────────────────────────────────────────────────────────── + +export async function importStage2(): Promise { + console.log("Loading stage 2 annotated files..."); + const records = await loadAnnotated(); + console.log(` Loaded ${records.length.toLocaleString()} synsets`); + + const db = openDb(); + + const insertSynset = db.prepare( + `INSERT INTO synsets (source_id, pos) VALUES (?, ?)`, + ); + + const insertTranslation = db.prepare( + `INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`, + ); + + const insertGloss = db.prepare( + `INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`, + ); + + const insertExample = db.prepare( + `INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`, + ); + + const insertCefrVote = db.prepare(` + INSERT INTO cefr_source_votes (translation_id, cefr_level) + VALUES ( + (SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?), + ? + ) + `); + + console.log("\nImporting into pipeline.db..."); + + const importAll = db.transaction(() => { + let synsets = 0; + let translations = 0; + let glosses = 0; + let examples = 0; + let cefrVotes = 0; + + for (const record of records) { + insertSynset.run(record.source_id, record.pos); + synsets++; + + // Translations + for (const [lang, words] of Object.entries(record.translations)) { + const unique = [...new Set(words)]; + for (const word of unique) { + insertTranslation.run(record.source_id, lang, word); + translations++; + } + } + + // Glosses + for (const [lang, glossList] of Object.entries(record.glosses)) { + for (const text of glossList) { + insertGloss.run(record.source_id, lang, text); + glosses++; + } + } + + // Examples + for (const [lang, exList] of Object.entries(record.examples)) { + for (const example of exList) { + insertExample.run( + record.source_id, + lang, + example.text, + example.source, + ); + examples++; + } + } + + // CEFR source votes + for (const [lang, langVotes] of Object.entries(record.votes)) { + for (const [word, vote] of Object.entries( + langVotes as Record, + )) { + insertCefrVote.run(record.source_id, lang, word, vote.cefr_source); + cefrVotes++; + } + } + } + + return { synsets, translations, glosses, examples, cefrVotes }; + }); + + const counts = importAll(); + + console.log(` synsets: ${counts.synsets.toLocaleString()}`); + console.log(` translations: ${counts.translations.toLocaleString()}`); + console.log(` glosses: ${counts.glosses.toLocaleString()}`); + console.log(` examples: ${counts.examples.toLocaleString()}`); + console.log(` cefr votes: ${counts.cefrVotes.toLocaleString()}`); + + db.close(); + console.log("\nImport complete."); +} + +// ── Check if already imported ───────────────────────────────────────────────── + +export function isImported(): boolean { + const db = openDb(); + const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as { + count: number; + }; + db.close(); + return row.count > 0; +} + +// ── Main ───────────────────────────────────────────────────────────────────── + +async function main(): Promise { + const db = openDb(); + const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as { + count: number; + }; + db.close(); + + if (row.count > 0) { + console.log( + `pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`, + ); + console.log("Delete pipeline.db and re-run db:init to start fresh."); + process.exit(0); + } + + await importStage2(); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/data-pipeline/db/pipeline.db b/data-pipeline/db/pipeline.db index f8dd7fc..e7c3bbe 100644 Binary files a/data-pipeline/db/pipeline.db and b/data-pipeline/db/pipeline.db differ diff --git a/data-pipeline/package.json b/data-pipeline/package.json index b985f86..1fd2636 100644 --- a/data-pipeline/package.json +++ b/data-pipeline/package.json @@ -4,7 +4,9 @@ "private": true, "type": "module", "scripts": { + "db:import": "tsx db/import.ts", "db:init": "tsx db/init.ts", + "annotate": "tsx stage-2-annotate/scripts/annotate.ts", "test": "vitest run", "test:watch": "vitest" }, diff --git a/data-pipeline/stage-1-extract/scripts/extract.py b/data-pipeline/stage-1-extract/scripts/extract.py index 5f0d879..7e39b9b 100644 --- a/data-pipeline/stage-1-extract/scripts/extract.py +++ b/data-pipeline/stage-1-extract/scripts/extract.py @@ -80,7 +80,7 @@ def extract_all( continue covered += 1 - lemmas = [str(lemma) for lemma in synset.lemmas()] + lemmas = list(dict.fromkeys(str(lemma) for lemma in synset.lemmas())) defns = [d for d in synset.definitions() if d] examples = [e for e in synset.examples() if e] diff --git a/data-pipeline/stage-2-annotate/scripts/annotate.ts b/data-pipeline/stage-2-annotate/scripts/annotate.ts index bb71f60..0c8c135 100644 --- a/data-pipeline/stage-2-annotate/scripts/annotate.ts +++ b/data-pipeline/stage-2-annotate/scripts/annotate.ts @@ -196,12 +196,12 @@ async function annotate(): Promise { // Add CEFR vote if (!annotated.votes[lang]) annotated.votes[lang] = {}; - annotated.votes[lang]![word] = { cefr_source: cefrEntry.level }; + annotated.votes[lang][word] = { cefr_source: cefrEntry.level }; // Add native example if present if (cefrEntry.example) { if (!annotated.examples[lang]) annotated.examples[lang] = []; - annotated.examples[lang]!.push({ + annotated.examples[lang].push({ text: cefrEntry.example, source: "cefr" as const, }); diff --git a/data-pipeline/tests/fixtures/annotated.fixture.json b/data-pipeline/tests/fixtures/annotated.fixture.json new file mode 100644 index 0000000..f941bd0 --- /dev/null +++ b/data-pipeline/tests/fixtures/annotated.fixture.json @@ -0,0 +1,170 @@ +[ + { + "_fixture": "noun_with_cefr_vote", + "source_id": "ili:i100955", + "pos": "noun", + "translations": { "en": ["grain"], "de": ["Korn", "Kornbrand"] }, + "glosses": { "en": ["a cereal grass"], "de": ["ein Getreidegras"] }, + "examples": { + "en": [ + { "text": "wheat is a grain that is grown in Kansas", "source": "omw" } + ] + }, + "votes": { "en": { "grain": { "cefr_source": "B1" } } } + }, + { + "_fixture": "verb_no_votes_no_translations", + "source_id": "ili:i21779", + "pos": "verb", + "translations": { "en": ["respire"] }, + "glosses": { + "en": [ + "undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide" + ] + }, + "examples": {}, + "votes": {} + }, + { + "_fixture": "verb_with_cefr_vote_all_languages", + "source_id": "ili:i21778", + "pos": "verb", + "translations": { + "en": ["breathe", "take a breath", "respire", "suspire"], + "it": ["respirare"], + "es": ["aspirar", "respirar"], + "de": ["Luft holen", "hauchen", "Luft bekommen", "Luft kriegen", "atmen"], + "fr": ["inspirer", "respirer"] + }, + "glosses": { + "en": ["draw air into, and expel out of, the lungs"], + "de": ["Luft in die Lunge saugen und aus ihr ausstoßen"] + }, + "examples": { + "en": [ + { + "text": "I can breathe better when the air is clean", + "source": "omw" + }, + { "text": "The patient is respiring", "source": "omw" } + ] + }, + "votes": { "en": { "breathe": { "cefr_source": "A1" } } } + }, + { + "_fixture": "adjective_all_languages_multiple_translations", + "source_id": "ili:i10007", + "pos": "adjective", + "translations": { + "en": ["possible"], + "it": [ + "attuabile", + "effettuabile", + "eseguibile", + "fattibile", + "operabile", + "possibile", + "producibile", + "realizzabile" + ], + "es": ["posible"], + "de": [ + "möglich", + "denkbar", + "eventuell", + "möglicherweise", + "allfällig", + "etwaig", + "gegebenenfalls", + "eventuell" + ], + "fr": ["possible", "éventuel"] + }, + "glosses": { + "en": ["capable of happening or existing"], + "de": ["in der Lage, zu geschehen oder zu existieren"] + }, + "examples": { + "en": [ + { "text": "a breakthrough may be possible next year", "source": "omw" }, + { "text": "anything is possible", "source": "omw" }, + { "text": "warned of possible consequences", "source": "omw" } + ] + }, + "votes": { "en": { "possible": { "cefr_source": "A2" } } } + }, + { + "_fixture": "adjective_multiple_de_votes_cefr_examples", + "source_id": "ili:i10000", + "pos": "adjective", + "translations": { + "en": ["negative"], + "de": [ + "dürftig", + "zu wünschen übrig lassen", + "schlecht", + "widrig", + "ungut", + "lausig", + "negativ", + "von Nachteil", + "schädlich", + "nachteilig", + "ungünstig" + ], + "fr": ["négatif", "strictement négatif"] + }, + "glosses": { "en": ["less than zero"], "de": ["kleiner als Null"] }, + "examples": { + "en": [{ "text": "a negative number", "source": "omw" }], + "de": [ + { "text": "Die Beweise waren dürftig.", "source": "cefr" }, + { "text": "Das Wetter ist heute schlecht.", "source": "cefr" }, + { + "text": "Trotz widriger Umstände haben sie es geschafft.", + "source": "cefr" + }, + { + "text": "Er hatte ein ungutes Gefühl bei der Sache.", + "source": "cefr" + }, + { "text": "Er hat eine sehr negative Einstellung.", "source": "cefr" }, + { + "text": "Rauchen ist schädlich für die Gesundheit.", + "source": "cefr" + }, + { + "text": "Diese Entscheidung könnte nachteilig sein.", + "source": "cefr" + }, + { + "text": "Das Wetter ist heute ungünstig für einen Ausflug.", + "source": "cefr" + } + ] + }, + "votes": { + "de": { + "dürftig": { "cefr_source": "C1" }, + "schlecht": { "cefr_source": "A1" }, + "widrig": { "cefr_source": "C1" }, + "ungut": { "cefr_source": "B2" }, + "negativ": { "cefr_source": "A2" }, + "schädlich": { "cefr_source": "B1" }, + "nachteilig": { "cefr_source": "B1" }, + "ungünstig": { "cefr_source": "B2" } + } + } + }, + { + "_fixture": "adverb_no_votes", + "source_id": "ili:i18157", + "pos": "adverb", + "translations": { "en": ["a cappella"], "es": ["a capella"] }, + "glosses": { "en": ["without musical accompaniment"] }, + "examples": { + "en": [{ "text": "they performed a cappella", "source": "omw" }] + }, + "votes": {} + } +] diff --git a/data-pipeline/tests/fixtures/conflicts.fixture.json b/data-pipeline/tests/fixtures/conflicts.fixture.json new file mode 100644 index 0000000..37f5111 --- /dev/null +++ b/data-pipeline/tests/fixtures/conflicts.fixture.json @@ -0,0 +1,4 @@ +[ + { "word": "macht", "pos": "noun", "language": "de", "levels": ["A2", "B1"] }, + { "word": "bleiche", "pos": "noun", "language": "de", "levels": ["B2", "B1"] } +] diff --git a/data-pipeline/tests/validation/stage-1.validation.test.ts b/data-pipeline/tests/validation/stage-1.validation.test.ts new file mode 100644 index 0000000..047f2e6 --- /dev/null +++ b/data-pipeline/tests/validation/stage-1.validation.test.ts @@ -0,0 +1,166 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { describe, it, expect } from "vitest"; +import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; +import type { SupportedPos, SupportedLanguageCode } from "@lila/shared"; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type OmwRecord = { + source_id: string; + pos: SupportedPos; + translations: Partial>; + glosses: Partial>; + examples: Partial>; +}; + +// ── Paths ───────────────────────────────────────────────────────────────────── + +const OMW_PATH = path.resolve("stage-1-extract/output/omw.json"); + +// ── Helpers ─────────────────────────────────────────────────────────────────── + +function isValidSourceId(id: string): boolean { + return /^ili:i\d+$/.test(id); +} + +// ── Tests ───────────────────────────────────────────────────────────────────── + +describe("stage 1 — omw.json validation", () => { + let records: OmwRecord[]; + + it("file exists and is valid JSON", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + expect(records).toBeDefined(); + }); + + it("is a non-empty array", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + expect(Array.isArray(records)).toBe(true); + expect(records.length).toBeGreaterThan(0); + }); + + it("every record has required fields", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + + const errors: string[] = []; + + for (const record of records) { + if (!record.source_id) { + errors.push(`missing source_id`); + continue; + } + if (!record.pos) errors.push(`${record.source_id}: missing pos`); + if (!record.translations) + errors.push(`${record.source_id}: missing translations`); + if (!record.glosses) errors.push(`${record.source_id}: missing glosses`); + if (!record.examples) + errors.push(`${record.source_id}: missing examples`); + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every source_id matches ili:i{number} pattern", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + + const errors: string[] = []; + + for (const record of records) { + if (!isValidSourceId(record.source_id)) { + errors.push(`invalid source_id: ${record.source_id}`); + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every source_id is unique", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + + const seen = new Set(); + const errors: string[] = []; + + for (const record of records) { + if (seen.has(record.source_id)) { + errors.push(`duplicate source_id: ${record.source_id}`); + } + seen.add(record.source_id); + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every pos is a valid supported value", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + + const errors: string[] = []; + const validPos = new Set(SUPPORTED_POS); + + for (const record of records) { + if (!validPos.has(record.pos)) { + errors.push(`${record.source_id}: invalid pos "${record.pos}"`); + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every record has at least one translation in at least one language", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + + const errors: string[] = []; + const validLangs = new Set(SUPPORTED_LANGUAGE_CODES); + + for (const record of records) { + const langs = Object.keys(record.translations) as SupportedLanguageCode[]; + + if (langs.length === 0) { + errors.push(`${record.source_id}: no translations`); + continue; + } + + for (const lang of langs) { + if (!validLangs.has(lang)) { + errors.push(`${record.source_id}: unsupported language "${lang}"`); + } + const words = record.translations[lang] ?? []; + if (words.length === 0) { + errors.push(`${record.source_id}: empty translations for "${lang}"`); + } + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("no duplicate translations within a single synset and language", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + const records = JSON.parse(raw) as OmwRecord[]; + + const errors: string[] = []; + + for (const record of records) { + for (const [lang, words] of Object.entries(record.translations)) { + const seen = new Set(); + for (const word of words) { + if (seen.has(word)) { + errors.push( + `${record.source_id} (${lang}): duplicate translation "${word}"`, + ); + } + seen.add(word); + } + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); +}); diff --git a/data-pipeline/tests/validation/stage-2.validation.test.ts b/data-pipeline/tests/validation/stage-2.validation.test.ts new file mode 100644 index 0000000..b50fcf5 --- /dev/null +++ b/data-pipeline/tests/validation/stage-2.validation.test.ts @@ -0,0 +1,218 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { describe, it, expect, beforeAll } from "vitest"; +import { + SUPPORTED_POS, + SUPPORTED_LANGUAGE_CODES, + CEFR_LEVELS, +} from "@lila/shared"; +import type { SupportedPos, SupportedLanguageCode } from "@lila/shared"; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type Example = { text: string; source: "omw" | "cefr" }; + +type AnnotatedRecord = { + source_id: string; + pos: SupportedPos; + translations: Partial>; + glosses: Partial>; + examples: Partial>; + votes: Partial< + Record> + >; +}; + +type ConflictEntry = { + word: string; + pos: string; + language: SupportedLanguageCode; + levels: string[]; +}; + +// ── Paths ───────────────────────────────────────────────────────────────────── + +const OUTPUT_DIR = path.resolve("stage-2-annotate/output"); + +// ── Tests ───────────────────────────────────────────────────────────────────── + +describe("stage 2 — annotated output validation", () => { + const recordsByLang = new Map(); + let conflicts: ConflictEntry[] = []; + + beforeAll(async () => { + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const raw = await fs.readFile( + path.join(OUTPUT_DIR, `${lang}.json`), + "utf-8", + ); + recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]); + } + const raw = await fs.readFile( + path.join(OUTPUT_DIR, "conflicts.json"), + "utf-8", + ); + conflicts = JSON.parse(raw) as ConflictEntry[]; + }, 60_000); + + it("all five language files exist", async () => { + const errors: string[] = []; + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const filePath = path.join(OUTPUT_DIR, `${lang}.json`); + try { + await fs.access(filePath); + } catch { + errors.push(`missing file: ${lang}.json`); + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("conflicts.json exists", async () => { + const filePath = path.join(OUTPUT_DIR, "conflicts.json"); + await expect(fs.access(filePath)).resolves.toBeUndefined(); + }); + + it("every language file is a non-empty array", () => { + const errors: string[] = []; + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const records = recordsByLang.get(lang)!; + if (!Array.isArray(records)) { + errors.push(`${lang}.json: not an array`); + } else if (records.length === 0) { + errors.push(`${lang}.json: empty array`); + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every record has required fields", () => { + const errors: string[] = []; + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const records = recordsByLang.get(lang)!; + + for (const record of records) { + if (!record.source_id) { + errors.push(`${lang}: record missing source_id`); + continue; + } + if (!record.pos) + errors.push(`${lang} ${record.source_id}: missing pos`); + if (!record.translations) + errors.push(`${lang} ${record.source_id}: missing translations`); + if (!record.glosses) + errors.push(`${lang} ${record.source_id}: missing glosses`); + if (record.examples === undefined) + errors.push(`${lang} ${record.source_id}: missing examples`); + if (record.votes === undefined) + errors.push(`${lang} ${record.source_id}: missing votes`); + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every pos is a valid supported value", () => { + const errors: string[] = []; + const validPos = new Set(SUPPORTED_POS); + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const records = recordsByLang.get(lang)!; + + for (const record of records) { + if (!validPos.has(record.pos)) { + errors.push( + `${lang} ${record.source_id}: invalid pos "${record.pos}"`, + ); + } + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every example has text and a valid source", () => { + const errors: string[] = []; + const validSources = new Set(["omw", "cefr"]); + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const records = recordsByLang.get(lang)!; + + for (const record of records) { + for (const [l, examples] of Object.entries(record.examples)) { + for (const example of examples) { + if (!example.text) { + errors.push( + `${lang} ${record.source_id} (${l}): example missing text`, + ); + } + if (!validSources.has(example.source)) { + errors.push( + `${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`, + ); + } + } + } + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every cefr_source vote is a valid CEFR level", () => { + const errors: string[] = []; + const validLevels = new Set(CEFR_LEVELS); + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const records = recordsByLang.get(lang)!; + + for (const record of records) { + for (const [l, langVotes] of Object.entries(record.votes)) { + for (const [word, vote] of Object.entries(langVotes ?? {})) { + if ( + !validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number]) + ) { + errors.push( + `${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`, + ); + } + } + } + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("conflicts.json entries have required fields and valid CEFR levels", () => { + const errors: string[] = []; + const validLevels = new Set(CEFR_LEVELS); + const validLangs = new Set(SUPPORTED_LANGUAGE_CODES); + + for (const entry of conflicts) { + if (!entry.word) errors.push(`conflict missing word`); + if (!entry.pos) errors.push(`conflict missing pos`); + if (!entry.language) { + errors.push(`conflict missing language`); + } else if (!validLangs.has(entry.language)) { + errors.push(`conflict invalid language "${entry.language}"`); + } + if (!Array.isArray(entry.levels) || entry.levels.length < 2) { + errors.push(`${entry.word}: levels must have at least 2 entries`); + } else { + for (const level of entry.levels) { + if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) { + errors.push(`${entry.word}: invalid level "${level}"`); + } + } + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); +}); diff --git a/data-pipeline/vitest.config.ts b/data-pipeline/vitest.config.ts index bdc68ba..cafc658 100644 --- a/data-pipeline/vitest.config.ts +++ b/data-pipeline/vitest.config.ts @@ -6,5 +6,6 @@ export default defineConfig({ globals: true, include: ["tests/**/*.test.ts"], exclude: ["**/dist/**", "**/node_modules/**"], + testTimeout: 60_000, }, }); diff --git a/documentation/data-pipeline.md b/documentation/data-pipeline.md index 543bca7..0d3ed01 100644 --- a/documentation/data-pipeline.md +++ b/documentation/data-pipeline.md @@ -63,8 +63,9 @@ The database serves three purposes: - **Resolved output** — the final resolved records live here and are read by the sync script to seed the production database. -The schema is defined in `data-pipeline/db/schema.sql`. Never edit `pipeline.db` -directly — all writes go through the pipeline scripts. +The schema is defined in `data-pipeline/db/schema.sql`. Never edit `pipeline.db` directly — all writes go through the pipeline scripts. + +On first run the orchestrator initialises `pipeline.db` automatically and imports the stage 2 output into the base tables. This happens once — subsequent runs skip the import if the base tables are already populated. ## Data sources @@ -230,15 +231,11 @@ Words not present in the CEFR source file will have an empty `votes` object. > `http://127.0.0.1:8080/health` and exits with instructions if it is not > reachable. See `llm-setup.md` for setup instructions. -The enrich stage runs in two rounds, both designed to execute overnight one -model at a time. All output is written to `pipeline.db` atomically per record -— runs are fully resumable if interrupted. Each model is run once — one model -produces one vote. +The enrich stage runs in two rounds, both designed to execute overnight one model at a time. All output is written to `pipeline.db` atomically per record — runs are fully resumable if interrupted. Each model is run once — one model produces one vote. **Round 1 — generation** -Each model processes every word in every language one term at a time and -generates: +Each model processes every word in every language one term at a time and generates: - A CEFR level vote for each translation - A description for each language @@ -246,20 +243,11 @@ generates: - A gloss for each language, only if OMW provides none - Usage examples for each language, only if OMW provides none -OMW data is never duplicated — the script checks what OMW already provides -before building the prompt. For translations, glosses and examples, if OMW -data exists for that language the LLM skips generation entirely. This -significantly reduces compute time for languages with good OMW coverage such -as English. +OMW data is never duplicated — the script checks what OMW already provides before building the prompt. For translations, glosses and examples, if OMW data exists for that language the LLM skips generation entirely. This significantly reduces compute time for languages with good OMW coverage such as English. -All model-generated content is stored with an anonymised source (`model_1`, -`model_2` etc.) so models cannot be biased by knowing who generated what in -round 2. +All model-generated content is stored with an anonymised source (`model_1`, `model_2` etc.) so models cannot be biased by knowing who generated what in round 2. -Each record is written to `pipeline.db` with status `complete` or -`needs_review` immediately after processing. If a record fails structural -validation (invalid JSON, missing required fields, invalid CEFR value) it is -marked `needs_review` and skipped — the run continues without interruption. +Each record is written to `pipeline.db` with status `complete` or `needs_review` immediately after processing. If a record fails structural validation (invalid JSON, missing required fields, invalid CEFR value) it is marked `needs_review` and skipped — the run continues without interruption. **Input:** `stage-2-annotate/output/{lang}.json` **Output:** `pipeline.db` — round 1 results per record per model @@ -270,9 +258,7 @@ pnpm --filter @lila/pipeline enrich --round 1 --model {model} **Compiling candidates** -Once all round 1 runs are complete, compile all generated candidates into a -single structured record per term in `pipeline.db`. This is the input to -round 2. +Once all round 1 runs are complete, compile all generated candidates into a single structured record per term in `pipeline.db`. This is the input to round 2. ```bash pnpm --filter @lila/pipeline enrich --compile-candidates @@ -287,10 +273,7 @@ Each model receives the compiled candidate list for every word and votes on: - The best usage examples candidate (if multiple exist) - A CEFR level vote for each translation -OMW data is not put to a vote — it automatically wins over any LLM-generated -candidate. Round 2 only resolves conflicts between model-generated candidates. -The prompt is kept small — one word at a time, a clean numbered candidate -list — to fit within a limited context window. +OMW data is not put to a vote — it automatically wins over any LLM-generated candidate. Round 2 only resolves conflicts between model-generated candidates. The prompt is kept small — one word at a time, a clean numbered candidate list — to fit within a limited context window. **Input:** `pipeline.db` — compiled candidates **Output:** `pipeline.db` — round 2 votes per record per model @@ -301,8 +284,7 @@ pnpm --filter @lila/pipeline enrich --round 2 --model {model} **Compiling votes** -Once all round 2 runs are complete, compile all votes into a final votes -record per term in `pipeline.db`. This is the input to the merge stage. +Once all round 2 runs are complete, compile all votes into a final votes record per term in `pipeline.db`. This is the input to the merge stage. ```bash pnpm --filter @lila/pipeline enrich --compile-votes @@ -310,9 +292,7 @@ pnpm --filter @lila/pipeline enrich --compile-votes ### 4. Merge -Reads compiled votes from `pipeline.db` and resolves the final value for -every field. Updates each record in `pipeline.db` with status `final` or -`flagged`. +Reads compiled votes from `pipeline.db` and resolves the final value for every field. Updates each record in `pipeline.db` with status `final` or `flagged`. **Merge rules:** @@ -340,18 +320,9 @@ pnpm --filter @lila/pipeline merge ### 4b. Tiebreak -Runs automatically after merge if any translations remain flagged. The script -queries `pipeline.db` for flagged translations, identifies which configured -models have not yet voted on each word, and runs those models on the flagged -subset only. Merge is re-run after each tiebreaker pass. This repeats until -all flagged translations are resolved or no unused models remain. +Runs automatically after merge if any translations remain flagged. The script queries `pipeline.db` for flagged translations, identifies which configured models have not yet voted on each word, and runs those models on the flagged subset only. Merge is re-run after each tiebreaker pass. This repeats until all flagged translations are resolved or no unused models remain. -If unused models are exhausted and flagged translations remain, the script -logs a detailed report showing the exact vote split for each unresolved word -and lists available models from OpenRouter that have not been used. Seeding -is blocked until all translations are resolved. To continue, add one or more -models to the config and re-run the pipeline — the tiebreaker will pick up -automatically. +If unused models are exhausted and flagged translations remain, the script logs a detailed report showing the exact vote split for each unresolved word and lists available models from OpenRouter that have not been used. Seeding is blocked until all translations are resolved. To continue, add one or more models to the config and re-run the pipeline — the tiebreaker will pick up automatically. **Input:** `pipeline.db` — flagged translations from merge **Output:** `pipeline.db` — flagged translations resolved to `final` @@ -361,9 +332,7 @@ automatically. ### 5. Compare / QA -Read-only. Generates `COVERAGE.md` with a full breakdown of the pipeline -output quality per language. Run this after merge to verify output before -syncing to the database. +Read-only. Generates `COVERAGE.md` with a full breakdown of the pipeline output quality per language. Run this after merge to verify output before syncing to the database. **Input:** `pipeline.db` — records with status `final` **Output:** `COVERAGE.md` @@ -393,10 +362,7 @@ pnpm --filter @lila/pipeline compare ## Sync -The sync script transfers all records with status `final` in `pipeline.db` to -the production PostgreSQL database. It is upsert-based and never wipes -existing data. For each record it checks whether a matching `source_id` -already exists in the target database: +The sync script transfers all records with status `final` in `pipeline.db` to the production PostgreSQL database. It is upsert-based and never wipes existing data. For each record it checks whether a matching `source_id` already exists in the target database: - **Missing** → insert - **Present but changed** → update @@ -408,14 +374,11 @@ Run this after all records are resolved and Compare / QA has been reviewed. pnpm --filter @lila/pipeline sync ``` -The sync script requires a connection string to the target database. Set -`DATABASE_URL` in your `.env` file before running. +The sync script requires a connection string to the target database. Set `DATABASE_URL` in your `.env` file before running. ## Reports -The pipeline generates a report at the end of every run. Reports are written -to `data-pipeline/reports/` as a JSON file and a markdown file with the same -name. The markdown is generated from the JSON and contains identical data. +The pipeline generates a report at the end of every run. Reports are written to `data-pipeline/reports/` as a JSON file and a markdown file with the same name. The markdown is generated from the JSON and contains identical data. ``` data-pipeline/reports/ @@ -497,10 +460,7 @@ dataset matures: ## Roadmap -**Current state:** Stages 1 and 2 are complete and output has been reviewed -for all five languages. Architecture for stages 3–6, the tiebreaker, and the -report system are finalised. Stage 3 scripts have not been written yet and -llama.cpp is not installed. +**Current state:** Stages 1 and 2 are complete, validated, and imported into `pipeline.db`. Schema, init, import scripts, validation tests, and fixtures are all in place. Stage 3 scripts have not been written yet and llama.cpp is not installed. **Next action:** Write the stage 3 round 1 script. @@ -523,6 +483,11 @@ llama.cpp is not installed. - [x] Write annotation script - [x] Run annotation → per-language JSON + `conflicts.json` +- [x] Add annotate script to package.json +- [x] Fix duplicate translations in extract.py +- [x] Write stage 1 and 2 validation tests +- [x] Write db schema, init, and import scripts +- [x] Write test fixtures ### Stage 3 — Enrich `🔲 not started`