diff --git a/data-pipeline/tests/fixtures/annotated.fixture.json b/data-pipeline/tests/fixtures/annotated.fixture.json new file mode 100644 index 0000000..f941bd0 --- /dev/null +++ b/data-pipeline/tests/fixtures/annotated.fixture.json @@ -0,0 +1,170 @@ +[ + { + "_fixture": "noun_with_cefr_vote", + "source_id": "ili:i100955", + "pos": "noun", + "translations": { "en": ["grain"], "de": ["Korn", "Kornbrand"] }, + "glosses": { "en": ["a cereal grass"], "de": ["ein Getreidegras"] }, + "examples": { + "en": [ + { "text": "wheat is a grain that is grown in Kansas", "source": "omw" } + ] + }, + "votes": { "en": { "grain": { "cefr_source": "B1" } } } + }, + { + "_fixture": "verb_no_votes_no_translations", + "source_id": "ili:i21779", + "pos": "verb", + "translations": { "en": ["respire"] }, + "glosses": { + "en": [ + "undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide" + ] + }, + "examples": {}, + "votes": {} + }, + { + "_fixture": "verb_with_cefr_vote_all_languages", + "source_id": "ili:i21778", + "pos": "verb", + "translations": { + "en": ["breathe", "take a breath", "respire", "suspire"], + "it": ["respirare"], + "es": ["aspirar", "respirar"], + "de": ["Luft holen", "hauchen", "Luft bekommen", "Luft kriegen", "atmen"], + "fr": ["inspirer", "respirer"] + }, + "glosses": { + "en": ["draw air into, and expel out of, the lungs"], + "de": ["Luft in die Lunge saugen und aus ihr ausstoßen"] + }, + "examples": { + "en": [ + { + "text": "I can breathe better when the air is clean", + "source": "omw" + }, + { "text": "The patient is respiring", "source": "omw" } + ] + }, + "votes": { "en": { "breathe": { "cefr_source": "A1" } } } + }, + { + "_fixture": "adjective_all_languages_multiple_translations", + "source_id": "ili:i10007", + "pos": "adjective", + "translations": { + "en": ["possible"], + "it": [ + "attuabile", + "effettuabile", + "eseguibile", + "fattibile", + "operabile", + "possibile", + "producibile", + "realizzabile" + ], + "es": ["posible"], + "de": [ + "möglich", + "denkbar", + "eventuell", + "möglicherweise", + "allfällig", + "etwaig", + "gegebenenfalls", + "eventuell" + ], + "fr": ["possible", "éventuel"] + }, + "glosses": { + "en": ["capable of happening or existing"], + "de": ["in der Lage, zu geschehen oder zu existieren"] + }, + "examples": { + "en": [ + { "text": "a breakthrough may be possible next year", "source": "omw" }, + { "text": "anything is possible", "source": "omw" }, + { "text": "warned of possible consequences", "source": "omw" } + ] + }, + "votes": { "en": { "possible": { "cefr_source": "A2" } } } + }, + { + "_fixture": "adjective_multiple_de_votes_cefr_examples", + "source_id": "ili:i10000", + "pos": "adjective", + "translations": { + "en": ["negative"], + "de": [ + "dürftig", + "zu wünschen übrig lassen", + "schlecht", + "widrig", + "ungut", + "lausig", + "negativ", + "von Nachteil", + "schädlich", + "nachteilig", + "ungünstig" + ], + "fr": ["négatif", "strictement négatif"] + }, + "glosses": { "en": ["less than zero"], "de": ["kleiner als Null"] }, + "examples": { + "en": [{ "text": "a negative number", "source": "omw" }], + "de": [ + { "text": "Die Beweise waren dürftig.", "source": "cefr" }, + { "text": "Das Wetter ist heute schlecht.", "source": "cefr" }, + { + "text": "Trotz widriger Umstände haben sie es geschafft.", + "source": "cefr" + }, + { + "text": "Er hatte ein ungutes Gefühl bei der Sache.", + "source": "cefr" + }, + { "text": "Er hat eine sehr negative Einstellung.", "source": "cefr" }, + { + "text": "Rauchen ist schädlich für die Gesundheit.", + "source": "cefr" + }, + { + "text": "Diese Entscheidung könnte nachteilig sein.", + "source": "cefr" + }, + { + "text": "Das Wetter ist heute ungünstig für einen Ausflug.", + "source": "cefr" + } + ] + }, + "votes": { + "de": { + "dürftig": { "cefr_source": "C1" }, + "schlecht": { "cefr_source": "A1" }, + "widrig": { "cefr_source": "C1" }, + "ungut": { "cefr_source": "B2" }, + "negativ": { "cefr_source": "A2" }, + "schädlich": { "cefr_source": "B1" }, + "nachteilig": { "cefr_source": "B1" }, + "ungünstig": { "cefr_source": "B2" } + } + } + }, + { + "_fixture": "adverb_no_votes", + "source_id": "ili:i18157", + "pos": "adverb", + "translations": { "en": ["a cappella"], "es": ["a capella"] }, + "glosses": { "en": ["without musical accompaniment"] }, + "examples": { + "en": [{ "text": "they performed a cappella", "source": "omw" }] + }, + "votes": {} + } +] diff --git a/data-pipeline/tests/fixtures/conflicts.fixture.json b/data-pipeline/tests/fixtures/conflicts.fixture.json new file mode 100644 index 0000000..37f5111 --- /dev/null +++ b/data-pipeline/tests/fixtures/conflicts.fixture.json @@ -0,0 +1,4 @@ +[ + { "word": "macht", "pos": "noun", "language": "de", "levels": ["A2", "B1"] }, + { "word": "bleiche", "pos": "noun", "language": "de", "levels": ["B2", "B1"] } +] diff --git a/data-pipeline/tests/validation/stage-1.validation.test.ts b/data-pipeline/tests/validation/stage-1.validation.test.ts new file mode 100644 index 0000000..287be9e --- /dev/null +++ b/data-pipeline/tests/validation/stage-1.validation.test.ts @@ -0,0 +1,143 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { describe, it, expect } from "vitest"; +import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; +import type { SupportedPos, SupportedLanguageCode } from "@lila/shared"; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type OmwRecord = { + source_id: string; + pos: SupportedPos; + translations: Partial>; + glosses: Partial>; + examples: Partial>; +}; + +// ── Paths ───────────────────────────────────────────────────────────────────── + +const OMW_PATH = path.resolve("stage-1-extract/output/omw.json"); + +// ── Helpers ─────────────────────────────────────────────────────────────────── + +function isValidSourceId(id: string): boolean { + return /^ili:i\d+$/.test(id); +} + +// ── Tests ───────────────────────────────────────────────────────────────────── + +describe("stage 1 — omw.json validation", () => { + let records: OmwRecord[]; + + it("file exists and is valid JSON", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + expect(records).toBeDefined(); + }); + + it("is a non-empty array", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + expect(Array.isArray(records)).toBe(true); + expect(records.length).toBeGreaterThan(0); + }); + + it("every record has required fields", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + + const errors: string[] = []; + + for (const record of records) { + if (!record.source_id) { + errors.push(`missing source_id`); + continue; + } + if (!record.pos) errors.push(`${record.source_id}: missing pos`); + if (!record.translations) + errors.push(`${record.source_id}: missing translations`); + if (!record.glosses) errors.push(`${record.source_id}: missing glosses`); + if (!record.examples) + errors.push(`${record.source_id}: missing examples`); + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every source_id matches ili:i{number} pattern", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + + const errors: string[] = []; + + for (const record of records) { + if (!isValidSourceId(record.source_id)) { + errors.push(`invalid source_id: ${record.source_id}`); + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every source_id is unique", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + + const seen = new Set(); + const errors: string[] = []; + + for (const record of records) { + if (seen.has(record.source_id)) { + errors.push(`duplicate source_id: ${record.source_id}`); + } + seen.add(record.source_id); + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every pos is a valid supported value", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + + const errors: string[] = []; + const validPos = new Set(SUPPORTED_POS); + + for (const record of records) { + if (!validPos.has(record.pos)) { + errors.push(`${record.source_id}: invalid pos "${record.pos}"`); + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every record has at least one translation in at least one language", async () => { + const raw = await fs.readFile(OMW_PATH, "utf-8"); + records = JSON.parse(raw) as OmwRecord[]; + + const errors: string[] = []; + const validLangs = new Set(SUPPORTED_LANGUAGE_CODES); + + for (const record of records) { + const langs = Object.keys(record.translations) as SupportedLanguageCode[]; + + if (langs.length === 0) { + errors.push(`${record.source_id}: no translations`); + continue; + } + + for (const lang of langs) { + if (!validLangs.has(lang)) { + errors.push(`${record.source_id}: unsupported language "${lang}"`); + } + const words = record.translations[lang] ?? []; + if (words.length === 0) { + errors.push(`${record.source_id}: empty translations for "${lang}"`); + } + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); +}); diff --git a/data-pipeline/tests/validation/stage-2.validation.test.ts b/data-pipeline/tests/validation/stage-2.validation.test.ts new file mode 100644 index 0000000..b50fcf5 --- /dev/null +++ b/data-pipeline/tests/validation/stage-2.validation.test.ts @@ -0,0 +1,218 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { describe, it, expect, beforeAll } from "vitest"; +import { + SUPPORTED_POS, + SUPPORTED_LANGUAGE_CODES, + CEFR_LEVELS, +} from "@lila/shared"; +import type { SupportedPos, SupportedLanguageCode } from "@lila/shared"; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type Example = { text: string; source: "omw" | "cefr" }; + +type AnnotatedRecord = { + source_id: string; + pos: SupportedPos; + translations: Partial>; + glosses: Partial>; + examples: Partial>; + votes: Partial< + Record> + >; +}; + +type ConflictEntry = { + word: string; + pos: string; + language: SupportedLanguageCode; + levels: string[]; +}; + +// ── Paths ───────────────────────────────────────────────────────────────────── + +const OUTPUT_DIR = path.resolve("stage-2-annotate/output"); + +// ── Tests ───────────────────────────────────────────────────────────────────── + +describe("stage 2 — annotated output validation", () => { + const recordsByLang = new Map(); + let conflicts: ConflictEntry[] = []; + + beforeAll(async () => { + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const raw = await fs.readFile( + path.join(OUTPUT_DIR, `${lang}.json`), + "utf-8", + ); + recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]); + } + const raw = await fs.readFile( + path.join(OUTPUT_DIR, "conflicts.json"), + "utf-8", + ); + conflicts = JSON.parse(raw) as ConflictEntry[]; + }, 60_000); + + it("all five language files exist", async () => { + const errors: string[] = []; + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const filePath = path.join(OUTPUT_DIR, `${lang}.json`); + try { + await fs.access(filePath); + } catch { + errors.push(`missing file: ${lang}.json`); + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("conflicts.json exists", async () => { + const filePath = path.join(OUTPUT_DIR, "conflicts.json"); + await expect(fs.access(filePath)).resolves.toBeUndefined(); + }); + + it("every language file is a non-empty array", () => { + const errors: string[] = []; + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const records = recordsByLang.get(lang)!; + if (!Array.isArray(records)) { + errors.push(`${lang}.json: not an array`); + } else if (records.length === 0) { + errors.push(`${lang}.json: empty array`); + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every record has required fields", () => { + const errors: string[] = []; + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const records = recordsByLang.get(lang)!; + + for (const record of records) { + if (!record.source_id) { + errors.push(`${lang}: record missing source_id`); + continue; + } + if (!record.pos) + errors.push(`${lang} ${record.source_id}: missing pos`); + if (!record.translations) + errors.push(`${lang} ${record.source_id}: missing translations`); + if (!record.glosses) + errors.push(`${lang} ${record.source_id}: missing glosses`); + if (record.examples === undefined) + errors.push(`${lang} ${record.source_id}: missing examples`); + if (record.votes === undefined) + errors.push(`${lang} ${record.source_id}: missing votes`); + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every pos is a valid supported value", () => { + const errors: string[] = []; + const validPos = new Set(SUPPORTED_POS); + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const records = recordsByLang.get(lang)!; + + for (const record of records) { + if (!validPos.has(record.pos)) { + errors.push( + `${lang} ${record.source_id}: invalid pos "${record.pos}"`, + ); + } + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every example has text and a valid source", () => { + const errors: string[] = []; + const validSources = new Set(["omw", "cefr"]); + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const records = recordsByLang.get(lang)!; + + for (const record of records) { + for (const [l, examples] of Object.entries(record.examples)) { + for (const example of examples) { + if (!example.text) { + errors.push( + `${lang} ${record.source_id} (${l}): example missing text`, + ); + } + if (!validSources.has(example.source)) { + errors.push( + `${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`, + ); + } + } + } + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every cefr_source vote is a valid CEFR level", () => { + const errors: string[] = []; + const validLevels = new Set(CEFR_LEVELS); + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const records = recordsByLang.get(lang)!; + + for (const record of records) { + for (const [l, langVotes] of Object.entries(record.votes)) { + for (const [word, vote] of Object.entries(langVotes ?? {})) { + if ( + !validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number]) + ) { + errors.push( + `${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`, + ); + } + } + } + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("conflicts.json entries have required fields and valid CEFR levels", () => { + const errors: string[] = []; + const validLevels = new Set(CEFR_LEVELS); + const validLangs = new Set(SUPPORTED_LANGUAGE_CODES); + + for (const entry of conflicts) { + if (!entry.word) errors.push(`conflict missing word`); + if (!entry.pos) errors.push(`conflict missing pos`); + if (!entry.language) { + errors.push(`conflict missing language`); + } else if (!validLangs.has(entry.language)) { + errors.push(`conflict invalid language "${entry.language}"`); + } + if (!Array.isArray(entry.levels) || entry.levels.length < 2) { + errors.push(`${entry.word}: levels must have at least 2 entries`); + } else { + for (const level of entry.levels) { + if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) { + errors.push(`${entry.word}: invalid level "${level}"`); + } + } + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); +}); diff --git a/data-pipeline/vitest.config.ts b/data-pipeline/vitest.config.ts index bdc68ba..cafc658 100644 --- a/data-pipeline/vitest.config.ts +++ b/data-pipeline/vitest.config.ts @@ -6,5 +6,6 @@ export default defineConfig({ globals: true, include: ["tests/**/*.test.ts"], exclude: ["**/dist/**", "**/node_modules/**"], + testTimeout: 60_000, }, });