feat: add stage 1 and 2 validation tests

2026-05-03 21:36:56 +02:00 · 2026-05-03 21:36:56 +02:00 · 4a842140b9
commit 4a842140b9
parent 4fa3073412
5 changed files with 536 additions and 0 deletions
--- a/data-pipeline/tests/fixtures/annotated.fixture.json
+++ b/data-pipeline/tests/fixtures/annotated.fixture.json
@ -0,0 +1,170 @@
 [
  {
    "_fixture": "noun_with_cefr_vote",
    "source_id": "ili:i100955",
    "pos": "noun",
    "translations": { "en": ["grain"], "de": ["Korn", "Kornbrand"] },
    "glosses": { "en": ["a cereal grass"], "de": ["ein Getreidegras"] },
    "examples": {
      "en": [
        { "text": "wheat is a grain that is grown in Kansas", "source": "omw" }
      ]
    },
    "votes": { "en": { "grain": { "cefr_source": "B1" } } }
  },
  {
    "_fixture": "verb_no_votes_no_translations",
    "source_id": "ili:i21779",
    "pos": "verb",
    "translations": { "en": ["respire"] },
    "glosses": {
      "en": [
        "undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide"
      ]
    },
    "examples": {},
    "votes": {}
  },
  {
    "_fixture": "verb_with_cefr_vote_all_languages",
    "source_id": "ili:i21778",
    "pos": "verb",
    "translations": {
      "en": ["breathe", "take a breath", "respire", "suspire"],
      "it": ["respirare"],
      "es": ["aspirar", "respirar"],
      "de": ["Luft holen", "hauchen", "Luft bekommen", "Luft kriegen", "atmen"],
      "fr": ["inspirer", "respirer"]
    },
    "glosses": {
      "en": ["draw air into, and expel out of, the lungs"],
      "de": ["Luft in die Lunge saugen und aus ihr ausstoßen"]
    },
    "examples": {
      "en": [
        {
          "text": "I can breathe better when the air is clean",
          "source": "omw"
        },
        { "text": "The patient is respiring", "source": "omw" }
      ]
    },
    "votes": { "en": { "breathe": { "cefr_source": "A1" } } }
  },
  {
    "_fixture": "adjective_all_languages_multiple_translations",
    "source_id": "ili:i10007",
    "pos": "adjective",
    "translations": {
      "en": ["possible"],
      "it": [
        "attuabile",
        "effettuabile",
        "eseguibile",
        "fattibile",
        "operabile",
        "possibile",
        "producibile",
        "realizzabile"
      ],
      "es": ["posible"],
      "de": [
        "möglich",
        "denkbar",
        "eventuell",
        "möglicherweise",
        "allfällig",
        "etwaig",
        "gegebenenfalls",
        "eventuell"
      ],
      "fr": ["possible", "éventuel"]
    },
    "glosses": {
      "en": ["capable of happening or existing"],
      "de": ["in der Lage, zu geschehen oder zu existieren"]
    },
    "examples": {
      "en": [
        { "text": "a breakthrough may be possible next year", "source": "omw" },
        { "text": "anything is possible", "source": "omw" },
        { "text": "warned of possible consequences", "source": "omw" }
      ]
    },
    "votes": { "en": { "possible": { "cefr_source": "A2" } } }
  },
  {
    "_fixture": "adjective_multiple_de_votes_cefr_examples",
    "source_id": "ili:i10000",
    "pos": "adjective",
    "translations": {
      "en": ["negative"],
      "de": [
        "dürftig",
        "zu wünschen übrig lassen",
        "schlecht",
        "widrig",
        "ungut",
        "lausig",
        "negativ",
        "von Nachteil",
        "schädlich",
        "nachteilig",
        "ungünstig"
      ],
      "fr": ["négatif", "strictement négatif"]
    },
    "glosses": { "en": ["less than zero"], "de": ["kleiner als Null"] },
    "examples": {
      "en": [{ "text": "a negative number", "source": "omw" }],
      "de": [
        { "text": "Die Beweise waren dürftig.", "source": "cefr" },
        { "text": "Das Wetter ist heute schlecht.", "source": "cefr" },
        {
          "text": "Trotz widriger Umstände haben sie es geschafft.",
          "source": "cefr"
        },
        {
          "text": "Er hatte ein ungutes Gefühl bei der Sache.",
          "source": "cefr"
        },
        { "text": "Er hat eine sehr negative Einstellung.", "source": "cefr" },
        {
          "text": "Rauchen ist schädlich für die Gesundheit.",
          "source": "cefr"
        },
        {
          "text": "Diese Entscheidung könnte nachteilig sein.",
          "source": "cefr"
        },
        {
          "text": "Das Wetter ist heute ungünstig für einen Ausflug.",
          "source": "cefr"
        }
      ]
    },
    "votes": {
      "de": {
        "dürftig": { "cefr_source": "C1" },
        "schlecht": { "cefr_source": "A1" },
        "widrig": { "cefr_source": "C1" },
        "ungut": { "cefr_source": "B2" },
        "negativ": { "cefr_source": "A2" },
        "schädlich": { "cefr_source": "B1" },
        "nachteilig": { "cefr_source": "B1" },
        "ungünstig": { "cefr_source": "B2" }
      }
    }
  },
  {
    "_fixture": "adverb_no_votes",
    "source_id": "ili:i18157",
    "pos": "adverb",
    "translations": { "en": ["a cappella"], "es": ["a capella"] },
    "glosses": { "en": ["without musical accompaniment"] },
    "examples": {
      "en": [{ "text": "they performed a cappella", "source": "omw" }]
    },
    "votes": {}
  }
 ]
--- a/data-pipeline/tests/fixtures/conflicts.fixture.json
+++ b/data-pipeline/tests/fixtures/conflicts.fixture.json
@ -0,0 +1,4 @@
 [
  { "word": "macht", "pos": "noun", "language": "de", "levels": ["A2", "B1"] },
  { "word": "bleiche", "pos": "noun", "language": "de", "levels": ["B2", "B1"] }
 ]
--- a/data-pipeline/tests/validation/stage-1.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-1.validation.test.ts
@ -0,0 +1,143 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { describe, it, expect } from "vitest";
 import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
 import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
 // ── Types ─────────────────────────────────────────────────────────────────────
 type OmwRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, string[]>>;
 };
 // ── Paths ─────────────────────────────────────────────────────────────────────
 const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
 // ── Helpers ───────────────────────────────────────────────────────────────────
 function isValidSourceId(id: string): boolean {
  return /^ili:i\d+$/.test(id);
 }
 // ── Tests ─────────────────────────────────────────────────────────────────────
 describe("stage 1 — omw.json validation", () => {
  let records: OmwRecord[];
  it("file exists and is valid JSON", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    expect(records).toBeDefined();
  });
  it("is a non-empty array", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    expect(Array.isArray(records)).toBe(true);
    expect(records.length).toBeGreaterThan(0);
  });
  it("every record has required fields", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    for (const record of records) {
      if (!record.source_id) {
        errors.push(`missing source_id`);
        continue;
      }
      if (!record.pos) errors.push(`${record.source_id}: missing pos`);
      if (!record.translations)
        errors.push(`${record.source_id}: missing translations`);
      if (!record.glosses) errors.push(`${record.source_id}: missing glosses`);
      if (!record.examples)
        errors.push(`${record.source_id}: missing examples`);
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every source_id matches ili:i{number} pattern", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    for (const record of records) {
      if (!isValidSourceId(record.source_id)) {
        errors.push(`invalid source_id: ${record.source_id}`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every source_id is unique", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const seen = new Set<string>();
    const errors: string[] = [];
    for (const record of records) {
      if (seen.has(record.source_id)) {
        errors.push(`duplicate source_id: ${record.source_id}`);
      }
      seen.add(record.source_id);
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every pos is a valid supported value", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    const validPos = new Set(SUPPORTED_POS);
    for (const record of records) {
      if (!validPos.has(record.pos)) {
        errors.push(`${record.source_id}: invalid pos "${record.pos}"`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every record has at least one translation in at least one language", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
    for (const record of records) {
      const langs = Object.keys(record.translations) as SupportedLanguageCode[];
      if (langs.length === 0) {
        errors.push(`${record.source_id}: no translations`);
        continue;
      }
      for (const lang of langs) {
        if (!validLangs.has(lang)) {
          errors.push(`${record.source_id}: unsupported language "${lang}"`);
        }
        const words = record.translations[lang] ?? [];
        if (words.length === 0) {
          errors.push(`${record.source_id}: empty translations for "${lang}"`);
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
 });
--- a/data-pipeline/tests/validation/stage-2.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-2.validation.test.ts
@ -0,0 +1,218 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { describe, it, expect, beforeAll } from "vitest";
 import {
  SUPPORTED_POS,
  SUPPORTED_LANGUAGE_CODES,
  CEFR_LEVELS,
 } from "@lila/shared";
 import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
 // ── Types ─────────────────────────────────────────────────────────────────────
 type Example = { text: string; source: "omw" | "cefr" };
 type AnnotatedRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
  votes: Partial<
    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
  >;
 };
 type ConflictEntry = {
  word: string;
  pos: string;
  language: SupportedLanguageCode;
  levels: string[];
 };
 // ── Paths ─────────────────────────────────────────────────────────────────────
 const OUTPUT_DIR = path.resolve("stage-2-annotate/output");
 // ── Tests ─────────────────────────────────────────────────────────────────────
 describe("stage 2 — annotated output validation", () => {
  const recordsByLang = new Map<SupportedLanguageCode, AnnotatedRecord[]>();
  let conflicts: ConflictEntry[] = [];
  beforeAll(async () => {
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const raw = await fs.readFile(
        path.join(OUTPUT_DIR, `${lang}.json`),
        "utf-8",
      );
      recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]);
    }
    const raw = await fs.readFile(
      path.join(OUTPUT_DIR, "conflicts.json"),
      "utf-8",
    );
    conflicts = JSON.parse(raw) as ConflictEntry[];
  }, 60_000);
  it("all five language files exist", async () => {
    const errors: string[] = [];
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
      try {
        await fs.access(filePath);
      } catch {
        errors.push(`missing file: ${lang}.json`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("conflicts.json exists", async () => {
    const filePath = path.join(OUTPUT_DIR, "conflicts.json");
    await expect(fs.access(filePath)).resolves.toBeUndefined();
  });
  it("every language file is a non-empty array", () => {
    const errors: string[] = [];
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      if (!Array.isArray(records)) {
        errors.push(`${lang}.json: not an array`);
      } else if (records.length === 0) {
        errors.push(`${lang}.json: empty array`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every record has required fields", () => {
    const errors: string[] = [];
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        if (!record.source_id) {
          errors.push(`${lang}: record missing source_id`);
          continue;
        }
        if (!record.pos)
          errors.push(`${lang} ${record.source_id}: missing pos`);
        if (!record.translations)
          errors.push(`${lang} ${record.source_id}: missing translations`);
        if (!record.glosses)
          errors.push(`${lang} ${record.source_id}: missing glosses`);
        if (record.examples === undefined)
          errors.push(`${lang} ${record.source_id}: missing examples`);
        if (record.votes === undefined)
          errors.push(`${lang} ${record.source_id}: missing votes`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every pos is a valid supported value", () => {
    const errors: string[] = [];
    const validPos = new Set(SUPPORTED_POS);
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        if (!validPos.has(record.pos)) {
          errors.push(
            `${lang} ${record.source_id}: invalid pos "${record.pos}"`,
          );
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every example has text and a valid source", () => {
    const errors: string[] = [];
    const validSources = new Set(["omw", "cefr"]);
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        for (const [l, examples] of Object.entries(record.examples)) {
          for (const example of examples) {
            if (!example.text) {
              errors.push(
                `${lang} ${record.source_id} (${l}): example missing text`,
              );
            }
            if (!validSources.has(example.source)) {
              errors.push(
                `${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`,
              );
            }
          }
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every cefr_source vote is a valid CEFR level", () => {
    const errors: string[] = [];
    const validLevels = new Set(CEFR_LEVELS);
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        for (const [l, langVotes] of Object.entries(record.votes)) {
          for (const [word, vote] of Object.entries(langVotes ?? {})) {
            if (
              !validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number])
            ) {
              errors.push(
                `${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`,
              );
            }
          }
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("conflicts.json entries have required fields and valid CEFR levels", () => {
    const errors: string[] = [];
    const validLevels = new Set(CEFR_LEVELS);
    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
    for (const entry of conflicts) {
      if (!entry.word) errors.push(`conflict missing word`);
      if (!entry.pos) errors.push(`conflict missing pos`);
      if (!entry.language) {
        errors.push(`conflict missing language`);
      } else if (!validLangs.has(entry.language)) {
        errors.push(`conflict invalid language "${entry.language}"`);
      }
      if (!Array.isArray(entry.levels) || entry.levels.length < 2) {
        errors.push(`${entry.word}: levels must have at least 2 entries`);
      } else {
        for (const level of entry.levels) {
          if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) {
            errors.push(`${entry.word}: invalid level "${level}"`);
          }
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
 });
--- a/data-pipeline/vitest.config.ts
+++ b/data-pipeline/vitest.config.ts
@ -6,5 +6,6 @@ export default defineConfig({
    globals: true,
    include: ["tests/**/*.test.ts"],
    exclude: ["**/dist/**", "**/node_modules/**"],
    testTimeout: 60_000,
  },
 });