feat: add stage 1 and 2 validation tests

This commit is contained in:
lila 2026-05-03 21:36:56 +02:00
parent 4fa3073412
commit 4a842140b9
5 changed files with 536 additions and 0 deletions

View file

@ -0,0 +1,143 @@
import fs from "node:fs/promises";
import path from "node:path";
import { describe, it, expect } from "vitest";
import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type OmwRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, string[]>>;
};
// ── Paths ─────────────────────────────────────────────────────────────────────
const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
// ── Helpers ───────────────────────────────────────────────────────────────────
function isValidSourceId(id: string): boolean {
return /^ili:i\d+$/.test(id);
}
// ── Tests ─────────────────────────────────────────────────────────────────────
describe("stage 1 — omw.json validation", () => {
let records: OmwRecord[];
it("file exists and is valid JSON", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
expect(records).toBeDefined();
});
it("is a non-empty array", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
expect(Array.isArray(records)).toBe(true);
expect(records.length).toBeGreaterThan(0);
});
it("every record has required fields", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
for (const record of records) {
if (!record.source_id) {
errors.push(`missing source_id`);
continue;
}
if (!record.pos) errors.push(`${record.source_id}: missing pos`);
if (!record.translations)
errors.push(`${record.source_id}: missing translations`);
if (!record.glosses) errors.push(`${record.source_id}: missing glosses`);
if (!record.examples)
errors.push(`${record.source_id}: missing examples`);
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every source_id matches ili:i{number} pattern", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
for (const record of records) {
if (!isValidSourceId(record.source_id)) {
errors.push(`invalid source_id: ${record.source_id}`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every source_id is unique", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const seen = new Set<string>();
const errors: string[] = [];
for (const record of records) {
if (seen.has(record.source_id)) {
errors.push(`duplicate source_id: ${record.source_id}`);
}
seen.add(record.source_id);
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every pos is a valid supported value", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
const validPos = new Set(SUPPORTED_POS);
for (const record of records) {
if (!validPos.has(record.pos)) {
errors.push(`${record.source_id}: invalid pos "${record.pos}"`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every record has at least one translation in at least one language", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
for (const record of records) {
const langs = Object.keys(record.translations) as SupportedLanguageCode[];
if (langs.length === 0) {
errors.push(`${record.source_id}: no translations`);
continue;
}
for (const lang of langs) {
if (!validLangs.has(lang)) {
errors.push(`${record.source_id}: unsupported language "${lang}"`);
}
const words = record.translations[lang] ?? [];
if (words.length === 0) {
errors.push(`${record.source_id}: empty translations for "${lang}"`);
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
});

View file

@ -0,0 +1,218 @@
import fs from "node:fs/promises";
import path from "node:path";
import { describe, it, expect, beforeAll } from "vitest";
import {
SUPPORTED_POS,
SUPPORTED_LANGUAGE_CODES,
CEFR_LEVELS,
} from "@lila/shared";
import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type Example = { text: string; source: "omw" | "cefr" };
type AnnotatedRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
votes: Partial<
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
>;
};
type ConflictEntry = {
word: string;
pos: string;
language: SupportedLanguageCode;
levels: string[];
};
// ── Paths ─────────────────────────────────────────────────────────────────────
const OUTPUT_DIR = path.resolve("stage-2-annotate/output");
// ── Tests ─────────────────────────────────────────────────────────────────────
describe("stage 2 — annotated output validation", () => {
const recordsByLang = new Map<SupportedLanguageCode, AnnotatedRecord[]>();
let conflicts: ConflictEntry[] = [];
beforeAll(async () => {
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const raw = await fs.readFile(
path.join(OUTPUT_DIR, `${lang}.json`),
"utf-8",
);
recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]);
}
const raw = await fs.readFile(
path.join(OUTPUT_DIR, "conflicts.json"),
"utf-8",
);
conflicts = JSON.parse(raw) as ConflictEntry[];
}, 60_000);
it("all five language files exist", async () => {
const errors: string[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
try {
await fs.access(filePath);
} catch {
errors.push(`missing file: ${lang}.json`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("conflicts.json exists", async () => {
const filePath = path.join(OUTPUT_DIR, "conflicts.json");
await expect(fs.access(filePath)).resolves.toBeUndefined();
});
it("every language file is a non-empty array", () => {
const errors: string[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
if (!Array.isArray(records)) {
errors.push(`${lang}.json: not an array`);
} else if (records.length === 0) {
errors.push(`${lang}.json: empty array`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every record has required fields", () => {
const errors: string[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
if (!record.source_id) {
errors.push(`${lang}: record missing source_id`);
continue;
}
if (!record.pos)
errors.push(`${lang} ${record.source_id}: missing pos`);
if (!record.translations)
errors.push(`${lang} ${record.source_id}: missing translations`);
if (!record.glosses)
errors.push(`${lang} ${record.source_id}: missing glosses`);
if (record.examples === undefined)
errors.push(`${lang} ${record.source_id}: missing examples`);
if (record.votes === undefined)
errors.push(`${lang} ${record.source_id}: missing votes`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every pos is a valid supported value", () => {
const errors: string[] = [];
const validPos = new Set(SUPPORTED_POS);
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
if (!validPos.has(record.pos)) {
errors.push(
`${lang} ${record.source_id}: invalid pos "${record.pos}"`,
);
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every example has text and a valid source", () => {
const errors: string[] = [];
const validSources = new Set(["omw", "cefr"]);
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
for (const [l, examples] of Object.entries(record.examples)) {
for (const example of examples) {
if (!example.text) {
errors.push(
`${lang} ${record.source_id} (${l}): example missing text`,
);
}
if (!validSources.has(example.source)) {
errors.push(
`${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`,
);
}
}
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every cefr_source vote is a valid CEFR level", () => {
const errors: string[] = [];
const validLevels = new Set(CEFR_LEVELS);
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
for (const [l, langVotes] of Object.entries(record.votes)) {
for (const [word, vote] of Object.entries(langVotes ?? {})) {
if (
!validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number])
) {
errors.push(
`${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`,
);
}
}
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("conflicts.json entries have required fields and valid CEFR levels", () => {
const errors: string[] = [];
const validLevels = new Set(CEFR_LEVELS);
const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
for (const entry of conflicts) {
if (!entry.word) errors.push(`conflict missing word`);
if (!entry.pos) errors.push(`conflict missing pos`);
if (!entry.language) {
errors.push(`conflict missing language`);
} else if (!validLangs.has(entry.language)) {
errors.push(`conflict invalid language "${entry.language}"`);
}
if (!Array.isArray(entry.levels) || entry.levels.length < 2) {
errors.push(`${entry.word}: levels must have at least 2 entries`);
} else {
for (const level of entry.levels) {
if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) {
errors.push(`${entry.word}: invalid level "${level}"`);
}
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
});