feat: add stage 1 and 2 validation tests
This commit is contained in:
parent
4fa3073412
commit
4a842140b9
5 changed files with 536 additions and 0 deletions
170
data-pipeline/tests/fixtures/annotated.fixture.json
vendored
Normal file
170
data-pipeline/tests/fixtures/annotated.fixture.json
vendored
Normal file
|
|
@ -0,0 +1,170 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"_fixture": "noun_with_cefr_vote",
|
||||||
|
"source_id": "ili:i100955",
|
||||||
|
"pos": "noun",
|
||||||
|
"translations": { "en": ["grain"], "de": ["Korn", "Kornbrand"] },
|
||||||
|
"glosses": { "en": ["a cereal grass"], "de": ["ein Getreidegras"] },
|
||||||
|
"examples": {
|
||||||
|
"en": [
|
||||||
|
{ "text": "wheat is a grain that is grown in Kansas", "source": "omw" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"votes": { "en": { "grain": { "cefr_source": "B1" } } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_fixture": "verb_no_votes_no_translations",
|
||||||
|
"source_id": "ili:i21779",
|
||||||
|
"pos": "verb",
|
||||||
|
"translations": { "en": ["respire"] },
|
||||||
|
"glosses": {
|
||||||
|
"en": [
|
||||||
|
"undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"examples": {},
|
||||||
|
"votes": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_fixture": "verb_with_cefr_vote_all_languages",
|
||||||
|
"source_id": "ili:i21778",
|
||||||
|
"pos": "verb",
|
||||||
|
"translations": {
|
||||||
|
"en": ["breathe", "take a breath", "respire", "suspire"],
|
||||||
|
"it": ["respirare"],
|
||||||
|
"es": ["aspirar", "respirar"],
|
||||||
|
"de": ["Luft holen", "hauchen", "Luft bekommen", "Luft kriegen", "atmen"],
|
||||||
|
"fr": ["inspirer", "respirer"]
|
||||||
|
},
|
||||||
|
"glosses": {
|
||||||
|
"en": ["draw air into, and expel out of, the lungs"],
|
||||||
|
"de": ["Luft in die Lunge saugen und aus ihr ausstoßen"]
|
||||||
|
},
|
||||||
|
"examples": {
|
||||||
|
"en": [
|
||||||
|
{
|
||||||
|
"text": "I can breathe better when the air is clean",
|
||||||
|
"source": "omw"
|
||||||
|
},
|
||||||
|
{ "text": "The patient is respiring", "source": "omw" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"votes": { "en": { "breathe": { "cefr_source": "A1" } } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_fixture": "adjective_all_languages_multiple_translations",
|
||||||
|
"source_id": "ili:i10007",
|
||||||
|
"pos": "adjective",
|
||||||
|
"translations": {
|
||||||
|
"en": ["possible"],
|
||||||
|
"it": [
|
||||||
|
"attuabile",
|
||||||
|
"effettuabile",
|
||||||
|
"eseguibile",
|
||||||
|
"fattibile",
|
||||||
|
"operabile",
|
||||||
|
"possibile",
|
||||||
|
"producibile",
|
||||||
|
"realizzabile"
|
||||||
|
],
|
||||||
|
"es": ["posible"],
|
||||||
|
"de": [
|
||||||
|
"möglich",
|
||||||
|
"denkbar",
|
||||||
|
"eventuell",
|
||||||
|
"möglicherweise",
|
||||||
|
"allfällig",
|
||||||
|
"etwaig",
|
||||||
|
"gegebenenfalls",
|
||||||
|
"eventuell"
|
||||||
|
],
|
||||||
|
"fr": ["possible", "éventuel"]
|
||||||
|
},
|
||||||
|
"glosses": {
|
||||||
|
"en": ["capable of happening or existing"],
|
||||||
|
"de": ["in der Lage, zu geschehen oder zu existieren"]
|
||||||
|
},
|
||||||
|
"examples": {
|
||||||
|
"en": [
|
||||||
|
{ "text": "a breakthrough may be possible next year", "source": "omw" },
|
||||||
|
{ "text": "anything is possible", "source": "omw" },
|
||||||
|
{ "text": "warned of possible consequences", "source": "omw" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"votes": { "en": { "possible": { "cefr_source": "A2" } } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_fixture": "adjective_multiple_de_votes_cefr_examples",
|
||||||
|
"source_id": "ili:i10000",
|
||||||
|
"pos": "adjective",
|
||||||
|
"translations": {
|
||||||
|
"en": ["negative"],
|
||||||
|
"de": [
|
||||||
|
"dürftig",
|
||||||
|
"zu wünschen übrig lassen",
|
||||||
|
"schlecht",
|
||||||
|
"widrig",
|
||||||
|
"ungut",
|
||||||
|
"lausig",
|
||||||
|
"negativ",
|
||||||
|
"von Nachteil",
|
||||||
|
"schädlich",
|
||||||
|
"nachteilig",
|
||||||
|
"ungünstig"
|
||||||
|
],
|
||||||
|
"fr": ["négatif", "strictement négatif"]
|
||||||
|
},
|
||||||
|
"glosses": { "en": ["less than zero"], "de": ["kleiner als Null"] },
|
||||||
|
"examples": {
|
||||||
|
"en": [{ "text": "a negative number", "source": "omw" }],
|
||||||
|
"de": [
|
||||||
|
{ "text": "Die Beweise waren dürftig.", "source": "cefr" },
|
||||||
|
{ "text": "Das Wetter ist heute schlecht.", "source": "cefr" },
|
||||||
|
{
|
||||||
|
"text": "Trotz widriger Umstände haben sie es geschafft.",
|
||||||
|
"source": "cefr"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Er hatte ein ungutes Gefühl bei der Sache.",
|
||||||
|
"source": "cefr"
|
||||||
|
},
|
||||||
|
{ "text": "Er hat eine sehr negative Einstellung.", "source": "cefr" },
|
||||||
|
{
|
||||||
|
"text": "Rauchen ist schädlich für die Gesundheit.",
|
||||||
|
"source": "cefr"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Diese Entscheidung könnte nachteilig sein.",
|
||||||
|
"source": "cefr"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Das Wetter ist heute ungünstig für einen Ausflug.",
|
||||||
|
"source": "cefr"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"votes": {
|
||||||
|
"de": {
|
||||||
|
"dürftig": { "cefr_source": "C1" },
|
||||||
|
"schlecht": { "cefr_source": "A1" },
|
||||||
|
"widrig": { "cefr_source": "C1" },
|
||||||
|
"ungut": { "cefr_source": "B2" },
|
||||||
|
"negativ": { "cefr_source": "A2" },
|
||||||
|
"schädlich": { "cefr_source": "B1" },
|
||||||
|
"nachteilig": { "cefr_source": "B1" },
|
||||||
|
"ungünstig": { "cefr_source": "B2" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"_fixture": "adverb_no_votes",
|
||||||
|
"source_id": "ili:i18157",
|
||||||
|
"pos": "adverb",
|
||||||
|
"translations": { "en": ["a cappella"], "es": ["a capella"] },
|
||||||
|
"glosses": { "en": ["without musical accompaniment"] },
|
||||||
|
"examples": {
|
||||||
|
"en": [{ "text": "they performed a cappella", "source": "omw" }]
|
||||||
|
},
|
||||||
|
"votes": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
4
data-pipeline/tests/fixtures/conflicts.fixture.json
vendored
Normal file
4
data-pipeline/tests/fixtures/conflicts.fixture.json
vendored
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
[
|
||||||
|
{ "word": "macht", "pos": "noun", "language": "de", "levels": ["A2", "B1"] },
|
||||||
|
{ "word": "bleiche", "pos": "noun", "language": "de", "levels": ["B2", "B1"] }
|
||||||
|
]
|
||||||
143
data-pipeline/tests/validation/stage-1.validation.test.ts
Normal file
143
data-pipeline/tests/validation/stage-1.validation.test.ts
Normal file
|
|
@ -0,0 +1,143 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import path from "node:path";
|
||||||
|
import { describe, it, expect } from "vitest";
|
||||||
|
import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||||
|
import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
|
||||||
|
|
||||||
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type OmwRecord = {
|
||||||
|
source_id: string;
|
||||||
|
pos: SupportedPos;
|
||||||
|
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
examples: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
|
||||||
|
|
||||||
|
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function isValidSourceId(id: string): boolean {
|
||||||
|
return /^ili:i\d+$/.test(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe("stage 1 — omw.json validation", () => {
|
||||||
|
let records: OmwRecord[];
|
||||||
|
|
||||||
|
it("file exists and is valid JSON", async () => {
|
||||||
|
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
||||||
|
records = JSON.parse(raw) as OmwRecord[];
|
||||||
|
expect(records).toBeDefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("is a non-empty array", async () => {
|
||||||
|
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
||||||
|
records = JSON.parse(raw) as OmwRecord[];
|
||||||
|
expect(Array.isArray(records)).toBe(true);
|
||||||
|
expect(records.length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every record has required fields", async () => {
|
||||||
|
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
||||||
|
records = JSON.parse(raw) as OmwRecord[];
|
||||||
|
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
for (const record of records) {
|
||||||
|
if (!record.source_id) {
|
||||||
|
errors.push(`missing source_id`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!record.pos) errors.push(`${record.source_id}: missing pos`);
|
||||||
|
if (!record.translations)
|
||||||
|
errors.push(`${record.source_id}: missing translations`);
|
||||||
|
if (!record.glosses) errors.push(`${record.source_id}: missing glosses`);
|
||||||
|
if (!record.examples)
|
||||||
|
errors.push(`${record.source_id}: missing examples`);
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every source_id matches ili:i{number} pattern", async () => {
|
||||||
|
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
||||||
|
records = JSON.parse(raw) as OmwRecord[];
|
||||||
|
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
for (const record of records) {
|
||||||
|
if (!isValidSourceId(record.source_id)) {
|
||||||
|
errors.push(`invalid source_id: ${record.source_id}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every source_id is unique", async () => {
|
||||||
|
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
||||||
|
records = JSON.parse(raw) as OmwRecord[];
|
||||||
|
|
||||||
|
const seen = new Set<string>();
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
for (const record of records) {
|
||||||
|
if (seen.has(record.source_id)) {
|
||||||
|
errors.push(`duplicate source_id: ${record.source_id}`);
|
||||||
|
}
|
||||||
|
seen.add(record.source_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every pos is a valid supported value", async () => {
|
||||||
|
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
||||||
|
records = JSON.parse(raw) as OmwRecord[];
|
||||||
|
|
||||||
|
const errors: string[] = [];
|
||||||
|
const validPos = new Set(SUPPORTED_POS);
|
||||||
|
|
||||||
|
for (const record of records) {
|
||||||
|
if (!validPos.has(record.pos)) {
|
||||||
|
errors.push(`${record.source_id}: invalid pos "${record.pos}"`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every record has at least one translation in at least one language", async () => {
|
||||||
|
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
||||||
|
records = JSON.parse(raw) as OmwRecord[];
|
||||||
|
|
||||||
|
const errors: string[] = [];
|
||||||
|
const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
|
||||||
|
|
||||||
|
for (const record of records) {
|
||||||
|
const langs = Object.keys(record.translations) as SupportedLanguageCode[];
|
||||||
|
|
||||||
|
if (langs.length === 0) {
|
||||||
|
errors.push(`${record.source_id}: no translations`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const lang of langs) {
|
||||||
|
if (!validLangs.has(lang)) {
|
||||||
|
errors.push(`${record.source_id}: unsupported language "${lang}"`);
|
||||||
|
}
|
||||||
|
const words = record.translations[lang] ?? [];
|
||||||
|
if (words.length === 0) {
|
||||||
|
errors.push(`${record.source_id}: empty translations for "${lang}"`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
218
data-pipeline/tests/validation/stage-2.validation.test.ts
Normal file
218
data-pipeline/tests/validation/stage-2.validation.test.ts
Normal file
|
|
@ -0,0 +1,218 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import path from "node:path";
|
||||||
|
import { describe, it, expect, beforeAll } from "vitest";
|
||||||
|
import {
|
||||||
|
SUPPORTED_POS,
|
||||||
|
SUPPORTED_LANGUAGE_CODES,
|
||||||
|
CEFR_LEVELS,
|
||||||
|
} from "@lila/shared";
|
||||||
|
import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
|
||||||
|
|
||||||
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type Example = { text: string; source: "omw" | "cefr" };
|
||||||
|
|
||||||
|
type AnnotatedRecord = {
|
||||||
|
source_id: string;
|
||||||
|
pos: SupportedPos;
|
||||||
|
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
||||||
|
votes: Partial<
|
||||||
|
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
||||||
|
>;
|
||||||
|
};
|
||||||
|
|
||||||
|
type ConflictEntry = {
|
||||||
|
word: string;
|
||||||
|
pos: string;
|
||||||
|
language: SupportedLanguageCode;
|
||||||
|
levels: string[];
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const OUTPUT_DIR = path.resolve("stage-2-annotate/output");
|
||||||
|
|
||||||
|
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe("stage 2 — annotated output validation", () => {
|
||||||
|
const recordsByLang = new Map<SupportedLanguageCode, AnnotatedRecord[]>();
|
||||||
|
let conflicts: ConflictEntry[] = [];
|
||||||
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const raw = await fs.readFile(
|
||||||
|
path.join(OUTPUT_DIR, `${lang}.json`),
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]);
|
||||||
|
}
|
||||||
|
const raw = await fs.readFile(
|
||||||
|
path.join(OUTPUT_DIR, "conflicts.json"),
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
conflicts = JSON.parse(raw) as ConflictEntry[];
|
||||||
|
}, 60_000);
|
||||||
|
|
||||||
|
it("all five language files exist", async () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
|
||||||
|
try {
|
||||||
|
await fs.access(filePath);
|
||||||
|
} catch {
|
||||||
|
errors.push(`missing file: ${lang}.json`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("conflicts.json exists", async () => {
|
||||||
|
const filePath = path.join(OUTPUT_DIR, "conflicts.json");
|
||||||
|
await expect(fs.access(filePath)).resolves.toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every language file is a non-empty array", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const records = recordsByLang.get(lang)!;
|
||||||
|
if (!Array.isArray(records)) {
|
||||||
|
errors.push(`${lang}.json: not an array`);
|
||||||
|
} else if (records.length === 0) {
|
||||||
|
errors.push(`${lang}.json: empty array`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every record has required fields", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const records = recordsByLang.get(lang)!;
|
||||||
|
|
||||||
|
for (const record of records) {
|
||||||
|
if (!record.source_id) {
|
||||||
|
errors.push(`${lang}: record missing source_id`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!record.pos)
|
||||||
|
errors.push(`${lang} ${record.source_id}: missing pos`);
|
||||||
|
if (!record.translations)
|
||||||
|
errors.push(`${lang} ${record.source_id}: missing translations`);
|
||||||
|
if (!record.glosses)
|
||||||
|
errors.push(`${lang} ${record.source_id}: missing glosses`);
|
||||||
|
if (record.examples === undefined)
|
||||||
|
errors.push(`${lang} ${record.source_id}: missing examples`);
|
||||||
|
if (record.votes === undefined)
|
||||||
|
errors.push(`${lang} ${record.source_id}: missing votes`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every pos is a valid supported value", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
const validPos = new Set(SUPPORTED_POS);
|
||||||
|
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const records = recordsByLang.get(lang)!;
|
||||||
|
|
||||||
|
for (const record of records) {
|
||||||
|
if (!validPos.has(record.pos)) {
|
||||||
|
errors.push(
|
||||||
|
`${lang} ${record.source_id}: invalid pos "${record.pos}"`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every example has text and a valid source", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
const validSources = new Set(["omw", "cefr"]);
|
||||||
|
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const records = recordsByLang.get(lang)!;
|
||||||
|
|
||||||
|
for (const record of records) {
|
||||||
|
for (const [l, examples] of Object.entries(record.examples)) {
|
||||||
|
for (const example of examples) {
|
||||||
|
if (!example.text) {
|
||||||
|
errors.push(
|
||||||
|
`${lang} ${record.source_id} (${l}): example missing text`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (!validSources.has(example.source)) {
|
||||||
|
errors.push(
|
||||||
|
`${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every cefr_source vote is a valid CEFR level", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
const validLevels = new Set(CEFR_LEVELS);
|
||||||
|
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const records = recordsByLang.get(lang)!;
|
||||||
|
|
||||||
|
for (const record of records) {
|
||||||
|
for (const [l, langVotes] of Object.entries(record.votes)) {
|
||||||
|
for (const [word, vote] of Object.entries(langVotes ?? {})) {
|
||||||
|
if (
|
||||||
|
!validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number])
|
||||||
|
) {
|
||||||
|
errors.push(
|
||||||
|
`${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("conflicts.json entries have required fields and valid CEFR levels", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
const validLevels = new Set(CEFR_LEVELS);
|
||||||
|
const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
|
||||||
|
|
||||||
|
for (const entry of conflicts) {
|
||||||
|
if (!entry.word) errors.push(`conflict missing word`);
|
||||||
|
if (!entry.pos) errors.push(`conflict missing pos`);
|
||||||
|
if (!entry.language) {
|
||||||
|
errors.push(`conflict missing language`);
|
||||||
|
} else if (!validLangs.has(entry.language)) {
|
||||||
|
errors.push(`conflict invalid language "${entry.language}"`);
|
||||||
|
}
|
||||||
|
if (!Array.isArray(entry.levels) || entry.levels.length < 2) {
|
||||||
|
errors.push(`${entry.word}: levels must have at least 2 entries`);
|
||||||
|
} else {
|
||||||
|
for (const level of entry.levels) {
|
||||||
|
if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) {
|
||||||
|
errors.push(`${entry.word}: invalid level "${level}"`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
@ -6,5 +6,6 @@ export default defineConfig({
|
||||||
globals: true,
|
globals: true,
|
||||||
include: ["tests/**/*.test.ts"],
|
include: ["tests/**/*.test.ts"],
|
||||||
exclude: ["**/dist/**", "**/node_modules/**"],
|
exclude: ["**/dist/**", "**/node_modules/**"],
|
||||||
|
testTimeout: 60_000,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue