feat: add stage 1 and db import validation tests for Kaikki schema
This commit is contained in:
parent
0cc643e308
commit
ba2635e3f7
2 changed files with 414 additions and 0 deletions
222
data-pipeline/tests/validation/db-import.validation.test.ts
Normal file
222
data-pipeline/tests/validation/db-import.validation.test.ts
Normal file
|
|
@ -0,0 +1,222 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import path from "node:path";
|
||||||
|
import { describe, it, expect, beforeAll } from "vitest";
|
||||||
|
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||||
|
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||||
|
|
||||||
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type ExtractedSense = {
|
||||||
|
headword: string;
|
||||||
|
language: SupportedLanguageCode;
|
||||||
|
pos: SupportedPos;
|
||||||
|
sense_index: number;
|
||||||
|
gloss: string | null;
|
||||||
|
examples: string[];
|
||||||
|
translations: {
|
||||||
|
target_lang: SupportedLanguageCode;
|
||||||
|
word: string;
|
||||||
|
sense_hint: string | null;
|
||||||
|
}[];
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const DB_PATH = path.resolve("db/pipeline.db");
|
||||||
|
const OUTPUT_DIR = path.resolve("stage-1-extract/output");
|
||||||
|
|
||||||
|
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function dbExists(): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
await fs.access(DB_PATH);
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe("pipeline.db — import validation", () => {
|
||||||
|
let db: import("better-sqlite3").Database;
|
||||||
|
let expectedEntriesByLang: Map<SupportedLanguageCode, number>;
|
||||||
|
let expectedTotalTranslations: number;
|
||||||
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
if (!(await dbExists())) return;
|
||||||
|
|
||||||
|
const Database = (await import("better-sqlite3")).default;
|
||||||
|
db = new Database(DB_PATH, { readonly: true });
|
||||||
|
db.pragma("foreign_keys = ON");
|
||||||
|
|
||||||
|
expectedEntriesByLang = new Map();
|
||||||
|
expectedTotalTranslations = 0;
|
||||||
|
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
try {
|
||||||
|
const raw = await fs.readFile(
|
||||||
|
path.join(OUTPUT_DIR, `${lang}.json`),
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
const senses = JSON.parse(raw) as ExtractedSense[];
|
||||||
|
expectedEntriesByLang.set(lang, senses.length);
|
||||||
|
if (lang === "en") {
|
||||||
|
for (const sense of senses) {
|
||||||
|
expectedTotalTranslations += sense.translations.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
expectedEntriesByLang.set(lang, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 30_000);
|
||||||
|
|
||||||
|
it("pipeline.db exists — skipping all tests if not", async () => {
|
||||||
|
const exists = await dbExists();
|
||||||
|
if (!exists) {
|
||||||
|
console.warn(
|
||||||
|
"\n pipeline.db not found — run pnpm db:init and pnpm db:import first\n",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
expect(exists).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("entry count per language matches source files", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const expected = expectedEntriesByLang.get(lang) ?? 0;
|
||||||
|
const row = db
|
||||||
|
.prepare("SELECT COUNT(*) as count FROM entries WHERE language = ?")
|
||||||
|
.get(lang) as { count: number };
|
||||||
|
|
||||||
|
if (row.count !== expected) {
|
||||||
|
errors.push(`${lang}: expected ${expected} entries, got ${row.count}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("translation count matches source files", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const row = db
|
||||||
|
.prepare("SELECT COUNT(*) as count FROM translations")
|
||||||
|
.get() as { count: number };
|
||||||
|
expect(row.count).toBe(expectedTotalTranslations);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every translation references a valid entry", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`SELECT t.id, t.entry_id
|
||||||
|
FROM translations t
|
||||||
|
LEFT JOIN entries e ON e.id = t.entry_id
|
||||||
|
WHERE e.id IS NULL`,
|
||||||
|
)
|
||||||
|
.all() as { id: number; entry_id: number }[];
|
||||||
|
|
||||||
|
const errors = rows.map(
|
||||||
|
(r) => `translation ${r.id}: references missing entry ${r.entry_id}`,
|
||||||
|
);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every entry has a valid language code", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", ");
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`SELECT id, headword, language FROM entries
|
||||||
|
WHERE language NOT IN (${validLangs})`,
|
||||||
|
)
|
||||||
|
.all() as { id: number; headword: string; language: string }[];
|
||||||
|
|
||||||
|
const errors = rows.map(
|
||||||
|
(r) => `entry ${r.id} "${r.headword}": invalid language "${r.language}"`,
|
||||||
|
);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every entry has a valid pos", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`SELECT id, headword, pos FROM entries
|
||||||
|
WHERE pos NOT IN ('noun', 'verb', 'adjective', 'adverb')`,
|
||||||
|
)
|
||||||
|
.all() as { id: number; headword: string; pos: string }[];
|
||||||
|
|
||||||
|
const errors = rows.map(
|
||||||
|
(r) => `entry ${r.id} "${r.headword}": invalid pos "${r.pos}"`,
|
||||||
|
);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("sense_index is unique per headword, language, pos", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`SELECT headword, language, pos, sense_index, COUNT(*) as c
|
||||||
|
FROM entries
|
||||||
|
GROUP BY headword, language, pos, sense_index
|
||||||
|
HAVING c > 1`,
|
||||||
|
)
|
||||||
|
.all() as {
|
||||||
|
headword: string;
|
||||||
|
language: string;
|
||||||
|
pos: string;
|
||||||
|
sense_index: number;
|
||||||
|
c: number;
|
||||||
|
}[];
|
||||||
|
|
||||||
|
const errors = rows.map(
|
||||||
|
(r) =>
|
||||||
|
`"${r.headword}" (${r.language} ${r.pos}): duplicate sense_index ${r.sense_index} (${r.c} rows)`,
|
||||||
|
);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("non-English entries have no translations", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const nonEnLangs = SUPPORTED_LANGUAGE_CODES.filter((l) => l !== "en")
|
||||||
|
.map((l) => `'${l}'`)
|
||||||
|
.join(", ");
|
||||||
|
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`SELECT e.headword, e.language, COUNT(t.id) as c
|
||||||
|
FROM entries e
|
||||||
|
JOIN translations t ON t.entry_id = e.id
|
||||||
|
WHERE e.language IN (${nonEnLangs})
|
||||||
|
GROUP BY e.id`,
|
||||||
|
)
|
||||||
|
.all() as { headword: string; language: string; c: number }[];
|
||||||
|
|
||||||
|
const errors = rows.map(
|
||||||
|
(r) => `"${r.headword}" (${r.language}): unexpected ${r.c} translations`,
|
||||||
|
);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("all translation target languages are supported and not English", () => {
|
||||||
|
if (!db) return;
|
||||||
|
const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", ");
|
||||||
|
|
||||||
|
const rows = db
|
||||||
|
.prepare(
|
||||||
|
`SELECT id, target_lang FROM translations
|
||||||
|
WHERE target_lang NOT IN (${validLangs})
|
||||||
|
OR target_lang = 'en'`,
|
||||||
|
)
|
||||||
|
.all() as { id: number; target_lang: string }[];
|
||||||
|
|
||||||
|
const errors = rows.map(
|
||||||
|
(r) => `translation ${r.id}: invalid target_lang "${r.target_lang}"`,
|
||||||
|
);
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
192
data-pipeline/tests/validation/stage-1.validation.test.ts
Normal file
192
data-pipeline/tests/validation/stage-1.validation.test.ts
Normal file
|
|
@ -0,0 +1,192 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import path from "node:path";
|
||||||
|
import { describe, it, expect, beforeAll } from "vitest";
|
||||||
|
import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@lila/shared";
|
||||||
|
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||||
|
|
||||||
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type ExtractedSense = {
|
||||||
|
headword: string;
|
||||||
|
language: SupportedLanguageCode;
|
||||||
|
pos: SupportedPos;
|
||||||
|
sense_index: number;
|
||||||
|
gloss: string | null;
|
||||||
|
examples: string[];
|
||||||
|
translations: {
|
||||||
|
target_lang: SupportedLanguageCode;
|
||||||
|
word: string;
|
||||||
|
sense_hint: string | null;
|
||||||
|
}[];
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const OUTPUT_DIR = path.resolve("stage-1-extract/output");
|
||||||
|
|
||||||
|
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
describe("stage 1 — Kaikki extraction output validation", () => {
|
||||||
|
const sensesByLang = new Map<SupportedLanguageCode, ExtractedSense[]>();
|
||||||
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
|
||||||
|
const raw = await fs.readFile(filePath, "utf-8");
|
||||||
|
sensesByLang.set(lang, JSON.parse(raw) as ExtractedSense[]);
|
||||||
|
}
|
||||||
|
}, 30_000);
|
||||||
|
|
||||||
|
it("all five language output files exist", async () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
try {
|
||||||
|
await fs.access(path.join(OUTPUT_DIR, `${lang}.json`));
|
||||||
|
} catch {
|
||||||
|
errors.push(`missing: ${lang}.json`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every language file is a non-empty array", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const senses = sensesByLang.get(lang)!;
|
||||||
|
if (!Array.isArray(senses)) errors.push(`${lang}: not an array`);
|
||||||
|
else if (senses.length === 0) errors.push(`${lang}: empty array`);
|
||||||
|
}
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every sense has required fields", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
for (const sense of sensesByLang.get(lang)!) {
|
||||||
|
if (!sense.headword) errors.push(`${lang}: sense missing headword`);
|
||||||
|
if (!sense.language)
|
||||||
|
errors.push(`${lang} ${sense.headword}: missing language`);
|
||||||
|
if (!sense.pos) errors.push(`${lang} ${sense.headword}: missing pos`);
|
||||||
|
if (sense.sense_index === undefined)
|
||||||
|
errors.push(`${lang} ${sense.headword}: missing sense_index`);
|
||||||
|
if (!Array.isArray(sense.examples))
|
||||||
|
errors.push(`${lang} ${sense.headword}: examples not an array`);
|
||||||
|
if (!Array.isArray(sense.translations))
|
||||||
|
errors.push(`${lang} ${sense.headword}: translations not an array`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every sense has a valid pos", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
const validPos = new Set(SUPPORTED_POS);
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
for (const sense of sensesByLang.get(lang)!) {
|
||||||
|
if (!validPos.has(sense.pos)) {
|
||||||
|
errors.push(`${lang} ${sense.headword}: invalid pos "${sense.pos}"`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("every sense language code matches its file", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
for (const sense of sensesByLang.get(lang)!) {
|
||||||
|
if (sense.language !== lang) {
|
||||||
|
errors.push(
|
||||||
|
`${lang} ${sense.headword}: language field "${sense.language}" does not match file`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("no abbreviation senses in output", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
for (const sense of sensesByLang.get(lang)!) {
|
||||||
|
if (sense.gloss?.toLowerCase().startsWith("abbreviation of")) {
|
||||||
|
errors.push(
|
||||||
|
`${lang} ${sense.headword}: abbreviation sense not filtered`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("English senses all have at least one translation", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
for (const sense of sensesByLang.get("en")!) {
|
||||||
|
if (sense.translations.length === 0) {
|
||||||
|
errors.push(
|
||||||
|
`en ${sense.headword} (sense ${sense.sense_index}): no translations`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("non-English senses have no translations", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
if (lang === "en") continue;
|
||||||
|
for (const sense of sensesByLang.get(lang)!) {
|
||||||
|
if (sense.translations.length > 0) {
|
||||||
|
errors.push(
|
||||||
|
`${lang} ${sense.headword}: unexpected translations in non-English file`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("all translation target languages are supported and not English", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
|
||||||
|
for (const sense of sensesByLang.get("en")!) {
|
||||||
|
for (const t of sense.translations) {
|
||||||
|
if (!validLangs.has(t.target_lang)) {
|
||||||
|
errors.push(
|
||||||
|
`en ${sense.headword}: unsupported translation language "${t.target_lang}"`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (t.target_lang === "en") {
|
||||||
|
errors.push(
|
||||||
|
`en ${sense.headword}: translation to same language "en"`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (!t.word?.trim()) {
|
||||||
|
errors.push(
|
||||||
|
`en ${sense.headword}: empty translation word for ${t.target_lang}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("sense_index is unique per headword and pos within each language", () => {
|
||||||
|
const errors: string[] = [];
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
const seen = new Map<string, Set<number>>();
|
||||||
|
for (const sense of sensesByLang.get(lang)!) {
|
||||||
|
const key = `${sense.headword}|${sense.pos}`;
|
||||||
|
if (!seen.has(key)) seen.set(key, new Set());
|
||||||
|
const indexes = seen.get(key)!;
|
||||||
|
if (indexes.has(sense.sense_index)) {
|
||||||
|
errors.push(
|
||||||
|
`${lang} ${sense.headword} (${sense.pos}): duplicate sense_index ${sense.sense_index}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
indexes.add(sense.sense_index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
Loading…
Add table
Add a link
Reference in a new issue