lila/data-pipeline/tests/validation/db-import.validation.test.ts

222 lines
7.1 KiB
TypeScript

import fs from "node:fs/promises";
import path from "node:path";
import { describe, it, expect, beforeAll } from "vitest";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type ExtractedSense = {
headword: string;
language: SupportedLanguageCode;
pos: SupportedPos;
sense_index: number;
gloss: string | null;
examples: string[];
translations: {
target_lang: SupportedLanguageCode;
word: string;
sense_hint: string | null;
}[];
};
// ── Paths ─────────────────────────────────────────────────────────────────────
const DB_PATH = path.resolve("db/pipeline.db");
const OUTPUT_DIR = path.resolve("stage-1-extract/output");
// ── Helpers ───────────────────────────────────────────────────────────────────
async function dbExists(): Promise<boolean> {
try {
await fs.access(DB_PATH);
return true;
} catch {
return false;
}
}
// ── Tests ─────────────────────────────────────────────────────────────────────
describe("pipeline.db — import validation", () => {
let db: import("better-sqlite3").Database;
let expectedEntriesByLang: Map<SupportedLanguageCode, number>;
let expectedTotalTranslations: number;
beforeAll(async () => {
if (!(await dbExists())) return;
const Database = (await import("better-sqlite3")).default;
db = new Database(DB_PATH, { readonly: true });
db.pragma("foreign_keys = ON");
expectedEntriesByLang = new Map();
expectedTotalTranslations = 0;
for (const lang of SUPPORTED_LANGUAGE_CODES) {
try {
const raw = await fs.readFile(
path.join(OUTPUT_DIR, `${lang}.json`),
"utf-8",
);
const senses = JSON.parse(raw) as ExtractedSense[];
expectedEntriesByLang.set(lang, senses.length);
if (lang === "en") {
for (const sense of senses) {
expectedTotalTranslations += sense.translations.length;
}
}
} catch {
expectedEntriesByLang.set(lang, 0);
}
}
}, 30_000);
it("pipeline.db exists — skipping all tests if not", async () => {
const exists = await dbExists();
if (!exists) {
console.warn(
"\n pipeline.db not found — run pnpm db:init and pnpm db:import first\n",
);
}
expect(exists).toBe(true);
});
it("entry count per language matches source files", () => {
if (!db) return;
const errors: string[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const expected = expectedEntriesByLang.get(lang) ?? 0;
const row = db
.prepare("SELECT COUNT(*) as count FROM entries WHERE language = ?")
.get(lang) as { count: number };
if (row.count !== expected) {
errors.push(`${lang}: expected ${expected} entries, got ${row.count}`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("translation count matches source files", () => {
if (!db) return;
const row = db
.prepare("SELECT COUNT(*) as count FROM translations")
.get() as { count: number };
expect(row.count).toBe(expectedTotalTranslations);
});
it("every translation references a valid entry", () => {
if (!db) return;
const rows = db
.prepare(
`SELECT t.id, t.entry_id
FROM translations t
LEFT JOIN entries e ON e.id = t.entry_id
WHERE e.id IS NULL`,
)
.all() as { id: number; entry_id: number }[];
const errors = rows.map(
(r) => `translation ${r.id}: references missing entry ${r.entry_id}`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every entry has a valid language code", () => {
if (!db) return;
const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", ");
const rows = db
.prepare(
`SELECT id, headword, language FROM entries
WHERE language NOT IN (${validLangs})`,
)
.all() as { id: number; headword: string; language: string }[];
const errors = rows.map(
(r) => `entry ${r.id} "${r.headword}": invalid language "${r.language}"`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every entry has a valid pos", () => {
if (!db) return;
const rows = db
.prepare(
`SELECT id, headword, pos FROM entries
WHERE pos NOT IN ('noun', 'verb', 'adjective', 'adverb')`,
)
.all() as { id: number; headword: string; pos: string }[];
const errors = rows.map(
(r) => `entry ${r.id} "${r.headword}": invalid pos "${r.pos}"`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("sense_index is unique per headword, language, pos", () => {
if (!db) return;
const rows = db
.prepare(
`SELECT headword, language, pos, sense_index, COUNT(*) as c
FROM entries
GROUP BY headword, language, pos, sense_index
HAVING c > 1`,
)
.all() as {
headword: string;
language: string;
pos: string;
sense_index: number;
c: number;
}[];
const errors = rows.map(
(r) =>
`"${r.headword}" (${r.language} ${r.pos}): duplicate sense_index ${r.sense_index} (${r.c} rows)`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("non-English entries have no translations", () => {
if (!db) return;
const nonEnLangs = SUPPORTED_LANGUAGE_CODES.filter((l) => l !== "en")
.map((l) => `'${l}'`)
.join(", ");
const rows = db
.prepare(
`SELECT e.headword, e.language, COUNT(t.id) as c
FROM entries e
JOIN translations t ON t.entry_id = e.id
WHERE e.language IN (${nonEnLangs})
GROUP BY e.id`,
)
.all() as { headword: string; language: string; c: number }[];
const errors = rows.map(
(r) => `"${r.headword}" (${r.language}): unexpected ${r.c} translations`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("all translation target languages are supported and not English", () => {
if (!db) return;
const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", ");
const rows = db
.prepare(
`SELECT id, target_lang FROM translations
WHERE target_lang NOT IN (${validLangs})
OR target_lang = 'en'`,
)
.all() as { id: number; target_lang: string }[];
const errors = rows.map(
(r) => `translation ${r.id}: invalid target_lang "${r.target_lang}"`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
});