diff --git a/data-pipeline/tests/validation/db-import.validation.test.ts b/data-pipeline/tests/validation/db-import.validation.test.ts new file mode 100644 index 0000000..56262d9 --- /dev/null +++ b/data-pipeline/tests/validation/db-import.validation.test.ts @@ -0,0 +1,222 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { describe, it, expect, beforeAll } from "vitest"; +import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; +import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type ExtractedSense = { + headword: string; + language: SupportedLanguageCode; + pos: SupportedPos; + sense_index: number; + gloss: string | null; + examples: string[]; + translations: { + target_lang: SupportedLanguageCode; + word: string; + sense_hint: string | null; + }[]; +}; + +// ── Paths ───────────────────────────────────────────────────────────────────── + +const DB_PATH = path.resolve("db/pipeline.db"); +const OUTPUT_DIR = path.resolve("stage-1-extract/output"); + +// ── Helpers ─────────────────────────────────────────────────────────────────── + +async function dbExists(): Promise { + try { + await fs.access(DB_PATH); + return true; + } catch { + return false; + } +} + +// ── Tests ───────────────────────────────────────────────────────────────────── + +describe("pipeline.db — import validation", () => { + let db: import("better-sqlite3").Database; + let expectedEntriesByLang: Map; + let expectedTotalTranslations: number; + + beforeAll(async () => { + if (!(await dbExists())) return; + + const Database = (await import("better-sqlite3")).default; + db = new Database(DB_PATH, { readonly: true }); + db.pragma("foreign_keys = ON"); + + expectedEntriesByLang = new Map(); + expectedTotalTranslations = 0; + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + try { + const raw = await fs.readFile( + path.join(OUTPUT_DIR, `${lang}.json`), + "utf-8", + ); + const senses = JSON.parse(raw) as ExtractedSense[]; + expectedEntriesByLang.set(lang, senses.length); + if (lang === "en") { + for (const sense of senses) { + expectedTotalTranslations += sense.translations.length; + } + } + } catch { + expectedEntriesByLang.set(lang, 0); + } + } + }, 30_000); + + it("pipeline.db exists — skipping all tests if not", async () => { + const exists = await dbExists(); + if (!exists) { + console.warn( + "\n pipeline.db not found — run pnpm db:init and pnpm db:import first\n", + ); + } + expect(exists).toBe(true); + }); + + it("entry count per language matches source files", () => { + if (!db) return; + const errors: string[] = []; + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const expected = expectedEntriesByLang.get(lang) ?? 0; + const row = db + .prepare("SELECT COUNT(*) as count FROM entries WHERE language = ?") + .get(lang) as { count: number }; + + if (row.count !== expected) { + errors.push(`${lang}: expected ${expected} entries, got ${row.count}`); + } + } + + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("translation count matches source files", () => { + if (!db) return; + const row = db + .prepare("SELECT COUNT(*) as count FROM translations") + .get() as { count: number }; + expect(row.count).toBe(expectedTotalTranslations); + }); + + it("every translation references a valid entry", () => { + if (!db) return; + const rows = db + .prepare( + `SELECT t.id, t.entry_id + FROM translations t + LEFT JOIN entries e ON e.id = t.entry_id + WHERE e.id IS NULL`, + ) + .all() as { id: number; entry_id: number }[]; + + const errors = rows.map( + (r) => `translation ${r.id}: references missing entry ${r.entry_id}`, + ); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every entry has a valid language code", () => { + if (!db) return; + const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "); + const rows = db + .prepare( + `SELECT id, headword, language FROM entries + WHERE language NOT IN (${validLangs})`, + ) + .all() as { id: number; headword: string; language: string }[]; + + const errors = rows.map( + (r) => `entry ${r.id} "${r.headword}": invalid language "${r.language}"`, + ); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every entry has a valid pos", () => { + if (!db) return; + const rows = db + .prepare( + `SELECT id, headword, pos FROM entries + WHERE pos NOT IN ('noun', 'verb', 'adjective', 'adverb')`, + ) + .all() as { id: number; headword: string; pos: string }[]; + + const errors = rows.map( + (r) => `entry ${r.id} "${r.headword}": invalid pos "${r.pos}"`, + ); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("sense_index is unique per headword, language, pos", () => { + if (!db) return; + const rows = db + .prepare( + `SELECT headword, language, pos, sense_index, COUNT(*) as c + FROM entries + GROUP BY headword, language, pos, sense_index + HAVING c > 1`, + ) + .all() as { + headword: string; + language: string; + pos: string; + sense_index: number; + c: number; + }[]; + + const errors = rows.map( + (r) => + `"${r.headword}" (${r.language} ${r.pos}): duplicate sense_index ${r.sense_index} (${r.c} rows)`, + ); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("non-English entries have no translations", () => { + if (!db) return; + const nonEnLangs = SUPPORTED_LANGUAGE_CODES.filter((l) => l !== "en") + .map((l) => `'${l}'`) + .join(", "); + + const rows = db + .prepare( + `SELECT e.headword, e.language, COUNT(t.id) as c + FROM entries e + JOIN translations t ON t.entry_id = e.id + WHERE e.language IN (${nonEnLangs}) + GROUP BY e.id`, + ) + .all() as { headword: string; language: string; c: number }[]; + + const errors = rows.map( + (r) => `"${r.headword}" (${r.language}): unexpected ${r.c} translations`, + ); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("all translation target languages are supported and not English", () => { + if (!db) return; + const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "); + + const rows = db + .prepare( + `SELECT id, target_lang FROM translations + WHERE target_lang NOT IN (${validLangs}) + OR target_lang = 'en'`, + ) + .all() as { id: number; target_lang: string }[]; + + const errors = rows.map( + (r) => `translation ${r.id}: invalid target_lang "${r.target_lang}"`, + ); + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); +}); diff --git a/data-pipeline/tests/validation/stage-1.validation.test.ts b/data-pipeline/tests/validation/stage-1.validation.test.ts new file mode 100644 index 0000000..86edac5 --- /dev/null +++ b/data-pipeline/tests/validation/stage-1.validation.test.ts @@ -0,0 +1,192 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { describe, it, expect, beforeAll } from "vitest"; +import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@lila/shared"; +import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type ExtractedSense = { + headword: string; + language: SupportedLanguageCode; + pos: SupportedPos; + sense_index: number; + gloss: string | null; + examples: string[]; + translations: { + target_lang: SupportedLanguageCode; + word: string; + sense_hint: string | null; + }[]; +}; + +// ── Paths ───────────────────────────────────────────────────────────────────── + +const OUTPUT_DIR = path.resolve("stage-1-extract/output"); + +// ── Tests ───────────────────────────────────────────────────────────────────── + +describe("stage 1 — Kaikki extraction output validation", () => { + const sensesByLang = new Map(); + + beforeAll(async () => { + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const filePath = path.join(OUTPUT_DIR, `${lang}.json`); + const raw = await fs.readFile(filePath, "utf-8"); + sensesByLang.set(lang, JSON.parse(raw) as ExtractedSense[]); + } + }, 30_000); + + it("all five language output files exist", async () => { + const errors: string[] = []; + for (const lang of SUPPORTED_LANGUAGE_CODES) { + try { + await fs.access(path.join(OUTPUT_DIR, `${lang}.json`)); + } catch { + errors.push(`missing: ${lang}.json`); + } + } + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every language file is a non-empty array", () => { + const errors: string[] = []; + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const senses = sensesByLang.get(lang)!; + if (!Array.isArray(senses)) errors.push(`${lang}: not an array`); + else if (senses.length === 0) errors.push(`${lang}: empty array`); + } + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every sense has required fields", () => { + const errors: string[] = []; + for (const lang of SUPPORTED_LANGUAGE_CODES) { + for (const sense of sensesByLang.get(lang)!) { + if (!sense.headword) errors.push(`${lang}: sense missing headword`); + if (!sense.language) + errors.push(`${lang} ${sense.headword}: missing language`); + if (!sense.pos) errors.push(`${lang} ${sense.headword}: missing pos`); + if (sense.sense_index === undefined) + errors.push(`${lang} ${sense.headword}: missing sense_index`); + if (!Array.isArray(sense.examples)) + errors.push(`${lang} ${sense.headword}: examples not an array`); + if (!Array.isArray(sense.translations)) + errors.push(`${lang} ${sense.headword}: translations not an array`); + } + } + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every sense has a valid pos", () => { + const errors: string[] = []; + const validPos = new Set(SUPPORTED_POS); + for (const lang of SUPPORTED_LANGUAGE_CODES) { + for (const sense of sensesByLang.get(lang)!) { + if (!validPos.has(sense.pos)) { + errors.push(`${lang} ${sense.headword}: invalid pos "${sense.pos}"`); + } + } + } + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("every sense language code matches its file", () => { + const errors: string[] = []; + for (const lang of SUPPORTED_LANGUAGE_CODES) { + for (const sense of sensesByLang.get(lang)!) { + if (sense.language !== lang) { + errors.push( + `${lang} ${sense.headword}: language field "${sense.language}" does not match file`, + ); + } + } + } + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("no abbreviation senses in output", () => { + const errors: string[] = []; + for (const lang of SUPPORTED_LANGUAGE_CODES) { + for (const sense of sensesByLang.get(lang)!) { + if (sense.gloss?.toLowerCase().startsWith("abbreviation of")) { + errors.push( + `${lang} ${sense.headword}: abbreviation sense not filtered`, + ); + } + } + } + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("English senses all have at least one translation", () => { + const errors: string[] = []; + for (const sense of sensesByLang.get("en")!) { + if (sense.translations.length === 0) { + errors.push( + `en ${sense.headword} (sense ${sense.sense_index}): no translations`, + ); + } + } + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("non-English senses have no translations", () => { + const errors: string[] = []; + for (const lang of SUPPORTED_LANGUAGE_CODES) { + if (lang === "en") continue; + for (const sense of sensesByLang.get(lang)!) { + if (sense.translations.length > 0) { + errors.push( + `${lang} ${sense.headword}: unexpected translations in non-English file`, + ); + } + } + } + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("all translation target languages are supported and not English", () => { + const errors: string[] = []; + const validLangs = new Set(SUPPORTED_LANGUAGE_CODES); + for (const sense of sensesByLang.get("en")!) { + for (const t of sense.translations) { + if (!validLangs.has(t.target_lang)) { + errors.push( + `en ${sense.headword}: unsupported translation language "${t.target_lang}"`, + ); + } + if (t.target_lang === "en") { + errors.push( + `en ${sense.headword}: translation to same language "en"`, + ); + } + if (!t.word?.trim()) { + errors.push( + `en ${sense.headword}: empty translation word for ${t.target_lang}`, + ); + } + } + } + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); + + it("sense_index is unique per headword and pos within each language", () => { + const errors: string[] = []; + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const seen = new Map>(); + for (const sense of sensesByLang.get(lang)!) { + const key = `${sense.headword}|${sense.pos}`; + if (!seen.has(key)) seen.set(key, new Set()); + const indexes = seen.get(key)!; + if (indexes.has(sense.sense_index)) { + errors.push( + `${lang} ${sense.headword} (${sense.pos}): duplicate sense_index ${sense.sense_index}`, + ); + } + indexes.add(sense.sense_index); + } + } + expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); + }); +});