import fs from "node:fs/promises"; import path from "node:path"; import { describe, it, expect, beforeAll } from "vitest"; import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; // ── Types ───────────────────────────────────────────────────────────────────── type ExtractedSense = { headword: string; language: SupportedLanguageCode; pos: SupportedPos; sense_index: number; gloss: string | null; examples: string[]; translations: { target_lang: SupportedLanguageCode; word: string; sense_hint: string | null; }[]; }; // ── Paths ───────────────────────────────────────────────────────────────────── const DB_PATH = path.resolve("db/pipeline.db"); const OUTPUT_DIR = path.resolve("stage-1-extract/output"); // ── Helpers ─────────────────────────────────────────────────────────────────── async function dbExists(): Promise { try { await fs.access(DB_PATH); return true; } catch { return false; } } // ── Tests ───────────────────────────────────────────────────────────────────── describe("pipeline.db — import validation", () => { let db: import("better-sqlite3").Database; let expectedEntriesByLang: Map; let expectedTotalTranslations: number; beforeAll(async () => { if (!(await dbExists())) return; const Database = (await import("better-sqlite3")).default; db = new Database(DB_PATH, { readonly: true }); db.pragma("foreign_keys = ON"); expectedEntriesByLang = new Map(); expectedTotalTranslations = 0; for (const lang of SUPPORTED_LANGUAGE_CODES) { try { const raw = await fs.readFile( path.join(OUTPUT_DIR, `${lang}.json`), "utf-8", ); const senses = JSON.parse(raw) as ExtractedSense[]; expectedEntriesByLang.set(lang, senses.length); if (lang === "en") { for (const sense of senses) { expectedTotalTranslations += sense.translations.length; } } } catch { expectedEntriesByLang.set(lang, 0); } } }, 30_000); it("pipeline.db exists — skipping all tests if not", async () => { const exists = await dbExists(); if (!exists) { console.warn( "\n pipeline.db not found — run pnpm db:init and pnpm db:import first\n", ); } expect(exists).toBe(true); }); it("entry count per language matches source files", () => { if (!db) return; const errors: string[] = []; for (const lang of SUPPORTED_LANGUAGE_CODES) { const expected = expectedEntriesByLang.get(lang) ?? 0; const row = db .prepare("SELECT COUNT(*) as count FROM entries WHERE language = ?") .get(lang) as { count: number }; if (row.count !== expected) { errors.push(`${lang}: expected ${expected} entries, got ${row.count}`); } } expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); it("translation count matches source files plus reverse links", () => { if (!db) return; const row = db .prepare("SELECT COUNT(*) as count FROM translations") .get() as { count: number }; const reverseLinks = db .prepare( "SELECT COUNT(*) as count FROM translations WHERE source = 'reverse_link'", ) .get() as { count: number }; expect(row.count).toBe(expectedTotalTranslations + reverseLinks.count); }); it("every translation references a valid entry", () => { if (!db) return; const rows = db .prepare( `SELECT t.id, t.entry_id FROM translations t LEFT JOIN entries e ON e.id = t.entry_id WHERE e.id IS NULL`, ) .all() as { id: number; entry_id: number }[]; const errors = rows.map( (r) => `translation ${r.id}: references missing entry ${r.entry_id}`, ); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); it("every entry has a valid language code", () => { if (!db) return; const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "); const rows = db .prepare( `SELECT id, headword, language FROM entries WHERE language NOT IN (${validLangs})`, ) .all() as { id: number; headword: string; language: string }[]; const errors = rows.map( (r) => `entry ${r.id} "${r.headword}": invalid language "${r.language}"`, ); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); it("every entry has a valid pos", () => { if (!db) return; const rows = db .prepare( `SELECT id, headword, pos FROM entries WHERE pos NOT IN ('noun', 'verb', 'adjective', 'adverb')`, ) .all() as { id: number; headword: string; pos: string }[]; const errors = rows.map( (r) => `entry ${r.id} "${r.headword}": invalid pos "${r.pos}"`, ); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); it("sense_index is unique per headword, language, pos", () => { if (!db) return; const rows = db .prepare( `SELECT headword, language, pos, sense_index, COUNT(*) as c FROM entries GROUP BY headword, language, pos, sense_index HAVING c > 1`, ) .all() as { headword: string; language: string; pos: string; sense_index: number; c: number; }[]; const errors = rows.map( (r) => `"${r.headword}" (${r.language} ${r.pos}): duplicate sense_index ${r.sense_index} (${r.c} rows)`, ); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); it("non-English entries have no Kaikki translations", () => { if (!db) return; const nonEnLangs = SUPPORTED_LANGUAGE_CODES.filter((l) => l !== "en") .map((l) => `'${l}'`) .join(", "); const rows = db .prepare( `SELECT e.headword, e.language, COUNT(t.id) as c FROM entries e JOIN translations t ON t.entry_id = e.id WHERE e.language IN (${nonEnLangs}) AND t.source = 'kaikki' GROUP BY e.id`, ) .all() as { headword: string; language: string; c: number }[]; const errors = rows.map( (r) => `"${r.headword}" (${r.language}): unexpected ${r.c} Kaikki translations`, ); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); it("all Kaikki translation target languages are supported and not English", () => { if (!db) return; const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "); const rows = db .prepare( `SELECT t.id, t.target_lang FROM translations t WHERE t.source = 'kaikki' AND (t.target_lang NOT IN (${validLangs}) OR t.target_lang = 'en')`, ) .all() as { id: number; target_lang: string }[]; const errors = rows.map( (r) => `translation ${r.id}: invalid target_lang "${r.target_lang}"`, ); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); });