import fs from "node:fs/promises"; import path from "node:path"; import { describe, it, expect, beforeAll } from "vitest"; import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; // ── Types ───────────────────────────────────────────────────────────────────── type Example = { text: string; source: "omw" | "cefr" }; type AnnotatedRecord = { source_id: string; pos: SupportedPos; translations: Partial>; glosses: Partial>; examples: Partial>; votes: Partial< Record> >; }; // ── Paths ───────────────────────────────────────────────────────────────────── const DB_PATH = path.resolve("db/pipeline.db"); const OMW_PATH = path.resolve("stage-1-extract/output/omw.json"); const ANNOTATED_DIR = path.resolve("stage-2-annotate/output"); // ── Helpers ─────────────────────────────────────────────────────────────────── async function dbExists(): Promise { try { await fs.access(DB_PATH); return true; } catch { return false; } } // ── Tests ───────────────────────────────────────────────────────────────────── describe("pipeline.db — import validation", () => { let db: import("better-sqlite3").Database; let expectedSynsetCount: number; let expectedCefrVoteCount: number; beforeAll(async () => { if (!(await dbExists())) return; const Database = (await import("better-sqlite3")).default; db = new Database(DB_PATH, { readonly: true }); db.pragma("foreign_keys = ON"); // Count expected synsets from omw.json const omwRaw = await fs.readFile(OMW_PATH, "utf-8"); const omwRecords = JSON.parse(omwRaw) as unknown[]; expectedSynsetCount = omwRecords.length; // Count expected CEFR votes from stage 2 annotated files. // Merge all language files the same way the import script does — // use en.json as base and merge votes from the other language files. const byId = new Map(); const baseRaw = await fs.readFile( path.join(ANNOTATED_DIR, "en.json"), "utf-8", ); const base = JSON.parse(baseRaw) as AnnotatedRecord[]; for (const record of base) { byId.set(record.source_id, record); } for (const lang of SUPPORTED_LANGUAGE_CODES) { if (lang === "en") continue; const raw = await fs.readFile( path.join(ANNOTATED_DIR, `${lang}.json`), "utf-8", ); const records = JSON.parse(raw) as AnnotatedRecord[]; for (const record of records) { const base = byId.get(record.source_id); if (!base) continue; for (const [l, langVotes] of Object.entries(record.votes)) { if (!base.votes[l as SupportedLanguageCode]) { base.votes[l as SupportedLanguageCode] = {}; } Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes); } } } expectedCefrVoteCount = 0; for (const record of byId.values()) { for (const langVotes of Object.values(record.votes)) { expectedCefrVoteCount += Object.keys(langVotes ?? {}).length; } } }, 120_000); it("pipeline.db exists — skipping all tests if not", async () => { const exists = await dbExists(); if (!exists) { console.warn( "\n pipeline.db not found — run pnpm db:init and pnpm db:import first\n", ); } expect(exists).toBe(true); }); it("synsets count matches omw.json", () => { if (!db) return; const row = db.prepare("SELECT COUNT(*) as count FROM synsets").get() as { count: number; }; expect(row.count).toBe(expectedSynsetCount); }); it("every synset has at least one translation", () => { if (!db) return; const rows = db .prepare( ` SELECT s.source_id FROM synsets s LEFT JOIN translations t ON t.source_id = s.source_id WHERE t.id IS NULL `, ) .all() as { source_id: string }[]; const errors = rows.map((r) => `${r.source_id}: no translations`); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); it("every translation belongs to a valid synset", () => { if (!db) return; const rows = db .prepare( ` SELECT t.id, t.source_id FROM translations t LEFT JOIN synsets s ON s.source_id = t.source_id WHERE s.source_id IS NULL `, ) .all() as { id: number; source_id: string }[]; const errors = rows.map( (r) => `translation ${r.id}: references missing synset ${r.source_id}`, ); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); it("every cefr_source_vote references a valid translation", () => { if (!db) return; const rows = db .prepare( ` SELECT v.id, v.translation_id FROM cefr_source_votes v LEFT JOIN translations t ON t.id = v.translation_id WHERE t.id IS NULL `, ) .all() as { id: number; translation_id: number }[]; const errors = rows.map( (r) => `cefr_vote ${r.id}: references missing translation ${r.translation_id}`, ); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); it("cefr_source_votes count matches stage 2 annotated output", () => { if (!db) return; const row = db .prepare("SELECT COUNT(*) as count FROM cefr_source_votes") .get() as { count: number }; expect(row.count).toBe(expectedCefrVoteCount); }); it("every example has a valid source", () => { if (!db) return; const rows = db .prepare( ` SELECT source_id, language, source FROM examples WHERE source NOT IN ('omw', 'cefr') `, ) .all() as { source_id: string; language: string; source: string }[]; const errors = rows.map( (r) => `${r.source_id} (${r.language}): invalid example source "${r.source}"`, ); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); it("every example belongs to a valid synset", () => { if (!db) return; const rows = db .prepare( ` SELECT e.id, e.source_id FROM examples e LEFT JOIN synsets s ON s.source_id = e.source_id WHERE s.source_id IS NULL `, ) .all() as { id: number; source_id: string }[]; const errors = rows.map( (r) => `example ${r.id}: references missing synset ${r.source_id}`, ); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); it("every gloss belongs to a valid synset", () => { if (!db) return; const rows = db .prepare( ` SELECT g.id, g.source_id FROM glosses g LEFT JOIN synsets s ON s.source_id = g.source_id WHERE s.source_id IS NULL `, ) .all() as { id: number; source_id: string }[]; const errors = rows.map( (r) => `gloss ${r.id}: references missing synset ${r.source_id}`, ); expect(errors, `\n${errors.join("\n")}`).toHaveLength(0); }); });