diff --git a/.gitignore b/.gitignore
index ad49f49..893044a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,6 @@ data-pipeline/stage-1-extract/output/
 data-pipeline/stage-2-annotate/output/
 data-pipeline/stage-3-enrich/output/
 data-pipeline/stage-4-merge/output/
+data-pipeline/db/pipeline.db
+data-pipeline/reports/
+
diff --git a/data-pipeline/db/pipeline.db b/data-pipeline/db/pipeline.db
deleted file mode 100644
index e7c3bbe..0000000
Binary files a/data-pipeline/db/pipeline.db and /dev/null differ
diff --git a/data-pipeline/db/pipeline.db-shm b/data-pipeline/db/pipeline.db-shm
new file mode 100644
index 0000000..fe9ac28
Binary files /dev/null and b/data-pipeline/db/pipeline.db-shm differ
diff --git a/data-pipeline/db/pipeline.db-wal b/data-pipeline/db/pipeline.db-wal
new file mode 100644
index 0000000..e69de29
diff --git a/data-pipeline/tests/validation/db-import.validation.test.ts b/data-pipeline/tests/validation/db-import.validation.test.ts
new file mode 100644
index 0000000..23c56e7
--- /dev/null
+++ b/data-pipeline/tests/validation/db-import.validation.test.ts
@@ -0,0 +1,237 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { describe, it, expect, beforeAll } from "vitest";
+import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type Example = { text: string; source: "omw" | "cefr" };
+
+type AnnotatedRecord = {
+  source_id: string;
+  pos: SupportedPos;
+  translations: Partial<Record<SupportedLanguageCode, string[]>>;
+  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
+  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
+  votes: Partial<
+    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
+  >;
+};
+
+// ── Paths ─────────────────────────────────────────────────────────────────────
+
+const DB_PATH = path.resolve("db/pipeline.db");
+const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
+const ANNOTATED_DIR = path.resolve("stage-2-annotate/output");
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+async function dbExists(): Promise<boolean> {
+  try {
+    await fs.access(DB_PATH);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+describe("pipeline.db — import validation", () => {
+  let db: import("better-sqlite3").Database;
+  let expectedSynsetCount: number;
+  let expectedCefrVoteCount: number;
+
+  beforeAll(async () => {
+    if (!(await dbExists())) return;
+
+    const Database = (await import("better-sqlite3")).default;
+    db = new Database(DB_PATH, { readonly: true });
+    db.pragma("foreign_keys = ON");
+
+    // Count expected synsets from omw.json
+    const omwRaw = await fs.readFile(OMW_PATH, "utf-8");
+    const omwRecords = JSON.parse(omwRaw) as unknown[];
+    expectedSynsetCount = omwRecords.length;
+
+    // Count expected CEFR votes from stage 2 annotated files.
+    // Merge all language files the same way the import script does —
+    // use en.json as base and merge votes from the other language files.
+    const byId = new Map<string, AnnotatedRecord>();
+
+    const baseRaw = await fs.readFile(
+      path.join(ANNOTATED_DIR, "en.json"),
+      "utf-8",
+    );
+    const base = JSON.parse(baseRaw) as AnnotatedRecord[];
+    for (const record of base) {
+      byId.set(record.source_id, record);
+    }
+
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      if (lang === "en") continue;
+      const raw = await fs.readFile(
+        path.join(ANNOTATED_DIR, `${lang}.json`),
+        "utf-8",
+      );
+      const records = JSON.parse(raw) as AnnotatedRecord[];
+      for (const record of records) {
+        const base = byId.get(record.source_id);
+        if (!base) continue;
+        for (const [l, langVotes] of Object.entries(record.votes)) {
+          if (!base.votes[l as SupportedLanguageCode]) {
+            base.votes[l as SupportedLanguageCode] = {};
+          }
+          Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
+        }
+      }
+    }
+
+    expectedCefrVoteCount = 0;
+    for (const record of byId.values()) {
+      for (const langVotes of Object.values(record.votes)) {
+        expectedCefrVoteCount += Object.keys(langVotes ?? {}).length;
+      }
+    }
+  }, 120_000);
+
+  it("pipeline.db exists — skipping all tests if not", async () => {
+    const exists = await dbExists();
+    if (!exists) {
+      console.warn(
+        "\n  pipeline.db not found — run pnpm db:init and pnpm db:import first\n",
+      );
+    }
+    expect(exists).toBe(true);
+  });
+
+  it("synsets count matches omw.json", () => {
+    if (!db) return;
+    const row = db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
+      count: number;
+    };
+    expect(row.count).toBe(expectedSynsetCount);
+  });
+
+  it("every synset has at least one translation", () => {
+    if (!db) return;
+    const rows = db
+      .prepare(
+        `
+        SELECT s.source_id
+        FROM synsets s
+        LEFT JOIN translations t ON t.source_id = s.source_id
+        WHERE t.id IS NULL
+      `,
+      )
+      .all() as { source_id: string }[];
+
+    const errors = rows.map((r) => `${r.source_id}: no translations`);
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every translation belongs to a valid synset", () => {
+    if (!db) return;
+    const rows = db
+      .prepare(
+        `
+        SELECT t.id, t.source_id
+        FROM translations t
+        LEFT JOIN synsets s ON s.source_id = t.source_id
+        WHERE s.source_id IS NULL
+      `,
+      )
+      .all() as { id: number; source_id: string }[];
+
+    const errors = rows.map(
+      (r) => `translation ${r.id}: references missing synset ${r.source_id}`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every cefr_source_vote references a valid translation", () => {
+    if (!db) return;
+    const rows = db
+      .prepare(
+        `
+        SELECT v.id, v.translation_id
+        FROM cefr_source_votes v
+        LEFT JOIN translations t ON t.id = v.translation_id
+        WHERE t.id IS NULL
+      `,
+      )
+      .all() as { id: number; translation_id: number }[];
+
+    const errors = rows.map(
+      (r) =>
+        `cefr_vote ${r.id}: references missing translation ${r.translation_id}`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("cefr_source_votes count matches stage 2 annotated output", () => {
+    if (!db) return;
+    const row = db
+      .prepare("SELECT COUNT(*) as count FROM cefr_source_votes")
+      .get() as { count: number };
+    expect(row.count).toBe(expectedCefrVoteCount);
+  });
+
+  it("every example has a valid source", () => {
+    if (!db) return;
+    const rows = db
+      .prepare(
+        `
+        SELECT source_id, language, source
+        FROM examples
+        WHERE source NOT IN ('omw', 'cefr')
+      `,
+      )
+      .all() as { source_id: string; language: string; source: string }[];
+
+    const errors = rows.map(
+      (r) =>
+        `${r.source_id} (${r.language}): invalid example source "${r.source}"`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every example belongs to a valid synset", () => {
+    if (!db) return;
+    const rows = db
+      .prepare(
+        `
+        SELECT e.id, e.source_id
+        FROM examples e
+        LEFT JOIN synsets s ON s.source_id = e.source_id
+        WHERE s.source_id IS NULL
+      `,
+      )
+      .all() as { id: number; source_id: string }[];
+
+    const errors = rows.map(
+      (r) => `example ${r.id}: references missing synset ${r.source_id}`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every gloss belongs to a valid synset", () => {
+    if (!db) return;
+    const rows = db
+      .prepare(
+        `
+        SELECT g.id, g.source_id
+        FROM glosses g
+        LEFT JOIN synsets s ON s.source_id = g.source_id
+        WHERE s.source_id IS NULL
+      `,
+      )
+      .all() as { id: number; source_id: string }[];
+
+    const errors = rows.map(
+      (r) => `gloss ${r.id}: references missing synset ${r.source_id}`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+});