feat: add db import script, fix duplicate translations in extract, add annotate script

feat: add stage 1 and 2 validation tests
2026-05-03 22:05:10 +02:00 · 2026-05-03 21:36:56 +02:00
11 changed files with 810 additions and 62 deletions
--- a/data-pipeline/db/import.ts
+++ b/data-pipeline/db/import.ts
@ -0,0 +1,222 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
 import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
 import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
 import { openDb } from "./index.js";
 // ── Types ─────────────────────────────────────────────────────────────────────
 type Example = { text: string; source: "omw" | "cefr" };
 type AnnotatedRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
  votes: Partial<
    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
  >;
 };
 // ── Paths ─────────────────────────────────────────────────────────────────────
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const PATHS = {
  annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
 };
 // ── Loading ───────────────────────────────────────────────────────────────────
 async function loadAnnotated(): Promise<AnnotatedRecord[]> {
  // Use en.json as the base — it has the most complete glosses and examples.
  // Merge votes and CEFR examples from the other language files.
  const baseRaw = await fs.readFile(
    path.join(PATHS.annotatedDir, "en.json"),
    "utf-8",
  );
  const base = JSON.parse(baseRaw) as AnnotatedRecord[];
  const byId = new Map<string, AnnotatedRecord>();
  for (const record of base) {
    byId.set(record.source_id, record);
  }
  for (const lang of SUPPORTED_LANGUAGE_CODES) {
    if (lang === "en") continue;
    const raw = await fs.readFile(
      path.join(PATHS.annotatedDir, `${lang}.json`),
      "utf-8",
    );
    const records = JSON.parse(raw) as AnnotatedRecord[];
    for (const record of records) {
      const base = byId.get(record.source_id);
      if (!base) continue;
      // Merge votes
      for (const [l, langVotes] of Object.entries(record.votes)) {
        if (!base.votes[l as SupportedLanguageCode]) {
          base.votes[l as SupportedLanguageCode] = {};
        }
        Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
      }
      // Merge CEFR examples not already in base
      for (const [l, examples] of Object.entries(record.examples)) {
        const lang = l as SupportedLanguageCode;
        const cefrExamples = examples.filter((e) => e.source === "cefr");
        if (cefrExamples.length === 0) continue;
        if (!base.examples[lang]) {
          base.examples[lang] = cefrExamples;
        } else {
          base.examples[lang].push(...cefrExamples);
        }
      }
    }
  }
  return [...byId.values()];
 }
 // ── Import ────────────────────────────────────────────────────────────────────
 export async function importStage2(): Promise<void> {
  console.log("Loading stage 2 annotated files...");
  const records = await loadAnnotated();
  console.log(`  Loaded ${records.length.toLocaleString()} synsets`);
  const db = openDb();
  const insertSynset = db.prepare(
    `INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
  );
  const insertTranslation = db.prepare(
    `INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
  );
  const insertGloss = db.prepare(
    `INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
  );
  const insertExample = db.prepare(
    `INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
  );
  const insertCefrVote = db.prepare(`
    INSERT INTO cefr_source_votes (translation_id, cefr_level)
    VALUES (
      (SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
      ?
    )
  `);
  console.log("\nImporting into pipeline.db...");
  const importAll = db.transaction(() => {
    let synsets = 0;
    let translations = 0;
    let glosses = 0;
    let examples = 0;
    let cefrVotes = 0;
    for (const record of records) {
      insertSynset.run(record.source_id, record.pos);
      synsets++;
      // Translations
      for (const [lang, words] of Object.entries(record.translations)) {
        const unique = [...new Set(words)];
        for (const word of unique) {
          insertTranslation.run(record.source_id, lang, word);
          translations++;
        }
      }
      // Glosses
      for (const [lang, glossList] of Object.entries(record.glosses)) {
        for (const text of glossList) {
          insertGloss.run(record.source_id, lang, text);
          glosses++;
        }
      }
      // Examples
      for (const [lang, exList] of Object.entries(record.examples)) {
        for (const example of exList) {
          insertExample.run(
            record.source_id,
            lang,
            example.text,
            example.source,
          );
          examples++;
        }
      }
      // CEFR source votes
      for (const [lang, langVotes] of Object.entries(record.votes)) {
        for (const [word, vote] of Object.entries(
          langVotes as Record<string, { cefr_source: string }>,
        )) {
          insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
          cefrVotes++;
        }
      }
    }
    return { synsets, translations, glosses, examples, cefrVotes };
  });
  const counts = importAll();
  console.log(`  synsets:      ${counts.synsets.toLocaleString()}`);
  console.log(`  translations: ${counts.translations.toLocaleString()}`);
  console.log(`  glosses:      ${counts.glosses.toLocaleString()}`);
  console.log(`  examples:     ${counts.examples.toLocaleString()}`);
  console.log(`  cefr votes:   ${counts.cefrVotes.toLocaleString()}`);
  db.close();
  console.log("\nImport complete.");
 }
 // ── Check if already imported ─────────────────────────────────────────────────
 export function isImported(): boolean {
  const db = openDb();
  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
    count: number;
  };
  db.close();
  return row.count > 0;
 }
 // ── Main ─────────────────────────────────────────────────────────────────────
 async function main(): Promise<void> {
  const db = openDb();
  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
    count: number;
  };
  db.close();
  if (row.count > 0) {
    console.log(
      `pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
    );
    console.log("Delete pipeline.db and re-run db:init to start fresh.");
    process.exit(0);
  }
  await importStage2();
 }
 main().catch((err) => {
  console.error(err);
  process.exit(1);
 });
--- a/data-pipeline/db/pipeline.db
+++ b/data-pipeline/db/pipeline.db
--- a/data-pipeline/package.json
+++ b/data-pipeline/package.json
@ -4,7 +4,9 @@
  "private": true,
  "type": "module",
  "scripts": {
    "db:import": "tsx db/import.ts",
    "db:init": "tsx db/init.ts",
    "annotate": "tsx stage-2-annotate/scripts/annotate.ts",
    "test": "vitest run",
    "test:watch": "vitest"
  },
--- a/data-pipeline/stage-1-extract/scripts/extract.py
+++ b/data-pipeline/stage-1-extract/scripts/extract.py
@ -80,7 +80,7 @@ def extract_all(
                    continue
                covered += 1
-                lemmas = [str(lemma) for lemma in synset.lemmas()]
+                lemmas = list(dict.fromkeys(str(lemma) for lemma in synset.lemmas()))
                defns = [d for d in synset.definitions() if d]
                examples = [e for e in synset.examples() if e]
--- a/data-pipeline/stage-2-annotate/scripts/annotate.ts
+++ b/data-pipeline/stage-2-annotate/scripts/annotate.ts
@ -196,12 +196,12 @@ async function annotate(): Promise<void> {
        // Add CEFR vote
        if (!annotated.votes[lang]) annotated.votes[lang] = {};
-        annotated.votes[lang]![word] = { cefr_source: cefrEntry.level };
+        annotated.votes[lang][word] = { cefr_source: cefrEntry.level };
        // Add native example if present
        if (cefrEntry.example) {
          if (!annotated.examples[lang]) annotated.examples[lang] = [];
-          annotated.examples[lang]!.push({
+          annotated.examples[lang].push({
            text: cefrEntry.example,
            source: "cefr" as const,
          });
--- a/data-pipeline/tests/fixtures/annotated.fixture.json
+++ b/data-pipeline/tests/fixtures/annotated.fixture.json
@ -0,0 +1,170 @@
 [
  {
    "_fixture": "noun_with_cefr_vote",
    "source_id": "ili:i100955",
    "pos": "noun",
    "translations": { "en": ["grain"], "de": ["Korn", "Kornbrand"] },
    "glosses": { "en": ["a cereal grass"], "de": ["ein Getreidegras"] },
    "examples": {
      "en": [
        { "text": "wheat is a grain that is grown in Kansas", "source": "omw" }
      ]
    },
    "votes": { "en": { "grain": { "cefr_source": "B1" } } }
  },
  {
    "_fixture": "verb_no_votes_no_translations",
    "source_id": "ili:i21779",
    "pos": "verb",
    "translations": { "en": ["respire"] },
    "glosses": {
      "en": [
        "undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide"
      ]
    },
    "examples": {},
    "votes": {}
  },
  {
    "_fixture": "verb_with_cefr_vote_all_languages",
    "source_id": "ili:i21778",
    "pos": "verb",
    "translations": {
      "en": ["breathe", "take a breath", "respire", "suspire"],
      "it": ["respirare"],
      "es": ["aspirar", "respirar"],
      "de": ["Luft holen", "hauchen", "Luft bekommen", "Luft kriegen", "atmen"],
      "fr": ["inspirer", "respirer"]
    },
    "glosses": {
      "en": ["draw air into, and expel out of, the lungs"],
      "de": ["Luft in die Lunge saugen und aus ihr ausstoßen"]
    },
    "examples": {
      "en": [
        {
          "text": "I can breathe better when the air is clean",
          "source": "omw"
        },
        { "text": "The patient is respiring", "source": "omw" }
      ]
    },
    "votes": { "en": { "breathe": { "cefr_source": "A1" } } }
  },
  {
    "_fixture": "adjective_all_languages_multiple_translations",
    "source_id": "ili:i10007",
    "pos": "adjective",
    "translations": {
      "en": ["possible"],
      "it": [
        "attuabile",
        "effettuabile",
        "eseguibile",
        "fattibile",
        "operabile",
        "possibile",
        "producibile",
        "realizzabile"
      ],
      "es": ["posible"],
      "de": [
        "möglich",
        "denkbar",
        "eventuell",
        "möglicherweise",
        "allfällig",
        "etwaig",
        "gegebenenfalls",
        "eventuell"
      ],
      "fr": ["possible", "éventuel"]
    },
    "glosses": {
      "en": ["capable of happening or existing"],
      "de": ["in der Lage, zu geschehen oder zu existieren"]
    },
    "examples": {
      "en": [
        { "text": "a breakthrough may be possible next year", "source": "omw" },
        { "text": "anything is possible", "source": "omw" },
        { "text": "warned of possible consequences", "source": "omw" }
      ]
    },
    "votes": { "en": { "possible": { "cefr_source": "A2" } } }
  },
  {
    "_fixture": "adjective_multiple_de_votes_cefr_examples",
    "source_id": "ili:i10000",
    "pos": "adjective",
    "translations": {
      "en": ["negative"],
      "de": [
        "dürftig",
        "zu wünschen übrig lassen",
        "schlecht",
        "widrig",
        "ungut",
        "lausig",
        "negativ",
        "von Nachteil",
        "schädlich",
        "nachteilig",
        "ungünstig"
      ],
      "fr": ["négatif", "strictement négatif"]
    },
    "glosses": { "en": ["less than zero"], "de": ["kleiner als Null"] },
    "examples": {
      "en": [{ "text": "a negative number", "source": "omw" }],
      "de": [
        { "text": "Die Beweise waren dürftig.", "source": "cefr" },
        { "text": "Das Wetter ist heute schlecht.", "source": "cefr" },
        {
          "text": "Trotz widriger Umstände haben sie es geschafft.",
          "source": "cefr"
        },
        {
          "text": "Er hatte ein ungutes Gefühl bei der Sache.",
          "source": "cefr"
        },
        { "text": "Er hat eine sehr negative Einstellung.", "source": "cefr" },
        {
          "text": "Rauchen ist schädlich für die Gesundheit.",
          "source": "cefr"
        },
        {
          "text": "Diese Entscheidung könnte nachteilig sein.",
          "source": "cefr"
        },
        {
          "text": "Das Wetter ist heute ungünstig für einen Ausflug.",
          "source": "cefr"
        }
      ]
    },
    "votes": {
      "de": {
        "dürftig": { "cefr_source": "C1" },
        "schlecht": { "cefr_source": "A1" },
        "widrig": { "cefr_source": "C1" },
        "ungut": { "cefr_source": "B2" },
        "negativ": { "cefr_source": "A2" },
        "schädlich": { "cefr_source": "B1" },
        "nachteilig": { "cefr_source": "B1" },
        "ungünstig": { "cefr_source": "B2" }
      }
    }
  },
  {
    "_fixture": "adverb_no_votes",
    "source_id": "ili:i18157",
    "pos": "adverb",
    "translations": { "en": ["a cappella"], "es": ["a capella"] },
    "glosses": { "en": ["without musical accompaniment"] },
    "examples": {
      "en": [{ "text": "they performed a cappella", "source": "omw" }]
    },
    "votes": {}
  }
 ]
--- a/data-pipeline/tests/fixtures/conflicts.fixture.json
+++ b/data-pipeline/tests/fixtures/conflicts.fixture.json
@ -0,0 +1,4 @@
 [
  { "word": "macht", "pos": "noun", "language": "de", "levels": ["A2", "B1"] },
  { "word": "bleiche", "pos": "noun", "language": "de", "levels": ["B2", "B1"] }
 ]
--- a/data-pipeline/tests/validation/stage-1.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-1.validation.test.ts
@ -0,0 +1,166 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { describe, it, expect } from "vitest";
 import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
 import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
 // ── Types ─────────────────────────────────────────────────────────────────────
 type OmwRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, string[]>>;
 };
 // ── Paths ─────────────────────────────────────────────────────────────────────
 const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
 // ── Helpers ───────────────────────────────────────────────────────────────────
 function isValidSourceId(id: string): boolean {
  return /^ili:i\d+$/.test(id);
 }
 // ── Tests ─────────────────────────────────────────────────────────────────────
 describe("stage 1 — omw.json validation", () => {
  let records: OmwRecord[];
  it("file exists and is valid JSON", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    expect(records).toBeDefined();
  });
  it("is a non-empty array", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    expect(Array.isArray(records)).toBe(true);
    expect(records.length).toBeGreaterThan(0);
  });
  it("every record has required fields", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    for (const record of records) {
      if (!record.source_id) {
        errors.push(`missing source_id`);
        continue;
      }
      if (!record.pos) errors.push(`${record.source_id}: missing pos`);
      if (!record.translations)
        errors.push(`${record.source_id}: missing translations`);
      if (!record.glosses) errors.push(`${record.source_id}: missing glosses`);
      if (!record.examples)
        errors.push(`${record.source_id}: missing examples`);
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every source_id matches ili:i{number} pattern", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    for (const record of records) {
      if (!isValidSourceId(record.source_id)) {
        errors.push(`invalid source_id: ${record.source_id}`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every source_id is unique", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const seen = new Set<string>();
    const errors: string[] = [];
    for (const record of records) {
      if (seen.has(record.source_id)) {
        errors.push(`duplicate source_id: ${record.source_id}`);
      }
      seen.add(record.source_id);
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every pos is a valid supported value", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    const validPos = new Set(SUPPORTED_POS);
    for (const record of records) {
      if (!validPos.has(record.pos)) {
        errors.push(`${record.source_id}: invalid pos "${record.pos}"`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every record has at least one translation in at least one language", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
    for (const record of records) {
      const langs = Object.keys(record.translations) as SupportedLanguageCode[];
      if (langs.length === 0) {
        errors.push(`${record.source_id}: no translations`);
        continue;
      }
      for (const lang of langs) {
        if (!validLangs.has(lang)) {
          errors.push(`${record.source_id}: unsupported language "${lang}"`);
        }
        const words = record.translations[lang] ?? [];
        if (words.length === 0) {
          errors.push(`${record.source_id}: empty translations for "${lang}"`);
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("no duplicate translations within a single synset and language", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    const records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    for (const record of records) {
      for (const [lang, words] of Object.entries(record.translations)) {
        const seen = new Set<string>();
        for (const word of words) {
          if (seen.has(word)) {
            errors.push(
              `${record.source_id} (${lang}): duplicate translation "${word}"`,
            );
          }
          seen.add(word);
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
 });
--- a/data-pipeline/tests/validation/stage-2.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-2.validation.test.ts
@ -0,0 +1,218 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { describe, it, expect, beforeAll } from "vitest";
 import {
  SUPPORTED_POS,
  SUPPORTED_LANGUAGE_CODES,
  CEFR_LEVELS,
 } from "@lila/shared";
 import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
 // ── Types ─────────────────────────────────────────────────────────────────────
 type Example = { text: string; source: "omw" | "cefr" };
 type AnnotatedRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
  votes: Partial<
    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
  >;
 };
 type ConflictEntry = {
  word: string;
  pos: string;
  language: SupportedLanguageCode;
  levels: string[];
 };
 // ── Paths ─────────────────────────────────────────────────────────────────────
 const OUTPUT_DIR = path.resolve("stage-2-annotate/output");
 // ── Tests ─────────────────────────────────────────────────────────────────────
 describe("stage 2 — annotated output validation", () => {
  const recordsByLang = new Map<SupportedLanguageCode, AnnotatedRecord[]>();
  let conflicts: ConflictEntry[] = [];
  beforeAll(async () => {
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const raw = await fs.readFile(
        path.join(OUTPUT_DIR, `${lang}.json`),
        "utf-8",
      );
      recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]);
    }
    const raw = await fs.readFile(
      path.join(OUTPUT_DIR, "conflicts.json"),
      "utf-8",
    );
    conflicts = JSON.parse(raw) as ConflictEntry[];
  }, 60_000);
  it("all five language files exist", async () => {
    const errors: string[] = [];
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
      try {
        await fs.access(filePath);
      } catch {
        errors.push(`missing file: ${lang}.json`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("conflicts.json exists", async () => {
    const filePath = path.join(OUTPUT_DIR, "conflicts.json");
    await expect(fs.access(filePath)).resolves.toBeUndefined();
  });
  it("every language file is a non-empty array", () => {
    const errors: string[] = [];
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      if (!Array.isArray(records)) {
        errors.push(`${lang}.json: not an array`);
      } else if (records.length === 0) {
        errors.push(`${lang}.json: empty array`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every record has required fields", () => {
    const errors: string[] = [];
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        if (!record.source_id) {
          errors.push(`${lang}: record missing source_id`);
          continue;
        }
        if (!record.pos)
          errors.push(`${lang} ${record.source_id}: missing pos`);
        if (!record.translations)
          errors.push(`${lang} ${record.source_id}: missing translations`);
        if (!record.glosses)
          errors.push(`${lang} ${record.source_id}: missing glosses`);
        if (record.examples === undefined)
          errors.push(`${lang} ${record.source_id}: missing examples`);
        if (record.votes === undefined)
          errors.push(`${lang} ${record.source_id}: missing votes`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every pos is a valid supported value", () => {
    const errors: string[] = [];
    const validPos = new Set(SUPPORTED_POS);
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        if (!validPos.has(record.pos)) {
          errors.push(
            `${lang} ${record.source_id}: invalid pos "${record.pos}"`,
          );
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every example has text and a valid source", () => {
    const errors: string[] = [];
    const validSources = new Set(["omw", "cefr"]);
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        for (const [l, examples] of Object.entries(record.examples)) {
          for (const example of examples) {
            if (!example.text) {
              errors.push(
                `${lang} ${record.source_id} (${l}): example missing text`,
              );
            }
            if (!validSources.has(example.source)) {
              errors.push(
                `${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`,
              );
            }
          }
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every cefr_source vote is a valid CEFR level", () => {
    const errors: string[] = [];
    const validLevels = new Set(CEFR_LEVELS);
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        for (const [l, langVotes] of Object.entries(record.votes)) {
          for (const [word, vote] of Object.entries(langVotes ?? {})) {
            if (
              !validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number])
            ) {
              errors.push(
                `${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`,
              );
            }
          }
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("conflicts.json entries have required fields and valid CEFR levels", () => {
    const errors: string[] = [];
    const validLevels = new Set(CEFR_LEVELS);
    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
    for (const entry of conflicts) {
      if (!entry.word) errors.push(`conflict missing word`);
      if (!entry.pos) errors.push(`conflict missing pos`);
      if (!entry.language) {
        errors.push(`conflict missing language`);
      } else if (!validLangs.has(entry.language)) {
        errors.push(`conflict invalid language "${entry.language}"`);
      }
      if (!Array.isArray(entry.levels) || entry.levels.length < 2) {
        errors.push(`${entry.word}: levels must have at least 2 entries`);
      } else {
        for (const level of entry.levels) {
          if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) {
            errors.push(`${entry.word}: invalid level "${level}"`);
          }
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
 });
--- a/data-pipeline/vitest.config.ts
+++ b/data-pipeline/vitest.config.ts
@ -6,5 +6,6 @@ export default defineConfig({
    globals: true,
    include: ["tests/**/*.test.ts"],
    exclude: ["**/dist/**", "**/node_modules/**"],
    testTimeout: 60_000,
  },
 });
--- a/documentation/data-pipeline.md
+++ b/documentation/data-pipeline.md
@ -63,8 +63,9 @@ The database serves three purposes:
 - **Resolved output** — the final resolved records live here and are read by
  the sync script to seed the production database.
-The schema is defined in `data-pipeline/db/schema.sql`. Never edit `pipeline.db`
+The schema is defined in `data-pipeline/db/schema.sql`. Never edit `pipeline.db` directly — all writes go through the pipeline scripts.
-directly — all writes go through the pipeline scripts.
+
 On first run the orchestrator initialises `pipeline.db` automatically and imports the stage 2 output into the base tables. This happens once — subsequent runs skip the import if the base tables are already populated.
 ## Data sources
@ -230,15 +231,11 @@ Words not present in the CEFR source file will have an empty `votes` object.
 > `http://127.0.0.1:8080/health` and exits with instructions if it is not
 > reachable. See `llm-setup.md` for setup instructions.
-The enrich stage runs in two rounds, both designed to execute overnight one
+The enrich stage runs in two rounds, both designed to execute overnight one model at a time. All output is written to `pipeline.db` atomically per record — runs are fully resumable if interrupted. Each model is run once — one model produces one vote.
 model at a time. All output is written to `pipeline.db` atomically per record
 — runs are fully resumable if interrupted. Each model is run once — one model
 produces one vote.
 **Round 1 — generation**
-Each model processes every word in every language one term at a time and
+Each model processes every word in every language one term at a time and generates:
 generates:
 - A CEFR level vote for each translation
 - A description for each language
@ -246,20 +243,11 @@ generates:
 - A gloss for each language, only if OMW provides none
 - Usage examples for each language, only if OMW provides none
-OMW data is never duplicated — the script checks what OMW already provides
+OMW data is never duplicated — the script checks what OMW already provides before building the prompt. For translations, glosses and examples, if OMW data exists for that language the LLM skips generation entirely. This significantly reduces compute time for languages with good OMW coverage such as English.
 before building the prompt. For translations, glosses and examples, if OMW
 data exists for that language the LLM skips generation entirely. This
 significantly reduces compute time for languages with good OMW coverage such
 as English.
-All model-generated content is stored with an anonymised source (`model_1`,
+All model-generated content is stored with an anonymised source (`model_1`, `model_2` etc.) so models cannot be biased by knowing who generated what in round 2.
 `model_2` etc.) so models cannot be biased by knowing who generated what in
 round 2.
-Each record is written to `pipeline.db` with status `complete` or
+Each record is written to `pipeline.db` with status `complete` or `needs_review` immediately after processing. If a record fails structural validation (invalid JSON, missing required fields, invalid CEFR value) it is marked `needs_review` and skipped — the run continues without interruption.
 `needs_review` immediately after processing. If a record fails structural
 validation (invalid JSON, missing required fields, invalid CEFR value) it is
 marked `needs_review` and skipped — the run continues without interruption.
 **Input:** `stage-2-annotate/output/{lang}.json`
 **Output:** `pipeline.db` — round 1 results per record per model
@ -270,9 +258,7 @@ pnpm --filter @lila/pipeline enrich --round 1 --model {model}
 **Compiling candidates**
-Once all round 1 runs are complete, compile all generated candidates into a
+Once all round 1 runs are complete, compile all generated candidates into a single structured record per term in `pipeline.db`. This is the input to round 2.
 single structured record per term in `pipeline.db`. This is the input to
 round 2.
 ```bash
 pnpm --filter @lila/pipeline enrich --compile-candidates
@ -287,10 +273,7 @@ Each model receives the compiled candidate list for every word and votes on:
 - The best usage examples candidate (if multiple exist)
 - A CEFR level vote for each translation
-OMW data is not put to a vote — it automatically wins over any LLM-generated
+OMW data is not put to a vote — it automatically wins over any LLM-generated candidate. Round 2 only resolves conflicts between model-generated candidates. The prompt is kept small — one word at a time, a clean numbered candidate list — to fit within a limited context window.
 candidate. Round 2 only resolves conflicts between model-generated candidates.
 The prompt is kept small — one word at a time, a clean numbered candidate
 list — to fit within a limited context window.
 **Input:** `pipeline.db` — compiled candidates
 **Output:** `pipeline.db` — round 2 votes per record per model
@ -301,8 +284,7 @@ pnpm --filter @lila/pipeline enrich --round 2 --model {model}
 **Compiling votes**
-Once all round 2 runs are complete, compile all votes into a final votes
+Once all round 2 runs are complete, compile all votes into a final votes record per term in `pipeline.db`. This is the input to the merge stage.
 record per term in `pipeline.db`. This is the input to the merge stage.
 ```bash
 pnpm --filter @lila/pipeline enrich --compile-votes
@ -310,9 +292,7 @@ pnpm --filter @lila/pipeline enrich --compile-votes
 ### 4. Merge
-Reads compiled votes from `pipeline.db` and resolves the final value for
+Reads compiled votes from `pipeline.db` and resolves the final value for every field. Updates each record in `pipeline.db` with status `final` or `flagged`.
 every field. Updates each record in `pipeline.db` with status `final` or
 `flagged`.
 **Merge rules:**
@ -340,18 +320,9 @@ pnpm --filter @lila/pipeline merge
 ### 4b. Tiebreak
-Runs automatically after merge if any translations remain flagged. The script
+Runs automatically after merge if any translations remain flagged. The script queries `pipeline.db` for flagged translations, identifies which configured models have not yet voted on each word, and runs those models on the flagged subset only. Merge is re-run after each tiebreaker pass. This repeats until all flagged translations are resolved or no unused models remain.
 queries `pipeline.db` for flagged translations, identifies which configured
 models have not yet voted on each word, and runs those models on the flagged
 subset only. Merge is re-run after each tiebreaker pass. This repeats until
 all flagged translations are resolved or no unused models remain.
-If unused models are exhausted and flagged translations remain, the script
+If unused models are exhausted and flagged translations remain, the script logs a detailed report showing the exact vote split for each unresolved word and lists available models from OpenRouter that have not been used. Seeding is blocked until all translations are resolved. To continue, add one or more models to the config and re-run the pipeline — the tiebreaker will pick up automatically.
 logs a detailed report showing the exact vote split for each unresolved word
 and lists available models from OpenRouter that have not been used. Seeding
 is blocked until all translations are resolved. To continue, add one or more
 models to the config and re-run the pipeline — the tiebreaker will pick up
 automatically.
 **Input:** `pipeline.db` — flagged translations from merge
 **Output:** `pipeline.db` — flagged translations resolved to `final`
@ -361,9 +332,7 @@ automatically.
 ### 5. Compare / QA
-Read-only. Generates `COVERAGE.md` with a full breakdown of the pipeline
+Read-only. Generates `COVERAGE.md` with a full breakdown of the pipeline output quality per language. Run this after merge to verify output before syncing to the database.
 output quality per language. Run this after merge to verify output before
 syncing to the database.
 **Input:** `pipeline.db` — records with status `final`
 **Output:** `COVERAGE.md`
@ -393,10 +362,7 @@ pnpm --filter @lila/pipeline compare
 ## Sync
-The sync script transfers all records with status `final` in `pipeline.db` to
+The sync script transfers all records with status `final` in `pipeline.db` to the production PostgreSQL database. It is upsert-based and never wipes existing data. For each record it checks whether a matching `source_id` already exists in the target database:
 the production PostgreSQL database. It is upsert-based and never wipes
 existing data. For each record it checks whether a matching `source_id`
 already exists in the target database:
 - **Missing** → insert
 - **Present but changed** → update
@ -408,14 +374,11 @@ Run this after all records are resolved and Compare / QA has been reviewed.
 pnpm --filter @lila/pipeline sync
 ```
-The sync script requires a connection string to the target database. Set
+The sync script requires a connection string to the target database. Set `DATABASE_URL` in your `.env` file before running.
 `DATABASE_URL` in your `.env` file before running.
 ## Reports
-The pipeline generates a report at the end of every run. Reports are written
+The pipeline generates a report at the end of every run. Reports are written to `data-pipeline/reports/` as a JSON file and a markdown file with the same name. The markdown is generated from the JSON and contains identical data.
 to `data-pipeline/reports/` as a JSON file and a markdown file with the same
 name. The markdown is generated from the JSON and contains identical data.
 ```
 data-pipeline/reports/
@ -497,10 +460,7 @@ dataset matures:
 ## Roadmap
-**Current state:** Stages 1 and 2 are complete and output has been reviewed
+**Current state:** Stages 1 and 2 are complete, validated, and imported into `pipeline.db`. Schema, init, import scripts, validation tests, and fixtures are all in place. Stage 3 scripts have not been written yet and llama.cpp is not installed.
 for all five languages. Architecture for stages 3–6, the tiebreaker, and the
 report system are finalised. Stage 3 scripts have not been written yet and
 llama.cpp is not installed.
 **Next action:** Write the stage 3 round 1 script.
@ -523,6 +483,11 @@ llama.cpp is not installed.
 - [x] Write annotation script
 - [x] Run annotation → per-language JSON + `conflicts.json`
 - [x] Add annotate script to package.json
 - [x] Fix duplicate translations in extract.py
 - [x] Write stage 1 and 2 validation tests
 - [x] Write db schema, init, and import scripts
 - [x] Write test fixtures
 ### Stage 3 — Enrich `🔲 not started`
Author	SHA1	Message	Date
lila	f59399be02	feat: add db import script, fix duplicate translations in extract, add annotate script	2026-05-03 22:05:10 +02:00
lila	4a842140b9	feat: add stage 1 and 2 validation tests	2026-05-03 21:36:56 +02:00