feat: add Kaikki extraction and import scripts for stage 1

- Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL, filters to supported POS and languages, skips abbreviations and senses with no translations in supported languages - Rewrite db/import.ts for Kaikki flat model — tracks sense_index offsets per headword+pos to handle duplicate JSONL entries - Rewrite db/schema.sql for Kaikki model — entries, translations, LLM vote tables, resolved tables - Add extract and db:import scripts to package.json - Sample mode hardcoded to 500 entries for development
2026-05-05 18:11:53 +02:00 · 2026-05-05 18:11:53 +02:00 · 209d52f54b
commit 209d52f54b
parent 963bff4eb8
17 changed files with 346 additions and 1055737 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,6 +12,7 @@ __pycache__/
 data-pipeline/archive/
 data-pipeline/stage-1-extract/output/
 data-pipeline/stage-1-extract/sources/
 data-pipeline/stage-2-annotate/output/
 data-pipeline/stage-3-enrich/output/
 data-pipeline/stage-4-merge/output/
--- a/data-pipeline/audit.md
+++ b/data-pipeline/audit.md
@ -1,362 +0,0 @@
 # OMW German Translation Quality Audit
 Instructions: for each entry, check if the German translations
 match the meaning described by the English gloss.
 Mark QUALITY as:
 OK — all German translations fit the meaning
 PARTIAL — some fit, some don't
 BAD — none of the German translations fit
 USELESS — translations are correct but useless for learners
 ---
 1.  [noun] ili:i98680
    EN gloss: the flowering part of a plant or arrangement of flowers on a stalk
    DE gloss: der blühende Teil einer Pflanze oder die Anordnung von Blüten an einem Stiel
    EN words: inflorescence
    DE words: Blütenstand, Infloreszenz
    QUALITY: correct
 2.  [verb] ili:i24675
    EN gloss: make motionless
    DE gloss: unbeweglich machen
    EN words: still
    DE words: stillen, zum Stillstand bringen
    QUALITY: stillen means breastfeeding, so completelyworng, zum stillstand bringen is correct but the gloss sounds weird: unbeweglich machen, no one says this
 3.  [verb] ili:i22153
    EN gloss: lose interest or become bored with something or somebody
    DE gloss: das Interesse an etwas oder jemandem verlieren oder sich langweilen
    EN words: fatigue, jade, pall, tire, weary
    DE words: Langeweile erzeugen, anöden, ermüden, langweilen, sich langweilen, sich zu Tode langweilen, sich öden
    QUALITY: its ok
 4.  [noun] ili:i74742
    EN gloss: zealous preaching and advocacy of the gospel
    DE gloss: eifriges Predigen und Eintreten für das Evangelium
    EN words: evangelism
    DE words: Evangelisation, Evangelisierung
    QUALITY: ok
 5.  [noun] ili:i115665
    EN gloss: an oxide of iron that is strongly attracted by magnets
    DE gloss: ein Eisenoxid, das stark von Magneten angezogen wird
    EN words: magnetic iron-ore, magnetite
    DE words: Eisenoxiduloxid, Magneteisen, Magneteisenstein, Magnetit
    QUALITY: ok
 6.  [adjective] ili:i17569
    EN gloss: of or relating to fatalism
    DE gloss: von oder im Zusammenhang mit Fatalismus
    EN words: fatalist, fatalistic
    DE words: auf alles gefasst, dem Schicksal ergeben, fatalistisch, gottergeben, schicksalsergeben
    QUALITY: ok
 7.  [adjective] ili:i682
    EN gloss: having no previous example or precedent or parallel
    DE gloss: ohne vorheriges Beispiel oder Präzedenzfall oder Parallele
    EN words: new, unexampled
    DE words: beispiellos, gab es noch nie, ohne Beispiel, ohne Präzedenzfall, ohnegleichen, präzedenzlos, sondergleichen, unvergleichbar
    QUALITY: ok
 8.  [noun] ili:i114018
    EN gloss: a soft silvery metallic element of the rare earth group; isotope 170 emits X-rays and is used in small portable X-ray machines; it occurs in monazite and apatite and xenotime
    DE gloss: ein weiches, silbriges Metallelement der Gruppe der Seltenen Erden; Isotop 170 emittiert Röntgenstrahlen und wird in kleinen tragbaren Röntgengeräten verwendet; es kommt in Monazit und Apatit sowie in Xenotim vor
    EN words: Tm, atomic number 69, thulium
    DE words: Terameter, Tm
    QUALITY: ok
 9.  [noun] ili:i117564
    EN gloss: the rate of some repeating event
    DE gloss: die Geschwindigkeit eines sich wiederholenden Ereignisses
    EN words: pace, tempo
    DE words: Takt, Tempo
    QUALITY: ok
 10. [verb] ili:i31619
    EN gloss: let drop or droop
    DE gloss: fallen oder hängen lassen
    EN words: hang
    DE words: am Galgen sterben lassen, aufhängen, aufknüpfen, erhängen, henken, hängen
    QUALITY: wrong,let drop means fallen lassen, like dropping something? im not sure here, does it really mean to hang some one? if so, then its ok
 11. [noun] ili:i75571
    EN gloss: a heavy dull sound (as made by impact of heavy objects)
    DE gloss: ein schweres, dumpfes Geräusch (wie beim Aufprall schwerer Gegenstände)
    EN words: clump, clunk, thud, thump, thumping
    DE words: Geklacker, Geklapper, Klackern, Klappern
    QUALITY: ok
 12. [noun] ili:i92290
    EN gloss: a person who makes a promise
    DE gloss: eine Person, die ein Versprechen gibt
    EN words: promiser, promisor
    DE words: Freud'scher Versprecher, Lapsus Linguae, Versprecher, freudscher Versprecher
    QUALITY: completeley wrong, Versprecher is if you intend to say something but say some thing else, it has nothing to do with Versprechen
 13. [noun] ili:i59450
    EN gloss: a vertical well around which there is a stairway
    DE gloss: ein vertikaler Schacht, um den herum eine Treppe verläuft
    EN words: stairwell
    DE words: Ern, Flur, Hausflur, Stiegenhaus, Treppenhaus
    QUALITY: treppenhaus woudl be the only correct one right?
 14. [verb] ili:i21908
    EN gloss: smile affectedly or derisively
    DE gloss: affektiert oder spöttisch lächeln
    EN words: simper, smirk
    DE words: in sich hinein lächeln, schmunzeln, vor sich hin lächeln
    QUALITY: the glosses would be also the words here? schmunzeln and lächeln are kind of the same but the affektiert and spöttisch is missing?
 15. [adjective] ili:i10887
    EN gloss: tending to reserve or introspection
    DE gloss: zur Zurückhaltung oder Introspektion neigend
    EN words: indrawn, withdrawn
    DE words: allein, einsam, eremitenhaft, eremitisch, für sich, solo, wie ein Einsiedler, wie ein Eremit, zurückgezogen
    QUALITY: ok
 16. [noun] ili:i113657
    EN gloss: a substance from which another substance is formed (especially by a metabolic reaction)
    DE gloss: ein Stoff, aus dem ein anderer Stoff gebildet wird (insbesondere durch eine Stoffwechselreaktion)
    EN words: precursor
    DE words: Ausgangsstoff, Edukt, Grundstoff, Präkursor, Vorläufer, biologische Vorstufe
    QUALITY: ok
 17. [adjective] ili:i13251
    EN gloss: tastelessly showy
    DE gloss: geschmacklos und auffällig
    EN words: brassy, cheap, flash, flashy, garish, gaudy, gimcrack, loud, meretricious, tacky, tatty, tawdry, trashy
    DE words: aufdringlich, marktschreierisch, reißerisch
    QUALITY: ok
 18. [noun] ili:i68734
    EN gloss: the branch of chemistry that studies the relation between chemical action and the amount of heat absorbed or generated
    DE gloss: der Zweig der Chemie, der die Beziehung zwischen chemischer Wirkung und der absorbierten oder erzeugten Wärmemenge untersucht
    EN words: thermochemistry
    DE words: Thermochemie, chemische Thermodynamik
    QUALITY: ok
 19. [adjective] ili:i12980
    EN gloss: distinguished from others in excellence
    DE gloss: durch hohe Qualität von anderen unterschieden
    EN words: outstanding
    DE words: I a, ausgezeichnet, außergewöhnlich, außerordentlich, besonders, bestens, eins a, exzeptionell, herausragend, schnafte, splendid, trefflich, vortrefflich, vorzüglich
    QUALITY: ok, aber eins a/1a is wirklich sehr starke umgangssprache. und cih habe ncoh nie schnafte oder splendid gehört, der rest passt
 20. [verb] ili:i30043
    EN gloss: tear down so as to make flat with the ground
    DE gloss: abreißen, um den Boden zu ebnen
    EN words: dismantle, level, pull down, rase, raze, take down, tear down
    DE words: abreißen, aus den Augen verlieren, keinen Kontakt mehr haben zu, nicht länger in Kontakt stehen
    QUALITY: nur abreißen stimmt, der rest passt in diesem zusammenhang gar nicht!
 21. [adjective] ili:i14014
    EN gloss: desired or wished for or sought
    DE gloss: gewünscht oder gewünscht oder gesucht
    EN words: wanted
    DE words: benötigt, gesucht, gewünscht
    QUALITY: ok
 22. [verb] ili:i29481
    EN gloss: mar or spoil the appearance of
    DE gloss: das Aussehen verunstalten
    EN words: blemish, deface, disfigure
    DE words: deformieren, entstellen, verhunzen, verschandeln, verunstalten, verunzieren
    QUALITY: ok
 23. [verb] ili:i28605
    EN gloss: spread thickly
    DE gloss: dick auftragen
    EN words: slather
    DE words: beharken, bestreichen, mit Feuer belegen, mit Sperrfeuer belegen
    QUALITY: kein wort ist wirklich ein synonym für dick auftragen, (i dont even know if the english word fits here?)
 24. [noun] ili:i92029
    EN gloss: someone who is licensed to operate an aircraft in flight
    DE gloss: jemand, der eine Lizenz zum Führen eines Luftfahrzeugs im Flug hat
    EN words: airplane pilot, pilot
    DE words: Führer, Lotse, Pilot
    QUALITY: nur Pilot stimmt hier
 25. [adjective] ili:i8221
    EN gloss: capable of being measured
    DE gloss: in der Lage, gemessen zu werden
    EN words: measurable, mensurable
    DE words: bestimmbar, der Messung zugänglich, erhebbar, mensurabel, messbar
    QUALITY: ok
 26. [noun] ili:i61380
    EN gloss: the spirit of a group that makes the members want the group to succeed
    DE gloss: der Geist einer Gruppe, der die Mitglieder dazu bringt, den Erfolg der Gruppe zu wollen
    EN words: esprit de corps, morale, team spirit
    DE words: Gruppengeist, Teamgeist
    QUALITY: Gruppengeist hört sich so komisch an, das sagt niemand, teamgeist ist in ordnung
 27. [adjective] ili:i10497
    EN gloss: free of restrictions or qualifications
    DE gloss: Zustand, in dem in einer Wohnung niemand wohnt.
    EN words: clean, clear
    DE words: frei, leer stehend, leerstehend, unbewohnt, ungenutzt, verwaist
    QUALITY: ok
 28. [adjective] ili:i6238
    EN gloss: moving and bending with ease
    DE gloss: anmutig schlank und mit Leichtigkeit biegsam und beweglich
    EN words: lissom, lissome, lithe, lithesome, slender, supple, svelte, sylphlike
    DE words: elastisch, geschmeidig, schlangenartig
    QUALITY: \_\_\_
 29. [noun] ili:i57906
    EN gloss: station for the production and transmission of AM or FM radio broadcasts
    DE gloss: Sender für die Produktion und Übertragung von AM- oder FM-Radiosendungen
    EN words: radio station
    DE words: Radiosender, Rundfunkstation, Sender
    QUALITY: \_\_\_
 30. [noun] ili:i112045
    EN gloss: the purple or black-and-blue area resulting from a bruise
    DE gloss: der violette oder schwarzblaue Bereich, der durch einen Bluterguss entsteht
    EN words: ecchymosis
    DE words: Ekchymose, kleinflächige Hautblutung
    QUALITY: \_\_\_
 31. [adjective] ili:i10839
    EN gloss: capable of being replaced
    DE gloss: kann ersetzt werden
    EN words: replaceable
    DE words: austauschbar, ersetzbar, fungibel
    QUALITY: \_\_\_
 32. [verb] ili:i28714
    EN gloss: whip
    DE gloss: peitschen
    EN words: flagellate, scourge
    DE words: auspeitschen, flagellieren, geißeln, peitschen
    QUALITY: \_\_\_
 33. [noun] ili:i52826
    EN gloss: a mechanical or electrical explosive device or a small amount of explosive; can be used to initiate the reaction of a disrupting explosive
    DE gloss: ein mechanischer oder elektrischer Sprengkörper oder eine kleine Menge Sprengstoff; kann verwendet werden, um die Reaktion eines Sprengstoffs auszulösen
    EN words: cap, detonating device, detonator
    DE words: Auslöser, Zünder, Zündvorrichtung
    QUALITY: \_\_\_
 34. [noun] ili:i115477
    EN gloss: ice crystals forming a white deposit (especially on objects outside)
    DE gloss: Eiskristalle, die einen weißen Belag bilden (insbesondere auf Gegenständen im Freien)
    EN words: frost, hoar, hoarfrost, rime
    DE words: Raufrost, Raureif, Reif
    QUALITY: \_\_\_
 35. [noun] ili:i66650
    EN gloss: the ability to see in reduced illumination (as in moonlight)
    DE gloss: die Fähigkeit, bei reduzierter Beleuchtung zu sehen (wie bei Mondlicht)
    EN words: night vision, night-sight, scotopic vision, twilight vision
    DE words: Nachtsehen, skotopisches Sehen
    QUALITY: \_\_\_
 36. [verb] ili:i26849
    EN gloss: express or utter with a hiss
    DE gloss: mit einem Zischen ausdrücken oder aussprechen
    EN words: hiss, sibilate, siss, sizz
    DE words: Stimme dämpfen, flüstern, hauchen, hinter vorgehaltener Hand, ins Ohr sagen, leise sprechen, mit tonloser Stimme, munkeln, raunen, säuseln, tonlos, tuscheln, wispern, zischeln, zuflüstern
    QUALITY: \_\_\_
 37. [noun] ili:i94222
    EN gloss: a teenager or a young adult male
    DE gloss: ein Jugendlicher oder ein junger Erwachsener
    EN words: young buck, young man
    DE words: Bruder, Bürschchen, Cowboy, Freundchen, Jungs, Kinders, Kollege, Kollegin, Leute, Mann Gottes, Meister, Sportsfreund, Verehrtester, der Herr, guter Mann, junger Mann, mein Gutster, mein Herr
    QUALITY: \_\_\_
 38. [noun] ili:i49310
    EN gloss: dusky grey food fish found from Louisiana and Florida southward
    DE gloss: dunkelgrauer Speisefisch, der von Louisiana und Florida südwärts vorkommt
    EN words: Anisotremus surinamensis, black margate, pompon
    DE words: Pompon, Puschel, Tanzwedel
    QUALITY: \_\_\_
 39. [noun] ili:i50315
    EN gloss: a small vehicle with four wheels in which a baby or child is pushed around
    DE gloss: ein kleines Fahrzeug mit vier Rädern, in dem ein Säugling oder ein Kind herumgeschoben wird
    EN words: baby buggy, baby carriage, carriage, go-cart, perambulator, pram, pushchair, pusher, stroller
    DE words: Kinderwagen, Säuglingskutsche
    QUALITY: \_\_\_
 40. [verb] ili:i31857
    EN gloss: meet at a point
    DE gloss: sich an einem Punkt treffen
    EN words: cross, intersect
    DE words: gegen den Wind segeln, kreuzen
    QUALITY: \_\_\_
 41. [noun] ili:i51632
    EN gloss: a sailboat with two parallel hulls held together by single deck
    DE gloss: ein Boot mit zwei parallelen Rümpfen, die durch ein einziges Deck zusammengehalten werden
    EN words: catamaran
    DE words: Doppelrumpfboot, Katamaran, Zweirumpfboot
    QUALITY: \_\_\_
 42. [verb] ili:i34734
    EN gloss: to be found to exist
    DE gloss: als existent befunden werden
    EN words: occur
    DE words: anzutreffen sein, auftreten, nicht ausbleiben, vorkommen, zu finden sein, zu sehen sein
    QUALITY: \_\_\_
 43. [verb] ili:i25187
    EN gloss: assign too high a value to
    DE gloss: einen zu hohen Wert zuweisen
    EN words: overestimate, overvalue
    DE words: zu hoch bewerten, zu viel Gewicht beimessen, zu viel Wichtigkeit beimessen, überbewerten, überschätzen
    QUALITY: \_\_\_
 44. [noun] ili:i73844
    EN gloss: an expressive style of music
    DE gloss: ein ausdrucksstarker Musikstil
    EN words: genre, music genre, musical genre, musical style
    DE words: Genre, Musikgenre, Musikrichtung, Musikstil, Stilrichtung
    QUALITY: \_\_\_
 45. [noun] ili:i113026
    EN gloss: an abnormal condition in which cerebrospinal fluid collects in the ventricles of the brain; in infants it can cause abnormally rapid growth of the head and bulging fontanelles and a small face; in adults the symptoms are primarily neurological
    DE gloss: ein anormaler Zustand, bei dem sich Liquor in den Hirnventrikeln sammelt; bei Säuglingen kann er zu einem anormal schnellen Wachstum des Kopfes, zu wulstigen Fontanellen und einem kleinen Gesicht führen; bei Erwachsenen sind die Symptome hauptsächlich neurologisch
    EN words: hydrocephalus, hydrocephaly
    DE words: Gehirnwassersucht, Hydrocephalus, Hydrozephalus, Wasserkopf
    QUALITY: \_\_\_
 46. [noun] ili:i62720
    EN gloss: habitual uncleanliness
    DE gloss: gewohnheitsmäßige Unreinheit
    EN words: slovenliness
    DE words: Flickarbeit, Flickenteppich, Flickwerk, Gestümper, Mist, Murks, Murkserei, Pfusch, Pfuscharbeit, Pfuscherei, Schlamperei, Schlendrian, Schluderei, Schund, schlechte Arbeit
    QUALITY: \_\_\_
 47. [noun] ili:i80976
    EN gloss: the government agency in the United Kingdom that is responsible for internal security and counterintelligence overseas
    DE gloss: Regierungsbehörde im Vereinigten Königreich, die für die innere Sicherheit und die Spionageabwehr im Ausland zuständig ist.
    EN words: MI, Military Intelligence Section 6, Secret Intelligence Service
    DE words: MI6, SIS, Secret Intelligence Service, Secret Service, britischer Auslandsgeheimdienst
    QUALITY: \_\_\_
 48. [noun] ili:i60476
    EN gloss: an electrical device by which alternating current of one voltage is changed to another voltage
    DE gloss: ein elektrisches Gerät, mit dem Wechselstrom einer bestimmten Spannung in eine andere Spannung umgewandelt wird
    EN words: transformer
    DE words: Spannungswandler, Trafo, Transformator, Transformer
    QUALITY: \_\_\_
 49. [noun] ili:i37037
    EN gloss: wandering from the main path of a journey
    DE gloss: das Abweichen vom Hauptweg einer Reise
    EN words: digression, excursion
    DE words: Abschweifung, Abstecher, Einschub, Exkurs, Umschweif
    QUALITY: \_\_\_
 50. [noun] ili:i77288
    EN gloss: any meat that is minced and spiced and cooked as patties or used to fill sausages
    DE gloss: jegliches Fleisch, das zerkleinert und gewürzt und als Pasteten gekocht oder zur Füllung von Würsten verwendet wird
    EN words: sausage meat
    DE words: Brät, Wurstbrät
    QUALITY: \_\_\_
--- a/data-pipeline/db/import.ts
+++ b/data-pipeline/db/import.ts
@ -1,185 +1,98 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
 import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
 import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
 import { openDb } from "./index.js";
-
+import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
 // ── Types ─────────────────────────────────────────────────────────────────────
 type Example = { text: string; source: "omw" | "cefr" };
 type AnnotatedRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
  votes: Partial<
    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
  >;
 };
 // ── Paths ─────────────────────────────────────────────────────────────────────
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const PATHS = {
-  annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
+  extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
 };
 // ── Loading ───────────────────────────────────────────────────────────────────
 async function loadAnnotated(): Promise<AnnotatedRecord[]> {
  // Use en.json as the base — it has the most complete glosses and examples.
  // Merge votes and CEFR examples from the other language files.
  const baseRaw = await fs.readFile(
    path.join(PATHS.annotatedDir, "en.json"),
    "utf-8",
  );
  const base = JSON.parse(baseRaw) as AnnotatedRecord[];
  const byId = new Map<string, AnnotatedRecord>();
  for (const record of base) {
    byId.set(record.source_id, record);
  }
  for (const lang of SUPPORTED_LANGUAGE_CODES) {
    if (lang === "en") continue;
    const raw = await fs.readFile(
      path.join(PATHS.annotatedDir, `${lang}.json`),
      "utf-8",
    );
    const records = JSON.parse(raw) as AnnotatedRecord[];
    for (const record of records) {
      const base = byId.get(record.source_id);
      if (!base) continue;
      // Merge votes
      for (const [l, langVotes] of Object.entries(record.votes)) {
        if (!base.votes[l as SupportedLanguageCode]) {
          base.votes[l as SupportedLanguageCode] = {};
        }
        Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
      }
      // Merge CEFR examples not already in base
      for (const [l, examples] of Object.entries(record.examples)) {
        const lang = l as SupportedLanguageCode;
        const cefrExamples = examples.filter((e) => e.source === "cefr");
        if (cefrExamples.length === 0) continue;
        if (!base.examples[lang]) {
          base.examples[lang] = cefrExamples;
        } else {
          base.examples[lang].push(...cefrExamples);
        }
      }
    }
  }
  return [...byId.values()];
 }
 // ── Import ────────────────────────────────────────────────────────────────────
-export async function importStage2(): Promise<void> {
+export async function importKaikki(): Promise<void> {
-  console.log("Loading stage 2 annotated files...");
+  console.log("Loading extracted Kaikki data...");
-  const records = await loadAnnotated();
+  const raw = await fs.readFile(PATHS.extracted, "utf-8");
-  console.log(`  Loaded ${records.length.toLocaleString()} synsets`);
+  const senses = JSON.parse(raw) as ExtractedSense[];
  console.log(`  Loaded ${senses.length.toLocaleString()} senses`);
  const db = openDb();
-  const insertSynset = db.prepare(
+  const insertEntry = db.prepare(`
-    `INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
+    INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
-  );
+    VALUES (?, ?, ?, ?, ?, ?)
-
+    ON CONFLICT (headword, language, pos, sense_index)
-  const insertTranslation = db.prepare(
+    DO UPDATE SET
-    `INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
+      gloss    = excluded.gloss,
-  );
+      examples = excluded.examples
-
+    RETURNING id
  const insertGloss = db.prepare(
    `INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
  );
  const insertExample = db.prepare(
    `INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
  );
  const insertCefrVote = db.prepare(`
    INSERT INTO cefr_source_votes (translation_id, cefr_level)
    VALUES (
      (SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
      ?
    )
  `);
  const insertTranslation = db.prepare(`
    INSERT INTO translations (entry_id, target_lang, word, sense_hint)
    VALUES (?, ?, ?, ?)
    ON CONFLICT (entry_id, target_lang, word) DO NOTHING
  `);
  // Track next available sense_index per (headword, pos) to handle
  // the same word appearing in multiple JSONL entries with the same POS.
  const senseIndexMap = new Map<string, number>();
  console.log("\nImporting into pipeline.db...");
  const importAll = db.transaction(() => {
-    let synsets = 0;
+    let entries = 0;
    let translations = 0;
-    let glosses = 0;
+    let skipped = 0;
    let examples = 0;
    let cefrVotes = 0;
-    for (const record of records) {
+    for (const sense of senses) {
-      insertSynset.run(record.source_id, record.pos);
+      const key = `${sense.headword}|${sense.pos}`;
-      synsets++;
+      const nextIndex = senseIndexMap.get(key) ?? 0;
-      // Translations
+      // Use the offset sense_index to avoid collisions when the same word
-      for (const [lang, words] of Object.entries(record.translations)) {
+      // appears in multiple JSONL entries with the same POS.
-        const unique = [...new Set(words)];
+      const senseIndex = nextIndex;
-        for (const word of unique) {
+      senseIndexMap.set(key, nextIndex + 1);
-          insertTranslation.run(record.source_id, lang, word);
+
-          translations++;
+      const row = insertEntry.get(
-        }
+        sense.headword,
        "en",
        sense.pos,
        senseIndex,
        sense.gloss ?? null,
        JSON.stringify(sense.examples),
      ) as { id: number } | undefined;
      if (!row) {
        skipped++;
        continue;
      }
-      // Glosses
+      entries++;
      for (const [lang, glossList] of Object.entries(record.glosses)) {
        for (const text of glossList) {
          insertGloss.run(record.source_id, lang, text);
          glosses++;
        }
      }
-      // Examples
+      for (const t of sense.translations) {
-      for (const [lang, exList] of Object.entries(record.examples)) {
+        insertTranslation.run(
-        for (const example of exList) {
+          row.id,
-          insertExample.run(
+          t.target_lang,
-            record.source_id,
+          t.word,
-            lang,
+          t.sense_hint ?? null,
-            example.text,
+        );
-            example.source,
+        translations++;
          );
          examples++;
        }
      }
      // CEFR source votes
      for (const [lang, langVotes] of Object.entries(record.votes)) {
        for (const [word, vote] of Object.entries(
          langVotes as Record<string, { cefr_source: string }>,
        )) {
          insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
          cefrVotes++;
        }
      }
    }
-    return { synsets, translations, glosses, examples, cefrVotes };
+    return { entries, translations, skipped };
  });
  const counts = importAll();
-  console.log(`  synsets:      ${counts.synsets.toLocaleString()}`);
+  console.log(`  entries:      ${counts.entries.toLocaleString()}`);
  console.log(`  translations: ${counts.translations.toLocaleString()}`);
-  console.log(`  glosses:      ${counts.glosses.toLocaleString()}`);
+  console.log(`  skipped:      ${counts.skipped.toLocaleString()}`);
  console.log(`  examples:     ${counts.examples.toLocaleString()}`);
  console.log(`  cefr votes:   ${counts.cefrVotes.toLocaleString()}`);
  db.close();
  console.log("\nImport complete.");
@ -189,7 +102,7 @@ export async function importStage2(): Promise<void> {
 export function isImported(): boolean {
  const db = openDb();
-  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
+  const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
    count: number;
  };
  db.close();
@ -200,20 +113,20 @@ export function isImported(): boolean {
 async function main(): Promise<void> {
  const db = openDb();
-  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
+  const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
    count: number;
  };
  db.close();
  if (row.count > 0) {
    console.log(
-      `pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
+      `pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
    );
    console.log("Delete pipeline.db and re-run db:init to start fresh.");
    process.exit(0);
  }
-  await importStage2();
+  await importKaikki();
 }
 if (import.meta.url === `file://${process.argv[1]}`) {
--- a/data-pipeline/db/schema.sql
+++ b/data-pipeline/db/schema.sql
@ -1,62 +1,58 @@
 -- ── Base data ─────────────────────────────────────────────────────────────────
-- Imported from stage 2 JSON on first run. Never mutated after import.
+-- Imported from Kaikki on first run. Never mutated after import.
-CREATE TABLE IF NOT EXISTS synsets (
+CREATE TABLE IF NOT EXISTS entries (
-  source_id TEXT PRIMARY KEY,
+  id          INTEGER PRIMARY KEY,
-  pos       TEXT NOT NULL
+  headword    TEXT    NOT NULL,
  language    TEXT    NOT NULL,
  pos         TEXT    NOT NULL,
  sense_index INTEGER NOT NULL DEFAULT 0,
  gloss       TEXT,
  examples    TEXT    NOT NULL DEFAULT '[]', -- JSON array of strings
  source      TEXT    NOT NULL DEFAULT 'kaikki',
  UNIQUE (headword, language, pos, sense_index)
 );
 CREATE TABLE IF NOT EXISTS translations (
-  id        INTEGER PRIMARY KEY,
+  id          INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
-  language  TEXT    NOT NULL,
+  target_lang TEXT    NOT NULL,
-  word      TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
-  UNIQUE (source_id, language, word)
+  sense_hint  TEXT,
-);
+  source      TEXT    NOT NULL DEFAULT 'kaikki',
-
+  UNIQUE (entry_id, target_lang, word)
 CREATE TABLE IF NOT EXISTS glosses (
  id        INTEGER PRIMARY KEY,
  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
  language  TEXT    NOT NULL,
  text      TEXT    NOT NULL
 );
 CREATE TABLE IF NOT EXISTS examples (
  id        INTEGER PRIMARY KEY,
  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
  language  TEXT    NOT NULL,
  text      TEXT    NOT NULL,
  source    TEXT    NOT NULL
 );
 CREATE TABLE IF NOT EXISTS cefr_source_votes (
  id             INTEGER PRIMARY KEY,
  translation_id INTEGER NOT NULL REFERENCES translations(id),
  cefr_level     TEXT    NOT NULL,
  UNIQUE (translation_id)
 );
 -- ── Status tracking ───────────────────────────────────────────────────────────
-- One row per synset per model per stage. Drives resumability.
+-- One row per entry per model per stage. Drives resumability.
 -- Sentinel rows use entry_id = 0 for one-time pipeline steps.
 -- stage:  round1 | round2 | tiebreak
 -- status: pending | complete | needs_review | flagged
 CREATE TABLE IF NOT EXISTS run_status (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL,
+  entry_id   INTEGER NOT NULL,
  model_name TEXT    NOT NULL,
  stage      TEXT    NOT NULL,
  status     TEXT    NOT NULL,
  created_at TEXT    NOT NULL DEFAULT (datetime('now')),
  updated_at TEXT    NOT NULL DEFAULT (datetime('now')),
-  UNIQUE (source_id, model_name, stage)
+  UNIQUE (entry_id, model_name, stage)
 );
 -- ── Round 1 output ────────────────────────────────────────────────────────────
-- One row per translation/language per model. Written atomically per record.
+-- Written atomically per entry per model.
 -- Unique constraints enforce one model one vote.
-CREATE TABLE IF NOT EXISTS model_cefr_votes (
+CREATE TABLE IF NOT EXISTS model_entry_cefr_votes (
  id         INTEGER PRIMARY KEY,
  entry_id   INTEGER NOT NULL REFERENCES entries(id),
  model_name TEXT    NOT NULL,
  cefr_level TEXT    NOT NULL,
  UNIQUE (entry_id, model_name)
 );
 CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
  id             INTEGER PRIMARY KEY,
  translation_id INTEGER NOT NULL REFERENCES translations(id),
  model_name     TEXT    NOT NULL,
@ -64,38 +60,29 @@ CREATE TABLE IF NOT EXISTS model_cefr_votes (
  UNIQUE (translation_id, model_name)
 );
 CREATE TABLE IF NOT EXISTS model_translation_rejections (
  id             INTEGER PRIMARY KEY,
  translation_id INTEGER NOT NULL REFERENCES translations(id),
  model_name     TEXT    NOT NULL,
  UNIQUE (translation_id, model_name)
 );
 CREATE TABLE IF NOT EXISTS generated_glosses (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
  model_name TEXT    NOT NULL,
  language   TEXT    NOT NULL,
  text       TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+  UNIQUE (entry_id, model_name)
 );
 CREATE TABLE IF NOT EXISTS generated_examples (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
  model_name TEXT    NOT NULL,
  language   TEXT    NOT NULL,
  text       TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+  UNIQUE (entry_id, model_name)
 );
-CREATE TABLE IF NOT EXISTS generated_descriptions (
+CREATE TABLE IF NOT EXISTS generated_translations (
-  id         INTEGER PRIMARY KEY,
+  id          INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
-  model_name TEXT    NOT NULL,
+  model_name  TEXT    NOT NULL,
-  language   TEXT    NOT NULL,
+  target_lang TEXT    NOT NULL,
-  text       TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+  UNIQUE (entry_id, model_name, target_lang)
 );
 -- ── Round 2 output ────────────────────────────────────────────────────────────
@ -116,20 +103,28 @@ CREATE TABLE IF NOT EXISTS example_candidate_votes (
  UNIQUE (example_id, model_name)
 );
-CREATE TABLE IF NOT EXISTS description_candidate_votes (
+CREATE TABLE IF NOT EXISTS translation_candidate_votes (
  id             INTEGER PRIMARY KEY,
-  description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
+  translation_id INTEGER NOT NULL REFERENCES generated_translations(id),
  model_name     TEXT    NOT NULL,
-  UNIQUE (description_id, model_name)
+  UNIQUE (translation_id, model_name)
 );
 -- ── Resolved output ───────────────────────────────────────────────────────────
 -- Written by merge. Never updated after writing.
-- Only fully resolved records are written here — no nulls, no flags.
+-- Only fully resolved records are written here — no nulls.
 -- Absence of a row means unresolved. Flagged status tracked in run_status.
-- source: omw | cefr | model_name
+-- source: kaikki | model_name
-CREATE TABLE IF NOT EXISTS resolved_translations (
+CREATE TABLE IF NOT EXISTS resolved_entry_cefr (
  id         INTEGER PRIMARY KEY,
  entry_id   INTEGER NOT NULL REFERENCES entries(id),
  cefr_level TEXT    NOT NULL,
  difficulty TEXT    NOT NULL,
  UNIQUE (entry_id)
 );
 CREATE TABLE IF NOT EXISTS resolved_translation_cefr (
  id             INTEGER PRIMARY KEY,
  translation_id INTEGER NOT NULL REFERENCES translations(id),
  cefr_level     TEXT    NOT NULL,
@ -138,27 +133,25 @@ CREATE TABLE IF NOT EXISTS resolved_translations (
 );
 CREATE TABLE IF NOT EXISTS resolved_glosses (
-  id        INTEGER PRIMARY KEY,
+  id         INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
-  language  TEXT    NOT NULL,
+  text       TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
+  source     TEXT    NOT NULL,
-  source    TEXT    NOT NULL,
+  UNIQUE (entry_id)
  UNIQUE (source_id, language)
 );
 CREATE TABLE IF NOT EXISTS resolved_examples (
-  id        INTEGER PRIMARY KEY,
+  id         INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
-  language  TEXT    NOT NULL,
+  text       TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
+  source     TEXT    NOT NULL
  source    TEXT    NOT NULL
 );
-CREATE TABLE IF NOT EXISTS resolved_descriptions (
+CREATE TABLE IF NOT EXISTS resolved_generated_translations (
-  id        INTEGER PRIMARY KEY,
+  id          INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
-  language  TEXT    NOT NULL,
+  target_lang TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
-  source    TEXT    NOT NULL,
+  source      TEXT    NOT NULL,
-  UNIQUE (source_id, language)
+  UNIQUE (entry_id, target_lang)
 );
--- a/data-pipeline/stage-1-extract/scripts/extract.py
+++ b/data-pipeline/stage-1-extract/scripts/extract.py
@ -1,204 +0,0 @@
 """
 data-pipeline/stage-1-extract/scripts/extract.py
 Extract all synsets from the Open Multilingual Wordnet (OMW) for all
 supported languages and parts of speech.
 Output: one JSON file per language, written to stage-1-extract/output/
  en.json, it.json, es.json, de.json, fr.json
 Each file is a JSON array of synset records:
  {
    "source_id": "ili:i12345",
    "pos": "noun",
    "translations": { "en": ["dog", "canine"], "it": ["cane"] },
    "glosses":      { "en": ["a domesticated animal..."] },
    "examples":     { "en": ["the dog barked at the stranger"] }
  }
 Usage:
  python stage-1-extract/scripts/extract.py
  python stage-1-extract/scripts/extract.py --sample
 Prerequisites:
  pip install wn
  python -m wn download omw-en:1.4
  python -m wn download omw-it:1.4
  python -m wn download omw-de:1.4
  python -m wn download omw-es:1.4
  python -m wn download omw-fr:1.4
 """
 import json
 import sys
 from pathlib import Path
 import wn
 SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
 POS_MAP: dict[str, str] = {
    "n": "noun",
    "v": "verb",
    "a": "adjective",
    "s": "adjective",  # adjective satellite — collapsed into adjective
    "r": "adverb",
 }
 def extract_all(
    output_dir: str = "stage-1-extract/output", sample: bool = False
 ) -> None:
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)
    sample_size = 100 if sample else None
    # Load one Wordnet object per language up front.
    print("Loading wordnets...")
    wordnets: dict[str, wn.Wordnet] = {}
    for lang in SUPPORTED_LANGUAGE_CODES:
        try:
            wordnets[lang] = wn.Wordnet(lang=lang)
            synset_count = len(wordnets[lang].synsets())
            print(f"  {lang}: {synset_count:,} total synsets")
        except wn.Error as e:
            print(f"  ERROR loading {lang}: {e}")
            print(f"  Run: python -m wn download omw-{lang}:1.4")
            sys.exit(1)
    # Collect per-ILI data across all languages and POS.
    print("\nExtracting synsets...")
    by_ili: dict[str, dict] = {}
    for lang, wnet in wordnets.items():
        for omw_pos, pos_label in POS_MAP.items():
            synsets = wnet.synsets(pos=omw_pos)
            covered = 0
            for synset in synsets:
                ili = synset.ili
                if not ili:
                    continue
                covered += 1
                lemmas = list(dict.fromkeys(str(lemma) for lemma in synset.lemmas()))
                defns = [d for d in synset.definitions() if d]
                examples = [e for e in synset.examples() if e]
                if ili not in by_ili:
                    by_ili[ili] = {"pos": pos_label}
                if lang not in by_ili[ili]:
                    by_ili[ili][lang] = {
                        "lemmas": lemmas,
                        "glosses": defns,
                        "examples": examples,
                    }
                else:
                    # ILI already exists for this language — merge data.
                    # Happens when 'a' and 's' both map to adjective for the
                    # same ILI. Deduplicate to avoid repeated entries.
                    existing = by_ili[ili][lang]
                    existing["lemmas"] = list(
                        dict.fromkeys(existing["lemmas"] + lemmas)
                    )
                    existing["glosses"] = list(
                        dict.fromkeys(existing["glosses"] + defns)
                    )
                    existing["examples"] = list(
                        dict.fromkeys(existing["examples"] + examples)
                    )
            print(f"  {lang} {pos_label}: {covered:,} synsets with ILI")
    # Build records and write single combined output file.
    print("\nBuilding records...")
    ilis = sorted(by_ili.keys())
    if sample_size:
        ilis = ilis[:sample_size]
    records: list[dict] = []
    for ili in ilis:
        data = by_ili[ili]
        record: dict = {
            "source_id": f"ili:{ili}",
            "pos": data["pos"],
            "translations": {},
            "glosses": {},
            "examples": {},
        }
        for key, value in data.items():
            if key == "pos":
                continue
            lang = key
            if value["lemmas"]:
                record["translations"][lang] = value["lemmas"]
            if value["glosses"]:
                record["glosses"][lang] = value["glosses"]
            if value["examples"]:
                record["examples"][lang] = value["examples"]
        records.append(record)
    output_file = out / "omw.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=2, ensure_ascii=False)
    print(f"\nWrote {len(records):,} synsets → {output_file}")
    _print_coverage(records)
 def _print_coverage(records: list[dict]) -> None:
    """Print per-language translation, gloss, and example counts."""
    lang_stats: dict[str, dict[str, int]] = {}
    for lang in SUPPORTED_LANGUAGE_CODES:
        lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
    pos_stats: dict[str, int] = {}
    for r in records:
        pos = r["pos"]
        pos_stats[pos] = pos_stats.get(pos, 0) + 1
        for lang, lemmas in r["translations"].items():
            if lang in lang_stats:
                lang_stats[lang]["translations"] += len(lemmas)
        for lang, gloss_list in r["glosses"].items():
            if lang in lang_stats:
                lang_stats[lang]["glosses"] += len(gloss_list)
        for lang, example_list in r["examples"].items():
            if lang in lang_stats:
                lang_stats[lang]["examples"] += len(example_list)
    print("\nPOS breakdown:")
    for pos, count in sorted(pos_stats.items()):
        print(f"  {pos}: {count:,}")
    print("\nCoverage per language:")
    for lang, counts in lang_stats.items():
        t = counts["translations"]
        g = counts["glosses"]
        e = counts["examples"]
        total = len(records)
        print(
            f"  {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
        )
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
    parser.add_argument(
        "--output-dir",
        default="stage-1-extract/output",
        help="Output directory for JSON files",
    )
    parser.add_argument(
        "--sample",
        action="store_true",
        help="Extract only 100 synsets per language for inspection",
    )
    args = parser.parse_args()
    extract_all(output_dir=args.output_dir, sample=args.sample)
--- a/data-pipeline/stage-1-extract/scripts/extract.ts
+++ b/data-pipeline/stage-1-extract/scripts/extract.ts
@ -0,0 +1,209 @@
 import fs from "node:fs";
 import path from "node:path";
 import readline from "node:readline";
 import { fileURLToPath } from "node:url";
 import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
 import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
 // ── Types ─────────────────────────────────────────────────────────────────────
 type KaikkiTranslation = {
  code?: string;
  lang_code?: string;
  word?: string;
  sense?: string;
 };
 type KaikkiSense = {
  glosses?: string[];
  examples?: { text?: string }[];
  translations?: KaikkiTranslation[];
 };
 type KaikkiEntry = { word?: string; pos?: string; senses?: KaikkiSense[] };
 export type ExtractedSense = {
  headword: string;
  pos: SupportedPos;
  sense_index: number;
  gloss: string | null;
  examples: string[];
  translations: {
    target_lang: SupportedLanguageCode;
    word: string;
    sense_hint: string | null;
  }[];
 };
 // ── Constants ─────────────────────────────────────────────────────────────────
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const PATHS = {
  source: path.resolve(
    __dirname,
    "../sources/kaikki.org-dictionary-English.jsonl",
  ),
  output: path.resolve(__dirname, "../output/en.json"),
 };
 const POS_MAP: Record<string, SupportedPos> = {
  noun: "noun",
  verb: "verb",
  adj: "adjective",
  adv: "adverb",
 };
 const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
 // ── Helpers ───────────────────────────────────────────────────────────────────
 function mapPos(kaikkiPos: string): SupportedPos | null {
  return POS_MAP[kaikkiPos] ?? null;
 }
 function isAbbreviation(gloss: string): boolean {
  return gloss.toLowerCase().startsWith("abbreviation of");
 }
 function extractTranslations(
  sense: KaikkiSense,
 ): ExtractedSense["translations"] {
  const seen = new Set<string>();
  const result: ExtractedSense["translations"] = [];
  for (const t of sense.translations ?? []) {
    const code = t.code ?? t.lang_code;
    if (!code || !SUPPORTED_LANG_SET.has(code) || code === "en") continue;
    if (!t.word?.trim()) continue;
    const key = `${code}:${t.word.trim()}`;
    if (seen.has(key)) continue;
    seen.add(key);
    result.push({
      target_lang: code as SupportedLanguageCode,
      word: t.word.trim(),
      sense_hint: t.sense?.trim() ?? null,
    });
  }
  return result;
 }
 function extractExamples(sense: KaikkiSense): string[] {
  return (sense.examples ?? [])
    .map((e) => e.text?.trim())
    .filter((t): t is string => !!t);
 }
 function processEntry(entry: KaikkiEntry): ExtractedSense[] {
  const pos = mapPos(entry.pos ?? "");
  if (!pos) return [];
  if (!entry.word?.trim()) return [];
  const headword = entry.word.trim();
  const results: ExtractedSense[] = [];
  let senseIndex = 0;
  for (const sense of entry.senses ?? []) {
    const gloss = sense.glosses?.[0]?.trim() ?? null;
    // Skip abbreviation senses
    if (gloss && isAbbreviation(gloss)) continue;
    const translations = extractTranslations(sense);
    // Skip senses with no translations in our supported languages
    if (translations.length === 0) continue;
    results.push({
      headword,
      pos,
      sense_index: senseIndex++,
      gloss,
      examples: extractExamples(sense),
      translations,
    });
  }
  return results;
 }
 // ── Main ──────────────────────────────────────────────────────────────────────
 async function extract(sampleLimit?: number): Promise<void> {
  console.log("Extracting Kaikki English data...");
  console.log(`  Source: ${PATHS.source}`);
  if (sampleLimit) {
    console.log(`  Sample mode: ${sampleLimit} entries`);
  }
  await fs.promises.mkdir(path.dirname(PATHS.output), { recursive: true });
  const fileStream = fs.createReadStream(PATHS.source);
  const rl = readline.createInterface({
    input: fileStream,
    crlfDelay: Infinity,
  });
  const senses: ExtractedSense[] = [];
  let linesRead = 0;
  let entriesProcessed = 0;
  let entriesSkipped = 0;
  for await (const line of rl) {
    if (!line.trim()) continue;
    if (sampleLimit && entriesProcessed >= sampleLimit) break;
    linesRead++;
    let entry: KaikkiEntry;
    try {
      entry = JSON.parse(line) as KaikkiEntry;
    } catch {
      console.warn(`  Warning: failed to parse line ${linesRead}, skipping`);
      continue;
    }
    const extracted = processEntry(entry);
    if (extracted.length === 0) {
      entriesSkipped++;
      continue;
    }
    senses.push(...extracted);
    entriesProcessed++;
    if (entriesProcessed % 10_000 === 0) {
      console.log(
        `  Processed ${entriesProcessed.toLocaleString()} entries...`,
      );
    }
  }
  await fs.promises.writeFile(
    PATHS.output,
    JSON.stringify(senses, null, 2),
    "utf-8",
  );
  console.log(`\nExtraction complete:`);
  console.log(`  Lines read:         ${linesRead.toLocaleString()}`);
  console.log(`  Entries processed:  ${entriesProcessed.toLocaleString()}`);
  console.log(`  Entries skipped:    ${entriesSkipped.toLocaleString()}`);
  console.log(`  Senses extracted:   ${senses.length.toLocaleString()}`);
  console.log(`  Output:             ${PATHS.output}`);
 }
 main().catch((err) => {
  console.error(err);
  process.exit(1);
 });
 async function main(): Promise<void> {
  // Hardcoded sample limit for initial testing — remove for full extraction
  await extract(500);
 }
--- a/data-pipeline/stage-2-annotate/scripts/annotate.ts
+++ b/data-pipeline/stage-2-annotate/scripts/annotate.ts
@ -1,227 +0,0 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
 import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
 // ── Types ────────────────────────────────────────────────────────────────────
 type OmwExample = { text: string; source: "omw" };
 type CefrExample = { text: string; source: "cefr" };
 type Example = OmwExample | CefrExample;
 type OmwRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, string[]>>;
 };
 type AnnotatedRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
  votes: Partial<
    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
  >;
 };
 type CefrSourceEntry = {
  word: string;
  pos: string;
  cefr_level: string;
  example_sentence_native?: string;
 };
 type ConflictEntry = {
  word: string;
  pos: string;
  language: SupportedLanguageCode;
  levels: string[];
 };
 // ── Constants ─────────────────────────────────────────────────────────────────
 const POS_NORMALIZE: Record<string, SupportedPos> = {
  noun: "noun",
  n: "noun",
  nom: "noun", // French
  verb: "verb",
  verbs: "verb",
  v: "verb",
  v1: "verb",
  adjective: "adjective",
  adjektiv: "adjective", // German
  adj: "adjective",
  adverb: "adverb",
  adverbs: "adverb",
  adv: "adverb",
 };
 const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
 const PATHS = {
  omw: "stage-1-extract/output/omw.json",
  cefrDir: "stage-2-annotate/sources/cefr",
  outputDir: "stage-2-annotate/output",
 };
 // ── CEFR source loading ───────────────────────────────────────────────────────
 type CefrIndex = Map<string, { level: string; example?: string }>;
 async function loadCefrSource(
  lang: SupportedLanguageCode,
 ): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
  const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
  const raw = await fs.readFile(filepath, "utf-8");
  const entries = JSON.parse(raw) as CefrSourceEntry[];
  // First pass — detect conflicts.
  // Structure: "word|pos" -> Set of CEFR levels seen
  const seen = new Map<string, Set<string>>();
  for (const entry of entries) {
    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
    if (!pos) continue;
    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
    if (!seen.has(key)) seen.set(key, new Set());
    seen.get(key)!.add(entry.cefr_level);
  }
  const conflicts: ConflictEntry[] = [];
  for (const [key, levels] of seen.entries()) {
    if (levels.size > 1) {
      const [word, pos] = key.split("|") as [string, string];
      conflicts.push({ word, pos, language: lang, levels: [...levels] });
    }
  }
  // Second pass — build index, skip conflicting entries.
  const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
  const index: CefrIndex = new Map();
  for (const entry of entries) {
    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
    if (!pos) continue;
    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
    if (conflictKeys.has(key)) continue;
    index.set(key, {
      level: entry.cefr_level,
      ...(entry.example_sentence_native
        ? { example: entry.example_sentence_native }
        : {}),
    });
  }
  return { index, conflicts };
 }
 // ── Annotation ────────────────────────────────────────────────────────────────
 async function annotate(): Promise<void> {
  // Load OMW records
  console.log("Reading OMW extract...");
  const raw = await fs.readFile(PATHS.omw, "utf-8");
  const omwRecords = JSON.parse(raw) as OmwRecord[];
  console.log(`  Loaded ${omwRecords.length.toLocaleString()} synsets`);
  // Load CEFR sources for all languages
  console.log("\nLoading CEFR source files...");
  const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
  const allConflicts: ConflictEntry[] = [];
  for (const lang of SUPPORTED_LANGUAGE_CODES) {
    const { index, conflicts } = await loadCefrSource(lang);
    cefrIndexes.set(lang, index);
    allConflicts.push(...conflicts);
    console.log(
      `  ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
    );
  }
  // Write conflicts file
  await fs.mkdir(PATHS.outputDir, { recursive: true });
  await fs.writeFile(
    path.join(PATHS.outputDir, "conflicts.json"),
    JSON.stringify(allConflicts, null, 2),
    "utf-8",
  );
  console.log(
    `\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
  );
  // Annotate and write one file per language
  console.log("\nAnnotating...");
  for (const lang of SUPPORTED_LANGUAGE_CODES) {
    const index = cefrIndexes.get(lang)!;
    const records: AnnotatedRecord[] = [];
    let matched = 0;
    for (const record of omwRecords) {
      const annotated: AnnotatedRecord = {
        source_id: record.source_id,
        pos: record.pos,
        translations: record.translations,
        glosses: record.glosses,
        examples: {},
        votes: {},
      };
      // Convert OMW examples to typed format
      for (const [l, exList] of Object.entries(record.examples)) {
        annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
          text,
          source: "omw" as const,
        }));
      }
      // Match translations for this language against CEFR index
      const langTranslations = record.translations[lang] ?? [];
      for (const word of langTranslations) {
        const key = `${word.toLowerCase().trim()}|${record.pos}`;
        const cefrEntry = index.get(key);
        if (!cefrEntry) continue;
        matched++;
        // Add CEFR vote
        if (!annotated.votes[lang]) annotated.votes[lang] = {};
        annotated.votes[lang][word] = { cefr_source: cefrEntry.level };
        // Add native example if present
        if (cefrEntry.example) {
          if (!annotated.examples[lang]) annotated.examples[lang] = [];
          annotated.examples[lang].push({
            text: cefrEntry.example,
            source: "cefr" as const,
          });
        }
      }
      records.push(annotated);
    }
    const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
    await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
    console.log(
      `  ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
    );
  }
 }
 // ── Main ─────────────────────────────────────────────────────────────────────
 annotate().catch((err) => {
  console.error(err);
  process.exit(1);
 });
--- a/data-pipeline/stage-2-annotate/sources/cefr/de.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/de.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/en.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/en.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/es.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/es.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/fr.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/fr.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/it.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/it.json
--- a/data-pipeline/tests/fixtures/annotated.fixture.json
+++ b/data-pipeline/tests/fixtures/annotated.fixture.json
@ -1,170 +0,0 @@
 [
  {
    "_fixture": "noun_with_cefr_vote",
    "source_id": "ili:i100955",
    "pos": "noun",
    "translations": { "en": ["grain"], "de": ["Korn", "Kornbrand"] },
    "glosses": { "en": ["a cereal grass"], "de": ["ein Getreidegras"] },
    "examples": {
      "en": [
        { "text": "wheat is a grain that is grown in Kansas", "source": "omw" }
      ]
    },
    "votes": { "en": { "grain": { "cefr_source": "B1" } } }
  },
  {
    "_fixture": "verb_no_votes_no_translations",
    "source_id": "ili:i21779",
    "pos": "verb",
    "translations": { "en": ["respire"] },
    "glosses": {
      "en": [
        "undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide"
      ]
    },
    "examples": {},
    "votes": {}
  },
  {
    "_fixture": "verb_with_cefr_vote_all_languages",
    "source_id": "ili:i21778",
    "pos": "verb",
    "translations": {
      "en": ["breathe", "take a breath", "respire", "suspire"],
      "it": ["respirare"],
      "es": ["aspirar", "respirar"],
      "de": ["Luft holen", "hauchen", "Luft bekommen", "Luft kriegen", "atmen"],
      "fr": ["inspirer", "respirer"]
    },
    "glosses": {
      "en": ["draw air into, and expel out of, the lungs"],
      "de": ["Luft in die Lunge saugen und aus ihr ausstoßen"]
    },
    "examples": {
      "en": [
        {
          "text": "I can breathe better when the air is clean",
          "source": "omw"
        },
        { "text": "The patient is respiring", "source": "omw" }
      ]
    },
    "votes": { "en": { "breathe": { "cefr_source": "A1" } } }
  },
  {
    "_fixture": "adjective_all_languages_multiple_translations",
    "source_id": "ili:i10007",
    "pos": "adjective",
    "translations": {
      "en": ["possible"],
      "it": [
        "attuabile",
        "effettuabile",
        "eseguibile",
        "fattibile",
        "operabile",
        "possibile",
        "producibile",
        "realizzabile"
      ],
      "es": ["posible"],
      "de": [
        "möglich",
        "denkbar",
        "eventuell",
        "möglicherweise",
        "allfällig",
        "etwaig",
        "gegebenenfalls",
        "eventuell"
      ],
      "fr": ["possible", "éventuel"]
    },
    "glosses": {
      "en": ["capable of happening or existing"],
      "de": ["in der Lage, zu geschehen oder zu existieren"]
    },
    "examples": {
      "en": [
        { "text": "a breakthrough may be possible next year", "source": "omw" },
        { "text": "anything is possible", "source": "omw" },
        { "text": "warned of possible consequences", "source": "omw" }
      ]
    },
    "votes": { "en": { "possible": { "cefr_source": "A2" } } }
  },
  {
    "_fixture": "adjective_multiple_de_votes_cefr_examples",
    "source_id": "ili:i10000",
    "pos": "adjective",
    "translations": {
      "en": ["negative"],
      "de": [
        "dürftig",
        "zu wünschen übrig lassen",
        "schlecht",
        "widrig",
        "ungut",
        "lausig",
        "negativ",
        "von Nachteil",
        "schädlich",
        "nachteilig",
        "ungünstig"
      ],
      "fr": ["négatif", "strictement négatif"]
    },
    "glosses": { "en": ["less than zero"], "de": ["kleiner als Null"] },
    "examples": {
      "en": [{ "text": "a negative number", "source": "omw" }],
      "de": [
        { "text": "Die Beweise waren dürftig.", "source": "cefr" },
        { "text": "Das Wetter ist heute schlecht.", "source": "cefr" },
        {
          "text": "Trotz widriger Umstände haben sie es geschafft.",
          "source": "cefr"
        },
        {
          "text": "Er hatte ein ungutes Gefühl bei der Sache.",
          "source": "cefr"
        },
        { "text": "Er hat eine sehr negative Einstellung.", "source": "cefr" },
        {
          "text": "Rauchen ist schädlich für die Gesundheit.",
          "source": "cefr"
        },
        {
          "text": "Diese Entscheidung könnte nachteilig sein.",
          "source": "cefr"
        },
        {
          "text": "Das Wetter ist heute ungünstig für einen Ausflug.",
          "source": "cefr"
        }
      ]
    },
    "votes": {
      "de": {
        "dürftig": { "cefr_source": "C1" },
        "schlecht": { "cefr_source": "A1" },
        "widrig": { "cefr_source": "C1" },
        "ungut": { "cefr_source": "B2" },
        "negativ": { "cefr_source": "A2" },
        "schädlich": { "cefr_source": "B1" },
        "nachteilig": { "cefr_source": "B1" },
        "ungünstig": { "cefr_source": "B2" }
      }
    }
  },
  {
    "_fixture": "adverb_no_votes",
    "source_id": "ili:i18157",
    "pos": "adverb",
    "translations": { "en": ["a cappella"], "es": ["a capella"] },
    "glosses": { "en": ["without musical accompaniment"] },
    "examples": {
      "en": [{ "text": "they performed a cappella", "source": "omw" }]
    },
    "votes": {}
  }
 ]
--- a/data-pipeline/tests/fixtures/conflicts.fixture.json
+++ b/data-pipeline/tests/fixtures/conflicts.fixture.json
@ -1,4 +0,0 @@
 [
  { "word": "macht", "pos": "noun", "language": "de", "levels": ["A2", "B1"] },
  { "word": "bleiche", "pos": "noun", "language": "de", "levels": ["B2", "B1"] }
 ]
--- a/data-pipeline/tests/validation/db-import.validation.test.ts
+++ b/data-pipeline/tests/validation/db-import.validation.test.ts
@ -1,237 +0,0 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { describe, it, expect, beforeAll } from "vitest";
 import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
 import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
 // ── Types ─────────────────────────────────────────────────────────────────────
 type Example = { text: string; source: "omw" | "cefr" };
 type AnnotatedRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
  votes: Partial<
    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
  >;
 };
 // ── Paths ─────────────────────────────────────────────────────────────────────
 const DB_PATH = path.resolve("db/pipeline.db");
 const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
 const ANNOTATED_DIR = path.resolve("stage-2-annotate/output");
 // ── Helpers ───────────────────────────────────────────────────────────────────
 async function dbExists(): Promise<boolean> {
  try {
    await fs.access(DB_PATH);
    return true;
  } catch {
    return false;
  }
 }
 // ── Tests ─────────────────────────────────────────────────────────────────────
 describe("pipeline.db — import validation", () => {
  let db: import("better-sqlite3").Database;
  let expectedSynsetCount: number;
  let expectedCefrVoteCount: number;
  beforeAll(async () => {
    if (!(await dbExists())) return;
    const Database = (await import("better-sqlite3")).default;
    db = new Database(DB_PATH, { readonly: true });
    db.pragma("foreign_keys = ON");
    // Count expected synsets from omw.json
    const omwRaw = await fs.readFile(OMW_PATH, "utf-8");
    const omwRecords = JSON.parse(omwRaw) as unknown[];
    expectedSynsetCount = omwRecords.length;
    // Count expected CEFR votes from stage 2 annotated files.
    // Merge all language files the same way the import script does —
    // use en.json as base and merge votes from the other language files.
    const byId = new Map<string, AnnotatedRecord>();
    const baseRaw = await fs.readFile(
      path.join(ANNOTATED_DIR, "en.json"),
      "utf-8",
    );
    const base = JSON.parse(baseRaw) as AnnotatedRecord[];
    for (const record of base) {
      byId.set(record.source_id, record);
    }
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      if (lang === "en") continue;
      const raw = await fs.readFile(
        path.join(ANNOTATED_DIR, `${lang}.json`),
        "utf-8",
      );
      const records = JSON.parse(raw) as AnnotatedRecord[];
      for (const record of records) {
        const base = byId.get(record.source_id);
        if (!base) continue;
        for (const [l, langVotes] of Object.entries(record.votes)) {
          if (!base.votes[l as SupportedLanguageCode]) {
            base.votes[l as SupportedLanguageCode] = {};
          }
          Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
        }
      }
    }
    expectedCefrVoteCount = 0;
    for (const record of byId.values()) {
      for (const langVotes of Object.values(record.votes)) {
        expectedCefrVoteCount += Object.keys(langVotes ?? {}).length;
      }
    }
  }, 120_000);
  it("pipeline.db exists — skipping all tests if not", async () => {
    const exists = await dbExists();
    if (!exists) {
      console.warn(
        "\n  pipeline.db not found — run pnpm db:init and pnpm db:import first\n",
      );
    }
    expect(exists).toBe(true);
  });
  it("synsets count matches omw.json", () => {
    if (!db) return;
    const row = db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
      count: number;
    };
    expect(row.count).toBe(expectedSynsetCount);
  });
  it("every synset has at least one translation", () => {
    if (!db) return;
    const rows = db
      .prepare(
        `
        SELECT s.source_id
        FROM synsets s
        LEFT JOIN translations t ON t.source_id = s.source_id
        WHERE t.id IS NULL
      `,
      )
      .all() as { source_id: string }[];
    const errors = rows.map((r) => `${r.source_id}: no translations`);
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every translation belongs to a valid synset", () => {
    if (!db) return;
    const rows = db
      .prepare(
        `
        SELECT t.id, t.source_id
        FROM translations t
        LEFT JOIN synsets s ON s.source_id = t.source_id
        WHERE s.source_id IS NULL
      `,
      )
      .all() as { id: number; source_id: string }[];
    const errors = rows.map(
      (r) => `translation ${r.id}: references missing synset ${r.source_id}`,
    );
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every cefr_source_vote references a valid translation", () => {
    if (!db) return;
    const rows = db
      .prepare(
        `
        SELECT v.id, v.translation_id
        FROM cefr_source_votes v
        LEFT JOIN translations t ON t.id = v.translation_id
        WHERE t.id IS NULL
      `,
      )
      .all() as { id: number; translation_id: number }[];
    const errors = rows.map(
      (r) =>
        `cefr_vote ${r.id}: references missing translation ${r.translation_id}`,
    );
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("cefr_source_votes count matches stage 2 annotated output", () => {
    if (!db) return;
    const row = db
      .prepare("SELECT COUNT(*) as count FROM cefr_source_votes")
      .get() as { count: number };
    expect(row.count).toBe(expectedCefrVoteCount);
  });
  it("every example has a valid source", () => {
    if (!db) return;
    const rows = db
      .prepare(
        `
        SELECT source_id, language, source
        FROM examples
        WHERE source NOT IN ('omw', 'cefr')
      `,
      )
      .all() as { source_id: string; language: string; source: string }[];
    const errors = rows.map(
      (r) =>
        `${r.source_id} (${r.language}): invalid example source "${r.source}"`,
    );
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every example belongs to a valid synset", () => {
    if (!db) return;
    const rows = db
      .prepare(
        `
        SELECT e.id, e.source_id
        FROM examples e
        LEFT JOIN synsets s ON s.source_id = e.source_id
        WHERE s.source_id IS NULL
      `,
      )
      .all() as { id: number; source_id: string }[];
    const errors = rows.map(
      (r) => `example ${r.id}: references missing synset ${r.source_id}`,
    );
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every gloss belongs to a valid synset", () => {
    if (!db) return;
    const rows = db
      .prepare(
        `
        SELECT g.id, g.source_id
        FROM glosses g
        LEFT JOIN synsets s ON s.source_id = g.source_id
        WHERE s.source_id IS NULL
      `,
      )
      .all() as { id: number; source_id: string }[];
    const errors = rows.map(
      (r) => `gloss ${r.id}: references missing synset ${r.source_id}`,
    );
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
 });
--- a/data-pipeline/tests/validation/stage-1.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-1.validation.test.ts
@ -1,166 +0,0 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { describe, it, expect } from "vitest";
 import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
 import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
 // ── Types ─────────────────────────────────────────────────────────────────────
 type OmwRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, string[]>>;
 };
 // ── Paths ─────────────────────────────────────────────────────────────────────
 const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
 // ── Helpers ───────────────────────────────────────────────────────────────────
 function isValidSourceId(id: string): boolean {
  return /^ili:i\d+$/.test(id);
 }
 // ── Tests ─────────────────────────────────────────────────────────────────────
 describe("stage 1 — omw.json validation", () => {
  let records: OmwRecord[];
  it("file exists and is valid JSON", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    expect(records).toBeDefined();
  });
  it("is a non-empty array", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    expect(Array.isArray(records)).toBe(true);
    expect(records.length).toBeGreaterThan(0);
  });
  it("every record has required fields", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    for (const record of records) {
      if (!record.source_id) {
        errors.push(`missing source_id`);
        continue;
      }
      if (!record.pos) errors.push(`${record.source_id}: missing pos`);
      if (!record.translations)
        errors.push(`${record.source_id}: missing translations`);
      if (!record.glosses) errors.push(`${record.source_id}: missing glosses`);
      if (!record.examples)
        errors.push(`${record.source_id}: missing examples`);
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every source_id matches ili:i{number} pattern", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    for (const record of records) {
      if (!isValidSourceId(record.source_id)) {
        errors.push(`invalid source_id: ${record.source_id}`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every source_id is unique", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const seen = new Set<string>();
    const errors: string[] = [];
    for (const record of records) {
      if (seen.has(record.source_id)) {
        errors.push(`duplicate source_id: ${record.source_id}`);
      }
      seen.add(record.source_id);
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every pos is a valid supported value", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    const validPos = new Set(SUPPORTED_POS);
    for (const record of records) {
      if (!validPos.has(record.pos)) {
        errors.push(`${record.source_id}: invalid pos "${record.pos}"`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every record has at least one translation in at least one language", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
    for (const record of records) {
      const langs = Object.keys(record.translations) as SupportedLanguageCode[];
      if (langs.length === 0) {
        errors.push(`${record.source_id}: no translations`);
        continue;
      }
      for (const lang of langs) {
        if (!validLangs.has(lang)) {
          errors.push(`${record.source_id}: unsupported language "${lang}"`);
        }
        const words = record.translations[lang] ?? [];
        if (words.length === 0) {
          errors.push(`${record.source_id}: empty translations for "${lang}"`);
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("no duplicate translations within a single synset and language", async () => {
    const raw = await fs.readFile(OMW_PATH, "utf-8");
    const records = JSON.parse(raw) as OmwRecord[];
    const errors: string[] = [];
    for (const record of records) {
      for (const [lang, words] of Object.entries(record.translations)) {
        const seen = new Set<string>();
        for (const word of words) {
          if (seen.has(word)) {
            errors.push(
              `${record.source_id} (${lang}): duplicate translation "${word}"`,
            );
          }
          seen.add(word);
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
 });
--- a/data-pipeline/tests/validation/stage-2.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-2.validation.test.ts
@ -1,218 +0,0 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { describe, it, expect, beforeAll } from "vitest";
 import {
  SUPPORTED_POS,
  SUPPORTED_LANGUAGE_CODES,
  CEFR_LEVELS,
 } from "@lila/shared";
 import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
 // ── Types ─────────────────────────────────────────────────────────────────────
 type Example = { text: string; source: "omw" | "cefr" };
 type AnnotatedRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
  votes: Partial<
    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
  >;
 };
 type ConflictEntry = {
  word: string;
  pos: string;
  language: SupportedLanguageCode;
  levels: string[];
 };
 // ── Paths ─────────────────────────────────────────────────────────────────────
 const OUTPUT_DIR = path.resolve("stage-2-annotate/output");
 // ── Tests ─────────────────────────────────────────────────────────────────────
 describe("stage 2 — annotated output validation", () => {
  const recordsByLang = new Map<SupportedLanguageCode, AnnotatedRecord[]>();
  let conflicts: ConflictEntry[] = [];
  beforeAll(async () => {
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const raw = await fs.readFile(
        path.join(OUTPUT_DIR, `${lang}.json`),
        "utf-8",
      );
      recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]);
    }
    const raw = await fs.readFile(
      path.join(OUTPUT_DIR, "conflicts.json"),
      "utf-8",
    );
    conflicts = JSON.parse(raw) as ConflictEntry[];
  }, 60_000);
  it("all five language files exist", async () => {
    const errors: string[] = [];
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
      try {
        await fs.access(filePath);
      } catch {
        errors.push(`missing file: ${lang}.json`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("conflicts.json exists", async () => {
    const filePath = path.join(OUTPUT_DIR, "conflicts.json");
    await expect(fs.access(filePath)).resolves.toBeUndefined();
  });
  it("every language file is a non-empty array", () => {
    const errors: string[] = [];
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      if (!Array.isArray(records)) {
        errors.push(`${lang}.json: not an array`);
      } else if (records.length === 0) {
        errors.push(`${lang}.json: empty array`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every record has required fields", () => {
    const errors: string[] = [];
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        if (!record.source_id) {
          errors.push(`${lang}: record missing source_id`);
          continue;
        }
        if (!record.pos)
          errors.push(`${lang} ${record.source_id}: missing pos`);
        if (!record.translations)
          errors.push(`${lang} ${record.source_id}: missing translations`);
        if (!record.glosses)
          errors.push(`${lang} ${record.source_id}: missing glosses`);
        if (record.examples === undefined)
          errors.push(`${lang} ${record.source_id}: missing examples`);
        if (record.votes === undefined)
          errors.push(`${lang} ${record.source_id}: missing votes`);
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every pos is a valid supported value", () => {
    const errors: string[] = [];
    const validPos = new Set(SUPPORTED_POS);
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        if (!validPos.has(record.pos)) {
          errors.push(
            `${lang} ${record.source_id}: invalid pos "${record.pos}"`,
          );
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every example has text and a valid source", () => {
    const errors: string[] = [];
    const validSources = new Set(["omw", "cefr"]);
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        for (const [l, examples] of Object.entries(record.examples)) {
          for (const example of examples) {
            if (!example.text) {
              errors.push(
                `${lang} ${record.source_id} (${l}): example missing text`,
              );
            }
            if (!validSources.has(example.source)) {
              errors.push(
                `${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`,
              );
            }
          }
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("every cefr_source vote is a valid CEFR level", () => {
    const errors: string[] = [];
    const validLevels = new Set(CEFR_LEVELS);
    for (const lang of SUPPORTED_LANGUAGE_CODES) {
      const records = recordsByLang.get(lang)!;
      for (const record of records) {
        for (const [l, langVotes] of Object.entries(record.votes)) {
          for (const [word, vote] of Object.entries(langVotes ?? {})) {
            if (
              !validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number])
            ) {
              errors.push(
                `${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`,
              );
            }
          }
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
  it("conflicts.json entries have required fields and valid CEFR levels", () => {
    const errors: string[] = [];
    const validLevels = new Set(CEFR_LEVELS);
    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
    for (const entry of conflicts) {
      if (!entry.word) errors.push(`conflict missing word`);
      if (!entry.pos) errors.push(`conflict missing pos`);
      if (!entry.language) {
        errors.push(`conflict missing language`);
      } else if (!validLangs.has(entry.language)) {
        errors.push(`conflict invalid language "${entry.language}"`);
      }
      if (!Array.isArray(entry.levels) || entry.levels.length < 2) {
        errors.push(`${entry.word}: levels must have at least 2 entries`);
      } else {
        for (const level of entry.levels) {
          if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) {
            errors.push(`${entry.word}: invalid level "${level}"`);
          }
        }
      }
    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
 });