feat: add Kaikki extraction and import scripts for stage 1

- Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL, filters to supported POS and languages, skips abbreviations and senses with no translations in supported languages - Rewrite db/import.ts for Kaikki flat model — tracks sense_index offsets per headword+pos to handle duplicate JSONL entries - Rewrite db/schema.sql for Kaikki model — entries, translations, LLM vote tables, resolved tables - Add extract and db:import scripts to package.json - Sample mode hardcoded to 500 entries for development
2026-05-05 18:11:53 +02:00 · 2026-05-05 18:11:53 +02:00 · 209d52f54b
commit 209d52f54b
parent 963bff4eb8
17 changed files with 346 additions and 1055737 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,6 +12,7 @@ __pycache__/

 data-pipeline/archive/
 data-pipeline/stage-1-extract/output/
+data-pipeline/stage-1-extract/sources/
 data-pipeline/stage-2-annotate/output/
 data-pipeline/stage-3-enrich/output/
 data-pipeline/stage-4-merge/output/
--- a/data-pipeline/audit.md
+++ b/data-pipeline/audit.md
@ -1,362 +0,0 @@
-# OMW German Translation Quality Audit
-
-Instructions: for each entry, check if the German translations
-match the meaning described by the English gloss.
-
-Mark QUALITY as:
-OK — all German translations fit the meaning
-PARTIAL — some fit, some don't
-BAD — none of the German translations fit
-USELESS — translations are correct but useless for learners
-
---
-
-1.  [noun] ili:i98680
-    EN gloss: the flowering part of a plant or arrangement of flowers on a stalk
-    DE gloss: der blühende Teil einer Pflanze oder die Anordnung von Blüten an einem Stiel
-    EN words: inflorescence
-    DE words: Blütenstand, Infloreszenz
-    QUALITY: correct
-
-2.  [verb] ili:i24675
-    EN gloss: make motionless
-    DE gloss: unbeweglich machen
-    EN words: still
-    DE words: stillen, zum Stillstand bringen
-    QUALITY: stillen means breastfeeding, so completelyworng, zum stillstand bringen is correct but the gloss sounds weird: unbeweglich machen, no one says this
-
-3.  [verb] ili:i22153
-    EN gloss: lose interest or become bored with something or somebody
-    DE gloss: das Interesse an etwas oder jemandem verlieren oder sich langweilen
-    EN words: fatigue, jade, pall, tire, weary
-    DE words: Langeweile erzeugen, anöden, ermüden, langweilen, sich langweilen, sich zu Tode langweilen, sich öden
-    QUALITY: its ok
-
-4.  [noun] ili:i74742
-    EN gloss: zealous preaching and advocacy of the gospel
-    DE gloss: eifriges Predigen und Eintreten für das Evangelium
-    EN words: evangelism
-    DE words: Evangelisation, Evangelisierung
-    QUALITY: ok
-
-5.  [noun] ili:i115665
-    EN gloss: an oxide of iron that is strongly attracted by magnets
-    DE gloss: ein Eisenoxid, das stark von Magneten angezogen wird
-    EN words: magnetic iron-ore, magnetite
-    DE words: Eisenoxiduloxid, Magneteisen, Magneteisenstein, Magnetit
-    QUALITY: ok
-
-6.  [adjective] ili:i17569
-    EN gloss: of or relating to fatalism
-    DE gloss: von oder im Zusammenhang mit Fatalismus
-    EN words: fatalist, fatalistic
-    DE words: auf alles gefasst, dem Schicksal ergeben, fatalistisch, gottergeben, schicksalsergeben
-    QUALITY: ok
-
-7.  [adjective] ili:i682
-    EN gloss: having no previous example or precedent or parallel
-    DE gloss: ohne vorheriges Beispiel oder Präzedenzfall oder Parallele
-    EN words: new, unexampled
-    DE words: beispiellos, gab es noch nie, ohne Beispiel, ohne Präzedenzfall, ohnegleichen, präzedenzlos, sondergleichen, unvergleichbar
-    QUALITY: ok
-
-8.  [noun] ili:i114018
-    EN gloss: a soft silvery metallic element of the rare earth group; isotope 170 emits X-rays and is used in small portable X-ray machines; it occurs in monazite and apatite and xenotime
-    DE gloss: ein weiches, silbriges Metallelement der Gruppe der Seltenen Erden; Isotop 170 emittiert Röntgenstrahlen und wird in kleinen tragbaren Röntgengeräten verwendet; es kommt in Monazit und Apatit sowie in Xenotim vor
-    EN words: Tm, atomic number 69, thulium
-    DE words: Terameter, Tm
-    QUALITY: ok
-
-9.  [noun] ili:i117564
-    EN gloss: the rate of some repeating event
-    DE gloss: die Geschwindigkeit eines sich wiederholenden Ereignisses
-    EN words: pace, tempo
-    DE words: Takt, Tempo
-    QUALITY: ok
-
-10. [verb] ili:i31619
-    EN gloss: let drop or droop
-    DE gloss: fallen oder hängen lassen
-    EN words: hang
-    DE words: am Galgen sterben lassen, aufhängen, aufknüpfen, erhängen, henken, hängen
-    QUALITY: wrong,let drop means fallen lassen, like dropping something? im not sure here, does it really mean to hang some one? if so, then its ok
-
-11. [noun] ili:i75571
-    EN gloss: a heavy dull sound (as made by impact of heavy objects)
-    DE gloss: ein schweres, dumpfes Geräusch (wie beim Aufprall schwerer Gegenstände)
-    EN words: clump, clunk, thud, thump, thumping
-    DE words: Geklacker, Geklapper, Klackern, Klappern
-    QUALITY: ok
-
-12. [noun] ili:i92290
-    EN gloss: a person who makes a promise
-    DE gloss: eine Person, die ein Versprechen gibt
-    EN words: promiser, promisor
-    DE words: Freud'scher Versprecher, Lapsus Linguae, Versprecher, freudscher Versprecher
-    QUALITY: completeley wrong, Versprecher is if you intend to say something but say some thing else, it has nothing to do with Versprechen
-
-13. [noun] ili:i59450
-    EN gloss: a vertical well around which there is a stairway
-    DE gloss: ein vertikaler Schacht, um den herum eine Treppe verläuft
-    EN words: stairwell
-    DE words: Ern, Flur, Hausflur, Stiegenhaus, Treppenhaus
-    QUALITY: treppenhaus woudl be the only correct one right?
-
-14. [verb] ili:i21908
-    EN gloss: smile affectedly or derisively
-    DE gloss: affektiert oder spöttisch lächeln
-    EN words: simper, smirk
-    DE words: in sich hinein lächeln, schmunzeln, vor sich hin lächeln
-    QUALITY: the glosses would be also the words here? schmunzeln and lächeln are kind of the same but the affektiert and spöttisch is missing?
-
-15. [adjective] ili:i10887
-    EN gloss: tending to reserve or introspection
-    DE gloss: zur Zurückhaltung oder Introspektion neigend
-    EN words: indrawn, withdrawn
-    DE words: allein, einsam, eremitenhaft, eremitisch, für sich, solo, wie ein Einsiedler, wie ein Eremit, zurückgezogen
-    QUALITY: ok
-
-16. [noun] ili:i113657
-    EN gloss: a substance from which another substance is formed (especially by a metabolic reaction)
-    DE gloss: ein Stoff, aus dem ein anderer Stoff gebildet wird (insbesondere durch eine Stoffwechselreaktion)
-    EN words: precursor
-    DE words: Ausgangsstoff, Edukt, Grundstoff, Präkursor, Vorläufer, biologische Vorstufe
-    QUALITY: ok
-
-17. [adjective] ili:i13251
-    EN gloss: tastelessly showy
-    DE gloss: geschmacklos und auffällig
-    EN words: brassy, cheap, flash, flashy, garish, gaudy, gimcrack, loud, meretricious, tacky, tatty, tawdry, trashy
-    DE words: aufdringlich, marktschreierisch, reißerisch
-    QUALITY: ok
-
-18. [noun] ili:i68734
-    EN gloss: the branch of chemistry that studies the relation between chemical action and the amount of heat absorbed or generated
-    DE gloss: der Zweig der Chemie, der die Beziehung zwischen chemischer Wirkung und der absorbierten oder erzeugten Wärmemenge untersucht
-    EN words: thermochemistry
-    DE words: Thermochemie, chemische Thermodynamik
-    QUALITY: ok
-
-19. [adjective] ili:i12980
-    EN gloss: distinguished from others in excellence
-    DE gloss: durch hohe Qualität von anderen unterschieden
-    EN words: outstanding
-    DE words: I a, ausgezeichnet, außergewöhnlich, außerordentlich, besonders, bestens, eins a, exzeptionell, herausragend, schnafte, splendid, trefflich, vortrefflich, vorzüglich
-    QUALITY: ok, aber eins a/1a is wirklich sehr starke umgangssprache. und cih habe ncoh nie schnafte oder splendid gehört, der rest passt
-
-20. [verb] ili:i30043
-    EN gloss: tear down so as to make flat with the ground
-    DE gloss: abreißen, um den Boden zu ebnen
-    EN words: dismantle, level, pull down, rase, raze, take down, tear down
-    DE words: abreißen, aus den Augen verlieren, keinen Kontakt mehr haben zu, nicht länger in Kontakt stehen
-    QUALITY: nur abreißen stimmt, der rest passt in diesem zusammenhang gar nicht!
-
-21. [adjective] ili:i14014
-    EN gloss: desired or wished for or sought
-    DE gloss: gewünscht oder gewünscht oder gesucht
-    EN words: wanted
-    DE words: benötigt, gesucht, gewünscht
-    QUALITY: ok
-
-22. [verb] ili:i29481
-    EN gloss: mar or spoil the appearance of
-    DE gloss: das Aussehen verunstalten
-    EN words: blemish, deface, disfigure
-    DE words: deformieren, entstellen, verhunzen, verschandeln, verunstalten, verunzieren
-    QUALITY: ok
-
-23. [verb] ili:i28605
-    EN gloss: spread thickly
-    DE gloss: dick auftragen
-    EN words: slather
-    DE words: beharken, bestreichen, mit Feuer belegen, mit Sperrfeuer belegen
-    QUALITY: kein wort ist wirklich ein synonym für dick auftragen, (i dont even know if the english word fits here?)
-
-24. [noun] ili:i92029
-    EN gloss: someone who is licensed to operate an aircraft in flight
-    DE gloss: jemand, der eine Lizenz zum Führen eines Luftfahrzeugs im Flug hat
-    EN words: airplane pilot, pilot
-    DE words: Führer, Lotse, Pilot
-    QUALITY: nur Pilot stimmt hier
-
-25. [adjective] ili:i8221
-    EN gloss: capable of being measured
-    DE gloss: in der Lage, gemessen zu werden
-    EN words: measurable, mensurable
-    DE words: bestimmbar, der Messung zugänglich, erhebbar, mensurabel, messbar
-    QUALITY: ok
-
-26. [noun] ili:i61380
-    EN gloss: the spirit of a group that makes the members want the group to succeed
-    DE gloss: der Geist einer Gruppe, der die Mitglieder dazu bringt, den Erfolg der Gruppe zu wollen
-    EN words: esprit de corps, morale, team spirit
-    DE words: Gruppengeist, Teamgeist
-    QUALITY: Gruppengeist hört sich so komisch an, das sagt niemand, teamgeist ist in ordnung
-
-27. [adjective] ili:i10497
-    EN gloss: free of restrictions or qualifications
-    DE gloss: Zustand, in dem in einer Wohnung niemand wohnt.
-    EN words: clean, clear
-    DE words: frei, leer stehend, leerstehend, unbewohnt, ungenutzt, verwaist
-    QUALITY: ok
-
-28. [adjective] ili:i6238
-    EN gloss: moving and bending with ease
-    DE gloss: anmutig schlank und mit Leichtigkeit biegsam und beweglich
-    EN words: lissom, lissome, lithe, lithesome, slender, supple, svelte, sylphlike
-    DE words: elastisch, geschmeidig, schlangenartig
-    QUALITY: \_\_\_
-
-29. [noun] ili:i57906
-    EN gloss: station for the production and transmission of AM or FM radio broadcasts
-    DE gloss: Sender für die Produktion und Übertragung von AM- oder FM-Radiosendungen
-    EN words: radio station
-    DE words: Radiosender, Rundfunkstation, Sender
-    QUALITY: \_\_\_
-
-30. [noun] ili:i112045
-    EN gloss: the purple or black-and-blue area resulting from a bruise
-    DE gloss: der violette oder schwarzblaue Bereich, der durch einen Bluterguss entsteht
-    EN words: ecchymosis
-    DE words: Ekchymose, kleinflächige Hautblutung
-    QUALITY: \_\_\_
-
-31. [adjective] ili:i10839
-    EN gloss: capable of being replaced
-    DE gloss: kann ersetzt werden
-    EN words: replaceable
-    DE words: austauschbar, ersetzbar, fungibel
-    QUALITY: \_\_\_
-
-32. [verb] ili:i28714
-    EN gloss: whip
-    DE gloss: peitschen
-    EN words: flagellate, scourge
-    DE words: auspeitschen, flagellieren, geißeln, peitschen
-    QUALITY: \_\_\_
-
-33. [noun] ili:i52826
-    EN gloss: a mechanical or electrical explosive device or a small amount of explosive; can be used to initiate the reaction of a disrupting explosive
-    DE gloss: ein mechanischer oder elektrischer Sprengkörper oder eine kleine Menge Sprengstoff; kann verwendet werden, um die Reaktion eines Sprengstoffs auszulösen
-    EN words: cap, detonating device, detonator
-    DE words: Auslöser, Zünder, Zündvorrichtung
-    QUALITY: \_\_\_
-
-34. [noun] ili:i115477
-    EN gloss: ice crystals forming a white deposit (especially on objects outside)
-    DE gloss: Eiskristalle, die einen weißen Belag bilden (insbesondere auf Gegenständen im Freien)
-    EN words: frost, hoar, hoarfrost, rime
-    DE words: Raufrost, Raureif, Reif
-    QUALITY: \_\_\_
-
-35. [noun] ili:i66650
-    EN gloss: the ability to see in reduced illumination (as in moonlight)
-    DE gloss: die Fähigkeit, bei reduzierter Beleuchtung zu sehen (wie bei Mondlicht)
-    EN words: night vision, night-sight, scotopic vision, twilight vision
-    DE words: Nachtsehen, skotopisches Sehen
-    QUALITY: \_\_\_
-
-36. [verb] ili:i26849
-    EN gloss: express or utter with a hiss
-    DE gloss: mit einem Zischen ausdrücken oder aussprechen
-    EN words: hiss, sibilate, siss, sizz
-    DE words: Stimme dämpfen, flüstern, hauchen, hinter vorgehaltener Hand, ins Ohr sagen, leise sprechen, mit tonloser Stimme, munkeln, raunen, säuseln, tonlos, tuscheln, wispern, zischeln, zuflüstern
-    QUALITY: \_\_\_
-
-37. [noun] ili:i94222
-    EN gloss: a teenager or a young adult male
-    DE gloss: ein Jugendlicher oder ein junger Erwachsener
-    EN words: young buck, young man
-    DE words: Bruder, Bürschchen, Cowboy, Freundchen, Jungs, Kinders, Kollege, Kollegin, Leute, Mann Gottes, Meister, Sportsfreund, Verehrtester, der Herr, guter Mann, junger Mann, mein Gutster, mein Herr
-    QUALITY: \_\_\_
-
-38. [noun] ili:i49310
-    EN gloss: dusky grey food fish found from Louisiana and Florida southward
-    DE gloss: dunkelgrauer Speisefisch, der von Louisiana und Florida südwärts vorkommt
-    EN words: Anisotremus surinamensis, black margate, pompon
-    DE words: Pompon, Puschel, Tanzwedel
-    QUALITY: \_\_\_
-
-39. [noun] ili:i50315
-    EN gloss: a small vehicle with four wheels in which a baby or child is pushed around
-    DE gloss: ein kleines Fahrzeug mit vier Rädern, in dem ein Säugling oder ein Kind herumgeschoben wird
-    EN words: baby buggy, baby carriage, carriage, go-cart, perambulator, pram, pushchair, pusher, stroller
-    DE words: Kinderwagen, Säuglingskutsche
-    QUALITY: \_\_\_
-
-40. [verb] ili:i31857
-    EN gloss: meet at a point
-    DE gloss: sich an einem Punkt treffen
-    EN words: cross, intersect
-    DE words: gegen den Wind segeln, kreuzen
-    QUALITY: \_\_\_
-
-41. [noun] ili:i51632
-    EN gloss: a sailboat with two parallel hulls held together by single deck
-    DE gloss: ein Boot mit zwei parallelen Rümpfen, die durch ein einziges Deck zusammengehalten werden
-    EN words: catamaran
-    DE words: Doppelrumpfboot, Katamaran, Zweirumpfboot
-    QUALITY: \_\_\_
-
-42. [verb] ili:i34734
-    EN gloss: to be found to exist
-    DE gloss: als existent befunden werden
-    EN words: occur
-    DE words: anzutreffen sein, auftreten, nicht ausbleiben, vorkommen, zu finden sein, zu sehen sein
-    QUALITY: \_\_\_
-
-43. [verb] ili:i25187
-    EN gloss: assign too high a value to
-    DE gloss: einen zu hohen Wert zuweisen
-    EN words: overestimate, overvalue
-    DE words: zu hoch bewerten, zu viel Gewicht beimessen, zu viel Wichtigkeit beimessen, überbewerten, überschätzen
-    QUALITY: \_\_\_
-
-44. [noun] ili:i73844
-    EN gloss: an expressive style of music
-    DE gloss: ein ausdrucksstarker Musikstil
-    EN words: genre, music genre, musical genre, musical style
-    DE words: Genre, Musikgenre, Musikrichtung, Musikstil, Stilrichtung
-    QUALITY: \_\_\_
-
-45. [noun] ili:i113026
-    EN gloss: an abnormal condition in which cerebrospinal fluid collects in the ventricles of the brain; in infants it can cause abnormally rapid growth of the head and bulging fontanelles and a small face; in adults the symptoms are primarily neurological
-    DE gloss: ein anormaler Zustand, bei dem sich Liquor in den Hirnventrikeln sammelt; bei Säuglingen kann er zu einem anormal schnellen Wachstum des Kopfes, zu wulstigen Fontanellen und einem kleinen Gesicht führen; bei Erwachsenen sind die Symptome hauptsächlich neurologisch
-    EN words: hydrocephalus, hydrocephaly
-    DE words: Gehirnwassersucht, Hydrocephalus, Hydrozephalus, Wasserkopf
-    QUALITY: \_\_\_
-
-46. [noun] ili:i62720
-    EN gloss: habitual uncleanliness
-    DE gloss: gewohnheitsmäßige Unreinheit
-    EN words: slovenliness
-    DE words: Flickarbeit, Flickenteppich, Flickwerk, Gestümper, Mist, Murks, Murkserei, Pfusch, Pfuscharbeit, Pfuscherei, Schlamperei, Schlendrian, Schluderei, Schund, schlechte Arbeit
-    QUALITY: \_\_\_
-
-47. [noun] ili:i80976
-    EN gloss: the government agency in the United Kingdom that is responsible for internal security and counterintelligence overseas
-    DE gloss: Regierungsbehörde im Vereinigten Königreich, die für die innere Sicherheit und die Spionageabwehr im Ausland zuständig ist.
-    EN words: MI, Military Intelligence Section 6, Secret Intelligence Service
-    DE words: MI6, SIS, Secret Intelligence Service, Secret Service, britischer Auslandsgeheimdienst
-    QUALITY: \_\_\_
-
-48. [noun] ili:i60476
-    EN gloss: an electrical device by which alternating current of one voltage is changed to another voltage
-    DE gloss: ein elektrisches Gerät, mit dem Wechselstrom einer bestimmten Spannung in eine andere Spannung umgewandelt wird
-    EN words: transformer
-    DE words: Spannungswandler, Trafo, Transformator, Transformer
-    QUALITY: \_\_\_
-
-49. [noun] ili:i37037
-    EN gloss: wandering from the main path of a journey
-    DE gloss: das Abweichen vom Hauptweg einer Reise
-    EN words: digression, excursion
-    DE words: Abschweifung, Abstecher, Einschub, Exkurs, Umschweif
-    QUALITY: \_\_\_
-
-50. [noun] ili:i77288
-    EN gloss: any meat that is minced and spiced and cooked as patties or used to fill sausages
-    DE gloss: jegliches Fleisch, das zerkleinert und gewürzt und als Pasteten gekocht oder zur Füllung von Würsten verwendet wird
-    EN words: sausage meat
-    DE words: Brät, Wurstbrät
-    QUALITY: \_\_\_
--- a/data-pipeline/db/import.ts
+++ b/data-pipeline/db/import.ts
@ -1,185 +1,98 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
-import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
-import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
 import { openDb } from "./index.js";
-
-// ── Types ─────────────────────────────────────────────────────────────────────
-
-type Example = { text: string; source: "omw" | "cefr" };
-
-type AnnotatedRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
-};
+import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";

 // ── Paths ─────────────────────────────────────────────────────────────────────

 const __dirname = path.dirname(fileURLToPath(import.meta.url));

 const PATHS = {
-  annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
+  extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
 };

-// ── Loading ───────────────────────────────────────────────────────────────────
-
-async function loadAnnotated(): Promise<AnnotatedRecord[]> {
-  // Use en.json as the base — it has the most complete glosses and examples.
-  // Merge votes and CEFR examples from the other language files.
-  const baseRaw = await fs.readFile(
-    path.join(PATHS.annotatedDir, "en.json"),
-    "utf-8",
-  );
-  const base = JSON.parse(baseRaw) as AnnotatedRecord[];
-
-  const byId = new Map<string, AnnotatedRecord>();
-  for (const record of base) {
-    byId.set(record.source_id, record);
-  }
-
-  for (const lang of SUPPORTED_LANGUAGE_CODES) {
-    if (lang === "en") continue;
-
-    const raw = await fs.readFile(
-      path.join(PATHS.annotatedDir, `${lang}.json`),
-      "utf-8",
-    );
-    const records = JSON.parse(raw) as AnnotatedRecord[];
-
-    for (const record of records) {
-      const base = byId.get(record.source_id);
-      if (!base) continue;
-
-      // Merge votes
-      for (const [l, langVotes] of Object.entries(record.votes)) {
-        if (!base.votes[l as SupportedLanguageCode]) {
-          base.votes[l as SupportedLanguageCode] = {};
-        }
-        Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
-      }
-
-      // Merge CEFR examples not already in base
-      for (const [l, examples] of Object.entries(record.examples)) {
-        const lang = l as SupportedLanguageCode;
-        const cefrExamples = examples.filter((e) => e.source === "cefr");
-        if (cefrExamples.length === 0) continue;
-
-        if (!base.examples[lang]) {
-          base.examples[lang] = cefrExamples;
-        } else {
-          base.examples[lang].push(...cefrExamples);
-        }
-      }
-    }
-  }
-
-  return [...byId.values()];
-}
-
 // ── Import ────────────────────────────────────────────────────────────────────

-export async function importStage2(): Promise<void> {
-  console.log("Loading stage 2 annotated files...");
-  const records = await loadAnnotated();
-  console.log(`  Loaded ${records.length.toLocaleString()} synsets`);
+export async function importKaikki(): Promise<void> {
+  console.log("Loading extracted Kaikki data...");
+  const raw = await fs.readFile(PATHS.extracted, "utf-8");
+  const senses = JSON.parse(raw) as ExtractedSense[];
+  console.log(`  Loaded ${senses.length.toLocaleString()} senses`);

  const db = openDb();

-  const insertSynset = db.prepare(
-    `INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
-  );
-
-  const insertTranslation = db.prepare(
-    `INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
-  );
-
-  const insertGloss = db.prepare(
-    `INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
-  );
-
-  const insertExample = db.prepare(
-    `INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
-  );
-
-  const insertCefrVote = db.prepare(`
-    INSERT INTO cefr_source_votes (translation_id, cefr_level)
-    VALUES (
-      (SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
-      ?
-    )
+  const insertEntry = db.prepare(`
+    INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
+    VALUES (?, ?, ?, ?, ?, ?)
+    ON CONFLICT (headword, language, pos, sense_index)
+    DO UPDATE SET
+      gloss    = excluded.gloss,
+      examples = excluded.examples
+    RETURNING id
  `);

+  const insertTranslation = db.prepare(`
+    INSERT INTO translations (entry_id, target_lang, word, sense_hint)
+    VALUES (?, ?, ?, ?)
+    ON CONFLICT (entry_id, target_lang, word) DO NOTHING
+  `);
+
+  // Track next available sense_index per (headword, pos) to handle
+  // the same word appearing in multiple JSONL entries with the same POS.
+  const senseIndexMap = new Map<string, number>();
+
  console.log("\nImporting into pipeline.db...");

  const importAll = db.transaction(() => {
-    let synsets = 0;
+    let entries = 0;
    let translations = 0;
-    let glosses = 0;
-    let examples = 0;
-    let cefrVotes = 0;
+    let skipped = 0;

-    for (const record of records) {
-      insertSynset.run(record.source_id, record.pos);
-      synsets++;
+    for (const sense of senses) {
+      const key = `${sense.headword}|${sense.pos}`;
+      const nextIndex = senseIndexMap.get(key) ?? 0;

-      // Translations
-      for (const [lang, words] of Object.entries(record.translations)) {
-        const unique = [...new Set(words)];
-        for (const word of unique) {
-          insertTranslation.run(record.source_id, lang, word);
-          translations++;
-        }
+      // Use the offset sense_index to avoid collisions when the same word
+      // appears in multiple JSONL entries with the same POS.
+      const senseIndex = nextIndex;
+      senseIndexMap.set(key, nextIndex + 1);
+
+      const row = insertEntry.get(
+        sense.headword,
+        "en",
+        sense.pos,
+        senseIndex,
+        sense.gloss ?? null,
+        JSON.stringify(sense.examples),
+      ) as { id: number } | undefined;
+
+      if (!row) {
+        skipped++;
+        continue;
      }

-      // Glosses
-      for (const [lang, glossList] of Object.entries(record.glosses)) {
-        for (const text of glossList) {
-          insertGloss.run(record.source_id, lang, text);
-          glosses++;
-        }
-      }
+      entries++;

-      // Examples
-      for (const [lang, exList] of Object.entries(record.examples)) {
-        for (const example of exList) {
-          insertExample.run(
-            record.source_id,
-            lang,
-            example.text,
-            example.source,
-          );
-          examples++;
-        }
-      }
-
-      // CEFR source votes
-      for (const [lang, langVotes] of Object.entries(record.votes)) {
-        for (const [word, vote] of Object.entries(
-          langVotes as Record<string, { cefr_source: string }>,
-        )) {
-          insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
-          cefrVotes++;
-        }
+      for (const t of sense.translations) {
+        insertTranslation.run(
+          row.id,
+          t.target_lang,
+          t.word,
+          t.sense_hint ?? null,
+        );
+        translations++;
      }
    }

-    return { synsets, translations, glosses, examples, cefrVotes };
+    return { entries, translations, skipped };
  });

  const counts = importAll();

-  console.log(`  synsets:      ${counts.synsets.toLocaleString()}`);
+  console.log(`  entries:      ${counts.entries.toLocaleString()}`);
  console.log(`  translations: ${counts.translations.toLocaleString()}`);
-  console.log(`  glosses:      ${counts.glosses.toLocaleString()}`);
-  console.log(`  examples:     ${counts.examples.toLocaleString()}`);
-  console.log(`  cefr votes:   ${counts.cefrVotes.toLocaleString()}`);
+  console.log(`  skipped:      ${counts.skipped.toLocaleString()}`);

  db.close();
  console.log("\nImport complete.");
@ -189,7 +102,7 @@ export async function importStage2(): Promise<void> {

 export function isImported(): boolean {
  const db = openDb();
-  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
+  const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
    count: number;
  };
  db.close();
@ -200,20 +113,20 @@ export function isImported(): boolean {

 async function main(): Promise<void> {
  const db = openDb();
-  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
+  const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
    count: number;
  };
  db.close();

  if (row.count > 0) {
    console.log(
-      `pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
+      `pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
    );
    console.log("Delete pipeline.db and re-run db:init to start fresh.");
    process.exit(0);
  }

-  await importStage2();
+  await importKaikki();
 }

 if (import.meta.url === `file://${process.argv[1]}`) {
--- a/data-pipeline/db/schema.sql
+++ b/data-pipeline/db/schema.sql
@ -1,62 +1,58 @@
 -- ── Base data ─────────────────────────────────────────────────────────────────
-- Imported from stage 2 JSON on first run. Never mutated after import.
+-- Imported from Kaikki on first run. Never mutated after import.

-CREATE TABLE IF NOT EXISTS synsets (
-  source_id TEXT PRIMARY KEY,
-  pos       TEXT NOT NULL
+CREATE TABLE IF NOT EXISTS entries (
+  id          INTEGER PRIMARY KEY,
+  headword    TEXT    NOT NULL,
+  language    TEXT    NOT NULL,
+  pos         TEXT    NOT NULL,
+  sense_index INTEGER NOT NULL DEFAULT 0,
+  gloss       TEXT,
+  examples    TEXT    NOT NULL DEFAULT '[]', -- JSON array of strings
+  source      TEXT    NOT NULL DEFAULT 'kaikki',
+  UNIQUE (headword, language, pos, sense_index)
 );

 CREATE TABLE IF NOT EXISTS translations (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  word      TEXT    NOT NULL,
-  UNIQUE (source_id, language, word)
-);
-
-CREATE TABLE IF NOT EXISTS glosses (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL
-);
-
-CREATE TABLE IF NOT EXISTS examples (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL
-);
-
-CREATE TABLE IF NOT EXISTS cefr_source_votes (
-  id             INTEGER PRIMARY KEY,
-  translation_id INTEGER NOT NULL REFERENCES translations(id),
-  cefr_level     TEXT    NOT NULL,
-  UNIQUE (translation_id)
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  sense_hint  TEXT,
+  source      TEXT    NOT NULL DEFAULT 'kaikki',
+  UNIQUE (entry_id, target_lang, word)
 );

 -- ── Status tracking ───────────────────────────────────────────────────────────
-- One row per synset per model per stage. Drives resumability.
+-- One row per entry per model per stage. Drives resumability.
+-- Sentinel rows use entry_id = 0 for one-time pipeline steps.
 -- stage:  round1 | round2 | tiebreak
 -- status: pending | complete | needs_review | flagged

 CREATE TABLE IF NOT EXISTS run_status (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL,
+  entry_id   INTEGER NOT NULL,
  model_name TEXT    NOT NULL,
  stage      TEXT    NOT NULL,
  status     TEXT    NOT NULL,
  created_at TEXT    NOT NULL DEFAULT (datetime('now')),
  updated_at TEXT    NOT NULL DEFAULT (datetime('now')),
-  UNIQUE (source_id, model_name, stage)
+  UNIQUE (entry_id, model_name, stage)
 );

 -- ── Round 1 output ────────────────────────────────────────────────────────────
-- One row per translation/language per model. Written atomically per record.
+-- Written atomically per entry per model.
 -- Unique constraints enforce one model one vote.

-CREATE TABLE IF NOT EXISTS model_cefr_votes (
+CREATE TABLE IF NOT EXISTS model_entry_cefr_votes (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  model_name TEXT    NOT NULL,
+  cefr_level TEXT    NOT NULL,
+  UNIQUE (entry_id, model_name)
+);
+
+CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
  id             INTEGER PRIMARY KEY,
  translation_id INTEGER NOT NULL REFERENCES translations(id),
  model_name     TEXT    NOT NULL,
@ -64,38 +60,29 @@ CREATE TABLE IF NOT EXISTS model_cefr_votes (
  UNIQUE (translation_id, model_name)
 );

-CREATE TABLE IF NOT EXISTS model_translation_rejections (
-  id             INTEGER PRIMARY KEY,
-  translation_id INTEGER NOT NULL REFERENCES translations(id),
-  model_name     TEXT    NOT NULL,
-  UNIQUE (translation_id, model_name)
-);
-
 CREATE TABLE IF NOT EXISTS generated_glosses (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
  model_name TEXT    NOT NULL,
-  language   TEXT    NOT NULL,
  text       TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+  UNIQUE (entry_id, model_name)
 );

 CREATE TABLE IF NOT EXISTS generated_examples (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
  model_name TEXT    NOT NULL,
-  language   TEXT    NOT NULL,
  text       TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+  UNIQUE (entry_id, model_name)
 );

-CREATE TABLE IF NOT EXISTS generated_descriptions (
-  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
-  model_name TEXT    NOT NULL,
-  language   TEXT    NOT NULL,
-  text       TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+CREATE TABLE IF NOT EXISTS generated_translations (
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  model_name  TEXT    NOT NULL,
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  UNIQUE (entry_id, model_name, target_lang)
 );

 -- ── Round 2 output ────────────────────────────────────────────────────────────
@ -116,20 +103,28 @@ CREATE TABLE IF NOT EXISTS example_candidate_votes (
  UNIQUE (example_id, model_name)
 );

-CREATE TABLE IF NOT EXISTS description_candidate_votes (
+CREATE TABLE IF NOT EXISTS translation_candidate_votes (
  id             INTEGER PRIMARY KEY,
-  description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
+  translation_id INTEGER NOT NULL REFERENCES generated_translations(id),
  model_name     TEXT    NOT NULL,
-  UNIQUE (description_id, model_name)
+  UNIQUE (translation_id, model_name)
 );

 -- ── Resolved output ───────────────────────────────────────────────────────────
 -- Written by merge. Never updated after writing.
-- Only fully resolved records are written here — no nulls, no flags.
+-- Only fully resolved records are written here — no nulls.
 -- Absence of a row means unresolved. Flagged status tracked in run_status.
-- source: omw | cefr | model_name
+-- source: kaikki | model_name

-CREATE TABLE IF NOT EXISTS resolved_translations (
+CREATE TABLE IF NOT EXISTS resolved_entry_cefr (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  cefr_level TEXT    NOT NULL,
+  difficulty TEXT    NOT NULL,
+  UNIQUE (entry_id)
+);
+
+CREATE TABLE IF NOT EXISTS resolved_translation_cefr (
  id             INTEGER PRIMARY KEY,
  translation_id INTEGER NOT NULL REFERENCES translations(id),
  cefr_level     TEXT    NOT NULL,
@ -138,27 +133,25 @@ CREATE TABLE IF NOT EXISTS resolved_translations (
 );

 CREATE TABLE IF NOT EXISTS resolved_glosses (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL,
-  UNIQUE (source_id, language)
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  text       TEXT    NOT NULL,
+  source     TEXT    NOT NULL,
+  UNIQUE (entry_id)
 );

 CREATE TABLE IF NOT EXISTS resolved_examples (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  text       TEXT    NOT NULL,
+  source     TEXT    NOT NULL
 );

-CREATE TABLE IF NOT EXISTS resolved_descriptions (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL,
-  UNIQUE (source_id, language)
+CREATE TABLE IF NOT EXISTS resolved_generated_translations (
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  source      TEXT    NOT NULL,
+  UNIQUE (entry_id, target_lang)
 );
--- a/data-pipeline/stage-1-extract/scripts/extract.py
+++ b/data-pipeline/stage-1-extract/scripts/extract.py
@ -1,204 +0,0 @@
-"""
-data-pipeline/stage-1-extract/scripts/extract.py
-
-Extract all synsets from the Open Multilingual Wordnet (OMW) for all
-supported languages and parts of speech.
-
-Output: one JSON file per language, written to stage-1-extract/output/
-  en.json, it.json, es.json, de.json, fr.json
-
-Each file is a JSON array of synset records:
-  {
-    "source_id": "ili:i12345",
-    "pos": "noun",
-    "translations": { "en": ["dog", "canine"], "it": ["cane"] },
-    "glosses":      { "en": ["a domesticated animal..."] },
-    "examples":     { "en": ["the dog barked at the stranger"] }
-  }
-
-Usage:
-  python stage-1-extract/scripts/extract.py
-  python stage-1-extract/scripts/extract.py --sample
-
-Prerequisites:
-  pip install wn
-  python -m wn download omw-en:1.4
-  python -m wn download omw-it:1.4
-  python -m wn download omw-de:1.4
-  python -m wn download omw-es:1.4
-  python -m wn download omw-fr:1.4
-"""
-
-import json
-import sys
-from pathlib import Path
-
-import wn
-
-SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
-POS_MAP: dict[str, str] = {
-    "n": "noun",
-    "v": "verb",
-    "a": "adjective",
-    "s": "adjective",  # adjective satellite — collapsed into adjective
-    "r": "adverb",
-}
-
-
-def extract_all(
-    output_dir: str = "stage-1-extract/output", sample: bool = False
-) -> None:
-    out = Path(output_dir)
-    out.mkdir(parents=True, exist_ok=True)
-
-    sample_size = 100 if sample else None
-
-    # Load one Wordnet object per language up front.
-    print("Loading wordnets...")
-    wordnets: dict[str, wn.Wordnet] = {}
-    for lang in SUPPORTED_LANGUAGE_CODES:
-        try:
-            wordnets[lang] = wn.Wordnet(lang=lang)
-            synset_count = len(wordnets[lang].synsets())
-            print(f"  {lang}: {synset_count:,} total synsets")
-        except wn.Error as e:
-            print(f"  ERROR loading {lang}: {e}")
-            print(f"  Run: python -m wn download omw-{lang}:1.4")
-            sys.exit(1)
-
-    # Collect per-ILI data across all languages and POS.
-    print("\nExtracting synsets...")
-    by_ili: dict[str, dict] = {}
-
-    for lang, wnet in wordnets.items():
-        for omw_pos, pos_label in POS_MAP.items():
-            synsets = wnet.synsets(pos=omw_pos)
-            covered = 0
-            for synset in synsets:
-                ili = synset.ili
-                if not ili:
-                    continue
-                covered += 1
-
-                lemmas = list(dict.fromkeys(str(lemma) for lemma in synset.lemmas()))
-                defns = [d for d in synset.definitions() if d]
-                examples = [e for e in synset.examples() if e]
-
-                if ili not in by_ili:
-                    by_ili[ili] = {"pos": pos_label}
-
-                if lang not in by_ili[ili]:
-                    by_ili[ili][lang] = {
-                        "lemmas": lemmas,
-                        "glosses": defns,
-                        "examples": examples,
-                    }
-                else:
-                    # ILI already exists for this language — merge data.
-                    # Happens when 'a' and 's' both map to adjective for the
-                    # same ILI. Deduplicate to avoid repeated entries.
-                    existing = by_ili[ili][lang]
-                    existing["lemmas"] = list(
-                        dict.fromkeys(existing["lemmas"] + lemmas)
-                    )
-                    existing["glosses"] = list(
-                        dict.fromkeys(existing["glosses"] + defns)
-                    )
-                    existing["examples"] = list(
-                        dict.fromkeys(existing["examples"] + examples)
-                    )
-
-            print(f"  {lang} {pos_label}: {covered:,} synsets with ILI")
-
-    # Build records and write single combined output file.
-    print("\nBuilding records...")
-    ilis = sorted(by_ili.keys())
-    if sample_size:
-        ilis = ilis[:sample_size]
-
-    records: list[dict] = []
-    for ili in ilis:
-        data = by_ili[ili]
-        record: dict = {
-            "source_id": f"ili:{ili}",
-            "pos": data["pos"],
-            "translations": {},
-            "glosses": {},
-            "examples": {},
-        }
-
-        for key, value in data.items():
-            if key == "pos":
-                continue
-            lang = key
-            if value["lemmas"]:
-                record["translations"][lang] = value["lemmas"]
-            if value["glosses"]:
-                record["glosses"][lang] = value["glosses"]
-            if value["examples"]:
-                record["examples"][lang] = value["examples"]
-
-        records.append(record)
-
-    output_file = out / "omw.json"
-    with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(records, f, indent=2, ensure_ascii=False)
-
-    print(f"\nWrote {len(records):,} synsets → {output_file}")
-    _print_coverage(records)
-
-
-def _print_coverage(records: list[dict]) -> None:
-    """Print per-language translation, gloss, and example counts."""
-    lang_stats: dict[str, dict[str, int]] = {}
-    for lang in SUPPORTED_LANGUAGE_CODES:
-        lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
-
-    pos_stats: dict[str, int] = {}
-
-    for r in records:
-        pos = r["pos"]
-        pos_stats[pos] = pos_stats.get(pos, 0) + 1
-
-        for lang, lemmas in r["translations"].items():
-            if lang in lang_stats:
-                lang_stats[lang]["translations"] += len(lemmas)
-        for lang, gloss_list in r["glosses"].items():
-            if lang in lang_stats:
-                lang_stats[lang]["glosses"] += len(gloss_list)
-        for lang, example_list in r["examples"].items():
-            if lang in lang_stats:
-                lang_stats[lang]["examples"] += len(example_list)
-
-    print("\nPOS breakdown:")
-    for pos, count in sorted(pos_stats.items()):
-        print(f"  {pos}: {count:,}")
-
-    print("\nCoverage per language:")
-    for lang, counts in lang_stats.items():
-        t = counts["translations"]
-        g = counts["glosses"]
-        e = counts["examples"]
-        total = len(records)
-        print(
-            f"  {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
-        )
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
-    parser.add_argument(
-        "--output-dir",
-        default="stage-1-extract/output",
-        help="Output directory for JSON files",
-    )
-    parser.add_argument(
-        "--sample",
-        action="store_true",
-        help="Extract only 100 synsets per language for inspection",
-    )
-    args = parser.parse_args()
-
-    extract_all(output_dir=args.output_dir, sample=args.sample)
--- a/data-pipeline/stage-1-extract/scripts/extract.ts
+++ b/data-pipeline/stage-1-extract/scripts/extract.ts
@ -0,0 +1,209 @@
+import fs from "node:fs";
+import path from "node:path";
+import readline from "node:readline";
+import { fileURLToPath } from "node:url";
+import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type KaikkiTranslation = {
+  code?: string;
+  lang_code?: string;
+  word?: string;
+  sense?: string;
+};
+
+type KaikkiSense = {
+  glosses?: string[];
+  examples?: { text?: string }[];
+  translations?: KaikkiTranslation[];
+};
+
+type KaikkiEntry = { word?: string; pos?: string; senses?: KaikkiSense[] };
+
+export type ExtractedSense = {
+  headword: string;
+  pos: SupportedPos;
+  sense_index: number;
+  gloss: string | null;
+  examples: string[];
+  translations: {
+    target_lang: SupportedLanguageCode;
+    word: string;
+    sense_hint: string | null;
+  }[];
+};
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const PATHS = {
+  source: path.resolve(
+    __dirname,
+    "../sources/kaikki.org-dictionary-English.jsonl",
+  ),
+  output: path.resolve(__dirname, "../output/en.json"),
+};
+
+const POS_MAP: Record<string, SupportedPos> = {
+  noun: "noun",
+  verb: "verb",
+  adj: "adjective",
+  adv: "adverb",
+};
+
+const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+function mapPos(kaikkiPos: string): SupportedPos | null {
+  return POS_MAP[kaikkiPos] ?? null;
+}
+
+function isAbbreviation(gloss: string): boolean {
+  return gloss.toLowerCase().startsWith("abbreviation of");
+}
+
+function extractTranslations(
+  sense: KaikkiSense,
+): ExtractedSense["translations"] {
+  const seen = new Set<string>();
+  const result: ExtractedSense["translations"] = [];
+
+  for (const t of sense.translations ?? []) {
+    const code = t.code ?? t.lang_code;
+    if (!code || !SUPPORTED_LANG_SET.has(code) || code === "en") continue;
+    if (!t.word?.trim()) continue;
+
+    const key = `${code}:${t.word.trim()}`;
+    if (seen.has(key)) continue;
+    seen.add(key);
+
+    result.push({
+      target_lang: code as SupportedLanguageCode,
+      word: t.word.trim(),
+      sense_hint: t.sense?.trim() ?? null,
+    });
+  }
+
+  return result;
+}
+
+function extractExamples(sense: KaikkiSense): string[] {
+  return (sense.examples ?? [])
+    .map((e) => e.text?.trim())
+    .filter((t): t is string => !!t);
+}
+
+function processEntry(entry: KaikkiEntry): ExtractedSense[] {
+  const pos = mapPos(entry.pos ?? "");
+  if (!pos) return [];
+  if (!entry.word?.trim()) return [];
+
+  const headword = entry.word.trim();
+  const results: ExtractedSense[] = [];
+  let senseIndex = 0;
+
+  for (const sense of entry.senses ?? []) {
+    const gloss = sense.glosses?.[0]?.trim() ?? null;
+
+    // Skip abbreviation senses
+    if (gloss && isAbbreviation(gloss)) continue;
+
+    const translations = extractTranslations(sense);
+
+    // Skip senses with no translations in our supported languages
+    if (translations.length === 0) continue;
+
+    results.push({
+      headword,
+      pos,
+      sense_index: senseIndex++,
+      gloss,
+      examples: extractExamples(sense),
+      translations,
+    });
+  }
+
+  return results;
+}
+
+// ── Main ──────────────────────────────────────────────────────────────────────
+
+async function extract(sampleLimit?: number): Promise<void> {
+  console.log("Extracting Kaikki English data...");
+  console.log(`  Source: ${PATHS.source}`);
+
+  if (sampleLimit) {
+    console.log(`  Sample mode: ${sampleLimit} entries`);
+  }
+
+  await fs.promises.mkdir(path.dirname(PATHS.output), { recursive: true });
+
+  const fileStream = fs.createReadStream(PATHS.source);
+  const rl = readline.createInterface({
+    input: fileStream,
+    crlfDelay: Infinity,
+  });
+
+  const senses: ExtractedSense[] = [];
+  let linesRead = 0;
+  let entriesProcessed = 0;
+  let entriesSkipped = 0;
+
+  for await (const line of rl) {
+    if (!line.trim()) continue;
+    if (sampleLimit && entriesProcessed >= sampleLimit) break;
+
+    linesRead++;
+
+    let entry: KaikkiEntry;
+    try {
+      entry = JSON.parse(line) as KaikkiEntry;
+    } catch {
+      console.warn(`  Warning: failed to parse line ${linesRead}, skipping`);
+      continue;
+    }
+
+    const extracted = processEntry(entry);
+
+    if (extracted.length === 0) {
+      entriesSkipped++;
+      continue;
+    }
+
+    senses.push(...extracted);
+    entriesProcessed++;
+
+    if (entriesProcessed % 10_000 === 0) {
+      console.log(
+        `  Processed ${entriesProcessed.toLocaleString()} entries...`,
+      );
+    }
+  }
+
+  await fs.promises.writeFile(
+    PATHS.output,
+    JSON.stringify(senses, null, 2),
+    "utf-8",
+  );
+
+  console.log(`\nExtraction complete:`);
+  console.log(`  Lines read:         ${linesRead.toLocaleString()}`);
+  console.log(`  Entries processed:  ${entriesProcessed.toLocaleString()}`);
+  console.log(`  Entries skipped:    ${entriesSkipped.toLocaleString()}`);
+  console.log(`  Senses extracted:   ${senses.length.toLocaleString()}`);
+  console.log(`  Output:             ${PATHS.output}`);
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
+
+async function main(): Promise<void> {
+  // Hardcoded sample limit for initial testing — remove for full extraction
+  await extract(500);
+}
--- a/data-pipeline/stage-2-annotate/scripts/annotate.ts
+++ b/data-pipeline/stage-2-annotate/scripts/annotate.ts
@ -1,227 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
-import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
-
-// ── Types ────────────────────────────────────────────────────────────────────
-
-type OmwExample = { text: string; source: "omw" };
-
-type CefrExample = { text: string; source: "cefr" };
-
-type Example = OmwExample | CefrExample;
-
-type OmwRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, string[]>>;
-};
-
-type AnnotatedRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
-};
-
-type CefrSourceEntry = {
-  word: string;
-  pos: string;
-  cefr_level: string;
-  example_sentence_native?: string;
-};
-
-type ConflictEntry = {
-  word: string;
-  pos: string;
-  language: SupportedLanguageCode;
-  levels: string[];
-};
-
-// ── Constants ─────────────────────────────────────────────────────────────────
-
-const POS_NORMALIZE: Record<string, SupportedPos> = {
-  noun: "noun",
-  n: "noun",
-  nom: "noun", // French
-  verb: "verb",
-  verbs: "verb",
-  v: "verb",
-  v1: "verb",
-  adjective: "adjective",
-  adjektiv: "adjective", // German
-  adj: "adjective",
-  adverb: "adverb",
-  adverbs: "adverb",
-  adv: "adverb",
-};
-
-const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
-
-const PATHS = {
-  omw: "stage-1-extract/output/omw.json",
-  cefrDir: "stage-2-annotate/sources/cefr",
-  outputDir: "stage-2-annotate/output",
-};
-
-// ── CEFR source loading ───────────────────────────────────────────────────────
-
-type CefrIndex = Map<string, { level: string; example?: string }>;
-
-async function loadCefrSource(
-  lang: SupportedLanguageCode,
-): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
-  const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
-  const raw = await fs.readFile(filepath, "utf-8");
-  const entries = JSON.parse(raw) as CefrSourceEntry[];
-
-  // First pass — detect conflicts.
-  // Structure: "word|pos" -> Set of CEFR levels seen
-  const seen = new Map<string, Set<string>>();
-
-  for (const entry of entries) {
-    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
-    if (!pos) continue;
-    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
-
-    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
-    if (!seen.has(key)) seen.set(key, new Set());
-    seen.get(key)!.add(entry.cefr_level);
-  }
-
-  const conflicts: ConflictEntry[] = [];
-  for (const [key, levels] of seen.entries()) {
-    if (levels.size > 1) {
-      const [word, pos] = key.split("|") as [string, string];
-      conflicts.push({ word, pos, language: lang, levels: [...levels] });
-    }
-  }
-
-  // Second pass — build index, skip conflicting entries.
-  const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
-
-  const index: CefrIndex = new Map();
-  for (const entry of entries) {
-    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
-    if (!pos) continue;
-    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
-
-    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
-    if (conflictKeys.has(key)) continue;
-
-    index.set(key, {
-      level: entry.cefr_level,
-      ...(entry.example_sentence_native
-        ? { example: entry.example_sentence_native }
-        : {}),
-    });
-  }
-
-  return { index, conflicts };
-}
-
-// ── Annotation ────────────────────────────────────────────────────────────────
-
-async function annotate(): Promise<void> {
-  // Load OMW records
-  console.log("Reading OMW extract...");
-  const raw = await fs.readFile(PATHS.omw, "utf-8");
-  const omwRecords = JSON.parse(raw) as OmwRecord[];
-  console.log(`  Loaded ${omwRecords.length.toLocaleString()} synsets`);
-
-  // Load CEFR sources for all languages
-  console.log("\nLoading CEFR source files...");
-  const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
-  const allConflicts: ConflictEntry[] = [];
-
-  for (const lang of SUPPORTED_LANGUAGE_CODES) {
-    const { index, conflicts } = await loadCefrSource(lang);
-    cefrIndexes.set(lang, index);
-    allConflicts.push(...conflicts);
-    console.log(
-      `  ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
-    );
-  }
-
-  // Write conflicts file
-  await fs.mkdir(PATHS.outputDir, { recursive: true });
-  await fs.writeFile(
-    path.join(PATHS.outputDir, "conflicts.json"),
-    JSON.stringify(allConflicts, null, 2),
-    "utf-8",
-  );
-  console.log(
-    `\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
-  );
-
-  // Annotate and write one file per language
-  console.log("\nAnnotating...");
-  for (const lang of SUPPORTED_LANGUAGE_CODES) {
-    const index = cefrIndexes.get(lang)!;
-    const records: AnnotatedRecord[] = [];
-    let matched = 0;
-
-    for (const record of omwRecords) {
-      const annotated: AnnotatedRecord = {
-        source_id: record.source_id,
-        pos: record.pos,
-        translations: record.translations,
-        glosses: record.glosses,
-        examples: {},
-        votes: {},
-      };
-
-      // Convert OMW examples to typed format
-      for (const [l, exList] of Object.entries(record.examples)) {
-        annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
-          text,
-          source: "omw" as const,
-        }));
-      }
-
-      // Match translations for this language against CEFR index
-      const langTranslations = record.translations[lang] ?? [];
-      for (const word of langTranslations) {
-        const key = `${word.toLowerCase().trim()}|${record.pos}`;
-        const cefrEntry = index.get(key);
-        if (!cefrEntry) continue;
-
-        matched++;
-
-        // Add CEFR vote
-        if (!annotated.votes[lang]) annotated.votes[lang] = {};
-        annotated.votes[lang][word] = { cefr_source: cefrEntry.level };
-
-        // Add native example if present
-        if (cefrEntry.example) {
-          if (!annotated.examples[lang]) annotated.examples[lang] = [];
-          annotated.examples[lang].push({
-            text: cefrEntry.example,
-            source: "cefr" as const,
-          });
-        }
-      }
-
-      records.push(annotated);
-    }
-
-    const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
-    await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
-    console.log(
-      `  ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
-    );
-  }
-}
-
-// ── Main ─────────────────────────────────────────────────────────────────────
-
-annotate().catch((err) => {
-  console.error(err);
-  process.exit(1);
-});
--- a/data-pipeline/stage-2-annotate/sources/cefr/de.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/de.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/en.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/en.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/es.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/es.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/fr.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/fr.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/it.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/it.json
--- a/data-pipeline/tests/fixtures/annotated.fixture.json
+++ b/data-pipeline/tests/fixtures/annotated.fixture.json
@ -1,170 +0,0 @@
-[
-  {
-    "_fixture": "noun_with_cefr_vote",
-    "source_id": "ili:i100955",
-    "pos": "noun",
-    "translations": { "en": ["grain"], "de": ["Korn", "Kornbrand"] },
-    "glosses": { "en": ["a cereal grass"], "de": ["ein Getreidegras"] },
-    "examples": {
-      "en": [
-        { "text": "wheat is a grain that is grown in Kansas", "source": "omw" }
-      ]
-    },
-    "votes": { "en": { "grain": { "cefr_source": "B1" } } }
-  },
-  {
-    "_fixture": "verb_no_votes_no_translations",
-    "source_id": "ili:i21779",
-    "pos": "verb",
-    "translations": { "en": ["respire"] },
-    "glosses": {
-      "en": [
-        "undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide"
-      ]
-    },
-    "examples": {},
-    "votes": {}
-  },
-  {
-    "_fixture": "verb_with_cefr_vote_all_languages",
-    "source_id": "ili:i21778",
-    "pos": "verb",
-    "translations": {
-      "en": ["breathe", "take a breath", "respire", "suspire"],
-      "it": ["respirare"],
-      "es": ["aspirar", "respirar"],
-      "de": ["Luft holen", "hauchen", "Luft bekommen", "Luft kriegen", "atmen"],
-      "fr": ["inspirer", "respirer"]
-    },
-    "glosses": {
-      "en": ["draw air into, and expel out of, the lungs"],
-      "de": ["Luft in die Lunge saugen und aus ihr ausstoßen"]
-    },
-    "examples": {
-      "en": [
-        {
-          "text": "I can breathe better when the air is clean",
-          "source": "omw"
-        },
-        { "text": "The patient is respiring", "source": "omw" }
-      ]
-    },
-    "votes": { "en": { "breathe": { "cefr_source": "A1" } } }
-  },
-  {
-    "_fixture": "adjective_all_languages_multiple_translations",
-    "source_id": "ili:i10007",
-    "pos": "adjective",
-    "translations": {
-      "en": ["possible"],
-      "it": [
-        "attuabile",
-        "effettuabile",
-        "eseguibile",
-        "fattibile",
-        "operabile",
-        "possibile",
-        "producibile",
-        "realizzabile"
-      ],
-      "es": ["posible"],
-      "de": [
-        "möglich",
-        "denkbar",
-        "eventuell",
-        "möglicherweise",
-        "allfällig",
-        "etwaig",
-        "gegebenenfalls",
-        "eventuell"
-      ],
-      "fr": ["possible", "éventuel"]
-    },
-    "glosses": {
-      "en": ["capable of happening or existing"],
-      "de": ["in der Lage, zu geschehen oder zu existieren"]
-    },
-    "examples": {
-      "en": [
-        { "text": "a breakthrough may be possible next year", "source": "omw" },
-        { "text": "anything is possible", "source": "omw" },
-        { "text": "warned of possible consequences", "source": "omw" }
-      ]
-    },
-    "votes": { "en": { "possible": { "cefr_source": "A2" } } }
-  },
-  {
-    "_fixture": "adjective_multiple_de_votes_cefr_examples",
-    "source_id": "ili:i10000",
-    "pos": "adjective",
-    "translations": {
-      "en": ["negative"],
-      "de": [
-        "dürftig",
-        "zu wünschen übrig lassen",
-        "schlecht",
-        "widrig",
-        "ungut",
-        "lausig",
-        "negativ",
-        "von Nachteil",
-        "schädlich",
-        "nachteilig",
-        "ungünstig"
-      ],
-      "fr": ["négatif", "strictement négatif"]
-    },
-    "glosses": { "en": ["less than zero"], "de": ["kleiner als Null"] },
-    "examples": {
-      "en": [{ "text": "a negative number", "source": "omw" }],
-      "de": [
-        { "text": "Die Beweise waren dürftig.", "source": "cefr" },
-        { "text": "Das Wetter ist heute schlecht.", "source": "cefr" },
-        {
-          "text": "Trotz widriger Umstände haben sie es geschafft.",
-          "source": "cefr"
-        },
-        {
-          "text": "Er hatte ein ungutes Gefühl bei der Sache.",
-          "source": "cefr"
-        },
-        { "text": "Er hat eine sehr negative Einstellung.", "source": "cefr" },
-        {
-          "text": "Rauchen ist schädlich für die Gesundheit.",
-          "source": "cefr"
-        },
-        {
-          "text": "Diese Entscheidung könnte nachteilig sein.",
-          "source": "cefr"
-        },
-        {
-          "text": "Das Wetter ist heute ungünstig für einen Ausflug.",
-          "source": "cefr"
-        }
-      ]
-    },
-    "votes": {
-      "de": {
-        "dürftig": { "cefr_source": "C1" },
-        "schlecht": { "cefr_source": "A1" },
-        "widrig": { "cefr_source": "C1" },
-        "ungut": { "cefr_source": "B2" },
-        "negativ": { "cefr_source": "A2" },
-        "schädlich": { "cefr_source": "B1" },
-        "nachteilig": { "cefr_source": "B1" },
-        "ungünstig": { "cefr_source": "B2" }
-      }
-    }
-  },
-  {
-    "_fixture": "adverb_no_votes",
-    "source_id": "ili:i18157",
-    "pos": "adverb",
-    "translations": { "en": ["a cappella"], "es": ["a capella"] },
-    "glosses": { "en": ["without musical accompaniment"] },
-    "examples": {
-      "en": [{ "text": "they performed a cappella", "source": "omw" }]
-    },
-    "votes": {}
-  }
-]
--- a/data-pipeline/tests/fixtures/conflicts.fixture.json
+++ b/data-pipeline/tests/fixtures/conflicts.fixture.json
@ -1,4 +0,0 @@
-[
-  { "word": "macht", "pos": "noun", "language": "de", "levels": ["A2", "B1"] },
-  { "word": "bleiche", "pos": "noun", "language": "de", "levels": ["B2", "B1"] }
-]
--- a/data-pipeline/tests/validation/db-import.validation.test.ts
+++ b/data-pipeline/tests/validation/db-import.validation.test.ts
@ -1,237 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { describe, it, expect, beforeAll } from "vitest";
-import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
-import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
-
-// ── Types ─────────────────────────────────────────────────────────────────────
-
-type Example = { text: string; source: "omw" | "cefr" };
-
-type AnnotatedRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
-};
-
-// ── Paths ─────────────────────────────────────────────────────────────────────
-
-const DB_PATH = path.resolve("db/pipeline.db");
-const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
-const ANNOTATED_DIR = path.resolve("stage-2-annotate/output");
-
-// ── Helpers ───────────────────────────────────────────────────────────────────
-
-async function dbExists(): Promise<boolean> {
-  try {
-    await fs.access(DB_PATH);
-    return true;
-  } catch {
-    return false;
-  }
-}
-
-// ── Tests ─────────────────────────────────────────────────────────────────────
-
-describe("pipeline.db — import validation", () => {
-  let db: import("better-sqlite3").Database;
-  let expectedSynsetCount: number;
-  let expectedCefrVoteCount: number;
-
-  beforeAll(async () => {
-    if (!(await dbExists())) return;
-
-    const Database = (await import("better-sqlite3")).default;
-    db = new Database(DB_PATH, { readonly: true });
-    db.pragma("foreign_keys = ON");
-
-    // Count expected synsets from omw.json
-    const omwRaw = await fs.readFile(OMW_PATH, "utf-8");
-    const omwRecords = JSON.parse(omwRaw) as unknown[];
-    expectedSynsetCount = omwRecords.length;
-
-    // Count expected CEFR votes from stage 2 annotated files.
-    // Merge all language files the same way the import script does —
-    // use en.json as base and merge votes from the other language files.
-    const byId = new Map<string, AnnotatedRecord>();
-
-    const baseRaw = await fs.readFile(
-      path.join(ANNOTATED_DIR, "en.json"),
-      "utf-8",
-    );
-    const base = JSON.parse(baseRaw) as AnnotatedRecord[];
-    for (const record of base) {
-      byId.set(record.source_id, record);
-    }
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      if (lang === "en") continue;
-      const raw = await fs.readFile(
-        path.join(ANNOTATED_DIR, `${lang}.json`),
-        "utf-8",
-      );
-      const records = JSON.parse(raw) as AnnotatedRecord[];
-      for (const record of records) {
-        const base = byId.get(record.source_id);
-        if (!base) continue;
-        for (const [l, langVotes] of Object.entries(record.votes)) {
-          if (!base.votes[l as SupportedLanguageCode]) {
-            base.votes[l as SupportedLanguageCode] = {};
-          }
-          Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
-        }
-      }
-    }
-
-    expectedCefrVoteCount = 0;
-    for (const record of byId.values()) {
-      for (const langVotes of Object.values(record.votes)) {
-        expectedCefrVoteCount += Object.keys(langVotes ?? {}).length;
-      }
-    }
-  }, 120_000);
-
-  it("pipeline.db exists — skipping all tests if not", async () => {
-    const exists = await dbExists();
-    if (!exists) {
-      console.warn(
-        "\n  pipeline.db not found — run pnpm db:init and pnpm db:import first\n",
-      );
-    }
-    expect(exists).toBe(true);
-  });
-
-  it("synsets count matches omw.json", () => {
-    if (!db) return;
-    const row = db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
-      count: number;
-    };
-    expect(row.count).toBe(expectedSynsetCount);
-  });
-
-  it("every synset has at least one translation", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT s.source_id
-        FROM synsets s
-        LEFT JOIN translations t ON t.source_id = s.source_id
-        WHERE t.id IS NULL
-      `,
-      )
-      .all() as { source_id: string }[];
-
-    const errors = rows.map((r) => `${r.source_id}: no translations`);
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every translation belongs to a valid synset", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT t.id, t.source_id
-        FROM translations t
-        LEFT JOIN synsets s ON s.source_id = t.source_id
-        WHERE s.source_id IS NULL
-      `,
-      )
-      .all() as { id: number; source_id: string }[];
-
-    const errors = rows.map(
-      (r) => `translation ${r.id}: references missing synset ${r.source_id}`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every cefr_source_vote references a valid translation", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT v.id, v.translation_id
-        FROM cefr_source_votes v
-        LEFT JOIN translations t ON t.id = v.translation_id
-        WHERE t.id IS NULL
-      `,
-      )
-      .all() as { id: number; translation_id: number }[];
-
-    const errors = rows.map(
-      (r) =>
-        `cefr_vote ${r.id}: references missing translation ${r.translation_id}`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("cefr_source_votes count matches stage 2 annotated output", () => {
-    if (!db) return;
-    const row = db
-      .prepare("SELECT COUNT(*) as count FROM cefr_source_votes")
-      .get() as { count: number };
-    expect(row.count).toBe(expectedCefrVoteCount);
-  });
-
-  it("every example has a valid source", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT source_id, language, source
-        FROM examples
-        WHERE source NOT IN ('omw', 'cefr')
-      `,
-      )
-      .all() as { source_id: string; language: string; source: string }[];
-
-    const errors = rows.map(
-      (r) =>
-        `${r.source_id} (${r.language}): invalid example source "${r.source}"`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every example belongs to a valid synset", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT e.id, e.source_id
-        FROM examples e
-        LEFT JOIN synsets s ON s.source_id = e.source_id
-        WHERE s.source_id IS NULL
-      `,
-      )
-      .all() as { id: number; source_id: string }[];
-
-    const errors = rows.map(
-      (r) => `example ${r.id}: references missing synset ${r.source_id}`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every gloss belongs to a valid synset", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT g.id, g.source_id
-        FROM glosses g
-        LEFT JOIN synsets s ON s.source_id = g.source_id
-        WHERE s.source_id IS NULL
-      `,
-      )
-      .all() as { id: number; source_id: string }[];
-
-    const errors = rows.map(
-      (r) => `gloss ${r.id}: references missing synset ${r.source_id}`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-});
--- a/data-pipeline/tests/validation/stage-1.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-1.validation.test.ts
@ -1,166 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { describe, it, expect } from "vitest";
-import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
-import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
-
-// ── Types ─────────────────────────────────────────────────────────────────────
-
-type OmwRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, string[]>>;
-};
-
-// ── Paths ─────────────────────────────────────────────────────────────────────
-
-const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
-
-// ── Helpers ───────────────────────────────────────────────────────────────────
-
-function isValidSourceId(id: string): boolean {
-  return /^ili:i\d+$/.test(id);
-}
-
-// ── Tests ─────────────────────────────────────────────────────────────────────
-
-describe("stage 1 — omw.json validation", () => {
-  let records: OmwRecord[];
-
-  it("file exists and is valid JSON", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-    expect(records).toBeDefined();
-  });
-
-  it("is a non-empty array", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-    expect(Array.isArray(records)).toBe(true);
-    expect(records.length).toBeGreaterThan(0);
-  });
-
-  it("every record has required fields", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const errors: string[] = [];
-
-    for (const record of records) {
-      if (!record.source_id) {
-        errors.push(`missing source_id`);
-        continue;
-      }
-      if (!record.pos) errors.push(`${record.source_id}: missing pos`);
-      if (!record.translations)
-        errors.push(`${record.source_id}: missing translations`);
-      if (!record.glosses) errors.push(`${record.source_id}: missing glosses`);
-      if (!record.examples)
-        errors.push(`${record.source_id}: missing examples`);
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every source_id matches ili:i{number} pattern", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const errors: string[] = [];
-
-    for (const record of records) {
-      if (!isValidSourceId(record.source_id)) {
-        errors.push(`invalid source_id: ${record.source_id}`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every source_id is unique", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const seen = new Set<string>();
-    const errors: string[] = [];
-
-    for (const record of records) {
-      if (seen.has(record.source_id)) {
-        errors.push(`duplicate source_id: ${record.source_id}`);
-      }
-      seen.add(record.source_id);
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every pos is a valid supported value", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const errors: string[] = [];
-    const validPos = new Set(SUPPORTED_POS);
-
-    for (const record of records) {
-      if (!validPos.has(record.pos)) {
-        errors.push(`${record.source_id}: invalid pos "${record.pos}"`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every record has at least one translation in at least one language", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const errors: string[] = [];
-    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
-
-    for (const record of records) {
-      const langs = Object.keys(record.translations) as SupportedLanguageCode[];
-
-      if (langs.length === 0) {
-        errors.push(`${record.source_id}: no translations`);
-        continue;
-      }
-
-      for (const lang of langs) {
-        if (!validLangs.has(lang)) {
-          errors.push(`${record.source_id}: unsupported language "${lang}"`);
-        }
-        const words = record.translations[lang] ?? [];
-        if (words.length === 0) {
-          errors.push(`${record.source_id}: empty translations for "${lang}"`);
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("no duplicate translations within a single synset and language", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    const records = JSON.parse(raw) as OmwRecord[];
-
-    const errors: string[] = [];
-
-    for (const record of records) {
-      for (const [lang, words] of Object.entries(record.translations)) {
-        const seen = new Set<string>();
-        for (const word of words) {
-          if (seen.has(word)) {
-            errors.push(
-              `${record.source_id} (${lang}): duplicate translation "${word}"`,
-            );
-          }
-          seen.add(word);
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-});
--- a/data-pipeline/tests/validation/stage-2.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-2.validation.test.ts
@ -1,218 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { describe, it, expect, beforeAll } from "vitest";
-import {
-  SUPPORTED_POS,
-  SUPPORTED_LANGUAGE_CODES,
-  CEFR_LEVELS,
-} from "@lila/shared";
-import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
-
-// ── Types ─────────────────────────────────────────────────────────────────────
-
-type Example = { text: string; source: "omw" | "cefr" };
-
-type AnnotatedRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
-};
-
-type ConflictEntry = {
-  word: string;
-  pos: string;
-  language: SupportedLanguageCode;
-  levels: string[];
-};
-
-// ── Paths ─────────────────────────────────────────────────────────────────────
-
-const OUTPUT_DIR = path.resolve("stage-2-annotate/output");
-
-// ── Tests ─────────────────────────────────────────────────────────────────────
-
-describe("stage 2 — annotated output validation", () => {
-  const recordsByLang = new Map<SupportedLanguageCode, AnnotatedRecord[]>();
-  let conflicts: ConflictEntry[] = [];
-
-  beforeAll(async () => {
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const raw = await fs.readFile(
-        path.join(OUTPUT_DIR, `${lang}.json`),
-        "utf-8",
-      );
-      recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]);
-    }
-    const raw = await fs.readFile(
-      path.join(OUTPUT_DIR, "conflicts.json"),
-      "utf-8",
-    );
-    conflicts = JSON.parse(raw) as ConflictEntry[];
-  }, 60_000);
-
-  it("all five language files exist", async () => {
-    const errors: string[] = [];
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
-      try {
-        await fs.access(filePath);
-      } catch {
-        errors.push(`missing file: ${lang}.json`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("conflicts.json exists", async () => {
-    const filePath = path.join(OUTPUT_DIR, "conflicts.json");
-    await expect(fs.access(filePath)).resolves.toBeUndefined();
-  });
-
-  it("every language file is a non-empty array", () => {
-    const errors: string[] = [];
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-      if (!Array.isArray(records)) {
-        errors.push(`${lang}.json: not an array`);
-      } else if (records.length === 0) {
-        errors.push(`${lang}.json: empty array`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every record has required fields", () => {
-    const errors: string[] = [];
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        if (!record.source_id) {
-          errors.push(`${lang}: record missing source_id`);
-          continue;
-        }
-        if (!record.pos)
-          errors.push(`${lang} ${record.source_id}: missing pos`);
-        if (!record.translations)
-          errors.push(`${lang} ${record.source_id}: missing translations`);
-        if (!record.glosses)
-          errors.push(`${lang} ${record.source_id}: missing glosses`);
-        if (record.examples === undefined)
-          errors.push(`${lang} ${record.source_id}: missing examples`);
-        if (record.votes === undefined)
-          errors.push(`${lang} ${record.source_id}: missing votes`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every pos is a valid supported value", () => {
-    const errors: string[] = [];
-    const validPos = new Set(SUPPORTED_POS);
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        if (!validPos.has(record.pos)) {
-          errors.push(
-            `${lang} ${record.source_id}: invalid pos "${record.pos}"`,
-          );
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every example has text and a valid source", () => {
-    const errors: string[] = [];
-    const validSources = new Set(["omw", "cefr"]);
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        for (const [l, examples] of Object.entries(record.examples)) {
-          for (const example of examples) {
-            if (!example.text) {
-              errors.push(
-                `${lang} ${record.source_id} (${l}): example missing text`,
-              );
-            }
-            if (!validSources.has(example.source)) {
-              errors.push(
-                `${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`,
-              );
-            }
-          }
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every cefr_source vote is a valid CEFR level", () => {
-    const errors: string[] = [];
-    const validLevels = new Set(CEFR_LEVELS);
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        for (const [l, langVotes] of Object.entries(record.votes)) {
-          for (const [word, vote] of Object.entries(langVotes ?? {})) {
-            if (
-              !validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number])
-            ) {
-              errors.push(
-                `${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`,
-              );
-            }
-          }
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("conflicts.json entries have required fields and valid CEFR levels", () => {
-    const errors: string[] = [];
-    const validLevels = new Set(CEFR_LEVELS);
-    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
-
-    for (const entry of conflicts) {
-      if (!entry.word) errors.push(`conflict missing word`);
-      if (!entry.pos) errors.push(`conflict missing pos`);
-      if (!entry.language) {
-        errors.push(`conflict missing language`);
-      } else if (!validLangs.has(entry.language)) {
-        errors.push(`conflict invalid language "${entry.language}"`);
-      }
-      if (!Array.isArray(entry.levels) || entry.levels.length < 2) {
-        errors.push(`${entry.word}: levels must have at least 2 entries`);
-      } else {
-        for (const level of entry.levels) {
-          if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) {
-            errors.push(`${entry.word}: invalid level "${level}"`);
-          }
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-});