refactoring documentation

WIP: checkpoint before stage-3 sub-stage rewrite
feat: enrich script working, redesigning to sub-stage architecture
2026-05-15 23:09:54 +02:00 · 2026-05-12 22:13:14 +02:00 · 2026-05-07 13:09:43 +02:00 · 2026-05-05 19:30:18 +02:00 · 2026-05-05 19:28:38 +02:00 · 2026-05-05 19:10:19 +02:00
50 changed files with 3107 additions and 1057495 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,6 +12,7 @@ __pycache__/

 data-pipeline/archive/
 data-pipeline/stage-1-extract/output/
+data-pipeline/stage-1-extract/sources/
 data-pipeline/stage-2-annotate/output/
 data-pipeline/stage-3-enrich/output/
 data-pipeline/stage-4-merge/output/
--- a/apps/api/src/controllers/gameController.test.ts
+++ b/apps/api/src/controllers/gameController.test.ts
@ -64,9 +64,14 @@ const validBody = {
 };

 const fakeTerms = [
-  { termId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
-  { termId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
-  { termId: "t3", sourceText: "house", targetText: "casa", sourceGloss: null },
+  { entryId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
+  { entryId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
+  {
+    entryId: "t3",
+    sourceText: "house",
+    targetText: "casa",
+    sourceGloss: "a building for living in",
+  },
 ];

 beforeEach(() => {
@ -197,7 +202,7 @@ describe("POST /api/v1/game/answer", () => {
    expect(body.success).toBe(false);
    expect(body.error).toContain("Question already answered");
  });
-  
+
  it("returns 400 when a field has an invalid value", async () => {
    const res = await request(app)
      .post("/api/v1/game/start")
--- a/apps/api/src/services/gameService.test.ts
+++ b/apps/api/src/services/gameService.test.ts
@ -19,10 +19,10 @@ const validRequest: GameRequest = {
 };

 const fakeTerms = [
-  { termId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
-  { termId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
+  { entryId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
+  { entryId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
  {
-    termId: "t3",
+    entryId: "t3",
    sourceText: "house",
    targetText: "casa",
    sourceGloss: "a building for living in",
--- a/apps/api/src/services/gameService.ts
+++ b/apps/api/src/services/gameService.ts
@ -38,8 +38,9 @@ export const createGameSession = async (
  const questions: GameQuestion[] = await Promise.all(
    terms.map(async (term) => {
      const distractorTexts = await getDistractors(
-        term.termId,
+        term.entryId,
        term.targetText,
+        request.source_language,
        request.target_language,
        request.pos,
        request.difficulty,
--- a/apps/api/src/services/multiplayerGameService.test.ts
+++ b/apps/api/src/services/multiplayerGameService.test.ts
@ -9,10 +9,10 @@ const mockGetGameTerms = vi.mocked(getGameTerms);
 const mockGetDistractors = vi.mocked(getDistractors);

 const fakeTerms = [
-  { termId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
-  { termId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
+  { entryId: "t1", sourceText: "dog", targetText: "cane", sourceGloss: null },
+  { entryId: "t2", sourceText: "cat", targetText: "gatto", sourceGloss: null },
  {
-    termId: "t3",
+    entryId: "t3",
    sourceText: "house",
    targetText: "casa",
    sourceGloss: "a building for living in",
--- a/apps/api/src/services/multiplayerGameService.ts
+++ b/apps/api/src/services/multiplayerGameService.ts
@ -44,8 +44,9 @@ export const generateMultiplayerQuestions = async (): Promise<
  const questions: MultiplayerQuestion[] = await Promise.all(
    correctAnswers.map(async (correctAnswer) => {
      const distractorTexts = await getDistractors(
-        correctAnswer.termId,
+        correctAnswer.entryId,
        correctAnswer.targetText,
+        MULTIPLAYER_DEFAULTS.sourceLanguage,
        MULTIPLAYER_DEFAULTS.targetLanguage,
        MULTIPLAYER_DEFAULTS.pos,
        MULTIPLAYER_DEFAULTS.difficulty,
--- a/data-pipeline/audit.md
+++ b/data-pipeline/audit.md
@ -1,362 +0,0 @@
-# OMW German Translation Quality Audit
-
-Instructions: for each entry, check if the German translations
-match the meaning described by the English gloss.
-
-Mark QUALITY as:
-OK — all German translations fit the meaning
-PARTIAL — some fit, some don't
-BAD — none of the German translations fit
-USELESS — translations are correct but useless for learners
-
---
-
-1.  [noun] ili:i98680
-    EN gloss: the flowering part of a plant or arrangement of flowers on a stalk
-    DE gloss: der blühende Teil einer Pflanze oder die Anordnung von Blüten an einem Stiel
-    EN words: inflorescence
-    DE words: Blütenstand, Infloreszenz
-    QUALITY: correct
-
-2.  [verb] ili:i24675
-    EN gloss: make motionless
-    DE gloss: unbeweglich machen
-    EN words: still
-    DE words: stillen, zum Stillstand bringen
-    QUALITY: stillen means breastfeeding, so completelyworng, zum stillstand bringen is correct but the gloss sounds weird: unbeweglich machen, no one says this
-
-3.  [verb] ili:i22153
-    EN gloss: lose interest or become bored with something or somebody
-    DE gloss: das Interesse an etwas oder jemandem verlieren oder sich langweilen
-    EN words: fatigue, jade, pall, tire, weary
-    DE words: Langeweile erzeugen, anöden, ermüden, langweilen, sich langweilen, sich zu Tode langweilen, sich öden
-    QUALITY: its ok
-
-4.  [noun] ili:i74742
-    EN gloss: zealous preaching and advocacy of the gospel
-    DE gloss: eifriges Predigen und Eintreten für das Evangelium
-    EN words: evangelism
-    DE words: Evangelisation, Evangelisierung
-    QUALITY: ok
-
-5.  [noun] ili:i115665
-    EN gloss: an oxide of iron that is strongly attracted by magnets
-    DE gloss: ein Eisenoxid, das stark von Magneten angezogen wird
-    EN words: magnetic iron-ore, magnetite
-    DE words: Eisenoxiduloxid, Magneteisen, Magneteisenstein, Magnetit
-    QUALITY: ok
-
-6.  [adjective] ili:i17569
-    EN gloss: of or relating to fatalism
-    DE gloss: von oder im Zusammenhang mit Fatalismus
-    EN words: fatalist, fatalistic
-    DE words: auf alles gefasst, dem Schicksal ergeben, fatalistisch, gottergeben, schicksalsergeben
-    QUALITY: ok
-
-7.  [adjective] ili:i682
-    EN gloss: having no previous example or precedent or parallel
-    DE gloss: ohne vorheriges Beispiel oder Präzedenzfall oder Parallele
-    EN words: new, unexampled
-    DE words: beispiellos, gab es noch nie, ohne Beispiel, ohne Präzedenzfall, ohnegleichen, präzedenzlos, sondergleichen, unvergleichbar
-    QUALITY: ok
-
-8.  [noun] ili:i114018
-    EN gloss: a soft silvery metallic element of the rare earth group; isotope 170 emits X-rays and is used in small portable X-ray machines; it occurs in monazite and apatite and xenotime
-    DE gloss: ein weiches, silbriges Metallelement der Gruppe der Seltenen Erden; Isotop 170 emittiert Röntgenstrahlen und wird in kleinen tragbaren Röntgengeräten verwendet; es kommt in Monazit und Apatit sowie in Xenotim vor
-    EN words: Tm, atomic number 69, thulium
-    DE words: Terameter, Tm
-    QUALITY: ok
-
-9.  [noun] ili:i117564
-    EN gloss: the rate of some repeating event
-    DE gloss: die Geschwindigkeit eines sich wiederholenden Ereignisses
-    EN words: pace, tempo
-    DE words: Takt, Tempo
-    QUALITY: ok
-
-10. [verb] ili:i31619
-    EN gloss: let drop or droop
-    DE gloss: fallen oder hängen lassen
-    EN words: hang
-    DE words: am Galgen sterben lassen, aufhängen, aufknüpfen, erhängen, henken, hängen
-    QUALITY: wrong,let drop means fallen lassen, like dropping something? im not sure here, does it really mean to hang some one? if so, then its ok
-
-11. [noun] ili:i75571
-    EN gloss: a heavy dull sound (as made by impact of heavy objects)
-    DE gloss: ein schweres, dumpfes Geräusch (wie beim Aufprall schwerer Gegenstände)
-    EN words: clump, clunk, thud, thump, thumping
-    DE words: Geklacker, Geklapper, Klackern, Klappern
-    QUALITY: ok
-
-12. [noun] ili:i92290
-    EN gloss: a person who makes a promise
-    DE gloss: eine Person, die ein Versprechen gibt
-    EN words: promiser, promisor
-    DE words: Freud'scher Versprecher, Lapsus Linguae, Versprecher, freudscher Versprecher
-    QUALITY: completeley wrong, Versprecher is if you intend to say something but say some thing else, it has nothing to do with Versprechen
-
-13. [noun] ili:i59450
-    EN gloss: a vertical well around which there is a stairway
-    DE gloss: ein vertikaler Schacht, um den herum eine Treppe verläuft
-    EN words: stairwell
-    DE words: Ern, Flur, Hausflur, Stiegenhaus, Treppenhaus
-    QUALITY: treppenhaus woudl be the only correct one right?
-
-14. [verb] ili:i21908
-    EN gloss: smile affectedly or derisively
-    DE gloss: affektiert oder spöttisch lächeln
-    EN words: simper, smirk
-    DE words: in sich hinein lächeln, schmunzeln, vor sich hin lächeln
-    QUALITY: the glosses would be also the words here? schmunzeln and lächeln are kind of the same but the affektiert and spöttisch is missing?
-
-15. [adjective] ili:i10887
-    EN gloss: tending to reserve or introspection
-    DE gloss: zur Zurückhaltung oder Introspektion neigend
-    EN words: indrawn, withdrawn
-    DE words: allein, einsam, eremitenhaft, eremitisch, für sich, solo, wie ein Einsiedler, wie ein Eremit, zurückgezogen
-    QUALITY: ok
-
-16. [noun] ili:i113657
-    EN gloss: a substance from which another substance is formed (especially by a metabolic reaction)
-    DE gloss: ein Stoff, aus dem ein anderer Stoff gebildet wird (insbesondere durch eine Stoffwechselreaktion)
-    EN words: precursor
-    DE words: Ausgangsstoff, Edukt, Grundstoff, Präkursor, Vorläufer, biologische Vorstufe
-    QUALITY: ok
-
-17. [adjective] ili:i13251
-    EN gloss: tastelessly showy
-    DE gloss: geschmacklos und auffällig
-    EN words: brassy, cheap, flash, flashy, garish, gaudy, gimcrack, loud, meretricious, tacky, tatty, tawdry, trashy
-    DE words: aufdringlich, marktschreierisch, reißerisch
-    QUALITY: ok
-
-18. [noun] ili:i68734
-    EN gloss: the branch of chemistry that studies the relation between chemical action and the amount of heat absorbed or generated
-    DE gloss: der Zweig der Chemie, der die Beziehung zwischen chemischer Wirkung und der absorbierten oder erzeugten Wärmemenge untersucht
-    EN words: thermochemistry
-    DE words: Thermochemie, chemische Thermodynamik
-    QUALITY: ok
-
-19. [adjective] ili:i12980
-    EN gloss: distinguished from others in excellence
-    DE gloss: durch hohe Qualität von anderen unterschieden
-    EN words: outstanding
-    DE words: I a, ausgezeichnet, außergewöhnlich, außerordentlich, besonders, bestens, eins a, exzeptionell, herausragend, schnafte, splendid, trefflich, vortrefflich, vorzüglich
-    QUALITY: ok, aber eins a/1a is wirklich sehr starke umgangssprache. und cih habe ncoh nie schnafte oder splendid gehört, der rest passt
-
-20. [verb] ili:i30043
-    EN gloss: tear down so as to make flat with the ground
-    DE gloss: abreißen, um den Boden zu ebnen
-    EN words: dismantle, level, pull down, rase, raze, take down, tear down
-    DE words: abreißen, aus den Augen verlieren, keinen Kontakt mehr haben zu, nicht länger in Kontakt stehen
-    QUALITY: nur abreißen stimmt, der rest passt in diesem zusammenhang gar nicht!
-
-21. [adjective] ili:i14014
-    EN gloss: desired or wished for or sought
-    DE gloss: gewünscht oder gewünscht oder gesucht
-    EN words: wanted
-    DE words: benötigt, gesucht, gewünscht
-    QUALITY: ok
-
-22. [verb] ili:i29481
-    EN gloss: mar or spoil the appearance of
-    DE gloss: das Aussehen verunstalten
-    EN words: blemish, deface, disfigure
-    DE words: deformieren, entstellen, verhunzen, verschandeln, verunstalten, verunzieren
-    QUALITY: ok
-
-23. [verb] ili:i28605
-    EN gloss: spread thickly
-    DE gloss: dick auftragen
-    EN words: slather
-    DE words: beharken, bestreichen, mit Feuer belegen, mit Sperrfeuer belegen
-    QUALITY: kein wort ist wirklich ein synonym für dick auftragen, (i dont even know if the english word fits here?)
-
-24. [noun] ili:i92029
-    EN gloss: someone who is licensed to operate an aircraft in flight
-    DE gloss: jemand, der eine Lizenz zum Führen eines Luftfahrzeugs im Flug hat
-    EN words: airplane pilot, pilot
-    DE words: Führer, Lotse, Pilot
-    QUALITY: nur Pilot stimmt hier
-
-25. [adjective] ili:i8221
-    EN gloss: capable of being measured
-    DE gloss: in der Lage, gemessen zu werden
-    EN words: measurable, mensurable
-    DE words: bestimmbar, der Messung zugänglich, erhebbar, mensurabel, messbar
-    QUALITY: ok
-
-26. [noun] ili:i61380
-    EN gloss: the spirit of a group that makes the members want the group to succeed
-    DE gloss: der Geist einer Gruppe, der die Mitglieder dazu bringt, den Erfolg der Gruppe zu wollen
-    EN words: esprit de corps, morale, team spirit
-    DE words: Gruppengeist, Teamgeist
-    QUALITY: Gruppengeist hört sich so komisch an, das sagt niemand, teamgeist ist in ordnung
-
-27. [adjective] ili:i10497
-    EN gloss: free of restrictions or qualifications
-    DE gloss: Zustand, in dem in einer Wohnung niemand wohnt.
-    EN words: clean, clear
-    DE words: frei, leer stehend, leerstehend, unbewohnt, ungenutzt, verwaist
-    QUALITY: ok
-
-28. [adjective] ili:i6238
-    EN gloss: moving and bending with ease
-    DE gloss: anmutig schlank und mit Leichtigkeit biegsam und beweglich
-    EN words: lissom, lissome, lithe, lithesome, slender, supple, svelte, sylphlike
-    DE words: elastisch, geschmeidig, schlangenartig
-    QUALITY: \_\_\_
-
-29. [noun] ili:i57906
-    EN gloss: station for the production and transmission of AM or FM radio broadcasts
-    DE gloss: Sender für die Produktion und Übertragung von AM- oder FM-Radiosendungen
-    EN words: radio station
-    DE words: Radiosender, Rundfunkstation, Sender
-    QUALITY: \_\_\_
-
-30. [noun] ili:i112045
-    EN gloss: the purple or black-and-blue area resulting from a bruise
-    DE gloss: der violette oder schwarzblaue Bereich, der durch einen Bluterguss entsteht
-    EN words: ecchymosis
-    DE words: Ekchymose, kleinflächige Hautblutung
-    QUALITY: \_\_\_
-
-31. [adjective] ili:i10839
-    EN gloss: capable of being replaced
-    DE gloss: kann ersetzt werden
-    EN words: replaceable
-    DE words: austauschbar, ersetzbar, fungibel
-    QUALITY: \_\_\_
-
-32. [verb] ili:i28714
-    EN gloss: whip
-    DE gloss: peitschen
-    EN words: flagellate, scourge
-    DE words: auspeitschen, flagellieren, geißeln, peitschen
-    QUALITY: \_\_\_
-
-33. [noun] ili:i52826
-    EN gloss: a mechanical or electrical explosive device or a small amount of explosive; can be used to initiate the reaction of a disrupting explosive
-    DE gloss: ein mechanischer oder elektrischer Sprengkörper oder eine kleine Menge Sprengstoff; kann verwendet werden, um die Reaktion eines Sprengstoffs auszulösen
-    EN words: cap, detonating device, detonator
-    DE words: Auslöser, Zünder, Zündvorrichtung
-    QUALITY: \_\_\_
-
-34. [noun] ili:i115477
-    EN gloss: ice crystals forming a white deposit (especially on objects outside)
-    DE gloss: Eiskristalle, die einen weißen Belag bilden (insbesondere auf Gegenständen im Freien)
-    EN words: frost, hoar, hoarfrost, rime
-    DE words: Raufrost, Raureif, Reif
-    QUALITY: \_\_\_
-
-35. [noun] ili:i66650
-    EN gloss: the ability to see in reduced illumination (as in moonlight)
-    DE gloss: die Fähigkeit, bei reduzierter Beleuchtung zu sehen (wie bei Mondlicht)
-    EN words: night vision, night-sight, scotopic vision, twilight vision
-    DE words: Nachtsehen, skotopisches Sehen
-    QUALITY: \_\_\_
-
-36. [verb] ili:i26849
-    EN gloss: express or utter with a hiss
-    DE gloss: mit einem Zischen ausdrücken oder aussprechen
-    EN words: hiss, sibilate, siss, sizz
-    DE words: Stimme dämpfen, flüstern, hauchen, hinter vorgehaltener Hand, ins Ohr sagen, leise sprechen, mit tonloser Stimme, munkeln, raunen, säuseln, tonlos, tuscheln, wispern, zischeln, zuflüstern
-    QUALITY: \_\_\_
-
-37. [noun] ili:i94222
-    EN gloss: a teenager or a young adult male
-    DE gloss: ein Jugendlicher oder ein junger Erwachsener
-    EN words: young buck, young man
-    DE words: Bruder, Bürschchen, Cowboy, Freundchen, Jungs, Kinders, Kollege, Kollegin, Leute, Mann Gottes, Meister, Sportsfreund, Verehrtester, der Herr, guter Mann, junger Mann, mein Gutster, mein Herr
-    QUALITY: \_\_\_
-
-38. [noun] ili:i49310
-    EN gloss: dusky grey food fish found from Louisiana and Florida southward
-    DE gloss: dunkelgrauer Speisefisch, der von Louisiana und Florida südwärts vorkommt
-    EN words: Anisotremus surinamensis, black margate, pompon
-    DE words: Pompon, Puschel, Tanzwedel
-    QUALITY: \_\_\_
-
-39. [noun] ili:i50315
-    EN gloss: a small vehicle with four wheels in which a baby or child is pushed around
-    DE gloss: ein kleines Fahrzeug mit vier Rädern, in dem ein Säugling oder ein Kind herumgeschoben wird
-    EN words: baby buggy, baby carriage, carriage, go-cart, perambulator, pram, pushchair, pusher, stroller
-    DE words: Kinderwagen, Säuglingskutsche
-    QUALITY: \_\_\_
-
-40. [verb] ili:i31857
-    EN gloss: meet at a point
-    DE gloss: sich an einem Punkt treffen
-    EN words: cross, intersect
-    DE words: gegen den Wind segeln, kreuzen
-    QUALITY: \_\_\_
-
-41. [noun] ili:i51632
-    EN gloss: a sailboat with two parallel hulls held together by single deck
-    DE gloss: ein Boot mit zwei parallelen Rümpfen, die durch ein einziges Deck zusammengehalten werden
-    EN words: catamaran
-    DE words: Doppelrumpfboot, Katamaran, Zweirumpfboot
-    QUALITY: \_\_\_
-
-42. [verb] ili:i34734
-    EN gloss: to be found to exist
-    DE gloss: als existent befunden werden
-    EN words: occur
-    DE words: anzutreffen sein, auftreten, nicht ausbleiben, vorkommen, zu finden sein, zu sehen sein
-    QUALITY: \_\_\_
-
-43. [verb] ili:i25187
-    EN gloss: assign too high a value to
-    DE gloss: einen zu hohen Wert zuweisen
-    EN words: overestimate, overvalue
-    DE words: zu hoch bewerten, zu viel Gewicht beimessen, zu viel Wichtigkeit beimessen, überbewerten, überschätzen
-    QUALITY: \_\_\_
-
-44. [noun] ili:i73844
-    EN gloss: an expressive style of music
-    DE gloss: ein ausdrucksstarker Musikstil
-    EN words: genre, music genre, musical genre, musical style
-    DE words: Genre, Musikgenre, Musikrichtung, Musikstil, Stilrichtung
-    QUALITY: \_\_\_
-
-45. [noun] ili:i113026
-    EN gloss: an abnormal condition in which cerebrospinal fluid collects in the ventricles of the brain; in infants it can cause abnormally rapid growth of the head and bulging fontanelles and a small face; in adults the symptoms are primarily neurological
-    DE gloss: ein anormaler Zustand, bei dem sich Liquor in den Hirnventrikeln sammelt; bei Säuglingen kann er zu einem anormal schnellen Wachstum des Kopfes, zu wulstigen Fontanellen und einem kleinen Gesicht führen; bei Erwachsenen sind die Symptome hauptsächlich neurologisch
-    EN words: hydrocephalus, hydrocephaly
-    DE words: Gehirnwassersucht, Hydrocephalus, Hydrozephalus, Wasserkopf
-    QUALITY: \_\_\_
-
-46. [noun] ili:i62720
-    EN gloss: habitual uncleanliness
-    DE gloss: gewohnheitsmäßige Unreinheit
-    EN words: slovenliness
-    DE words: Flickarbeit, Flickenteppich, Flickwerk, Gestümper, Mist, Murks, Murkserei, Pfusch, Pfuscharbeit, Pfuscherei, Schlamperei, Schlendrian, Schluderei, Schund, schlechte Arbeit
-    QUALITY: \_\_\_
-
-47. [noun] ili:i80976
-    EN gloss: the government agency in the United Kingdom that is responsible for internal security and counterintelligence overseas
-    DE gloss: Regierungsbehörde im Vereinigten Königreich, die für die innere Sicherheit und die Spionageabwehr im Ausland zuständig ist.
-    EN words: MI, Military Intelligence Section 6, Secret Intelligence Service
-    DE words: MI6, SIS, Secret Intelligence Service, Secret Service, britischer Auslandsgeheimdienst
-    QUALITY: \_\_\_
-
-48. [noun] ili:i60476
-    EN gloss: an electrical device by which alternating current of one voltage is changed to another voltage
-    DE gloss: ein elektrisches Gerät, mit dem Wechselstrom einer bestimmten Spannung in eine andere Spannung umgewandelt wird
-    EN words: transformer
-    DE words: Spannungswandler, Trafo, Transformator, Transformer
-    QUALITY: \_\_\_
-
-49. [noun] ili:i37037
-    EN gloss: wandering from the main path of a journey
-    DE gloss: das Abweichen vom Hauptweg einer Reise
-    EN words: digression, excursion
-    DE words: Abschweifung, Abstecher, Einschub, Exkurs, Umschweif
-    QUALITY: \_\_\_
-
-50. [noun] ili:i77288
-    EN gloss: any meat that is minced and spiced and cooked as patties or used to fill sausages
-    DE gloss: jegliches Fleisch, das zerkleinert und gewürzt und als Pasteten gekocht oder zur Füllung von Würsten verwendet wird
-    EN words: sausage meat
-    DE words: Brät, Wurstbrät
-    QUALITY: \_\_\_
--- a/data-pipeline/db/import.ts
+++ b/data-pipeline/db/import.ts
@ -2,194 +2,123 @@ import fs from "node:fs/promises";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
 import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
-import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
 import { openDb } from "./index.js";
-
-// ── Types ─────────────────────────────────────────────────────────────────────
-
-type Example = { text: string; source: "omw" | "cefr" };
-
-type AnnotatedRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
-};
+import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";

 // ── Paths ─────────────────────────────────────────────────────────────────────

 const __dirname = path.dirname(fileURLToPath(import.meta.url));

-const PATHS = {
-  annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
-};
-
-// ── Loading ───────────────────────────────────────────────────────────────────
-
-async function loadAnnotated(): Promise<AnnotatedRecord[]> {
-  // Use en.json as the base — it has the most complete glosses and examples.
-  // Merge votes and CEFR examples from the other language files.
-  const baseRaw = await fs.readFile(
-    path.join(PATHS.annotatedDir, "en.json"),
-    "utf-8",
-  );
-  const base = JSON.parse(baseRaw) as AnnotatedRecord[];
-
-  const byId = new Map<string, AnnotatedRecord>();
-  for (const record of base) {
-    byId.set(record.source_id, record);
-  }
-
-  for (const lang of SUPPORTED_LANGUAGE_CODES) {
-    if (lang === "en") continue;
-
-    const raw = await fs.readFile(
-      path.join(PATHS.annotatedDir, `${lang}.json`),
-      "utf-8",
-    );
-    const records = JSON.parse(raw) as AnnotatedRecord[];
-
-    for (const record of records) {
-      const base = byId.get(record.source_id);
-      if (!base) continue;
-
-      // Merge votes
-      for (const [l, langVotes] of Object.entries(record.votes)) {
-        if (!base.votes[l as SupportedLanguageCode]) {
-          base.votes[l as SupportedLanguageCode] = {};
-        }
-        Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
-      }
-
-      // Merge CEFR examples not already in base
-      for (const [l, examples] of Object.entries(record.examples)) {
-        const lang = l as SupportedLanguageCode;
-        const cefrExamples = examples.filter((e) => e.source === "cefr");
-        if (cefrExamples.length === 0) continue;
-
-        if (!base.examples[lang]) {
-          base.examples[lang] = cefrExamples;
-        } else {
-          base.examples[lang].push(...cefrExamples);
-        }
-      }
-    }
-  }
-
-  return [...byId.values()];
-}
+const OUTPUT_DIR = path.resolve(__dirname, "../stage-1-extract/output");

 // ── Import ────────────────────────────────────────────────────────────────────

-export async function importStage2(): Promise<void> {
-  console.log("Loading stage 2 annotated files...");
-  const records = await loadAnnotated();
-  console.log(`  Loaded ${records.length.toLocaleString()} synsets`);
-
+export async function importKaikki(): Promise<void> {
  const db = openDb();

-  const insertSynset = db.prepare(
-    `INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
-  );
-
-  const insertTranslation = db.prepare(
-    `INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
-  );
-
-  const insertGloss = db.prepare(
-    `INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
-  );
-
-  const insertExample = db.prepare(
-    `INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
-  );
-
-  const insertCefrVote = db.prepare(`
-    INSERT INTO cefr_source_votes (translation_id, cefr_level)
-    VALUES (
-      (SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
-      ?
-    )
+  const insertEntry = db.prepare(`
+    INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
+    VALUES (?, ?, ?, ?, ?, ?)
+    ON CONFLICT (headword, language, pos, sense_index)
+    DO UPDATE SET
+      gloss    = excluded.gloss,
+      examples = excluded.examples
+    RETURNING id
  `);

-  console.log("\nImporting into pipeline.db...");
+  const insertTranslation = db.prepare(`
+    INSERT INTO translations (entry_id, target_lang, word, sense_hint)
+    VALUES (?, ?, ?, ?)
+    ON CONFLICT (entry_id, target_lang, word) DO NOTHING
+  `);

-  const importAll = db.transaction(() => {
-    let synsets = 0;
-    let translations = 0;
-    let glosses = 0;
-    let examples = 0;
-    let cefrVotes = 0;
+  let totalEntries = 0;
+  let totalTranslations = 0;
+  let totalSkipped = 0;

-    for (const record of records) {
-      insertSynset.run(record.source_id, record.pos);
-      synsets++;
+  for (const lang of SUPPORTED_LANGUAGE_CODES) {
+    const filePath = path.join(OUTPUT_DIR, `${lang}.json`);

-      // Translations
-      for (const [lang, words] of Object.entries(record.translations)) {
-        const unique = [...new Set(words)];
-        for (const word of unique) {
-          insertTranslation.run(record.source_id, lang, word);
+    let senses: ExtractedSense[];
+    try {
+      const raw = await fs.readFile(filePath, "utf-8");
+      senses = JSON.parse(raw) as ExtractedSense[];
+    } catch {
+      console.warn(`  Warning: no output file found for ${lang}, skipping`);
+      continue;
+    }
+
+    console.log(
+      `  Importing ${lang}: ${senses.length.toLocaleString()} senses...`,
+    );
+
+    // Track next available sense_index per (headword, pos) to handle
+    // the same word appearing in multiple JSONL entries with the same POS.
+    const senseIndexMap = new Map<string, number>();
+
+    const importLang = db.transaction(() => {
+      let entries = 0;
+      let translations = 0;
+      let skipped = 0;
+
+      for (const sense of senses) {
+        const key = `${sense.headword}|${sense.pos}`;
+        const nextIndex = senseIndexMap.get(key) ?? 0;
+        senseIndexMap.set(key, nextIndex + 1);
+
+        const row = insertEntry.get(
+          sense.headword,
+          sense.language,
+          sense.pos,
+          nextIndex,
+          sense.gloss ?? null,
+          JSON.stringify(sense.examples),
+        ) as { id: number } | undefined;
+
+        if (!row) {
+          skipped++;
+          continue;
+        }
+
+        entries++;
+
+        for (const t of sense.translations) {
+          insertTranslation.run(
+            row.id,
+            t.target_lang,
+            t.word,
+            t.sense_hint ?? null,
+          );
          translations++;
        }
      }

-      // Glosses
-      for (const [lang, glossList] of Object.entries(record.glosses)) {
-        for (const text of glossList) {
-          insertGloss.run(record.source_id, lang, text);
-          glosses++;
-        }
-      }
+      return { entries, translations, skipped };
+    });

-      // Examples
-      for (const [lang, exList] of Object.entries(record.examples)) {
-        for (const example of exList) {
-          insertExample.run(
-            record.source_id,
-            lang,
-            example.text,
-            example.source,
-          );
-          examples++;
-        }
-      }
+    const counts = importLang();
+    totalEntries += counts.entries;
+    totalTranslations += counts.translations;
+    totalSkipped += counts.skipped;

-      // CEFR source votes
-      for (const [lang, langVotes] of Object.entries(record.votes)) {
-        for (const [word, vote] of Object.entries(
-          langVotes as Record<string, { cefr_source: string }>,
-        )) {
-          insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
-          cefrVotes++;
-        }
-      }
-    }
-
-    return { synsets, translations, glosses, examples, cefrVotes };
-  });
-
-  const counts = importAll();
-
-  console.log(`  synsets:      ${counts.synsets.toLocaleString()}`);
-  console.log(`  translations: ${counts.translations.toLocaleString()}`);
-  console.log(`  glosses:      ${counts.glosses.toLocaleString()}`);
-  console.log(`  examples:     ${counts.examples.toLocaleString()}`);
-  console.log(`  cefr votes:   ${counts.cefrVotes.toLocaleString()}`);
+    console.log(
+      `    entries: ${counts.entries.toLocaleString()}, translations: ${counts.translations.toLocaleString()}, skipped: ${counts.skipped.toLocaleString()}`,
+    );
+  }

  db.close();
-  console.log("\nImport complete.");
+
+  console.log(`\nImport complete:`);
+  console.log(`  Total entries:      ${totalEntries.toLocaleString()}`);
+  console.log(`  Total translations: ${totalTranslations.toLocaleString()}`);
+  console.log(`  Total skipped:      ${totalSkipped.toLocaleString()}`);
 }

 // ── Check if already imported ─────────────────────────────────────────────────

 export function isImported(): boolean {
  const db = openDb();
-  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
+  const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
    count: number;
  };
  db.close();
@ -200,20 +129,21 @@ export function isImported(): boolean {

 async function main(): Promise<void> {
  const db = openDb();
-  const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
+  const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
    count: number;
  };
  db.close();

  if (row.count > 0) {
    console.log(
-      `pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
+      `pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
    );
    console.log("Delete pipeline.db and re-run db:init to start fresh.");
    process.exit(0);
  }

-  await importStage2();
+  console.log("Importing Kaikki data into pipeline.db...");
+  await importKaikki();
 }

 if (import.meta.url === `file://${process.argv[1]}`) {
--- a/data-pipeline/db/reset.ts
+++ b/data-pipeline/db/reset.ts
@ -0,0 +1,41 @@
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import Database from "better-sqlite3";
+
+// ── Paths ─────────────────────────────────────────────────────────────────────
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const DB_PATH = path.join(__dirname, "pipeline.db");
+
+// ── Main ──────────────────────────────────────────────────────────────────────
+
+function main(): void {
+  const mode = process.argv[2];
+
+  if (!mode || (mode !== "round1" && mode !== "all")) {
+    console.error("Usage: pnpm db:reset round1 | all");
+    console.error("  round1 — delete all round1 sub-stage rows");
+    console.error("  all    — delete all run_status rows except reverse_link");
+    process.exit(1);
+  }
+
+  const db = new Database(DB_PATH);
+
+  let result: { changes: number };
+
+  if (mode === "round1") {
+    result = db
+      .prepare("DELETE FROM run_status WHERE stage LIKE 'round1%'")
+      .run();
+    console.log(`Deleted ${result.changes} round1 rows from run_status`);
+  } else {
+    result = db
+      .prepare("DELETE FROM run_status WHERE stage NOT IN ('reverse_link')")
+      .run();
+    console.log(`Deleted ${result.changes} rows from run_status`);
+  }
+
+  db.close();
+}
+
+main();
--- a/data-pipeline/db/schema.sql
+++ b/data-pipeline/db/schema.sql
@ -1,62 +1,58 @@
 -- ── Base data ─────────────────────────────────────────────────────────────────
-- Imported from stage 2 JSON on first run. Never mutated after import.
+-- Imported from Kaikki on first run. Never mutated after import.

-CREATE TABLE IF NOT EXISTS synsets (
-  source_id TEXT PRIMARY KEY,
-  pos       TEXT NOT NULL
+CREATE TABLE IF NOT EXISTS entries (
+  id          INTEGER PRIMARY KEY,
+  headword    TEXT    NOT NULL,
+  language    TEXT    NOT NULL,
+  pos         TEXT    NOT NULL,
+  sense_index INTEGER NOT NULL DEFAULT 0,
+  gloss       TEXT,
+  examples    TEXT    NOT NULL DEFAULT '[]', -- JSON array of strings
+  source      TEXT    NOT NULL DEFAULT 'kaikki',
+  UNIQUE (headword, language, pos, sense_index)
 );

 CREATE TABLE IF NOT EXISTS translations (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  word      TEXT    NOT NULL,
-  UNIQUE (source_id, language, word)
-);
-
-CREATE TABLE IF NOT EXISTS glosses (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL
-);
-
-CREATE TABLE IF NOT EXISTS examples (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL
-);
-
-CREATE TABLE IF NOT EXISTS cefr_source_votes (
-  id             INTEGER PRIMARY KEY,
-  translation_id INTEGER NOT NULL REFERENCES translations(id),
-  cefr_level     TEXT    NOT NULL,
-  UNIQUE (translation_id)
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  sense_hint  TEXT,
+  source      TEXT    NOT NULL DEFAULT 'kaikki',
+  UNIQUE (entry_id, target_lang, word)
 );

 -- ── Status tracking ───────────────────────────────────────────────────────────
-- One row per synset per model per stage. Drives resumability.
+-- One row per entry per model per stage. Drives resumability.
+-- Sentinel rows use entry_id = 0 for one-time pipeline steps.
 -- stage:  round1 | round2 | tiebreak
 -- status: pending | complete | needs_review | flagged

 CREATE TABLE IF NOT EXISTS run_status (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL,
+  entry_id   INTEGER NOT NULL,
  model_name TEXT    NOT NULL,
  stage      TEXT    NOT NULL,
  status     TEXT    NOT NULL,
  created_at TEXT    NOT NULL DEFAULT (datetime('now')),
  updated_at TEXT    NOT NULL DEFAULT (datetime('now')),
-  UNIQUE (source_id, model_name, stage)
+  UNIQUE (entry_id, model_name, stage)
 );

 -- ── Round 1 output ────────────────────────────────────────────────────────────
-- One row per translation/language per model. Written atomically per record.
+-- Written atomically per entry per model.
 -- Unique constraints enforce one model one vote.

-CREATE TABLE IF NOT EXISTS model_cefr_votes (
+CREATE TABLE IF NOT EXISTS model_entry_cefr_votes (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  model_name TEXT    NOT NULL,
+  cefr_level TEXT    NOT NULL,
+  UNIQUE (entry_id, model_name)
+);
+
+CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
  id             INTEGER PRIMARY KEY,
  translation_id INTEGER NOT NULL REFERENCES translations(id),
  model_name     TEXT    NOT NULL,
@ -73,29 +69,27 @@ CREATE TABLE IF NOT EXISTS model_translation_rejections (

 CREATE TABLE IF NOT EXISTS generated_glosses (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
  model_name TEXT    NOT NULL,
-  language   TEXT    NOT NULL,
  text       TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+  UNIQUE (entry_id, model_name)
 );

 CREATE TABLE IF NOT EXISTS generated_examples (
  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
  model_name TEXT    NOT NULL,
-  language   TEXT    NOT NULL,
  text       TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+  UNIQUE (entry_id, model_name)
 );

-CREATE TABLE IF NOT EXISTS generated_descriptions (
-  id         INTEGER PRIMARY KEY,
-  source_id  TEXT    NOT NULL REFERENCES synsets(source_id),
-  model_name TEXT    NOT NULL,
-  language   TEXT    NOT NULL,
-  text       TEXT    NOT NULL,
-  UNIQUE (source_id, model_name, language)
+CREATE TABLE IF NOT EXISTS generated_translations (
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  model_name  TEXT    NOT NULL,
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  UNIQUE (entry_id, model_name, target_lang)
 );

 -- ── Round 2 output ────────────────────────────────────────────────────────────
@ -116,20 +110,28 @@ CREATE TABLE IF NOT EXISTS example_candidate_votes (
  UNIQUE (example_id, model_name)
 );

-CREATE TABLE IF NOT EXISTS description_candidate_votes (
+CREATE TABLE IF NOT EXISTS translation_candidate_votes (
  id             INTEGER PRIMARY KEY,
-  description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
+  translation_id INTEGER NOT NULL REFERENCES generated_translations(id),
  model_name     TEXT    NOT NULL,
-  UNIQUE (description_id, model_name)
+  UNIQUE (translation_id, model_name)
 );

 -- ── Resolved output ───────────────────────────────────────────────────────────
 -- Written by merge. Never updated after writing.
-- Only fully resolved records are written here — no nulls, no flags.
+-- Only fully resolved records are written here — no nulls.
 -- Absence of a row means unresolved. Flagged status tracked in run_status.
-- source: omw | cefr | model_name
+-- source: kaikki | model_name

-CREATE TABLE IF NOT EXISTS resolved_translations (
+CREATE TABLE IF NOT EXISTS resolved_entry_cefr (
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  cefr_level TEXT    NOT NULL,
+  difficulty TEXT    NOT NULL,
+  UNIQUE (entry_id)
+);
+
+CREATE TABLE IF NOT EXISTS resolved_translation_cefr (
  id             INTEGER PRIMARY KEY,
  translation_id INTEGER NOT NULL REFERENCES translations(id),
  cefr_level     TEXT    NOT NULL,
@ -138,27 +140,25 @@ CREATE TABLE IF NOT EXISTS resolved_translations (
 );

 CREATE TABLE IF NOT EXISTS resolved_glosses (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL,
-  UNIQUE (source_id, language)
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  text       TEXT    NOT NULL,
+  source     TEXT    NOT NULL,
+  UNIQUE (entry_id)
 );

 CREATE TABLE IF NOT EXISTS resolved_examples (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL
+  id         INTEGER PRIMARY KEY,
+  entry_id   INTEGER NOT NULL REFERENCES entries(id),
+  text       TEXT    NOT NULL,
+  source     TEXT    NOT NULL
 );

-CREATE TABLE IF NOT EXISTS resolved_descriptions (
-  id        INTEGER PRIMARY KEY,
-  source_id TEXT    NOT NULL REFERENCES synsets(source_id),
-  language  TEXT    NOT NULL,
-  text      TEXT    NOT NULL,
-  source    TEXT    NOT NULL,
-  UNIQUE (source_id, language)
+CREATE TABLE IF NOT EXISTS resolved_generated_translations (
+  id          INTEGER PRIMARY KEY,
+  entry_id    INTEGER NOT NULL REFERENCES entries(id),
+  target_lang TEXT    NOT NULL,
+  word        TEXT    NOT NULL,
+  source      TEXT    NOT NULL,
+  UNIQUE (entry_id, target_lang)
 );
--- a/data-pipeline/package.json
+++ b/data-pipeline/package.json
@ -4,9 +4,11 @@
  "private": true,
  "type": "module",
  "scripts": {
+    "db:reset": "tsx db/reset.ts",
+    "extract": "tsx stage-1-extract/scripts/extract.ts",
+    "reverse-link": "tsx stage-2-reverse-link/scripts/reverse-link.ts",
    "db:import": "tsx db/import.ts",
    "db:init": "tsx db/init.ts",
-    "annotate": "tsx stage-2-annotate/scripts/annotate.ts",
    "test": "vitest run",
    "test:watch": "vitest",
    "pipeline:run": "tsx --env-file .env pipeline.ts"
--- a/data-pipeline/pipeline.ts
+++ b/data-pipeline/pipeline.ts
@ -2,10 +2,12 @@ import fs from "node:fs/promises";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
 import { initDb } from "./db/init.js";
-import { isImported, importStage2 } from "./db/import.js";
+import { isImported, importKaikki } from "./db/import.js";
 import { openDb } from "./db/index.js";
+import { reverseLink } from "./stage-2-reverse-link/scripts/reverse-link.js";
 import { ALL_PROVIDERS, validateProviderKey } from "./stage-3-enrich/config.js";
 import type { ProviderConfig } from "./stage-3-enrich/config.js";
+import { enrich } from "./stage-3-enrich/scripts/enrich.js";

 // ── Types ─────────────────────────────────────────────────────────────────────

@ -35,23 +37,23 @@ type RunStats = {
 const __dirname = path.dirname(fileURLToPath(import.meta.url));

 const PATHS = {
-  omw: path.join(__dirname, "stage-1-extract/output/omw.json"),
+  extractedEn: path.join(__dirname, "stage-1-extract/output/en.json"),
  db: path.join(__dirname, "db/pipeline.db"),
  reports: path.join(__dirname, "reports"),
  llamaHealth: "http://127.0.0.1:8080/health",
 };

-const SENTINEL = { sourceId: "system", modelName: "system" };
+const SENTINEL = { entryId: 0, modelName: "system" };

 // ── Startup checks ────────────────────────────────────────────────────────────

-async function checkOmwExists(): Promise<void> {
+async function checkExtractedFilesExist(): Promise<void> {
  try {
-    await fs.access(PATHS.omw);
+    await fs.access(PATHS.extractedEn);
  } catch {
-    console.error("\n  ERROR: stage-1-extract/output/omw.json not found.");
+    console.error("\n  ERROR: stage-1-extract/output/en.json not found.");
    console.error("  Run the stage 1 extraction script first:");
-    console.error("    python stage-1-extract/scripts/extract.py\n");
+    console.error("    pnpm extract\n");
    process.exit(1);
  }
 }
@ -67,8 +69,8 @@ async function checkAndInitDb(): Promise<void> {

 async function checkAndImportDb(): Promise<void> {
  if (!isImported()) {
-    console.log("  Base tables empty — importing stage 2 data...");
-    await importStage2();
+    console.log("  Base tables empty — importing Kaikki data...");
+    await importKaikki();
  }
 }

@ -108,7 +110,7 @@ async function checkProviderReady(provider: ProviderConfig): Promise<void> {
 async function generateRunName(): Promise<string> {
  await fs.mkdir(PATHS.reports, { recursive: true });

-  const date = new Date().toISOString().slice(0, 10);
+  const date = new Date().toISOString().exi(0, 10);
  const files = await fs.readdir(PATHS.reports);
  const todaysRuns = files.filter(
    (f) => f.startsWith(date) && f.endsWith(".json"),
@ -132,6 +134,7 @@ function registerShutdownHandler(stats: RunStats): void {
  process.on("SIGINT", handler);
  process.on("SIGTERM", handler);
 }
+
 // ── Stage status helpers ──────────────────────────────────────────────────────

 function getSentinelStatus(stage: RunStage): StageStatus {
@ -139,9 +142,9 @@ function getSentinelStatus(stage: RunStage): StageStatus {
  const row = db
    .prepare(
      `SELECT status FROM run_status
-       WHERE source_id = ? AND model_name = ? AND stage = ?`,
+       WHERE entry_id = ? AND model_name = ? AND stage = ?`,
    )
-    .get(SENTINEL.sourceId, SENTINEL.modelName, stage) as
+    .get(SENTINEL.entryId, SENTINEL.modelName, stage) as
    | { status: string }
    | undefined;
  db.close();
@ -151,11 +154,11 @@ function getSentinelStatus(stage: RunStage): StageStatus {
 function markSentinelComplete(stage: RunStage): void {
  const db = openDb();
  db.prepare(
-    `INSERT INTO run_status (source_id, model_name, stage, status)
+    `INSERT INTO run_status (entry_id, model_name, stage, status)
     VALUES (?, ?, ?, 'complete')
-     ON CONFLICT (source_id, model_name, stage)
+     ON CONFLICT (entry_id, model_name, stage)
     DO UPDATE SET status = 'complete', updated_at = datetime('now')`,
-  ).run(SENTINEL.sourceId, SENTINEL.modelName, stage);
+  ).run(SENTINEL.entryId, SENTINEL.modelName, stage);
  db.close();
 }

@ -163,16 +166,17 @@ function getModelRound1Status(modelName: string): StageStatus {
  const db = openDb();

  const total = (
-    db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
-      count: number;
-    }
+    db
+      .prepare("SELECT COUNT(*) as count FROM entries WHERE language = 'en'")
+      .get() as { count: number }
  ).count;

  const complete = (
    db
      .prepare(
        `SELECT COUNT(*) as count FROM run_status
-         WHERE model_name = ? AND stage = 'round1' AND status = 'complete'`,
+         WHERE model_name = ? AND stage = 'round1_gloss'
+         AND status = 'complete'`,
      )
      .get(modelName) as { count: number }
  ).count;
@ -188,9 +192,9 @@ function getModelRound2Status(modelName: string): StageStatus {
  const db = openDb();

  const total = (
-    db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
-      count: number;
-    }
+    db
+      .prepare("SELECT COUNT(*) as count FROM entries WHERE language = 'en'")
+      .get() as { count: number }
  ).count;

  const complete = (
@ -209,12 +213,52 @@ function getModelRound2Status(modelName: string): StageStatus {
  return "in_progress";
 }

-// ── Stage runners (stubs) ─────────────────────────────────────────────────────
+function isReverseLinkDone(): boolean {
+  const db = openDb();
+  const row = db
+    .prepare(
+      `SELECT status FROM run_status
+       WHERE entry_id = ? AND model_name = ? AND stage = 'reverse_link'`,
+    )
+    .get(SENTINEL.entryId, SENTINEL.modelName) as
+    | { status: string }
+    | undefined;
+  db.close();
+  return row?.status === "complete";
+}

-function runRound1(provider: ProviderConfig, stats: RunStats): void {
+function markReverseLinkComplete(): void {
+  const db = openDb();
+  db.prepare(
+    `INSERT INTO run_status (entry_id, model_name, stage, status)
+     VALUES (?, ?, 'reverse_link', 'complete')
+     ON CONFLICT (entry_id, model_name, stage)
+     DO UPDATE SET status = 'complete', updated_at = datetime('now')`,
+  ).run(SENTINEL.entryId, SENTINEL.modelName);
+  db.close();
+}
+
+// ── Stage runners ─────────────────────────────────────────────────────────────
+
+function runReverseLinkStage(): void {
+  if (isReverseLinkDone()) {
+    console.log("\n  [reverse link] Already complete, skipping");
+    return;
+  }
+  console.log("\n  [reverse link] Syncing reverse translation links...");
+  reverseLink();
+  markReverseLinkComplete();
+}
+
+async function runRound1(
+  provider: ProviderConfig,
+  stats: RunStats,
+): Promise<void> {
  console.log(`\n  [round 1] Running ${provider.name}...`);
-  // TODO: implement round 1 enrich script
-  console.log(`  [round 1] ${provider.name} — not yet implemented`);
+  const counts = await enrich(provider);
+  stats.recordsProcessed += counts.processed;
+  stats.recordsSkipped += counts.skipped;
+  stats.needsReview += counts.needsReview;
  stats.modelsRun.push(provider.name);
 }

@ -247,7 +291,7 @@ function runMerge(): void {
 }

 function runTiebreak(stats: RunStats): void {
-  console.log("\n  [tiebreak] Resolving flagged translations...");
+  console.log("\n  [tiebreak] Resolving flagged entries...");
  // TODO: implement tiebreak logic
  console.log("  [tiebreak] not yet implemented");
  stats.currentStage = "tiebreak";
@ -265,19 +309,19 @@ function runCompare(): void {
 async function generateReport(runName: string, stats: RunStats): Promise<void> {
  const db = openDb();

-  const totalSynsets = (
-    db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
+  const totalEntries = (
+    db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
      count: number;
    }
  ).count;

-  const resolvedTranslations = (
-    db.prepare("SELECT COUNT(*) as count FROM resolved_translations").get() as {
+  const resolvedEntries = (
+    db.prepare("SELECT COUNT(*) as count FROM resolved_entry_cefr").get() as {
      count: number;
    }
  ).count;

-  const flaggedTranslations = (
+  const flaggedEntries = (
    db
      .prepare(
        `SELECT COUNT(*) as count FROM run_status
@ -302,7 +346,7 @@ async function generateReport(runName: string, stats: RunStats): Promise<void> {
  const durationMin = Math.round(durationMs / 60_000);

  const isFinal =
-    getSentinelStatus("compare") === "complete" && flaggedTranslations === 0;
+    getSentinelStatus("compare") === "complete" && flaggedEntries === 0;

  const report = {
    runName,
@ -310,15 +354,16 @@ async function generateReport(runName: string, stats: RunStats): Promise<void> {
    durationMinutes: durationMin,
    isFinal,
    progress: {
-      totalSynsets,
-      resolvedTranslations,
-      flaggedTranslations,
+      totalEntries,
+      resolvedEntries,
+      flaggedEntries,
      needsReview,
      recordsProcessedThisRun: stats.recordsProcessed,
      recordsSkippedThisRun: stats.recordsSkipped,
    },
    modelsRun: stats.modelsRun,
    stages: {
+      reverseLink: isReverseLinkDone() ? "complete" : "pending",
      round1: ALL_PROVIDERS.map((p) => ({
        model: p.name,
        status: getModelRound1Status(p.name),
@ -354,15 +399,17 @@ async function generateReport(runName: string, stats: RunStats): Promise<void> {
    ``,
    `| Metric | Value |`,
    `| ------ | ----- |`,
-    `| Total synsets | ${totalSynsets.toLocaleString()} |`,
-    `| Resolved translations | ${resolvedTranslations.toLocaleString()} |`,
-    `| Flagged translations | ${flaggedTranslations.toLocaleString()} |`,
+    `| Total entries | ${totalEntries.toLocaleString()} |`,
+    `| Resolved entries | ${resolvedEntries.toLocaleString()} |`,
+    `| Flagged entries | ${flaggedEntries.toLocaleString()} |`,
    `| Needs review | ${needsReview.toLocaleString()} |`,
    `| Records processed this run | ${stats.recordsProcessed.toLocaleString()} |`,
    `| Records skipped this run | ${stats.recordsSkipped.toLocaleString()} |`,
    ``,
    `## Stage status`,
    ``,
+    `### Reverse link: ${report.stages.reverseLink}`,
+    ``,
    `### Round 1`,
    ``,
    ...report.stages.round1.map(
@ -403,7 +450,7 @@ async function main(): Promise<void> {

  // ── Startup checks
  console.log("Checking prerequisites...");
-  await checkOmwExists();
+  await checkExtractedFilesExist();
  await checkAndInitDb();
  await checkAndImportDb();
  console.log("  Prerequisites OK");
@ -425,6 +472,14 @@ async function main(): Promise<void> {

  registerShutdownHandler(stats);

+  // ── Stage 2 — Reverse link
+  runReverseLinkStage();
+
+  if (shutdownRequested) {
+    await generateReport(runName, stats);
+    process.exit(0);
+  }
+
  // ── Round 1
  console.log("\nRound 1 — generation");
  for (const provider of ALL_PROVIDERS) {
@ -444,7 +499,7 @@ async function main(): Promise<void> {
      console.log(`  [round 1] ${provider.name} — resuming...`);
    }

-    runRound1(provider, stats);
+    await runRound1(provider, stats);
  }

  if (shutdownRequested) {
@ -548,9 +603,9 @@ async function main(): Promise<void> {
    runCompare();
  }

-  // ── Report
-  stats.stoppedAt = new Date();
-  await generateReport(runName, stats);
+  // ── Report (disabled until full pipeline is implemented)
+  // stats.stoppedAt = new Date();
+  // await generateReport(runName, stats);

  console.log("\nPipeline complete.");
 }
--- a/data-pipeline/stage-1-extract/scripts/extract.py
+++ b/data-pipeline/stage-1-extract/scripts/extract.py
@ -1,204 +0,0 @@
-"""
-data-pipeline/stage-1-extract/scripts/extract.py
-
-Extract all synsets from the Open Multilingual Wordnet (OMW) for all
-supported languages and parts of speech.
-
-Output: one JSON file per language, written to stage-1-extract/output/
-  en.json, it.json, es.json, de.json, fr.json
-
-Each file is a JSON array of synset records:
-  {
-    "source_id": "ili:i12345",
-    "pos": "noun",
-    "translations": { "en": ["dog", "canine"], "it": ["cane"] },
-    "glosses":      { "en": ["a domesticated animal..."] },
-    "examples":     { "en": ["the dog barked at the stranger"] }
-  }
-
-Usage:
-  python stage-1-extract/scripts/extract.py
-  python stage-1-extract/scripts/extract.py --sample
-
-Prerequisites:
-  pip install wn
-  python -m wn download omw-en:1.4
-  python -m wn download omw-it:1.4
-  python -m wn download omw-de:1.4
-  python -m wn download omw-es:1.4
-  python -m wn download omw-fr:1.4
-"""
-
-import json
-import sys
-from pathlib import Path
-
-import wn
-
-SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
-POS_MAP: dict[str, str] = {
-    "n": "noun",
-    "v": "verb",
-    "a": "adjective",
-    "s": "adjective",  # adjective satellite — collapsed into adjective
-    "r": "adverb",
-}
-
-
-def extract_all(
-    output_dir: str = "stage-1-extract/output", sample: bool = False
-) -> None:
-    out = Path(output_dir)
-    out.mkdir(parents=True, exist_ok=True)
-
-    sample_size = 100 if sample else None
-
-    # Load one Wordnet object per language up front.
-    print("Loading wordnets...")
-    wordnets: dict[str, wn.Wordnet] = {}
-    for lang in SUPPORTED_LANGUAGE_CODES:
-        try:
-            wordnets[lang] = wn.Wordnet(lang=lang)
-            synset_count = len(wordnets[lang].synsets())
-            print(f"  {lang}: {synset_count:,} total synsets")
-        except wn.Error as e:
-            print(f"  ERROR loading {lang}: {e}")
-            print(f"  Run: python -m wn download omw-{lang}:1.4")
-            sys.exit(1)
-
-    # Collect per-ILI data across all languages and POS.
-    print("\nExtracting synsets...")
-    by_ili: dict[str, dict] = {}
-
-    for lang, wnet in wordnets.items():
-        for omw_pos, pos_label in POS_MAP.items():
-            synsets = wnet.synsets(pos=omw_pos)
-            covered = 0
-            for synset in synsets:
-                ili = synset.ili
-                if not ili:
-                    continue
-                covered += 1
-
-                lemmas = list(dict.fromkeys(str(lemma) for lemma in synset.lemmas()))
-                defns = [d for d in synset.definitions() if d]
-                examples = [e for e in synset.examples() if e]
-
-                if ili not in by_ili:
-                    by_ili[ili] = {"pos": pos_label}
-
-                if lang not in by_ili[ili]:
-                    by_ili[ili][lang] = {
-                        "lemmas": lemmas,
-                        "glosses": defns,
-                        "examples": examples,
-                    }
-                else:
-                    # ILI already exists for this language — merge data.
-                    # Happens when 'a' and 's' both map to adjective for the
-                    # same ILI. Deduplicate to avoid repeated entries.
-                    existing = by_ili[ili][lang]
-                    existing["lemmas"] = list(
-                        dict.fromkeys(existing["lemmas"] + lemmas)
-                    )
-                    existing["glosses"] = list(
-                        dict.fromkeys(existing["glosses"] + defns)
-                    )
-                    existing["examples"] = list(
-                        dict.fromkeys(existing["examples"] + examples)
-                    )
-
-            print(f"  {lang} {pos_label}: {covered:,} synsets with ILI")
-
-    # Build records and write single combined output file.
-    print("\nBuilding records...")
-    ilis = sorted(by_ili.keys())
-    if sample_size:
-        ilis = ilis[:sample_size]
-
-    records: list[dict] = []
-    for ili in ilis:
-        data = by_ili[ili]
-        record: dict = {
-            "source_id": f"ili:{ili}",
-            "pos": data["pos"],
-            "translations": {},
-            "glosses": {},
-            "examples": {},
-        }
-
-        for key, value in data.items():
-            if key == "pos":
-                continue
-            lang = key
-            if value["lemmas"]:
-                record["translations"][lang] = value["lemmas"]
-            if value["glosses"]:
-                record["glosses"][lang] = value["glosses"]
-            if value["examples"]:
-                record["examples"][lang] = value["examples"]
-
-        records.append(record)
-
-    output_file = out / "omw.json"
-    with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(records, f, indent=2, ensure_ascii=False)
-
-    print(f"\nWrote {len(records):,} synsets → {output_file}")
-    _print_coverage(records)
-
-
-def _print_coverage(records: list[dict]) -> None:
-    """Print per-language translation, gloss, and example counts."""
-    lang_stats: dict[str, dict[str, int]] = {}
-    for lang in SUPPORTED_LANGUAGE_CODES:
-        lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
-
-    pos_stats: dict[str, int] = {}
-
-    for r in records:
-        pos = r["pos"]
-        pos_stats[pos] = pos_stats.get(pos, 0) + 1
-
-        for lang, lemmas in r["translations"].items():
-            if lang in lang_stats:
-                lang_stats[lang]["translations"] += len(lemmas)
-        for lang, gloss_list in r["glosses"].items():
-            if lang in lang_stats:
-                lang_stats[lang]["glosses"] += len(gloss_list)
-        for lang, example_list in r["examples"].items():
-            if lang in lang_stats:
-                lang_stats[lang]["examples"] += len(example_list)
-
-    print("\nPOS breakdown:")
-    for pos, count in sorted(pos_stats.items()):
-        print(f"  {pos}: {count:,}")
-
-    print("\nCoverage per language:")
-    for lang, counts in lang_stats.items():
-        t = counts["translations"]
-        g = counts["glosses"]
-        e = counts["examples"]
-        total = len(records)
-        print(
-            f"  {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
-        )
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
-    parser.add_argument(
-        "--output-dir",
-        default="stage-1-extract/output",
-        help="Output directory for JSON files",
-    )
-    parser.add_argument(
-        "--sample",
-        action="store_true",
-        help="Extract only 100 synsets per language for inspection",
-    )
-    args = parser.parse_args()
-
-    extract_all(output_dir=args.output_dir, sample=args.sample)
--- a/data-pipeline/stage-1-extract/scripts/extract.ts
+++ b/data-pipeline/stage-1-extract/scripts/extract.ts
@ -0,0 +1,257 @@
+import fs from "node:fs";
+import path from "node:path";
+import readline from "node:readline";
+import { fileURLToPath } from "node:url";
+import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type KaikkiTranslation = {
+  code?: string;
+  lang_code?: string;
+  word?: string;
+  sense?: string;
+};
+
+type KaikkiSense = {
+  glosses?: string[];
+  examples?: { text?: string }[];
+  translations?: KaikkiTranslation[];
+};
+
+type KaikkiEntry = {
+  word?: string;
+  pos?: string;
+  lang_code?: string;
+  senses?: KaikkiSense[];
+};
+
+export type ExtractedSense = {
+  headword: string;
+  language: SupportedLanguageCode;
+  pos: SupportedPos;
+  sense_index: number;
+  gloss: string | null;
+  examples: string[];
+  translations: {
+    target_lang: SupportedLanguageCode;
+    word: string;
+    sense_hint: string | null;
+  }[];
+};
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const SOURCES_DIR = path.resolve(__dirname, "../sources");
+const OUTPUT_DIR = path.resolve(__dirname, "../output");
+
+const LANG_TO_FILE: Record<SupportedLanguageCode, string> = {
+  en: "kaikki.org-dictionary-English.jsonl",
+  de: "kaikki.org-dictionary-German.jsonl",
+  it: "kaikki.org-dictionary-Italian.jsonl",
+  fr: "kaikki.org-dictionary-French.jsonl",
+  es: "kaikki.org-dictionary-Spanish.jsonl",
+};
+
+const POS_MAP: Record<string, SupportedPos> = {
+  noun: "noun",
+  verb: "verb",
+  adj: "adjective",
+  adv: "adverb",
+};
+
+const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+function mapPos(kaikkiPos: string): SupportedPos | null {
+  return POS_MAP[kaikkiPos] ?? null;
+}
+
+function isAbbreviation(gloss: string): boolean {
+  return gloss.toLowerCase().startsWith("abbreviation of");
+}
+
+function extractTranslations(
+  sense: KaikkiSense,
+  sourceLang: SupportedLanguageCode,
+): ExtractedSense["translations"] {
+  const seen = new Set<string>();
+  const result: ExtractedSense["translations"] = [];
+
+  for (const t of sense.translations ?? []) {
+    const code = t.code ?? t.lang_code;
+    if (!code || !SUPPORTED_LANG_SET.has(code)) continue;
+    if (code === sourceLang) continue; // skip same-language translations
+    if (!t.word?.trim()) continue;
+
+    const key = `${code}:${t.word.trim()}`;
+    if (seen.has(key)) continue;
+    seen.add(key);
+
+    result.push({
+      target_lang: code as SupportedLanguageCode,
+      word: t.word.trim(),
+      sense_hint: t.sense?.trim() ?? null,
+    });
+  }
+
+  return result;
+}
+
+function extractExamples(sense: KaikkiSense): string[] {
+  return (sense.examples ?? [])
+    .map((e) => e.text?.trim())
+    .filter((t): t is string => !!t);
+}
+
+function processEntry(
+  entry: KaikkiEntry,
+  sourceLang: SupportedLanguageCode,
+): Omit<ExtractedSense, "sense_index">[] {
+  const pos = mapPos(entry.pos ?? "");
+  if (!pos) return [];
+  if (!entry.word?.trim()) return [];
+
+  // For non-English files, only process entries in the target language
+  const entryLang = (entry as Record<string, unknown>)["lang_code"] as
+    | string
+    | undefined;
+  if (sourceLang !== "en" && entryLang !== sourceLang) return [];
+
+  const headword = entry.word.trim();
+  const results: Omit<ExtractedSense, "sense_index">[] = [];
+
+  for (const sense of entry.senses ?? []) {
+    const gloss = sense.glosses?.[0]?.trim() ?? null;
+
+    if (gloss && isAbbreviation(gloss)) continue;
+
+    if (sourceLang === "en") {
+      // English: require translations in supported languages
+      const translations = extractTranslations(sense, sourceLang);
+      if (translations.length === 0) continue;
+      results.push({
+        headword,
+        language: sourceLang,
+        pos,
+        gloss,
+        examples: extractExamples(sense),
+        translations,
+      });
+    } else {
+      // Non-English: just extract the entry, no translations needed
+      results.push({
+        headword,
+        language: sourceLang,
+        pos,
+        gloss,
+        examples: extractExamples(sense),
+        translations: [],
+      });
+    }
+  }
+
+  return results;
+}
+
+// ── Extract ───────────────────────────────────────────────────────────────────
+
+export async function extract(
+  lang: SupportedLanguageCode,
+  sampleLimit?: number,
+): Promise<void> {
+  const filename = LANG_TO_FILE[lang];
+  const sourcePath = path.join(SOURCES_DIR, filename);
+  const outputPath = path.join(OUTPUT_DIR, `${lang}.json`);
+
+  console.log(`\nExtracting ${lang}...`);
+  console.log(`  Source: ${sourcePath}`);
+  if (sampleLimit) console.log(`  Sample mode: ${sampleLimit} entries`);
+
+  await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+
+  const fileStream = fs.createReadStream(sourcePath);
+  const rl = readline.createInterface({
+    input: fileStream,
+    crlfDelay: Infinity,
+  });
+
+  const senses: ExtractedSense[] = [];
+  const senseIndexMap = new Map<string, number>();
+  let linesRead = 0;
+  let entriesProcessed = 0;
+  let entriesSkipped = 0;
+
+  for await (const line of rl) {
+    if (!line.trim()) continue;
+    if (sampleLimit && entriesProcessed >= sampleLimit) break;
+
+    linesRead++;
+
+    let entry: KaikkiEntry;
+    try {
+      entry = JSON.parse(line) as KaikkiEntry;
+    } catch {
+      console.warn(`  Warning: failed to parse line ${linesRead}, skipping`);
+      continue;
+    }
+
+    const extracted = processEntry(entry, lang);
+
+    if (extracted.length === 0) {
+      entriesSkipped++;
+      continue;
+    }
+
+    for (const sense of extracted) {
+      const key = `${sense.headword}|${sense.pos}`;
+      const senseIndex = senseIndexMap.get(key) ?? 0;
+      senseIndexMap.set(key, senseIndex + 1);
+      senses.push({ ...sense, sense_index: senseIndex });
+    }
+
+    entriesProcessed++;
+
+    if (entriesProcessed % 10_000 === 0) {
+      console.log(
+        `  Processed ${entriesProcessed.toLocaleString()} entries...`,
+      );
+    }
+  }
+
+  await fs.promises.writeFile(
+    outputPath,
+    JSON.stringify(senses, null, 2),
+    "utf-8",
+  );
+
+  console.log(`  Lines read:        ${linesRead.toLocaleString()}`);
+  console.log(`  Entries processed: ${entriesProcessed.toLocaleString()}`);
+  console.log(`  Entries skipped:   ${entriesSkipped.toLocaleString()}`);
+  console.log(`  Senses extracted:  ${senses.length.toLocaleString()}`);
+  console.log(`  Output:            ${outputPath}`);
+}
+
+// ── Main ─────────────────────────────────────────────────────────────────────
+
+async function main(): Promise<void> {
+  // Hardcoded sample limit for development — remove for full extraction
+  const SAMPLE = 500;
+
+  for (const lang of SUPPORTED_LANGUAGE_CODES) {
+    await extract(lang, SAMPLE);
+  }
+
+  console.log("\nExtraction complete.");
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main().catch((err) => {
+    console.error(err);
+    process.exit(1);
+  });
+}
--- a/data-pipeline/stage-2-annotate/scripts/annotate.ts
+++ b/data-pipeline/stage-2-annotate/scripts/annotate.ts
@ -1,227 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
-import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
-
-// ── Types ────────────────────────────────────────────────────────────────────
-
-type OmwExample = { text: string; source: "omw" };
-
-type CefrExample = { text: string; source: "cefr" };
-
-type Example = OmwExample | CefrExample;
-
-type OmwRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, string[]>>;
-};
-
-type AnnotatedRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
-};
-
-type CefrSourceEntry = {
-  word: string;
-  pos: string;
-  cefr_level: string;
-  example_sentence_native?: string;
-};
-
-type ConflictEntry = {
-  word: string;
-  pos: string;
-  language: SupportedLanguageCode;
-  levels: string[];
-};
-
-// ── Constants ─────────────────────────────────────────────────────────────────
-
-const POS_NORMALIZE: Record<string, SupportedPos> = {
-  noun: "noun",
-  n: "noun",
-  nom: "noun", // French
-  verb: "verb",
-  verbs: "verb",
-  v: "verb",
-  v1: "verb",
-  adjective: "adjective",
-  adjektiv: "adjective", // German
-  adj: "adjective",
-  adverb: "adverb",
-  adverbs: "adverb",
-  adv: "adverb",
-};
-
-const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
-
-const PATHS = {
-  omw: "stage-1-extract/output/omw.json",
-  cefrDir: "stage-2-annotate/sources/cefr",
-  outputDir: "stage-2-annotate/output",
-};
-
-// ── CEFR source loading ───────────────────────────────────────────────────────
-
-type CefrIndex = Map<string, { level: string; example?: string }>;
-
-async function loadCefrSource(
-  lang: SupportedLanguageCode,
-): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
-  const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
-  const raw = await fs.readFile(filepath, "utf-8");
-  const entries = JSON.parse(raw) as CefrSourceEntry[];
-
-  // First pass — detect conflicts.
-  // Structure: "word|pos" -> Set of CEFR levels seen
-  const seen = new Map<string, Set<string>>();
-
-  for (const entry of entries) {
-    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
-    if (!pos) continue;
-    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
-
-    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
-    if (!seen.has(key)) seen.set(key, new Set());
-    seen.get(key)!.add(entry.cefr_level);
-  }
-
-  const conflicts: ConflictEntry[] = [];
-  for (const [key, levels] of seen.entries()) {
-    if (levels.size > 1) {
-      const [word, pos] = key.split("|") as [string, string];
-      conflicts.push({ word, pos, language: lang, levels: [...levels] });
-    }
-  }
-
-  // Second pass — build index, skip conflicting entries.
-  const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
-
-  const index: CefrIndex = new Map();
-  for (const entry of entries) {
-    const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
-    if (!pos) continue;
-    if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
-
-    const key = `${entry.word.toLowerCase().trim()}|${pos}`;
-    if (conflictKeys.has(key)) continue;
-
-    index.set(key, {
-      level: entry.cefr_level,
-      ...(entry.example_sentence_native
-        ? { example: entry.example_sentence_native }
-        : {}),
-    });
-  }
-
-  return { index, conflicts };
-}
-
-// ── Annotation ────────────────────────────────────────────────────────────────
-
-async function annotate(): Promise<void> {
-  // Load OMW records
-  console.log("Reading OMW extract...");
-  const raw = await fs.readFile(PATHS.omw, "utf-8");
-  const omwRecords = JSON.parse(raw) as OmwRecord[];
-  console.log(`  Loaded ${omwRecords.length.toLocaleString()} synsets`);
-
-  // Load CEFR sources for all languages
-  console.log("\nLoading CEFR source files...");
-  const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
-  const allConflicts: ConflictEntry[] = [];
-
-  for (const lang of SUPPORTED_LANGUAGE_CODES) {
-    const { index, conflicts } = await loadCefrSource(lang);
-    cefrIndexes.set(lang, index);
-    allConflicts.push(...conflicts);
-    console.log(
-      `  ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
-    );
-  }
-
-  // Write conflicts file
-  await fs.mkdir(PATHS.outputDir, { recursive: true });
-  await fs.writeFile(
-    path.join(PATHS.outputDir, "conflicts.json"),
-    JSON.stringify(allConflicts, null, 2),
-    "utf-8",
-  );
-  console.log(
-    `\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
-  );
-
-  // Annotate and write one file per language
-  console.log("\nAnnotating...");
-  for (const lang of SUPPORTED_LANGUAGE_CODES) {
-    const index = cefrIndexes.get(lang)!;
-    const records: AnnotatedRecord[] = [];
-    let matched = 0;
-
-    for (const record of omwRecords) {
-      const annotated: AnnotatedRecord = {
-        source_id: record.source_id,
-        pos: record.pos,
-        translations: record.translations,
-        glosses: record.glosses,
-        examples: {},
-        votes: {},
-      };
-
-      // Convert OMW examples to typed format
-      for (const [l, exList] of Object.entries(record.examples)) {
-        annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
-          text,
-          source: "omw" as const,
-        }));
-      }
-
-      // Match translations for this language against CEFR index
-      const langTranslations = record.translations[lang] ?? [];
-      for (const word of langTranslations) {
-        const key = `${word.toLowerCase().trim()}|${record.pos}`;
-        const cefrEntry = index.get(key);
-        if (!cefrEntry) continue;
-
-        matched++;
-
-        // Add CEFR vote
-        if (!annotated.votes[lang]) annotated.votes[lang] = {};
-        annotated.votes[lang][word] = { cefr_source: cefrEntry.level };
-
-        // Add native example if present
-        if (cefrEntry.example) {
-          if (!annotated.examples[lang]) annotated.examples[lang] = [];
-          annotated.examples[lang].push({
-            text: cefrEntry.example,
-            source: "cefr" as const,
-          });
-        }
-      }
-
-      records.push(annotated);
-    }
-
-    const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
-    await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
-    console.log(
-      `  ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
-    );
-  }
-}
-
-// ── Main ─────────────────────────────────────────────────────────────────────
-
-annotate().catch((err) => {
-  console.error(err);
-  process.exit(1);
-});
--- a/data-pipeline/stage-2-annotate/sources/cefr/de.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/de.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/en.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/en.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/es.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/es.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/fr.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/fr.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/it.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/it.json
--- a/data-pipeline/stage-2-reverse-link/scripts/reverse-link.ts
+++ b/data-pipeline/stage-2-reverse-link/scripts/reverse-link.ts
@ -0,0 +1,109 @@
+import { openDb } from "../../db/index.js";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type TranslationRow = {
+  translation_id: number;
+  entry_id: number;
+  entry_language: string;
+  entry_headword: string;
+  target_lang: string;
+  word: string;
+  sense_hint: string | null;
+};
+
+type EntryRow = { id: number };
+
+// ── Sync ──────────────────────────────────────────────────────────────────────
+
+export function reverseLink(): void {
+  const db = openDb();
+
+  // Find all translations and their source entry details
+  const translations = db
+    .prepare(
+      `SELECT
+        t.id          AS translation_id,
+        t.entry_id,
+        e.language    AS entry_language,
+        e.headword    AS entry_headword,
+        t.target_lang,
+        t.word,
+        t.sense_hint
+       FROM translations t
+       JOIN entries e ON e.id = t.entry_id`,
+    )
+    .all() as TranslationRow[];
+
+  console.log(
+    `  Found ${translations.length.toLocaleString()} translations to check`,
+  );
+
+  const findEntry = db.prepare(
+    `SELECT id FROM entries WHERE headword = ? AND language = ? LIMIT 1`,
+  );
+
+  const insertReverseLink = db.prepare(
+    `INSERT INTO translations (entry_id, target_lang, word, sense_hint, source)
+     VALUES (?, ?, ?, ?, 'reverse_link')
+     ON CONFLICT (entry_id, target_lang, word) DO NOTHING`,
+  );
+
+  const sync = db.transaction(() => {
+    let inserted = 0;
+    let skipped = 0;
+    let noEntry = 0;
+
+    for (const t of translations) {
+      // Look for an entry in the target language with the translation word as headword
+      const targetEntry = findEntry.get(t.word, t.target_lang) as
+        | EntryRow
+        | undefined;
+
+      if (!targetEntry) {
+        noEntry++;
+        continue;
+      }
+
+      // Insert reverse link: target entry → source language → source headword
+      const result = insertReverseLink.run(
+        targetEntry.id,
+        t.entry_language,
+        t.entry_headword,
+        t.sense_hint ?? null,
+      );
+
+      if (result.changes > 0) {
+        inserted++;
+      } else {
+        skipped++;
+      }
+    }
+
+    return { inserted, skipped, noEntry };
+  });
+
+  const counts = sync();
+
+  db.close();
+
+  console.log(`  Inserted: ${counts.inserted.toLocaleString()} reverse links`);
+  console.log(
+    `  Skipped:  ${counts.skipped.toLocaleString()} (already existed)`,
+  );
+  console.log(
+    `  No entry: ${counts.noEntry.toLocaleString()} (target word not in entries)`,
+  );
+}
+
+// ── Main ─────────────────────────────────────────────────────────────────────
+
+function main(): void {
+  console.log("Running reverse link sync...");
+  reverseLink();
+  console.log("\nReverse link sync complete.");
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main();
+}
--- a/data-pipeline/stage-3-enrich/config.ts
+++ b/data-pipeline/stage-3-enrich/config.ts
@ -20,12 +20,20 @@ export type ProviderConfig = {

 // ── Local llama.cpp ───────────────────────────────────────────────────────────

+export const LOCAL_QWEN35_4B: ProviderConfig = {
+  name: "local-qwen3.5-4b",
+  baseURL: "http://127.0.0.1:8080/v1",
+  apiKey: "none",
+  model: "qwen3.5-4b",
+  maxTokens: 1024, // no reasoning overhead so 1024 is enough
+};
+
 export const LOCAL_GEMMA4: ProviderConfig = {
  name: "local-gemma4-e4b",
  baseURL: "http://127.0.0.1:8080/v1",
  apiKey: "none", // llama.cpp ignores this
  model: "gemma4-e4b", // llama.cpp ignores model name, uses loaded model
-  maxTokens: 512,
+  maxTokens: 2048,
 };

 export const LOCAL_QWEN7B: ProviderConfig = {
@ -87,13 +95,14 @@ export const ANTHROPIC_SONNET: ProviderConfig = {
 // Add new providers here to include them in the voting pool.

 export const ALL_PROVIDERS: ProviderConfig[] = [
-  LOCAL_GEMMA4,
-  LOCAL_QWEN7B,
-  OR_QWEN3_480B,
-  OR_GEMMA4_31B,
-  OR_QWEN3_80B,
-  OR_NEMOTRON,
-  ANTHROPIC_SONNET,
+  LOCAL_QWEN35_4B,
+  // LOCAL_GEMMA4,
+  // LOCAL_QWEN7B,
+  // OR_QWEN3_480B,
+  // OR_GEMMA4_31B,
+  // OR_QWEN3_80B,
+  // OR_NEMOTRON,
+  // ANTHROPIC_SONNET,
 ];

 // ── Key validation ────────────────────────────────────────────────────────────
--- a/data-pipeline/stage-3-enrich/scripts/enrich.ts
+++ b/data-pipeline/stage-3-enrich/scripts/enrich.ts
@ -0,0 +1,877 @@
+import { openDb } from "../../db/index.js";
+import type { ProviderConfig } from "../config.js";
+import { CEFR_LEVELS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
+import type { SupportedLanguageCode } from "@lila/shared";
+
+// ── Types ─────────────────────────────────────────────────────────────────────
+
+type EntryRow = {
+  id: number;
+  headword: string;
+  language: SupportedLanguageCode;
+  pos: string;
+  gloss: string | null;
+  examples: string; // JSON array string
+};
+
+type TranslationRow = {
+  id: number;
+  target_lang: SupportedLanguageCode;
+  word: string;
+};
+
+type GlossResult = { status: "ok" } | { status: "improved"; gloss: string };
+
+type ExampleResult = { status: "ok" } | { status: "improved"; example: string };
+
+type TranslationResult = {
+  translations: Partial<
+    Record<SupportedLanguageCode, Record<string, "ok" | "reject">>
+  >;
+  generated?: Partial<Record<SupportedLanguageCode, string>>;
+};
+
+type CefrResult = {
+  headword_cefr: string;
+  translation_cefr: Partial<
+    Record<SupportedLanguageCode, Record<string, string>>
+  >;
+};
+
+type SubStage =
+  | "round1_gloss"
+  | "round1_example"
+  | "round1_translations"
+  | "round1_cefr";
+
+// ── Constants ─────────────────────────────────────────────────────────────────
+
+const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
+const CEFR_SET = new Set<string>(CEFR_LEVELS);
+
+// ── Shutdown ──────────────────────────────────────────────────────────────────
+
+let shutdownRequested = false;
+let currentCallController: AbortController | null = null;
+
+export function registerEnrichShutdown(): void {
+  const handler = (): void => {
+    if (shutdownRequested) return;
+    shutdownRequested = true;
+    console.log("\n\n  Shutdown requested — aborting current LLM call...");
+    currentCallController?.abort();
+  };
+  process.on("SIGINT", handler);
+  process.on("SIGTERM", handler);
+}
+
+// ── Prompt builders ───────────────────────────────────────────────────────────
+
+function buildGlossPrompt(entry: EntryRow): string {
+  const glossText = entry.gloss ?? "none";
+  const examples: string[] = JSON.parse(entry.examples) as string[];
+  const examplesText =
+    examples.length > 0 ? examples.map((e) => `  - ${e}`).join("\n") : "  none";
+
+  return `You are a language learning expert.
+
+Review this gloss for the ${entry.pos} "${entry.headword}" (sense ${entry.sense_index}).
+Gloss: "${glossText}"
+Examples of this specific sense:
+${examplesText}
+
+Is this gloss clear, accurate for this specific sense, and suitable for a language learner?
+- If yes, respond with: {"status": "ok"}
+- If no or if gloss is "none", respond with: {"status": "improved", "gloss": "your improved gloss here"}
+
+IMPORTANT: Your improved gloss must describe THIS SPECIFIC SENSE shown by the examples above,
+not a more common or general meaning of the word.
+
+Respond ONLY with valid JSON and nothing else.`;
+}
+
+function buildTranslationsPrompt(
+  entry: EntryRow,
+  translations: TranslationRow[],
+  verifiedGloss: string,
+): string {
+  const byLang = new Map<SupportedLanguageCode, string[]>();
+  for (const t of translations) {
+    if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, []);
+    byLang.get(t.target_lang)!.push(t.word);
+  }
+
+  const coveredLangs = new Set(byLang.keys());
+  const missingLangs = SUPPORTED_LANGUAGE_CODES.filter(
+    (l) => l !== entry.language && !coveredLangs.has(l),
+  );
+
+  const translationsText =
+    byLang.size > 0
+      ? [...byLang.entries()]
+          .map(([lang, words]) => `  ${lang}: ${words.join(", ")}`)
+          .join("\n")
+      : "  none";
+
+  const missingText =
+    missingLangs.length > 0 ? missingLangs.join(", ") : "none";
+
+  const exampleResponse: Record<string, unknown> = {
+    translations: {
+      de: { frei: "ok", "-frei": "reject" },
+      it: { libero: "ok", free: "reject" },
+    },
+  };
+  if (missingLangs.length > 0) {
+    exampleResponse["generated"] = { es: "libre", fr: "libre" };
+  }
+
+  return `You are a language learning expert.
+
+For the ${entry.language} ${entry.pos} "${entry.headword}" (meaning: "${verifiedGloss}"), review these translations:
+${translationsText}
+
+For each translation:
+- Write "ok" if it is a valid translation for this specific meaning
+- Write "reject" if it is wrong, a suffix (starts with -), garbled text, or the wrong language
+
+Examples of correct behaviour:
+- "free" listed as Italian → "reject" (it is English, not Italian)
+- "-frei" listed as German → "reject" (it is a suffix, not a standalone word)
+- "libre" listed as Spanish → "ok" (it is a valid Spanish word)
+
+${missingLangs.length > 0 ? `Also generate the single best translation for these missing languages: ${missingText}` : ""}
+
+Respond ONLY with valid JSON and nothing else:
+${JSON.stringify(exampleResponse, null, 2)}`;
+}
+
+function buildCefrPrompt(
+  entry: EntryRow,
+  verifiedGloss: string,
+  validatedTranslations: Map<SupportedLanguageCode, string[]>,
+): string {
+  const translationsText =
+    validatedTranslations.size > 0
+      ? [...validatedTranslations.entries()]
+          .map(([lang, words]) => `  ${lang}: ${words.join(", ")}`)
+          .join("\n")
+      : "  none";
+
+  return `You are a language learning expert.
+
+Assign CEFR levels (A1, A2, B1, B2, C1, or C2) to this word and its validated translations.
+Base your levels on how commonly a language learner at that level would encounter this specific sense.
+Consider register — slang, technical, and archaic words should be rated higher.
+
+WORD: ${entry.headword} (${entry.pos})
+MEANING: ${verifiedGloss}
+VALIDATED TRANSLATIONS:
+${translationsText}
+
+Respond ONLY with valid JSON and nothing else:
+{
+  "headword_cefr": "B1",
+  "translation_cefr": {
+    "de": { "frei": "A2" },
+    "it": { "libero": "A2" }
+  }
+}`;
+}
+
+// ── Validation ────────────────────────────────────────────────────────────────
+
+function validateGloss(raw: string): GlossResult | null {
+  try {
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (obj["status"] === "ok") return { status: "ok" };
+    if (
+      obj["status"] === "improved" &&
+      typeof obj["gloss"] === "string" &&
+      obj["gloss"].trim()
+    ) {
+      return { status: "improved", gloss: obj["gloss"].trim() };
+    }
+    return null;
+  } catch {
+    return null;
+  }
+}
+
+function validateExample(raw: string): ExampleResult | null {
+  try {
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (obj["status"] === "ok") return { status: "ok" };
+    if (
+      obj["status"] === "improved" &&
+      typeof obj["example"] === "string" &&
+      obj["example"].trim()
+    ) {
+      return { status: "improved", example: obj["example"].trim() };
+    }
+    return null;
+  } catch {
+    return null;
+  }
+}
+
+function validateTranslations(
+  raw: string,
+  translations: TranslationRow[],
+): TranslationResult | null {
+  try {
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (typeof obj["translations"] !== "object" || obj["translations"] === null)
+      return null;
+
+    const result: TranslationResult = { translations: {} };
+    const translationsObj = obj["translations"] as Record<string, unknown>;
+
+    // Validate each language's votes
+    for (const [lang, votes] of Object.entries(translationsObj)) {
+      if (!SUPPORTED_LANG_SET.has(lang)) continue;
+      if (typeof votes !== "object" || votes === null) continue;
+
+      result.translations[lang as SupportedLanguageCode] = {};
+      for (const [word, status] of Object.entries(
+        votes as Record<string, unknown>,
+      )) {
+        if (status === "ok" || status === "reject") {
+          result.translations[lang as SupportedLanguageCode]![word] = status;
+        }
+      }
+    }
+
+    // Validate generated translations
+    if (obj["generated"] !== undefined && obj["generated"] !== null) {
+      if (typeof obj["generated"] !== "object") return null;
+      result.generated = {};
+      for (const [lang, word] of Object.entries(
+        obj["generated"] as Record<string, unknown>,
+      )) {
+        if (!SUPPORTED_LANG_SET.has(lang)) continue;
+        if (typeof word === "string" && word.trim()) {
+          result.generated[lang as SupportedLanguageCode] = word.trim();
+        }
+      }
+    }
+
+    // Check all translations got a vote
+    const byLang = new Map<string, Set<string>>();
+    for (const t of translations) {
+      if (!byLang.has(t.target_lang)) byLang.set(t.target_lang, new Set());
+      byLang.get(t.target_lang)!.add(t.word);
+    }
+
+    for (const [lang, words] of byLang.entries()) {
+      const votes = result.translations[lang as SupportedLanguageCode];
+      if (!votes) return null;
+      for (const word of words) {
+        if (!votes[word]) return null;
+      }
+    }
+
+    return result;
+  } catch {
+    return null;
+  }
+}
+
+function validateCefr(
+  raw: string,
+  validatedTranslations: Map<SupportedLanguageCode, string[]>,
+): CefrResult | null {
+  try {
+    const obj = JSON.parse(raw) as Record<string, unknown>;
+    if (typeof obj["headword_cefr"] !== "string") return null;
+    if (!CEFR_SET.has(obj["headword_cefr"])) return null;
+    if (
+      typeof obj["translation_cefr"] !== "object" ||
+      obj["translation_cefr"] === null
+    )
+      return null;
+
+    const translationCefr = obj["translation_cefr"] as Record<string, unknown>;
+
+    // Verify all validated translations have a CEFR vote
+    for (const [lang, words] of validatedTranslations.entries()) {
+      const votes = translationCefr[lang] as Record<string, string> | undefined;
+      if (!votes) return null;
+      for (const word of words) {
+        if (!votes[word] || !CEFR_SET.has(votes[word])) return null;
+      }
+    }
+
+    return {
+      headword_cefr: obj["headword_cefr"],
+      translation_cefr: translationCefr as Partial<
+        Record<SupportedLanguageCode, Record<string, string>>
+      >,
+    };
+  } catch {
+    return null;
+  }
+}
+
+// ── LLM call ──────────────────────────────────────────────────────────────────
+
+async function callLlm(
+  prompt: string,
+  provider: ProviderConfig,
+): Promise<string> {
+  currentCallController = new AbortController();
+  const timeout = setTimeout(() => currentCallController?.abort(), 120_000);
+
+  let response: Response;
+  try {
+    response = await fetch(`${provider.baseURL}/chat/completions`, {
+      method: "POST",
+      signal: currentCallController.signal,
+      headers: {
+        "Content-Type": "application/json",
+        Authorization: `Bearer ${provider.apiKey}`,
+      },
+      body: JSON.stringify({
+        model: provider.model,
+        max_tokens: provider.maxTokens,
+        messages: [{ role: "user", content: prompt }],
+        temperature: 0.1,
+      }),
+    });
+  } finally {
+    clearTimeout(timeout);
+    currentCallController = null;
+  }
+
+  if (!response.ok) {
+    throw new Error(`LLM API error: ${response.status} ${response.statusText}`);
+  }
+
+  const data = (await response.json()) as {
+    choices?: { message?: { content?: string } }[];
+  };
+
+  const content = data.choices?.[0]?.message?.content;
+  if (!content) throw new Error("LLM returned empty response");
+
+  return content
+    .replace(/```json\n?/g, "")
+    .replace(/```\n?/g, "")
+    .trim();
+}
+
+// ── Status helpers ────────────────────────────────────────────────────────────
+
+function getSubStageStatus(
+  entryId: number,
+  modelName: string,
+  stage: SubStage,
+): "complete" | "needs_review" | "pending" {
+  const db = openDb();
+  const row = db
+    .prepare(
+      `SELECT status FROM run_status
+       WHERE entry_id = ? AND model_name = ? AND stage = ?`,
+    )
+    .get(entryId, modelName, stage) as { status: string } | undefined;
+  db.close();
+  if (!row) return "pending";
+  if (row.status === "complete") return "complete";
+  if (row.status === "needs_review") return "needs_review";
+  return "pending";
+}
+
+function markSubStage(
+  entryId: number,
+  modelName: string,
+  stage: SubStage,
+  status: "complete" | "needs_review",
+): void {
+  const db = openDb();
+  db.prepare(
+    `INSERT INTO run_status (entry_id, model_name, stage, status)
+     VALUES (?, ?, ?, ?)
+     ON CONFLICT (entry_id, model_name, stage)
+     DO UPDATE SET status = ?, updated_at = datetime('now')`,
+  ).run(entryId, modelName, stage, status, status);
+  db.close();
+}
+
+// ── Write helpers ─────────────────────────────────────────────────────────────
+
+function writeGloss(
+  entryId: number,
+  modelName: string,
+  result: GlossResult,
+): void {
+  if (result.status === "improved") {
+    const db = openDb();
+    db.prepare(
+      `INSERT INTO generated_glosses (entry_id, model_name, text)
+       VALUES (?, ?, ?)
+       ON CONFLICT (entry_id, model_name) DO NOTHING`,
+    ).run(entryId, modelName, result.gloss);
+    db.close();
+  }
+}
+
+function writeExample(
+  entryId: number,
+  modelName: string,
+  result: ExampleResult,
+): void {
+  if (result.status === "improved") {
+    const db = openDb();
+    db.prepare(
+      `INSERT INTO generated_examples (entry_id, model_name, text)
+       VALUES (?, ?, ?)
+       ON CONFLICT (entry_id, model_name) DO NOTHING`,
+    ).run(entryId, modelName, result.example);
+    db.close();
+  }
+}
+
+function writeTranslations(
+  entryId: number,
+  modelName: string,
+  result: TranslationResult,
+  translations: TranslationRow[],
+): void {
+  const db = openDb();
+
+  db.transaction(() => {
+    // Write rejections
+    for (const t of translations) {
+      const vote = result.translations[t.target_lang]?.[t.word];
+      if (vote === "reject") {
+        db.prepare(
+          `INSERT INTO model_translation_rejections (translation_id, model_name)
+           VALUES (?, ?)
+           ON CONFLICT (translation_id, model_name) DO NOTHING`,
+        ).run(t.id, modelName);
+      }
+    }
+
+    // Write generated translations
+    if (result.generated) {
+      for (const [lang, word] of Object.entries(result.generated)) {
+        db.prepare(
+          `INSERT INTO generated_translations (entry_id, model_name, target_lang, word)
+           VALUES (?, ?, ?, ?)
+           ON CONFLICT (entry_id, model_name, target_lang) DO NOTHING`,
+        ).run(entryId, modelName, lang, word);
+      }
+    }
+  })();
+
+  db.close();
+}
+
+function writeCefr(
+  entryId: number,
+  modelName: string,
+  result: CefrResult,
+  translations: TranslationRow[],
+): void {
+  const db = openDb();
+
+  db.transaction(() => {
+    // Headword CEFR
+    db.prepare(
+      `INSERT INTO model_entry_cefr_votes (entry_id, model_name, cefr_level)
+       VALUES (?, ?, ?)
+       ON CONFLICT (entry_id, model_name) DO NOTHING`,
+    ).run(entryId, modelName, result.headword_cefr);
+
+    // Translation CEFR votes
+    for (const t of translations) {
+      const level = result.translation_cefr[t.target_lang]?.[t.word];
+      if (level && CEFR_SET.has(level)) {
+        db.prepare(
+          `INSERT INTO model_translation_cefr_votes (translation_id, model_name, cefr_level)
+           VALUES (?, ?, ?)
+           ON CONFLICT (translation_id, model_name) DO NOTHING`,
+        ).run(t.id, modelName, level);
+      }
+    }
+  })();
+
+  db.close();
+}
+
+// ── Progress ──────────────────────────────────────────────────────────────────
+
+function updateProgress(
+  processed: number,
+  needsReview: number,
+  total: number,
+  llmMs: number,
+  startTime: number,
+): void {
+  const totalProcessed = processed + needsReview;
+  const pct = ((totalProcessed / total) * 100).toFixed(1);
+  const elapsed = (Date.now() - startTime) / 1000;
+  const rate = elapsed > 0 ? totalProcessed / elapsed : 0;
+  const remaining = rate > 0 ? (total - totalProcessed) / rate : 0;
+  const eta =
+    remaining === 0
+      ? "calculating..."
+      : remaining < 60
+        ? `${Math.round(remaining)}s`
+        : `${Math.round(remaining / 60)}m`;
+  const totalElapsedStr =
+    elapsed < 60
+      ? `${Math.round(elapsed)}s`
+      : `${Math.floor(elapsed / 60)}m ${Math.round(elapsed % 60)}s`;
+
+  process.stdout.write(
+    `\r    ${totalProcessed}/${total} (${pct}%) — entry: ${(llmMs / 1000).toFixed(1)}s — total: ${totalElapsedStr} — ETA: ${eta}    `,
+  );
+}
+
+// ── Main enrich function ──────────────────────────────────────────────────────
+
+export async function enrich(
+  provider: ProviderConfig,
+): Promise<{ processed: number; skipped: number; needsReview: number }> {
+  registerEnrichShutdown();
+  const db = openDb();
+
+  const allEntries = db
+    .prepare(`SELECT * FROM entries WHERE language = 'en'`)
+    .all() as EntryRow[];
+
+  // An entry is fully complete when all 4 sub-stages are complete
+  const completeEntries = db
+    .prepare(
+      `SELECT entry_id FROM run_status
+       WHERE model_name = ? AND stage = 'round1_gloss'
+       AND status = 'complete'`,
+    )
+    .all(provider.name) as { entry_id: number }[];
+
+  const completeIds = new Set(completeEntries.map((r) => r.entry_id));
+  const pending = allEntries.filter((e) => !completeIds.has(e.id)).slice(0, 50);
+
+  db.close();
+
+  console.log(`\n  Model: ${provider.name}`);
+  console.log(`  Total entries: ${allEntries.length.toLocaleString()}`);
+  console.log(`  Already complete: ${completeIds.size.toLocaleString()}`);
+  console.log(`  Pending: ${pending.length.toLocaleString()}`);
+
+  if (pending.length === 0) {
+    console.log("  Nothing to process.");
+    return { processed: 0, skipped: completeIds.size, needsReview: 0 };
+  }
+
+  let processedCount = 0;
+  let needsReviewCount = 0;
+  let llmMs = 0;
+  const startTime = Date.now();
+
+  for (const entry of pending) {
+    if (shutdownRequested) break;
+
+    const db2 = openDb();
+    const translations = db2
+      .prepare(
+        `SELECT id, target_lang, word FROM translations WHERE entry_id = ? AND source = 'kaikki'`,
+      )
+      .all(entry.id) as TranslationRow[];
+    db2.close();
+
+    let entryFailed = false;
+
+    // ── Sub-stage 1: Gloss ────────────────────────────────────────────────────
+
+    let verifiedGloss = entry.gloss ?? "";
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_gloss") !== "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(buildGlossPrompt(entry), provider);
+        llmMs = Date.now() - llmStart;
+
+        const result = validateGloss(raw);
+        if (!result) {
+          markSubStage(entry.id, provider.name, "round1_gloss", "needs_review");
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_gloss — invalid response`,
+          );
+          entryFailed = true;
+        } else {
+          writeGloss(entry.id, provider.name, result);
+          if (result.status === "improved") verifiedGloss = result.gloss;
+          markSubStage(entry.id, provider.name, "round1_gloss", "complete");
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(entry.id, provider.name, "round1_gloss", "needs_review");
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_gloss — ${message}`,
+        );
+        entryFailed = true;
+      }
+    }
+
+    if (entryFailed) {
+      needsReviewCount++;
+      updateProgress(
+        processedCount,
+        needsReviewCount,
+        pending.length,
+        llmMs,
+        startTime,
+      );
+      continue;
+    }
+
+    /*
+    // ── Sub-stages 2, 3, 4 — not yet active ──────────────────────────────────
+    // ── Sub-stage 2: Example ──────────────────────────────────────────────────
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_example") !==
+      "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(
+          buildExamplePrompt(entry, verifiedGloss),
+          provider,
+        );
+        llmMs = Date.now() - llmStart;
+
+        const result = validateExample(raw);
+        if (!result) {
+          markSubStage(
+            entry.id,
+            provider.name,
+            "round1_example",
+            "needs_review",
+          );
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_example — invalid response`,
+          );
+          entryFailed = true;
+        } else {
+          writeExample(entry.id, provider.name, result);
+          markSubStage(entry.id, provider.name, "round1_example", "complete");
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(entry.id, provider.name, "round1_example", "needs_review");
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_example — ${message}`,
+        );
+        entryFailed = true;
+      }
+    }
+
+    if (entryFailed) {
+      needsReviewCount++;
+      updateProgress(
+        processedCount,
+        needsReviewCount,
+        pending.length,
+        llmMs,
+        startTime,
+      );
+      continue;
+    }
+
+    // ── Sub-stage 3: Translations ─────────────────────────────────────────────
+
+    const validatedTranslations = new Map<SupportedLanguageCode, string[]>();
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_translations") !==
+      "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(
+          buildTranslationsPrompt(entry, translations, verifiedGloss),
+          provider,
+        );
+        llmMs = Date.now() - llmStart;
+
+        const result = validateTranslations(raw, translations);
+        if (!result) {
+          markSubStage(
+            entry.id,
+            provider.name,
+            "round1_translations",
+            "needs_review",
+          );
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_translations — invalid response`,
+          );
+          entryFailed = true;
+        } else {
+          writeTranslations(entry.id, provider.name, result, translations);
+          markSubStage(
+            entry.id,
+            provider.name,
+            "round1_translations",
+            "complete",
+          );
+
+          // Build validated translations map for CEFR sub-stage
+          // Include kaikki translations that were ok'd + generated translations
+          for (const t of translations) {
+            const vote = result.translations[t.target_lang]?.[t.word];
+            if (vote === "ok") {
+              if (!validatedTranslations.has(t.target_lang)) {
+                validatedTranslations.set(t.target_lang, []);
+              }
+              validatedTranslations.get(t.target_lang)!.push(t.word);
+            }
+          }
+          if (result.generated) {
+            for (const [lang, word] of Object.entries(result.generated)) {
+              const l = lang as SupportedLanguageCode;
+              if (!validatedTranslations.has(l))
+                validatedTranslations.set(l, []);
+              validatedTranslations.get(l)!.push(word);
+            }
+          }
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(
+          entry.id,
+          provider.name,
+          "round1_translations",
+          "needs_review",
+        );
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_translations — ${message}`,
+        );
+        entryFailed = true;
+      }
+    } else {
+      // Already complete — rebuild validated translations from db
+      const db3 = openDb();
+      const rejections = new Set(
+        (
+          db3
+            .prepare(
+              `SELECT translation_id FROM model_translation_rejections WHERE model_name = ?`,
+            )
+            .all(provider.name) as { translation_id: number }[]
+        ).map((r) => r.translation_id),
+      );
+      for (const t of translations) {
+        if (!rejections.has(t.id)) {
+          if (!validatedTranslations.has(t.target_lang)) {
+            validatedTranslations.set(t.target_lang, []);
+          }
+          validatedTranslations.get(t.target_lang)!.push(t.word);
+        }
+      }
+      const generated = db3
+        .prepare(
+          `SELECT target_lang, word FROM generated_translations WHERE entry_id = ? AND model_name = ?`,
+        )
+        .all(entry.id, provider.name) as {
+        target_lang: SupportedLanguageCode;
+        word: string;
+      }[];
+      for (const g of generated) {
+        if (!validatedTranslations.has(g.target_lang))
+          validatedTranslations.set(g.target_lang, []);
+        validatedTranslations.get(g.target_lang)!.push(g.word);
+      }
+      db3.close();
+    }
+
+    if (entryFailed) {
+      needsReviewCount++;
+      updateProgress(
+        processedCount,
+        needsReviewCount,
+        pending.length,
+        llmMs,
+        startTime,
+      );
+      continue;
+    }
+
+    // ── Sub-stage 4: CEFR ─────────────────────────────────────────────────────
+
+    if (
+      getSubStageStatus(entry.id, provider.name, "round1_cefr") !== "complete"
+    ) {
+      try {
+        const llmStart = Date.now();
+        const raw = await callLlm(
+          buildCefrPrompt(entry, verifiedGloss, validatedTranslations),
+          provider,
+        );
+        llmMs = Date.now() - llmStart;
+
+        const result = validateCefr(raw, validatedTranslations);
+        if (!result) {
+          markSubStage(entry.id, provider.name, "round1_cefr", "needs_review");
+          console.warn(
+            `\n    needs_review: entry ${entry.id} round1_cefr — invalid response`,
+          );
+          needsReviewCount++;
+        } else {
+          // Get translation rows for validated words only
+          const validatedRows = translations.filter((t) => {
+            return validatedTranslations.get(t.target_lang)?.includes(t.word);
+          });
+          writeCefr(entry.id, provider.name, result, validatedRows);
+          markSubStage(entry.id, provider.name, "round1_cefr", "complete");
+          processedCount++;
+        }
+      } catch (err) {
+        llmMs = 0;
+        const message = err instanceof Error ? err.message : String(err);
+        markSubStage(entry.id, provider.name, "round1_cefr", "needs_review");
+        console.warn(
+          `\n    needs_review: entry ${entry.id} round1_cefr — ${message}`,
+        );
+        needsReviewCount++;
+      }
+    } else {
+      processedCount++;
+    }
+
+    */
+
+    processedCount++;
+    updateProgress(
+      processedCount,
+      needsReviewCount,
+      pending.length,
+      llmMs,
+      startTime,
+    );
+  }
+
+  process.stdout.write("\n");
+  const totalMs = Date.now() - startTime;
+  const totalMin = Math.floor(totalMs / 60_000);
+  const totalSec = Math.round((totalMs % 60_000) / 1000);
+  console.log(`  Total time: ${totalMin}m ${totalSec}s`);
+  console.log(
+    `  Avg per entry: ${(totalMs / Math.max(processedCount + needsReviewCount, 1) / 1000).toFixed(1)}s`,
+  );
+  console.log(`  Processed: ${processedCount.toLocaleString()}`);
+  console.log(`  Needs review: ${needsReviewCount.toLocaleString()}`);
+
+  return {
+    processed: processedCount,
+    skipped: completeIds.size,
+    needsReview: needsReviewCount,
+  };
+}
--- a/data-pipeline/tests/fixtures/annotated.fixture.json
+++ b/data-pipeline/tests/fixtures/annotated.fixture.json
@ -1,170 +0,0 @@
-[
-  {
-    "_fixture": "noun_with_cefr_vote",
-    "source_id": "ili:i100955",
-    "pos": "noun",
-    "translations": { "en": ["grain"], "de": ["Korn", "Kornbrand"] },
-    "glosses": { "en": ["a cereal grass"], "de": ["ein Getreidegras"] },
-    "examples": {
-      "en": [
-        { "text": "wheat is a grain that is grown in Kansas", "source": "omw" }
-      ]
-    },
-    "votes": { "en": { "grain": { "cefr_source": "B1" } } }
-  },
-  {
-    "_fixture": "verb_no_votes_no_translations",
-    "source_id": "ili:i21779",
-    "pos": "verb",
-    "translations": { "en": ["respire"] },
-    "glosses": {
-      "en": [
-        "undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide"
-      ]
-    },
-    "examples": {},
-    "votes": {}
-  },
-  {
-    "_fixture": "verb_with_cefr_vote_all_languages",
-    "source_id": "ili:i21778",
-    "pos": "verb",
-    "translations": {
-      "en": ["breathe", "take a breath", "respire", "suspire"],
-      "it": ["respirare"],
-      "es": ["aspirar", "respirar"],
-      "de": ["Luft holen", "hauchen", "Luft bekommen", "Luft kriegen", "atmen"],
-      "fr": ["inspirer", "respirer"]
-    },
-    "glosses": {
-      "en": ["draw air into, and expel out of, the lungs"],
-      "de": ["Luft in die Lunge saugen und aus ihr ausstoßen"]
-    },
-    "examples": {
-      "en": [
-        {
-          "text": "I can breathe better when the air is clean",
-          "source": "omw"
-        },
-        { "text": "The patient is respiring", "source": "omw" }
-      ]
-    },
-    "votes": { "en": { "breathe": { "cefr_source": "A1" } } }
-  },
-  {
-    "_fixture": "adjective_all_languages_multiple_translations",
-    "source_id": "ili:i10007",
-    "pos": "adjective",
-    "translations": {
-      "en": ["possible"],
-      "it": [
-        "attuabile",
-        "effettuabile",
-        "eseguibile",
-        "fattibile",
-        "operabile",
-        "possibile",
-        "producibile",
-        "realizzabile"
-      ],
-      "es": ["posible"],
-      "de": [
-        "möglich",
-        "denkbar",
-        "eventuell",
-        "möglicherweise",
-        "allfällig",
-        "etwaig",
-        "gegebenenfalls",
-        "eventuell"
-      ],
-      "fr": ["possible", "éventuel"]
-    },
-    "glosses": {
-      "en": ["capable of happening or existing"],
-      "de": ["in der Lage, zu geschehen oder zu existieren"]
-    },
-    "examples": {
-      "en": [
-        { "text": "a breakthrough may be possible next year", "source": "omw" },
-        { "text": "anything is possible", "source": "omw" },
-        { "text": "warned of possible consequences", "source": "omw" }
-      ]
-    },
-    "votes": { "en": { "possible": { "cefr_source": "A2" } } }
-  },
-  {
-    "_fixture": "adjective_multiple_de_votes_cefr_examples",
-    "source_id": "ili:i10000",
-    "pos": "adjective",
-    "translations": {
-      "en": ["negative"],
-      "de": [
-        "dürftig",
-        "zu wünschen übrig lassen",
-        "schlecht",
-        "widrig",
-        "ungut",
-        "lausig",
-        "negativ",
-        "von Nachteil",
-        "schädlich",
-        "nachteilig",
-        "ungünstig"
-      ],
-      "fr": ["négatif", "strictement négatif"]
-    },
-    "glosses": { "en": ["less than zero"], "de": ["kleiner als Null"] },
-    "examples": {
-      "en": [{ "text": "a negative number", "source": "omw" }],
-      "de": [
-        { "text": "Die Beweise waren dürftig.", "source": "cefr" },
-        { "text": "Das Wetter ist heute schlecht.", "source": "cefr" },
-        {
-          "text": "Trotz widriger Umstände haben sie es geschafft.",
-          "source": "cefr"
-        },
-        {
-          "text": "Er hatte ein ungutes Gefühl bei der Sache.",
-          "source": "cefr"
-        },
-        { "text": "Er hat eine sehr negative Einstellung.", "source": "cefr" },
-        {
-          "text": "Rauchen ist schädlich für die Gesundheit.",
-          "source": "cefr"
-        },
-        {
-          "text": "Diese Entscheidung könnte nachteilig sein.",
-          "source": "cefr"
-        },
-        {
-          "text": "Das Wetter ist heute ungünstig für einen Ausflug.",
-          "source": "cefr"
-        }
-      ]
-    },
-    "votes": {
-      "de": {
-        "dürftig": { "cefr_source": "C1" },
-        "schlecht": { "cefr_source": "A1" },
-        "widrig": { "cefr_source": "C1" },
-        "ungut": { "cefr_source": "B2" },
-        "negativ": { "cefr_source": "A2" },
-        "schädlich": { "cefr_source": "B1" },
-        "nachteilig": { "cefr_source": "B1" },
-        "ungünstig": { "cefr_source": "B2" }
-      }
-    }
-  },
-  {
-    "_fixture": "adverb_no_votes",
-    "source_id": "ili:i18157",
-    "pos": "adverb",
-    "translations": { "en": ["a cappella"], "es": ["a capella"] },
-    "glosses": { "en": ["without musical accompaniment"] },
-    "examples": {
-      "en": [{ "text": "they performed a cappella", "source": "omw" }]
-    },
-    "votes": {}
-  }
-]
--- a/data-pipeline/tests/fixtures/conflicts.fixture.json
+++ b/data-pipeline/tests/fixtures/conflicts.fixture.json
@ -1,4 +0,0 @@
-[
-  { "word": "macht", "pos": "noun", "language": "de", "levels": ["A2", "B1"] },
-  { "word": "bleiche", "pos": "noun", "language": "de", "levels": ["B2", "B1"] }
-]
--- a/data-pipeline/tests/validation/db-import.validation.test.ts
+++ b/data-pipeline/tests/validation/db-import.validation.test.ts
@ -6,24 +6,24 @@ import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";

 // ── Types ─────────────────────────────────────────────────────────────────────

-type Example = { text: string; source: "omw" | "cefr" };
-
-type AnnotatedRecord = {
-  source_id: string;
+type ExtractedSense = {
+  headword: string;
+  language: SupportedLanguageCode;
  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
+  sense_index: number;
+  gloss: string | null;
+  examples: string[];
+  translations: {
+    target_lang: SupportedLanguageCode;
+    word: string;
+    sense_hint: string | null;
+  }[];
 };

 // ── Paths ─────────────────────────────────────────────────────────────────────

 const DB_PATH = path.resolve("db/pipeline.db");
-const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
-const ANNOTATED_DIR = path.resolve("stage-2-annotate/output");
+const OUTPUT_DIR = path.resolve("stage-1-extract/output");

 // ── Helpers ───────────────────────────────────────────────────────────────────

@ -40,8 +40,8 @@ async function dbExists(): Promise<boolean> {

 describe("pipeline.db — import validation", () => {
  let db: import("better-sqlite3").Database;
-  let expectedSynsetCount: number;
-  let expectedCefrVoteCount: number;
+  let expectedEntriesByLang: Map<SupportedLanguageCode, number>;
+  let expectedTotalTranslations: number;

  beforeAll(async () => {
    if (!(await dbExists())) return;
@ -50,51 +50,27 @@ describe("pipeline.db — import validation", () => {
    db = new Database(DB_PATH, { readonly: true });
    db.pragma("foreign_keys = ON");

-    // Count expected synsets from omw.json
-    const omwRaw = await fs.readFile(OMW_PATH, "utf-8");
-    const omwRecords = JSON.parse(omwRaw) as unknown[];
-    expectedSynsetCount = omwRecords.length;
-
-    // Count expected CEFR votes from stage 2 annotated files.
-    // Merge all language files the same way the import script does —
-    // use en.json as base and merge votes from the other language files.
-    const byId = new Map<string, AnnotatedRecord>();
-
-    const baseRaw = await fs.readFile(
-      path.join(ANNOTATED_DIR, "en.json"),
-      "utf-8",
-    );
-    const base = JSON.parse(baseRaw) as AnnotatedRecord[];
-    for (const record of base) {
-      byId.set(record.source_id, record);
-    }
+    expectedEntriesByLang = new Map();
+    expectedTotalTranslations = 0;

    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      if (lang === "en") continue;
-      const raw = await fs.readFile(
-        path.join(ANNOTATED_DIR, `${lang}.json`),
-        "utf-8",
-      );
-      const records = JSON.parse(raw) as AnnotatedRecord[];
-      for (const record of records) {
-        const base = byId.get(record.source_id);
-        if (!base) continue;
-        for (const [l, langVotes] of Object.entries(record.votes)) {
-          if (!base.votes[l as SupportedLanguageCode]) {
-            base.votes[l as SupportedLanguageCode] = {};
+      try {
+        const raw = await fs.readFile(
+          path.join(OUTPUT_DIR, `${lang}.json`),
+          "utf-8",
+        );
+        const senses = JSON.parse(raw) as ExtractedSense[];
+        expectedEntriesByLang.set(lang, senses.length);
+        if (lang === "en") {
+          for (const sense of senses) {
+            expectedTotalTranslations += sense.translations.length;
          }
-          Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
        }
+      } catch {
+        expectedEntriesByLang.set(lang, 0);
      }
    }
-
-    expectedCefrVoteCount = 0;
-    for (const record of byId.values()) {
-      for (const langVotes of Object.values(record.votes)) {
-        expectedCefrVoteCount += Object.keys(langVotes ?? {}).length;
-      }
-    }
-  }, 120_000);
+  }, 30_000);

  it("pipeline.db exists — skipping all tests if not", async () => {
    const exists = await dbExists();
@ -106,131 +82,148 @@ describe("pipeline.db — import validation", () => {
    expect(exists).toBe(true);
  });

-  it("synsets count matches omw.json", () => {
+  it("entry count per language matches source files", () => {
    if (!db) return;
-    const row = db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
-      count: number;
-    };
-    expect(row.count).toBe(expectedSynsetCount);
-  });
+    const errors: string[] = [];

-  it("every synset has at least one translation", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT s.source_id
-        FROM synsets s
-        LEFT JOIN translations t ON t.source_id = s.source_id
-        WHERE t.id IS NULL
-      `,
-      )
-      .all() as { source_id: string }[];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      const expected = expectedEntriesByLang.get(lang) ?? 0;
+      const row = db
+        .prepare("SELECT COUNT(*) as count FROM entries WHERE language = ?")
+        .get(lang) as { count: number };
+
+      if (row.count !== expected) {
+        errors.push(`${lang}: expected ${expected} entries, got ${row.count}`);
+      }
+    }

-    const errors = rows.map((r) => `${r.source_id}: no translations`);
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });

-  it("every translation belongs to a valid synset", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT t.id, t.source_id
-        FROM translations t
-        LEFT JOIN synsets s ON s.source_id = t.source_id
-        WHERE s.source_id IS NULL
-      `,
-      )
-      .all() as { id: number; source_id: string }[];
-
-    const errors = rows.map(
-      (r) => `translation ${r.id}: references missing synset ${r.source_id}`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every cefr_source_vote references a valid translation", () => {
-    if (!db) return;
-    const rows = db
-      .prepare(
-        `
-        SELECT v.id, v.translation_id
-        FROM cefr_source_votes v
-        LEFT JOIN translations t ON t.id = v.translation_id
-        WHERE t.id IS NULL
-      `,
-      )
-      .all() as { id: number; translation_id: number }[];
-
-    const errors = rows.map(
-      (r) =>
-        `cefr_vote ${r.id}: references missing translation ${r.translation_id}`,
-    );
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("cefr_source_votes count matches stage 2 annotated output", () => {
+  it("translation count matches source files plus reverse links", () => {
    if (!db) return;
    const row = db
-      .prepare("SELECT COUNT(*) as count FROM cefr_source_votes")
+      .prepare("SELECT COUNT(*) as count FROM translations")
      .get() as { count: number };
-    expect(row.count).toBe(expectedCefrVoteCount);
+    const reverseLinks = db
+      .prepare(
+        "SELECT COUNT(*) as count FROM translations WHERE source = 'reverse_link'",
+      )
+      .get() as { count: number };
+    expect(row.count).toBe(expectedTotalTranslations + reverseLinks.count);
  });

-  it("every example has a valid source", () => {
+  it("every translation references a valid entry", () => {
    if (!db) return;
    const rows = db
      .prepare(
-        `
-        SELECT source_id, language, source
-        FROM examples
-        WHERE source NOT IN ('omw', 'cefr')
-      `,
+        `SELECT t.id, t.entry_id
+         FROM translations t
+         LEFT JOIN entries e ON e.id = t.entry_id
+         WHERE e.id IS NULL`,
      )
-      .all() as { source_id: string; language: string; source: string }[];
+      .all() as { id: number; entry_id: number }[];
+
+    const errors = rows.map(
+      (r) => `translation ${r.id}: references missing entry ${r.entry_id}`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every entry has a valid language code", () => {
+    if (!db) return;
+    const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", ");
+    const rows = db
+      .prepare(
+        `SELECT id, headword, language FROM entries
+         WHERE language NOT IN (${validLangs})`,
+      )
+      .all() as { id: number; headword: string; language: string }[];
+
+    const errors = rows.map(
+      (r) => `entry ${r.id} "${r.headword}": invalid language "${r.language}"`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("every entry has a valid pos", () => {
+    if (!db) return;
+    const rows = db
+      .prepare(
+        `SELECT id, headword, pos FROM entries
+         WHERE pos NOT IN ('noun', 'verb', 'adjective', 'adverb')`,
+      )
+      .all() as { id: number; headword: string; pos: string }[];
+
+    const errors = rows.map(
+      (r) => `entry ${r.id} "${r.headword}": invalid pos "${r.pos}"`,
+    );
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("sense_index is unique per headword, language, pos", () => {
+    if (!db) return;
+    const rows = db
+      .prepare(
+        `SELECT headword, language, pos, sense_index, COUNT(*) as c
+         FROM entries
+         GROUP BY headword, language, pos, sense_index
+         HAVING c > 1`,
+      )
+      .all() as {
+      headword: string;
+      language: string;
+      pos: string;
+      sense_index: number;
+      c: number;
+    }[];

    const errors = rows.map(
      (r) =>
-        `${r.source_id} (${r.language}): invalid example source "${r.source}"`,
+        `"${r.headword}" (${r.language} ${r.pos}): duplicate sense_index ${r.sense_index} (${r.c} rows)`,
    );
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });

-  it("every example belongs to a valid synset", () => {
+  it("non-English entries have no Kaikki translations", () => {
    if (!db) return;
+    const nonEnLangs = SUPPORTED_LANGUAGE_CODES.filter((l) => l !== "en")
+      .map((l) => `'${l}'`)
+      .join(", ");
+
    const rows = db
      .prepare(
-        `
-        SELECT e.id, e.source_id
-        FROM examples e
-        LEFT JOIN synsets s ON s.source_id = e.source_id
-        WHERE s.source_id IS NULL
-      `,
+        `SELECT e.headword, e.language, COUNT(t.id) as c
+         FROM entries e
+         JOIN translations t ON t.entry_id = e.id
+         WHERE e.language IN (${nonEnLangs})
+         AND t.source = 'kaikki'
+         GROUP BY e.id`,
      )
-      .all() as { id: number; source_id: string }[];
+      .all() as { headword: string; language: string; c: number }[];

    const errors = rows.map(
-      (r) => `example ${r.id}: references missing synset ${r.source_id}`,
+      (r) =>
+        `"${r.headword}" (${r.language}): unexpected ${r.c} Kaikki translations`,
    );
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });

-  it("every gloss belongs to a valid synset", () => {
+  it("all Kaikki translation target languages are supported and not English", () => {
    if (!db) return;
+    const validLangs = SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", ");
+
    const rows = db
      .prepare(
-        `
-        SELECT g.id, g.source_id
-        FROM glosses g
-        LEFT JOIN synsets s ON s.source_id = g.source_id
-        WHERE s.source_id IS NULL
-      `,
+        `SELECT t.id, t.target_lang
+         FROM translations t
+         WHERE t.source = 'kaikki'
+         AND (t.target_lang NOT IN (${validLangs}) OR t.target_lang = 'en')`,
      )
-      .all() as { id: number; source_id: string }[];
+      .all() as { id: number; target_lang: string }[];

    const errors = rows.map(
-      (r) => `gloss ${r.id}: references missing synset ${r.source_id}`,
+      (r) => `translation ${r.id}: invalid target_lang "${r.target_lang}"`,
    );
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
--- a/data-pipeline/tests/validation/stage-1.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-1.validation.test.ts
@ -1,166 +1,192 @@
 import fs from "node:fs/promises";
 import path from "node:path";
-import { describe, it, expect } from "vitest";
-import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
-import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
+import { describe, it, expect, beforeAll } from "vitest";
+import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@lila/shared";
+import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";

 // ── Types ─────────────────────────────────────────────────────────────────────

-type OmwRecord = {
-  source_id: string;
+type ExtractedSense = {
+  headword: string;
+  language: SupportedLanguageCode;
  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, string[]>>;
+  sense_index: number;
+  gloss: string | null;
+  examples: string[];
+  translations: {
+    target_lang: SupportedLanguageCode;
+    word: string;
+    sense_hint: string | null;
+  }[];
 };

 // ── Paths ─────────────────────────────────────────────────────────────────────

-const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
-
-// ── Helpers ───────────────────────────────────────────────────────────────────
-
-function isValidSourceId(id: string): boolean {
-  return /^ili:i\d+$/.test(id);
-}
+const OUTPUT_DIR = path.resolve("stage-1-extract/output");

 // ── Tests ─────────────────────────────────────────────────────────────────────

-describe("stage 1 — omw.json validation", () => {
-  let records: OmwRecord[];
+describe("stage 1 — Kaikki extraction output validation", () => {
+  const sensesByLang = new Map<SupportedLanguageCode, ExtractedSense[]>();

-  it("file exists and is valid JSON", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-    expect(records).toBeDefined();
-  });
-
-  it("is a non-empty array", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-    expect(Array.isArray(records)).toBe(true);
-    expect(records.length).toBeGreaterThan(0);
-  });
-
-  it("every record has required fields", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const errors: string[] = [];
-
-    for (const record of records) {
-      if (!record.source_id) {
-        errors.push(`missing source_id`);
-        continue;
-      }
-      if (!record.pos) errors.push(`${record.source_id}: missing pos`);
-      if (!record.translations)
-        errors.push(`${record.source_id}: missing translations`);
-      if (!record.glosses) errors.push(`${record.source_id}: missing glosses`);
-      if (!record.examples)
-        errors.push(`${record.source_id}: missing examples`);
+  beforeAll(async () => {
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
+      const raw = await fs.readFile(filePath, "utf-8");
+      sensesByLang.set(lang, JSON.parse(raw) as ExtractedSense[]);
    }
+  }, 30_000);

+  it("all five language output files exist", async () => {
+    const errors: string[] = [];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      try {
+        await fs.access(path.join(OUTPUT_DIR, `${lang}.json`));
+      } catch {
+        errors.push(`missing: ${lang}.json`);
+      }
+    }
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });

-  it("every source_id matches ili:i{number} pattern", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
+  it("every language file is a non-empty array", () => {
    const errors: string[] = [];
-
-    for (const record of records) {
-      if (!isValidSourceId(record.source_id)) {
-        errors.push(`invalid source_id: ${record.source_id}`);
-      }
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      const senses = sensesByLang.get(lang)!;
+      if (!Array.isArray(senses)) errors.push(`${lang}: not an array`);
+      else if (senses.length === 0) errors.push(`${lang}: empty array`);
    }
-
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });

-  it("every source_id is unique", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
-    const seen = new Set<string>();
+  it("every sense has required fields", () => {
    const errors: string[] = [];
-
-    for (const record of records) {
-      if (seen.has(record.source_id)) {
-        errors.push(`duplicate source_id: ${record.source_id}`);
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      for (const sense of sensesByLang.get(lang)!) {
+        if (!sense.headword) errors.push(`${lang}: sense missing headword`);
+        if (!sense.language)
+          errors.push(`${lang} ${sense.headword}: missing language`);
+        if (!sense.pos) errors.push(`${lang} ${sense.headword}: missing pos`);
+        if (sense.sense_index === undefined)
+          errors.push(`${lang} ${sense.headword}: missing sense_index`);
+        if (!Array.isArray(sense.examples))
+          errors.push(`${lang} ${sense.headword}: examples not an array`);
+        if (!Array.isArray(sense.translations))
+          errors.push(`${lang} ${sense.headword}: translations not an array`);
      }
-      seen.add(record.source_id);
    }
-
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });

-  it("every pos is a valid supported value", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
-
+  it("every sense has a valid pos", () => {
    const errors: string[] = [];
    const validPos = new Set(SUPPORTED_POS);
-
-    for (const record of records) {
-      if (!validPos.has(record.pos)) {
-        errors.push(`${record.source_id}: invalid pos "${record.pos}"`);
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      for (const sense of sensesByLang.get(lang)!) {
+        if (!validPos.has(sense.pos)) {
+          errors.push(`${lang} ${sense.headword}: invalid pos "${sense.pos}"`);
+        }
      }
    }
-
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });

-  it("every record has at least one translation in at least one language", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    records = JSON.parse(raw) as OmwRecord[];
+  it("every sense language code matches its file", () => {
+    const errors: string[] = [];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      for (const sense of sensesByLang.get(lang)!) {
+        if (sense.language !== lang) {
+          errors.push(
+            `${lang} ${sense.headword}: language field "${sense.language}" does not match file`,
+          );
+        }
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });

+  it("no abbreviation senses in output", () => {
+    const errors: string[] = [];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      for (const sense of sensesByLang.get(lang)!) {
+        if (sense.gloss?.toLowerCase().startsWith("abbreviation of")) {
+          errors.push(
+            `${lang} ${sense.headword}: abbreviation sense not filtered`,
+          );
+        }
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("English senses all have at least one translation", () => {
+    const errors: string[] = [];
+    for (const sense of sensesByLang.get("en")!) {
+      if (sense.translations.length === 0) {
+        errors.push(
+          `en ${sense.headword} (sense ${sense.sense_index}): no translations`,
+        );
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("non-English senses have no translations", () => {
+    const errors: string[] = [];
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      if (lang === "en") continue;
+      for (const sense of sensesByLang.get(lang)!) {
+        if (sense.translations.length > 0) {
+          errors.push(
+            `${lang} ${sense.headword}: unexpected translations in non-English file`,
+          );
+        }
+      }
+    }
+    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
+  });
+
+  it("all translation target languages are supported and not English", () => {
    const errors: string[] = [];
    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
-
-    for (const record of records) {
-      const langs = Object.keys(record.translations) as SupportedLanguageCode[];
-
-      if (langs.length === 0) {
-        errors.push(`${record.source_id}: no translations`);
-        continue;
-      }
-
-      for (const lang of langs) {
-        if (!validLangs.has(lang)) {
-          errors.push(`${record.source_id}: unsupported language "${lang}"`);
+    for (const sense of sensesByLang.get("en")!) {
+      for (const t of sense.translations) {
+        if (!validLangs.has(t.target_lang)) {
+          errors.push(
+            `en ${sense.headword}: unsupported translation language "${t.target_lang}"`,
+          );
        }
-        const words = record.translations[lang] ?? [];
-        if (words.length === 0) {
-          errors.push(`${record.source_id}: empty translations for "${lang}"`);
+        if (t.target_lang === "en") {
+          errors.push(
+            `en ${sense.headword}: translation to same language "en"`,
+          );
+        }
+        if (!t.word?.trim()) {
+          errors.push(
+            `en ${sense.headword}: empty translation word for ${t.target_lang}`,
+          );
        }
      }
    }
-
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });

-  it("no duplicate translations within a single synset and language", async () => {
-    const raw = await fs.readFile(OMW_PATH, "utf-8");
-    const records = JSON.parse(raw) as OmwRecord[];
-
+  it("sense_index is unique per headword and pos within each language", () => {
    const errors: string[] = [];
-
-    for (const record of records) {
-      for (const [lang, words] of Object.entries(record.translations)) {
-        const seen = new Set<string>();
-        for (const word of words) {
-          if (seen.has(word)) {
-            errors.push(
-              `${record.source_id} (${lang}): duplicate translation "${word}"`,
-            );
-          }
-          seen.add(word);
+    for (const lang of SUPPORTED_LANGUAGE_CODES) {
+      const seen = new Map<string, Set<number>>();
+      for (const sense of sensesByLang.get(lang)!) {
+        const key = `${sense.headword}|${sense.pos}`;
+        if (!seen.has(key)) seen.set(key, new Set());
+        const indexes = seen.get(key)!;
+        if (indexes.has(sense.sense_index)) {
+          errors.push(
+            `${lang} ${sense.headword} (${sense.pos}): duplicate sense_index ${sense.sense_index}`,
+          );
        }
+        indexes.add(sense.sense_index);
      }
    }
-
    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
  });
 });
--- a/data-pipeline/tests/validation/stage-2.validation.test.ts
+++ b/data-pipeline/tests/validation/stage-2.validation.test.ts
@ -1,218 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { describe, it, expect, beforeAll } from "vitest";
-import {
-  SUPPORTED_POS,
-  SUPPORTED_LANGUAGE_CODES,
-  CEFR_LEVELS,
-} from "@lila/shared";
-import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
-
-// ── Types ─────────────────────────────────────────────────────────────────────
-
-type Example = { text: string; source: "omw" | "cefr" };
-
-type AnnotatedRecord = {
-  source_id: string;
-  pos: SupportedPos;
-  translations: Partial<Record<SupportedLanguageCode, string[]>>;
-  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
-  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
-  votes: Partial<
-    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
-  >;
-};
-
-type ConflictEntry = {
-  word: string;
-  pos: string;
-  language: SupportedLanguageCode;
-  levels: string[];
-};
-
-// ── Paths ─────────────────────────────────────────────────────────────────────
-
-const OUTPUT_DIR = path.resolve("stage-2-annotate/output");
-
-// ── Tests ─────────────────────────────────────────────────────────────────────
-
-describe("stage 2 — annotated output validation", () => {
-  const recordsByLang = new Map<SupportedLanguageCode, AnnotatedRecord[]>();
-  let conflicts: ConflictEntry[] = [];
-
-  beforeAll(async () => {
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const raw = await fs.readFile(
-        path.join(OUTPUT_DIR, `${lang}.json`),
-        "utf-8",
-      );
-      recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]);
-    }
-    const raw = await fs.readFile(
-      path.join(OUTPUT_DIR, "conflicts.json"),
-      "utf-8",
-    );
-    conflicts = JSON.parse(raw) as ConflictEntry[];
-  }, 60_000);
-
-  it("all five language files exist", async () => {
-    const errors: string[] = [];
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
-      try {
-        await fs.access(filePath);
-      } catch {
-        errors.push(`missing file: ${lang}.json`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("conflicts.json exists", async () => {
-    const filePath = path.join(OUTPUT_DIR, "conflicts.json");
-    await expect(fs.access(filePath)).resolves.toBeUndefined();
-  });
-
-  it("every language file is a non-empty array", () => {
-    const errors: string[] = [];
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-      if (!Array.isArray(records)) {
-        errors.push(`${lang}.json: not an array`);
-      } else if (records.length === 0) {
-        errors.push(`${lang}.json: empty array`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every record has required fields", () => {
-    const errors: string[] = [];
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        if (!record.source_id) {
-          errors.push(`${lang}: record missing source_id`);
-          continue;
-        }
-        if (!record.pos)
-          errors.push(`${lang} ${record.source_id}: missing pos`);
-        if (!record.translations)
-          errors.push(`${lang} ${record.source_id}: missing translations`);
-        if (!record.glosses)
-          errors.push(`${lang} ${record.source_id}: missing glosses`);
-        if (record.examples === undefined)
-          errors.push(`${lang} ${record.source_id}: missing examples`);
-        if (record.votes === undefined)
-          errors.push(`${lang} ${record.source_id}: missing votes`);
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every pos is a valid supported value", () => {
-    const errors: string[] = [];
-    const validPos = new Set(SUPPORTED_POS);
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        if (!validPos.has(record.pos)) {
-          errors.push(
-            `${lang} ${record.source_id}: invalid pos "${record.pos}"`,
-          );
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every example has text and a valid source", () => {
-    const errors: string[] = [];
-    const validSources = new Set(["omw", "cefr"]);
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        for (const [l, examples] of Object.entries(record.examples)) {
-          for (const example of examples) {
-            if (!example.text) {
-              errors.push(
-                `${lang} ${record.source_id} (${l}): example missing text`,
-              );
-            }
-            if (!validSources.has(example.source)) {
-              errors.push(
-                `${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`,
-              );
-            }
-          }
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("every cefr_source vote is a valid CEFR level", () => {
-    const errors: string[] = [];
-    const validLevels = new Set(CEFR_LEVELS);
-
-    for (const lang of SUPPORTED_LANGUAGE_CODES) {
-      const records = recordsByLang.get(lang)!;
-
-      for (const record of records) {
-        for (const [l, langVotes] of Object.entries(record.votes)) {
-          for (const [word, vote] of Object.entries(langVotes ?? {})) {
-            if (
-              !validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number])
-            ) {
-              errors.push(
-                `${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`,
-              );
-            }
-          }
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-
-  it("conflicts.json entries have required fields and valid CEFR levels", () => {
-    const errors: string[] = [];
-    const validLevels = new Set(CEFR_LEVELS);
-    const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
-
-    for (const entry of conflicts) {
-      if (!entry.word) errors.push(`conflict missing word`);
-      if (!entry.pos) errors.push(`conflict missing pos`);
-      if (!entry.language) {
-        errors.push(`conflict missing language`);
-      } else if (!validLangs.has(entry.language)) {
-        errors.push(`conflict invalid language "${entry.language}"`);
-      }
-      if (!Array.isArray(entry.levels) || entry.levels.length < 2) {
-        errors.push(`${entry.word}: levels must have at least 2 entries`);
-      } else {
-        for (const level of entry.levels) {
-          if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) {
-            errors.push(`${entry.word}: invalid level "${level}"`);
-          }
-        }
-      }
-    }
-
-    expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
-  });
-});
--- a/documentation/audits/generating-decks.md
+++ b/documentation/audits/generating-decks.md
@ -1,371 +0,0 @@
-# Code Review: `build-top-english-nouns-deck` seed script
-
-Hey, good work getting this to a finished, working state — that's genuinely the hardest part. Below is feedback structured the way a mentor would give it: what the problem is, why it matters in a real codebase, and how to fix it. Work through these one by one when you refactor.
-
---
-
-## 1. Function names should be imperative, not gerunds
-
-### What you wrote
-
-```ts
-const readingFromWordlist = async () => { ... }
-const checkingSourceWordsAgainstDB = async () => { ... }
-```
-
-### Why it's a problem
-
-Functions represent _actions_. In English, imperative verbs describe actions: `read`, `fetch`, `build`. Gerunds (`reading`, `checking`) describe ongoing processes — they read like you're narrating what's happening rather than declaring what a function does. This isn't just style preference: when you're scanning a call stack or reading `main()`, imperative names parse faster because they match the mental model of "I am calling this to do a thing."
-
-### How to fix it
-
-```ts
-const readWordlist = async () => { ... }
-const resolveSourceTerms = async () => { ... }  // "checking" undersells what it returns
-const writeMissingWords = async () => { ... }
-```
-
-Note the rename of `checkingSourceWordsAgainstDB` → `resolveSourceTerms`. The original name describes the _mechanism_ (checking against DB). A better name describes the _result_ (resolving words into term IDs). Callers don't need to know it hits the DB.
-
-### Further reading
-
- [Clean Code, Chapter 2 – Meaningful Names](https://www.oreilly.com/library/view/clean-code-a/9780136083238/) — specifically the section on "Use Intention-Revealing Names"
- [Google TypeScript Style Guide – Naming](https://google.github.io/styleguide/tsguide.html#naming-style)
-
---
-
-## 2. N+1 query pattern in `validateLanguages` and `logLanguageCoverage`
-
-### What you wrote
-
-```ts
-for (const language of languages) {
-  const rows = await db
-    .selectDistinct({ termId: translations.term_id })
-    .from(translations)
-    .where(
-      and(
-        inArray(translations.term_id, termIds),
-        eq(translations.language_code, language),
-      ),
-    );
-}
-```
-
-### Why it's a problem
-
-This fires one database query _per language_. If you have 15 supported languages, that's 15 round trips. Each round trip has network latency, connection overhead, and query planning cost. The database already knows how to aggregate across all languages in a single pass — you're just not asking it to.
-
-This pattern is called **N+1** (one query to get the list, then N queries for each item in the list) and it's one of the most common performance mistakes in applications that use databases. At 15 languages it's fine. At 50 languages with 100k terms, your script will be the reason someone gets paged at 2am.
-
-### How to fix it
-
-Ask the database to do the grouping for you in a single query:
-
-```ts
-import { count, ne } from "drizzle-orm";
-
-const coverage = await db
-  .select({
-    language: translations.language_code,
-    coveredCount: count(translations.term_id),
-  })
-  .from(translations)
-  .where(
-    and(
-      inArray(translations.term_id, termIds),
-      ne(translations.language_code, sourceLanguage),
-    ),
-  )
-  .groupBy(translations.language_code);
-
-const validatedLanguages = coverage
-  .filter((row) => row.coveredCount === termIds.length)
-  .map((row) => row.language);
-```
-
-One query. The database returns a row per language with the count of covered terms. You filter in JS. Done.
-
-### Further reading
-
- [Drizzle ORM – `groupBy` and aggregations](https://orm.drizzle.team/docs/select#aggregations)
- ["What is the N+1 query problem" — StackOverflow](https://stackoverflow.com/questions/97197/what-is-the-n1-select-query-problem-and-how-can-it-be-avoided)
-
---
-
-## 3. Two functions doing the same database work
-
-### What you wrote
-
-`validateLanguages` and `logLanguageCoverage` both loop over languages and fire the same query per language. You wrote the same logic twice.
-
-### Why it's a problem
-
-This is a violation of **DRY** (Don't Repeat Yourself). The immediate cost is that any bug in the query exists in two places — fixing one doesn't fix the other. The deeper cost is that it doubles your database load for no reason: you fetch the coverage data, use it to compute `validatedLanguages`, throw it away, then fetch it again just to log it.
-
-### How to fix it
-
-Once you apply the fix from point 2, you have a single `coverage` array. Use it for both purposes:
-
-```ts
-const coverage = await db...  // single query from point 2
-
-// Use for validation
-const validatedLanguages = coverage
-  .filter((row) => row.coveredCount === termIds.length)
-  .map((row) => row.language);
-
-// Use for logging
-for (const row of coverage) {
-  console.log(`  ${row.language}: ${row.coveredCount} / ${termIds.length} terms covered`);
-}
-```
-
-No second trip to the database.
-
-### Further reading
-
- [The DRY Principle](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself)
-
---
-
-## 4. Unnecessary array copying inside a loop
-
-### What you wrote
-
-```ts
-const wordToTermIds = new Map<string, string[]>();
-for (const row of rows) {
-  const existing = wordToTermIds.get(word) ?? [];
-  wordToTermIds.set(word, [...existing, row.termId]); // spreads the whole array every iteration
-}
-```
-
-### Why it's a problem
-
-`[...existing, row.termId]` creates a _brand new array_ every time and copies all the previous elements into it. If "bank" has 3 homonyms, you allocate arrays of size 0, 1, 2, and 3 — throwing the first three away. This is an `O(n²)` memory allocation pattern. For 1000 words it's invisible. In a tighter loop or with more data, it adds up.
-
-This pattern comes from functional programming habits (immutability is good there). But in a one-off script building a local data structure, there's no reason to avoid mutation.
-
-### How to fix it
-
-```ts
-const wordToTermIds = new Map<string, string[]>();
-for (const row of rows) {
-  const word = row.text.toLowerCase();
-  if (!wordToTermIds.has(word)) {
-    wordToTermIds.set(word, []);
-  }
-  wordToTermIds.get(word)!.push(row.termId);
-}
-```
-
-Get the array once, push into it. No copies.
-
-### Further reading
-
- [MDN – Array.prototype.push()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/push)
- [Big O Notation primer](https://www.freecodecamp.org/news/big-o-notation-why-it-matters-and-why-it-doesnt-1674cfa8a23c/) — worth understanding O(n²) vs O(n)
-
---
-
-## 5. No database transaction — your "idempotent" script can corrupt state
-
-### What you wrote
-
-```ts
-deckId = await createDeck(validatedLanguages); // step 1
-const addedCount = await addTermsToDeck(deckId, termIds); // step 2
-await updateValidatedLanguages(deckId, validatedLanguages); // step 3
-```
-
-### Why it's a problem
-
-These three operations are separate database round trips with nothing tying them together. If step 2 throws (network blip, constraint violation, anything), you end up with a deck row that has no terms. Run the script again and it finds the existing deck, skips creation, then tries to add terms — but now your `validated_languages` from the previous partial run might be stale. The script _appears_ to recover, but you can't be sure of what state you're in.
-
-A **transaction** is a guarantee: either all steps succeed together, or none of them do. If anything fails mid-way, the database rolls back to the state before the transaction started. This is fundamental to writing scripts that touch multiple tables.
-
-### How to fix it
-
-```ts
-await db.transaction(async (tx) => {
-  const existingDeck = await findExistingDeck(tx);
-
-  let deckId: string;
-  if (!existingDeck) {
-    deckId = await createDeck(tx, validatedLanguages);
-  } else {
-    deckId = existingDeck.id;
-  }
-
-  await addTermsToDeck(tx, deckId, termIds);
-  await updateValidatedLanguages(tx, deckId, validatedLanguages);
-});
-```
-
-You'll need to thread the `tx` (transaction context) through your functions instead of using the global `db` — that's the key change.
-
-### Further reading
-
- [Drizzle ORM – Transactions](https://orm.drizzle.team/docs/transactions)
- [PostgreSQL – What is a Transaction?](https://www.postgresql.org/docs/current/tutorial-transactions.html)
- [ACID properties explained](https://www.databricks.com/glossary/acid-transactions) — Atomicity is what protects you here
-
---
-
-## 6. The `isNewDeck` flag is unnecessary
-
-### What you wrote
-
-```ts
-let isNewDeck: boolean;
-
-if (!existingDeck) {
-  deckId = await createDeck(validatedLanguages);
-  isNewDeck = true;
-} else {
-  deckId = existingDeck.id;
-  isNewDeck = false;
-}
-
-// ...later...
-if (!isNewDeck) {
-  await updateValidatedLanguages(deckId, validatedLanguages);
-}
-```
-
-### Why it's a problem
-
-You introduced `isNewDeck` to avoid calling `updateValidatedLanguages` when the deck was just created — reasoning that you already passed `validatedLanguages` to `createDeck`. But that means you're calling `updateValidatedLanguages` in _one path_ and `createDeck(..., validatedLanguages)` in the _other_ path. The intent (always keep validated languages current) is the same in both cases, but the code splits it into two branches you have to mentally reconcile.
-
-The cleaner model: always call `updateValidatedLanguages` after finding or creating the deck. Then `createDeck` doesn't need `validatedLanguages` at all, and `isNewDeck` disappears.
-
-### How to fix it
-
-```ts
-const deckId = existingDeck ? existingDeck.id : await createDeck(); // no validatedLanguages needed here
-
-await addTermsToDeck(deckId, termIds);
-await updateValidatedLanguages(deckId, validatedLanguages); // always runs
-```
-
-Fewer variables, one clear flow.
-
---
-
-## 7. Comments explain _what_, not _why_
-
-### What you wrote
-
-```ts
-// new Set() automatically discards duplicate values,
-// and spreading it back with ... converts it to a plain array again.
-// So if "bank" appears twice in the file,
-// the resulting array will only contain it once.
-const words = [
-  ...new Set(
-    raw
-      .split("\n")
-      .map((w) => w.trim().toLowerCase())
-      .filter(Boolean),
-  ),
-];
-```
-
-### Why it's a problem
-
-Comments that re-explain what the code literally does are called **noise comments**. They add length without adding understanding — any developer who can read this script already knows what `Set` does. Worse, they can get out of date if the code changes but the comment doesn't.
-
-Good comments explain _why_ a decision was made, not _what_ the code does. The code already says what it does.
-
-Meanwhile, your most complex line — `const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())]` — has no comment at all. That's the one that earns a note.
-
-### How to fix it
-
-```ts
-// Deduplicate: multiple words can map to the same term ID (e.g. via synonyms)
-const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())];
-```
-
-And remove the Set explanation from `readWordlist`. The code is clear.
-
-### Further reading
-
- [Clean Code, Chapter 4 – Comments](https://www.oreilly.com/library/view/clean-code-a/9780136083238/) — specifically "Explain Yourself in Code" and "Noise Comments"
-
---
-
-## 8. The finished roadmap comment should be deleted
-
-### What you wrote
-
-```ts
-/*
- * roadmap
- * [x] Setup
- * [x] Read wordlist
- * ...all checked off
- */
-```
-
-### Why it's a problem
-
-This was useful _while you were planning_. Now that every item is checked, it communicates nothing except "this is done" — which the existence of a working script already communicates. Leaving it in adds noise to the file header and signals that you're not sure what belongs in source control vs. a task tracker.
-
-### How to fix it
-
-Delete it. Use GitHub Issues, a Notion doc, or even a scratchpad file for planning notes. Source code is the output of planning, not the place to store it.
-
---
-
-## 9. No log levels — everything goes to `console.log`
-
-### What you wrote
-
-```ts
-console.log("📖 Reading word list...");
-console.log(`   ${sourceWords.length} words loaded\n`);
-// ...and so on for every step
-```
-
-### Why it's a problem
-
-In a real environment — CI/CD pipelines, server logs, anything beyond your local terminal — all of this output lands in the same stream at the same priority. Actual errors (`console.error`) get buried in progress logs. There's no way to run the script quietly when you just need the summary, or verbosely when you're debugging.
-
-For a one-off seed script this is low priority, but it's a habit worth building early.
-
-### How to fix it
-
-At minimum, use `console.error` for actual errors (not just in the catch block — also for things like "deck creation returned no ID"). For the detailed per-language breakdown, consider putting it behind a `--verbose` CLI flag so you can run the script cleanly in CI without dumping hundreds of lines of coverage data.
-
-```ts
-// Basic approach
-if (process.argv.includes("--verbose")) {
-  await logLanguageCoverage(termIds);
-}
-```
-
-### Further reading
-
- [Node.js `process.argv`](https://nodejs.org/en/learn/command-line/nodejs-accept-arguments-from-the-command-line)
- For a proper solution later: [pino](https://github.com/pinojs/pino) — a lightweight structured logger widely used in Node.js
-
---
-
-## Summary
-
-| #   | Issue                          | Priority                                |
-| --- | ------------------------------ | --------------------------------------- |
-| 1   | Gerund function names          | Low — style, but builds good habits     |
-| 2   | N+1 queries                    | High — real performance impact          |
-| 3   | Duplicate query logic          | High — bugs in two places               |
-| 4   | Array spread in loop           | Medium — inefficient pattern to unlearn |
-| 5   | No transaction                 | High — can corrupt database state       |
-| 6   | `isNewDeck` flag               | Low — unnecessary complexity            |
-| 7   | Comments explain what, not why | Low — style, but important long-term    |
-| 8   | Roadmap comment left in        | Low — cleanup                           |
-| 9   | No log levels                  | Low — good habit to build               |
-
-Start with **2, 3, and 5** — those are the ones that would cause real problems in production. The rest are about writing code that's easier to read and maintain over time.
-
-Good luck with the refactor. Come back with the updated script when you're done.
--- a/documentation/backlog.md
+++ b/documentation/backlog.md
@ -8,15 +8,18 @@ Labels: `[feature]` `[infra]` `[security]` `[ux]` `[debt]`

 Things that are actively in progress or should be picked up immediately. Mostly operational risk and the remaining phase 7 hardening work.

- **Hetzner domain migration check** `[infra]`
-  Verify whether the lilastudy.com domain needs to be migrated following a Hetzner DNS change. Check Hetzner dashboard for any pending migration notice.
-
 ---

 ## next

 Clearly planned work, not yet started. No hard ordering — sequence based on what unblocks real users first.

+- admin dashboard to see users and their status!
+
+- refactor frontend: play => learn alone, multiplayer => learn together, also multiplayer => public and private games
+
+- if logged out, navbar should not show play and multiplayer
+
 - **Batch distractor queries to eliminate N+1** `[debt]`
  createGameSession calls getDistractors once per term in parallel — 3 queries for 3 rounds, 10 for 10. Each query does ORDER BY RANDOM() which can't use an index and gets slower as the translations table grows. Fix: add a getDistractorsForTerms(termIds[], ...) function to @lila/db that batches all distractor fetches into a single query and returns results grouped by term. The service distributes the results per question. Prerequisite: none. Blocked by: nothing, but coordinate with any ongoing @lila/db changes.

--- a/documentation/data-pipeline.md
+++ b/documentation/data-pipeline.md
@ -54,6 +54,97 @@ The schema is defined in `data-pipeline/db/schema.sql`. Never edit `pipeline.db`

 On first run the orchestrator initialises `pipeline.db` automatically and imports the stage 1 output into the base tables. This happens once — subsequent runs skip the import if the base tables are already populated.

+## Common commands
+
+### Starting llama.cpp
+
+```bash
+cd ~/Downloads/llama.cpp
+./build/bin/llama-server \
+  --model models/qwen3.5-4b-q4_k_m.gguf \
+  --port 8080 \
+  --ctx-size 4096 \
+  --n-gpu-layers 999 \
+  --host 127.0.0.1 \
+  --chat-template-kwargs '{"enable_thinking":false}' \
+  --reasoning-budget 0
+```
+
+Verify the server is running:
+
+```bash
+curl http://127.0.0.1:8080/health
+```
+
+### Running the pipeline
+
+```bash
+pnpm --filter @lila/pipeline pipeline:run
+```
+
+The pipeline auto-generates a run name from the date and a counter. It picks up where it left off — completed stages are skipped automatically.
+
+### Stage 1 — Extract
+
+```bash
+pnpm --filter @lila/pipeline extract
+```
+
+Runs in sample mode (500 entries per language) by default. Remove the hardcoded limit in `stage-1-extract/scripts/extract.ts` for a full run.
+
+### Stage 2 — Reverse link sync
+
+```bash
+pnpm --filter @lila/pipeline reverse-link
+```
+
+### Initialising and importing the database
+
+```bash
+# Initialise pipeline.db from schema
+pnpm --filter @lila/pipeline db:init
+
+# Import stage 1 output into pipeline.db
+pnpm --filter @lila/pipeline db:import
+```
+
+### Resetting the database
+
+```bash
+# Full reset — delete and reinitialise
+rm data-pipeline/db/pipeline.db
+pnpm --filter @lila/pipeline db:init
+pnpm --filter @lila/pipeline db:import
+pnpm --filter @lila/pipeline reverse-link
+```
+
+### Resetting enrich stage progress
+
+```bash
+# Reset round 1 only
+pnpm --filter @lila/pipeline db:reset round1
+
+# Reset all stages except reverse link
+pnpm --filter @lila/pipeline db:reset all
+```
+
+### Checking pipeline progress
+
+```bash
+node -e "
+const Database = require('better-sqlite3');
+const db = new Database('data-pipeline/db/pipeline.db', { readonly: true });
+const total = db.prepare('SELECT COUNT(*) as c FROM entries WHERE language = \\'en\\'').get().c;
+const complete = db.prepare(\"SELECT COUNT(*) as c FROM run_status WHERE stage = 'round1' AND status = 'complete'\").get().c;
+const needsReview = db.prepare(\"SELECT COUNT(*) as c FROM run_status WHERE stage = 'round1' AND status = 'needs_review'\").get().c;
+console.log('Total English entries:', total);
+console.log('Round 1 complete:', complete);
+console.log('Needs review:', needsReview);
+console.log('Pending:', total - complete - needsReview);
+db.close();
+"
+```
+
 ## Data source

 ### Kaikki (Wiktionary)
@ -171,24 +262,31 @@ pnpm --filter @lila/pipeline reverse-link

 ### 3. Enrich

-The enrich stage runs LLMs to fill four types of gaps, in this order:
+> **Note:** Before running this stage, ensure the llama.cpp server is running
+> locally. The orchestrator checks for a running server at
+> `http://127.0.0.1:8080/health` and exits with instructions if it is not
+> reachable. See `llm-setup.md` for setup instructions.

-**A — Missing translations:** for each entry that has no translation in one or more supported languages after reverse link sync, the LLM generates the best translation for that language given the entry's headword, gloss, and examples.
+The enrich stage runs in four ordered sub-stages per entry, designed to build context progressively. All output is written to `pipeline.db` atomically per sub-stage — runs are fully resumable if interrupted. Each model is run once — one model produces one vote per sub-stage.

-**B — Weak glosses and examples:** for each entry where the gloss is missing or the examples are missing, the LLM generates a natural, learner-friendly gloss and one usage example in the entry's language.
+**Sub-stage order:**

-**C — CEFR levels:** for every entry, the LLM assigns a CEFR level (A1–C2) based on the headword, gloss, and examples. This runs for all entries regardless of whether other enrichment was needed.
+1. **`round1_gloss`** — the LLM reviews the existing gloss. If it is clear and learner-friendly, it confirms it. If not, it generates a better one.

-All output is written to `pipeline.db` atomically per entry — runs are fully resumable if interrupted. Each model is run once — one model produces one vote.
+2. **`round1_example`** — the LLM reviews the existing examples. If they are natural and suitable, it confirms them. If not, it generates one better example sentence in the entry language.

-> **Note:** Before running this stage, ensure the llama.cpp server is running locally. The orchestrator checks for a running server at `http://127.0.0.1:8080/health` and exits with instructions if it is not reachable. See `llm-setup.md` for setup instructions.
+3. **`round1_translations`** — using the verified gloss as context, the LLM reviews each existing translation. Valid translations are confirmed. Invalid ones (wrong language, suffixes, garbled text, wrong sense) are explicitly rejected. Missing languages get a generated translation.
+
+4. **`round1_cefr`** — using only the validated translations from the previous sub-stage, the LLM votes on the CEFR level for the headword and for each confirmed translation. Rejected translations never reach this sub-stage.
+
+This ordering ensures the CEFR voting sub-stage only sees clean, verified data.
+
+All output is written to `pipeline.db` atomically per sub-stage per entry. Interrupted runs resume from the last incomplete sub-stage without losing work. Each model is run once — one model, one vote per sub-stage.

 **Input:** `pipeline.db` — entries after reverse link sync
-**Output:** `pipeline.db` — LLM-generated translations, glosses, examples, and CEFR votes
+**Output:** `pipeline.db` — gloss votes, example votes, translation votes, CEFR votes per entry per model

-```bash
-pnpm --filter @lila/pipeline run --name "night-1"
-```
+> **Note:** The tiebreaker is not a standalone script. It runs automatically > as part of the pipeline orchestrator after merge completes.

 ### 4. Merge

@ -314,9 +412,9 @@ These are not part of the current pipeline but are worth considering as the data

 ## Roadmap

-**Current state:** Data source migrated from OMW to Kaikki. Production schema and pipeline being rewritten on `feat/kaikki-vocabulary-schema`. Pipeline infrastructure (orchestrator, db init, reporting, tests) is in place and carries forward.
+**Current state:** Stage 1 extraction and stage 2 reverse link sync complete and verified on sample data. Stage 3 enrich script written and tested — redesigning to sub-stage architecture for better data quality. llama.cpp running with Qwen3.5-4B.

-**Next action:** Rewrite production schema in `packages/db`, then rewrite pipeline extraction stage for Kaikki.
+**Next action:** Rewrite enrich script for sub-stage design.

 | Stage           | Status         |
 | --------------- | -------------- |
@ -328,27 +426,32 @@ These are not part of the current pipeline but are worth considering as the data
 | 5. Compare / QA | 🔲 not started |
 | 6. Sync         | 🔲 not started |

-### Stage 1 — Extract `🔲 not started`
+### Stage 1 — Extract `🔄 in progress`

- [ ] Download Kaikki JSONL files for all 5 languages
- [ ] Write extraction script
- [ ] Write stage 1 validation tests
- [ ] Run extraction → `pipeline.db`
+- [x] Download Kaikki JSONL files for all 5 languages
+- [x] Write extraction script
+- [x] Write stage 1 validation tests
+- [x] Write db schema, init, and import scripts
+- [x] Write db import validation tests
+- [x] Run sample extraction → `stage-1-extract/output/{lang}.json`
+- [ ] Remove sample limit and run full extraction
+- [ ] Re-run full import → `pipeline.db`

-### Stage 2 — Reverse link sync `🔲 not started`
+### Stage 2 — Reverse link sync `🔄 in progress`

- [ ] Write reverse link sync script
- [ ] Write tests
- [ ] Run reverse link sync → `pipeline.db`
+- [x] Write reverse link sync script
+- [x] Run reverse link sync on sample data → 141 links inserted
+- [ ] Run reverse link sync on full data after full extraction

-### Stage 3 — Enrich `🔲 not started`
+### Stage 3 — Enrich `🔄 in progress`

-**Next action:** Write the enrich script after production schema is complete.
+**Next action:** Rewrite enrich script for sub-stage design.

- [ ] Write enrich script (missing translations, glosses, examples, CEFR votes)
- [ ] Write tests
- [ ] Install llama.cpp and verify server
- [ ] Smoke test with sample entries
+- [x] Write initial enrich script (single-prompt design)
+- [x] Install llama.cpp and verify server
+- [x] Smoke test with sample entries
+- [ ] Rewrite enrich script for sub-stage design (round1_gloss, round1_example, round1_translations, round1_cefr)
+- [ ] Write tests for enrich sub-stages
 - [ ] Run full sample, collect metrics
 - [ ] Compare providers (local vs OpenRouter free models)
 - [ ] Production run — all entries, all models
--- a/documentation/design.md
+++ b/documentation/design.md
@ -1,5 +0,0 @@
-# design
-
-## notes
-
-break points
--- a/documentation/llm-setup.md
+++ b/documentation/llm-setup.md
@ -1,17 +1,12 @@
 # LLM Setup — lila pipeline

-This document covers the LLM infrastructure for stage 3 (enrich) of the lila
-data pipeline. It documents the hardware constraints, supported providers,
-model recommendations, and how to configure and swap providers in the test
-and production scripts.
+This document covers the LLM infrastructure for stage 3 (enrich) of the lila data pipeline. It documents the hardware constraints, supported providers, model recommendations, and how to configure and swap providers in the test and production scripts.

 ---

 ## Provider model

-Each provider + model combination counts as one vote in the final majority.
-Running the same model twice is not supported — one model, one vote. To
-increase vote confidence, add more models rather than re-running existing ones.
+Each provider + model combination counts as one vote in the final majority. Running the same model twice is not supported — one model, one vote. To increase vote confidence, add more models rather than re-running existing ones.

 ---

@ -24,17 +19,13 @@ increase vote confidence, add more models rather than re-running existing ones.
 | GPU       | NVIDIA GeForce GTX 950M — 4 GB VRAM (Maxwell, CUDA compute 5.0) |
 | OS        | Debian GNU/Linux 13 (trixie) x86_64                             |

-**Local inference verdict:** viable for small/quantized models, not for
-production runs. See the [Local inference](#local-inference-llamacpp) section
-for details.
+**Local inference verdict:** viable for small/quantized models, not for production runs. See the [Local inference](#local-inference-llamacpp) section for details.

 ---

 ## Provider overview

-The enrich script uses a single, swappable provider config. All providers
-except Anthropic expose an OpenAI-compatible API, so the same client code
-works across all of them — only `baseURL`, `apiKey`, and `model` change.
+The enrich script uses a single, swappable provider config. All providers except Anthropic expose an OpenAI-compatible API, so the same client code works across all of them — only `baseURL`, `apiKey`, and `model` change.

 | Provider               | Use case                                      | Cost               | Rate limits            |
 | ---------------------- | --------------------------------------------- | ------------------ | ---------------------- |
@ -49,20 +40,13 @@ works across all of them — only `baseURL`, `apiKey`, and `model` change.

 ### Why local inference is worth testing

-Time is not a constraint — the pipeline scripts are fully resumable. The
-laptop can run overnight for multiple nights. The only question is output
-quality, which the test script evaluates empirically.
+Time is not a constraint — the pipeline scripts are fully resumable. The laptop can run overnight for multiple nights. The only question is output quality, which the test script evaluates empirically.

 ### Hardware constraints

-The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0).
-llama.cpp supports Maxwell via CUDA backend but newer builds may require
-the `--cuda-no-kv-offload` flag depending on the version.
+The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0). llama.cpp supports Maxwell via CUDA backend but newer builds may require the `--cuda-no-kv-offload` flag depending on the version.

-llama.cpp splits model layers between GPU and CPU automatically via
-`--n-gpu-layers`. You set how many layers go on the GPU; the rest run on
-CPU/RAM. This means a model larger than VRAM is not a dead end — it runs
-in hybrid mode, slower than full-GPU but much faster than pure CPU.
+llama.cpp splits model layers between GPU and CPU automatically via `--n-gpu-layers`. You set how many layers go on the GPU; the rest run on CPU/RAM. This means a model larger than VRAM is not a dead end — it runs in hybrid mode, slower than full-GPU but much faster than pure CPU.

 Practical estimates for this hardware (~3.5 GB VRAM usable after drivers):

@ -75,24 +59,19 @@ Practical estimates for this hardware (~3.5 GB VRAM usable after drivers):

 ### Recommended local models

-Two candidates worth testing, covering different points on the size/quality
-tradeoff:
+Two candidates worth testing, covering different points on the size/quality tradeoff:

 **Gemma 4 E4B Instruct (Q4 / UD-Q4_K_XL)**

 - GGUF file: `gemma-4-E4B-it-UD-Q4_K_XL.gguf` (~2.5 GB)
 - Source: https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF
- Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+
-  language support including all five pipeline languages. First candidate
-  to test.
+- Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+ language support including all five pipeline languages. First candidate to test.

 **Qwen2.5 7B Instruct (Q4_K_M)**

 - GGUF file: `Qwen2.5-7B-Instruct-Q4_K_M.gguf` (~4.5 GB)
 - Source: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF
- Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~8–12 tok/s.
-  Stronger multilingual generation than any 3–4B model. Second candidate,
-  for comparison against the smaller Gemma 4 E4B.
+- Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~8–12 tok/s. Stronger multilingual generation than any 3–4B model. Second candidate, for comparison against the smaller Gemma 4 E4B.

 ### Installation

--- a/documentation/model-strategy.md
+++ b/documentation/model-strategy.md
@ -0,0 +1,173 @@
+# Model Strategy
+
+## The problem
+
+The pipeline requires LLMs to perform four tasks per vocabulary entry:
+
+1. **Gloss review** — confirm or improve the existing gloss
+2. **Example review** — confirm or improve existing examples
+3. **Translation validation** — confirm valid translations, reject bad data, generate missing ones
+4. **CEFR assignment** — assign A1-C2 to the headword and each translation
+
+The core challenge is that vocabulary entries have **multiple senses**. The word "cat" appears five times in the database — as an animal, as slang for "guy", as a nautical term, as a verb meaning "to vomit", and as a verb meaning "to hoist an anchor". Each sense requires a different CEFR level and different translations. A model that only knows "cat" is A1 gets four out of five wrong.
+
+This makes CEFR assignment fundamentally a **sense-disambiguation problem**, not just a vocabulary lookup. Specialized CEFR classifiers (like `cefrpy` or `dksysd/cefr-classifier`) operate at the word or sentence level and cannot distinguish between senses of the same word. General LLMs handle sense disambiguation well but introduce quality and reliability problems that depend heavily on model size.
+
+The secondary challenge is **hardware constraints**. The available local hardware (GTX 950M, 4GB VRAM) can only run models up to approximately 4B parameters fully in GPU memory. Larger models run in hybrid CPU/GPU mode which is significantly slower. Free cloud API tiers are generous enough for the sample dataset but have daily limits that make processing 100k+ entries across multiple sub-stages a multi-day or multi-week operation.
+
+## What we tried and why it failed or worked
+
+### Single-prompt design (abandoned)
+
+The first enrich script sent one large prompt per entry covering all four tasks at once — CEFR voting, gloss improvement, example improvement, translation validation, and missing translation generation. This produced the following problems:
+
+- The model skipped translations it considered invalid rather than explicitly rejecting them, causing validation failures
+- Bad data in the translation table (`it:free`, `de:-frei`, `es:de fai`) caused consistent validation failures because the model refused to vote on them even when explicitly instructed
+- The combined prompt was large enough to trigger reasoning mode on Gemma 4 E4B, consuming all available tokens on thinking before producing output
+- 20% of entries required manual review
+
+### Sub-stage design (current)
+
+Splitting into four ordered sub-stages fixed the reasoning and validation problems:
+
+1. `round1_gloss` — LLM reviews the gloss in isolation
+2. `round1_example` — LLM reviews examples with verified gloss as context
+3. `round1_translations` — LLM validates translations with verified gloss as context
+4. `round1_cefr` — LLM assigns CEFR levels only to validated translations
+
+This ordering ensures the CEFR sub-stage never sees bad data. The smaller, focused prompts eliminated reasoning mode triggering and reduced per-entry time from ~120 seconds to ~25 seconds.
+
+### Gloss quality (ongoing)
+
+Testing on 50 entries with Qwen3.5-4B showed ~80% good quality. The 20% failures fall into three categories:
+
+- **Category header glosses** — Kaikki occasionally uses "Terms relating to people." or "Terms relating to things." as a gloss instead of a real definition. No model handles these correctly because there is no real meaning to improve.
+- **Rare/obscure senses** — slang, archaic, and theological senses that a 4B model does not have enough knowledge to handle (e.g. "cat" meaning "to vomit", "word" meaning "Logos, Christ").
+- **Short ambiguous glosses** — one or two word glosses with no example context cause hallucination.
+
+### Gemma 4 E4B (rejected)
+
+Gemma 4 E4B is a hybrid reasoning model. Disabling thinking via `--reasoning-budget 0` or `--chat-template-kwargs '{"enable_thinking":false}'` does not work reliably in llama.cpp for the E4B variant — the model either puts reasoning into the content field as plain text or returns empty content with reasoning in `reasoning_content`. Per-entry time exceeded 100 seconds making it impractical.
+
+### Qwen3.5-4B (current local model)
+
+Non-thinking by default for the small series. Runs fully in 4GB VRAM at ~5 seconds per sub-stage. Acceptable quality for common vocabulary (A1-B2) but struggles with rare and specialized senses. Used as the primary local voter.
+
+### Specialized CEFR classifiers (rejected for primary use)
+
+HuggingFace hosts several CEFR text classifiers (`dksysd/cefr-classifier`, `AbdulSami/bert-base-cased-cefr`) and the `cefrpy` Python library maps individual words to CEFR levels. These operate at the word or sentence level and cannot distinguish between senses. "cat" would always be assigned A1 regardless of whether the sense is the animal or obscure nautical slang. Useful only as a sanity check signal, not as a primary voter.
+
+## Available free resources
+
+| Resource                     | Type               | Requests/day      | Quality   | Notes                                                                  |
+| ---------------------------- | ------------------ | ----------------- | --------- | ---------------------------------------------------------------------- |
+| Local Qwen3.5-4B Q4_K_M      | Local model        | Unlimited         | Decent    | Non-thinking by default, fits in 4GB VRAM, ~5s per sub-stage           |
+| Local Qwen3.5-9B Q4_K_M      | Local model        | Unlimited         | Good      | Hybrid CPU/GPU mode on 4GB VRAM, slower but better quality             |
+| Local Llama 3.1 8B Q4_K_M    | Local model        | Unlimited         | Decent    | ~4.3GB, fits in VRAM or light hybrid, different architecture from Qwen |
+| Groq — Llama 3.3 70B         | Cloud API          | 1,000             | Excellent | Best free quality available, 5-10x with batching                       |
+| Groq — Llama 3.1 8B          | Cloud API          | 14,400            | Decent    | High volume, similar quality to local 4B                               |
+| Google Gemini AI Studio      | Cloud API          | 1,500             | Very good | Google account required, 5-10x with batching                           |
+| OpenRouter free rotation     | Cloud API          | 50–1,000          | Varies    | Rotates between free models automatically via `openrouter/free`        |
+| Wiktionary API               | Context enrichment | Unlimited         | N/A       | Structured vocabulary data, directly related to Kaikki source          |
+| `cefrpy` Python library      | Word lookup        | Unlimited         | Limited   | Deterministic English word CEFR lookup, no sense disambiguation        |
+| HuggingFace CEFR classifiers | Text classifier    | Unlimited (local) | Limited   | Sentence-level difficulty, not sense-aware                             |
+
+### Batching
+
+All cloud APIs support sending multiple entries in a single request. Sending 5 entries per request multiplies effective daily capacity by 5x:
+
+- Groq Llama 3.3 70B: 1,000 requests → ~5,000 entries/day
+- Gemini: 1,500 requests → ~7,500 entries/day
+
+### Multiple accounts
+
+Prohibited by the terms of service of all providers listed above.
+
+## Final approach per sub-stage
+
+The pipeline runs multiple models as independent voters. Each model processes every entry once and writes its votes to `pipeline.db`. The merge stage resolves disagreements by majority vote. A tiebreaker runs additional models on flagged entries where no majority was reached.
+
+### round1_gloss and round1_example
+
+These sub-stages require a model that understands sense context from examples. Specialized classifiers cannot help here — only general LLMs can evaluate whether a gloss correctly describes a specific sense.
+
+**Primary voter:** Local Qwen3.5-9B Q4_K_M — runs overnight, unlimited, handles common vocabulary well.
+
+**Secondary voter:** Groq Llama 3.3 70B with 5-entry batching — higher quality, catches errors the local model makes on rare or specialized senses.
+
+**Tertiary voter:** Gemini AI Studio with 5-entry batching — third independent opinion, different training data from both Groq and local model.
+
+**Context enrichment via Wiktionary API:** Before calling any model for the gloss or example sub-stage, the pipeline queries the Wiktionary API for the headword. The API returns the full Wiktionary entry including all senses, usage notes, and examples. This structured data is added to the prompt as additional context, giving the model a much clearer picture of which specific sense it is working with.
+
+This directly fixes the two hardest failure cases:
+- **Category header glosses** ("Terms relating to people.") — the Wiktionary entry contains the real definition which the model can use to generate a proper gloss
+- **Short ambiguous glosses** — the additional sense context prevents the model from guessing the wrong meaning
+
+The Wiktionary API is free, has no rate limits for reasonable use, and is directly related to the Kaikki data source since Kaikki extracts from Wiktionary.
+
+### round1_translations
+
+Same voter stack as gloss/example. The few-shot examples in the prompt (showing that `it:free` → reject and `de:-frei` → reject) handle the bad data cases that caused validation failures in the single-prompt design.
+
+### round1_cefr
+
+This sub-stage only receives translations that survived the validation step. All bad data is already excluded.
+
+**Primary voter:** Local Qwen3.5-9B Q4_K_M.
+
+**Secondary voter:** Groq Llama 3.3 70B with 5-entry batching.
+
+**Tertiary voter:** Gemini AI Studio with 5-entry batching.
+
+**Sanity check:** `cefrpy` provides a deterministic English word CEFR level as a reference signal. If the majority LLM vote disagrees significantly (e.g. LLMs vote C2 for "cat" the animal), the entry is flagged for human review. `cefrpy` does not vote — it only triggers review flags.
+
+### Voter summary
+
+| Sub-stage           | Voter 1            | Voter 2            | Voter 3 |
+| ------------------- | ------------------ | ------------------ | ------- |
+| round1_gloss        | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+| round1_example      | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+| round1_translations | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+| round1_cefr         | Qwen3.5-9B (local) | Groq Llama 3.3 70B | Gemini  |
+
+Three voters means a correct majority requires at least two models to agree. Even if the local model gets a difficult sense wrong, the two cloud models will likely agree on the correct answer and outvote it.
+
+## Open questions
+
+### Wiktionary API context extraction
+The Wiktionary API returns the full entry for a word including all senses. For a word like "free" with 8+ senses, dumping the entire entry into the prompt wastes tokens and may confuse the model. The open question is how to extract only the relevant sense — options include matching by sense_index, fuzzy-matching the Kaikki gloss against Wiktionary glosses, or letting the model see all senses and identify the correct one itself.
+
+### Batching prompt design
+Batching 5-10 entries per API call multiplies effective daily capacity significantly. The prompt and validation logic for batched requests is more complex — the model must return a structured JSON object keyed by entry ID, and partial failures (one entry in a batch fails validation) need careful handling. Not yet designed or tested.
+
+### Groq and Gemini API integration
+Neither Groq nor Gemini is integrated into the pipeline yet. Both use OpenAI-compatible APIs so integration is straightforward — add provider configs to `stage-3-enrich/config.ts` and set API keys in `.env`. The batching prompt design needs to be finalised first.
+
+### OpenRouter free model rotation
+OpenRouter's `openrouter/free` router selects a model at random from available free models. This means output style and quality vary between requests, which complicates round 2 voting where models review each other's candidates. May need to pin specific free models rather than using the router.
+
+### Qwen3.5-9B performance on hard cases
+The 9B model has not yet been tested. It is expected to handle rare and specialized senses better than the 4B model but this has not been verified. Needs a test run against the same 50 entries used to evaluate the 4B model.
+
+### Llama.cpp Gemma 4 bug
+The llama.cpp chat template bug preventing reliable JSON output from Gemma 4 E4B may be fixed in a future release. The model fits in 4GB VRAM and would be a useful additional local voter if the bug is resolved. Worth checking periodically.
+
+### Full dataset scale
+The current pipeline runs on a 500-entry sample per language. The full Kaikki English file contains approximately 1.3 million entries, of which a fraction will pass the POS and translation filters. The exact count and the time required to run all sub-stages across all models at full scale is not yet known.
+
+### Category header glosses
+Kaikki occasionally uses category headers ("Terms relating to people.", "Terms relating to things.") as glosses. These are not real definitions and no model produces useful output for them. Options include pre-filtering them before the gloss sub-stage and generating a gloss purely from examples, or flagging them as a special case for human review.
+
+
+
+
+wget -O models/llama-3.1-8b-instruct-q4_k_m.gguf \
+  "https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
+
+# Q4_K_M (5.68GB — hybrid mode, better quality)
+wget -O models/qwen3.5-9b-q4_k_m.gguf \
+  "https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q4_K_M.gguf"
+
+# Q3_K_S (4.32GB — might fit fully in VRAM)
+wget -O models/qwen3.5-9b-q3_k_s.gguf \
+  "https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q3_K_S.gguf"
--- a/documentation/notes.md
+++ b/documentation/notes.md
@ -115,45 +115,4 @@ Manage your app audience in the Audience page of the Google Auth Platform.
 - openapi
 - bruno for api testing
 - tailscale
- husky/lint-staged
 - musicforprogramming.net
-
-## openwordnet
-
-download libraries via
-
-```bash
-python -c 'import wn; wn.download("omw-fr")';
-```
-
-libraries:
-
-odenet:1.4
-omw-es:1.4
-omw-fr:1.4
-omw-it:1.4
-omw-en:1.4
-
-upgrade wn package:
-
-```bash
-pip install --upgrade wn
-```
-
-check if wn is available, eg italian:
-
-```bash
-python -c "import wn; print(len(wn.words(lang='it', lexicon='omw-it:1.4')))"
-```
-
-remove a library:
-
-```bash
-python -c "import wn; wn.remove('oewn:2024')"﬌ python -c "import wn; wn.remove('oewn:2024')"
-```
-
-list all libraries:
-
-```bash
-python -c "import wn; print(wn.lexicons())"
-```
--- a/documentation/roasts/gameService.md
+++ b/documentation/roasts/gameService.md
@ -1,348 +0,0 @@
-# 🔥 GameService Roast: `apps/api/src/services/gameService.ts`
-
-> *"It works on my machine" is not a scalability strategy.*
-
-**Project:** lila — Vocabulary Trainer  
-**File Roasted:** `gameService.ts`  
-**Date:** $(date)  
-**Roaster:** Qwen3.6  
-
---
-
-## 📋 Executive Summary
-
-| Metric        | Score    | Notes                                                |
-| ------------- | -------- | ---------------------------------------------------- |
-| Code Quality  | 8/10     | Clean layering, good types, consistent style         |
-| Correctness   | 6/10     | Race condition + N+1 query are critical              |
-| Test Coverage | 7/10     | Good happy-path tests, missing concurrency tests     |
-| Scalability   | 5/10     | Will choke at ~100 concurrent users without fixes    |
-| **Overall**   | **7/10** | Solid foundation, but fix the footguns before launch |
-
---
-
-## 🚨 Critical Issues (Fix Before Production)
-
-### 1. Race Condition: Lost Update in `evaluateAnswer`
-
-**Location:** `gameService.ts:45-58` + `InMemoryGameSessionStore.ts:update()`
-
-// Current flow (VULNERABLE):
-const session = await store.get(submission.sessionId);  // READ
-const updatedAnswers = new Map(session.answers);         // MODIFY (local copy)
-updatedAnswers.delete(submission.questionId);
-await store.update(submission.sessionId, { answers: updatedAnswers }); // WRITE
-
-The Attack:
-
-    Client submits answer A and answer B for the same question (network retry, bug, or malice)
-    Both requests read the same session.answers Map (question still present)
-    Both delete the question from their local copy
-    Both write back → second write overwrites first
-    Result: One answer is silently lost, session state desyncs
-
-Why Tests Missed It: Vitest runs tests synchronously. Race conditions require deliberate concurrency testing.
-Fix Options:
-
-// Option A: Add atomic operation to store interface
-interface GameSessionStore {
-  deleteAnswer(sessionId: string, questionId: string): Promise<boolean>;
-}
-
-// Option B: Use Valkey Lua script for atomic read-modify-write
-// Option C: Optimistic locking with version numbers
-
-Priority: 🔴 CRITICAL — Data integrity issue
-2. N+1 Query: Database Performance Bomb
-Location: gameService.ts:24-26 + termModel.ts:getDistractors()
-
-// For each of N terms, we call getDistractors():
-const questions: GameQuestion[] = await Promise.all(
-  terms.map(async (term) => {
-    const distractorTexts = await getDistractors(term.termId, ...); // 🚩 N queries!
-  })
-);
-
-Impact Analysis:
-Rounds
-	
-DB Queries
-	
-At 50 concurrent users
-3
-	
-1 + 3 = 4
-	
-200 queries/min
-10
-	
-1 + 10 = 11
-	
-550 queries/min
-20
-	
-1 + 20 = 21
-	
-1,050 queries/min
-Each getDistractors() runs:
-
-SELECT text FROM terms 
-JOIN translations ON ... 
-WHERE pos = $1 AND difficulty = $2 AND term_id != $3 AND text != $4 
-ORDER BY RANDOM() LIMIT 6
-
-Fix: Batch Fetch Distractors
-
-// Fetch all distractors in ONE query
-const allDistractors = await db
-  .select({ termId: terms.id, text: translations.text })
-  .from(terms)
-  .innerJoin(translations, /* ... */)
-  .where(and(
-    eq(terms.pos, pos),
-    eq(translations.difficulty, difficulty),
-    inArray(terms.id, termIds), // Batch!
-  ))
-  .limit(DISTRACTOR_FETCH_COUNT * termIds.length);
-
-// Group by termId in JS, then slice to 3 unique distractors per term
-const distractorsByTerm = groupByTermId(allDistractors);
-
-Priority: 🔴 CRITICAL — Performance/scalability issue
-
-3. Error Handling Inconsistency
-Location: gameService.ts:33-36
-
-if (uniqueDistractors.length < 3) {
-  throw new Error(`Not enough unique distractors for term: ${term.targetText}`); // 🚩
-}
-
-Problem: Raw Error bypasses your errorHandler middleware:
-
-    No HTTP status mapping (defaults to 500)
-    No structured logging
-    Inconsistent API responses
-
-Fix:
-import { UnprocessableEntityError } from "../errors/AppError.js";
-
-if (uniqueDistractors.length < 3) {
-  logger.warn({ termId: term.termId, uniqueCount: uniqueDistractors.length }, 
-              "insufficient_distractors");
-  throw new UnprocessableEntityError(
-    `Not enough unique distractors for term: ${term.targetText}`
-  );
-}
-Priority: 🟡 HIGH — Observability & UX issue
-⚠️ High-Severity Smells
-4. Code Duplication: Singleplayer vs Multiplayer
-Compare: gameService.ts vs multiplayerGameService.ts
-// gameService.ts
-const optionTexts = [term.targetText, ...uniqueDistractors.slice(0, 3)];
-const shuffledTexts = shuffleArray(optionTexts);
-const correctOptionId = shuffledTexts.indexOf(term.targetText);
-
-// multiplayerGameService.ts (lines 35-45)
-const optionTexts = [correctAnswer.targetText, ...distractorTexts];
-const shuffledTexts = shuffle(optionTexts); // Different function, same logic!
-const correctOptionId = shuffledTexts.indexOf(correctAnswer.targetText);
-
-Risks:
-
-    Fix shuffle bias in one place, forget the other
-    Add new option type (e.g., etymology hint), update one service only
-    Harder to test core game logic in isolation
-
-Fix: Extract pure function to @lila/shared or new @lila/game-logic:
-
-// packages/shared/src/game-logic.ts
-export const buildQuestionOptions = (
-  correctAnswer: string,
-  distractors: string[],
-  optionCount: number = 4
-): { options: AnswerOption[]; correctOptionId: number } => {
-  const uniqueDistractors = [...new Set(distractors.filter(d => d !== correctAnswer))];
-  const optionTexts = [correctAnswer, ...uniqueDistractors.slice(0, optionCount - 1)];
-  const shuffled = shuffleSecure(optionTexts);
-  const correctOptionId = shuffled.indexOf(correctAnswer);
-  
-  return {
-    options: shuffled.map((text, idx) => ({ optionId: idx, text })),
-    correctOptionId
-  };
-};
-
-Priority: 🟡 HIGH — Maintainability issue
-5. Shuffle Bias: Math.random() Trap
-Location: utils.ts:shuffleArray() + multiplayerGameService.ts:shuffle()
-
-export const shuffleArray = <T>(array: T[]): T[] => {
-  for (let i = result.length - 1; i > 0; i--) {
-    const j = Math.floor(Math.random() * (i + 1)); // 🚩 Modulo bias + non-crypto RNG
-    // ...
-  }
-};
-
-The Math:
-
-    Math.random() has ~53 bits of entropy (fine for vocab)
-    Math.floor(rand * n) has modulo bias when n isn't a power of 2
-    For n=4: bias is ~0.01% (tiny, but non-zero)
-
-When It Matters:
-
-    Competitive leaderboards ("option 0 is correct 26% of the time")
-    Achievement systems based on answer patterns
-    Security-sensitive features (not applicable here, but principle matters)
-
-Fix (if needed):
-import { randomBytes } from "crypto";
-
-const shuffleSecure = <T>(array: T[]): T[] => {
-  const result = [...array];
-  for (let i = result.length - 1; i > 0; i--) {
-    // Use crypto.getRandomValues for better randomness
-    const rand = randomBytes(4).readUInt32LE(0);
-    const j = rand % (i + 1);
-    [result[i], result[j]] = [result[j], result[i]];
-  }
-  return result;
-};
-
-Priority: 🟢 LOW — Document tradeoff and move on for now
-
-6. Test Coverage Gaps
-File: gameService.test.ts
-✅ Well Tested:
-
-    Happy path: session creation, answer evaluation
-    Edge cases: duplicate distractors, empty results, invalid inputs
-    Error propagation from DB layer
-
-❌ Missing Tests:
-
-// 1. Concurrency test (race condition)
-it("rejects duplicate answers for same question under concurrent load", async () => {
-  const session = await createGameSession(validRequest, store, "user-1");
-  const question = session.questions[0]!;
-  
-  // Submit two answers simultaneously
-  const [result1, result2] = await Promise.allSettled([
-    evaluateAnswer({ sessionId, questionId, selectedOptionId: 0 }, store, "user-1"),
-    evaluateAnswer({ sessionId, questionId, selectedOptionId: 1 }, store, "user-1"),
-  ]);
-  
-  // Exactly one should succeed, one should throw ConflictError
-  expect([result1, result2].filter(r => r.status === "fulfilled")).toHaveLength(1);
-});
-
-// 2. TTL expiration test
-it("deletes session after TTL expires", async () => {
-  vi.useFakeTimers();
-  const session = await createGameSession(validRequest, store, "user-1");
-  
-  vi.advanceTimersByTime(31 * 60 * 1000); // 31 minutes
-  
-  await expect(store.get(session.sessionId)).resolves.toBeNull();
-});
-
-// 3. Distractor fallback strategy test
-it("uses fallback when <3 unique distractors available", async () => {
-  mockGetDistractors.mockResolvedValue(["same", "same", "same", "same"]);
-  // Should either: (a) fetch from broader pool, or (b) reduce rounds gracefully
-});
-
-Priority: 🟡 HIGH — Prevents regression on critical fixes
-🧼 Code Quality Nitpicks
-7. Magic Numbers
-
-// gameService.ts:52
-await store.create(sessionId, {...}, 30 * 60 * 1000); // What is this?
-
-// termModel.ts:65
-.limit(count); // count=6, but why?
-
-// shared/schemas/game.ts:15
-optionId: z.number().int().min(0).max(3), // Why 4 options?
-
-Fix: Centralize in @lila/shared/constants.ts:
-
-export const GAME_SESSION_TTL_MS = 30 * 60 * 1000;
-export const DISTRACTOR_FETCH_COUNT = 6;
-export const GAME_OPTION_COUNT = 4;
-export const MIN_UNIQUE_DISTRACTORS = 3;
-
-8. Mutable Reference Leakage
-Location: InMemoryGameSessionStore.ts:get()
-
-get(sessionId: string): Promise<GameSessionData | null> {
-  return Promise.resolve(entry.data); // 🚩 Returns mutable reference to internal state
-}
-
-Risk: Any code that does session.answers.delete(...) mutates the store's internal Map directly.
-Fix:
-
-// Option A: Deep clone (simple, works for this data shape)
-return Promise.resolve(structuredClone(entry.data));
-
-// Option B: Return readonly view (TypeScript-only protection)
-return Promise.resolve(entry.data as Readonly<GameSessionData>);
-
-// Option C: Use immutable data structures (overkill for now)
-
-9. Zero Observability
-Problem: No logging, no metrics. You're flying blind in production.
-Minimal Fix (5 minutes):
-
-
-
-// apps/api/src/lib/logger.ts
-import pino from "pino";
-export const logger = pino({ 
-  level: process.env.LOG_LEVEL || "info",
-  transport: process.env.NODE_ENV === "production" 
-    ? { target: "pino-pretty" } 
-    : undefined 
-});
-
-// In gameService.ts:
-import { logger } from "../lib/logger.js";
-
-logger.info(
-  { userId, sourceLang, targetLang, termCount: terms.length },
-  "game_session_created"
-);
-
-logger.debug(
-  { sessionId, questionId, isCorrect, responseTimeMs },
-  "answer_evaluated"
-);
-
-Bonus: Export a Prometheus histogram for game_service_duration_seconds.
-
-10. ORDER BY RANDOM() Time Bomb
-Location: termModel.ts:getGameTerms() + getDistractors()
-
-.orderBy(sql`RANDOM()`) // 🚩 Fine for 10k rows, slow for 1M
-
-The Comment Admits It:
-
-// TODO(post-mvp): ORDER BY RANDOM() sorts the entire filtered result set...
-
-Reality Check: "Post-MVP" never comes without a ticket.
-Fix Options:
-
-- Option A: Pre-computed random_seed column (updated nightly)
-WHERE ... AND random_seed >= random() 
-ORDER BY random_seed 
-LIMIT $1
-
-- Option B: TABLESAMPLE for approximate sampling (Postgres 9.5+)
-FROM terms TABLESAMPLE SYSTEM(10) 
-WHERE ... 
-LIMIT $1
-
-- Option C: Random offset (simple, but still scans)
-OFFSET floor(random() * (SELECT count(*) FROM terms WHERE ...))
-
-Action: Add a ticket to documentation/tickets/t00009.md now.
--- a/documentation/tickets/blueprint.md
+++ b/documentation/tickets/blueprint.md
@ -1,95 +0,0 @@
-# Ticket Blueprint
-
-Two formats depending on task type. Choose based on whether a meaningful
-decision between options was made.
-
---
-
-## Format A — ADR (architectural/infrastructural decisions)
-
-Use when: you chose between options with long-term consequences.
-Prefix: `adr-`
-
---
-
-# ADR: <title>
-
-## Status
-
-Accepted | Superseded by | Deprecated
-
-## Date
-
-YYYY-MM-DD
-
-## Context
-
-What is the problem? Why does it need to be solved?
-
-## Decision
-
-What was chosen and why in one or two sentences.
-
-## Options considered
-
-### Option A — <name> ✅
-
-Description. Why it was chosen.
-
-### Option B — <name>
-
-Description. Why it was rejected.
-
-## Consequences
-
- What gets better
- What gets worse or more complex
- Operational implications
- What breaks if this needs to be redone
-
-## Affected files / machines
-
- List files, servers, or systems touched
-
-## References
-
- Links to relevant docs
-
---
-
-## Setup guide / implementation notes
-
-Step-by-step of what was actually done.
-
---
-
-## Format B — Task (features, fixes, chores)
-
-Use when: routine task with a clear solution.
-Prefix: `feat-` / `fix-` / `chore-`
-
---
-
-# <prefix>: <title>
-
-## Problem
-
-What was wrong or missing?
-
-## Options considered
-
-### Option A — <name> ✅
-
-### Option B — <name>
-
-## Solution
-
-What was done and why.
-
-## Files changed
-
- `path/to/file.ts`
-
-## Commit
-
-`<type>: <message>`
--- a/documentation/tickets/t00001.md
+++ b/documentation/tickets/t00001.md
@ -1,107 +0,0 @@
-# ADR: Docker Credential Helper Setup
-
-## Status
-
-Accepted
-
-## Date
-
-2026-04-26
-
-## Context
-
-Docker credentials for `git.lilastudy.com` and `dhi.io` were stored as base64-encoded strings in `~/.docker/config.json` on both the dev laptop and the VPS. Base64 is not encryption — anyone with read access to the file can decode the credentials instantly.
-
-## Decision
-
-Use `pass` (GPG-backed password store) as the Docker credential helper on both machines.
-
-## Options considered
-
-### Option A — `pass` (GPG-backed) ✅
-
-Stores credentials encrypted with a GPG key. Works on headless servers and desktops without GNOME. Industry standard for Linux servers.
-
-### Option B — `secretservice` (GNOME keyring)
-
-Uses the desktop keyring daemon. Not suitable for a headless VPS, and not suitable for an i3 desktop without running `gnome-keyring-daemon` manually.
-
-### Option C — `gnome-libsecret`
-
-Same limitations as Option B.
-
-## Consequences
-
- Credentials are now GPG-encrypted at rest on both machines
- Requires GPG passphrase entry when Docker needs to pull credentials
-  in a new session
- Must be set up manually on each machine — not reproducible via the repo
- VPS setup must be repeated if the server is reprovisioned
-
-## Affected machines
-
- Dev laptop (Debian 13, i3)
- VPS (Debian 13, ARM64, headless)
-
-## References
-
- [docker docs](https://docs.docker.com/reference/cli/docker/login/#credential-stores)
- [pass docs](https://www.passwordstore.org/)
-
---
-
-## Setup guide
-
-Repeat these steps on each machine.
-
-### 1. Install dependencies
-
-```bash
-sudo apt-get install -y pass gnupg2 golang-docker-credential-helpers
-```
-
-### 2. Generate a GPG key
-
-```bash
-gpg --full-generate-key
-```
-
-Choose RSA, 4096 bits, no expiry. Set a strong passphrase.
-
-### 3. Get the key ID
-
-```bash
-gpg --list-secret-keys --keyid-format LONG
-```
-
-Copy the hex string after the `/` on the `sec` line.
-
-### 4. Initialise pass
-
-```bash
-pass init <your-key-id>
-```
-
-### 5. Update `~/.docker/config.json`
-
-Replace the entire file contents with:
-
-```json
-{ "credsStore": "pass" }
-```
-
-### 6. Re-login to registries
-
-```bash
-docker login git.lilastudy.com
-# dev laptop only:
-docker login dhi.io
-```
-
-### 7. Verify
-
-```bash
-cat ~/.docker/config.json
-```
-
-Should show only `"credsStore": "pass"` with no `auths` block.
--- a/documentation/tickets/t00002.md
+++ b/documentation/tickets/t00002.md
@ -1,149 +0,0 @@
-# ADR: Change GAME_ROUNDS from strings to numbers
-
-## Status
-
-Accepted
-
-## Date
-
-2026-04-28
-
-## Context
-
-`GAME_ROUNDS` in `packages/shared/src/constants.ts` was typed as `["3", "10"] as const`, making `GameRounds` a string union (`"3" | "10"`). This meant `gameService.ts` had to cast the value with `Number(request.rounds)` deep in business logic — a type conversion happening far from the boundary where data enters the system. The type system was lying: `rounds` was described as a string everywhere but used as a number where it mattered.
-
-## Decision
-
-Change `GAME_ROUNDS` to `[3, 10] as const` and update the Zod schema to use `z.literal(GAME_ROUNDS)` instead of `z.enum(GAME_ROUNDS)`. The single source of truth remains `constants.ts` — adding a new round count (e.g. `20`) requires only editing that file.
-
-## Options considered
-
-### Option A — Numbers everywhere ✅
-
-Change `GAME_ROUNDS` to `[3, 10] as const`. Use `z.literal(GAME_ROUNDS)` in the schema. Update the frontend component state and `SettingGroup` props. Drop `Number()` cast in the service.
-
-Chosen because: JSON carries numbers natively, both ends of the wire are owned by this codebase, and type conversions belong at the boundary — not inside business logic.
-
-### Option B — Keep strings, accept the cast
-
-Leave `GAME_ROUNDS` as `["3", "10"]`. The `Number()` cast stays in `gameService.ts`.
-
-Rejected because: it pushes type conversion into business logic and makes the inferred `GameRequest` type misleading. The cast has to live somewhere — the schema boundary is the right place.
-
-### Option C — Coerce at the schema boundary
-
-Keep `GAME_ROUNDS` as numbers but use `z.coerce.number().pipe(z.literal(GAME_ROUNDS))` so the frontend can keep sending strings.
-
-Rejected because: coercion is for untrusted or uncontrolled inputs (form fields, query params, third-party clients). We control both ends of the wire. Coercing a self-inflicted type mismatch is treating a wound we gave ourselves.
-
-## Consequences
-
- `GameRounds` is now `3 | 10` instead of `"3" | "10"`
- `Number(request.rounds)` cast removed from `gameService.ts`
- `SettingGroup` in `GameSetup.tsx` now accepts `string | number` options
- `useState<string>` for rounds changed to `useState<number>`
- Adding a new round count requires only editing `GAME_ROUNDS` in `constants.ts`
- `z.enum` cannot be used for number literals — `z.literal` must be used instead (this is a Zod constraint, not a project convention)
-
-## Affected files
-
- `packages/shared/src/constants.ts`
- `packages/shared/src/schemas/game.ts`
- `apps/api/src/services/gameService.ts`
- `apps/api/src/services/gameService.test.ts`
- `apps/api/src/controllers/gameController.test.ts`
- `apps/web/src/components/game/GameSetup.tsx`
-
-## References
-
- [Zod literals](https://zod.dev/?id=literals)
-
---
-
-## Setup guide / implementation notes
-
-1. In `packages/shared/src/constants.ts`, change:
-
-   ```ts
-   export const GAME_ROUNDS = ["3", "10"] as const;
-   ```
-
-   to:
-
-   ```ts
-   export const GAME_ROUNDS = [3, 10] as const;
-   ```
-
-2. In `packages/shared/src/schemas/game.ts`, change:
-
-   ```ts
-   rounds: z.enum(GAME_ROUNDS),
-   ```
-
-   to:
-
-   ```ts
-   rounds: z.literal(GAME_ROUNDS),
-   ```
-
-3. In `apps/api/src/services/gameService.ts`, change:
-
-   ```ts
-   Number(request.rounds),
-   ```
-
-   to:
-
-   ```ts
-   request.rounds,
-   ```
-
-4. In `apps/api/src/services/gameService.test.ts`, change:
-
-   ```ts
-   rounds: "3",
-   ```
-
-   to:
-
-   ```ts
-   rounds: 3,
-   ```
-
-5. In `apps/api/src/controllers/gameController.test.ts`, change:
-
-   ```ts
-   rounds: "3",
-   ```
-
-   to:
-
-   ```ts
-   rounds: 3,
-   ```
-
-   Also add a pinning test before the refactor:
-
-   ```ts
-   it("returns 400 when rounds has an invalid value", async () => {
-     const res = await request(app)
-       .post("/api/v1/game/start")
-       .send({ ...validBody, rounds: "invalid" });
-     expect(res.status).toBe(400);
-     expect(res.body.success).toBe(false);
-   });
-   ```
-
-6. In `apps/web/src/components/game/GameSetup.tsx`:
-   - Update `SettingGroup` props to accept `string | number`:
-
-     ```ts
-     type SettingGroupProps = {
-       options: readonly (string | number)[];
-       selected: string | number;
-       onSelect: (value: string | number) => void;
-     };
-     ```
-
-   - Update `LABELS` lookup to `LABELS[String(option)]`
-   - Change rounds state from `useState<string>` to `useState<number>`
--- a/documentation/tickets/t00003.md
+++ b/documentation/tickets/t00003.md
@ -1,37 +0,0 @@
-# refactor: extract shuffleArray to lib/utils, rename correctAnswers to terms
-
-## Problem
-
-Two readability issues in `gameService.ts`:
-
-1. `shuffle` was defined as a private function at the bottom of `gameService.ts`, after the function that calls it. It is a pure generic utility with no dependency on game domain logic, so it had no business living there.
-
-2. The variable holding terms fetched from the database was named `correctAnswers`. These are word pairs — they only become "correct answers" once options are built around them. The name was premature and misleading.
-
-## Options considered
-
-### Option A — Move `shuffle` up in the same file
-
-Simple, no new files. Fixes the ordering issue but keeps a generic utility buried in domain code.
-
-### Option B — Extract to `lib/utils.ts` ✅
-
-Move `shuffle` (renamed `shuffleArray`) to `apps/api/src/lib/utils.ts` and import it. Cleaner separation: domain logic stays in services, generic utilities live in `lib/`.
-
-Chosen because `lib/` already exists, the function is reusable, and it gives future utilities a home.
-
-## Solution
-
- Created `apps/api/src/lib/utils.ts` with `shuffleArray`
- Renamed `shuffle` → `shuffleArray` for clarity at the call site
- Removed the inline `shuffle` from `gameService.ts` and imported from `lib/utils.ts`
- Renamed `correctAnswers` → `terms` and `correctAnswer` → `term` throughout `gameService.ts`
-
-## Files changed
-
- `apps/api/src/lib/utils.ts` — created
- `apps/api/src/services/gameService.ts` — removed `shuffle`, updated import, renamed variables
-
-## Commit
-
-`refactor: extract shuffleArray to lib/utils, rename correctAnswers to terms`
--- a/documentation/tickets/t00004.md
+++ b/documentation/tickets/t00004.md
@ -1,110 +0,0 @@
-# ADR: Dependency injection for GameSessionStore via composition root
-
-## Status
-
-Accepted
-
-## Date
-
-2026-04-28
-
-## Context
-
-`gameService.ts` had a module-level singleton:
-
-```ts
-const gameSessionStore = new InMemoryGameSessionStore();
-```
-
-This made the store invisible to anything outside the file. The `GameSessionStore` interface existed to make the store swappable — but the singleton made that impossible without editing the service itself. Tests shared the same instance across every test run, creating the potential for ghost sessions leaking between tests. The controller also briefly owned the singleton during an intermediate step, which violated the principle that controllers should only handle HTTP concerns.
-
-## Decision
-
-Adopt a composition root pattern. The store is created once in `createApp()` and passed down through factory functions: `createApiRouter(store)` → `createGameRouter(store)` → `createGameController(store)` → service calls. Neither the controller nor the service knows which implementation they're working with — they both see `GameSessionStore`.
-
-## Options considered
-
-### Option A — Composition root ✅
-
-Convert routers and controllers to factory functions. Create the store in `createApp()` and pass it down. The store is created once, at the top, and injected through the call chain.
-
-Chosen because: clean separation of concerns, no layer below `createApp()` needs to know the concrete implementation, swapping to `ValKeyGameSessionStore` is a one-line change in `app.ts`, and tests get fresh isolated store instances.
-
-### Option B — Keep singleton in controller
-
-Leave the store as a module-level singleton in `gameController.ts`. Controllers own the store lifetime.
-
-Rejected because: controllers should only handle HTTP concerns. Owning infrastructure lifetime is not an HTTP concern.
-
-### Option C — DI framework (tsyringe, inversify)
-
-Use a proper dependency injection container.
-
-Rejected because: overkill for the current scale. The composition root pattern achieves the same result with zero dependencies and no magic.
-
-## Consequences
-
- Swapping `InMemoryGameSessionStore` for `ValKeyGameSessionStore` requires editing one line in `app.ts`
- Tests create fresh `InMemoryGameSessionStore` instances per test — no shared state, no ghost sessions
- Routers and controllers are now factory functions instead of module-level singletons — slightly more verbose but explicitly testable
- `gameController.test.ts` uses `createApp()` which owns the store — controller tests remain integration-style and unaffected
- All layers below `createApp()` depend only on the `GameSessionStore` interface, never the concrete implementation
-
-## Affected files
-
- `apps/api/src/app.ts` — creates the store, passes to `createApiRouter`
- `apps/api/src/routes/apiRouter.ts` — converted to `createApiRouter(store)` factory
- `apps/api/src/routes/gameRouter.ts` — converted to `createGameRouter(store)` factory
- `apps/api/src/controllers/gameController.ts` — converted to `createGameController(store)` factory
- `apps/api/src/services/gameService.ts` — `store` parameter added to both functions, singleton removed
- `apps/api/src/services/gameService.test.ts` — fresh store per describe block via `beforeEach`
-
-## References
-
- [Composition root pattern](https://blog.ploeh.dk/2011/07/28/CompositionRoot/)
-
---
-
-## Setup guide / implementation notes
-
-1. `gameService.ts` — remove module-level singleton, add `store: GameSessionStore` parameter to `createGameSession` and `evaluateAnswer`
-
-2. `gameController.ts` — convert exported functions to a factory:
-
-   ```ts
-   export const createGameController = (store: GameSessionStore) => ({
-     createGame: async (req, res, next) => { ... },
-     submitAnswer: async (req, res, next) => { ... },
-   });
-   ```
-
-3. `gameRouter.ts` — convert to factory:
-
-   ```ts
-   export const createGameRouter = (store: GameSessionStore): Router => {
-     const router = express.Router();
-     const controller = createGameController(store);
-     router.post("/start", controller.createGame);
-     router.post("/answer", controller.submitAnswer);
-     return router;
-   };
-   ```
-
-4. `apiRouter.ts` — convert to factory:
-
-   ```ts
-   export const createApiRouter = (store: GameSessionStore): Router => {
-     const router = express.Router();
-     router.use("/game", createGameRouter(store));
-     return router;
-   };
-   ```
-
-5. `app.ts` — create the store at the composition root:
-
-   ```ts
-   const store = new InMemoryGameSessionStore();
-   app.use("/api/v1", createApiRouter(store));
-   ```
-
-6. `gameService.test.ts` — add `let store: InMemoryGameSessionStore` to each `describe` block, reset in `beforeEach`, pass to every service call
--- a/documentation/tickets/t00005.md
+++ b/documentation/tickets/t00005.md
@ -1,93 +0,0 @@
-# ADR: Session lifecycle — TTL and replay protection
-
-## Status
-
-Accepted
-
-## Date
-
-2026-04-28
-
-## Context
-
-`InMemoryGameSessionStore` had no TTL and no cleanup mechanism. Every session created stayed in memory until the process restarted. Additionally, `evaluateAnswer` never removed a question from the answer key after evaluating it, meaning the same question could be submitted multiple times and receive a valid result each time — a potential exploit in multiplayer and a correctness bug in singleplayer.
-
-## Decision
-
-Add a `ttlMs` parameter to `GameSessionStore.create()` so both the in-memory and future Valkey implementations handle expiry consistently. Delete questions from the answer key after evaluation. Delete the session when the last question is answered.
-
-## Options considered
-
-### Option A — Delete on last answer only
-
-Simple. Covers replay protection and normal session completion. Abandoned sessions (player starts game, never finishes) still leak memory.
-
-### Option B — Delete on last answer + TTL on the interface ✅
-
-Delete on answer covers normal flow. TTL covers abandoned sessions. TTL on the interface means `ValKeyGameSessionStore` can use Redis-native `EXPIRE` without any interface changes during migration.
-
-Chosen because it closes the memory leak entirely and makes the Valkey migration a zero-interface-change operation.
-
-### Option C — TTL hardcoded inside InMemoryGameSessionStore only
-
-Simpler short-term. But the interface wouldn't carry the TTL parameter, so `ValKeyGameSessionStore` would need a different mechanism — inconsistency between implementations.
-
-## Consequences
-
- Sessions expire after 30 minutes of inactivity regardless of completion state
- Submitting the same question twice throws `NotFoundError` on the second attempt
- Sessions are deleted automatically when the last question is answered
- `GameSessionStore.create()` now requires a `ttlMs` argument — any future implementation must honour it
- `ValKeyGameSessionStore` can implement TTL via Redis `EXPIRE` with no interface changes
- `InMemoryGameSessionStore` stores `{ data, expiresAt }` entries instead of raw `GameSessionData` — expiry is checked lazily on `get()`
-
-## Affected files
-
- `apps/api/src/gameSessionStore/GameSessionStore.ts` — `ttlMs` added to `create`
- `apps/api/src/gameSessionStore/InMemoryGameSessionStore.ts` — TTL implementation
- `apps/api/src/gameSessionStore/InMemoryGameSessionStore.test.ts` — new test file
- `apps/api/src/services/gameService.ts` — passes TTL to `store.create`, deletes question after evaluation, deletes session when empty
- `apps/api/src/services/gameService.test.ts` — replay protection and session cleanup tests added
-
-## References
-
- [Redis EXPIRE command](https://redis.io/commands/expire/)
-
---
-
-## Setup guide / implementation notes
-
-1. `GameSessionStore.ts` — add `ttlMs` to `create`:
-
-   ```ts
-   create(sessionId: string, data: GameSessionData, ttlMs: number): Promise<void>;
-   ```
-
-2. `InMemoryGameSessionStore.ts` — wrap stored data with expiry:
-
-   ```ts
-   type SessionEntry = { data: GameSessionData; expiresAt: number };
-   ```
-
-   Check expiry on `get()`, delete expired entries lazily.
-
-3. `gameService.ts` — pass TTL when creating session:
-
-   ```ts
-   await store.create(sessionId, { answers: answerKey }, 30 * 60 * 1000);
-   ```
-
-   After evaluating an answer:
-
-   ```ts
-   session.answers.delete(submission.questionId);
-   if (session.answers.size === 0) {
-     await store.delete(submission.sessionId);
-   }
-   ```
-
-4. When implementing `ValKeyGameSessionStore`, pass `ttlMs` to Redis `EXPIRE`:
-
-   ```ts
-   await valkey.set(sessionId, serialize(data), "EX", Math.ceil(ttlMs / 1000));
-   ```
--- a/documentation/tickets/t00006.md
+++ b/documentation/tickets/t00006.md
@ -1,125 +0,0 @@
-# ADR: Session ownership check and AuthenticatedRequest type
-
-## Status
-
-Accepted
-
-## Date
-
-2026-04-28
-
-## Context
-
-`evaluateAnswer` accepted any `sessionId` without verifying it belonged to the requesting user. The only protection was the unguessability of a UUID — security through obscurity. If a user intercepted or guessed another user's `sessionId`, they could submit answers on their behalf.
-
-Additionally, protected controller handlers typed their `req` parameter as `Request`, making `session` optional even though `requireAuth` middleware guarantees it is present. This required non-null assertions (`req.session!`) in business logic — a type assertion that could cause a runtime crash if middleware ordering ever changed.
-
-## Decision
-
-Store `userId` in `GameSessionData`. Pass `userId` from the controller into both `createGameSession` and `evaluateAnswer`. Assert ownership on evaluation — if the session's `userId` doesn't match the requesting user's ID, throw `NotFoundError`. Introduce `AuthenticatedRequest` to eliminate non-null assertions in protected handlers.
-
-## Options considered
-
-### Option A — AuthenticatedRequest type ✅
-
-Define `AuthenticatedRequest = Request & { session: { session: Session; user: User } }` in `types/express.d.ts`. Use it in protected controller handlers instead of `Request`. Requires a single `as express.RequestHandler` cast at route registration due to Express's type limitations.
-
-Chosen because: eliminates dangerous non-null assertions in business logic. The cast at route registration is a necessary cast caused by a third-party library limitation, not uncertain logic.
-
-### Option B — Non-null assertion (`req.session!`)
-
-Keep `Request` on all handlers. Assert `req.session!` at every usage.
-
-Rejected because: non-null assertions in business logic are dangerous — if middleware ordering ever changes, the assertion silently passes and crashes at runtime.
-
---
-
-### Option C — NotFoundError (404) on ownership failure ✅
-
-When a session exists but belongs to a different user, throw `NotFoundError` with the same message as a missing session.
-
-Chosen because: session IDs are opaque secrets. Returning 403 would confirm to the caller that the session ID is valid and belongs to someone else — information they shouldn't have. This pattern is used by GitHub, AWS, and most security-conscious APIs.
-
-### Option D — ForbiddenError (403) on ownership failure
-
-Explicit error that distinguishes "not found" from "not allowed".
-
-Rejected because: for user-owned resources identified by opaque IDs, confirming existence to an unauthorised caller is an information leak. 404 is the industry standard for this case.
-
-## Consequences
-
- Alice cannot submit answers for Bob's session — ownership is verified at the service layer
- `req.session.user.id` is accessible without non-null assertions in protected handlers
- `GameSessionData` now carries `userId` — any future `GameSessionStore` implementation must store and return it
- Route registration requires `as express.RequestHandler` cast for protected handlers — one cast per route, in wiring code only
- `ValKeyGameSessionStore` must serialise and deserialise `userId` alongside `answers`
-
-## Affected files
-
- `apps/api/src/types/express.d.ts` — `AuthenticatedRequest` type added
- `apps/api/src/gameSessionStore/GameSessionStore.ts` — `userId` added to `GameSessionData`
- `apps/api/src/gameSessionStore/InMemoryGameSessionStore.test.ts` — updated data fixtures
- `apps/api/src/services/gameService.ts` — `userId` parameter added to both functions, ownership assertion in `evaluateAnswer`
- `apps/api/src/services/gameService.test.ts` — updated all calls, ownership test added
- `apps/api/src/controllers/gameController.ts` — extracts `userId` from `req.session.user.id`, passes to service calls
- `apps/api/src/routes/gameRouter.ts` — `as express.RequestHandler` cast at route registration
-
-## References
-
- [OWASP: Insecure Direct Object Reference](https://owasp.org/www-community/attacks/Insecure_Direct_Object_Reference)
- [HTTP 403 vs 404 for authorization failures](https://stackoverflow.com/questions/3297048/403-forbidden-vs-401-unauthorized-http-responses)
-
---
-
-## Setup guide / implementation notes
-
-1. `types/express.d.ts` — add:
-
-   ```ts
-   export type AuthenticatedRequest = Request & {
-     session: { session: Session; user: User };
-   };
-   ```
-
-2. `GameSessionStore.ts` — add `userId` to `GameSessionData`:
-
-   ```ts
-   export type GameSessionData = { answers: Map<string, number>; userId: string };
-   ```
-
-3. `gameService.ts` — add `userId` to both function signatures:
-
-   ```ts
-   export const createGameSession = async (
-     request: GameRequest,
-     store: GameSessionStore,
-     userId: string,
-   ): Promise<GameSession>
-   ```
-
-   Store it on create:
-
-   ```ts
-   await store.create(sessionId, { answers: answerKey, userId }, 30 * 60 * 1000);
-   ```
-
-   Assert on evaluate:
-
-   ```ts
-   if (!session || session.userId !== userId) {
-     throw new NotFoundError(`Game session not found: ${submission.sessionId}`);
-   }
-   ```
-
-4. `gameController.ts` — extract from authenticated request:
-
-   ```ts
-   req.session.user.id
-   ```
-
-5. `gameRouter.ts` — cast at registration:
-
-   ```ts
-   router.post("/start", controller.createGame as express.RequestHandler);
-   router.post("/answer", controller.submitAnswer as express.RequestHandler);
-   ```
--- a/documentation/tickets/t00007.md
+++ b/documentation/tickets/t00007.md
@ -1,41 +0,0 @@
-# feat: guard against empty terms in createGameSession
-
-## Problem
-
-If `getGameTerms` returned an empty array — no vocabulary data matched the requested language, difficulty, and part of speech combination — `createGameSession` would create a session with zero questions and return it. The frontend would receive an empty `questions` array, attempt to render the first question, find nothing, and crash with no useful error message shown to the user.
-
-## Options considered
-
-### Option A — `NotFoundError` (404) ✅
-
-Throw when `terms.length === 0` before any session is created. The combination of filters yielded no data — that's a "not found" situation.
-
-Chosen because: the request is technically valid (all filter values are recognised), but the combination has no matching data. 404 is the correct semantic response.
-
-### Option B — `ValidationError` (400)
-
-Treat empty results as a bad request.
-
-Rejected because: the client sent valid input. The problem is missing data, not invalid input. 400 would be misleading.
-
-## Solution
-
-Added a guard in `createGameSession` immediately after `getGameTerms`:
-
-```ts
-if (terms.length === 0) {
-  throw new NotFoundError("No terms found for the given filters");
-}
-```
-
-The error propagates through the controller's `try/catch` to the error handler, which returns a clean 404 response. No session is created.
-
-## Files changed
-
- `apps/api/src/services/gameService.ts` — empty terms guard added
- `apps/api/src/services/gameService.test.ts` — pinning test added
- `apps/api/src/controllers/gameController.test.ts` — pinning test added at HTTP layer
-
-## Commit
-
-`feat: guard against empty terms in createGameSession`
--- a/documentation/tickets/t00008.md
+++ b/documentation/tickets/t00008.md
@ -1,54 +0,0 @@
-# fix: deduplicate distractors, replace tautological test
-
-## Problem
-
-Two issues in `createGameSession` and its test suite:
-
-1. If `getDistractors` returned the correct answer as one of the distractors, `createGameSession` would include it in the options array without filtering it out. `indexOf` would then find the first occurrence, which might not be the one intended as the correct answer — producing a question where the correct answer appears twice and the stored `correctOptionId` is wrong.
-
-2. The test `"distractors are never the correct answer"` was tautological — it filtered the correct answer out of the options array, then asserted the remaining items were not the correct answer. It was testing that `Array.filter()` works. It could never fail.
-
-## Options considered
-
-### Option A — Filter duplicates after fetching, request extra distractors as buffer ✅
-
-Filter out any distractor that matches the correct answer after fetching. Request 6 distractors instead of 3 to ensure enough remain after deduplication. Take the first 3 valid ones with `slice(0, 3)`.
-
-Chosen because: deduplication at the service layer is the right place — `getDistractors` shouldn't need to know what the correct answer is. Requesting extra provides a buffer against collisions.
-
-### Option B — Fix `getDistractors` to never return the correct answer
-
-Add a NOT filter in the database query.
-
-Not chosen for this ticket — the database query is in `@lila/db` and is a separate concern. The service layer should be defensive regardless of what the model layer returns.
-
-## Solution
-
- Filter distractors against the correct answer before building options:
-
-  ```ts
-  const uniqueDistractors = distractorTexts.filter((t) => t !== term.targetText);
-  const optionTexts = [term.targetText, ...uniqueDistractors.slice(0, 3)];
-  ```
-
- Request 6 distractors instead of 3 to account for potential duplicates
- Replaced tautological test with a test that actually exercises the duplicate case:
-
-  ```ts
-  it("correct answer appears exactly once even if getDistractors returns a duplicate", ...)
-  ```
-
- Added distractor failure propagation test:
-
-  ```ts
-  it("propagates getDistractors failure", ...)
-  ```
-
-## Files changed
-
- `apps/api/src/services/gameService.ts` — deduplication logic, distractor count increased to 6
- `apps/api/src/services/gameService.test.ts` — tautological test replaced, failure test added
-
-## Commit
-
-`fix: deduplicate distractors, replace tautological test, add distractor failure test`
--- a/packages/db/drizzle/0011_nice_spyke.sql
+++ b/packages/db/drizzle/0011_nice_spyke.sql
@ -0,0 +1,46 @@
+CREATE TABLE "entry_translations" (
+	"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
+	"entry_id" uuid NOT NULL,
+	"target_language_code" varchar(10) NOT NULL,
+	"translation" text NOT NULL,
+	"sense_hint" text,
+	"cefr_level" varchar(2),
+	"difficulty" varchar(20),
+	"source" varchar(50) DEFAULT 'kaikki' NOT NULL,
+	"created_at" timestamp with time zone DEFAULT now() NOT NULL,
+	CONSTRAINT "unique_translation" UNIQUE("entry_id","target_language_code","translation"),
+	CONSTRAINT "target_language_code_check" CHECK ("entry_translations"."target_language_code" IN ('en', 'it', 'de', 'fr', 'es')),
+	CONSTRAINT "cefr_check" CHECK ("entry_translations"."cefr_level" IS NULL OR "entry_translations"."cefr_level" IN ('A1', 'A2', 'B1', 'B2', 'C1', 'C2')),
+	CONSTRAINT "difficulty_check" CHECK ("entry_translations"."difficulty" IS NULL OR "entry_translations"."difficulty" IN ('easy', 'intermediate', 'hard'))
+);
+--> statement-breakpoint
+CREATE TABLE "vocabulary_entries" (
+	"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
+	"headword" text NOT NULL,
+	"language_code" varchar(10) NOT NULL,
+	"pos" varchar(20) NOT NULL,
+	"sense_index" smallint DEFAULT 0 NOT NULL,
+	"gloss" text,
+	"examples" text[] DEFAULT '{}' NOT NULL,
+	"cefr_level" varchar(2),
+	"difficulty" varchar(20),
+	"source" varchar(50) DEFAULT 'kaikki' NOT NULL,
+	"created_at" timestamp with time zone DEFAULT now() NOT NULL,
+	CONSTRAINT "unique_entry" UNIQUE("headword","language_code","pos","sense_index"),
+	CONSTRAINT "language_code_check" CHECK ("vocabulary_entries"."language_code" IN ('en', 'it', 'de', 'fr', 'es')),
+	CONSTRAINT "pos_check" CHECK ("vocabulary_entries"."pos" IN ('noun', 'verb', 'adjective', 'adverb')),
+	CONSTRAINT "cefr_check" CHECK ("vocabulary_entries"."cefr_level" IS NULL OR "vocabulary_entries"."cefr_level" IN ('A1', 'A2', 'B1', 'B2', 'C1', 'C2')),
+	CONSTRAINT "difficulty_check" CHECK ("vocabulary_entries"."difficulty" IS NULL OR "vocabulary_entries"."difficulty" IN ('easy', 'intermediate', 'hard'))
+);
+--> statement-breakpoint
+DROP TABLE "deck_terms" CASCADE;--> statement-breakpoint
+DROP TABLE "decks" CASCADE;--> statement-breakpoint
+DROP TABLE "term_examples" CASCADE;--> statement-breakpoint
+DROP TABLE "term_glosses" CASCADE;--> statement-breakpoint
+DROP TABLE "term_topics" CASCADE;--> statement-breakpoint
+DROP TABLE "terms" CASCADE;--> statement-breakpoint
+DROP TABLE "topics" CASCADE;--> statement-breakpoint
+DROP TABLE "translations" CASCADE;--> statement-breakpoint
+ALTER TABLE "entry_translations" ADD CONSTRAINT "entry_translations_entry_id_vocabulary_entries_id_fk" FOREIGN KEY ("entry_id") REFERENCES "public"."vocabulary_entries"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
+CREATE INDEX "idx_translations_target_lang" ON "entry_translations" USING btree ("target_language_code","difficulty","entry_id");--> statement-breakpoint
+CREATE INDEX "idx_entries_lang_pos" ON "vocabulary_entries" USING btree ("language_code","pos","difficulty");
--- a/packages/db/drizzle/meta/0011_snapshot.json
+++ b/packages/db/drizzle/meta/0011_snapshot.json
@ -0,0 +1,750 @@
+{
+  "id": "6f1811a6-8573-4d43-912a-ceb5191341cc",
+  "prevId": "6c1cb049-807d-43d0-b83e-d3575b80de33",
+  "version": "7",
+  "dialect": "postgresql",
+  "tables": {
+    "public.account": {
+      "name": "account",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "account_id": {
+          "name": "account_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "provider_id": {
+          "name": "provider_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "access_token": {
+          "name": "access_token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "refresh_token": {
+          "name": "refresh_token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "id_token": {
+          "name": "id_token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "access_token_expires_at": {
+          "name": "access_token_expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "refresh_token_expires_at": {
+          "name": "refresh_token_expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "scope": {
+          "name": "scope",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "password": {
+          "name": "password",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        }
+      },
+      "indexes": {
+        "account_userId_idx": {
+          "name": "account_userId_idx",
+          "columns": [
+            {
+              "expression": "user_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "account_user_id_user_id_fk": {
+          "name": "account_user_id_user_id_fk",
+          "tableFrom": "account",
+          "tableTo": "user",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.entry_translations": {
+      "name": "entry_translations",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "entry_id": {
+          "name": "entry_id",
+          "type": "uuid",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "target_language_code": {
+          "name": "target_language_code",
+          "type": "varchar(10)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "translation": {
+          "name": "translation",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "sense_hint": {
+          "name": "sense_hint",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "cefr_level": {
+          "name": "cefr_level",
+          "type": "varchar(2)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "difficulty": {
+          "name": "difficulty",
+          "type": "varchar(20)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "source": {
+          "name": "source",
+          "type": "varchar(50)",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'kaikki'"
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "idx_translations_target_lang": {
+          "name": "idx_translations_target_lang",
+          "columns": [
+            {
+              "expression": "target_language_code",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "difficulty",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "entry_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "entry_translations_entry_id_vocabulary_entries_id_fk": {
+          "name": "entry_translations_entry_id_vocabulary_entries_id_fk",
+          "tableFrom": "entry_translations",
+          "tableTo": "vocabulary_entries",
+          "columnsFrom": ["entry_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "unique_translation": {
+          "name": "unique_translation",
+          "nullsNotDistinct": false,
+          "columns": ["entry_id", "target_language_code", "translation"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {
+        "target_language_code_check": {
+          "name": "target_language_code_check",
+          "value": "\"entry_translations\".\"target_language_code\" IN ('en', 'it', 'de', 'fr', 'es')"
+        },
+        "cefr_check": {
+          "name": "cefr_check",
+          "value": "\"entry_translations\".\"cefr_level\" IS NULL OR \"entry_translations\".\"cefr_level\" IN ('A1', 'A2', 'B1', 'B2', 'C1', 'C2')"
+        },
+        "difficulty_check": {
+          "name": "difficulty_check",
+          "value": "\"entry_translations\".\"difficulty\" IS NULL OR \"entry_translations\".\"difficulty\" IN ('easy', 'intermediate', 'hard')"
+        }
+      },
+      "isRLSEnabled": false
+    },
+    "public.lobbies": {
+      "name": "lobbies",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "code": {
+          "name": "code",
+          "type": "varchar(10)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "host_user_id": {
+          "name": "host_user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "status": {
+          "name": "status",
+          "type": "varchar(20)",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'waiting'"
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "lobbies_host_user_id_user_id_fk": {
+          "name": "lobbies_host_user_id_user_id_fk",
+          "tableFrom": "lobbies",
+          "tableTo": "user",
+          "columnsFrom": ["host_user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "lobbies_code_unique": {
+          "name": "lobbies_code_unique",
+          "nullsNotDistinct": false,
+          "columns": ["code"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {
+        "lobby_status_check": {
+          "name": "lobby_status_check",
+          "value": "\"lobbies\".\"status\" IN ('waiting', 'in_progress', 'finished')"
+        }
+      },
+      "isRLSEnabled": false
+    },
+    "public.lobby_players": {
+      "name": "lobby_players",
+      "schema": "",
+      "columns": {
+        "lobby_id": {
+          "name": "lobby_id",
+          "type": "uuid",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "score": {
+          "name": "score",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "default": 0
+        },
+        "joined_at": {
+          "name": "joined_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {
+        "lobby_players_lobby_id_lobbies_id_fk": {
+          "name": "lobby_players_lobby_id_lobbies_id_fk",
+          "tableFrom": "lobby_players",
+          "tableTo": "lobbies",
+          "columnsFrom": ["lobby_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        },
+        "lobby_players_user_id_user_id_fk": {
+          "name": "lobby_players_user_id_user_id_fk",
+          "tableFrom": "lobby_players",
+          "tableTo": "user",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {
+        "lobby_players_lobby_id_user_id_pk": {
+          "name": "lobby_players_lobby_id_user_id_pk",
+          "columns": ["lobby_id", "user_id"]
+        }
+      },
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.session": {
+      "name": "session",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "expires_at": {
+          "name": "expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "token": {
+          "name": "token",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "ip_address": {
+          "name": "ip_address",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "user_agent": {
+          "name": "user_agent",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "user_id": {
+          "name": "user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        }
+      },
+      "indexes": {
+        "session_userId_idx": {
+          "name": "session_userId_idx",
+          "columns": [
+            {
+              "expression": "user_id",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {
+        "session_user_id_user_id_fk": {
+          "name": "session_user_id_user_id_fk",
+          "tableFrom": "session",
+          "tableTo": "user",
+          "columnsFrom": ["user_id"],
+          "columnsTo": ["id"],
+          "onDelete": "cascade",
+          "onUpdate": "no action"
+        }
+      },
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "session_token_unique": {
+          "name": "session_token_unique",
+          "nullsNotDistinct": false,
+          "columns": ["token"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.user": {
+      "name": "user",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "name": {
+          "name": "name",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "email": {
+          "name": "email",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "email_verified": {
+          "name": "email_verified",
+          "type": "boolean",
+          "primaryKey": false,
+          "notNull": true,
+          "default": false
+        },
+        "image": {
+          "name": "image",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "user_email_unique": {
+          "name": "user_email_unique",
+          "nullsNotDistinct": false,
+          "columns": ["email"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.verification": {
+      "name": "verification",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true
+        },
+        "identifier": {
+          "name": "identifier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "value": {
+          "name": "value",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "expires_at": {
+          "name": "expires_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "timestamp",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "verification_identifier_idx": {
+          "name": "verification_identifier_idx",
+          "columns": [
+            {
+              "expression": "identifier",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "policies": {},
+      "checkConstraints": {},
+      "isRLSEnabled": false
+    },
+    "public.vocabulary_entries": {
+      "name": "vocabulary_entries",
+      "schema": "",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "uuid",
+          "primaryKey": true,
+          "notNull": true,
+          "default": "gen_random_uuid()"
+        },
+        "headword": {
+          "name": "headword",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "language_code": {
+          "name": "language_code",
+          "type": "varchar(10)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "pos": {
+          "name": "pos",
+          "type": "varchar(20)",
+          "primaryKey": false,
+          "notNull": true
+        },
+        "sense_index": {
+          "name": "sense_index",
+          "type": "smallint",
+          "primaryKey": false,
+          "notNull": true,
+          "default": 0
+        },
+        "gloss": {
+          "name": "gloss",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "examples": {
+          "name": "examples",
+          "type": "text[]",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'{}'"
+        },
+        "cefr_level": {
+          "name": "cefr_level",
+          "type": "varchar(2)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "difficulty": {
+          "name": "difficulty",
+          "type": "varchar(20)",
+          "primaryKey": false,
+          "notNull": false
+        },
+        "source": {
+          "name": "source",
+          "type": "varchar(50)",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "'kaikki'"
+        },
+        "created_at": {
+          "name": "created_at",
+          "type": "timestamp with time zone",
+          "primaryKey": false,
+          "notNull": true,
+          "default": "now()"
+        }
+      },
+      "indexes": {
+        "idx_entries_lang_pos": {
+          "name": "idx_entries_lang_pos",
+          "columns": [
+            {
+              "expression": "language_code",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "pos",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            },
+            {
+              "expression": "difficulty",
+              "isExpression": false,
+              "asc": true,
+              "nulls": "last"
+            }
+          ],
+          "isUnique": false,
+          "concurrently": false,
+          "method": "btree",
+          "with": {}
+        }
+      },
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {
+        "unique_entry": {
+          "name": "unique_entry",
+          "nullsNotDistinct": false,
+          "columns": ["headword", "language_code", "pos", "sense_index"]
+        }
+      },
+      "policies": {},
+      "checkConstraints": {
+        "language_code_check": {
+          "name": "language_code_check",
+          "value": "\"vocabulary_entries\".\"language_code\" IN ('en', 'it', 'de', 'fr', 'es')"
+        },
+        "pos_check": {
+          "name": "pos_check",
+          "value": "\"vocabulary_entries\".\"pos\" IN ('noun', 'verb', 'adjective', 'adverb')"
+        },
+        "cefr_check": {
+          "name": "cefr_check",
+          "value": "\"vocabulary_entries\".\"cefr_level\" IS NULL OR \"vocabulary_entries\".\"cefr_level\" IN ('A1', 'A2', 'B1', 'B2', 'C1', 'C2')"
+        },
+        "difficulty_check": {
+          "name": "difficulty_check",
+          "value": "\"vocabulary_entries\".\"difficulty\" IS NULL OR \"vocabulary_entries\".\"difficulty\" IN ('easy', 'intermediate', 'hard')"
+        }
+      },
+      "isRLSEnabled": false
+    }
+  },
+  "enums": {},
+  "schemas": {},
+  "sequences": {},
+  "roles": {},
+  "policies": {},
+  "views": {},
+  "_meta": { "columns": {}, "schemas": {}, "tables": {} }
+}
--- a/packages/db/drizzle/meta/_journal.json
+++ b/packages/db/drizzle/meta/_journal.json
@ -78,6 +78,13 @@
      "when": 1776929932845,
      "tag": "0010_thankful_reaper",
      "breakpoints": true
+    },
+    {
+      "idx": 11,
+      "version": "7",
+      "when": 1777994750330,
+      "tag": "0011_nice_spyke",
+      "breakpoints": true
    }
  ]
 }
--- a/packages/db/src/db/schema.ts
+++ b/packages/db/src/db/schema.ts
@ -10,6 +10,7 @@ import {
  index,
  boolean,
  integer,
+  smallint,
 } from "drizzle-orm/pg-core";

 import { sql, relations } from "drizzle-orm";
@ -18,182 +19,100 @@ import {
  SUPPORTED_POS,
  SUPPORTED_LANGUAGE_CODES,
  CEFR_LEVELS,
-  SUPPORTED_DECK_TYPES,
  DIFFICULTY_LEVELS,
  LOBBY_STATUSES,
 } from "@lila/shared";

-export const terms = pgTable(
-  "terms",
+// ── Vocabulary ────────────────────────────────────────────────────────────────
+
+export const vocabulary_entries = pgTable(
+  "vocabulary_entries",
  {
    id: uuid().primaryKey().defaultRandom(),
-    source: varchar({ length: 50 }), // 'omw', 'wiktionary', null for manual
-    source_id: text(), // synset_id value for omw, wiktionary QID, etc.
+    headword: text().notNull(),
+    language_code: varchar({ length: 10 }).notNull(),
    pos: varchar({ length: 20 }).notNull(),
+    sense_index: smallint().notNull().default(0),
+    gloss: text(),
+    examples: text().array().notNull().default([]),
+    cefr_level: varchar({ length: 2 }),
+    difficulty: varchar({ length: 20 }),
+    source: varchar({ length: 50 }).notNull().default("kaikki"),
    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
  },
  (table) => [
+    unique("unique_entry").on(
+      table.headword,
+      table.language_code,
+      table.pos,
+      table.sense_index,
+    ),
+    check(
+      "language_code_check",
+      sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
+    ),
    check(
      "pos_check",
      sql`${table.pos} IN (${sql.raw(SUPPORTED_POS.map((p) => `'${p}'`).join(", "))})`,
    ),
-    unique("unique_source_id").on(table.source, table.source_id),
-    index("idx_terms_source_pos").on(table.source, table.pos),
-  ],
-);
-
-export const term_glosses = pgTable(
-  "term_glosses",
-  {
-    id: uuid().primaryKey().defaultRandom(),
-    term_id: uuid()
-      .notNull()
-      .references(() => terms.id, { onDelete: "cascade" }),
-    language_code: varchar({ length: 10 }).notNull(),
-    text: text().notNull(),
-    description: text(),
-    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
-  },
-  (table) => [
-    unique("unique_term_gloss").on(table.term_id, table.language_code),
-    check(
-      "language_code_check",
-      sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
-    ),
-  ],
-);
-
-export const term_examples = pgTable(
-  "term_examples",
-  {
-    id: uuid().primaryKey().defaultRandom(),
-    term_id: uuid()
-      .notNull()
-      .references(() => terms.id, { onDelete: "cascade" }),
-    language_code: varchar({ length: 10 }).notNull(),
-    text: text().notNull(),
-    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
-  },
-  (table) => [
-    unique("unique_term_example").on(
-      table.term_id,
-      table.language_code,
-      table.text,
-    ),
-    check(
-      "language_code_check",
-      sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
-    ),
-    index("idx_term_examples_term_id").on(table.term_id, table.language_code),
-  ],
-);
-
-export const translations = pgTable(
-  "translations",
-  {
-    id: uuid().primaryKey().defaultRandom(),
-    term_id: uuid()
-      .notNull()
-      .references(() => terms.id, { onDelete: "cascade" }),
-    language_code: varchar({ length: 10 }).notNull(),
-    text: text().notNull(),
-    cefr_level: varchar({ length: 2 }),
-    difficulty: varchar({ length: 20 }),
-    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
-  },
-  (table) => [
-    unique("unique_translations").on(
-      table.term_id,
-      table.language_code,
-      table.text,
-    ),
-    check(
-      "language_code_check",
-      sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
-    ),
    check(
      "cefr_check",
-      sql`${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((l) => `'${l}'`).join(", "))})`,
+      sql`${table.cefr_level} IS NULL OR ${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((l) => `'${l}'`).join(", "))})`,
    ),
    check(
      "difficulty_check",
-      sql`${table.difficulty} IN (${sql.raw(DIFFICULTY_LEVELS.map((d) => `'${d}'`).join(", "))})`,
+      sql`${table.difficulty} IS NULL OR ${table.difficulty} IN (${sql.raw(DIFFICULTY_LEVELS.map((d) => `'${d}'`).join(", "))})`,
    ),
-    index("idx_translations_lang").on(
+    index("idx_entries_lang_pos").on(
      table.language_code,
+      table.pos,
      table.difficulty,
-      table.cefr_level,
-      table.term_id,
    ),
  ],
 );

-export const decks = pgTable(
-  "decks",
+export const entry_translations = pgTable(
+  "entry_translations",
  {
    id: uuid().primaryKey().defaultRandom(),
-    name: text().notNull(),
-    description: text(),
-    source_language: varchar({ length: 10 }).notNull(),
-    validated_languages: varchar({ length: 10 }).array().notNull().default([]),
-    type: varchar({ length: 20 }).notNull(),
+    entry_id: uuid()
+      .notNull()
+      .references(() => vocabulary_entries.id, { onDelete: "cascade" }),
+    target_language_code: varchar({ length: 10 }).notNull(),
+    translation: text().notNull(),
+    sense_hint: text(),
+    cefr_level: varchar({ length: 2 }),
+    difficulty: varchar({ length: 20 }),
+    source: varchar({ length: 50 }).notNull().default("kaikki"),
    created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
  },
  (table) => [
-    check(
-      "source_language_check",
-      sql`${table.source_language} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
+    unique("unique_translation").on(
+      table.entry_id,
+      table.target_language_code,
+      table.translation,
    ),
    check(
-      "validated_languages_check",
-      sql`validated_languages <@ ARRAY[${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))}]::varchar[]`,
+      "target_language_code_check",
+      sql`${table.target_language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
    ),
    check(
-      "validated_languages_excludes_source",
-      sql`NOT (${table.source_language} = ANY(${table.validated_languages}))`,
+      "cefr_check",
+      sql`${table.cefr_level} IS NULL OR ${table.cefr_level} IN (${sql.raw(CEFR_LEVELS.map((l) => `'${l}'`).join(", "))})`,
    ),
    check(
-      "deck_type_check",
-      sql`${table.type} IN (${sql.raw(SUPPORTED_DECK_TYPES.map((t) => `'${t}'`).join(", "))})`,
+      "difficulty_check",
+      sql`${table.difficulty} IS NULL OR ${table.difficulty} IN (${sql.raw(DIFFICULTY_LEVELS.map((d) => `'${d}'`).join(", "))})`,
+    ),
+    index("idx_translations_target_lang").on(
+      table.target_language_code,
+      table.difficulty,
+      table.entry_id,
    ),
-    unique("unique_deck_name").on(table.name, table.source_language),
-    index("idx_decks_type").on(table.type, table.source_language),
  ],
 );

-export const deck_terms = pgTable(
-  "deck_terms",
-  {
-    deck_id: uuid()
-      .notNull()
-      .references(() => decks.id, { onDelete: "cascade" }),
-    term_id: uuid()
-      .notNull()
-      .references(() => terms.id, { onDelete: "cascade" }),
-  },
-  (table) => [primaryKey({ columns: [table.deck_id, table.term_id] })],
-);
-
-export const topics = pgTable("topics", {
-  id: uuid().primaryKey().defaultRandom(),
-  slug: varchar({ length: 50 }).notNull().unique(),
-  label: text().notNull(),
-  description: text(),
-  created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
-});
-
-export const term_topics = pgTable(
-  "term_topics",
-  {
-    term_id: uuid()
-      .notNull()
-      .references(() => terms.id, { onDelete: "cascade" }),
-    topic_id: uuid()
-      .notNull()
-      .references(() => topics.id, { onDelete: "cascade" }),
-  },
-  (table) => [primaryKey({ columns: [table.term_id, table.topic_id] })],
-);
+// ── Auth (managed by Better Auth) ─────────────────────────────────────────────

 export const user = pgTable("user", {
  id: text("id").primaryKey(),
@ -204,7 +123,7 @@ export const user = pgTable("user", {
  createdAt: timestamp("created_at").defaultNow().notNull(),
  updatedAt: timestamp("updated_at")
    .defaultNow()
-    .$onUpdate(() => /* @__PURE__ */ new Date())
+    .$onUpdate(() => new Date())
    .notNull(),
 });

@ -216,7 +135,7 @@ export const session = pgTable(
    token: text("token").notNull().unique(),
    createdAt: timestamp("created_at").defaultNow().notNull(),
    updatedAt: timestamp("updated_at")
-      .$onUpdate(() => /* @__PURE__ */ new Date())
+      .$onUpdate(() => new Date())
      .notNull(),
    ipAddress: text("ip_address"),
    userAgent: text("user_agent"),
@ -245,7 +164,7 @@ export const account = pgTable(
    password: text("password"),
    createdAt: timestamp("created_at").defaultNow().notNull(),
    updatedAt: timestamp("updated_at")
-      .$onUpdate(() => /* @__PURE__ */ new Date())
+      .$onUpdate(() => new Date())
      .notNull(),
  },
  (table) => [index("account_userId_idx").on(table.userId)],
@ -261,24 +180,13 @@ export const verification = pgTable(
    createdAt: timestamp("created_at").defaultNow().notNull(),
    updatedAt: timestamp("updated_at")
      .defaultNow()
-      .$onUpdate(() => /* @__PURE__ */ new Date())
+      .$onUpdate(() => new Date())
      .notNull(),
  },
  (table) => [index("verification_identifier_idx").on(table.identifier)],
 );

-export const userRelations = relations(user, ({ many }) => ({
-  sessions: many(session),
-  accounts: many(account),
-}));
-
-export const sessionRelations = relations(session, ({ one }) => ({
-  user: one(user, { fields: [session.userId], references: [user.id] }),
-}));
-
-export const accountRelations = relations(account, ({ one }) => ({
-  user: one(user, { fields: [account.userId], references: [user.id] }),
-}));
+// ── Lobbies ───────────────────────────────────────────────────────────────────

 export const lobbies = pgTable(
  "lobbies",
@ -318,6 +226,36 @@ export const lobby_players = pgTable(
  (table) => [primaryKey({ columns: [table.lobbyId, table.userId] })],
 );

+// ── Relations ─────────────────────────────────────────────────────────────────
+
+export const vocabularyEntryRelations = relations(
+  vocabulary_entries,
+  ({ many }) => ({ translations: many(entry_translations) }),
+);
+
+export const entryTranslationRelations = relations(
+  entry_translations,
+  ({ one }) => ({
+    entry: one(vocabulary_entries, {
+      fields: [entry_translations.entry_id],
+      references: [vocabulary_entries.id],
+    }),
+  }),
+);
+
+export const userRelations = relations(user, ({ many }) => ({
+  sessions: many(session),
+  accounts: many(account),
+}));
+
+export const sessionRelations = relations(session, ({ one }) => ({
+  user: one(user, { fields: [session.userId], references: [user.id] }),
+}));
+
+export const accountRelations = relations(account, ({ one }) => ({
+  user: one(user, { fields: [account.userId], references: [user.id] }),
+}));
+
 export const lobbyRelations = relations(lobbies, ({ one, many }) => ({
  host: one(user, { fields: [lobbies.hostUserId], references: [user.id] }),
  players: many(lobby_players),
--- a/packages/db/src/models/termModel.ts
+++ b/packages/db/src/models/termModel.ts
@ -1,25 +1,27 @@
 import { db } from "@lila/db";
-import { eq, and, isNotNull, sql, ne } from "drizzle-orm";
-import { terms, translations, term_glosses } from "@lila/db/schema";
+import { eq, and, ne, sql, isNotNull } from "drizzle-orm";
+import { vocabulary_entries, entry_translations } from "@lila/db/schema";
 import { alias } from "drizzle-orm/pg-core";
-
 import type {
  SupportedLanguageCode,
  SupportedPos,
  DifficultyLevel,
 } from "@lila/shared";

+// ── Types ─────────────────────────────────────────────────────────────────────
+
 export type TranslationPairRow = {
-  termId: string;
+  entryId: string;
  sourceText: string;
  targetText: string;
  sourceGloss: string | null;
 };

-// Note: difficulty filter is intentionally asymmetric. We filter on the target
-// (answer) side only — a word can be A2 in Italian but B1 in English, and what
-// matters for the learner is the difficulty of the word they're being taught.
+// ── Queries ───────────────────────────────────────────────────────────────────

+// Note: difficulty filter is intentionally on the target (translation) side.
+// A word can be A2 in one language but B1 in another — what matters for the
+// learner is the difficulty of the word they are being tested on.
 export const getGameTerms = async (
  sourceLanguage: SupportedLanguageCode,
  targetLanguage: SupportedLanguageCode,
@ -27,53 +29,36 @@ export const getGameTerms = async (
  difficulty: DifficultyLevel,
  rounds: number,
 ): Promise<TranslationPairRow[]> => {
-  const sourceTranslations = alias(translations, "source_translations");
-  const targetTranslations = alias(translations, "target_translations");
+  const sourceEntries = alias(vocabulary_entries, "source_entries");
+  const targetTranslations = alias(entry_translations, "target_translations");

  const rows = await db
    .select({
-      termId: terms.id,
-      sourceText: sourceTranslations.text,
-      targetText: targetTranslations.text,
-      sourceGloss: term_glosses.text,
+      entryId: sourceEntries.id,
+      sourceText: sourceEntries.headword,
+      targetText: targetTranslations.translation,
+      sourceGloss: sourceEntries.gloss,
    })
-    .from(terms)
-    .innerJoin(
-      sourceTranslations,
-      and(
-        eq(sourceTranslations.term_id, terms.id),
-        eq(sourceTranslations.language_code, sourceLanguage), // Filter here!
-      ),
-    )
+    .from(sourceEntries)
    .innerJoin(
      targetTranslations,
      and(
-        eq(targetTranslations.term_id, terms.id),
-        eq(targetTranslations.language_code, targetLanguage), // Filter here!
-      ),
-    )
-    .leftJoin(
-      term_glosses,
-      and(
-        eq(term_glosses.term_id, terms.id),
-        eq(term_glosses.language_code, sourceLanguage),
+        eq(targetTranslations.entry_id, sourceEntries.id),
+        eq(targetTranslations.target_language_code, targetLanguage),
+        eq(targetTranslations.difficulty, difficulty),
+        isNotNull(targetTranslations.translation),
      ),
    )
    .where(
      and(
-        eq(terms.pos, pos),
-        eq(targetTranslations.difficulty, difficulty),
-        isNotNull(sourceTranslations.difficulty), // Good data quality check!
+        eq(sourceEntries.language_code, sourceLanguage),
+        eq(sourceEntries.pos, pos),
+        isNotNull(sourceEntries.difficulty),
      ),
    )
-    // TODO(post-mvp): ORDER BY RANDOM() sorts the entire filtered result set before
-    // applying LIMIT, which is fine at current data volumes (low thousands of rows
-    // after POS + difficulty filters) but degrades as the terms table grows. Once
-    // the database is fully populated and tagged, replace with one of:
-    //   - TABLESAMPLE BERNOULLI(n) for approximate sampling on large tables
-    //   - Random offset: SELECT ... OFFSET floor(random() * (SELECT count(*) ...))
-    //   - Pre-computed random column with a btree index, reshuffled periodically
-    // Benchmark first — don't optimise until it actually hurts.
+    // TODO(post-mvp): ORDER BY RANDOM() sorts the entire filtered result set
+    // before applying LIMIT, which is fine at current data volumes but degrades
+    // as the table grows. See original termModel.ts for optimisation options.
    .orderBy(sql`RANDOM()`)
    .limit(rounds);

@ -81,32 +66,33 @@ export const getGameTerms = async (
 };

 export const getDistractors = async (
-  excludeTermId: string,
+  excludeEntryId: string,
  excludeText: string,
+  sourceLanguage: SupportedLanguageCode,
  targetLanguage: SupportedLanguageCode,
  pos: SupportedPos,
  difficulty: DifficultyLevel,
  count: number,
 ): Promise<string[]> => {
  const rows = await db
-    .select({ text: translations.text })
-    .from(terms)
+    .select({ text: entry_translations.translation })
+    .from(vocabulary_entries)
    .innerJoin(
-      translations,
+      entry_translations,
      and(
-        eq(translations.term_id, terms.id),
-        eq(translations.language_code, targetLanguage),
+        eq(entry_translations.entry_id, vocabulary_entries.id),
+        eq(entry_translations.target_language_code, targetLanguage),
+        eq(entry_translations.difficulty, difficulty),
      ),
    )
    .where(
      and(
-        eq(terms.pos, pos),
-        eq(translations.difficulty, difficulty),
-        ne(terms.id, excludeTermId),
-        ne(translations.text, excludeText),
+        eq(vocabulary_entries.language_code, sourceLanguage),
+        eq(vocabulary_entries.pos, pos),
+        ne(vocabulary_entries.id, excludeEntryId),
+        ne(entry_translations.translation, excludeText),
      ),
    )
-    // TODO(post-mvp): same ORDER BY RANDOM() concern as getGameTerms — see comment there.
    .orderBy(sql`RANDOM()`)
    .limit(count);
Author	SHA1	Message	Date
lila	97b0f302d0	refactoring documentation	2026-05-15 23:09:54 +02:00
lila	04a581efe1	WIP: checkpoint before stage-3 sub-stage rewrite	2026-05-12 22:13:14 +02:00
lila	73fb12ac35	feat: enrich script working, redesigning to sub-stage architecture - Enrich script functional with timeout, progress tracking, rejection mechanism - Identified ordering issue: CEFR voting needs validated translations first - Redesign: round1_gloss → round1_example → round1_translations → round1_cefr - Update data-pipeline.md with new sub-stage design and roadmap - Qwen3.5-4B confirmed working with thinking disabled	2026-05-07 13:09:43 +02:00
lila	7f10c35e03	docs: update roadmap — stage 3 enrich script written, llama.cpp next	2026-05-05 19:30:18 +02:00
lila	9642daf6dd	feat: add stage 3 round 1 enrich script and wire into orchestrator	2026-05-05 19:28:38 +02:00
lila	76af2ab093	fix: update db import validation tests to account for reverse links - Translation count test now adds reverse link count to expected total - Non-English translations test now filters to kaikki source only - Target language test now filters to kaikki source only — reverse links to English are valid and expected	2026-05-05 19:10:19 +02:00
lila	1c44ef989b	feat: update pipeline orchestrator for Kaikki — wire up stages 1 and 2 - Replace checkOmwExists with checkExtractedFilesExist - Wire up importKaikki and reverseLink as real stage implementations - Track reverse link completion via sentinel row in run_status - Update report to use resolved_entry_cefr and entry counts - Stages 3 onwards remain as stubs	2026-05-05 19:04:28 +02:00
lila	6f9a42c707	feat: add stage 2 reverse link sync script	2026-05-05 18:57:55 +02:00
lila	b5a76ee178	docs: update roadmap — stage 1 in progress, sample extraction complete	2026-05-05 18:52:10 +02:00
lila	ba2635e3f7	feat: add stage 1 and db import validation tests for Kaikki schema	2026-05-05 18:51:11 +02:00
lila	0cc643e308	feat: update extractor for all 5 languages, update import for multi-language - Extract.ts now processes all 5 language files, filters non-English entries by lang_code, skips translation extraction for non-English (no translations in source files) - Import.ts now imports all 5 language output files, uses language field from ExtractedSense instead of hardcoding en - Sample limit hardcoded to 500 entries per language for development	2026-05-05 18:46:32 +02:00
lila	209d52f54b	feat: add Kaikki extraction and import scripts for stage 1 - Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL, filters to supported POS and languages, skips abbreviations and senses with no translations in supported languages - Rewrite db/import.ts for Kaikki flat model — tracks sense_index offsets per headword+pos to handle duplicate JSONL entries - Rewrite db/schema.sql for Kaikki model — entries, translations, LLM vote tables, resolved tables - Add extract and db:import scripts to package.json - Sample mode hardcoded to 500 entries for development	2026-05-05 18:11:53 +02:00
lila	963bff4eb8	feat: migrate production schema from OMW to Kaikki flat vocabulary model - Replace terms/translations/term_glosses/term_examples with vocabulary_entries and entry_translations - Remove decks, topics and related tables (deferred) - Add cefr_level and difficulty to entry_translations for game query filtering - Update termModel.ts for new schema — getDistractors now takes sourceLanguage - Update gameService.ts and multiplayerGameService.ts for entryId rename - Update all test fixtures from termId to entryId - Generate and apply migration 0011	2026-05-05 17:39:25 +02:00