feat: add Kaikki extraction and import scripts for stage 1

- Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL,
  filters to supported POS and languages, skips abbreviations and
  senses with no translations in supported languages
- Rewrite db/import.ts for Kaikki flat model — tracks sense_index
  offsets per headword+pos to handle duplicate JSONL entries
- Rewrite db/schema.sql for Kaikki model — entries, translations,
  LLM vote tables, resolved tables
- Add extract and db:import scripts to package.json
- Sample mode hardcoded to 500 entries for development
This commit is contained in:
lila 2026-05-05 18:11:53 +02:00
parent 963bff4eb8
commit 209d52f54b
17 changed files with 346 additions and 1055737 deletions

1
.gitignore vendored
View file

@ -12,6 +12,7 @@ __pycache__/
data-pipeline/archive/
data-pipeline/stage-1-extract/output/
data-pipeline/stage-1-extract/sources/
data-pipeline/stage-2-annotate/output/
data-pipeline/stage-3-enrich/output/
data-pipeline/stage-4-merge/output/

View file

@ -1,362 +0,0 @@
# OMW German Translation Quality Audit
Instructions: for each entry, check if the German translations
match the meaning described by the English gloss.
Mark QUALITY as:
OK — all German translations fit the meaning
PARTIAL — some fit, some don't
BAD — none of the German translations fit
USELESS — translations are correct but useless for learners
---
1. [noun] ili:i98680
EN gloss: the flowering part of a plant or arrangement of flowers on a stalk
DE gloss: der blühende Teil einer Pflanze oder die Anordnung von Blüten an einem Stiel
EN words: inflorescence
DE words: Blütenstand, Infloreszenz
QUALITY: correct
2. [verb] ili:i24675
EN gloss: make motionless
DE gloss: unbeweglich machen
EN words: still
DE words: stillen, zum Stillstand bringen
QUALITY: stillen means breastfeeding, so completelyworng, zum stillstand bringen is correct but the gloss sounds weird: unbeweglich machen, no one says this
3. [verb] ili:i22153
EN gloss: lose interest or become bored with something or somebody
DE gloss: das Interesse an etwas oder jemandem verlieren oder sich langweilen
EN words: fatigue, jade, pall, tire, weary
DE words: Langeweile erzeugen, anöden, ermüden, langweilen, sich langweilen, sich zu Tode langweilen, sich öden
QUALITY: its ok
4. [noun] ili:i74742
EN gloss: zealous preaching and advocacy of the gospel
DE gloss: eifriges Predigen und Eintreten für das Evangelium
EN words: evangelism
DE words: Evangelisation, Evangelisierung
QUALITY: ok
5. [noun] ili:i115665
EN gloss: an oxide of iron that is strongly attracted by magnets
DE gloss: ein Eisenoxid, das stark von Magneten angezogen wird
EN words: magnetic iron-ore, magnetite
DE words: Eisenoxiduloxid, Magneteisen, Magneteisenstein, Magnetit
QUALITY: ok
6. [adjective] ili:i17569
EN gloss: of or relating to fatalism
DE gloss: von oder im Zusammenhang mit Fatalismus
EN words: fatalist, fatalistic
DE words: auf alles gefasst, dem Schicksal ergeben, fatalistisch, gottergeben, schicksalsergeben
QUALITY: ok
7. [adjective] ili:i682
EN gloss: having no previous example or precedent or parallel
DE gloss: ohne vorheriges Beispiel oder Präzedenzfall oder Parallele
EN words: new, unexampled
DE words: beispiellos, gab es noch nie, ohne Beispiel, ohne Präzedenzfall, ohnegleichen, präzedenzlos, sondergleichen, unvergleichbar
QUALITY: ok
8. [noun] ili:i114018
EN gloss: a soft silvery metallic element of the rare earth group; isotope 170 emits X-rays and is used in small portable X-ray machines; it occurs in monazite and apatite and xenotime
DE gloss: ein weiches, silbriges Metallelement der Gruppe der Seltenen Erden; Isotop 170 emittiert Röntgenstrahlen und wird in kleinen tragbaren Röntgengeräten verwendet; es kommt in Monazit und Apatit sowie in Xenotim vor
EN words: Tm, atomic number 69, thulium
DE words: Terameter, Tm
QUALITY: ok
9. [noun] ili:i117564
EN gloss: the rate of some repeating event
DE gloss: die Geschwindigkeit eines sich wiederholenden Ereignisses
EN words: pace, tempo
DE words: Takt, Tempo
QUALITY: ok
10. [verb] ili:i31619
EN gloss: let drop or droop
DE gloss: fallen oder hängen lassen
EN words: hang
DE words: am Galgen sterben lassen, aufhängen, aufknüpfen, erhängen, henken, hängen
QUALITY: wrong,let drop means fallen lassen, like dropping something? im not sure here, does it really mean to hang some one? if so, then its ok
11. [noun] ili:i75571
EN gloss: a heavy dull sound (as made by impact of heavy objects)
DE gloss: ein schweres, dumpfes Geräusch (wie beim Aufprall schwerer Gegenstände)
EN words: clump, clunk, thud, thump, thumping
DE words: Geklacker, Geklapper, Klackern, Klappern
QUALITY: ok
12. [noun] ili:i92290
EN gloss: a person who makes a promise
DE gloss: eine Person, die ein Versprechen gibt
EN words: promiser, promisor
DE words: Freud'scher Versprecher, Lapsus Linguae, Versprecher, freudscher Versprecher
QUALITY: completeley wrong, Versprecher is if you intend to say something but say some thing else, it has nothing to do with Versprechen
13. [noun] ili:i59450
EN gloss: a vertical well around which there is a stairway
DE gloss: ein vertikaler Schacht, um den herum eine Treppe verläuft
EN words: stairwell
DE words: Ern, Flur, Hausflur, Stiegenhaus, Treppenhaus
QUALITY: treppenhaus woudl be the only correct one right?
14. [verb] ili:i21908
EN gloss: smile affectedly or derisively
DE gloss: affektiert oder spöttisch lächeln
EN words: simper, smirk
DE words: in sich hinein lächeln, schmunzeln, vor sich hin lächeln
QUALITY: the glosses would be also the words here? schmunzeln and lächeln are kind of the same but the affektiert and spöttisch is missing?
15. [adjective] ili:i10887
EN gloss: tending to reserve or introspection
DE gloss: zur Zurückhaltung oder Introspektion neigend
EN words: indrawn, withdrawn
DE words: allein, einsam, eremitenhaft, eremitisch, für sich, solo, wie ein Einsiedler, wie ein Eremit, zurückgezogen
QUALITY: ok
16. [noun] ili:i113657
EN gloss: a substance from which another substance is formed (especially by a metabolic reaction)
DE gloss: ein Stoff, aus dem ein anderer Stoff gebildet wird (insbesondere durch eine Stoffwechselreaktion)
EN words: precursor
DE words: Ausgangsstoff, Edukt, Grundstoff, Präkursor, Vorläufer, biologische Vorstufe
QUALITY: ok
17. [adjective] ili:i13251
EN gloss: tastelessly showy
DE gloss: geschmacklos und auffällig
EN words: brassy, cheap, flash, flashy, garish, gaudy, gimcrack, loud, meretricious, tacky, tatty, tawdry, trashy
DE words: aufdringlich, marktschreierisch, reißerisch
QUALITY: ok
18. [noun] ili:i68734
EN gloss: the branch of chemistry that studies the relation between chemical action and the amount of heat absorbed or generated
DE gloss: der Zweig der Chemie, der die Beziehung zwischen chemischer Wirkung und der absorbierten oder erzeugten Wärmemenge untersucht
EN words: thermochemistry
DE words: Thermochemie, chemische Thermodynamik
QUALITY: ok
19. [adjective] ili:i12980
EN gloss: distinguished from others in excellence
DE gloss: durch hohe Qualität von anderen unterschieden
EN words: outstanding
DE words: I a, ausgezeichnet, außergewöhnlich, außerordentlich, besonders, bestens, eins a, exzeptionell, herausragend, schnafte, splendid, trefflich, vortrefflich, vorzüglich
QUALITY: ok, aber eins a/1a is wirklich sehr starke umgangssprache. und cih habe ncoh nie schnafte oder splendid gehört, der rest passt
20. [verb] ili:i30043
EN gloss: tear down so as to make flat with the ground
DE gloss: abreißen, um den Boden zu ebnen
EN words: dismantle, level, pull down, rase, raze, take down, tear down
DE words: abreißen, aus den Augen verlieren, keinen Kontakt mehr haben zu, nicht länger in Kontakt stehen
QUALITY: nur abreißen stimmt, der rest passt in diesem zusammenhang gar nicht!
21. [adjective] ili:i14014
EN gloss: desired or wished for or sought
DE gloss: gewünscht oder gewünscht oder gesucht
EN words: wanted
DE words: benötigt, gesucht, gewünscht
QUALITY: ok
22. [verb] ili:i29481
EN gloss: mar or spoil the appearance of
DE gloss: das Aussehen verunstalten
EN words: blemish, deface, disfigure
DE words: deformieren, entstellen, verhunzen, verschandeln, verunstalten, verunzieren
QUALITY: ok
23. [verb] ili:i28605
EN gloss: spread thickly
DE gloss: dick auftragen
EN words: slather
DE words: beharken, bestreichen, mit Feuer belegen, mit Sperrfeuer belegen
QUALITY: kein wort ist wirklich ein synonym für dick auftragen, (i dont even know if the english word fits here?)
24. [noun] ili:i92029
EN gloss: someone who is licensed to operate an aircraft in flight
DE gloss: jemand, der eine Lizenz zum Führen eines Luftfahrzeugs im Flug hat
EN words: airplane pilot, pilot
DE words: Führer, Lotse, Pilot
QUALITY: nur Pilot stimmt hier
25. [adjective] ili:i8221
EN gloss: capable of being measured
DE gloss: in der Lage, gemessen zu werden
EN words: measurable, mensurable
DE words: bestimmbar, der Messung zugänglich, erhebbar, mensurabel, messbar
QUALITY: ok
26. [noun] ili:i61380
EN gloss: the spirit of a group that makes the members want the group to succeed
DE gloss: der Geist einer Gruppe, der die Mitglieder dazu bringt, den Erfolg der Gruppe zu wollen
EN words: esprit de corps, morale, team spirit
DE words: Gruppengeist, Teamgeist
QUALITY: Gruppengeist hört sich so komisch an, das sagt niemand, teamgeist ist in ordnung
27. [adjective] ili:i10497
EN gloss: free of restrictions or qualifications
DE gloss: Zustand, in dem in einer Wohnung niemand wohnt.
EN words: clean, clear
DE words: frei, leer stehend, leerstehend, unbewohnt, ungenutzt, verwaist
QUALITY: ok
28. [adjective] ili:i6238
EN gloss: moving and bending with ease
DE gloss: anmutig schlank und mit Leichtigkeit biegsam und beweglich
EN words: lissom, lissome, lithe, lithesome, slender, supple, svelte, sylphlike
DE words: elastisch, geschmeidig, schlangenartig
QUALITY: \_\_\_
29. [noun] ili:i57906
EN gloss: station for the production and transmission of AM or FM radio broadcasts
DE gloss: Sender für die Produktion und Übertragung von AM- oder FM-Radiosendungen
EN words: radio station
DE words: Radiosender, Rundfunkstation, Sender
QUALITY: \_\_\_
30. [noun] ili:i112045
EN gloss: the purple or black-and-blue area resulting from a bruise
DE gloss: der violette oder schwarzblaue Bereich, der durch einen Bluterguss entsteht
EN words: ecchymosis
DE words: Ekchymose, kleinflächige Hautblutung
QUALITY: \_\_\_
31. [adjective] ili:i10839
EN gloss: capable of being replaced
DE gloss: kann ersetzt werden
EN words: replaceable
DE words: austauschbar, ersetzbar, fungibel
QUALITY: \_\_\_
32. [verb] ili:i28714
EN gloss: whip
DE gloss: peitschen
EN words: flagellate, scourge
DE words: auspeitschen, flagellieren, geißeln, peitschen
QUALITY: \_\_\_
33. [noun] ili:i52826
EN gloss: a mechanical or electrical explosive device or a small amount of explosive; can be used to initiate the reaction of a disrupting explosive
DE gloss: ein mechanischer oder elektrischer Sprengkörper oder eine kleine Menge Sprengstoff; kann verwendet werden, um die Reaktion eines Sprengstoffs auszulösen
EN words: cap, detonating device, detonator
DE words: Auslöser, Zünder, Zündvorrichtung
QUALITY: \_\_\_
34. [noun] ili:i115477
EN gloss: ice crystals forming a white deposit (especially on objects outside)
DE gloss: Eiskristalle, die einen weißen Belag bilden (insbesondere auf Gegenständen im Freien)
EN words: frost, hoar, hoarfrost, rime
DE words: Raufrost, Raureif, Reif
QUALITY: \_\_\_
35. [noun] ili:i66650
EN gloss: the ability to see in reduced illumination (as in moonlight)
DE gloss: die Fähigkeit, bei reduzierter Beleuchtung zu sehen (wie bei Mondlicht)
EN words: night vision, night-sight, scotopic vision, twilight vision
DE words: Nachtsehen, skotopisches Sehen
QUALITY: \_\_\_
36. [verb] ili:i26849
EN gloss: express or utter with a hiss
DE gloss: mit einem Zischen ausdrücken oder aussprechen
EN words: hiss, sibilate, siss, sizz
DE words: Stimme dämpfen, flüstern, hauchen, hinter vorgehaltener Hand, ins Ohr sagen, leise sprechen, mit tonloser Stimme, munkeln, raunen, säuseln, tonlos, tuscheln, wispern, zischeln, zuflüstern
QUALITY: \_\_\_
37. [noun] ili:i94222
EN gloss: a teenager or a young adult male
DE gloss: ein Jugendlicher oder ein junger Erwachsener
EN words: young buck, young man
DE words: Bruder, Bürschchen, Cowboy, Freundchen, Jungs, Kinders, Kollege, Kollegin, Leute, Mann Gottes, Meister, Sportsfreund, Verehrtester, der Herr, guter Mann, junger Mann, mein Gutster, mein Herr
QUALITY: \_\_\_
38. [noun] ili:i49310
EN gloss: dusky grey food fish found from Louisiana and Florida southward
DE gloss: dunkelgrauer Speisefisch, der von Louisiana und Florida südwärts vorkommt
EN words: Anisotremus surinamensis, black margate, pompon
DE words: Pompon, Puschel, Tanzwedel
QUALITY: \_\_\_
39. [noun] ili:i50315
EN gloss: a small vehicle with four wheels in which a baby or child is pushed around
DE gloss: ein kleines Fahrzeug mit vier Rädern, in dem ein Säugling oder ein Kind herumgeschoben wird
EN words: baby buggy, baby carriage, carriage, go-cart, perambulator, pram, pushchair, pusher, stroller
DE words: Kinderwagen, Säuglingskutsche
QUALITY: \_\_\_
40. [verb] ili:i31857
EN gloss: meet at a point
DE gloss: sich an einem Punkt treffen
EN words: cross, intersect
DE words: gegen den Wind segeln, kreuzen
QUALITY: \_\_\_
41. [noun] ili:i51632
EN gloss: a sailboat with two parallel hulls held together by single deck
DE gloss: ein Boot mit zwei parallelen Rümpfen, die durch ein einziges Deck zusammengehalten werden
EN words: catamaran
DE words: Doppelrumpfboot, Katamaran, Zweirumpfboot
QUALITY: \_\_\_
42. [verb] ili:i34734
EN gloss: to be found to exist
DE gloss: als existent befunden werden
EN words: occur
DE words: anzutreffen sein, auftreten, nicht ausbleiben, vorkommen, zu finden sein, zu sehen sein
QUALITY: \_\_\_
43. [verb] ili:i25187
EN gloss: assign too high a value to
DE gloss: einen zu hohen Wert zuweisen
EN words: overestimate, overvalue
DE words: zu hoch bewerten, zu viel Gewicht beimessen, zu viel Wichtigkeit beimessen, überbewerten, überschätzen
QUALITY: \_\_\_
44. [noun] ili:i73844
EN gloss: an expressive style of music
DE gloss: ein ausdrucksstarker Musikstil
EN words: genre, music genre, musical genre, musical style
DE words: Genre, Musikgenre, Musikrichtung, Musikstil, Stilrichtung
QUALITY: \_\_\_
45. [noun] ili:i113026
EN gloss: an abnormal condition in which cerebrospinal fluid collects in the ventricles of the brain; in infants it can cause abnormally rapid growth of the head and bulging fontanelles and a small face; in adults the symptoms are primarily neurological
DE gloss: ein anormaler Zustand, bei dem sich Liquor in den Hirnventrikeln sammelt; bei Säuglingen kann er zu einem anormal schnellen Wachstum des Kopfes, zu wulstigen Fontanellen und einem kleinen Gesicht führen; bei Erwachsenen sind die Symptome hauptsächlich neurologisch
EN words: hydrocephalus, hydrocephaly
DE words: Gehirnwassersucht, Hydrocephalus, Hydrozephalus, Wasserkopf
QUALITY: \_\_\_
46. [noun] ili:i62720
EN gloss: habitual uncleanliness
DE gloss: gewohnheitsmäßige Unreinheit
EN words: slovenliness
DE words: Flickarbeit, Flickenteppich, Flickwerk, Gestümper, Mist, Murks, Murkserei, Pfusch, Pfuscharbeit, Pfuscherei, Schlamperei, Schlendrian, Schluderei, Schund, schlechte Arbeit
QUALITY: \_\_\_
47. [noun] ili:i80976
EN gloss: the government agency in the United Kingdom that is responsible for internal security and counterintelligence overseas
DE gloss: Regierungsbehörde im Vereinigten Königreich, die für die innere Sicherheit und die Spionageabwehr im Ausland zuständig ist.
EN words: MI, Military Intelligence Section 6, Secret Intelligence Service
DE words: MI6, SIS, Secret Intelligence Service, Secret Service, britischer Auslandsgeheimdienst
QUALITY: \_\_\_
48. [noun] ili:i60476
EN gloss: an electrical device by which alternating current of one voltage is changed to another voltage
DE gloss: ein elektrisches Gerät, mit dem Wechselstrom einer bestimmten Spannung in eine andere Spannung umgewandelt wird
EN words: transformer
DE words: Spannungswandler, Trafo, Transformator, Transformer
QUALITY: \_\_\_
49. [noun] ili:i37037
EN gloss: wandering from the main path of a journey
DE gloss: das Abweichen vom Hauptweg einer Reise
EN words: digression, excursion
DE words: Abschweifung, Abstecher, Einschub, Exkurs, Umschweif
QUALITY: \_\_\_
50. [noun] ili:i77288
EN gloss: any meat that is minced and spiced and cooked as patties or used to fill sausages
DE gloss: jegliches Fleisch, das zerkleinert und gewürzt und als Pasteten gekocht oder zur Füllung von Würsten verwendet wird
EN words: sausage meat
DE words: Brät, Wurstbrät
QUALITY: \_\_\_

View file

@ -1,185 +1,98 @@
import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
import { openDb } from "./index.js";
// ── Types ─────────────────────────────────────────────────────────────────────
type Example = { text: string; source: "omw" | "cefr" };
type AnnotatedRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
votes: Partial<
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
>;
};
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
// ── Paths ─────────────────────────────────────────────────────────────────────
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PATHS = {
annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
};
// ── Loading ───────────────────────────────────────────────────────────────────
async function loadAnnotated(): Promise<AnnotatedRecord[]> {
// Use en.json as the base — it has the most complete glosses and examples.
// Merge votes and CEFR examples from the other language files.
const baseRaw = await fs.readFile(
path.join(PATHS.annotatedDir, "en.json"),
"utf-8",
);
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
const byId = new Map<string, AnnotatedRecord>();
for (const record of base) {
byId.set(record.source_id, record);
}
for (const lang of SUPPORTED_LANGUAGE_CODES) {
if (lang === "en") continue;
const raw = await fs.readFile(
path.join(PATHS.annotatedDir, `${lang}.json`),
"utf-8",
);
const records = JSON.parse(raw) as AnnotatedRecord[];
for (const record of records) {
const base = byId.get(record.source_id);
if (!base) continue;
// Merge votes
for (const [l, langVotes] of Object.entries(record.votes)) {
if (!base.votes[l as SupportedLanguageCode]) {
base.votes[l as SupportedLanguageCode] = {};
}
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
}
// Merge CEFR examples not already in base
for (const [l, examples] of Object.entries(record.examples)) {
const lang = l as SupportedLanguageCode;
const cefrExamples = examples.filter((e) => e.source === "cefr");
if (cefrExamples.length === 0) continue;
if (!base.examples[lang]) {
base.examples[lang] = cefrExamples;
} else {
base.examples[lang].push(...cefrExamples);
}
}
}
}
return [...byId.values()];
}
// ── Import ────────────────────────────────────────────────────────────────────
export async function importStage2(): Promise<void> {
console.log("Loading stage 2 annotated files...");
const records = await loadAnnotated();
console.log(` Loaded ${records.length.toLocaleString()} synsets`);
export async function importKaikki(): Promise<void> {
console.log("Loading extracted Kaikki data...");
const raw = await fs.readFile(PATHS.extracted, "utf-8");
const senses = JSON.parse(raw) as ExtractedSense[];
console.log(` Loaded ${senses.length.toLocaleString()} senses`);
const db = openDb();
const insertSynset = db.prepare(
`INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
);
const insertTranslation = db.prepare(
`INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
);
const insertGloss = db.prepare(
`INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
);
const insertExample = db.prepare(
`INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
);
const insertCefrVote = db.prepare(`
INSERT INTO cefr_source_votes (translation_id, cefr_level)
VALUES (
(SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
?
)
const insertEntry = db.prepare(`
INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT (headword, language, pos, sense_index)
DO UPDATE SET
gloss = excluded.gloss,
examples = excluded.examples
RETURNING id
`);
const insertTranslation = db.prepare(`
INSERT INTO translations (entry_id, target_lang, word, sense_hint)
VALUES (?, ?, ?, ?)
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
`);
// Track next available sense_index per (headword, pos) to handle
// the same word appearing in multiple JSONL entries with the same POS.
const senseIndexMap = new Map<string, number>();
console.log("\nImporting into pipeline.db...");
const importAll = db.transaction(() => {
let synsets = 0;
let entries = 0;
let translations = 0;
let glosses = 0;
let examples = 0;
let cefrVotes = 0;
let skipped = 0;
for (const record of records) {
insertSynset.run(record.source_id, record.pos);
synsets++;
for (const sense of senses) {
const key = `${sense.headword}|${sense.pos}`;
const nextIndex = senseIndexMap.get(key) ?? 0;
// Translations
for (const [lang, words] of Object.entries(record.translations)) {
const unique = [...new Set(words)];
for (const word of unique) {
insertTranslation.run(record.source_id, lang, word);
translations++;
}
// Use the offset sense_index to avoid collisions when the same word
// appears in multiple JSONL entries with the same POS.
const senseIndex = nextIndex;
senseIndexMap.set(key, nextIndex + 1);
const row = insertEntry.get(
sense.headword,
"en",
sense.pos,
senseIndex,
sense.gloss ?? null,
JSON.stringify(sense.examples),
) as { id: number } | undefined;
if (!row) {
skipped++;
continue;
}
// Glosses
for (const [lang, glossList] of Object.entries(record.glosses)) {
for (const text of glossList) {
insertGloss.run(record.source_id, lang, text);
glosses++;
}
}
entries++;
// Examples
for (const [lang, exList] of Object.entries(record.examples)) {
for (const example of exList) {
insertExample.run(
record.source_id,
lang,
example.text,
example.source,
);
examples++;
}
}
// CEFR source votes
for (const [lang, langVotes] of Object.entries(record.votes)) {
for (const [word, vote] of Object.entries(
langVotes as Record<string, { cefr_source: string }>,
)) {
insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
cefrVotes++;
}
for (const t of sense.translations) {
insertTranslation.run(
row.id,
t.target_lang,
t.word,
t.sense_hint ?? null,
);
translations++;
}
}
return { synsets, translations, glosses, examples, cefrVotes };
return { entries, translations, skipped };
});
const counts = importAll();
console.log(` synsets: ${counts.synsets.toLocaleString()}`);
console.log(` entries: ${counts.entries.toLocaleString()}`);
console.log(` translations: ${counts.translations.toLocaleString()}`);
console.log(` glosses: ${counts.glosses.toLocaleString()}`);
console.log(` examples: ${counts.examples.toLocaleString()}`);
console.log(` cefr votes: ${counts.cefrVotes.toLocaleString()}`);
console.log(` skipped: ${counts.skipped.toLocaleString()}`);
db.close();
console.log("\nImport complete.");
@ -189,7 +102,7 @@ export async function importStage2(): Promise<void> {
export function isImported(): boolean {
const db = openDb();
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
count: number;
};
db.close();
@ -200,20 +113,20 @@ export function isImported(): boolean {
async function main(): Promise<void> {
const db = openDb();
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
count: number;
};
db.close();
if (row.count > 0) {
console.log(
`pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
`pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
);
console.log("Delete pipeline.db and re-run db:init to start fresh.");
process.exit(0);
}
await importStage2();
await importKaikki();
}
if (import.meta.url === `file://${process.argv[1]}`) {

View file

@ -1,62 +1,58 @@
-- ── Base data ─────────────────────────────────────────────────────────────────
-- Imported from stage 2 JSON on first run. Never mutated after import.
-- Imported from Kaikki on first run. Never mutated after import.
CREATE TABLE IF NOT EXISTS synsets (
source_id TEXT PRIMARY KEY,
pos TEXT NOT NULL
CREATE TABLE IF NOT EXISTS entries (
id INTEGER PRIMARY KEY,
headword TEXT NOT NULL,
language TEXT NOT NULL,
pos TEXT NOT NULL,
sense_index INTEGER NOT NULL DEFAULT 0,
gloss TEXT,
examples TEXT NOT NULL DEFAULT '[]', -- JSON array of strings
source TEXT NOT NULL DEFAULT 'kaikki',
UNIQUE (headword, language, pos, sense_index)
);
CREATE TABLE IF NOT EXISTS translations (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
word TEXT NOT NULL,
UNIQUE (source_id, language, word)
);
CREATE TABLE IF NOT EXISTS glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS cefr_source_votes (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
cefr_level TEXT NOT NULL,
UNIQUE (translation_id)
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
target_lang TEXT NOT NULL,
word TEXT NOT NULL,
sense_hint TEXT,
source TEXT NOT NULL DEFAULT 'kaikki',
UNIQUE (entry_id, target_lang, word)
);
-- ── Status tracking ───────────────────────────────────────────────────────────
-- One row per synset per model per stage. Drives resumability.
-- One row per entry per model per stage. Drives resumability.
-- Sentinel rows use entry_id = 0 for one-time pipeline steps.
-- stage: round1 | round2 | tiebreak
-- status: pending | complete | needs_review | flagged
CREATE TABLE IF NOT EXISTS run_status (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL,
entry_id INTEGER NOT NULL,
model_name TEXT NOT NULL,
stage TEXT NOT NULL,
status TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
UNIQUE (source_id, model_name, stage)
UNIQUE (entry_id, model_name, stage)
);
-- ── Round 1 output ────────────────────────────────────────────────────────────
-- One row per translation/language per model. Written atomically per record.
-- Written atomically per entry per model.
-- Unique constraints enforce one model one vote.
CREATE TABLE IF NOT EXISTS model_cefr_votes (
CREATE TABLE IF NOT EXISTS model_entry_cefr_votes (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
cefr_level TEXT NOT NULL,
UNIQUE (entry_id, model_name)
);
CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
model_name TEXT NOT NULL,
@ -64,38 +60,29 @@ CREATE TABLE IF NOT EXISTS model_cefr_votes (
UNIQUE (translation_id, model_name)
);
CREATE TABLE IF NOT EXISTS model_translation_rejections (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
model_name TEXT NOT NULL,
UNIQUE (translation_id, model_name)
);
CREATE TABLE IF NOT EXISTS generated_glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
UNIQUE (entry_id, model_name)
);
CREATE TABLE IF NOT EXISTS generated_examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
UNIQUE (entry_id, model_name)
);
CREATE TABLE IF NOT EXISTS generated_descriptions (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
model_name TEXT NOT NULL,
language TEXT NOT NULL,
text TEXT NOT NULL,
UNIQUE (source_id, model_name, language)
CREATE TABLE IF NOT EXISTS generated_translations (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
model_name TEXT NOT NULL,
target_lang TEXT NOT NULL,
word TEXT NOT NULL,
UNIQUE (entry_id, model_name, target_lang)
);
-- ── Round 2 output ────────────────────────────────────────────────────────────
@ -116,20 +103,28 @@ CREATE TABLE IF NOT EXISTS example_candidate_votes (
UNIQUE (example_id, model_name)
);
CREATE TABLE IF NOT EXISTS description_candidate_votes (
CREATE TABLE IF NOT EXISTS translation_candidate_votes (
id INTEGER PRIMARY KEY,
description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
translation_id INTEGER NOT NULL REFERENCES generated_translations(id),
model_name TEXT NOT NULL,
UNIQUE (description_id, model_name)
UNIQUE (translation_id, model_name)
);
-- ── Resolved output ───────────────────────────────────────────────────────────
-- Written by merge. Never updated after writing.
-- Only fully resolved records are written here — no nulls, no flags.
-- Only fully resolved records are written here — no nulls.
-- Absence of a row means unresolved. Flagged status tracked in run_status.
-- source: omw | cefr | model_name
-- source: kaikki | model_name
CREATE TABLE IF NOT EXISTS resolved_translations (
CREATE TABLE IF NOT EXISTS resolved_entry_cefr (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
cefr_level TEXT NOT NULL,
difficulty TEXT NOT NULL,
UNIQUE (entry_id)
);
CREATE TABLE IF NOT EXISTS resolved_translation_cefr (
id INTEGER PRIMARY KEY,
translation_id INTEGER NOT NULL REFERENCES translations(id),
cefr_level TEXT NOT NULL,
@ -138,27 +133,25 @@ CREATE TABLE IF NOT EXISTS resolved_translations (
);
CREATE TABLE IF NOT EXISTS resolved_glosses (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (source_id, language)
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
text TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (entry_id)
);
CREATE TABLE IF NOT EXISTS resolved_examples (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
text TEXT NOT NULL,
source TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS resolved_descriptions (
id INTEGER PRIMARY KEY,
source_id TEXT NOT NULL REFERENCES synsets(source_id),
language TEXT NOT NULL,
text TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (source_id, language)
CREATE TABLE IF NOT EXISTS resolved_generated_translations (
id INTEGER PRIMARY KEY,
entry_id INTEGER NOT NULL REFERENCES entries(id),
target_lang TEXT NOT NULL,
word TEXT NOT NULL,
source TEXT NOT NULL,
UNIQUE (entry_id, target_lang)
);

View file

@ -1,204 +0,0 @@
"""
data-pipeline/stage-1-extract/scripts/extract.py
Extract all synsets from the Open Multilingual Wordnet (OMW) for all
supported languages and parts of speech.
Output: one JSON file per language, written to stage-1-extract/output/
en.json, it.json, es.json, de.json, fr.json
Each file is a JSON array of synset records:
{
"source_id": "ili:i12345",
"pos": "noun",
"translations": { "en": ["dog", "canine"], "it": ["cane"] },
"glosses": { "en": ["a domesticated animal..."] },
"examples": { "en": ["the dog barked at the stranger"] }
}
Usage:
python stage-1-extract/scripts/extract.py
python stage-1-extract/scripts/extract.py --sample
Prerequisites:
pip install wn
python -m wn download omw-en:1.4
python -m wn download omw-it:1.4
python -m wn download omw-de:1.4
python -m wn download omw-es:1.4
python -m wn download omw-fr:1.4
"""
import json
import sys
from pathlib import Path
import wn
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
POS_MAP: dict[str, str] = {
"n": "noun",
"v": "verb",
"a": "adjective",
"s": "adjective", # adjective satellite — collapsed into adjective
"r": "adverb",
}
def extract_all(
output_dir: str = "stage-1-extract/output", sample: bool = False
) -> None:
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
sample_size = 100 if sample else None
# Load one Wordnet object per language up front.
print("Loading wordnets...")
wordnets: dict[str, wn.Wordnet] = {}
for lang in SUPPORTED_LANGUAGE_CODES:
try:
wordnets[lang] = wn.Wordnet(lang=lang)
synset_count = len(wordnets[lang].synsets())
print(f" {lang}: {synset_count:,} total synsets")
except wn.Error as e:
print(f" ERROR loading {lang}: {e}")
print(f" Run: python -m wn download omw-{lang}:1.4")
sys.exit(1)
# Collect per-ILI data across all languages and POS.
print("\nExtracting synsets...")
by_ili: dict[str, dict] = {}
for lang, wnet in wordnets.items():
for omw_pos, pos_label in POS_MAP.items():
synsets = wnet.synsets(pos=omw_pos)
covered = 0
for synset in synsets:
ili = synset.ili
if not ili:
continue
covered += 1
lemmas = list(dict.fromkeys(str(lemma) for lemma in synset.lemmas()))
defns = [d for d in synset.definitions() if d]
examples = [e for e in synset.examples() if e]
if ili not in by_ili:
by_ili[ili] = {"pos": pos_label}
if lang not in by_ili[ili]:
by_ili[ili][lang] = {
"lemmas": lemmas,
"glosses": defns,
"examples": examples,
}
else:
# ILI already exists for this language — merge data.
# Happens when 'a' and 's' both map to adjective for the
# same ILI. Deduplicate to avoid repeated entries.
existing = by_ili[ili][lang]
existing["lemmas"] = list(
dict.fromkeys(existing["lemmas"] + lemmas)
)
existing["glosses"] = list(
dict.fromkeys(existing["glosses"] + defns)
)
existing["examples"] = list(
dict.fromkeys(existing["examples"] + examples)
)
print(f" {lang} {pos_label}: {covered:,} synsets with ILI")
# Build records and write single combined output file.
print("\nBuilding records...")
ilis = sorted(by_ili.keys())
if sample_size:
ilis = ilis[:sample_size]
records: list[dict] = []
for ili in ilis:
data = by_ili[ili]
record: dict = {
"source_id": f"ili:{ili}",
"pos": data["pos"],
"translations": {},
"glosses": {},
"examples": {},
}
for key, value in data.items():
if key == "pos":
continue
lang = key
if value["lemmas"]:
record["translations"][lang] = value["lemmas"]
if value["glosses"]:
record["glosses"][lang] = value["glosses"]
if value["examples"]:
record["examples"][lang] = value["examples"]
records.append(record)
output_file = out / "omw.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
print(f"\nWrote {len(records):,} synsets → {output_file}")
_print_coverage(records)
def _print_coverage(records: list[dict]) -> None:
"""Print per-language translation, gloss, and example counts."""
lang_stats: dict[str, dict[str, int]] = {}
for lang in SUPPORTED_LANGUAGE_CODES:
lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
pos_stats: dict[str, int] = {}
for r in records:
pos = r["pos"]
pos_stats[pos] = pos_stats.get(pos, 0) + 1
for lang, lemmas in r["translations"].items():
if lang in lang_stats:
lang_stats[lang]["translations"] += len(lemmas)
for lang, gloss_list in r["glosses"].items():
if lang in lang_stats:
lang_stats[lang]["glosses"] += len(gloss_list)
for lang, example_list in r["examples"].items():
if lang in lang_stats:
lang_stats[lang]["examples"] += len(example_list)
print("\nPOS breakdown:")
for pos, count in sorted(pos_stats.items()):
print(f" {pos}: {count:,}")
print("\nCoverage per language:")
for lang, counts in lang_stats.items():
t = counts["translations"]
g = counts["glosses"]
e = counts["examples"]
total = len(records)
print(
f" {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
parser.add_argument(
"--output-dir",
default="stage-1-extract/output",
help="Output directory for JSON files",
)
parser.add_argument(
"--sample",
action="store_true",
help="Extract only 100 synsets per language for inspection",
)
args = parser.parse_args()
extract_all(output_dir=args.output_dir, sample=args.sample)

View file

@ -0,0 +1,209 @@
import fs from "node:fs";
import path from "node:path";
import readline from "node:readline";
import { fileURLToPath } from "node:url";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type KaikkiTranslation = {
code?: string;
lang_code?: string;
word?: string;
sense?: string;
};
type KaikkiSense = {
glosses?: string[];
examples?: { text?: string }[];
translations?: KaikkiTranslation[];
};
type KaikkiEntry = { word?: string; pos?: string; senses?: KaikkiSense[] };
export type ExtractedSense = {
headword: string;
pos: SupportedPos;
sense_index: number;
gloss: string | null;
examples: string[];
translations: {
target_lang: SupportedLanguageCode;
word: string;
sense_hint: string | null;
}[];
};
// ── Constants ─────────────────────────────────────────────────────────────────
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PATHS = {
source: path.resolve(
__dirname,
"../sources/kaikki.org-dictionary-English.jsonl",
),
output: path.resolve(__dirname, "../output/en.json"),
};
const POS_MAP: Record<string, SupportedPos> = {
noun: "noun",
verb: "verb",
adj: "adjective",
adv: "adverb",
};
const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
// ── Helpers ───────────────────────────────────────────────────────────────────
function mapPos(kaikkiPos: string): SupportedPos | null {
return POS_MAP[kaikkiPos] ?? null;
}
function isAbbreviation(gloss: string): boolean {
return gloss.toLowerCase().startsWith("abbreviation of");
}
function extractTranslations(
sense: KaikkiSense,
): ExtractedSense["translations"] {
const seen = new Set<string>();
const result: ExtractedSense["translations"] = [];
for (const t of sense.translations ?? []) {
const code = t.code ?? t.lang_code;
if (!code || !SUPPORTED_LANG_SET.has(code) || code === "en") continue;
if (!t.word?.trim()) continue;
const key = `${code}:${t.word.trim()}`;
if (seen.has(key)) continue;
seen.add(key);
result.push({
target_lang: code as SupportedLanguageCode,
word: t.word.trim(),
sense_hint: t.sense?.trim() ?? null,
});
}
return result;
}
function extractExamples(sense: KaikkiSense): string[] {
return (sense.examples ?? [])
.map((e) => e.text?.trim())
.filter((t): t is string => !!t);
}
function processEntry(entry: KaikkiEntry): ExtractedSense[] {
const pos = mapPos(entry.pos ?? "");
if (!pos) return [];
if (!entry.word?.trim()) return [];
const headword = entry.word.trim();
const results: ExtractedSense[] = [];
let senseIndex = 0;
for (const sense of entry.senses ?? []) {
const gloss = sense.glosses?.[0]?.trim() ?? null;
// Skip abbreviation senses
if (gloss && isAbbreviation(gloss)) continue;
const translations = extractTranslations(sense);
// Skip senses with no translations in our supported languages
if (translations.length === 0) continue;
results.push({
headword,
pos,
sense_index: senseIndex++,
gloss,
examples: extractExamples(sense),
translations,
});
}
return results;
}
// ── Main ──────────────────────────────────────────────────────────────────────
async function extract(sampleLimit?: number): Promise<void> {
console.log("Extracting Kaikki English data...");
console.log(` Source: ${PATHS.source}`);
if (sampleLimit) {
console.log(` Sample mode: ${sampleLimit} entries`);
}
await fs.promises.mkdir(path.dirname(PATHS.output), { recursive: true });
const fileStream = fs.createReadStream(PATHS.source);
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
});
const senses: ExtractedSense[] = [];
let linesRead = 0;
let entriesProcessed = 0;
let entriesSkipped = 0;
for await (const line of rl) {
if (!line.trim()) continue;
if (sampleLimit && entriesProcessed >= sampleLimit) break;
linesRead++;
let entry: KaikkiEntry;
try {
entry = JSON.parse(line) as KaikkiEntry;
} catch {
console.warn(` Warning: failed to parse line ${linesRead}, skipping`);
continue;
}
const extracted = processEntry(entry);
if (extracted.length === 0) {
entriesSkipped++;
continue;
}
senses.push(...extracted);
entriesProcessed++;
if (entriesProcessed % 10_000 === 0) {
console.log(
` Processed ${entriesProcessed.toLocaleString()} entries...`,
);
}
}
await fs.promises.writeFile(
PATHS.output,
JSON.stringify(senses, null, 2),
"utf-8",
);
console.log(`\nExtraction complete:`);
console.log(` Lines read: ${linesRead.toLocaleString()}`);
console.log(` Entries processed: ${entriesProcessed.toLocaleString()}`);
console.log(` Entries skipped: ${entriesSkipped.toLocaleString()}`);
console.log(` Senses extracted: ${senses.length.toLocaleString()}`);
console.log(` Output: ${PATHS.output}`);
}
main().catch((err) => {
console.error(err);
process.exit(1);
});
async function main(): Promise<void> {
// Hardcoded sample limit for initial testing — remove for full extraction
await extract(500);
}

View file

@ -1,227 +0,0 @@
import fs from "node:fs/promises";
import path from "node:path";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
// ── Types ────────────────────────────────────────────────────────────────────
type OmwExample = { text: string; source: "omw" };
type CefrExample = { text: string; source: "cefr" };
type Example = OmwExample | CefrExample;
type OmwRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, string[]>>;
};
type AnnotatedRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
votes: Partial<
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
>;
};
type CefrSourceEntry = {
word: string;
pos: string;
cefr_level: string;
example_sentence_native?: string;
};
type ConflictEntry = {
word: string;
pos: string;
language: SupportedLanguageCode;
levels: string[];
};
// ── Constants ─────────────────────────────────────────────────────────────────
const POS_NORMALIZE: Record<string, SupportedPos> = {
noun: "noun",
n: "noun",
nom: "noun", // French
verb: "verb",
verbs: "verb",
v: "verb",
v1: "verb",
adjective: "adjective",
adjektiv: "adjective", // German
adj: "adjective",
adverb: "adverb",
adverbs: "adverb",
adv: "adverb",
};
const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
const PATHS = {
omw: "stage-1-extract/output/omw.json",
cefrDir: "stage-2-annotate/sources/cefr",
outputDir: "stage-2-annotate/output",
};
// ── CEFR source loading ───────────────────────────────────────────────────────
type CefrIndex = Map<string, { level: string; example?: string }>;
async function loadCefrSource(
lang: SupportedLanguageCode,
): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
const raw = await fs.readFile(filepath, "utf-8");
const entries = JSON.parse(raw) as CefrSourceEntry[];
// First pass — detect conflicts.
// Structure: "word|pos" -> Set of CEFR levels seen
const seen = new Map<string, Set<string>>();
for (const entry of entries) {
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
if (!pos) continue;
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
if (!seen.has(key)) seen.set(key, new Set());
seen.get(key)!.add(entry.cefr_level);
}
const conflicts: ConflictEntry[] = [];
for (const [key, levels] of seen.entries()) {
if (levels.size > 1) {
const [word, pos] = key.split("|") as [string, string];
conflicts.push({ word, pos, language: lang, levels: [...levels] });
}
}
// Second pass — build index, skip conflicting entries.
const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
const index: CefrIndex = new Map();
for (const entry of entries) {
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
if (!pos) continue;
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
if (conflictKeys.has(key)) continue;
index.set(key, {
level: entry.cefr_level,
...(entry.example_sentence_native
? { example: entry.example_sentence_native }
: {}),
});
}
return { index, conflicts };
}
// ── Annotation ────────────────────────────────────────────────────────────────
async function annotate(): Promise<void> {
// Load OMW records
console.log("Reading OMW extract...");
const raw = await fs.readFile(PATHS.omw, "utf-8");
const omwRecords = JSON.parse(raw) as OmwRecord[];
console.log(` Loaded ${omwRecords.length.toLocaleString()} synsets`);
// Load CEFR sources for all languages
console.log("\nLoading CEFR source files...");
const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
const allConflicts: ConflictEntry[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const { index, conflicts } = await loadCefrSource(lang);
cefrIndexes.set(lang, index);
allConflicts.push(...conflicts);
console.log(
` ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
);
}
// Write conflicts file
await fs.mkdir(PATHS.outputDir, { recursive: true });
await fs.writeFile(
path.join(PATHS.outputDir, "conflicts.json"),
JSON.stringify(allConflicts, null, 2),
"utf-8",
);
console.log(
`\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
);
// Annotate and write one file per language
console.log("\nAnnotating...");
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const index = cefrIndexes.get(lang)!;
const records: AnnotatedRecord[] = [];
let matched = 0;
for (const record of omwRecords) {
const annotated: AnnotatedRecord = {
source_id: record.source_id,
pos: record.pos,
translations: record.translations,
glosses: record.glosses,
examples: {},
votes: {},
};
// Convert OMW examples to typed format
for (const [l, exList] of Object.entries(record.examples)) {
annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
text,
source: "omw" as const,
}));
}
// Match translations for this language against CEFR index
const langTranslations = record.translations[lang] ?? [];
for (const word of langTranslations) {
const key = `${word.toLowerCase().trim()}|${record.pos}`;
const cefrEntry = index.get(key);
if (!cefrEntry) continue;
matched++;
// Add CEFR vote
if (!annotated.votes[lang]) annotated.votes[lang] = {};
annotated.votes[lang][word] = { cefr_source: cefrEntry.level };
// Add native example if present
if (cefrEntry.example) {
if (!annotated.examples[lang]) annotated.examples[lang] = [];
annotated.examples[lang].push({
text: cefrEntry.example,
source: "cefr" as const,
});
}
}
records.push(annotated);
}
const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
console.log(
` ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
);
}
}
// ── Main ─────────────────────────────────────────────────────────────────────
annotate().catch((err) => {
console.error(err);
process.exit(1);
});

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,170 +0,0 @@
[
{
"_fixture": "noun_with_cefr_vote",
"source_id": "ili:i100955",
"pos": "noun",
"translations": { "en": ["grain"], "de": ["Korn", "Kornbrand"] },
"glosses": { "en": ["a cereal grass"], "de": ["ein Getreidegras"] },
"examples": {
"en": [
{ "text": "wheat is a grain that is grown in Kansas", "source": "omw" }
]
},
"votes": { "en": { "grain": { "cefr_source": "B1" } } }
},
{
"_fixture": "verb_no_votes_no_translations",
"source_id": "ili:i21779",
"pos": "verb",
"translations": { "en": ["respire"] },
"glosses": {
"en": [
"undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide"
]
},
"examples": {},
"votes": {}
},
{
"_fixture": "verb_with_cefr_vote_all_languages",
"source_id": "ili:i21778",
"pos": "verb",
"translations": {
"en": ["breathe", "take a breath", "respire", "suspire"],
"it": ["respirare"],
"es": ["aspirar", "respirar"],
"de": ["Luft holen", "hauchen", "Luft bekommen", "Luft kriegen", "atmen"],
"fr": ["inspirer", "respirer"]
},
"glosses": {
"en": ["draw air into, and expel out of, the lungs"],
"de": ["Luft in die Lunge saugen und aus ihr ausstoßen"]
},
"examples": {
"en": [
{
"text": "I can breathe better when the air is clean",
"source": "omw"
},
{ "text": "The patient is respiring", "source": "omw" }
]
},
"votes": { "en": { "breathe": { "cefr_source": "A1" } } }
},
{
"_fixture": "adjective_all_languages_multiple_translations",
"source_id": "ili:i10007",
"pos": "adjective",
"translations": {
"en": ["possible"],
"it": [
"attuabile",
"effettuabile",
"eseguibile",
"fattibile",
"operabile",
"possibile",
"producibile",
"realizzabile"
],
"es": ["posible"],
"de": [
"möglich",
"denkbar",
"eventuell",
"möglicherweise",
"allfällig",
"etwaig",
"gegebenenfalls",
"eventuell"
],
"fr": ["possible", "éventuel"]
},
"glosses": {
"en": ["capable of happening or existing"],
"de": ["in der Lage, zu geschehen oder zu existieren"]
},
"examples": {
"en": [
{ "text": "a breakthrough may be possible next year", "source": "omw" },
{ "text": "anything is possible", "source": "omw" },
{ "text": "warned of possible consequences", "source": "omw" }
]
},
"votes": { "en": { "possible": { "cefr_source": "A2" } } }
},
{
"_fixture": "adjective_multiple_de_votes_cefr_examples",
"source_id": "ili:i10000",
"pos": "adjective",
"translations": {
"en": ["negative"],
"de": [
"dürftig",
"zu wünschen übrig lassen",
"schlecht",
"widrig",
"ungut",
"lausig",
"negativ",
"von Nachteil",
"schädlich",
"nachteilig",
"ungünstig"
],
"fr": ["négatif", "strictement négatif"]
},
"glosses": { "en": ["less than zero"], "de": ["kleiner als Null"] },
"examples": {
"en": [{ "text": "a negative number", "source": "omw" }],
"de": [
{ "text": "Die Beweise waren dürftig.", "source": "cefr" },
{ "text": "Das Wetter ist heute schlecht.", "source": "cefr" },
{
"text": "Trotz widriger Umstände haben sie es geschafft.",
"source": "cefr"
},
{
"text": "Er hatte ein ungutes Gefühl bei der Sache.",
"source": "cefr"
},
{ "text": "Er hat eine sehr negative Einstellung.", "source": "cefr" },
{
"text": "Rauchen ist schädlich für die Gesundheit.",
"source": "cefr"
},
{
"text": "Diese Entscheidung könnte nachteilig sein.",
"source": "cefr"
},
{
"text": "Das Wetter ist heute ungünstig für einen Ausflug.",
"source": "cefr"
}
]
},
"votes": {
"de": {
"dürftig": { "cefr_source": "C1" },
"schlecht": { "cefr_source": "A1" },
"widrig": { "cefr_source": "C1" },
"ungut": { "cefr_source": "B2" },
"negativ": { "cefr_source": "A2" },
"schädlich": { "cefr_source": "B1" },
"nachteilig": { "cefr_source": "B1" },
"ungünstig": { "cefr_source": "B2" }
}
}
},
{
"_fixture": "adverb_no_votes",
"source_id": "ili:i18157",
"pos": "adverb",
"translations": { "en": ["a cappella"], "es": ["a capella"] },
"glosses": { "en": ["without musical accompaniment"] },
"examples": {
"en": [{ "text": "they performed a cappella", "source": "omw" }]
},
"votes": {}
}
]

View file

@ -1,4 +0,0 @@
[
{ "word": "macht", "pos": "noun", "language": "de", "levels": ["A2", "B1"] },
{ "word": "bleiche", "pos": "noun", "language": "de", "levels": ["B2", "B1"] }
]

View file

@ -1,237 +0,0 @@
import fs from "node:fs/promises";
import path from "node:path";
import { describe, it, expect, beforeAll } from "vitest";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type Example = { text: string; source: "omw" | "cefr" };
type AnnotatedRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
votes: Partial<
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
>;
};
// ── Paths ─────────────────────────────────────────────────────────────────────
const DB_PATH = path.resolve("db/pipeline.db");
const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
const ANNOTATED_DIR = path.resolve("stage-2-annotate/output");
// ── Helpers ───────────────────────────────────────────────────────────────────
async function dbExists(): Promise<boolean> {
try {
await fs.access(DB_PATH);
return true;
} catch {
return false;
}
}
// ── Tests ─────────────────────────────────────────────────────────────────────
describe("pipeline.db — import validation", () => {
let db: import("better-sqlite3").Database;
let expectedSynsetCount: number;
let expectedCefrVoteCount: number;
beforeAll(async () => {
if (!(await dbExists())) return;
const Database = (await import("better-sqlite3")).default;
db = new Database(DB_PATH, { readonly: true });
db.pragma("foreign_keys = ON");
// Count expected synsets from omw.json
const omwRaw = await fs.readFile(OMW_PATH, "utf-8");
const omwRecords = JSON.parse(omwRaw) as unknown[];
expectedSynsetCount = omwRecords.length;
// Count expected CEFR votes from stage 2 annotated files.
// Merge all language files the same way the import script does —
// use en.json as base and merge votes from the other language files.
const byId = new Map<string, AnnotatedRecord>();
const baseRaw = await fs.readFile(
path.join(ANNOTATED_DIR, "en.json"),
"utf-8",
);
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
for (const record of base) {
byId.set(record.source_id, record);
}
for (const lang of SUPPORTED_LANGUAGE_CODES) {
if (lang === "en") continue;
const raw = await fs.readFile(
path.join(ANNOTATED_DIR, `${lang}.json`),
"utf-8",
);
const records = JSON.parse(raw) as AnnotatedRecord[];
for (const record of records) {
const base = byId.get(record.source_id);
if (!base) continue;
for (const [l, langVotes] of Object.entries(record.votes)) {
if (!base.votes[l as SupportedLanguageCode]) {
base.votes[l as SupportedLanguageCode] = {};
}
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
}
}
}
expectedCefrVoteCount = 0;
for (const record of byId.values()) {
for (const langVotes of Object.values(record.votes)) {
expectedCefrVoteCount += Object.keys(langVotes ?? {}).length;
}
}
}, 120_000);
it("pipeline.db exists — skipping all tests if not", async () => {
const exists = await dbExists();
if (!exists) {
console.warn(
"\n pipeline.db not found — run pnpm db:init and pnpm db:import first\n",
);
}
expect(exists).toBe(true);
});
it("synsets count matches omw.json", () => {
if (!db) return;
const row = db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
count: number;
};
expect(row.count).toBe(expectedSynsetCount);
});
it("every synset has at least one translation", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT s.source_id
FROM synsets s
LEFT JOIN translations t ON t.source_id = s.source_id
WHERE t.id IS NULL
`,
)
.all() as { source_id: string }[];
const errors = rows.map((r) => `${r.source_id}: no translations`);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every translation belongs to a valid synset", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT t.id, t.source_id
FROM translations t
LEFT JOIN synsets s ON s.source_id = t.source_id
WHERE s.source_id IS NULL
`,
)
.all() as { id: number; source_id: string }[];
const errors = rows.map(
(r) => `translation ${r.id}: references missing synset ${r.source_id}`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every cefr_source_vote references a valid translation", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT v.id, v.translation_id
FROM cefr_source_votes v
LEFT JOIN translations t ON t.id = v.translation_id
WHERE t.id IS NULL
`,
)
.all() as { id: number; translation_id: number }[];
const errors = rows.map(
(r) =>
`cefr_vote ${r.id}: references missing translation ${r.translation_id}`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("cefr_source_votes count matches stage 2 annotated output", () => {
if (!db) return;
const row = db
.prepare("SELECT COUNT(*) as count FROM cefr_source_votes")
.get() as { count: number };
expect(row.count).toBe(expectedCefrVoteCount);
});
it("every example has a valid source", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT source_id, language, source
FROM examples
WHERE source NOT IN ('omw', 'cefr')
`,
)
.all() as { source_id: string; language: string; source: string }[];
const errors = rows.map(
(r) =>
`${r.source_id} (${r.language}): invalid example source "${r.source}"`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every example belongs to a valid synset", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT e.id, e.source_id
FROM examples e
LEFT JOIN synsets s ON s.source_id = e.source_id
WHERE s.source_id IS NULL
`,
)
.all() as { id: number; source_id: string }[];
const errors = rows.map(
(r) => `example ${r.id}: references missing synset ${r.source_id}`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every gloss belongs to a valid synset", () => {
if (!db) return;
const rows = db
.prepare(
`
SELECT g.id, g.source_id
FROM glosses g
LEFT JOIN synsets s ON s.source_id = g.source_id
WHERE s.source_id IS NULL
`,
)
.all() as { id: number; source_id: string }[];
const errors = rows.map(
(r) => `gloss ${r.id}: references missing synset ${r.source_id}`,
);
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
});

View file

@ -1,166 +0,0 @@
import fs from "node:fs/promises";
import path from "node:path";
import { describe, it, expect } from "vitest";
import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type OmwRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, string[]>>;
};
// ── Paths ─────────────────────────────────────────────────────────────────────
const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
// ── Helpers ───────────────────────────────────────────────────────────────────
function isValidSourceId(id: string): boolean {
return /^ili:i\d+$/.test(id);
}
// ── Tests ─────────────────────────────────────────────────────────────────────
describe("stage 1 — omw.json validation", () => {
let records: OmwRecord[];
it("file exists and is valid JSON", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
expect(records).toBeDefined();
});
it("is a non-empty array", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
expect(Array.isArray(records)).toBe(true);
expect(records.length).toBeGreaterThan(0);
});
it("every record has required fields", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
for (const record of records) {
if (!record.source_id) {
errors.push(`missing source_id`);
continue;
}
if (!record.pos) errors.push(`${record.source_id}: missing pos`);
if (!record.translations)
errors.push(`${record.source_id}: missing translations`);
if (!record.glosses) errors.push(`${record.source_id}: missing glosses`);
if (!record.examples)
errors.push(`${record.source_id}: missing examples`);
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every source_id matches ili:i{number} pattern", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
for (const record of records) {
if (!isValidSourceId(record.source_id)) {
errors.push(`invalid source_id: ${record.source_id}`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every source_id is unique", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const seen = new Set<string>();
const errors: string[] = [];
for (const record of records) {
if (seen.has(record.source_id)) {
errors.push(`duplicate source_id: ${record.source_id}`);
}
seen.add(record.source_id);
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every pos is a valid supported value", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
const validPos = new Set(SUPPORTED_POS);
for (const record of records) {
if (!validPos.has(record.pos)) {
errors.push(`${record.source_id}: invalid pos "${record.pos}"`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every record has at least one translation in at least one language", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
for (const record of records) {
const langs = Object.keys(record.translations) as SupportedLanguageCode[];
if (langs.length === 0) {
errors.push(`${record.source_id}: no translations`);
continue;
}
for (const lang of langs) {
if (!validLangs.has(lang)) {
errors.push(`${record.source_id}: unsupported language "${lang}"`);
}
const words = record.translations[lang] ?? [];
if (words.length === 0) {
errors.push(`${record.source_id}: empty translations for "${lang}"`);
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("no duplicate translations within a single synset and language", async () => {
const raw = await fs.readFile(OMW_PATH, "utf-8");
const records = JSON.parse(raw) as OmwRecord[];
const errors: string[] = [];
for (const record of records) {
for (const [lang, words] of Object.entries(record.translations)) {
const seen = new Set<string>();
for (const word of words) {
if (seen.has(word)) {
errors.push(
`${record.source_id} (${lang}): duplicate translation "${word}"`,
);
}
seen.add(word);
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
});

View file

@ -1,218 +0,0 @@
import fs from "node:fs/promises";
import path from "node:path";
import { describe, it, expect, beforeAll } from "vitest";
import {
SUPPORTED_POS,
SUPPORTED_LANGUAGE_CODES,
CEFR_LEVELS,
} from "@lila/shared";
import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type Example = { text: string; source: "omw" | "cefr" };
type AnnotatedRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
votes: Partial<
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
>;
};
type ConflictEntry = {
word: string;
pos: string;
language: SupportedLanguageCode;
levels: string[];
};
// ── Paths ─────────────────────────────────────────────────────────────────────
const OUTPUT_DIR = path.resolve("stage-2-annotate/output");
// ── Tests ─────────────────────────────────────────────────────────────────────
describe("stage 2 — annotated output validation", () => {
const recordsByLang = new Map<SupportedLanguageCode, AnnotatedRecord[]>();
let conflicts: ConflictEntry[] = [];
beforeAll(async () => {
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const raw = await fs.readFile(
path.join(OUTPUT_DIR, `${lang}.json`),
"utf-8",
);
recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]);
}
const raw = await fs.readFile(
path.join(OUTPUT_DIR, "conflicts.json"),
"utf-8",
);
conflicts = JSON.parse(raw) as ConflictEntry[];
}, 60_000);
it("all five language files exist", async () => {
const errors: string[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
try {
await fs.access(filePath);
} catch {
errors.push(`missing file: ${lang}.json`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("conflicts.json exists", async () => {
const filePath = path.join(OUTPUT_DIR, "conflicts.json");
await expect(fs.access(filePath)).resolves.toBeUndefined();
});
it("every language file is a non-empty array", () => {
const errors: string[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
if (!Array.isArray(records)) {
errors.push(`${lang}.json: not an array`);
} else if (records.length === 0) {
errors.push(`${lang}.json: empty array`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every record has required fields", () => {
const errors: string[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
if (!record.source_id) {
errors.push(`${lang}: record missing source_id`);
continue;
}
if (!record.pos)
errors.push(`${lang} ${record.source_id}: missing pos`);
if (!record.translations)
errors.push(`${lang} ${record.source_id}: missing translations`);
if (!record.glosses)
errors.push(`${lang} ${record.source_id}: missing glosses`);
if (record.examples === undefined)
errors.push(`${lang} ${record.source_id}: missing examples`);
if (record.votes === undefined)
errors.push(`${lang} ${record.source_id}: missing votes`);
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every pos is a valid supported value", () => {
const errors: string[] = [];
const validPos = new Set(SUPPORTED_POS);
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
if (!validPos.has(record.pos)) {
errors.push(
`${lang} ${record.source_id}: invalid pos "${record.pos}"`,
);
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every example has text and a valid source", () => {
const errors: string[] = [];
const validSources = new Set(["omw", "cefr"]);
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
for (const [l, examples] of Object.entries(record.examples)) {
for (const example of examples) {
if (!example.text) {
errors.push(
`${lang} ${record.source_id} (${l}): example missing text`,
);
}
if (!validSources.has(example.source)) {
errors.push(
`${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`,
);
}
}
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("every cefr_source vote is a valid CEFR level", () => {
const errors: string[] = [];
const validLevels = new Set(CEFR_LEVELS);
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const records = recordsByLang.get(lang)!;
for (const record of records) {
for (const [l, langVotes] of Object.entries(record.votes)) {
for (const [word, vote] of Object.entries(langVotes ?? {})) {
if (
!validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number])
) {
errors.push(
`${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`,
);
}
}
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
it("conflicts.json entries have required fields and valid CEFR levels", () => {
const errors: string[] = [];
const validLevels = new Set(CEFR_LEVELS);
const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
for (const entry of conflicts) {
if (!entry.word) errors.push(`conflict missing word`);
if (!entry.pos) errors.push(`conflict missing pos`);
if (!entry.language) {
errors.push(`conflict missing language`);
} else if (!validLangs.has(entry.language)) {
errors.push(`conflict invalid language "${entry.language}"`);
}
if (!Array.isArray(entry.levels) || entry.levels.length < 2) {
errors.push(`${entry.word}: levels must have at least 2 entries`);
} else {
for (const level of entry.levels) {
if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) {
errors.push(`${entry.word}: invalid level "${level}"`);
}
}
}
}
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
});
});