feat: add Kaikki extraction and import scripts for stage 1
- Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL, filters to supported POS and languages, skips abbreviations and senses with no translations in supported languages - Rewrite db/import.ts for Kaikki flat model — tracks sense_index offsets per headword+pos to handle duplicate JSONL entries - Rewrite db/schema.sql for Kaikki model — entries, translations, LLM vote tables, resolved tables - Add extract and db:import scripts to package.json - Sample mode hardcoded to 500 entries for development
This commit is contained in:
parent
963bff4eb8
commit
209d52f54b
17 changed files with 346 additions and 1055737 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -12,6 +12,7 @@ __pycache__/
|
||||||
|
|
||||||
data-pipeline/archive/
|
data-pipeline/archive/
|
||||||
data-pipeline/stage-1-extract/output/
|
data-pipeline/stage-1-extract/output/
|
||||||
|
data-pipeline/stage-1-extract/sources/
|
||||||
data-pipeline/stage-2-annotate/output/
|
data-pipeline/stage-2-annotate/output/
|
||||||
data-pipeline/stage-3-enrich/output/
|
data-pipeline/stage-3-enrich/output/
|
||||||
data-pipeline/stage-4-merge/output/
|
data-pipeline/stage-4-merge/output/
|
||||||
|
|
|
||||||
|
|
@ -1,362 +0,0 @@
|
||||||
# OMW German Translation Quality Audit
|
|
||||||
|
|
||||||
Instructions: for each entry, check if the German translations
|
|
||||||
match the meaning described by the English gloss.
|
|
||||||
|
|
||||||
Mark QUALITY as:
|
|
||||||
OK — all German translations fit the meaning
|
|
||||||
PARTIAL — some fit, some don't
|
|
||||||
BAD — none of the German translations fit
|
|
||||||
USELESS — translations are correct but useless for learners
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
1. [noun] ili:i98680
|
|
||||||
EN gloss: the flowering part of a plant or arrangement of flowers on a stalk
|
|
||||||
DE gloss: der blühende Teil einer Pflanze oder die Anordnung von Blüten an einem Stiel
|
|
||||||
EN words: inflorescence
|
|
||||||
DE words: Blütenstand, Infloreszenz
|
|
||||||
QUALITY: correct
|
|
||||||
|
|
||||||
2. [verb] ili:i24675
|
|
||||||
EN gloss: make motionless
|
|
||||||
DE gloss: unbeweglich machen
|
|
||||||
EN words: still
|
|
||||||
DE words: stillen, zum Stillstand bringen
|
|
||||||
QUALITY: stillen means breastfeeding, so completelyworng, zum stillstand bringen is correct but the gloss sounds weird: unbeweglich machen, no one says this
|
|
||||||
|
|
||||||
3. [verb] ili:i22153
|
|
||||||
EN gloss: lose interest or become bored with something or somebody
|
|
||||||
DE gloss: das Interesse an etwas oder jemandem verlieren oder sich langweilen
|
|
||||||
EN words: fatigue, jade, pall, tire, weary
|
|
||||||
DE words: Langeweile erzeugen, anöden, ermüden, langweilen, sich langweilen, sich zu Tode langweilen, sich öden
|
|
||||||
QUALITY: its ok
|
|
||||||
|
|
||||||
4. [noun] ili:i74742
|
|
||||||
EN gloss: zealous preaching and advocacy of the gospel
|
|
||||||
DE gloss: eifriges Predigen und Eintreten für das Evangelium
|
|
||||||
EN words: evangelism
|
|
||||||
DE words: Evangelisation, Evangelisierung
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
5. [noun] ili:i115665
|
|
||||||
EN gloss: an oxide of iron that is strongly attracted by magnets
|
|
||||||
DE gloss: ein Eisenoxid, das stark von Magneten angezogen wird
|
|
||||||
EN words: magnetic iron-ore, magnetite
|
|
||||||
DE words: Eisenoxiduloxid, Magneteisen, Magneteisenstein, Magnetit
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
6. [adjective] ili:i17569
|
|
||||||
EN gloss: of or relating to fatalism
|
|
||||||
DE gloss: von oder im Zusammenhang mit Fatalismus
|
|
||||||
EN words: fatalist, fatalistic
|
|
||||||
DE words: auf alles gefasst, dem Schicksal ergeben, fatalistisch, gottergeben, schicksalsergeben
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
7. [adjective] ili:i682
|
|
||||||
EN gloss: having no previous example or precedent or parallel
|
|
||||||
DE gloss: ohne vorheriges Beispiel oder Präzedenzfall oder Parallele
|
|
||||||
EN words: new, unexampled
|
|
||||||
DE words: beispiellos, gab es noch nie, ohne Beispiel, ohne Präzedenzfall, ohnegleichen, präzedenzlos, sondergleichen, unvergleichbar
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
8. [noun] ili:i114018
|
|
||||||
EN gloss: a soft silvery metallic element of the rare earth group; isotope 170 emits X-rays and is used in small portable X-ray machines; it occurs in monazite and apatite and xenotime
|
|
||||||
DE gloss: ein weiches, silbriges Metallelement der Gruppe der Seltenen Erden; Isotop 170 emittiert Röntgenstrahlen und wird in kleinen tragbaren Röntgengeräten verwendet; es kommt in Monazit und Apatit sowie in Xenotim vor
|
|
||||||
EN words: Tm, atomic number 69, thulium
|
|
||||||
DE words: Terameter, Tm
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
9. [noun] ili:i117564
|
|
||||||
EN gloss: the rate of some repeating event
|
|
||||||
DE gloss: die Geschwindigkeit eines sich wiederholenden Ereignisses
|
|
||||||
EN words: pace, tempo
|
|
||||||
DE words: Takt, Tempo
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
10. [verb] ili:i31619
|
|
||||||
EN gloss: let drop or droop
|
|
||||||
DE gloss: fallen oder hängen lassen
|
|
||||||
EN words: hang
|
|
||||||
DE words: am Galgen sterben lassen, aufhängen, aufknüpfen, erhängen, henken, hängen
|
|
||||||
QUALITY: wrong,let drop means fallen lassen, like dropping something? im not sure here, does it really mean to hang some one? if so, then its ok
|
|
||||||
|
|
||||||
11. [noun] ili:i75571
|
|
||||||
EN gloss: a heavy dull sound (as made by impact of heavy objects)
|
|
||||||
DE gloss: ein schweres, dumpfes Geräusch (wie beim Aufprall schwerer Gegenstände)
|
|
||||||
EN words: clump, clunk, thud, thump, thumping
|
|
||||||
DE words: Geklacker, Geklapper, Klackern, Klappern
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
12. [noun] ili:i92290
|
|
||||||
EN gloss: a person who makes a promise
|
|
||||||
DE gloss: eine Person, die ein Versprechen gibt
|
|
||||||
EN words: promiser, promisor
|
|
||||||
DE words: Freud'scher Versprecher, Lapsus Linguae, Versprecher, freudscher Versprecher
|
|
||||||
QUALITY: completeley wrong, Versprecher is if you intend to say something but say some thing else, it has nothing to do with Versprechen
|
|
||||||
|
|
||||||
13. [noun] ili:i59450
|
|
||||||
EN gloss: a vertical well around which there is a stairway
|
|
||||||
DE gloss: ein vertikaler Schacht, um den herum eine Treppe verläuft
|
|
||||||
EN words: stairwell
|
|
||||||
DE words: Ern, Flur, Hausflur, Stiegenhaus, Treppenhaus
|
|
||||||
QUALITY: treppenhaus woudl be the only correct one right?
|
|
||||||
|
|
||||||
14. [verb] ili:i21908
|
|
||||||
EN gloss: smile affectedly or derisively
|
|
||||||
DE gloss: affektiert oder spöttisch lächeln
|
|
||||||
EN words: simper, smirk
|
|
||||||
DE words: in sich hinein lächeln, schmunzeln, vor sich hin lächeln
|
|
||||||
QUALITY: the glosses would be also the words here? schmunzeln and lächeln are kind of the same but the affektiert and spöttisch is missing?
|
|
||||||
|
|
||||||
15. [adjective] ili:i10887
|
|
||||||
EN gloss: tending to reserve or introspection
|
|
||||||
DE gloss: zur Zurückhaltung oder Introspektion neigend
|
|
||||||
EN words: indrawn, withdrawn
|
|
||||||
DE words: allein, einsam, eremitenhaft, eremitisch, für sich, solo, wie ein Einsiedler, wie ein Eremit, zurückgezogen
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
16. [noun] ili:i113657
|
|
||||||
EN gloss: a substance from which another substance is formed (especially by a metabolic reaction)
|
|
||||||
DE gloss: ein Stoff, aus dem ein anderer Stoff gebildet wird (insbesondere durch eine Stoffwechselreaktion)
|
|
||||||
EN words: precursor
|
|
||||||
DE words: Ausgangsstoff, Edukt, Grundstoff, Präkursor, Vorläufer, biologische Vorstufe
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
17. [adjective] ili:i13251
|
|
||||||
EN gloss: tastelessly showy
|
|
||||||
DE gloss: geschmacklos und auffällig
|
|
||||||
EN words: brassy, cheap, flash, flashy, garish, gaudy, gimcrack, loud, meretricious, tacky, tatty, tawdry, trashy
|
|
||||||
DE words: aufdringlich, marktschreierisch, reißerisch
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
18. [noun] ili:i68734
|
|
||||||
EN gloss: the branch of chemistry that studies the relation between chemical action and the amount of heat absorbed or generated
|
|
||||||
DE gloss: der Zweig der Chemie, der die Beziehung zwischen chemischer Wirkung und der absorbierten oder erzeugten Wärmemenge untersucht
|
|
||||||
EN words: thermochemistry
|
|
||||||
DE words: Thermochemie, chemische Thermodynamik
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
19. [adjective] ili:i12980
|
|
||||||
EN gloss: distinguished from others in excellence
|
|
||||||
DE gloss: durch hohe Qualität von anderen unterschieden
|
|
||||||
EN words: outstanding
|
|
||||||
DE words: I a, ausgezeichnet, außergewöhnlich, außerordentlich, besonders, bestens, eins a, exzeptionell, herausragend, schnafte, splendid, trefflich, vortrefflich, vorzüglich
|
|
||||||
QUALITY: ok, aber eins a/1a is wirklich sehr starke umgangssprache. und cih habe ncoh nie schnafte oder splendid gehört, der rest passt
|
|
||||||
|
|
||||||
20. [verb] ili:i30043
|
|
||||||
EN gloss: tear down so as to make flat with the ground
|
|
||||||
DE gloss: abreißen, um den Boden zu ebnen
|
|
||||||
EN words: dismantle, level, pull down, rase, raze, take down, tear down
|
|
||||||
DE words: abreißen, aus den Augen verlieren, keinen Kontakt mehr haben zu, nicht länger in Kontakt stehen
|
|
||||||
QUALITY: nur abreißen stimmt, der rest passt in diesem zusammenhang gar nicht!
|
|
||||||
|
|
||||||
21. [adjective] ili:i14014
|
|
||||||
EN gloss: desired or wished for or sought
|
|
||||||
DE gloss: gewünscht oder gewünscht oder gesucht
|
|
||||||
EN words: wanted
|
|
||||||
DE words: benötigt, gesucht, gewünscht
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
22. [verb] ili:i29481
|
|
||||||
EN gloss: mar or spoil the appearance of
|
|
||||||
DE gloss: das Aussehen verunstalten
|
|
||||||
EN words: blemish, deface, disfigure
|
|
||||||
DE words: deformieren, entstellen, verhunzen, verschandeln, verunstalten, verunzieren
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
23. [verb] ili:i28605
|
|
||||||
EN gloss: spread thickly
|
|
||||||
DE gloss: dick auftragen
|
|
||||||
EN words: slather
|
|
||||||
DE words: beharken, bestreichen, mit Feuer belegen, mit Sperrfeuer belegen
|
|
||||||
QUALITY: kein wort ist wirklich ein synonym für dick auftragen, (i dont even know if the english word fits here?)
|
|
||||||
|
|
||||||
24. [noun] ili:i92029
|
|
||||||
EN gloss: someone who is licensed to operate an aircraft in flight
|
|
||||||
DE gloss: jemand, der eine Lizenz zum Führen eines Luftfahrzeugs im Flug hat
|
|
||||||
EN words: airplane pilot, pilot
|
|
||||||
DE words: Führer, Lotse, Pilot
|
|
||||||
QUALITY: nur Pilot stimmt hier
|
|
||||||
|
|
||||||
25. [adjective] ili:i8221
|
|
||||||
EN gloss: capable of being measured
|
|
||||||
DE gloss: in der Lage, gemessen zu werden
|
|
||||||
EN words: measurable, mensurable
|
|
||||||
DE words: bestimmbar, der Messung zugänglich, erhebbar, mensurabel, messbar
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
26. [noun] ili:i61380
|
|
||||||
EN gloss: the spirit of a group that makes the members want the group to succeed
|
|
||||||
DE gloss: der Geist einer Gruppe, der die Mitglieder dazu bringt, den Erfolg der Gruppe zu wollen
|
|
||||||
EN words: esprit de corps, morale, team spirit
|
|
||||||
DE words: Gruppengeist, Teamgeist
|
|
||||||
QUALITY: Gruppengeist hört sich so komisch an, das sagt niemand, teamgeist ist in ordnung
|
|
||||||
|
|
||||||
27. [adjective] ili:i10497
|
|
||||||
EN gloss: free of restrictions or qualifications
|
|
||||||
DE gloss: Zustand, in dem in einer Wohnung niemand wohnt.
|
|
||||||
EN words: clean, clear
|
|
||||||
DE words: frei, leer stehend, leerstehend, unbewohnt, ungenutzt, verwaist
|
|
||||||
QUALITY: ok
|
|
||||||
|
|
||||||
28. [adjective] ili:i6238
|
|
||||||
EN gloss: moving and bending with ease
|
|
||||||
DE gloss: anmutig schlank und mit Leichtigkeit biegsam und beweglich
|
|
||||||
EN words: lissom, lissome, lithe, lithesome, slender, supple, svelte, sylphlike
|
|
||||||
DE words: elastisch, geschmeidig, schlangenartig
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
29. [noun] ili:i57906
|
|
||||||
EN gloss: station for the production and transmission of AM or FM radio broadcasts
|
|
||||||
DE gloss: Sender für die Produktion und Übertragung von AM- oder FM-Radiosendungen
|
|
||||||
EN words: radio station
|
|
||||||
DE words: Radiosender, Rundfunkstation, Sender
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
30. [noun] ili:i112045
|
|
||||||
EN gloss: the purple or black-and-blue area resulting from a bruise
|
|
||||||
DE gloss: der violette oder schwarzblaue Bereich, der durch einen Bluterguss entsteht
|
|
||||||
EN words: ecchymosis
|
|
||||||
DE words: Ekchymose, kleinflächige Hautblutung
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
31. [adjective] ili:i10839
|
|
||||||
EN gloss: capable of being replaced
|
|
||||||
DE gloss: kann ersetzt werden
|
|
||||||
EN words: replaceable
|
|
||||||
DE words: austauschbar, ersetzbar, fungibel
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
32. [verb] ili:i28714
|
|
||||||
EN gloss: whip
|
|
||||||
DE gloss: peitschen
|
|
||||||
EN words: flagellate, scourge
|
|
||||||
DE words: auspeitschen, flagellieren, geißeln, peitschen
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
33. [noun] ili:i52826
|
|
||||||
EN gloss: a mechanical or electrical explosive device or a small amount of explosive; can be used to initiate the reaction of a disrupting explosive
|
|
||||||
DE gloss: ein mechanischer oder elektrischer Sprengkörper oder eine kleine Menge Sprengstoff; kann verwendet werden, um die Reaktion eines Sprengstoffs auszulösen
|
|
||||||
EN words: cap, detonating device, detonator
|
|
||||||
DE words: Auslöser, Zünder, Zündvorrichtung
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
34. [noun] ili:i115477
|
|
||||||
EN gloss: ice crystals forming a white deposit (especially on objects outside)
|
|
||||||
DE gloss: Eiskristalle, die einen weißen Belag bilden (insbesondere auf Gegenständen im Freien)
|
|
||||||
EN words: frost, hoar, hoarfrost, rime
|
|
||||||
DE words: Raufrost, Raureif, Reif
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
35. [noun] ili:i66650
|
|
||||||
EN gloss: the ability to see in reduced illumination (as in moonlight)
|
|
||||||
DE gloss: die Fähigkeit, bei reduzierter Beleuchtung zu sehen (wie bei Mondlicht)
|
|
||||||
EN words: night vision, night-sight, scotopic vision, twilight vision
|
|
||||||
DE words: Nachtsehen, skotopisches Sehen
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
36. [verb] ili:i26849
|
|
||||||
EN gloss: express or utter with a hiss
|
|
||||||
DE gloss: mit einem Zischen ausdrücken oder aussprechen
|
|
||||||
EN words: hiss, sibilate, siss, sizz
|
|
||||||
DE words: Stimme dämpfen, flüstern, hauchen, hinter vorgehaltener Hand, ins Ohr sagen, leise sprechen, mit tonloser Stimme, munkeln, raunen, säuseln, tonlos, tuscheln, wispern, zischeln, zuflüstern
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
37. [noun] ili:i94222
|
|
||||||
EN gloss: a teenager or a young adult male
|
|
||||||
DE gloss: ein Jugendlicher oder ein junger Erwachsener
|
|
||||||
EN words: young buck, young man
|
|
||||||
DE words: Bruder, Bürschchen, Cowboy, Freundchen, Jungs, Kinders, Kollege, Kollegin, Leute, Mann Gottes, Meister, Sportsfreund, Verehrtester, der Herr, guter Mann, junger Mann, mein Gutster, mein Herr
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
38. [noun] ili:i49310
|
|
||||||
EN gloss: dusky grey food fish found from Louisiana and Florida southward
|
|
||||||
DE gloss: dunkelgrauer Speisefisch, der von Louisiana und Florida südwärts vorkommt
|
|
||||||
EN words: Anisotremus surinamensis, black margate, pompon
|
|
||||||
DE words: Pompon, Puschel, Tanzwedel
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
39. [noun] ili:i50315
|
|
||||||
EN gloss: a small vehicle with four wheels in which a baby or child is pushed around
|
|
||||||
DE gloss: ein kleines Fahrzeug mit vier Rädern, in dem ein Säugling oder ein Kind herumgeschoben wird
|
|
||||||
EN words: baby buggy, baby carriage, carriage, go-cart, perambulator, pram, pushchair, pusher, stroller
|
|
||||||
DE words: Kinderwagen, Säuglingskutsche
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
40. [verb] ili:i31857
|
|
||||||
EN gloss: meet at a point
|
|
||||||
DE gloss: sich an einem Punkt treffen
|
|
||||||
EN words: cross, intersect
|
|
||||||
DE words: gegen den Wind segeln, kreuzen
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
41. [noun] ili:i51632
|
|
||||||
EN gloss: a sailboat with two parallel hulls held together by single deck
|
|
||||||
DE gloss: ein Boot mit zwei parallelen Rümpfen, die durch ein einziges Deck zusammengehalten werden
|
|
||||||
EN words: catamaran
|
|
||||||
DE words: Doppelrumpfboot, Katamaran, Zweirumpfboot
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
42. [verb] ili:i34734
|
|
||||||
EN gloss: to be found to exist
|
|
||||||
DE gloss: als existent befunden werden
|
|
||||||
EN words: occur
|
|
||||||
DE words: anzutreffen sein, auftreten, nicht ausbleiben, vorkommen, zu finden sein, zu sehen sein
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
43. [verb] ili:i25187
|
|
||||||
EN gloss: assign too high a value to
|
|
||||||
DE gloss: einen zu hohen Wert zuweisen
|
|
||||||
EN words: overestimate, overvalue
|
|
||||||
DE words: zu hoch bewerten, zu viel Gewicht beimessen, zu viel Wichtigkeit beimessen, überbewerten, überschätzen
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
44. [noun] ili:i73844
|
|
||||||
EN gloss: an expressive style of music
|
|
||||||
DE gloss: ein ausdrucksstarker Musikstil
|
|
||||||
EN words: genre, music genre, musical genre, musical style
|
|
||||||
DE words: Genre, Musikgenre, Musikrichtung, Musikstil, Stilrichtung
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
45. [noun] ili:i113026
|
|
||||||
EN gloss: an abnormal condition in which cerebrospinal fluid collects in the ventricles of the brain; in infants it can cause abnormally rapid growth of the head and bulging fontanelles and a small face; in adults the symptoms are primarily neurological
|
|
||||||
DE gloss: ein anormaler Zustand, bei dem sich Liquor in den Hirnventrikeln sammelt; bei Säuglingen kann er zu einem anormal schnellen Wachstum des Kopfes, zu wulstigen Fontanellen und einem kleinen Gesicht führen; bei Erwachsenen sind die Symptome hauptsächlich neurologisch
|
|
||||||
EN words: hydrocephalus, hydrocephaly
|
|
||||||
DE words: Gehirnwassersucht, Hydrocephalus, Hydrozephalus, Wasserkopf
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
46. [noun] ili:i62720
|
|
||||||
EN gloss: habitual uncleanliness
|
|
||||||
DE gloss: gewohnheitsmäßige Unreinheit
|
|
||||||
EN words: slovenliness
|
|
||||||
DE words: Flickarbeit, Flickenteppich, Flickwerk, Gestümper, Mist, Murks, Murkserei, Pfusch, Pfuscharbeit, Pfuscherei, Schlamperei, Schlendrian, Schluderei, Schund, schlechte Arbeit
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
47. [noun] ili:i80976
|
|
||||||
EN gloss: the government agency in the United Kingdom that is responsible for internal security and counterintelligence overseas
|
|
||||||
DE gloss: Regierungsbehörde im Vereinigten Königreich, die für die innere Sicherheit und die Spionageabwehr im Ausland zuständig ist.
|
|
||||||
EN words: MI, Military Intelligence Section 6, Secret Intelligence Service
|
|
||||||
DE words: MI6, SIS, Secret Intelligence Service, Secret Service, britischer Auslandsgeheimdienst
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
48. [noun] ili:i60476
|
|
||||||
EN gloss: an electrical device by which alternating current of one voltage is changed to another voltage
|
|
||||||
DE gloss: ein elektrisches Gerät, mit dem Wechselstrom einer bestimmten Spannung in eine andere Spannung umgewandelt wird
|
|
||||||
EN words: transformer
|
|
||||||
DE words: Spannungswandler, Trafo, Transformator, Transformer
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
49. [noun] ili:i37037
|
|
||||||
EN gloss: wandering from the main path of a journey
|
|
||||||
DE gloss: das Abweichen vom Hauptweg einer Reise
|
|
||||||
EN words: digression, excursion
|
|
||||||
DE words: Abschweifung, Abstecher, Einschub, Exkurs, Umschweif
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
||||||
50. [noun] ili:i77288
|
|
||||||
EN gloss: any meat that is minced and spiced and cooked as patties or used to fill sausages
|
|
||||||
DE gloss: jegliches Fleisch, das zerkleinert und gewürzt und als Pasteten gekocht oder zur Füllung von Würsten verwendet wird
|
|
||||||
EN words: sausage meat
|
|
||||||
DE words: Brät, Wurstbrät
|
|
||||||
QUALITY: \_\_\_
|
|
||||||
|
|
@ -1,185 +1,98 @@
|
||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
import { fileURLToPath } from "node:url";
|
import { fileURLToPath } from "node:url";
|
||||||
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
|
||||||
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
|
||||||
import { openDb } from "./index.js";
|
import { openDb } from "./index.js";
|
||||||
|
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
|
||||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
type Example = { text: string; source: "omw" | "cefr" };
|
|
||||||
|
|
||||||
type AnnotatedRecord = {
|
|
||||||
source_id: string;
|
|
||||||
pos: SupportedPos;
|
|
||||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
|
||||||
votes: Partial<
|
|
||||||
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
|
||||||
>;
|
|
||||||
};
|
|
||||||
|
|
||||||
// ── Paths ─────────────────────────────────────────────────────────────────────
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
const PATHS = {
|
const PATHS = {
|
||||||
annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
|
extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
|
||||||
};
|
};
|
||||||
|
|
||||||
// ── Loading ───────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
async function loadAnnotated(): Promise<AnnotatedRecord[]> {
|
|
||||||
// Use en.json as the base — it has the most complete glosses and examples.
|
|
||||||
// Merge votes and CEFR examples from the other language files.
|
|
||||||
const baseRaw = await fs.readFile(
|
|
||||||
path.join(PATHS.annotatedDir, "en.json"),
|
|
||||||
"utf-8",
|
|
||||||
);
|
|
||||||
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
|
|
||||||
|
|
||||||
const byId = new Map<string, AnnotatedRecord>();
|
|
||||||
for (const record of base) {
|
|
||||||
byId.set(record.source_id, record);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
||||||
if (lang === "en") continue;
|
|
||||||
|
|
||||||
const raw = await fs.readFile(
|
|
||||||
path.join(PATHS.annotatedDir, `${lang}.json`),
|
|
||||||
"utf-8",
|
|
||||||
);
|
|
||||||
const records = JSON.parse(raw) as AnnotatedRecord[];
|
|
||||||
|
|
||||||
for (const record of records) {
|
|
||||||
const base = byId.get(record.source_id);
|
|
||||||
if (!base) continue;
|
|
||||||
|
|
||||||
// Merge votes
|
|
||||||
for (const [l, langVotes] of Object.entries(record.votes)) {
|
|
||||||
if (!base.votes[l as SupportedLanguageCode]) {
|
|
||||||
base.votes[l as SupportedLanguageCode] = {};
|
|
||||||
}
|
|
||||||
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Merge CEFR examples not already in base
|
|
||||||
for (const [l, examples] of Object.entries(record.examples)) {
|
|
||||||
const lang = l as SupportedLanguageCode;
|
|
||||||
const cefrExamples = examples.filter((e) => e.source === "cefr");
|
|
||||||
if (cefrExamples.length === 0) continue;
|
|
||||||
|
|
||||||
if (!base.examples[lang]) {
|
|
||||||
base.examples[lang] = cefrExamples;
|
|
||||||
} else {
|
|
||||||
base.examples[lang].push(...cefrExamples);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return [...byId.values()];
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Import ────────────────────────────────────────────────────────────────────
|
// ── Import ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
export async function importStage2(): Promise<void> {
|
export async function importKaikki(): Promise<void> {
|
||||||
console.log("Loading stage 2 annotated files...");
|
console.log("Loading extracted Kaikki data...");
|
||||||
const records = await loadAnnotated();
|
const raw = await fs.readFile(PATHS.extracted, "utf-8");
|
||||||
console.log(` Loaded ${records.length.toLocaleString()} synsets`);
|
const senses = JSON.parse(raw) as ExtractedSense[];
|
||||||
|
console.log(` Loaded ${senses.length.toLocaleString()} senses`);
|
||||||
|
|
||||||
const db = openDb();
|
const db = openDb();
|
||||||
|
|
||||||
const insertSynset = db.prepare(
|
const insertEntry = db.prepare(`
|
||||||
`INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
|
INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
|
||||||
);
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
|
ON CONFLICT (headword, language, pos, sense_index)
|
||||||
const insertTranslation = db.prepare(
|
DO UPDATE SET
|
||||||
`INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
|
gloss = excluded.gloss,
|
||||||
);
|
examples = excluded.examples
|
||||||
|
RETURNING id
|
||||||
const insertGloss = db.prepare(
|
|
||||||
`INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
|
|
||||||
);
|
|
||||||
|
|
||||||
const insertExample = db.prepare(
|
|
||||||
`INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
|
|
||||||
);
|
|
||||||
|
|
||||||
const insertCefrVote = db.prepare(`
|
|
||||||
INSERT INTO cefr_source_votes (translation_id, cefr_level)
|
|
||||||
VALUES (
|
|
||||||
(SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
|
|
||||||
?
|
|
||||||
)
|
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
const insertTranslation = db.prepare(`
|
||||||
|
INSERT INTO translations (entry_id, target_lang, word, sense_hint)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
|
||||||
|
`);
|
||||||
|
|
||||||
|
// Track next available sense_index per (headword, pos) to handle
|
||||||
|
// the same word appearing in multiple JSONL entries with the same POS.
|
||||||
|
const senseIndexMap = new Map<string, number>();
|
||||||
|
|
||||||
console.log("\nImporting into pipeline.db...");
|
console.log("\nImporting into pipeline.db...");
|
||||||
|
|
||||||
const importAll = db.transaction(() => {
|
const importAll = db.transaction(() => {
|
||||||
let synsets = 0;
|
let entries = 0;
|
||||||
let translations = 0;
|
let translations = 0;
|
||||||
let glosses = 0;
|
let skipped = 0;
|
||||||
let examples = 0;
|
|
||||||
let cefrVotes = 0;
|
|
||||||
|
|
||||||
for (const record of records) {
|
for (const sense of senses) {
|
||||||
insertSynset.run(record.source_id, record.pos);
|
const key = `${sense.headword}|${sense.pos}`;
|
||||||
synsets++;
|
const nextIndex = senseIndexMap.get(key) ?? 0;
|
||||||
|
|
||||||
// Translations
|
// Use the offset sense_index to avoid collisions when the same word
|
||||||
for (const [lang, words] of Object.entries(record.translations)) {
|
// appears in multiple JSONL entries with the same POS.
|
||||||
const unique = [...new Set(words)];
|
const senseIndex = nextIndex;
|
||||||
for (const word of unique) {
|
senseIndexMap.set(key, nextIndex + 1);
|
||||||
insertTranslation.run(record.source_id, lang, word);
|
|
||||||
translations++;
|
const row = insertEntry.get(
|
||||||
}
|
sense.headword,
|
||||||
|
"en",
|
||||||
|
sense.pos,
|
||||||
|
senseIndex,
|
||||||
|
sense.gloss ?? null,
|
||||||
|
JSON.stringify(sense.examples),
|
||||||
|
) as { id: number } | undefined;
|
||||||
|
|
||||||
|
if (!row) {
|
||||||
|
skipped++;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Glosses
|
entries++;
|
||||||
for (const [lang, glossList] of Object.entries(record.glosses)) {
|
|
||||||
for (const text of glossList) {
|
|
||||||
insertGloss.run(record.source_id, lang, text);
|
|
||||||
glosses++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Examples
|
for (const t of sense.translations) {
|
||||||
for (const [lang, exList] of Object.entries(record.examples)) {
|
insertTranslation.run(
|
||||||
for (const example of exList) {
|
row.id,
|
||||||
insertExample.run(
|
t.target_lang,
|
||||||
record.source_id,
|
t.word,
|
||||||
lang,
|
t.sense_hint ?? null,
|
||||||
example.text,
|
);
|
||||||
example.source,
|
translations++;
|
||||||
);
|
|
||||||
examples++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// CEFR source votes
|
|
||||||
for (const [lang, langVotes] of Object.entries(record.votes)) {
|
|
||||||
for (const [word, vote] of Object.entries(
|
|
||||||
langVotes as Record<string, { cefr_source: string }>,
|
|
||||||
)) {
|
|
||||||
insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
|
|
||||||
cefrVotes++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return { synsets, translations, glosses, examples, cefrVotes };
|
return { entries, translations, skipped };
|
||||||
});
|
});
|
||||||
|
|
||||||
const counts = importAll();
|
const counts = importAll();
|
||||||
|
|
||||||
console.log(` synsets: ${counts.synsets.toLocaleString()}`);
|
console.log(` entries: ${counts.entries.toLocaleString()}`);
|
||||||
console.log(` translations: ${counts.translations.toLocaleString()}`);
|
console.log(` translations: ${counts.translations.toLocaleString()}`);
|
||||||
console.log(` glosses: ${counts.glosses.toLocaleString()}`);
|
console.log(` skipped: ${counts.skipped.toLocaleString()}`);
|
||||||
console.log(` examples: ${counts.examples.toLocaleString()}`);
|
|
||||||
console.log(` cefr votes: ${counts.cefrVotes.toLocaleString()}`);
|
|
||||||
|
|
||||||
db.close();
|
db.close();
|
||||||
console.log("\nImport complete.");
|
console.log("\nImport complete.");
|
||||||
|
|
@ -189,7 +102,7 @@ export async function importStage2(): Promise<void> {
|
||||||
|
|
||||||
export function isImported(): boolean {
|
export function isImported(): boolean {
|
||||||
const db = openDb();
|
const db = openDb();
|
||||||
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
|
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
|
||||||
count: number;
|
count: number;
|
||||||
};
|
};
|
||||||
db.close();
|
db.close();
|
||||||
|
|
@ -200,20 +113,20 @@ export function isImported(): boolean {
|
||||||
|
|
||||||
async function main(): Promise<void> {
|
async function main(): Promise<void> {
|
||||||
const db = openDb();
|
const db = openDb();
|
||||||
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
|
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
|
||||||
count: number;
|
count: number;
|
||||||
};
|
};
|
||||||
db.close();
|
db.close();
|
||||||
|
|
||||||
if (row.count > 0) {
|
if (row.count > 0) {
|
||||||
console.log(
|
console.log(
|
||||||
`pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
|
`pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
|
||||||
);
|
);
|
||||||
console.log("Delete pipeline.db and re-run db:init to start fresh.");
|
console.log("Delete pipeline.db and re-run db:init to start fresh.");
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
await importStage2();
|
await importKaikki();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (import.meta.url === `file://${process.argv[1]}`) {
|
if (import.meta.url === `file://${process.argv[1]}`) {
|
||||||
|
|
|
||||||
|
|
@ -1,62 +1,58 @@
|
||||||
-- ── Base data ─────────────────────────────────────────────────────────────────
|
-- ── Base data ─────────────────────────────────────────────────────────────────
|
||||||
-- Imported from stage 2 JSON on first run. Never mutated after import.
|
-- Imported from Kaikki on first run. Never mutated after import.
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS synsets (
|
CREATE TABLE IF NOT EXISTS entries (
|
||||||
source_id TEXT PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
pos TEXT NOT NULL
|
headword TEXT NOT NULL,
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
pos TEXT NOT NULL,
|
||||||
|
sense_index INTEGER NOT NULL DEFAULT 0,
|
||||||
|
gloss TEXT,
|
||||||
|
examples TEXT NOT NULL DEFAULT '[]', -- JSON array of strings
|
||||||
|
source TEXT NOT NULL DEFAULT 'kaikki',
|
||||||
|
UNIQUE (headword, language, pos, sense_index)
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS translations (
|
CREATE TABLE IF NOT EXISTS translations (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||||
language TEXT NOT NULL,
|
target_lang TEXT NOT NULL,
|
||||||
word TEXT NOT NULL,
|
word TEXT NOT NULL,
|
||||||
UNIQUE (source_id, language, word)
|
sense_hint TEXT,
|
||||||
);
|
source TEXT NOT NULL DEFAULT 'kaikki',
|
||||||
|
UNIQUE (entry_id, target_lang, word)
|
||||||
CREATE TABLE IF NOT EXISTS glosses (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
||||||
language TEXT NOT NULL,
|
|
||||||
text TEXT NOT NULL
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS examples (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
|
||||||
language TEXT NOT NULL,
|
|
||||||
text TEXT NOT NULL,
|
|
||||||
source TEXT NOT NULL
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS cefr_source_votes (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
|
||||||
cefr_level TEXT NOT NULL,
|
|
||||||
UNIQUE (translation_id)
|
|
||||||
);
|
);
|
||||||
|
|
||||||
-- ── Status tracking ───────────────────────────────────────────────────────────
|
-- ── Status tracking ───────────────────────────────────────────────────────────
|
||||||
-- One row per synset per model per stage. Drives resumability.
|
-- One row per entry per model per stage. Drives resumability.
|
||||||
|
-- Sentinel rows use entry_id = 0 for one-time pipeline steps.
|
||||||
-- stage: round1 | round2 | tiebreak
|
-- stage: round1 | round2 | tiebreak
|
||||||
-- status: pending | complete | needs_review | flagged
|
-- status: pending | complete | needs_review | flagged
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS run_status (
|
CREATE TABLE IF NOT EXISTS run_status (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
source_id TEXT NOT NULL,
|
entry_id INTEGER NOT NULL,
|
||||||
model_name TEXT NOT NULL,
|
model_name TEXT NOT NULL,
|
||||||
stage TEXT NOT NULL,
|
stage TEXT NOT NULL,
|
||||||
status TEXT NOT NULL,
|
status TEXT NOT NULL,
|
||||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
UNIQUE (source_id, model_name, stage)
|
UNIQUE (entry_id, model_name, stage)
|
||||||
);
|
);
|
||||||
|
|
||||||
-- ── Round 1 output ────────────────────────────────────────────────────────────
|
-- ── Round 1 output ────────────────────────────────────────────────────────────
|
||||||
-- One row per translation/language per model. Written atomically per record.
|
-- Written atomically per entry per model.
|
||||||
-- Unique constraints enforce one model one vote.
|
-- Unique constraints enforce one model one vote.
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS model_cefr_votes (
|
CREATE TABLE IF NOT EXISTS model_entry_cefr_votes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
cefr_level TEXT NOT NULL,
|
||||||
|
UNIQUE (entry_id, model_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||||
model_name TEXT NOT NULL,
|
model_name TEXT NOT NULL,
|
||||||
|
|
@ -64,38 +60,29 @@ CREATE TABLE IF NOT EXISTS model_cefr_votes (
|
||||||
UNIQUE (translation_id, model_name)
|
UNIQUE (translation_id, model_name)
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS model_translation_rejections (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
|
||||||
model_name TEXT NOT NULL,
|
|
||||||
UNIQUE (translation_id, model_name)
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS generated_glosses (
|
CREATE TABLE IF NOT EXISTS generated_glosses (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||||
model_name TEXT NOT NULL,
|
model_name TEXT NOT NULL,
|
||||||
language TEXT NOT NULL,
|
|
||||||
text TEXT NOT NULL,
|
text TEXT NOT NULL,
|
||||||
UNIQUE (source_id, model_name, language)
|
UNIQUE (entry_id, model_name)
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS generated_examples (
|
CREATE TABLE IF NOT EXISTS generated_examples (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||||
model_name TEXT NOT NULL,
|
model_name TEXT NOT NULL,
|
||||||
language TEXT NOT NULL,
|
|
||||||
text TEXT NOT NULL,
|
text TEXT NOT NULL,
|
||||||
UNIQUE (source_id, model_name, language)
|
UNIQUE (entry_id, model_name)
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS generated_descriptions (
|
CREATE TABLE IF NOT EXISTS generated_translations (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||||
model_name TEXT NOT NULL,
|
model_name TEXT NOT NULL,
|
||||||
language TEXT NOT NULL,
|
target_lang TEXT NOT NULL,
|
||||||
text TEXT NOT NULL,
|
word TEXT NOT NULL,
|
||||||
UNIQUE (source_id, model_name, language)
|
UNIQUE (entry_id, model_name, target_lang)
|
||||||
);
|
);
|
||||||
|
|
||||||
-- ── Round 2 output ────────────────────────────────────────────────────────────
|
-- ── Round 2 output ────────────────────────────────────────────────────────────
|
||||||
|
|
@ -116,20 +103,28 @@ CREATE TABLE IF NOT EXISTS example_candidate_votes (
|
||||||
UNIQUE (example_id, model_name)
|
UNIQUE (example_id, model_name)
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS description_candidate_votes (
|
CREATE TABLE IF NOT EXISTS translation_candidate_votes (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
|
translation_id INTEGER NOT NULL REFERENCES generated_translations(id),
|
||||||
model_name TEXT NOT NULL,
|
model_name TEXT NOT NULL,
|
||||||
UNIQUE (description_id, model_name)
|
UNIQUE (translation_id, model_name)
|
||||||
);
|
);
|
||||||
|
|
||||||
-- ── Resolved output ───────────────────────────────────────────────────────────
|
-- ── Resolved output ───────────────────────────────────────────────────────────
|
||||||
-- Written by merge. Never updated after writing.
|
-- Written by merge. Never updated after writing.
|
||||||
-- Only fully resolved records are written here — no nulls, no flags.
|
-- Only fully resolved records are written here — no nulls.
|
||||||
-- Absence of a row means unresolved. Flagged status tracked in run_status.
|
-- Absence of a row means unresolved. Flagged status tracked in run_status.
|
||||||
-- source: omw | cefr | model_name
|
-- source: kaikki | model_name
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS resolved_translations (
|
CREATE TABLE IF NOT EXISTS resolved_entry_cefr (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||||
|
cefr_level TEXT NOT NULL,
|
||||||
|
difficulty TEXT NOT NULL,
|
||||||
|
UNIQUE (entry_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS resolved_translation_cefr (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||||
cefr_level TEXT NOT NULL,
|
cefr_level TEXT NOT NULL,
|
||||||
|
|
@ -138,27 +133,25 @@ CREATE TABLE IF NOT EXISTS resolved_translations (
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS resolved_glosses (
|
CREATE TABLE IF NOT EXISTS resolved_glosses (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||||
language TEXT NOT NULL,
|
text TEXT NOT NULL,
|
||||||
text TEXT NOT NULL,
|
source TEXT NOT NULL,
|
||||||
source TEXT NOT NULL,
|
UNIQUE (entry_id)
|
||||||
UNIQUE (source_id, language)
|
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS resolved_examples (
|
CREATE TABLE IF NOT EXISTS resolved_examples (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||||
language TEXT NOT NULL,
|
text TEXT NOT NULL,
|
||||||
text TEXT NOT NULL,
|
source TEXT NOT NULL
|
||||||
source TEXT NOT NULL
|
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS resolved_descriptions (
|
CREATE TABLE IF NOT EXISTS resolved_generated_translations (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||||
language TEXT NOT NULL,
|
target_lang TEXT NOT NULL,
|
||||||
text TEXT NOT NULL,
|
word TEXT NOT NULL,
|
||||||
source TEXT NOT NULL,
|
source TEXT NOT NULL,
|
||||||
UNIQUE (source_id, language)
|
UNIQUE (entry_id, target_lang)
|
||||||
);
|
);
|
||||||
|
|
|
||||||
|
|
@ -1,204 +0,0 @@
|
||||||
"""
|
|
||||||
data-pipeline/stage-1-extract/scripts/extract.py
|
|
||||||
|
|
||||||
Extract all synsets from the Open Multilingual Wordnet (OMW) for all
|
|
||||||
supported languages and parts of speech.
|
|
||||||
|
|
||||||
Output: one JSON file per language, written to stage-1-extract/output/
|
|
||||||
en.json, it.json, es.json, de.json, fr.json
|
|
||||||
|
|
||||||
Each file is a JSON array of synset records:
|
|
||||||
{
|
|
||||||
"source_id": "ili:i12345",
|
|
||||||
"pos": "noun",
|
|
||||||
"translations": { "en": ["dog", "canine"], "it": ["cane"] },
|
|
||||||
"glosses": { "en": ["a domesticated animal..."] },
|
|
||||||
"examples": { "en": ["the dog barked at the stranger"] }
|
|
||||||
}
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
python stage-1-extract/scripts/extract.py
|
|
||||||
python stage-1-extract/scripts/extract.py --sample
|
|
||||||
|
|
||||||
Prerequisites:
|
|
||||||
pip install wn
|
|
||||||
python -m wn download omw-en:1.4
|
|
||||||
python -m wn download omw-it:1.4
|
|
||||||
python -m wn download omw-de:1.4
|
|
||||||
python -m wn download omw-es:1.4
|
|
||||||
python -m wn download omw-fr:1.4
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import wn
|
|
||||||
|
|
||||||
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
|
|
||||||
POS_MAP: dict[str, str] = {
|
|
||||||
"n": "noun",
|
|
||||||
"v": "verb",
|
|
||||||
"a": "adjective",
|
|
||||||
"s": "adjective", # adjective satellite — collapsed into adjective
|
|
||||||
"r": "adverb",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def extract_all(
|
|
||||||
output_dir: str = "stage-1-extract/output", sample: bool = False
|
|
||||||
) -> None:
|
|
||||||
out = Path(output_dir)
|
|
||||||
out.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
sample_size = 100 if sample else None
|
|
||||||
|
|
||||||
# Load one Wordnet object per language up front.
|
|
||||||
print("Loading wordnets...")
|
|
||||||
wordnets: dict[str, wn.Wordnet] = {}
|
|
||||||
for lang in SUPPORTED_LANGUAGE_CODES:
|
|
||||||
try:
|
|
||||||
wordnets[lang] = wn.Wordnet(lang=lang)
|
|
||||||
synset_count = len(wordnets[lang].synsets())
|
|
||||||
print(f" {lang}: {synset_count:,} total synsets")
|
|
||||||
except wn.Error as e:
|
|
||||||
print(f" ERROR loading {lang}: {e}")
|
|
||||||
print(f" Run: python -m wn download omw-{lang}:1.4")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# Collect per-ILI data across all languages and POS.
|
|
||||||
print("\nExtracting synsets...")
|
|
||||||
by_ili: dict[str, dict] = {}
|
|
||||||
|
|
||||||
for lang, wnet in wordnets.items():
|
|
||||||
for omw_pos, pos_label in POS_MAP.items():
|
|
||||||
synsets = wnet.synsets(pos=omw_pos)
|
|
||||||
covered = 0
|
|
||||||
for synset in synsets:
|
|
||||||
ili = synset.ili
|
|
||||||
if not ili:
|
|
||||||
continue
|
|
||||||
covered += 1
|
|
||||||
|
|
||||||
lemmas = list(dict.fromkeys(str(lemma) for lemma in synset.lemmas()))
|
|
||||||
defns = [d for d in synset.definitions() if d]
|
|
||||||
examples = [e for e in synset.examples() if e]
|
|
||||||
|
|
||||||
if ili not in by_ili:
|
|
||||||
by_ili[ili] = {"pos": pos_label}
|
|
||||||
|
|
||||||
if lang not in by_ili[ili]:
|
|
||||||
by_ili[ili][lang] = {
|
|
||||||
"lemmas": lemmas,
|
|
||||||
"glosses": defns,
|
|
||||||
"examples": examples,
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
# ILI already exists for this language — merge data.
|
|
||||||
# Happens when 'a' and 's' both map to adjective for the
|
|
||||||
# same ILI. Deduplicate to avoid repeated entries.
|
|
||||||
existing = by_ili[ili][lang]
|
|
||||||
existing["lemmas"] = list(
|
|
||||||
dict.fromkeys(existing["lemmas"] + lemmas)
|
|
||||||
)
|
|
||||||
existing["glosses"] = list(
|
|
||||||
dict.fromkeys(existing["glosses"] + defns)
|
|
||||||
)
|
|
||||||
existing["examples"] = list(
|
|
||||||
dict.fromkeys(existing["examples"] + examples)
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f" {lang} {pos_label}: {covered:,} synsets with ILI")
|
|
||||||
|
|
||||||
# Build records and write single combined output file.
|
|
||||||
print("\nBuilding records...")
|
|
||||||
ilis = sorted(by_ili.keys())
|
|
||||||
if sample_size:
|
|
||||||
ilis = ilis[:sample_size]
|
|
||||||
|
|
||||||
records: list[dict] = []
|
|
||||||
for ili in ilis:
|
|
||||||
data = by_ili[ili]
|
|
||||||
record: dict = {
|
|
||||||
"source_id": f"ili:{ili}",
|
|
||||||
"pos": data["pos"],
|
|
||||||
"translations": {},
|
|
||||||
"glosses": {},
|
|
||||||
"examples": {},
|
|
||||||
}
|
|
||||||
|
|
||||||
for key, value in data.items():
|
|
||||||
if key == "pos":
|
|
||||||
continue
|
|
||||||
lang = key
|
|
||||||
if value["lemmas"]:
|
|
||||||
record["translations"][lang] = value["lemmas"]
|
|
||||||
if value["glosses"]:
|
|
||||||
record["glosses"][lang] = value["glosses"]
|
|
||||||
if value["examples"]:
|
|
||||||
record["examples"][lang] = value["examples"]
|
|
||||||
|
|
||||||
records.append(record)
|
|
||||||
|
|
||||||
output_file = out / "omw.json"
|
|
||||||
with open(output_file, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
print(f"\nWrote {len(records):,} synsets → {output_file}")
|
|
||||||
_print_coverage(records)
|
|
||||||
|
|
||||||
|
|
||||||
def _print_coverage(records: list[dict]) -> None:
|
|
||||||
"""Print per-language translation, gloss, and example counts."""
|
|
||||||
lang_stats: dict[str, dict[str, int]] = {}
|
|
||||||
for lang in SUPPORTED_LANGUAGE_CODES:
|
|
||||||
lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
|
|
||||||
|
|
||||||
pos_stats: dict[str, int] = {}
|
|
||||||
|
|
||||||
for r in records:
|
|
||||||
pos = r["pos"]
|
|
||||||
pos_stats[pos] = pos_stats.get(pos, 0) + 1
|
|
||||||
|
|
||||||
for lang, lemmas in r["translations"].items():
|
|
||||||
if lang in lang_stats:
|
|
||||||
lang_stats[lang]["translations"] += len(lemmas)
|
|
||||||
for lang, gloss_list in r["glosses"].items():
|
|
||||||
if lang in lang_stats:
|
|
||||||
lang_stats[lang]["glosses"] += len(gloss_list)
|
|
||||||
for lang, example_list in r["examples"].items():
|
|
||||||
if lang in lang_stats:
|
|
||||||
lang_stats[lang]["examples"] += len(example_list)
|
|
||||||
|
|
||||||
print("\nPOS breakdown:")
|
|
||||||
for pos, count in sorted(pos_stats.items()):
|
|
||||||
print(f" {pos}: {count:,}")
|
|
||||||
|
|
||||||
print("\nCoverage per language:")
|
|
||||||
for lang, counts in lang_stats.items():
|
|
||||||
t = counts["translations"]
|
|
||||||
g = counts["glosses"]
|
|
||||||
e = counts["examples"]
|
|
||||||
total = len(records)
|
|
||||||
print(
|
|
||||||
f" {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
|
|
||||||
parser.add_argument(
|
|
||||||
"--output-dir",
|
|
||||||
default="stage-1-extract/output",
|
|
||||||
help="Output directory for JSON files",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--sample",
|
|
||||||
action="store_true",
|
|
||||||
help="Extract only 100 synsets per language for inspection",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
extract_all(output_dir=args.output_dir, sample=args.sample)
|
|
||||||
209
data-pipeline/stage-1-extract/scripts/extract.ts
Normal file
209
data-pipeline/stage-1-extract/scripts/extract.ts
Normal file
|
|
@ -0,0 +1,209 @@
|
||||||
|
import fs from "node:fs";
|
||||||
|
import path from "node:path";
|
||||||
|
import readline from "node:readline";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||||
|
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||||
|
|
||||||
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type KaikkiTranslation = {
|
||||||
|
code?: string;
|
||||||
|
lang_code?: string;
|
||||||
|
word?: string;
|
||||||
|
sense?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type KaikkiSense = {
|
||||||
|
glosses?: string[];
|
||||||
|
examples?: { text?: string }[];
|
||||||
|
translations?: KaikkiTranslation[];
|
||||||
|
};
|
||||||
|
|
||||||
|
type KaikkiEntry = { word?: string; pos?: string; senses?: KaikkiSense[] };
|
||||||
|
|
||||||
|
export type ExtractedSense = {
|
||||||
|
headword: string;
|
||||||
|
pos: SupportedPos;
|
||||||
|
sense_index: number;
|
||||||
|
gloss: string | null;
|
||||||
|
examples: string[];
|
||||||
|
translations: {
|
||||||
|
target_lang: SupportedLanguageCode;
|
||||||
|
word: string;
|
||||||
|
sense_hint: string | null;
|
||||||
|
}[];
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Constants ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
|
const PATHS = {
|
||||||
|
source: path.resolve(
|
||||||
|
__dirname,
|
||||||
|
"../sources/kaikki.org-dictionary-English.jsonl",
|
||||||
|
),
|
||||||
|
output: path.resolve(__dirname, "../output/en.json"),
|
||||||
|
};
|
||||||
|
|
||||||
|
const POS_MAP: Record<string, SupportedPos> = {
|
||||||
|
noun: "noun",
|
||||||
|
verb: "verb",
|
||||||
|
adj: "adjective",
|
||||||
|
adv: "adverb",
|
||||||
|
};
|
||||||
|
|
||||||
|
const SUPPORTED_LANG_SET = new Set<string>(SUPPORTED_LANGUAGE_CODES);
|
||||||
|
|
||||||
|
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function mapPos(kaikkiPos: string): SupportedPos | null {
|
||||||
|
return POS_MAP[kaikkiPos] ?? null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function isAbbreviation(gloss: string): boolean {
|
||||||
|
return gloss.toLowerCase().startsWith("abbreviation of");
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTranslations(
|
||||||
|
sense: KaikkiSense,
|
||||||
|
): ExtractedSense["translations"] {
|
||||||
|
const seen = new Set<string>();
|
||||||
|
const result: ExtractedSense["translations"] = [];
|
||||||
|
|
||||||
|
for (const t of sense.translations ?? []) {
|
||||||
|
const code = t.code ?? t.lang_code;
|
||||||
|
if (!code || !SUPPORTED_LANG_SET.has(code) || code === "en") continue;
|
||||||
|
if (!t.word?.trim()) continue;
|
||||||
|
|
||||||
|
const key = `${code}:${t.word.trim()}`;
|
||||||
|
if (seen.has(key)) continue;
|
||||||
|
seen.add(key);
|
||||||
|
|
||||||
|
result.push({
|
||||||
|
target_lang: code as SupportedLanguageCode,
|
||||||
|
word: t.word.trim(),
|
||||||
|
sense_hint: t.sense?.trim() ?? null,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractExamples(sense: KaikkiSense): string[] {
|
||||||
|
return (sense.examples ?? [])
|
||||||
|
.map((e) => e.text?.trim())
|
||||||
|
.filter((t): t is string => !!t);
|
||||||
|
}
|
||||||
|
|
||||||
|
function processEntry(entry: KaikkiEntry): ExtractedSense[] {
|
||||||
|
const pos = mapPos(entry.pos ?? "");
|
||||||
|
if (!pos) return [];
|
||||||
|
if (!entry.word?.trim()) return [];
|
||||||
|
|
||||||
|
const headword = entry.word.trim();
|
||||||
|
const results: ExtractedSense[] = [];
|
||||||
|
let senseIndex = 0;
|
||||||
|
|
||||||
|
for (const sense of entry.senses ?? []) {
|
||||||
|
const gloss = sense.glosses?.[0]?.trim() ?? null;
|
||||||
|
|
||||||
|
// Skip abbreviation senses
|
||||||
|
if (gloss && isAbbreviation(gloss)) continue;
|
||||||
|
|
||||||
|
const translations = extractTranslations(sense);
|
||||||
|
|
||||||
|
// Skip senses with no translations in our supported languages
|
||||||
|
if (translations.length === 0) continue;
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
headword,
|
||||||
|
pos,
|
||||||
|
sense_index: senseIndex++,
|
||||||
|
gloss,
|
||||||
|
examples: extractExamples(sense),
|
||||||
|
translations,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Main ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function extract(sampleLimit?: number): Promise<void> {
|
||||||
|
console.log("Extracting Kaikki English data...");
|
||||||
|
console.log(` Source: ${PATHS.source}`);
|
||||||
|
|
||||||
|
if (sampleLimit) {
|
||||||
|
console.log(` Sample mode: ${sampleLimit} entries`);
|
||||||
|
}
|
||||||
|
|
||||||
|
await fs.promises.mkdir(path.dirname(PATHS.output), { recursive: true });
|
||||||
|
|
||||||
|
const fileStream = fs.createReadStream(PATHS.source);
|
||||||
|
const rl = readline.createInterface({
|
||||||
|
input: fileStream,
|
||||||
|
crlfDelay: Infinity,
|
||||||
|
});
|
||||||
|
|
||||||
|
const senses: ExtractedSense[] = [];
|
||||||
|
let linesRead = 0;
|
||||||
|
let entriesProcessed = 0;
|
||||||
|
let entriesSkipped = 0;
|
||||||
|
|
||||||
|
for await (const line of rl) {
|
||||||
|
if (!line.trim()) continue;
|
||||||
|
if (sampleLimit && entriesProcessed >= sampleLimit) break;
|
||||||
|
|
||||||
|
linesRead++;
|
||||||
|
|
||||||
|
let entry: KaikkiEntry;
|
||||||
|
try {
|
||||||
|
entry = JSON.parse(line) as KaikkiEntry;
|
||||||
|
} catch {
|
||||||
|
console.warn(` Warning: failed to parse line ${linesRead}, skipping`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const extracted = processEntry(entry);
|
||||||
|
|
||||||
|
if (extracted.length === 0) {
|
||||||
|
entriesSkipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
senses.push(...extracted);
|
||||||
|
entriesProcessed++;
|
||||||
|
|
||||||
|
if (entriesProcessed % 10_000 === 0) {
|
||||||
|
console.log(
|
||||||
|
` Processed ${entriesProcessed.toLocaleString()} entries...`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await fs.promises.writeFile(
|
||||||
|
PATHS.output,
|
||||||
|
JSON.stringify(senses, null, 2),
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(`\nExtraction complete:`);
|
||||||
|
console.log(` Lines read: ${linesRead.toLocaleString()}`);
|
||||||
|
console.log(` Entries processed: ${entriesProcessed.toLocaleString()}`);
|
||||||
|
console.log(` Entries skipped: ${entriesSkipped.toLocaleString()}`);
|
||||||
|
console.log(` Senses extracted: ${senses.length.toLocaleString()}`);
|
||||||
|
console.log(` Output: ${PATHS.output}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
// Hardcoded sample limit for initial testing — remove for full extraction
|
||||||
|
await extract(500);
|
||||||
|
}
|
||||||
|
|
@ -1,227 +0,0 @@
|
||||||
import fs from "node:fs/promises";
|
|
||||||
import path from "node:path";
|
|
||||||
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
|
||||||
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
|
||||||
|
|
||||||
// ── Types ────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
type OmwExample = { text: string; source: "omw" };
|
|
||||||
|
|
||||||
type CefrExample = { text: string; source: "cefr" };
|
|
||||||
|
|
||||||
type Example = OmwExample | CefrExample;
|
|
||||||
|
|
||||||
type OmwRecord = {
|
|
||||||
source_id: string;
|
|
||||||
pos: SupportedPos;
|
|
||||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
examples: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
};
|
|
||||||
|
|
||||||
type AnnotatedRecord = {
|
|
||||||
source_id: string;
|
|
||||||
pos: SupportedPos;
|
|
||||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
|
||||||
votes: Partial<
|
|
||||||
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
|
||||||
>;
|
|
||||||
};
|
|
||||||
|
|
||||||
type CefrSourceEntry = {
|
|
||||||
word: string;
|
|
||||||
pos: string;
|
|
||||||
cefr_level: string;
|
|
||||||
example_sentence_native?: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
type ConflictEntry = {
|
|
||||||
word: string;
|
|
||||||
pos: string;
|
|
||||||
language: SupportedLanguageCode;
|
|
||||||
levels: string[];
|
|
||||||
};
|
|
||||||
|
|
||||||
// ── Constants ─────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
const POS_NORMALIZE: Record<string, SupportedPos> = {
|
|
||||||
noun: "noun",
|
|
||||||
n: "noun",
|
|
||||||
nom: "noun", // French
|
|
||||||
verb: "verb",
|
|
||||||
verbs: "verb",
|
|
||||||
v: "verb",
|
|
||||||
v1: "verb",
|
|
||||||
adjective: "adjective",
|
|
||||||
adjektiv: "adjective", // German
|
|
||||||
adj: "adjective",
|
|
||||||
adverb: "adverb",
|
|
||||||
adverbs: "adverb",
|
|
||||||
adv: "adverb",
|
|
||||||
};
|
|
||||||
|
|
||||||
const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
|
|
||||||
|
|
||||||
const PATHS = {
|
|
||||||
omw: "stage-1-extract/output/omw.json",
|
|
||||||
cefrDir: "stage-2-annotate/sources/cefr",
|
|
||||||
outputDir: "stage-2-annotate/output",
|
|
||||||
};
|
|
||||||
|
|
||||||
// ── CEFR source loading ───────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
type CefrIndex = Map<string, { level: string; example?: string }>;
|
|
||||||
|
|
||||||
async function loadCefrSource(
|
|
||||||
lang: SupportedLanguageCode,
|
|
||||||
): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
|
|
||||||
const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
|
|
||||||
const raw = await fs.readFile(filepath, "utf-8");
|
|
||||||
const entries = JSON.parse(raw) as CefrSourceEntry[];
|
|
||||||
|
|
||||||
// First pass — detect conflicts.
|
|
||||||
// Structure: "word|pos" -> Set of CEFR levels seen
|
|
||||||
const seen = new Map<string, Set<string>>();
|
|
||||||
|
|
||||||
for (const entry of entries) {
|
|
||||||
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
|
|
||||||
if (!pos) continue;
|
|
||||||
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
|
|
||||||
|
|
||||||
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
|
|
||||||
if (!seen.has(key)) seen.set(key, new Set());
|
|
||||||
seen.get(key)!.add(entry.cefr_level);
|
|
||||||
}
|
|
||||||
|
|
||||||
const conflicts: ConflictEntry[] = [];
|
|
||||||
for (const [key, levels] of seen.entries()) {
|
|
||||||
if (levels.size > 1) {
|
|
||||||
const [word, pos] = key.split("|") as [string, string];
|
|
||||||
conflicts.push({ word, pos, language: lang, levels: [...levels] });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Second pass — build index, skip conflicting entries.
|
|
||||||
const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
|
|
||||||
|
|
||||||
const index: CefrIndex = new Map();
|
|
||||||
for (const entry of entries) {
|
|
||||||
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
|
|
||||||
if (!pos) continue;
|
|
||||||
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
|
|
||||||
|
|
||||||
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
|
|
||||||
if (conflictKeys.has(key)) continue;
|
|
||||||
|
|
||||||
index.set(key, {
|
|
||||||
level: entry.cefr_level,
|
|
||||||
...(entry.example_sentence_native
|
|
||||||
? { example: entry.example_sentence_native }
|
|
||||||
: {}),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return { index, conflicts };
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Annotation ────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
async function annotate(): Promise<void> {
|
|
||||||
// Load OMW records
|
|
||||||
console.log("Reading OMW extract...");
|
|
||||||
const raw = await fs.readFile(PATHS.omw, "utf-8");
|
|
||||||
const omwRecords = JSON.parse(raw) as OmwRecord[];
|
|
||||||
console.log(` Loaded ${omwRecords.length.toLocaleString()} synsets`);
|
|
||||||
|
|
||||||
// Load CEFR sources for all languages
|
|
||||||
console.log("\nLoading CEFR source files...");
|
|
||||||
const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
|
|
||||||
const allConflicts: ConflictEntry[] = [];
|
|
||||||
|
|
||||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
||||||
const { index, conflicts } = await loadCefrSource(lang);
|
|
||||||
cefrIndexes.set(lang, index);
|
|
||||||
allConflicts.push(...conflicts);
|
|
||||||
console.log(
|
|
||||||
` ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write conflicts file
|
|
||||||
await fs.mkdir(PATHS.outputDir, { recursive: true });
|
|
||||||
await fs.writeFile(
|
|
||||||
path.join(PATHS.outputDir, "conflicts.json"),
|
|
||||||
JSON.stringify(allConflicts, null, 2),
|
|
||||||
"utf-8",
|
|
||||||
);
|
|
||||||
console.log(
|
|
||||||
`\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
|
|
||||||
);
|
|
||||||
|
|
||||||
// Annotate and write one file per language
|
|
||||||
console.log("\nAnnotating...");
|
|
||||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
||||||
const index = cefrIndexes.get(lang)!;
|
|
||||||
const records: AnnotatedRecord[] = [];
|
|
||||||
let matched = 0;
|
|
||||||
|
|
||||||
for (const record of omwRecords) {
|
|
||||||
const annotated: AnnotatedRecord = {
|
|
||||||
source_id: record.source_id,
|
|
||||||
pos: record.pos,
|
|
||||||
translations: record.translations,
|
|
||||||
glosses: record.glosses,
|
|
||||||
examples: {},
|
|
||||||
votes: {},
|
|
||||||
};
|
|
||||||
|
|
||||||
// Convert OMW examples to typed format
|
|
||||||
for (const [l, exList] of Object.entries(record.examples)) {
|
|
||||||
annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
|
|
||||||
text,
|
|
||||||
source: "omw" as const,
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Match translations for this language against CEFR index
|
|
||||||
const langTranslations = record.translations[lang] ?? [];
|
|
||||||
for (const word of langTranslations) {
|
|
||||||
const key = `${word.toLowerCase().trim()}|${record.pos}`;
|
|
||||||
const cefrEntry = index.get(key);
|
|
||||||
if (!cefrEntry) continue;
|
|
||||||
|
|
||||||
matched++;
|
|
||||||
|
|
||||||
// Add CEFR vote
|
|
||||||
if (!annotated.votes[lang]) annotated.votes[lang] = {};
|
|
||||||
annotated.votes[lang][word] = { cefr_source: cefrEntry.level };
|
|
||||||
|
|
||||||
// Add native example if present
|
|
||||||
if (cefrEntry.example) {
|
|
||||||
if (!annotated.examples[lang]) annotated.examples[lang] = [];
|
|
||||||
annotated.examples[lang].push({
|
|
||||||
text: cefrEntry.example,
|
|
||||||
source: "cefr" as const,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
records.push(annotated);
|
|
||||||
}
|
|
||||||
|
|
||||||
const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
|
|
||||||
await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
|
|
||||||
console.log(
|
|
||||||
` ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Main ─────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
annotate().catch((err) => {
|
|
||||||
console.error(err);
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
170
data-pipeline/tests/fixtures/annotated.fixture.json
vendored
170
data-pipeline/tests/fixtures/annotated.fixture.json
vendored
|
|
@ -1,170 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"_fixture": "noun_with_cefr_vote",
|
|
||||||
"source_id": "ili:i100955",
|
|
||||||
"pos": "noun",
|
|
||||||
"translations": { "en": ["grain"], "de": ["Korn", "Kornbrand"] },
|
|
||||||
"glosses": { "en": ["a cereal grass"], "de": ["ein Getreidegras"] },
|
|
||||||
"examples": {
|
|
||||||
"en": [
|
|
||||||
{ "text": "wheat is a grain that is grown in Kansas", "source": "omw" }
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"votes": { "en": { "grain": { "cefr_source": "B1" } } }
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"_fixture": "verb_no_votes_no_translations",
|
|
||||||
"source_id": "ili:i21779",
|
|
||||||
"pos": "verb",
|
|
||||||
"translations": { "en": ["respire"] },
|
|
||||||
"glosses": {
|
|
||||||
"en": [
|
|
||||||
"undergo the biomedical and metabolic processes of respiration by taking up oxygen and producing carbon monoxide"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"examples": {},
|
|
||||||
"votes": {}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"_fixture": "verb_with_cefr_vote_all_languages",
|
|
||||||
"source_id": "ili:i21778",
|
|
||||||
"pos": "verb",
|
|
||||||
"translations": {
|
|
||||||
"en": ["breathe", "take a breath", "respire", "suspire"],
|
|
||||||
"it": ["respirare"],
|
|
||||||
"es": ["aspirar", "respirar"],
|
|
||||||
"de": ["Luft holen", "hauchen", "Luft bekommen", "Luft kriegen", "atmen"],
|
|
||||||
"fr": ["inspirer", "respirer"]
|
|
||||||
},
|
|
||||||
"glosses": {
|
|
||||||
"en": ["draw air into, and expel out of, the lungs"],
|
|
||||||
"de": ["Luft in die Lunge saugen und aus ihr ausstoßen"]
|
|
||||||
},
|
|
||||||
"examples": {
|
|
||||||
"en": [
|
|
||||||
{
|
|
||||||
"text": "I can breathe better when the air is clean",
|
|
||||||
"source": "omw"
|
|
||||||
},
|
|
||||||
{ "text": "The patient is respiring", "source": "omw" }
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"votes": { "en": { "breathe": { "cefr_source": "A1" } } }
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"_fixture": "adjective_all_languages_multiple_translations",
|
|
||||||
"source_id": "ili:i10007",
|
|
||||||
"pos": "adjective",
|
|
||||||
"translations": {
|
|
||||||
"en": ["possible"],
|
|
||||||
"it": [
|
|
||||||
"attuabile",
|
|
||||||
"effettuabile",
|
|
||||||
"eseguibile",
|
|
||||||
"fattibile",
|
|
||||||
"operabile",
|
|
||||||
"possibile",
|
|
||||||
"producibile",
|
|
||||||
"realizzabile"
|
|
||||||
],
|
|
||||||
"es": ["posible"],
|
|
||||||
"de": [
|
|
||||||
"möglich",
|
|
||||||
"denkbar",
|
|
||||||
"eventuell",
|
|
||||||
"möglicherweise",
|
|
||||||
"allfällig",
|
|
||||||
"etwaig",
|
|
||||||
"gegebenenfalls",
|
|
||||||
"eventuell"
|
|
||||||
],
|
|
||||||
"fr": ["possible", "éventuel"]
|
|
||||||
},
|
|
||||||
"glosses": {
|
|
||||||
"en": ["capable of happening or existing"],
|
|
||||||
"de": ["in der Lage, zu geschehen oder zu existieren"]
|
|
||||||
},
|
|
||||||
"examples": {
|
|
||||||
"en": [
|
|
||||||
{ "text": "a breakthrough may be possible next year", "source": "omw" },
|
|
||||||
{ "text": "anything is possible", "source": "omw" },
|
|
||||||
{ "text": "warned of possible consequences", "source": "omw" }
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"votes": { "en": { "possible": { "cefr_source": "A2" } } }
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"_fixture": "adjective_multiple_de_votes_cefr_examples",
|
|
||||||
"source_id": "ili:i10000",
|
|
||||||
"pos": "adjective",
|
|
||||||
"translations": {
|
|
||||||
"en": ["negative"],
|
|
||||||
"de": [
|
|
||||||
"dürftig",
|
|
||||||
"zu wünschen übrig lassen",
|
|
||||||
"schlecht",
|
|
||||||
"widrig",
|
|
||||||
"ungut",
|
|
||||||
"lausig",
|
|
||||||
"negativ",
|
|
||||||
"von Nachteil",
|
|
||||||
"schädlich",
|
|
||||||
"nachteilig",
|
|
||||||
"ungünstig"
|
|
||||||
],
|
|
||||||
"fr": ["négatif", "strictement négatif"]
|
|
||||||
},
|
|
||||||
"glosses": { "en": ["less than zero"], "de": ["kleiner als Null"] },
|
|
||||||
"examples": {
|
|
||||||
"en": [{ "text": "a negative number", "source": "omw" }],
|
|
||||||
"de": [
|
|
||||||
{ "text": "Die Beweise waren dürftig.", "source": "cefr" },
|
|
||||||
{ "text": "Das Wetter ist heute schlecht.", "source": "cefr" },
|
|
||||||
{
|
|
||||||
"text": "Trotz widriger Umstände haben sie es geschafft.",
|
|
||||||
"source": "cefr"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": "Er hatte ein ungutes Gefühl bei der Sache.",
|
|
||||||
"source": "cefr"
|
|
||||||
},
|
|
||||||
{ "text": "Er hat eine sehr negative Einstellung.", "source": "cefr" },
|
|
||||||
{
|
|
||||||
"text": "Rauchen ist schädlich für die Gesundheit.",
|
|
||||||
"source": "cefr"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": "Diese Entscheidung könnte nachteilig sein.",
|
|
||||||
"source": "cefr"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": "Das Wetter ist heute ungünstig für einen Ausflug.",
|
|
||||||
"source": "cefr"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"votes": {
|
|
||||||
"de": {
|
|
||||||
"dürftig": { "cefr_source": "C1" },
|
|
||||||
"schlecht": { "cefr_source": "A1" },
|
|
||||||
"widrig": { "cefr_source": "C1" },
|
|
||||||
"ungut": { "cefr_source": "B2" },
|
|
||||||
"negativ": { "cefr_source": "A2" },
|
|
||||||
"schädlich": { "cefr_source": "B1" },
|
|
||||||
"nachteilig": { "cefr_source": "B1" },
|
|
||||||
"ungünstig": { "cefr_source": "B2" }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"_fixture": "adverb_no_votes",
|
|
||||||
"source_id": "ili:i18157",
|
|
||||||
"pos": "adverb",
|
|
||||||
"translations": { "en": ["a cappella"], "es": ["a capella"] },
|
|
||||||
"glosses": { "en": ["without musical accompaniment"] },
|
|
||||||
"examples": {
|
|
||||||
"en": [{ "text": "they performed a cappella", "source": "omw" }]
|
|
||||||
},
|
|
||||||
"votes": {}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
[
|
|
||||||
{ "word": "macht", "pos": "noun", "language": "de", "levels": ["A2", "B1"] },
|
|
||||||
{ "word": "bleiche", "pos": "noun", "language": "de", "levels": ["B2", "B1"] }
|
|
||||||
]
|
|
||||||
|
|
@ -1,237 +0,0 @@
|
||||||
import fs from "node:fs/promises";
|
|
||||||
import path from "node:path";
|
|
||||||
import { describe, it, expect, beforeAll } from "vitest";
|
|
||||||
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
|
||||||
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
|
||||||
|
|
||||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
type Example = { text: string; source: "omw" | "cefr" };
|
|
||||||
|
|
||||||
type AnnotatedRecord = {
|
|
||||||
source_id: string;
|
|
||||||
pos: SupportedPos;
|
|
||||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
|
||||||
votes: Partial<
|
|
||||||
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
|
||||||
>;
|
|
||||||
};
|
|
||||||
|
|
||||||
// ── Paths ─────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
const DB_PATH = path.resolve("db/pipeline.db");
|
|
||||||
const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
|
|
||||||
const ANNOTATED_DIR = path.resolve("stage-2-annotate/output");
|
|
||||||
|
|
||||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
async function dbExists(): Promise<boolean> {
|
|
||||||
try {
|
|
||||||
await fs.access(DB_PATH);
|
|
||||||
return true;
|
|
||||||
} catch {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
describe("pipeline.db — import validation", () => {
|
|
||||||
let db: import("better-sqlite3").Database;
|
|
||||||
let expectedSynsetCount: number;
|
|
||||||
let expectedCefrVoteCount: number;
|
|
||||||
|
|
||||||
beforeAll(async () => {
|
|
||||||
if (!(await dbExists())) return;
|
|
||||||
|
|
||||||
const Database = (await import("better-sqlite3")).default;
|
|
||||||
db = new Database(DB_PATH, { readonly: true });
|
|
||||||
db.pragma("foreign_keys = ON");
|
|
||||||
|
|
||||||
// Count expected synsets from omw.json
|
|
||||||
const omwRaw = await fs.readFile(OMW_PATH, "utf-8");
|
|
||||||
const omwRecords = JSON.parse(omwRaw) as unknown[];
|
|
||||||
expectedSynsetCount = omwRecords.length;
|
|
||||||
|
|
||||||
// Count expected CEFR votes from stage 2 annotated files.
|
|
||||||
// Merge all language files the same way the import script does —
|
|
||||||
// use en.json as base and merge votes from the other language files.
|
|
||||||
const byId = new Map<string, AnnotatedRecord>();
|
|
||||||
|
|
||||||
const baseRaw = await fs.readFile(
|
|
||||||
path.join(ANNOTATED_DIR, "en.json"),
|
|
||||||
"utf-8",
|
|
||||||
);
|
|
||||||
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
|
|
||||||
for (const record of base) {
|
|
||||||
byId.set(record.source_id, record);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
||||||
if (lang === "en") continue;
|
|
||||||
const raw = await fs.readFile(
|
|
||||||
path.join(ANNOTATED_DIR, `${lang}.json`),
|
|
||||||
"utf-8",
|
|
||||||
);
|
|
||||||
const records = JSON.parse(raw) as AnnotatedRecord[];
|
|
||||||
for (const record of records) {
|
|
||||||
const base = byId.get(record.source_id);
|
|
||||||
if (!base) continue;
|
|
||||||
for (const [l, langVotes] of Object.entries(record.votes)) {
|
|
||||||
if (!base.votes[l as SupportedLanguageCode]) {
|
|
||||||
base.votes[l as SupportedLanguageCode] = {};
|
|
||||||
}
|
|
||||||
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expectedCefrVoteCount = 0;
|
|
||||||
for (const record of byId.values()) {
|
|
||||||
for (const langVotes of Object.values(record.votes)) {
|
|
||||||
expectedCefrVoteCount += Object.keys(langVotes ?? {}).length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}, 120_000);
|
|
||||||
|
|
||||||
it("pipeline.db exists — skipping all tests if not", async () => {
|
|
||||||
const exists = await dbExists();
|
|
||||||
if (!exists) {
|
|
||||||
console.warn(
|
|
||||||
"\n pipeline.db not found — run pnpm db:init and pnpm db:import first\n",
|
|
||||||
);
|
|
||||||
}
|
|
||||||
expect(exists).toBe(true);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("synsets count matches omw.json", () => {
|
|
||||||
if (!db) return;
|
|
||||||
const row = db.prepare("SELECT COUNT(*) as count FROM synsets").get() as {
|
|
||||||
count: number;
|
|
||||||
};
|
|
||||||
expect(row.count).toBe(expectedSynsetCount);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every synset has at least one translation", () => {
|
|
||||||
if (!db) return;
|
|
||||||
const rows = db
|
|
||||||
.prepare(
|
|
||||||
`
|
|
||||||
SELECT s.source_id
|
|
||||||
FROM synsets s
|
|
||||||
LEFT JOIN translations t ON t.source_id = s.source_id
|
|
||||||
WHERE t.id IS NULL
|
|
||||||
`,
|
|
||||||
)
|
|
||||||
.all() as { source_id: string }[];
|
|
||||||
|
|
||||||
const errors = rows.map((r) => `${r.source_id}: no translations`);
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every translation belongs to a valid synset", () => {
|
|
||||||
if (!db) return;
|
|
||||||
const rows = db
|
|
||||||
.prepare(
|
|
||||||
`
|
|
||||||
SELECT t.id, t.source_id
|
|
||||||
FROM translations t
|
|
||||||
LEFT JOIN synsets s ON s.source_id = t.source_id
|
|
||||||
WHERE s.source_id IS NULL
|
|
||||||
`,
|
|
||||||
)
|
|
||||||
.all() as { id: number; source_id: string }[];
|
|
||||||
|
|
||||||
const errors = rows.map(
|
|
||||||
(r) => `translation ${r.id}: references missing synset ${r.source_id}`,
|
|
||||||
);
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every cefr_source_vote references a valid translation", () => {
|
|
||||||
if (!db) return;
|
|
||||||
const rows = db
|
|
||||||
.prepare(
|
|
||||||
`
|
|
||||||
SELECT v.id, v.translation_id
|
|
||||||
FROM cefr_source_votes v
|
|
||||||
LEFT JOIN translations t ON t.id = v.translation_id
|
|
||||||
WHERE t.id IS NULL
|
|
||||||
`,
|
|
||||||
)
|
|
||||||
.all() as { id: number; translation_id: number }[];
|
|
||||||
|
|
||||||
const errors = rows.map(
|
|
||||||
(r) =>
|
|
||||||
`cefr_vote ${r.id}: references missing translation ${r.translation_id}`,
|
|
||||||
);
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("cefr_source_votes count matches stage 2 annotated output", () => {
|
|
||||||
if (!db) return;
|
|
||||||
const row = db
|
|
||||||
.prepare("SELECT COUNT(*) as count FROM cefr_source_votes")
|
|
||||||
.get() as { count: number };
|
|
||||||
expect(row.count).toBe(expectedCefrVoteCount);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every example has a valid source", () => {
|
|
||||||
if (!db) return;
|
|
||||||
const rows = db
|
|
||||||
.prepare(
|
|
||||||
`
|
|
||||||
SELECT source_id, language, source
|
|
||||||
FROM examples
|
|
||||||
WHERE source NOT IN ('omw', 'cefr')
|
|
||||||
`,
|
|
||||||
)
|
|
||||||
.all() as { source_id: string; language: string; source: string }[];
|
|
||||||
|
|
||||||
const errors = rows.map(
|
|
||||||
(r) =>
|
|
||||||
`${r.source_id} (${r.language}): invalid example source "${r.source}"`,
|
|
||||||
);
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every example belongs to a valid synset", () => {
|
|
||||||
if (!db) return;
|
|
||||||
const rows = db
|
|
||||||
.prepare(
|
|
||||||
`
|
|
||||||
SELECT e.id, e.source_id
|
|
||||||
FROM examples e
|
|
||||||
LEFT JOIN synsets s ON s.source_id = e.source_id
|
|
||||||
WHERE s.source_id IS NULL
|
|
||||||
`,
|
|
||||||
)
|
|
||||||
.all() as { id: number; source_id: string }[];
|
|
||||||
|
|
||||||
const errors = rows.map(
|
|
||||||
(r) => `example ${r.id}: references missing synset ${r.source_id}`,
|
|
||||||
);
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every gloss belongs to a valid synset", () => {
|
|
||||||
if (!db) return;
|
|
||||||
const rows = db
|
|
||||||
.prepare(
|
|
||||||
`
|
|
||||||
SELECT g.id, g.source_id
|
|
||||||
FROM glosses g
|
|
||||||
LEFT JOIN synsets s ON s.source_id = g.source_id
|
|
||||||
WHERE s.source_id IS NULL
|
|
||||||
`,
|
|
||||||
)
|
|
||||||
.all() as { id: number; source_id: string }[];
|
|
||||||
|
|
||||||
const errors = rows.map(
|
|
||||||
(r) => `gloss ${r.id}: references missing synset ${r.source_id}`,
|
|
||||||
);
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
@ -1,166 +0,0 @@
|
||||||
import fs from "node:fs/promises";
|
|
||||||
import path from "node:path";
|
|
||||||
import { describe, it, expect } from "vitest";
|
|
||||||
import { SUPPORTED_POS, SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
|
||||||
import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
|
|
||||||
|
|
||||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
type OmwRecord = {
|
|
||||||
source_id: string;
|
|
||||||
pos: SupportedPos;
|
|
||||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
examples: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
};
|
|
||||||
|
|
||||||
// ── Paths ─────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
const OMW_PATH = path.resolve("stage-1-extract/output/omw.json");
|
|
||||||
|
|
||||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
function isValidSourceId(id: string): boolean {
|
|
||||||
return /^ili:i\d+$/.test(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
describe("stage 1 — omw.json validation", () => {
|
|
||||||
let records: OmwRecord[];
|
|
||||||
|
|
||||||
it("file exists and is valid JSON", async () => {
|
|
||||||
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
|
||||||
records = JSON.parse(raw) as OmwRecord[];
|
|
||||||
expect(records).toBeDefined();
|
|
||||||
});
|
|
||||||
|
|
||||||
it("is a non-empty array", async () => {
|
|
||||||
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
|
||||||
records = JSON.parse(raw) as OmwRecord[];
|
|
||||||
expect(Array.isArray(records)).toBe(true);
|
|
||||||
expect(records.length).toBeGreaterThan(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every record has required fields", async () => {
|
|
||||||
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
|
||||||
records = JSON.parse(raw) as OmwRecord[];
|
|
||||||
|
|
||||||
const errors: string[] = [];
|
|
||||||
|
|
||||||
for (const record of records) {
|
|
||||||
if (!record.source_id) {
|
|
||||||
errors.push(`missing source_id`);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!record.pos) errors.push(`${record.source_id}: missing pos`);
|
|
||||||
if (!record.translations)
|
|
||||||
errors.push(`${record.source_id}: missing translations`);
|
|
||||||
if (!record.glosses) errors.push(`${record.source_id}: missing glosses`);
|
|
||||||
if (!record.examples)
|
|
||||||
errors.push(`${record.source_id}: missing examples`);
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every source_id matches ili:i{number} pattern", async () => {
|
|
||||||
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
|
||||||
records = JSON.parse(raw) as OmwRecord[];
|
|
||||||
|
|
||||||
const errors: string[] = [];
|
|
||||||
|
|
||||||
for (const record of records) {
|
|
||||||
if (!isValidSourceId(record.source_id)) {
|
|
||||||
errors.push(`invalid source_id: ${record.source_id}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every source_id is unique", async () => {
|
|
||||||
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
|
||||||
records = JSON.parse(raw) as OmwRecord[];
|
|
||||||
|
|
||||||
const seen = new Set<string>();
|
|
||||||
const errors: string[] = [];
|
|
||||||
|
|
||||||
for (const record of records) {
|
|
||||||
if (seen.has(record.source_id)) {
|
|
||||||
errors.push(`duplicate source_id: ${record.source_id}`);
|
|
||||||
}
|
|
||||||
seen.add(record.source_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every pos is a valid supported value", async () => {
|
|
||||||
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
|
||||||
records = JSON.parse(raw) as OmwRecord[];
|
|
||||||
|
|
||||||
const errors: string[] = [];
|
|
||||||
const validPos = new Set(SUPPORTED_POS);
|
|
||||||
|
|
||||||
for (const record of records) {
|
|
||||||
if (!validPos.has(record.pos)) {
|
|
||||||
errors.push(`${record.source_id}: invalid pos "${record.pos}"`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every record has at least one translation in at least one language", async () => {
|
|
||||||
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
|
||||||
records = JSON.parse(raw) as OmwRecord[];
|
|
||||||
|
|
||||||
const errors: string[] = [];
|
|
||||||
const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
|
|
||||||
|
|
||||||
for (const record of records) {
|
|
||||||
const langs = Object.keys(record.translations) as SupportedLanguageCode[];
|
|
||||||
|
|
||||||
if (langs.length === 0) {
|
|
||||||
errors.push(`${record.source_id}: no translations`);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const lang of langs) {
|
|
||||||
if (!validLangs.has(lang)) {
|
|
||||||
errors.push(`${record.source_id}: unsupported language "${lang}"`);
|
|
||||||
}
|
|
||||||
const words = record.translations[lang] ?? [];
|
|
||||||
if (words.length === 0) {
|
|
||||||
errors.push(`${record.source_id}: empty translations for "${lang}"`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("no duplicate translations within a single synset and language", async () => {
|
|
||||||
const raw = await fs.readFile(OMW_PATH, "utf-8");
|
|
||||||
const records = JSON.parse(raw) as OmwRecord[];
|
|
||||||
|
|
||||||
const errors: string[] = [];
|
|
||||||
|
|
||||||
for (const record of records) {
|
|
||||||
for (const [lang, words] of Object.entries(record.translations)) {
|
|
||||||
const seen = new Set<string>();
|
|
||||||
for (const word of words) {
|
|
||||||
if (seen.has(word)) {
|
|
||||||
errors.push(
|
|
||||||
`${record.source_id} (${lang}): duplicate translation "${word}"`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
seen.add(word);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
@ -1,218 +0,0 @@
|
||||||
import fs from "node:fs/promises";
|
|
||||||
import path from "node:path";
|
|
||||||
import { describe, it, expect, beforeAll } from "vitest";
|
|
||||||
import {
|
|
||||||
SUPPORTED_POS,
|
|
||||||
SUPPORTED_LANGUAGE_CODES,
|
|
||||||
CEFR_LEVELS,
|
|
||||||
} from "@lila/shared";
|
|
||||||
import type { SupportedPos, SupportedLanguageCode } from "@lila/shared";
|
|
||||||
|
|
||||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
type Example = { text: string; source: "omw" | "cefr" };
|
|
||||||
|
|
||||||
type AnnotatedRecord = {
|
|
||||||
source_id: string;
|
|
||||||
pos: SupportedPos;
|
|
||||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
||||||
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
|
||||||
votes: Partial<
|
|
||||||
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
|
||||||
>;
|
|
||||||
};
|
|
||||||
|
|
||||||
type ConflictEntry = {
|
|
||||||
word: string;
|
|
||||||
pos: string;
|
|
||||||
language: SupportedLanguageCode;
|
|
||||||
levels: string[];
|
|
||||||
};
|
|
||||||
|
|
||||||
// ── Paths ─────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
const OUTPUT_DIR = path.resolve("stage-2-annotate/output");
|
|
||||||
|
|
||||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
describe("stage 2 — annotated output validation", () => {
|
|
||||||
const recordsByLang = new Map<SupportedLanguageCode, AnnotatedRecord[]>();
|
|
||||||
let conflicts: ConflictEntry[] = [];
|
|
||||||
|
|
||||||
beforeAll(async () => {
|
|
||||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
||||||
const raw = await fs.readFile(
|
|
||||||
path.join(OUTPUT_DIR, `${lang}.json`),
|
|
||||||
"utf-8",
|
|
||||||
);
|
|
||||||
recordsByLang.set(lang, JSON.parse(raw) as AnnotatedRecord[]);
|
|
||||||
}
|
|
||||||
const raw = await fs.readFile(
|
|
||||||
path.join(OUTPUT_DIR, "conflicts.json"),
|
|
||||||
"utf-8",
|
|
||||||
);
|
|
||||||
conflicts = JSON.parse(raw) as ConflictEntry[];
|
|
||||||
}, 60_000);
|
|
||||||
|
|
||||||
it("all five language files exist", async () => {
|
|
||||||
const errors: string[] = [];
|
|
||||||
|
|
||||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
||||||
const filePath = path.join(OUTPUT_DIR, `${lang}.json`);
|
|
||||||
try {
|
|
||||||
await fs.access(filePath);
|
|
||||||
} catch {
|
|
||||||
errors.push(`missing file: ${lang}.json`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("conflicts.json exists", async () => {
|
|
||||||
const filePath = path.join(OUTPUT_DIR, "conflicts.json");
|
|
||||||
await expect(fs.access(filePath)).resolves.toBeUndefined();
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every language file is a non-empty array", () => {
|
|
||||||
const errors: string[] = [];
|
|
||||||
|
|
||||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
||||||
const records = recordsByLang.get(lang)!;
|
|
||||||
if (!Array.isArray(records)) {
|
|
||||||
errors.push(`${lang}.json: not an array`);
|
|
||||||
} else if (records.length === 0) {
|
|
||||||
errors.push(`${lang}.json: empty array`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every record has required fields", () => {
|
|
||||||
const errors: string[] = [];
|
|
||||||
|
|
||||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
||||||
const records = recordsByLang.get(lang)!;
|
|
||||||
|
|
||||||
for (const record of records) {
|
|
||||||
if (!record.source_id) {
|
|
||||||
errors.push(`${lang}: record missing source_id`);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!record.pos)
|
|
||||||
errors.push(`${lang} ${record.source_id}: missing pos`);
|
|
||||||
if (!record.translations)
|
|
||||||
errors.push(`${lang} ${record.source_id}: missing translations`);
|
|
||||||
if (!record.glosses)
|
|
||||||
errors.push(`${lang} ${record.source_id}: missing glosses`);
|
|
||||||
if (record.examples === undefined)
|
|
||||||
errors.push(`${lang} ${record.source_id}: missing examples`);
|
|
||||||
if (record.votes === undefined)
|
|
||||||
errors.push(`${lang} ${record.source_id}: missing votes`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every pos is a valid supported value", () => {
|
|
||||||
const errors: string[] = [];
|
|
||||||
const validPos = new Set(SUPPORTED_POS);
|
|
||||||
|
|
||||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
||||||
const records = recordsByLang.get(lang)!;
|
|
||||||
|
|
||||||
for (const record of records) {
|
|
||||||
if (!validPos.has(record.pos)) {
|
|
||||||
errors.push(
|
|
||||||
`${lang} ${record.source_id}: invalid pos "${record.pos}"`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every example has text and a valid source", () => {
|
|
||||||
const errors: string[] = [];
|
|
||||||
const validSources = new Set(["omw", "cefr"]);
|
|
||||||
|
|
||||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
||||||
const records = recordsByLang.get(lang)!;
|
|
||||||
|
|
||||||
for (const record of records) {
|
|
||||||
for (const [l, examples] of Object.entries(record.examples)) {
|
|
||||||
for (const example of examples) {
|
|
||||||
if (!example.text) {
|
|
||||||
errors.push(
|
|
||||||
`${lang} ${record.source_id} (${l}): example missing text`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (!validSources.has(example.source)) {
|
|
||||||
errors.push(
|
|
||||||
`${lang} ${record.source_id} (${l}): invalid example source "${example.source}"`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("every cefr_source vote is a valid CEFR level", () => {
|
|
||||||
const errors: string[] = [];
|
|
||||||
const validLevels = new Set(CEFR_LEVELS);
|
|
||||||
|
|
||||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
||||||
const records = recordsByLang.get(lang)!;
|
|
||||||
|
|
||||||
for (const record of records) {
|
|
||||||
for (const [l, langVotes] of Object.entries(record.votes)) {
|
|
||||||
for (const [word, vote] of Object.entries(langVotes ?? {})) {
|
|
||||||
if (
|
|
||||||
!validLevels.has(vote.cefr_source as (typeof CEFR_LEVELS)[number])
|
|
||||||
) {
|
|
||||||
errors.push(
|
|
||||||
`${lang} ${record.source_id} (${l} — "${word}"): invalid cefr_source "${vote.cefr_source}"`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("conflicts.json entries have required fields and valid CEFR levels", () => {
|
|
||||||
const errors: string[] = [];
|
|
||||||
const validLevels = new Set(CEFR_LEVELS);
|
|
||||||
const validLangs = new Set(SUPPORTED_LANGUAGE_CODES);
|
|
||||||
|
|
||||||
for (const entry of conflicts) {
|
|
||||||
if (!entry.word) errors.push(`conflict missing word`);
|
|
||||||
if (!entry.pos) errors.push(`conflict missing pos`);
|
|
||||||
if (!entry.language) {
|
|
||||||
errors.push(`conflict missing language`);
|
|
||||||
} else if (!validLangs.has(entry.language)) {
|
|
||||||
errors.push(`conflict invalid language "${entry.language}"`);
|
|
||||||
}
|
|
||||||
if (!Array.isArray(entry.levels) || entry.levels.length < 2) {
|
|
||||||
errors.push(`${entry.word}: levels must have at least 2 entries`);
|
|
||||||
} else {
|
|
||||||
for (const level of entry.levels) {
|
|
||||||
if (!validLevels.has(level as (typeof CEFR_LEVELS)[number])) {
|
|
||||||
errors.push(`${entry.word}: invalid level "${level}"`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
expect(errors, `\n${errors.join("\n")}`).toHaveLength(0);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue