feat(pipeline): add data pipeline workspace and extraction stage

- rename scripts/ to data-pipeline/, archive existing scripts - add @lila/pipeline as pnpm workspace package - add stage-1-extract through stage-5-compare folder structure - update SUPPORTED_LANGUAGE_CODES (add es, de, fr) - update SUPPORTED_POS (add adjective, adverb) - add description field to term_glosses - add term_examples table - run and verify db migration - write and verify extract.py (117,659 synsets across 5 languages) - write PIPELINE.md
2026-04-21 09:39:36 +02:00 · 2026-04-21 09:39:36 +02:00 · c9cddf68de
commit c9cddf68de
parent e993aac711
7 changed files with 1054164 additions and 33 deletions
--- a/data-pipeline/stage-1-extract/scripts/extract.py
+++ b/data-pipeline/stage-1-extract/scripts/extract.py
@ -0,0 +1,204 @@
+"""
+data-pipeline/stage-1-extract/scripts/extract.py
+
+Extract all synsets from the Open Multilingual Wordnet (OMW) for all
+supported languages and parts of speech.
+
+Output: one JSON file per language, written to stage-1-extract/output/
+  en.json, it.json, es.json, de.json, fr.json
+
+Each file is a JSON array of synset records:
+  {
+    "source_id": "ili:i12345",
+    "pos": "noun",
+    "translations": { "en": ["dog", "canine"], "it": ["cane"] },
+    "glosses":      { "en": ["a domesticated animal..."] },
+    "examples":     { "en": ["the dog barked at the stranger"] }
+  }
+
+Usage:
+  python stage-1-extract/scripts/extract.py
+  python stage-1-extract/scripts/extract.py --sample
+
+Prerequisites:
+  pip install wn
+  python -m wn download omw-en:1.4
+  python -m wn download omw-it:1.4
+  python -m wn download omw-de:1.4
+  python -m wn download omw-es:1.4
+  python -m wn download omw-fr:1.4
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import wn
+
+SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
+POS_MAP: dict[str, str] = {
+    "n": "noun",
+    "v": "verb",
+    "a": "adjective",
+    "s": "adjective",  # adjective satellite — collapsed into adjective
+    "r": "adverb",
+}
+
+
+def extract_all(
+    output_dir: str = "stage-1-extract/output", sample: bool = False
+) -> None:
+    out = Path(output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+
+    sample_size = 100 if sample else None
+
+    # Load one Wordnet object per language up front.
+    print("Loading wordnets...")
+    wordnets: dict[str, wn.Wordnet] = {}
+    for lang in SUPPORTED_LANGUAGE_CODES:
+        try:
+            wordnets[lang] = wn.Wordnet(lang=lang)
+            synset_count = len(wordnets[lang].synsets())
+            print(f"  {lang}: {synset_count:,} total synsets")
+        except wn.Error as e:
+            print(f"  ERROR loading {lang}: {e}")
+            print(f"  Run: python -m wn download omw-{lang}:1.4")
+            sys.exit(1)
+
+    # Collect per-ILI data across all languages and POS.
+    print("\nExtracting synsets...")
+    by_ili: dict[str, dict] = {}
+
+    for lang, wnet in wordnets.items():
+        for omw_pos, pos_label in POS_MAP.items():
+            synsets = wnet.synsets(pos=omw_pos)
+            covered = 0
+            for synset in synsets:
+                ili = synset.ili
+                if not ili:
+                    continue
+                covered += 1
+
+                lemmas = [str(lemma) for lemma in synset.lemmas()]
+                defns = [d for d in synset.definitions() if d]
+                examples = [e for e in synset.examples() if e]
+
+                if ili not in by_ili:
+                    by_ili[ili] = {"pos": pos_label}
+
+                if lang not in by_ili[ili]:
+                    by_ili[ili][lang] = {
+                        "lemmas": lemmas,
+                        "glosses": defns,
+                        "examples": examples,
+                    }
+                else:
+                    # ILI already exists for this language — merge data.
+                    # Happens when 'a' and 's' both map to adjective for the
+                    # same ILI. Deduplicate to avoid repeated entries.
+                    existing = by_ili[ili][lang]
+                    existing["lemmas"] = list(
+                        dict.fromkeys(existing["lemmas"] + lemmas)
+                    )
+                    existing["glosses"] = list(
+                        dict.fromkeys(existing["glosses"] + defns)
+                    )
+                    existing["examples"] = list(
+                        dict.fromkeys(existing["examples"] + examples)
+                    )
+
+            print(f"  {lang} {pos_label}: {covered:,} synsets with ILI")
+
+    # Build records and write single combined output file.
+    print("\nBuilding records...")
+    ilis = sorted(by_ili.keys())
+    if sample_size:
+        ilis = ilis[:sample_size]
+
+    records: list[dict] = []
+    for ili in ilis:
+        data = by_ili[ili]
+        record: dict = {
+            "source_id": f"ili:{ili}",
+            "pos": data["pos"],
+            "translations": {},
+            "glosses": {},
+            "examples": {},
+        }
+
+        for key, value in data.items():
+            if key == "pos":
+                continue
+            lang = key
+            if value["lemmas"]:
+                record["translations"][lang] = value["lemmas"]
+            if value["glosses"]:
+                record["glosses"][lang] = value["glosses"]
+            if value["examples"]:
+                record["examples"][lang] = value["examples"]
+
+        records.append(record)
+
+    output_file = out / "omw.json"
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(records, f, indent=2, ensure_ascii=False)
+
+    print(f"\nWrote {len(records):,} synsets → {output_file}")
+    _print_coverage(records)
+
+
+def _print_coverage(records: list[dict]) -> None:
+    """Print per-language translation, gloss, and example counts."""
+    lang_stats: dict[str, dict[str, int]] = {}
+    for lang in SUPPORTED_LANGUAGE_CODES:
+        lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
+
+    pos_stats: dict[str, int] = {}
+
+    for r in records:
+        pos = r["pos"]
+        pos_stats[pos] = pos_stats.get(pos, 0) + 1
+
+        for lang, lemmas in r["translations"].items():
+            if lang in lang_stats:
+                lang_stats[lang]["translations"] += len(lemmas)
+        for lang, gloss_list in r["glosses"].items():
+            if lang in lang_stats:
+                lang_stats[lang]["glosses"] += len(gloss_list)
+        for lang, example_list in r["examples"].items():
+            if lang in lang_stats:
+                lang_stats[lang]["examples"] += len(example_list)
+
+    print("\nPOS breakdown:")
+    for pos, count in sorted(pos_stats.items()):
+        print(f"  {pos}: {count:,}")
+
+    print("\nCoverage per language:")
+    for lang, counts in lang_stats.items():
+        t = counts["translations"]
+        g = counts["glosses"]
+        e = counts["examples"]
+        total = len(records)
+        print(
+            f"  {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
+        )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
+    parser.add_argument(
+        "--output-dir",
+        default="stage-1-extract/output",
+        help="Output directory for JSON files",
+    )
+    parser.add_argument(
+        "--sample",
+        action="store_true",
+        help="Extract only 100 synsets per language for inspection",
+    )
+    args = parser.parse_args()
+
+    extract_all(output_dir=args.output_dir, sample=args.sample)
--- a/data-pipeline/stage-2-annotate/sources/cefr/de.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/de.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/en.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/en.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/es.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/es.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/fr.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/fr.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/it.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/it.json