feat(pipeline): add data pipeline workspace and extraction stage

- rename scripts/ to data-pipeline/, archive existing scripts - add @lila/pipeline as pnpm workspace package - add stage-1-extract through stage-5-compare folder structure - update SUPPORTED_LANGUAGE_CODES (add es, de, fr) - update SUPPORTED_POS (add adjective, adverb) - add description field to term_glosses - add term_examples table - run and verify db migration - write and verify extract.py (117,659 synsets across 5 languages) - write PIPELINE.md
2026-04-21 09:39:36 +02:00 · 2026-04-21 09:39:36 +02:00 · c9cddf68de
commit c9cddf68de
parent e993aac711
7 changed files with 1054164 additions and 33 deletions
--- a/data-pipeline/stage-1-extract/scripts/extract.py
+++ b/data-pipeline/stage-1-extract/scripts/extract.py
@ -0,0 +1,204 @@
+"""
+data-pipeline/stage-1-extract/scripts/extract.py
+
+Extract all synsets from the Open Multilingual Wordnet (OMW) for all
+supported languages and parts of speech.
+
+Output: one JSON file per language, written to stage-1-extract/output/
+  en.json, it.json, es.json, de.json, fr.json
+
+Each file is a JSON array of synset records:
+  {
+    "source_id": "ili:i12345",
+    "pos": "noun",
+    "translations": { "en": ["dog", "canine"], "it": ["cane"] },
+    "glosses":      { "en": ["a domesticated animal..."] },
+    "examples":     { "en": ["the dog barked at the stranger"] }
+  }
+
+Usage:
+  python stage-1-extract/scripts/extract.py
+  python stage-1-extract/scripts/extract.py --sample
+
+Prerequisites:
+  pip install wn
+  python -m wn download omw-en:1.4
+  python -m wn download omw-it:1.4
+  python -m wn download omw-de:1.4
+  python -m wn download omw-es:1.4
+  python -m wn download omw-fr:1.4
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import wn
+
+SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
+POS_MAP: dict[str, str] = {
+    "n": "noun",
+    "v": "verb",
+    "a": "adjective",
+    "s": "adjective",  # adjective satellite — collapsed into adjective
+    "r": "adverb",
+}
+
+
+def extract_all(
+    output_dir: str = "stage-1-extract/output", sample: bool = False
+) -> None:
+    out = Path(output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+
+    sample_size = 100 if sample else None
+
+    # Load one Wordnet object per language up front.
+    print("Loading wordnets...")
+    wordnets: dict[str, wn.Wordnet] = {}
+    for lang in SUPPORTED_LANGUAGE_CODES:
+        try:
+            wordnets[lang] = wn.Wordnet(lang=lang)
+            synset_count = len(wordnets[lang].synsets())
+            print(f"  {lang}: {synset_count:,} total synsets")
+        except wn.Error as e:
+            print(f"  ERROR loading {lang}: {e}")
+            print(f"  Run: python -m wn download omw-{lang}:1.4")
+            sys.exit(1)
+
+    # Collect per-ILI data across all languages and POS.
+    print("\nExtracting synsets...")
+    by_ili: dict[str, dict] = {}
+
+    for lang, wnet in wordnets.items():
+        for omw_pos, pos_label in POS_MAP.items():
+            synsets = wnet.synsets(pos=omw_pos)
+            covered = 0
+            for synset in synsets:
+                ili = synset.ili
+                if not ili:
+                    continue
+                covered += 1
+
+                lemmas = [str(lemma) for lemma in synset.lemmas()]
+                defns = [d for d in synset.definitions() if d]
+                examples = [e for e in synset.examples() if e]
+
+                if ili not in by_ili:
+                    by_ili[ili] = {"pos": pos_label}
+
+                if lang not in by_ili[ili]:
+                    by_ili[ili][lang] = {
+                        "lemmas": lemmas,
+                        "glosses": defns,
+                        "examples": examples,
+                    }
+                else:
+                    # ILI already exists for this language — merge data.
+                    # Happens when 'a' and 's' both map to adjective for the
+                    # same ILI. Deduplicate to avoid repeated entries.
+                    existing = by_ili[ili][lang]
+                    existing["lemmas"] = list(
+                        dict.fromkeys(existing["lemmas"] + lemmas)
+                    )
+                    existing["glosses"] = list(
+                        dict.fromkeys(existing["glosses"] + defns)
+                    )
+                    existing["examples"] = list(
+                        dict.fromkeys(existing["examples"] + examples)
+                    )
+
+            print(f"  {lang} {pos_label}: {covered:,} synsets with ILI")
+
+    # Build records and write single combined output file.
+    print("\nBuilding records...")
+    ilis = sorted(by_ili.keys())
+    if sample_size:
+        ilis = ilis[:sample_size]
+
+    records: list[dict] = []
+    for ili in ilis:
+        data = by_ili[ili]
+        record: dict = {
+            "source_id": f"ili:{ili}",
+            "pos": data["pos"],
+            "translations": {},
+            "glosses": {},
+            "examples": {},
+        }
+
+        for key, value in data.items():
+            if key == "pos":
+                continue
+            lang = key
+            if value["lemmas"]:
+                record["translations"][lang] = value["lemmas"]
+            if value["glosses"]:
+                record["glosses"][lang] = value["glosses"]
+            if value["examples"]:
+                record["examples"][lang] = value["examples"]
+
+        records.append(record)
+
+    output_file = out / "omw.json"
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(records, f, indent=2, ensure_ascii=False)
+
+    print(f"\nWrote {len(records):,} synsets → {output_file}")
+    _print_coverage(records)
+
+
+def _print_coverage(records: list[dict]) -> None:
+    """Print per-language translation, gloss, and example counts."""
+    lang_stats: dict[str, dict[str, int]] = {}
+    for lang in SUPPORTED_LANGUAGE_CODES:
+        lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
+
+    pos_stats: dict[str, int] = {}
+
+    for r in records:
+        pos = r["pos"]
+        pos_stats[pos] = pos_stats.get(pos, 0) + 1
+
+        for lang, lemmas in r["translations"].items():
+            if lang in lang_stats:
+                lang_stats[lang]["translations"] += len(lemmas)
+        for lang, gloss_list in r["glosses"].items():
+            if lang in lang_stats:
+                lang_stats[lang]["glosses"] += len(gloss_list)
+        for lang, example_list in r["examples"].items():
+            if lang in lang_stats:
+                lang_stats[lang]["examples"] += len(example_list)
+
+    print("\nPOS breakdown:")
+    for pos, count in sorted(pos_stats.items()):
+        print(f"  {pos}: {count:,}")
+
+    print("\nCoverage per language:")
+    for lang, counts in lang_stats.items():
+        t = counts["translations"]
+        g = counts["glosses"]
+        e = counts["examples"]
+        total = len(records)
+        print(
+            f"  {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
+        )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
+    parser.add_argument(
+        "--output-dir",
+        default="stage-1-extract/output",
+        help="Output directory for JSON files",
+    )
+    parser.add_argument(
+        "--sample",
+        action="store_true",
+        help="Extract only 100 synsets per language for inspection",
+    )
+    args = parser.parse_args()
+
+    extract_all(output_dir=args.output_dir, sample=args.sample)
--- a/data-pipeline/stage-2-annotate/sources/cefr/de.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/de.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/en.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/en.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/es.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/es.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/fr.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/fr.json
--- a/data-pipeline/stage-2-annotate/sources/cefr/it.json
+++ b/data-pipeline/stage-2-annotate/sources/cefr/it.json
--- a/documentation/PIPELINE.md
+++ b/documentation/PIPELINE.md
@ -112,77 +112,83 @@ The pipeline runs in five stages. Each stage is independent and can be re-run wi

 ### 1. Extract

-Reads each language from the OMW SQLite database (`~/.wn_data/wn.db`) and produces a normalized JSON file per language containing all synsets with their translations, glosses, and usage examples across all parts of speech. Adjective satellites are collapsed into adjective at this stage.
+Reads the OMW SQLite database (`~/.wn_data/wn.db`) and produces a single normalized JSON file containing all synsets with their translations, glosses, and usage examples across all five languages and all parts of speech. Adjective satellites are collapsed into adjective at this stage.

 **Input:** `~/.wn_data/wn.db`
-**Output:** `stage-1-extract/output/{lang}.json`
+**Output:** `stage-1-extract/output/omw.json`

 ```bash
-python scripts/extract.py
+python stage-1-extract/scripts/extract.py
 ```

+Add `--sample` to extract 100 synsets for inspection before running the full
+extraction.
+
 Each record in the output looks like this:

 ```json
 {
-  "source_id": "omw-en-12345",
-  "pos": "noun",
+  "source_id": "ili:i1",
+  "pos": "adjective",
  "translations": {
-    "en": ["dog", "canine"],
-    "it": ["cane"]
+    "en": ["able"],
+    "it": ["abile", "intelligente", "valente", "capace"],
+    "es": ["capaz"],
+    "fr": ["comptable"]
  },
  "glosses": {
-    "en": "a domesticated carnivorous mammal"
+    "en": ["(usually followed by 'to') having the necessary means or skill or know-how or authority to do something"]
  },
  "examples": {
-    "en": ["the dog barked at the stranger"]
+    "en": ["able to swim", "she was able to program her computer"]
  }
 }
 ```

-Note: glosses and examples are not available for all languages. French and Spanish have no glosses in the current OMW database. Coverage detail is in `COVERAGE.md`.
-
-<!-- TODO: verify record shape once extract.py is written -->
-
-> **Note for first run:** Before extracting the full dataset, run the script
-> in sample mode to inspect the actual data per language. Real-world wordnet
-> data often contains unexpected formatting, missing fields, or inconsistencies
-> that are better discovered early. A sample of 50–100 synsets per language is
-> enough to verify the output shape and spot anything worth handling before
-> processing the full dataset.
+Note: glosses and examples are not available for all languages. French and Spanish have no glosses or examples in the current OMW database — these will be generated by the LLM in the enrich stage. Coverage detail is in `COVERAGE.md`.

 ### 2. Annotate

-Merges the CEFR source files into the extracted data. Each word in each language is looked up in the corresponding CEFR source file. Matched words receive a `cefr_source` vote which carries into the enrich stage. Unmatched words proceed without a vote — the enrich stage handles them entirely.
+Reads the combined OMW extract and merges CEFR source data into it. Each translation in each language is matched against the corresponding CEFR source
+file by word text and part of speech. Matched translations receive a `cefr_source` vote which carries into the enrich stage. Unmatched translations proceed without a vote.

-This stage is language-agnostic and processes all languages in one run.
+This stage also extracts native example sentences from the CEFR source files and adds them to the record alongside OMW examples, with `source: "cefr"` to distinguish them.

-**Input:** `stage-1-extract/output/{lang}.json` + `stage-2-annotate/sources/cefr/{lang}.json`
-**Output:** `stage-2-annotate/output/{lang}.json`
+Words appearing in the CEFR source file multiple times with different CEFR levels are written to `conflicts.json` for manual review and excluded from voting until resolved.
+
+**Input:** `stage-1-extract/output/omw.json` + `stage-2-annotate/sources/cefr/{lang}.json`
+**Output:**
+- `stage-2-annotate/output/{lang}.json` — one per language
+- `stage-2-annotate/output/conflicts.json` — cross-language conflicts for review

 ```bash
 pnpm --filter @lila/pipeline annotate
 ```

-Each record in the output extends the extracted record with a `votes` field:
+Each record in the output extends the OMW record with a `votes` field and any additional examples from the CEFR source file:

 ```json
 {
-  "source_id": "omw-en-12345",
-  "pos": "noun",
+  "source_id": "ili:i1",
+  "pos": "adjective",
  "translations": {
-    "en": ["dog", "canine"],
-    "it": ["cane"]
+    "en": ["able"],
+    "it": ["abile", "intelligente", "valente", "capace"],
+    "es": ["capaz"],
+    "fr": ["comptable"]
  },
  "glosses": {
-    "en": "a domesticated carnivorous mammal"
+    "en": ["having the necessary means or skill to do something"]
  },
  "examples": {
-    "en": ["the dog barked at the stranger"]
+    "en": [
+      { "text": "able to swim", "source": "omw" },
+      { "text": "She was able to finish the task.", "source": "cefr" }
+    ]
  },
  "votes": {
    "en": {
-      "cefr_source": "A1"
+      "able": { "cefr_source": "B1" }
    }
  }
 }
@ -196,14 +202,16 @@ The enrich stage runs in two rounds, both designed to execute overnight one mode

 **Round 1 — generation**

-Each model processes every word in every language one term at a time and generates:
+Each model processes every word in every language one term at a time and
+generates:

 - A CEFR level vote for each translation
 - A description for each language
+- A translation for each language, only if OMW provides none
 - A gloss for each language, only if OMW provides none
 - Usage examples for each language, only if OMW provides none

-OMW data is never duplicated — the script checks what OMW already provides before building the prompt. For glosses and examples, if OMW data exists for that language the LLM skips generation entirely. This significantly reduces compute time for languages with good OMW coverage such as English and Italian.
+OMW data is never duplicated — the script checks what OMW already provides before building the prompt. For translations, glosses and examples, if OMW data exists for that language the LLM skips generation entirely. This significantly reduces compute time for languages with good OMW coverage such as English.

 All model-generated content is stored with an anonymised source (`model_1`, `model_2` etc.) so models cannot be biased by knowing who generated what in round 2.