updating seeding pipeline

2026-04-05 19:29:17 +02:00 · 2026-04-05 19:29:17 +02:00 · dfeb6a4cb0
commit dfeb6a4cb0
parent c49c2fe2c3
2 changed files with 149 additions and 203 deletions
--- a/scripts/extract-own-save-to-json.py
+++ b/scripts/extract-own-save-to-json.py
@ -0,0 +1,149 @@
+"""
+scripts/extract-omw-data.py
+
+Extract ALL synsets from Open Multilingual Wordnet (OMW) for every supported
+language and POS. Replaces extract-en-it-nouns.py.
+
+Output: one JSON file per POS, written to packages/db/src/data/datafiles/
+  omw-noun.json
+  omw-verb.json
+
+Each file is a JSON array of objects matching SynsetRecord in seed.ts:
+  {
+    "source_id": "ili:i12345",
+    "pos": "noun",
+    "translations": { "en": ["dog", "canine"], "it": ["cane"] },
+    "glosses":      { "en": ["a domesticated animal..."] }
+  }
+
+Translations and glosses are absent for a language if that wordnet has no
+coverage for the synset — the seed script handles sparse data gracefully.
+
+Usage:
+  python scripts/extract-omw-data.py [output_dir]
+
+  output_dir defaults to packages/db/src/data/datafiles/
+
+Prerequisites:
+  pip install wn
+  python -c "import wn; wn.download('oewn:2024'); wn.download('omw-it:1.4')"
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import wn
+
+# Mirror constants.ts — update both places if languages or POS change.
+SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it"]
+POS_MAP: dict[str, str] = {
+    "n": "noun",
+    "v": "verb",
+}
+
+
+def extract_all(output_dir: str = "packages/db/src/data/datafiles/") -> None:
+    out = Path(output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+
+    # Load one Wordnet object per language up front.
+    print("Loading wordnets...")
+    wordnets: dict[str, wn.Wordnet] = {}
+    for lang in SUPPORTED_LANGUAGE_CODES:
+        try:
+            wordnets[lang] = wn.Wordnet(lang=lang)
+            synset_count = len(wordnets[lang].synsets())
+            print(f"  {lang}: {synset_count:,} total synsets")
+        except wn.Error as e:
+            print(f"  ERROR loading {lang}: {e}")
+            print(f"  Run: python -c \"import wn; wn.download('omw-{lang}:1.4')\"")
+            sys.exit(1)
+
+    for omw_pos, pos_label in POS_MAP.items():
+        print(f"\n--- Extracting {pos_label}s (pos='{omw_pos}') ---")
+
+        # Collect per-ILI data across all languages.
+        # Structure: { ili -> { lang -> { "lemmas": [...], "glosses": [...] } } }
+        by_ili: dict[str, dict[str, dict[str, list[str]]]] = {}
+
+        for lang, wnet in wordnets.items():
+            synsets = wnet.synsets(pos=omw_pos)
+            covered = 0
+            for synset in synsets:
+                ili = synset.ili
+                if not ili:
+                    continue  # skip synsets without an ILI — can't cross-link
+                covered += 1
+                if ili not in by_ili:
+                    by_ili[ili] = {}
+
+                lemmas = [str(lemma) for lemma in synset.lemmas()]
+                defns = [d for d in synset.definitions() if d]
+
+                by_ili[ili][lang] = {"lemmas": lemmas, "glosses": defns}
+
+            print(f"  {lang}: {covered:,} {pos_label} synsets with ILI")
+
+        # Build output records — sort by ILI for a stable, diffable file.
+        records: list[dict] = []
+        for ili in sorted(by_ili.keys()):
+            lang_data = by_ili[ili]
+            translations: dict[str, list[str]] = {}
+            glosses: dict[str, list[str]] = {}
+
+            for lang, data in lang_data.items():
+                if data["lemmas"]:
+                    translations[lang] = data["lemmas"]
+                if data["glosses"]:
+                    glosses[lang] = data["glosses"]
+
+            # Include the record even if only one language has coverage —
+            # the seed script imports all terms regardless of cross-language overlap.
+            records.append(
+                {
+                    "source_id": f"ili:{ili}",
+                    "pos": pos_label,
+                    "translations": translations,
+                    "glosses": glosses,
+                }
+            )
+
+        output_file = out / f"omw-{pos_label}.json"
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(records, f, indent=2, ensure_ascii=False)
+
+        print(f"\nWrote {len(records):,} {pos_label} synsets → {output_file}")
+        _print_coverage(records, pos_label)
+
+
+def _print_coverage(records: list[dict], pos_label: str) -> None:
+    """Print per-language translation and gloss counts."""
+    lang_stats: dict[str, dict[str, int]] = {}
+    for lang in SUPPORTED_LANGUAGE_CODES:
+        lang_stats[lang] = {"translations": 0, "glosses": 0}
+
+    for r in records:
+        for lang, lemmas in r["translations"].items():
+            if lang in lang_stats:
+                lang_stats[lang]["translations"] += len(lemmas)
+        for lang, gloss_list in r["glosses"].items():
+            if lang in lang_stats:
+                lang_stats[lang]["glosses"] += len(gloss_list)
+
+    print(f"\nCoverage for {pos_label}s:")
+    for lang, counts in lang_stats.items():
+        t = counts["translations"]
+        g = counts["glosses"]
+        avg_t = t / len(records) if records else 0
+        print(f"  {lang}: {t:,} translations ({avg_t:.1f} avg/synset), {g:,} glosses")
+
+    # Sample output
+    print(f"\nSample {pos_label}s (records 1000–1004):")
+    for r in records[1000:1005]:
+        print(f"  {r['source_id']}: {r['translations']}")
+
+
+if __name__ == "__main__":
+    output_dir = sys.argv[1] if len(sys.argv) > 1 else "packages/db/src/data/datafiles/"
+    extract_all(output_dir)