lila/scripts/extract-own-save-to-json.py

"""
scripts/extract-omw-data.py

Extract ALL synsets from Open Multilingual Wordnet (OMW) for every supported
language and POS. Replaces extract-en-it-nouns.py.

Output: one JSON file per POS, written to packages/db/src/data/datafiles/
  omw-noun.json
  omw-verb.json

Each file is a JSON array of objects matching SynsetRecord in seed.ts:
  {
    "source_id": "ili:i12345",
    "pos": "noun",
    "translations": { "en": ["dog", "canine"], "it": ["cane"] },
    "glosses":      { "en": ["a domesticated animal..."] }
  }

Translations and glosses are absent for a language if that wordnet has no
coverage for the synset — the seed script handles sparse data gracefully.

Usage:
  python scripts/extract-omw-data.py [output_dir]

  output_dir defaults to packages/db/src/data/datafiles/

Prerequisites:
  pip install wn
  python -c "import wn; wn.download('oewn:2024'); wn.download('omw-it:1.4')"
"""

import json
import sys
from pathlib import Path

import wn

# Mirror constants.ts — update both places if languages or POS change.
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it"]
POS_MAP: dict[str, str] = {
    "n": "noun",
    "v": "verb",
}


def extract_all(output_dir: str = "packages/db/src/data/datafiles/") -> None:
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)

    # Load one Wordnet object per language up front.
    print("Loading wordnets...")
    wordnets: dict[str, wn.Wordnet] = {}
    for lang in SUPPORTED_LANGUAGE_CODES:
        try:
            wordnets[lang] = wn.Wordnet(lang=lang)
            synset_count = len(wordnets[lang].synsets())
            print(f"  {lang}: {synset_count:,} total synsets")
        except wn.Error as e:
            print(f"  ERROR loading {lang}: {e}")
            print(f"  Run: python -c \"import wn; wn.download('omw-{lang}:1.4')\"")
            sys.exit(1)

    for omw_pos, pos_label in POS_MAP.items():
        print(f"\n--- Extracting {pos_label}s (pos='{omw_pos}') ---")

        # Collect per-ILI data across all languages.
        # Structure: { ili -> { lang -> { "lemmas": [...], "glosses": [...] } } }
        by_ili: dict[str, dict[str, dict[str, list[str]]]] = {}

        for lang, wnet in wordnets.items():
            synsets = wnet.synsets(pos=omw_pos)
            covered = 0
            for synset in synsets:
                ili = synset.ili
                if not ili:
                    continue  # skip synsets without an ILI — can't cross-link
                covered += 1
                if ili not in by_ili:
                    by_ili[ili] = {}

                lemmas = [str(lemma) for lemma in synset.lemmas()]
                defns = [d for d in synset.definitions() if d]

                by_ili[ili][lang] = {"lemmas": lemmas, "glosses": defns}

            print(f"  {lang}: {covered:,} {pos_label} synsets with ILI")

        # Build output records — sort by ILI for a stable, diffable file.
        records: list[dict] = []
        for ili in sorted(by_ili.keys()):
            lang_data = by_ili[ili]
            translations: dict[str, list[str]] = {}
            glosses: dict[str, list[str]] = {}

            for lang, data in lang_data.items():
                if data["lemmas"]:
                    translations[lang] = data["lemmas"]
                if data["glosses"]:
                    glosses[lang] = data["glosses"]

            # Include the record even if only one language has coverage —
            # the seed script imports all terms regardless of cross-language overlap.
            records.append(
                {
                    "source_id": f"ili:{ili}",
                    "pos": pos_label,
                    "translations": translations,
                    "glosses": glosses,
                }
            )

        output_file = out / f"omw-{pos_label}.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(records, f, indent=2, ensure_ascii=False)

        print(f"\nWrote {len(records):,} {pos_label} synsets → {output_file}")
        _print_coverage(records, pos_label)


def _print_coverage(records: list[dict], pos_label: str) -> None:
    """Print per-language translation and gloss counts."""
    lang_stats: dict[str, dict[str, int]] = {}
    for lang in SUPPORTED_LANGUAGE_CODES:
        lang_stats[lang] = {"translations": 0, "glosses": 0}

    for r in records:
        for lang, lemmas in r["translations"].items():
            if lang in lang_stats:
                lang_stats[lang]["translations"] += len(lemmas)
        for lang, gloss_list in r["glosses"].items():
            if lang in lang_stats:
                lang_stats[lang]["glosses"] += len(gloss_list)

    print(f"\nCoverage for {pos_label}s:")
    for lang, counts in lang_stats.items():
        t = counts["translations"]
        g = counts["glosses"]
        avg_t = t / len(records) if records else 0
        print(f"  {lang}: {t:,} translations ({avg_t:.1f} avg/synset), {g:,} glosses")

    # Sample output
    print(f"\nSample {pos_label}s (records 1000–1004):")
    for r in records[1000:1005]:
        print(f"  {r['source_id']}: {r['translations']}")


if __name__ == "__main__":
    output_dir = sys.argv[1] if len(sys.argv) > 1 else "packages/db/src/data/datafiles/"
    extract_all(output_dir)