""" scripts/extract-omw-data.py Extract ALL synsets from Open Multilingual Wordnet (OMW) for every supported language and POS. Replaces extract-en-it-nouns.py. Output: one JSON file per POS, written to packages/db/src/data/datafiles/ omw-noun.json omw-verb.json Each file is a JSON array of objects matching SynsetRecord in seed.ts: { "source_id": "ili:i12345", "pos": "noun", "translations": { "en": ["dog", "canine"], "it": ["cane"] }, "glosses": { "en": ["a domesticated animal..."] } } Translations and glosses are absent for a language if that wordnet has no coverage for the synset — the seed script handles sparse data gracefully. Usage: python scripts/extract-omw-data.py [output_dir] output_dir defaults to packages/db/src/data/datafiles/ Prerequisites: pip install wn python -c "import wn; wn.download('oewn:2024'); wn.download('omw-it:1.4')" """ import json import sys from pathlib import Path import wn # Mirror constants.ts — update both places if languages or POS change. SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it"] POS_MAP: dict[str, str] = { "n": "noun", "v": "verb", } def extract_all(output_dir: str = "packages/db/src/data/datafiles/") -> None: out = Path(output_dir) out.mkdir(parents=True, exist_ok=True) # Load one Wordnet object per language up front. print("Loading wordnets...") wordnets: dict[str, wn.Wordnet] = {} for lang in SUPPORTED_LANGUAGE_CODES: try: wordnets[lang] = wn.Wordnet(lang=lang) synset_count = len(wordnets[lang].synsets()) print(f" {lang}: {synset_count:,} total synsets") except wn.Error as e: print(f" ERROR loading {lang}: {e}") print(f" Run: python -c \"import wn; wn.download('omw-{lang}:1.4')\"") sys.exit(1) for omw_pos, pos_label in POS_MAP.items(): print(f"\n--- Extracting {pos_label}s (pos='{omw_pos}') ---") # Collect per-ILI data across all languages. # Structure: { ili -> { lang -> { "lemmas": [...], "glosses": [...] } } } by_ili: dict[str, dict[str, dict[str, list[str]]]] = {} for lang, wnet in wordnets.items(): synsets = wnet.synsets(pos=omw_pos) covered = 0 for synset in synsets: ili = synset.ili if not ili: continue # skip synsets without an ILI — can't cross-link covered += 1 if ili not in by_ili: by_ili[ili] = {} lemmas = [str(lemma) for lemma in synset.lemmas()] defns = [d for d in synset.definitions() if d] by_ili[ili][lang] = {"lemmas": lemmas, "glosses": defns} print(f" {lang}: {covered:,} {pos_label} synsets with ILI") # Build output records — sort by ILI for a stable, diffable file. records: list[dict] = [] for ili in sorted(by_ili.keys()): lang_data = by_ili[ili] translations: dict[str, list[str]] = {} glosses: dict[str, list[str]] = {} for lang, data in lang_data.items(): if data["lemmas"]: translations[lang] = data["lemmas"] if data["glosses"]: glosses[lang] = data["glosses"] # Include the record even if only one language has coverage — # the seed script imports all terms regardless of cross-language overlap. records.append( { "source_id": f"ili:{ili}", "pos": pos_label, "translations": translations, "glosses": glosses, } ) output_file = out / f"omw-{pos_label}.json" with open(output_file, "w", encoding="utf-8") as f: json.dump(records, f, indent=2, ensure_ascii=False) print(f"\nWrote {len(records):,} {pos_label} synsets → {output_file}") _print_coverage(records, pos_label) def _print_coverage(records: list[dict], pos_label: str) -> None: """Print per-language translation and gloss counts.""" lang_stats: dict[str, dict[str, int]] = {} for lang in SUPPORTED_LANGUAGE_CODES: lang_stats[lang] = {"translations": 0, "glosses": 0} for r in records: for lang, lemmas in r["translations"].items(): if lang in lang_stats: lang_stats[lang]["translations"] += len(lemmas) for lang, gloss_list in r["glosses"].items(): if lang in lang_stats: lang_stats[lang]["glosses"] += len(gloss_list) print(f"\nCoverage for {pos_label}s:") for lang, counts in lang_stats.items(): t = counts["translations"] g = counts["glosses"] avg_t = t / len(records) if records else 0 print(f" {lang}: {t:,} translations ({avg_t:.1f} avg/synset), {g:,} glosses") # Sample output print(f"\nSample {pos_label}s (records 1000–1004):") for r in records[1000:1005]: print(f" {r['source_id']}: {r['translations']}") if __name__ == "__main__": output_dir = sys.argv[1] if len(sys.argv) > 1 else "packages/db/src/data/datafiles/" extract_all(output_dir)