lila/scripts/extract-own-save-to-json.py
2026-04-05 19:29:17 +02:00

149 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
scripts/extract-omw-data.py
Extract ALL synsets from Open Multilingual Wordnet (OMW) for every supported
language and POS. Replaces extract-en-it-nouns.py.
Output: one JSON file per POS, written to packages/db/src/data/datafiles/
omw-noun.json
omw-verb.json
Each file is a JSON array of objects matching SynsetRecord in seed.ts:
{
"source_id": "ili:i12345",
"pos": "noun",
"translations": { "en": ["dog", "canine"], "it": ["cane"] },
"glosses": { "en": ["a domesticated animal..."] }
}
Translations and glosses are absent for a language if that wordnet has no
coverage for the synset — the seed script handles sparse data gracefully.
Usage:
python scripts/extract-omw-data.py [output_dir]
output_dir defaults to packages/db/src/data/datafiles/
Prerequisites:
pip install wn
python -c "import wn; wn.download('oewn:2024'); wn.download('omw-it:1.4')"
"""
import json
import sys
from pathlib import Path
import wn
# Mirror constants.ts — update both places if languages or POS change.
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it"]
POS_MAP: dict[str, str] = {
"n": "noun",
"v": "verb",
}
def extract_all(output_dir: str = "packages/db/src/data/datafiles/") -> None:
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
# Load one Wordnet object per language up front.
print("Loading wordnets...")
wordnets: dict[str, wn.Wordnet] = {}
for lang in SUPPORTED_LANGUAGE_CODES:
try:
wordnets[lang] = wn.Wordnet(lang=lang)
synset_count = len(wordnets[lang].synsets())
print(f" {lang}: {synset_count:,} total synsets")
except wn.Error as e:
print(f" ERROR loading {lang}: {e}")
print(f" Run: python -c \"import wn; wn.download('omw-{lang}:1.4')\"")
sys.exit(1)
for omw_pos, pos_label in POS_MAP.items():
print(f"\n--- Extracting {pos_label}s (pos='{omw_pos}') ---")
# Collect per-ILI data across all languages.
# Structure: { ili -> { lang -> { "lemmas": [...], "glosses": [...] } } }
by_ili: dict[str, dict[str, dict[str, list[str]]]] = {}
for lang, wnet in wordnets.items():
synsets = wnet.synsets(pos=omw_pos)
covered = 0
for synset in synsets:
ili = synset.ili
if not ili:
continue # skip synsets without an ILI — can't cross-link
covered += 1
if ili not in by_ili:
by_ili[ili] = {}
lemmas = [str(lemma) for lemma in synset.lemmas()]
defns = [d for d in synset.definitions() if d]
by_ili[ili][lang] = {"lemmas": lemmas, "glosses": defns}
print(f" {lang}: {covered:,} {pos_label} synsets with ILI")
# Build output records — sort by ILI for a stable, diffable file.
records: list[dict] = []
for ili in sorted(by_ili.keys()):
lang_data = by_ili[ili]
translations: dict[str, list[str]] = {}
glosses: dict[str, list[str]] = {}
for lang, data in lang_data.items():
if data["lemmas"]:
translations[lang] = data["lemmas"]
if data["glosses"]:
glosses[lang] = data["glosses"]
# Include the record even if only one language has coverage —
# the seed script imports all terms regardless of cross-language overlap.
records.append(
{
"source_id": f"ili:{ili}",
"pos": pos_label,
"translations": translations,
"glosses": glosses,
}
)
output_file = out / f"omw-{pos_label}.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
print(f"\nWrote {len(records):,} {pos_label} synsets → {output_file}")
_print_coverage(records, pos_label)
def _print_coverage(records: list[dict], pos_label: str) -> None:
"""Print per-language translation and gloss counts."""
lang_stats: dict[str, dict[str, int]] = {}
for lang in SUPPORTED_LANGUAGE_CODES:
lang_stats[lang] = {"translations": 0, "glosses": 0}
for r in records:
for lang, lemmas in r["translations"].items():
if lang in lang_stats:
lang_stats[lang]["translations"] += len(lemmas)
for lang, gloss_list in r["glosses"].items():
if lang in lang_stats:
lang_stats[lang]["glosses"] += len(gloss_list)
print(f"\nCoverage for {pos_label}s:")
for lang, counts in lang_stats.items():
t = counts["translations"]
g = counts["glosses"]
avg_t = t / len(records) if records else 0
print(f" {lang}: {t:,} translations ({avg_t:.1f} avg/synset), {g:,} glosses")
# Sample output
print(f"\nSample {pos_label}s (records 10001004):")
for r in records[1000:1005]:
print(f" {r['source_id']}: {r['translations']}")
if __name__ == "__main__":
output_dir = sys.argv[1] if len(sys.argv) > 1 else "packages/db/src/data/datafiles/"
extract_all(output_dir)