updating seeding pipeline
This commit is contained in:
parent
c49c2fe2c3
commit
dfeb6a4cb0
2 changed files with 149 additions and 203 deletions
149
scripts/extract-own-save-to-json.py
Normal file
149
scripts/extract-own-save-to-json.py
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
"""
|
||||
scripts/extract-omw-data.py
|
||||
|
||||
Extract ALL synsets from Open Multilingual Wordnet (OMW) for every supported
|
||||
language and POS. Replaces extract-en-it-nouns.py.
|
||||
|
||||
Output: one JSON file per POS, written to packages/db/src/data/datafiles/
|
||||
omw-noun.json
|
||||
omw-verb.json
|
||||
|
||||
Each file is a JSON array of objects matching SynsetRecord in seed.ts:
|
||||
{
|
||||
"source_id": "ili:i12345",
|
||||
"pos": "noun",
|
||||
"translations": { "en": ["dog", "canine"], "it": ["cane"] },
|
||||
"glosses": { "en": ["a domesticated animal..."] }
|
||||
}
|
||||
|
||||
Translations and glosses are absent for a language if that wordnet has no
|
||||
coverage for the synset — the seed script handles sparse data gracefully.
|
||||
|
||||
Usage:
|
||||
python scripts/extract-omw-data.py [output_dir]
|
||||
|
||||
output_dir defaults to packages/db/src/data/datafiles/
|
||||
|
||||
Prerequisites:
|
||||
pip install wn
|
||||
python -c "import wn; wn.download('oewn:2024'); wn.download('omw-it:1.4')"
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import wn
|
||||
|
||||
# Mirror constants.ts — update both places if languages or POS change.
|
||||
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it"]
|
||||
POS_MAP: dict[str, str] = {
|
||||
"n": "noun",
|
||||
"v": "verb",
|
||||
}
|
||||
|
||||
|
||||
def extract_all(output_dir: str = "packages/db/src/data/datafiles/") -> None:
|
||||
out = Path(output_dir)
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load one Wordnet object per language up front.
|
||||
print("Loading wordnets...")
|
||||
wordnets: dict[str, wn.Wordnet] = {}
|
||||
for lang in SUPPORTED_LANGUAGE_CODES:
|
||||
try:
|
||||
wordnets[lang] = wn.Wordnet(lang=lang)
|
||||
synset_count = len(wordnets[lang].synsets())
|
||||
print(f" {lang}: {synset_count:,} total synsets")
|
||||
except wn.Error as e:
|
||||
print(f" ERROR loading {lang}: {e}")
|
||||
print(f" Run: python -c \"import wn; wn.download('omw-{lang}:1.4')\"")
|
||||
sys.exit(1)
|
||||
|
||||
for omw_pos, pos_label in POS_MAP.items():
|
||||
print(f"\n--- Extracting {pos_label}s (pos='{omw_pos}') ---")
|
||||
|
||||
# Collect per-ILI data across all languages.
|
||||
# Structure: { ili -> { lang -> { "lemmas": [...], "glosses": [...] } } }
|
||||
by_ili: dict[str, dict[str, dict[str, list[str]]]] = {}
|
||||
|
||||
for lang, wnet in wordnets.items():
|
||||
synsets = wnet.synsets(pos=omw_pos)
|
||||
covered = 0
|
||||
for synset in synsets:
|
||||
ili = synset.ili
|
||||
if not ili:
|
||||
continue # skip synsets without an ILI — can't cross-link
|
||||
covered += 1
|
||||
if ili not in by_ili:
|
||||
by_ili[ili] = {}
|
||||
|
||||
lemmas = [str(lemma) for lemma in synset.lemmas()]
|
||||
defns = [d for d in synset.definitions() if d]
|
||||
|
||||
by_ili[ili][lang] = {"lemmas": lemmas, "glosses": defns}
|
||||
|
||||
print(f" {lang}: {covered:,} {pos_label} synsets with ILI")
|
||||
|
||||
# Build output records — sort by ILI for a stable, diffable file.
|
||||
records: list[dict] = []
|
||||
for ili in sorted(by_ili.keys()):
|
||||
lang_data = by_ili[ili]
|
||||
translations: dict[str, list[str]] = {}
|
||||
glosses: dict[str, list[str]] = {}
|
||||
|
||||
for lang, data in lang_data.items():
|
||||
if data["lemmas"]:
|
||||
translations[lang] = data["lemmas"]
|
||||
if data["glosses"]:
|
||||
glosses[lang] = data["glosses"]
|
||||
|
||||
# Include the record even if only one language has coverage —
|
||||
# the seed script imports all terms regardless of cross-language overlap.
|
||||
records.append(
|
||||
{
|
||||
"source_id": f"ili:{ili}",
|
||||
"pos": pos_label,
|
||||
"translations": translations,
|
||||
"glosses": glosses,
|
||||
}
|
||||
)
|
||||
|
||||
output_file = out / f"omw-{pos_label}.json"
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\nWrote {len(records):,} {pos_label} synsets → {output_file}")
|
||||
_print_coverage(records, pos_label)
|
||||
|
||||
|
||||
def _print_coverage(records: list[dict], pos_label: str) -> None:
|
||||
"""Print per-language translation and gloss counts."""
|
||||
lang_stats: dict[str, dict[str, int]] = {}
|
||||
for lang in SUPPORTED_LANGUAGE_CODES:
|
||||
lang_stats[lang] = {"translations": 0, "glosses": 0}
|
||||
|
||||
for r in records:
|
||||
for lang, lemmas in r["translations"].items():
|
||||
if lang in lang_stats:
|
||||
lang_stats[lang]["translations"] += len(lemmas)
|
||||
for lang, gloss_list in r["glosses"].items():
|
||||
if lang in lang_stats:
|
||||
lang_stats[lang]["glosses"] += len(gloss_list)
|
||||
|
||||
print(f"\nCoverage for {pos_label}s:")
|
||||
for lang, counts in lang_stats.items():
|
||||
t = counts["translations"]
|
||||
g = counts["glosses"]
|
||||
avg_t = t / len(records) if records else 0
|
||||
print(f" {lang}: {t:,} translations ({avg_t:.1f} avg/synset), {g:,} glosses")
|
||||
|
||||
# Sample output
|
||||
print(f"\nSample {pos_label}s (records 1000–1004):")
|
||||
for r in records[1000:1005]:
|
||||
print(f" {r['source_id']}: {r['translations']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
output_dir = sys.argv[1] if len(sys.argv) > 1 else "packages/db/src/data/datafiles/"
|
||||
extract_all(output_dir)
|
||||
Loading…
Add table
Add a link
Reference in a new issue