updating seeding pipeline

This commit is contained in:
lila 2026-04-05 19:29:47 +02:00
parent dfeb6a4cb0
commit e3c05b5596

View file

@ -1,105 +0,0 @@
"""
scripts/extract-en-it-nouns.py
Extract ALL bilingual nouns from Open Multilingual Wordnet (OMW).
Output mirrors the terms table schema exactly no filtering, no ranking.
Decks handle curation later.
"""
import json
import os
import sys
from pathlib import Path
import wn
def extract_bilingual_nouns(
source_lang: str = "en",
target_lang: str = "it",
output_path: str = "datafiles/en-it-nouns.json",
) -> None:
"""
Extract all noun synsets present in both languages via ILI.
Args:
source_lang: Source language code (e.g., "en" for English)
target_lang: Target language code (e.g., "it" for Italian)
output_path: Where to write the seed JSON
"""
print(f"Loading WordNets: {source_lang=}, {target_lang=}")
try:
source_wn = wn.Wordnet(lang=source_lang)
target_wn = wn.Wordnet(lang=target_lang)
except wn.Error as e:
print(f"Error loading WordNet: {e}")
print(f"Run: wn download omw-{target_lang}:1.4 oewn:2024")
sys.exit(1)
# Index nouns by ILI (Inter-Lingual Index)
source_by_ili: dict[str, wn.Synset] = {}
for synset in source_wn.synsets(pos="n"):
if synset.ili:
source_by_ili[synset.ili] = synset
target_by_ili: dict[str, wn.Synset] = {}
for synset in target_wn.synsets(pos="n"):
if synset.ili:
target_by_ili[synset.ili] = synset
# Find bilingual synsets (present in both languages)
common_ilis = set(source_by_ili.keys()) & set(target_by_ili.keys())
print(f"Found {len(common_ilis):,} bilingual noun synsets")
# Build seed data matching schema exactly
terms: list[dict] = []
for ili in sorted(common_ilis, key=lambda x: int(x[1:])):
en_syn = source_by_ili[ili]
it_syn = target_by_ili[ili]
# All lemmas (synonyms) for each language
en_lemmas = [str(lemma) for lemma in en_syn.lemmas()]
it_lemmas = [str(lemma) for lemma in it_syn.lemmas()]
term = {
"synset_id": f"ili:{ili}", # e.g., "ili:i12345"
"pos": "noun",
"translations": {source_lang: en_lemmas, target_lang: it_lemmas},
# Note: id, created_at added by seed.ts during insert
}
terms.append(term)
# Ensure output directory exists
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
# Write JSON
with open(output_file, "w", encoding="utf-8") as f:
json.dump(terms, f, indent=2, ensure_ascii=False)
print(f"Wrote {len(terms):,} terms to {output_path}")
# Summary stats
total_en_lemmas = sum(len(t["translations"][source_lang]) for t in terms)
total_it_lemmas = sum(len(t["translations"][target_lang]) for t in terms)
print(f"\nLemma counts:")
print(
f" English: {total_en_lemmas:,} total ({total_en_lemmas / len(terms):.1f} avg per synset)"
)
print(
f" Italian: {total_it_lemmas:,} total ({total_it_lemmas / len(terms):.1f} avg per synset)"
)
# Sample output
print(f"\n--- Sample terms ---")
for t in terms[1000:1005]:
print(
f"{t['synset_id']}: {t['translations'][source_lang]} -> {t['translations'][target_lang]}"
)
if __name__ == "__main__":
extract_bilingual_nouns()