""" scripts/extract-en-it-nouns.py Extract ALL bilingual nouns from Open Multilingual Wordnet (OMW). Output mirrors the terms table schema exactly — no filtering, no ranking. Decks handle curation later. """ import json import os import sys from pathlib import Path import wn def extract_bilingual_nouns( source_lang: str = "en", target_lang: str = "it", output_path: str = "datafiles/en-it-nouns.json", ) -> None: """ Extract all noun synsets present in both languages via ILI. Args: source_lang: Source language code (e.g., "en" for English) target_lang: Target language code (e.g., "it" for Italian) output_path: Where to write the seed JSON """ print(f"Loading WordNets: {source_lang=}, {target_lang=}") try: source_wn = wn.Wordnet(lang=source_lang) target_wn = wn.Wordnet(lang=target_lang) except wn.Error as e: print(f"Error loading WordNet: {e}") print(f"Run: wn download omw-{target_lang}:1.4 oewn:2024") sys.exit(1) # Index nouns by ILI (Inter-Lingual Index) source_by_ili: dict[str, wn.Synset] = {} for synset in source_wn.synsets(pos="n"): if synset.ili: source_by_ili[synset.ili] = synset target_by_ili: dict[str, wn.Synset] = {} for synset in target_wn.synsets(pos="n"): if synset.ili: target_by_ili[synset.ili] = synset # Find bilingual synsets (present in both languages) common_ilis = set(source_by_ili.keys()) & set(target_by_ili.keys()) print(f"Found {len(common_ilis):,} bilingual noun synsets") # Build seed data matching schema exactly terms: list[dict] = [] for ili in sorted(common_ilis, key=lambda x: int(x[1:])): en_syn = source_by_ili[ili] it_syn = target_by_ili[ili] # All lemmas (synonyms) for each language en_lemmas = [str(lemma) for lemma in en_syn.lemmas()] it_lemmas = [str(lemma) for lemma in it_syn.lemmas()] term = { "synset_id": f"ili:{ili}", # e.g., "ili:i12345" "pos": "noun", "translations": {source_lang: en_lemmas, target_lang: it_lemmas}, # Note: id, created_at added by seed.ts during insert } terms.append(term) # Ensure output directory exists output_file = Path(output_path) output_file.parent.mkdir(parents=True, exist_ok=True) # Write JSON with open(output_file, "w", encoding="utf-8") as f: json.dump(terms, f, indent=2, ensure_ascii=False) print(f"Wrote {len(terms):,} terms to {output_path}") # Summary stats total_en_lemmas = sum(len(t["translations"][source_lang]) for t in terms) total_it_lemmas = sum(len(t["translations"][target_lang]) for t in terms) print(f"\nLemma counts:") print( f" English: {total_en_lemmas:,} total ({total_en_lemmas / len(terms):.1f} avg per synset)" ) print( f" Italian: {total_it_lemmas:,} total ({total_it_lemmas / len(terms):.1f} avg per synset)" ) # Sample output print(f"\n--- Sample terms ---") for t in terms[1000:1005]: print( f"{t['synset_id']}: {t['translations'][source_lang]} -> {t['translations'][target_lang]}" ) if __name__ == "__main__": extract_bilingual_nouns()