updating seeding pipeline

2026-04-05 19:29:47 +02:00 · 2026-04-05 19:29:47 +02:00 · e3c05b5596
commit e3c05b5596
parent dfeb6a4cb0
1 changed files with 0 additions and 105 deletions
--- a/scripts/extract-en-it-nouns.py
+++ b/scripts/extract-en-it-nouns.py
@ -1,105 +0,0 @@
 """
 scripts/extract-en-it-nouns.py
 Extract ALL bilingual nouns from Open Multilingual Wordnet (OMW).
 Output mirrors the terms table schema exactly — no filtering, no ranking.
 Decks handle curation later.
 """
 import json
 import os
 import sys
 from pathlib import Path
 import wn
 def extract_bilingual_nouns(
    source_lang: str = "en",
    target_lang: str = "it",
    output_path: str = "datafiles/en-it-nouns.json",
 ) -> None:
    """
    Extract all noun synsets present in both languages via ILI.
    Args:
        source_lang: Source language code (e.g., "en" for English)
        target_lang: Target language code (e.g., "it" for Italian)
        output_path: Where to write the seed JSON
    """
    print(f"Loading WordNets: {source_lang=}, {target_lang=}")
    try:
        source_wn = wn.Wordnet(lang=source_lang)
        target_wn = wn.Wordnet(lang=target_lang)
    except wn.Error as e:
        print(f"Error loading WordNet: {e}")
        print(f"Run: wn download omw-{target_lang}:1.4 oewn:2024")
        sys.exit(1)
    # Index nouns by ILI (Inter-Lingual Index)
    source_by_ili: dict[str, wn.Synset] = {}
    for synset in source_wn.synsets(pos="n"):
        if synset.ili:
            source_by_ili[synset.ili] = synset
    target_by_ili: dict[str, wn.Synset] = {}
    for synset in target_wn.synsets(pos="n"):
        if synset.ili:
            target_by_ili[synset.ili] = synset
    # Find bilingual synsets (present in both languages)
    common_ilis = set(source_by_ili.keys()) & set(target_by_ili.keys())
    print(f"Found {len(common_ilis):,} bilingual noun synsets")
    # Build seed data matching schema exactly
    terms: list[dict] = []
    for ili in sorted(common_ilis, key=lambda x: int(x[1:])):
        en_syn = source_by_ili[ili]
        it_syn = target_by_ili[ili]
        # All lemmas (synonyms) for each language
        en_lemmas = [str(lemma) for lemma in en_syn.lemmas()]
        it_lemmas = [str(lemma) for lemma in it_syn.lemmas()]
        term = {
            "synset_id": f"ili:{ili}",  # e.g., "ili:i12345"
            "pos": "noun",
            "translations": {source_lang: en_lemmas, target_lang: it_lemmas},
            # Note: id, created_at added by seed.ts during insert
        }
        terms.append(term)
    # Ensure output directory exists
    output_file = Path(output_path)
    output_file.parent.mkdir(parents=True, exist_ok=True)
    # Write JSON
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(terms, f, indent=2, ensure_ascii=False)
    print(f"Wrote {len(terms):,} terms to {output_path}")
    # Summary stats
    total_en_lemmas = sum(len(t["translations"][source_lang]) for t in terms)
    total_it_lemmas = sum(len(t["translations"][target_lang]) for t in terms)
    print(f"\nLemma counts:")
    print(
        f"  English: {total_en_lemmas:,} total ({total_en_lemmas / len(terms):.1f} avg per synset)"
    )
    print(
        f"  Italian: {total_it_lemmas:,} total ({total_it_lemmas / len(terms):.1f} avg per synset)"
    )
    # Sample output
    print(f"\n--- Sample terms ---")
    for t in terms[1000:1005]:
        print(
            f"{t['synset_id']}: {t['translations'][source_lang]} -> {t['translations'][target_lang]}"
        )
 if __name__ == "__main__":
    extract_bilingual_nouns()