updating seeding pipeline

2026-04-05 19:29:47 +02:00 · 2026-04-05 19:29:47 +02:00 · e3c05b5596
commit e3c05b5596
parent dfeb6a4cb0
1 changed files with 0 additions and 105 deletions
--- a/scripts/extract-en-it-nouns.py
+++ b/scripts/extract-en-it-nouns.py
@ -1,105 +0,0 @@
-"""
-scripts/extract-en-it-nouns.py
-
-Extract ALL bilingual nouns from Open Multilingual Wordnet (OMW).
-Output mirrors the terms table schema exactly — no filtering, no ranking.
-Decks handle curation later.
-"""
-
-import json
-import os
-import sys
-from pathlib import Path
-
-import wn
-
-
-def extract_bilingual_nouns(
-    source_lang: str = "en",
-    target_lang: str = "it",
-    output_path: str = "datafiles/en-it-nouns.json",
-) -> None:
-    """
-    Extract all noun synsets present in both languages via ILI.
-
-    Args:
-        source_lang: Source language code (e.g., "en" for English)
-        target_lang: Target language code (e.g., "it" for Italian)
-        output_path: Where to write the seed JSON
-    """
-    print(f"Loading WordNets: {source_lang=}, {target_lang=}")
-
-    try:
-        source_wn = wn.Wordnet(lang=source_lang)
-        target_wn = wn.Wordnet(lang=target_lang)
-    except wn.Error as e:
-        print(f"Error loading WordNet: {e}")
-        print(f"Run: wn download omw-{target_lang}:1.4 oewn:2024")
-        sys.exit(1)
-
-    # Index nouns by ILI (Inter-Lingual Index)
-    source_by_ili: dict[str, wn.Synset] = {}
-    for synset in source_wn.synsets(pos="n"):
-        if synset.ili:
-            source_by_ili[synset.ili] = synset
-
-    target_by_ili: dict[str, wn.Synset] = {}
-    for synset in target_wn.synsets(pos="n"):
-        if synset.ili:
-            target_by_ili[synset.ili] = synset
-
-    # Find bilingual synsets (present in both languages)
-    common_ilis = set(source_by_ili.keys()) & set(target_by_ili.keys())
-    print(f"Found {len(common_ilis):,} bilingual noun synsets")
-
-    # Build seed data matching schema exactly
-    terms: list[dict] = []
-
-    for ili in sorted(common_ilis, key=lambda x: int(x[1:])):
-        en_syn = source_by_ili[ili]
-        it_syn = target_by_ili[ili]
-
-        # All lemmas (synonyms) for each language
-        en_lemmas = [str(lemma) for lemma in en_syn.lemmas()]
-        it_lemmas = [str(lemma) for lemma in it_syn.lemmas()]
-
-        term = {
-            "synset_id": f"ili:{ili}",  # e.g., "ili:i12345"
-            "pos": "noun",
-            "translations": {source_lang: en_lemmas, target_lang: it_lemmas},
-            # Note: id, created_at added by seed.ts during insert
-        }
-        terms.append(term)
-
-    # Ensure output directory exists
-    output_file = Path(output_path)
-    output_file.parent.mkdir(parents=True, exist_ok=True)
-
-    # Write JSON
-    with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(terms, f, indent=2, ensure_ascii=False)
-
-    print(f"Wrote {len(terms):,} terms to {output_path}")
-
-    # Summary stats
-    total_en_lemmas = sum(len(t["translations"][source_lang]) for t in terms)
-    total_it_lemmas = sum(len(t["translations"][target_lang]) for t in terms)
-
-    print(f"\nLemma counts:")
-    print(
-        f"  English: {total_en_lemmas:,} total ({total_en_lemmas / len(terms):.1f} avg per synset)"
-    )
-    print(
-        f"  Italian: {total_it_lemmas:,} total ({total_it_lemmas / len(terms):.1f} avg per synset)"
-    )
-
-    # Sample output
-    print(f"\n--- Sample terms ---")
-    for t in terms[1000:1005]:
-        print(
-            f"{t['synset_id']}: {t['translations'][source_lang]} -> {t['translations'][target_lang]}"
-        )
-
-
-if __name__ == "__main__":
-    extract_bilingual_nouns()