From e3c05b5596ffa04d0a8cc4587bbda295c5c869c4 Mon Sep 17 00:00:00 2001 From: lila Date: Sun, 5 Apr 2026 19:29:47 +0200 Subject: [PATCH] updating seeding pipeline --- scripts/extract-en-it-nouns.py | 105 --------------------------------- 1 file changed, 105 deletions(-) delete mode 100644 scripts/extract-en-it-nouns.py diff --git a/scripts/extract-en-it-nouns.py b/scripts/extract-en-it-nouns.py deleted file mode 100644 index a8a77e1..0000000 --- a/scripts/extract-en-it-nouns.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -scripts/extract-en-it-nouns.py - -Extract ALL bilingual nouns from Open Multilingual Wordnet (OMW). -Output mirrors the terms table schema exactly — no filtering, no ranking. -Decks handle curation later. -""" - -import json -import os -import sys -from pathlib import Path - -import wn - - -def extract_bilingual_nouns( - source_lang: str = "en", - target_lang: str = "it", - output_path: str = "datafiles/en-it-nouns.json", -) -> None: - """ - Extract all noun synsets present in both languages via ILI. - - Args: - source_lang: Source language code (e.g., "en" for English) - target_lang: Target language code (e.g., "it" for Italian) - output_path: Where to write the seed JSON - """ - print(f"Loading WordNets: {source_lang=}, {target_lang=}") - - try: - source_wn = wn.Wordnet(lang=source_lang) - target_wn = wn.Wordnet(lang=target_lang) - except wn.Error as e: - print(f"Error loading WordNet: {e}") - print(f"Run: wn download omw-{target_lang}:1.4 oewn:2024") - sys.exit(1) - - # Index nouns by ILI (Inter-Lingual Index) - source_by_ili: dict[str, wn.Synset] = {} - for synset in source_wn.synsets(pos="n"): - if synset.ili: - source_by_ili[synset.ili] = synset - - target_by_ili: dict[str, wn.Synset] = {} - for synset in target_wn.synsets(pos="n"): - if synset.ili: - target_by_ili[synset.ili] = synset - - # Find bilingual synsets (present in both languages) - common_ilis = set(source_by_ili.keys()) & set(target_by_ili.keys()) - print(f"Found {len(common_ilis):,} bilingual noun synsets") - - # Build seed data matching schema exactly - terms: list[dict] = [] - - for ili in sorted(common_ilis, key=lambda x: int(x[1:])): - en_syn = source_by_ili[ili] - it_syn = target_by_ili[ili] - - # All lemmas (synonyms) for each language - en_lemmas = [str(lemma) for lemma in en_syn.lemmas()] - it_lemmas = [str(lemma) for lemma in it_syn.lemmas()] - - term = { - "synset_id": f"ili:{ili}", # e.g., "ili:i12345" - "pos": "noun", - "translations": {source_lang: en_lemmas, target_lang: it_lemmas}, - # Note: id, created_at added by seed.ts during insert - } - terms.append(term) - - # Ensure output directory exists - output_file = Path(output_path) - output_file.parent.mkdir(parents=True, exist_ok=True) - - # Write JSON - with open(output_file, "w", encoding="utf-8") as f: - json.dump(terms, f, indent=2, ensure_ascii=False) - - print(f"Wrote {len(terms):,} terms to {output_path}") - - # Summary stats - total_en_lemmas = sum(len(t["translations"][source_lang]) for t in terms) - total_it_lemmas = sum(len(t["translations"][target_lang]) for t in terms) - - print(f"\nLemma counts:") - print( - f" English: {total_en_lemmas:,} total ({total_en_lemmas / len(terms):.1f} avg per synset)" - ) - print( - f" Italian: {total_it_lemmas:,} total ({total_it_lemmas / len(terms):.1f} avg per synset)" - ) - - # Sample output - print(f"\n--- Sample terms ---") - for t in terms[1000:1005]: - print( - f"{t['synset_id']}: {t['translations'][source_lang]} -> {t['translations'][target_lang]}" - ) - - -if __name__ == "__main__": - extract_bilingual_nouns()