refactor: migrate to deck-based vocabulary curation

Database Schema: - Add decks table for curated word lists (A1, Most Common, etc.) - Add deck_terms join table with position ordering - Link rooms to decks via rooms.deck_id FK - Remove frequency_rank from terms (now deck-scoped) - Change users.id to uuid, add openauth_sub for auth mapping - Add room_players.left_at for disconnect tracking - Add rooms.updated_at for stale room recovery - Add CHECK constraints for data integrity (pos, status, etc.) Extraction Script: - Rewrite extract.py to mirror complete OMW dataset - Extract all 25,204 bilingual noun synsets (en-it) - Remove frequency filtering and block lists - Output all lemmas per synset for full synonym support - Seed data now uncurated; decks handle selection Architecture: - Separate concerns: raw OMW data in DB, curation in decks - Enables user-created decks and multiple difficulty levels - Rooms select vocabulary by choosing a deck
2026-03-27 16:53:26 +01:00 · 2026-03-27 16:53:26 +01:00 · be7a7903c5
commit be7a7903c5
parent e9e750da3e
9 changed files with 349148 additions and 492 deletions
--- a/scripts/datafiles/en-it-nouns.json
+++ b/scripts/datafiles/en-it-nouns.json
--- a/scripts/download_data.py
+++ b/scripts/download_data.py
@ -1,18 +0,0 @@
-import nltk
-
-
-def main():
-    print("Downloading WordNet...")
-    nltk.download("wordnet")
-
-    print("Downloading OMW 1.4...")
-    nltk.download("omw-1.4")
-
-    print("Downloading WordNet IC...")
-    nltk.download("wordnet_ic")
-
-    print("Done.")
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/extract-en-it-nouns.py
+++ b/scripts/extract-en-it-nouns.py
@ -0,0 +1,105 @@
+"""
+scripts/extract-en-it-nouns.py
+
+Extract ALL bilingual nouns from Open Multilingual Wordnet (OMW).
+Output mirrors the terms table schema exactly — no filtering, no ranking.
+Decks handle curation later.
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+import wn
+
+
+def extract_bilingual_nouns(
+    source_lang: str = "en",
+    target_lang: str = "it",
+    output_path: str = "datafiles/en-it-nouns.json",
+) -> None:
+    """
+    Extract all noun synsets present in both languages via ILI.
+
+    Args:
+        source_lang: Source language code (e.g., "en" for English)
+        target_lang: Target language code (e.g., "it" for Italian)
+        output_path: Where to write the seed JSON
+    """
+    print(f"Loading WordNets: {source_lang=}, {target_lang=}")
+
+    try:
+        source_wn = wn.Wordnet(lang=source_lang)
+        target_wn = wn.Wordnet(lang=target_lang)
+    except wn.Error as e:
+        print(f"Error loading WordNet: {e}")
+        print(f"Run: wn download omw-{target_lang}:1.4 oewn:2024")
+        sys.exit(1)
+
+    # Index nouns by ILI (Inter-Lingual Index)
+    source_by_ili: dict[str, wn.Synset] = {}
+    for synset in source_wn.synsets(pos="n"):
+        if synset.ili:
+            source_by_ili[synset.ili] = synset
+
+    target_by_ili: dict[str, wn.Synset] = {}
+    for synset in target_wn.synsets(pos="n"):
+        if synset.ili:
+            target_by_ili[synset.ili] = synset
+
+    # Find bilingual synsets (present in both languages)
+    common_ilis = set(source_by_ili.keys()) & set(target_by_ili.keys())
+    print(f"Found {len(common_ilis):,} bilingual noun synsets")
+
+    # Build seed data matching schema exactly
+    terms: list[dict] = []
+
+    for ili in sorted(common_ilis, key=lambda x: int(x[1:])):
+        en_syn = source_by_ili[ili]
+        it_syn = target_by_ili[ili]
+
+        # All lemmas (synonyms) for each language
+        en_lemmas = [str(lemma) for lemma in en_syn.lemmas()]
+        it_lemmas = [str(lemma) for lemma in it_syn.lemmas()]
+
+        term = {
+            "synset_id": f"ili:{ili}",  # e.g., "ili:i12345"
+            "pos": "noun",
+            "translations": {source_lang: en_lemmas, target_lang: it_lemmas},
+            # Note: id, created_at added by seed.ts during insert
+        }
+        terms.append(term)
+
+    # Ensure output directory exists
+    output_file = Path(output_path)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+
+    # Write JSON
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(terms, f, indent=2, ensure_ascii=False)
+
+    print(f"Wrote {len(terms):,} terms to {output_path}")
+
+    # Summary stats
+    total_en_lemmas = sum(len(t["translations"][source_lang]) for t in terms)
+    total_it_lemmas = sum(len(t["translations"][target_lang]) for t in terms)
+
+    print(f"\nLemma counts:")
+    print(
+        f"  English: {total_en_lemmas:,} total ({total_en_lemmas / len(terms):.1f} avg per synset)"
+    )
+    print(
+        f"  Italian: {total_it_lemmas:,} total ({total_it_lemmas / len(terms):.1f} avg per synset)"
+    )
+
+    # Sample output
+    print(f"\n--- Sample terms ---")
+    for t in terms[1000:1005]:
+        print(
+            f"{t['synset_id']}: {t['translations'][source_lang]} -> {t['translations'][target_lang]}"
+        )
+
+
+if __name__ == "__main__":
+    extract_bilingual_nouns()
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@ -1 +1 @@
-nltk>=3.8
+wn==1.1.0