refactor: migrate to deck-based vocabulary curation

Database Schema:
- Add decks table for curated word lists (A1, Most Common, etc.)
- Add deck_terms join table with position ordering
- Link rooms to decks via rooms.deck_id FK
- Remove frequency_rank from terms (now deck-scoped)
- Change users.id to uuid, add openauth_sub for auth mapping
- Add room_players.left_at for disconnect tracking
- Add rooms.updated_at for stale room recovery
- Add CHECK constraints for data integrity (pos, status, etc.)

Extraction Script:
- Rewrite extract.py to mirror complete OMW dataset
- Extract all 25,204 bilingual noun synsets (en-it)
- Remove frequency filtering and block lists
- Output all lemmas per synset for full synonym support
- Seed data now uncurated; decks handle selection

Architecture:
- Separate concerns: raw OMW data in DB, curation in decks
- Enables user-created decks and multiple difficulty levels
- Rooms select vocabulary by choosing a deck
This commit is contained in:
lila 2026-03-27 16:53:26 +01:00
parent e9e750da3e
commit be7a7903c5
9 changed files with 349148 additions and 492 deletions

File diff suppressed because it is too large Load diff

View file

@ -1,18 +0,0 @@
import nltk
def main():
print("Downloading WordNet...")
nltk.download("wordnet")
print("Downloading OMW 1.4...")
nltk.download("omw-1.4")
print("Downloading WordNet IC...")
nltk.download("wordnet_ic")
print("Done.")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,105 @@
"""
scripts/extract-en-it-nouns.py
Extract ALL bilingual nouns from Open Multilingual Wordnet (OMW).
Output mirrors the terms table schema exactly no filtering, no ranking.
Decks handle curation later.
"""
import json
import os
import sys
from pathlib import Path
import wn
def extract_bilingual_nouns(
source_lang: str = "en",
target_lang: str = "it",
output_path: str = "datafiles/en-it-nouns.json",
) -> None:
"""
Extract all noun synsets present in both languages via ILI.
Args:
source_lang: Source language code (e.g., "en" for English)
target_lang: Target language code (e.g., "it" for Italian)
output_path: Where to write the seed JSON
"""
print(f"Loading WordNets: {source_lang=}, {target_lang=}")
try:
source_wn = wn.Wordnet(lang=source_lang)
target_wn = wn.Wordnet(lang=target_lang)
except wn.Error as e:
print(f"Error loading WordNet: {e}")
print(f"Run: wn download omw-{target_lang}:1.4 oewn:2024")
sys.exit(1)
# Index nouns by ILI (Inter-Lingual Index)
source_by_ili: dict[str, wn.Synset] = {}
for synset in source_wn.synsets(pos="n"):
if synset.ili:
source_by_ili[synset.ili] = synset
target_by_ili: dict[str, wn.Synset] = {}
for synset in target_wn.synsets(pos="n"):
if synset.ili:
target_by_ili[synset.ili] = synset
# Find bilingual synsets (present in both languages)
common_ilis = set(source_by_ili.keys()) & set(target_by_ili.keys())
print(f"Found {len(common_ilis):,} bilingual noun synsets")
# Build seed data matching schema exactly
terms: list[dict] = []
for ili in sorted(common_ilis, key=lambda x: int(x[1:])):
en_syn = source_by_ili[ili]
it_syn = target_by_ili[ili]
# All lemmas (synonyms) for each language
en_lemmas = [str(lemma) for lemma in en_syn.lemmas()]
it_lemmas = [str(lemma) for lemma in it_syn.lemmas()]
term = {
"synset_id": f"ili:{ili}", # e.g., "ili:i12345"
"pos": "noun",
"translations": {source_lang: en_lemmas, target_lang: it_lemmas},
# Note: id, created_at added by seed.ts during insert
}
terms.append(term)
# Ensure output directory exists
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
# Write JSON
with open(output_file, "w", encoding="utf-8") as f:
json.dump(terms, f, indent=2, ensure_ascii=False)
print(f"Wrote {len(terms):,} terms to {output_path}")
# Summary stats
total_en_lemmas = sum(len(t["translations"][source_lang]) for t in terms)
total_it_lemmas = sum(len(t["translations"][target_lang]) for t in terms)
print(f"\nLemma counts:")
print(
f" English: {total_en_lemmas:,} total ({total_en_lemmas / len(terms):.1f} avg per synset)"
)
print(
f" Italian: {total_it_lemmas:,} total ({total_it_lemmas / len(terms):.1f} avg per synset)"
)
# Sample output
print(f"\n--- Sample terms ---")
for t in terms[1000:1005]:
print(
f"{t['synset_id']}: {t['translations'][source_lang]} -> {t['translations'][target_lang]}"
)
if __name__ == "__main__":
extract_bilingual_nouns()

View file

@ -1 +1 @@
nltk>=3.8
wn==1.1.0