refactor: migrate to deck-based vocabulary curation
Database Schema: - Add decks table for curated word lists (A1, Most Common, etc.) - Add deck_terms join table with position ordering - Link rooms to decks via rooms.deck_id FK - Remove frequency_rank from terms (now deck-scoped) - Change users.id to uuid, add openauth_sub for auth mapping - Add room_players.left_at for disconnect tracking - Add rooms.updated_at for stale room recovery - Add CHECK constraints for data integrity (pos, status, etc.) Extraction Script: - Rewrite extract.py to mirror complete OMW dataset - Extract all 25,204 bilingual noun synsets (en-it) - Remove frequency filtering and block lists - Output all lemmas per synset for full synonym support - Seed data now uncurated; decks handle selection Architecture: - Separate concerns: raw OMW data in DB, curation in decks - Enables user-created decks and multiple difficulty levels - Rooms select vocabulary by choosing a deck
This commit is contained in:
parent
e9e750da3e
commit
be7a7903c5
9 changed files with 349148 additions and 492 deletions
348711
scripts/datafiles/en-it-nouns.json
Normal file
348711
scripts/datafiles/en-it-nouns.json
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,18 +0,0 @@
|
|||
import nltk
|
||||
|
||||
|
||||
def main():
|
||||
print("Downloading WordNet...")
|
||||
nltk.download("wordnet")
|
||||
|
||||
print("Downloading OMW 1.4...")
|
||||
nltk.download("omw-1.4")
|
||||
|
||||
print("Downloading WordNet IC...")
|
||||
nltk.download("wordnet_ic")
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
105
scripts/extract-en-it-nouns.py
Normal file
105
scripts/extract-en-it-nouns.py
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
"""
|
||||
scripts/extract-en-it-nouns.py
|
||||
|
||||
Extract ALL bilingual nouns from Open Multilingual Wordnet (OMW).
|
||||
Output mirrors the terms table schema exactly — no filtering, no ranking.
|
||||
Decks handle curation later.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import wn
|
||||
|
||||
|
||||
def extract_bilingual_nouns(
|
||||
source_lang: str = "en",
|
||||
target_lang: str = "it",
|
||||
output_path: str = "datafiles/en-it-nouns.json",
|
||||
) -> None:
|
||||
"""
|
||||
Extract all noun synsets present in both languages via ILI.
|
||||
|
||||
Args:
|
||||
source_lang: Source language code (e.g., "en" for English)
|
||||
target_lang: Target language code (e.g., "it" for Italian)
|
||||
output_path: Where to write the seed JSON
|
||||
"""
|
||||
print(f"Loading WordNets: {source_lang=}, {target_lang=}")
|
||||
|
||||
try:
|
||||
source_wn = wn.Wordnet(lang=source_lang)
|
||||
target_wn = wn.Wordnet(lang=target_lang)
|
||||
except wn.Error as e:
|
||||
print(f"Error loading WordNet: {e}")
|
||||
print(f"Run: wn download omw-{target_lang}:1.4 oewn:2024")
|
||||
sys.exit(1)
|
||||
|
||||
# Index nouns by ILI (Inter-Lingual Index)
|
||||
source_by_ili: dict[str, wn.Synset] = {}
|
||||
for synset in source_wn.synsets(pos="n"):
|
||||
if synset.ili:
|
||||
source_by_ili[synset.ili] = synset
|
||||
|
||||
target_by_ili: dict[str, wn.Synset] = {}
|
||||
for synset in target_wn.synsets(pos="n"):
|
||||
if synset.ili:
|
||||
target_by_ili[synset.ili] = synset
|
||||
|
||||
# Find bilingual synsets (present in both languages)
|
||||
common_ilis = set(source_by_ili.keys()) & set(target_by_ili.keys())
|
||||
print(f"Found {len(common_ilis):,} bilingual noun synsets")
|
||||
|
||||
# Build seed data matching schema exactly
|
||||
terms: list[dict] = []
|
||||
|
||||
for ili in sorted(common_ilis, key=lambda x: int(x[1:])):
|
||||
en_syn = source_by_ili[ili]
|
||||
it_syn = target_by_ili[ili]
|
||||
|
||||
# All lemmas (synonyms) for each language
|
||||
en_lemmas = [str(lemma) for lemma in en_syn.lemmas()]
|
||||
it_lemmas = [str(lemma) for lemma in it_syn.lemmas()]
|
||||
|
||||
term = {
|
||||
"synset_id": f"ili:{ili}", # e.g., "ili:i12345"
|
||||
"pos": "noun",
|
||||
"translations": {source_lang: en_lemmas, target_lang: it_lemmas},
|
||||
# Note: id, created_at added by seed.ts during insert
|
||||
}
|
||||
terms.append(term)
|
||||
|
||||
# Ensure output directory exists
|
||||
output_file = Path(output_path)
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write JSON
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(terms, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"Wrote {len(terms):,} terms to {output_path}")
|
||||
|
||||
# Summary stats
|
||||
total_en_lemmas = sum(len(t["translations"][source_lang]) for t in terms)
|
||||
total_it_lemmas = sum(len(t["translations"][target_lang]) for t in terms)
|
||||
|
||||
print(f"\nLemma counts:")
|
||||
print(
|
||||
f" English: {total_en_lemmas:,} total ({total_en_lemmas / len(terms):.1f} avg per synset)"
|
||||
)
|
||||
print(
|
||||
f" Italian: {total_it_lemmas:,} total ({total_it_lemmas / len(terms):.1f} avg per synset)"
|
||||
)
|
||||
|
||||
# Sample output
|
||||
print(f"\n--- Sample terms ---")
|
||||
for t in terms[1000:1005]:
|
||||
print(
|
||||
f"{t['synset_id']}: {t['translations'][source_lang]} -> {t['translations'][target_lang]}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_bilingual_nouns()
|
||||
|
|
@ -1 +1 @@
|
|||
nltk>=3.8
|
||||
wn==1.1.0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue