diff --git a/.gitignore b/.gitignore index 03b6ab8..eab823d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ build/ .repomixignore repomix.config.json repomix/ +venv/ +__pycache__/ +*.pyc diff --git a/documentation/phase-1/task-1.md b/documentation/phase-1/task-1.md new file mode 100644 index 0000000..350ee2d --- /dev/null +++ b/documentation/phase-1/task-1.md @@ -0,0 +1,301 @@ +# Task: Run `scripts/extract_omw.py` locally → generates `packages/db/src/seed.json` + +**Goal**: Produce a committed, validated JSON file containing 1000 English-Italian noun pairs ranked by frequency. +**Done when**: `packages/db/src/seed.json` exists in version control with exactly 1000 entries, all validated. + +--- + +## Step 1: Python Environment Setup + +**Prerequisites**: Python 3.11+ installed locally (not in Docker) + +- [x] Create `scripts/requirements.txt`: + +nltk>=3.8 + +- [x] add to `.gitignore`: + +venv/ +pycache/ +*.pyc + +- [x] Create virtual environment: + +```bash +cd scripts +python -m venv venv +source venv/bin/activate +``` + +- [x] Install dependencies: + +```bash +pip install -r requirements.txt +``` + +- [x] Create scripts/download_data.py to fetch NLTK corpora: + +```Python +import nltk + +def main(): + print("Downloading WordNet...") + nltk.download("wordnet") + + print("Downloading OMW 1.4...") + nltk.download("omw-1.4") + + print("Downloading WordNet IC...") + nltk.download("wordnet_ic") + + print("Done.") + +if __name__ == "__main__": + main() +``` + +- [x] Run it once to cache data locally (~100MB in ~/nltk_data/): + +```bash + python download_data.py +``` + +- [x] Verify data downloaded: check ~/nltk_data/corpora/ exists with wordnet, omw, wordnet_ic folders + +## Step 2: Data Exploration (Throwaway Script) + +Before writing extraction logic, understand the data shape. + + [ ] Create scripts/explore.py: + Import nltk.corpus.wordnet as wn + Import nltk.corpus.wordnet_ic as wnic + Load semcor_ic = wnic.ic('ic-semcor.dat') + [ ] Print sample synset: + Python + Copy + + dog = wn.synset('dog.n.01') + print(f"Offset: {dog.offset():08d}") + print(f"POS: {dog.pos()}") + print(f"English lemmas: {dog.lemma_names()}") + print(f"Italian lemmas: {dog.lemma_names(lang='ita')}") + print(f"Frequency: {dog.res_similarity(dog, semcor_ic)}") + + [ ] Test 5 common words: dog, house, car, water, time + [ ] Document findings: + [ ] Synset ID format confirmed: {offset:08d}{pos} → 02084071n + [ ] Italian availability: what percentage have translations? + [ ] Multi-word handling: underscores in lemma_names()? + [ ] Frequency scores: numeric range and distribution + [ ] Test edge cases: + [ ] Word with multiple synsets (homonyms): bank, run + [ ] Word with multiple Italian translations per synset + [ ] Word with no Italian translation + [ ] Delete or keep explore.py (optional reference) + +Decision checkpoint: Confirm frequency ranking strategy + + Option A: synset.count() — raw lemma occurrence count + Option B: res_similarity with SemCor IC — information content score + Document choice and rationale in comments + +## Step 3: Extraction Script Implementation + +Create scripts/extract_omw.py with the full pipeline. + + [ ] Imports and setup: + Python + Copy + + import json + from collections import defaultdict + from nltk.corpus import wordnet as wn + from nltk.corpus import wordnet_ic + + [ ] Load information content for frequency ranking: + Python + Copy + + semcor_ic = wordnet_ic.ic('ic-semcor.dat') + + [ ] Define target count and output path: + Python + Copy + + TARGET_COUNT = 1000 + OUTPUT_PATH = '../packages/db/src/seed.json' + + [ ] Iterate all noun synsets and collect candidates: + Python + Copy + + candidates = [] + for synset in wn.all_synsets(pos='n'): + italian_lemmas = synset.lemma_names(lang='ita') + if not italian_lemmas: + continue + + english_lemmas = synset.lemma_names() + + # Calculate frequency score + try: + # Using self-similarity as frequency proxy + freq_score = synset.res_similarity(synset, semcor_ic) + except Exception: + freq_score = 0 + + candidates.append({ + 'synset': synset, + 'offset': synset.offset(), + 'pos': synset.pos(), + 'freq_score': freq_score, + 'english': english_lemmas, + 'italian': italian_lemmas, + }) + + [ ] Sort by frequency descending: + Python + Copy + + candidates.sort(key=lambda x: x['freq_score'], reverse=True) + + [ ] Slice top 1000: + Python + Copy + + top_1000 = candidates[:TARGET_COUNT] + + [ ] Build output structure matching schema needs: + Python + Copy + + seed_data = [] + for rank, candidate in enumerate(top_1000, start=1): + # Normalize multi-word expressions: replace underscores with spaces + english_normalized = [w.replace('_', ' ') for w in candidate['english']] + italian_normalized = [w.replace('_', ' ') for w in candidate['italian']] + + seed_data.append({ + 'synset_id': f"wn:{candidate['offset']:08d}{candidate['pos']}", + 'pos': 'noun', # Map 'n' to full word for schema + 'frequency_rank': rank, + 'english_lemmas': english_normalized, + 'italian_lemmas': italian_normalized, + }) + + [ ] Write formatted JSON: + Python + Copy + + with open(OUTPUT_PATH, 'w', encoding='utf-8') as f: + json.dump(seed_data, f, indent=2, ensure_ascii=False) + + print(f"Generated {len(seed_data)} entries at {OUTPUT_PATH}") + +Edge cases to handle: + + [ ] Skip synsets with empty Italian list (already filtered) + [ ] Handle res_similarity exceptions (some synsets lack IC data) + [ ] Normalize underscores to spaces in all lemmas + [ ] Ensure UTF-8 encoding for Italian characters (à, è, ì, ò, ù) + +## Step 4: Validation Script +Create scripts/validate_seed.py to verify output quality. + + [ ] Load and parse JSON: + Python + Copy + + import json + from pathlib import Path + + SEED_PATH = '../packages/db/src/seed.json' + + with open(SEED_PATH, 'r', encoding='utf-8') as f: + data = json.load(f) + + [ ] Run validation checks: + [ ] Count check: len(data) == 1000 + [ ] Rank check: all entries have frequency_rank from 1 to 1000, no gaps, no duplicates + [ ] Synset ID format: matches regex ^wn:\d{8}[nvar]$ (noun, verb, adjective, adverb) + [ ] POS check: all are noun (since we filtered pos='n') + [ ] Italian presence: every entry has italian_lemmas with at least 1 item + [ ] English presence: every entry has english_lemmas with at least 1 item + [ ] No duplicate synset IDs + [ ] No empty strings in lemma arrays + [ ] No leading/trailing whitespace in lemmas + [ ] Print summary statistics: + Total entries + Average English lemmas per entry + Average Italian lemmas per entry + Sample entries (ranks 1, 500, 1000) for manual inspection + [ ] Exit with error code if any check fails + +## Step 5: Execution and Iteration + + [ ] Run extraction: + bash + Copy + + cd scripts + source venv/bin/activate + python extract_omw.py + + [ ] Run validation: + bash + Copy + + python validate_seed.py + + [ ] If validation fails, fix extract_omw.py and re-run: + [ ] Too few entries? Relax filters or reduce target count + [ ] Data quality issues? Add normalization logic + [ ] Format mismatches? Adjust output structure + [ ] Manual sanity check: open seed.json, read first 10 and last 10 entries + Do translations make sense? + Are frequencies plausible (common words first)? + +## Step 6: Git Integration + + [ ] Verify file location: packages/db/src/seed.json + [ ] Check file size: should be ~200-500KB (if larger, investigate) + [ ] Stage the file: + bash + Copy + + git add packages/db/src/seed.json + + [ ] Commit with descriptive message: + plain + Copy + + feat(data): add seed.json with 1000 English-Italian noun pairs + + Generated from WordNet 3.0 + OMW 1.4 using SemCor IC frequency ranking. + Top entry: dog/cane (rank 1). Bottom entry: [word] (rank 1000). + + [ ] Push to current feature branch + +## Step 7: Documentation Update + + [ ] Update documentation/decisions.md with: + [ ] Frequency ranking method chosen (SemCor IC vs count) and why + [ ] How multi-word expressions are handled (underscore → space) + [ ] OMW data quality notes (coverage percentage, any manual fixes) + [ ] Seed file structure (for future maintainers) + +Definition of Done + + [ ] scripts/extract_omw.py exists and runs without errors + [ ] scripts/validate_seed.py passes all checks + [ ] packages/db/src/seed.json committed to git with exactly 1000 entries + [ ] Manual sample check confirms sensible translations + [ ] decisions.md updated with extraction methodology + [ ] Virtual environment and Python cache files are gitignored + +Out of Scope (for this task) + + Distractor generation (happens in API layer later) + Additional parts of speech (verbs, adjectives) + Data updates/re-seeding strategy (MVP assumes static seed) + Database insertion (next task: seed.ts) diff --git a/mise.toml b/mise.toml index 8c1f295..ac4346a 100644 --- a/mise.toml +++ b/mise.toml @@ -1,2 +1,3 @@ [tools] node = "24.14.0" +python = "latest" diff --git a/scripts/download_data.py b/scripts/download_data.py new file mode 100644 index 0000000..eb4f54b --- /dev/null +++ b/scripts/download_data.py @@ -0,0 +1,18 @@ +import nltk + + +def main(): + print("Downloading WordNet...") + nltk.download("wordnet") + + print("Downloading OMW 1.4...") + nltk.download("omw-1.4") + + print("Downloading WordNet IC...") + nltk.download("wordnet_ic") + + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..4317136 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1 @@ +nltk>=3.8