#!/usr/bin/env python3 """ scripts/extraction-scripts/english/extract-cefrj-csv.py Extracts CEFR data from cefrj.csv (CEFR-J vocabulary profile). Filters for supported POS (noun, verb). Input: scripts/data-sources/english/cefrj.csv Output: scripts/data-sources/english/cefrj-extracted.json Output format (normalized): [ { "word": "ability", "pos": "noun", "cefr": "A2", "source": "cefrj" } ] """ import csv import json from pathlib import Path # Constants matching @lila/shared SUPPORTED_POS = ["noun", "verb"] CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"] # Paths (relative to project root) INPUT_FILE = Path("scripts/data-sources/english/cefrj.csv") OUTPUT_FILE = Path("scripts/data-sources/english/cefrj-extracted.json") def extract() -> None: print(f"Reading: {INPUT_FILE}") records = [] skipped_pos = 0 skipped_invalid_cefr = 0 skipped_empty_word = 0 total_rows = 0 with open(INPUT_FILE, "r", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: total_rows += 1 # Filter: must have supported POS pos = row.get("pos", "").lower().strip() if pos not in SUPPORTED_POS: skipped_pos += 1 continue # Filter: must have valid CEFR level cefr = row.get("CEFR", "").upper().strip() if cefr not in CEFR_LEVELS: skipped_invalid_cefr += 1 continue # Normalize word word = row.get("headword", "").lower().strip() if not word: skipped_empty_word += 1 continue record = {"word": word, "pos": pos, "cefr": cefr, "source": "cefrj"} records.append(record) # Write output with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(records, f, indent=2, ensure_ascii=False) # Stats noun_count = sum(1 for r in records if r["pos"] == "noun") verb_count = sum(1 for r in records if r["pos"] == "verb") cefr_distribution = {} for level in CEFR_LEVELS: count = sum(1 for r in records if r["cefr"] == level) if count > 0: cefr_distribution[level] = count print(f"\nTotal rows in CSV: {total_rows}") print(f"Extracted: {len(records)} records") print(f" - Nouns: {noun_count}") print(f" - Verbs: {verb_count}") print("\nCEFR distribution:") for level in CEFR_LEVELS: if level in cefr_distribution: print(f" - {level}: {cefr_distribution[level]}") print("\nSkipped:") print(f" - Unsupported POS: {skipped_pos}") print(f" - Invalid CEFR: {skipped_invalid_cefr}") print(f" - Empty word: {skipped_empty_word}") print(f"\nOutput: {OUTPUT_FILE}") if __name__ == "__main__": extract()