#!/usr/bin/env python3 """ scripts/extraction-scripts/english/extract-en_m3.py Extracts CEFR data from en_m3.xls (M3 wordlist). """ import json from pathlib import Path import xlrd # Constants matching @lila/shared SUPPORTED_POS = ["noun", "verb"] CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"] # POS mapping (case-insensitive) POS_MAP = { "noun": "noun", "verb": "verb", } # Paths (relative to project root) INPUT_FILE = Path("scripts/data-sources/english/en_m3.xls") OUTPUT_FILE = Path("scripts/data-sources/english/en_m3-extracted.json") def extract() -> None: print(f"Reading: {INPUT_FILE}") records = [] skipped_pos = 0 skipped_invalid_cefr = 0 skipped_empty_word = 0 total_rows = 0 wb = xlrd.open_workbook(INPUT_FILE) ws = wb.sheet_by_index(0) # Skip header row, start from row 1 for row_idx in range(1, ws.nrows): total_rows += 1 # Unpack columns: ID number, Word, Part of Speech, CEFR, Points word_raw = ws.cell_value(row_idx, 1) pos_raw = ws.cell_value(row_idx, 2) cefr_raw = ws.cell_value(row_idx, 3) # Normalize POS (case-insensitive) pos = str(pos_raw).lower().strip() if pos_raw else "" if pos not in POS_MAP: skipped_pos += 1 continue pos = POS_MAP[pos] # Normalize CEFR - handle smart quotes cefr_str = str(cefr_raw).strip() if cefr_raw else "" # Strip Unicode smart quotes (U+201C and U+201D) cefr_str = cefr_str.strip("\u201c\u201d") cefr = cefr_str.upper() if cefr not in CEFR_LEVELS: skipped_invalid_cefr += 1 continue # Normalize word word = str(word_raw).lower().strip() if word_raw else "" if not word: skipped_empty_word += 1 continue record = {"word": word, "pos": pos, "cefr": cefr, "source": "en_m3"} records.append(record) # Write output with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(records, f, indent=2, ensure_ascii=False) # Stats noun_count = sum(1 for r in records if r["pos"] == "noun") verb_count = sum(1 for r in records if r["pos"] == "verb") cefr_distribution = {} for level in CEFR_LEVELS: count = sum(1 for r in records if r["cefr"] == level) if count > 0: cefr_distribution[level] = count print(f"\nTotal rows in XLS: {total_rows}") print(f"Extracted: {len(records)} records") print(f" - Nouns: {noun_count}") print(f" - Verbs: {verb_count}") print("\nCEFR distribution:") for level in CEFR_LEVELS: if level in cefr_distribution: print(f" - {level}: {cefr_distribution[level]}") print("\nSkipped:") print(f" - Unsupported POS: {skipped_pos}") print(f" - Invalid CEFR: {skipped_invalid_cefr}") print(f" - Empty word: {skipped_empty_word}") print(f"\nOutput: {OUTPUT_FILE}") if __name__ == "__main__": extract()