lila/scripts/extraction-scripts/english/extract-cefrj-csv.py

#!/usr/bin/env python3
"""
scripts/extraction-scripts/english/extract-cefrj-csv.py

Extracts CEFR data from cefrj.csv (CEFR-J vocabulary profile).
Filters for supported POS (noun, verb).

Input:  scripts/data-sources/english/cefrj.csv
Output: scripts/data-sources/english/cefrj-extracted.json

Output format (normalized):
[
  { "word": "ability", "pos": "noun", "cefr": "A2", "source": "cefrj" }
]
"""

import csv
import json
from pathlib import Path

# Constants matching @glossa/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]

# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/english/cefrj.csv")
OUTPUT_FILE = Path("scripts/data-sources/english/cefrj-extracted.json")


def extract() -> None:
    print(f"Reading: {INPUT_FILE}")

    records = []
    skipped_pos = 0
    skipped_invalid_cefr = 0
    skipped_empty_word = 0
    total_rows = 0

    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            total_rows += 1

            # Filter: must have supported POS
            pos = row.get("pos", "").lower().strip()
            if pos not in SUPPORTED_POS:
                skipped_pos += 1
                continue

            # Filter: must have valid CEFR level
            cefr = row.get("CEFR", "").upper().strip()
            if cefr not in CEFR_LEVELS:
                skipped_invalid_cefr += 1
                continue

            # Normalize word
            word = row.get("headword", "").lower().strip()
            if not word:
                skipped_empty_word += 1
                continue

            record = {"word": word, "pos": pos, "cefr": cefr, "source": "cefrj"}
            records.append(record)

    # Write output
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=2, ensure_ascii=False)

    # Stats
    noun_count = sum(1 for r in records if r["pos"] == "noun")
    verb_count = sum(1 for r in records if r["pos"] == "verb")

    cefr_distribution = {}
    for level in CEFR_LEVELS:
        count = sum(1 for r in records if r["cefr"] == level)
        if count > 0:
            cefr_distribution[level] = count

    print(f"\nTotal rows in CSV: {total_rows}")
    print(f"Extracted: {len(records)} records")
    print(f"  - Nouns: {noun_count}")
    print(f"  - Verbs: {verb_count}")
    print("\nCEFR distribution:")
    for level in CEFR_LEVELS:
        if level in cefr_distribution:
            print(f"  - {level}: {cefr_distribution[level]}")

    print("\nSkipped:")
    print(f"  - Unsupported POS: {skipped_pos}")
    print(f"  - Invalid CEFR: {skipped_invalid_cefr}")
    print(f"  - Empty word: {skipped_empty_word}")
    print(f"\nOutput: {OUTPUT_FILE}")


if __name__ == "__main__":
    extract()