lila/scripts/extraction-scripts/italian/extract-it_m3.py

#!/usr/bin/env python3
"""
scripts/extraction-scripts/italian/extract-it_m3.py

Extracts CEFR data from it_m3.xls (Italian M3 wordlist).
"""

import json
from pathlib import Path

import xlrd

# Constants matching @glossa/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]

# POS mapping (case-insensitive) – based on observed abbreviations
POS_MAP = {
    "n": "noun",  # nome
    "v": "verb",  # verbo
}

# Column indices (0-based) – verified from sample
WORD_COL = 0  # Lemma
POS_COL = 1  # Pos
CEFR_COL = 2  # Points (CEFR level)

# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/italian/it_m3.xls")
OUTPUT_FILE = Path("scripts/data-sources/italian/it_m3-extracted.json")


def extract() -> None:
    print(f"Reading: {INPUT_FILE}")

    records = []
    skipped_pos = 0
    skipped_invalid_cefr = 0
    skipped_empty_word = 0
    total_rows = 0

    wb = xlrd.open_workbook(INPUT_FILE)
    ws = wb.sheet_by_index(0)

    # Skip header row, start from row 1
    for row_idx in range(1, ws.nrows):
        total_rows += 1

        word_raw = ws.cell_value(row_idx, WORD_COL)
        pos_raw = ws.cell_value(row_idx, POS_COL)
        cefr_raw = ws.cell_value(row_idx, CEFR_COL)

        # Normalize POS (case-insensitive)
        pos = str(pos_raw).lower().strip() if pos_raw else ""
        if pos not in POS_MAP:
            skipped_pos += 1
            continue

        pos = POS_MAP[pos]

        # Normalize CEFR - handle smart quotes
        cefr_str = str(cefr_raw).strip() if cefr_raw else ""
        cefr_str = cefr_str.strip("\u201c\u201d")  # strip Unicode smart quotes
        cefr = cefr_str.upper()

        if cefr not in CEFR_LEVELS:
            skipped_invalid_cefr += 1
            continue

        # Normalize word – handle multiple forms like "il, lo, la" → take first?
        word_raw_str = str(word_raw).strip() if word_raw else ""
        # If word contains comma, take first part (e.g., "il, lo, la" → "il")
        # But this may lose variants; consider keeping as is or processing differently.
        # For consistency, we'll keep the full string and lowercase it.
        word = word_raw_str.lower()
        if not word:
            skipped_empty_word += 1
            continue

        record = {"word": word, "pos": pos, "cefr": cefr, "source": "it_m3"}
        records.append(record)

    # Write output
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=2, ensure_ascii=False)

    # Stats
    noun_count = sum(1 for r in records if r["pos"] == "noun")
    verb_count = sum(1 for r in records if r["pos"] == "verb")

    cefr_distribution = {}
    for level in CEFR_LEVELS:
        count = sum(1 for r in records if r["cefr"] == level)
        if count > 0:
            cefr_distribution[level] = count

    print(f"\nTotal rows in XLS: {total_rows}")
    print(f"Extracted: {len(records)} records")
    print(f"  - Nouns: {noun_count}")
    print(f"  - Verbs: {verb_count}")
    print(f"\nCEFR distribution:")
    for level in CEFR_LEVELS:
        if level in cefr_distribution:
            print(f"  - {level}: {cefr_distribution[level]}")

    print(f"\nSkipped:")
    print(f"  - Unsupported POS: {skipped_pos}")
    print(f"  - Invalid CEFR: {skipped_invalid_cefr}")
    print(f"  - Empty word: {skipped_empty_word}")
    print(f"\nOutput: {OUTPUT_FILE}")


if __name__ == "__main__":
    extract()