lila/scripts/extraction-scripts/italian/extract-it_m3.py
lila 3374bd8b20 feat(scripts): add Italian CEFR data pipeline
- Add extractors for Italian sources: it_m3.xls and italian.json
- Add comparison script (compare-italian.py) to report source overlaps and conflicts
- Add merge script (merge-italian-json.py) with priority order ['italian', 'it_m3']
- Output authoritative dataset to datafiles/italian-merged.json
- Update README to document both English and Italian pipelines
2026-04-08 18:32:03 +02:00

114 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
scripts/extraction-scripts/italian/extract-it_m3.py
Extracts CEFR data from it_m3.xls (Italian M3 wordlist).
"""
import json
from pathlib import Path
import xlrd
# Constants matching @glossa/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
# POS mapping (case-insensitive) based on observed abbreviations
POS_MAP = {
"n": "noun", # nome
"v": "verb", # verbo
}
# Column indices (0-based) verified from sample
WORD_COL = 0 # Lemma
POS_COL = 1 # Pos
CEFR_COL = 2 # Points (CEFR level)
# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/italian/it_m3.xls")
OUTPUT_FILE = Path("scripts/data-sources/italian/it_m3-extracted.json")
def extract() -> None:
print(f"Reading: {INPUT_FILE}")
records = []
skipped_pos = 0
skipped_invalid_cefr = 0
skipped_empty_word = 0
total_rows = 0
wb = xlrd.open_workbook(INPUT_FILE)
ws = wb.sheet_by_index(0)
# Skip header row, start from row 1
for row_idx in range(1, ws.nrows):
total_rows += 1
word_raw = ws.cell_value(row_idx, WORD_COL)
pos_raw = ws.cell_value(row_idx, POS_COL)
cefr_raw = ws.cell_value(row_idx, CEFR_COL)
# Normalize POS (case-insensitive)
pos = str(pos_raw).lower().strip() if pos_raw else ""
if pos not in POS_MAP:
skipped_pos += 1
continue
pos = POS_MAP[pos]
# Normalize CEFR - handle smart quotes
cefr_str = str(cefr_raw).strip() if cefr_raw else ""
cefr_str = cefr_str.strip("\u201c\u201d") # strip Unicode smart quotes
cefr = cefr_str.upper()
if cefr not in CEFR_LEVELS:
skipped_invalid_cefr += 1
continue
# Normalize word handle multiple forms like "il, lo, la" → take first?
word_raw_str = str(word_raw).strip() if word_raw else ""
# If word contains comma, take first part (e.g., "il, lo, la" → "il")
# But this may lose variants; consider keeping as is or processing differently.
# For consistency, we'll keep the full string and lowercase it.
word = word_raw_str.lower()
if not word:
skipped_empty_word += 1
continue
record = {"word": word, "pos": pos, "cefr": cefr, "source": "it_m3"}
records.append(record)
# Write output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
# Stats
noun_count = sum(1 for r in records if r["pos"] == "noun")
verb_count = sum(1 for r in records if r["pos"] == "verb")
cefr_distribution = {}
for level in CEFR_LEVELS:
count = sum(1 for r in records if r["cefr"] == level)
if count > 0:
cefr_distribution[level] = count
print(f"\nTotal rows in XLS: {total_rows}")
print(f"Extracted: {len(records)} records")
print(f" - Nouns: {noun_count}")
print(f" - Verbs: {verb_count}")
print(f"\nCEFR distribution:")
for level in CEFR_LEVELS:
if level in cefr_distribution:
print(f" - {level}: {cefr_distribution[level]}")
print(f"\nSkipped:")
print(f" - Unsupported POS: {skipped_pos}")
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
print(f" - Empty word: {skipped_empty_word}")
print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__":
extract()