feat(scripts): add Italian CEFR data pipeline

- Add extractors for Italian sources: it_m3.xls and italian.json - Add comparison script (compare-italian.py) to report source overlaps and conflicts - Add merge script (merge-italian-json.py) with priority order ['italian', 'it_m3'] - Output authoritative dataset to datafiles/italian-merged.json - Update README to document both English and Italian pipelines
2026-04-08 18:32:03 +02:00 · 2026-04-08 18:32:03 +02:00 · 3374bd8b20
commit 3374bd8b20
parent 59152950d6
9 changed files with 208535 additions and 26 deletions
--- a/scripts/extraction-scripts/english/extract-en_m3.py
+++ b/scripts/extraction-scripts/english/extract-en_m3.py
@ -91,12 +91,12 @@ def extract() -> None:
    print(f"Extracted: {len(records)} records")
    print(f"  - Nouns: {noun_count}")
    print(f"  - Verbs: {verb_count}")
-    print(f"\nCEFR distribution:")
+    print("\nCEFR distribution:")
    for level in CEFR_LEVELS:
        if level in cefr_distribution:
            print(f"  - {level}: {cefr_distribution[level]}")

-    print(f"\nSkipped:")
+    print("\nSkipped:")
    print(f"  - Unsupported POS: {skipped_pos}")
    print(f"  - Invalid CEFR: {skipped_invalid_cefr}")
    print(f"  - Empty word: {skipped_empty_word}")
--- a/scripts/extraction-scripts/italian/extract-it_m3.py
+++ b/scripts/extraction-scripts/italian/extract-it_m3.py
@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+"""
+scripts/extraction-scripts/italian/extract-it_m3.py
+
+Extracts CEFR data from it_m3.xls (Italian M3 wordlist).
+"""
+
+import json
+from pathlib import Path
+
+import xlrd
+
+# Constants matching @glossa/shared
+SUPPORTED_POS = ["noun", "verb"]
+CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
+
+# POS mapping (case-insensitive) – based on observed abbreviations
+POS_MAP = {
+    "n": "noun",  # nome
+    "v": "verb",  # verbo
+}
+
+# Column indices (0-based) – verified from sample
+WORD_COL = 0  # Lemma
+POS_COL = 1  # Pos
+CEFR_COL = 2  # Points (CEFR level)
+
+# Paths (relative to project root)
+INPUT_FILE = Path("scripts/data-sources/italian/it_m3.xls")
+OUTPUT_FILE = Path("scripts/data-sources/italian/it_m3-extracted.json")
+
+
+def extract() -> None:
+    print(f"Reading: {INPUT_FILE}")
+
+    records = []
+    skipped_pos = 0
+    skipped_invalid_cefr = 0
+    skipped_empty_word = 0
+    total_rows = 0
+
+    wb = xlrd.open_workbook(INPUT_FILE)
+    ws = wb.sheet_by_index(0)
+
+    # Skip header row, start from row 1
+    for row_idx in range(1, ws.nrows):
+        total_rows += 1
+
+        word_raw = ws.cell_value(row_idx, WORD_COL)
+        pos_raw = ws.cell_value(row_idx, POS_COL)
+        cefr_raw = ws.cell_value(row_idx, CEFR_COL)
+
+        # Normalize POS (case-insensitive)
+        pos = str(pos_raw).lower().strip() if pos_raw else ""
+        if pos not in POS_MAP:
+            skipped_pos += 1
+            continue
+
+        pos = POS_MAP[pos]
+
+        # Normalize CEFR - handle smart quotes
+        cefr_str = str(cefr_raw).strip() if cefr_raw else ""
+        cefr_str = cefr_str.strip("\u201c\u201d")  # strip Unicode smart quotes
+        cefr = cefr_str.upper()
+
+        if cefr not in CEFR_LEVELS:
+            skipped_invalid_cefr += 1
+            continue
+
+        # Normalize word – handle multiple forms like "il, lo, la" → take first?
+        word_raw_str = str(word_raw).strip() if word_raw else ""
+        # If word contains comma, take first part (e.g., "il, lo, la" → "il")
+        # But this may lose variants; consider keeping as is or processing differently.
+        # For consistency, we'll keep the full string and lowercase it.
+        word = word_raw_str.lower()
+        if not word:
+            skipped_empty_word += 1
+            continue
+
+        record = {"word": word, "pos": pos, "cefr": cefr, "source": "it_m3"}
+        records.append(record)
+
+    # Write output
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+        json.dump(records, f, indent=2, ensure_ascii=False)
+
+    # Stats
+    noun_count = sum(1 for r in records if r["pos"] == "noun")
+    verb_count = sum(1 for r in records if r["pos"] == "verb")
+
+    cefr_distribution = {}
+    for level in CEFR_LEVELS:
+        count = sum(1 for r in records if r["cefr"] == level)
+        if count > 0:
+            cefr_distribution[level] = count
+
+    print(f"\nTotal rows in XLS: {total_rows}")
+    print(f"Extracted: {len(records)} records")
+    print(f"  - Nouns: {noun_count}")
+    print(f"  - Verbs: {verb_count}")
+    print(f"\nCEFR distribution:")
+    for level in CEFR_LEVELS:
+        if level in cefr_distribution:
+            print(f"  - {level}: {cefr_distribution[level]}")
+
+    print(f"\nSkipped:")
+    print(f"  - Unsupported POS: {skipped_pos}")
+    print(f"  - Invalid CEFR: {skipped_invalid_cefr}")
+    print(f"  - Empty word: {skipped_empty_word}")
+    print(f"\nOutput: {OUTPUT_FILE}")
+
+
+if __name__ == "__main__":
+    extract()
--- a/scripts/extraction-scripts/italian/extract-random-json.py
+++ b/scripts/extraction-scripts/italian/extract-random-json.py
@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""
+scripts/extraction-scripts/italian/extract-italian-json.py
+
+Extracts CEFR data from italian.json (Italian flashcard source).
+Filters for useful_for_flashcard=true and supported POS (noun, verb).
+"""
+
+import json
+from pathlib import Path
+
+# Constants matching @glossa/shared
+SUPPORTED_POS = ["noun", "verb"]
+CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
+
+# Paths (relative to project root)
+INPUT_FILE = Path("scripts/data-sources/italian/italian.json")
+OUTPUT_FILE = Path("scripts/data-sources/italian/italian-extracted.json")
+
+
+def extract() -> None:
+    print(f"Reading: {INPUT_FILE}")
+
+    with open(INPUT_FILE, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    records = []
+    skipped_pos = 0
+    skipped_not_useful = 0
+    skipped_invalid_cefr = 0
+    skipped_empty_word = 0
+
+    for entry in data:
+        # Filter: must be useful for flashcard
+        if not entry.get("useful_for_flashcard", False):
+            skipped_not_useful += 1
+            continue
+
+        # Filter: must have supported POS
+        pos = entry.get("pos", "").lower().strip()
+        if pos not in SUPPORTED_POS:
+            skipped_pos += 1
+            continue
+
+        # Filter: must have valid CEFR level
+        cefr = entry.get("cefr_level", "").upper().strip()
+        if cefr not in CEFR_LEVELS:
+            skipped_invalid_cefr += 1
+            continue
+
+        # Normalize word
+        word = entry.get("word", "").lower().strip()
+        if not word:
+            skipped_empty_word += 1
+            continue
+
+        record = {"word": word, "pos": pos, "cefr": cefr, "source": "italian"}
+        records.append(record)
+
+    # Write output
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+        json.dump(records, f, indent=2, ensure_ascii=False)
+
+    # Stats
+    noun_count = sum(1 for r in records if r["pos"] == "noun")
+    verb_count = sum(1 for r in records if r["pos"] == "verb")
+
+    cefr_distribution = {}
+    for level in CEFR_LEVELS:
+        count = sum(1 for r in records if r["cefr"] == level)
+        if count > 0:
+            cefr_distribution[level] = count
+
+    print(f"\nExtracted: {len(records)} records")
+    print(f"  - Nouns: {noun_count}")
+    print(f"  - Verbs: {verb_count}")
+    print("\nCEFR distribution:")
+    for level in CEFR_LEVELS:
+        if level in cefr_distribution:
+            print(f"  - {level}: {cefr_distribution[level]}")
+
+    print("\nSkipped:")
+    print(f"  - Not useful for flashcard: {skipped_not_useful}")
+    print(f"  - Unsupported POS: {skipped_pos}")
+    print(f"  - Invalid CEFR: {skipped_invalid_cefr}")
+    print(f"  - Empty word: {skipped_empty_word}")
+    print(f"\nOutput: {OUTPUT_FILE}")
+
+
+if __name__ == "__main__":
+    extract()