feat(scripts): add Italian CEFR data pipeline
- Add extractors for Italian sources: it_m3.xls and italian.json - Add comparison script (compare-italian.py) to report source overlaps and conflicts - Add merge script (merge-italian-json.py) with priority order ['italian', 'it_m3'] - Output authoritative dataset to datafiles/italian-merged.json - Update README to document both English and Italian pipelines
This commit is contained in:
parent
59152950d6
commit
3374bd8b20
9 changed files with 208535 additions and 26 deletions
|
|
@ -91,12 +91,12 @@ def extract() -> None:
|
|||
print(f"Extracted: {len(records)} records")
|
||||
print(f" - Nouns: {noun_count}")
|
||||
print(f" - Verbs: {verb_count}")
|
||||
print(f"\nCEFR distribution:")
|
||||
print("\nCEFR distribution:")
|
||||
for level in CEFR_LEVELS:
|
||||
if level in cefr_distribution:
|
||||
print(f" - {level}: {cefr_distribution[level]}")
|
||||
|
||||
print(f"\nSkipped:")
|
||||
print("\nSkipped:")
|
||||
print(f" - Unsupported POS: {skipped_pos}")
|
||||
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||
print(f" - Empty word: {skipped_empty_word}")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,114 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
scripts/extraction-scripts/italian/extract-it_m3.py
|
||||
|
||||
Extracts CEFR data from it_m3.xls (Italian M3 wordlist).
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import xlrd
|
||||
|
||||
# Constants matching @glossa/shared
|
||||
SUPPORTED_POS = ["noun", "verb"]
|
||||
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||
|
||||
# POS mapping (case-insensitive) – based on observed abbreviations
|
||||
POS_MAP = {
|
||||
"n": "noun", # nome
|
||||
"v": "verb", # verbo
|
||||
}
|
||||
|
||||
# Column indices (0-based) – verified from sample
|
||||
WORD_COL = 0 # Lemma
|
||||
POS_COL = 1 # Pos
|
||||
CEFR_COL = 2 # Points (CEFR level)
|
||||
|
||||
# Paths (relative to project root)
|
||||
INPUT_FILE = Path("scripts/data-sources/italian/it_m3.xls")
|
||||
OUTPUT_FILE = Path("scripts/data-sources/italian/it_m3-extracted.json")
|
||||
|
||||
|
||||
def extract() -> None:
|
||||
print(f"Reading: {INPUT_FILE}")
|
||||
|
||||
records = []
|
||||
skipped_pos = 0
|
||||
skipped_invalid_cefr = 0
|
||||
skipped_empty_word = 0
|
||||
total_rows = 0
|
||||
|
||||
wb = xlrd.open_workbook(INPUT_FILE)
|
||||
ws = wb.sheet_by_index(0)
|
||||
|
||||
# Skip header row, start from row 1
|
||||
for row_idx in range(1, ws.nrows):
|
||||
total_rows += 1
|
||||
|
||||
word_raw = ws.cell_value(row_idx, WORD_COL)
|
||||
pos_raw = ws.cell_value(row_idx, POS_COL)
|
||||
cefr_raw = ws.cell_value(row_idx, CEFR_COL)
|
||||
|
||||
# Normalize POS (case-insensitive)
|
||||
pos = str(pos_raw).lower().strip() if pos_raw else ""
|
||||
if pos not in POS_MAP:
|
||||
skipped_pos += 1
|
||||
continue
|
||||
|
||||
pos = POS_MAP[pos]
|
||||
|
||||
# Normalize CEFR - handle smart quotes
|
||||
cefr_str = str(cefr_raw).strip() if cefr_raw else ""
|
||||
cefr_str = cefr_str.strip("\u201c\u201d") # strip Unicode smart quotes
|
||||
cefr = cefr_str.upper()
|
||||
|
||||
if cefr not in CEFR_LEVELS:
|
||||
skipped_invalid_cefr += 1
|
||||
continue
|
||||
|
||||
# Normalize word – handle multiple forms like "il, lo, la" → take first?
|
||||
word_raw_str = str(word_raw).strip() if word_raw else ""
|
||||
# If word contains comma, take first part (e.g., "il, lo, la" → "il")
|
||||
# But this may lose variants; consider keeping as is or processing differently.
|
||||
# For consistency, we'll keep the full string and lowercase it.
|
||||
word = word_raw_str.lower()
|
||||
if not word:
|
||||
skipped_empty_word += 1
|
||||
continue
|
||||
|
||||
record = {"word": word, "pos": pos, "cefr": cefr, "source": "it_m3"}
|
||||
records.append(record)
|
||||
|
||||
# Write output
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Stats
|
||||
noun_count = sum(1 for r in records if r["pos"] == "noun")
|
||||
verb_count = sum(1 for r in records if r["pos"] == "verb")
|
||||
|
||||
cefr_distribution = {}
|
||||
for level in CEFR_LEVELS:
|
||||
count = sum(1 for r in records if r["cefr"] == level)
|
||||
if count > 0:
|
||||
cefr_distribution[level] = count
|
||||
|
||||
print(f"\nTotal rows in XLS: {total_rows}")
|
||||
print(f"Extracted: {len(records)} records")
|
||||
print(f" - Nouns: {noun_count}")
|
||||
print(f" - Verbs: {verb_count}")
|
||||
print(f"\nCEFR distribution:")
|
||||
for level in CEFR_LEVELS:
|
||||
if level in cefr_distribution:
|
||||
print(f" - {level}: {cefr_distribution[level]}")
|
||||
|
||||
print(f"\nSkipped:")
|
||||
print(f" - Unsupported POS: {skipped_pos}")
|
||||
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||
print(f" - Empty word: {skipped_empty_word}")
|
||||
print(f"\nOutput: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract()
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
scripts/extraction-scripts/italian/extract-italian-json.py
|
||||
|
||||
Extracts CEFR data from italian.json (Italian flashcard source).
|
||||
Filters for useful_for_flashcard=true and supported POS (noun, verb).
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Constants matching @glossa/shared
|
||||
SUPPORTED_POS = ["noun", "verb"]
|
||||
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||
|
||||
# Paths (relative to project root)
|
||||
INPUT_FILE = Path("scripts/data-sources/italian/italian.json")
|
||||
OUTPUT_FILE = Path("scripts/data-sources/italian/italian-extracted.json")
|
||||
|
||||
|
||||
def extract() -> None:
|
||||
print(f"Reading: {INPUT_FILE}")
|
||||
|
||||
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
records = []
|
||||
skipped_pos = 0
|
||||
skipped_not_useful = 0
|
||||
skipped_invalid_cefr = 0
|
||||
skipped_empty_word = 0
|
||||
|
||||
for entry in data:
|
||||
# Filter: must be useful for flashcard
|
||||
if not entry.get("useful_for_flashcard", False):
|
||||
skipped_not_useful += 1
|
||||
continue
|
||||
|
||||
# Filter: must have supported POS
|
||||
pos = entry.get("pos", "").lower().strip()
|
||||
if pos not in SUPPORTED_POS:
|
||||
skipped_pos += 1
|
||||
continue
|
||||
|
||||
# Filter: must have valid CEFR level
|
||||
cefr = entry.get("cefr_level", "").upper().strip()
|
||||
if cefr not in CEFR_LEVELS:
|
||||
skipped_invalid_cefr += 1
|
||||
continue
|
||||
|
||||
# Normalize word
|
||||
word = entry.get("word", "").lower().strip()
|
||||
if not word:
|
||||
skipped_empty_word += 1
|
||||
continue
|
||||
|
||||
record = {"word": word, "pos": pos, "cefr": cefr, "source": "italian"}
|
||||
records.append(record)
|
||||
|
||||
# Write output
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Stats
|
||||
noun_count = sum(1 for r in records if r["pos"] == "noun")
|
||||
verb_count = sum(1 for r in records if r["pos"] == "verb")
|
||||
|
||||
cefr_distribution = {}
|
||||
for level in CEFR_LEVELS:
|
||||
count = sum(1 for r in records if r["cefr"] == level)
|
||||
if count > 0:
|
||||
cefr_distribution[level] = count
|
||||
|
||||
print(f"\nExtracted: {len(records)} records")
|
||||
print(f" - Nouns: {noun_count}")
|
||||
print(f" - Verbs: {verb_count}")
|
||||
print("\nCEFR distribution:")
|
||||
for level in CEFR_LEVELS:
|
||||
if level in cefr_distribution:
|
||||
print(f" - {level}: {cefr_distribution[level]}")
|
||||
|
||||
print("\nSkipped:")
|
||||
print(f" - Not useful for flashcard: {skipped_not_useful}")
|
||||
print(f" - Unsupported POS: {skipped_pos}")
|
||||
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||
print(f" - Empty word: {skipped_empty_word}")
|
||||
print(f"\nOutput: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract()
|
||||
Loading…
Add table
Add a link
Reference in a new issue