feat(scripts): add Italian CEFR data pipeline
- Add extractors for Italian sources: it_m3.xls and italian.json - Add comparison script (compare-italian.py) to report source overlaps and conflicts - Add merge script (merge-italian-json.py) with priority order ['italian', 'it_m3'] - Output authoritative dataset to datafiles/italian-merged.json - Update README to document both English and Italian pipelines
This commit is contained in:
parent
59152950d6
commit
3374bd8b20
9 changed files with 208535 additions and 26 deletions
|
|
@ -91,12 +91,12 @@ def extract() -> None:
|
|||
print(f"Extracted: {len(records)} records")
|
||||
print(f" - Nouns: {noun_count}")
|
||||
print(f" - Verbs: {verb_count}")
|
||||
print(f"\nCEFR distribution:")
|
||||
print("\nCEFR distribution:")
|
||||
for level in CEFR_LEVELS:
|
||||
if level in cefr_distribution:
|
||||
print(f" - {level}: {cefr_distribution[level]}")
|
||||
|
||||
print(f"\nSkipped:")
|
||||
print("\nSkipped:")
|
||||
print(f" - Unsupported POS: {skipped_pos}")
|
||||
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||
print(f" - Empty word: {skipped_empty_word}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue