- Update all package names from @glossa/* to @lila/* - Update all imports, container names, volume names - Update documentation references - Recreate database with new credentials
107 lines
2.9 KiB
Python
107 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
scripts/extraction-scripts/english/extract-en_m3.py
|
|
|
|
Extracts CEFR data from en_m3.xls (M3 wordlist).
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import xlrd
|
|
|
|
# Constants matching @lila/shared
|
|
SUPPORTED_POS = ["noun", "verb"]
|
|
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
|
|
|
# POS mapping (case-insensitive)
|
|
POS_MAP = {
|
|
"noun": "noun",
|
|
"verb": "verb",
|
|
}
|
|
|
|
# Paths (relative to project root)
|
|
INPUT_FILE = Path("scripts/data-sources/english/en_m3.xls")
|
|
OUTPUT_FILE = Path("scripts/data-sources/english/en_m3-extracted.json")
|
|
|
|
|
|
def extract() -> None:
|
|
print(f"Reading: {INPUT_FILE}")
|
|
|
|
records = []
|
|
skipped_pos = 0
|
|
skipped_invalid_cefr = 0
|
|
skipped_empty_word = 0
|
|
total_rows = 0
|
|
|
|
wb = xlrd.open_workbook(INPUT_FILE)
|
|
ws = wb.sheet_by_index(0)
|
|
|
|
# Skip header row, start from row 1
|
|
for row_idx in range(1, ws.nrows):
|
|
total_rows += 1
|
|
|
|
# Unpack columns: ID number, Word, Part of Speech, CEFR, Points
|
|
word_raw = ws.cell_value(row_idx, 1)
|
|
pos_raw = ws.cell_value(row_idx, 2)
|
|
cefr_raw = ws.cell_value(row_idx, 3)
|
|
|
|
# Normalize POS (case-insensitive)
|
|
pos = str(pos_raw).lower().strip() if pos_raw else ""
|
|
if pos not in POS_MAP:
|
|
skipped_pos += 1
|
|
continue
|
|
|
|
pos = POS_MAP[pos]
|
|
|
|
# Normalize CEFR - handle smart quotes
|
|
cefr_str = str(cefr_raw).strip() if cefr_raw else ""
|
|
# Strip Unicode smart quotes (U+201C and U+201D)
|
|
cefr_str = cefr_str.strip("\u201c\u201d")
|
|
cefr = cefr_str.upper()
|
|
|
|
if cefr not in CEFR_LEVELS:
|
|
skipped_invalid_cefr += 1
|
|
continue
|
|
|
|
# Normalize word
|
|
word = str(word_raw).lower().strip() if word_raw else ""
|
|
if not word:
|
|
skipped_empty_word += 1
|
|
continue
|
|
|
|
record = {"word": word, "pos": pos, "cefr": cefr, "source": "en_m3"}
|
|
records.append(record)
|
|
|
|
# Write output
|
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(records, f, indent=2, ensure_ascii=False)
|
|
|
|
# Stats
|
|
noun_count = sum(1 for r in records if r["pos"] == "noun")
|
|
verb_count = sum(1 for r in records if r["pos"] == "verb")
|
|
|
|
cefr_distribution = {}
|
|
for level in CEFR_LEVELS:
|
|
count = sum(1 for r in records if r["cefr"] == level)
|
|
if count > 0:
|
|
cefr_distribution[level] = count
|
|
|
|
print(f"\nTotal rows in XLS: {total_rows}")
|
|
print(f"Extracted: {len(records)} records")
|
|
print(f" - Nouns: {noun_count}")
|
|
print(f" - Verbs: {verb_count}")
|
|
print("\nCEFR distribution:")
|
|
for level in CEFR_LEVELS:
|
|
if level in cefr_distribution:
|
|
print(f" - {level}: {cefr_distribution[level]}")
|
|
|
|
print("\nSkipped:")
|
|
print(f" - Unsupported POS: {skipped_pos}")
|
|
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
|
print(f" - Empty word: {skipped_empty_word}")
|
|
print(f"\nOutput: {OUTPUT_FILE}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
extract()
|