extraction, comparison and merging scripts for english are done, final english.json exists

This commit is contained in:
lila 2026-04-08 17:50:25 +02:00
parent 3596f76492
commit 59152950d6
14 changed files with 206319 additions and 0 deletions

View file

@ -0,0 +1,107 @@
#!/usr/bin/env python3
"""
scripts/extraction-scripts/english/extract-en_m3.py
Extracts CEFR data from en_m3.xls (M3 wordlist).
"""
import json
from pathlib import Path
import xlrd
# Constants matching @glossa/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
# POS mapping (case-insensitive)
POS_MAP = {
"noun": "noun",
"verb": "verb",
}
# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/english/en_m3.xls")
OUTPUT_FILE = Path("scripts/data-sources/english/en_m3-extracted.json")
def extract() -> None:
print(f"Reading: {INPUT_FILE}")
records = []
skipped_pos = 0
skipped_invalid_cefr = 0
skipped_empty_word = 0
total_rows = 0
wb = xlrd.open_workbook(INPUT_FILE)
ws = wb.sheet_by_index(0)
# Skip header row, start from row 1
for row_idx in range(1, ws.nrows):
total_rows += 1
# Unpack columns: ID number, Word, Part of Speech, CEFR, Points
word_raw = ws.cell_value(row_idx, 1)
pos_raw = ws.cell_value(row_idx, 2)
cefr_raw = ws.cell_value(row_idx, 3)
# Normalize POS (case-insensitive)
pos = str(pos_raw).lower().strip() if pos_raw else ""
if pos not in POS_MAP:
skipped_pos += 1
continue
pos = POS_MAP[pos]
# Normalize CEFR - handle smart quotes
cefr_str = str(cefr_raw).strip() if cefr_raw else ""
# Strip Unicode smart quotes (U+201C and U+201D)
cefr_str = cefr_str.strip("\u201c\u201d")
cefr = cefr_str.upper()
if cefr not in CEFR_LEVELS:
skipped_invalid_cefr += 1
continue
# Normalize word
word = str(word_raw).lower().strip() if word_raw else ""
if not word:
skipped_empty_word += 1
continue
record = {"word": word, "pos": pos, "cefr": cefr, "source": "en_m3"}
records.append(record)
# Write output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
# Stats
noun_count = sum(1 for r in records if r["pos"] == "noun")
verb_count = sum(1 for r in records if r["pos"] == "verb")
cefr_distribution = {}
for level in CEFR_LEVELS:
count = sum(1 for r in records if r["cefr"] == level)
if count > 0:
cefr_distribution[level] = count
print(f"\nTotal rows in XLS: {total_rows}")
print(f"Extracted: {len(records)} records")
print(f" - Nouns: {noun_count}")
print(f" - Verbs: {verb_count}")
print(f"\nCEFR distribution:")
for level in CEFR_LEVELS:
if level in cefr_distribution:
print(f" - {level}: {cefr_distribution[level]}")
print(f"\nSkipped:")
print(f" - Unsupported POS: {skipped_pos}")
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
print(f" - Empty word: {skipped_empty_word}")
print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__":
extract()

View file

@ -0,0 +1,90 @@
#!/usr/bin/env python3
"""
scripts/extraction-scripts/english/extract-octanove.py
Extracts CEFR data from octanove.csv (Octanove vocabulary profile).
Filters for supported POS (noun, verb).
Input: scripts/data-sources/english/octanove.csv
Output: scripts/data-sources/english/octanove-extracted.json
Output format (normalized):
[
{ "word": "example", "pos": "noun", "cefr": "C1", "source": "octanove" }
]
"""
import csv
import json
from pathlib import Path
# Constants matching @glossa/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/english/octanove.csv")
OUTPUT_FILE = Path("scripts/data-sources/english/octanove-extracted.json")
def extract() -> None:
print(f"Reading: {INPUT_FILE}")
records = []
skipped_pos = 0
skipped_invalid_cefr = 0
skipped_empty_word = 0
total_rows = 0
with open(INPUT_FILE, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
total_rows += 1
# Filter: must have supported POS
pos = row.get("pos", "").lower().strip()
if pos not in SUPPORTED_POS:
skipped_pos += 1
continue
# Filter: must have valid CEFR level
cefr = row.get("CEFR", "").upper().strip()
if cefr not in CEFR_LEVELS:
skipped_invalid_cefr += 1
continue
# Normalize word
word = row.get("headword", "").lower().strip()
if not word:
skipped_empty_word += 1
continue
record = {"word": word, "pos": pos, "cefr": cefr, "source": "octanove"}
records.append(record)
# Write output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
# Stats
noun_count = sum(1 for r in records if r["pos"] == "noun")
verb_count = sum(1 for r in records if r["pos"] == "verb")
cefr_distribution = {}
for level in CEFR_LEVELS:
count = sum(1 for r in records if r["cefr"] == level)
if count > 0:
cefr_distribution[level] = count
print(f"\nTotal rows in CSV: {total_rows}")
print(f"Extracted: {len(records)} records")
print(f" - Nouns: {noun_count}")
print(f" - Verbs: {verb_count}")
print("\nCEFR distribution:")
for level in CEFR_LEVELS:
if level in cefr_distribution:
print(f" - {level}: {cefr_distribution[level]}")
print("\nSkipped:")
print(f" - Unsupported POS: {skipped_pos}")
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
print(f" - Empty word: {skipped_empty_word}")
print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__":
extract()