#!/usr/bin/env python3 """ scripts/extraction-scripts/english/extract-random-json.py Extracts CEFR data from random.json (English flashcard source). Filters for useful_for_flashcard=true and supported POS (noun, verb). Input: scripts/data-sources/english/random.json Output: scripts/data-sources/english/random-extracted.json Output format (normalized): [ { "word": "be", "pos": "verb", "cefr": "A1", "source": "random" } ] """ import json from pathlib import Path # Constants matching @glossa/shared SUPPORTED_POS = ["noun", "verb"] CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"] # Paths (relative to project root) INPUT_FILE = Path("scripts/data-sources/english/random.json") OUTPUT_FILE = Path("scripts/data-sources/english/random-extracted.json") def extract() -> None: print(f"Reading: {INPUT_FILE}") with open(INPUT_FILE, "r", encoding="utf-8") as f: data = json.load(f) records = [] skipped_pos = 0 skipped_not_useful = 0 skipped_invalid_cefr = 0 skipped_empty_word = 0 for entry in data: # Filter: must be useful for flashcard if not entry.get("useful_for_flashcard", False): skipped_not_useful += 1 continue # Filter: must have supported POS pos = entry.get("pos", "").lower().strip() if pos not in SUPPORTED_POS: skipped_pos += 1 continue # Filter: must have valid CEFR level cefr = entry.get("cefr_level", "").upper().strip() if cefr not in CEFR_LEVELS: skipped_invalid_cefr += 1 continue # Normalize word word = entry.get("word", "").lower().strip() if not word: skipped_empty_word += 1 continue record = {"word": word, "pos": pos, "cefr": cefr, "source": "random"} records.append(record) # Write output with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(records, f, indent=2, ensure_ascii=False) # Stats noun_count = sum(1 for r in records if r["pos"] == "noun") verb_count = sum(1 for r in records if r["pos"] == "verb") cefr_distribution = {} for level in CEFR_LEVELS: count = sum(1 for r in records if r["cefr"] == level) if count > 0: cefr_distribution[level] = count print(f"\nExtracted: {len(records)} records") print(f" - Nouns: {noun_count}") print(f" - Verbs: {verb_count}") print("\nCEFR distribution:") for level in CEFR_LEVELS: if level in cefr_distribution: print(f" - {level}: {cefr_distribution[level]}") print("\nSkipped:") print(f" - Not useful for flashcard: {skipped_not_useful}") print(f" - Unsupported POS: {skipped_pos}") print(f" - Invalid CEFR: {skipped_invalid_cefr}") print(f" - Empty word: {skipped_empty_word}") print(f"\nOutput: {OUTPUT_FILE}") if __name__ == "__main__": extract()