extraction, comparison and merging scripts for english are done, final english.json exists

2026-04-08 17:50:25 +02:00 · 2026-04-08 17:50:25 +02:00 · 59152950d6
commit 59152950d6
parent 3596f76492
14 changed files with 206319 additions and 0 deletions
--- a/scripts/merge-scripts/merge-english-json.py
+++ b/scripts/merge-scripts/merge-english-json.py
@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+"""
+CEFR Data Pipeline - Stage 3: English Merge
+Merges extracted JSON files for English into an authoritative dataset.
+"""
+
+import json
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+# Supported CEFR levels and difficulty mapping
+CEFR_LEVELS = {"A1", "A2", "B1", "B2", "C1", "C2"}
+DIFFICULTY_MAP = {
+    "A1": "easy",
+    "A2": "easy",
+    "B1": "intermediate",
+    "B2": "intermediate",
+    "C1": "hard",
+    "C2": "hard",
+}
+
+# Source priority order (from lowest to highest priority)
+# Higher index = higher authority when conflicts occur
+PRIORITY_ORDER = ["random", "octanove", "cefrj", "en_m3"]
+
+
+def load_extracted_files(data_dir: Path) -> Dict[str, List[dict]]:
+    """Load all *-extracted.json files from the English data directory."""
+    sources = {}
+    for file_path in data_dir.glob("*-extracted.json"):
+        source_name = file_path.stem.replace("-extracted", "")
+        with open(file_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            if isinstance(data, list):
+                sources[source_name] = data
+            else:
+                print(f"Warning: {file_path} does not contain a list, skipping.")
+    return sources
+
+
+def normalize_entry(entry: dict) -> Tuple[str, str]:
+    """Return (word, pos) key for merging."""
+    return entry["word"].lower().strip(), entry["pos"].lower().strip()
+
+
+def get_source_priority(source_name: str) -> int:
+    """Return priority index for a source (higher = more authoritative)."""
+    try:
+        return PRIORITY_ORDER.index(source_name)
+    except ValueError:
+        # If source not in list, assign lowest priority
+        return -1
+
+
+def merge_entries(sources: Dict[str, List[dict]]) -> List[dict]:
+    """Merge entries from multiple sources, resolving conflicts by priority."""
+    grouped = defaultdict(list)
+    for src_name, entries in sources.items():
+        for entry in entries:
+            key = normalize_entry(entry)
+            grouped[key].append((src_name, entry["cefr"], entry))
+
+    merged = []
+    conflicts_resolved = 0
+    total_multi_source = 0
+
+    for (word, pos), src_entries in grouped.items():
+        if len(src_entries) == 1:
+            src_name, cefr, original = src_entries[0]
+            final_cefr = cefr
+            contributing_sources = [src_name]
+        else:
+            total_multi_source += 1
+            sorted_entries = sorted(
+                src_entries, key=lambda x: get_source_priority(x[0]), reverse=True
+            )
+            highest_src, highest_cefr, _ = sorted_entries[0]
+            all_cefrs = {e[1] for e in src_entries}
+            if len(all_cefrs) > 1:
+                conflicts_resolved += 1
+
+            final_cefr = highest_cefr
+            contributing_sources = [e[0] for e in src_entries]
+
+        difficulty = DIFFICULTY_MAP.get(final_cefr, "unknown")
+
+        merged.append(
+            {
+                "word": word,
+                "pos": pos,
+                "cefr": final_cefr,
+                "difficulty": difficulty,
+                "sources": sorted(contributing_sources),
+            }
+        )
+
+    print(f"Merge statistics:")
+    print(f"  Total unique entries: {len(merged)}")
+    print(f"  Entries with multiple sources: {total_multi_source}")
+    print(f"  Conflicts resolved by priority: {conflicts_resolved}")
+
+    return merged
+
+
+def print_summary(merged: List[dict]):
+    """Print distribution of CEFR levels and difficulty in final dataset."""
+    cefr_counts = defaultdict(int)
+    diff_counts = defaultdict(int)
+
+    for entry in merged:
+        cefr_counts[entry["cefr"]] += 1
+        diff_counts[entry["difficulty"]] += 1
+
+    print("\n📊 Final CEFR distribution:")
+    for level in sorted(CEFR_LEVELS):
+        count = cefr_counts.get(level, 0)
+        if count:
+            print(f"  {level}: {count}")
+
+    print("\n📊 Final difficulty distribution:")
+    for diff in ["easy", "intermediate", "hard"]:
+        count = diff_counts.get(diff, 0)
+        print(f"  {diff}: {count}")
+
+
+def main():
+    script_dir = Path(__file__).parent
+    data_dir = script_dir.parent / "data-sources" / "english"
+    output_dir = script_dir.parent / "datafiles"
+    output_file = output_dir / "english-merged.json"
+
+    if not data_dir.exists():
+        print(f"Error: English data directory not found: {data_dir}")
+        return
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Loading extracted files from {data_dir}...")
+    sources = load_extracted_files(data_dir)
+
+    if not sources:
+        print("No extracted files found.")
+        return
+
+    print(f"Found sources: {', '.join(sources.keys())}")
+    print(f"Priority order (lowest to highest): {PRIORITY_ORDER}")
+
+    merged = merge_entries(sources)
+
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(merged, f, indent=2, ensure_ascii=False)
+
+    print(f"\n✅ Merged dataset written to: {output_file}")
+    print_summary(merged)
+
+
+if __name__ == "__main__":
+    main()