extraction, comparison and merging scripts for english are done, final english.json exists

2026-04-08 17:50:25 +02:00 · 2026-04-08 17:50:25 +02:00 · 59152950d6
commit 59152950d6
parent 3596f76492
14 changed files with 206319 additions and 0 deletions
--- a/scripts/extraction-scripts/english/extract-octanove.py
+++ b/scripts/extraction-scripts/english/extract-octanove.py
@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""
+scripts/extraction-scripts/english/extract-octanove.py
+Extracts CEFR data from octanove.csv (Octanove vocabulary profile).
+Filters for supported POS (noun, verb).
+Input:  scripts/data-sources/english/octanove.csv
+Output: scripts/data-sources/english/octanove-extracted.json
+Output format (normalized):
+[
+  { "word": "example", "pos": "noun", "cefr": "C1", "source": "octanove" }
+]
+"""
+
+import csv
+import json
+from pathlib import Path
+
+# Constants matching @glossa/shared
+SUPPORTED_POS = ["noun", "verb"]
+CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
+
+# Paths (relative to project root)
+INPUT_FILE = Path("scripts/data-sources/english/octanove.csv")
+OUTPUT_FILE = Path("scripts/data-sources/english/octanove-extracted.json")
+
+
+def extract() -> None:
+    print(f"Reading: {INPUT_FILE}")
+    records = []
+    skipped_pos = 0
+    skipped_invalid_cefr = 0
+    skipped_empty_word = 0
+    total_rows = 0
+
+    with open(INPUT_FILE, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            total_rows += 1
+
+            # Filter: must have supported POS
+            pos = row.get("pos", "").lower().strip()
+            if pos not in SUPPORTED_POS:
+                skipped_pos += 1
+                continue
+
+            # Filter: must have valid CEFR level
+            cefr = row.get("CEFR", "").upper().strip()
+            if cefr not in CEFR_LEVELS:
+                skipped_invalid_cefr += 1
+                continue
+
+            # Normalize word
+            word = row.get("headword", "").lower().strip()
+            if not word:
+                skipped_empty_word += 1
+                continue
+
+            record = {"word": word, "pos": pos, "cefr": cefr, "source": "octanove"}
+            records.append(record)
+
+    # Write output
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+        json.dump(records, f, indent=2, ensure_ascii=False)
+
+    # Stats
+    noun_count = sum(1 for r in records if r["pos"] == "noun")
+    verb_count = sum(1 for r in records if r["pos"] == "verb")
+    cefr_distribution = {}
+    for level in CEFR_LEVELS:
+        count = sum(1 for r in records if r["cefr"] == level)
+        if count > 0:
+            cefr_distribution[level] = count
+
+    print(f"\nTotal rows in CSV: {total_rows}")
+    print(f"Extracted: {len(records)} records")
+    print(f"  - Nouns: {noun_count}")
+    print(f"  - Verbs: {verb_count}")
+    print("\nCEFR distribution:")
+    for level in CEFR_LEVELS:
+        if level in cefr_distribution:
+            print(f"  - {level}: {cefr_distribution[level]}")
+    print("\nSkipped:")
+    print(f"  - Unsupported POS: {skipped_pos}")
+    print(f"  - Invalid CEFR: {skipped_invalid_cefr}")
+    print(f"  - Empty word: {skipped_empty_word}")
+    print(f"\nOutput: {OUTPUT_FILE}")
+
+
+if __name__ == "__main__":
+    extract()