extraction datafiles with cefr annotations

This commit is contained in:
lila 2026-04-08 13:09:47 +02:00
parent e79fa6922b
commit 3596f76492
19 changed files with 2368633 additions and 2 deletions

View file

@ -11,7 +11,7 @@
"format": "prettier --write .", "format": "prettier --write .",
"format:check": "prettier --check ." "format:check": "prettier --check ."
}, },
"packageManager": "pnpm@10.32.1", "packageManager": "pnpm@10.33.0",
"devDependencies": { "devDependencies": {
"@eslint/js": "^10.0.1", "@eslint/js": "^10.0.1",
"@tanstack/eslint-plugin-router": "^1.161.6", "@tanstack/eslint-plugin-router": "^1.161.6",

View file

@ -14,7 +14,8 @@
"@glossa/shared": "workspace:*", "@glossa/shared": "workspace:*",
"dotenv": "^17.3.1", "dotenv": "^17.3.1",
"drizzle-orm": "^0.45.1", "drizzle-orm": "^0.45.1",
"pg": "^8.20.0" "pg": "^8.20.0",
"xlsx": "^0.18.5"
}, },
"devDependencies": { "devDependencies": {
"@types/pg": "^8.20.0", "@types/pg": "^8.20.0",

72
pnpm-lock.yaml generated
View file

@ -124,6 +124,9 @@ importers:
pg: pg:
specifier: ^8.20.0 specifier: ^8.20.0
version: 8.20.0 version: 8.20.0
xlsx:
specifier: ^0.18.5
version: 0.18.5
devDependencies: devDependencies:
'@types/pg': '@types/pg':
specifier: ^8.20.0 specifier: ^8.20.0
@ -1299,6 +1302,10 @@ packages:
engines: {node: '>=0.4.0'} engines: {node: '>=0.4.0'}
hasBin: true hasBin: true
adler-32@1.3.1:
resolution: {integrity: sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A==}
engines: {node: '>=0.8'}
ajv@6.14.0: ajv@6.14.0:
resolution: {integrity: sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==} resolution: {integrity: sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==}
@ -1383,6 +1390,10 @@ packages:
caniuse-lite@1.0.30001780: caniuse-lite@1.0.30001780:
resolution: {integrity: sha512-llngX0E7nQci5BPJDqoZSbuZ5Bcs9F5db7EtgfwBerX9XGtkkiO4NwfDDIRzHTTwcYC8vC7bmeUEPGrKlR/TkQ==} resolution: {integrity: sha512-llngX0E7nQci5BPJDqoZSbuZ5Bcs9F5db7EtgfwBerX9XGtkkiO4NwfDDIRzHTTwcYC8vC7bmeUEPGrKlR/TkQ==}
cfb@1.2.2:
resolution: {integrity: sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==}
engines: {node: '>=0.8'}
chai@6.2.2: chai@6.2.2:
resolution: {integrity: sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==} resolution: {integrity: sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==}
engines: {node: '>=18'} engines: {node: '>=18'}
@ -1403,6 +1414,10 @@ packages:
resolution: {integrity: sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==} resolution: {integrity: sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==}
engines: {node: '>=6'} engines: {node: '>=6'}
codepage@1.15.0:
resolution: {integrity: sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA==}
engines: {node: '>=0.8'}
color-convert@2.0.1: color-convert@2.0.1:
resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==} resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==}
engines: {node: '>=7.0.0'} engines: {node: '>=7.0.0'}
@ -1437,6 +1452,11 @@ packages:
resolution: {integrity: sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==} resolution: {integrity: sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==}
engines: {node: '>= 0.6'} engines: {node: '>= 0.6'}
crc-32@1.2.2:
resolution: {integrity: sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==}
engines: {node: '>=0.8'}
hasBin: true
cross-spawn@7.0.6: cross-spawn@7.0.6:
resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
engines: {node: '>= 8'} engines: {node: '>= 8'}
@ -1769,6 +1789,10 @@ packages:
resolution: {integrity: sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==} resolution: {integrity: sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==}
engines: {node: '>= 0.6'} engines: {node: '>= 0.6'}
frac@1.1.2:
resolution: {integrity: sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA==}
engines: {node: '>=0.8'}
fresh@2.0.0: fresh@2.0.0:
resolution: {integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==} resolution: {integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==}
engines: {node: '>= 0.8'} engines: {node: '>= 0.8'}
@ -2377,6 +2401,10 @@ packages:
resolution: {integrity: sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==} resolution: {integrity: sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==}
engines: {node: '>= 10.x'} engines: {node: '>= 10.x'}
ssf@0.11.2:
resolution: {integrity: sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g==}
engines: {node: '>=0.8'}
stackback@0.0.2: stackback@0.0.2:
resolution: {integrity: sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==} resolution: {integrity: sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==}
@ -2638,10 +2666,18 @@ packages:
engines: {node: '>=8'} engines: {node: '>=8'}
hasBin: true hasBin: true
wmf@1.0.2:
resolution: {integrity: sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw==}
engines: {node: '>=0.8'}
word-wrap@1.2.5: word-wrap@1.2.5:
resolution: {integrity: sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==} resolution: {integrity: sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==}
engines: {node: '>=0.10.0'} engines: {node: '>=0.10.0'}
word@0.3.0:
resolution: {integrity: sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA==}
engines: {node: '>=0.8'}
wrap-ansi@7.0.0: wrap-ansi@7.0.0:
resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==} resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==}
engines: {node: '>=10'} engines: {node: '>=10'}
@ -2649,6 +2685,11 @@ packages:
wrappy@1.0.2: wrappy@1.0.2:
resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==} resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==}
xlsx@0.18.5:
resolution: {integrity: sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ==}
engines: {node: '>=0.8'}
hasBin: true
xml-name-validator@5.0.0: xml-name-validator@5.0.0:
resolution: {integrity: sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==} resolution: {integrity: sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==}
engines: {node: '>=18'} engines: {node: '>=18'}
@ -3648,6 +3689,8 @@ snapshots:
acorn@8.16.0: {} acorn@8.16.0: {}
adler-32@1.3.1: {}
ajv@6.14.0: ajv@6.14.0:
dependencies: dependencies:
fast-deep-equal: 3.1.3 fast-deep-equal: 3.1.3
@ -3745,6 +3788,11 @@ snapshots:
caniuse-lite@1.0.30001780: {} caniuse-lite@1.0.30001780: {}
cfb@1.2.2:
dependencies:
adler-32: 1.3.1
crc-32: 1.2.2
chai@6.2.2: {} chai@6.2.2: {}
chalk@4.1.2: chalk@4.1.2:
@ -3772,6 +3820,8 @@ snapshots:
clsx@2.1.1: {} clsx@2.1.1: {}
codepage@1.15.0: {}
color-convert@2.0.1: color-convert@2.0.1:
dependencies: dependencies:
color-name: 1.1.4 color-name: 1.1.4
@ -3799,6 +3849,8 @@ snapshots:
cookie@0.7.2: {} cookie@0.7.2: {}
crc-32@1.2.2: {}
cross-spawn@7.0.6: cross-spawn@7.0.6:
dependencies: dependencies:
path-key: 3.1.1 path-key: 3.1.1
@ -4138,6 +4190,8 @@ snapshots:
forwarded@0.2.0: {} forwarded@0.2.0: {}
frac@1.1.2: {}
fresh@2.0.0: {} fresh@2.0.0: {}
fsevents@2.3.3: fsevents@2.3.3:
@ -4700,6 +4754,10 @@ snapshots:
split2@4.2.0: {} split2@4.2.0: {}
ssf@0.11.2:
dependencies:
frac: 1.1.2
stackback@0.0.2: {} stackback@0.0.2: {}
statuses@2.0.2: {} statuses@2.0.2: {}
@ -4918,8 +4976,12 @@ snapshots:
siginfo: 2.0.0 siginfo: 2.0.0
stackback: 0.0.2 stackback: 0.0.2
wmf@1.0.2: {}
word-wrap@1.2.5: {} word-wrap@1.2.5: {}
word@0.3.0: {}
wrap-ansi@7.0.0: wrap-ansi@7.0.0:
dependencies: dependencies:
ansi-styles: 4.3.0 ansi-styles: 4.3.0
@ -4928,6 +4990,16 @@ snapshots:
wrappy@1.0.2: {} wrappy@1.0.2: {}
xlsx@0.18.5:
dependencies:
adler-32: 1.3.1
cfb: 1.2.2
codepage: 1.15.0
crc-32: 1.2.2
ssf: 0.11.2
wmf: 1.0.2
word: 0.3.0
xml-name-validator@5.0.0: {} xml-name-validator@5.0.0: {}
xmlchars@2.2.0: {} xmlchars@2.2.0: {}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,96 @@
#!/usr/bin/env python3
"""
scripts/extraction-scripts/english/extract-cefrj-csv.py
Extracts CEFR data from cefrj.csv (CEFR-J vocabulary profile).
Filters for supported POS (noun, verb).
Input: scripts/data-sources/english/cefrj.csv
Output: scripts/data-sources/english/cefrj-extracted.json
Output format (normalized):
[
{ "word": "ability", "pos": "noun", "cefr": "A2", "source": "cefrj" }
]
"""
import csv
import json
from pathlib import Path
# Constants matching @glossa/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/english/cefrj.csv")
OUTPUT_FILE = Path("scripts/data-sources/english/cefrj-extracted.json")
def extract() -> None:
print(f"Reading: {INPUT_FILE}")
records = []
skipped_pos = 0
skipped_invalid_cefr = 0
skipped_empty_word = 0
total_rows = 0
with open(INPUT_FILE, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
total_rows += 1
# Filter: must have supported POS
pos = row.get("pos", "").lower().strip()
if pos not in SUPPORTED_POS:
skipped_pos += 1
continue
# Filter: must have valid CEFR level
cefr = row.get("CEFR", "").upper().strip()
if cefr not in CEFR_LEVELS:
skipped_invalid_cefr += 1
continue
# Normalize word
word = row.get("headword", "").lower().strip()
if not word:
skipped_empty_word += 1
continue
record = {"word": word, "pos": pos, "cefr": cefr, "source": "cefrj"}
records.append(record)
# Write output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
# Stats
noun_count = sum(1 for r in records if r["pos"] == "noun")
verb_count = sum(1 for r in records if r["pos"] == "verb")
cefr_distribution = {}
for level in CEFR_LEVELS:
count = sum(1 for r in records if r["cefr"] == level)
if count > 0:
cefr_distribution[level] = count
print(f"\nTotal rows in CSV: {total_rows}")
print(f"Extracted: {len(records)} records")
print(f" - Nouns: {noun_count}")
print(f" - Verbs: {verb_count}")
print("\nCEFR distribution:")
for level in CEFR_LEVELS:
if level in cefr_distribution:
print(f" - {level}: {cefr_distribution[level]}")
print("\nSkipped:")
print(f" - Unsupported POS: {skipped_pos}")
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
print(f" - Empty word: {skipped_empty_word}")
print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__":
extract()

View file

@ -0,0 +1,99 @@
#!/usr/bin/env python3
"""
scripts/extraction-scripts/english/extract-random-json.py
Extracts CEFR data from random.json (English flashcard source).
Filters for useful_for_flashcard=true and supported POS (noun, verb).
Input: scripts/data-sources/english/random.json
Output: scripts/data-sources/english/random-extracted.json
Output format (normalized):
[
{ "word": "be", "pos": "verb", "cefr": "A1", "source": "random" }
]
"""
import json
from pathlib import Path
# Constants matching @glossa/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/english/random.json")
OUTPUT_FILE = Path("scripts/data-sources/english/random-extracted.json")
def extract() -> None:
print(f"Reading: {INPUT_FILE}")
with open(INPUT_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
records = []
skipped_pos = 0
skipped_not_useful = 0
skipped_invalid_cefr = 0
skipped_empty_word = 0
for entry in data:
# Filter: must be useful for flashcard
if not entry.get("useful_for_flashcard", False):
skipped_not_useful += 1
continue
# Filter: must have supported POS
pos = entry.get("pos", "").lower().strip()
if pos not in SUPPORTED_POS:
skipped_pos += 1
continue
# Filter: must have valid CEFR level
cefr = entry.get("cefr_level", "").upper().strip()
if cefr not in CEFR_LEVELS:
skipped_invalid_cefr += 1
continue
# Normalize word
word = entry.get("word", "").lower().strip()
if not word:
skipped_empty_word += 1
continue
record = {"word": word, "pos": pos, "cefr": cefr, "source": "random"}
records.append(record)
# Write output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
# Stats
noun_count = sum(1 for r in records if r["pos"] == "noun")
verb_count = sum(1 for r in records if r["pos"] == "verb")
cefr_distribution = {}
for level in CEFR_LEVELS:
count = sum(1 for r in records if r["cefr"] == level)
if count > 0:
cefr_distribution[level] = count
print(f"\nExtracted: {len(records)} records")
print(f" - Nouns: {noun_count}")
print(f" - Verbs: {verb_count}")
print("\nCEFR distribution:")
for level in CEFR_LEVELS:
if level in cefr_distribution:
print(f" - {level}: {cefr_distribution[level]}")
print("\nSkipped:")
print(f" - Not useful for flashcard: {skipped_not_useful}")
print(f" - Unsupported POS: {skipped_pos}")
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
print(f" - Empty word: {skipped_empty_word}")
print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__":
extract()