extraction datafiles with cefr annotations
This commit is contained in:
parent
e79fa6922b
commit
3596f76492
19 changed files with 2368633 additions and 2 deletions
|
|
@ -11,7 +11,7 @@
|
||||||
"format": "prettier --write .",
|
"format": "prettier --write .",
|
||||||
"format:check": "prettier --check ."
|
"format:check": "prettier --check ."
|
||||||
},
|
},
|
||||||
"packageManager": "pnpm@10.32.1",
|
"packageManager": "pnpm@10.33.0",
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@eslint/js": "^10.0.1",
|
"@eslint/js": "^10.0.1",
|
||||||
"@tanstack/eslint-plugin-router": "^1.161.6",
|
"@tanstack/eslint-plugin-router": "^1.161.6",
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,8 @@
|
||||||
"@glossa/shared": "workspace:*",
|
"@glossa/shared": "workspace:*",
|
||||||
"dotenv": "^17.3.1",
|
"dotenv": "^17.3.1",
|
||||||
"drizzle-orm": "^0.45.1",
|
"drizzle-orm": "^0.45.1",
|
||||||
"pg": "^8.20.0"
|
"pg": "^8.20.0",
|
||||||
|
"xlsx": "^0.18.5"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/pg": "^8.20.0",
|
"@types/pg": "^8.20.0",
|
||||||
|
|
|
||||||
72
pnpm-lock.yaml
generated
72
pnpm-lock.yaml
generated
|
|
@ -124,6 +124,9 @@ importers:
|
||||||
pg:
|
pg:
|
||||||
specifier: ^8.20.0
|
specifier: ^8.20.0
|
||||||
version: 8.20.0
|
version: 8.20.0
|
||||||
|
xlsx:
|
||||||
|
specifier: ^0.18.5
|
||||||
|
version: 0.18.5
|
||||||
devDependencies:
|
devDependencies:
|
||||||
'@types/pg':
|
'@types/pg':
|
||||||
specifier: ^8.20.0
|
specifier: ^8.20.0
|
||||||
|
|
@ -1299,6 +1302,10 @@ packages:
|
||||||
engines: {node: '>=0.4.0'}
|
engines: {node: '>=0.4.0'}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
|
|
||||||
|
adler-32@1.3.1:
|
||||||
|
resolution: {integrity: sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A==}
|
||||||
|
engines: {node: '>=0.8'}
|
||||||
|
|
||||||
ajv@6.14.0:
|
ajv@6.14.0:
|
||||||
resolution: {integrity: sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==}
|
resolution: {integrity: sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==}
|
||||||
|
|
||||||
|
|
@ -1383,6 +1390,10 @@ packages:
|
||||||
caniuse-lite@1.0.30001780:
|
caniuse-lite@1.0.30001780:
|
||||||
resolution: {integrity: sha512-llngX0E7nQci5BPJDqoZSbuZ5Bcs9F5db7EtgfwBerX9XGtkkiO4NwfDDIRzHTTwcYC8vC7bmeUEPGrKlR/TkQ==}
|
resolution: {integrity: sha512-llngX0E7nQci5BPJDqoZSbuZ5Bcs9F5db7EtgfwBerX9XGtkkiO4NwfDDIRzHTTwcYC8vC7bmeUEPGrKlR/TkQ==}
|
||||||
|
|
||||||
|
cfb@1.2.2:
|
||||||
|
resolution: {integrity: sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==}
|
||||||
|
engines: {node: '>=0.8'}
|
||||||
|
|
||||||
chai@6.2.2:
|
chai@6.2.2:
|
||||||
resolution: {integrity: sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==}
|
resolution: {integrity: sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==}
|
||||||
engines: {node: '>=18'}
|
engines: {node: '>=18'}
|
||||||
|
|
@ -1403,6 +1414,10 @@ packages:
|
||||||
resolution: {integrity: sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==}
|
resolution: {integrity: sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==}
|
||||||
engines: {node: '>=6'}
|
engines: {node: '>=6'}
|
||||||
|
|
||||||
|
codepage@1.15.0:
|
||||||
|
resolution: {integrity: sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA==}
|
||||||
|
engines: {node: '>=0.8'}
|
||||||
|
|
||||||
color-convert@2.0.1:
|
color-convert@2.0.1:
|
||||||
resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==}
|
resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==}
|
||||||
engines: {node: '>=7.0.0'}
|
engines: {node: '>=7.0.0'}
|
||||||
|
|
@ -1437,6 +1452,11 @@ packages:
|
||||||
resolution: {integrity: sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==}
|
resolution: {integrity: sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==}
|
||||||
engines: {node: '>= 0.6'}
|
engines: {node: '>= 0.6'}
|
||||||
|
|
||||||
|
crc-32@1.2.2:
|
||||||
|
resolution: {integrity: sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==}
|
||||||
|
engines: {node: '>=0.8'}
|
||||||
|
hasBin: true
|
||||||
|
|
||||||
cross-spawn@7.0.6:
|
cross-spawn@7.0.6:
|
||||||
resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
|
resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
|
||||||
engines: {node: '>= 8'}
|
engines: {node: '>= 8'}
|
||||||
|
|
@ -1769,6 +1789,10 @@ packages:
|
||||||
resolution: {integrity: sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==}
|
resolution: {integrity: sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==}
|
||||||
engines: {node: '>= 0.6'}
|
engines: {node: '>= 0.6'}
|
||||||
|
|
||||||
|
frac@1.1.2:
|
||||||
|
resolution: {integrity: sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA==}
|
||||||
|
engines: {node: '>=0.8'}
|
||||||
|
|
||||||
fresh@2.0.0:
|
fresh@2.0.0:
|
||||||
resolution: {integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==}
|
resolution: {integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==}
|
||||||
engines: {node: '>= 0.8'}
|
engines: {node: '>= 0.8'}
|
||||||
|
|
@ -2377,6 +2401,10 @@ packages:
|
||||||
resolution: {integrity: sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==}
|
resolution: {integrity: sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==}
|
||||||
engines: {node: '>= 10.x'}
|
engines: {node: '>= 10.x'}
|
||||||
|
|
||||||
|
ssf@0.11.2:
|
||||||
|
resolution: {integrity: sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g==}
|
||||||
|
engines: {node: '>=0.8'}
|
||||||
|
|
||||||
stackback@0.0.2:
|
stackback@0.0.2:
|
||||||
resolution: {integrity: sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==}
|
resolution: {integrity: sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==}
|
||||||
|
|
||||||
|
|
@ -2638,10 +2666,18 @@ packages:
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
|
|
||||||
|
wmf@1.0.2:
|
||||||
|
resolution: {integrity: sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw==}
|
||||||
|
engines: {node: '>=0.8'}
|
||||||
|
|
||||||
word-wrap@1.2.5:
|
word-wrap@1.2.5:
|
||||||
resolution: {integrity: sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==}
|
resolution: {integrity: sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==}
|
||||||
engines: {node: '>=0.10.0'}
|
engines: {node: '>=0.10.0'}
|
||||||
|
|
||||||
|
word@0.3.0:
|
||||||
|
resolution: {integrity: sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA==}
|
||||||
|
engines: {node: '>=0.8'}
|
||||||
|
|
||||||
wrap-ansi@7.0.0:
|
wrap-ansi@7.0.0:
|
||||||
resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==}
|
resolution: {integrity: sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==}
|
||||||
engines: {node: '>=10'}
|
engines: {node: '>=10'}
|
||||||
|
|
@ -2649,6 +2685,11 @@ packages:
|
||||||
wrappy@1.0.2:
|
wrappy@1.0.2:
|
||||||
resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==}
|
resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==}
|
||||||
|
|
||||||
|
xlsx@0.18.5:
|
||||||
|
resolution: {integrity: sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ==}
|
||||||
|
engines: {node: '>=0.8'}
|
||||||
|
hasBin: true
|
||||||
|
|
||||||
xml-name-validator@5.0.0:
|
xml-name-validator@5.0.0:
|
||||||
resolution: {integrity: sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==}
|
resolution: {integrity: sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==}
|
||||||
engines: {node: '>=18'}
|
engines: {node: '>=18'}
|
||||||
|
|
@ -3648,6 +3689,8 @@ snapshots:
|
||||||
|
|
||||||
acorn@8.16.0: {}
|
acorn@8.16.0: {}
|
||||||
|
|
||||||
|
adler-32@1.3.1: {}
|
||||||
|
|
||||||
ajv@6.14.0:
|
ajv@6.14.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
fast-deep-equal: 3.1.3
|
fast-deep-equal: 3.1.3
|
||||||
|
|
@ -3745,6 +3788,11 @@ snapshots:
|
||||||
|
|
||||||
caniuse-lite@1.0.30001780: {}
|
caniuse-lite@1.0.30001780: {}
|
||||||
|
|
||||||
|
cfb@1.2.2:
|
||||||
|
dependencies:
|
||||||
|
adler-32: 1.3.1
|
||||||
|
crc-32: 1.2.2
|
||||||
|
|
||||||
chai@6.2.2: {}
|
chai@6.2.2: {}
|
||||||
|
|
||||||
chalk@4.1.2:
|
chalk@4.1.2:
|
||||||
|
|
@ -3772,6 +3820,8 @@ snapshots:
|
||||||
|
|
||||||
clsx@2.1.1: {}
|
clsx@2.1.1: {}
|
||||||
|
|
||||||
|
codepage@1.15.0: {}
|
||||||
|
|
||||||
color-convert@2.0.1:
|
color-convert@2.0.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
color-name: 1.1.4
|
color-name: 1.1.4
|
||||||
|
|
@ -3799,6 +3849,8 @@ snapshots:
|
||||||
|
|
||||||
cookie@0.7.2: {}
|
cookie@0.7.2: {}
|
||||||
|
|
||||||
|
crc-32@1.2.2: {}
|
||||||
|
|
||||||
cross-spawn@7.0.6:
|
cross-spawn@7.0.6:
|
||||||
dependencies:
|
dependencies:
|
||||||
path-key: 3.1.1
|
path-key: 3.1.1
|
||||||
|
|
@ -4138,6 +4190,8 @@ snapshots:
|
||||||
|
|
||||||
forwarded@0.2.0: {}
|
forwarded@0.2.0: {}
|
||||||
|
|
||||||
|
frac@1.1.2: {}
|
||||||
|
|
||||||
fresh@2.0.0: {}
|
fresh@2.0.0: {}
|
||||||
|
|
||||||
fsevents@2.3.3:
|
fsevents@2.3.3:
|
||||||
|
|
@ -4700,6 +4754,10 @@ snapshots:
|
||||||
|
|
||||||
split2@4.2.0: {}
|
split2@4.2.0: {}
|
||||||
|
|
||||||
|
ssf@0.11.2:
|
||||||
|
dependencies:
|
||||||
|
frac: 1.1.2
|
||||||
|
|
||||||
stackback@0.0.2: {}
|
stackback@0.0.2: {}
|
||||||
|
|
||||||
statuses@2.0.2: {}
|
statuses@2.0.2: {}
|
||||||
|
|
@ -4918,8 +4976,12 @@ snapshots:
|
||||||
siginfo: 2.0.0
|
siginfo: 2.0.0
|
||||||
stackback: 0.0.2
|
stackback: 0.0.2
|
||||||
|
|
||||||
|
wmf@1.0.2: {}
|
||||||
|
|
||||||
word-wrap@1.2.5: {}
|
word-wrap@1.2.5: {}
|
||||||
|
|
||||||
|
word@0.3.0: {}
|
||||||
|
|
||||||
wrap-ansi@7.0.0:
|
wrap-ansi@7.0.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
ansi-styles: 4.3.0
|
ansi-styles: 4.3.0
|
||||||
|
|
@ -4928,6 +4990,16 @@ snapshots:
|
||||||
|
|
||||||
wrappy@1.0.2: {}
|
wrappy@1.0.2: {}
|
||||||
|
|
||||||
|
xlsx@0.18.5:
|
||||||
|
dependencies:
|
||||||
|
adler-32: 1.3.1
|
||||||
|
cfb: 1.2.2
|
||||||
|
codepage: 1.15.0
|
||||||
|
crc-32: 1.2.2
|
||||||
|
ssf: 0.11.2
|
||||||
|
wmf: 1.0.2
|
||||||
|
word: 0.3.0
|
||||||
|
|
||||||
xml-name-validator@5.0.0: {}
|
xml-name-validator@5.0.0: {}
|
||||||
|
|
||||||
xmlchars@2.2.0: {}
|
xmlchars@2.2.0: {}
|
||||||
|
|
|
||||||
32642
scripts/data-sources/english/cefrj-extracted.json
Normal file
32642
scripts/data-sources/english/cefrj-extracted.json
Normal file
File diff suppressed because it is too large
Load diff
7800
scripts/data-sources/english/cefrj.csv
Normal file
7800
scripts/data-sources/english/cefrj.csv
Normal file
File diff suppressed because it is too large
Load diff
BIN
scripts/data-sources/english/en_m3.xls
Normal file
BIN
scripts/data-sources/english/en_m3.xls
Normal file
Binary file not shown.
2137
scripts/data-sources/english/octanove.csv
Normal file
2137
scripts/data-sources/english/octanove.csv
Normal file
File diff suppressed because it is too large
Load diff
89750
scripts/data-sources/english/random-extracted.json
Normal file
89750
scripts/data-sources/english/random-extracted.json
Normal file
File diff suppressed because it is too large
Load diff
186374
scripts/data-sources/english/random.json
Normal file
186374
scripts/data-sources/english/random.json
Normal file
File diff suppressed because it is too large
Load diff
193382
scripts/data-sources/french/french.json
Normal file
193382
scripts/data-sources/french/french.json
Normal file
File diff suppressed because it is too large
Load diff
324482
scripts/data-sources/german/german.json
Normal file
324482
scripts/data-sources/german/german.json
Normal file
File diff suppressed because it is too large
Load diff
2987
scripts/data-sources/italian/it-list_with_glossas.csv
Normal file
2987
scripts/data-sources/italian/it-list_with_glossas.csv
Normal file
File diff suppressed because it is too large
Load diff
BIN
scripts/data-sources/italian/it_m3.xls
Normal file
BIN
scripts/data-sources/italian/it_m3.xls
Normal file
Binary file not shown.
185759
scripts/data-sources/italian/italian.json
Normal file
185759
scripts/data-sources/italian/italian.json
Normal file
File diff suppressed because it is too large
Load diff
517565
scripts/data-sources/italian/subtlex-it.csv
Normal file
517565
scripts/data-sources/italian/subtlex-it.csv
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
163922
scripts/data-sources/spanish/spanish.json
Normal file
163922
scripts/data-sources/spanish/spanish.json
Normal file
File diff suppressed because it is too large
Load diff
96
scripts/extraction-scripts/english/extract-cefrj-csv.py
Normal file
96
scripts/extraction-scripts/english/extract-cefrj-csv.py
Normal file
|
|
@ -0,0 +1,96 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
scripts/extraction-scripts/english/extract-cefrj-csv.py
|
||||||
|
|
||||||
|
Extracts CEFR data from cefrj.csv (CEFR-J vocabulary profile).
|
||||||
|
Filters for supported POS (noun, verb).
|
||||||
|
|
||||||
|
Input: scripts/data-sources/english/cefrj.csv
|
||||||
|
Output: scripts/data-sources/english/cefrj-extracted.json
|
||||||
|
|
||||||
|
Output format (normalized):
|
||||||
|
[
|
||||||
|
{ "word": "ability", "pos": "noun", "cefr": "A2", "source": "cefrj" }
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Constants matching @glossa/shared
|
||||||
|
SUPPORTED_POS = ["noun", "verb"]
|
||||||
|
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||||
|
|
||||||
|
# Paths (relative to project root)
|
||||||
|
INPUT_FILE = Path("scripts/data-sources/english/cefrj.csv")
|
||||||
|
OUTPUT_FILE = Path("scripts/data-sources/english/cefrj-extracted.json")
|
||||||
|
|
||||||
|
|
||||||
|
def extract() -> None:
|
||||||
|
print(f"Reading: {INPUT_FILE}")
|
||||||
|
|
||||||
|
records = []
|
||||||
|
skipped_pos = 0
|
||||||
|
skipped_invalid_cefr = 0
|
||||||
|
skipped_empty_word = 0
|
||||||
|
total_rows = 0
|
||||||
|
|
||||||
|
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
for row in reader:
|
||||||
|
total_rows += 1
|
||||||
|
|
||||||
|
# Filter: must have supported POS
|
||||||
|
pos = row.get("pos", "").lower().strip()
|
||||||
|
if pos not in SUPPORTED_POS:
|
||||||
|
skipped_pos += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Filter: must have valid CEFR level
|
||||||
|
cefr = row.get("CEFR", "").upper().strip()
|
||||||
|
if cefr not in CEFR_LEVELS:
|
||||||
|
skipped_invalid_cefr += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Normalize word
|
||||||
|
word = row.get("headword", "").lower().strip()
|
||||||
|
if not word:
|
||||||
|
skipped_empty_word += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
record = {"word": word, "pos": pos, "cefr": cefr, "source": "cefrj"}
|
||||||
|
records.append(record)
|
||||||
|
|
||||||
|
# Write output
|
||||||
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
noun_count = sum(1 for r in records if r["pos"] == "noun")
|
||||||
|
verb_count = sum(1 for r in records if r["pos"] == "verb")
|
||||||
|
|
||||||
|
cefr_distribution = {}
|
||||||
|
for level in CEFR_LEVELS:
|
||||||
|
count = sum(1 for r in records if r["cefr"] == level)
|
||||||
|
if count > 0:
|
||||||
|
cefr_distribution[level] = count
|
||||||
|
|
||||||
|
print(f"\nTotal rows in CSV: {total_rows}")
|
||||||
|
print(f"Extracted: {len(records)} records")
|
||||||
|
print(f" - Nouns: {noun_count}")
|
||||||
|
print(f" - Verbs: {verb_count}")
|
||||||
|
print("\nCEFR distribution:")
|
||||||
|
for level in CEFR_LEVELS:
|
||||||
|
if level in cefr_distribution:
|
||||||
|
print(f" - {level}: {cefr_distribution[level]}")
|
||||||
|
|
||||||
|
print("\nSkipped:")
|
||||||
|
print(f" - Unsupported POS: {skipped_pos}")
|
||||||
|
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||||
|
print(f" - Empty word: {skipped_empty_word}")
|
||||||
|
print(f"\nOutput: {OUTPUT_FILE}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
extract()
|
||||||
99
scripts/extraction-scripts/english/extract-random-json.py
Normal file
99
scripts/extraction-scripts/english/extract-random-json.py
Normal file
|
|
@ -0,0 +1,99 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
scripts/extraction-scripts/english/extract-random-json.py
|
||||||
|
|
||||||
|
Extracts CEFR data from random.json (English flashcard source).
|
||||||
|
Filters for useful_for_flashcard=true and supported POS (noun, verb).
|
||||||
|
|
||||||
|
Input: scripts/data-sources/english/random.json
|
||||||
|
Output: scripts/data-sources/english/random-extracted.json
|
||||||
|
|
||||||
|
Output format (normalized):
|
||||||
|
[
|
||||||
|
{ "word": "be", "pos": "verb", "cefr": "A1", "source": "random" }
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Constants matching @glossa/shared
|
||||||
|
SUPPORTED_POS = ["noun", "verb"]
|
||||||
|
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||||
|
|
||||||
|
# Paths (relative to project root)
|
||||||
|
INPUT_FILE = Path("scripts/data-sources/english/random.json")
|
||||||
|
OUTPUT_FILE = Path("scripts/data-sources/english/random-extracted.json")
|
||||||
|
|
||||||
|
|
||||||
|
def extract() -> None:
|
||||||
|
print(f"Reading: {INPUT_FILE}")
|
||||||
|
|
||||||
|
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
records = []
|
||||||
|
skipped_pos = 0
|
||||||
|
skipped_not_useful = 0
|
||||||
|
skipped_invalid_cefr = 0
|
||||||
|
skipped_empty_word = 0
|
||||||
|
|
||||||
|
for entry in data:
|
||||||
|
# Filter: must be useful for flashcard
|
||||||
|
if not entry.get("useful_for_flashcard", False):
|
||||||
|
skipped_not_useful += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Filter: must have supported POS
|
||||||
|
pos = entry.get("pos", "").lower().strip()
|
||||||
|
if pos not in SUPPORTED_POS:
|
||||||
|
skipped_pos += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Filter: must have valid CEFR level
|
||||||
|
cefr = entry.get("cefr_level", "").upper().strip()
|
||||||
|
if cefr not in CEFR_LEVELS:
|
||||||
|
skipped_invalid_cefr += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Normalize word
|
||||||
|
word = entry.get("word", "").lower().strip()
|
||||||
|
if not word:
|
||||||
|
skipped_empty_word += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
record = {"word": word, "pos": pos, "cefr": cefr, "source": "random"}
|
||||||
|
records.append(record)
|
||||||
|
|
||||||
|
# Write output
|
||||||
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
noun_count = sum(1 for r in records if r["pos"] == "noun")
|
||||||
|
verb_count = sum(1 for r in records if r["pos"] == "verb")
|
||||||
|
|
||||||
|
cefr_distribution = {}
|
||||||
|
for level in CEFR_LEVELS:
|
||||||
|
count = sum(1 for r in records if r["cefr"] == level)
|
||||||
|
if count > 0:
|
||||||
|
cefr_distribution[level] = count
|
||||||
|
|
||||||
|
print(f"\nExtracted: {len(records)} records")
|
||||||
|
print(f" - Nouns: {noun_count}")
|
||||||
|
print(f" - Verbs: {verb_count}")
|
||||||
|
print("\nCEFR distribution:")
|
||||||
|
for level in CEFR_LEVELS:
|
||||||
|
if level in cefr_distribution:
|
||||||
|
print(f" - {level}: {cefr_distribution[level]}")
|
||||||
|
|
||||||
|
print("\nSkipped:")
|
||||||
|
print(f" - Not useful for flashcard: {skipped_not_useful}")
|
||||||
|
print(f" - Unsupported POS: {skipped_pos}")
|
||||||
|
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||||
|
print(f" - Empty word: {skipped_empty_word}")
|
||||||
|
print(f"\nOutput: {OUTPUT_FILE}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
extract()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue