wip
This commit is contained in:
parent
9d1a82bdf0
commit
488f0dab11
6 changed files with 666 additions and 82 deletions
13
packages/db/drizzle/0001_medical_fabian_cortez.sql
Normal file
13
packages/db/drizzle/0001_medical_fabian_cortez.sql
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
ALTER TABLE "decks" DROP CONSTRAINT "unique_deck_name";--> statement-breakpoint
|
||||
ALTER TABLE "decks" DROP CONSTRAINT "decks_language_pair_id_language_pairs_id_fk";
|
||||
--> statement-breakpoint
|
||||
ALTER TABLE "decks" DROP CONSTRAINT "decks_created_by_users_id_fk";
|
||||
--> statement-breakpoint
|
||||
DROP INDEX "idx_decks_created_by";--> statement-breakpoint
|
||||
DROP INDEX "idx_decks_language_pair";--> statement-breakpoint
|
||||
ALTER TABLE "decks" ADD COLUMN "validated_for_languages" varchar(10)[] DEFAULT '{}' NOT NULL;--> statement-breakpoint
|
||||
ALTER TABLE "deck_terms" DROP COLUMN "position";--> statement-breakpoint
|
||||
ALTER TABLE "decks" DROP COLUMN "language_pair_id";--> statement-breakpoint
|
||||
ALTER TABLE "decks" DROP COLUMN "created_by";--> statement-breakpoint
|
||||
ALTER TABLE "decks" ADD CONSTRAINT "unique_deck_name" UNIQUE("name");--> statement-breakpoint
|
||||
ALTER TABLE "decks" ADD CONSTRAINT "validated_languages_check" CHECK (validated_for_languages <@ ARRAY['en', 'it']::varchar[]);
|
||||
587
packages/db/drizzle/meta/0001_snapshot.json
Normal file
587
packages/db/drizzle/meta/0001_snapshot.json
Normal file
|
|
@ -0,0 +1,587 @@
|
|||
{
|
||||
"id": "d6bed73d-ee69-44b1-a3ce-3ae25898a6f0",
|
||||
"prevId": "5830ce3b-dc0e-44a7-83d6-bc74016ca4fd",
|
||||
"version": "7",
|
||||
"dialect": "postgresql",
|
||||
"tables": {
|
||||
"public.deck_terms": {
|
||||
"name": "deck_terms",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"deck_id": {
|
||||
"name": "deck_id",
|
||||
"type": "uuid",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"term_id": {
|
||||
"name": "term_id",
|
||||
"type": "uuid",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"added_at": {
|
||||
"name": "added_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"idx_deck_terms_term": {
|
||||
"name": "idx_deck_terms_term",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "term_id",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
}
|
||||
},
|
||||
"foreignKeys": {
|
||||
"deck_terms_deck_id_decks_id_fk": {
|
||||
"name": "deck_terms_deck_id_decks_id_fk",
|
||||
"tableFrom": "deck_terms",
|
||||
"tableTo": "decks",
|
||||
"columnsFrom": [
|
||||
"deck_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
},
|
||||
"deck_terms_term_id_terms_id_fk": {
|
||||
"name": "deck_terms_term_id_terms_id_fk",
|
||||
"tableFrom": "deck_terms",
|
||||
"tableTo": "terms",
|
||||
"columnsFrom": [
|
||||
"term_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
}
|
||||
},
|
||||
"compositePrimaryKeys": {
|
||||
"deck_terms_deck_id_term_id_pk": {
|
||||
"name": "deck_terms_deck_id_term_id_pk",
|
||||
"columns": [
|
||||
"deck_id",
|
||||
"term_id"
|
||||
]
|
||||
}
|
||||
},
|
||||
"uniqueConstraints": {},
|
||||
"policies": {},
|
||||
"checkConstraints": {},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.decks": {
|
||||
"name": "decks",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"name": {
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"description": {
|
||||
"name": "description",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false
|
||||
},
|
||||
"validated_for_languages": {
|
||||
"name": "validated_for_languages",
|
||||
"type": "varchar(10)[]",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "'{}'"
|
||||
},
|
||||
"is_public": {
|
||||
"name": "is_public",
|
||||
"type": "boolean",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": false
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"unique_deck_name": {
|
||||
"name": "unique_deck_name",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"name"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {
|
||||
"validated_languages_check": {
|
||||
"name": "validated_languages_check",
|
||||
"value": "validated_for_languages <@ ARRAY['en', 'it']::varchar[]"
|
||||
}
|
||||
},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.language_pairs": {
|
||||
"name": "language_pairs",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"source_language": {
|
||||
"name": "source_language",
|
||||
"type": "varchar(10)",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"target_language": {
|
||||
"name": "target_language",
|
||||
"type": "varchar(10)",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"label": {
|
||||
"name": "label",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false
|
||||
},
|
||||
"active": {
|
||||
"name": "active",
|
||||
"type": "boolean",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": true
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"idx_pairs_active": {
|
||||
"name": "idx_pairs_active",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "active",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
},
|
||||
{
|
||||
"expression": "source_language",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
},
|
||||
{
|
||||
"expression": "target_language",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"unique_source_target": {
|
||||
"name": "unique_source_target",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"source_language",
|
||||
"target_language"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {
|
||||
"source_language_check": {
|
||||
"name": "source_language_check",
|
||||
"value": "\"language_pairs\".\"source_language\" IN ('en', 'it')"
|
||||
},
|
||||
"target_language_check": {
|
||||
"name": "target_language_check",
|
||||
"value": "\"language_pairs\".\"target_language\" IN ('en', 'it')"
|
||||
},
|
||||
"no_self_pair": {
|
||||
"name": "no_self_pair",
|
||||
"value": "\"language_pairs\".\"source_language\" != \"language_pairs\".\"target_language\""
|
||||
}
|
||||
},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.term_glosses": {
|
||||
"name": "term_glosses",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"term_id": {
|
||||
"name": "term_id",
|
||||
"type": "uuid",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"language_code": {
|
||||
"name": "language_code",
|
||||
"type": "varchar(10)",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"text": {
|
||||
"name": "text",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"idx_term_glosses_term": {
|
||||
"name": "idx_term_glosses_term",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "term_id",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
}
|
||||
},
|
||||
"foreignKeys": {
|
||||
"term_glosses_term_id_terms_id_fk": {
|
||||
"name": "term_glosses_term_id_terms_id_fk",
|
||||
"tableFrom": "term_glosses",
|
||||
"tableTo": "terms",
|
||||
"columnsFrom": [
|
||||
"term_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
}
|
||||
},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"unique_term_gloss": {
|
||||
"name": "unique_term_gloss",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"term_id",
|
||||
"language_code",
|
||||
"text"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.terms": {
|
||||
"name": "terms",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"synset_id": {
|
||||
"name": "synset_id",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"pos": {
|
||||
"name": "pos",
|
||||
"type": "varchar(20)",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"idx_terms_pos": {
|
||||
"name": "idx_terms_pos",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "pos",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
}
|
||||
},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"terms_synset_id_unique": {
|
||||
"name": "terms_synset_id_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"synset_id"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {
|
||||
"pos_check": {
|
||||
"name": "pos_check",
|
||||
"value": "\"terms\".\"pos\" IN ('noun')"
|
||||
}
|
||||
},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.translations": {
|
||||
"name": "translations",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"term_id": {
|
||||
"name": "term_id",
|
||||
"type": "uuid",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"language_code": {
|
||||
"name": "language_code",
|
||||
"type": "varchar(10)",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"text": {
|
||||
"name": "text",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
}
|
||||
},
|
||||
"indexes": {
|
||||
"idx_translations_lang": {
|
||||
"name": "idx_translations_lang",
|
||||
"columns": [
|
||||
{
|
||||
"expression": "language_code",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
},
|
||||
{
|
||||
"expression": "term_id",
|
||||
"isExpression": false,
|
||||
"asc": true,
|
||||
"nulls": "last"
|
||||
}
|
||||
],
|
||||
"isUnique": false,
|
||||
"concurrently": false,
|
||||
"method": "btree",
|
||||
"with": {}
|
||||
}
|
||||
},
|
||||
"foreignKeys": {
|
||||
"translations_term_id_terms_id_fk": {
|
||||
"name": "translations_term_id_terms_id_fk",
|
||||
"tableFrom": "translations",
|
||||
"tableTo": "terms",
|
||||
"columnsFrom": [
|
||||
"term_id"
|
||||
],
|
||||
"columnsTo": [
|
||||
"id"
|
||||
],
|
||||
"onDelete": "cascade",
|
||||
"onUpdate": "no action"
|
||||
}
|
||||
},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"unique_translations": {
|
||||
"name": "unique_translations",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"term_id",
|
||||
"language_code",
|
||||
"text"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {},
|
||||
"isRLSEnabled": false
|
||||
},
|
||||
"public.users": {
|
||||
"name": "users",
|
||||
"schema": "",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "uuid",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"default": "gen_random_uuid()"
|
||||
},
|
||||
"openauth_sub": {
|
||||
"name": "openauth_sub",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true
|
||||
},
|
||||
"email": {
|
||||
"name": "email",
|
||||
"type": "varchar(255)",
|
||||
"primaryKey": false,
|
||||
"notNull": false
|
||||
},
|
||||
"display_name": {
|
||||
"name": "display_name",
|
||||
"type": "varchar(100)",
|
||||
"primaryKey": false,
|
||||
"notNull": false
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"default": "now()"
|
||||
},
|
||||
"last_login_at": {
|
||||
"name": "last_login_at",
|
||||
"type": "timestamp with time zone",
|
||||
"primaryKey": false,
|
||||
"notNull": false
|
||||
}
|
||||
},
|
||||
"indexes": {},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {
|
||||
"users_openauth_sub_unique": {
|
||||
"name": "users_openauth_sub_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"openauth_sub"
|
||||
]
|
||||
},
|
||||
"users_email_unique": {
|
||||
"name": "users_email_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"email"
|
||||
]
|
||||
},
|
||||
"users_display_name_unique": {
|
||||
"name": "users_display_name_unique",
|
||||
"nullsNotDistinct": false,
|
||||
"columns": [
|
||||
"display_name"
|
||||
]
|
||||
}
|
||||
},
|
||||
"policies": {},
|
||||
"checkConstraints": {},
|
||||
"isRLSEnabled": false
|
||||
}
|
||||
},
|
||||
"enums": {},
|
||||
"schemas": {},
|
||||
"sequences": {},
|
||||
"roles": {},
|
||||
"policies": {},
|
||||
"views": {},
|
||||
"_meta": {
|
||||
"columns": {},
|
||||
"schemas": {},
|
||||
"tables": {}
|
||||
}
|
||||
}
|
||||
|
|
@ -8,6 +8,13 @@
|
|||
"when": 1774721919883,
|
||||
"tag": "0000_bitter_turbo",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 1,
|
||||
"version": "7",
|
||||
"when": 1774970553186,
|
||||
"tag": "0001_medical_fabian_cortez",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
@ -5,6 +5,7 @@
|
|||
"type": "module",
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"generate": "drizzle-kit generate",
|
||||
"migrate": "drizzle-kit migrate"
|
||||
},
|
||||
"dependencies": {
|
||||
|
|
|
|||
|
|
@ -1,55 +0,0 @@
|
|||
import fs from "node:fs/promises";
|
||||
import { db } from "@glossa/db";
|
||||
import { translations } from "@glossa/db/schema";
|
||||
import { inArray } from "drizzle-orm";
|
||||
|
||||
const wordlistPath = "./src/data/wordlists/top1000englishnouns";
|
||||
const unmatchedOutputPath =
|
||||
"./src/data/wordlists/top1000englishnouns-unmatched";
|
||||
|
||||
const main = async () => {
|
||||
// 1. Read and normalise the word list
|
||||
console.log("📖 Reading word list...");
|
||||
const raw = await fs.readFile(wordlistPath, "utf8");
|
||||
const words = raw
|
||||
.split("\n")
|
||||
.map((w) => w.trim().toLowerCase())
|
||||
.filter(Boolean);
|
||||
console.log(` ${words.length} words loaded\n`);
|
||||
|
||||
// 2. Query DB for matches
|
||||
console.log("🔍 Checking against database...");
|
||||
const rows = await db
|
||||
.select({ text: translations.text })
|
||||
.from(translations)
|
||||
.where(inArray(translations.text, words));
|
||||
|
||||
const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
|
||||
|
||||
// 3. Split into matched / unmatched
|
||||
const matched = words.filter((w) => matchedSet.has(w));
|
||||
const unmatched = words.filter((w) => !matchedSet.has(w));
|
||||
|
||||
// 4. Terminal output
|
||||
console.log(`✅ Matched: ${matched.length}/${words.length}`);
|
||||
console.log(`❌ Unmatched: ${unmatched.length}/${words.length}`);
|
||||
console.log(
|
||||
`📊 Coverage: ${((matched.length / words.length) * 100).toFixed(1)}%\n`,
|
||||
);
|
||||
|
||||
if (unmatched.length > 0) {
|
||||
console.log("❌ Unmatched words:");
|
||||
for (const w of unmatched) {
|
||||
console.log(` ${w}`);
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Write unmatched to file
|
||||
// await fs.writeFile(unmatchedOutputPath, unmatched.join("\n"), "utf8");
|
||||
console.log(`\n💾 Unmatched words written to ${unmatchedOutputPath}`);
|
||||
};
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(error);
|
||||
process.exit(1);
|
||||
});
|
||||
|
|
@ -1,46 +1,77 @@
|
|||
/*
|
||||
Parse CLI args → resolve the word list file path
|
||||
Connect to the database
|
||||
Read the word list file into an ordered array of strings
|
||||
Look up the en→it language pair ID from language_pairs
|
||||
Batch-fetch all matching rows from translations where language_code = 'en' and text IN (words)
|
||||
Build a word → termId map from the results
|
||||
Walk the ordered word list → split into hits (word found, capture position) and misses (skip)
|
||||
Check if a deck with this name already exists → if so, delete its deck_terms then the deck itself
|
||||
Insert the new decks row
|
||||
Insert all deck_terms rows in batches (deckId, termId, position)
|
||||
Log the skipped words
|
||||
Close the DB connection
|
||||
- [x] Setup — hardcoded path, name, description, source language, POS
|
||||
- [x] Read wordlist — load the 1000 nouns
|
||||
- [x] Query terms — match to database, find which ones have translations
|
||||
- [ ] Validation — determine validated languages from the data (e.g., ["en", "it"] if all matched terms have both)
|
||||
- [ ] Check idempotency — skip if deck exists
|
||||
- [ ] Create deck — insert with discovered validated_for_languages
|
||||
- [ ] Link terms — insert deck_terms
|
||||
- [ ] Report — summary
|
||||
*/
|
||||
|
||||
import fs from "node:fs/promises";
|
||||
import { db } from "@glossa/db";
|
||||
import { translations } from "@glossa/db/schema";
|
||||
import { inArray } from "drizzle-orm";
|
||||
import { translations, terms } from "@glossa/db/schema";
|
||||
import { inArray, and, eq } from "drizzle-orm";
|
||||
|
||||
const wordlistPath = "./src/data/wordlists/top1000englishnouns";
|
||||
const pathToWordlist = "./src/data/wordlists/top1000englishnouns";
|
||||
const nameOfDeck = "top english nouns";
|
||||
const descriptionOfDeck =
|
||||
"Most frequently used English nouns for vocabulary practice";
|
||||
const sourceLanguage = "en";
|
||||
const sourcePOS = "noun";
|
||||
|
||||
const main = async () => {
|
||||
// Read and normalise the word list
|
||||
console.log("📖 Reading word list...");
|
||||
const raw = await fs.readFile(wordlistPath, "utf8");
|
||||
const readingFromWordlist = async () => {
|
||||
const raw = await fs.readFile(pathToWordlist, "utf8");
|
||||
const words = raw
|
||||
.split("\n")
|
||||
.map((w) => w.trim().toLowerCase())
|
||||
.filter(Boolean);
|
||||
console.log(` ${words.length} words loaded\n`);
|
||||
return words;
|
||||
};
|
||||
|
||||
// Query DB for matches
|
||||
console.log("🔍 Checking against database...");
|
||||
const checkingSourceWordsAgainstDB = async (words: string[]) => {
|
||||
const rows = await db
|
||||
.select({ text: translations.text, termId: translations.term_id })
|
||||
.from(translations)
|
||||
.where(inArray(translations.text, words));
|
||||
.innerJoin(terms, eq(translations.term_id, terms.id))
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.text, words),
|
||||
eq(translations.language_code, sourceLanguage),
|
||||
eq(terms.pos, sourcePOS),
|
||||
),
|
||||
);
|
||||
|
||||
const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
|
||||
const wordsInDb = words.filter((w) => matchedSet.has(w));
|
||||
// map word text to term_id
|
||||
const wordToTermId = new Map<string, string>();
|
||||
for (const row of rows) {
|
||||
const word = row.text.toLowerCase();
|
||||
if (!wordToTermId.has(word)) {
|
||||
wordToTermId.set(word, row.termId);
|
||||
}
|
||||
}
|
||||
const termIds = Array.from(wordToTermId.values());
|
||||
const missingWords = words.filter((w) => !wordToTermId.has(w));
|
||||
|
||||
console.log("wordsInDb: ", wordsInDb);
|
||||
return { termIds, missingWords };
|
||||
};
|
||||
|
||||
const writeMissingWordsToFile = async (words: string[]) => {};
|
||||
|
||||
const main = async () => {
|
||||
// Read and normalise the word list
|
||||
console.log("📖 Reading word list...");
|
||||
const sourceWords = await readingFromWordlist();
|
||||
console.log(` ${sourceWords.length} words loaded\n`);
|
||||
// check if sourceWords exist in database
|
||||
console.log("🔍 Checking against database...");
|
||||
const { termIds, missingWords } =
|
||||
await checkingSourceWordsAgainstDB(sourceWords);
|
||||
console.log("words found in db: ", termIds.length);
|
||||
console.log("words NOT found in db: ", missingWords.length);
|
||||
// write missing words to file
|
||||
await writeMissingWordsToFile(missingWords);
|
||||
};
|
||||
|
||||
main().catch((error) => {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue