feat(db): add incremental upsert seed script for WordNet vocabulary
Implements packages/db/src/seed.ts — reads all JSON files from scripts/datafiles/, validates filenames against supported language codes and POS, and upserts synsets into and via onConflictDoNothing. Safe to re-run; produces 0 writes on a duplicate run.
This commit is contained in:
parent
55885336ba
commit
2b177aad5b
12 changed files with 1349 additions and 10 deletions
337
documentation/data-seeding-notes.md
Normal file
337
documentation/data-seeding-notes.md
Normal file
|
|
@ -0,0 +1,337 @@
|
||||||
|
# WordNet Seeding Script — Session Summary
|
||||||
|
|
||||||
|
## Project Context
|
||||||
|
|
||||||
|
A multiplayer English–Italian vocabulary trainer (Glossa) built with a pnpm monorepo. Vocabulary data comes from Open Multilingual Wordnet (OMW) and is extracted into JSON files, then seeded into a PostgreSQL database via Drizzle ORM.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. JSON Extraction Format
|
||||||
|
|
||||||
|
Each synset extracted from WordNet is represented as:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"synset_id": "ili:i35545",
|
||||||
|
"pos": "noun",
|
||||||
|
"translations": {
|
||||||
|
"en": ["entity"],
|
||||||
|
"it": ["cosa", "entità"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fields:**
|
||||||
|
- `synset_id` — OMW Interlingual Index ID, maps to `terms.synset_id` in the DB
|
||||||
|
- `pos` — part of speech, matches the CHECK constraint on `terms.pos`
|
||||||
|
- `translations` — object of language code → array of lemmas (synonyms within a synset)
|
||||||
|
|
||||||
|
**Glosses** are not extracted — the `term_glosses` table exists in the schema for future use but is not needed for the MVP quiz mechanic.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Database Schema (relevant tables)
|
||||||
|
|
||||||
|
```
|
||||||
|
terms
|
||||||
|
id uuid PK
|
||||||
|
synset_id text UNIQUE
|
||||||
|
pos varchar(20)
|
||||||
|
created_at timestamptz
|
||||||
|
|
||||||
|
translations
|
||||||
|
id uuid PK
|
||||||
|
term_id uuid FK → terms.id (CASCADE)
|
||||||
|
language_code varchar(10)
|
||||||
|
text text
|
||||||
|
created_at timestamptz
|
||||||
|
UNIQUE (term_id, language_code, text)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Seeding Script — v1 (batch, truncate-based)
|
||||||
|
|
||||||
|
### Approach
|
||||||
|
- Read a single JSON file
|
||||||
|
- Batch inserts into `terms` and `translations` in groups of 500
|
||||||
|
- Truncate tables before each run for a clean slate
|
||||||
|
|
||||||
|
### Key decisions made during development
|
||||||
|
|
||||||
|
| Issue | Resolution |
|
||||||
|
|-------|-----------|
|
||||||
|
| `JSON.parse` returns `any` | Added `Array.isArray` check before casting |
|
||||||
|
| `forEach` doesn't await | Switched to `for...of` |
|
||||||
|
| Empty array types | Used Drizzle's `$inferInsert` types |
|
||||||
|
| `translations` naming conflict | Renamed local variable to `translationRows` |
|
||||||
|
| Final batch not flushed | Added `if (termsArray.length > 0)` guard after loop |
|
||||||
|
| Exact batch size check `=== 500` | Changed to `>= 500` |
|
||||||
|
|
||||||
|
### Final script structure
|
||||||
|
|
||||||
|
```ts
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared";
|
||||||
|
import { db } from "@glossa/db";
|
||||||
|
import { terms, translations } from "@glossa/db/schema";
|
||||||
|
|
||||||
|
type POS = (typeof SUPPORTED_POS)[number];
|
||||||
|
type LANGUAGE_CODE = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
||||||
|
type TermInsert = typeof terms.$inferInsert;
|
||||||
|
type TranslationInsert = typeof translations.$inferInsert;
|
||||||
|
type Synset = {
|
||||||
|
synset_id: string;
|
||||||
|
pos: POS;
|
||||||
|
translations: Record<LANGUAGE_CODE, string[]>;
|
||||||
|
};
|
||||||
|
|
||||||
|
const dataDir = "../../scripts/datafiles/";
|
||||||
|
|
||||||
|
const readFromJsonFile = async (filepath: string): Promise<Synset[]> => {
|
||||||
|
const data = await fs.readFile(filepath, "utf8");
|
||||||
|
const parsed = JSON.parse(data);
|
||||||
|
if (!Array.isArray(parsed)) throw new Error("Expected a JSON array");
|
||||||
|
return parsed as Synset[];
|
||||||
|
};
|
||||||
|
|
||||||
|
const uploadToDB = async (
|
||||||
|
termsData: TermInsert[],
|
||||||
|
translationsData: TranslationInsert[],
|
||||||
|
) => {
|
||||||
|
await db.insert(terms).values(termsData);
|
||||||
|
await db.insert(translations).values(translationsData);
|
||||||
|
};
|
||||||
|
|
||||||
|
const main = async () => {
|
||||||
|
console.log("Reading JSON file...");
|
||||||
|
const allSynsets = await readFromJsonFile(dataDir + "en-it-nouns.json");
|
||||||
|
console.log(`Loaded ${allSynsets.length} synsets`);
|
||||||
|
|
||||||
|
const termsArray: TermInsert[] = [];
|
||||||
|
const translationsArray: TranslationInsert[] = [];
|
||||||
|
let batchCount = 0;
|
||||||
|
|
||||||
|
for (const synset of allSynsets) {
|
||||||
|
const term = {
|
||||||
|
id: crypto.randomUUID(),
|
||||||
|
synset_id: synset.synset_id,
|
||||||
|
pos: synset.pos,
|
||||||
|
};
|
||||||
|
|
||||||
|
const translationRows = Object.entries(synset.translations).flatMap(
|
||||||
|
([lang, lemmas]) =>
|
||||||
|
lemmas.map((lemma) => ({
|
||||||
|
id: crypto.randomUUID(),
|
||||||
|
term_id: term.id,
|
||||||
|
language_code: lang as LANGUAGE_CODE,
|
||||||
|
text: lemma,
|
||||||
|
})),
|
||||||
|
);
|
||||||
|
|
||||||
|
translationsArray.push(...translationRows);
|
||||||
|
termsArray.push(term);
|
||||||
|
|
||||||
|
if (termsArray.length >= 500) {
|
||||||
|
batchCount++;
|
||||||
|
console.log(`Uploading batch ${batchCount} (${batchCount * 500}/${allSynsets.length} synsets)...`);
|
||||||
|
await uploadToDB(termsArray, translationsArray);
|
||||||
|
termsArray.length = 0;
|
||||||
|
translationsArray.length = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (termsArray.length > 0) {
|
||||||
|
batchCount++;
|
||||||
|
console.log(`Uploading final batch (${allSynsets.length}/${allSynsets.length} synsets)...`);
|
||||||
|
await uploadToDB(termsArray, translationsArray);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Seeding complete — ${allSynsets.length} synsets inserted`);
|
||||||
|
};
|
||||||
|
|
||||||
|
main().catch((error) => {
|
||||||
|
console.error(error);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Pitfalls Encountered
|
||||||
|
|
||||||
|
### Duplicate key on re-run
|
||||||
|
Running the script twice causes `duplicate key value violates unique constraint "terms_synset_id_unique"`. Fix: truncate before seeding.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker exec -it glossa-database psql -U glossa -d glossa -c "TRUNCATE translations, terms CASCADE;"
|
||||||
|
```
|
||||||
|
|
||||||
|
### `onConflictDoNothing` breaks FK references
|
||||||
|
When `onConflictDoNothing` skips a `terms` insert, the in-memory UUID is never written to the DB. Subsequent `translations` inserts reference that non-existent UUID, causing a FK violation. This is why the truncate approach is correct for batch seeding.
|
||||||
|
|
||||||
|
### DATABASE_URL misconfigured
|
||||||
|
Correct format:
|
||||||
|
```
|
||||||
|
DATABASE_URL=postgresql://glossa:glossa@localhost:5432/glossa
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tables not found after `docker compose up`
|
||||||
|
Migrations must be applied first: `npx drizzle-kit migrate`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Running the Script
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Start the DB container
|
||||||
|
docker compose up -d postgres
|
||||||
|
|
||||||
|
# Apply migrations
|
||||||
|
npx drizzle-kit migrate
|
||||||
|
|
||||||
|
# Truncate existing data (if re-seeding)
|
||||||
|
docker exec -it glossa-database psql -U glossa -d glossa -c "TRUNCATE translations, terms CASCADE;"
|
||||||
|
|
||||||
|
# Run the seed script
|
||||||
|
npx tsx src/seed-en-it-nouns.ts
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
docker exec -it glossa-database psql -U glossa -d glossa -c "SELECT COUNT(*) FROM terms; SELECT COUNT(*) FROM translations;"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Seeding Script — v2 (incremental upsert, multi-file)
|
||||||
|
|
||||||
|
### Motivation
|
||||||
|
The truncate approach is fine for dev but unsuitable for production — it wipes all data. The v2 approach extends the database incrementally without ever truncating.
|
||||||
|
|
||||||
|
### File naming convention
|
||||||
|
One JSON file per language pair per POS:
|
||||||
|
```
|
||||||
|
scripts/datafiles/
|
||||||
|
en-it-nouns.json
|
||||||
|
en-fr-nouns.json
|
||||||
|
en-it-verbs.json
|
||||||
|
de-it-nouns.json
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
### How incremental upsert works
|
||||||
|
For a concept like "dog" already in the DB with English and Italian:
|
||||||
|
1. Import `en-fr-nouns.json`
|
||||||
|
2. Upsert `terms` by `synset_id` — finds existing row, returns its real ID
|
||||||
|
3. `dog (en)` already exists → skipped by `onConflictDoNothing`
|
||||||
|
4. `chien (fr)` is new → inserted
|
||||||
|
|
||||||
|
The concept is **extended**, not replaced.
|
||||||
|
|
||||||
|
### Tradeoff vs batch approach
|
||||||
|
Batching is no longer possible since you need the real `term.id` from the DB before inserting translations. Each synset is processed individually. For 25k rows this is still fast enough.
|
||||||
|
|
||||||
|
### Key types added
|
||||||
|
|
||||||
|
```ts
|
||||||
|
type Synset = {
|
||||||
|
synset_id: string;
|
||||||
|
pos: POS;
|
||||||
|
translations: Partial<Record<LANGUAGE_CODE, string[]>>; // Partial — file only contains subset of languages
|
||||||
|
};
|
||||||
|
|
||||||
|
type FileName = {
|
||||||
|
sourceLang: LANGUAGE_CODE;
|
||||||
|
targetLang: LANGUAGE_CODE;
|
||||||
|
pos: POS;
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Filename validation
|
||||||
|
|
||||||
|
```ts
|
||||||
|
const parseFilename = (filename: string): FileName => {
|
||||||
|
const parts = filename.replace(".json", "").split("-");
|
||||||
|
if (parts.length !== 3)
|
||||||
|
throw new Error(`Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`);
|
||||||
|
const [sourceLang, targetLang, pos] = parts;
|
||||||
|
if (!SUPPORTED_LANGUAGE_CODES.includes(sourceLang as LANGUAGE_CODE))
|
||||||
|
throw new Error(`Unsupported language code: ${sourceLang}`);
|
||||||
|
if (!SUPPORTED_LANGUAGE_CODES.includes(targetLang as LANGUAGE_CODE))
|
||||||
|
throw new Error(`Unsupported language code: ${targetLang}`);
|
||||||
|
if (!SUPPORTED_POS.includes(pos as POS))
|
||||||
|
throw new Error(`Unsupported POS: ${pos}`);
|
||||||
|
return {
|
||||||
|
sourceLang: sourceLang as LANGUAGE_CODE,
|
||||||
|
targetLang: targetLang as LANGUAGE_CODE,
|
||||||
|
pos: pos as POS,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Upsert function (WIP)
|
||||||
|
|
||||||
|
```ts
|
||||||
|
const upsertSynset = async (
|
||||||
|
synset: Synset,
|
||||||
|
fileInfo: FileName,
|
||||||
|
): Promise<{ termInserted: boolean; translationsInserted: number }> => {
|
||||||
|
const [upsertedTerm] = await db
|
||||||
|
.insert(terms)
|
||||||
|
.values({ synset_id: synset.synset_id, pos: synset.pos })
|
||||||
|
.onConflictDoUpdate({
|
||||||
|
target: terms.synset_id,
|
||||||
|
set: { pos: synset.pos },
|
||||||
|
})
|
||||||
|
.returning({ id: terms.id, created_at: terms.created_at });
|
||||||
|
|
||||||
|
const termInserted = upsertedTerm.created_at > new Date(Date.now() - 1000);
|
||||||
|
|
||||||
|
const translationRows = Object.entries(synset.translations).flatMap(
|
||||||
|
([lang, lemmas]) =>
|
||||||
|
lemmas!.map((lemma) => ({
|
||||||
|
id: crypto.randomUUID(),
|
||||||
|
term_id: upsertedTerm.id,
|
||||||
|
language_code: lang as LANGUAGE_CODE,
|
||||||
|
text: lemma,
|
||||||
|
})),
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = await db
|
||||||
|
.insert(translations)
|
||||||
|
.values(translationRows)
|
||||||
|
.onConflictDoNothing()
|
||||||
|
.returning({ id: translations.id });
|
||||||
|
|
||||||
|
return { termInserted, translationsInserted: result.length };
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Strategy Comparison
|
||||||
|
|
||||||
|
| Strategy | Use case | Pros | Cons |
|
||||||
|
|----------|----------|------|------|
|
||||||
|
| Truncate + batch | Dev / first-time setup | Fast, simple | Wipes all data |
|
||||||
|
| Incremental upsert | Production / adding languages | Safe, non-destructive | No batching, slower |
|
||||||
|
| Migrations-as-data | Production audit trail | Clean history | Files accumulate |
|
||||||
|
| Diff-based sync | Large production datasets | Minimal writes | Complex to implement |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. packages/db — package.json exports fix
|
||||||
|
|
||||||
|
The `exports` field must be an object, not an array:
|
||||||
|
|
||||||
|
```json
|
||||||
|
"exports": {
|
||||||
|
".": "./src/index.ts",
|
||||||
|
"./schema": "./src/db/schema.ts"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Imports then resolve as:
|
||||||
|
```ts
|
||||||
|
import { db } from "@glossa/db";
|
||||||
|
import { terms, translations } from "@glossa/db/schema";
|
||||||
|
```
|
||||||
|
|
@ -6,7 +6,7 @@
|
||||||
- add this to drizzle migrartions file:
|
- add this to drizzle migrartions file:
|
||||||
✅ ALTER TABLE terms ADD CHECK (pos IN ('noun', 'verb', 'adjective', etc));
|
✅ ALTER TABLE terms ADD CHECK (pos IN ('noun', 'verb', 'adjective', etc));
|
||||||
|
|
||||||
## open word net
|
## openwordnet
|
||||||
|
|
||||||
download libraries via
|
download libraries via
|
||||||
|
|
||||||
|
|
@ -45,3 +45,17 @@ list all libraries:
|
||||||
```bash
|
```bash
|
||||||
python -c "import wn; print(wn.lexicons())"
|
python -c "import wn; print(wn.lexicons())"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## drizzle
|
||||||
|
|
||||||
|
generate migration file, go to packages/db, then:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm drizzle-kit generate
|
||||||
|
```
|
||||||
|
|
||||||
|
execute migration, go to packages/db (docker containers need to be running):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
DATABASE_URL=postgresql://username:password@localhost:5432/database pnpm drizzle-kit migrate
|
||||||
|
```
|
||||||
|
|
|
||||||
|
|
@ -26,17 +26,17 @@ Done when: `GET /api/decks/1/terms?limit=10` returns 10 terms from a specific de
|
||||||
|
|
||||||
[x] Run `extract-en-it-nouns.py` locally → generates `datafiles/en-it-nouns.json`
|
[x] Run `extract-en-it-nouns.py` locally → generates `datafiles/en-it-nouns.json`
|
||||||
-- Import ALL available OMW noun synsets (no frequency filtering)
|
-- Import ALL available OMW noun synsets (no frequency filtering)
|
||||||
[ ] Write Drizzle schema: `terms`, `translations`, `language_pairs`, `term_glosses`, `decks`, `deck_terms`
|
[x] Write Drizzle schema: `terms`, `translations`, `language_pairs`, `term_glosses`, `decks`, `deck_terms`
|
||||||
[ ] Write and run migration (includes CHECK constraints for `pos`, `gloss_type`)
|
[x] Write and run migration (includes CHECK constraints for `pos`, `gloss_type`)
|
||||||
[ ] Write `packages/db/src/seed.ts` (imports ALL terms + translations, NO decks)
|
[x] Write `packages/db/src/seed.ts` (imports ALL terms + translations, NO decks)
|
||||||
[ ] Write `scripts/build_decks.ts` (reads external CEFR lists, matches to DB, creates decks)
|
|
||||||
[ ] Download CEFR A1/A2 noun lists (from GitHub repos)
|
[ ] Download CEFR A1/A2 noun lists (from GitHub repos)
|
||||||
|
[ ] Write `scripts/build_decks.ts` (reads external CEFR lists, matches to DB, creates decks)
|
||||||
[ ] Run `pnpm db:seed` → populates terms
|
[ ] Run `pnpm db:seed` → populates terms
|
||||||
[ ] Run `pnpm db:build-decks` → creates curated decks
|
[ ] Run `pnpm db:build-decks` → creates curated decks
|
||||||
|
[ ] Define Zod response schemas in `packages/shared`
|
||||||
[ ] Implement `DeckRepository.getTerms(deckId, limit, offset)`
|
[ ] Implement `DeckRepository.getTerms(deckId, limit, offset)`
|
||||||
[ ] Implement `QuizService.attachDistractors(terms)` — same POS, server-side, no duplicates
|
[ ] Implement `QuizService.attachDistractors(terms)` — same POS, server-side, no duplicates
|
||||||
[ ] Implement `GET /language-pairs`, `GET /decks`, `GET /decks/:id/terms` endpoints
|
[ ] Implement `GET /language-pairs`, `GET /decks`, `GET /decks/:id/terms` endpoints
|
||||||
[ ] Define Zod response schemas in `packages/shared`
|
|
||||||
[ ] Unit tests for `QuizService` (correct POS filtering, never includes the answer)
|
[ ] Unit tests for `QuizService` (correct POS filtering, never includes the answer)
|
||||||
[ ] update decisions.md
|
[ ] update decisions.md
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -205,7 +205,6 @@ term_glosses
|
||||||
term_id uuid FK → terms.id
|
term_id uuid FK → terms.id
|
||||||
language_code varchar(10) -- NOT NULL
|
language_code varchar(10) -- NOT NULL
|
||||||
text text -- NOT NULL
|
text text -- NOT NULL
|
||||||
type varchar(20) -- CHECK (type IN ('definition', 'example')), NULLABLE
|
|
||||||
created_at timestamptz DEFAULT now()
|
created_at timestamptz DEFAULT now()
|
||||||
|
|
||||||
language_pairs
|
language_pairs
|
||||||
|
|
|
||||||
84
packages/db/drizzle/0000_bitter_turbo.sql
Normal file
84
packages/db/drizzle/0000_bitter_turbo.sql
Normal file
|
|
@ -0,0 +1,84 @@
|
||||||
|
CREATE TABLE "deck_terms" (
|
||||||
|
"deck_id" uuid NOT NULL,
|
||||||
|
"term_id" uuid NOT NULL,
|
||||||
|
"position" integer NOT NULL,
|
||||||
|
"added_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||||
|
CONSTRAINT "deck_terms_deck_id_term_id_pk" PRIMARY KEY("deck_id","term_id")
|
||||||
|
);
|
||||||
|
--> statement-breakpoint
|
||||||
|
CREATE TABLE "decks" (
|
||||||
|
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
|
||||||
|
"name" text NOT NULL,
|
||||||
|
"description" text,
|
||||||
|
"language_pair_id" uuid NOT NULL,
|
||||||
|
"created_by" uuid NOT NULL,
|
||||||
|
"is_public" boolean DEFAULT false NOT NULL,
|
||||||
|
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||||
|
CONSTRAINT "unique_deck_name" UNIQUE("name","created_by")
|
||||||
|
);
|
||||||
|
--> statement-breakpoint
|
||||||
|
CREATE TABLE "language_pairs" (
|
||||||
|
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
|
||||||
|
"source_language" varchar(10) NOT NULL,
|
||||||
|
"target_language" varchar(10) NOT NULL,
|
||||||
|
"label" text,
|
||||||
|
"active" boolean DEFAULT true NOT NULL,
|
||||||
|
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||||
|
CONSTRAINT "unique_source_target" UNIQUE("source_language","target_language"),
|
||||||
|
CONSTRAINT "source_language_check" CHECK ("language_pairs"."source_language" IN ('en', 'it')),
|
||||||
|
CONSTRAINT "target_language_check" CHECK ("language_pairs"."target_language" IN ('en', 'it')),
|
||||||
|
CONSTRAINT "no_self_pair" CHECK ("language_pairs"."source_language" != "language_pairs"."target_language")
|
||||||
|
);
|
||||||
|
--> statement-breakpoint
|
||||||
|
CREATE TABLE "term_glosses" (
|
||||||
|
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
|
||||||
|
"term_id" uuid NOT NULL,
|
||||||
|
"language_code" varchar(10) NOT NULL,
|
||||||
|
"text" text NOT NULL,
|
||||||
|
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||||
|
CONSTRAINT "unique_term_gloss" UNIQUE("term_id","language_code","text")
|
||||||
|
);
|
||||||
|
--> statement-breakpoint
|
||||||
|
CREATE TABLE "terms" (
|
||||||
|
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
|
||||||
|
"synset_id" text NOT NULL,
|
||||||
|
"pos" varchar(20) NOT NULL,
|
||||||
|
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||||
|
CONSTRAINT "terms_synset_id_unique" UNIQUE("synset_id"),
|
||||||
|
CONSTRAINT "pos_check" CHECK ("terms"."pos" IN ('noun'))
|
||||||
|
);
|
||||||
|
--> statement-breakpoint
|
||||||
|
CREATE TABLE "translations" (
|
||||||
|
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
|
||||||
|
"term_id" uuid NOT NULL,
|
||||||
|
"language_code" varchar(10) NOT NULL,
|
||||||
|
"text" text NOT NULL,
|
||||||
|
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||||
|
CONSTRAINT "unique_translations" UNIQUE("term_id","language_code","text")
|
||||||
|
);
|
||||||
|
--> statement-breakpoint
|
||||||
|
CREATE TABLE "users" (
|
||||||
|
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
|
||||||
|
"openauth_sub" text NOT NULL,
|
||||||
|
"email" varchar(255),
|
||||||
|
"display_name" varchar(100),
|
||||||
|
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||||
|
"last_login_at" timestamp with time zone,
|
||||||
|
CONSTRAINT "users_openauth_sub_unique" UNIQUE("openauth_sub"),
|
||||||
|
CONSTRAINT "users_email_unique" UNIQUE("email"),
|
||||||
|
CONSTRAINT "users_display_name_unique" UNIQUE("display_name")
|
||||||
|
);
|
||||||
|
--> statement-breakpoint
|
||||||
|
ALTER TABLE "deck_terms" ADD CONSTRAINT "deck_terms_deck_id_decks_id_fk" FOREIGN KEY ("deck_id") REFERENCES "public"."decks"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||||
|
ALTER TABLE "deck_terms" ADD CONSTRAINT "deck_terms_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||||
|
ALTER TABLE "decks" ADD CONSTRAINT "decks_language_pair_id_language_pairs_id_fk" FOREIGN KEY ("language_pair_id") REFERENCES "public"."language_pairs"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||||
|
ALTER TABLE "decks" ADD CONSTRAINT "decks_created_by_users_id_fk" FOREIGN KEY ("created_by") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||||
|
ALTER TABLE "term_glosses" ADD CONSTRAINT "term_glosses_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||||
|
ALTER TABLE "translations" ADD CONSTRAINT "translations_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||||
|
CREATE INDEX "idx_deck_terms_term" ON "deck_terms" USING btree ("term_id");--> statement-breakpoint
|
||||||
|
CREATE INDEX "idx_decks_created_by" ON "decks" USING btree ("created_by");--> statement-breakpoint
|
||||||
|
CREATE INDEX "idx_decks_language_pair" ON "decks" USING btree ("language_pair_id");--> statement-breakpoint
|
||||||
|
CREATE INDEX "idx_pairs_active" ON "language_pairs" USING btree ("active","source_language","target_language");--> statement-breakpoint
|
||||||
|
CREATE INDEX "idx_term_glosses_term" ON "term_glosses" USING btree ("term_id");--> statement-breakpoint
|
||||||
|
CREATE INDEX "idx_terms_pos" ON "terms" USING btree ("pos");--> statement-breakpoint
|
||||||
|
CREATE INDEX "idx_translations_lang" ON "translations" USING btree ("language_code","term_id");
|
||||||
652
packages/db/drizzle/meta/0000_snapshot.json
Normal file
652
packages/db/drizzle/meta/0000_snapshot.json
Normal file
|
|
@ -0,0 +1,652 @@
|
||||||
|
{
|
||||||
|
"id": "5830ce3b-dc0e-44a7-83d6-bc74016ca4fd",
|
||||||
|
"prevId": "00000000-0000-0000-0000-000000000000",
|
||||||
|
"version": "7",
|
||||||
|
"dialect": "postgresql",
|
||||||
|
"tables": {
|
||||||
|
"public.deck_terms": {
|
||||||
|
"name": "deck_terms",
|
||||||
|
"schema": "",
|
||||||
|
"columns": {
|
||||||
|
"deck_id": {
|
||||||
|
"name": "deck_id",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"term_id": {
|
||||||
|
"name": "term_id",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"position": {
|
||||||
|
"name": "position",
|
||||||
|
"type": "integer",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"added_at": {
|
||||||
|
"name": "added_at",
|
||||||
|
"type": "timestamp with time zone",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "now()"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"indexes": {
|
||||||
|
"idx_deck_terms_term": {
|
||||||
|
"name": "idx_deck_terms_term",
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"expression": "term_id",
|
||||||
|
"isExpression": false,
|
||||||
|
"asc": true,
|
||||||
|
"nulls": "last"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"isUnique": false,
|
||||||
|
"concurrently": false,
|
||||||
|
"method": "btree",
|
||||||
|
"with": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"foreignKeys": {
|
||||||
|
"deck_terms_deck_id_decks_id_fk": {
|
||||||
|
"name": "deck_terms_deck_id_decks_id_fk",
|
||||||
|
"tableFrom": "deck_terms",
|
||||||
|
"tableTo": "decks",
|
||||||
|
"columnsFrom": [
|
||||||
|
"deck_id"
|
||||||
|
],
|
||||||
|
"columnsTo": [
|
||||||
|
"id"
|
||||||
|
],
|
||||||
|
"onDelete": "cascade",
|
||||||
|
"onUpdate": "no action"
|
||||||
|
},
|
||||||
|
"deck_terms_term_id_terms_id_fk": {
|
||||||
|
"name": "deck_terms_term_id_terms_id_fk",
|
||||||
|
"tableFrom": "deck_terms",
|
||||||
|
"tableTo": "terms",
|
||||||
|
"columnsFrom": [
|
||||||
|
"term_id"
|
||||||
|
],
|
||||||
|
"columnsTo": [
|
||||||
|
"id"
|
||||||
|
],
|
||||||
|
"onDelete": "cascade",
|
||||||
|
"onUpdate": "no action"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"compositePrimaryKeys": {
|
||||||
|
"deck_terms_deck_id_term_id_pk": {
|
||||||
|
"name": "deck_terms_deck_id_term_id_pk",
|
||||||
|
"columns": [
|
||||||
|
"deck_id",
|
||||||
|
"term_id"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"uniqueConstraints": {},
|
||||||
|
"policies": {},
|
||||||
|
"checkConstraints": {},
|
||||||
|
"isRLSEnabled": false
|
||||||
|
},
|
||||||
|
"public.decks": {
|
||||||
|
"name": "decks",
|
||||||
|
"schema": "",
|
||||||
|
"columns": {
|
||||||
|
"id": {
|
||||||
|
"name": "id",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": true,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "gen_random_uuid()"
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"name": "name",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"description": {
|
||||||
|
"name": "description",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false
|
||||||
|
},
|
||||||
|
"language_pair_id": {
|
||||||
|
"name": "language_pair_id",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"created_by": {
|
||||||
|
"name": "created_by",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"is_public": {
|
||||||
|
"name": "is_public",
|
||||||
|
"type": "boolean",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"created_at": {
|
||||||
|
"name": "created_at",
|
||||||
|
"type": "timestamp with time zone",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "now()"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"indexes": {
|
||||||
|
"idx_decks_created_by": {
|
||||||
|
"name": "idx_decks_created_by",
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"expression": "created_by",
|
||||||
|
"isExpression": false,
|
||||||
|
"asc": true,
|
||||||
|
"nulls": "last"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"isUnique": false,
|
||||||
|
"concurrently": false,
|
||||||
|
"method": "btree",
|
||||||
|
"with": {}
|
||||||
|
},
|
||||||
|
"idx_decks_language_pair": {
|
||||||
|
"name": "idx_decks_language_pair",
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"expression": "language_pair_id",
|
||||||
|
"isExpression": false,
|
||||||
|
"asc": true,
|
||||||
|
"nulls": "last"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"isUnique": false,
|
||||||
|
"concurrently": false,
|
||||||
|
"method": "btree",
|
||||||
|
"with": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"foreignKeys": {
|
||||||
|
"decks_language_pair_id_language_pairs_id_fk": {
|
||||||
|
"name": "decks_language_pair_id_language_pairs_id_fk",
|
||||||
|
"tableFrom": "decks",
|
||||||
|
"tableTo": "language_pairs",
|
||||||
|
"columnsFrom": [
|
||||||
|
"language_pair_id"
|
||||||
|
],
|
||||||
|
"columnsTo": [
|
||||||
|
"id"
|
||||||
|
],
|
||||||
|
"onDelete": "cascade",
|
||||||
|
"onUpdate": "no action"
|
||||||
|
},
|
||||||
|
"decks_created_by_users_id_fk": {
|
||||||
|
"name": "decks_created_by_users_id_fk",
|
||||||
|
"tableFrom": "decks",
|
||||||
|
"tableTo": "users",
|
||||||
|
"columnsFrom": [
|
||||||
|
"created_by"
|
||||||
|
],
|
||||||
|
"columnsTo": [
|
||||||
|
"id"
|
||||||
|
],
|
||||||
|
"onDelete": "cascade",
|
||||||
|
"onUpdate": "no action"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"compositePrimaryKeys": {},
|
||||||
|
"uniqueConstraints": {
|
||||||
|
"unique_deck_name": {
|
||||||
|
"name": "unique_deck_name",
|
||||||
|
"nullsNotDistinct": false,
|
||||||
|
"columns": [
|
||||||
|
"name",
|
||||||
|
"created_by"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"policies": {},
|
||||||
|
"checkConstraints": {},
|
||||||
|
"isRLSEnabled": false
|
||||||
|
},
|
||||||
|
"public.language_pairs": {
|
||||||
|
"name": "language_pairs",
|
||||||
|
"schema": "",
|
||||||
|
"columns": {
|
||||||
|
"id": {
|
||||||
|
"name": "id",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": true,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "gen_random_uuid()"
|
||||||
|
},
|
||||||
|
"source_language": {
|
||||||
|
"name": "source_language",
|
||||||
|
"type": "varchar(10)",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"target_language": {
|
||||||
|
"name": "target_language",
|
||||||
|
"type": "varchar(10)",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"label": {
|
||||||
|
"name": "label",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false
|
||||||
|
},
|
||||||
|
"active": {
|
||||||
|
"name": "active",
|
||||||
|
"type": "boolean",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"default": true
|
||||||
|
},
|
||||||
|
"created_at": {
|
||||||
|
"name": "created_at",
|
||||||
|
"type": "timestamp with time zone",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "now()"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"indexes": {
|
||||||
|
"idx_pairs_active": {
|
||||||
|
"name": "idx_pairs_active",
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"expression": "active",
|
||||||
|
"isExpression": false,
|
||||||
|
"asc": true,
|
||||||
|
"nulls": "last"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expression": "source_language",
|
||||||
|
"isExpression": false,
|
||||||
|
"asc": true,
|
||||||
|
"nulls": "last"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expression": "target_language",
|
||||||
|
"isExpression": false,
|
||||||
|
"asc": true,
|
||||||
|
"nulls": "last"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"isUnique": false,
|
||||||
|
"concurrently": false,
|
||||||
|
"method": "btree",
|
||||||
|
"with": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"foreignKeys": {},
|
||||||
|
"compositePrimaryKeys": {},
|
||||||
|
"uniqueConstraints": {
|
||||||
|
"unique_source_target": {
|
||||||
|
"name": "unique_source_target",
|
||||||
|
"nullsNotDistinct": false,
|
||||||
|
"columns": [
|
||||||
|
"source_language",
|
||||||
|
"target_language"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"policies": {},
|
||||||
|
"checkConstraints": {
|
||||||
|
"source_language_check": {
|
||||||
|
"name": "source_language_check",
|
||||||
|
"value": "\"language_pairs\".\"source_language\" IN ('en', 'it')"
|
||||||
|
},
|
||||||
|
"target_language_check": {
|
||||||
|
"name": "target_language_check",
|
||||||
|
"value": "\"language_pairs\".\"target_language\" IN ('en', 'it')"
|
||||||
|
},
|
||||||
|
"no_self_pair": {
|
||||||
|
"name": "no_self_pair",
|
||||||
|
"value": "\"language_pairs\".\"source_language\" != \"language_pairs\".\"target_language\""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"isRLSEnabled": false
|
||||||
|
},
|
||||||
|
"public.term_glosses": {
|
||||||
|
"name": "term_glosses",
|
||||||
|
"schema": "",
|
||||||
|
"columns": {
|
||||||
|
"id": {
|
||||||
|
"name": "id",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": true,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "gen_random_uuid()"
|
||||||
|
},
|
||||||
|
"term_id": {
|
||||||
|
"name": "term_id",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"language_code": {
|
||||||
|
"name": "language_code",
|
||||||
|
"type": "varchar(10)",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"text": {
|
||||||
|
"name": "text",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"created_at": {
|
||||||
|
"name": "created_at",
|
||||||
|
"type": "timestamp with time zone",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "now()"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"indexes": {
|
||||||
|
"idx_term_glosses_term": {
|
||||||
|
"name": "idx_term_glosses_term",
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"expression": "term_id",
|
||||||
|
"isExpression": false,
|
||||||
|
"asc": true,
|
||||||
|
"nulls": "last"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"isUnique": false,
|
||||||
|
"concurrently": false,
|
||||||
|
"method": "btree",
|
||||||
|
"with": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"foreignKeys": {
|
||||||
|
"term_glosses_term_id_terms_id_fk": {
|
||||||
|
"name": "term_glosses_term_id_terms_id_fk",
|
||||||
|
"tableFrom": "term_glosses",
|
||||||
|
"tableTo": "terms",
|
||||||
|
"columnsFrom": [
|
||||||
|
"term_id"
|
||||||
|
],
|
||||||
|
"columnsTo": [
|
||||||
|
"id"
|
||||||
|
],
|
||||||
|
"onDelete": "cascade",
|
||||||
|
"onUpdate": "no action"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"compositePrimaryKeys": {},
|
||||||
|
"uniqueConstraints": {
|
||||||
|
"unique_term_gloss": {
|
||||||
|
"name": "unique_term_gloss",
|
||||||
|
"nullsNotDistinct": false,
|
||||||
|
"columns": [
|
||||||
|
"term_id",
|
||||||
|
"language_code",
|
||||||
|
"text"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"policies": {},
|
||||||
|
"checkConstraints": {},
|
||||||
|
"isRLSEnabled": false
|
||||||
|
},
|
||||||
|
"public.terms": {
|
||||||
|
"name": "terms",
|
||||||
|
"schema": "",
|
||||||
|
"columns": {
|
||||||
|
"id": {
|
||||||
|
"name": "id",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": true,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "gen_random_uuid()"
|
||||||
|
},
|
||||||
|
"synset_id": {
|
||||||
|
"name": "synset_id",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"pos": {
|
||||||
|
"name": "pos",
|
||||||
|
"type": "varchar(20)",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"created_at": {
|
||||||
|
"name": "created_at",
|
||||||
|
"type": "timestamp with time zone",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "now()"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"indexes": {
|
||||||
|
"idx_terms_pos": {
|
||||||
|
"name": "idx_terms_pos",
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"expression": "pos",
|
||||||
|
"isExpression": false,
|
||||||
|
"asc": true,
|
||||||
|
"nulls": "last"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"isUnique": false,
|
||||||
|
"concurrently": false,
|
||||||
|
"method": "btree",
|
||||||
|
"with": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"foreignKeys": {},
|
||||||
|
"compositePrimaryKeys": {},
|
||||||
|
"uniqueConstraints": {
|
||||||
|
"terms_synset_id_unique": {
|
||||||
|
"name": "terms_synset_id_unique",
|
||||||
|
"nullsNotDistinct": false,
|
||||||
|
"columns": [
|
||||||
|
"synset_id"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"policies": {},
|
||||||
|
"checkConstraints": {
|
||||||
|
"pos_check": {
|
||||||
|
"name": "pos_check",
|
||||||
|
"value": "\"terms\".\"pos\" IN ('noun')"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"isRLSEnabled": false
|
||||||
|
},
|
||||||
|
"public.translations": {
|
||||||
|
"name": "translations",
|
||||||
|
"schema": "",
|
||||||
|
"columns": {
|
||||||
|
"id": {
|
||||||
|
"name": "id",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": true,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "gen_random_uuid()"
|
||||||
|
},
|
||||||
|
"term_id": {
|
||||||
|
"name": "term_id",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"language_code": {
|
||||||
|
"name": "language_code",
|
||||||
|
"type": "varchar(10)",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"text": {
|
||||||
|
"name": "text",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"created_at": {
|
||||||
|
"name": "created_at",
|
||||||
|
"type": "timestamp with time zone",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "now()"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"indexes": {
|
||||||
|
"idx_translations_lang": {
|
||||||
|
"name": "idx_translations_lang",
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"expression": "language_code",
|
||||||
|
"isExpression": false,
|
||||||
|
"asc": true,
|
||||||
|
"nulls": "last"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expression": "term_id",
|
||||||
|
"isExpression": false,
|
||||||
|
"asc": true,
|
||||||
|
"nulls": "last"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"isUnique": false,
|
||||||
|
"concurrently": false,
|
||||||
|
"method": "btree",
|
||||||
|
"with": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"foreignKeys": {
|
||||||
|
"translations_term_id_terms_id_fk": {
|
||||||
|
"name": "translations_term_id_terms_id_fk",
|
||||||
|
"tableFrom": "translations",
|
||||||
|
"tableTo": "terms",
|
||||||
|
"columnsFrom": [
|
||||||
|
"term_id"
|
||||||
|
],
|
||||||
|
"columnsTo": [
|
||||||
|
"id"
|
||||||
|
],
|
||||||
|
"onDelete": "cascade",
|
||||||
|
"onUpdate": "no action"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"compositePrimaryKeys": {},
|
||||||
|
"uniqueConstraints": {
|
||||||
|
"unique_translations": {
|
||||||
|
"name": "unique_translations",
|
||||||
|
"nullsNotDistinct": false,
|
||||||
|
"columns": [
|
||||||
|
"term_id",
|
||||||
|
"language_code",
|
||||||
|
"text"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"policies": {},
|
||||||
|
"checkConstraints": {},
|
||||||
|
"isRLSEnabled": false
|
||||||
|
},
|
||||||
|
"public.users": {
|
||||||
|
"name": "users",
|
||||||
|
"schema": "",
|
||||||
|
"columns": {
|
||||||
|
"id": {
|
||||||
|
"name": "id",
|
||||||
|
"type": "uuid",
|
||||||
|
"primaryKey": true,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "gen_random_uuid()"
|
||||||
|
},
|
||||||
|
"openauth_sub": {
|
||||||
|
"name": "openauth_sub",
|
||||||
|
"type": "text",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true
|
||||||
|
},
|
||||||
|
"email": {
|
||||||
|
"name": "email",
|
||||||
|
"type": "varchar(255)",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false
|
||||||
|
},
|
||||||
|
"display_name": {
|
||||||
|
"name": "display_name",
|
||||||
|
"type": "varchar(100)",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false
|
||||||
|
},
|
||||||
|
"created_at": {
|
||||||
|
"name": "created_at",
|
||||||
|
"type": "timestamp with time zone",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": true,
|
||||||
|
"default": "now()"
|
||||||
|
},
|
||||||
|
"last_login_at": {
|
||||||
|
"name": "last_login_at",
|
||||||
|
"type": "timestamp with time zone",
|
||||||
|
"primaryKey": false,
|
||||||
|
"notNull": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"indexes": {},
|
||||||
|
"foreignKeys": {},
|
||||||
|
"compositePrimaryKeys": {},
|
||||||
|
"uniqueConstraints": {
|
||||||
|
"users_openauth_sub_unique": {
|
||||||
|
"name": "users_openauth_sub_unique",
|
||||||
|
"nullsNotDistinct": false,
|
||||||
|
"columns": [
|
||||||
|
"openauth_sub"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"users_email_unique": {
|
||||||
|
"name": "users_email_unique",
|
||||||
|
"nullsNotDistinct": false,
|
||||||
|
"columns": [
|
||||||
|
"email"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"users_display_name_unique": {
|
||||||
|
"name": "users_display_name_unique",
|
||||||
|
"nullsNotDistinct": false,
|
||||||
|
"columns": [
|
||||||
|
"display_name"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"policies": {},
|
||||||
|
"checkConstraints": {},
|
||||||
|
"isRLSEnabled": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"enums": {},
|
||||||
|
"schemas": {},
|
||||||
|
"sequences": {},
|
||||||
|
"roles": {},
|
||||||
|
"policies": {},
|
||||||
|
"views": {},
|
||||||
|
"_meta": {
|
||||||
|
"columns": {},
|
||||||
|
"schemas": {},
|
||||||
|
"tables": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1 +1,13 @@
|
||||||
{"version":"7","dialect":"postgresql","entries":[]}
|
{
|
||||||
|
"version": "7",
|
||||||
|
"dialect": "postgresql",
|
||||||
|
"entries": [
|
||||||
|
{
|
||||||
|
"idx": 0,
|
||||||
|
"version": "7",
|
||||||
|
"when": 1774721919883,
|
||||||
|
"tag": "0000_bitter_turbo",
|
||||||
|
"breakpoints": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -16,5 +16,8 @@
|
||||||
"@types/pg": "^8.20.0",
|
"@types/pg": "^8.20.0",
|
||||||
"drizzle-kit": "^0.31.10"
|
"drizzle-kit": "^0.31.10"
|
||||||
},
|
},
|
||||||
"exports": "./src/index.ts"
|
"exports": {
|
||||||
|
".": "./src/index.ts",
|
||||||
|
"./schema": "./src/db/schema.ts"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,6 @@ export const term_glosses = pgTable(
|
||||||
table.language_code,
|
table.language_code,
|
||||||
table.text,
|
table.text,
|
||||||
),
|
),
|
||||||
,
|
|
||||||
index("idx_term_glosses_term").on(table.term_id),
|
index("idx_term_glosses_term").on(table.term_id),
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
|
|
|
||||||
203
packages/db/src/seeding-datafiles.ts
Normal file
203
packages/db/src/seeding-datafiles.ts
Normal file
|
|
@ -0,0 +1,203 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import { eq } from "drizzle-orm";
|
||||||
|
|
||||||
|
import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared";
|
||||||
|
import { db } from "@glossa/db";
|
||||||
|
import { terms, translations } from "@glossa/db/schema";
|
||||||
|
|
||||||
|
// the following generate unions of the imported const arrays
|
||||||
|
type POS = (typeof SUPPORTED_POS)[number];
|
||||||
|
type LANGUAGE_CODE = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
||||||
|
|
||||||
|
type Synset = {
|
||||||
|
synset_id: string;
|
||||||
|
pos: POS;
|
||||||
|
translations: Partial<Record<LANGUAGE_CODE, string[]>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
type FileName = {
|
||||||
|
sourceLang: LANGUAGE_CODE;
|
||||||
|
targetLang: LANGUAGE_CODE;
|
||||||
|
pos: POS;
|
||||||
|
};
|
||||||
|
|
||||||
|
const dataDir = "../../scripts/datafiles/";
|
||||||
|
|
||||||
|
const parseFilename = (filename: string): FileName => {
|
||||||
|
const parts = filename.replace(".json", "").split("-");
|
||||||
|
if (parts.length !== 3)
|
||||||
|
throw new Error(
|
||||||
|
`Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`,
|
||||||
|
);
|
||||||
|
const [sourceLang, targetLang, pos] = parts;
|
||||||
|
if (!SUPPORTED_LANGUAGE_CODES.includes(sourceLang as LANGUAGE_CODE))
|
||||||
|
throw new Error(`Unsupported language code: ${sourceLang}`);
|
||||||
|
if (!SUPPORTED_LANGUAGE_CODES.includes(targetLang as LANGUAGE_CODE))
|
||||||
|
throw new Error(`Unsupported language code: ${targetLang}`);
|
||||||
|
if (!SUPPORTED_POS.includes(pos as POS))
|
||||||
|
throw new Error(`Unsupported POS: ${pos}`);
|
||||||
|
return {
|
||||||
|
sourceLang: sourceLang as LANGUAGE_CODE,
|
||||||
|
targetLang: targetLang as LANGUAGE_CODE,
|
||||||
|
pos: pos as POS,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
const readFromJsonFile = async (filepath: string): Promise<Synset[]> => {
|
||||||
|
const data = await fs.readFile(filepath, "utf8");
|
||||||
|
const parsed = JSON.parse(data);
|
||||||
|
if (!Array.isArray(parsed)) throw new Error("Expected a JSON array");
|
||||||
|
return parsed as Synset[];
|
||||||
|
};
|
||||||
|
|
||||||
|
const uploadSynsetToDB = async (
|
||||||
|
synset: Synset,
|
||||||
|
_fileInfo: FileName,
|
||||||
|
): Promise<{ termInserted: boolean; translationsInserted: number }> => {
|
||||||
|
// 1. Try to insert the term — skip if synset_id already exists
|
||||||
|
const inserted = await db
|
||||||
|
.insert(terms)
|
||||||
|
.values({ synset_id: synset.synset_id, pos: synset.pos })
|
||||||
|
.onConflictDoNothing()
|
||||||
|
.returning({ id: terms.id });
|
||||||
|
|
||||||
|
let termId: string;
|
||||||
|
let termInserted: boolean;
|
||||||
|
|
||||||
|
if (inserted.length > 0) {
|
||||||
|
termId = inserted[0]!.id;
|
||||||
|
termInserted = true;
|
||||||
|
} else {
|
||||||
|
// Term already exists — fetch its real DB id for the FK
|
||||||
|
const [existing] = await db
|
||||||
|
.select({ id: terms.id })
|
||||||
|
.from(terms)
|
||||||
|
.where(eq(terms.synset_id, synset.synset_id))
|
||||||
|
.limit(1);
|
||||||
|
if (!existing)
|
||||||
|
throw new Error(`Term not found after conflict: ${synset.synset_id}`);
|
||||||
|
termId = existing.id;
|
||||||
|
termInserted = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Build translation rows and upsert — skip duplicates silently
|
||||||
|
const translationRows = Object.entries(synset.translations).flatMap(
|
||||||
|
([lang, lemmas]) =>
|
||||||
|
lemmas!.map((lemma) => ({
|
||||||
|
id: crypto.randomUUID(),
|
||||||
|
term_id: termId,
|
||||||
|
language_code: lang as LANGUAGE_CODE,
|
||||||
|
text: lemma,
|
||||||
|
})),
|
||||||
|
);
|
||||||
|
|
||||||
|
if (translationRows.length === 0) {
|
||||||
|
return { termInserted, translationsInserted: 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await db
|
||||||
|
.insert(translations)
|
||||||
|
.values(translationRows)
|
||||||
|
.onConflictDoNothing()
|
||||||
|
.returning({ id: translations.id });
|
||||||
|
|
||||||
|
return { termInserted, translationsInserted: result.length };
|
||||||
|
};
|
||||||
|
|
||||||
|
const main = async () => {
|
||||||
|
// step 1: discovering files
|
||||||
|
console.log("\n");
|
||||||
|
console.log("\n");
|
||||||
|
console.log("##########################################");
|
||||||
|
console.log("step 1: discovering files");
|
||||||
|
console.log("##########################################");
|
||||||
|
|
||||||
|
console.log("🔍 Scanning datafiles directory...");
|
||||||
|
const allFiles = await fs.readdir(dataDir);
|
||||||
|
const jsonFiles = allFiles.filter((f) => f.endsWith(".json"));
|
||||||
|
|
||||||
|
if (jsonFiles.length === 0) {
|
||||||
|
console.warn("⚠️ No JSON files found in", dataDir);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
console.log(`📁 Found ${jsonFiles.length} file(s)\n`);
|
||||||
|
|
||||||
|
// step 2: validating filenames
|
||||||
|
console.log("\n");
|
||||||
|
console.log("\n");
|
||||||
|
console.log("##########################################");
|
||||||
|
console.log("step 2: validating filenames");
|
||||||
|
console.log("##########################################");
|
||||||
|
const validFiles: { filename: string; fileInfo: FileName }[] = [];
|
||||||
|
for (const filename of jsonFiles) {
|
||||||
|
try {
|
||||||
|
const fileInfo = parseFilename(filename);
|
||||||
|
validFiles.push({ filename, fileInfo });
|
||||||
|
console.log(
|
||||||
|
` ✅ ${filename} — ${fileInfo.sourceLang} → ${fileInfo.targetLang} (${fileInfo.pos})`,
|
||||||
|
);
|
||||||
|
} catch (e) {
|
||||||
|
console.warn(` ⚠️ Skipping ${filename}: ${(e as Error).message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (validFiles.length === 0) {
|
||||||
|
console.error("❌ No valid files to process. Exiting.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 3: processing each file
|
||||||
|
console.log("\n");
|
||||||
|
console.log("\n");
|
||||||
|
console.log("##########################################");
|
||||||
|
console.log("step 3: processing each file");
|
||||||
|
console.log("##########################################");
|
||||||
|
let totalTermsInserted = 0;
|
||||||
|
let totalTranslationsInserted = 0;
|
||||||
|
|
||||||
|
for (const [i, { filename, fileInfo }] of validFiles.entries()) {
|
||||||
|
const prefix = `[${i + 1}/${validFiles.length}]`;
|
||||||
|
|
||||||
|
console.log(`\n${prefix} 📄 ${filename}`);
|
||||||
|
|
||||||
|
const synsets = await readFromJsonFile(dataDir + filename);
|
||||||
|
console.log(`${prefix} Loaded ${synsets.length} synsets`);
|
||||||
|
|
||||||
|
let fileTermsInserted = 0;
|
||||||
|
let fileTranslationsInserted = 0;
|
||||||
|
|
||||||
|
for (const [j, synset] of synsets.entries()) {
|
||||||
|
if (j > 0 && j % 500 === 0) {
|
||||||
|
console.log(`${prefix} ⏳ ${j}/${synsets.length} synsets processed...`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const { termInserted, translationsInserted } = await uploadSynsetToDB(
|
||||||
|
synset,
|
||||||
|
fileInfo,
|
||||||
|
);
|
||||||
|
if (termInserted) fileTermsInserted++;
|
||||||
|
fileTranslationsInserted += translationsInserted;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`${prefix} ✅ Done — ${fileTermsInserted} new terms, ${fileTranslationsInserted} new translations`,
|
||||||
|
);
|
||||||
|
totalTermsInserted += fileTermsInserted;
|
||||||
|
totalTranslationsInserted += fileTranslationsInserted;
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 4: Final summary
|
||||||
|
console.log("\n");
|
||||||
|
console.log("\n");
|
||||||
|
console.log("##########################################");
|
||||||
|
console.log("step 4: final summary");
|
||||||
|
console.log("##########################################");
|
||||||
|
console.log(`\n🎉 Seeding complete!`);
|
||||||
|
console.log(` Terms inserted: ${totalTermsInserted}`);
|
||||||
|
console.log(` Translations inserted: ${totalTranslationsInserted}`);
|
||||||
|
};
|
||||||
|
|
||||||
|
main().catch((error) => {
|
||||||
|
console.error(error);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
36
scripts/datafiles/test.json
Normal file
36
scripts/datafiles/test.json
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"synset_id": "ili:i35545",
|
||||||
|
"pos": "noun",
|
||||||
|
"translations": { "en": ["entity"], "it": ["cosa", "entità"] }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"synset_id": "ili:i35547",
|
||||||
|
"pos": "noun",
|
||||||
|
"translations": {
|
||||||
|
"en": ["abstraction", "abstract entity"],
|
||||||
|
"it": ["astrazione"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"synset_id": "ili:i35549",
|
||||||
|
"pos": "noun",
|
||||||
|
"translations": {
|
||||||
|
"en": ["object", "physical object"],
|
||||||
|
"it": ["oggetto", "cosa"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"synset_id": "ili:i35550",
|
||||||
|
"pos": "noun",
|
||||||
|
"translations": { "en": ["whole", "unit"], "it": ["insieme", "tutto"] }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"synset_id": "ili:i35553",
|
||||||
|
"pos": "noun",
|
||||||
|
"translations": {
|
||||||
|
"en": ["organism", "being"],
|
||||||
|
"it": ["essere vivente", "organismo", "organismo vivente"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
Loading…
Add table
Add a link
Reference in a new issue