adding datafiles and seeding script

2026-03-31 10:05:36 +02:00 · 2026-03-31 10:05:36 +02:00 · 20fa6a9331
commit 20fa6a9331
parent 068949b4cb
7 changed files with 349852 additions and 3 deletions
--- a/packages/db/src/check-noun-coverage.ts
+++ b/packages/db/src/check-noun-coverage.ts
@ -0,0 +1,55 @@
+import fs from "node:fs/promises";
+import { db } from "@glossa/db";
+import { translations } from "@glossa/db/schema";
+import { inArray } from "drizzle-orm";
+
+const wordlistPath = "./src/data/wordlists/top1000englishnouns";
+const unmatchedOutputPath =
+  "./src/data/wordlists/top1000englishnouns-unmatched";
+
+const main = async () => {
+  // 1. Read and normalise the word list
+  console.log("📖 Reading word list...");
+  const raw = await fs.readFile(wordlistPath, "utf8");
+  const words = raw
+    .split("\n")
+    .map((w) => w.trim().toLowerCase())
+    .filter(Boolean);
+  console.log(`   ${words.length} words loaded\n`);
+
+  // 2. Query DB for matches
+  console.log("🔍 Checking against database...");
+  const rows = await db
+    .select({ text: translations.text })
+    .from(translations)
+    .where(inArray(translations.text, words));
+
+  const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
+
+  // 3. Split into matched / unmatched
+  const matched = words.filter((w) => matchedSet.has(w));
+  const unmatched = words.filter((w) => !matchedSet.has(w));
+
+  // 4. Terminal output
+  console.log(`✅ Matched:   ${matched.length}/${words.length}`);
+  console.log(`❌ Unmatched: ${unmatched.length}/${words.length}`);
+  console.log(
+    `📊 Coverage:  ${((matched.length / words.length) * 100).toFixed(1)}%\n`,
+  );
+
+  if (unmatched.length > 0) {
+    console.log("❌ Unmatched words:");
+    for (const w of unmatched) {
+      console.log(`   ${w}`);
+    }
+  }
+
+  // 5. Write unmatched to file
+  // await fs.writeFile(unmatchedOutputPath, unmatched.join("\n"), "utf8");
+  console.log(`\n💾 Unmatched words written to ${unmatchedOutputPath}`);
+};
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
--- a/packages/db/src/create-test-deck.ts
+++ b/packages/db/src/create-test-deck.ts
@ -0,0 +1,49 @@
+/*
+Parse CLI args → resolve the word list file path
+Connect to the database
+Read the word list file into an ordered array of strings
+Look up the en→it language pair ID from language_pairs
+Batch-fetch all matching rows from translations where language_code = 'en' and text IN (words)
+Build a word → termId map from the results
+Walk the ordered word list → split into hits (word found, capture position) and misses (skip)
+Check if a deck with this name already exists → if so, delete its deck_terms then the deck itself
+Insert the new decks row
+Insert all deck_terms rows in batches (deckId, termId, position)
+Log the skipped words
+Close the DB connection
+*/
+
+import fs from "node:fs/promises";
+import { db } from "@glossa/db";
+import { translations } from "@glossa/db/schema";
+import { inArray, and, eq } from "drizzle-orm";
+
+const wordlistPath = "./src/data/wordlists/top1000englishnouns";
+
+const main = async () => {
+  // Read and normalise the word list
+  console.log("📖 Reading word list...");
+  const raw = await fs.readFile(wordlistPath, "utf8");
+  const words = raw
+    .split("\n")
+    .map((w) => w.trim().toLowerCase())
+    .filter(Boolean);
+  console.log(`   ${words.length} words loaded\n`);
+
+  // Query DB for matches
+  console.log("🔍 Checking against database...");
+  const rows = await db
+    .select({ text: translations.text, termId: translations.term_id })
+    .from(translations)
+    .where(inArray(translations.text, words));
+
+  const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
+  const wordsInDb = words.filter((w) => matchedSet.has(w));
+
+  console.log("wordsInDb: ", wordsInDb);
+};
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
--- a/packages/db/src/data/datafiles/en-it-noun.json
+++ b/packages/db/src/data/datafiles/en-it-noun.json
--- a/packages/db/src/data/wordlists/top1000englishnouns
+++ b/packages/db/src/data/wordlists/top1000englishnouns
--- a/packages/db/src/data/wordlists/top1000englishnouns-unmatched
+++ b/packages/db/src/data/wordlists/top1000englishnouns-unmatched
@ -0,0 +1,34 @@
+a
+other
+us
+may
+st
+paul
+new
+software
+oxford
+english
+mary
+japan
+while
+pp
+membership
+manchester
+tony
+alan
+jones
+un
+northern
+simon
+behalf
+co
+graham
+joe
+guy
+lewis
+jane
+taylor
+co-operation
+travel
+self
+thatcher
--- a/packages/db/src/seeding-datafiles.ts
+++ b/packages/db/src/seeding-datafiles.ts
@ -21,7 +21,7 @@ type FileName = {
  pos: POS;
 };

-const dataDir = "../../scripts/datafiles/";
+const dataDir = "./src/data/datafiles/";

 const parseFilename = (filename: string): FileName => {
  const parts = filename.replace(".json", "").split("-");
--- a/packages/db/tsconfig.json
+++ b/packages/db/tsconfig.json
@ -5,7 +5,7 @@
    "moduleResolution": "NodeNext",
    "outDir": "./dist",
    "resolveJsonModule": true,
-    "types": ["vitest/globals"]
+    "types": ["vitest/globals"],
  },
-  "include": ["src", "vitest.config.ts"]
+  "include": ["src", "vitest.config.ts"],
 }