adding datafiles and seeding script

2026-03-31 10:05:36 +02:00 · 2026-03-31 10:05:36 +02:00 · 20fa6a9331
commit 20fa6a9331
parent 068949b4cb
7 changed files with 349852 additions and 3 deletions
--- a/packages/db/src/check-noun-coverage.ts
+++ b/packages/db/src/check-noun-coverage.ts
@ -0,0 +1,55 @@
 import fs from "node:fs/promises";
 import { db } from "@glossa/db";
 import { translations } from "@glossa/db/schema";
 import { inArray } from "drizzle-orm";
 const wordlistPath = "./src/data/wordlists/top1000englishnouns";
 const unmatchedOutputPath =
  "./src/data/wordlists/top1000englishnouns-unmatched";
 const main = async () => {
  // 1. Read and normalise the word list
  console.log("📖 Reading word list...");
  const raw = await fs.readFile(wordlistPath, "utf8");
  const words = raw
    .split("\n")
    .map((w) => w.trim().toLowerCase())
    .filter(Boolean);
  console.log(`   ${words.length} words loaded\n`);
  // 2. Query DB for matches
  console.log("🔍 Checking against database...");
  const rows = await db
    .select({ text: translations.text })
    .from(translations)
    .where(inArray(translations.text, words));
  const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
  // 3. Split into matched / unmatched
  const matched = words.filter((w) => matchedSet.has(w));
  const unmatched = words.filter((w) => !matchedSet.has(w));
  // 4. Terminal output
  console.log(`✅ Matched:   ${matched.length}/${words.length}`);
  console.log(`❌ Unmatched: ${unmatched.length}/${words.length}`);
  console.log(
    `📊 Coverage:  ${((matched.length / words.length) * 100).toFixed(1)}%\n`,
  );
  if (unmatched.length > 0) {
    console.log("❌ Unmatched words:");
    for (const w of unmatched) {
      console.log(`   ${w}`);
    }
  }
  // 5. Write unmatched to file
  // await fs.writeFile(unmatchedOutputPath, unmatched.join("\n"), "utf8");
  console.log(`\n💾 Unmatched words written to ${unmatchedOutputPath}`);
 };
 main().catch((error) => {
  console.error(error);
  process.exit(1);
 });
--- a/packages/db/src/create-test-deck.ts
+++ b/packages/db/src/create-test-deck.ts
@ -0,0 +1,49 @@
 /*
 Parse CLI args → resolve the word list file path
 Connect to the database
 Read the word list file into an ordered array of strings
 Look up the en→it language pair ID from language_pairs
 Batch-fetch all matching rows from translations where language_code = 'en' and text IN (words)
 Build a word → termId map from the results
 Walk the ordered word list → split into hits (word found, capture position) and misses (skip)
 Check if a deck with this name already exists → if so, delete its deck_terms then the deck itself
 Insert the new decks row
 Insert all deck_terms rows in batches (deckId, termId, position)
 Log the skipped words
 Close the DB connection
 */
 import fs from "node:fs/promises";
 import { db } from "@glossa/db";
 import { translations } from "@glossa/db/schema";
 import { inArray, and, eq } from "drizzle-orm";
 const wordlistPath = "./src/data/wordlists/top1000englishnouns";
 const main = async () => {
  // Read and normalise the word list
  console.log("📖 Reading word list...");
  const raw = await fs.readFile(wordlistPath, "utf8");
  const words = raw
    .split("\n")
    .map((w) => w.trim().toLowerCase())
    .filter(Boolean);
  console.log(`   ${words.length} words loaded\n`);
  // Query DB for matches
  console.log("🔍 Checking against database...");
  const rows = await db
    .select({ text: translations.text, termId: translations.term_id })
    .from(translations)
    .where(inArray(translations.text, words));
  const matchedSet = new Set(rows.map((r) => r.text.toLowerCase()));
  const wordsInDb = words.filter((w) => matchedSet.has(w));
  console.log("wordsInDb: ", wordsInDb);
 };
 main().catch((error) => {
  console.error(error);
  process.exit(1);
 });
--- a/packages/db/src/data/datafiles/en-it-noun.json
+++ b/packages/db/src/data/datafiles/en-it-noun.json
--- a/packages/db/src/data/wordlists/top1000englishnouns
+++ b/packages/db/src/data/wordlists/top1000englishnouns
--- a/packages/db/src/data/wordlists/top1000englishnouns-unmatched
+++ b/packages/db/src/data/wordlists/top1000englishnouns-unmatched
@ -0,0 +1,34 @@
 a
 other
 us
 may
 st
 paul
 new
 software
 oxford
 english
 mary
 japan
 while
 pp
 membership
 manchester
 tony
 alan
 jones
 un
 northern
 simon
 behalf
 co
 graham
 joe
 guy
 lewis
 jane
 taylor
 co-operation
 travel
 self
 thatcher
--- a/packages/db/src/seeding-datafiles.ts
+++ b/packages/db/src/seeding-datafiles.ts
@ -21,7 +21,7 @@ type FileName = {
  pos: POS;
 };
-const dataDir = "../../scripts/datafiles/";
+const dataDir = "./src/data/datafiles/";
 const parseFilename = (filename: string): FileName => {
  const parts = filename.replace(".json", "").split("-");
--- a/packages/db/tsconfig.json
+++ b/packages/db/tsconfig.json
@ -5,7 +5,7 @@
    "moduleResolution": "NodeNext",
    "outDir": "./dist",
    "resolveJsonModule": true,
-    "types": ["vitest/globals"]
+    "types": ["vitest/globals"],
  },
-  "include": ["src", "vitest.config.ts"]
+  "include": ["src", "vitest.config.ts"],
 }