updating seeding pipeline

2026-04-05 19:29:17 +02:00 · 2026-04-05 19:29:17 +02:00 · dfeb6a4cb0
commit dfeb6a4cb0
parent c49c2fe2c3
2 changed files with 149 additions and 203 deletions
--- a/packages/db/src/seeding-datafiles.ts
+++ b/packages/db/src/seeding-datafiles.ts
@ -1,203 +0,0 @@
 import fs from "node:fs/promises";
 import { eq } from "drizzle-orm";
 import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared";
 import { db } from "@glossa/db";
 import { terms, translations } from "@glossa/db/schema";
 // the following generate unions of the imported const arrays
 type POS = (typeof SUPPORTED_POS)[number];
 type LANGUAGE_CODE = (typeof SUPPORTED_LANGUAGE_CODES)[number];
 type Synset = {
  synset_id: string;
  pos: POS;
  translations: Partial<Record<LANGUAGE_CODE, string[]>>;
 };
 type FileName = {
  sourceLang: LANGUAGE_CODE;
  targetLang: LANGUAGE_CODE;
  pos: POS;
 };
 const dataDir = "./src/data/datafiles/";
 const parseFilename = (filename: string): FileName => {
  const parts = filename.replace(".json", "").split("-");
  if (parts.length !== 3)
    throw new Error(
      `Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`,
    );
  const [sourceLang, targetLang, pos] = parts;
  if (!SUPPORTED_LANGUAGE_CODES.includes(sourceLang as LANGUAGE_CODE))
    throw new Error(`Unsupported language code: ${sourceLang}`);
  if (!SUPPORTED_LANGUAGE_CODES.includes(targetLang as LANGUAGE_CODE))
    throw new Error(`Unsupported language code: ${targetLang}`);
  if (!SUPPORTED_POS.includes(pos as POS))
    throw new Error(`Unsupported POS: ${pos}`);
  return {
    sourceLang: sourceLang as LANGUAGE_CODE,
    targetLang: targetLang as LANGUAGE_CODE,
    pos: pos as POS,
  };
 };
 const readFromJsonFile = async (filepath: string): Promise<Synset[]> => {
  const data = await fs.readFile(filepath, "utf8");
  const parsed = JSON.parse(data);
  if (!Array.isArray(parsed)) throw new Error("Expected a JSON array");
  return parsed as Synset[];
 };
 const uploadSynsetToDB = async (
  synset: Synset,
  _fileInfo: FileName,
 ): Promise<{ termInserted: boolean; translationsInserted: number }> => {
  // 1. Try to insert the term — skip if synset_id already exists
  const inserted = await db
    .insert(terms)
    .values({ synset_id: synset.synset_id, pos: synset.pos })
    .onConflictDoNothing()
    .returning({ id: terms.id });
  let termId: string;
  let termInserted: boolean;
  if (inserted.length > 0) {
    termId = inserted[0]!.id;
    termInserted = true;
  } else {
    // Term already exists — fetch its real DB id for the FK
    const [existing] = await db
      .select({ id: terms.id })
      .from(terms)
      .where(eq(terms.synset_id, synset.synset_id))
      .limit(1);
    if (!existing)
      throw new Error(`Term not found after conflict: ${synset.synset_id}`);
    termId = existing.id;
    termInserted = false;
  }
  // 2. Build translation rows and upsert — skip duplicates silently
  const translationRows = Object.entries(synset.translations).flatMap(
    ([lang, lemmas]) =>
      lemmas!.map((lemma) => ({
        id: crypto.randomUUID(),
        term_id: termId,
        language_code: lang as LANGUAGE_CODE,
        text: lemma,
      })),
  );
  if (translationRows.length === 0) {
    return { termInserted, translationsInserted: 0 };
  }
  const result = await db
    .insert(translations)
    .values(translationRows)
    .onConflictDoNothing()
    .returning({ id: translations.id });
  return { termInserted, translationsInserted: result.length };
 };
 const main = async () => {
  // step 1: discovering files
  console.log("\n");
  console.log("\n");
  console.log("##########################################");
  console.log("step 1: discovering files");
  console.log("##########################################");
  console.log("🔍 Scanning datafiles directory...");
  const allFiles = await fs.readdir(dataDir);
  const jsonFiles = allFiles.filter((f) => f.endsWith(".json"));
  if (jsonFiles.length === 0) {
    console.warn("⚠️  No JSON files found in", dataDir);
    return;
  }
  console.log(`📁 Found ${jsonFiles.length} file(s)\n`);
  // step 2: validating filenames
  console.log("\n");
  console.log("\n");
  console.log("##########################################");
  console.log("step 2: validating filenames");
  console.log("##########################################");
  const validFiles: { filename: string; fileInfo: FileName }[] = [];
  for (const filename of jsonFiles) {
    try {
      const fileInfo = parseFilename(filename);
      validFiles.push({ filename, fileInfo });
      console.log(
        `  ✅ ${filename} — ${fileInfo.sourceLang} → ${fileInfo.targetLang} (${fileInfo.pos})`,
      );
    } catch (e) {
      console.warn(`  ⚠️  Skipping ${filename}: ${(e as Error).message}`);
    }
  }
  if (validFiles.length === 0) {
    console.error("❌ No valid files to process. Exiting.");
    return;
  }
  // step 3: processing each file
  console.log("\n");
  console.log("\n");
  console.log("##########################################");
  console.log("step 3: processing each file");
  console.log("##########################################");
  let totalTermsInserted = 0;
  let totalTranslationsInserted = 0;
  for (const [i, { filename, fileInfo }] of validFiles.entries()) {
    const prefix = `[${i + 1}/${validFiles.length}]`;
    console.log(`\n${prefix} 📄 ${filename}`);
    const synsets = await readFromJsonFile(dataDir + filename);
    console.log(`${prefix} Loaded ${synsets.length} synsets`);
    let fileTermsInserted = 0;
    let fileTranslationsInserted = 0;
    for (const [j, synset] of synsets.entries()) {
      if (j > 0 && j % 500 === 0) {
        console.log(`${prefix} ⏳ ${j}/${synsets.length} synsets processed...`);
      }
      const { termInserted, translationsInserted } = await uploadSynsetToDB(
        synset,
        fileInfo,
      );
      if (termInserted) fileTermsInserted++;
      fileTranslationsInserted += translationsInserted;
    }
    console.log(
      `${prefix} ✅ Done — ${fileTermsInserted} new terms, ${fileTranslationsInserted} new translations`,
    );
    totalTermsInserted += fileTermsInserted;
    totalTranslationsInserted += fileTranslationsInserted;
  }
  // step 4: Final summary
  console.log("\n");
  console.log("\n");
  console.log("##########################################");
  console.log("step 4: final summary");
  console.log("##########################################");
  console.log(`\n🎉 Seeding complete!`);
  console.log(`   Terms inserted:        ${totalTermsInserted}`);
  console.log(`   Translations inserted: ${totalTranslationsInserted}`);
 };
 main().catch((error) => {
  console.error(error);
  process.exit(1);
 });
--- a/scripts/extract-own-save-to-json.py
+++ b/scripts/extract-own-save-to-json.py
@ -0,0 +1,149 @@
 """
 scripts/extract-omw-data.py
 Extract ALL synsets from Open Multilingual Wordnet (OMW) for every supported
 language and POS. Replaces extract-en-it-nouns.py.
 Output: one JSON file per POS, written to packages/db/src/data/datafiles/
  omw-noun.json
  omw-verb.json
 Each file is a JSON array of objects matching SynsetRecord in seed.ts:
  {
    "source_id": "ili:i12345",
    "pos": "noun",
    "translations": { "en": ["dog", "canine"], "it": ["cane"] },
    "glosses":      { "en": ["a domesticated animal..."] }
  }
 Translations and glosses are absent for a language if that wordnet has no
 coverage for the synset — the seed script handles sparse data gracefully.
 Usage:
  python scripts/extract-omw-data.py [output_dir]
  output_dir defaults to packages/db/src/data/datafiles/
 Prerequisites:
  pip install wn
  python -c "import wn; wn.download('oewn:2024'); wn.download('omw-it:1.4')"
 """
 import json
 import sys
 from pathlib import Path
 import wn
 # Mirror constants.ts — update both places if languages or POS change.
 SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it"]
 POS_MAP: dict[str, str] = {
    "n": "noun",
    "v": "verb",
 }
 def extract_all(output_dir: str = "packages/db/src/data/datafiles/") -> None:
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)
    # Load one Wordnet object per language up front.
    print("Loading wordnets...")
    wordnets: dict[str, wn.Wordnet] = {}
    for lang in SUPPORTED_LANGUAGE_CODES:
        try:
            wordnets[lang] = wn.Wordnet(lang=lang)
            synset_count = len(wordnets[lang].synsets())
            print(f"  {lang}: {synset_count:,} total synsets")
        except wn.Error as e:
            print(f"  ERROR loading {lang}: {e}")
            print(f"  Run: python -c \"import wn; wn.download('omw-{lang}:1.4')\"")
            sys.exit(1)
    for omw_pos, pos_label in POS_MAP.items():
        print(f"\n--- Extracting {pos_label}s (pos='{omw_pos}') ---")
        # Collect per-ILI data across all languages.
        # Structure: { ili -> { lang -> { "lemmas": [...], "glosses": [...] } } }
        by_ili: dict[str, dict[str, dict[str, list[str]]]] = {}
        for lang, wnet in wordnets.items():
            synsets = wnet.synsets(pos=omw_pos)
            covered = 0
            for synset in synsets:
                ili = synset.ili
                if not ili:
                    continue  # skip synsets without an ILI — can't cross-link
                covered += 1
                if ili not in by_ili:
                    by_ili[ili] = {}
                lemmas = [str(lemma) for lemma in synset.lemmas()]
                defns = [d for d in synset.definitions() if d]
                by_ili[ili][lang] = {"lemmas": lemmas, "glosses": defns}
            print(f"  {lang}: {covered:,} {pos_label} synsets with ILI")
        # Build output records — sort by ILI for a stable, diffable file.
        records: list[dict] = []
        for ili in sorted(by_ili.keys()):
            lang_data = by_ili[ili]
            translations: dict[str, list[str]] = {}
            glosses: dict[str, list[str]] = {}
            for lang, data in lang_data.items():
                if data["lemmas"]:
                    translations[lang] = data["lemmas"]
                if data["glosses"]:
                    glosses[lang] = data["glosses"]
            # Include the record even if only one language has coverage —
            # the seed script imports all terms regardless of cross-language overlap.
            records.append(
                {
                    "source_id": f"ili:{ili}",
                    "pos": pos_label,
                    "translations": translations,
                    "glosses": glosses,
                }
            )
        output_file = out / f"omw-{pos_label}.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(records, f, indent=2, ensure_ascii=False)
        print(f"\nWrote {len(records):,} {pos_label} synsets → {output_file}")
        _print_coverage(records, pos_label)
 def _print_coverage(records: list[dict], pos_label: str) -> None:
    """Print per-language translation and gloss counts."""
    lang_stats: dict[str, dict[str, int]] = {}
    for lang in SUPPORTED_LANGUAGE_CODES:
        lang_stats[lang] = {"translations": 0, "glosses": 0}
    for r in records:
        for lang, lemmas in r["translations"].items():
            if lang in lang_stats:
                lang_stats[lang]["translations"] += len(lemmas)
        for lang, gloss_list in r["glosses"].items():
            if lang in lang_stats:
                lang_stats[lang]["glosses"] += len(gloss_list)
    print(f"\nCoverage for {pos_label}s:")
    for lang, counts in lang_stats.items():
        t = counts["translations"]
        g = counts["glosses"]
        avg_t = t / len(records) if records else 0
        print(f"  {lang}: {t:,} translations ({avg_t:.1f} avg/synset), {g:,} glosses")
    # Sample output
    print(f"\nSample {pos_label}s (records 1000–1004):")
    for r in records[1000:1005]:
        print(f"  {r['source_id']}: {r['translations']}")
 if __name__ == "__main__":
    output_dir = sys.argv[1] if len(sys.argv) > 1 else "packages/db/src/data/datafiles/"
    extract_all(output_dir)