From dfeb6a4cb0a64720e25d4f2c886cd55a968ac814 Mon Sep 17 00:00:00 2001 From: lila Date: Sun, 5 Apr 2026 19:29:17 +0200 Subject: [PATCH] updating seeding pipeline --- packages/db/src/seeding-datafiles.ts | 203 --------------------------- scripts/extract-own-save-to-json.py | 149 ++++++++++++++++++++ 2 files changed, 149 insertions(+), 203 deletions(-) create mode 100644 scripts/extract-own-save-to-json.py diff --git a/packages/db/src/seeding-datafiles.ts b/packages/db/src/seeding-datafiles.ts index da854a2..e69de29 100644 --- a/packages/db/src/seeding-datafiles.ts +++ b/packages/db/src/seeding-datafiles.ts @@ -1,203 +0,0 @@ -import fs from "node:fs/promises"; -import { eq } from "drizzle-orm"; - -import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared"; -import { db } from "@glossa/db"; -import { terms, translations } from "@glossa/db/schema"; - -// the following generate unions of the imported const arrays -type POS = (typeof SUPPORTED_POS)[number]; -type LANGUAGE_CODE = (typeof SUPPORTED_LANGUAGE_CODES)[number]; - -type Synset = { - synset_id: string; - pos: POS; - translations: Partial>; -}; - -type FileName = { - sourceLang: LANGUAGE_CODE; - targetLang: LANGUAGE_CODE; - pos: POS; -}; - -const dataDir = "./src/data/datafiles/"; - -const parseFilename = (filename: string): FileName => { - const parts = filename.replace(".json", "").split("-"); - if (parts.length !== 3) - throw new Error( - `Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`, - ); - const [sourceLang, targetLang, pos] = parts; - if (!SUPPORTED_LANGUAGE_CODES.includes(sourceLang as LANGUAGE_CODE)) - throw new Error(`Unsupported language code: ${sourceLang}`); - if (!SUPPORTED_LANGUAGE_CODES.includes(targetLang as LANGUAGE_CODE)) - throw new Error(`Unsupported language code: ${targetLang}`); - if (!SUPPORTED_POS.includes(pos as POS)) - throw new Error(`Unsupported POS: ${pos}`); - return { - sourceLang: sourceLang as LANGUAGE_CODE, - targetLang: targetLang as LANGUAGE_CODE, - pos: pos as POS, - }; -}; - -const readFromJsonFile = async (filepath: string): Promise => { - const data = await fs.readFile(filepath, "utf8"); - const parsed = JSON.parse(data); - if (!Array.isArray(parsed)) throw new Error("Expected a JSON array"); - return parsed as Synset[]; -}; - -const uploadSynsetToDB = async ( - synset: Synset, - _fileInfo: FileName, -): Promise<{ termInserted: boolean; translationsInserted: number }> => { - // 1. Try to insert the term — skip if synset_id already exists - const inserted = await db - .insert(terms) - .values({ synset_id: synset.synset_id, pos: synset.pos }) - .onConflictDoNothing() - .returning({ id: terms.id }); - - let termId: string; - let termInserted: boolean; - - if (inserted.length > 0) { - termId = inserted[0]!.id; - termInserted = true; - } else { - // Term already exists — fetch its real DB id for the FK - const [existing] = await db - .select({ id: terms.id }) - .from(terms) - .where(eq(terms.synset_id, synset.synset_id)) - .limit(1); - if (!existing) - throw new Error(`Term not found after conflict: ${synset.synset_id}`); - termId = existing.id; - termInserted = false; - } - - // 2. Build translation rows and upsert — skip duplicates silently - const translationRows = Object.entries(synset.translations).flatMap( - ([lang, lemmas]) => - lemmas!.map((lemma) => ({ - id: crypto.randomUUID(), - term_id: termId, - language_code: lang as LANGUAGE_CODE, - text: lemma, - })), - ); - - if (translationRows.length === 0) { - return { termInserted, translationsInserted: 0 }; - } - - const result = await db - .insert(translations) - .values(translationRows) - .onConflictDoNothing() - .returning({ id: translations.id }); - - return { termInserted, translationsInserted: result.length }; -}; - -const main = async () => { - // step 1: discovering files - console.log("\n"); - console.log("\n"); - console.log("##########################################"); - console.log("step 1: discovering files"); - console.log("##########################################"); - - console.log("šŸ” Scanning datafiles directory..."); - const allFiles = await fs.readdir(dataDir); - const jsonFiles = allFiles.filter((f) => f.endsWith(".json")); - - if (jsonFiles.length === 0) { - console.warn("āš ļø No JSON files found in", dataDir); - return; - } - console.log(`šŸ“ Found ${jsonFiles.length} file(s)\n`); - - // step 2: validating filenames - console.log("\n"); - console.log("\n"); - console.log("##########################################"); - console.log("step 2: validating filenames"); - console.log("##########################################"); - const validFiles: { filename: string; fileInfo: FileName }[] = []; - for (const filename of jsonFiles) { - try { - const fileInfo = parseFilename(filename); - validFiles.push({ filename, fileInfo }); - console.log( - ` āœ… ${filename} — ${fileInfo.sourceLang} → ${fileInfo.targetLang} (${fileInfo.pos})`, - ); - } catch (e) { - console.warn(` āš ļø Skipping ${filename}: ${(e as Error).message}`); - } - } - - if (validFiles.length === 0) { - console.error("āŒ No valid files to process. Exiting."); - return; - } - - // step 3: processing each file - console.log("\n"); - console.log("\n"); - console.log("##########################################"); - console.log("step 3: processing each file"); - console.log("##########################################"); - let totalTermsInserted = 0; - let totalTranslationsInserted = 0; - - for (const [i, { filename, fileInfo }] of validFiles.entries()) { - const prefix = `[${i + 1}/${validFiles.length}]`; - - console.log(`\n${prefix} šŸ“„ ${filename}`); - - const synsets = await readFromJsonFile(dataDir + filename); - console.log(`${prefix} Loaded ${synsets.length} synsets`); - - let fileTermsInserted = 0; - let fileTranslationsInserted = 0; - - for (const [j, synset] of synsets.entries()) { - if (j > 0 && j % 500 === 0) { - console.log(`${prefix} ā³ ${j}/${synsets.length} synsets processed...`); - } - - const { termInserted, translationsInserted } = await uploadSynsetToDB( - synset, - fileInfo, - ); - if (termInserted) fileTermsInserted++; - fileTranslationsInserted += translationsInserted; - } - - console.log( - `${prefix} āœ… Done — ${fileTermsInserted} new terms, ${fileTranslationsInserted} new translations`, - ); - totalTermsInserted += fileTermsInserted; - totalTranslationsInserted += fileTranslationsInserted; - } - - // step 4: Final summary - console.log("\n"); - console.log("\n"); - console.log("##########################################"); - console.log("step 4: final summary"); - console.log("##########################################"); - console.log(`\nšŸŽ‰ Seeding complete!`); - console.log(` Terms inserted: ${totalTermsInserted}`); - console.log(` Translations inserted: ${totalTranslationsInserted}`); -}; - -main().catch((error) => { - console.error(error); - process.exit(1); -}); diff --git a/scripts/extract-own-save-to-json.py b/scripts/extract-own-save-to-json.py new file mode 100644 index 0000000..2c27d78 --- /dev/null +++ b/scripts/extract-own-save-to-json.py @@ -0,0 +1,149 @@ +""" +scripts/extract-omw-data.py + +Extract ALL synsets from Open Multilingual Wordnet (OMW) for every supported +language and POS. Replaces extract-en-it-nouns.py. + +Output: one JSON file per POS, written to packages/db/src/data/datafiles/ + omw-noun.json + omw-verb.json + +Each file is a JSON array of objects matching SynsetRecord in seed.ts: + { + "source_id": "ili:i12345", + "pos": "noun", + "translations": { "en": ["dog", "canine"], "it": ["cane"] }, + "glosses": { "en": ["a domesticated animal..."] } + } + +Translations and glosses are absent for a language if that wordnet has no +coverage for the synset — the seed script handles sparse data gracefully. + +Usage: + python scripts/extract-omw-data.py [output_dir] + + output_dir defaults to packages/db/src/data/datafiles/ + +Prerequisites: + pip install wn + python -c "import wn; wn.download('oewn:2024'); wn.download('omw-it:1.4')" +""" + +import json +import sys +from pathlib import Path + +import wn + +# Mirror constants.ts — update both places if languages or POS change. +SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it"] +POS_MAP: dict[str, str] = { + "n": "noun", + "v": "verb", +} + + +def extract_all(output_dir: str = "packages/db/src/data/datafiles/") -> None: + out = Path(output_dir) + out.mkdir(parents=True, exist_ok=True) + + # Load one Wordnet object per language up front. + print("Loading wordnets...") + wordnets: dict[str, wn.Wordnet] = {} + for lang in SUPPORTED_LANGUAGE_CODES: + try: + wordnets[lang] = wn.Wordnet(lang=lang) + synset_count = len(wordnets[lang].synsets()) + print(f" {lang}: {synset_count:,} total synsets") + except wn.Error as e: + print(f" ERROR loading {lang}: {e}") + print(f" Run: python -c \"import wn; wn.download('omw-{lang}:1.4')\"") + sys.exit(1) + + for omw_pos, pos_label in POS_MAP.items(): + print(f"\n--- Extracting {pos_label}s (pos='{omw_pos}') ---") + + # Collect per-ILI data across all languages. + # Structure: { ili -> { lang -> { "lemmas": [...], "glosses": [...] } } } + by_ili: dict[str, dict[str, dict[str, list[str]]]] = {} + + for lang, wnet in wordnets.items(): + synsets = wnet.synsets(pos=omw_pos) + covered = 0 + for synset in synsets: + ili = synset.ili + if not ili: + continue # skip synsets without an ILI — can't cross-link + covered += 1 + if ili not in by_ili: + by_ili[ili] = {} + + lemmas = [str(lemma) for lemma in synset.lemmas()] + defns = [d for d in synset.definitions() if d] + + by_ili[ili][lang] = {"lemmas": lemmas, "glosses": defns} + + print(f" {lang}: {covered:,} {pos_label} synsets with ILI") + + # Build output records — sort by ILI for a stable, diffable file. + records: list[dict] = [] + for ili in sorted(by_ili.keys()): + lang_data = by_ili[ili] + translations: dict[str, list[str]] = {} + glosses: dict[str, list[str]] = {} + + for lang, data in lang_data.items(): + if data["lemmas"]: + translations[lang] = data["lemmas"] + if data["glosses"]: + glosses[lang] = data["glosses"] + + # Include the record even if only one language has coverage — + # the seed script imports all terms regardless of cross-language overlap. + records.append( + { + "source_id": f"ili:{ili}", + "pos": pos_label, + "translations": translations, + "glosses": glosses, + } + ) + + output_file = out / f"omw-{pos_label}.json" + with open(output_file, "w", encoding="utf-8") as f: + json.dump(records, f, indent=2, ensure_ascii=False) + + print(f"\nWrote {len(records):,} {pos_label} synsets → {output_file}") + _print_coverage(records, pos_label) + + +def _print_coverage(records: list[dict], pos_label: str) -> None: + """Print per-language translation and gloss counts.""" + lang_stats: dict[str, dict[str, int]] = {} + for lang in SUPPORTED_LANGUAGE_CODES: + lang_stats[lang] = {"translations": 0, "glosses": 0} + + for r in records: + for lang, lemmas in r["translations"].items(): + if lang in lang_stats: + lang_stats[lang]["translations"] += len(lemmas) + for lang, gloss_list in r["glosses"].items(): + if lang in lang_stats: + lang_stats[lang]["glosses"] += len(gloss_list) + + print(f"\nCoverage for {pos_label}s:") + for lang, counts in lang_stats.items(): + t = counts["translations"] + g = counts["glosses"] + avg_t = t / len(records) if records else 0 + print(f" {lang}: {t:,} translations ({avg_t:.1f} avg/synset), {g:,} glosses") + + # Sample output + print(f"\nSample {pos_label}s (records 1000–1004):") + for r in records[1000:1005]: + print(f" {r['source_id']}: {r['translations']}") + + +if __name__ == "__main__": + output_dir = sys.argv[1] if len(sys.argv) > 1 else "packages/db/src/data/datafiles/" + extract_all(output_dir)