updating seeding pipeline
This commit is contained in:
parent
c49c2fe2c3
commit
dfeb6a4cb0
2 changed files with 149 additions and 203 deletions
|
|
@ -1,203 +0,0 @@
|
||||||
import fs from "node:fs/promises";
|
|
||||||
import { eq } from "drizzle-orm";
|
|
||||||
|
|
||||||
import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@glossa/shared";
|
|
||||||
import { db } from "@glossa/db";
|
|
||||||
import { terms, translations } from "@glossa/db/schema";
|
|
||||||
|
|
||||||
// the following generate unions of the imported const arrays
|
|
||||||
type POS = (typeof SUPPORTED_POS)[number];
|
|
||||||
type LANGUAGE_CODE = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
|
||||||
|
|
||||||
type Synset = {
|
|
||||||
synset_id: string;
|
|
||||||
pos: POS;
|
|
||||||
translations: Partial<Record<LANGUAGE_CODE, string[]>>;
|
|
||||||
};
|
|
||||||
|
|
||||||
type FileName = {
|
|
||||||
sourceLang: LANGUAGE_CODE;
|
|
||||||
targetLang: LANGUAGE_CODE;
|
|
||||||
pos: POS;
|
|
||||||
};
|
|
||||||
|
|
||||||
const dataDir = "./src/data/datafiles/";
|
|
||||||
|
|
||||||
const parseFilename = (filename: string): FileName => {
|
|
||||||
const parts = filename.replace(".json", "").split("-");
|
|
||||||
if (parts.length !== 3)
|
|
||||||
throw new Error(
|
|
||||||
`Invalid filename format: ${filename}. Expected: sourcelang-targetlang-pos.json`,
|
|
||||||
);
|
|
||||||
const [sourceLang, targetLang, pos] = parts;
|
|
||||||
if (!SUPPORTED_LANGUAGE_CODES.includes(sourceLang as LANGUAGE_CODE))
|
|
||||||
throw new Error(`Unsupported language code: ${sourceLang}`);
|
|
||||||
if (!SUPPORTED_LANGUAGE_CODES.includes(targetLang as LANGUAGE_CODE))
|
|
||||||
throw new Error(`Unsupported language code: ${targetLang}`);
|
|
||||||
if (!SUPPORTED_POS.includes(pos as POS))
|
|
||||||
throw new Error(`Unsupported POS: ${pos}`);
|
|
||||||
return {
|
|
||||||
sourceLang: sourceLang as LANGUAGE_CODE,
|
|
||||||
targetLang: targetLang as LANGUAGE_CODE,
|
|
||||||
pos: pos as POS,
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
const readFromJsonFile = async (filepath: string): Promise<Synset[]> => {
|
|
||||||
const data = await fs.readFile(filepath, "utf8");
|
|
||||||
const parsed = JSON.parse(data);
|
|
||||||
if (!Array.isArray(parsed)) throw new Error("Expected a JSON array");
|
|
||||||
return parsed as Synset[];
|
|
||||||
};
|
|
||||||
|
|
||||||
const uploadSynsetToDB = async (
|
|
||||||
synset: Synset,
|
|
||||||
_fileInfo: FileName,
|
|
||||||
): Promise<{ termInserted: boolean; translationsInserted: number }> => {
|
|
||||||
// 1. Try to insert the term — skip if synset_id already exists
|
|
||||||
const inserted = await db
|
|
||||||
.insert(terms)
|
|
||||||
.values({ synset_id: synset.synset_id, pos: synset.pos })
|
|
||||||
.onConflictDoNothing()
|
|
||||||
.returning({ id: terms.id });
|
|
||||||
|
|
||||||
let termId: string;
|
|
||||||
let termInserted: boolean;
|
|
||||||
|
|
||||||
if (inserted.length > 0) {
|
|
||||||
termId = inserted[0]!.id;
|
|
||||||
termInserted = true;
|
|
||||||
} else {
|
|
||||||
// Term already exists — fetch its real DB id for the FK
|
|
||||||
const [existing] = await db
|
|
||||||
.select({ id: terms.id })
|
|
||||||
.from(terms)
|
|
||||||
.where(eq(terms.synset_id, synset.synset_id))
|
|
||||||
.limit(1);
|
|
||||||
if (!existing)
|
|
||||||
throw new Error(`Term not found after conflict: ${synset.synset_id}`);
|
|
||||||
termId = existing.id;
|
|
||||||
termInserted = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2. Build translation rows and upsert — skip duplicates silently
|
|
||||||
const translationRows = Object.entries(synset.translations).flatMap(
|
|
||||||
([lang, lemmas]) =>
|
|
||||||
lemmas!.map((lemma) => ({
|
|
||||||
id: crypto.randomUUID(),
|
|
||||||
term_id: termId,
|
|
||||||
language_code: lang as LANGUAGE_CODE,
|
|
||||||
text: lemma,
|
|
||||||
})),
|
|
||||||
);
|
|
||||||
|
|
||||||
if (translationRows.length === 0) {
|
|
||||||
return { termInserted, translationsInserted: 0 };
|
|
||||||
}
|
|
||||||
|
|
||||||
const result = await db
|
|
||||||
.insert(translations)
|
|
||||||
.values(translationRows)
|
|
||||||
.onConflictDoNothing()
|
|
||||||
.returning({ id: translations.id });
|
|
||||||
|
|
||||||
return { termInserted, translationsInserted: result.length };
|
|
||||||
};
|
|
||||||
|
|
||||||
const main = async () => {
|
|
||||||
// step 1: discovering files
|
|
||||||
console.log("\n");
|
|
||||||
console.log("\n");
|
|
||||||
console.log("##########################################");
|
|
||||||
console.log("step 1: discovering files");
|
|
||||||
console.log("##########################################");
|
|
||||||
|
|
||||||
console.log("🔍 Scanning datafiles directory...");
|
|
||||||
const allFiles = await fs.readdir(dataDir);
|
|
||||||
const jsonFiles = allFiles.filter((f) => f.endsWith(".json"));
|
|
||||||
|
|
||||||
if (jsonFiles.length === 0) {
|
|
||||||
console.warn("⚠️ No JSON files found in", dataDir);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
console.log(`📁 Found ${jsonFiles.length} file(s)\n`);
|
|
||||||
|
|
||||||
// step 2: validating filenames
|
|
||||||
console.log("\n");
|
|
||||||
console.log("\n");
|
|
||||||
console.log("##########################################");
|
|
||||||
console.log("step 2: validating filenames");
|
|
||||||
console.log("##########################################");
|
|
||||||
const validFiles: { filename: string; fileInfo: FileName }[] = [];
|
|
||||||
for (const filename of jsonFiles) {
|
|
||||||
try {
|
|
||||||
const fileInfo = parseFilename(filename);
|
|
||||||
validFiles.push({ filename, fileInfo });
|
|
||||||
console.log(
|
|
||||||
` ✅ ${filename} — ${fileInfo.sourceLang} → ${fileInfo.targetLang} (${fileInfo.pos})`,
|
|
||||||
);
|
|
||||||
} catch (e) {
|
|
||||||
console.warn(` ⚠️ Skipping ${filename}: ${(e as Error).message}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (validFiles.length === 0) {
|
|
||||||
console.error("❌ No valid files to process. Exiting.");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// step 3: processing each file
|
|
||||||
console.log("\n");
|
|
||||||
console.log("\n");
|
|
||||||
console.log("##########################################");
|
|
||||||
console.log("step 3: processing each file");
|
|
||||||
console.log("##########################################");
|
|
||||||
let totalTermsInserted = 0;
|
|
||||||
let totalTranslationsInserted = 0;
|
|
||||||
|
|
||||||
for (const [i, { filename, fileInfo }] of validFiles.entries()) {
|
|
||||||
const prefix = `[${i + 1}/${validFiles.length}]`;
|
|
||||||
|
|
||||||
console.log(`\n${prefix} 📄 ${filename}`);
|
|
||||||
|
|
||||||
const synsets = await readFromJsonFile(dataDir + filename);
|
|
||||||
console.log(`${prefix} Loaded ${synsets.length} synsets`);
|
|
||||||
|
|
||||||
let fileTermsInserted = 0;
|
|
||||||
let fileTranslationsInserted = 0;
|
|
||||||
|
|
||||||
for (const [j, synset] of synsets.entries()) {
|
|
||||||
if (j > 0 && j % 500 === 0) {
|
|
||||||
console.log(`${prefix} ⏳ ${j}/${synsets.length} synsets processed...`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const { termInserted, translationsInserted } = await uploadSynsetToDB(
|
|
||||||
synset,
|
|
||||||
fileInfo,
|
|
||||||
);
|
|
||||||
if (termInserted) fileTermsInserted++;
|
|
||||||
fileTranslationsInserted += translationsInserted;
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(
|
|
||||||
`${prefix} ✅ Done — ${fileTermsInserted} new terms, ${fileTranslationsInserted} new translations`,
|
|
||||||
);
|
|
||||||
totalTermsInserted += fileTermsInserted;
|
|
||||||
totalTranslationsInserted += fileTranslationsInserted;
|
|
||||||
}
|
|
||||||
|
|
||||||
// step 4: Final summary
|
|
||||||
console.log("\n");
|
|
||||||
console.log("\n");
|
|
||||||
console.log("##########################################");
|
|
||||||
console.log("step 4: final summary");
|
|
||||||
console.log("##########################################");
|
|
||||||
console.log(`\n🎉 Seeding complete!`);
|
|
||||||
console.log(` Terms inserted: ${totalTermsInserted}`);
|
|
||||||
console.log(` Translations inserted: ${totalTranslationsInserted}`);
|
|
||||||
};
|
|
||||||
|
|
||||||
main().catch((error) => {
|
|
||||||
console.error(error);
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
149
scripts/extract-own-save-to-json.py
Normal file
149
scripts/extract-own-save-to-json.py
Normal file
|
|
@ -0,0 +1,149 @@
|
||||||
|
"""
|
||||||
|
scripts/extract-omw-data.py
|
||||||
|
|
||||||
|
Extract ALL synsets from Open Multilingual Wordnet (OMW) for every supported
|
||||||
|
language and POS. Replaces extract-en-it-nouns.py.
|
||||||
|
|
||||||
|
Output: one JSON file per POS, written to packages/db/src/data/datafiles/
|
||||||
|
omw-noun.json
|
||||||
|
omw-verb.json
|
||||||
|
|
||||||
|
Each file is a JSON array of objects matching SynsetRecord in seed.ts:
|
||||||
|
{
|
||||||
|
"source_id": "ili:i12345",
|
||||||
|
"pos": "noun",
|
||||||
|
"translations": { "en": ["dog", "canine"], "it": ["cane"] },
|
||||||
|
"glosses": { "en": ["a domesticated animal..."] }
|
||||||
|
}
|
||||||
|
|
||||||
|
Translations and glosses are absent for a language if that wordnet has no
|
||||||
|
coverage for the synset — the seed script handles sparse data gracefully.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/extract-omw-data.py [output_dir]
|
||||||
|
|
||||||
|
output_dir defaults to packages/db/src/data/datafiles/
|
||||||
|
|
||||||
|
Prerequisites:
|
||||||
|
pip install wn
|
||||||
|
python -c "import wn; wn.download('oewn:2024'); wn.download('omw-it:1.4')"
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import wn
|
||||||
|
|
||||||
|
# Mirror constants.ts — update both places if languages or POS change.
|
||||||
|
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it"]
|
||||||
|
POS_MAP: dict[str, str] = {
|
||||||
|
"n": "noun",
|
||||||
|
"v": "verb",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_all(output_dir: str = "packages/db/src/data/datafiles/") -> None:
|
||||||
|
out = Path(output_dir)
|
||||||
|
out.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Load one Wordnet object per language up front.
|
||||||
|
print("Loading wordnets...")
|
||||||
|
wordnets: dict[str, wn.Wordnet] = {}
|
||||||
|
for lang in SUPPORTED_LANGUAGE_CODES:
|
||||||
|
try:
|
||||||
|
wordnets[lang] = wn.Wordnet(lang=lang)
|
||||||
|
synset_count = len(wordnets[lang].synsets())
|
||||||
|
print(f" {lang}: {synset_count:,} total synsets")
|
||||||
|
except wn.Error as e:
|
||||||
|
print(f" ERROR loading {lang}: {e}")
|
||||||
|
print(f" Run: python -c \"import wn; wn.download('omw-{lang}:1.4')\"")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
for omw_pos, pos_label in POS_MAP.items():
|
||||||
|
print(f"\n--- Extracting {pos_label}s (pos='{omw_pos}') ---")
|
||||||
|
|
||||||
|
# Collect per-ILI data across all languages.
|
||||||
|
# Structure: { ili -> { lang -> { "lemmas": [...], "glosses": [...] } } }
|
||||||
|
by_ili: dict[str, dict[str, dict[str, list[str]]]] = {}
|
||||||
|
|
||||||
|
for lang, wnet in wordnets.items():
|
||||||
|
synsets = wnet.synsets(pos=omw_pos)
|
||||||
|
covered = 0
|
||||||
|
for synset in synsets:
|
||||||
|
ili = synset.ili
|
||||||
|
if not ili:
|
||||||
|
continue # skip synsets without an ILI — can't cross-link
|
||||||
|
covered += 1
|
||||||
|
if ili not in by_ili:
|
||||||
|
by_ili[ili] = {}
|
||||||
|
|
||||||
|
lemmas = [str(lemma) for lemma in synset.lemmas()]
|
||||||
|
defns = [d for d in synset.definitions() if d]
|
||||||
|
|
||||||
|
by_ili[ili][lang] = {"lemmas": lemmas, "glosses": defns}
|
||||||
|
|
||||||
|
print(f" {lang}: {covered:,} {pos_label} synsets with ILI")
|
||||||
|
|
||||||
|
# Build output records — sort by ILI for a stable, diffable file.
|
||||||
|
records: list[dict] = []
|
||||||
|
for ili in sorted(by_ili.keys()):
|
||||||
|
lang_data = by_ili[ili]
|
||||||
|
translations: dict[str, list[str]] = {}
|
||||||
|
glosses: dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
for lang, data in lang_data.items():
|
||||||
|
if data["lemmas"]:
|
||||||
|
translations[lang] = data["lemmas"]
|
||||||
|
if data["glosses"]:
|
||||||
|
glosses[lang] = data["glosses"]
|
||||||
|
|
||||||
|
# Include the record even if only one language has coverage —
|
||||||
|
# the seed script imports all terms regardless of cross-language overlap.
|
||||||
|
records.append(
|
||||||
|
{
|
||||||
|
"source_id": f"ili:{ili}",
|
||||||
|
"pos": pos_label,
|
||||||
|
"translations": translations,
|
||||||
|
"glosses": glosses,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
output_file = out / f"omw-{pos_label}.json"
|
||||||
|
with open(output_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print(f"\nWrote {len(records):,} {pos_label} synsets → {output_file}")
|
||||||
|
_print_coverage(records, pos_label)
|
||||||
|
|
||||||
|
|
||||||
|
def _print_coverage(records: list[dict], pos_label: str) -> None:
|
||||||
|
"""Print per-language translation and gloss counts."""
|
||||||
|
lang_stats: dict[str, dict[str, int]] = {}
|
||||||
|
for lang in SUPPORTED_LANGUAGE_CODES:
|
||||||
|
lang_stats[lang] = {"translations": 0, "glosses": 0}
|
||||||
|
|
||||||
|
for r in records:
|
||||||
|
for lang, lemmas in r["translations"].items():
|
||||||
|
if lang in lang_stats:
|
||||||
|
lang_stats[lang]["translations"] += len(lemmas)
|
||||||
|
for lang, gloss_list in r["glosses"].items():
|
||||||
|
if lang in lang_stats:
|
||||||
|
lang_stats[lang]["glosses"] += len(gloss_list)
|
||||||
|
|
||||||
|
print(f"\nCoverage for {pos_label}s:")
|
||||||
|
for lang, counts in lang_stats.items():
|
||||||
|
t = counts["translations"]
|
||||||
|
g = counts["glosses"]
|
||||||
|
avg_t = t / len(records) if records else 0
|
||||||
|
print(f" {lang}: {t:,} translations ({avg_t:.1f} avg/synset), {g:,} glosses")
|
||||||
|
|
||||||
|
# Sample output
|
||||||
|
print(f"\nSample {pos_label}s (records 1000–1004):")
|
||||||
|
for r in records[1000:1005]:
|
||||||
|
print(f" {r['source_id']}: {r['translations']}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
output_dir = sys.argv[1] if len(sys.argv) > 1 else "packages/db/src/data/datafiles/"
|
||||||
|
extract_all(output_dir)
|
||||||
Loading…
Add table
Add a link
Reference in a new issue