222 lines
7.3 KiB
TypeScript
222 lines
7.3 KiB
TypeScript
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import { fileURLToPath } from "node:url";
|
|
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
|
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
|
import { openDb } from "./index.js";
|
|
|
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
|
|
|
type Example = { text: string; source: "omw" | "cefr" };
|
|
|
|
type AnnotatedRecord = {
|
|
source_id: string;
|
|
pos: SupportedPos;
|
|
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
|
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
|
votes: Partial<
|
|
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
|
>;
|
|
};
|
|
|
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
|
|
const PATHS = {
|
|
annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
|
|
};
|
|
|
|
// ── Loading ───────────────────────────────────────────────────────────────────
|
|
|
|
async function loadAnnotated(): Promise<AnnotatedRecord[]> {
|
|
// Use en.json as the base — it has the most complete glosses and examples.
|
|
// Merge votes and CEFR examples from the other language files.
|
|
const baseRaw = await fs.readFile(
|
|
path.join(PATHS.annotatedDir, "en.json"),
|
|
"utf-8",
|
|
);
|
|
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
|
|
|
|
const byId = new Map<string, AnnotatedRecord>();
|
|
for (const record of base) {
|
|
byId.set(record.source_id, record);
|
|
}
|
|
|
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
|
if (lang === "en") continue;
|
|
|
|
const raw = await fs.readFile(
|
|
path.join(PATHS.annotatedDir, `${lang}.json`),
|
|
"utf-8",
|
|
);
|
|
const records = JSON.parse(raw) as AnnotatedRecord[];
|
|
|
|
for (const record of records) {
|
|
const base = byId.get(record.source_id);
|
|
if (!base) continue;
|
|
|
|
// Merge votes
|
|
for (const [l, langVotes] of Object.entries(record.votes)) {
|
|
if (!base.votes[l as SupportedLanguageCode]) {
|
|
base.votes[l as SupportedLanguageCode] = {};
|
|
}
|
|
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
|
|
}
|
|
|
|
// Merge CEFR examples not already in base
|
|
for (const [l, examples] of Object.entries(record.examples)) {
|
|
const lang = l as SupportedLanguageCode;
|
|
const cefrExamples = examples.filter((e) => e.source === "cefr");
|
|
if (cefrExamples.length === 0) continue;
|
|
|
|
if (!base.examples[lang]) {
|
|
base.examples[lang] = cefrExamples;
|
|
} else {
|
|
base.examples[lang].push(...cefrExamples);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return [...byId.values()];
|
|
}
|
|
|
|
// ── Import ────────────────────────────────────────────────────────────────────
|
|
|
|
export async function importStage2(): Promise<void> {
|
|
console.log("Loading stage 2 annotated files...");
|
|
const records = await loadAnnotated();
|
|
console.log(` Loaded ${records.length.toLocaleString()} synsets`);
|
|
|
|
const db = openDb();
|
|
|
|
const insertSynset = db.prepare(
|
|
`INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
|
|
);
|
|
|
|
const insertTranslation = db.prepare(
|
|
`INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
|
|
);
|
|
|
|
const insertGloss = db.prepare(
|
|
`INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
|
|
);
|
|
|
|
const insertExample = db.prepare(
|
|
`INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
|
|
);
|
|
|
|
const insertCefrVote = db.prepare(`
|
|
INSERT INTO cefr_source_votes (translation_id, cefr_level)
|
|
VALUES (
|
|
(SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
|
|
?
|
|
)
|
|
`);
|
|
|
|
console.log("\nImporting into pipeline.db...");
|
|
|
|
const importAll = db.transaction(() => {
|
|
let synsets = 0;
|
|
let translations = 0;
|
|
let glosses = 0;
|
|
let examples = 0;
|
|
let cefrVotes = 0;
|
|
|
|
for (const record of records) {
|
|
insertSynset.run(record.source_id, record.pos);
|
|
synsets++;
|
|
|
|
// Translations
|
|
for (const [lang, words] of Object.entries(record.translations)) {
|
|
const unique = [...new Set(words)];
|
|
for (const word of unique) {
|
|
insertTranslation.run(record.source_id, lang, word);
|
|
translations++;
|
|
}
|
|
}
|
|
|
|
// Glosses
|
|
for (const [lang, glossList] of Object.entries(record.glosses)) {
|
|
for (const text of glossList) {
|
|
insertGloss.run(record.source_id, lang, text);
|
|
glosses++;
|
|
}
|
|
}
|
|
|
|
// Examples
|
|
for (const [lang, exList] of Object.entries(record.examples)) {
|
|
for (const example of exList) {
|
|
insertExample.run(
|
|
record.source_id,
|
|
lang,
|
|
example.text,
|
|
example.source,
|
|
);
|
|
examples++;
|
|
}
|
|
}
|
|
|
|
// CEFR source votes
|
|
for (const [lang, langVotes] of Object.entries(record.votes)) {
|
|
for (const [word, vote] of Object.entries(
|
|
langVotes as Record<string, { cefr_source: string }>,
|
|
)) {
|
|
insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
|
|
cefrVotes++;
|
|
}
|
|
}
|
|
}
|
|
|
|
return { synsets, translations, glosses, examples, cefrVotes };
|
|
});
|
|
|
|
const counts = importAll();
|
|
|
|
console.log(` synsets: ${counts.synsets.toLocaleString()}`);
|
|
console.log(` translations: ${counts.translations.toLocaleString()}`);
|
|
console.log(` glosses: ${counts.glosses.toLocaleString()}`);
|
|
console.log(` examples: ${counts.examples.toLocaleString()}`);
|
|
console.log(` cefr votes: ${counts.cefrVotes.toLocaleString()}`);
|
|
|
|
db.close();
|
|
console.log("\nImport complete.");
|
|
}
|
|
|
|
// ── Check if already imported ─────────────────────────────────────────────────
|
|
|
|
export function isImported(): boolean {
|
|
const db = openDb();
|
|
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
|
|
count: number;
|
|
};
|
|
db.close();
|
|
return row.count > 0;
|
|
}
|
|
|
|
// ── Main ─────────────────────────────────────────────────────────────────────
|
|
|
|
async function main(): Promise<void> {
|
|
const db = openDb();
|
|
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
|
|
count: number;
|
|
};
|
|
db.close();
|
|
|
|
if (row.count > 0) {
|
|
console.log(
|
|
`pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
|
|
);
|
|
console.log("Delete pipeline.db and re-run db:init to start fresh.");
|
|
process.exit(0);
|
|
}
|
|
|
|
await importStage2();
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error(err);
|
|
process.exit(1);
|
|
});
|