feat: add db import script, fix duplicate translations in extract, add annotate script
This commit is contained in:
parent
4a842140b9
commit
f59399be02
7 changed files with 274 additions and 62 deletions
222
data-pipeline/db/import.ts
Normal file
222
data-pipeline/db/import.ts
Normal file
|
|
@ -0,0 +1,222 @@
|
|||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||
import { openDb } from "./index.js";
|
||||
|
||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
type Example = { text: string; source: "omw" | "cefr" };
|
||||
|
||||
type AnnotatedRecord = {
|
||||
source_id: string;
|
||||
pos: SupportedPos;
|
||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
||||
votes: Partial<
|
||||
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
||||
>;
|
||||
};
|
||||
|
||||
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
const PATHS = {
|
||||
annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
|
||||
};
|
||||
|
||||
// ── Loading ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async function loadAnnotated(): Promise<AnnotatedRecord[]> {
|
||||
// Use en.json as the base — it has the most complete glosses and examples.
|
||||
// Merge votes and CEFR examples from the other language files.
|
||||
const baseRaw = await fs.readFile(
|
||||
path.join(PATHS.annotatedDir, "en.json"),
|
||||
"utf-8",
|
||||
);
|
||||
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
|
||||
|
||||
const byId = new Map<string, AnnotatedRecord>();
|
||||
for (const record of base) {
|
||||
byId.set(record.source_id, record);
|
||||
}
|
||||
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
if (lang === "en") continue;
|
||||
|
||||
const raw = await fs.readFile(
|
||||
path.join(PATHS.annotatedDir, `${lang}.json`),
|
||||
"utf-8",
|
||||
);
|
||||
const records = JSON.parse(raw) as AnnotatedRecord[];
|
||||
|
||||
for (const record of records) {
|
||||
const base = byId.get(record.source_id);
|
||||
if (!base) continue;
|
||||
|
||||
// Merge votes
|
||||
for (const [l, langVotes] of Object.entries(record.votes)) {
|
||||
if (!base.votes[l as SupportedLanguageCode]) {
|
||||
base.votes[l as SupportedLanguageCode] = {};
|
||||
}
|
||||
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
|
||||
}
|
||||
|
||||
// Merge CEFR examples not already in base
|
||||
for (const [l, examples] of Object.entries(record.examples)) {
|
||||
const lang = l as SupportedLanguageCode;
|
||||
const cefrExamples = examples.filter((e) => e.source === "cefr");
|
||||
if (cefrExamples.length === 0) continue;
|
||||
|
||||
if (!base.examples[lang]) {
|
||||
base.examples[lang] = cefrExamples;
|
||||
} else {
|
||||
base.examples[lang].push(...cefrExamples);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...byId.values()];
|
||||
}
|
||||
|
||||
// ── Import ────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function importStage2(): Promise<void> {
|
||||
console.log("Loading stage 2 annotated files...");
|
||||
const records = await loadAnnotated();
|
||||
console.log(` Loaded ${records.length.toLocaleString()} synsets`);
|
||||
|
||||
const db = openDb();
|
||||
|
||||
const insertSynset = db.prepare(
|
||||
`INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
|
||||
);
|
||||
|
||||
const insertTranslation = db.prepare(
|
||||
`INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
|
||||
);
|
||||
|
||||
const insertGloss = db.prepare(
|
||||
`INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
|
||||
);
|
||||
|
||||
const insertExample = db.prepare(
|
||||
`INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
|
||||
);
|
||||
|
||||
const insertCefrVote = db.prepare(`
|
||||
INSERT INTO cefr_source_votes (translation_id, cefr_level)
|
||||
VALUES (
|
||||
(SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
|
||||
?
|
||||
)
|
||||
`);
|
||||
|
||||
console.log("\nImporting into pipeline.db...");
|
||||
|
||||
const importAll = db.transaction(() => {
|
||||
let synsets = 0;
|
||||
let translations = 0;
|
||||
let glosses = 0;
|
||||
let examples = 0;
|
||||
let cefrVotes = 0;
|
||||
|
||||
for (const record of records) {
|
||||
insertSynset.run(record.source_id, record.pos);
|
||||
synsets++;
|
||||
|
||||
// Translations
|
||||
for (const [lang, words] of Object.entries(record.translations)) {
|
||||
const unique = [...new Set(words)];
|
||||
for (const word of unique) {
|
||||
insertTranslation.run(record.source_id, lang, word);
|
||||
translations++;
|
||||
}
|
||||
}
|
||||
|
||||
// Glosses
|
||||
for (const [lang, glossList] of Object.entries(record.glosses)) {
|
||||
for (const text of glossList) {
|
||||
insertGloss.run(record.source_id, lang, text);
|
||||
glosses++;
|
||||
}
|
||||
}
|
||||
|
||||
// Examples
|
||||
for (const [lang, exList] of Object.entries(record.examples)) {
|
||||
for (const example of exList) {
|
||||
insertExample.run(
|
||||
record.source_id,
|
||||
lang,
|
||||
example.text,
|
||||
example.source,
|
||||
);
|
||||
examples++;
|
||||
}
|
||||
}
|
||||
|
||||
// CEFR source votes
|
||||
for (const [lang, langVotes] of Object.entries(record.votes)) {
|
||||
for (const [word, vote] of Object.entries(
|
||||
langVotes as Record<string, { cefr_source: string }>,
|
||||
)) {
|
||||
insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
|
||||
cefrVotes++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { synsets, translations, glosses, examples, cefrVotes };
|
||||
});
|
||||
|
||||
const counts = importAll();
|
||||
|
||||
console.log(` synsets: ${counts.synsets.toLocaleString()}`);
|
||||
console.log(` translations: ${counts.translations.toLocaleString()}`);
|
||||
console.log(` glosses: ${counts.glosses.toLocaleString()}`);
|
||||
console.log(` examples: ${counts.examples.toLocaleString()}`);
|
||||
console.log(` cefr votes: ${counts.cefrVotes.toLocaleString()}`);
|
||||
|
||||
db.close();
|
||||
console.log("\nImport complete.");
|
||||
}
|
||||
|
||||
// ── Check if already imported ─────────────────────────────────────────────────
|
||||
|
||||
export function isImported(): boolean {
|
||||
const db = openDb();
|
||||
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
|
||||
count: number;
|
||||
};
|
||||
db.close();
|
||||
return row.count > 0;
|
||||
}
|
||||
|
||||
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const db = openDb();
|
||||
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
|
||||
count: number;
|
||||
};
|
||||
db.close();
|
||||
|
||||
if (row.count > 0) {
|
||||
console.log(
|
||||
`pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
|
||||
);
|
||||
console.log("Delete pipeline.db and re-run db:init to start fresh.");
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
await importStage2();
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue