feat: add Kaikki extraction and import scripts for stage 1
- Add stage-1-extract/scripts/extract.ts — streams Kaikki JSONL, filters to supported POS and languages, skips abbreviations and senses with no translations in supported languages - Rewrite db/import.ts for Kaikki flat model — tracks sense_index offsets per headword+pos to handle duplicate JSONL entries - Rewrite db/schema.sql for Kaikki model — entries, translations, LLM vote tables, resolved tables - Add extract and db:import scripts to package.json - Sample mode hardcoded to 500 entries for development
This commit is contained in:
parent
963bff4eb8
commit
209d52f54b
17 changed files with 346 additions and 1055737 deletions
|
|
@ -1,185 +1,98 @@
|
|||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||
import { openDb } from "./index.js";
|
||||
|
||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
type Example = { text: string; source: "omw" | "cefr" };
|
||||
|
||||
type AnnotatedRecord = {
|
||||
source_id: string;
|
||||
pos: SupportedPos;
|
||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
||||
votes: Partial<
|
||||
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
||||
>;
|
||||
};
|
||||
import type { ExtractedSense } from "../stage-1-extract/scripts/extract.js";
|
||||
|
||||
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
const PATHS = {
|
||||
annotatedDir: path.resolve(__dirname, "../stage-2-annotate/output"),
|
||||
extracted: path.resolve(__dirname, "../stage-1-extract/output/en.json"),
|
||||
};
|
||||
|
||||
// ── Loading ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async function loadAnnotated(): Promise<AnnotatedRecord[]> {
|
||||
// Use en.json as the base — it has the most complete glosses and examples.
|
||||
// Merge votes and CEFR examples from the other language files.
|
||||
const baseRaw = await fs.readFile(
|
||||
path.join(PATHS.annotatedDir, "en.json"),
|
||||
"utf-8",
|
||||
);
|
||||
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
|
||||
|
||||
const byId = new Map<string, AnnotatedRecord>();
|
||||
for (const record of base) {
|
||||
byId.set(record.source_id, record);
|
||||
}
|
||||
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
if (lang === "en") continue;
|
||||
|
||||
const raw = await fs.readFile(
|
||||
path.join(PATHS.annotatedDir, `${lang}.json`),
|
||||
"utf-8",
|
||||
);
|
||||
const records = JSON.parse(raw) as AnnotatedRecord[];
|
||||
|
||||
for (const record of records) {
|
||||
const base = byId.get(record.source_id);
|
||||
if (!base) continue;
|
||||
|
||||
// Merge votes
|
||||
for (const [l, langVotes] of Object.entries(record.votes)) {
|
||||
if (!base.votes[l as SupportedLanguageCode]) {
|
||||
base.votes[l as SupportedLanguageCode] = {};
|
||||
}
|
||||
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
|
||||
}
|
||||
|
||||
// Merge CEFR examples not already in base
|
||||
for (const [l, examples] of Object.entries(record.examples)) {
|
||||
const lang = l as SupportedLanguageCode;
|
||||
const cefrExamples = examples.filter((e) => e.source === "cefr");
|
||||
if (cefrExamples.length === 0) continue;
|
||||
|
||||
if (!base.examples[lang]) {
|
||||
base.examples[lang] = cefrExamples;
|
||||
} else {
|
||||
base.examples[lang].push(...cefrExamples);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...byId.values()];
|
||||
}
|
||||
|
||||
// ── Import ────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function importStage2(): Promise<void> {
|
||||
console.log("Loading stage 2 annotated files...");
|
||||
const records = await loadAnnotated();
|
||||
console.log(` Loaded ${records.length.toLocaleString()} synsets`);
|
||||
export async function importKaikki(): Promise<void> {
|
||||
console.log("Loading extracted Kaikki data...");
|
||||
const raw = await fs.readFile(PATHS.extracted, "utf-8");
|
||||
const senses = JSON.parse(raw) as ExtractedSense[];
|
||||
console.log(` Loaded ${senses.length.toLocaleString()} senses`);
|
||||
|
||||
const db = openDb();
|
||||
|
||||
const insertSynset = db.prepare(
|
||||
`INSERT INTO synsets (source_id, pos) VALUES (?, ?)`,
|
||||
);
|
||||
|
||||
const insertTranslation = db.prepare(
|
||||
`INSERT INTO translations (source_id, language, word) VALUES (?, ?, ?)`,
|
||||
);
|
||||
|
||||
const insertGloss = db.prepare(
|
||||
`INSERT INTO glosses (source_id, language, text) VALUES (?, ?, ?)`,
|
||||
);
|
||||
|
||||
const insertExample = db.prepare(
|
||||
`INSERT INTO examples (source_id, language, text, source) VALUES (?, ?, ?, ?)`,
|
||||
);
|
||||
|
||||
const insertCefrVote = db.prepare(`
|
||||
INSERT INTO cefr_source_votes (translation_id, cefr_level)
|
||||
VALUES (
|
||||
(SELECT id FROM translations WHERE source_id = ? AND language = ? AND word = ?),
|
||||
?
|
||||
)
|
||||
const insertEntry = db.prepare(`
|
||||
INSERT INTO entries (headword, language, pos, sense_index, gloss, examples)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT (headword, language, pos, sense_index)
|
||||
DO UPDATE SET
|
||||
gloss = excluded.gloss,
|
||||
examples = excluded.examples
|
||||
RETURNING id
|
||||
`);
|
||||
|
||||
const insertTranslation = db.prepare(`
|
||||
INSERT INTO translations (entry_id, target_lang, word, sense_hint)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT (entry_id, target_lang, word) DO NOTHING
|
||||
`);
|
||||
|
||||
// Track next available sense_index per (headword, pos) to handle
|
||||
// the same word appearing in multiple JSONL entries with the same POS.
|
||||
const senseIndexMap = new Map<string, number>();
|
||||
|
||||
console.log("\nImporting into pipeline.db...");
|
||||
|
||||
const importAll = db.transaction(() => {
|
||||
let synsets = 0;
|
||||
let entries = 0;
|
||||
let translations = 0;
|
||||
let glosses = 0;
|
||||
let examples = 0;
|
||||
let cefrVotes = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const record of records) {
|
||||
insertSynset.run(record.source_id, record.pos);
|
||||
synsets++;
|
||||
for (const sense of senses) {
|
||||
const key = `${sense.headword}|${sense.pos}`;
|
||||
const nextIndex = senseIndexMap.get(key) ?? 0;
|
||||
|
||||
// Translations
|
||||
for (const [lang, words] of Object.entries(record.translations)) {
|
||||
const unique = [...new Set(words)];
|
||||
for (const word of unique) {
|
||||
insertTranslation.run(record.source_id, lang, word);
|
||||
translations++;
|
||||
}
|
||||
// Use the offset sense_index to avoid collisions when the same word
|
||||
// appears in multiple JSONL entries with the same POS.
|
||||
const senseIndex = nextIndex;
|
||||
senseIndexMap.set(key, nextIndex + 1);
|
||||
|
||||
const row = insertEntry.get(
|
||||
sense.headword,
|
||||
"en",
|
||||
sense.pos,
|
||||
senseIndex,
|
||||
sense.gloss ?? null,
|
||||
JSON.stringify(sense.examples),
|
||||
) as { id: number } | undefined;
|
||||
|
||||
if (!row) {
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Glosses
|
||||
for (const [lang, glossList] of Object.entries(record.glosses)) {
|
||||
for (const text of glossList) {
|
||||
insertGloss.run(record.source_id, lang, text);
|
||||
glosses++;
|
||||
}
|
||||
}
|
||||
entries++;
|
||||
|
||||
// Examples
|
||||
for (const [lang, exList] of Object.entries(record.examples)) {
|
||||
for (const example of exList) {
|
||||
insertExample.run(
|
||||
record.source_id,
|
||||
lang,
|
||||
example.text,
|
||||
example.source,
|
||||
);
|
||||
examples++;
|
||||
}
|
||||
}
|
||||
|
||||
// CEFR source votes
|
||||
for (const [lang, langVotes] of Object.entries(record.votes)) {
|
||||
for (const [word, vote] of Object.entries(
|
||||
langVotes as Record<string, { cefr_source: string }>,
|
||||
)) {
|
||||
insertCefrVote.run(record.source_id, lang, word, vote.cefr_source);
|
||||
cefrVotes++;
|
||||
}
|
||||
for (const t of sense.translations) {
|
||||
insertTranslation.run(
|
||||
row.id,
|
||||
t.target_lang,
|
||||
t.word,
|
||||
t.sense_hint ?? null,
|
||||
);
|
||||
translations++;
|
||||
}
|
||||
}
|
||||
|
||||
return { synsets, translations, glosses, examples, cefrVotes };
|
||||
return { entries, translations, skipped };
|
||||
});
|
||||
|
||||
const counts = importAll();
|
||||
|
||||
console.log(` synsets: ${counts.synsets.toLocaleString()}`);
|
||||
console.log(` entries: ${counts.entries.toLocaleString()}`);
|
||||
console.log(` translations: ${counts.translations.toLocaleString()}`);
|
||||
console.log(` glosses: ${counts.glosses.toLocaleString()}`);
|
||||
console.log(` examples: ${counts.examples.toLocaleString()}`);
|
||||
console.log(` cefr votes: ${counts.cefrVotes.toLocaleString()}`);
|
||||
console.log(` skipped: ${counts.skipped.toLocaleString()}`);
|
||||
|
||||
db.close();
|
||||
console.log("\nImport complete.");
|
||||
|
|
@ -189,7 +102,7 @@ export async function importStage2(): Promise<void> {
|
|||
|
||||
export function isImported(): boolean {
|
||||
const db = openDb();
|
||||
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
|
||||
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
|
||||
count: number;
|
||||
};
|
||||
db.close();
|
||||
|
|
@ -200,20 +113,20 @@ export function isImported(): boolean {
|
|||
|
||||
async function main(): Promise<void> {
|
||||
const db = openDb();
|
||||
const row = db.prepare(`SELECT COUNT(*) as count FROM synsets`).get() as {
|
||||
const row = db.prepare("SELECT COUNT(*) as count FROM entries").get() as {
|
||||
count: number;
|
||||
};
|
||||
db.close();
|
||||
|
||||
if (row.count > 0) {
|
||||
console.log(
|
||||
`pipeline.db already contains ${row.count.toLocaleString()} synsets — skipping import.`,
|
||||
`pipeline.db already contains ${row.count.toLocaleString()} entries — skipping import.`,
|
||||
);
|
||||
console.log("Delete pipeline.db and re-run db:init to start fresh.");
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
await importStage2();
|
||||
await importKaikki();
|
||||
}
|
||||
|
||||
if (import.meta.url === `file://${process.argv[1]}`) {
|
||||
|
|
|
|||
|
|
@ -1,62 +1,58 @@
|
|||
-- ── Base data ─────────────────────────────────────────────────────────────────
|
||||
-- Imported from stage 2 JSON on first run. Never mutated after import.
|
||||
-- Imported from Kaikki on first run. Never mutated after import.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS synsets (
|
||||
source_id TEXT PRIMARY KEY,
|
||||
pos TEXT NOT NULL
|
||||
CREATE TABLE IF NOT EXISTS entries (
|
||||
id INTEGER PRIMARY KEY,
|
||||
headword TEXT NOT NULL,
|
||||
language TEXT NOT NULL,
|
||||
pos TEXT NOT NULL,
|
||||
sense_index INTEGER NOT NULL DEFAULT 0,
|
||||
gloss TEXT,
|
||||
examples TEXT NOT NULL DEFAULT '[]', -- JSON array of strings
|
||||
source TEXT NOT NULL DEFAULT 'kaikki',
|
||||
UNIQUE (headword, language, pos, sense_index)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS translations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
word TEXT NOT NULL,
|
||||
UNIQUE (source_id, language, word)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS glosses (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS examples (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
source TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS cefr_source_votes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||
cefr_level TEXT NOT NULL,
|
||||
UNIQUE (translation_id)
|
||||
id INTEGER PRIMARY KEY,
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||
target_lang TEXT NOT NULL,
|
||||
word TEXT NOT NULL,
|
||||
sense_hint TEXT,
|
||||
source TEXT NOT NULL DEFAULT 'kaikki',
|
||||
UNIQUE (entry_id, target_lang, word)
|
||||
);
|
||||
|
||||
-- ── Status tracking ───────────────────────────────────────────────────────────
|
||||
-- One row per synset per model per stage. Drives resumability.
|
||||
-- One row per entry per model per stage. Drives resumability.
|
||||
-- Sentinel rows use entry_id = 0 for one-time pipeline steps.
|
||||
-- stage: round1 | round2 | tiebreak
|
||||
-- status: pending | complete | needs_review | flagged
|
||||
|
||||
CREATE TABLE IF NOT EXISTS run_status (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL,
|
||||
entry_id INTEGER NOT NULL,
|
||||
model_name TEXT NOT NULL,
|
||||
stage TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
UNIQUE (source_id, model_name, stage)
|
||||
UNIQUE (entry_id, model_name, stage)
|
||||
);
|
||||
|
||||
-- ── Round 1 output ────────────────────────────────────────────────────────────
|
||||
-- One row per translation/language per model. Written atomically per record.
|
||||
-- Written atomically per entry per model.
|
||||
-- Unique constraints enforce one model one vote.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS model_cefr_votes (
|
||||
CREATE TABLE IF NOT EXISTS model_entry_cefr_votes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||
model_name TEXT NOT NULL,
|
||||
cefr_level TEXT NOT NULL,
|
||||
UNIQUE (entry_id, model_name)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS model_translation_cefr_votes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||
model_name TEXT NOT NULL,
|
||||
|
|
@ -64,38 +60,29 @@ CREATE TABLE IF NOT EXISTS model_cefr_votes (
|
|||
UNIQUE (translation_id, model_name)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS model_translation_rejections (
|
||||
id INTEGER PRIMARY KEY,
|
||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||
model_name TEXT NOT NULL,
|
||||
UNIQUE (translation_id, model_name)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS generated_glosses (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||
model_name TEXT NOT NULL,
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
UNIQUE (source_id, model_name, language)
|
||||
UNIQUE (entry_id, model_name)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS generated_examples (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||
model_name TEXT NOT NULL,
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
UNIQUE (source_id, model_name, language)
|
||||
UNIQUE (entry_id, model_name)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS generated_descriptions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
model_name TEXT NOT NULL,
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
UNIQUE (source_id, model_name, language)
|
||||
CREATE TABLE IF NOT EXISTS generated_translations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||
model_name TEXT NOT NULL,
|
||||
target_lang TEXT NOT NULL,
|
||||
word TEXT NOT NULL,
|
||||
UNIQUE (entry_id, model_name, target_lang)
|
||||
);
|
||||
|
||||
-- ── Round 2 output ────────────────────────────────────────────────────────────
|
||||
|
|
@ -116,20 +103,28 @@ CREATE TABLE IF NOT EXISTS example_candidate_votes (
|
|||
UNIQUE (example_id, model_name)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS description_candidate_votes (
|
||||
CREATE TABLE IF NOT EXISTS translation_candidate_votes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
|
||||
translation_id INTEGER NOT NULL REFERENCES generated_translations(id),
|
||||
model_name TEXT NOT NULL,
|
||||
UNIQUE (description_id, model_name)
|
||||
UNIQUE (translation_id, model_name)
|
||||
);
|
||||
|
||||
-- ── Resolved output ───────────────────────────────────────────────────────────
|
||||
-- Written by merge. Never updated after writing.
|
||||
-- Only fully resolved records are written here — no nulls, no flags.
|
||||
-- Only fully resolved records are written here — no nulls.
|
||||
-- Absence of a row means unresolved. Flagged status tracked in run_status.
|
||||
-- source: omw | cefr | model_name
|
||||
-- source: kaikki | model_name
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resolved_translations (
|
||||
CREATE TABLE IF NOT EXISTS resolved_entry_cefr (
|
||||
id INTEGER PRIMARY KEY,
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||
cefr_level TEXT NOT NULL,
|
||||
difficulty TEXT NOT NULL,
|
||||
UNIQUE (entry_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resolved_translation_cefr (
|
||||
id INTEGER PRIMARY KEY,
|
||||
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||
cefr_level TEXT NOT NULL,
|
||||
|
|
@ -138,27 +133,25 @@ CREATE TABLE IF NOT EXISTS resolved_translations (
|
|||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resolved_glosses (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
UNIQUE (source_id, language)
|
||||
id INTEGER PRIMARY KEY,
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||
text TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
UNIQUE (entry_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resolved_examples (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
source TEXT NOT NULL
|
||||
id INTEGER PRIMARY KEY,
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||
text TEXT NOT NULL,
|
||||
source TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS resolved_descriptions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||
language TEXT NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
UNIQUE (source_id, language)
|
||||
CREATE TABLE IF NOT EXISTS resolved_generated_translations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
entry_id INTEGER NOT NULL REFERENCES entries(id),
|
||||
target_lang TEXT NOT NULL,
|
||||
word TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
UNIQUE (entry_id, target_lang)
|
||||
);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue