From 6f9a42c7073579817733bf130b67c14caa3c0107 Mon Sep 17 00:00:00 2001 From: lila Date: Tue, 5 May 2026 18:57:55 +0200 Subject: [PATCH] feat: add stage 2 reverse link sync script --- data-pipeline/package.json | 1 + .../scripts/reverse-link.ts | 109 ++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 data-pipeline/stage-2-reverse-link/scripts/reverse-link.ts diff --git a/data-pipeline/package.json b/data-pipeline/package.json index b25f26d..e47936a 100644 --- a/data-pipeline/package.json +++ b/data-pipeline/package.json @@ -5,6 +5,7 @@ "type": "module", "scripts": { "extract": "tsx stage-1-extract/scripts/extract.ts", + "reverse-link": "tsx stage-2-reverse-link/scripts/reverse-link.ts", "db:import": "tsx db/import.ts", "db:init": "tsx db/init.ts", "annotate": "tsx stage-2-annotate/scripts/annotate.ts", diff --git a/data-pipeline/stage-2-reverse-link/scripts/reverse-link.ts b/data-pipeline/stage-2-reverse-link/scripts/reverse-link.ts new file mode 100644 index 0000000..da8c9b6 --- /dev/null +++ b/data-pipeline/stage-2-reverse-link/scripts/reverse-link.ts @@ -0,0 +1,109 @@ +import { openDb } from "../../db/index.js"; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type TranslationRow = { + translation_id: number; + entry_id: number; + entry_language: string; + entry_headword: string; + target_lang: string; + word: string; + sense_hint: string | null; +}; + +type EntryRow = { id: number }; + +// ── Sync ────────────────────────────────────────────────────────────────────── + +export function reverseLink(): void { + const db = openDb(); + + // Find all translations and their source entry details + const translations = db + .prepare( + `SELECT + t.id AS translation_id, + t.entry_id, + e.language AS entry_language, + e.headword AS entry_headword, + t.target_lang, + t.word, + t.sense_hint + FROM translations t + JOIN entries e ON e.id = t.entry_id`, + ) + .all() as TranslationRow[]; + + console.log( + ` Found ${translations.length.toLocaleString()} translations to check`, + ); + + const findEntry = db.prepare( + `SELECT id FROM entries WHERE headword = ? AND language = ? LIMIT 1`, + ); + + const insertReverseLink = db.prepare( + `INSERT INTO translations (entry_id, target_lang, word, sense_hint, source) + VALUES (?, ?, ?, ?, 'reverse_link') + ON CONFLICT (entry_id, target_lang, word) DO NOTHING`, + ); + + const sync = db.transaction(() => { + let inserted = 0; + let skipped = 0; + let noEntry = 0; + + for (const t of translations) { + // Look for an entry in the target language with the translation word as headword + const targetEntry = findEntry.get(t.word, t.target_lang) as + | EntryRow + | undefined; + + if (!targetEntry) { + noEntry++; + continue; + } + + // Insert reverse link: target entry → source language → source headword + const result = insertReverseLink.run( + targetEntry.id, + t.entry_language, + t.entry_headword, + t.sense_hint ?? null, + ); + + if (result.changes > 0) { + inserted++; + } else { + skipped++; + } + } + + return { inserted, skipped, noEntry }; + }); + + const counts = sync(); + + db.close(); + + console.log(` Inserted: ${counts.inserted.toLocaleString()} reverse links`); + console.log( + ` Skipped: ${counts.skipped.toLocaleString()} (already existed)`, + ); + console.log( + ` No entry: ${counts.noEntry.toLocaleString()} (target word not in entries)`, + ); +} + +// ── Main ───────────────────────────────────────────────────────────────────── + +function main(): void { + console.log("Running reverse link sync..."); + reverseLink(); + console.log("\nReverse link sync complete."); +} + +if (import.meta.url === `file://${process.argv[1]}`) { + main(); +}