Compare commits
2 commits
6007fe1e38
...
4fa3073412
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4fa3073412 | ||
|
|
74cfc82bdd |
14 changed files with 370 additions and 41 deletions
24
data-pipeline/db/index.ts
Normal file
24
data-pipeline/db/index.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
import path from "node:path";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import Database from "better-sqlite3";
|
||||||
|
|
||||||
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
|
const DB_PATH = path.join(__dirname, "pipeline.db");
|
||||||
|
|
||||||
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export type Db = InstanceType<typeof Database>;
|
||||||
|
|
||||||
|
// ── Open ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export function openDb(): Db {
|
||||||
|
const db = new Database(DB_PATH);
|
||||||
|
|
||||||
|
db.pragma("journal_mode = WAL");
|
||||||
|
db.pragma("foreign_keys = ON");
|
||||||
|
|
||||||
|
return db;
|
||||||
|
}
|
||||||
39
data-pipeline/db/init.ts
Normal file
39
data-pipeline/db/init.ts
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import path from "node:path";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import Database from "better-sqlite3";
|
||||||
|
|
||||||
|
// ── Paths ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
|
const PATHS = {
|
||||||
|
schema: path.join(__dirname, "schema.sql"),
|
||||||
|
db: path.join(__dirname, "pipeline.db"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Init ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
export async function initDb(): Promise<void> {
|
||||||
|
const schema = await fs.readFile(PATHS.schema, "utf-8");
|
||||||
|
const db = new Database(PATHS.db);
|
||||||
|
|
||||||
|
db.pragma("journal_mode = WAL");
|
||||||
|
db.pragma("foreign_keys = ON");
|
||||||
|
db.exec(schema);
|
||||||
|
db.close();
|
||||||
|
|
||||||
|
console.log(` pipeline.db initialised → ${PATHS.db}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
console.log("Initialising pipeline.db...");
|
||||||
|
await initDb();
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
BIN
data-pipeline/db/pipeline.db
Normal file
BIN
data-pipeline/db/pipeline.db
Normal file
Binary file not shown.
157
data-pipeline/db/schema.sql
Normal file
157
data-pipeline/db/schema.sql
Normal file
|
|
@ -0,0 +1,157 @@
|
||||||
|
-- ── Base data ─────────────────────────────────────────────────────────────────
|
||||||
|
-- Imported from stage 2 JSON on first run. Never mutated after import.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS synsets (
|
||||||
|
source_id TEXT PRIMARY KEY,
|
||||||
|
pos TEXT NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS translations (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
word TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, language, word)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS glosses (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS examples (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS cefr_source_votes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||||
|
cefr_level TEXT NOT NULL,
|
||||||
|
UNIQUE (translation_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ── Status tracking ───────────────────────────────────────────────────────────
|
||||||
|
-- One row per synset per model per stage. Drives resumability.
|
||||||
|
-- stage: round1 | round2 | tiebreak
|
||||||
|
-- status: pending | complete | needs_review | flagged
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS run_status (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
stage TEXT NOT NULL,
|
||||||
|
status TEXT NOT NULL,
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
UNIQUE (source_id, model_name, stage)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ── Round 1 output ────────────────────────────────────────────────────────────
|
||||||
|
-- One row per translation/language per model. Written atomically per record.
|
||||||
|
-- Unique constraints enforce one model one vote.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS model_cefr_votes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
cefr_level TEXT NOT NULL,
|
||||||
|
UNIQUE (translation_id, model_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS generated_glosses (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, model_name, language)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS generated_examples (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, model_name, language)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS generated_descriptions (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, model_name, language)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ── Round 2 output ────────────────────────────────────────────────────────────
|
||||||
|
-- Each row represents one model voting for one candidate.
|
||||||
|
-- The candidate with the most votes wins in merge.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS gloss_candidate_votes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
gloss_id INTEGER NOT NULL REFERENCES generated_glosses(id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
UNIQUE (gloss_id, model_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS example_candidate_votes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
example_id INTEGER NOT NULL REFERENCES generated_examples(id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
UNIQUE (example_id, model_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS description_candidate_votes (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
description_id INTEGER NOT NULL REFERENCES generated_descriptions(id),
|
||||||
|
model_name TEXT NOT NULL,
|
||||||
|
UNIQUE (description_id, model_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- ── Resolved output ───────────────────────────────────────────────────────────
|
||||||
|
-- Written by merge. Never updated after writing.
|
||||||
|
-- Only fully resolved records are written here — no nulls, no flags.
|
||||||
|
-- Absence of a row means unresolved. Flagged status tracked in run_status.
|
||||||
|
-- source: omw | cefr | model_name
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS resolved_translations (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
translation_id INTEGER NOT NULL REFERENCES translations(id),
|
||||||
|
cefr_level TEXT NOT NULL,
|
||||||
|
difficulty TEXT NOT NULL,
|
||||||
|
UNIQUE (translation_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS resolved_glosses (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, language)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS resolved_examples (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS resolved_descriptions (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
source_id TEXT NOT NULL REFERENCES synsets(source_id),
|
||||||
|
language TEXT NOT NULL,
|
||||||
|
text TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL,
|
||||||
|
UNIQUE (source_id, language)
|
||||||
|
);
|
||||||
|
|
@ -3,7 +3,11 @@
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {},
|
"scripts": {
|
||||||
|
"db:init": "tsx db/init.ts",
|
||||||
|
"test": "vitest run",
|
||||||
|
"test:watch": "vitest"
|
||||||
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@lila/shared": "workspace:*",
|
"@lila/shared": "workspace:*",
|
||||||
"better-sqlite3": "^12.9.0"
|
"better-sqlite3": "^12.9.0"
|
||||||
|
|
@ -12,6 +16,7 @@
|
||||||
"@types/better-sqlite3": "^7.6.13",
|
"@types/better-sqlite3": "^7.6.13",
|
||||||
"@types/node": "^24.12.0",
|
"@types/node": "^24.12.0",
|
||||||
"tsx": "^4.21.0",
|
"tsx": "^4.21.0",
|
||||||
"typescript": "^5.9.3"
|
"typescript": "^5.9.3",
|
||||||
|
"vitest": "^4.1.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -154,7 +154,7 @@ async function loadAnnotated(): Promise<AnnotatedRecord[]> {
|
||||||
for (const [l, examples] of Object.entries(record.examples)) {
|
for (const [l, examples] of Object.entries(record.examples)) {
|
||||||
const lang = l as SupportedLanguageCode;
|
const lang = l as SupportedLanguageCode;
|
||||||
if (!base.examples[lang]) {
|
if (!base.examples[lang]) {
|
||||||
base.examples[lang] = examples as Example[];
|
base.examples[lang] = examples;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -8,5 +8,5 @@
|
||||||
"types": ["node"]
|
"types": ["node"]
|
||||||
},
|
},
|
||||||
"references": [{ "path": "../packages/shared" }],
|
"references": [{ "path": "../packages/shared" }],
|
||||||
"include": ["./**/*"]
|
"include": ["./**/*", "vitest.config.ts"]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
10
data-pipeline/vitest.config.ts
Normal file
10
data-pipeline/vitest.config.ts
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
import { defineConfig } from "vitest/config";
|
||||||
|
|
||||||
|
export default defineConfig({
|
||||||
|
test: {
|
||||||
|
environment: "node",
|
||||||
|
globals: true,
|
||||||
|
include: ["tests/**/*.test.ts"],
|
||||||
|
exclude: ["**/dist/**", "**/node_modules/**"],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
@ -7,7 +7,7 @@
|
||||||
> (huggingface.co/UniversalCEFR) AND CEFR-J
|
> (huggingface.co/UniversalCEFR) AND CEFR-J
|
||||||
> (github.com/openlanguageprofiles/olp-en-cefrj) AS STARTING POINTS.**
|
> (github.com/openlanguageprofiles/olp-en-cefrj) AS STARTING POINTS.**
|
||||||
|
|
||||||
This pipeline extracts vocabulary data from the Open Multilingual Wordnet (OMW), annotates it with CEFR levels from curated source files, verifies and enriches annotations using local LLMs, and produces authoritative JSON files per language. These files are consumed by the seeder in `packages/db` to populate the database with terms, translations, glosses, CEFR levels, difficulty ratings, and LLM-generated descriptions.
|
This pipeline extracts vocabulary data from the Open Multilingual Wordnet (OMW), annotates it with CEFR levels from curated source files, verifies and enriches annotations using local LLMs, and produces authoritative output in `pipeline.db`. This database is consumed by the sync script to populate the production database with terms, translations, glosses, CEFR levels, difficulty ratings, and LLM-generated descriptions.
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
|
|
@ -20,6 +20,7 @@ flowchart LR
|
||||||
enrich[Enrich]
|
enrich[Enrich]
|
||||||
pipelinedb[(pipeline.db)]
|
pipelinedb[(pipeline.db)]
|
||||||
merge[Merge]
|
merge[Merge]
|
||||||
|
tiebreak[Tiebreak]
|
||||||
compare[Compare]
|
compare[Compare]
|
||||||
sync[Sync]
|
sync[Sync]
|
||||||
db[(PostgreSQL)]
|
db[(PostgreSQL)]
|
||||||
|
|
@ -31,6 +32,8 @@ flowchart LR
|
||||||
enrich --> pipelinedb
|
enrich --> pipelinedb
|
||||||
pipelinedb --> merge
|
pipelinedb --> merge
|
||||||
merge --> pipelinedb
|
merge --> pipelinedb
|
||||||
|
pipelinedb --> tiebreak
|
||||||
|
tiebreak --> pipelinedb
|
||||||
pipelinedb --> compare
|
pipelinedb --> compare
|
||||||
pipelinedb --> sync
|
pipelinedb --> sync
|
||||||
sync --> db
|
sync --> db
|
||||||
|
|
@ -42,7 +45,26 @@ Stage 1 is a manual prerequisite and is not run by the pipeline orchestrator. Se
|
||||||
|
|
||||||
The enrich stage is designed to run overnight, one model at a time. Each model processes every word and writes results to `pipeline.db` atomically per record — interrupted runs resume from the last unprocessed record.
|
The enrich stage is designed to run overnight, one model at a time. Each model processes every word and writes results to `pipeline.db` atomically per record — interrupted runs resume from the last unprocessed record.
|
||||||
|
|
||||||
Only fully resolved records reach the database. Records where LLMs could not reach a majority vote are marked `flagged` in `pipeline.db` and wait for manual review before syncing.
|
Only fully resolved records reach the production database. Records where LLMs could not reach a majority vote are handled automatically by the tiebreaker stage before seeding.
|
||||||
|
|
||||||
|
## pipeline.db
|
||||||
|
|
||||||
|
All pipeline state from stage 3 onwards is stored in `pipeline.db` — a SQLite
|
||||||
|
database in `data-pipeline/db/`. It is created automatically on first run and
|
||||||
|
is not committed to git.
|
||||||
|
|
||||||
|
The database serves three purposes:
|
||||||
|
|
||||||
|
- **Resumability** — every record is written atomically with a status. Interrupted
|
||||||
|
overnight runs resume from the last pending record without losing work.
|
||||||
|
- **Vote tracking** — all model votes for CEFR levels and generated text are
|
||||||
|
stored per model per record, giving full auditability of how every decision
|
||||||
|
was reached.
|
||||||
|
- **Resolved output** — the final resolved records live here and are read by
|
||||||
|
the sync script to seed the production database.
|
||||||
|
|
||||||
|
The schema is defined in `data-pipeline/db/schema.sql`. Never edit `pipeline.db`
|
||||||
|
directly — all writes go through the pipeline scripts.
|
||||||
|
|
||||||
## Data sources
|
## Data sources
|
||||||
|
|
||||||
|
|
@ -77,7 +99,9 @@ CEFR levels are determined by a majority vote combining all available sources:
|
||||||
|
|
||||||
The LLMs verify existing annotations as well as filling gaps — a source file entry does not automatically win. Majority vote across all sources determines the final level.
|
The LLMs verify existing annotations as well as filling gaps — a source file entry does not automatically win. Majority vote across all sources determines the final level.
|
||||||
|
|
||||||
If no majority is reached, the word is flagged for manual review and excluded from the database until resolved.
|
Words appearing in the CEFR source file multiple times with different CEFR levels are written to `conflicts.json` and excluded from `cefr_source_votes`. They are still present in `translations` and the LLMs vote on them like any other unannotated word — the conflict is resolved by majority vote.
|
||||||
|
|
||||||
|
If no majority is reached after all model runs, the word is handled automatically by the tiebreaker stage.
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
|
|
@ -99,19 +123,21 @@ to git.
|
||||||
|
|
||||||
### LLM setup
|
### LLM setup
|
||||||
|
|
||||||
See `LLM-SETUP.md`.
|
See `llm-setup.md`.
|
||||||
|
|
||||||
## Pipeline stages
|
## Pipeline stages
|
||||||
|
|
||||||
The pipeline runs in five stages. Each stage is independent and can be re-run without affecting the others.
|
The pipeline runs in six stages plus a tiebreaker. Each stage is independent and can be re-run without affecting the others.
|
||||||
|
|
||||||
| Stage | What it does |
|
| Stage | What it does |
|
||||||
| ----------- | -------------------------------------------------------------------- |
|
| ------------ | -------------------------------------------------------------------- |
|
||||||
| 1. Extract | Reads OMW SQLite database, outputs normalized JSON per language |
|
| 1. Extract | Reads OMW SQLite database, outputs normalized JSON per language |
|
||||||
| 2. Annotate | Merges CEFR source files into extracted data, adds source file votes |
|
| 2. Annotate | Merges CEFR source files into extracted data, adds source file votes |
|
||||||
| 3. Enrich | Runs local LLMs in two rounds — generation then voting |
|
| 3. Enrich | Runs local LLMs in two rounds — generation then voting |
|
||||||
| 4. Merge | Resolves votes, derives difficulty, splits into final and flagged |
|
| 4. Merge | Resolves votes, derives difficulty, splits into final and flagged |
|
||||||
| 5. Compare | Generates COVERAGE.md with detailed quality report |
|
| 4b. Tiebreak | Runs unused models on flagged translations until majority is reached |
|
||||||
|
| 5. Compare | Generates COVERAGE.md with detailed quality report |
|
||||||
|
| 6. Sync | Upserts resolved records into production PostgreSQL |
|
||||||
|
|
||||||
### 1. Extract
|
### 1. Extract
|
||||||
|
|
||||||
|
|
@ -156,18 +182,17 @@ Note: glosses and examples are not available for all languages. French and Spani
|
||||||
|
|
||||||
### 2. Annotate
|
### 2. Annotate
|
||||||
|
|
||||||
Reads the combined OMW extract and merges CEFR source data into it. Each translation in each language is matched against the corresponding CEFR source
|
Reads the combined OMW extract and merges CEFR source data into it. Each translation in each language is matched against the corresponding CEFR source file by word text and part of speech. Matched translations receive a `cefr_source` vote which carries into the enrich stage. Unmatched translations proceed without a vote.
|
||||||
file by word text and part of speech. Matched translations receive a `cefr_source` vote which carries into the enrich stage. Unmatched translations proceed without a vote.
|
|
||||||
|
|
||||||
This stage also extracts native example sentences from the CEFR source files and adds them to the record alongside OMW examples, with `source: "cefr"` to distinguish them.
|
This stage also extracts native example sentences from the CEFR source files and adds them to the record alongside OMW examples, with `source: "cefr"` to distinguish them.
|
||||||
|
|
||||||
Words appearing in the CEFR source file multiple times with different CEFR levels are written to `conflicts.json` for manual review and excluded from voting until resolved.
|
Words appearing in the CEFR source file multiple times with different CEFR levels are written to `conflicts.json` and excluded from source voting. The LLMs handle these words like any other unannotated word.
|
||||||
|
|
||||||
**Input:** `stage-1-extract/output/omw.json` + `stage-2-annotate/sources/cefr/{lang}.json`
|
**Input:** `stage-1-extract/output/omw.json` + `stage-2-annotate/sources/cefr/{lang}.json`
|
||||||
**Output:**
|
**Output:**
|
||||||
|
|
||||||
- `stage-2-annotate/output/{lang}.json` — one per language
|
- `stage-2-annotate/output/{lang}.json` — one per language
|
||||||
- `stage-2-annotate/output/conflicts.json` — cross-language conflicts for review
|
- `stage-2-annotate/output/conflicts.json` — cross-language conflicts for reference
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pnpm --filter @lila/pipeline annotate
|
pnpm --filter @lila/pipeline annotate
|
||||||
|
|
@ -293,11 +318,10 @@ every field. Updates each record in `pipeline.db` with status `final` or
|
||||||
|
|
||||||
- OMW data wins automatically and is never overridden
|
- OMW data wins automatically and is never overridden
|
||||||
- For CEFR levels: the level with the most votes wins. If no majority is
|
- For CEFR levels: the level with the most votes wins. If no majority is
|
||||||
reached, that translation is flagged
|
reached, that translation is flagged for the tiebreaker
|
||||||
- For LLM-generated text fields (gloss, examples, descriptions): the
|
- For LLM-generated text fields (gloss, examples, descriptions): the
|
||||||
candidate with the most votes wins
|
candidate with the most votes wins. If no majority is reached, the
|
||||||
|
tiebreaker runs for that record as well
|
||||||
<!-- TODO: decide fallback strategy when no majority is reached for text fields -->
|
|
||||||
|
|
||||||
**Difficulty mapping:**
|
**Difficulty mapping:**
|
||||||
|
|
||||||
|
|
@ -314,11 +338,26 @@ every field. Updates each record in `pipeline.db` with status `final` or
|
||||||
pnpm --filter @lila/pipeline merge
|
pnpm --filter @lila/pipeline merge
|
||||||
```
|
```
|
||||||
|
|
||||||
**Resolving flagged words:**
|
### 4b. Tiebreak
|
||||||
|
|
||||||
Query `pipeline.db` for all records with status `flagged`, manually set the
|
Runs automatically after merge if any translations remain flagged. The script
|
||||||
correct `cefr_level` and `difficulty` for each flagged translation, and update
|
queries `pipeline.db` for flagged translations, identifies which configured
|
||||||
the record status to `final`. Re-run the sync script after resolving.
|
models have not yet voted on each word, and runs those models on the flagged
|
||||||
|
subset only. Merge is re-run after each tiebreaker pass. This repeats until
|
||||||
|
all flagged translations are resolved or no unused models remain.
|
||||||
|
|
||||||
|
If unused models are exhausted and flagged translations remain, the script
|
||||||
|
logs a detailed report showing the exact vote split for each unresolved word
|
||||||
|
and lists available models from OpenRouter that have not been used. Seeding
|
||||||
|
is blocked until all translations are resolved. To continue, add one or more
|
||||||
|
models to the config and re-run the pipeline — the tiebreaker will pick up
|
||||||
|
automatically.
|
||||||
|
|
||||||
|
**Input:** `pipeline.db` — flagged translations from merge
|
||||||
|
**Output:** `pipeline.db` — flagged translations resolved to `final`
|
||||||
|
|
||||||
|
> **Note:** The tiebreaker is not a standalone script. It runs automatically
|
||||||
|
> as part of the pipeline orchestrator after merge completes.
|
||||||
|
|
||||||
### 5. Compare / QA
|
### 5. Compare / QA
|
||||||
|
|
||||||
|
|
@ -326,7 +365,7 @@ Read-only. Generates `COVERAGE.md` with a full breakdown of the pipeline
|
||||||
output quality per language. Run this after merge to verify output before
|
output quality per language. Run this after merge to verify output before
|
||||||
syncing to the database.
|
syncing to the database.
|
||||||
|
|
||||||
**Input:** `pipeline.db` — records with status `final` and `flagged`
|
**Input:** `pipeline.db` — records with status `final`
|
||||||
**Output:** `COVERAGE.md`
|
**Output:** `COVERAGE.md`
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
@ -363,7 +402,7 @@ already exists in the target database:
|
||||||
- **Present but changed** → update
|
- **Present but changed** → update
|
||||||
- **Present and unchanged** → skip
|
- **Present and unchanged** → skip
|
||||||
|
|
||||||
Run this after merge and after manually resolving any flagged entries.
|
Run this after all records are resolved and Compare / QA has been reviewed.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pnpm --filter @lila/pipeline sync
|
pnpm --filter @lila/pipeline sync
|
||||||
|
|
@ -372,6 +411,41 @@ pnpm --filter @lila/pipeline sync
|
||||||
The sync script requires a connection string to the target database. Set
|
The sync script requires a connection string to the target database. Set
|
||||||
`DATABASE_URL` in your `.env` file before running.
|
`DATABASE_URL` in your `.env` file before running.
|
||||||
|
|
||||||
|
## Reports
|
||||||
|
|
||||||
|
The pipeline generates a report at the end of every run. Reports are written
|
||||||
|
to `data-pipeline/reports/` as a JSON file and a markdown file with the same
|
||||||
|
name. The markdown is generated from the JSON and contains identical data.
|
||||||
|
|
||||||
|
```
|
||||||
|
data-pipeline/reports/
|
||||||
|
2026-05-03_night-1.json
|
||||||
|
2026-05-03_night-1.md
|
||||||
|
```
|
||||||
|
|
||||||
|
The report name is provided when starting the pipeline:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pnpm --filter @lila/pipeline run --name "night-1"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Nightly report** contains:
|
||||||
|
|
||||||
|
- Records processed this run vs total
|
||||||
|
- Records remaining per stage
|
||||||
|
- Average processing speed and estimated nights remaining
|
||||||
|
- `needs_review` count — records that failed structural validation
|
||||||
|
- Per-model progress breakdown
|
||||||
|
|
||||||
|
**Final report** (generated when all records are processed) additionally contains:
|
||||||
|
|
||||||
|
- Full vote breakdown per model
|
||||||
|
- Flagged translations with exact vote splits
|
||||||
|
- Available unused models from OpenRouter for tiebreaking
|
||||||
|
- Per-model quality metrics — CEFR agreement rate, field coverage, JSON parse rate
|
||||||
|
|
||||||
|
Reports are not committed to git.
|
||||||
|
|
||||||
## Adding a new language
|
## Adding a new language
|
||||||
|
|
||||||
1. Add the language code to `SUPPORTED_LANGUAGE_CODES` in `packages/shared/src/constants.ts`
|
1. Add the language code to `SUPPORTED_LANGUAGE_CODES` in `packages/shared/src/constants.ts`
|
||||||
|
|
@ -424,9 +498,9 @@ dataset matures:
|
||||||
## Roadmap
|
## Roadmap
|
||||||
|
|
||||||
**Current state:** Stages 1 and 2 are complete and output has been reviewed
|
**Current state:** Stages 1 and 2 are complete and output has been reviewed
|
||||||
for all five languages. Architecture for stages 3–5 and the sync script is
|
for all five languages. Architecture for stages 3–6, the tiebreaker, and the
|
||||||
finalised. Stage 3 scripts have not been written yet and llama.cpp is not
|
report system are finalised. Stage 3 scripts have not been written yet and
|
||||||
installed.
|
llama.cpp is not installed.
|
||||||
|
|
||||||
**Next action:** Write the stage 3 round 1 script.
|
**Next action:** Write the stage 3 round 1 script.
|
||||||
|
|
||||||
|
|
@ -436,7 +510,9 @@ installed.
|
||||||
| 2. Annotate | ✅ complete |
|
| 2. Annotate | ✅ complete |
|
||||||
| 3. Enrich | 🔲 not started |
|
| 3. Enrich | 🔲 not started |
|
||||||
| 4. Merge | 🔲 not started |
|
| 4. Merge | 🔲 not started |
|
||||||
|
| 4b. Tiebreak | 🔲 not started |
|
||||||
| 5. Compare / QA | 🔲 not started |
|
| 5. Compare / QA | 🔲 not started |
|
||||||
|
| 6. Sync | 🔲 not started |
|
||||||
|
|
||||||
### Stage 1 — Extract `✅ complete`
|
### Stage 1 — Extract `✅ complete`
|
||||||
|
|
||||||
|
|
@ -452,6 +528,7 @@ installed.
|
||||||
|
|
||||||
**Next action:** Write the round 1 generation script.
|
**Next action:** Write the round 1 generation script.
|
||||||
|
|
||||||
|
- [ ] Write tests for stage 3
|
||||||
- [ ] Write round 1 script (generation)
|
- [ ] Write round 1 script (generation)
|
||||||
- [ ] Write compile-candidates script
|
- [ ] Write compile-candidates script
|
||||||
- [ ] Write round 2 script (voting)
|
- [ ] Write round 2 script (voting)
|
||||||
|
|
@ -461,21 +538,38 @@ installed.
|
||||||
- [ ] Run full 100-record sample, collect metrics
|
- [ ] Run full 100-record sample, collect metrics
|
||||||
- [ ] Compare providers (local vs OpenRouter free models)
|
- [ ] Compare providers (local vs OpenRouter free models)
|
||||||
- [ ] Production run — all records, all models
|
- [ ] Production run — all records, all models
|
||||||
- [ ] Compile candidates → `stage-3-enrich/output/candidates/{lang}_candidates.json`
|
- [ ] Compile candidates → `pipeline.db`
|
||||||
- [ ] Compile votes → `stage-3-enrich/output/votes/{lang}_votes.json`
|
- [ ] Compile votes → `pipeline.db`
|
||||||
|
|
||||||
### Stage 4 — Merge `🔲 not started`
|
### Stage 4 — Merge `🔲 not started`
|
||||||
|
|
||||||
|
- [ ] Write tests for stage 3
|
||||||
- [ ] Write merge script
|
- [ ] Write merge script
|
||||||
- [ ] Run merge → `final/` and `flagged/`
|
- [ ] Run merge → `pipeline.db`
|
||||||
- [ ] Manually resolve flagged entries
|
- [ ] Confirm tiebreaker resolves all flagged translations
|
||||||
|
|
||||||
|
### Stage 4b — Tiebreak `🔲 not started`
|
||||||
|
|
||||||
|
- [ ] Write tests for stage 3
|
||||||
|
- [ ] Write tiebreak logic
|
||||||
|
- [ ] Run tiebreaker for all flagged translations
|
||||||
|
- [ ] Confirm no flagged translations remain before seeding
|
||||||
|
|
||||||
### Stage 5 — Compare / QA `🔲 not started`
|
### Stage 5 — Compare / QA `🔲 not started`
|
||||||
|
|
||||||
|
- [ ] Write tests for stage 3
|
||||||
- [ ] Write compare script
|
- [ ] Write compare script
|
||||||
- [ ] Run compare → `COVERAGE.md`
|
- [ ] Run compare → `COVERAGE.md`
|
||||||
- [ ] Review output quality before seeding
|
- [ ] Review output quality before seeding
|
||||||
|
|
||||||
|
### Stage 6 — Sync `🔲 not started`
|
||||||
|
|
||||||
|
- [ ] Write tests for stage 3
|
||||||
|
- [ ] Write sync script
|
||||||
|
- [ ] Configure `DATABASE_URL` in `.env`
|
||||||
|
- [ ] Run sync → production PostgreSQL
|
||||||
|
- [ ] Verify seeded data in production
|
||||||
|
|
||||||
### Utilities
|
### Utilities
|
||||||
|
|
||||||
**`test/`** — Runs the pipeline against a small sample to produce human-readable output for a quick sanity check before committing to a full run. Run this after any script change before running the full pipeline.
|
**`test/`** — Runs the pipeline against a small sample to produce human-readable output for a quick sanity check before committing to a full run. Run this after any script change before running the full pipeline.
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,6 @@ export default defineConfig([
|
||||||
"node_modules/",
|
"node_modules/",
|
||||||
"routeTree.gen.ts",
|
"routeTree.gen.ts",
|
||||||
"scripts/**",
|
"scripts/**",
|
||||||
"data-pipeline/**/*",
|
|
||||||
]),
|
]),
|
||||||
|
|
||||||
eslint.configs.recommended,
|
eslint.configs.recommended,
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@
|
||||||
"prettier --write"
|
"prettier --write"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"packageManager": "pnpm@10.33.1",
|
"packageManager": "pnpm@10.33.2",
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@eslint/js": "^10.0.1",
|
"@eslint/js": "^10.0.1",
|
||||||
"@tanstack/eslint-plugin-router": "^1.161.6",
|
"@tanstack/eslint-plugin-router": "^1.161.6",
|
||||||
|
|
|
||||||
5
pnpm-lock.yaml
generated
5
pnpm-lock.yaml
generated
|
|
@ -173,6 +173,9 @@ importers:
|
||||||
typescript:
|
typescript:
|
||||||
specifier: ^5.9.3
|
specifier: ^5.9.3
|
||||||
version: 5.9.3
|
version: 5.9.3
|
||||||
|
vitest:
|
||||||
|
specifier: ^4.1.0
|
||||||
|
version: 4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))
|
||||||
|
|
||||||
packages/db:
|
packages/db:
|
||||||
dependencies:
|
dependencies:
|
||||||
|
|
@ -4391,7 +4394,6 @@ snapshots:
|
||||||
magic-string: 0.30.21
|
magic-string: 0.30.21
|
||||||
optionalDependencies:
|
optionalDependencies:
|
||||||
vite: 8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)
|
vite: 8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)
|
||||||
optional: true
|
|
||||||
|
|
||||||
'@vitest/mocker@4.1.0(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))':
|
'@vitest/mocker@4.1.0(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3))':
|
||||||
dependencies:
|
dependencies:
|
||||||
|
|
@ -6136,7 +6138,6 @@ snapshots:
|
||||||
jsdom: 29.0.1(@noble/hashes@2.2.0)
|
jsdom: 29.0.1(@noble/hashes@2.2.0)
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- msw
|
- msw
|
||||||
optional: true
|
|
||||||
|
|
||||||
vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)):
|
vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.3)):
|
||||||
dependencies:
|
dependencies:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue