adding documentation for the llm setup for the data pipeline

2026-04-21 13:22:27 +02:00 · 2026-04-21 13:22:27 +02:00 · 849fcdad86
commit 849fcdad86
parent 214a597e99
3 changed files with 4992 additions and 0 deletions
--- a/data-pipeline/test/output/sample.json
+++ b/data-pipeline/test/output/sample.json
--- a/data-pipeline/test/scripts/sample.ts
+++ b/data-pipeline/test/scripts/sample.ts
@ -0,0 +1,205 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
 import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
 // ── Types ─────────────────────────────────────────────────────────────────────
 type Example = { text: string; source: "omw" | "cefr" };
 type AnnotatedRecord = {
  source_id: string;
  pos: SupportedPos;
  translations: Partial<Record<SupportedLanguageCode, string[]>>;
  glosses: Partial<Record<SupportedLanguageCode, string[]>>;
  examples: Partial<Record<SupportedLanguageCode, Example[]>>;
  votes: Partial<
    Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
  >;
 };
 type SampleRecord = AnnotatedRecord & { _sample_bucket: string };
 // ── Constants ─────────────────────────────────────────────────────────────────
 const PATHS = {
  annotatedDir: "stage-2-annotate/output",
  output: "test/output/sample.json",
 };
 const BUCKET_SIZE = 20;
 // ── Bucket predicates ─────────────────────────────────────────────────────────
 type Bucket = { name: string; predicate: (record: AnnotatedRecord) => boolean };
 const BUCKETS: Bucket[] = [
  {
    name: "has_cefr_vote",
    predicate: (r) =>
      Object.values(r.votes).some(
        (langVotes) => Object.keys(langVotes ?? {}).length > 0,
      ),
  },
  {
    name: "no_cefr_vote",
    predicate: (r) =>
      Object.values(r.votes).every(
        (langVotes) => Object.keys(langVotes ?? {}).length === 0,
      ),
  },
  {
    name: "has_glosses_and_examples",
    predicate: (r) =>
      Object.keys(r.glosses).length > 0 && Object.keys(r.examples).length > 0,
  },
  {
    name: "no_glosses_no_examples",
    predicate: (r) =>
      !r.glosses["fr"] &&
      !r.examples["fr"] &&
      !r.votes["fr"] &&
      !r.glosses["es"] &&
      !r.examples["es"] &&
      !r.votes["es"],
  },
  {
    name: "pos_spread",
    predicate: () => true, // sampled separately to ensure POS coverage
  },
 ];
 // ── Sampling ──────────────────────────────────────────────────────────────────
 function sampleBucket(
  records: AnnotatedRecord[],
  predicate: (r: AnnotatedRecord) => boolean,
  size: number,
  exclude: Set<string>,
 ): AnnotatedRecord[] {
  const candidates = records.filter(
    (r) => !exclude.has(r.source_id) && predicate(r),
  );
  // Shuffle for random sampling
  for (let i = candidates.length - 1; i > 0; i--) {
    const j = Math.floor(Math.random() * (i + 1));
    [candidates[i], candidates[j]] = [candidates[j]!, candidates[i]!];
  }
  return candidates.slice(0, size);
 }
 function samplePosBucket(
  records: AnnotatedRecord[],
  exclude: Set<string>,
 ): AnnotatedRecord[] {
  const posList: SupportedPos[] = ["noun", "verb", "adjective", "adverb"];
  const perPos = Math.floor(BUCKET_SIZE / posList.length);
  const result: AnnotatedRecord[] = [];
  for (const pos of posList) {
    const sampled = sampleBucket(
      records,
      (r) => r.pos === pos,
      perPos,
      exclude,
    );
    result.push(...sampled);
  }
  return result;
 }
 // ── Loading ───────────────────────────────────────────────────────────────────
 async function loadAnnotated(): Promise<AnnotatedRecord[]> {
  // Load all language files and merge votes into a single record set.
  // Use en.json as the base record structure since it has the most complete
  // glosses and examples. Votes from all other languages are merged in.
  const baseRaw = await fs.readFile(
    path.join(PATHS.annotatedDir, "en.json"),
    "utf-8",
  );
  const base = JSON.parse(baseRaw) as AnnotatedRecord[];
  // Build a map for fast lookup by source_id
  const byId = new Map<string, AnnotatedRecord>();
  for (const record of base) {
    byId.set(record.source_id, record);
  }
  // Merge votes from remaining language files
  for (const lang of SUPPORTED_LANGUAGE_CODES) {
    if (lang === "en") continue;
    const raw = await fs.readFile(
      path.join(PATHS.annotatedDir, `${lang}.json`),
      "utf-8",
    );
    const records = JSON.parse(raw) as AnnotatedRecord[];
    for (const record of records) {
      const base = byId.get(record.source_id);
      if (!base) continue;
      // Merge votes
      for (const [l, langVotes] of Object.entries(record.votes)) {
        if (!base.votes[l as SupportedLanguageCode]) {
          base.votes[l as SupportedLanguageCode] = {};
        }
        Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
      }
      // Merge examples from CEFR source files not in base
      for (const [l, examples] of Object.entries(record.examples)) {
        const lang = l as SupportedLanguageCode;
        if (!base.examples[lang]) {
          base.examples[lang] = examples as Example[];
        }
      }
    }
  }
  return [...byId.values()];
 }
 // ── Main ─────────────────────────────────────────────────────────────────────
 async function main(): Promise<void> {
  console.log("Loading annotated files...");
  const records = await loadAnnotated();
  console.log(`  Loaded ${records.length.toLocaleString()} synsets`);
  const sampled: SampleRecord[] = [];
  const seen = new Set<string>();
  // Sample each bucket except pos_spread
  for (const bucket of BUCKETS.filter((b) => b.name !== "pos_spread")) {
    const results = sampleBucket(records, bucket.predicate, BUCKET_SIZE, seen);
    for (const r of results) {
      seen.add(r.source_id);
      sampled.push({ ...r, _sample_bucket: bucket.name });
    }
    console.log(`  ${bucket.name}: ${results.length} records`);
  }
  // Sample pos_spread bucket
  const posResults = samplePosBucket(records, seen);
  for (const r of posResults) {
    seen.add(r.source_id);
    sampled.push({ ...r, _sample_bucket: "pos_spread" });
  }
  console.log(`  pos_spread: ${posResults.length} records`);
  console.log(`\nTotal sampled: ${sampled.length} records`);
  // Write output
  await fs.mkdir(path.dirname(PATHS.output), { recursive: true });
  await fs.writeFile(PATHS.output, JSON.stringify(sampled, null, 2), "utf-8");
  console.log(`Wrote sample → ${PATHS.output}`);
 }
 main().catch((err) => {
  console.error(err);
  process.exit(1);
 });
--- a/documentation/llm-setup.md
+++ b/documentation/llm-setup.md
@ -0,0 +1,295 @@
 # LLM Setup — lila pipeline
 This document covers the LLM infrastructure for stage 3 (enrich) of the lila
 data pipeline. It documents the hardware constraints, supported providers,
 model recommendations, and how to configure and swap providers in the test
 and production scripts.
 ---
 ## Hardware (dev machine)
 | Component | Spec |
 |---|---|
 | CPU | Intel Core i7-6500U (2 cores / 4 threads @ 3.10 GHz) |
 | RAM | 8 GB |
 | GPU | NVIDIA GeForce GTX 950M — 4 GB VRAM (Maxwell, CUDA compute 5.0) |
 | OS | Debian GNU/Linux 13 (trixie) x86_64 |
 **Local inference verdict:** viable for small/quantized models, not for
 production runs. See the [Local inference](#local-inference-llamacpp) section
 for details.
 ---
 ## Provider overview
 The enrich script uses a single, swappable provider config. All providers
 except Anthropic expose an OpenAI-compatible API, so the same client code
 works across all of them — only `baseURL`, `apiKey`, and `model` change.
 | Provider | Use case | Cost | Rate limits |
 |---|---|---|---|
 | llama.cpp (local) | Quality testing, overnight dev runs | Free (electricity) | None |
 | OpenRouter (free tier) | Quality comparison, multi-model evaluation | Free | 50 req/day, 20 req/min |
 | OpenRouter (paid) | Production runs if local quality insufficient | Pay-per-token | None |
 | Anthropic API | Quality baseline / reference | Pay-per-token | Standard |
 ---
 ## Local inference (llama.cpp)
 ### Why local inference is worth testing
 Time is not a constraint — the pipeline scripts are fully resumable. The
 laptop can run overnight for multiple nights. The only question is output
 quality, which the test script evaluates empirically.
 ### Hardware constraints
 The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0).
 llama.cpp supports Maxwell via CUDA backend but newer builds may require
 the `--cuda-no-kv-offload` flag depending on the version.
 llama.cpp splits model layers between GPU and CPU automatically via
 `--n-gpu-layers`. You set how many layers go on the GPU; the rest run on
 CPU/RAM. This means a model larger than VRAM is not a dead end — it runs
 in hybrid mode, slower than full-GPU but much faster than pure CPU.
 Practical estimates for this hardware (~3.5 GB VRAM usable after drivers):
 | Model size | Q4 VRAM | Mode | Est. speed |
 |---|---|---|---|
 | 3B | ~2.0 GB | Full GPU | ~15–20 tok/s |
 | 4B | ~2.5 GB | Full GPU | ~12–18 tok/s |
 | 7B | ~4.5 GB | Hybrid (~26/32 layers on GPU) | ~8–12 tok/s |
 | 13B+ | ~8 GB+ | CPU-heavy hybrid | too slow |
 ### Recommended local models
 Two candidates worth testing, covering different points on the size/quality
 tradeoff:
 **Gemma 4 E4B Instruct (Q4 / UD-Q4_K_XL)**
 - GGUF file: `gemma-4-E4B-it-UD-Q4_K_XL.gguf` (~2.5 GB)
 - Source: https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF
 - Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+
  language support including all five pipeline languages. First candidate
  to test.
 **Qwen2.5 7B Instruct (Q4_K_M)**
 - GGUF file: `Qwen2.5-7B-Instruct-Q4_K_M.gguf` (~4.5 GB)
 - Source: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF
 - Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~8–12 tok/s.
  Stronger multilingual generation than any 3–4B model. Second candidate,
  for comparison against the smaller Gemma 4 E4B.
 ### Installation
 ```bash
 # Install build dependencies
 sudo apt install build-essential cmake git
 # Clone llama.cpp
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 # Build with CUDA support (GTX 950M — compute 5.0)
 cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=50
 cmake --build build --config Release -j$(nproc)
 # Download model (example — adjust path as needed)
 mkdir -p models
 wget -O models/qwen2.5-3b-instruct-q4_k_m.gguf \
  https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_k_m.gguf
 ```
 ### Starting the server
 **Gemma 4 E4B** (full GPU):
 ```bash
 ./build/bin/llama-server \
  --model models/gemma-4-e4b-it-ud-q4_k_xl.gguf \
  --port 8080 \
  --ctx-size 4096 \
  --n-gpu-layers 999 \
  --host 127.0.0.1
 ```
 **Qwen2.5 7B** (hybrid — tune `--n-gpu-layers` to fit your VRAM):
 ```bash
 ./build/bin/llama-server \
  --model models/qwen2.5-7b-instruct-q4_k_m.gguf \
  --port 8080 \
  --ctx-size 4096 \
  --n-gpu-layers 28 \
  --host 127.0.0.1
 ```
 `--n-gpu-layers 999` means "put everything on GPU" — llama.cpp caps at the
 actual layer count automatically, so 999 is safe as a "full offload" value.
 For the 7B hybrid, start with `28` and reduce by 2 if the server reports
 out-of-memory at startup.
 ### Verify the server is running
 ```bash
 curl http://127.0.0.1:8080/health
 # Expected: {"status":"ok"}
 ```
 ---
 ## OpenRouter (free tier)
 OpenRouter exposes all models via an OpenAI-compatible API. No code changes
 are needed to switch from local llama.cpp to OpenRouter — only the config
 object changes.
 ### Rate limits (free tier)
 - **50 requests per day** (account total, not per model)
 - 20 requests per minute
 > **Implication for testing:** with a 10-record test set you have headroom
 > to test 4–5 models per day. With a 100-record test set, plan one model per
 > day.
 > **Implication for production:** the free tier is not viable for 117k
 > records. If local quality is insufficient, use paid OpenRouter credits or
 > a dedicated provider.
 ### Free models recommended for this pipeline
 Ranked by expected multilingual generation quality for en/it/de/fr/es:
 | Model ID | Params | Notes |
 |---|---|---|
 | `qwen/qwen3-coder:free` | 480B MoE (35B active) | Best free option. Strong multilingual despite "coder" label. Use as quality ceiling. |
 | `qwen/qwen3-next-80b-a3b-instruct:free` | 80B MoE (3B active) | Smaller Qwen, useful comparison point. |
 | `nvidia/nemotron-3-super-120b-a12b:free` | 120B MoE (12B active) | 262K context, supports structured output. |
 | `google/gemma-4-31b-it:free` | 31B | 140+ language support, good European language coverage. |
 | `zhipuai/glm-4.5-air:free` | MoE | Multilingual-focused. |
 **Skip for this pipeline:**
 - Llama models — weaker European language generation than Qwen/Gemma
 - Mistral free tier — requests may be used for model training
 ### API endpoint
 ```
 https://openrouter.ai/api/v1/chat/completions
 ```
 Set `Authorization: Bearer <OPENROUTER_API_KEY>` in the request headers.
 ---
 ## Provider configuration in the test script
 The enrich test script reads a single config object. To switch providers,
 change this object and re-run.
 ```typescript
 // config.ts
 export type ProviderConfig = {
  name: string;           // used for output folder naming
  baseURL: string;
  apiKey: string;
  model: string;
  maxTokens: number;
 };
 // Local llama.cpp
 export const LOCAL_QWEN3B: ProviderConfig = {
  name: "local-qwen2.5-3b",
  baseURL: "http://127.0.0.1:8080/v1",
  apiKey: "none",          // llama.cpp ignores this
  model: "qwen2.5-3b",     // llama.cpp ignores model name, uses loaded model
  maxTokens: 512,
 };
 // OpenRouter — Qwen3 480B (free)
 export const OR_QWEN3_480B: ProviderConfig = {
  name: "or-qwen3-480b",
  baseURL: "https://openrouter.ai/api/v1",
  apiKey: process.env.OPENROUTER_API_KEY!,
  model: "qwen/qwen3-coder:free",
  maxTokens: 512,
 };
 // OpenRouter — Gemma 4 31B (free)
 export const OR_GEMMA4_31B: ProviderConfig = {
  name: "or-gemma4-31b",
  baseURL: "https://openrouter.ai/api/v1",
  apiKey: process.env.OPENROUTER_API_KEY!,
  model: "google/gemma-4-31b-it:free",
  maxTokens: 512,
 };
 // Anthropic (reference baseline — different adapter required)
 export const ANTHROPIC_SONNET: ProviderConfig = {
  name: "anthropic-sonnet",
  baseURL: "https://api.anthropic.com/v1",  // adapter handles format difference
  apiKey: process.env.ANTHROPIC_API_KEY!,
  model: "claude-sonnet-4-6",
  maxTokens: 512,
 };
 ```
 Output from each run lands in:
 ```
 stage-3-enrich/test/output/{provider.name}/results.json
 stage-3-enrich/test/output/{provider.name}/metrics.json
 ```
 The evaluate script compares all `metrics.json` files side by side.
 ---
 ## Evaluation metrics
 The test script measures the following per provider run:
 | Metric | What it measures |
 |---|---|
 | **JSON parse rate** | % of responses that are valid, schema-compliant JSON. Critical — a failed parse is a wasted call. Target: >97% |
 | **Field coverage** | % of records where all required fields are present (cefr votes for all translations, descriptions for all languages, glosses/examples for fr/es) |
 | **CEFR agreement** | For records that have a `cefr_source` vote, % where the model agrees. Measures calibration. |
 | **Language correctness** | Manual spot-check only — automated detection not reliable enough |
 | **Tokens/second** | Local only. Indicates overnight run feasibility |
 ### Decision thresholds
 | Metric | Threshold | Action if below |
 |---|---|---|
 | JSON parse rate | < 97% | Do not use this model for production |
 | Field coverage | < 95% | Prompt needs revision before production |
 | CEFR agreement | < 70% | Model lacks vocabulary knowledge for this task |
 ---
 ## Recommended test sequence
 1. **Start local, minimal dataset (5–10 records)**
   Install llama.cpp, run Qwen2.5 3B against 5–10 hand-picked records.
   Verify the server works, the output parses, and the model produces
   something reasonable. This is purely a smoke test.
 2. **Expand local to full 100-record sample**
   Once the pipeline is confirmed working, run all 100 records locally.
   Collect metrics. This is your local quality baseline.
 3. **Run the same 100 records through OpenRouter free models**
   One model per day (50 req/day limit). Start with `qwen/qwen3-coder:free`
   as the quality ceiling.
 4. **Compare metrics side by side**
   If local 3B is within acceptable range of the cloud models on CEFR
   agreement and field coverage, proceed with local overnight runs for
   production. If not, use the cloud model that passed.
 5. **Production run**
   Full 117k records. Resume-safe — the script checkpoints after each
   record so overnight runs can be stopped and continued.