adding documentation for the llm setup for the data pipeline
This commit is contained in:
parent
214a597e99
commit
849fcdad86
3 changed files with 4992 additions and 0 deletions
4492
data-pipeline/test/output/sample.json
Normal file
4492
data-pipeline/test/output/sample.json
Normal file
File diff suppressed because it is too large
Load diff
205
data-pipeline/test/scripts/sample.ts
Normal file
205
data-pipeline/test/scripts/sample.ts
Normal file
|
|
@ -0,0 +1,205 @@
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import path from "node:path";
|
||||||
|
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||||
|
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||||
|
|
||||||
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type Example = { text: string; source: "omw" | "cefr" };
|
||||||
|
|
||||||
|
type AnnotatedRecord = {
|
||||||
|
source_id: string;
|
||||||
|
pos: SupportedPos;
|
||||||
|
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||||
|
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
||||||
|
votes: Partial<
|
||||||
|
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
||||||
|
>;
|
||||||
|
};
|
||||||
|
|
||||||
|
type SampleRecord = AnnotatedRecord & { _sample_bucket: string };
|
||||||
|
|
||||||
|
// ── Constants ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const PATHS = {
|
||||||
|
annotatedDir: "stage-2-annotate/output",
|
||||||
|
output: "test/output/sample.json",
|
||||||
|
};
|
||||||
|
|
||||||
|
const BUCKET_SIZE = 20;
|
||||||
|
|
||||||
|
// ── Bucket predicates ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type Bucket = { name: string; predicate: (record: AnnotatedRecord) => boolean };
|
||||||
|
|
||||||
|
const BUCKETS: Bucket[] = [
|
||||||
|
{
|
||||||
|
name: "has_cefr_vote",
|
||||||
|
predicate: (r) =>
|
||||||
|
Object.values(r.votes).some(
|
||||||
|
(langVotes) => Object.keys(langVotes ?? {}).length > 0,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no_cefr_vote",
|
||||||
|
predicate: (r) =>
|
||||||
|
Object.values(r.votes).every(
|
||||||
|
(langVotes) => Object.keys(langVotes ?? {}).length === 0,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "has_glosses_and_examples",
|
||||||
|
predicate: (r) =>
|
||||||
|
Object.keys(r.glosses).length > 0 && Object.keys(r.examples).length > 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no_glosses_no_examples",
|
||||||
|
predicate: (r) =>
|
||||||
|
!r.glosses["fr"] &&
|
||||||
|
!r.examples["fr"] &&
|
||||||
|
!r.votes["fr"] &&
|
||||||
|
!r.glosses["es"] &&
|
||||||
|
!r.examples["es"] &&
|
||||||
|
!r.votes["es"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "pos_spread",
|
||||||
|
predicate: () => true, // sampled separately to ensure POS coverage
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
// ── Sampling ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function sampleBucket(
|
||||||
|
records: AnnotatedRecord[],
|
||||||
|
predicate: (r: AnnotatedRecord) => boolean,
|
||||||
|
size: number,
|
||||||
|
exclude: Set<string>,
|
||||||
|
): AnnotatedRecord[] {
|
||||||
|
const candidates = records.filter(
|
||||||
|
(r) => !exclude.has(r.source_id) && predicate(r),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Shuffle for random sampling
|
||||||
|
for (let i = candidates.length - 1; i > 0; i--) {
|
||||||
|
const j = Math.floor(Math.random() * (i + 1));
|
||||||
|
[candidates[i], candidates[j]] = [candidates[j]!, candidates[i]!];
|
||||||
|
}
|
||||||
|
|
||||||
|
return candidates.slice(0, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
function samplePosBucket(
|
||||||
|
records: AnnotatedRecord[],
|
||||||
|
exclude: Set<string>,
|
||||||
|
): AnnotatedRecord[] {
|
||||||
|
const posList: SupportedPos[] = ["noun", "verb", "adjective", "adverb"];
|
||||||
|
const perPos = Math.floor(BUCKET_SIZE / posList.length);
|
||||||
|
const result: AnnotatedRecord[] = [];
|
||||||
|
|
||||||
|
for (const pos of posList) {
|
||||||
|
const sampled = sampleBucket(
|
||||||
|
records,
|
||||||
|
(r) => r.pos === pos,
|
||||||
|
perPos,
|
||||||
|
exclude,
|
||||||
|
);
|
||||||
|
result.push(...sampled);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Loading ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function loadAnnotated(): Promise<AnnotatedRecord[]> {
|
||||||
|
// Load all language files and merge votes into a single record set.
|
||||||
|
// Use en.json as the base record structure since it has the most complete
|
||||||
|
// glosses and examples. Votes from all other languages are merged in.
|
||||||
|
const baseRaw = await fs.readFile(
|
||||||
|
path.join(PATHS.annotatedDir, "en.json"),
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
|
||||||
|
|
||||||
|
// Build a map for fast lookup by source_id
|
||||||
|
const byId = new Map<string, AnnotatedRecord>();
|
||||||
|
for (const record of base) {
|
||||||
|
byId.set(record.source_id, record);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge votes from remaining language files
|
||||||
|
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||||
|
if (lang === "en") continue;
|
||||||
|
const raw = await fs.readFile(
|
||||||
|
path.join(PATHS.annotatedDir, `${lang}.json`),
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
const records = JSON.parse(raw) as AnnotatedRecord[];
|
||||||
|
|
||||||
|
for (const record of records) {
|
||||||
|
const base = byId.get(record.source_id);
|
||||||
|
if (!base) continue;
|
||||||
|
|
||||||
|
// Merge votes
|
||||||
|
for (const [l, langVotes] of Object.entries(record.votes)) {
|
||||||
|
if (!base.votes[l as SupportedLanguageCode]) {
|
||||||
|
base.votes[l as SupportedLanguageCode] = {};
|
||||||
|
}
|
||||||
|
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge examples from CEFR source files not in base
|
||||||
|
for (const [l, examples] of Object.entries(record.examples)) {
|
||||||
|
const lang = l as SupportedLanguageCode;
|
||||||
|
if (!base.examples[lang]) {
|
||||||
|
base.examples[lang] = examples as Example[];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return [...byId.values()];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
console.log("Loading annotated files...");
|
||||||
|
const records = await loadAnnotated();
|
||||||
|
console.log(` Loaded ${records.length.toLocaleString()} synsets`);
|
||||||
|
|
||||||
|
const sampled: SampleRecord[] = [];
|
||||||
|
const seen = new Set<string>();
|
||||||
|
|
||||||
|
// Sample each bucket except pos_spread
|
||||||
|
for (const bucket of BUCKETS.filter((b) => b.name !== "pos_spread")) {
|
||||||
|
const results = sampleBucket(records, bucket.predicate, BUCKET_SIZE, seen);
|
||||||
|
for (const r of results) {
|
||||||
|
seen.add(r.source_id);
|
||||||
|
sampled.push({ ...r, _sample_bucket: bucket.name });
|
||||||
|
}
|
||||||
|
console.log(` ${bucket.name}: ${results.length} records`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sample pos_spread bucket
|
||||||
|
const posResults = samplePosBucket(records, seen);
|
||||||
|
for (const r of posResults) {
|
||||||
|
seen.add(r.source_id);
|
||||||
|
sampled.push({ ...r, _sample_bucket: "pos_spread" });
|
||||||
|
}
|
||||||
|
console.log(` pos_spread: ${posResults.length} records`);
|
||||||
|
|
||||||
|
console.log(`\nTotal sampled: ${sampled.length} records`);
|
||||||
|
|
||||||
|
// Write output
|
||||||
|
await fs.mkdir(path.dirname(PATHS.output), { recursive: true });
|
||||||
|
await fs.writeFile(PATHS.output, JSON.stringify(sampled, null, 2), "utf-8");
|
||||||
|
console.log(`Wrote sample → ${PATHS.output}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
295
documentation/llm-setup.md
Normal file
295
documentation/llm-setup.md
Normal file
|
|
@ -0,0 +1,295 @@
|
||||||
|
# LLM Setup — lila pipeline
|
||||||
|
|
||||||
|
This document covers the LLM infrastructure for stage 3 (enrich) of the lila
|
||||||
|
data pipeline. It documents the hardware constraints, supported providers,
|
||||||
|
model recommendations, and how to configure and swap providers in the test
|
||||||
|
and production scripts.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Hardware (dev machine)
|
||||||
|
|
||||||
|
| Component | Spec |
|
||||||
|
|---|---|
|
||||||
|
| CPU | Intel Core i7-6500U (2 cores / 4 threads @ 3.10 GHz) |
|
||||||
|
| RAM | 8 GB |
|
||||||
|
| GPU | NVIDIA GeForce GTX 950M — 4 GB VRAM (Maxwell, CUDA compute 5.0) |
|
||||||
|
| OS | Debian GNU/Linux 13 (trixie) x86_64 |
|
||||||
|
|
||||||
|
**Local inference verdict:** viable for small/quantized models, not for
|
||||||
|
production runs. See the [Local inference](#local-inference-llamacpp) section
|
||||||
|
for details.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Provider overview
|
||||||
|
|
||||||
|
The enrich script uses a single, swappable provider config. All providers
|
||||||
|
except Anthropic expose an OpenAI-compatible API, so the same client code
|
||||||
|
works across all of them — only `baseURL`, `apiKey`, and `model` change.
|
||||||
|
|
||||||
|
| Provider | Use case | Cost | Rate limits |
|
||||||
|
|---|---|---|---|
|
||||||
|
| llama.cpp (local) | Quality testing, overnight dev runs | Free (electricity) | None |
|
||||||
|
| OpenRouter (free tier) | Quality comparison, multi-model evaluation | Free | 50 req/day, 20 req/min |
|
||||||
|
| OpenRouter (paid) | Production runs if local quality insufficient | Pay-per-token | None |
|
||||||
|
| Anthropic API | Quality baseline / reference | Pay-per-token | Standard |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Local inference (llama.cpp)
|
||||||
|
|
||||||
|
### Why local inference is worth testing
|
||||||
|
|
||||||
|
Time is not a constraint — the pipeline scripts are fully resumable. The
|
||||||
|
laptop can run overnight for multiple nights. The only question is output
|
||||||
|
quality, which the test script evaluates empirically.
|
||||||
|
|
||||||
|
### Hardware constraints
|
||||||
|
|
||||||
|
The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0).
|
||||||
|
llama.cpp supports Maxwell via CUDA backend but newer builds may require
|
||||||
|
the `--cuda-no-kv-offload` flag depending on the version.
|
||||||
|
|
||||||
|
llama.cpp splits model layers between GPU and CPU automatically via
|
||||||
|
`--n-gpu-layers`. You set how many layers go on the GPU; the rest run on
|
||||||
|
CPU/RAM. This means a model larger than VRAM is not a dead end — it runs
|
||||||
|
in hybrid mode, slower than full-GPU but much faster than pure CPU.
|
||||||
|
|
||||||
|
Practical estimates for this hardware (~3.5 GB VRAM usable after drivers):
|
||||||
|
|
||||||
|
| Model size | Q4 VRAM | Mode | Est. speed |
|
||||||
|
|---|---|---|---|
|
||||||
|
| 3B | ~2.0 GB | Full GPU | ~15–20 tok/s |
|
||||||
|
| 4B | ~2.5 GB | Full GPU | ~12–18 tok/s |
|
||||||
|
| 7B | ~4.5 GB | Hybrid (~26/32 layers on GPU) | ~8–12 tok/s |
|
||||||
|
| 13B+ | ~8 GB+ | CPU-heavy hybrid | too slow |
|
||||||
|
|
||||||
|
### Recommended local models
|
||||||
|
|
||||||
|
Two candidates worth testing, covering different points on the size/quality
|
||||||
|
tradeoff:
|
||||||
|
|
||||||
|
**Gemma 4 E4B Instruct (Q4 / UD-Q4_K_XL)**
|
||||||
|
- GGUF file: `gemma-4-E4B-it-UD-Q4_K_XL.gguf` (~2.5 GB)
|
||||||
|
- Source: https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF
|
||||||
|
- Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+
|
||||||
|
language support including all five pipeline languages. First candidate
|
||||||
|
to test.
|
||||||
|
|
||||||
|
**Qwen2.5 7B Instruct (Q4_K_M)**
|
||||||
|
- GGUF file: `Qwen2.5-7B-Instruct-Q4_K_M.gguf` (~4.5 GB)
|
||||||
|
- Source: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF
|
||||||
|
- Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~8–12 tok/s.
|
||||||
|
Stronger multilingual generation than any 3–4B model. Second candidate,
|
||||||
|
for comparison against the smaller Gemma 4 E4B.
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install build dependencies
|
||||||
|
sudo apt install build-essential cmake git
|
||||||
|
|
||||||
|
# Clone llama.cpp
|
||||||
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
|
||||||
|
# Build with CUDA support (GTX 950M — compute 5.0)
|
||||||
|
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=50
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
|
||||||
|
# Download model (example — adjust path as needed)
|
||||||
|
mkdir -p models
|
||||||
|
wget -O models/qwen2.5-3b-instruct-q4_k_m.gguf \
|
||||||
|
https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_k_m.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
### Starting the server
|
||||||
|
|
||||||
|
**Gemma 4 E4B** (full GPU):
|
||||||
|
```bash
|
||||||
|
./build/bin/llama-server \
|
||||||
|
--model models/gemma-4-e4b-it-ud-q4_k_xl.gguf \
|
||||||
|
--port 8080 \
|
||||||
|
--ctx-size 4096 \
|
||||||
|
--n-gpu-layers 999 \
|
||||||
|
--host 127.0.0.1
|
||||||
|
```
|
||||||
|
|
||||||
|
**Qwen2.5 7B** (hybrid — tune `--n-gpu-layers` to fit your VRAM):
|
||||||
|
```bash
|
||||||
|
./build/bin/llama-server \
|
||||||
|
--model models/qwen2.5-7b-instruct-q4_k_m.gguf \
|
||||||
|
--port 8080 \
|
||||||
|
--ctx-size 4096 \
|
||||||
|
--n-gpu-layers 28 \
|
||||||
|
--host 127.0.0.1
|
||||||
|
```
|
||||||
|
|
||||||
|
`--n-gpu-layers 999` means "put everything on GPU" — llama.cpp caps at the
|
||||||
|
actual layer count automatically, so 999 is safe as a "full offload" value.
|
||||||
|
For the 7B hybrid, start with `28` and reduce by 2 if the server reports
|
||||||
|
out-of-memory at startup.
|
||||||
|
|
||||||
|
### Verify the server is running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://127.0.0.1:8080/health
|
||||||
|
# Expected: {"status":"ok"}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## OpenRouter (free tier)
|
||||||
|
|
||||||
|
OpenRouter exposes all models via an OpenAI-compatible API. No code changes
|
||||||
|
are needed to switch from local llama.cpp to OpenRouter — only the config
|
||||||
|
object changes.
|
||||||
|
|
||||||
|
### Rate limits (free tier)
|
||||||
|
|
||||||
|
- **50 requests per day** (account total, not per model)
|
||||||
|
- 20 requests per minute
|
||||||
|
|
||||||
|
> **Implication for testing:** with a 10-record test set you have headroom
|
||||||
|
> to test 4–5 models per day. With a 100-record test set, plan one model per
|
||||||
|
> day.
|
||||||
|
|
||||||
|
> **Implication for production:** the free tier is not viable for 117k
|
||||||
|
> records. If local quality is insufficient, use paid OpenRouter credits or
|
||||||
|
> a dedicated provider.
|
||||||
|
|
||||||
|
### Free models recommended for this pipeline
|
||||||
|
|
||||||
|
Ranked by expected multilingual generation quality for en/it/de/fr/es:
|
||||||
|
|
||||||
|
| Model ID | Params | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| `qwen/qwen3-coder:free` | 480B MoE (35B active) | Best free option. Strong multilingual despite "coder" label. Use as quality ceiling. |
|
||||||
|
| `qwen/qwen3-next-80b-a3b-instruct:free` | 80B MoE (3B active) | Smaller Qwen, useful comparison point. |
|
||||||
|
| `nvidia/nemotron-3-super-120b-a12b:free` | 120B MoE (12B active) | 262K context, supports structured output. |
|
||||||
|
| `google/gemma-4-31b-it:free` | 31B | 140+ language support, good European language coverage. |
|
||||||
|
| `zhipuai/glm-4.5-air:free` | MoE | Multilingual-focused. |
|
||||||
|
|
||||||
|
**Skip for this pipeline:**
|
||||||
|
- Llama models — weaker European language generation than Qwen/Gemma
|
||||||
|
- Mistral free tier — requests may be used for model training
|
||||||
|
|
||||||
|
### API endpoint
|
||||||
|
|
||||||
|
```
|
||||||
|
https://openrouter.ai/api/v1/chat/completions
|
||||||
|
```
|
||||||
|
|
||||||
|
Set `Authorization: Bearer <OPENROUTER_API_KEY>` in the request headers.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Provider configuration in the test script
|
||||||
|
|
||||||
|
The enrich test script reads a single config object. To switch providers,
|
||||||
|
change this object and re-run.
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// config.ts
|
||||||
|
|
||||||
|
export type ProviderConfig = {
|
||||||
|
name: string; // used for output folder naming
|
||||||
|
baseURL: string;
|
||||||
|
apiKey: string;
|
||||||
|
model: string;
|
||||||
|
maxTokens: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Local llama.cpp
|
||||||
|
export const LOCAL_QWEN3B: ProviderConfig = {
|
||||||
|
name: "local-qwen2.5-3b",
|
||||||
|
baseURL: "http://127.0.0.1:8080/v1",
|
||||||
|
apiKey: "none", // llama.cpp ignores this
|
||||||
|
model: "qwen2.5-3b", // llama.cpp ignores model name, uses loaded model
|
||||||
|
maxTokens: 512,
|
||||||
|
};
|
||||||
|
|
||||||
|
// OpenRouter — Qwen3 480B (free)
|
||||||
|
export const OR_QWEN3_480B: ProviderConfig = {
|
||||||
|
name: "or-qwen3-480b",
|
||||||
|
baseURL: "https://openrouter.ai/api/v1",
|
||||||
|
apiKey: process.env.OPENROUTER_API_KEY!,
|
||||||
|
model: "qwen/qwen3-coder:free",
|
||||||
|
maxTokens: 512,
|
||||||
|
};
|
||||||
|
|
||||||
|
// OpenRouter — Gemma 4 31B (free)
|
||||||
|
export const OR_GEMMA4_31B: ProviderConfig = {
|
||||||
|
name: "or-gemma4-31b",
|
||||||
|
baseURL: "https://openrouter.ai/api/v1",
|
||||||
|
apiKey: process.env.OPENROUTER_API_KEY!,
|
||||||
|
model: "google/gemma-4-31b-it:free",
|
||||||
|
maxTokens: 512,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Anthropic (reference baseline — different adapter required)
|
||||||
|
export const ANTHROPIC_SONNET: ProviderConfig = {
|
||||||
|
name: "anthropic-sonnet",
|
||||||
|
baseURL: "https://api.anthropic.com/v1", // adapter handles format difference
|
||||||
|
apiKey: process.env.ANTHROPIC_API_KEY!,
|
||||||
|
model: "claude-sonnet-4-6",
|
||||||
|
maxTokens: 512,
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
Output from each run lands in:
|
||||||
|
```
|
||||||
|
stage-3-enrich/test/output/{provider.name}/results.json
|
||||||
|
stage-3-enrich/test/output/{provider.name}/metrics.json
|
||||||
|
```
|
||||||
|
|
||||||
|
The evaluate script compares all `metrics.json` files side by side.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Evaluation metrics
|
||||||
|
|
||||||
|
The test script measures the following per provider run:
|
||||||
|
|
||||||
|
| Metric | What it measures |
|
||||||
|
|---|---|
|
||||||
|
| **JSON parse rate** | % of responses that are valid, schema-compliant JSON. Critical — a failed parse is a wasted call. Target: >97% |
|
||||||
|
| **Field coverage** | % of records where all required fields are present (cefr votes for all translations, descriptions for all languages, glosses/examples for fr/es) |
|
||||||
|
| **CEFR agreement** | For records that have a `cefr_source` vote, % where the model agrees. Measures calibration. |
|
||||||
|
| **Language correctness** | Manual spot-check only — automated detection not reliable enough |
|
||||||
|
| **Tokens/second** | Local only. Indicates overnight run feasibility |
|
||||||
|
|
||||||
|
### Decision thresholds
|
||||||
|
|
||||||
|
| Metric | Threshold | Action if below |
|
||||||
|
|---|---|---|
|
||||||
|
| JSON parse rate | < 97% | Do not use this model for production |
|
||||||
|
| Field coverage | < 95% | Prompt needs revision before production |
|
||||||
|
| CEFR agreement | < 70% | Model lacks vocabulary knowledge for this task |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommended test sequence
|
||||||
|
|
||||||
|
1. **Start local, minimal dataset (5–10 records)**
|
||||||
|
Install llama.cpp, run Qwen2.5 3B against 5–10 hand-picked records.
|
||||||
|
Verify the server works, the output parses, and the model produces
|
||||||
|
something reasonable. This is purely a smoke test.
|
||||||
|
|
||||||
|
2. **Expand local to full 100-record sample**
|
||||||
|
Once the pipeline is confirmed working, run all 100 records locally.
|
||||||
|
Collect metrics. This is your local quality baseline.
|
||||||
|
|
||||||
|
3. **Run the same 100 records through OpenRouter free models**
|
||||||
|
One model per day (50 req/day limit). Start with `qwen/qwen3-coder:free`
|
||||||
|
as the quality ceiling.
|
||||||
|
|
||||||
|
4. **Compare metrics side by side**
|
||||||
|
If local 3B is within acceptable range of the cloud models on CEFR
|
||||||
|
agreement and field coverage, proceed with local overnight runs for
|
||||||
|
production. If not, use the cloud model that passed.
|
||||||
|
|
||||||
|
5. **Production run**
|
||||||
|
Full 117k records. Resume-safe — the script checkpoints after each
|
||||||
|
record so overnight runs can be stopped and continued.
|
||||||
Loading…
Add table
Add a link
Reference in a new issue