Compare commits

...

23 commits

Author SHA1 Message Date
lila
0dba68904e adding labels
All checks were successful
Build and Deploy / build-and-deploy (push) Successful in 2m12s
2026-04-21 14:44:14 +02:00
lila
1715726ec6 excluding temporary status of data-pipeline 2026-04-21 14:44:01 +02:00
lila
849fcdad86 adding documentation for the llm setup for the data pipeline 2026-04-21 13:22:27 +02:00
lila
214a597e99 feat(pipeline): add annotate stage
- write annotate.ts — matches CEFR source files against OMW translations
- match by word text + normalized POS
- add cefr_source vote to matched translations
- extract native example sentences from CEFR source files
- write one annotated JSON per language to stage-2-annotate/output/
- write conflicts.json for words with multiple CEFR levels
- update tsconfig to support all stage directories
- 2 German conflicts found (macht, bleiche)
- match rates: en 47k, fr 44k, de 26k, it 26k, es 26k
2026-04-21 12:01:56 +02:00
lila
9ea35568e5 updating config 2026-04-21 12:01:29 +02:00
lila
c9cddf68de feat(pipeline): add data pipeline workspace and extraction stage
- rename scripts/ to data-pipeline/, archive existing scripts
- add @lila/pipeline as pnpm workspace package
- add stage-1-extract through stage-5-compare folder structure
- update SUPPORTED_LANGUAGE_CODES (add es, de, fr)
- update SUPPORTED_POS (add adjective, adverb)
- add description field to term_glosses
- add term_examples table
- run and verify db migration
- write and verify extract.py (117,659 synsets across 5 languages)
- write PIPELINE.md
2026-04-21 09:39:36 +02:00
lila
e993aac711 adding task to separate user db 2026-04-21 08:39:38 +02:00
lila
07fe256abd documenting the pipeline to enrich the db data, reorganizing the file structure of the data pipeline 2026-04-20 18:28:10 +02:00
lila
0ac2cef6e1 adding term examples table 2026-04-20 18:27:32 +02:00
lila
e718d188d5 archiving old seeding scripts, removing them from package.json scripts 2026-04-20 10:10:28 +02:00
lila
a3d19d36f6 adding the data-pipeline to ts and pnpm workspaces 2026-04-20 09:05:27 +02:00
lila
200b14ef64 reoganising folders/files 2026-04-20 08:50:27 +02:00
lila
eacdd35295 updating schema to have a description field on term_glosses 2026-04-20 08:46:05 +02:00
lila
091a901485 adding remaining languages and pos 2026-04-20 08:01:57 +02:00
lila
1f42239779 reorganising file structure 2026-04-20 07:48:44 +02:00
lila
3f125ba162 reorganising data-pipeline folder 2026-04-20 07:37:02 +02:00
lila
cfd2927c4c removing unnecessary word files 2026-04-20 07:13:10 +02:00
lila
d2314168f8 Merge branch 'dev'
All checks were successful
Build and Deploy / build-and-deploy (push) Successful in 1m12s
2026-04-19 19:26:25 +02:00
lila
bbc9a3d630 update documentation
All checks were successful
Build and Deploy / build-and-deploy (push) Successful in 2m23s
2026-04-19 08:38:12 +02:00
lila
e5595b5039 updating documentation
All checks were successful
Build and Deploy / build-and-deploy (push) Successful in 1m3s
2026-04-14 19:35:49 +02:00
lila
201f462447 cleaning up
All checks were successful
Build and Deploy / build-and-deploy (push) Successful in 1m7s
2026-04-14 19:19:07 +02:00
lila
3b2ecf6ee3 adding debugging step
All checks were successful
Build and Deploy / build-and-deploy (push) Successful in 2m10s
2026-04-14 18:56:59 +02:00
lila
46fb7dbdd2 adding docker and openssh client installation
Some checks failed
Build and Deploy / build-and-deploy (push) Failing after 20s
2026-04-14 18:33:30 +02:00
85 changed files with 8651 additions and 5599688 deletions

View file

@ -8,6 +8,9 @@ jobs:
build-and-deploy:
runs-on: docker
steps:
- name: Install tools
run: apt-get update && apt-get install -y docker.io openssh-client
- name: Checkout code
uses: https://data.forgejo.org/actions/checkout@v4

6
.gitignore vendored
View file

@ -9,3 +9,9 @@ repomix/
venv/
__pycache__/
*.pyc
data-pipeline/archive/
data-pipeline/stage-1-extract/output/
data-pipeline/stage-2-annotate/output/
data-pipeline/stage-3-enrich/output/
data-pipeline/stage-4-merge/output/

View file

@ -10,6 +10,9 @@ import type { GameRequest } from "@lila/shared";
const LABELS: Record<string, string> = {
en: "English",
it: "Italian",
de: "German",
fr: "French",
es: "Spanish",
noun: "Nouns",
verb: "Verbs",
easy: "Easy",

View file

View file

@ -0,0 +1,17 @@
{
"name": "@lila/pipeline",
"version": "1.0.0",
"private": true,
"type": "module",
"scripts": {},
"dependencies": {
"@lila/shared": "workspace:*",
"better-sqlite3": "^12.9.0"
},
"devDependencies": {
"@types/better-sqlite3": "^7.6.13",
"@types/node": "^24.12.0",
"tsx": "^4.21.0",
"typescript": "^5.9.3"
}
}

View file

@ -0,0 +1,204 @@
"""
data-pipeline/stage-1-extract/scripts/extract.py
Extract all synsets from the Open Multilingual Wordnet (OMW) for all
supported languages and parts of speech.
Output: one JSON file per language, written to stage-1-extract/output/
en.json, it.json, es.json, de.json, fr.json
Each file is a JSON array of synset records:
{
"source_id": "ili:i12345",
"pos": "noun",
"translations": { "en": ["dog", "canine"], "it": ["cane"] },
"glosses": { "en": ["a domesticated animal..."] },
"examples": { "en": ["the dog barked at the stranger"] }
}
Usage:
python stage-1-extract/scripts/extract.py
python stage-1-extract/scripts/extract.py --sample
Prerequisites:
pip install wn
python -m wn download omw-en:1.4
python -m wn download omw-it:1.4
python -m wn download omw-de:1.4
python -m wn download omw-es:1.4
python -m wn download omw-fr:1.4
"""
import json
import sys
from pathlib import Path
import wn
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
POS_MAP: dict[str, str] = {
"n": "noun",
"v": "verb",
"a": "adjective",
"s": "adjective", # adjective satellite — collapsed into adjective
"r": "adverb",
}
def extract_all(
output_dir: str = "stage-1-extract/output", sample: bool = False
) -> None:
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
sample_size = 100 if sample else None
# Load one Wordnet object per language up front.
print("Loading wordnets...")
wordnets: dict[str, wn.Wordnet] = {}
for lang in SUPPORTED_LANGUAGE_CODES:
try:
wordnets[lang] = wn.Wordnet(lang=lang)
synset_count = len(wordnets[lang].synsets())
print(f" {lang}: {synset_count:,} total synsets")
except wn.Error as e:
print(f" ERROR loading {lang}: {e}")
print(f" Run: python -m wn download omw-{lang}:1.4")
sys.exit(1)
# Collect per-ILI data across all languages and POS.
print("\nExtracting synsets...")
by_ili: dict[str, dict] = {}
for lang, wnet in wordnets.items():
for omw_pos, pos_label in POS_MAP.items():
synsets = wnet.synsets(pos=omw_pos)
covered = 0
for synset in synsets:
ili = synset.ili
if not ili:
continue
covered += 1
lemmas = [str(lemma) for lemma in synset.lemmas()]
defns = [d for d in synset.definitions() if d]
examples = [e for e in synset.examples() if e]
if ili not in by_ili:
by_ili[ili] = {"pos": pos_label}
if lang not in by_ili[ili]:
by_ili[ili][lang] = {
"lemmas": lemmas,
"glosses": defns,
"examples": examples,
}
else:
# ILI already exists for this language — merge data.
# Happens when 'a' and 's' both map to adjective for the
# same ILI. Deduplicate to avoid repeated entries.
existing = by_ili[ili][lang]
existing["lemmas"] = list(
dict.fromkeys(existing["lemmas"] + lemmas)
)
existing["glosses"] = list(
dict.fromkeys(existing["glosses"] + defns)
)
existing["examples"] = list(
dict.fromkeys(existing["examples"] + examples)
)
print(f" {lang} {pos_label}: {covered:,} synsets with ILI")
# Build records and write single combined output file.
print("\nBuilding records...")
ilis = sorted(by_ili.keys())
if sample_size:
ilis = ilis[:sample_size]
records: list[dict] = []
for ili in ilis:
data = by_ili[ili]
record: dict = {
"source_id": f"ili:{ili}",
"pos": data["pos"],
"translations": {},
"glosses": {},
"examples": {},
}
for key, value in data.items():
if key == "pos":
continue
lang = key
if value["lemmas"]:
record["translations"][lang] = value["lemmas"]
if value["glosses"]:
record["glosses"][lang] = value["glosses"]
if value["examples"]:
record["examples"][lang] = value["examples"]
records.append(record)
output_file = out / "omw.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
print(f"\nWrote {len(records):,} synsets → {output_file}")
_print_coverage(records)
def _print_coverage(records: list[dict]) -> None:
"""Print per-language translation, gloss, and example counts."""
lang_stats: dict[str, dict[str, int]] = {}
for lang in SUPPORTED_LANGUAGE_CODES:
lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
pos_stats: dict[str, int] = {}
for r in records:
pos = r["pos"]
pos_stats[pos] = pos_stats.get(pos, 0) + 1
for lang, lemmas in r["translations"].items():
if lang in lang_stats:
lang_stats[lang]["translations"] += len(lemmas)
for lang, gloss_list in r["glosses"].items():
if lang in lang_stats:
lang_stats[lang]["glosses"] += len(gloss_list)
for lang, example_list in r["examples"].items():
if lang in lang_stats:
lang_stats[lang]["examples"] += len(example_list)
print("\nPOS breakdown:")
for pos, count in sorted(pos_stats.items()):
print(f" {pos}: {count:,}")
print("\nCoverage per language:")
for lang, counts in lang_stats.items():
t = counts["translations"]
g = counts["glosses"]
e = counts["examples"]
total = len(records)
print(
f" {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
parser.add_argument(
"--output-dir",
default="stage-1-extract/output",
help="Output directory for JSON files",
)
parser.add_argument(
"--sample",
action="store_true",
help="Extract only 100 synsets per language for inspection",
)
args = parser.parse_args()
extract_all(output_dir=args.output_dir, sample=args.sample)

View file

@ -0,0 +1,227 @@
import fs from "node:fs/promises";
import path from "node:path";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
// ── Types ────────────────────────────────────────────────────────────────────
type OmwExample = { text: string; source: "omw" };
type CefrExample = { text: string; source: "cefr" };
type Example = OmwExample | CefrExample;
type OmwRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, string[]>>;
};
type AnnotatedRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
votes: Partial<
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
>;
};
type CefrSourceEntry = {
word: string;
pos: string;
cefr_level: string;
example_sentence_native?: string;
};
type ConflictEntry = {
word: string;
pos: string;
language: SupportedLanguageCode;
levels: string[];
};
// ── Constants ─────────────────────────────────────────────────────────────────
const POS_NORMALIZE: Record<string, SupportedPos> = {
noun: "noun",
n: "noun",
nom: "noun", // French
verb: "verb",
verbs: "verb",
v: "verb",
v1: "verb",
adjective: "adjective",
adjektiv: "adjective", // German
adj: "adjective",
adverb: "adverb",
adverbs: "adverb",
adv: "adverb",
};
const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
const PATHS = {
omw: "stage-1-extract/output/omw.json",
cefrDir: "stage-2-annotate/sources/cefr",
outputDir: "stage-2-annotate/output",
};
// ── CEFR source loading ───────────────────────────────────────────────────────
type CefrIndex = Map<string, { level: string; example?: string }>;
async function loadCefrSource(
lang: SupportedLanguageCode,
): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
const raw = await fs.readFile(filepath, "utf-8");
const entries = JSON.parse(raw) as CefrSourceEntry[];
// First pass — detect conflicts.
// Structure: "word|pos" -> Set of CEFR levels seen
const seen = new Map<string, Set<string>>();
for (const entry of entries) {
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
if (!pos) continue;
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
if (!seen.has(key)) seen.set(key, new Set());
seen.get(key)!.add(entry.cefr_level);
}
const conflicts: ConflictEntry[] = [];
for (const [key, levels] of seen.entries()) {
if (levels.size > 1) {
const [word, pos] = key.split("|") as [string, string];
conflicts.push({ word, pos, language: lang, levels: [...levels] });
}
}
// Second pass — build index, skip conflicting entries.
const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
const index: CefrIndex = new Map();
for (const entry of entries) {
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
if (!pos) continue;
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
if (conflictKeys.has(key)) continue;
index.set(key, {
level: entry.cefr_level,
...(entry.example_sentence_native
? { example: entry.example_sentence_native }
: {}),
});
}
return { index, conflicts };
}
// ── Annotation ────────────────────────────────────────────────────────────────
async function annotate(): Promise<void> {
// Load OMW records
console.log("Reading OMW extract...");
const raw = await fs.readFile(PATHS.omw, "utf-8");
const omwRecords = JSON.parse(raw) as OmwRecord[];
console.log(` Loaded ${omwRecords.length.toLocaleString()} synsets`);
// Load CEFR sources for all languages
console.log("\nLoading CEFR source files...");
const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
const allConflicts: ConflictEntry[] = [];
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const { index, conflicts } = await loadCefrSource(lang);
cefrIndexes.set(lang, index);
allConflicts.push(...conflicts);
console.log(
` ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
);
}
// Write conflicts file
await fs.mkdir(PATHS.outputDir, { recursive: true });
await fs.writeFile(
path.join(PATHS.outputDir, "conflicts.json"),
JSON.stringify(allConflicts, null, 2),
"utf-8",
);
console.log(
`\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
);
// Annotate and write one file per language
console.log("\nAnnotating...");
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const index = cefrIndexes.get(lang)!;
const records: AnnotatedRecord[] = [];
let matched = 0;
for (const record of omwRecords) {
const annotated: AnnotatedRecord = {
source_id: record.source_id,
pos: record.pos,
translations: record.translations,
glosses: record.glosses,
examples: {},
votes: {},
};
// Convert OMW examples to typed format
for (const [l, exList] of Object.entries(record.examples)) {
annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
text,
source: "omw" as const,
}));
}
// Match translations for this language against CEFR index
const langTranslations = record.translations[lang] ?? [];
for (const word of langTranslations) {
const key = `${word.toLowerCase().trim()}|${record.pos}`;
const cefrEntry = index.get(key);
if (!cefrEntry) continue;
matched++;
// Add CEFR vote
if (!annotated.votes[lang]) annotated.votes[lang] = {};
annotated.votes[lang]![word] = { cefr_source: cefrEntry.level };
// Add native example if present
if (cefrEntry.example) {
if (!annotated.examples[lang]) annotated.examples[lang] = [];
annotated.examples[lang]!.push({
text: cefrEntry.example,
source: "cefr" as const,
});
}
}
records.push(annotated);
}
const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
console.log(
` ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
);
}
}
// ── Main ─────────────────────────────────────────────────────────────────────
annotate().catch((err) => {
console.error(err);
process.exit(1);
});

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,205 @@
import fs from "node:fs/promises";
import path from "node:path";
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
// ── Types ─────────────────────────────────────────────────────────────────────
type Example = { text: string; source: "omw" | "cefr" };
type AnnotatedRecord = {
source_id: string;
pos: SupportedPos;
translations: Partial<Record<SupportedLanguageCode, string[]>>;
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
votes: Partial<
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
>;
};
type SampleRecord = AnnotatedRecord & { _sample_bucket: string };
// ── Constants ─────────────────────────────────────────────────────────────────
const PATHS = {
annotatedDir: "stage-2-annotate/output",
output: "test/output/sample.json",
};
const BUCKET_SIZE = 20;
// ── Bucket predicates ─────────────────────────────────────────────────────────
type Bucket = { name: string; predicate: (record: AnnotatedRecord) => boolean };
const BUCKETS: Bucket[] = [
{
name: "has_cefr_vote",
predicate: (r) =>
Object.values(r.votes).some(
(langVotes) => Object.keys(langVotes ?? {}).length > 0,
),
},
{
name: "no_cefr_vote",
predicate: (r) =>
Object.values(r.votes).every(
(langVotes) => Object.keys(langVotes ?? {}).length === 0,
),
},
{
name: "has_glosses_and_examples",
predicate: (r) =>
Object.keys(r.glosses).length > 0 && Object.keys(r.examples).length > 0,
},
{
name: "no_glosses_no_examples",
predicate: (r) =>
!r.glosses["fr"] &&
!r.examples["fr"] &&
!r.votes["fr"] &&
!r.glosses["es"] &&
!r.examples["es"] &&
!r.votes["es"],
},
{
name: "pos_spread",
predicate: () => true, // sampled separately to ensure POS coverage
},
];
// ── Sampling ──────────────────────────────────────────────────────────────────
function sampleBucket(
records: AnnotatedRecord[],
predicate: (r: AnnotatedRecord) => boolean,
size: number,
exclude: Set<string>,
): AnnotatedRecord[] {
const candidates = records.filter(
(r) => !exclude.has(r.source_id) && predicate(r),
);
// Shuffle for random sampling
for (let i = candidates.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[candidates[i], candidates[j]] = [candidates[j]!, candidates[i]!];
}
return candidates.slice(0, size);
}
function samplePosBucket(
records: AnnotatedRecord[],
exclude: Set<string>,
): AnnotatedRecord[] {
const posList: SupportedPos[] = ["noun", "verb", "adjective", "adverb"];
const perPos = Math.floor(BUCKET_SIZE / posList.length);
const result: AnnotatedRecord[] = [];
for (const pos of posList) {
const sampled = sampleBucket(
records,
(r) => r.pos === pos,
perPos,
exclude,
);
result.push(...sampled);
}
return result;
}
// ── Loading ───────────────────────────────────────────────────────────────────
async function loadAnnotated(): Promise<AnnotatedRecord[]> {
// Load all language files and merge votes into a single record set.
// Use en.json as the base record structure since it has the most complete
// glosses and examples. Votes from all other languages are merged in.
const baseRaw = await fs.readFile(
path.join(PATHS.annotatedDir, "en.json"),
"utf-8",
);
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
// Build a map for fast lookup by source_id
const byId = new Map<string, AnnotatedRecord>();
for (const record of base) {
byId.set(record.source_id, record);
}
// Merge votes from remaining language files
for (const lang of SUPPORTED_LANGUAGE_CODES) {
if (lang === "en") continue;
const raw = await fs.readFile(
path.join(PATHS.annotatedDir, `${lang}.json`),
"utf-8",
);
const records = JSON.parse(raw) as AnnotatedRecord[];
for (const record of records) {
const base = byId.get(record.source_id);
if (!base) continue;
// Merge votes
for (const [l, langVotes] of Object.entries(record.votes)) {
if (!base.votes[l as SupportedLanguageCode]) {
base.votes[l as SupportedLanguageCode] = {};
}
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
}
// Merge examples from CEFR source files not in base
for (const [l, examples] of Object.entries(record.examples)) {
const lang = l as SupportedLanguageCode;
if (!base.examples[lang]) {
base.examples[lang] = examples as Example[];
}
}
}
}
return [...byId.values()];
}
// ── Main ─────────────────────────────────────────────────────────────────────
async function main(): Promise<void> {
console.log("Loading annotated files...");
const records = await loadAnnotated();
console.log(` Loaded ${records.length.toLocaleString()} synsets`);
const sampled: SampleRecord[] = [];
const seen = new Set<string>();
// Sample each bucket except pos_spread
for (const bucket of BUCKETS.filter((b) => b.name !== "pos_spread")) {
const results = sampleBucket(records, bucket.predicate, BUCKET_SIZE, seen);
for (const r of results) {
seen.add(r.source_id);
sampled.push({ ...r, _sample_bucket: bucket.name });
}
console.log(` ${bucket.name}: ${results.length} records`);
}
// Sample pos_spread bucket
const posResults = samplePosBucket(records, seen);
for (const r of posResults) {
seen.add(r.source_id);
sampled.push({ ...r, _sample_bucket: "pos_spread" });
}
console.log(` pos_spread: ${posResults.length} records`);
console.log(`\nTotal sampled: ${sampled.length} records`);
// Write output
await fs.mkdir(path.dirname(PATHS.output), { recursive: true });
await fs.writeFile(PATHS.output, JSON.stringify(sampled, null, 2), "utf-8");
console.log(`Wrote sample → ${PATHS.output}`);
}
main().catch((err) => {
console.error(err);
process.exit(1);
});

View file

@ -0,0 +1,12 @@
{
"extends": "../tsconfig.base.json",
"compilerOptions": {
"module": "NodeNext",
"moduleResolution": "NodeNext",
"outDir": "dist",
"rootDir": ".",
"types": ["node"],
},
"references": [{ "path": "../packages/shared" }],
"include": ["./**/*"],
}

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

473
documentation/PIPELINE.md Normal file
View file

@ -0,0 +1,473 @@
# lila data pipeline
> **NOTE: BEFORE RUNNING THE PIPELINE, CONSIDER IMPROVING THE CEFR SOURCE
> FILES IN `stage-2-annotate/sources/cefr/`. BETTER SOURCE COVERAGE MEANS
> FEWER WORDS FOR THE LLM TO ANNOTATE FROM SCRATCH, FASTER OVERNIGHT RUNS,
> AND HIGHER CONFIDENCE IN THE FINAL OUTPUT. SEE UNIVERSALCEFR
> (huggingface.co/UniversalCEFR) AND CEFR-J
> (github.com/openlanguageprofiles/olp-en-cefrj) AS STARTING POINTS.**
This pipeline extracts vocabulary data from the Open Multilingual Wordnet (OMW), annotates it with CEFR levels from curated source files, verifies and enriches annotations using local LLMs, and produces authoritative JSON files per language. These files are consumed by the seeder in `packages/db` to populate the database with terms, translations, glosses, CEFR levels, difficulty ratings, and LLM-generated descriptions.
## Overview
```mermaid
flowchart LR
omw[(OMW SQLite DBs)]
cefr[(CEFR JSON files)]
extract[Extract]
annotate[Annotate]
enrich[Enrich]
merge[Merge]
final[(final/lang.json)]
flagged[(flagged/lang.json)]
seeder[packages/db seeder]
db[(Database)]
omw --> extract
cefr --> annotate
extract --> annotate
annotate --> enrich
enrich --> merge
merge --> final
merge --> flagged
final --> seeder
seeder --> db
```
Each stage is a standalone script that reads from the previous stage's output and produces one JSON file per language. Stages can be re-run independently without affecting earlier or later stages.
The enrich stage is the exception — it produces one checkpoint file per model run per language, plus a compiled votes file once all runs are complete. It is designed to run overnight, one model at a time, and is fully resumable if interrupted.
Only fully annotated output in `stage-4-merge/output/final/` reaches the database. Words where LLMs could not reach a majority vote land in `stage-4-merge/output/flagged/` and wait for manual review before seeding.
## Data sources
### OMW / WordNet
The Open Multilingual Wordnet (OMW) is the base vocabulary source. It provides synsets — groups of synonymous words — with translations and glosses across multiple languages. One SQLite database per language is downloaded and placed in `sources/omw/`. These files are not committed to git.
All four parts of speech are extracted: noun, verb, adjective, adverb. WordNet's adjective satellites are collapsed into adjective — this is a WordNet-internal distinction that has no relevance for language learning. Alongside translations and glosses, usage examples are extracted where available and stored in the database as term_examples.
See **Setup** for download instructions.
### CEFR source files
Per-language JSON files in `sources/cefr/` provide the initial CEFR level annotations. These files do not cover the full vocabulary extracted from OMW — coverage varies by language. Gaps and disagreements are handled by the enrich stage.
| Language | File |
|---|---|
| English | `sources/cefr/en.json` |
| Italian | `sources/cefr/it.json` |
| Spanish | `sources/cefr/es.json` |
| German | `sources/cefr/de.json` |
| French | `sources/cefr/fr.json` |
These files are committed to git. For per-language coverage detail see `COVERAGE.md`.
### CEFR annotation and verification
CEFR levels are determined by a majority vote combining all available sources:
- The CEFR source file counts as one vote (if it has an entry for the word)
- Each LLM model run counts as one vote
The LLMs verify existing annotations as well as filling gaps — a source file entry does not automatically win. Majority vote across all sources determines the final level.
If no majority is reached, the word is flagged for manual review and excluded from the database until resolved.
## Setup
### OMW databases
Download the OMW SQLite database for each language using the `wn` Python
library:
```bash
python -m wn download omw-en:1.4
python -m wn download omw-it:1.4
python -m wn download omw-de:1.4
python -m wn download omw-es:1.4
python -m wn download omw-fr:1.4
```
The data is stored automatically at `~/.wn_data/wn.db` and is not committed
to git.
### LLM setup
See `LLM-SETUP.md`.
## Pipeline stages
The pipeline runs in five stages. Each stage is independent and can be re-run without affecting the others.
| Stage | What it does |
|---|---|
| 1. Extract | Reads OMW SQLite database, outputs normalized JSON per language |
| 2. Annotate | Merges CEFR source files into extracted data, adds source file votes |
| 3. Enrich | Runs local LLMs in two rounds — generation then voting |
| 4. Merge | Resolves votes, derives difficulty, splits into final and flagged |
| 5. Compare | Generates COVERAGE.md with detailed quality report |
### 1. Extract
Reads the OMW SQLite database (`~/.wn_data/wn.db`) and produces a single normalized JSON file containing all synsets with their translations, glosses, and usage examples across all five languages and all parts of speech. Adjective satellites are collapsed into adjective at this stage.
**Input:** `~/.wn_data/wn.db`
**Output:** `stage-1-extract/output/omw.json`
```bash
python stage-1-extract/scripts/extract.py
```
Add `--sample` to extract 100 synsets for inspection before running the full
extraction.
Each record in the output looks like this:
```json
{
"source_id": "ili:i1",
"pos": "adjective",
"translations": {
"en": ["able"],
"it": ["abile", "intelligente", "valente", "capace"],
"es": ["capaz"],
"fr": ["comptable"]
},
"glosses": {
"en": ["(usually followed by 'to') having the necessary means or skill or know-how or authority to do something"]
},
"examples": {
"en": ["able to swim", "she was able to program her computer"]
}
}
```
Note: glosses and examples are not available for all languages. French and Spanish have no glosses or examples in the current OMW database — these will be generated by the LLM in the enrich stage. Coverage detail is in `COVERAGE.md`.
### 2. Annotate
Reads the combined OMW extract and merges CEFR source data into it. Each translation in each language is matched against the corresponding CEFR source
file by word text and part of speech. Matched translations receive a `cefr_source` vote which carries into the enrich stage. Unmatched translations proceed without a vote.
This stage also extracts native example sentences from the CEFR source files and adds them to the record alongside OMW examples, with `source: "cefr"` to distinguish them.
Words appearing in the CEFR source file multiple times with different CEFR levels are written to `conflicts.json` for manual review and excluded from voting until resolved.
**Input:** `stage-1-extract/output/omw.json` + `stage-2-annotate/sources/cefr/{lang}.json`
**Output:**
- `stage-2-annotate/output/{lang}.json` — one per language
- `stage-2-annotate/output/conflicts.json` — cross-language conflicts for review
```bash
pnpm --filter @lila/pipeline annotate
```
Each record in the output extends the OMW record with a `votes` field and any additional examples from the CEFR source file:
```json
{
"source_id": "ili:i1",
"pos": "adjective",
"translations": {
"en": ["able"],
"it": ["abile", "intelligente", "valente", "capace"],
"es": ["capaz"],
"fr": ["comptable"]
},
"glosses": {
"en": ["having the necessary means or skill to do something"]
},
"examples": {
"en": [
{ "text": "able to swim", "source": "omw" },
{ "text": "She was able to finish the task.", "source": "cefr" }
]
},
"votes": {
"en": {
"able": { "cefr_source": "B1" }
}
}
}
```
Words not present in the CEFR source file will have an empty `votes` object.
### 3. Enrich
The enrich stage runs in two rounds, both designed to execute overnight one model at a time. The llama.cpp server must be running locally before starting either round. See `LLM-SETUP.md` for setup instructions.
**Round 1 — generation**
Each model processes every word in every language one term at a time and
generates:
- A CEFR level vote for each translation
- A description for each language
- A translation for each language, only if OMW provides none
- A gloss for each language, only if OMW provides none
- Usage examples for each language, only if OMW provides none
OMW data is never duplicated — the script checks what OMW already provides before building the prompt. For translations, glosses and examples, if OMW data exists for that language the LLM skips generation entirely. This significantly reduces compute time for languages with good OMW coverage such as English.
All model-generated content is stored with an anonymised source (`model_1`, `model_2` etc.) so models cannot be biased by knowing who generated what in round 2.
**Input:** `stage-2-annotate/output/{lang}.json`
**Output:** `stage-3-enrich/output/round1/{lang}_{model}.json` per run
```bash
pnpm --filter @lila/pipeline enrich --round 1 --model {model}
```
**Compiling candidates**
Once all round 1 runs are complete, compile all generated candidates into a single structured file per language. This is the input to round 2.
**Input:** `stage-3-enrich/output/round1/{lang}_{model}.json`
**Output:** `stage-3-enrich/output/candidates/{lang}_candidates.json`
```bash
pnpm --filter @lila/pipeline enrich --compile-candidates
```
**Round 2 — voting**
Each model receives the compiled candidate list for every word and votes on:
- The best gloss candidate (if multiple exist)
- The best description candidate (if multiple exist)
- The best usage examples candidate (if multiple exist)
- A CEFR level vote for each translation
OMW data is not put to a vote — it automatically wins over any LLM-generated candidate. Round 2 only resolves conflicts between model-generated candidates. The prompt is kept small — one word at a time, a clean numbered candidate list — to fit within a limited context window.
**Input:** `stage-3-enrich/output/candidates/{lang}_candidates.json`
**Output:** `stage-3-enrich/output/round2/{lang}_{model}.json` per run
```bash
pnpm --filter @lila/pipeline enrich --round 2 --model {model}
```
**Compiling votes**
Once all round 2 runs are complete, compile all votes into a single file per language. This is the input to the merge stage.
**Input:** `stage-3-enrich/output/round2/{lang}_{model}.json`
**Output:** `stage-3-enrich/output/votes/{lang}_votes.json`
```bash
pnpm --filter @lila/pipeline enrich --compile-votes
```
Each record in the votes file looks like this:
```json
{
"source_id": "omw-en-12345",
"pos": "noun",
"translations": {
"en": [
{
"text": "dog",
"votes": { "cefr_source": "A1", "model_1": "A1", "model_2": "A1" }
},
{
"text": "canine",
"votes": { "cefr_source": "B2", "model_1": "B2", "model_2": "B1" }
}
],
"it": [
{
"text": "cane",
"votes": { "cefr_source": "A1", "model_1": "A1", "model_2": "A1" }
}
]
},
"glosses": {
"en": { "text": "a domesticated carnivorous mammal", "source": "omw" },
"fr": {
"candidates": [
{ "text": "un mammifère carnivore domestiqué", "source": "model_1" },
{ "text": "un animal domestique carnivore", "source": "model_2" }
],
"votes": { "model_1": 1, "model_2": 1 }
}
},
"examples": {
"en": [
{ "text": "the dog barked at the stranger", "source": "omw" }
],
"fr": {
"candidates": [
{ "text": "le chien a aboyé", "source": "model_1" },
{ "text": "le chien gardait la maison", "source": "model_2" }
],
"votes": { "model_1": 2, "model_2": 1 }
}
},
"descriptions": {
"en": {
"candidates": [
{ "text": "a common household pet known for loyalty", "source": "model_1" },
{ "text": "a domesticated animal and loyal companion", "source": "model_2" }
],
"votes": { "model_1": 2, "model_2": 1 }
}
}
}
```
### 4. Merge
Reads the votes file per language and resolves the final value for every field. Produces two output files per language — fully resolved records ready for seeding, and flagged records that need manual review.
**Merge rules:**
- OMW data wins automatically and is never overridden
- For CEFR levels: the level with the most votes wins. If no majority is reached, that translation is flagged
- For LLM-generated text fields (gloss, examples, descriptions): the candidate with the most votes wins
<!-- TODO: decide fallback strategy when no majority is reached for text fields -->
**Difficulty mapping:**
| CEFR | Difficulty |
|---|---|
| A1, A2 | easy |
| B1, B2 | intermediate |
| C1, C2 | hard |
**Input:** `stage-3-enrich/output/votes/{lang}_votes.json`
**Output:**
- `stage-4-merge/output/final/{lang}.json` — fully resolved, ready for seeding
- `stage-4-merge/output/flagged/{lang}.json` — CEFR majority not reached, needs manual review before seeding
```bash
pnpm --filter @lila/pipeline merge
```
Each record in `final/{lang}.json` looks like this:
```json
{
"source_id": "omw-en-12345",
"pos": "noun",
"translations": {
"en": [
{ "text": "dog", "cefr_level": "A1", "difficulty": "easy" },
{ "text": "canine", "cefr_level": "B2", "difficulty": "intermediate" }
],
"it": [
{ "text": "cane", "cefr_level": "A1", "difficulty": "easy" }
]
},
"glosses": {
"en": { "text": "a domesticated carnivorous mammal", "source": "omw" },
"fr": { "text": "un mammifère carnivore domestiqué", "source": "model_1" }
},
"examples": {
"en": [
{ "text": "the dog barked at the stranger", "source": "omw" }
],
"fr": [
{ "text": "le chien a aboyé", "source": "model_1" }
]
},
"descriptions": {
"en": {
"text": "a common household pet known for loyalty and companionship",
"source": "model_1"
},
"it": {
"text": "un animale domestico comune noto per la sua fedeltà",
"source": "model_2"
}
}
}
```
**Resolving flagged words:**
Open `stage-4-merge/output/flagged/{lang}.json`, manually set the correct `cefr_level` and `difficulty` for each flagged translation, then move the resolved entries into `stage-4-merge/output/final/{lang}.json`. Re-run the seeder after resolving.
### 5. Compare / QA
Read-only. Generates `COVERAGE.md` with a full breakdown of the pipeline
output quality per language. Run this after merge to verify output before
seeding the database.
**Input:**
- `stage-4-merge/output/final/{lang}.json`
- `stage-4-merge/output/flagged/{lang}.json`
**Output:** `COVERAGE.md`
```bash
pnpm --filter @lila/pipeline compare
```
`COVERAGE.md` reports the following per language:
- Total synsets extracted
- Total translations per language
- POS breakdown per language — word counts for noun, verb, adjective, adverb
- CEFR coverage per language — how many translations have a resolved CEFR level, broken down by level (A1, A2, B1, B2, C1, C2)
- Difficulty breakdown per language — word counts for easy, intermediate, hard
- Flagged count per language — how many translations are awaiting manual review
- Gloss coverage per language — total glosses, broken down by source (omw vs LLM-generated) and which languages have no glosses at all
- Example coverage per language — same breakdown as glosses
- Description coverage per language — how many translations have a description, broken down by source
- CEFR source file coverage per language — how many words from the source file were matched against OMW translations
- LLM model contribution — how many CEFR votes and text candidates each anonymised model contributed
## Adding a new language
1. Add the language code to `SUPPORTED_LANGUAGE_CODES` in `packages/shared/src/constants.ts`
2. Build shared: `pnpm --filter @lila/shared build`
3. Generate and run a DB migration: `pnpm --filter @lila/db generate` then `pnpm --filter @lila/db migrate`
4. Download the OMW lexicon for the language using the `wn` Python library
5. Add a CEFR source file at `stage-2-annotate/sources/cefr/{lang}.json`
6. Run the full pipeline
## Constants and constraints
These values are defined in `packages/shared/src/constants.ts` and enforced by database check constraints. The pipeline filters out any entries that violate them.
| Constant | Values |
|---|---|
| Languages | `en`, `it`, `de`, `es`, `fr` |
| Parts of speech | `noun`, `verb`, `adjective`, `adverb` |
| CEFR levels | `A1`, `A2`, `B1`, `B2`, `C1`, `C2` |
| Difficulty | `easy`, `intermediate`, `hard` |
Adding a new value to any of these requires a constants update and a database migration before re-running the pipeline. See **Adding a new language** for the full steps — the same process applies for new parts of speech.
## Further extensions
These are not part of the current pipeline but are worth considering as the
dataset matures:
- **Grammatical gender and articles** — Wiktionary dumps contain gender and
article data for nouns across all supported languages. Could be extracted
and stored as a new `translation_forms` table.
- **Conjugations** — Wiktionary also carries verb conjugation tables. Useful
for a future grammar-focused quiz mode.
- **IPA pronunciations** — Wiktionary and Forvo are potential sources for
phonetic transcriptions per language.
- **TTS audio files** — Generate pronunciation audio for each translation
using a local or cloud TTS engine. Stored as static files, served alongside
the quiz UI.
- **Images** — Associate an image with each synset to support visual
vocabulary learning. Could be sourced from open image datasets like
ImageNet or WikiMedia Commons.
- **Frequency data** — Word frequency rankings per language from sources like
the Google Ngram dataset. Useful for smarter difficulty calibration beyond
CEFR levels alone.
- **Improved CEFR source files** — See note at the top of this document.
UniversalCEFR and CEFR-J are good starting points.
- **Additional languages** — The pipeline is language-agnostic. Adding a new
language requires an OMW lexicon, a CEFR source file, and a constants
update. See **Adding a new language**.

View file

@ -225,9 +225,59 @@ Host git.lilastudy.com
This allows standard git commands without specifying the port.
## CI/CD Pipeline
Automated build and deploy via Forgejo Actions. On every push to `main`, the pipeline builds ARM64 images natively on the VPS, pushes them to the Forgejo registry, and restarts the app containers.
### Components
- **Forgejo Actions** — enabled by default, workflow files in `.forgejo/workflows/`
- **Forgejo Runner** — runs as a container (`lila-ci-runner`) on the VPS, uses the host's Docker socket to build images natively on ARM64
- **Workflow file**`.forgejo/workflows/deploy.yml`
### Pipeline Steps
1. Install Docker CLI and SSH client in the job container
2. Checkout the repository
3. Login to the Forgejo container registry
4. Build API image (target: `runner`)
5. Build Web image (target: `production`, with `VITE_API_URL` baked in)
6. Push both images to `git.lilastudy.com`
7. SSH into the VPS, pull new images, restart `api` and `web` containers, prune old images
### Secrets (stored in Forgejo repo settings → Actions → Secrets)
| Secret | Value |
|---|---|
| REGISTRY_USER | Forgejo username |
| REGISTRY_PASSWORD | Forgejo password |
| SSH_PRIVATE_KEY | Contents of `~/.ssh/ci-runner` on the VPS |
| SSH_HOST | VPS IP address |
| SSH_USER | `lila` |
### Runner Configuration
The runner config is at `/data/config.yml` inside the `lila-ci-runner` container. Key settings:
- `docker_host: "automount"` — mounts the host Docker socket into job containers
- `valid_volumes: ["/var/run/docker.sock"]` — allows the socket mount
- `privileged: true` — required for Docker access from job containers
- `options: "--group-add 989"` — adds the host's docker group (GID 989) to job containers
The runner command must explicitly reference the config file:
```yaml
command: '/bin/sh -c "sleep 5; forgejo-runner -c /data/config.yml daemon"'
```
### Deploy Cycle
Push to main → pipeline runs automatically (~2-5 min) → app is updated. No manual steps required.
To manually trigger a re-run: go to the repo's Actions tab, click on the latest run, and use the re-run button.
## Known Issues and Future Work
- **CI/CD**: Currently manual build-push-pull cycle. Plan: Forgejo Actions with a runner on the VPS building ARM images natively (eliminates QEMU cross-compilation)
- **Backups**: Offsite backup storage (Hetzner Object Storage or similar) should be added
- **Valkey**: Not in the production stack yet. Will be added when multiplayer requires session/room state
- **Monitoring/logging**: No centralized logging or uptime monitoring configured

295
documentation/llm-setup.md Normal file
View file

@ -0,0 +1,295 @@
# LLM Setup — lila pipeline
This document covers the LLM infrastructure for stage 3 (enrich) of the lila
data pipeline. It documents the hardware constraints, supported providers,
model recommendations, and how to configure and swap providers in the test
and production scripts.
---
## Hardware (dev machine)
| Component | Spec |
|---|---|
| CPU | Intel Core i7-6500U (2 cores / 4 threads @ 3.10 GHz) |
| RAM | 8 GB |
| GPU | NVIDIA GeForce GTX 950M — 4 GB VRAM (Maxwell, CUDA compute 5.0) |
| OS | Debian GNU/Linux 13 (trixie) x86_64 |
**Local inference verdict:** viable for small/quantized models, not for
production runs. See the [Local inference](#local-inference-llamacpp) section
for details.
---
## Provider overview
The enrich script uses a single, swappable provider config. All providers
except Anthropic expose an OpenAI-compatible API, so the same client code
works across all of them — only `baseURL`, `apiKey`, and `model` change.
| Provider | Use case | Cost | Rate limits |
|---|---|---|---|
| llama.cpp (local) | Quality testing, overnight dev runs | Free (electricity) | None |
| OpenRouter (free tier) | Quality comparison, multi-model evaluation | Free | 50 req/day, 20 req/min |
| OpenRouter (paid) | Production runs if local quality insufficient | Pay-per-token | None |
| Anthropic API | Quality baseline / reference | Pay-per-token | Standard |
---
## Local inference (llama.cpp)
### Why local inference is worth testing
Time is not a constraint — the pipeline scripts are fully resumable. The
laptop can run overnight for multiple nights. The only question is output
quality, which the test script evaluates empirically.
### Hardware constraints
The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0).
llama.cpp supports Maxwell via CUDA backend but newer builds may require
the `--cuda-no-kv-offload` flag depending on the version.
llama.cpp splits model layers between GPU and CPU automatically via
`--n-gpu-layers`. You set how many layers go on the GPU; the rest run on
CPU/RAM. This means a model larger than VRAM is not a dead end — it runs
in hybrid mode, slower than full-GPU but much faster than pure CPU.
Practical estimates for this hardware (~3.5 GB VRAM usable after drivers):
| Model size | Q4 VRAM | Mode | Est. speed |
|---|---|---|---|
| 3B | ~2.0 GB | Full GPU | ~1520 tok/s |
| 4B | ~2.5 GB | Full GPU | ~1218 tok/s |
| 7B | ~4.5 GB | Hybrid (~26/32 layers on GPU) | ~812 tok/s |
| 13B+ | ~8 GB+ | CPU-heavy hybrid | too slow |
### Recommended local models
Two candidates worth testing, covering different points on the size/quality
tradeoff:
**Gemma 4 E4B Instruct (Q4 / UD-Q4_K_XL)**
- GGUF file: `gemma-4-E4B-it-UD-Q4_K_XL.gguf` (~2.5 GB)
- Source: https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF
- Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+
language support including all five pipeline languages. First candidate
to test.
**Qwen2.5 7B Instruct (Q4_K_M)**
- GGUF file: `Qwen2.5-7B-Instruct-Q4_K_M.gguf` (~4.5 GB)
- Source: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF
- Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~812 tok/s.
Stronger multilingual generation than any 34B model. Second candidate,
for comparison against the smaller Gemma 4 E4B.
### Installation
```bash
# Install build dependencies
sudo apt install build-essential cmake git
# Clone llama.cpp
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
# Build with CUDA support (GTX 950M — compute 5.0)
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=50
cmake --build build --config Release -j$(nproc)
# Download model (example — adjust path as needed)
mkdir -p models
wget -O models/qwen2.5-3b-instruct-q4_k_m.gguf \
https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_k_m.gguf
```
### Starting the server
**Gemma 4 E4B** (full GPU):
```bash
./build/bin/llama-server \
--model models/gemma-4-e4b-it-ud-q4_k_xl.gguf \
--port 8080 \
--ctx-size 4096 \
--n-gpu-layers 999 \
--host 127.0.0.1
```
**Qwen2.5 7B** (hybrid — tune `--n-gpu-layers` to fit your VRAM):
```bash
./build/bin/llama-server \
--model models/qwen2.5-7b-instruct-q4_k_m.gguf \
--port 8080 \
--ctx-size 4096 \
--n-gpu-layers 28 \
--host 127.0.0.1
```
`--n-gpu-layers 999` means "put everything on GPU" — llama.cpp caps at the
actual layer count automatically, so 999 is safe as a "full offload" value.
For the 7B hybrid, start with `28` and reduce by 2 if the server reports
out-of-memory at startup.
### Verify the server is running
```bash
curl http://127.0.0.1:8080/health
# Expected: {"status":"ok"}
```
---
## OpenRouter (free tier)
OpenRouter exposes all models via an OpenAI-compatible API. No code changes
are needed to switch from local llama.cpp to OpenRouter — only the config
object changes.
### Rate limits (free tier)
- **50 requests per day** (account total, not per model)
- 20 requests per minute
> **Implication for testing:** with a 10-record test set you have headroom
> to test 45 models per day. With a 100-record test set, plan one model per
> day.
> **Implication for production:** the free tier is not viable for 117k
> records. If local quality is insufficient, use paid OpenRouter credits or
> a dedicated provider.
### Free models recommended for this pipeline
Ranked by expected multilingual generation quality for en/it/de/fr/es:
| Model ID | Params | Notes |
|---|---|---|
| `qwen/qwen3-coder:free` | 480B MoE (35B active) | Best free option. Strong multilingual despite "coder" label. Use as quality ceiling. |
| `qwen/qwen3-next-80b-a3b-instruct:free` | 80B MoE (3B active) | Smaller Qwen, useful comparison point. |
| `nvidia/nemotron-3-super-120b-a12b:free` | 120B MoE (12B active) | 262K context, supports structured output. |
| `google/gemma-4-31b-it:free` | 31B | 140+ language support, good European language coverage. |
| `zhipuai/glm-4.5-air:free` | MoE | Multilingual-focused. |
**Skip for this pipeline:**
- Llama models — weaker European language generation than Qwen/Gemma
- Mistral free tier — requests may be used for model training
### API endpoint
```
https://openrouter.ai/api/v1/chat/completions
```
Set `Authorization: Bearer <OPENROUTER_API_KEY>` in the request headers.
---
## Provider configuration in the test script
The enrich test script reads a single config object. To switch providers,
change this object and re-run.
```typescript
// config.ts
export type ProviderConfig = {
name: string; // used for output folder naming
baseURL: string;
apiKey: string;
model: string;
maxTokens: number;
};
// Local llama.cpp
export const LOCAL_QWEN3B: ProviderConfig = {
name: "local-qwen2.5-3b",
baseURL: "http://127.0.0.1:8080/v1",
apiKey: "none", // llama.cpp ignores this
model: "qwen2.5-3b", // llama.cpp ignores model name, uses loaded model
maxTokens: 512,
};
// OpenRouter — Qwen3 480B (free)
export const OR_QWEN3_480B: ProviderConfig = {
name: "or-qwen3-480b",
baseURL: "https://openrouter.ai/api/v1",
apiKey: process.env.OPENROUTER_API_KEY!,
model: "qwen/qwen3-coder:free",
maxTokens: 512,
};
// OpenRouter — Gemma 4 31B (free)
export const OR_GEMMA4_31B: ProviderConfig = {
name: "or-gemma4-31b",
baseURL: "https://openrouter.ai/api/v1",
apiKey: process.env.OPENROUTER_API_KEY!,
model: "google/gemma-4-31b-it:free",
maxTokens: 512,
};
// Anthropic (reference baseline — different adapter required)
export const ANTHROPIC_SONNET: ProviderConfig = {
name: "anthropic-sonnet",
baseURL: "https://api.anthropic.com/v1", // adapter handles format difference
apiKey: process.env.ANTHROPIC_API_KEY!,
model: "claude-sonnet-4-6",
maxTokens: 512,
};
```
Output from each run lands in:
```
stage-3-enrich/test/output/{provider.name}/results.json
stage-3-enrich/test/output/{provider.name}/metrics.json
```
The evaluate script compares all `metrics.json` files side by side.
---
## Evaluation metrics
The test script measures the following per provider run:
| Metric | What it measures |
|---|---|
| **JSON parse rate** | % of responses that are valid, schema-compliant JSON. Critical — a failed parse is a wasted call. Target: >97% |
| **Field coverage** | % of records where all required fields are present (cefr votes for all translations, descriptions for all languages, glosses/examples for fr/es) |
| **CEFR agreement** | For records that have a `cefr_source` vote, % where the model agrees. Measures calibration. |
| **Language correctness** | Manual spot-check only — automated detection not reliable enough |
| **Tokens/second** | Local only. Indicates overnight run feasibility |
### Decision thresholds
| Metric | Threshold | Action if below |
|---|---|---|
| JSON parse rate | < 97% | Do not use this model for production |
| Field coverage | < 95% | Prompt needs revision before production |
| CEFR agreement | < 70% | Model lacks vocabulary knowledge for this task |
---
## Recommended test sequence
1. **Start local, minimal dataset (510 records)**
Install llama.cpp, run Qwen2.5 3B against 510 hand-picked records.
Verify the server works, the output parses, and the model produces
something reasonable. This is purely a smoke test.
2. **Expand local to full 100-record sample**
Once the pipeline is confirmed working, run all 100 records locally.
Collect metrics. This is your local quality baseline.
3. **Run the same 100 records through OpenRouter free models**
One model per day (50 req/day limit). Start with `qwen/qwen3-coder:free`
as the quality ceiling.
4. **Compare metrics side by side**
If local 3B is within acceptable range of the cloud models on CEFR
agreement and field coverage, proceed with local overnight runs for
production. If not, use the cloud model that passed.
5. **Production run**
Full 117k records. Resume-safe — the script checkpoints after each
record so overnight runs can be stopped and continued.

View file

@ -2,6 +2,7 @@
## tasks
- put users in separate db
- pinning dependencies in package.json files
- rethink organisation of datafiles and wordlists
- admin dashboard for user management, also overview of words and languages and all their stats
@ -29,6 +30,18 @@ laptop: verify if docker containers run on startup (they shouldnt)
### vps setup
- monitoring and logging (eg via chrootkit or rkhunter, logwatch/monit => mails daily with summary)
<<<<<<< HEAD
- ~~keep the vps clean (e.g. old docker images/containers)~~ ✅ CI/CD pipeline runs `docker image prune -f` after deploy
### ~~cd/ci pipeline~~ ✅ RESOLVED
Forgejo Actions with runner on VPS, Forgejo built-in container registry. See `deployment.md`.
### ~~postgres backups~~ ✅ RESOLVED
Daily pg_dump cron job, 7-day retention, dev laptop auto-sync via rsync. See `deployment.md`.
=======
>>>>>>> dev
### try now option

View file

@ -287,6 +287,17 @@ After completing a task: share the code, ask what to refactor and why. The LLM s
## 11. Post-MVP Ladder
<<<<<<< HEAD
| Phase | What it adds | Status |
| ----------------- | ------------------------------------------------------------------------------- | ------ |
| Auth | Better Auth (Google + GitHub), embedded in Express API, user rows in DB | ✅ |
| Deployment | Docker Compose, Caddy, Forgejo, CI/CD, Hetzner VPS | ✅ |
| Hardening (partial) | CI/CD pipeline, DB backups | ✅ |
| User Stats | Games played, score history, profile page | ❌ |
| Multiplayer Lobby | Room creation, join by code, WebSocket connection | ❌ |
| Multiplayer Game | Simultaneous answers, server timer, live scores, winner screen | ❌ |
| Hardening (rest) | Rate limiting, error boundaries, monitoring, accessibility | ❌ |
=======
| Phase | What it adds | Status |
| ------------------- | ----------------------------------------------------------------------- | ------ |
| Auth | Better Auth (Google + GitHub), embedded in Express API, user rows in DB | ✅ |
@ -296,6 +307,7 @@ After completing a task: share the code, ask what to refactor and why. The LLM s
| Multiplayer Lobby | Room creation, join by code, WebSocket connection | ✅ |
| Multiplayer Game | Simultaneous answers, server timer, live scores, winner screen | ✅ |
| Hardening (rest) | Rate limiting, error boundaries, monitoring, accessibility | ❌ |
>>>>>>> dev
### Future Data Model Extensions (deferred, additive)

View file

@ -14,6 +14,7 @@ export default defineConfig([
"**/*.config.ts",
"routeTree.gen.ts",
"scripts/**",
"data-pipeline/**/*",
]),
eslint.configs.recommended,

View file

@ -11,5 +11,5 @@ export default defineConfig({
out: "./drizzle",
schema: "./src/db/schema.ts",
dialect: "postgresql",
dbCredentials: { url: process.env["DATABASE_URL"]! },
dbCredentials: { url: process.env["DATABASE_URL_LOCAL"]! },
});

View file

@ -0,0 +1,12 @@
ALTER TABLE "decks" DROP CONSTRAINT "source_language_check";--> statement-breakpoint
ALTER TABLE "decks" DROP CONSTRAINT "validated_languages_check";--> statement-breakpoint
ALTER TABLE "term_glosses" DROP CONSTRAINT "language_code_check";--> statement-breakpoint
ALTER TABLE "terms" DROP CONSTRAINT "pos_check";--> statement-breakpoint
ALTER TABLE "translations" DROP CONSTRAINT "language_code_check";--> statement-breakpoint
ALTER TABLE "lobbies" ALTER COLUMN "status" SET DEFAULT 'waiting';--> statement-breakpoint
ALTER TABLE "term_glosses" ADD COLUMN "description" text;--> statement-breakpoint
ALTER TABLE "decks" ADD CONSTRAINT "source_language_check" CHECK ("decks"."source_language" IN ('en', 'it', 'de', 'fr', 'es'));--> statement-breakpoint
ALTER TABLE "decks" ADD CONSTRAINT "validated_languages_check" CHECK (validated_languages <@ ARRAY['en', 'it', 'de', 'fr', 'es']::varchar[]);--> statement-breakpoint
ALTER TABLE "term_glosses" ADD CONSTRAINT "language_code_check" CHECK ("term_glosses"."language_code" IN ('en', 'it', 'de', 'fr', 'es'));--> statement-breakpoint
ALTER TABLE "terms" ADD CONSTRAINT "pos_check" CHECK ("terms"."pos" IN ('noun', 'verb', 'adjective', 'adverb'));--> statement-breakpoint
ALTER TABLE "translations" ADD CONSTRAINT "language_code_check" CHECK ("translations"."language_code" IN ('en', 'it', 'de', 'fr', 'es'));

View file

@ -0,0 +1,12 @@
CREATE TABLE "term_examples" (
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
"term_id" uuid NOT NULL,
"language_code" varchar(10) NOT NULL,
"text" text NOT NULL,
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
CONSTRAINT "unique_term_example" UNIQUE("term_id","language_code","text"),
CONSTRAINT "language_code_check" CHECK ("term_examples"."language_code" IN ('en', 'it', 'de', 'fr', 'es'))
);
--> statement-breakpoint
ALTER TABLE "term_examples" ADD CONSTRAINT "term_examples_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
CREATE INDEX "idx_term_examples_term_id" ON "term_examples" USING btree ("term_id","language_code");

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -50,6 +50,20 @@
"when": 1776270391189,
"tag": "0006_certain_adam_destine",
"breakpoints": true
},
{
"idx": 7,
"version": "7",
"when": 1776665111607,
"tag": "0007_nosy_leper_queen",
"breakpoints": true
},
{
"idx": 8,
"version": "7",
"when": 1776695279870,
"tag": "0008_far_energizer",
"breakpoints": true
}
]
}
}

View file

@ -6,9 +6,7 @@
"scripts": {
"build": "tsc",
"generate": "drizzle-kit generate",
"migrate": "drizzle-kit migrate",
"db:seed": "npx tsx src/seeding-datafiles.ts",
"db:build-deck": "npx tsx src/generating-deck.ts"
"migrate": "drizzle-kit migrate"
},
"dependencies": {
"@lila/shared": "workspace:*",

View file

@ -1,183 +0,0 @@
/*
This script performs a cross-reference check between two specific data sets:
- The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for.
- The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app.
What it calculates:
It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?"
Matched: The word from the JSON file was found in the DB. (Ready for enrichment).
Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment).
*/
import fs from "node:fs/promises";
import { eq } from "drizzle-orm";
import {
SUPPORTED_LANGUAGE_CODES,
SUPPORTED_POS,
CEFR_LEVELS,
DIFFICULTY_LEVELS,
} from "@lila/shared";
import { db } from "@lila/db";
import { terms, translations } from "@lila/db/schema";
type POS = (typeof SUPPORTED_POS)[number];
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
type CEFRLevel = (typeof CEFR_LEVELS)[number];
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
type MergedRecord = {
word: string;
pos: POS;
cefr: CEFRLevel;
difficulty: Difficulty;
sources: string[];
};
type CoverageStats = {
total: number;
matched: number;
unmatched: number;
byCefr: Record<CEFRLevel, { total: number; matched: number }>;
byDifficulty: Record<Difficulty, { total: number; matched: number }>;
unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>;
};
const dataDir = "./src/data/";
async function checkCoverage(language: LanguageCode): Promise<void> {
const filename = `${language}-merged.json`;
const filepath = dataDir + filename;
console.log(`\n📄 Checking ${filename}...`);
// Load merged data
let records: MergedRecord[];
try {
const raw = await fs.readFile(filepath, "utf8");
records = JSON.parse(raw) as MergedRecord[];
} catch (e) {
console.warn(` ⚠️ Could not read file: ${(e as Error).message}`);
return;
}
console.log(` Loaded ${records.length.toLocaleString("en-US")} entries`);
// Initialize stats
const stats: CoverageStats = {
total: records.length,
matched: 0,
unmatched: 0,
byCefr: {} as Record<CEFRLevel, { total: number; matched: number }>,
byDifficulty: {} as Record<Difficulty, { total: number; matched: number }>,
unmatchedWords: [],
};
for (const level of CEFR_LEVELS)
stats.byCefr[level] = { total: 0, matched: 0 };
for (const diff of DIFFICULTY_LEVELS)
stats.byDifficulty[diff] = { total: 0, matched: 0 };
// ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ──
console.log(` 🔍 Querying database for existing translations...`);
// Get all existing translations for this language + POS combo
const existingRows = await db
.select({ text: translations.text, pos: terms.pos })
.from(translations)
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(eq(translations.language_code, language));
// Create a Set for O(1) lookup: "word|pos" -> true
const existingSet = new Set(
existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`),
);
// ── Process records against the in-memory Set ──
for (const record of records) {
stats.byCefr[record.cefr].total++;
stats.byDifficulty[record.difficulty].total++;
const key = `${record.word.toLowerCase()}|${record.pos}`;
if (existingSet.has(key)) {
stats.matched++;
stats.byCefr[record.cefr].matched++;
stats.byDifficulty[record.difficulty].matched++;
} else {
stats.unmatched++;
if (stats.unmatchedWords.length < 20) {
stats.unmatchedWords.push({
word: record.word,
pos: record.pos,
cefr: record.cefr,
});
}
}
}
// ── Print results (same as your draft) ──
console.log(`\n📊 Coverage for ${language}:`);
console.log(` Total entries: ${stats.total.toLocaleString("en-US")}`);
console.log(
` Matched in DB: ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`,
);
console.log(
` Unmatched: ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`,
);
console.log(`\n By CEFR level:`);
for (const level of CEFR_LEVELS) {
const { total, matched } = stats.byCefr[level];
if (total > 0) {
const pct = ((matched / total) * 100).toFixed(1);
console.log(
` ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
);
}
}
console.log(`\n By difficulty:`);
for (const diff of DIFFICULTY_LEVELS) {
const { total, matched } = stats.byDifficulty[diff];
if (total > 0) {
const pct = ((matched / total) * 100).toFixed(1);
console.log(
` ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
);
}
}
if (stats.unmatchedWords.length > 0) {
console.log(`\n⚠ Sample unmatched words (first 20):`);
for (const { word, pos, cefr } of stats.unmatchedWords) {
console.log(` "${word}" (${pos}, ${cefr})`);
}
if (stats.unmatched > 20) {
console.log(` ... and ${stats.unmatched - 20} more`);
}
}
}
const main = async () => {
console.log("##########################################");
console.log("lila — CEFR Coverage Check");
console.log("##########################################");
for (const language of SUPPORTED_LANGUAGE_CODES) {
await checkCoverage(language);
}
console.log("\n##########################################");
console.log("Done");
console.log("##########################################");
};
main().catch((err) => {
console.error(err);
process.exit(1);
});

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -51,6 +51,7 @@ export const term_glosses = pgTable(
.references(() => terms.id, { onDelete: "cascade" }),
language_code: varchar({ length: 10 }).notNull(),
text: text().notNull(),
description: text(),
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
},
(table) => [
@ -62,6 +63,31 @@ export const term_glosses = pgTable(
],
);
export const term_examples = pgTable(
"term_examples",
{
id: uuid().primaryKey().defaultRandom(),
term_id: uuid()
.notNull()
.references(() => terms.id, { onDelete: "cascade" }),
language_code: varchar({ length: 10 }).notNull(),
text: text().notNull(),
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
},
(table) => [
unique("unique_term_example").on(
table.term_id,
table.language_code,
table.text,
),
check(
"language_code_check",
sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
),
index("idx_term_examples_term_id").on(table.term_id, table.language_code),
],
);
export const translations = pgTable(
"translations",
{

View file

@ -1,211 +0,0 @@
import fs from "node:fs/promises";
import { db } from "@lila/db";
import { translations, terms, decks, deck_terms } from "@lila/db/schema";
import { inArray, and, eq, ne, countDistinct } from "drizzle-orm";
type DbOrTx = Parameters<Parameters<typeof db.transaction>[0]>[0];
const config = {
pathToWordlist: "./src/data/wordlists/top1000englishnouns",
deckName: "top english nouns",
deckDescription: "Most frequently used English nouns for vocabulary practice",
sourceLanguage: "en",
sourcePOS: "noun",
} as const;
const readWordList = async () => {
const raw = await fs.readFile(config.pathToWordlist, "utf8");
const words = [
...new Set(
raw
.split("\n")
.map((w) => w.trim().toLowerCase())
.filter(Boolean),
),
];
return words;
};
const resolveSourceTerms = async (words: string[]) => {
const rows = await db
.select({ text: translations.text, termId: translations.term_id })
.from(translations)
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(
and(
inArray(translations.text, words),
eq(translations.language_code, config.sourceLanguage),
eq(terms.pos, config.sourcePOS),
),
);
const wordToTermIds = new Map<string, string[]>();
for (const row of rows) {
const word = row.text.toLowerCase();
if (!wordToTermIds.has(word)) {
wordToTermIds.set(word, []);
}
wordToTermIds.get(word)!.push(row.termId);
}
// Deduplicate: multiple words can map to the same term ID (e.g. via synonyms)
const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())];
const missingWords = words.filter((w) => !wordToTermIds.has(w));
return { termIds, missingWords };
};
const writeMissingWordsToFile = async (missingWords: string[]) => {
const outputPath = `${config.pathToWordlist}-missing`;
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
};
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
const coverage = await db
.select({
language: translations.language_code,
coveredCount: countDistinct(translations.term_id),
})
.from(translations)
.where(
and(
inArray(translations.term_id, termIds),
ne(translations.language_code, sourceLanguage),
),
)
.groupBy(translations.language_code);
const validatedLanguages = coverage
.filter((row) => Number(row.coveredCount) === termIds.length)
.map((row) => row.language);
return { coverage, validatedLanguages };
};
const findExistingDeck = async (tx: DbOrTx) => {
const existing = await tx
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
.from(decks)
.where(
and(
eq(decks.name, config.deckName),
eq(decks.source_language, config.sourceLanguage),
),
);
return existing[0] ?? null;
};
const createDeck = async (tx: DbOrTx, validatedLanguages: string[]) => {
const result = await tx
.insert(decks)
.values({
name: config.deckName,
description: config.deckDescription,
source_language: config.sourceLanguage,
validated_languages: validatedLanguages,
type: "core",
})
.returning({ id: decks.id });
const created = result[0];
if (!created) throw new Error("Failed to create deck: no row returned");
return created.id;
};
const addTermsToDeck = async (
tx: DbOrTx,
deckId: string,
termIds: string[],
): Promise<number> => {
if (termIds.length === 0) return 0;
await tx
.insert(deck_terms)
.values(termIds.map((termId) => ({ deck_id: deckId, term_id: termId })))
.onConflictDoNothing();
return termIds.length;
};
const updateValidatedLanguages = async (
tx: DbOrTx,
deckId: string,
validatedLanguages: string[],
): Promise<void> => {
await tx
.update(decks)
.set({ validated_languages: validatedLanguages })
.where(eq(decks.id, deckId));
};
const main = async () => {
console.log("📖 Reading word list...");
const sourceWords = await readWordList();
console.log(` ${sourceWords.length} words loaded\n`);
console.log("🔍 Checking against database...");
const { termIds, missingWords } = await resolveSourceTerms(sourceWords);
console.log(` ${termIds.length} terms found`);
console.log(` ${missingWords.length} words not found in DB\n`);
console.log("🖊️ Writing missing words to file...\n");
await writeMissingWordsToFile(missingWords);
console.log("✅ Validating languages...");
const { coverage, validatedLanguages } = await validateLanguages(
config.sourceLanguage,
termIds,
);
console.log(
` Validated languages: ${JSON.stringify(validatedLanguages)}\n`,
);
console.log("🔬 Language coverage breakdown...");
for (const row of coverage) {
console.log(
` ${row.language}: ${row.coveredCount} / ${termIds.length} terms covered`,
);
}
console.log("🃏 Looking for existing deck...");
const addedCount = await db.transaction(async (tx) => {
const existingDeck = await findExistingDeck(tx);
const deckId = existingDeck
? existingDeck.id
: await createDeck(tx, validatedLanguages);
const addedCount = await addTermsToDeck(tx, deckId, termIds);
const currentLanguages = existingDeck?.validatedForLanguages ?? [];
const hasChanged =
JSON.stringify([...currentLanguages].sort()) !==
JSON.stringify([...validatedLanguages].sort());
if (hasChanged) {
await updateValidatedLanguages(tx, deckId, validatedLanguages);
}
return addedCount;
});
const alreadyPresentCount = termIds.length - addedCount;
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
console.log("📊 Summary");
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
console.log(` Words loaded from wordlist : ${sourceWords.length}`);
console.log(
` Words matched in DB : ${sourceWords.length - missingWords.length}`,
);
console.log(` Words not found in DB : ${missingWords.length}`);
console.log(` Term IDs resolved : ${termIds.length}`);
console.log(` Terms added to deck : ${addedCount}`);
console.log(` Terms already in deck : ${alreadyPresentCount}`);
console.log(
` Validated languages : ${validatedLanguages.length > 0 ? validatedLanguages.join(", ") : "none"}`,
);
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
};
main().catch((error) => {
console.error(error);
process.exit(1);
});

View file

@ -1,148 +0,0 @@
import fs from "node:fs/promises";
import { eq, inArray } from "drizzle-orm";
import {
SUPPORTED_LANGUAGE_CODES,
SUPPORTED_POS,
CEFR_LEVELS,
DIFFICULTY_LEVELS,
} from "@lila/shared";
import { db } from "@lila/db";
import { translations, terms } from "@lila/db/schema";
type POS = (typeof SUPPORTED_POS)[number];
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
type CEFRLevel = (typeof CEFR_LEVELS)[number];
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
type MergedRecord = {
word: string;
pos: POS;
cefr: CEFRLevel;
difficulty: Difficulty;
sources: string[];
};
const dataDir = "./src/data/";
const BATCH_SIZE = 500;
// ────────────────────────────────────────────────────────────
// Helpers
// ────────────────────────────────────────────────────────────
function chunk<T>(arr: T[], size: number): T[][] {
const out: T[][] = [];
for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
return out;
}
function fmt(n: number): string {
return n.toLocaleString("en-US");
}
// ────────────────────────────────────────────────────────────
// Enrichment per language
// ────────────────────────────────────────────────────────────
async function enrichLanguage(language: LanguageCode): Promise<void> {
const filename = `${language}-merged.json`;
const filepath = dataDir + filename;
console.log(`\n📝 Enriching ${filename}...`);
let records: MergedRecord[];
try {
const raw = await fs.readFile(filepath, "utf8");
records = JSON.parse(raw) as MergedRecord[];
} catch (e) {
console.warn(` ⚠️ Could not read file: ${(e as Error).message}`);
return;
}
console.log(` Loaded ${fmt(records.length)} entries`);
// 1. Bulk fetch existing translations for this language
console.log(` 🔍 Fetching existing translations from DB...`);
const existingTranslations = await db
.select({ id: translations.id, text: translations.text, pos: terms.pos })
.from(translations)
.innerJoin(terms, eq(translations.term_id, terms.id))
.where(eq(translations.language_code, language));
// 2. Build lookup map: "lowercase_word|pos" -> translation IDs
const translationMap = new Map<string, string[]>();
for (const t of existingTranslations) {
const key = `${t.text.toLowerCase()}|${t.pos}`;
if (!translationMap.has(key)) translationMap.set(key, []);
translationMap.get(key)!.push(t.id);
}
// 3. Match records to DB IDs and group by target (cefr, difficulty)
const updatesByValue = new Map<string, string[]>();
const unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }> = [];
for (const rec of records) {
const key = `${rec.word.toLowerCase()}|${rec.pos}`;
const ids = translationMap.get(key);
if (ids && ids.length > 0) {
const valueKey = `${rec.cefr}|${rec.difficulty}`;
if (!updatesByValue.has(valueKey)) updatesByValue.set(valueKey, []);
updatesByValue.get(valueKey)!.push(...ids);
} else {
unmatchedWords.push({ word: rec.word, pos: rec.pos, cefr: rec.cefr });
}
}
// 4. Batch updates grouped by (cefr, difficulty)
let totalUpdated = 0;
for (const [valueKey, ids] of updatesByValue.entries()) {
const [cefr, difficulty] = valueKey.split("|") as [CEFRLevel, Difficulty];
const uniqueIds = [...new Set(ids)]; // Deduplicate synonyms/duplicates
for (const idBatch of chunk(uniqueIds, BATCH_SIZE)) {
await db
.update(translations)
.set({ cefr_level: cefr, difficulty })
.where(inArray(translations.id, idBatch));
totalUpdated += idBatch.length;
}
}
// 5. Summary
console.log(`\n ✅ Updated ${fmt(totalUpdated)} translations`);
console.log(` ⚠️ Unmatched: ${fmt(unmatchedWords.length)}`);
if (unmatchedWords.length > 0) {
console.log(`\n Sample unmatched words (first 20):`);
for (const { word, pos, cefr } of unmatchedWords.slice(0, 20)) {
console.log(` "${word}" (${pos}, ${cefr})`);
}
if (unmatchedWords.length > 20) {
console.log(` ... and ${fmt(unmatchedWords.length - 20)} more`);
}
}
}
// ────────────────────────────────────────────────────────────
// Main
// ────────────────────────────────────────────────────────────
const main = async () => {
console.log("##########################################");
console.log("lila — CEFR Enrichment");
console.log("##########################################\n");
for (const lang of SUPPORTED_LANGUAGE_CODES) {
await enrichLanguage(lang);
}
console.log("\n##########################################");
console.log("Done");
console.log("##########################################");
};
main().catch((err) => {
console.error(err);
process.exit(1);
});

View file

@ -1,212 +0,0 @@
import fs from "node:fs/promises";
import { and, count, eq, inArray } from "drizzle-orm";
import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@lila/shared";
import { db } from "@lila/db";
import { terms, translations, term_glosses } from "@lila/db/schema";
type POS = (typeof SUPPORTED_POS)[number];
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
type SynsetRecord = {
source_id: string;
pos: POS;
translations: Partial<Record<LanguageCode, string[]>>;
glosses: Partial<Record<LanguageCode, string[]>>;
};
const dataDir = "./src/data/";
const BATCH_SIZE = 500;
// ────────────────────────────────────────────────────────────
// Helpers
// ────────────────────────────────────────────────────────────
function chunk<T>(arr: T[], size: number): T[][] {
const out: T[][] = [];
for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
return out;
}
function fmt(n: number): string {
return n.toLocaleString("en-US");
}
// ────────────────────────────────────────────────────────────
// Stats
// ────────────────────────────────────────────────────────────
const stats = {
terms: { inserted: 0, skipped: 0 },
translations: { inserted: 0, skipped: 0 },
glosses: { inserted: 0, skipped: 0 },
};
// ────────────────────────────────────────────────────────────
// Per-batch processing
// ────────────────────────────────────────────────────────────
async function processBatch(batch: SynsetRecord[]): Promise<void> {
// 1. Insert terms — idempotency key: (source, source_id)
const termValues = batch.map((r) => ({
source: "omw" as const,
source_id: r.source_id,
pos: r.pos,
}));
const insertedTerms = await db
.insert(terms)
.values(termValues)
.onConflictDoNothing()
.returning({ id: terms.id });
stats.terms.inserted += insertedTerms.length;
stats.terms.skipped += batch.length - insertedTerms.length;
// 2. Resolve UUIDs for every source_id in this batch (new + pre-existing).
// We can't rely solely on the .returning() above because onConflictDoNothing
// returns nothing for rows that already existed.
const sourceIds = batch.map((r) => r.source_id);
const termRows = await db
.select({ id: terms.id, source_id: terms.source_id })
.from(terms)
.where(and(eq(terms.source, "omw"), inArray(terms.source_id, sourceIds)));
const sourceIdToTermId = new Map(termRows.map((r) => [r.source_id, r.id]));
// 3. Build and insert translation rows
const translationRows = batch.flatMap((r) => {
const termId = sourceIdToTermId.get(r.source_id);
if (!termId) return [];
return Object.entries(r.translations).flatMap(([lang, lemmas]) =>
(lemmas ?? []).map((text) => ({
term_id: termId,
language_code: lang as LanguageCode,
text,
})),
);
});
for (const tBatch of chunk(translationRows, BATCH_SIZE)) {
const inserted = await db
.insert(translations)
.values(tBatch)
.onConflictDoNothing()
.returning({ id: translations.id });
stats.translations.inserted += inserted.length;
stats.translations.skipped += tBatch.length - inserted.length;
}
// 4. Build and insert gloss rows
const glossRows = batch.flatMap((r) => {
const termId = sourceIdToTermId.get(r.source_id);
if (!termId) return [];
return Object.entries(r.glosses ?? {}).flatMap(([lang, texts]) =>
(texts ?? []).map((text) => ({
term_id: termId,
language_code: lang as LanguageCode,
text,
})),
);
});
for (const gBatch of chunk(glossRows, BATCH_SIZE)) {
const inserted = await db
.insert(term_glosses)
.values(gBatch)
.onConflictDoNothing()
.returning({ id: term_glosses.id });
stats.glosses.inserted += inserted.length;
stats.glosses.skipped += gBatch.length - inserted.length;
}
}
// ────────────────────────────────────────────────────────────
// Main
// ────────────────────────────────────────────────────────────
const main = async () => {
console.log("\n##########################################");
console.log("lila — OMW seed");
console.log("##########################################\n");
// One file per POS — names are derived from SUPPORTED_POS so adding a new
// constant value automatically picks up a new file on the next run.
const posToFile = Object.fromEntries(
SUPPORTED_POS.map((pos) => [pos, `omw-${pos}.json`]),
) as Record<POS, string>;
for (const pos of SUPPORTED_POS) {
const filename = posToFile[pos];
const filepath = dataDir + filename;
console.log(`📄 ${filename}`);
let records: SynsetRecord[];
try {
const raw = await fs.readFile(filepath, "utf8");
records = JSON.parse(raw) as SynsetRecord[];
} catch (e) {
console.warn(
` ⚠️ Skipping — could not read file: ${(e as Error).message}\n`,
);
continue;
}
console.log(` Loaded ${fmt(records.length)} synsets`);
const batches = chunk(records, BATCH_SIZE);
for (const [i, batch] of batches.entries()) {
// Progress every 5 000 synsets
if (i > 0 && i % 10 === 0) {
const processed = i * BATCH_SIZE;
console.log(`${fmt(processed)} / ${fmt(records.length)}`);
}
await processBatch(batch);
}
console.log(` ✅ Done\n`);
}
// ── Summary ───────────────────────────────────────────────
console.log("##########################################");
console.log("Summary");
console.log("##########################################\n");
const pad = (label: string) => label.padEnd(14);
console.log(
`${pad("Terms:")}inserted ${fmt(stats.terms.inserted)}, skipped ${fmt(stats.terms.skipped)}`,
);
console.log(
`${pad("Translations:")}inserted ${fmt(stats.translations.inserted)}, skipped ${fmt(stats.translations.skipped)}`,
);
console.log(
`${pad("Glosses:")}inserted ${fmt(stats.glosses.inserted)}, skipped ${fmt(stats.glosses.skipped)}`,
);
// Query actual DB totals — insert-based counters show 0 on re-runs.
console.log("\nCoverage per language (total in DB):");
for (const lang of SUPPORTED_LANGUAGE_CODES) {
const [tRow] = await db
.select({ n: count() })
.from(translations)
.where(eq(translations.language_code, lang));
const [gRow] = await db
.select({ n: count() })
.from(term_glosses)
.where(eq(term_glosses.language_code, lang));
console.log(
` ${lang}: ${fmt(tRow?.n ?? 0)} translations, ${fmt(gRow?.n ?? 0)} glosses`,
);
}
};
main().catch((err) => {
console.error(err);
process.exit(1);
});

View file

@ -5,7 +5,11 @@
"moduleResolution": "NodeNext",
"outDir": "./dist",
"resolveJsonModule": true,
"types": ["vitest/globals"]
"types": ["vitest/globals"],
},
"include": ["src", "vitest.config.ts"]
"include": [
"src",
"vitest.config.ts",
"../../data-pipeline/archive/packages-db-src-old-seeding-scripts/data",
],
}

View file

@ -1,7 +1,7 @@
export const SUPPORTED_LANGUAGE_CODES = ["en", "it"] as const;
export const SUPPORTED_LANGUAGE_CODES = ["en", "it", "de", "fr", "es"] as const;
export type SupportedLanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
export const SUPPORTED_POS = ["noun", "verb"] as const;
export const SUPPORTED_POS = ["noun", "verb", "adjective", "adverb"] as const;
export type SupportedPos = (typeof SUPPORTED_POS)[number];
export const GAME_ROUNDS = ["3", "10"] as const;

297
pnpm-lock.yaml generated
View file

@ -55,7 +55,7 @@ importers:
version: link:../../packages/shared
better-auth:
specifier: ^1.6.2
version: 1.6.2(@opentelemetry/api@1.9.1)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)))
version: 1.6.2(@opentelemetry/api@1.9.1)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)))
cors:
specifier: ^2.8.6
version: 2.8.6
@ -101,7 +101,7 @@ importers:
version: 1.166.10(@tanstack/react-router@1.168.1(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(@tanstack/router-core@1.168.1)(csstype@3.2.3)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
better-auth:
specifier: ^1.6.2
version: 1.6.2(@opentelemetry/api@1.9.1)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)))
version: 1.6.2(@opentelemetry/api@1.9.1)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)))
react:
specifier: ^19.2.4
version: 19.2.4
@ -134,6 +134,28 @@ importers:
specifier: ^8.0.1
version: 8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)
data-pipeline:
dependencies:
'@lila/shared':
specifier: workspace:*
version: link:../packages/shared
better-sqlite3:
specifier: ^12.9.0
version: 12.9.0
devDependencies:
'@types/better-sqlite3':
specifier: ^7.6.13
version: 7.6.13
'@types/node':
specifier: ^24.12.0
version: 24.12.0
tsx:
specifier: ^4.21.0
version: 4.21.0
typescript:
specifier: ^5.9.3
version: 5.9.3
packages/db:
dependencies:
'@lila/shared':
@ -144,7 +166,7 @@ importers:
version: 17.3.1
drizzle-orm:
specifier: ^0.45.1
version: 0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0)
version: 0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)
pg:
specifier: ^8.20.0
version: 8.20.0
@ -1243,6 +1265,9 @@ packages:
'@tybys/wasm-util@0.10.1':
resolution: {integrity: sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg==}
'@types/better-sqlite3@7.6.13':
resolution: {integrity: sha512-NMv9ASNARoKksWtsq/SHakpYAYnhBrQgGD8zkLYk/jaK8jUGn08CfEdTRgYhMypUQAfzSP8W6gNLe0q19/t4VA==}
'@types/body-parser@1.19.6':
resolution: {integrity: sha512-HLFeCYgz89uk22N5Qg3dvGvsv46B8GLvKKo1zKG4NybA8U2DiEO3w9lqGg29t/tfLRJpJ6iQxnVw4OnB7MoM9g==}
@ -1491,6 +1516,9 @@ packages:
resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==}
engines: {node: 18 || 20 || >=22}
base64-js@1.5.1:
resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
baseline-browser-mapping@2.10.9:
resolution: {integrity: sha512-OZd0e2mU11ClX8+IdXe3r0dbqMEznRiT4TfbhYIbcRPZkqJ7Qwer8ij3GZAmLsRKa+II9V1v5czCkvmHH3XZBg==}
engines: {node: '>=6.0.0'}
@ -1566,6 +1594,10 @@ packages:
zod:
optional: true
better-sqlite3@12.9.0:
resolution: {integrity: sha512-wqUv4Gm3toFpHDQmaKD4QhZm3g1DjUBI0yzS4UBl6lElUmXFYdTQmmEDpAFa5o8FiFiymURypEnfVHzILKaxqQ==}
engines: {node: 20.x || 22.x || 23.x || 24.x || 25.x}
bidi-js@1.0.3:
resolution: {integrity: sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==}
@ -1573,6 +1605,12 @@ packages:
resolution: {integrity: sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==}
engines: {node: '>=8'}
bindings@1.5.0:
resolution: {integrity: sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==}
bl@4.1.0:
resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==}
body-parser@2.2.2:
resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==}
engines: {node: '>=18'}
@ -1593,6 +1631,9 @@ packages:
buffer-from@1.1.2:
resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==}
buffer@5.7.1:
resolution: {integrity: sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==}
bytes@3.1.2:
resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==}
engines: {node: '>= 0.8'}
@ -1624,6 +1665,9 @@ packages:
resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==}
engines: {node: '>= 8.10.0'}
chownr@1.1.4:
resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==}
cliui@8.0.1:
resolution: {integrity: sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==}
engines: {node: '>=12'}
@ -1716,6 +1760,14 @@ packages:
decimal.js@10.6.0:
resolution: {integrity: sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==}
decompress-response@6.0.0:
resolution: {integrity: sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==}
engines: {node: '>=10'}
deep-extend@0.6.0:
resolution: {integrity: sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==}
engines: {node: '>=4.0.0'}
deep-is@0.1.4:
resolution: {integrity: sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==}
@ -1858,6 +1910,9 @@ packages:
resolution: {integrity: sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==}
engines: {node: '>= 0.8'}
end-of-stream@1.4.5:
resolution: {integrity: sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==}
enhanced-resolve@5.20.1:
resolution: {integrity: sha512-Qohcme7V1inbAfvjItgw0EaxVX5q2rdVEZHRBrEQdRZTssLDGsL8Lwrznl8oQ/6kuTJONLaDcGjkNP247XEhcA==}
engines: {node: '>=10.13.0'}
@ -1982,6 +2037,10 @@ packages:
resolution: {integrity: sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==}
engines: {node: '>= 0.6'}
expand-template@2.0.3:
resolution: {integrity: sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==}
engines: {node: '>=6'}
expect-type@1.3.0:
resolution: {integrity: sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==}
engines: {node: '>=12.0.0'}
@ -2015,6 +2074,9 @@ packages:
resolution: {integrity: sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==}
engines: {node: '>=16.0.0'}
file-uri-to-path@1.0.0:
resolution: {integrity: sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==}
fill-range@7.1.1:
resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==}
engines: {node: '>=8'}
@ -2054,6 +2116,9 @@ packages:
resolution: {integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==}
engines: {node: '>= 0.8'}
fs-constants@1.0.0:
resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==}
fsevents@2.3.3:
resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==}
engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0}
@ -2081,6 +2146,9 @@ packages:
get-tsconfig@4.13.6:
resolution: {integrity: sha512-shZT/QMiSHc/YBLxxOkMtgSid5HFoauqCE3/exfsEcwg1WkeqjG+V40yBbBrsD+jW2HDXcs28xOfcbm2jI8Ddw==}
github-from-package@0.0.0:
resolution: {integrity: sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==}
glob-parent@5.1.2:
resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==}
engines: {node: '>= 6'}
@ -2138,6 +2206,9 @@ packages:
resolution: {integrity: sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==}
engines: {node: '>=0.10.0'}
ieee754@1.2.1:
resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==}
ignore@5.3.2:
resolution: {integrity: sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==}
engines: {node: '>= 4'}
@ -2153,6 +2224,9 @@ packages:
inherits@2.0.4:
resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==}
ini@1.3.8:
resolution: {integrity: sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==}
ipaddr.js@1.9.1:
resolution: {integrity: sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==}
engines: {node: '>= 0.10'}
@ -2389,10 +2463,20 @@ packages:
engines: {node: '>=4.0.0'}
hasBin: true
mimic-response@3.1.0:
resolution: {integrity: sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==}
engines: {node: '>=10'}
minimatch@10.2.4:
resolution: {integrity: sha512-oRjTw/97aTBN0RHbYCdtF1MQfvusSIBQM0IZEgzl6426+8jSC0nF1a/GmnVLpfB9yyr6g6FTqWqiZVbxrtaCIg==}
engines: {node: 18 || 20 || >=22}
minimist@1.2.8:
resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==}
mkdirp-classic@0.5.3:
resolution: {integrity: sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==}
ms@2.1.3:
resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==}
@ -2405,6 +2489,9 @@ packages:
resolution: {integrity: sha512-F0wCzbsH80G7XXo0Jd9/AVQC7ouWY6idUCTnMwW5t/Rv9W8qmO6endavDwg7TNp5GbugwSukFMVZqzPSrSMndg==}
engines: {node: ^20.0.0 || >=22.0.0}
napi-build-utils@2.0.0:
resolution: {integrity: sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA==}
natural-compare@1.4.0:
resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==}
@ -2412,6 +2499,10 @@ packages:
resolution: {integrity: sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==}
engines: {node: '>= 0.6'}
node-abi@3.89.0:
resolution: {integrity: sha512-6u9UwL0HlAl21+agMN3YAMXcKByMqwGx+pq+P76vii5f7hTPtKDp08/H9py6DY+cfDw7kQNTGEj/rly3IgbNQA==}
engines: {node: '>=10'}
node-releases@2.0.36:
resolution: {integrity: sha512-TdC8FSgHz8Mwtw9g5L4gR/Sh9XhSP/0DEkQxfEFXOpiul5IiHgHan2VhYYb6agDSfp4KuvltmGApc8HMgUrIkA==}
@ -2535,6 +2626,12 @@ packages:
resolution: {integrity: sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==}
engines: {node: '>=0.10.0'}
prebuild-install@7.1.3:
resolution: {integrity: sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==}
engines: {node: '>=10'}
deprecated: No longer maintained. Please contact the author of the relevant native addon; alternatives are available.
hasBin: true
prelude-ls@1.2.1:
resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==}
engines: {node: '>= 0.8.0'}
@ -2548,6 +2645,9 @@ packages:
resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==}
engines: {node: '>= 0.10'}
pump@3.0.4:
resolution: {integrity: sha512-VS7sjc6KR7e1ukRFhQSY5LM2uBWAUPiOPa/A3mkKmiMwSmRFUITt0xuj+/lesgnCv+dPIEYlkzrcyXgquIHMcA==}
punycode@2.3.1:
resolution: {integrity: sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==}
engines: {node: '>=6'}
@ -2564,6 +2664,10 @@ packages:
resolution: {integrity: sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==}
engines: {node: '>= 0.10'}
rc@1.2.8:
resolution: {integrity: sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==}
hasBin: true
react-dom@19.2.4:
resolution: {integrity: sha512-AXJdLo8kgMbimY95O2aKQqsz2iWi9jMgKJhRBAxECE4IFxfcazB2LmzloIoibJI3C12IlY20+KFaLv+71bUJeQ==}
peerDependencies:
@ -2573,6 +2677,10 @@ packages:
resolution: {integrity: sha512-9nfp2hYpCwOjAN+8TZFGhtWEwgvWHXqESH8qT89AT/lWklpLON22Lc8pEtnpsZz7VmawabSU0gCjnj8aC0euHQ==}
engines: {node: '>=0.10.0'}
readable-stream@3.6.2:
resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==}
engines: {node: '>= 6'}
readdirp@3.6.0:
resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
engines: {node: '>=8.10.0'}
@ -2607,6 +2715,9 @@ packages:
rxjs@7.8.2:
resolution: {integrity: sha512-dhKf903U/PQZY6boNNtAGdWbG85WAbjT/1xYoZIC7FAY0yWapOBQVsVrDl58W86//e1VpMNBtRV4MaXfdMySFA==}
safe-buffer@5.2.1:
resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
safer-buffer@2.1.2:
resolution: {integrity: sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==}
@ -2681,6 +2792,12 @@ packages:
siginfo@2.0.0:
resolution: {integrity: sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==}
simple-concat@1.0.1:
resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==}
simple-get@4.0.1:
resolution: {integrity: sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==}
source-map-js@1.2.1:
resolution: {integrity: sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==}
engines: {node: '>=0.10.0'}
@ -2718,10 +2835,17 @@ packages:
resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==}
engines: {node: '>=8'}
string_decoder@1.3.0:
resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
strip-ansi@6.0.1:
resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==}
engines: {node: '>=8'}
strip-json-comments@2.0.1:
resolution: {integrity: sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==}
engines: {node: '>=0.10.0'}
superagent@10.3.0:
resolution: {integrity: sha512-B+4Ik7ROgVKrQsXTV0Jwp2u+PXYLSlqtDAhYnkkD+zn3yg8s/zjA2MeGayPoY/KICrbitwneDHrjSotxKL+0XQ==}
engines: {node: '>=14.18.0'}
@ -2748,6 +2872,13 @@ packages:
resolution: {integrity: sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg==}
engines: {node: '>=6'}
tar-fs@2.1.4:
resolution: {integrity: sha512-mDAjwmZdh7LTT6pNleZ05Yt65HC3E+NiQzl672vQG38jIrehtJk/J3mNwIg+vShQPcLF/LV7CMnDW6vjj6sfYQ==}
tar-stream@2.2.0:
resolution: {integrity: sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==}
engines: {node: '>=6'}
tiny-invariant@1.3.3:
resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==}
@ -2810,6 +2941,9 @@ packages:
engines: {node: '>=18.0.0'}
hasBin: true
tunnel-agent@0.6.0:
resolution: {integrity: sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==}
type-check@0.4.0:
resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==}
engines: {node: '>= 0.8.0'}
@ -2862,6 +2996,9 @@ packages:
peerDependencies:
react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
util-deprecate@1.0.2:
resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
vary@1.1.2:
resolution: {integrity: sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==}
engines: {node: '>= 0.8'}
@ -3198,12 +3335,12 @@ snapshots:
nanostores: 1.2.0
zod: 4.3.6
'@better-auth/drizzle-adapter@1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))':
'@better-auth/drizzle-adapter@1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))':
dependencies:
'@better-auth/core': 1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0)
'@better-auth/utils': 0.4.0
optionalDependencies:
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0)
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)
'@better-auth/kysely-adapter@1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(kysely@0.28.16)':
dependencies:
@ -3843,6 +3980,10 @@ snapshots:
tslib: 2.8.1
optional: true
'@types/better-sqlite3@7.6.13':
dependencies:
'@types/node': 24.12.0
'@types/body-parser@1.19.6':
dependencies:
'@types/connect': 3.4.38
@ -4160,12 +4301,14 @@ snapshots:
balanced-match@4.0.4: {}
base64-js@1.5.1: {}
baseline-browser-mapping@2.10.9: {}
better-auth@1.6.2(@opentelemetry/api@1.9.1)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0))):
better-auth@1.6.2(@opentelemetry/api@1.9.1)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0))):
dependencies:
'@better-auth/core': 1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0)
'@better-auth/drizzle-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))
'@better-auth/drizzle-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))
'@better-auth/kysely-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(kysely@0.28.16)
'@better-auth/memory-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)
'@better-auth/mongo-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)
@ -4182,8 +4325,9 @@ snapshots:
nanostores: 1.2.0
zod: 4.3.6
optionalDependencies:
better-sqlite3: 12.9.0
drizzle-kit: 0.31.10
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0)
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)
pg: 8.20.0
react: 19.2.4
react-dom: 19.2.4(react@19.2.4)
@ -4192,10 +4336,10 @@ snapshots:
- '@cloudflare/workers-types'
- '@opentelemetry/api'
better-auth@1.6.2(@opentelemetry/api@1.9.1)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0))):
better-auth@1.6.2(@opentelemetry/api@1.9.1)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0))):
dependencies:
'@better-auth/core': 1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0)
'@better-auth/drizzle-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))
'@better-auth/drizzle-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))
'@better-auth/kysely-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(kysely@0.28.16)
'@better-auth/memory-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)
'@better-auth/mongo-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)
@ -4212,8 +4356,9 @@ snapshots:
nanostores: 1.2.0
zod: 4.3.6
optionalDependencies:
better-sqlite3: 12.9.0
drizzle-kit: 0.31.10
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0)
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)
pg: 8.20.0
react: 19.2.4
react-dom: 19.2.4(react@19.2.4)
@ -4231,12 +4376,27 @@ snapshots:
optionalDependencies:
zod: 4.3.6
better-sqlite3@12.9.0:
dependencies:
bindings: 1.5.0
prebuild-install: 7.1.3
bidi-js@1.0.3:
dependencies:
require-from-string: 2.0.2
binary-extensions@2.3.0: {}
bindings@1.5.0:
dependencies:
file-uri-to-path: 1.0.0
bl@4.1.0:
dependencies:
buffer: 5.7.1
inherits: 2.0.4
readable-stream: 3.6.2
body-parser@2.2.2:
dependencies:
bytes: 3.1.2
@ -4269,6 +4429,11 @@ snapshots:
buffer-from@1.1.2: {}
buffer@5.7.1:
dependencies:
base64-js: 1.5.1
ieee754: 1.2.1
bytes@3.1.2: {}
call-bind-apply-helpers@1.0.2:
@ -4307,6 +4472,8 @@ snapshots:
optionalDependencies:
fsevents: 2.3.3
chownr@1.1.4: {}
cliui@8.0.1:
dependencies:
string-width: 4.2.3
@ -4385,6 +4552,12 @@ snapshots:
decimal.js@10.6.0: {}
decompress-response@6.0.0:
dependencies:
mimic-response: 3.1.0
deep-extend@0.6.0: {}
deep-is@0.1.4: {}
defu@6.1.7: {}
@ -4411,10 +4584,12 @@ snapshots:
esbuild: 0.25.12
tsx: 4.21.0
drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0):
drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0):
optionalDependencies:
'@opentelemetry/api': 1.9.1
'@types/better-sqlite3': 7.6.13
'@types/pg': 8.20.0
better-sqlite3: 12.9.0
kysely: 0.28.16
pg: 8.20.0
@ -4432,6 +4607,10 @@ snapshots:
encodeurl@2.0.0: {}
end-of-stream@1.4.5:
dependencies:
once: 1.4.0
enhanced-resolve@5.20.1:
dependencies:
graceful-fs: 4.2.11
@ -4638,6 +4817,8 @@ snapshots:
etag@1.8.1: {}
expand-template@2.0.3: {}
expect-type@1.3.0: {}
express@5.2.1:
@ -4689,6 +4870,8 @@ snapshots:
dependencies:
flat-cache: 4.0.1
file-uri-to-path@1.0.0: {}
fill-range@7.1.1:
dependencies:
to-regex-range: 5.0.1
@ -4736,6 +4919,8 @@ snapshots:
fresh@2.0.0: {}
fs-constants@1.0.0: {}
fsevents@2.3.3:
optional: true
@ -4767,6 +4952,8 @@ snapshots:
dependencies:
resolve-pkg-maps: 1.0.0
github-from-package@0.0.0: {}
glob-parent@5.1.2:
dependencies:
is-glob: 4.0.3
@ -4821,6 +5008,8 @@ snapshots:
dependencies:
safer-buffer: 2.1.2
ieee754@1.2.1: {}
ignore@5.3.2: {}
ignore@7.0.5: {}
@ -4829,6 +5018,8 @@ snapshots:
inherits@2.0.4: {}
ini@1.3.8: {}
ipaddr.js@1.9.1: {}
is-binary-path@2.1.0:
@ -5018,20 +5209,32 @@ snapshots:
mime@2.6.0: {}
mimic-response@3.1.0: {}
minimatch@10.2.4:
dependencies:
brace-expansion: 5.0.4
minimist@1.2.8: {}
mkdirp-classic@0.5.3: {}
ms@2.1.3: {}
nanoid@3.3.11: {}
nanostores@1.2.0: {}
napi-build-utils@2.0.0: {}
natural-compare@1.4.0: {}
negotiator@1.0.0: {}
node-abi@3.89.0:
dependencies:
semver: 7.7.4
node-releases@2.0.36: {}
normalize-path@3.0.0: {}
@ -5138,6 +5341,21 @@ snapshots:
dependencies:
xtend: 4.0.2
prebuild-install@7.1.3:
dependencies:
detect-libc: 2.1.2
expand-template: 2.0.3
github-from-package: 0.0.0
minimist: 1.2.8
mkdirp-classic: 0.5.3
napi-build-utils: 2.0.0
node-abi: 3.89.0
pump: 3.0.4
rc: 1.2.8
simple-get: 4.0.1
tar-fs: 2.1.4
tunnel-agent: 0.6.0
prelude-ls@1.2.1: {}
prettier@3.8.1: {}
@ -5147,6 +5365,11 @@ snapshots:
forwarded: 0.2.0
ipaddr.js: 1.9.1
pump@3.0.4:
dependencies:
end-of-stream: 1.4.5
once: 1.4.0
punycode@2.3.1: {}
qs@6.15.0:
@ -5162,6 +5385,13 @@ snapshots:
iconv-lite: 0.7.2
unpipe: 1.0.0
rc@1.2.8:
dependencies:
deep-extend: 0.6.0
ini: 1.3.8
minimist: 1.2.8
strip-json-comments: 2.0.1
react-dom@19.2.4(react@19.2.4):
dependencies:
react: 19.2.4
@ -5169,6 +5399,12 @@ snapshots:
react@19.2.4: {}
readable-stream@3.6.2:
dependencies:
inherits: 2.0.4
string_decoder: 1.3.0
util-deprecate: 1.0.2
readdirp@3.6.0:
dependencies:
picomatch: 2.3.1
@ -5224,6 +5460,8 @@ snapshots:
dependencies:
tslib: 2.8.1
safe-buffer@5.2.1: {}
safer-buffer@2.1.2: {}
saxes@6.0.0:
@ -5309,6 +5547,14 @@ snapshots:
siginfo@2.0.0: {}
simple-concat@1.0.1: {}
simple-get@4.0.1:
dependencies:
decompress-response: 6.0.0
once: 1.4.0
simple-concat: 1.0.1
source-map-js@1.2.1: {}
source-map-support@0.5.21:
@ -5338,10 +5584,16 @@ snapshots:
is-fullwidth-code-point: 3.0.0
strip-ansi: 6.0.1
string_decoder@1.3.0:
dependencies:
safe-buffer: 5.2.1
strip-ansi@6.0.1:
dependencies:
ansi-regex: 5.0.1
strip-json-comments@2.0.1: {}
superagent@10.3.0:
dependencies:
component-emitter: 1.3.1
@ -5378,6 +5630,21 @@ snapshots:
tapable@2.3.0: {}
tar-fs@2.1.4:
dependencies:
chownr: 1.1.4
mkdirp-classic: 0.5.3
pump: 3.0.4
tar-stream: 2.2.0
tar-stream@2.2.0:
dependencies:
bl: 4.1.0
end-of-stream: 1.4.5
fs-constants: 1.0.0
inherits: 2.0.4
readable-stream: 3.6.2
tiny-invariant@1.3.3: {}
tiny-warning@1.0.3: {}
@ -5428,6 +5695,10 @@ snapshots:
optionalDependencies:
fsevents: 2.3.3
tunnel-agent@0.6.0:
dependencies:
safe-buffer: 5.2.1
type-check@0.4.0:
dependencies:
prelude-ls: 1.2.1
@ -5481,6 +5752,8 @@ snapshots:
dependencies:
react: 19.2.4
util-deprecate@1.0.2: {}
vary@1.1.2: {}
vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0):

View file

@ -1,6 +1,8 @@
packages:
- apps/*
- packages/*
- data-pipeline
allowBuilds:
"@swc/core": true
better-sqlite3: true
esbuild: true

View file

@ -1,205 +0,0 @@
# CEFR Data Pipeline
This directory contains the source data files and extraction/merge pipeline for generating CEFR-enriched datasets. The final outputs (`english-merged.json`, `italian-merged.json`) are consumed by the database seeding process in `packages/db`.
## Overview
The pipeline transforms raw vocabulary data from multiple sources into a standardized format, resolves conflicts between sources, and produces an authoritative CEFR dataset per language. This dataset is then used by the lila database package to update translation records.
## Supported Languages
- ✅ English (`en`)
- ✅ Italian (`it`)
## Pipeline Stages
### Stage 1: Extraction
Each source file is processed by a dedicated extractor script. The extractor reads the source-specific format, normalizes the data, filters for supported parts of speech, and outputs a standardized JSON file.
**Input:** Raw source files (JSON, CSV, XLS)
**Output:** `{source}-extracted.json` files (same directory as source)
**Normalization rules:**
- Words are lowercased and trimmed
- Part of speech is mapped to supported values (noun, verb)
- Entries with unsupported POS are skipped
- CEFR levels are validated against A1-C2
- Each record includes the source identifier for traceability
**Extractor Scripts:**
| Language | Source | Script |
| -------- | -------------- | ---------------------------------------------------- |
| English | `cefrj.csv` | `extraction-scripts/english/extract-cefrj-csv.py` |
| English | `en_m3.xls` | `extraction-scripts/english/extract-en_m3.py` |
| English | `octanove.csv` | `extraction-scripts/english/extract-octanove.py` |
| English | `random.json` | `extraction-scripts/english/extract-random-json.py` |
| Italian | `it_m3.xls` | `extraction-scripts/italian/extract-it_m3.py` |
| Italian | `italian.json` | `extraction-scripts/italian/extract-italian-json.py` |
### Stage 2: Comparison
Before merging, sources are compared to identify agreements and conflicts. This stage is read-only and serves as a quality gate.
**Input:** All `{source}-extracted.json` files for a language
**Output:** Console report showing:
- Entry counts per source and CEFR level
- Overlap between sources (words appearing in multiple sources)
- Agreement rate (sources assigning the same CEFR level)
- Conflicts (same word/POS with different CEFR levels)
**Comparison Scripts:**
| Language | Script |
| -------- | --------------------------------------- |
| English | `comparison-scripts/compare-english.py` |
| Italian | `comparison-scripts/compare-italian.py` |
Run from the `scripts/` directory:
python comparison-scripts/compare-english.py
python comparison-scripts/compare-italian.py
### Stage 3: Merge
Multiple extracted sources are merged into a single authoritative JSON file per language. When the same word/POS appears in multiple sources with different CEFR levels, the conflict is resolved using a predefined priority order.
**Input:** All `{source}-extracted.json` files for a language
**Output:** `{language}-merged.json` in `../datafiles/`
**Merge rules:**
- Single source: use that source's CEFR level
- Multiple sources agree: use the agreed CEFR level
- Multiple sources conflict: use the level from the highest-priority source
**Difficulty derivation:**
Difficulty is not extracted from sources. It is derived from the final CEFR level:
- A1, A2 → easy
- B1, B2 → intermediate
- C1, C2 → hard
The merged file includes both CEFR level and derived difficulty, plus a list of sources that contributed to each entry.
**Merge Scripts & Priorities:**
| Language | Script | Priority (lowest → highest) |
| -------- | ------------------------------------- | -------------------------------------- |
| English | `merge-scripts/merge-english-json.py` | `random`, `octanove`, `cefrj`, `en_m3` |
| Italian | `merge-scripts/merge-italian-json.py` | `italian`, `it_m3` |
Run from the `scripts/` directory:
python merge-scripts/merge-english-json.py
python merge-scripts/merge-italian-json.py
### Stage 4: Enrichment
The authoritative merged file is consumed by the database package (packages/db) during the seeding or update process. This stage is implemented in TypeScript and is not part of the Python scripts in this directory.
## File Organization
```
scripts/
├── comparison-scripts/
│ ├── compare-english.py
│ └── compare-italian.py # Stage 2: compare extracted data
├── datafiles/
│ ├── english-merged.json # Stage 3 output (authoritative)
│ ├── italian-merged.json # Stage 3 output (authoritative)
│ ├── omw-noun.json
│ └── omw-verb.json
├── data-sources/
│ ├── english/
│ │ ├── cefrj.csv
│ │ ├── cefrj-extracted.json
│ │ ├── en_m3.xls
│ │ ├── en_m3-extracted.json
│ │ ├── octanove.csv
│ │ ├── octanove-extracted.json
│ │ ├── random.json
│ │ └── random-extracted.json
│ ├── french/ # (future)
│ ├── german/ # (future)
│ ├── italian/
│ │ ├── it_m3.xls
│ │ ├── it_m3-extracted.json
│ │ ├── italian.json
│ │ └── italian-extracted.json
│ └── spanish/ # (future)
├── extraction-scripts/
│ └── english/
│ ├── extract-cefrj-csv.py
│ ├── extract-en_m3.py
│ ├── extract-octanove.py
│ └── extract-random-json.py
│ └── italian/
│ ├── extract-it_m3.py
│ └── extract-italian-json.py
├── merge-scripts/
│ └── merge-english-json.py # Stage 3: merge into authority
├── extract-own-save-to-json.py # script to extract words from wordnet
├── requirements.txt
└── README.md # This file
```
Extracted files are co-located with their sources for easy traceability. Merged files live in `../datafiles/`.
## Source Priority by Language
Source priority determines which CEFR level wins when sources conflict:
**English:**
1. en_m3
2. cefrj
3. octanove
4. random
**Italian:**
1. it_m3
2. italian
Priority is defined in the merge configuration. Higher priority sources override lower priority sources when conflicts occur.
This is defined in merge-scripts/merge-english-json.py.
## Data Flow Summary
```
Raw Source → Extracted JSON → Merged JSON → Database
(1) (2) (3) (4)
```
1. **Extract:** Transform source formats to normalized records
2. **Compare:** Validate source quality and surface conflicts
3. **Merge:** Resolve conflicts, derive difficulty, create authority
4. **Enrich:** Write to database (handled in packages/db)
## Adding New Sources
To add a new source:
1. Place the raw file in the appropriate `data-sources/{language}/` directory
2. Create an extractor script in `../extractors/{language}/`
3. Run the extractor to generate `{source}-extracted.json`
4. Run comparison to assess coverage and conflicts
5. Update source priority in the merge configuration if needed
6. Run merge to regenerate the authoritative file
7. Run enrichment to update the database
## Constants and Constraints
The pipeline respects these constraints from the lila shared constants:
- **Supported languages:** en, it
- **Supported parts of speech:** noun, verb
- **CEFR levels:** A1, A2, B1, B2, C1, C2
- **Difficulty levels:** easy, intermediate, hard
Entries violating these constraints are filtered out during extraction.

View file

@ -1,166 +0,0 @@
#!/usr/bin/env python3
"""
CEFR Data Pipeline - Stage 2: English Comparison
Compares extracted JSON files for English and reports agreements and conflicts.
"""
import json
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple
# Supported CEFR levels
CEFR_LEVELS = {"A1", "A2", "B1", "B2", "C1", "C2"}
def load_extracted_files(data_dir: Path) -> Dict[str, List[dict]]:
"""Load all *-extracted.json files from the English data directory."""
sources = {}
for file_path in data_dir.glob("*-extracted.json"):
source_name = file_path.stem.replace("-extracted", "")
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
sources[source_name] = data
else:
print(f"Warning: {file_path} does not contain a list, skipping.")
return sources
def normalize_entry(entry: dict) -> Tuple[str, str]:
"""Return (word, pos) key for comparison."""
return entry["word"].lower().strip(), entry["pos"].lower().strip()
def compute_statistics(sources: Dict[str, List[dict]]) -> dict:
"""Compute overlap, agreement, and conflict statistics."""
# Per-source counts by CEFR level
source_counts = {}
for src, entries in sources.items():
cefr_counts = defaultdict(int)
for e in entries:
cefr = e.get("cefr", "UNKNOWN")
cefr_counts[cefr] += 1
source_counts[src] = dict(cefr_counts)
# Build word->pos->sources and CEFR assignments
word_map = defaultdict(lambda: defaultdict(dict))
for src, entries in sources.items():
for e in entries:
key = normalize_entry(e)
word_map[key][src] = e["cefr"]
# Compute overlaps, agreements, conflicts
total_entries = sum(len(e) for e in sources.values())
unique_words = len(word_map)
overlap_stats = defaultdict(int)
agreement_count = 0
conflict_count = 0
conflict_details = []
for key, src_cefr_map in word_map.items():
num_sources = len(src_cefr_map)
overlap_stats[num_sources] += 1
if num_sources > 1:
cefr_values = set(src_cefr_map.values())
if len(cefr_values) == 1:
agreement_count += 1
else:
conflict_count += 1
conflict_details.append(
{"word": key[0], "pos": key[1], "assignments": dict(src_cefr_map)}
)
return {
"source_counts": source_counts,
"total_entries": total_entries,
"unique_words": unique_words,
"overlap_distribution": dict(overlap_stats),
"agreements": agreement_count,
"conflicts": conflict_count,
"conflict_details": conflict_details,
}
def print_report(stats: dict, sources: Dict[str, List[dict]]):
"""Print formatted comparison report."""
print(f"\n{'=' * 60}")
print("CEFR COMPARISON REPORT - ENGLISH")
print(f"{'=' * 60}")
# Source entry counts
print("\n📊 ENTRIES PER SOURCE AND CEFR LEVEL")
print("-" * 50)
for src, counts in stats["source_counts"].items():
total = sum(counts.values())
print(f"\n{src}: {total} total entries")
for level in CEFR_LEVELS:
cnt = counts.get(level, 0)
if cnt > 0:
print(f" {level}: {cnt}")
# Show non-standard levels
for level, cnt in counts.items():
if level not in CEFR_LEVELS and level != "UNKNOWN":
print(f" {level}: {cnt} (non-standard)")
# Overlap statistics
print("\n🔄 OVERLAP BETWEEN SOURCES")
print("-" * 50)
print(f"Total unique (word, POS) combinations: {stats['unique_words']}")
print(f"Total entries across all sources: {stats['total_entries']}")
overlap = stats["overlap_distribution"]
for n_sources in sorted(overlap.keys()):
count = overlap[n_sources]
pct = (count / stats["unique_words"]) * 100
print(f"Words appearing in {n_sources} source(s): {count} ({pct:.1f}%)")
# Agreement and conflicts
print("\n⚖️ AGREEMENT / CONFLICT SUMMARY")
print("-" * 50)
print(f"Words with >1 source: {stats['agreements'] + stats['conflicts']}")
print(f" ✅ Agreements (same CEFR): {stats['agreements']}")
print(f" ❌ Conflicts (different CEFR): {stats['conflicts']}")
if stats["conflicts"] > 0:
agreement_rate = (
stats["agreements"] / (stats["agreements"] + stats["conflicts"])
) * 100
print(f" Agreement rate: {agreement_rate:.1f}%")
print("\n📋 CONFLICT DETAILS (first 10 shown):")
for i, conflict in enumerate(stats["conflict_details"][:10]):
print(f" {i + 1}. {conflict['word']} ({conflict['pos']})")
for src, cefr in conflict["assignments"].items():
print(f" {src}: {cefr}")
if len(stats["conflict_details"]) > 10:
print(f" ... and {len(stats['conflict_details']) - 10} more conflicts.")
print(f"\n{'=' * 60}\n")
def main():
# Determine paths
script_dir = Path(__file__).parent
data_dir = script_dir.parent / "data-sources" / "english"
if not data_dir.exists():
print(f"Error: English data directory not found: {data_dir}")
return
print(f"Loading extracted files from {data_dir}...")
sources = load_extracted_files(data_dir)
if not sources:
print("No extracted files found.")
return
print(f"Found sources: {', '.join(sources.keys())}")
stats = compute_statistics(sources)
print_report(stats, sources)
if __name__ == "__main__":
main()

View file

@ -1,166 +0,0 @@
#!/usr/bin/env python3
"""
CEFR Data Pipeline - Stage 2: Italian Comparison
Compares extracted JSON files for Italian and reports agreements and conflicts.
"""
import json
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple
# Supported CEFR levels
CEFR_LEVELS = {"A1", "A2", "B1", "B2", "C1", "C2"}
def load_extracted_files(data_dir: Path) -> Dict[str, List[dict]]:
"""Load all *-extracted.json files from the Italian data directory."""
sources = {}
for file_path in data_dir.glob("*-extracted.json"):
source_name = file_path.stem.replace("-extracted", "")
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
sources[source_name] = data
else:
print(f"Warning: {file_path} does not contain a list, skipping.")
return sources
def normalize_entry(entry: dict) -> Tuple[str, str]:
"""Return (word, pos) key for comparison."""
return entry["word"].lower().strip(), entry["pos"].lower().strip()
def compute_statistics(sources: Dict[str, List[dict]]) -> dict:
"""Compute overlap, agreement, and conflict statistics."""
# Per-source counts by CEFR level
source_counts = {}
for src, entries in sources.items():
cefr_counts = defaultdict(int)
for e in entries:
cefr = e.get("cefr", "UNKNOWN")
cefr_counts[cefr] += 1
source_counts[src] = dict(cefr_counts)
# Build word->pos->sources and CEFR assignments
word_map = defaultdict(lambda: defaultdict(dict))
for src, entries in sources.items():
for e in entries:
key = normalize_entry(e)
word_map[key][src] = e["cefr"]
# Compute overlaps, agreements, conflicts
total_entries = sum(len(e) for e in sources.values())
unique_words = len(word_map)
overlap_stats = defaultdict(int)
agreement_count = 0
conflict_count = 0
conflict_details = []
for key, src_cefr_map in word_map.items():
num_sources = len(src_cefr_map)
overlap_stats[num_sources] += 1
if num_sources > 1:
cefr_values = set(src_cefr_map.values())
if len(cefr_values) == 1:
agreement_count += 1
else:
conflict_count += 1
conflict_details.append(
{"word": key[0], "pos": key[1], "assignments": dict(src_cefr_map)}
)
return {
"source_counts": source_counts,
"total_entries": total_entries,
"unique_words": unique_words,
"overlap_distribution": dict(overlap_stats),
"agreements": agreement_count,
"conflicts": conflict_count,
"conflict_details": conflict_details,
}
def print_report(stats: dict, sources: Dict[str, List[dict]]):
"""Print formatted comparison report."""
print(f"\n{'=' * 60}")
print("CEFR COMPARISON REPORT - ITALIAN")
print(f"{'=' * 60}")
# Source entry counts
print("\n📊 ENTRIES PER SOURCE AND CEFR LEVEL")
print("-" * 50)
for src, counts in stats["source_counts"].items():
total = sum(counts.values())
print(f"\n{src}: {total} total entries")
for level in CEFR_LEVELS:
cnt = counts.get(level, 0)
if cnt > 0:
print(f" {level}: {cnt}")
# Show non-standard levels
for level, cnt in counts.items():
if level not in CEFR_LEVELS and level != "UNKNOWN":
print(f" {level}: {cnt} (non-standard)")
# Overlap statistics
print("\n🔄 OVERLAP BETWEEN SOURCES")
print("-" * 50)
print(f"Total unique (word, POS) combinations: {stats['unique_words']}")
print(f"Total entries across all sources: {stats['total_entries']}")
overlap = stats["overlap_distribution"]
for n_sources in sorted(overlap.keys()):
count = overlap[n_sources]
pct = (count / stats["unique_words"]) * 100
print(f"Words appearing in {n_sources} source(s): {count} ({pct:.1f}%)")
# Agreement and conflicts
print("\n⚖️ AGREEMENT / CONFLICT SUMMARY")
print("-" * 50)
print(f"Words with >1 source: {stats['agreements'] + stats['conflicts']}")
print(f" ✅ Agreements (same CEFR): {stats['agreements']}")
print(f" ❌ Conflicts (different CEFR): {stats['conflicts']}")
if stats["conflicts"] > 0:
agreement_rate = (
stats["agreements"] / (stats["agreements"] + stats["conflicts"])
) * 100
print(f" Agreement rate: {agreement_rate:.1f}%")
print("\n📋 CONFLICT DETAILS (first 10 shown):")
for i, conflict in enumerate(stats["conflict_details"][:10]):
print(f" {i + 1}. {conflict['word']} ({conflict['pos']})")
for src, cefr in conflict["assignments"].items():
print(f" {src}: {cefr}")
if len(stats["conflict_details"]) > 10:
print(f" ... and {len(stats['conflict_details']) - 10} more conflicts.")
print(f"\n{'=' * 60}\n")
def main():
# Determine paths
script_dir = Path(__file__).parent
data_dir = script_dir.parent / "data-sources" / "italian"
if not data_dir.exists():
print(f"Error: Italian data directory not found: {data_dir}")
return
print(f"Loading extracted files from {data_dir}...")
sources = load_extracted_files(data_dir)
if not sources:
print("No extracted files found.")
return
print(f"Found sources: {', '.join(sources.keys())}")
stats = compute_statistics(sources)
print_report(stats, sources)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,149 +0,0 @@
"""
scripts/extract-omw-data.py
Extract ALL synsets from Open Multilingual Wordnet (OMW) for every supported
language and POS. Replaces extract-en-it-nouns.py.
Output: one JSON file per POS, written to packages/db/src/data/datafiles/
omw-noun.json
omw-verb.json
Each file is a JSON array of objects matching SynsetRecord in seed.ts:
{
"source_id": "ili:i12345",
"pos": "noun",
"translations": { "en": ["dog", "canine"], "it": ["cane"] },
"glosses": { "en": ["a domesticated animal..."] }
}
Translations and glosses are absent for a language if that wordnet has no
coverage for the synset the seed script handles sparse data gracefully.
Usage:
python scripts/extract-omw-data.py [output_dir]
output_dir defaults to packages/db/src/data/datafiles/
Prerequisites:
pip install wn
python -c "import wn; wn.download('oewn:2024'); wn.download('omw-it:1.4')"
"""
import json
import sys
from pathlib import Path
import wn
# Mirror constants.ts — update both places if languages or POS change.
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it"]
POS_MAP: dict[str, str] = {
"n": "noun",
"v": "verb",
}
def extract_all(output_dir: str = "packages/db/src/data/datafiles/") -> None:
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
# Load one Wordnet object per language up front.
print("Loading wordnets...")
wordnets: dict[str, wn.Wordnet] = {}
for lang in SUPPORTED_LANGUAGE_CODES:
try:
wordnets[lang] = wn.Wordnet(lang=lang)
synset_count = len(wordnets[lang].synsets())
print(f" {lang}: {synset_count:,} total synsets")
except wn.Error as e:
print(f" ERROR loading {lang}: {e}")
print(f" Run: python -c \"import wn; wn.download('omw-{lang}:1.4')\"")
sys.exit(1)
for omw_pos, pos_label in POS_MAP.items():
print(f"\n--- Extracting {pos_label}s (pos='{omw_pos}') ---")
# Collect per-ILI data across all languages.
# Structure: { ili -> { lang -> { "lemmas": [...], "glosses": [...] } } }
by_ili: dict[str, dict[str, dict[str, list[str]]]] = {}
for lang, wnet in wordnets.items():
synsets = wnet.synsets(pos=omw_pos)
covered = 0
for synset in synsets:
ili = synset.ili
if not ili:
continue # skip synsets without an ILI — can't cross-link
covered += 1
if ili not in by_ili:
by_ili[ili] = {}
lemmas = [str(lemma) for lemma in synset.lemmas()]
defns = [d for d in synset.definitions() if d]
by_ili[ili][lang] = {"lemmas": lemmas, "glosses": defns}
print(f" {lang}: {covered:,} {pos_label} synsets with ILI")
# Build output records — sort by ILI for a stable, diffable file.
records: list[dict] = []
for ili in sorted(by_ili.keys()):
lang_data = by_ili[ili]
translations: dict[str, list[str]] = {}
glosses: dict[str, list[str]] = {}
for lang, data in lang_data.items():
if data["lemmas"]:
translations[lang] = data["lemmas"]
if data["glosses"]:
glosses[lang] = data["glosses"]
# Include the record even if only one language has coverage —
# the seed script imports all terms regardless of cross-language overlap.
records.append(
{
"source_id": f"ili:{ili}",
"pos": pos_label,
"translations": translations,
"glosses": glosses,
}
)
output_file = out / f"omw-{pos_label}.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
print(f"\nWrote {len(records):,} {pos_label} synsets → {output_file}")
_print_coverage(records, pos_label)
def _print_coverage(records: list[dict], pos_label: str) -> None:
"""Print per-language translation and gloss counts."""
lang_stats: dict[str, dict[str, int]] = {}
for lang in SUPPORTED_LANGUAGE_CODES:
lang_stats[lang] = {"translations": 0, "glosses": 0}
for r in records:
for lang, lemmas in r["translations"].items():
if lang in lang_stats:
lang_stats[lang]["translations"] += len(lemmas)
for lang, gloss_list in r["glosses"].items():
if lang in lang_stats:
lang_stats[lang]["glosses"] += len(gloss_list)
print(f"\nCoverage for {pos_label}s:")
for lang, counts in lang_stats.items():
t = counts["translations"]
g = counts["glosses"]
avg_t = t / len(records) if records else 0
print(f" {lang}: {t:,} translations ({avg_t:.1f} avg/synset), {g:,} glosses")
# Sample output
print(f"\nSample {pos_label}s (records 10001004):")
for r in records[1000:1005]:
print(f" {r['source_id']}: {r['translations']}")
if __name__ == "__main__":
output_dir = sys.argv[1] if len(sys.argv) > 1 else "packages/db/src/data/datafiles/"
extract_all(output_dir)

View file

@ -1,96 +0,0 @@
#!/usr/bin/env python3
"""
scripts/extraction-scripts/english/extract-cefrj-csv.py
Extracts CEFR data from cefrj.csv (CEFR-J vocabulary profile).
Filters for supported POS (noun, verb).
Input: scripts/data-sources/english/cefrj.csv
Output: scripts/data-sources/english/cefrj-extracted.json
Output format (normalized):
[
{ "word": "ability", "pos": "noun", "cefr": "A2", "source": "cefrj" }
]
"""
import csv
import json
from pathlib import Path
# Constants matching @lila/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/english/cefrj.csv")
OUTPUT_FILE = Path("scripts/data-sources/english/cefrj-extracted.json")
def extract() -> None:
print(f"Reading: {INPUT_FILE}")
records = []
skipped_pos = 0
skipped_invalid_cefr = 0
skipped_empty_word = 0
total_rows = 0
with open(INPUT_FILE, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
total_rows += 1
# Filter: must have supported POS
pos = row.get("pos", "").lower().strip()
if pos not in SUPPORTED_POS:
skipped_pos += 1
continue
# Filter: must have valid CEFR level
cefr = row.get("CEFR", "").upper().strip()
if cefr not in CEFR_LEVELS:
skipped_invalid_cefr += 1
continue
# Normalize word
word = row.get("headword", "").lower().strip()
if not word:
skipped_empty_word += 1
continue
record = {"word": word, "pos": pos, "cefr": cefr, "source": "cefrj"}
records.append(record)
# Write output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
# Stats
noun_count = sum(1 for r in records if r["pos"] == "noun")
verb_count = sum(1 for r in records if r["pos"] == "verb")
cefr_distribution = {}
for level in CEFR_LEVELS:
count = sum(1 for r in records if r["cefr"] == level)
if count > 0:
cefr_distribution[level] = count
print(f"\nTotal rows in CSV: {total_rows}")
print(f"Extracted: {len(records)} records")
print(f" - Nouns: {noun_count}")
print(f" - Verbs: {verb_count}")
print("\nCEFR distribution:")
for level in CEFR_LEVELS:
if level in cefr_distribution:
print(f" - {level}: {cefr_distribution[level]}")
print("\nSkipped:")
print(f" - Unsupported POS: {skipped_pos}")
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
print(f" - Empty word: {skipped_empty_word}")
print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__":
extract()

View file

@ -1,107 +0,0 @@
#!/usr/bin/env python3
"""
scripts/extraction-scripts/english/extract-en_m3.py
Extracts CEFR data from en_m3.xls (M3 wordlist).
"""
import json
from pathlib import Path
import xlrd
# Constants matching @lila/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
# POS mapping (case-insensitive)
POS_MAP = {
"noun": "noun",
"verb": "verb",
}
# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/english/en_m3.xls")
OUTPUT_FILE = Path("scripts/data-sources/english/en_m3-extracted.json")
def extract() -> None:
print(f"Reading: {INPUT_FILE}")
records = []
skipped_pos = 0
skipped_invalid_cefr = 0
skipped_empty_word = 0
total_rows = 0
wb = xlrd.open_workbook(INPUT_FILE)
ws = wb.sheet_by_index(0)
# Skip header row, start from row 1
for row_idx in range(1, ws.nrows):
total_rows += 1
# Unpack columns: ID number, Word, Part of Speech, CEFR, Points
word_raw = ws.cell_value(row_idx, 1)
pos_raw = ws.cell_value(row_idx, 2)
cefr_raw = ws.cell_value(row_idx, 3)
# Normalize POS (case-insensitive)
pos = str(pos_raw).lower().strip() if pos_raw else ""
if pos not in POS_MAP:
skipped_pos += 1
continue
pos = POS_MAP[pos]
# Normalize CEFR - handle smart quotes
cefr_str = str(cefr_raw).strip() if cefr_raw else ""
# Strip Unicode smart quotes (U+201C and U+201D)
cefr_str = cefr_str.strip("\u201c\u201d")
cefr = cefr_str.upper()
if cefr not in CEFR_LEVELS:
skipped_invalid_cefr += 1
continue
# Normalize word
word = str(word_raw).lower().strip() if word_raw else ""
if not word:
skipped_empty_word += 1
continue
record = {"word": word, "pos": pos, "cefr": cefr, "source": "en_m3"}
records.append(record)
# Write output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
# Stats
noun_count = sum(1 for r in records if r["pos"] == "noun")
verb_count = sum(1 for r in records if r["pos"] == "verb")
cefr_distribution = {}
for level in CEFR_LEVELS:
count = sum(1 for r in records if r["cefr"] == level)
if count > 0:
cefr_distribution[level] = count
print(f"\nTotal rows in XLS: {total_rows}")
print(f"Extracted: {len(records)} records")
print(f" - Nouns: {noun_count}")
print(f" - Verbs: {verb_count}")
print("\nCEFR distribution:")
for level in CEFR_LEVELS:
if level in cefr_distribution:
print(f" - {level}: {cefr_distribution[level]}")
print("\nSkipped:")
print(f" - Unsupported POS: {skipped_pos}")
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
print(f" - Empty word: {skipped_empty_word}")
print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__":
extract()

View file

@ -1,90 +0,0 @@
#!/usr/bin/env python3
"""
scripts/extraction-scripts/english/extract-octanove.py
Extracts CEFR data from octanove.csv (Octanove vocabulary profile).
Filters for supported POS (noun, verb).
Input: scripts/data-sources/english/octanove.csv
Output: scripts/data-sources/english/octanove-extracted.json
Output format (normalized):
[
{ "word": "example", "pos": "noun", "cefr": "C1", "source": "octanove" }
]
"""
import csv
import json
from pathlib import Path
# Constants matching @lila/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/english/octanove.csv")
OUTPUT_FILE = Path("scripts/data-sources/english/octanove-extracted.json")
def extract() -> None:
print(f"Reading: {INPUT_FILE}")
records = []
skipped_pos = 0
skipped_invalid_cefr = 0
skipped_empty_word = 0
total_rows = 0
with open(INPUT_FILE, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
total_rows += 1
# Filter: must have supported POS
pos = row.get("pos", "").lower().strip()
if pos not in SUPPORTED_POS:
skipped_pos += 1
continue
# Filter: must have valid CEFR level
cefr = row.get("CEFR", "").upper().strip()
if cefr not in CEFR_LEVELS:
skipped_invalid_cefr += 1
continue
# Normalize word
word = row.get("headword", "").lower().strip()
if not word:
skipped_empty_word += 1
continue
record = {"word": word, "pos": pos, "cefr": cefr, "source": "octanove"}
records.append(record)
# Write output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
# Stats
noun_count = sum(1 for r in records if r["pos"] == "noun")
verb_count = sum(1 for r in records if r["pos"] == "verb")
cefr_distribution = {}
for level in CEFR_LEVELS:
count = sum(1 for r in records if r["cefr"] == level)
if count > 0:
cefr_distribution[level] = count
print(f"\nTotal rows in CSV: {total_rows}")
print(f"Extracted: {len(records)} records")
print(f" - Nouns: {noun_count}")
print(f" - Verbs: {verb_count}")
print("\nCEFR distribution:")
for level in CEFR_LEVELS:
if level in cefr_distribution:
print(f" - {level}: {cefr_distribution[level]}")
print("\nSkipped:")
print(f" - Unsupported POS: {skipped_pos}")
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
print(f" - Empty word: {skipped_empty_word}")
print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__":
extract()

View file

@ -1,99 +0,0 @@
#!/usr/bin/env python3
"""
scripts/extraction-scripts/english/extract-random-json.py
Extracts CEFR data from random.json (English flashcard source).
Filters for useful_for_flashcard=true and supported POS (noun, verb).
Input: scripts/data-sources/english/random.json
Output: scripts/data-sources/english/random-extracted.json
Output format (normalized):
[
{ "word": "be", "pos": "verb", "cefr": "A1", "source": "random" }
]
"""
import json
from pathlib import Path
# Constants matching @lila/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/english/random.json")
OUTPUT_FILE = Path("scripts/data-sources/english/random-extracted.json")
def extract() -> None:
print(f"Reading: {INPUT_FILE}")
with open(INPUT_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
records = []
skipped_pos = 0
skipped_not_useful = 0
skipped_invalid_cefr = 0
skipped_empty_word = 0
for entry in data:
# Filter: must be useful for flashcard
if not entry.get("useful_for_flashcard", False):
skipped_not_useful += 1
continue
# Filter: must have supported POS
pos = entry.get("pos", "").lower().strip()
if pos not in SUPPORTED_POS:
skipped_pos += 1
continue
# Filter: must have valid CEFR level
cefr = entry.get("cefr_level", "").upper().strip()
if cefr not in CEFR_LEVELS:
skipped_invalid_cefr += 1
continue
# Normalize word
word = entry.get("word", "").lower().strip()
if not word:
skipped_empty_word += 1
continue
record = {"word": word, "pos": pos, "cefr": cefr, "source": "random"}
records.append(record)
# Write output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
# Stats
noun_count = sum(1 for r in records if r["pos"] == "noun")
verb_count = sum(1 for r in records if r["pos"] == "verb")
cefr_distribution = {}
for level in CEFR_LEVELS:
count = sum(1 for r in records if r["cefr"] == level)
if count > 0:
cefr_distribution[level] = count
print(f"\nExtracted: {len(records)} records")
print(f" - Nouns: {noun_count}")
print(f" - Verbs: {verb_count}")
print("\nCEFR distribution:")
for level in CEFR_LEVELS:
if level in cefr_distribution:
print(f" - {level}: {cefr_distribution[level]}")
print("\nSkipped:")
print(f" - Not useful for flashcard: {skipped_not_useful}")
print(f" - Unsupported POS: {skipped_pos}")
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
print(f" - Empty word: {skipped_empty_word}")
print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__":
extract()

View file

@ -1,114 +0,0 @@
#!/usr/bin/env python3
"""
scripts/extraction-scripts/italian/extract-it_m3.py
Extracts CEFR data from it_m3.xls (Italian M3 wordlist).
"""
import json
from pathlib import Path
import xlrd
# Constants matching @glossa/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
# POS mapping (case-insensitive) based on observed abbreviations
POS_MAP = {
"n": "noun", # nome
"v": "verb", # verbo
}
# Column indices (0-based) verified from sample
WORD_COL = 0 # Lemma
POS_COL = 1 # Pos
CEFR_COL = 2 # Points (CEFR level)
# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/italian/it_m3.xls")
OUTPUT_FILE = Path("scripts/data-sources/italian/it_m3-extracted.json")
def extract() -> None:
print(f"Reading: {INPUT_FILE}")
records = []
skipped_pos = 0
skipped_invalid_cefr = 0
skipped_empty_word = 0
total_rows = 0
wb = xlrd.open_workbook(INPUT_FILE)
ws = wb.sheet_by_index(0)
# Skip header row, start from row 1
for row_idx in range(1, ws.nrows):
total_rows += 1
word_raw = ws.cell_value(row_idx, WORD_COL)
pos_raw = ws.cell_value(row_idx, POS_COL)
cefr_raw = ws.cell_value(row_idx, CEFR_COL)
# Normalize POS (case-insensitive)
pos = str(pos_raw).lower().strip() if pos_raw else ""
if pos not in POS_MAP:
skipped_pos += 1
continue
pos = POS_MAP[pos]
# Normalize CEFR - handle smart quotes
cefr_str = str(cefr_raw).strip() if cefr_raw else ""
cefr_str = cefr_str.strip("\u201c\u201d") # strip Unicode smart quotes
cefr = cefr_str.upper()
if cefr not in CEFR_LEVELS:
skipped_invalid_cefr += 1
continue
# Normalize word handle multiple forms like "il, lo, la" → take first?
word_raw_str = str(word_raw).strip() if word_raw else ""
# If word contains comma, take first part (e.g., "il, lo, la" → "il")
# But this may lose variants; consider keeping as is or processing differently.
# For consistency, we'll keep the full string and lowercase it.
word = word_raw_str.lower()
if not word:
skipped_empty_word += 1
continue
record = {"word": word, "pos": pos, "cefr": cefr, "source": "it_m3"}
records.append(record)
# Write output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
# Stats
noun_count = sum(1 for r in records if r["pos"] == "noun")
verb_count = sum(1 for r in records if r["pos"] == "verb")
cefr_distribution = {}
for level in CEFR_LEVELS:
count = sum(1 for r in records if r["cefr"] == level)
if count > 0:
cefr_distribution[level] = count
print(f"\nTotal rows in XLS: {total_rows}")
print(f"Extracted: {len(records)} records")
print(f" - Nouns: {noun_count}")
print(f" - Verbs: {verb_count}")
print(f"\nCEFR distribution:")
for level in CEFR_LEVELS:
if level in cefr_distribution:
print(f" - {level}: {cefr_distribution[level]}")
print(f"\nSkipped:")
print(f" - Unsupported POS: {skipped_pos}")
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
print(f" - Empty word: {skipped_empty_word}")
print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__":
extract()

View file

@ -1,91 +0,0 @@
#!/usr/bin/env python3
"""
scripts/extraction-scripts/italian/extract-italian-json.py
Extracts CEFR data from italian.json (Italian flashcard source).
Filters for useful_for_flashcard=true and supported POS (noun, verb).
"""
import json
from pathlib import Path
# Constants matching @glossa/shared
SUPPORTED_POS = ["noun", "verb"]
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
# Paths (relative to project root)
INPUT_FILE = Path("scripts/data-sources/italian/italian.json")
OUTPUT_FILE = Path("scripts/data-sources/italian/italian-extracted.json")
def extract() -> None:
print(f"Reading: {INPUT_FILE}")
with open(INPUT_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
records = []
skipped_pos = 0
skipped_not_useful = 0
skipped_invalid_cefr = 0
skipped_empty_word = 0
for entry in data:
# Filter: must be useful for flashcard
if not entry.get("useful_for_flashcard", False):
skipped_not_useful += 1
continue
# Filter: must have supported POS
pos = entry.get("pos", "").lower().strip()
if pos not in SUPPORTED_POS:
skipped_pos += 1
continue
# Filter: must have valid CEFR level
cefr = entry.get("cefr_level", "").upper().strip()
if cefr not in CEFR_LEVELS:
skipped_invalid_cefr += 1
continue
# Normalize word
word = entry.get("word", "").lower().strip()
if not word:
skipped_empty_word += 1
continue
record = {"word": word, "pos": pos, "cefr": cefr, "source": "italian"}
records.append(record)
# Write output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(records, f, indent=2, ensure_ascii=False)
# Stats
noun_count = sum(1 for r in records if r["pos"] == "noun")
verb_count = sum(1 for r in records if r["pos"] == "verb")
cefr_distribution = {}
for level in CEFR_LEVELS:
count = sum(1 for r in records if r["cefr"] == level)
if count > 0:
cefr_distribution[level] = count
print(f"\nExtracted: {len(records)} records")
print(f" - Nouns: {noun_count}")
print(f" - Verbs: {verb_count}")
print("\nCEFR distribution:")
for level in CEFR_LEVELS:
if level in cefr_distribution:
print(f" - {level}: {cefr_distribution[level]}")
print("\nSkipped:")
print(f" - Not useful for flashcard: {skipped_not_useful}")
print(f" - Unsupported POS: {skipped_pos}")
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
print(f" - Empty word: {skipped_empty_word}")
print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__":
extract()

View file

@ -1,58 +0,0 @@
async function main() {
// Step 1: start a game
const startResponse = await fetch("http://localhost:3000/api/v1/game/start", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
source_language: "en",
target_language: "it",
pos: "noun",
difficulty: "easy",
rounds: "3",
}),
});
const game = await startResponse.json();
console.log("Game started:", JSON.stringify(game, null, 2));
// Step 2: answer each question (always pick option 0)
for (const question of game.data.questions) {
const answerResponse = await fetch(
"http://localhost:3000/api/v1/game/answer",
{
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
sessionId: game.data.sessionId,
questionId: question.questionId,
selectedOptionId: 0,
}),
},
);
const result = await answerResponse.json();
console.log("Raw result:", JSON.stringify(result, null, 2));
console.log(
`${question.prompt}: ${result.data.isCorrect ? "✓" : "✗"} (picked ${0}, correct was ${result.data.correctOptionId})`,
);
}
const badRequest = await fetch("http://localhost:3000/api/v1/game/start", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ source_language: "en" }),
});
console.log("400 test:", badRequest.status, await badRequest.json());
// Send a valid shape but a session that doesn't exist
const notFound = await fetch("http://localhost:3000/api/v1/game/answer", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
sessionId: "00000000-0000-0000-0000-000000000000",
questionId: "00000000-0000-0000-0000-000000000000",
selectedOptionId: 0,
}),
});
console.log("404 test:", notFound.status, await notFound.json());
}
main();

View file

@ -1,159 +0,0 @@
#!/usr/bin/env python3
"""
CEFR Data Pipeline - Stage 3: English Merge
Merges extracted JSON files for English into an authoritative dataset.
"""
import json
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple
# Supported CEFR levels and difficulty mapping
CEFR_LEVELS = {"A1", "A2", "B1", "B2", "C1", "C2"}
DIFFICULTY_MAP = {
"A1": "easy",
"A2": "easy",
"B1": "intermediate",
"B2": "intermediate",
"C1": "hard",
"C2": "hard",
}
# Source priority order (from lowest to highest priority)
# Higher index = higher authority when conflicts occur
PRIORITY_ORDER = ["random", "octanove", "cefrj", "en_m3"]
def load_extracted_files(data_dir: Path) -> Dict[str, List[dict]]:
"""Load all *-extracted.json files from the English data directory."""
sources = {}
for file_path in data_dir.glob("*-extracted.json"):
source_name = file_path.stem.replace("-extracted", "")
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
sources[source_name] = data
else:
print(f"Warning: {file_path} does not contain a list, skipping.")
return sources
def normalize_entry(entry: dict) -> Tuple[str, str]:
"""Return (word, pos) key for merging."""
return entry["word"].lower().strip(), entry["pos"].lower().strip()
def get_source_priority(source_name: str) -> int:
"""Return priority index for a source (higher = more authoritative)."""
try:
return PRIORITY_ORDER.index(source_name)
except ValueError:
# If source not in list, assign lowest priority
return -1
def merge_entries(sources: Dict[str, List[dict]]) -> List[dict]:
"""Merge entries from multiple sources, resolving conflicts by priority."""
grouped = defaultdict(list)
for src_name, entries in sources.items():
for entry in entries:
key = normalize_entry(entry)
grouped[key].append((src_name, entry["cefr"], entry))
merged = []
conflicts_resolved = 0
total_multi_source = 0
for (word, pos), src_entries in grouped.items():
if len(src_entries) == 1:
src_name, cefr, original = src_entries[0]
final_cefr = cefr
contributing_sources = [src_name]
else:
total_multi_source += 1
sorted_entries = sorted(
src_entries, key=lambda x: get_source_priority(x[0]), reverse=True
)
highest_src, highest_cefr, _ = sorted_entries[0]
all_cefrs = {e[1] for e in src_entries}
if len(all_cefrs) > 1:
conflicts_resolved += 1
final_cefr = highest_cefr
contributing_sources = [e[0] for e in src_entries]
difficulty = DIFFICULTY_MAP.get(final_cefr, "unknown")
merged.append(
{
"word": word,
"pos": pos,
"cefr": final_cefr,
"difficulty": difficulty,
"sources": sorted(contributing_sources),
}
)
print(f"Merge statistics:")
print(f" Total unique entries: {len(merged)}")
print(f" Entries with multiple sources: {total_multi_source}")
print(f" Conflicts resolved by priority: {conflicts_resolved}")
return merged
def print_summary(merged: List[dict]):
"""Print distribution of CEFR levels and difficulty in final dataset."""
cefr_counts = defaultdict(int)
diff_counts = defaultdict(int)
for entry in merged:
cefr_counts[entry["cefr"]] += 1
diff_counts[entry["difficulty"]] += 1
print("\n📊 Final CEFR distribution:")
for level in sorted(CEFR_LEVELS):
count = cefr_counts.get(level, 0)
if count:
print(f" {level}: {count}")
print("\n📊 Final difficulty distribution:")
for diff in ["easy", "intermediate", "hard"]:
count = diff_counts.get(diff, 0)
print(f" {diff}: {count}")
def main():
script_dir = Path(__file__).parent
data_dir = script_dir.parent / "data-sources" / "english"
output_dir = script_dir.parent / "datafiles"
output_file = output_dir / "english-merged.json"
if not data_dir.exists():
print(f"Error: English data directory not found: {data_dir}")
return
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Loading extracted files from {data_dir}...")
sources = load_extracted_files(data_dir)
if not sources:
print("No extracted files found.")
return
print(f"Found sources: {', '.join(sources.keys())}")
print(f"Priority order (lowest to highest): {PRIORITY_ORDER}")
merged = merge_entries(sources)
with open(output_file, "w", encoding="utf-8") as f:
json.dump(merged, f, indent=2, ensure_ascii=False)
print(f"\n✅ Merged dataset written to: {output_file}")
print_summary(merged)
if __name__ == "__main__":
main()

View file

@ -1,159 +0,0 @@
#!/usr/bin/env python3
"""
CEFR Data Pipeline - Stage 3: Italian Merge
Merges extracted JSON files for Italian into an authoritative dataset.
"""
import json
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple
# Supported CEFR levels and difficulty mapping
CEFR_LEVELS = {"A1", "A2", "B1", "B2", "C1", "C2"}
DIFFICULTY_MAP = {
"A1": "easy",
"A2": "easy",
"B1": "intermediate",
"B2": "intermediate",
"C1": "hard",
"C2": "hard",
}
# Source priority order (from lowest to highest priority)
# Higher index = higher authority when conflicts occur
PRIORITY_ORDER = ["italian", "it_m3"]
def load_extracted_files(data_dir: Path) -> Dict[str, List[dict]]:
"""Load all *-extracted.json files from the Italian data directory."""
sources = {}
for file_path in data_dir.glob("*-extracted.json"):
source_name = file_path.stem.replace("-extracted", "")
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
sources[source_name] = data
else:
print(f"Warning: {file_path} does not contain a list, skipping.")
return sources
def normalize_entry(entry: dict) -> Tuple[str, str]:
"""Return (word, pos) key for merging."""
return entry["word"].lower().strip(), entry["pos"].lower().strip()
def get_source_priority(source_name: str) -> int:
"""Return priority index for a source (higher = more authoritative)."""
try:
return PRIORITY_ORDER.index(source_name)
except ValueError:
# If source not in list, assign lowest priority
return -1
def merge_entries(sources: Dict[str, List[dict]]) -> List[dict]:
"""Merge entries from multiple sources, resolving conflicts by priority."""
grouped = defaultdict(list)
for src_name, entries in sources.items():
for entry in entries:
key = normalize_entry(entry)
grouped[key].append((src_name, entry["cefr"], entry))
merged = []
conflicts_resolved = 0
total_multi_source = 0
for (word, pos), src_entries in grouped.items():
if len(src_entries) == 1:
src_name, cefr, original = src_entries[0]
final_cefr = cefr
contributing_sources = [src_name]
else:
total_multi_source += 1
sorted_entries = sorted(
src_entries, key=lambda x: get_source_priority(x[0]), reverse=True
)
highest_src, highest_cefr, _ = sorted_entries[0]
all_cefrs = {e[1] for e in src_entries}
if len(all_cefrs) > 1:
conflicts_resolved += 1
final_cefr = highest_cefr
contributing_sources = [e[0] for e in src_entries]
difficulty = DIFFICULTY_MAP.get(final_cefr, "unknown")
merged.append(
{
"word": word,
"pos": pos,
"cefr": final_cefr,
"difficulty": difficulty,
"sources": sorted(contributing_sources),
}
)
print(f"Merge statistics:")
print(f" Total unique entries: {len(merged)}")
print(f" Entries with multiple sources: {total_multi_source}")
print(f" Conflicts resolved by priority: {conflicts_resolved}")
return merged
def print_summary(merged: List[dict]):
"""Print distribution of CEFR levels and difficulty in final dataset."""
cefr_counts = defaultdict(int)
diff_counts = defaultdict(int)
for entry in merged:
cefr_counts[entry["cefr"]] += 1
diff_counts[entry["difficulty"]] += 1
print("\n📊 Final CEFR distribution:")
for level in sorted(CEFR_LEVELS):
count = cefr_counts.get(level, 0)
if count:
print(f" {level}: {count}")
print("\n📊 Final difficulty distribution:")
for diff in ["easy", "intermediate", "hard"]:
count = diff_counts.get(diff, 0)
print(f" {diff}: {count}")
def main():
script_dir = Path(__file__).parent
data_dir = script_dir.parent / "data-sources" / "italian"
output_dir = script_dir.parent / "datafiles"
output_file = output_dir / "italian-merged.json"
if not data_dir.exists():
print(f"Error: Italian data directory not found: {data_dir}")
return
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Loading extracted files from {data_dir}...")
sources = load_extracted_files(data_dir)
if not sources:
print("No extracted files found.")
return
print(f"Found sources: {', '.join(sources.keys())}")
print(f"Priority order (lowest to highest): {PRIORITY_ORDER}")
merged = merge_entries(sources)
with open(output_file, "w", encoding="utf-8") as f:
json.dump(merged, f, indent=2, ensure_ascii=False)
print(f"\n✅ Merged dataset written to: {output_file}")
print_summary(merged)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,2 +0,0 @@
wn==1.1.0
openpyxl==3.1.5

View file

@ -3,7 +3,8 @@
{ "path": "./packages/shared" },
{ "path": "./packages/db" },
{ "path": "./apps/web" },
{ "path": "./apps/api" }
{ "path": "./apps/api" },
{ "path": "./data-pipeline" },
],
"files": []
"files": [],
}