Compare commits
No commits in common. "0dba68904e3240d27f157a103b403dbb592ad57e" and "0a0bafa0ec2072c7058c074852f276d3e4d213c3" have entirely different histories.
0dba68904e
...
0a0bafa0ec
85 changed files with 5599688 additions and 8651 deletions
|
|
@ -8,9 +8,6 @@ jobs:
|
|||
build-and-deploy:
|
||||
runs-on: docker
|
||||
steps:
|
||||
- name: Install tools
|
||||
run: apt-get update && apt-get install -y docker.io openssh-client
|
||||
|
||||
- name: Checkout code
|
||||
uses: https://data.forgejo.org/actions/checkout@v4
|
||||
|
||||
|
|
|
|||
6
.gitignore
vendored
6
.gitignore
vendored
|
|
@ -9,9 +9,3 @@ repomix/
|
|||
venv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
data-pipeline/archive/
|
||||
data-pipeline/stage-1-extract/output/
|
||||
data-pipeline/stage-2-annotate/output/
|
||||
data-pipeline/stage-3-enrich/output/
|
||||
data-pipeline/stage-4-merge/output/
|
||||
|
|
|
|||
|
|
@ -10,9 +10,6 @@ import type { GameRequest } from "@lila/shared";
|
|||
const LABELS: Record<string, string> = {
|
||||
en: "English",
|
||||
it: "Italian",
|
||||
de: "German",
|
||||
fr: "French",
|
||||
es: "Spanish",
|
||||
noun: "Nouns",
|
||||
verb: "Verbs",
|
||||
easy: "Easy",
|
||||
|
|
|
|||
|
|
@ -1,17 +0,0 @@
|
|||
{
|
||||
"name": "@lila/pipeline",
|
||||
"version": "1.0.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {},
|
||||
"dependencies": {
|
||||
"@lila/shared": "workspace:*",
|
||||
"better-sqlite3": "^12.9.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/better-sqlite3": "^7.6.13",
|
||||
"@types/node": "^24.12.0",
|
||||
"tsx": "^4.21.0",
|
||||
"typescript": "^5.9.3"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,204 +0,0 @@
|
|||
"""
|
||||
data-pipeline/stage-1-extract/scripts/extract.py
|
||||
|
||||
Extract all synsets from the Open Multilingual Wordnet (OMW) for all
|
||||
supported languages and parts of speech.
|
||||
|
||||
Output: one JSON file per language, written to stage-1-extract/output/
|
||||
en.json, it.json, es.json, de.json, fr.json
|
||||
|
||||
Each file is a JSON array of synset records:
|
||||
{
|
||||
"source_id": "ili:i12345",
|
||||
"pos": "noun",
|
||||
"translations": { "en": ["dog", "canine"], "it": ["cane"] },
|
||||
"glosses": { "en": ["a domesticated animal..."] },
|
||||
"examples": { "en": ["the dog barked at the stranger"] }
|
||||
}
|
||||
|
||||
Usage:
|
||||
python stage-1-extract/scripts/extract.py
|
||||
python stage-1-extract/scripts/extract.py --sample
|
||||
|
||||
Prerequisites:
|
||||
pip install wn
|
||||
python -m wn download omw-en:1.4
|
||||
python -m wn download omw-it:1.4
|
||||
python -m wn download omw-de:1.4
|
||||
python -m wn download omw-es:1.4
|
||||
python -m wn download omw-fr:1.4
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import wn
|
||||
|
||||
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it", "es", "de", "fr"]
|
||||
POS_MAP: dict[str, str] = {
|
||||
"n": "noun",
|
||||
"v": "verb",
|
||||
"a": "adjective",
|
||||
"s": "adjective", # adjective satellite — collapsed into adjective
|
||||
"r": "adverb",
|
||||
}
|
||||
|
||||
|
||||
def extract_all(
|
||||
output_dir: str = "stage-1-extract/output", sample: bool = False
|
||||
) -> None:
|
||||
out = Path(output_dir)
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
sample_size = 100 if sample else None
|
||||
|
||||
# Load one Wordnet object per language up front.
|
||||
print("Loading wordnets...")
|
||||
wordnets: dict[str, wn.Wordnet] = {}
|
||||
for lang in SUPPORTED_LANGUAGE_CODES:
|
||||
try:
|
||||
wordnets[lang] = wn.Wordnet(lang=lang)
|
||||
synset_count = len(wordnets[lang].synsets())
|
||||
print(f" {lang}: {synset_count:,} total synsets")
|
||||
except wn.Error as e:
|
||||
print(f" ERROR loading {lang}: {e}")
|
||||
print(f" Run: python -m wn download omw-{lang}:1.4")
|
||||
sys.exit(1)
|
||||
|
||||
# Collect per-ILI data across all languages and POS.
|
||||
print("\nExtracting synsets...")
|
||||
by_ili: dict[str, dict] = {}
|
||||
|
||||
for lang, wnet in wordnets.items():
|
||||
for omw_pos, pos_label in POS_MAP.items():
|
||||
synsets = wnet.synsets(pos=omw_pos)
|
||||
covered = 0
|
||||
for synset in synsets:
|
||||
ili = synset.ili
|
||||
if not ili:
|
||||
continue
|
||||
covered += 1
|
||||
|
||||
lemmas = [str(lemma) for lemma in synset.lemmas()]
|
||||
defns = [d for d in synset.definitions() if d]
|
||||
examples = [e for e in synset.examples() if e]
|
||||
|
||||
if ili not in by_ili:
|
||||
by_ili[ili] = {"pos": pos_label}
|
||||
|
||||
if lang not in by_ili[ili]:
|
||||
by_ili[ili][lang] = {
|
||||
"lemmas": lemmas,
|
||||
"glosses": defns,
|
||||
"examples": examples,
|
||||
}
|
||||
else:
|
||||
# ILI already exists for this language — merge data.
|
||||
# Happens when 'a' and 's' both map to adjective for the
|
||||
# same ILI. Deduplicate to avoid repeated entries.
|
||||
existing = by_ili[ili][lang]
|
||||
existing["lemmas"] = list(
|
||||
dict.fromkeys(existing["lemmas"] + lemmas)
|
||||
)
|
||||
existing["glosses"] = list(
|
||||
dict.fromkeys(existing["glosses"] + defns)
|
||||
)
|
||||
existing["examples"] = list(
|
||||
dict.fromkeys(existing["examples"] + examples)
|
||||
)
|
||||
|
||||
print(f" {lang} {pos_label}: {covered:,} synsets with ILI")
|
||||
|
||||
# Build records and write single combined output file.
|
||||
print("\nBuilding records...")
|
||||
ilis = sorted(by_ili.keys())
|
||||
if sample_size:
|
||||
ilis = ilis[:sample_size]
|
||||
|
||||
records: list[dict] = []
|
||||
for ili in ilis:
|
||||
data = by_ili[ili]
|
||||
record: dict = {
|
||||
"source_id": f"ili:{ili}",
|
||||
"pos": data["pos"],
|
||||
"translations": {},
|
||||
"glosses": {},
|
||||
"examples": {},
|
||||
}
|
||||
|
||||
for key, value in data.items():
|
||||
if key == "pos":
|
||||
continue
|
||||
lang = key
|
||||
if value["lemmas"]:
|
||||
record["translations"][lang] = value["lemmas"]
|
||||
if value["glosses"]:
|
||||
record["glosses"][lang] = value["glosses"]
|
||||
if value["examples"]:
|
||||
record["examples"][lang] = value["examples"]
|
||||
|
||||
records.append(record)
|
||||
|
||||
output_file = out / "omw.json"
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\nWrote {len(records):,} synsets → {output_file}")
|
||||
_print_coverage(records)
|
||||
|
||||
|
||||
def _print_coverage(records: list[dict]) -> None:
|
||||
"""Print per-language translation, gloss, and example counts."""
|
||||
lang_stats: dict[str, dict[str, int]] = {}
|
||||
for lang in SUPPORTED_LANGUAGE_CODES:
|
||||
lang_stats[lang] = {"translations": 0, "glosses": 0, "examples": 0}
|
||||
|
||||
pos_stats: dict[str, int] = {}
|
||||
|
||||
for r in records:
|
||||
pos = r["pos"]
|
||||
pos_stats[pos] = pos_stats.get(pos, 0) + 1
|
||||
|
||||
for lang, lemmas in r["translations"].items():
|
||||
if lang in lang_stats:
|
||||
lang_stats[lang]["translations"] += len(lemmas)
|
||||
for lang, gloss_list in r["glosses"].items():
|
||||
if lang in lang_stats:
|
||||
lang_stats[lang]["glosses"] += len(gloss_list)
|
||||
for lang, example_list in r["examples"].items():
|
||||
if lang in lang_stats:
|
||||
lang_stats[lang]["examples"] += len(example_list)
|
||||
|
||||
print("\nPOS breakdown:")
|
||||
for pos, count in sorted(pos_stats.items()):
|
||||
print(f" {pos}: {count:,}")
|
||||
|
||||
print("\nCoverage per language:")
|
||||
for lang, counts in lang_stats.items():
|
||||
t = counts["translations"]
|
||||
g = counts["glosses"]
|
||||
e = counts["examples"]
|
||||
total = len(records)
|
||||
print(
|
||||
f" {lang}: {t:,} translations, {g:,} glosses, {e:,} examples (avg {(t / total):.1f} translations/synset)"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Extract OMW data to JSON")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="stage-1-extract/output",
|
||||
help="Output directory for JSON files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sample",
|
||||
action="store_true",
|
||||
help="Extract only 100 synsets per language for inspection",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
extract_all(output_dir=args.output_dir, sample=args.sample)
|
||||
|
|
@ -1,227 +0,0 @@
|
|||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||
|
||||
// ── Types ────────────────────────────────────────────────────────────────────
|
||||
|
||||
type OmwExample = { text: string; source: "omw" };
|
||||
|
||||
type CefrExample = { text: string; source: "cefr" };
|
||||
|
||||
type Example = OmwExample | CefrExample;
|
||||
|
||||
type OmwRecord = {
|
||||
source_id: string;
|
||||
pos: SupportedPos;
|
||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
examples: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
};
|
||||
|
||||
type AnnotatedRecord = {
|
||||
source_id: string;
|
||||
pos: SupportedPos;
|
||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
||||
votes: Partial<
|
||||
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
||||
>;
|
||||
};
|
||||
|
||||
type CefrSourceEntry = {
|
||||
word: string;
|
||||
pos: string;
|
||||
cefr_level: string;
|
||||
example_sentence_native?: string;
|
||||
};
|
||||
|
||||
type ConflictEntry = {
|
||||
word: string;
|
||||
pos: string;
|
||||
language: SupportedLanguageCode;
|
||||
levels: string[];
|
||||
};
|
||||
|
||||
// ── Constants ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const POS_NORMALIZE: Record<string, SupportedPos> = {
|
||||
noun: "noun",
|
||||
n: "noun",
|
||||
nom: "noun", // French
|
||||
verb: "verb",
|
||||
verbs: "verb",
|
||||
v: "verb",
|
||||
v1: "verb",
|
||||
adjective: "adjective",
|
||||
adjektiv: "adjective", // German
|
||||
adj: "adjective",
|
||||
adverb: "adverb",
|
||||
adverbs: "adverb",
|
||||
adv: "adverb",
|
||||
};
|
||||
|
||||
const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]);
|
||||
|
||||
const PATHS = {
|
||||
omw: "stage-1-extract/output/omw.json",
|
||||
cefrDir: "stage-2-annotate/sources/cefr",
|
||||
outputDir: "stage-2-annotate/output",
|
||||
};
|
||||
|
||||
// ── CEFR source loading ───────────────────────────────────────────────────────
|
||||
|
||||
type CefrIndex = Map<string, { level: string; example?: string }>;
|
||||
|
||||
async function loadCefrSource(
|
||||
lang: SupportedLanguageCode,
|
||||
): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> {
|
||||
const filepath = path.join(PATHS.cefrDir, `${lang}.json`);
|
||||
const raw = await fs.readFile(filepath, "utf-8");
|
||||
const entries = JSON.parse(raw) as CefrSourceEntry[];
|
||||
|
||||
// First pass — detect conflicts.
|
||||
// Structure: "word|pos" -> Set of CEFR levels seen
|
||||
const seen = new Map<string, Set<string>>();
|
||||
|
||||
for (const entry of entries) {
|
||||
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
|
||||
if (!pos) continue;
|
||||
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
|
||||
|
||||
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
|
||||
if (!seen.has(key)) seen.set(key, new Set());
|
||||
seen.get(key)!.add(entry.cefr_level);
|
||||
}
|
||||
|
||||
const conflicts: ConflictEntry[] = [];
|
||||
for (const [key, levels] of seen.entries()) {
|
||||
if (levels.size > 1) {
|
||||
const [word, pos] = key.split("|") as [string, string];
|
||||
conflicts.push({ word, pos, language: lang, levels: [...levels] });
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass — build index, skip conflicting entries.
|
||||
const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`));
|
||||
|
||||
const index: CefrIndex = new Map();
|
||||
for (const entry of entries) {
|
||||
const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()];
|
||||
if (!pos) continue;
|
||||
if (!CEFR_LEVELS.has(entry.cefr_level)) continue;
|
||||
|
||||
const key = `${entry.word.toLowerCase().trim()}|${pos}`;
|
||||
if (conflictKeys.has(key)) continue;
|
||||
|
||||
index.set(key, {
|
||||
level: entry.cefr_level,
|
||||
...(entry.example_sentence_native
|
||||
? { example: entry.example_sentence_native }
|
||||
: {}),
|
||||
});
|
||||
}
|
||||
|
||||
return { index, conflicts };
|
||||
}
|
||||
|
||||
// ── Annotation ────────────────────────────────────────────────────────────────
|
||||
|
||||
async function annotate(): Promise<void> {
|
||||
// Load OMW records
|
||||
console.log("Reading OMW extract...");
|
||||
const raw = await fs.readFile(PATHS.omw, "utf-8");
|
||||
const omwRecords = JSON.parse(raw) as OmwRecord[];
|
||||
console.log(` Loaded ${omwRecords.length.toLocaleString()} synsets`);
|
||||
|
||||
// Load CEFR sources for all languages
|
||||
console.log("\nLoading CEFR source files...");
|
||||
const cefrIndexes = new Map<SupportedLanguageCode, CefrIndex>();
|
||||
const allConflicts: ConflictEntry[] = [];
|
||||
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
const { index, conflicts } = await loadCefrSource(lang);
|
||||
cefrIndexes.set(lang, index);
|
||||
allConflicts.push(...conflicts);
|
||||
console.log(
|
||||
` ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`,
|
||||
);
|
||||
}
|
||||
|
||||
// Write conflicts file
|
||||
await fs.mkdir(PATHS.outputDir, { recursive: true });
|
||||
await fs.writeFile(
|
||||
path.join(PATHS.outputDir, "conflicts.json"),
|
||||
JSON.stringify(allConflicts, null, 2),
|
||||
"utf-8",
|
||||
);
|
||||
console.log(
|
||||
`\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`,
|
||||
);
|
||||
|
||||
// Annotate and write one file per language
|
||||
console.log("\nAnnotating...");
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
const index = cefrIndexes.get(lang)!;
|
||||
const records: AnnotatedRecord[] = [];
|
||||
let matched = 0;
|
||||
|
||||
for (const record of omwRecords) {
|
||||
const annotated: AnnotatedRecord = {
|
||||
source_id: record.source_id,
|
||||
pos: record.pos,
|
||||
translations: record.translations,
|
||||
glosses: record.glosses,
|
||||
examples: {},
|
||||
votes: {},
|
||||
};
|
||||
|
||||
// Convert OMW examples to typed format
|
||||
for (const [l, exList] of Object.entries(record.examples)) {
|
||||
annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({
|
||||
text,
|
||||
source: "omw" as const,
|
||||
}));
|
||||
}
|
||||
|
||||
// Match translations for this language against CEFR index
|
||||
const langTranslations = record.translations[lang] ?? [];
|
||||
for (const word of langTranslations) {
|
||||
const key = `${word.toLowerCase().trim()}|${record.pos}`;
|
||||
const cefrEntry = index.get(key);
|
||||
if (!cefrEntry) continue;
|
||||
|
||||
matched++;
|
||||
|
||||
// Add CEFR vote
|
||||
if (!annotated.votes[lang]) annotated.votes[lang] = {};
|
||||
annotated.votes[lang]![word] = { cefr_source: cefrEntry.level };
|
||||
|
||||
// Add native example if present
|
||||
if (cefrEntry.example) {
|
||||
if (!annotated.examples[lang]) annotated.examples[lang] = [];
|
||||
annotated.examples[lang]!.push({
|
||||
text: cefrEntry.example,
|
||||
source: "cefr" as const,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
records.push(annotated);
|
||||
}
|
||||
|
||||
const outputFile = path.join(PATHS.outputDir, `${lang}.json`);
|
||||
await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8");
|
||||
console.log(
|
||||
` ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
annotate().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,205 +0,0 @@
|
|||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||
|
||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
type Example = { text: string; source: "omw" | "cefr" };
|
||||
|
||||
type AnnotatedRecord = {
|
||||
source_id: string;
|
||||
pos: SupportedPos;
|
||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
||||
votes: Partial<
|
||||
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
||||
>;
|
||||
};
|
||||
|
||||
type SampleRecord = AnnotatedRecord & { _sample_bucket: string };
|
||||
|
||||
// ── Constants ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const PATHS = {
|
||||
annotatedDir: "stage-2-annotate/output",
|
||||
output: "test/output/sample.json",
|
||||
};
|
||||
|
||||
const BUCKET_SIZE = 20;
|
||||
|
||||
// ── Bucket predicates ─────────────────────────────────────────────────────────
|
||||
|
||||
type Bucket = { name: string; predicate: (record: AnnotatedRecord) => boolean };
|
||||
|
||||
const BUCKETS: Bucket[] = [
|
||||
{
|
||||
name: "has_cefr_vote",
|
||||
predicate: (r) =>
|
||||
Object.values(r.votes).some(
|
||||
(langVotes) => Object.keys(langVotes ?? {}).length > 0,
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "no_cefr_vote",
|
||||
predicate: (r) =>
|
||||
Object.values(r.votes).every(
|
||||
(langVotes) => Object.keys(langVotes ?? {}).length === 0,
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "has_glosses_and_examples",
|
||||
predicate: (r) =>
|
||||
Object.keys(r.glosses).length > 0 && Object.keys(r.examples).length > 0,
|
||||
},
|
||||
{
|
||||
name: "no_glosses_no_examples",
|
||||
predicate: (r) =>
|
||||
!r.glosses["fr"] &&
|
||||
!r.examples["fr"] &&
|
||||
!r.votes["fr"] &&
|
||||
!r.glosses["es"] &&
|
||||
!r.examples["es"] &&
|
||||
!r.votes["es"],
|
||||
},
|
||||
{
|
||||
name: "pos_spread",
|
||||
predicate: () => true, // sampled separately to ensure POS coverage
|
||||
},
|
||||
];
|
||||
|
||||
// ── Sampling ──────────────────────────────────────────────────────────────────
|
||||
|
||||
function sampleBucket(
|
||||
records: AnnotatedRecord[],
|
||||
predicate: (r: AnnotatedRecord) => boolean,
|
||||
size: number,
|
||||
exclude: Set<string>,
|
||||
): AnnotatedRecord[] {
|
||||
const candidates = records.filter(
|
||||
(r) => !exclude.has(r.source_id) && predicate(r),
|
||||
);
|
||||
|
||||
// Shuffle for random sampling
|
||||
for (let i = candidates.length - 1; i > 0; i--) {
|
||||
const j = Math.floor(Math.random() * (i + 1));
|
||||
[candidates[i], candidates[j]] = [candidates[j]!, candidates[i]!];
|
||||
}
|
||||
|
||||
return candidates.slice(0, size);
|
||||
}
|
||||
|
||||
function samplePosBucket(
|
||||
records: AnnotatedRecord[],
|
||||
exclude: Set<string>,
|
||||
): AnnotatedRecord[] {
|
||||
const posList: SupportedPos[] = ["noun", "verb", "adjective", "adverb"];
|
||||
const perPos = Math.floor(BUCKET_SIZE / posList.length);
|
||||
const result: AnnotatedRecord[] = [];
|
||||
|
||||
for (const pos of posList) {
|
||||
const sampled = sampleBucket(
|
||||
records,
|
||||
(r) => r.pos === pos,
|
||||
perPos,
|
||||
exclude,
|
||||
);
|
||||
result.push(...sampled);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ── Loading ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async function loadAnnotated(): Promise<AnnotatedRecord[]> {
|
||||
// Load all language files and merge votes into a single record set.
|
||||
// Use en.json as the base record structure since it has the most complete
|
||||
// glosses and examples. Votes from all other languages are merged in.
|
||||
const baseRaw = await fs.readFile(
|
||||
path.join(PATHS.annotatedDir, "en.json"),
|
||||
"utf-8",
|
||||
);
|
||||
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
|
||||
|
||||
// Build a map for fast lookup by source_id
|
||||
const byId = new Map<string, AnnotatedRecord>();
|
||||
for (const record of base) {
|
||||
byId.set(record.source_id, record);
|
||||
}
|
||||
|
||||
// Merge votes from remaining language files
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
if (lang === "en") continue;
|
||||
const raw = await fs.readFile(
|
||||
path.join(PATHS.annotatedDir, `${lang}.json`),
|
||||
"utf-8",
|
||||
);
|
||||
const records = JSON.parse(raw) as AnnotatedRecord[];
|
||||
|
||||
for (const record of records) {
|
||||
const base = byId.get(record.source_id);
|
||||
if (!base) continue;
|
||||
|
||||
// Merge votes
|
||||
for (const [l, langVotes] of Object.entries(record.votes)) {
|
||||
if (!base.votes[l as SupportedLanguageCode]) {
|
||||
base.votes[l as SupportedLanguageCode] = {};
|
||||
}
|
||||
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
|
||||
}
|
||||
|
||||
// Merge examples from CEFR source files not in base
|
||||
for (const [l, examples] of Object.entries(record.examples)) {
|
||||
const lang = l as SupportedLanguageCode;
|
||||
if (!base.examples[lang]) {
|
||||
base.examples[lang] = examples as Example[];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...byId.values()];
|
||||
}
|
||||
|
||||
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log("Loading annotated files...");
|
||||
const records = await loadAnnotated();
|
||||
console.log(` Loaded ${records.length.toLocaleString()} synsets`);
|
||||
|
||||
const sampled: SampleRecord[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
// Sample each bucket except pos_spread
|
||||
for (const bucket of BUCKETS.filter((b) => b.name !== "pos_spread")) {
|
||||
const results = sampleBucket(records, bucket.predicate, BUCKET_SIZE, seen);
|
||||
for (const r of results) {
|
||||
seen.add(r.source_id);
|
||||
sampled.push({ ...r, _sample_bucket: bucket.name });
|
||||
}
|
||||
console.log(` ${bucket.name}: ${results.length} records`);
|
||||
}
|
||||
|
||||
// Sample pos_spread bucket
|
||||
const posResults = samplePosBucket(records, seen);
|
||||
for (const r of posResults) {
|
||||
seen.add(r.source_id);
|
||||
sampled.push({ ...r, _sample_bucket: "pos_spread" });
|
||||
}
|
||||
console.log(` pos_spread: ${posResults.length} records`);
|
||||
|
||||
console.log(`\nTotal sampled: ${sampled.length} records`);
|
||||
|
||||
// Write output
|
||||
await fs.mkdir(path.dirname(PATHS.output), { recursive: true });
|
||||
await fs.writeFile(PATHS.output, JSON.stringify(sampled, null, 2), "utf-8");
|
||||
console.log(`Wrote sample → ${PATHS.output}`);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
{
|
||||
"extends": "../tsconfig.base.json",
|
||||
"compilerOptions": {
|
||||
"module": "NodeNext",
|
||||
"moduleResolution": "NodeNext",
|
||||
"outDir": "dist",
|
||||
"rootDir": ".",
|
||||
"types": ["node"],
|
||||
},
|
||||
"references": [{ "path": "../packages/shared" }],
|
||||
"include": ["./**/*"],
|
||||
}
|
||||
7800
data-sources/english/cefrj-vocabulary-profile-1.5.csv
Normal file
7800
data-sources/english/cefrj-vocabulary-profile-1.5.csv
Normal file
File diff suppressed because it is too large
Load diff
BIN
data-sources/english/en_m3.xls
Normal file
BIN
data-sources/english/en_m3.xls
Normal file
Binary file not shown.
2137
data-sources/english/octanove-vocabulary-profile-c1c2-1.0.csv
Normal file
2137
data-sources/english/octanove-vocabulary-profile-c1c2-1.0.csv
Normal file
File diff suppressed because it is too large
Load diff
2987
data-sources/italian/it-list_with_glossas.csv
Normal file
2987
data-sources/italian/it-list_with_glossas.csv
Normal file
File diff suppressed because it is too large
Load diff
BIN
data-sources/italian/it_m3.xls
Normal file
BIN
data-sources/italian/it_m3.xls
Normal file
Binary file not shown.
517565
data-sources/italian/subtlex-it.csv
Normal file
517565
data-sources/italian/subtlex-it.csv
Normal file
File diff suppressed because it is too large
Load diff
661563
data-sources/italian/wordlist_of_italian_words_660000_parole_italiane.txt
Normal file
661563
data-sources/italian/wordlist_of_italian_words_660000_parole_italiane.txt
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,473 +0,0 @@
|
|||
# lila data pipeline
|
||||
|
||||
> **NOTE: BEFORE RUNNING THE PIPELINE, CONSIDER IMPROVING THE CEFR SOURCE
|
||||
> FILES IN `stage-2-annotate/sources/cefr/`. BETTER SOURCE COVERAGE MEANS
|
||||
> FEWER WORDS FOR THE LLM TO ANNOTATE FROM SCRATCH, FASTER OVERNIGHT RUNS,
|
||||
> AND HIGHER CONFIDENCE IN THE FINAL OUTPUT. SEE UNIVERSALCEFR
|
||||
> (huggingface.co/UniversalCEFR) AND CEFR-J
|
||||
> (github.com/openlanguageprofiles/olp-en-cefrj) AS STARTING POINTS.**
|
||||
|
||||
This pipeline extracts vocabulary data from the Open Multilingual Wordnet (OMW), annotates it with CEFR levels from curated source files, verifies and enriches annotations using local LLMs, and produces authoritative JSON files per language. These files are consumed by the seeder in `packages/db` to populate the database with terms, translations, glosses, CEFR levels, difficulty ratings, and LLM-generated descriptions.
|
||||
|
||||
## Overview
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
omw[(OMW SQLite DBs)]
|
||||
cefr[(CEFR JSON files)]
|
||||
extract[Extract]
|
||||
annotate[Annotate]
|
||||
enrich[Enrich]
|
||||
merge[Merge]
|
||||
final[(final/lang.json)]
|
||||
flagged[(flagged/lang.json)]
|
||||
seeder[packages/db seeder]
|
||||
db[(Database)]
|
||||
|
||||
omw --> extract
|
||||
cefr --> annotate
|
||||
extract --> annotate
|
||||
annotate --> enrich
|
||||
enrich --> merge
|
||||
merge --> final
|
||||
merge --> flagged
|
||||
final --> seeder
|
||||
seeder --> db
|
||||
```
|
||||
|
||||
Each stage is a standalone script that reads from the previous stage's output and produces one JSON file per language. Stages can be re-run independently without affecting earlier or later stages.
|
||||
|
||||
The enrich stage is the exception — it produces one checkpoint file per model run per language, plus a compiled votes file once all runs are complete. It is designed to run overnight, one model at a time, and is fully resumable if interrupted.
|
||||
|
||||
Only fully annotated output in `stage-4-merge/output/final/` reaches the database. Words where LLMs could not reach a majority vote land in `stage-4-merge/output/flagged/` and wait for manual review before seeding.
|
||||
|
||||
## Data sources
|
||||
|
||||
### OMW / WordNet
|
||||
|
||||
The Open Multilingual Wordnet (OMW) is the base vocabulary source. It provides synsets — groups of synonymous words — with translations and glosses across multiple languages. One SQLite database per language is downloaded and placed in `sources/omw/`. These files are not committed to git.
|
||||
|
||||
All four parts of speech are extracted: noun, verb, adjective, adverb. WordNet's adjective satellites are collapsed into adjective — this is a WordNet-internal distinction that has no relevance for language learning. Alongside translations and glosses, usage examples are extracted where available and stored in the database as term_examples.
|
||||
|
||||
See **Setup** for download instructions.
|
||||
|
||||
### CEFR source files
|
||||
|
||||
Per-language JSON files in `sources/cefr/` provide the initial CEFR level annotations. These files do not cover the full vocabulary extracted from OMW — coverage varies by language. Gaps and disagreements are handled by the enrich stage.
|
||||
|
||||
| Language | File |
|
||||
|---|---|
|
||||
| English | `sources/cefr/en.json` |
|
||||
| Italian | `sources/cefr/it.json` |
|
||||
| Spanish | `sources/cefr/es.json` |
|
||||
| German | `sources/cefr/de.json` |
|
||||
| French | `sources/cefr/fr.json` |
|
||||
|
||||
These files are committed to git. For per-language coverage detail see `COVERAGE.md`.
|
||||
|
||||
### CEFR annotation and verification
|
||||
|
||||
CEFR levels are determined by a majority vote combining all available sources:
|
||||
|
||||
- The CEFR source file counts as one vote (if it has an entry for the word)
|
||||
- Each LLM model run counts as one vote
|
||||
|
||||
The LLMs verify existing annotations as well as filling gaps — a source file entry does not automatically win. Majority vote across all sources determines the final level.
|
||||
|
||||
If no majority is reached, the word is flagged for manual review and excluded from the database until resolved.
|
||||
|
||||
## Setup
|
||||
|
||||
### OMW databases
|
||||
|
||||
Download the OMW SQLite database for each language using the `wn` Python
|
||||
library:
|
||||
|
||||
```bash
|
||||
python -m wn download omw-en:1.4
|
||||
python -m wn download omw-it:1.4
|
||||
python -m wn download omw-de:1.4
|
||||
python -m wn download omw-es:1.4
|
||||
python -m wn download omw-fr:1.4
|
||||
```
|
||||
|
||||
The data is stored automatically at `~/.wn_data/wn.db` and is not committed
|
||||
to git.
|
||||
|
||||
### LLM setup
|
||||
|
||||
See `LLM-SETUP.md`.
|
||||
|
||||
## Pipeline stages
|
||||
|
||||
The pipeline runs in five stages. Each stage is independent and can be re-run without affecting the others.
|
||||
|
||||
| Stage | What it does |
|
||||
|---|---|
|
||||
| 1. Extract | Reads OMW SQLite database, outputs normalized JSON per language |
|
||||
| 2. Annotate | Merges CEFR source files into extracted data, adds source file votes |
|
||||
| 3. Enrich | Runs local LLMs in two rounds — generation then voting |
|
||||
| 4. Merge | Resolves votes, derives difficulty, splits into final and flagged |
|
||||
| 5. Compare | Generates COVERAGE.md with detailed quality report |
|
||||
|
||||
### 1. Extract
|
||||
|
||||
Reads the OMW SQLite database (`~/.wn_data/wn.db`) and produces a single normalized JSON file containing all synsets with their translations, glosses, and usage examples across all five languages and all parts of speech. Adjective satellites are collapsed into adjective at this stage.
|
||||
|
||||
**Input:** `~/.wn_data/wn.db`
|
||||
**Output:** `stage-1-extract/output/omw.json`
|
||||
|
||||
```bash
|
||||
python stage-1-extract/scripts/extract.py
|
||||
```
|
||||
|
||||
Add `--sample` to extract 100 synsets for inspection before running the full
|
||||
extraction.
|
||||
|
||||
Each record in the output looks like this:
|
||||
|
||||
```json
|
||||
{
|
||||
"source_id": "ili:i1",
|
||||
"pos": "adjective",
|
||||
"translations": {
|
||||
"en": ["able"],
|
||||
"it": ["abile", "intelligente", "valente", "capace"],
|
||||
"es": ["capaz"],
|
||||
"fr": ["comptable"]
|
||||
},
|
||||
"glosses": {
|
||||
"en": ["(usually followed by 'to') having the necessary means or skill or know-how or authority to do something"]
|
||||
},
|
||||
"examples": {
|
||||
"en": ["able to swim", "she was able to program her computer"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Note: glosses and examples are not available for all languages. French and Spanish have no glosses or examples in the current OMW database — these will be generated by the LLM in the enrich stage. Coverage detail is in `COVERAGE.md`.
|
||||
|
||||
### 2. Annotate
|
||||
|
||||
Reads the combined OMW extract and merges CEFR source data into it. Each translation in each language is matched against the corresponding CEFR source
|
||||
file by word text and part of speech. Matched translations receive a `cefr_source` vote which carries into the enrich stage. Unmatched translations proceed without a vote.
|
||||
|
||||
This stage also extracts native example sentences from the CEFR source files and adds them to the record alongside OMW examples, with `source: "cefr"` to distinguish them.
|
||||
|
||||
Words appearing in the CEFR source file multiple times with different CEFR levels are written to `conflicts.json` for manual review and excluded from voting until resolved.
|
||||
|
||||
**Input:** `stage-1-extract/output/omw.json` + `stage-2-annotate/sources/cefr/{lang}.json`
|
||||
**Output:**
|
||||
- `stage-2-annotate/output/{lang}.json` — one per language
|
||||
- `stage-2-annotate/output/conflicts.json` — cross-language conflicts for review
|
||||
|
||||
```bash
|
||||
pnpm --filter @lila/pipeline annotate
|
||||
```
|
||||
|
||||
Each record in the output extends the OMW record with a `votes` field and any additional examples from the CEFR source file:
|
||||
|
||||
```json
|
||||
{
|
||||
"source_id": "ili:i1",
|
||||
"pos": "adjective",
|
||||
"translations": {
|
||||
"en": ["able"],
|
||||
"it": ["abile", "intelligente", "valente", "capace"],
|
||||
"es": ["capaz"],
|
||||
"fr": ["comptable"]
|
||||
},
|
||||
"glosses": {
|
||||
"en": ["having the necessary means or skill to do something"]
|
||||
},
|
||||
"examples": {
|
||||
"en": [
|
||||
{ "text": "able to swim", "source": "omw" },
|
||||
{ "text": "She was able to finish the task.", "source": "cefr" }
|
||||
]
|
||||
},
|
||||
"votes": {
|
||||
"en": {
|
||||
"able": { "cefr_source": "B1" }
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Words not present in the CEFR source file will have an empty `votes` object.
|
||||
|
||||
### 3. Enrich
|
||||
|
||||
The enrich stage runs in two rounds, both designed to execute overnight one model at a time. The llama.cpp server must be running locally before starting either round. See `LLM-SETUP.md` for setup instructions.
|
||||
|
||||
**Round 1 — generation**
|
||||
|
||||
Each model processes every word in every language one term at a time and
|
||||
generates:
|
||||
|
||||
- A CEFR level vote for each translation
|
||||
- A description for each language
|
||||
- A translation for each language, only if OMW provides none
|
||||
- A gloss for each language, only if OMW provides none
|
||||
- Usage examples for each language, only if OMW provides none
|
||||
|
||||
OMW data is never duplicated — the script checks what OMW already provides before building the prompt. For translations, glosses and examples, if OMW data exists for that language the LLM skips generation entirely. This significantly reduces compute time for languages with good OMW coverage such as English.
|
||||
|
||||
All model-generated content is stored with an anonymised source (`model_1`, `model_2` etc.) so models cannot be biased by knowing who generated what in round 2.
|
||||
|
||||
**Input:** `stage-2-annotate/output/{lang}.json`
|
||||
**Output:** `stage-3-enrich/output/round1/{lang}_{model}.json` per run
|
||||
|
||||
```bash
|
||||
pnpm --filter @lila/pipeline enrich --round 1 --model {model}
|
||||
```
|
||||
|
||||
**Compiling candidates**
|
||||
|
||||
Once all round 1 runs are complete, compile all generated candidates into a single structured file per language. This is the input to round 2.
|
||||
|
||||
**Input:** `stage-3-enrich/output/round1/{lang}_{model}.json`
|
||||
**Output:** `stage-3-enrich/output/candidates/{lang}_candidates.json`
|
||||
|
||||
```bash
|
||||
pnpm --filter @lila/pipeline enrich --compile-candidates
|
||||
```
|
||||
|
||||
**Round 2 — voting**
|
||||
|
||||
Each model receives the compiled candidate list for every word and votes on:
|
||||
|
||||
- The best gloss candidate (if multiple exist)
|
||||
- The best description candidate (if multiple exist)
|
||||
- The best usage examples candidate (if multiple exist)
|
||||
- A CEFR level vote for each translation
|
||||
|
||||
OMW data is not put to a vote — it automatically wins over any LLM-generated candidate. Round 2 only resolves conflicts between model-generated candidates. The prompt is kept small — one word at a time, a clean numbered candidate list — to fit within a limited context window.
|
||||
|
||||
**Input:** `stage-3-enrich/output/candidates/{lang}_candidates.json`
|
||||
**Output:** `stage-3-enrich/output/round2/{lang}_{model}.json` per run
|
||||
|
||||
```bash
|
||||
pnpm --filter @lila/pipeline enrich --round 2 --model {model}
|
||||
```
|
||||
|
||||
**Compiling votes**
|
||||
|
||||
Once all round 2 runs are complete, compile all votes into a single file per language. This is the input to the merge stage.
|
||||
|
||||
**Input:** `stage-3-enrich/output/round2/{lang}_{model}.json`
|
||||
**Output:** `stage-3-enrich/output/votes/{lang}_votes.json`
|
||||
|
||||
```bash
|
||||
pnpm --filter @lila/pipeline enrich --compile-votes
|
||||
```
|
||||
|
||||
Each record in the votes file looks like this:
|
||||
|
||||
```json
|
||||
{
|
||||
"source_id": "omw-en-12345",
|
||||
"pos": "noun",
|
||||
"translations": {
|
||||
"en": [
|
||||
{
|
||||
"text": "dog",
|
||||
"votes": { "cefr_source": "A1", "model_1": "A1", "model_2": "A1" }
|
||||
},
|
||||
{
|
||||
"text": "canine",
|
||||
"votes": { "cefr_source": "B2", "model_1": "B2", "model_2": "B1" }
|
||||
}
|
||||
],
|
||||
"it": [
|
||||
{
|
||||
"text": "cane",
|
||||
"votes": { "cefr_source": "A1", "model_1": "A1", "model_2": "A1" }
|
||||
}
|
||||
]
|
||||
},
|
||||
"glosses": {
|
||||
"en": { "text": "a domesticated carnivorous mammal", "source": "omw" },
|
||||
"fr": {
|
||||
"candidates": [
|
||||
{ "text": "un mammifère carnivore domestiqué", "source": "model_1" },
|
||||
{ "text": "un animal domestique carnivore", "source": "model_2" }
|
||||
],
|
||||
"votes": { "model_1": 1, "model_2": 1 }
|
||||
}
|
||||
},
|
||||
"examples": {
|
||||
"en": [
|
||||
{ "text": "the dog barked at the stranger", "source": "omw" }
|
||||
],
|
||||
"fr": {
|
||||
"candidates": [
|
||||
{ "text": "le chien a aboyé", "source": "model_1" },
|
||||
{ "text": "le chien gardait la maison", "source": "model_2" }
|
||||
],
|
||||
"votes": { "model_1": 2, "model_2": 1 }
|
||||
}
|
||||
},
|
||||
"descriptions": {
|
||||
"en": {
|
||||
"candidates": [
|
||||
{ "text": "a common household pet known for loyalty", "source": "model_1" },
|
||||
{ "text": "a domesticated animal and loyal companion", "source": "model_2" }
|
||||
],
|
||||
"votes": { "model_1": 2, "model_2": 1 }
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Merge
|
||||
|
||||
Reads the votes file per language and resolves the final value for every field. Produces two output files per language — fully resolved records ready for seeding, and flagged records that need manual review.
|
||||
|
||||
**Merge rules:**
|
||||
|
||||
- OMW data wins automatically and is never overridden
|
||||
- For CEFR levels: the level with the most votes wins. If no majority is reached, that translation is flagged
|
||||
- For LLM-generated text fields (gloss, examples, descriptions): the candidate with the most votes wins
|
||||
|
||||
<!-- TODO: decide fallback strategy when no majority is reached for text fields -->
|
||||
|
||||
**Difficulty mapping:**
|
||||
|
||||
| CEFR | Difficulty |
|
||||
|---|---|
|
||||
| A1, A2 | easy |
|
||||
| B1, B2 | intermediate |
|
||||
| C1, C2 | hard |
|
||||
|
||||
**Input:** `stage-3-enrich/output/votes/{lang}_votes.json`
|
||||
**Output:**
|
||||
- `stage-4-merge/output/final/{lang}.json` — fully resolved, ready for seeding
|
||||
- `stage-4-merge/output/flagged/{lang}.json` — CEFR majority not reached, needs manual review before seeding
|
||||
|
||||
```bash
|
||||
pnpm --filter @lila/pipeline merge
|
||||
```
|
||||
|
||||
Each record in `final/{lang}.json` looks like this:
|
||||
|
||||
```json
|
||||
{
|
||||
"source_id": "omw-en-12345",
|
||||
"pos": "noun",
|
||||
"translations": {
|
||||
"en": [
|
||||
{ "text": "dog", "cefr_level": "A1", "difficulty": "easy" },
|
||||
{ "text": "canine", "cefr_level": "B2", "difficulty": "intermediate" }
|
||||
],
|
||||
"it": [
|
||||
{ "text": "cane", "cefr_level": "A1", "difficulty": "easy" }
|
||||
]
|
||||
},
|
||||
"glosses": {
|
||||
"en": { "text": "a domesticated carnivorous mammal", "source": "omw" },
|
||||
"fr": { "text": "un mammifère carnivore domestiqué", "source": "model_1" }
|
||||
},
|
||||
"examples": {
|
||||
"en": [
|
||||
{ "text": "the dog barked at the stranger", "source": "omw" }
|
||||
],
|
||||
"fr": [
|
||||
{ "text": "le chien a aboyé", "source": "model_1" }
|
||||
]
|
||||
},
|
||||
"descriptions": {
|
||||
"en": {
|
||||
"text": "a common household pet known for loyalty and companionship",
|
||||
"source": "model_1"
|
||||
},
|
||||
"it": {
|
||||
"text": "un animale domestico comune noto per la sua fedeltà",
|
||||
"source": "model_2"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Resolving flagged words:**
|
||||
|
||||
Open `stage-4-merge/output/flagged/{lang}.json`, manually set the correct `cefr_level` and `difficulty` for each flagged translation, then move the resolved entries into `stage-4-merge/output/final/{lang}.json`. Re-run the seeder after resolving.
|
||||
|
||||
### 5. Compare / QA
|
||||
|
||||
Read-only. Generates `COVERAGE.md` with a full breakdown of the pipeline
|
||||
output quality per language. Run this after merge to verify output before
|
||||
seeding the database.
|
||||
|
||||
**Input:**
|
||||
- `stage-4-merge/output/final/{lang}.json`
|
||||
- `stage-4-merge/output/flagged/{lang}.json`
|
||||
|
||||
**Output:** `COVERAGE.md`
|
||||
|
||||
```bash
|
||||
pnpm --filter @lila/pipeline compare
|
||||
```
|
||||
|
||||
`COVERAGE.md` reports the following per language:
|
||||
|
||||
- Total synsets extracted
|
||||
- Total translations per language
|
||||
- POS breakdown per language — word counts for noun, verb, adjective, adverb
|
||||
- CEFR coverage per language — how many translations have a resolved CEFR level, broken down by level (A1, A2, B1, B2, C1, C2)
|
||||
- Difficulty breakdown per language — word counts for easy, intermediate, hard
|
||||
- Flagged count per language — how many translations are awaiting manual review
|
||||
- Gloss coverage per language — total glosses, broken down by source (omw vs LLM-generated) and which languages have no glosses at all
|
||||
- Example coverage per language — same breakdown as glosses
|
||||
- Description coverage per language — how many translations have a description, broken down by source
|
||||
- CEFR source file coverage per language — how many words from the source file were matched against OMW translations
|
||||
- LLM model contribution — how many CEFR votes and text candidates each anonymised model contributed
|
||||
|
||||
## Adding a new language
|
||||
|
||||
1. Add the language code to `SUPPORTED_LANGUAGE_CODES` in `packages/shared/src/constants.ts`
|
||||
2. Build shared: `pnpm --filter @lila/shared build`
|
||||
3. Generate and run a DB migration: `pnpm --filter @lila/db generate` then `pnpm --filter @lila/db migrate`
|
||||
4. Download the OMW lexicon for the language using the `wn` Python library
|
||||
5. Add a CEFR source file at `stage-2-annotate/sources/cefr/{lang}.json`
|
||||
6. Run the full pipeline
|
||||
|
||||
## Constants and constraints
|
||||
|
||||
These values are defined in `packages/shared/src/constants.ts` and enforced by database check constraints. The pipeline filters out any entries that violate them.
|
||||
|
||||
| Constant | Values |
|
||||
|---|---|
|
||||
| Languages | `en`, `it`, `de`, `es`, `fr` |
|
||||
| Parts of speech | `noun`, `verb`, `adjective`, `adverb` |
|
||||
| CEFR levels | `A1`, `A2`, `B1`, `B2`, `C1`, `C2` |
|
||||
| Difficulty | `easy`, `intermediate`, `hard` |
|
||||
|
||||
Adding a new value to any of these requires a constants update and a database migration before re-running the pipeline. See **Adding a new language** for the full steps — the same process applies for new parts of speech.
|
||||
|
||||
## Further extensions
|
||||
|
||||
These are not part of the current pipeline but are worth considering as the
|
||||
dataset matures:
|
||||
|
||||
- **Grammatical gender and articles** — Wiktionary dumps contain gender and
|
||||
article data for nouns across all supported languages. Could be extracted
|
||||
and stored as a new `translation_forms` table.
|
||||
- **Conjugations** — Wiktionary also carries verb conjugation tables. Useful
|
||||
for a future grammar-focused quiz mode.
|
||||
- **IPA pronunciations** — Wiktionary and Forvo are potential sources for
|
||||
phonetic transcriptions per language.
|
||||
- **TTS audio files** — Generate pronunciation audio for each translation
|
||||
using a local or cloud TTS engine. Stored as static files, served alongside
|
||||
the quiz UI.
|
||||
- **Images** — Associate an image with each synset to support visual
|
||||
vocabulary learning. Could be sourced from open image datasets like
|
||||
ImageNet or WikiMedia Commons.
|
||||
- **Frequency data** — Word frequency rankings per language from sources like
|
||||
the Google Ngram dataset. Useful for smarter difficulty calibration beyond
|
||||
CEFR levels alone.
|
||||
- **Improved CEFR source files** — See note at the top of this document.
|
||||
UniversalCEFR and CEFR-J are good starting points.
|
||||
- **Additional languages** — The pipeline is language-agnostic. Adding a new
|
||||
language requires an OMW lexicon, a CEFR source file, and a constants
|
||||
update. See **Adding a new language**.
|
||||
|
|
@ -225,59 +225,9 @@ Host git.lilastudy.com
|
|||
|
||||
This allows standard git commands without specifying the port.
|
||||
|
||||
## CI/CD Pipeline
|
||||
|
||||
Automated build and deploy via Forgejo Actions. On every push to `main`, the pipeline builds ARM64 images natively on the VPS, pushes them to the Forgejo registry, and restarts the app containers.
|
||||
|
||||
### Components
|
||||
|
||||
- **Forgejo Actions** — enabled by default, workflow files in `.forgejo/workflows/`
|
||||
- **Forgejo Runner** — runs as a container (`lila-ci-runner`) on the VPS, uses the host's Docker socket to build images natively on ARM64
|
||||
- **Workflow file** — `.forgejo/workflows/deploy.yml`
|
||||
|
||||
### Pipeline Steps
|
||||
|
||||
1. Install Docker CLI and SSH client in the job container
|
||||
2. Checkout the repository
|
||||
3. Login to the Forgejo container registry
|
||||
4. Build API image (target: `runner`)
|
||||
5. Build Web image (target: `production`, with `VITE_API_URL` baked in)
|
||||
6. Push both images to `git.lilastudy.com`
|
||||
7. SSH into the VPS, pull new images, restart `api` and `web` containers, prune old images
|
||||
|
||||
### Secrets (stored in Forgejo repo settings → Actions → Secrets)
|
||||
|
||||
| Secret | Value |
|
||||
|---|---|
|
||||
| REGISTRY_USER | Forgejo username |
|
||||
| REGISTRY_PASSWORD | Forgejo password |
|
||||
| SSH_PRIVATE_KEY | Contents of `~/.ssh/ci-runner` on the VPS |
|
||||
| SSH_HOST | VPS IP address |
|
||||
| SSH_USER | `lila` |
|
||||
|
||||
### Runner Configuration
|
||||
|
||||
The runner config is at `/data/config.yml` inside the `lila-ci-runner` container. Key settings:
|
||||
|
||||
- `docker_host: "automount"` — mounts the host Docker socket into job containers
|
||||
- `valid_volumes: ["/var/run/docker.sock"]` — allows the socket mount
|
||||
- `privileged: true` — required for Docker access from job containers
|
||||
- `options: "--group-add 989"` — adds the host's docker group (GID 989) to job containers
|
||||
|
||||
The runner command must explicitly reference the config file:
|
||||
|
||||
```yaml
|
||||
command: '/bin/sh -c "sleep 5; forgejo-runner -c /data/config.yml daemon"'
|
||||
```
|
||||
|
||||
### Deploy Cycle
|
||||
|
||||
Push to main → pipeline runs automatically (~2-5 min) → app is updated. No manual steps required.
|
||||
|
||||
To manually trigger a re-run: go to the repo's Actions tab, click on the latest run, and use the re-run button.
|
||||
|
||||
## Known Issues and Future Work
|
||||
|
||||
- **CI/CD**: Currently manual build-push-pull cycle. Plan: Forgejo Actions with a runner on the VPS building ARM images natively (eliminates QEMU cross-compilation)
|
||||
- **Backups**: Offsite backup storage (Hetzner Object Storage or similar) should be added
|
||||
- **Valkey**: Not in the production stack yet. Will be added when multiplayer requires session/room state
|
||||
- **Monitoring/logging**: No centralized logging or uptime monitoring configured
|
||||
|
|
|
|||
|
|
@ -1,295 +0,0 @@
|
|||
# LLM Setup — lila pipeline
|
||||
|
||||
This document covers the LLM infrastructure for stage 3 (enrich) of the lila
|
||||
data pipeline. It documents the hardware constraints, supported providers,
|
||||
model recommendations, and how to configure and swap providers in the test
|
||||
and production scripts.
|
||||
|
||||
---
|
||||
|
||||
## Hardware (dev machine)
|
||||
|
||||
| Component | Spec |
|
||||
|---|---|
|
||||
| CPU | Intel Core i7-6500U (2 cores / 4 threads @ 3.10 GHz) |
|
||||
| RAM | 8 GB |
|
||||
| GPU | NVIDIA GeForce GTX 950M — 4 GB VRAM (Maxwell, CUDA compute 5.0) |
|
||||
| OS | Debian GNU/Linux 13 (trixie) x86_64 |
|
||||
|
||||
**Local inference verdict:** viable for small/quantized models, not for
|
||||
production runs. See the [Local inference](#local-inference-llamacpp) section
|
||||
for details.
|
||||
|
||||
---
|
||||
|
||||
## Provider overview
|
||||
|
||||
The enrich script uses a single, swappable provider config. All providers
|
||||
except Anthropic expose an OpenAI-compatible API, so the same client code
|
||||
works across all of them — only `baseURL`, `apiKey`, and `model` change.
|
||||
|
||||
| Provider | Use case | Cost | Rate limits |
|
||||
|---|---|---|---|
|
||||
| llama.cpp (local) | Quality testing, overnight dev runs | Free (electricity) | None |
|
||||
| OpenRouter (free tier) | Quality comparison, multi-model evaluation | Free | 50 req/day, 20 req/min |
|
||||
| OpenRouter (paid) | Production runs if local quality insufficient | Pay-per-token | None |
|
||||
| Anthropic API | Quality baseline / reference | Pay-per-token | Standard |
|
||||
|
||||
---
|
||||
|
||||
## Local inference (llama.cpp)
|
||||
|
||||
### Why local inference is worth testing
|
||||
|
||||
Time is not a constraint — the pipeline scripts are fully resumable. The
|
||||
laptop can run overnight for multiple nights. The only question is output
|
||||
quality, which the test script evaluates empirically.
|
||||
|
||||
### Hardware constraints
|
||||
|
||||
The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0).
|
||||
llama.cpp supports Maxwell via CUDA backend but newer builds may require
|
||||
the `--cuda-no-kv-offload` flag depending on the version.
|
||||
|
||||
llama.cpp splits model layers between GPU and CPU automatically via
|
||||
`--n-gpu-layers`. You set how many layers go on the GPU; the rest run on
|
||||
CPU/RAM. This means a model larger than VRAM is not a dead end — it runs
|
||||
in hybrid mode, slower than full-GPU but much faster than pure CPU.
|
||||
|
||||
Practical estimates for this hardware (~3.5 GB VRAM usable after drivers):
|
||||
|
||||
| Model size | Q4 VRAM | Mode | Est. speed |
|
||||
|---|---|---|---|
|
||||
| 3B | ~2.0 GB | Full GPU | ~15–20 tok/s |
|
||||
| 4B | ~2.5 GB | Full GPU | ~12–18 tok/s |
|
||||
| 7B | ~4.5 GB | Hybrid (~26/32 layers on GPU) | ~8–12 tok/s |
|
||||
| 13B+ | ~8 GB+ | CPU-heavy hybrid | too slow |
|
||||
|
||||
### Recommended local models
|
||||
|
||||
Two candidates worth testing, covering different points on the size/quality
|
||||
tradeoff:
|
||||
|
||||
**Gemma 4 E4B Instruct (Q4 / UD-Q4_K_XL)**
|
||||
- GGUF file: `gemma-4-E4B-it-UD-Q4_K_XL.gguf` (~2.5 GB)
|
||||
- Source: https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF
|
||||
- Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+
|
||||
language support including all five pipeline languages. First candidate
|
||||
to test.
|
||||
|
||||
**Qwen2.5 7B Instruct (Q4_K_M)**
|
||||
- GGUF file: `Qwen2.5-7B-Instruct-Q4_K_M.gguf` (~4.5 GB)
|
||||
- Source: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF
|
||||
- Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~8–12 tok/s.
|
||||
Stronger multilingual generation than any 3–4B model. Second candidate,
|
||||
for comparison against the smaller Gemma 4 E4B.
|
||||
|
||||
### Installation
|
||||
|
||||
```bash
|
||||
# Install build dependencies
|
||||
sudo apt install build-essential cmake git
|
||||
|
||||
# Clone llama.cpp
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
|
||||
# Build with CUDA support (GTX 950M — compute 5.0)
|
||||
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=50
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
# Download model (example — adjust path as needed)
|
||||
mkdir -p models
|
||||
wget -O models/qwen2.5-3b-instruct-q4_k_m.gguf \
|
||||
https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_k_m.gguf
|
||||
```
|
||||
|
||||
### Starting the server
|
||||
|
||||
**Gemma 4 E4B** (full GPU):
|
||||
```bash
|
||||
./build/bin/llama-server \
|
||||
--model models/gemma-4-e4b-it-ud-q4_k_xl.gguf \
|
||||
--port 8080 \
|
||||
--ctx-size 4096 \
|
||||
--n-gpu-layers 999 \
|
||||
--host 127.0.0.1
|
||||
```
|
||||
|
||||
**Qwen2.5 7B** (hybrid — tune `--n-gpu-layers` to fit your VRAM):
|
||||
```bash
|
||||
./build/bin/llama-server \
|
||||
--model models/qwen2.5-7b-instruct-q4_k_m.gguf \
|
||||
--port 8080 \
|
||||
--ctx-size 4096 \
|
||||
--n-gpu-layers 28 \
|
||||
--host 127.0.0.1
|
||||
```
|
||||
|
||||
`--n-gpu-layers 999` means "put everything on GPU" — llama.cpp caps at the
|
||||
actual layer count automatically, so 999 is safe as a "full offload" value.
|
||||
For the 7B hybrid, start with `28` and reduce by 2 if the server reports
|
||||
out-of-memory at startup.
|
||||
|
||||
### Verify the server is running
|
||||
|
||||
```bash
|
||||
curl http://127.0.0.1:8080/health
|
||||
# Expected: {"status":"ok"}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## OpenRouter (free tier)
|
||||
|
||||
OpenRouter exposes all models via an OpenAI-compatible API. No code changes
|
||||
are needed to switch from local llama.cpp to OpenRouter — only the config
|
||||
object changes.
|
||||
|
||||
### Rate limits (free tier)
|
||||
|
||||
- **50 requests per day** (account total, not per model)
|
||||
- 20 requests per minute
|
||||
|
||||
> **Implication for testing:** with a 10-record test set you have headroom
|
||||
> to test 4–5 models per day. With a 100-record test set, plan one model per
|
||||
> day.
|
||||
|
||||
> **Implication for production:** the free tier is not viable for 117k
|
||||
> records. If local quality is insufficient, use paid OpenRouter credits or
|
||||
> a dedicated provider.
|
||||
|
||||
### Free models recommended for this pipeline
|
||||
|
||||
Ranked by expected multilingual generation quality for en/it/de/fr/es:
|
||||
|
||||
| Model ID | Params | Notes |
|
||||
|---|---|---|
|
||||
| `qwen/qwen3-coder:free` | 480B MoE (35B active) | Best free option. Strong multilingual despite "coder" label. Use as quality ceiling. |
|
||||
| `qwen/qwen3-next-80b-a3b-instruct:free` | 80B MoE (3B active) | Smaller Qwen, useful comparison point. |
|
||||
| `nvidia/nemotron-3-super-120b-a12b:free` | 120B MoE (12B active) | 262K context, supports structured output. |
|
||||
| `google/gemma-4-31b-it:free` | 31B | 140+ language support, good European language coverage. |
|
||||
| `zhipuai/glm-4.5-air:free` | MoE | Multilingual-focused. |
|
||||
|
||||
**Skip for this pipeline:**
|
||||
- Llama models — weaker European language generation than Qwen/Gemma
|
||||
- Mistral free tier — requests may be used for model training
|
||||
|
||||
### API endpoint
|
||||
|
||||
```
|
||||
https://openrouter.ai/api/v1/chat/completions
|
||||
```
|
||||
|
||||
Set `Authorization: Bearer <OPENROUTER_API_KEY>` in the request headers.
|
||||
|
||||
---
|
||||
|
||||
## Provider configuration in the test script
|
||||
|
||||
The enrich test script reads a single config object. To switch providers,
|
||||
change this object and re-run.
|
||||
|
||||
```typescript
|
||||
// config.ts
|
||||
|
||||
export type ProviderConfig = {
|
||||
name: string; // used for output folder naming
|
||||
baseURL: string;
|
||||
apiKey: string;
|
||||
model: string;
|
||||
maxTokens: number;
|
||||
};
|
||||
|
||||
// Local llama.cpp
|
||||
export const LOCAL_QWEN3B: ProviderConfig = {
|
||||
name: "local-qwen2.5-3b",
|
||||
baseURL: "http://127.0.0.1:8080/v1",
|
||||
apiKey: "none", // llama.cpp ignores this
|
||||
model: "qwen2.5-3b", // llama.cpp ignores model name, uses loaded model
|
||||
maxTokens: 512,
|
||||
};
|
||||
|
||||
// OpenRouter — Qwen3 480B (free)
|
||||
export const OR_QWEN3_480B: ProviderConfig = {
|
||||
name: "or-qwen3-480b",
|
||||
baseURL: "https://openrouter.ai/api/v1",
|
||||
apiKey: process.env.OPENROUTER_API_KEY!,
|
||||
model: "qwen/qwen3-coder:free",
|
||||
maxTokens: 512,
|
||||
};
|
||||
|
||||
// OpenRouter — Gemma 4 31B (free)
|
||||
export const OR_GEMMA4_31B: ProviderConfig = {
|
||||
name: "or-gemma4-31b",
|
||||
baseURL: "https://openrouter.ai/api/v1",
|
||||
apiKey: process.env.OPENROUTER_API_KEY!,
|
||||
model: "google/gemma-4-31b-it:free",
|
||||
maxTokens: 512,
|
||||
};
|
||||
|
||||
// Anthropic (reference baseline — different adapter required)
|
||||
export const ANTHROPIC_SONNET: ProviderConfig = {
|
||||
name: "anthropic-sonnet",
|
||||
baseURL: "https://api.anthropic.com/v1", // adapter handles format difference
|
||||
apiKey: process.env.ANTHROPIC_API_KEY!,
|
||||
model: "claude-sonnet-4-6",
|
||||
maxTokens: 512,
|
||||
};
|
||||
```
|
||||
|
||||
Output from each run lands in:
|
||||
```
|
||||
stage-3-enrich/test/output/{provider.name}/results.json
|
||||
stage-3-enrich/test/output/{provider.name}/metrics.json
|
||||
```
|
||||
|
||||
The evaluate script compares all `metrics.json` files side by side.
|
||||
|
||||
---
|
||||
|
||||
## Evaluation metrics
|
||||
|
||||
The test script measures the following per provider run:
|
||||
|
||||
| Metric | What it measures |
|
||||
|---|---|
|
||||
| **JSON parse rate** | % of responses that are valid, schema-compliant JSON. Critical — a failed parse is a wasted call. Target: >97% |
|
||||
| **Field coverage** | % of records where all required fields are present (cefr votes for all translations, descriptions for all languages, glosses/examples for fr/es) |
|
||||
| **CEFR agreement** | For records that have a `cefr_source` vote, % where the model agrees. Measures calibration. |
|
||||
| **Language correctness** | Manual spot-check only — automated detection not reliable enough |
|
||||
| **Tokens/second** | Local only. Indicates overnight run feasibility |
|
||||
|
||||
### Decision thresholds
|
||||
|
||||
| Metric | Threshold | Action if below |
|
||||
|---|---|---|
|
||||
| JSON parse rate | < 97% | Do not use this model for production |
|
||||
| Field coverage | < 95% | Prompt needs revision before production |
|
||||
| CEFR agreement | < 70% | Model lacks vocabulary knowledge for this task |
|
||||
|
||||
---
|
||||
|
||||
## Recommended test sequence
|
||||
|
||||
1. **Start local, minimal dataset (5–10 records)**
|
||||
Install llama.cpp, run Qwen2.5 3B against 5–10 hand-picked records.
|
||||
Verify the server works, the output parses, and the model produces
|
||||
something reasonable. This is purely a smoke test.
|
||||
|
||||
2. **Expand local to full 100-record sample**
|
||||
Once the pipeline is confirmed working, run all 100 records locally.
|
||||
Collect metrics. This is your local quality baseline.
|
||||
|
||||
3. **Run the same 100 records through OpenRouter free models**
|
||||
One model per day (50 req/day limit). Start with `qwen/qwen3-coder:free`
|
||||
as the quality ceiling.
|
||||
|
||||
4. **Compare metrics side by side**
|
||||
If local 3B is within acceptable range of the cloud models on CEFR
|
||||
agreement and field coverage, proceed with local overnight runs for
|
||||
production. If not, use the cloud model that passed.
|
||||
|
||||
5. **Production run**
|
||||
Full 117k records. Resume-safe — the script checkpoints after each
|
||||
record so overnight runs can be stopped and continued.
|
||||
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
## tasks
|
||||
|
||||
- put users in separate db
|
||||
- pinning dependencies in package.json files
|
||||
- rethink organisation of datafiles and wordlists
|
||||
- admin dashboard for user management, also overview of words and languages and all their stats
|
||||
|
|
@ -30,18 +29,6 @@ laptop: verify if docker containers run on startup (they shouldnt)
|
|||
### vps setup
|
||||
|
||||
- monitoring and logging (eg via chrootkit or rkhunter, logwatch/monit => mails daily with summary)
|
||||
<<<<<<< HEAD
|
||||
- ~~keep the vps clean (e.g. old docker images/containers)~~ ✅ CI/CD pipeline runs `docker image prune -f` after deploy
|
||||
|
||||
### ~~cd/ci pipeline~~ ✅ RESOLVED
|
||||
|
||||
Forgejo Actions with runner on VPS, Forgejo built-in container registry. See `deployment.md`.
|
||||
|
||||
### ~~postgres backups~~ ✅ RESOLVED
|
||||
|
||||
Daily pg_dump cron job, 7-day retention, dev laptop auto-sync via rsync. See `deployment.md`.
|
||||
=======
|
||||
>>>>>>> dev
|
||||
|
||||
### try now option
|
||||
|
||||
|
|
|
|||
|
|
@ -287,17 +287,6 @@ After completing a task: share the code, ask what to refactor and why. The LLM s
|
|||
|
||||
## 11. Post-MVP Ladder
|
||||
|
||||
<<<<<<< HEAD
|
||||
| Phase | What it adds | Status |
|
||||
| ----------------- | ------------------------------------------------------------------------------- | ------ |
|
||||
| Auth | Better Auth (Google + GitHub), embedded in Express API, user rows in DB | ✅ |
|
||||
| Deployment | Docker Compose, Caddy, Forgejo, CI/CD, Hetzner VPS | ✅ |
|
||||
| Hardening (partial) | CI/CD pipeline, DB backups | ✅ |
|
||||
| User Stats | Games played, score history, profile page | ❌ |
|
||||
| Multiplayer Lobby | Room creation, join by code, WebSocket connection | ❌ |
|
||||
| Multiplayer Game | Simultaneous answers, server timer, live scores, winner screen | ❌ |
|
||||
| Hardening (rest) | Rate limiting, error boundaries, monitoring, accessibility | ❌ |
|
||||
=======
|
||||
| Phase | What it adds | Status |
|
||||
| ------------------- | ----------------------------------------------------------------------- | ------ |
|
||||
| Auth | Better Auth (Google + GitHub), embedded in Express API, user rows in DB | ✅ |
|
||||
|
|
@ -307,7 +296,6 @@ After completing a task: share the code, ask what to refactor and why. The LLM s
|
|||
| Multiplayer Lobby | Room creation, join by code, WebSocket connection | ✅ |
|
||||
| Multiplayer Game | Simultaneous answers, server timer, live scores, winner screen | ✅ |
|
||||
| Hardening (rest) | Rate limiting, error boundaries, monitoring, accessibility | ❌ |
|
||||
>>>>>>> dev
|
||||
|
||||
### Future Data Model Extensions (deferred, additive)
|
||||
|
||||
|
|
|
|||
|
|
@ -14,7 +14,6 @@ export default defineConfig([
|
|||
"**/*.config.ts",
|
||||
"routeTree.gen.ts",
|
||||
"scripts/**",
|
||||
"data-pipeline/**/*",
|
||||
]),
|
||||
|
||||
eslint.configs.recommended,
|
||||
|
|
|
|||
|
|
@ -11,5 +11,5 @@ export default defineConfig({
|
|||
out: "./drizzle",
|
||||
schema: "./src/db/schema.ts",
|
||||
dialect: "postgresql",
|
||||
dbCredentials: { url: process.env["DATABASE_URL_LOCAL"]! },
|
||||
dbCredentials: { url: process.env["DATABASE_URL"]! },
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,12 +0,0 @@
|
|||
ALTER TABLE "decks" DROP CONSTRAINT "source_language_check";--> statement-breakpoint
|
||||
ALTER TABLE "decks" DROP CONSTRAINT "validated_languages_check";--> statement-breakpoint
|
||||
ALTER TABLE "term_glosses" DROP CONSTRAINT "language_code_check";--> statement-breakpoint
|
||||
ALTER TABLE "terms" DROP CONSTRAINT "pos_check";--> statement-breakpoint
|
||||
ALTER TABLE "translations" DROP CONSTRAINT "language_code_check";--> statement-breakpoint
|
||||
ALTER TABLE "lobbies" ALTER COLUMN "status" SET DEFAULT 'waiting';--> statement-breakpoint
|
||||
ALTER TABLE "term_glosses" ADD COLUMN "description" text;--> statement-breakpoint
|
||||
ALTER TABLE "decks" ADD CONSTRAINT "source_language_check" CHECK ("decks"."source_language" IN ('en', 'it', 'de', 'fr', 'es'));--> statement-breakpoint
|
||||
ALTER TABLE "decks" ADD CONSTRAINT "validated_languages_check" CHECK (validated_languages <@ ARRAY['en', 'it', 'de', 'fr', 'es']::varchar[]);--> statement-breakpoint
|
||||
ALTER TABLE "term_glosses" ADD CONSTRAINT "language_code_check" CHECK ("term_glosses"."language_code" IN ('en', 'it', 'de', 'fr', 'es'));--> statement-breakpoint
|
||||
ALTER TABLE "terms" ADD CONSTRAINT "pos_check" CHECK ("terms"."pos" IN ('noun', 'verb', 'adjective', 'adverb'));--> statement-breakpoint
|
||||
ALTER TABLE "translations" ADD CONSTRAINT "language_code_check" CHECK ("translations"."language_code" IN ('en', 'it', 'de', 'fr', 'es'));
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
CREATE TABLE "term_examples" (
|
||||
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
|
||||
"term_id" uuid NOT NULL,
|
||||
"language_code" varchar(10) NOT NULL,
|
||||
"text" text NOT NULL,
|
||||
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||
CONSTRAINT "unique_term_example" UNIQUE("term_id","language_code","text"),
|
||||
CONSTRAINT "language_code_check" CHECK ("term_examples"."language_code" IN ('en', 'it', 'de', 'fr', 'es'))
|
||||
);
|
||||
--> statement-breakpoint
|
||||
ALTER TABLE "term_examples" ADD CONSTRAINT "term_examples_term_id_terms_id_fk" FOREIGN KEY ("term_id") REFERENCES "public"."terms"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
CREATE INDEX "idx_term_examples_term_id" ON "term_examples" USING btree ("term_id","language_code");
|
||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -50,20 +50,6 @@
|
|||
"when": 1776270391189,
|
||||
"tag": "0006_certain_adam_destine",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 7,
|
||||
"version": "7",
|
||||
"when": 1776665111607,
|
||||
"tag": "0007_nosy_leper_queen",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 8,
|
||||
"version": "7",
|
||||
"when": 1776695279870,
|
||||
"tag": "0008_far_energizer",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,7 +6,9 @@
|
|||
"scripts": {
|
||||
"build": "tsc",
|
||||
"generate": "drizzle-kit generate",
|
||||
"migrate": "drizzle-kit migrate"
|
||||
"migrate": "drizzle-kit migrate",
|
||||
"db:seed": "npx tsx src/seeding-datafiles.ts",
|
||||
"db:build-deck": "npx tsx src/generating-deck.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@lila/shared": "workspace:*",
|
||||
|
|
|
|||
183
packages/db/src/checking-cefr-coverage.ts
Normal file
183
packages/db/src/checking-cefr-coverage.ts
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
/*
|
||||
|
||||
This script performs a cross-reference check between two specific data sets:
|
||||
|
||||
- The "Target" List: It reads the {language}-merged.json file (e.g., en-merged.json). This represents the vocabulary you want to have CEFR levels for.
|
||||
- The "Source of Truth": It queries your Database (translations table). This represents the vocabulary you currently have in your app.
|
||||
|
||||
What it calculates:
|
||||
It tells you: "Of all the words in my merged JSON file, how many actually exist in my database?"
|
||||
|
||||
Matched: The word from the JSON file was found in the DB. (Ready for enrichment).
|
||||
Unmatched: The word from the JSON file was not found in the DB. (These will be skipped during enrichment).
|
||||
|
||||
*/
|
||||
|
||||
import fs from "node:fs/promises";
|
||||
import { eq } from "drizzle-orm";
|
||||
|
||||
import {
|
||||
SUPPORTED_LANGUAGE_CODES,
|
||||
SUPPORTED_POS,
|
||||
CEFR_LEVELS,
|
||||
DIFFICULTY_LEVELS,
|
||||
} from "@lila/shared";
|
||||
import { db } from "@lila/db";
|
||||
import { terms, translations } from "@lila/db/schema";
|
||||
|
||||
type POS = (typeof SUPPORTED_POS)[number];
|
||||
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
||||
type CEFRLevel = (typeof CEFR_LEVELS)[number];
|
||||
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
|
||||
|
||||
type MergedRecord = {
|
||||
word: string;
|
||||
pos: POS;
|
||||
cefr: CEFRLevel;
|
||||
difficulty: Difficulty;
|
||||
sources: string[];
|
||||
};
|
||||
|
||||
type CoverageStats = {
|
||||
total: number;
|
||||
matched: number;
|
||||
unmatched: number;
|
||||
byCefr: Record<CEFRLevel, { total: number; matched: number }>;
|
||||
byDifficulty: Record<Difficulty, { total: number; matched: number }>;
|
||||
unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }>;
|
||||
};
|
||||
|
||||
const dataDir = "./src/data/";
|
||||
|
||||
async function checkCoverage(language: LanguageCode): Promise<void> {
|
||||
const filename = `${language}-merged.json`;
|
||||
const filepath = dataDir + filename;
|
||||
|
||||
console.log(`\n📄 Checking ${filename}...`);
|
||||
|
||||
// Load merged data
|
||||
let records: MergedRecord[];
|
||||
try {
|
||||
const raw = await fs.readFile(filepath, "utf8");
|
||||
records = JSON.parse(raw) as MergedRecord[];
|
||||
} catch (e) {
|
||||
console.warn(` ⚠️ Could not read file: ${(e as Error).message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(` Loaded ${records.length.toLocaleString("en-US")} entries`);
|
||||
|
||||
// Initialize stats
|
||||
const stats: CoverageStats = {
|
||||
total: records.length,
|
||||
matched: 0,
|
||||
unmatched: 0,
|
||||
byCefr: {} as Record<CEFRLevel, { total: number; matched: number }>,
|
||||
byDifficulty: {} as Record<Difficulty, { total: number; matched: number }>,
|
||||
unmatchedWords: [],
|
||||
};
|
||||
|
||||
for (const level of CEFR_LEVELS)
|
||||
stats.byCefr[level] = { total: 0, matched: 0 };
|
||||
for (const diff of DIFFICULTY_LEVELS)
|
||||
stats.byDifficulty[diff] = { total: 0, matched: 0 };
|
||||
|
||||
// ── BATCHED LOOKUP: Build a Set of existing (word, pos) pairs in DB ──
|
||||
console.log(` 🔍 Querying database for existing translations...`);
|
||||
|
||||
// Get all existing translations for this language + POS combo
|
||||
const existingRows = await db
|
||||
.select({ text: translations.text, pos: terms.pos })
|
||||
.from(translations)
|
||||
.innerJoin(terms, eq(translations.term_id, terms.id))
|
||||
.where(eq(translations.language_code, language));
|
||||
|
||||
// Create a Set for O(1) lookup: "word|pos" -> true
|
||||
const existingSet = new Set(
|
||||
existingRows.map((row) => `${row.text.toLowerCase()}|${row.pos}`),
|
||||
);
|
||||
|
||||
// ── Process records against the in-memory Set ──
|
||||
for (const record of records) {
|
||||
stats.byCefr[record.cefr].total++;
|
||||
stats.byDifficulty[record.difficulty].total++;
|
||||
|
||||
const key = `${record.word.toLowerCase()}|${record.pos}`;
|
||||
|
||||
if (existingSet.has(key)) {
|
||||
stats.matched++;
|
||||
stats.byCefr[record.cefr].matched++;
|
||||
stats.byDifficulty[record.difficulty].matched++;
|
||||
} else {
|
||||
stats.unmatched++;
|
||||
if (stats.unmatchedWords.length < 20) {
|
||||
stats.unmatchedWords.push({
|
||||
word: record.word,
|
||||
pos: record.pos,
|
||||
cefr: record.cefr,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Print results (same as your draft) ──
|
||||
console.log(`\n📊 Coverage for ${language}:`);
|
||||
console.log(` Total entries: ${stats.total.toLocaleString("en-US")}`);
|
||||
console.log(
|
||||
` Matched in DB: ${stats.matched.toLocaleString("en-US")} (${((stats.matched / stats.total) * 100).toFixed(1)}%)`,
|
||||
);
|
||||
console.log(
|
||||
` Unmatched: ${stats.unmatched.toLocaleString("en-US")} (${((stats.unmatched / stats.total) * 100).toFixed(1)}%)`,
|
||||
);
|
||||
|
||||
console.log(`\n By CEFR level:`);
|
||||
for (const level of CEFR_LEVELS) {
|
||||
const { total, matched } = stats.byCefr[level];
|
||||
if (total > 0) {
|
||||
const pct = ((matched / total) * 100).toFixed(1);
|
||||
console.log(
|
||||
` ${level}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n By difficulty:`);
|
||||
for (const diff of DIFFICULTY_LEVELS) {
|
||||
const { total, matched } = stats.byDifficulty[diff];
|
||||
if (total > 0) {
|
||||
const pct = ((matched / total) * 100).toFixed(1);
|
||||
console.log(
|
||||
` ${diff}: ${matched.toLocaleString("en-US")}/${total.toLocaleString("en-US")} (${pct}%)`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (stats.unmatchedWords.length > 0) {
|
||||
console.log(`\n⚠️ Sample unmatched words (first 20):`);
|
||||
for (const { word, pos, cefr } of stats.unmatchedWords) {
|
||||
console.log(` "${word}" (${pos}, ${cefr})`);
|
||||
}
|
||||
if (stats.unmatched > 20) {
|
||||
console.log(` ... and ${stats.unmatched - 20} more`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const main = async () => {
|
||||
console.log("##########################################");
|
||||
console.log("lila — CEFR Coverage Check");
|
||||
console.log("##########################################");
|
||||
|
||||
for (const language of SUPPORTED_LANGUAGE_CODES) {
|
||||
await checkCoverage(language);
|
||||
}
|
||||
|
||||
console.log("\n##########################################");
|
||||
console.log("Done");
|
||||
console.log("##########################################");
|
||||
};
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
120906
packages/db/src/data/en-merged.json
Normal file
120906
packages/db/src/data/en-merged.json
Normal file
File diff suppressed because it is too large
Load diff
85710
packages/db/src/data/it-merged.json
Normal file
85710
packages/db/src/data/it-merged.json
Normal file
File diff suppressed because it is too large
Load diff
747568
packages/db/src/data/omw-noun.json
Normal file
747568
packages/db/src/data/omw-noun.json
Normal file
File diff suppressed because it is too large
Load diff
102492
packages/db/src/data/omw-verb.json
Normal file
102492
packages/db/src/data/omw-verb.json
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -51,7 +51,6 @@ export const term_glosses = pgTable(
|
|||
.references(() => terms.id, { onDelete: "cascade" }),
|
||||
language_code: varchar({ length: 10 }).notNull(),
|
||||
text: text().notNull(),
|
||||
description: text(),
|
||||
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
|
||||
},
|
||||
(table) => [
|
||||
|
|
@ -63,31 +62,6 @@ export const term_glosses = pgTable(
|
|||
],
|
||||
);
|
||||
|
||||
export const term_examples = pgTable(
|
||||
"term_examples",
|
||||
{
|
||||
id: uuid().primaryKey().defaultRandom(),
|
||||
term_id: uuid()
|
||||
.notNull()
|
||||
.references(() => terms.id, { onDelete: "cascade" }),
|
||||
language_code: varchar({ length: 10 }).notNull(),
|
||||
text: text().notNull(),
|
||||
created_at: timestamp({ withTimezone: true }).defaultNow().notNull(),
|
||||
},
|
||||
(table) => [
|
||||
unique("unique_term_example").on(
|
||||
table.term_id,
|
||||
table.language_code,
|
||||
table.text,
|
||||
),
|
||||
check(
|
||||
"language_code_check",
|
||||
sql`${table.language_code} IN (${sql.raw(SUPPORTED_LANGUAGE_CODES.map((l) => `'${l}'`).join(", "))})`,
|
||||
),
|
||||
index("idx_term_examples_term_id").on(table.term_id, table.language_code),
|
||||
],
|
||||
);
|
||||
|
||||
export const translations = pgTable(
|
||||
"translations",
|
||||
{
|
||||
|
|
|
|||
211
packages/db/src/generating-deck.ts
Normal file
211
packages/db/src/generating-deck.ts
Normal file
|
|
@ -0,0 +1,211 @@
|
|||
import fs from "node:fs/promises";
|
||||
import { db } from "@lila/db";
|
||||
import { translations, terms, decks, deck_terms } from "@lila/db/schema";
|
||||
import { inArray, and, eq, ne, countDistinct } from "drizzle-orm";
|
||||
|
||||
type DbOrTx = Parameters<Parameters<typeof db.transaction>[0]>[0];
|
||||
|
||||
const config = {
|
||||
pathToWordlist: "./src/data/wordlists/top1000englishnouns",
|
||||
deckName: "top english nouns",
|
||||
deckDescription: "Most frequently used English nouns for vocabulary practice",
|
||||
sourceLanguage: "en",
|
||||
sourcePOS: "noun",
|
||||
} as const;
|
||||
|
||||
const readWordList = async () => {
|
||||
const raw = await fs.readFile(config.pathToWordlist, "utf8");
|
||||
const words = [
|
||||
...new Set(
|
||||
raw
|
||||
.split("\n")
|
||||
.map((w) => w.trim().toLowerCase())
|
||||
.filter(Boolean),
|
||||
),
|
||||
];
|
||||
return words;
|
||||
};
|
||||
|
||||
const resolveSourceTerms = async (words: string[]) => {
|
||||
const rows = await db
|
||||
.select({ text: translations.text, termId: translations.term_id })
|
||||
.from(translations)
|
||||
.innerJoin(terms, eq(translations.term_id, terms.id))
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.text, words),
|
||||
eq(translations.language_code, config.sourceLanguage),
|
||||
eq(terms.pos, config.sourcePOS),
|
||||
),
|
||||
);
|
||||
|
||||
const wordToTermIds = new Map<string, string[]>();
|
||||
for (const row of rows) {
|
||||
const word = row.text.toLowerCase();
|
||||
|
||||
if (!wordToTermIds.has(word)) {
|
||||
wordToTermIds.set(word, []);
|
||||
}
|
||||
wordToTermIds.get(word)!.push(row.termId);
|
||||
}
|
||||
// Deduplicate: multiple words can map to the same term ID (e.g. via synonyms)
|
||||
const termIds = [...new Set(Array.from(wordToTermIds.values()).flat())];
|
||||
const missingWords = words.filter((w) => !wordToTermIds.has(w));
|
||||
|
||||
return { termIds, missingWords };
|
||||
};
|
||||
|
||||
const writeMissingWordsToFile = async (missingWords: string[]) => {
|
||||
const outputPath = `${config.pathToWordlist}-missing`;
|
||||
await fs.writeFile(outputPath, missingWords.join("\n"), "utf8");
|
||||
};
|
||||
|
||||
const validateLanguages = async (sourceLanguage: string, termIds: string[]) => {
|
||||
const coverage = await db
|
||||
.select({
|
||||
language: translations.language_code,
|
||||
coveredCount: countDistinct(translations.term_id),
|
||||
})
|
||||
.from(translations)
|
||||
.where(
|
||||
and(
|
||||
inArray(translations.term_id, termIds),
|
||||
ne(translations.language_code, sourceLanguage),
|
||||
),
|
||||
)
|
||||
.groupBy(translations.language_code);
|
||||
|
||||
const validatedLanguages = coverage
|
||||
.filter((row) => Number(row.coveredCount) === termIds.length)
|
||||
.map((row) => row.language);
|
||||
|
||||
return { coverage, validatedLanguages };
|
||||
};
|
||||
|
||||
const findExistingDeck = async (tx: DbOrTx) => {
|
||||
const existing = await tx
|
||||
.select({ id: decks.id, validatedForLanguages: decks.validated_languages })
|
||||
.from(decks)
|
||||
.where(
|
||||
and(
|
||||
eq(decks.name, config.deckName),
|
||||
eq(decks.source_language, config.sourceLanguage),
|
||||
),
|
||||
);
|
||||
return existing[0] ?? null;
|
||||
};
|
||||
|
||||
const createDeck = async (tx: DbOrTx, validatedLanguages: string[]) => {
|
||||
const result = await tx
|
||||
.insert(decks)
|
||||
.values({
|
||||
name: config.deckName,
|
||||
description: config.deckDescription,
|
||||
source_language: config.sourceLanguage,
|
||||
validated_languages: validatedLanguages,
|
||||
type: "core",
|
||||
})
|
||||
.returning({ id: decks.id });
|
||||
const created = result[0];
|
||||
if (!created) throw new Error("Failed to create deck: no row returned");
|
||||
return created.id;
|
||||
};
|
||||
|
||||
const addTermsToDeck = async (
|
||||
tx: DbOrTx,
|
||||
deckId: string,
|
||||
termIds: string[],
|
||||
): Promise<number> => {
|
||||
if (termIds.length === 0) return 0;
|
||||
|
||||
await tx
|
||||
.insert(deck_terms)
|
||||
.values(termIds.map((termId) => ({ deck_id: deckId, term_id: termId })))
|
||||
.onConflictDoNothing();
|
||||
|
||||
return termIds.length;
|
||||
};
|
||||
|
||||
const updateValidatedLanguages = async (
|
||||
tx: DbOrTx,
|
||||
deckId: string,
|
||||
validatedLanguages: string[],
|
||||
): Promise<void> => {
|
||||
await tx
|
||||
.update(decks)
|
||||
.set({ validated_languages: validatedLanguages })
|
||||
.where(eq(decks.id, deckId));
|
||||
};
|
||||
|
||||
const main = async () => {
|
||||
console.log("📖 Reading word list...");
|
||||
const sourceWords = await readWordList();
|
||||
console.log(` ${sourceWords.length} words loaded\n`);
|
||||
|
||||
console.log("🔍 Checking against database...");
|
||||
const { termIds, missingWords } = await resolveSourceTerms(sourceWords);
|
||||
console.log(` ${termIds.length} terms found`);
|
||||
console.log(` ${missingWords.length} words not found in DB\n`);
|
||||
|
||||
console.log("🖊️ Writing missing words to file...\n");
|
||||
await writeMissingWordsToFile(missingWords);
|
||||
|
||||
console.log("✅ Validating languages...");
|
||||
const { coverage, validatedLanguages } = await validateLanguages(
|
||||
config.sourceLanguage,
|
||||
termIds,
|
||||
);
|
||||
console.log(
|
||||
` Validated languages: ${JSON.stringify(validatedLanguages)}\n`,
|
||||
);
|
||||
|
||||
console.log("🔬 Language coverage breakdown...");
|
||||
for (const row of coverage) {
|
||||
console.log(
|
||||
` ${row.language}: ${row.coveredCount} / ${termIds.length} terms covered`,
|
||||
);
|
||||
}
|
||||
|
||||
console.log("🃏 Looking for existing deck...");
|
||||
const addedCount = await db.transaction(async (tx) => {
|
||||
const existingDeck = await findExistingDeck(tx);
|
||||
const deckId = existingDeck
|
||||
? existingDeck.id
|
||||
: await createDeck(tx, validatedLanguages);
|
||||
|
||||
const addedCount = await addTermsToDeck(tx, deckId, termIds);
|
||||
|
||||
const currentLanguages = existingDeck?.validatedForLanguages ?? [];
|
||||
const hasChanged =
|
||||
JSON.stringify([...currentLanguages].sort()) !==
|
||||
JSON.stringify([...validatedLanguages].sort());
|
||||
|
||||
if (hasChanged) {
|
||||
await updateValidatedLanguages(tx, deckId, validatedLanguages);
|
||||
}
|
||||
|
||||
return addedCount;
|
||||
});
|
||||
const alreadyPresentCount = termIds.length - addedCount;
|
||||
|
||||
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
console.log("📊 Summary");
|
||||
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
console.log(` Words loaded from wordlist : ${sourceWords.length}`);
|
||||
console.log(
|
||||
` Words matched in DB : ${sourceWords.length - missingWords.length}`,
|
||||
);
|
||||
console.log(` Words not found in DB : ${missingWords.length}`);
|
||||
console.log(` Term IDs resolved : ${termIds.length}`);
|
||||
console.log(` Terms added to deck : ${addedCount}`);
|
||||
console.log(` Terms already in deck : ${alreadyPresentCount}`);
|
||||
console.log(
|
||||
` Validated languages : ${validatedLanguages.length > 0 ? validatedLanguages.join(", ") : "none"}`,
|
||||
);
|
||||
console.log("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
};
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(error);
|
||||
process.exit(1);
|
||||
});
|
||||
148
packages/db/src/seeding-cefr-levels.ts
Normal file
148
packages/db/src/seeding-cefr-levels.ts
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
import fs from "node:fs/promises";
|
||||
import { eq, inArray } from "drizzle-orm";
|
||||
|
||||
import {
|
||||
SUPPORTED_LANGUAGE_CODES,
|
||||
SUPPORTED_POS,
|
||||
CEFR_LEVELS,
|
||||
DIFFICULTY_LEVELS,
|
||||
} from "@lila/shared";
|
||||
import { db } from "@lila/db";
|
||||
import { translations, terms } from "@lila/db/schema";
|
||||
|
||||
type POS = (typeof SUPPORTED_POS)[number];
|
||||
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
||||
type CEFRLevel = (typeof CEFR_LEVELS)[number];
|
||||
type Difficulty = (typeof DIFFICULTY_LEVELS)[number];
|
||||
|
||||
type MergedRecord = {
|
||||
word: string;
|
||||
pos: POS;
|
||||
cefr: CEFRLevel;
|
||||
difficulty: Difficulty;
|
||||
sources: string[];
|
||||
};
|
||||
|
||||
const dataDir = "./src/data/";
|
||||
const BATCH_SIZE = 500;
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Helpers
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
function chunk<T>(arr: T[], size: number): T[][] {
|
||||
const out: T[][] = [];
|
||||
for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
|
||||
return out;
|
||||
}
|
||||
|
||||
function fmt(n: number): string {
|
||||
return n.toLocaleString("en-US");
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Enrichment per language
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
async function enrichLanguage(language: LanguageCode): Promise<void> {
|
||||
const filename = `${language}-merged.json`;
|
||||
const filepath = dataDir + filename;
|
||||
|
||||
console.log(`\n📝 Enriching ${filename}...`);
|
||||
|
||||
let records: MergedRecord[];
|
||||
try {
|
||||
const raw = await fs.readFile(filepath, "utf8");
|
||||
records = JSON.parse(raw) as MergedRecord[];
|
||||
} catch (e) {
|
||||
console.warn(` ⚠️ Could not read file: ${(e as Error).message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(` Loaded ${fmt(records.length)} entries`);
|
||||
|
||||
// 1. Bulk fetch existing translations for this language
|
||||
console.log(` 🔍 Fetching existing translations from DB...`);
|
||||
const existingTranslations = await db
|
||||
.select({ id: translations.id, text: translations.text, pos: terms.pos })
|
||||
.from(translations)
|
||||
.innerJoin(terms, eq(translations.term_id, terms.id))
|
||||
.where(eq(translations.language_code, language));
|
||||
|
||||
// 2. Build lookup map: "lowercase_word|pos" -> translation IDs
|
||||
const translationMap = new Map<string, string[]>();
|
||||
for (const t of existingTranslations) {
|
||||
const key = `${t.text.toLowerCase()}|${t.pos}`;
|
||||
if (!translationMap.has(key)) translationMap.set(key, []);
|
||||
translationMap.get(key)!.push(t.id);
|
||||
}
|
||||
|
||||
// 3. Match records to DB IDs and group by target (cefr, difficulty)
|
||||
const updatesByValue = new Map<string, string[]>();
|
||||
const unmatchedWords: Array<{ word: string; pos: POS; cefr: CEFRLevel }> = [];
|
||||
|
||||
for (const rec of records) {
|
||||
const key = `${rec.word.toLowerCase()}|${rec.pos}`;
|
||||
const ids = translationMap.get(key);
|
||||
|
||||
if (ids && ids.length > 0) {
|
||||
const valueKey = `${rec.cefr}|${rec.difficulty}`;
|
||||
if (!updatesByValue.has(valueKey)) updatesByValue.set(valueKey, []);
|
||||
updatesByValue.get(valueKey)!.push(...ids);
|
||||
} else {
|
||||
unmatchedWords.push({ word: rec.word, pos: rec.pos, cefr: rec.cefr });
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Batch updates grouped by (cefr, difficulty)
|
||||
let totalUpdated = 0;
|
||||
for (const [valueKey, ids] of updatesByValue.entries()) {
|
||||
const [cefr, difficulty] = valueKey.split("|") as [CEFRLevel, Difficulty];
|
||||
const uniqueIds = [...new Set(ids)]; // Deduplicate synonyms/duplicates
|
||||
|
||||
for (const idBatch of chunk(uniqueIds, BATCH_SIZE)) {
|
||||
await db
|
||||
.update(translations)
|
||||
.set({ cefr_level: cefr, difficulty })
|
||||
.where(inArray(translations.id, idBatch));
|
||||
totalUpdated += idBatch.length;
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Summary
|
||||
console.log(`\n ✅ Updated ${fmt(totalUpdated)} translations`);
|
||||
console.log(` ⚠️ Unmatched: ${fmt(unmatchedWords.length)}`);
|
||||
|
||||
if (unmatchedWords.length > 0) {
|
||||
console.log(`\n Sample unmatched words (first 20):`);
|
||||
for (const { word, pos, cefr } of unmatchedWords.slice(0, 20)) {
|
||||
console.log(` "${word}" (${pos}, ${cefr})`);
|
||||
}
|
||||
if (unmatchedWords.length > 20) {
|
||||
console.log(` ... and ${fmt(unmatchedWords.length - 20)} more`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Main
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
const main = async () => {
|
||||
console.log("##########################################");
|
||||
console.log("lila — CEFR Enrichment");
|
||||
console.log("##########################################\n");
|
||||
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
await enrichLanguage(lang);
|
||||
}
|
||||
|
||||
console.log("\n##########################################");
|
||||
console.log("Done");
|
||||
console.log("##########################################");
|
||||
};
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
212
packages/db/src/seeding-datafiles.ts
Normal file
212
packages/db/src/seeding-datafiles.ts
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
import fs from "node:fs/promises";
|
||||
import { and, count, eq, inArray } from "drizzle-orm";
|
||||
|
||||
import { SUPPORTED_LANGUAGE_CODES, SUPPORTED_POS } from "@lila/shared";
|
||||
import { db } from "@lila/db";
|
||||
import { terms, translations, term_glosses } from "@lila/db/schema";
|
||||
|
||||
type POS = (typeof SUPPORTED_POS)[number];
|
||||
type LanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
||||
|
||||
type SynsetRecord = {
|
||||
source_id: string;
|
||||
pos: POS;
|
||||
translations: Partial<Record<LanguageCode, string[]>>;
|
||||
glosses: Partial<Record<LanguageCode, string[]>>;
|
||||
};
|
||||
|
||||
const dataDir = "./src/data/";
|
||||
const BATCH_SIZE = 500;
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Helpers
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
function chunk<T>(arr: T[], size: number): T[][] {
|
||||
const out: T[][] = [];
|
||||
for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
|
||||
return out;
|
||||
}
|
||||
|
||||
function fmt(n: number): string {
|
||||
return n.toLocaleString("en-US");
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Stats
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
const stats = {
|
||||
terms: { inserted: 0, skipped: 0 },
|
||||
translations: { inserted: 0, skipped: 0 },
|
||||
glosses: { inserted: 0, skipped: 0 },
|
||||
};
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Per-batch processing
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
async function processBatch(batch: SynsetRecord[]): Promise<void> {
|
||||
// 1. Insert terms — idempotency key: (source, source_id)
|
||||
const termValues = batch.map((r) => ({
|
||||
source: "omw" as const,
|
||||
source_id: r.source_id,
|
||||
pos: r.pos,
|
||||
}));
|
||||
|
||||
const insertedTerms = await db
|
||||
.insert(terms)
|
||||
.values(termValues)
|
||||
.onConflictDoNothing()
|
||||
.returning({ id: terms.id });
|
||||
|
||||
stats.terms.inserted += insertedTerms.length;
|
||||
stats.terms.skipped += batch.length - insertedTerms.length;
|
||||
|
||||
// 2. Resolve UUIDs for every source_id in this batch (new + pre-existing).
|
||||
// We can't rely solely on the .returning() above because onConflictDoNothing
|
||||
// returns nothing for rows that already existed.
|
||||
const sourceIds = batch.map((r) => r.source_id);
|
||||
const termRows = await db
|
||||
.select({ id: terms.id, source_id: terms.source_id })
|
||||
.from(terms)
|
||||
.where(and(eq(terms.source, "omw"), inArray(terms.source_id, sourceIds)));
|
||||
|
||||
const sourceIdToTermId = new Map(termRows.map((r) => [r.source_id, r.id]));
|
||||
|
||||
// 3. Build and insert translation rows
|
||||
const translationRows = batch.flatMap((r) => {
|
||||
const termId = sourceIdToTermId.get(r.source_id);
|
||||
if (!termId) return [];
|
||||
return Object.entries(r.translations).flatMap(([lang, lemmas]) =>
|
||||
(lemmas ?? []).map((text) => ({
|
||||
term_id: termId,
|
||||
language_code: lang as LanguageCode,
|
||||
text,
|
||||
})),
|
||||
);
|
||||
});
|
||||
|
||||
for (const tBatch of chunk(translationRows, BATCH_SIZE)) {
|
||||
const inserted = await db
|
||||
.insert(translations)
|
||||
.values(tBatch)
|
||||
.onConflictDoNothing()
|
||||
.returning({ id: translations.id });
|
||||
|
||||
stats.translations.inserted += inserted.length;
|
||||
stats.translations.skipped += tBatch.length - inserted.length;
|
||||
}
|
||||
|
||||
// 4. Build and insert gloss rows
|
||||
const glossRows = batch.flatMap((r) => {
|
||||
const termId = sourceIdToTermId.get(r.source_id);
|
||||
if (!termId) return [];
|
||||
return Object.entries(r.glosses ?? {}).flatMap(([lang, texts]) =>
|
||||
(texts ?? []).map((text) => ({
|
||||
term_id: termId,
|
||||
language_code: lang as LanguageCode,
|
||||
text,
|
||||
})),
|
||||
);
|
||||
});
|
||||
|
||||
for (const gBatch of chunk(glossRows, BATCH_SIZE)) {
|
||||
const inserted = await db
|
||||
.insert(term_glosses)
|
||||
.values(gBatch)
|
||||
.onConflictDoNothing()
|
||||
.returning({ id: term_glosses.id });
|
||||
|
||||
stats.glosses.inserted += inserted.length;
|
||||
stats.glosses.skipped += gBatch.length - inserted.length;
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// Main
|
||||
// ────────────────────────────────────────────────────────────
|
||||
|
||||
const main = async () => {
|
||||
console.log("\n##########################################");
|
||||
console.log("lila — OMW seed");
|
||||
console.log("##########################################\n");
|
||||
|
||||
// One file per POS — names are derived from SUPPORTED_POS so adding a new
|
||||
// constant value automatically picks up a new file on the next run.
|
||||
const posToFile = Object.fromEntries(
|
||||
SUPPORTED_POS.map((pos) => [pos, `omw-${pos}.json`]),
|
||||
) as Record<POS, string>;
|
||||
|
||||
for (const pos of SUPPORTED_POS) {
|
||||
const filename = posToFile[pos];
|
||||
const filepath = dataDir + filename;
|
||||
|
||||
console.log(`📄 ${filename}`);
|
||||
|
||||
let records: SynsetRecord[];
|
||||
try {
|
||||
const raw = await fs.readFile(filepath, "utf8");
|
||||
records = JSON.parse(raw) as SynsetRecord[];
|
||||
} catch (e) {
|
||||
console.warn(
|
||||
` ⚠️ Skipping — could not read file: ${(e as Error).message}\n`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(` Loaded ${fmt(records.length)} synsets`);
|
||||
|
||||
const batches = chunk(records, BATCH_SIZE);
|
||||
|
||||
for (const [i, batch] of batches.entries()) {
|
||||
// Progress every 5 000 synsets
|
||||
if (i > 0 && i % 10 === 0) {
|
||||
const processed = i * BATCH_SIZE;
|
||||
console.log(` ⏳ ${fmt(processed)} / ${fmt(records.length)}`);
|
||||
}
|
||||
await processBatch(batch);
|
||||
}
|
||||
|
||||
console.log(` ✅ Done\n`);
|
||||
}
|
||||
|
||||
// ── Summary ───────────────────────────────────────────────
|
||||
|
||||
console.log("##########################################");
|
||||
console.log("Summary");
|
||||
console.log("##########################################\n");
|
||||
|
||||
const pad = (label: string) => label.padEnd(14);
|
||||
|
||||
console.log(
|
||||
`${pad("Terms:")}inserted ${fmt(stats.terms.inserted)}, skipped ${fmt(stats.terms.skipped)}`,
|
||||
);
|
||||
console.log(
|
||||
`${pad("Translations:")}inserted ${fmt(stats.translations.inserted)}, skipped ${fmt(stats.translations.skipped)}`,
|
||||
);
|
||||
console.log(
|
||||
`${pad("Glosses:")}inserted ${fmt(stats.glosses.inserted)}, skipped ${fmt(stats.glosses.skipped)}`,
|
||||
);
|
||||
|
||||
// Query actual DB totals — insert-based counters show 0 on re-runs.
|
||||
console.log("\nCoverage per language (total in DB):");
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
const [tRow] = await db
|
||||
.select({ n: count() })
|
||||
.from(translations)
|
||||
.where(eq(translations.language_code, lang));
|
||||
const [gRow] = await db
|
||||
.select({ n: count() })
|
||||
.from(term_glosses)
|
||||
.where(eq(term_glosses.language_code, lang));
|
||||
console.log(
|
||||
` ${lang}: ${fmt(tRow?.n ?? 0)} translations, ${fmt(gRow?.n ?? 0)} glosses`,
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
|
|
@ -5,11 +5,7 @@
|
|||
"moduleResolution": "NodeNext",
|
||||
"outDir": "./dist",
|
||||
"resolveJsonModule": true,
|
||||
"types": ["vitest/globals"],
|
||||
"types": ["vitest/globals"]
|
||||
},
|
||||
"include": [
|
||||
"src",
|
||||
"vitest.config.ts",
|
||||
"../../data-pipeline/archive/packages-db-src-old-seeding-scripts/data",
|
||||
],
|
||||
"include": ["src", "vitest.config.ts"]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
export const SUPPORTED_LANGUAGE_CODES = ["en", "it", "de", "fr", "es"] as const;
|
||||
export const SUPPORTED_LANGUAGE_CODES = ["en", "it"] as const;
|
||||
export type SupportedLanguageCode = (typeof SUPPORTED_LANGUAGE_CODES)[number];
|
||||
|
||||
export const SUPPORTED_POS = ["noun", "verb", "adjective", "adverb"] as const;
|
||||
export const SUPPORTED_POS = ["noun", "verb"] as const;
|
||||
export type SupportedPos = (typeof SUPPORTED_POS)[number];
|
||||
|
||||
export const GAME_ROUNDS = ["3", "10"] as const;
|
||||
|
|
|
|||
297
pnpm-lock.yaml
generated
297
pnpm-lock.yaml
generated
|
|
@ -55,7 +55,7 @@ importers:
|
|||
version: link:../../packages/shared
|
||||
better-auth:
|
||||
specifier: ^1.6.2
|
||||
version: 1.6.2(@opentelemetry/api@1.9.1)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)))
|
||||
version: 1.6.2(@opentelemetry/api@1.9.1)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)))
|
||||
cors:
|
||||
specifier: ^2.8.6
|
||||
version: 2.8.6
|
||||
|
|
@ -101,7 +101,7 @@ importers:
|
|||
version: 1.166.10(@tanstack/react-router@1.168.1(react-dom@19.2.4(react@19.2.4))(react@19.2.4))(@tanstack/router-core@1.168.1)(csstype@3.2.3)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)
|
||||
better-auth:
|
||||
specifier: ^1.6.2
|
||||
version: 1.6.2(@opentelemetry/api@1.9.1)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)))
|
||||
version: 1.6.2(@opentelemetry/api@1.9.1)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)))
|
||||
react:
|
||||
specifier: ^19.2.4
|
||||
version: 19.2.4
|
||||
|
|
@ -134,28 +134,6 @@ importers:
|
|||
specifier: ^8.0.1
|
||||
version: 8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0)
|
||||
|
||||
data-pipeline:
|
||||
dependencies:
|
||||
'@lila/shared':
|
||||
specifier: workspace:*
|
||||
version: link:../packages/shared
|
||||
better-sqlite3:
|
||||
specifier: ^12.9.0
|
||||
version: 12.9.0
|
||||
devDependencies:
|
||||
'@types/better-sqlite3':
|
||||
specifier: ^7.6.13
|
||||
version: 7.6.13
|
||||
'@types/node':
|
||||
specifier: ^24.12.0
|
||||
version: 24.12.0
|
||||
tsx:
|
||||
specifier: ^4.21.0
|
||||
version: 4.21.0
|
||||
typescript:
|
||||
specifier: ^5.9.3
|
||||
version: 5.9.3
|
||||
|
||||
packages/db:
|
||||
dependencies:
|
||||
'@lila/shared':
|
||||
|
|
@ -166,7 +144,7 @@ importers:
|
|||
version: 17.3.1
|
||||
drizzle-orm:
|
||||
specifier: ^0.45.1
|
||||
version: 0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)
|
||||
version: 0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0)
|
||||
pg:
|
||||
specifier: ^8.20.0
|
||||
version: 8.20.0
|
||||
|
|
@ -1265,9 +1243,6 @@ packages:
|
|||
'@tybys/wasm-util@0.10.1':
|
||||
resolution: {integrity: sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg==}
|
||||
|
||||
'@types/better-sqlite3@7.6.13':
|
||||
resolution: {integrity: sha512-NMv9ASNARoKksWtsq/SHakpYAYnhBrQgGD8zkLYk/jaK8jUGn08CfEdTRgYhMypUQAfzSP8W6gNLe0q19/t4VA==}
|
||||
|
||||
'@types/body-parser@1.19.6':
|
||||
resolution: {integrity: sha512-HLFeCYgz89uk22N5Qg3dvGvsv46B8GLvKKo1zKG4NybA8U2DiEO3w9lqGg29t/tfLRJpJ6iQxnVw4OnB7MoM9g==}
|
||||
|
||||
|
|
@ -1516,9 +1491,6 @@ packages:
|
|||
resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==}
|
||||
engines: {node: 18 || 20 || >=22}
|
||||
|
||||
base64-js@1.5.1:
|
||||
resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
|
||||
|
||||
baseline-browser-mapping@2.10.9:
|
||||
resolution: {integrity: sha512-OZd0e2mU11ClX8+IdXe3r0dbqMEznRiT4TfbhYIbcRPZkqJ7Qwer8ij3GZAmLsRKa+II9V1v5czCkvmHH3XZBg==}
|
||||
engines: {node: '>=6.0.0'}
|
||||
|
|
@ -1594,10 +1566,6 @@ packages:
|
|||
zod:
|
||||
optional: true
|
||||
|
||||
better-sqlite3@12.9.0:
|
||||
resolution: {integrity: sha512-wqUv4Gm3toFpHDQmaKD4QhZm3g1DjUBI0yzS4UBl6lElUmXFYdTQmmEDpAFa5o8FiFiymURypEnfVHzILKaxqQ==}
|
||||
engines: {node: 20.x || 22.x || 23.x || 24.x || 25.x}
|
||||
|
||||
bidi-js@1.0.3:
|
||||
resolution: {integrity: sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==}
|
||||
|
||||
|
|
@ -1605,12 +1573,6 @@ packages:
|
|||
resolution: {integrity: sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
bindings@1.5.0:
|
||||
resolution: {integrity: sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==}
|
||||
|
||||
bl@4.1.0:
|
||||
resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==}
|
||||
|
||||
body-parser@2.2.2:
|
||||
resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==}
|
||||
engines: {node: '>=18'}
|
||||
|
|
@ -1631,9 +1593,6 @@ packages:
|
|||
buffer-from@1.1.2:
|
||||
resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==}
|
||||
|
||||
buffer@5.7.1:
|
||||
resolution: {integrity: sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==}
|
||||
|
||||
bytes@3.1.2:
|
||||
resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==}
|
||||
engines: {node: '>= 0.8'}
|
||||
|
|
@ -1665,9 +1624,6 @@ packages:
|
|||
resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==}
|
||||
engines: {node: '>= 8.10.0'}
|
||||
|
||||
chownr@1.1.4:
|
||||
resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==}
|
||||
|
||||
cliui@8.0.1:
|
||||
resolution: {integrity: sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==}
|
||||
engines: {node: '>=12'}
|
||||
|
|
@ -1760,14 +1716,6 @@ packages:
|
|||
decimal.js@10.6.0:
|
||||
resolution: {integrity: sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==}
|
||||
|
||||
decompress-response@6.0.0:
|
||||
resolution: {integrity: sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
deep-extend@0.6.0:
|
||||
resolution: {integrity: sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==}
|
||||
engines: {node: '>=4.0.0'}
|
||||
|
||||
deep-is@0.1.4:
|
||||
resolution: {integrity: sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==}
|
||||
|
||||
|
|
@ -1910,9 +1858,6 @@ packages:
|
|||
resolution: {integrity: sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==}
|
||||
engines: {node: '>= 0.8'}
|
||||
|
||||
end-of-stream@1.4.5:
|
||||
resolution: {integrity: sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==}
|
||||
|
||||
enhanced-resolve@5.20.1:
|
||||
resolution: {integrity: sha512-Qohcme7V1inbAfvjItgw0EaxVX5q2rdVEZHRBrEQdRZTssLDGsL8Lwrznl8oQ/6kuTJONLaDcGjkNP247XEhcA==}
|
||||
engines: {node: '>=10.13.0'}
|
||||
|
|
@ -2037,10 +1982,6 @@ packages:
|
|||
resolution: {integrity: sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==}
|
||||
engines: {node: '>= 0.6'}
|
||||
|
||||
expand-template@2.0.3:
|
||||
resolution: {integrity: sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==}
|
||||
engines: {node: '>=6'}
|
||||
|
||||
expect-type@1.3.0:
|
||||
resolution: {integrity: sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==}
|
||||
engines: {node: '>=12.0.0'}
|
||||
|
|
@ -2074,9 +2015,6 @@ packages:
|
|||
resolution: {integrity: sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==}
|
||||
engines: {node: '>=16.0.0'}
|
||||
|
||||
file-uri-to-path@1.0.0:
|
||||
resolution: {integrity: sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==}
|
||||
|
||||
fill-range@7.1.1:
|
||||
resolution: {integrity: sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==}
|
||||
engines: {node: '>=8'}
|
||||
|
|
@ -2116,9 +2054,6 @@ packages:
|
|||
resolution: {integrity: sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==}
|
||||
engines: {node: '>= 0.8'}
|
||||
|
||||
fs-constants@1.0.0:
|
||||
resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==}
|
||||
|
||||
fsevents@2.3.3:
|
||||
resolution: {integrity: sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==}
|
||||
engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0}
|
||||
|
|
@ -2146,9 +2081,6 @@ packages:
|
|||
get-tsconfig@4.13.6:
|
||||
resolution: {integrity: sha512-shZT/QMiSHc/YBLxxOkMtgSid5HFoauqCE3/exfsEcwg1WkeqjG+V40yBbBrsD+jW2HDXcs28xOfcbm2jI8Ddw==}
|
||||
|
||||
github-from-package@0.0.0:
|
||||
resolution: {integrity: sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==}
|
||||
|
||||
glob-parent@5.1.2:
|
||||
resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==}
|
||||
engines: {node: '>= 6'}
|
||||
|
|
@ -2206,9 +2138,6 @@ packages:
|
|||
resolution: {integrity: sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
|
||||
ieee754@1.2.1:
|
||||
resolution: {integrity: sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==}
|
||||
|
||||
ignore@5.3.2:
|
||||
resolution: {integrity: sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==}
|
||||
engines: {node: '>= 4'}
|
||||
|
|
@ -2224,9 +2153,6 @@ packages:
|
|||
inherits@2.0.4:
|
||||
resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==}
|
||||
|
||||
ini@1.3.8:
|
||||
resolution: {integrity: sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==}
|
||||
|
||||
ipaddr.js@1.9.1:
|
||||
resolution: {integrity: sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==}
|
||||
engines: {node: '>= 0.10'}
|
||||
|
|
@ -2463,20 +2389,10 @@ packages:
|
|||
engines: {node: '>=4.0.0'}
|
||||
hasBin: true
|
||||
|
||||
mimic-response@3.1.0:
|
||||
resolution: {integrity: sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
minimatch@10.2.4:
|
||||
resolution: {integrity: sha512-oRjTw/97aTBN0RHbYCdtF1MQfvusSIBQM0IZEgzl6426+8jSC0nF1a/GmnVLpfB9yyr6g6FTqWqiZVbxrtaCIg==}
|
||||
engines: {node: 18 || 20 || >=22}
|
||||
|
||||
minimist@1.2.8:
|
||||
resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==}
|
||||
|
||||
mkdirp-classic@0.5.3:
|
||||
resolution: {integrity: sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==}
|
||||
|
||||
ms@2.1.3:
|
||||
resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==}
|
||||
|
||||
|
|
@ -2489,9 +2405,6 @@ packages:
|
|||
resolution: {integrity: sha512-F0wCzbsH80G7XXo0Jd9/AVQC7ouWY6idUCTnMwW5t/Rv9W8qmO6endavDwg7TNp5GbugwSukFMVZqzPSrSMndg==}
|
||||
engines: {node: ^20.0.0 || >=22.0.0}
|
||||
|
||||
napi-build-utils@2.0.0:
|
||||
resolution: {integrity: sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA==}
|
||||
|
||||
natural-compare@1.4.0:
|
||||
resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==}
|
||||
|
||||
|
|
@ -2499,10 +2412,6 @@ packages:
|
|||
resolution: {integrity: sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==}
|
||||
engines: {node: '>= 0.6'}
|
||||
|
||||
node-abi@3.89.0:
|
||||
resolution: {integrity: sha512-6u9UwL0HlAl21+agMN3YAMXcKByMqwGx+pq+P76vii5f7hTPtKDp08/H9py6DY+cfDw7kQNTGEj/rly3IgbNQA==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
node-releases@2.0.36:
|
||||
resolution: {integrity: sha512-TdC8FSgHz8Mwtw9g5L4gR/Sh9XhSP/0DEkQxfEFXOpiul5IiHgHan2VhYYb6agDSfp4KuvltmGApc8HMgUrIkA==}
|
||||
|
||||
|
|
@ -2626,12 +2535,6 @@ packages:
|
|||
resolution: {integrity: sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
|
||||
prebuild-install@7.1.3:
|
||||
resolution: {integrity: sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==}
|
||||
engines: {node: '>=10'}
|
||||
deprecated: No longer maintained. Please contact the author of the relevant native addon; alternatives are available.
|
||||
hasBin: true
|
||||
|
||||
prelude-ls@1.2.1:
|
||||
resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==}
|
||||
engines: {node: '>= 0.8.0'}
|
||||
|
|
@ -2645,9 +2548,6 @@ packages:
|
|||
resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==}
|
||||
engines: {node: '>= 0.10'}
|
||||
|
||||
pump@3.0.4:
|
||||
resolution: {integrity: sha512-VS7sjc6KR7e1ukRFhQSY5LM2uBWAUPiOPa/A3mkKmiMwSmRFUITt0xuj+/lesgnCv+dPIEYlkzrcyXgquIHMcA==}
|
||||
|
||||
punycode@2.3.1:
|
||||
resolution: {integrity: sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==}
|
||||
engines: {node: '>=6'}
|
||||
|
|
@ -2664,10 +2564,6 @@ packages:
|
|||
resolution: {integrity: sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==}
|
||||
engines: {node: '>= 0.10'}
|
||||
|
||||
rc@1.2.8:
|
||||
resolution: {integrity: sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==}
|
||||
hasBin: true
|
||||
|
||||
react-dom@19.2.4:
|
||||
resolution: {integrity: sha512-AXJdLo8kgMbimY95O2aKQqsz2iWi9jMgKJhRBAxECE4IFxfcazB2LmzloIoibJI3C12IlY20+KFaLv+71bUJeQ==}
|
||||
peerDependencies:
|
||||
|
|
@ -2677,10 +2573,6 @@ packages:
|
|||
resolution: {integrity: sha512-9nfp2hYpCwOjAN+8TZFGhtWEwgvWHXqESH8qT89AT/lWklpLON22Lc8pEtnpsZz7VmawabSU0gCjnj8aC0euHQ==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
|
||||
readable-stream@3.6.2:
|
||||
resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==}
|
||||
engines: {node: '>= 6'}
|
||||
|
||||
readdirp@3.6.0:
|
||||
resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
|
||||
engines: {node: '>=8.10.0'}
|
||||
|
|
@ -2715,9 +2607,6 @@ packages:
|
|||
rxjs@7.8.2:
|
||||
resolution: {integrity: sha512-dhKf903U/PQZY6boNNtAGdWbG85WAbjT/1xYoZIC7FAY0yWapOBQVsVrDl58W86//e1VpMNBtRV4MaXfdMySFA==}
|
||||
|
||||
safe-buffer@5.2.1:
|
||||
resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
|
||||
|
||||
safer-buffer@2.1.2:
|
||||
resolution: {integrity: sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==}
|
||||
|
||||
|
|
@ -2792,12 +2681,6 @@ packages:
|
|||
siginfo@2.0.0:
|
||||
resolution: {integrity: sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==}
|
||||
|
||||
simple-concat@1.0.1:
|
||||
resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==}
|
||||
|
||||
simple-get@4.0.1:
|
||||
resolution: {integrity: sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==}
|
||||
|
||||
source-map-js@1.2.1:
|
||||
resolution: {integrity: sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
|
|
@ -2835,17 +2718,10 @@ packages:
|
|||
resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
string_decoder@1.3.0:
|
||||
resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==}
|
||||
|
||||
strip-ansi@6.0.1:
|
||||
resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
strip-json-comments@2.0.1:
|
||||
resolution: {integrity: sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
|
||||
superagent@10.3.0:
|
||||
resolution: {integrity: sha512-B+4Ik7ROgVKrQsXTV0Jwp2u+PXYLSlqtDAhYnkkD+zn3yg8s/zjA2MeGayPoY/KICrbitwneDHrjSotxKL+0XQ==}
|
||||
engines: {node: '>=14.18.0'}
|
||||
|
|
@ -2872,13 +2748,6 @@ packages:
|
|||
resolution: {integrity: sha512-g9ljZiwki/LfxmQADO3dEY1CbpmXT5Hm2fJ+QaGKwSXUylMybePR7/67YW7jOrrvjEgL1Fmz5kzyAjWVWLlucg==}
|
||||
engines: {node: '>=6'}
|
||||
|
||||
tar-fs@2.1.4:
|
||||
resolution: {integrity: sha512-mDAjwmZdh7LTT6pNleZ05Yt65HC3E+NiQzl672vQG38jIrehtJk/J3mNwIg+vShQPcLF/LV7CMnDW6vjj6sfYQ==}
|
||||
|
||||
tar-stream@2.2.0:
|
||||
resolution: {integrity: sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==}
|
||||
engines: {node: '>=6'}
|
||||
|
||||
tiny-invariant@1.3.3:
|
||||
resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==}
|
||||
|
||||
|
|
@ -2941,9 +2810,6 @@ packages:
|
|||
engines: {node: '>=18.0.0'}
|
||||
hasBin: true
|
||||
|
||||
tunnel-agent@0.6.0:
|
||||
resolution: {integrity: sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==}
|
||||
|
||||
type-check@0.4.0:
|
||||
resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==}
|
||||
engines: {node: '>= 0.8.0'}
|
||||
|
|
@ -2996,9 +2862,6 @@ packages:
|
|||
peerDependencies:
|
||||
react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
|
||||
|
||||
util-deprecate@1.0.2:
|
||||
resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
|
||||
|
||||
vary@1.1.2:
|
||||
resolution: {integrity: sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==}
|
||||
engines: {node: '>= 0.8'}
|
||||
|
|
@ -3335,12 +3198,12 @@ snapshots:
|
|||
nanostores: 1.2.0
|
||||
zod: 4.3.6
|
||||
|
||||
'@better-auth/drizzle-adapter@1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))':
|
||||
'@better-auth/drizzle-adapter@1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))':
|
||||
dependencies:
|
||||
'@better-auth/core': 1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0)
|
||||
'@better-auth/utils': 0.4.0
|
||||
optionalDependencies:
|
||||
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)
|
||||
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0)
|
||||
|
||||
'@better-auth/kysely-adapter@1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(kysely@0.28.16)':
|
||||
dependencies:
|
||||
|
|
@ -3980,10 +3843,6 @@ snapshots:
|
|||
tslib: 2.8.1
|
||||
optional: true
|
||||
|
||||
'@types/better-sqlite3@7.6.13':
|
||||
dependencies:
|
||||
'@types/node': 24.12.0
|
||||
|
||||
'@types/body-parser@1.19.6':
|
||||
dependencies:
|
||||
'@types/connect': 3.4.38
|
||||
|
|
@ -4301,14 +4160,12 @@ snapshots:
|
|||
|
||||
balanced-match@4.0.4: {}
|
||||
|
||||
base64-js@1.5.1: {}
|
||||
|
||||
baseline-browser-mapping@2.10.9: {}
|
||||
|
||||
better-auth@1.6.2(@opentelemetry/api@1.9.1)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0))):
|
||||
better-auth@1.6.2(@opentelemetry/api@1.9.1)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@24.12.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0))):
|
||||
dependencies:
|
||||
'@better-auth/core': 1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0)
|
||||
'@better-auth/drizzle-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))
|
||||
'@better-auth/drizzle-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))
|
||||
'@better-auth/kysely-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(kysely@0.28.16)
|
||||
'@better-auth/memory-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)
|
||||
'@better-auth/mongo-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)
|
||||
|
|
@ -4325,9 +4182,8 @@ snapshots:
|
|||
nanostores: 1.2.0
|
||||
zod: 4.3.6
|
||||
optionalDependencies:
|
||||
better-sqlite3: 12.9.0
|
||||
drizzle-kit: 0.31.10
|
||||
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)
|
||||
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0)
|
||||
pg: 8.20.0
|
||||
react: 19.2.4
|
||||
react-dom: 19.2.4(react@19.2.4)
|
||||
|
|
@ -4336,10 +4192,10 @@ snapshots:
|
|||
- '@cloudflare/workers-types'
|
||||
- '@opentelemetry/api'
|
||||
|
||||
better-auth@1.6.2(@opentelemetry/api@1.9.1)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0))):
|
||||
better-auth@1.6.2(@opentelemetry/api@1.9.1)(drizzle-kit@0.31.10)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))(pg@8.20.0)(react-dom@19.2.4(react@19.2.4))(react@19.2.4)(vitest@4.1.0(@opentelemetry/api@1.9.1)(@types/node@25.5.0)(jsdom@29.0.1(@noble/hashes@2.2.0))(vite@8.0.1(@types/node@25.5.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0))):
|
||||
dependencies:
|
||||
'@better-auth/core': 1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0)
|
||||
'@better-auth/drizzle-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0))
|
||||
'@better-auth/drizzle-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0))
|
||||
'@better-auth/kysely-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)(kysely@0.28.16)
|
||||
'@better-auth/memory-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)
|
||||
'@better-auth/mongo-adapter': 1.6.2(@better-auth/core@1.6.2(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.2.0))(@better-auth/utils@0.4.0)
|
||||
|
|
@ -4356,9 +4212,8 @@ snapshots:
|
|||
nanostores: 1.2.0
|
||||
zod: 4.3.6
|
||||
optionalDependencies:
|
||||
better-sqlite3: 12.9.0
|
||||
drizzle-kit: 0.31.10
|
||||
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)
|
||||
drizzle-orm: 0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0)
|
||||
pg: 8.20.0
|
||||
react: 19.2.4
|
||||
react-dom: 19.2.4(react@19.2.4)
|
||||
|
|
@ -4376,27 +4231,12 @@ snapshots:
|
|||
optionalDependencies:
|
||||
zod: 4.3.6
|
||||
|
||||
better-sqlite3@12.9.0:
|
||||
dependencies:
|
||||
bindings: 1.5.0
|
||||
prebuild-install: 7.1.3
|
||||
|
||||
bidi-js@1.0.3:
|
||||
dependencies:
|
||||
require-from-string: 2.0.2
|
||||
|
||||
binary-extensions@2.3.0: {}
|
||||
|
||||
bindings@1.5.0:
|
||||
dependencies:
|
||||
file-uri-to-path: 1.0.0
|
||||
|
||||
bl@4.1.0:
|
||||
dependencies:
|
||||
buffer: 5.7.1
|
||||
inherits: 2.0.4
|
||||
readable-stream: 3.6.2
|
||||
|
||||
body-parser@2.2.2:
|
||||
dependencies:
|
||||
bytes: 3.1.2
|
||||
|
|
@ -4429,11 +4269,6 @@ snapshots:
|
|||
|
||||
buffer-from@1.1.2: {}
|
||||
|
||||
buffer@5.7.1:
|
||||
dependencies:
|
||||
base64-js: 1.5.1
|
||||
ieee754: 1.2.1
|
||||
|
||||
bytes@3.1.2: {}
|
||||
|
||||
call-bind-apply-helpers@1.0.2:
|
||||
|
|
@ -4472,8 +4307,6 @@ snapshots:
|
|||
optionalDependencies:
|
||||
fsevents: 2.3.3
|
||||
|
||||
chownr@1.1.4: {}
|
||||
|
||||
cliui@8.0.1:
|
||||
dependencies:
|
||||
string-width: 4.2.3
|
||||
|
|
@ -4552,12 +4385,6 @@ snapshots:
|
|||
|
||||
decimal.js@10.6.0: {}
|
||||
|
||||
decompress-response@6.0.0:
|
||||
dependencies:
|
||||
mimic-response: 3.1.0
|
||||
|
||||
deep-extend@0.6.0: {}
|
||||
|
||||
deep-is@0.1.4: {}
|
||||
|
||||
defu@6.1.7: {}
|
||||
|
|
@ -4584,12 +4411,10 @@ snapshots:
|
|||
esbuild: 0.25.12
|
||||
tsx: 4.21.0
|
||||
|
||||
drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/better-sqlite3@7.6.13)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0):
|
||||
drizzle-orm@0.45.1(@opentelemetry/api@1.9.1)(@types/pg@8.20.0)(kysely@0.28.16)(pg@8.20.0):
|
||||
optionalDependencies:
|
||||
'@opentelemetry/api': 1.9.1
|
||||
'@types/better-sqlite3': 7.6.13
|
||||
'@types/pg': 8.20.0
|
||||
better-sqlite3: 12.9.0
|
||||
kysely: 0.28.16
|
||||
pg: 8.20.0
|
||||
|
||||
|
|
@ -4607,10 +4432,6 @@ snapshots:
|
|||
|
||||
encodeurl@2.0.0: {}
|
||||
|
||||
end-of-stream@1.4.5:
|
||||
dependencies:
|
||||
once: 1.4.0
|
||||
|
||||
enhanced-resolve@5.20.1:
|
||||
dependencies:
|
||||
graceful-fs: 4.2.11
|
||||
|
|
@ -4817,8 +4638,6 @@ snapshots:
|
|||
|
||||
etag@1.8.1: {}
|
||||
|
||||
expand-template@2.0.3: {}
|
||||
|
||||
expect-type@1.3.0: {}
|
||||
|
||||
express@5.2.1:
|
||||
|
|
@ -4870,8 +4689,6 @@ snapshots:
|
|||
dependencies:
|
||||
flat-cache: 4.0.1
|
||||
|
||||
file-uri-to-path@1.0.0: {}
|
||||
|
||||
fill-range@7.1.1:
|
||||
dependencies:
|
||||
to-regex-range: 5.0.1
|
||||
|
|
@ -4919,8 +4736,6 @@ snapshots:
|
|||
|
||||
fresh@2.0.0: {}
|
||||
|
||||
fs-constants@1.0.0: {}
|
||||
|
||||
fsevents@2.3.3:
|
||||
optional: true
|
||||
|
||||
|
|
@ -4952,8 +4767,6 @@ snapshots:
|
|||
dependencies:
|
||||
resolve-pkg-maps: 1.0.0
|
||||
|
||||
github-from-package@0.0.0: {}
|
||||
|
||||
glob-parent@5.1.2:
|
||||
dependencies:
|
||||
is-glob: 4.0.3
|
||||
|
|
@ -5008,8 +4821,6 @@ snapshots:
|
|||
dependencies:
|
||||
safer-buffer: 2.1.2
|
||||
|
||||
ieee754@1.2.1: {}
|
||||
|
||||
ignore@5.3.2: {}
|
||||
|
||||
ignore@7.0.5: {}
|
||||
|
|
@ -5018,8 +4829,6 @@ snapshots:
|
|||
|
||||
inherits@2.0.4: {}
|
||||
|
||||
ini@1.3.8: {}
|
||||
|
||||
ipaddr.js@1.9.1: {}
|
||||
|
||||
is-binary-path@2.1.0:
|
||||
|
|
@ -5209,32 +5018,20 @@ snapshots:
|
|||
|
||||
mime@2.6.0: {}
|
||||
|
||||
mimic-response@3.1.0: {}
|
||||
|
||||
minimatch@10.2.4:
|
||||
dependencies:
|
||||
brace-expansion: 5.0.4
|
||||
|
||||
minimist@1.2.8: {}
|
||||
|
||||
mkdirp-classic@0.5.3: {}
|
||||
|
||||
ms@2.1.3: {}
|
||||
|
||||
nanoid@3.3.11: {}
|
||||
|
||||
nanostores@1.2.0: {}
|
||||
|
||||
napi-build-utils@2.0.0: {}
|
||||
|
||||
natural-compare@1.4.0: {}
|
||||
|
||||
negotiator@1.0.0: {}
|
||||
|
||||
node-abi@3.89.0:
|
||||
dependencies:
|
||||
semver: 7.7.4
|
||||
|
||||
node-releases@2.0.36: {}
|
||||
|
||||
normalize-path@3.0.0: {}
|
||||
|
|
@ -5341,21 +5138,6 @@ snapshots:
|
|||
dependencies:
|
||||
xtend: 4.0.2
|
||||
|
||||
prebuild-install@7.1.3:
|
||||
dependencies:
|
||||
detect-libc: 2.1.2
|
||||
expand-template: 2.0.3
|
||||
github-from-package: 0.0.0
|
||||
minimist: 1.2.8
|
||||
mkdirp-classic: 0.5.3
|
||||
napi-build-utils: 2.0.0
|
||||
node-abi: 3.89.0
|
||||
pump: 3.0.4
|
||||
rc: 1.2.8
|
||||
simple-get: 4.0.1
|
||||
tar-fs: 2.1.4
|
||||
tunnel-agent: 0.6.0
|
||||
|
||||
prelude-ls@1.2.1: {}
|
||||
|
||||
prettier@3.8.1: {}
|
||||
|
|
@ -5365,11 +5147,6 @@ snapshots:
|
|||
forwarded: 0.2.0
|
||||
ipaddr.js: 1.9.1
|
||||
|
||||
pump@3.0.4:
|
||||
dependencies:
|
||||
end-of-stream: 1.4.5
|
||||
once: 1.4.0
|
||||
|
||||
punycode@2.3.1: {}
|
||||
|
||||
qs@6.15.0:
|
||||
|
|
@ -5385,13 +5162,6 @@ snapshots:
|
|||
iconv-lite: 0.7.2
|
||||
unpipe: 1.0.0
|
||||
|
||||
rc@1.2.8:
|
||||
dependencies:
|
||||
deep-extend: 0.6.0
|
||||
ini: 1.3.8
|
||||
minimist: 1.2.8
|
||||
strip-json-comments: 2.0.1
|
||||
|
||||
react-dom@19.2.4(react@19.2.4):
|
||||
dependencies:
|
||||
react: 19.2.4
|
||||
|
|
@ -5399,12 +5169,6 @@ snapshots:
|
|||
|
||||
react@19.2.4: {}
|
||||
|
||||
readable-stream@3.6.2:
|
||||
dependencies:
|
||||
inherits: 2.0.4
|
||||
string_decoder: 1.3.0
|
||||
util-deprecate: 1.0.2
|
||||
|
||||
readdirp@3.6.0:
|
||||
dependencies:
|
||||
picomatch: 2.3.1
|
||||
|
|
@ -5460,8 +5224,6 @@ snapshots:
|
|||
dependencies:
|
||||
tslib: 2.8.1
|
||||
|
||||
safe-buffer@5.2.1: {}
|
||||
|
||||
safer-buffer@2.1.2: {}
|
||||
|
||||
saxes@6.0.0:
|
||||
|
|
@ -5547,14 +5309,6 @@ snapshots:
|
|||
|
||||
siginfo@2.0.0: {}
|
||||
|
||||
simple-concat@1.0.1: {}
|
||||
|
||||
simple-get@4.0.1:
|
||||
dependencies:
|
||||
decompress-response: 6.0.0
|
||||
once: 1.4.0
|
||||
simple-concat: 1.0.1
|
||||
|
||||
source-map-js@1.2.1: {}
|
||||
|
||||
source-map-support@0.5.21:
|
||||
|
|
@ -5584,16 +5338,10 @@ snapshots:
|
|||
is-fullwidth-code-point: 3.0.0
|
||||
strip-ansi: 6.0.1
|
||||
|
||||
string_decoder@1.3.0:
|
||||
dependencies:
|
||||
safe-buffer: 5.2.1
|
||||
|
||||
strip-ansi@6.0.1:
|
||||
dependencies:
|
||||
ansi-regex: 5.0.1
|
||||
|
||||
strip-json-comments@2.0.1: {}
|
||||
|
||||
superagent@10.3.0:
|
||||
dependencies:
|
||||
component-emitter: 1.3.1
|
||||
|
|
@ -5630,21 +5378,6 @@ snapshots:
|
|||
|
||||
tapable@2.3.0: {}
|
||||
|
||||
tar-fs@2.1.4:
|
||||
dependencies:
|
||||
chownr: 1.1.4
|
||||
mkdirp-classic: 0.5.3
|
||||
pump: 3.0.4
|
||||
tar-stream: 2.2.0
|
||||
|
||||
tar-stream@2.2.0:
|
||||
dependencies:
|
||||
bl: 4.1.0
|
||||
end-of-stream: 1.4.5
|
||||
fs-constants: 1.0.0
|
||||
inherits: 2.0.4
|
||||
readable-stream: 3.6.2
|
||||
|
||||
tiny-invariant@1.3.3: {}
|
||||
|
||||
tiny-warning@1.0.3: {}
|
||||
|
|
@ -5695,10 +5428,6 @@ snapshots:
|
|||
optionalDependencies:
|
||||
fsevents: 2.3.3
|
||||
|
||||
tunnel-agent@0.6.0:
|
||||
dependencies:
|
||||
safe-buffer: 5.2.1
|
||||
|
||||
type-check@0.4.0:
|
||||
dependencies:
|
||||
prelude-ls: 1.2.1
|
||||
|
|
@ -5752,8 +5481,6 @@ snapshots:
|
|||
dependencies:
|
||||
react: 19.2.4
|
||||
|
||||
util-deprecate@1.0.2: {}
|
||||
|
||||
vary@1.1.2: {}
|
||||
|
||||
vite@8.0.1(@types/node@24.12.0)(esbuild@0.27.4)(jiti@2.6.1)(tsx@4.21.0):
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
packages:
|
||||
- apps/*
|
||||
- packages/*
|
||||
- data-pipeline
|
||||
allowBuilds:
|
||||
"@swc/core": true
|
||||
better-sqlite3: true
|
||||
esbuild: true
|
||||
|
|
|
|||
205
scripts/README.md
Normal file
205
scripts/README.md
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
# CEFR Data Pipeline
|
||||
|
||||
This directory contains the source data files and extraction/merge pipeline for generating CEFR-enriched datasets. The final outputs (`english-merged.json`, `italian-merged.json`) are consumed by the database seeding process in `packages/db`.
|
||||
|
||||
## Overview
|
||||
|
||||
The pipeline transforms raw vocabulary data from multiple sources into a standardized format, resolves conflicts between sources, and produces an authoritative CEFR dataset per language. This dataset is then used by the lila database package to update translation records.
|
||||
|
||||
## Supported Languages
|
||||
|
||||
- ✅ English (`en`)
|
||||
- ✅ Italian (`it`)
|
||||
|
||||
## Pipeline Stages
|
||||
|
||||
### Stage 1: Extraction
|
||||
|
||||
Each source file is processed by a dedicated extractor script. The extractor reads the source-specific format, normalizes the data, filters for supported parts of speech, and outputs a standardized JSON file.
|
||||
|
||||
**Input:** Raw source files (JSON, CSV, XLS)
|
||||
**Output:** `{source}-extracted.json` files (same directory as source)
|
||||
|
||||
**Normalization rules:**
|
||||
|
||||
- Words are lowercased and trimmed
|
||||
- Part of speech is mapped to supported values (noun, verb)
|
||||
- Entries with unsupported POS are skipped
|
||||
- CEFR levels are validated against A1-C2
|
||||
- Each record includes the source identifier for traceability
|
||||
|
||||
**Extractor Scripts:**
|
||||
|
||||
| Language | Source | Script |
|
||||
| -------- | -------------- | ---------------------------------------------------- |
|
||||
| English | `cefrj.csv` | `extraction-scripts/english/extract-cefrj-csv.py` |
|
||||
| English | `en_m3.xls` | `extraction-scripts/english/extract-en_m3.py` |
|
||||
| English | `octanove.csv` | `extraction-scripts/english/extract-octanove.py` |
|
||||
| English | `random.json` | `extraction-scripts/english/extract-random-json.py` |
|
||||
| Italian | `it_m3.xls` | `extraction-scripts/italian/extract-it_m3.py` |
|
||||
| Italian | `italian.json` | `extraction-scripts/italian/extract-italian-json.py` |
|
||||
|
||||
### Stage 2: Comparison
|
||||
|
||||
Before merging, sources are compared to identify agreements and conflicts. This stage is read-only and serves as a quality gate.
|
||||
|
||||
**Input:** All `{source}-extracted.json` files for a language
|
||||
**Output:** Console report showing:
|
||||
|
||||
- Entry counts per source and CEFR level
|
||||
- Overlap between sources (words appearing in multiple sources)
|
||||
- Agreement rate (sources assigning the same CEFR level)
|
||||
- Conflicts (same word/POS with different CEFR levels)
|
||||
|
||||
**Comparison Scripts:**
|
||||
|
||||
| Language | Script |
|
||||
| -------- | --------------------------------------- |
|
||||
| English | `comparison-scripts/compare-english.py` |
|
||||
| Italian | `comparison-scripts/compare-italian.py` |
|
||||
|
||||
Run from the `scripts/` directory:
|
||||
|
||||
python comparison-scripts/compare-english.py
|
||||
python comparison-scripts/compare-italian.py
|
||||
|
||||
### Stage 3: Merge
|
||||
|
||||
Multiple extracted sources are merged into a single authoritative JSON file per language. When the same word/POS appears in multiple sources with different CEFR levels, the conflict is resolved using a predefined priority order.
|
||||
|
||||
**Input:** All `{source}-extracted.json` files for a language
|
||||
**Output:** `{language}-merged.json` in `../datafiles/`
|
||||
|
||||
**Merge rules:**
|
||||
|
||||
- Single source: use that source's CEFR level
|
||||
- Multiple sources agree: use the agreed CEFR level
|
||||
- Multiple sources conflict: use the level from the highest-priority source
|
||||
|
||||
**Difficulty derivation:**
|
||||
Difficulty is not extracted from sources. It is derived from the final CEFR level:
|
||||
|
||||
- A1, A2 → easy
|
||||
- B1, B2 → intermediate
|
||||
- C1, C2 → hard
|
||||
|
||||
The merged file includes both CEFR level and derived difficulty, plus a list of sources that contributed to each entry.
|
||||
|
||||
**Merge Scripts & Priorities:**
|
||||
|
||||
| Language | Script | Priority (lowest → highest) |
|
||||
| -------- | ------------------------------------- | -------------------------------------- |
|
||||
| English | `merge-scripts/merge-english-json.py` | `random`, `octanove`, `cefrj`, `en_m3` |
|
||||
| Italian | `merge-scripts/merge-italian-json.py` | `italian`, `it_m3` |
|
||||
|
||||
Run from the `scripts/` directory:
|
||||
|
||||
python merge-scripts/merge-english-json.py
|
||||
python merge-scripts/merge-italian-json.py
|
||||
|
||||
### Stage 4: Enrichment
|
||||
|
||||
The authoritative merged file is consumed by the database package (packages/db) during the seeding or update process. This stage is implemented in TypeScript and is not part of the Python scripts in this directory.
|
||||
|
||||
## File Organization
|
||||
|
||||
```
|
||||
scripts/
|
||||
├── comparison-scripts/
|
||||
│ ├── compare-english.py
|
||||
│ └── compare-italian.py # Stage 2: compare extracted data
|
||||
├── datafiles/
|
||||
│ ├── english-merged.json # Stage 3 output (authoritative)
|
||||
│ ├── italian-merged.json # Stage 3 output (authoritative)
|
||||
│ ├── omw-noun.json
|
||||
│ └── omw-verb.json
|
||||
├── data-sources/
|
||||
│ ├── english/
|
||||
│ │ ├── cefrj.csv
|
||||
│ │ ├── cefrj-extracted.json
|
||||
│ │ ├── en_m3.xls
|
||||
│ │ ├── en_m3-extracted.json
|
||||
│ │ ├── octanove.csv
|
||||
│ │ ├── octanove-extracted.json
|
||||
│ │ ├── random.json
|
||||
│ │ └── random-extracted.json
|
||||
│ ├── french/ # (future)
|
||||
│ ├── german/ # (future)
|
||||
│ ├── italian/
|
||||
│ │ ├── it_m3.xls
|
||||
│ │ ├── it_m3-extracted.json
|
||||
│ │ ├── italian.json
|
||||
│ │ └── italian-extracted.json
|
||||
│ └── spanish/ # (future)
|
||||
├── extraction-scripts/
|
||||
│ └── english/
|
||||
│ ├── extract-cefrj-csv.py
|
||||
│ ├── extract-en_m3.py
|
||||
│ ├── extract-octanove.py
|
||||
│ └── extract-random-json.py
|
||||
│ └── italian/
|
||||
│ ├── extract-it_m3.py
|
||||
│ └── extract-italian-json.py
|
||||
├── merge-scripts/
|
||||
│ └── merge-english-json.py # Stage 3: merge into authority
|
||||
├── extract-own-save-to-json.py # script to extract words from wordnet
|
||||
├── requirements.txt
|
||||
└── README.md # This file
|
||||
```
|
||||
|
||||
Extracted files are co-located with their sources for easy traceability. Merged files live in `../datafiles/`.
|
||||
|
||||
## Source Priority by Language
|
||||
|
||||
Source priority determines which CEFR level wins when sources conflict:
|
||||
|
||||
**English:**
|
||||
|
||||
1. en_m3
|
||||
2. cefrj
|
||||
3. octanove
|
||||
4. random
|
||||
|
||||
**Italian:**
|
||||
|
||||
1. it_m3
|
||||
2. italian
|
||||
|
||||
Priority is defined in the merge configuration. Higher priority sources override lower priority sources when conflicts occur.
|
||||
|
||||
This is defined in merge-scripts/merge-english-json.py.
|
||||
|
||||
## Data Flow Summary
|
||||
|
||||
```
|
||||
Raw Source → Extracted JSON → Merged JSON → Database
|
||||
(1) (2) (3) (4)
|
||||
```
|
||||
|
||||
1. **Extract:** Transform source formats to normalized records
|
||||
2. **Compare:** Validate source quality and surface conflicts
|
||||
3. **Merge:** Resolve conflicts, derive difficulty, create authority
|
||||
4. **Enrich:** Write to database (handled in packages/db)
|
||||
|
||||
## Adding New Sources
|
||||
|
||||
To add a new source:
|
||||
|
||||
1. Place the raw file in the appropriate `data-sources/{language}/` directory
|
||||
2. Create an extractor script in `../extractors/{language}/`
|
||||
3. Run the extractor to generate `{source}-extracted.json`
|
||||
4. Run comparison to assess coverage and conflicts
|
||||
5. Update source priority in the merge configuration if needed
|
||||
6. Run merge to regenerate the authoritative file
|
||||
7. Run enrichment to update the database
|
||||
|
||||
## Constants and Constraints
|
||||
|
||||
The pipeline respects these constraints from the lila shared constants:
|
||||
|
||||
- **Supported languages:** en, it
|
||||
- **Supported parts of speech:** noun, verb
|
||||
- **CEFR levels:** A1, A2, B1, B2, C1, C2
|
||||
- **Difficulty levels:** easy, intermediate, hard
|
||||
|
||||
Entries violating these constraints are filtered out during extraction.
|
||||
166
scripts/comparison-scripts/compare-english.py
Normal file
166
scripts/comparison-scripts/compare-english.py
Normal file
|
|
@ -0,0 +1,166 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
CEFR Data Pipeline - Stage 2: English Comparison
|
||||
Compares extracted JSON files for English and reports agreements and conflicts.
|
||||
"""
|
||||
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
# Supported CEFR levels
|
||||
CEFR_LEVELS = {"A1", "A2", "B1", "B2", "C1", "C2"}
|
||||
|
||||
|
||||
def load_extracted_files(data_dir: Path) -> Dict[str, List[dict]]:
|
||||
"""Load all *-extracted.json files from the English data directory."""
|
||||
sources = {}
|
||||
for file_path in data_dir.glob("*-extracted.json"):
|
||||
source_name = file_path.stem.replace("-extracted", "")
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if isinstance(data, list):
|
||||
sources[source_name] = data
|
||||
else:
|
||||
print(f"Warning: {file_path} does not contain a list, skipping.")
|
||||
return sources
|
||||
|
||||
|
||||
def normalize_entry(entry: dict) -> Tuple[str, str]:
|
||||
"""Return (word, pos) key for comparison."""
|
||||
return entry["word"].lower().strip(), entry["pos"].lower().strip()
|
||||
|
||||
|
||||
def compute_statistics(sources: Dict[str, List[dict]]) -> dict:
|
||||
"""Compute overlap, agreement, and conflict statistics."""
|
||||
# Per-source counts by CEFR level
|
||||
source_counts = {}
|
||||
for src, entries in sources.items():
|
||||
cefr_counts = defaultdict(int)
|
||||
for e in entries:
|
||||
cefr = e.get("cefr", "UNKNOWN")
|
||||
cefr_counts[cefr] += 1
|
||||
source_counts[src] = dict(cefr_counts)
|
||||
|
||||
# Build word->pos->sources and CEFR assignments
|
||||
word_map = defaultdict(lambda: defaultdict(dict))
|
||||
for src, entries in sources.items():
|
||||
for e in entries:
|
||||
key = normalize_entry(e)
|
||||
word_map[key][src] = e["cefr"]
|
||||
|
||||
# Compute overlaps, agreements, conflicts
|
||||
total_entries = sum(len(e) for e in sources.values())
|
||||
unique_words = len(word_map)
|
||||
|
||||
overlap_stats = defaultdict(int)
|
||||
agreement_count = 0
|
||||
conflict_count = 0
|
||||
conflict_details = []
|
||||
|
||||
for key, src_cefr_map in word_map.items():
|
||||
num_sources = len(src_cefr_map)
|
||||
overlap_stats[num_sources] += 1
|
||||
if num_sources > 1:
|
||||
cefr_values = set(src_cefr_map.values())
|
||||
if len(cefr_values) == 1:
|
||||
agreement_count += 1
|
||||
else:
|
||||
conflict_count += 1
|
||||
conflict_details.append(
|
||||
{"word": key[0], "pos": key[1], "assignments": dict(src_cefr_map)}
|
||||
)
|
||||
|
||||
return {
|
||||
"source_counts": source_counts,
|
||||
"total_entries": total_entries,
|
||||
"unique_words": unique_words,
|
||||
"overlap_distribution": dict(overlap_stats),
|
||||
"agreements": agreement_count,
|
||||
"conflicts": conflict_count,
|
||||
"conflict_details": conflict_details,
|
||||
}
|
||||
|
||||
|
||||
def print_report(stats: dict, sources: Dict[str, List[dict]]):
|
||||
"""Print formatted comparison report."""
|
||||
print(f"\n{'=' * 60}")
|
||||
print("CEFR COMPARISON REPORT - ENGLISH")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# Source entry counts
|
||||
print("\n📊 ENTRIES PER SOURCE AND CEFR LEVEL")
|
||||
print("-" * 50)
|
||||
for src, counts in stats["source_counts"].items():
|
||||
total = sum(counts.values())
|
||||
print(f"\n{src}: {total} total entries")
|
||||
for level in CEFR_LEVELS:
|
||||
cnt = counts.get(level, 0)
|
||||
if cnt > 0:
|
||||
print(f" {level}: {cnt}")
|
||||
# Show non-standard levels
|
||||
for level, cnt in counts.items():
|
||||
if level not in CEFR_LEVELS and level != "UNKNOWN":
|
||||
print(f" {level}: {cnt} (non-standard)")
|
||||
|
||||
# Overlap statistics
|
||||
print("\n🔄 OVERLAP BETWEEN SOURCES")
|
||||
print("-" * 50)
|
||||
print(f"Total unique (word, POS) combinations: {stats['unique_words']}")
|
||||
print(f"Total entries across all sources: {stats['total_entries']}")
|
||||
|
||||
overlap = stats["overlap_distribution"]
|
||||
for n_sources in sorted(overlap.keys()):
|
||||
count = overlap[n_sources]
|
||||
pct = (count / stats["unique_words"]) * 100
|
||||
print(f"Words appearing in {n_sources} source(s): {count} ({pct:.1f}%)")
|
||||
|
||||
# Agreement and conflicts
|
||||
print("\n⚖️ AGREEMENT / CONFLICT SUMMARY")
|
||||
print("-" * 50)
|
||||
print(f"Words with >1 source: {stats['agreements'] + stats['conflicts']}")
|
||||
print(f" ✅ Agreements (same CEFR): {stats['agreements']}")
|
||||
print(f" ❌ Conflicts (different CEFR): {stats['conflicts']}")
|
||||
|
||||
if stats["conflicts"] > 0:
|
||||
agreement_rate = (
|
||||
stats["agreements"] / (stats["agreements"] + stats["conflicts"])
|
||||
) * 100
|
||||
print(f" Agreement rate: {agreement_rate:.1f}%")
|
||||
|
||||
print("\n📋 CONFLICT DETAILS (first 10 shown):")
|
||||
for i, conflict in enumerate(stats["conflict_details"][:10]):
|
||||
print(f" {i + 1}. {conflict['word']} ({conflict['pos']})")
|
||||
for src, cefr in conflict["assignments"].items():
|
||||
print(f" {src}: {cefr}")
|
||||
if len(stats["conflict_details"]) > 10:
|
||||
print(f" ... and {len(stats['conflict_details']) - 10} more conflicts.")
|
||||
|
||||
print(f"\n{'=' * 60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
# Determine paths
|
||||
script_dir = Path(__file__).parent
|
||||
data_dir = script_dir.parent / "data-sources" / "english"
|
||||
|
||||
if not data_dir.exists():
|
||||
print(f"Error: English data directory not found: {data_dir}")
|
||||
return
|
||||
|
||||
print(f"Loading extracted files from {data_dir}...")
|
||||
sources = load_extracted_files(data_dir)
|
||||
|
||||
if not sources:
|
||||
print("No extracted files found.")
|
||||
return
|
||||
|
||||
print(f"Found sources: {', '.join(sources.keys())}")
|
||||
|
||||
stats = compute_statistics(sources)
|
||||
print_report(stats, sources)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
166
scripts/comparison-scripts/compare-italian.py
Normal file
166
scripts/comparison-scripts/compare-italian.py
Normal file
|
|
@ -0,0 +1,166 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
CEFR Data Pipeline - Stage 2: Italian Comparison
|
||||
Compares extracted JSON files for Italian and reports agreements and conflicts.
|
||||
"""
|
||||
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
# Supported CEFR levels
|
||||
CEFR_LEVELS = {"A1", "A2", "B1", "B2", "C1", "C2"}
|
||||
|
||||
|
||||
def load_extracted_files(data_dir: Path) -> Dict[str, List[dict]]:
|
||||
"""Load all *-extracted.json files from the Italian data directory."""
|
||||
sources = {}
|
||||
for file_path in data_dir.glob("*-extracted.json"):
|
||||
source_name = file_path.stem.replace("-extracted", "")
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if isinstance(data, list):
|
||||
sources[source_name] = data
|
||||
else:
|
||||
print(f"Warning: {file_path} does not contain a list, skipping.")
|
||||
return sources
|
||||
|
||||
|
||||
def normalize_entry(entry: dict) -> Tuple[str, str]:
|
||||
"""Return (word, pos) key for comparison."""
|
||||
return entry["word"].lower().strip(), entry["pos"].lower().strip()
|
||||
|
||||
|
||||
def compute_statistics(sources: Dict[str, List[dict]]) -> dict:
|
||||
"""Compute overlap, agreement, and conflict statistics."""
|
||||
# Per-source counts by CEFR level
|
||||
source_counts = {}
|
||||
for src, entries in sources.items():
|
||||
cefr_counts = defaultdict(int)
|
||||
for e in entries:
|
||||
cefr = e.get("cefr", "UNKNOWN")
|
||||
cefr_counts[cefr] += 1
|
||||
source_counts[src] = dict(cefr_counts)
|
||||
|
||||
# Build word->pos->sources and CEFR assignments
|
||||
word_map = defaultdict(lambda: defaultdict(dict))
|
||||
for src, entries in sources.items():
|
||||
for e in entries:
|
||||
key = normalize_entry(e)
|
||||
word_map[key][src] = e["cefr"]
|
||||
|
||||
# Compute overlaps, agreements, conflicts
|
||||
total_entries = sum(len(e) for e in sources.values())
|
||||
unique_words = len(word_map)
|
||||
|
||||
overlap_stats = defaultdict(int)
|
||||
agreement_count = 0
|
||||
conflict_count = 0
|
||||
conflict_details = []
|
||||
|
||||
for key, src_cefr_map in word_map.items():
|
||||
num_sources = len(src_cefr_map)
|
||||
overlap_stats[num_sources] += 1
|
||||
if num_sources > 1:
|
||||
cefr_values = set(src_cefr_map.values())
|
||||
if len(cefr_values) == 1:
|
||||
agreement_count += 1
|
||||
else:
|
||||
conflict_count += 1
|
||||
conflict_details.append(
|
||||
{"word": key[0], "pos": key[1], "assignments": dict(src_cefr_map)}
|
||||
)
|
||||
|
||||
return {
|
||||
"source_counts": source_counts,
|
||||
"total_entries": total_entries,
|
||||
"unique_words": unique_words,
|
||||
"overlap_distribution": dict(overlap_stats),
|
||||
"agreements": agreement_count,
|
||||
"conflicts": conflict_count,
|
||||
"conflict_details": conflict_details,
|
||||
}
|
||||
|
||||
|
||||
def print_report(stats: dict, sources: Dict[str, List[dict]]):
|
||||
"""Print formatted comparison report."""
|
||||
print(f"\n{'=' * 60}")
|
||||
print("CEFR COMPARISON REPORT - ITALIAN")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# Source entry counts
|
||||
print("\n📊 ENTRIES PER SOURCE AND CEFR LEVEL")
|
||||
print("-" * 50)
|
||||
for src, counts in stats["source_counts"].items():
|
||||
total = sum(counts.values())
|
||||
print(f"\n{src}: {total} total entries")
|
||||
for level in CEFR_LEVELS:
|
||||
cnt = counts.get(level, 0)
|
||||
if cnt > 0:
|
||||
print(f" {level}: {cnt}")
|
||||
# Show non-standard levels
|
||||
for level, cnt in counts.items():
|
||||
if level not in CEFR_LEVELS and level != "UNKNOWN":
|
||||
print(f" {level}: {cnt} (non-standard)")
|
||||
|
||||
# Overlap statistics
|
||||
print("\n🔄 OVERLAP BETWEEN SOURCES")
|
||||
print("-" * 50)
|
||||
print(f"Total unique (word, POS) combinations: {stats['unique_words']}")
|
||||
print(f"Total entries across all sources: {stats['total_entries']}")
|
||||
|
||||
overlap = stats["overlap_distribution"]
|
||||
for n_sources in sorted(overlap.keys()):
|
||||
count = overlap[n_sources]
|
||||
pct = (count / stats["unique_words"]) * 100
|
||||
print(f"Words appearing in {n_sources} source(s): {count} ({pct:.1f}%)")
|
||||
|
||||
# Agreement and conflicts
|
||||
print("\n⚖️ AGREEMENT / CONFLICT SUMMARY")
|
||||
print("-" * 50)
|
||||
print(f"Words with >1 source: {stats['agreements'] + stats['conflicts']}")
|
||||
print(f" ✅ Agreements (same CEFR): {stats['agreements']}")
|
||||
print(f" ❌ Conflicts (different CEFR): {stats['conflicts']}")
|
||||
|
||||
if stats["conflicts"] > 0:
|
||||
agreement_rate = (
|
||||
stats["agreements"] / (stats["agreements"] + stats["conflicts"])
|
||||
) * 100
|
||||
print(f" Agreement rate: {agreement_rate:.1f}%")
|
||||
|
||||
print("\n📋 CONFLICT DETAILS (first 10 shown):")
|
||||
for i, conflict in enumerate(stats["conflict_details"][:10]):
|
||||
print(f" {i + 1}. {conflict['word']} ({conflict['pos']})")
|
||||
for src, cefr in conflict["assignments"].items():
|
||||
print(f" {src}: {cefr}")
|
||||
if len(stats["conflict_details"]) > 10:
|
||||
print(f" ... and {len(stats['conflict_details']) - 10} more conflicts.")
|
||||
|
||||
print(f"\n{'=' * 60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
# Determine paths
|
||||
script_dir = Path(__file__).parent
|
||||
data_dir = script_dir.parent / "data-sources" / "italian"
|
||||
|
||||
if not data_dir.exists():
|
||||
print(f"Error: Italian data directory not found: {data_dir}")
|
||||
return
|
||||
|
||||
print(f"Loading extracted files from {data_dir}...")
|
||||
sources = load_extracted_files(data_dir)
|
||||
|
||||
if not sources:
|
||||
print("No extracted files found.")
|
||||
return
|
||||
|
||||
print(f"Found sources: {', '.join(sources.keys())}")
|
||||
|
||||
stats = compute_statistics(sources)
|
||||
print_report(stats, sources)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
5932
scripts/data-sources/english/cefrj-extracted.json
Normal file
5932
scripts/data-sources/english/cefrj-extracted.json
Normal file
File diff suppressed because it is too large
Load diff
7800
scripts/data-sources/english/cefrj.csv
Normal file
7800
scripts/data-sources/english/cefrj.csv
Normal file
File diff suppressed because it is too large
Load diff
5458
scripts/data-sources/english/en_m3-extracted.json
Normal file
5458
scripts/data-sources/english/en_m3-extracted.json
Normal file
File diff suppressed because it is too large
Load diff
BIN
scripts/data-sources/english/en_m3.xls
Normal file
BIN
scripts/data-sources/english/en_m3.xls
Normal file
Binary file not shown.
1716
scripts/data-sources/english/octanove-extracted.json
Normal file
1716
scripts/data-sources/english/octanove-extracted.json
Normal file
File diff suppressed because it is too large
Load diff
2137
scripts/data-sources/english/octanove.csv
Normal file
2137
scripts/data-sources/english/octanove.csv
Normal file
File diff suppressed because it is too large
Load diff
15370
scripts/data-sources/english/random-extracted.json
Normal file
15370
scripts/data-sources/english/random-extracted.json
Normal file
File diff suppressed because it is too large
Load diff
186374
scripts/data-sources/english/random.json
Normal file
186374
scripts/data-sources/english/random.json
Normal file
File diff suppressed because it is too large
Load diff
193382
scripts/data-sources/french/french.json
Normal file
193382
scripts/data-sources/french/french.json
Normal file
File diff suppressed because it is too large
Load diff
324482
scripts/data-sources/german/german.json
Normal file
324482
scripts/data-sources/german/german.json
Normal file
File diff suppressed because it is too large
Load diff
3726
scripts/data-sources/italian/it_m3-extracted.json
Normal file
3726
scripts/data-sources/italian/it_m3-extracted.json
Normal file
File diff suppressed because it is too large
Load diff
BIN
scripts/data-sources/italian/it_m3.xls
Normal file
BIN
scripts/data-sources/italian/it_m3.xls
Normal file
Binary file not shown.
13672
scripts/data-sources/italian/italian-extracted.json
Normal file
13672
scripts/data-sources/italian/italian-extracted.json
Normal file
File diff suppressed because it is too large
Load diff
185759
scripts/data-sources/italian/italian.json
Normal file
185759
scripts/data-sources/italian/italian.json
Normal file
File diff suppressed because it is too large
Load diff
163922
scripts/data-sources/spanish/spanish.json
Normal file
163922
scripts/data-sources/spanish/spanish.json
Normal file
File diff suppressed because it is too large
Load diff
120906
scripts/datafiles/english-merged.json
Normal file
120906
scripts/datafiles/english-merged.json
Normal file
File diff suppressed because it is too large
Load diff
85710
scripts/datafiles/italian-merged.json
Normal file
85710
scripts/datafiles/italian-merged.json
Normal file
File diff suppressed because it is too large
Load diff
747568
scripts/datafiles/omw-noun.json
Normal file
747568
scripts/datafiles/omw-noun.json
Normal file
File diff suppressed because it is too large
Load diff
102492
scripts/datafiles/omw-verb.json
Normal file
102492
scripts/datafiles/omw-verb.json
Normal file
File diff suppressed because it is too large
Load diff
149
scripts/extract-own-save-to-json.py
Normal file
149
scripts/extract-own-save-to-json.py
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
"""
|
||||
scripts/extract-omw-data.py
|
||||
|
||||
Extract ALL synsets from Open Multilingual Wordnet (OMW) for every supported
|
||||
language and POS. Replaces extract-en-it-nouns.py.
|
||||
|
||||
Output: one JSON file per POS, written to packages/db/src/data/datafiles/
|
||||
omw-noun.json
|
||||
omw-verb.json
|
||||
|
||||
Each file is a JSON array of objects matching SynsetRecord in seed.ts:
|
||||
{
|
||||
"source_id": "ili:i12345",
|
||||
"pos": "noun",
|
||||
"translations": { "en": ["dog", "canine"], "it": ["cane"] },
|
||||
"glosses": { "en": ["a domesticated animal..."] }
|
||||
}
|
||||
|
||||
Translations and glosses are absent for a language if that wordnet has no
|
||||
coverage for the synset — the seed script handles sparse data gracefully.
|
||||
|
||||
Usage:
|
||||
python scripts/extract-omw-data.py [output_dir]
|
||||
|
||||
output_dir defaults to packages/db/src/data/datafiles/
|
||||
|
||||
Prerequisites:
|
||||
pip install wn
|
||||
python -c "import wn; wn.download('oewn:2024'); wn.download('omw-it:1.4')"
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import wn
|
||||
|
||||
# Mirror constants.ts — update both places if languages or POS change.
|
||||
SUPPORTED_LANGUAGE_CODES: list[str] = ["en", "it"]
|
||||
POS_MAP: dict[str, str] = {
|
||||
"n": "noun",
|
||||
"v": "verb",
|
||||
}
|
||||
|
||||
|
||||
def extract_all(output_dir: str = "packages/db/src/data/datafiles/") -> None:
|
||||
out = Path(output_dir)
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load one Wordnet object per language up front.
|
||||
print("Loading wordnets...")
|
||||
wordnets: dict[str, wn.Wordnet] = {}
|
||||
for lang in SUPPORTED_LANGUAGE_CODES:
|
||||
try:
|
||||
wordnets[lang] = wn.Wordnet(lang=lang)
|
||||
synset_count = len(wordnets[lang].synsets())
|
||||
print(f" {lang}: {synset_count:,} total synsets")
|
||||
except wn.Error as e:
|
||||
print(f" ERROR loading {lang}: {e}")
|
||||
print(f" Run: python -c \"import wn; wn.download('omw-{lang}:1.4')\"")
|
||||
sys.exit(1)
|
||||
|
||||
for omw_pos, pos_label in POS_MAP.items():
|
||||
print(f"\n--- Extracting {pos_label}s (pos='{omw_pos}') ---")
|
||||
|
||||
# Collect per-ILI data across all languages.
|
||||
# Structure: { ili -> { lang -> { "lemmas": [...], "glosses": [...] } } }
|
||||
by_ili: dict[str, dict[str, dict[str, list[str]]]] = {}
|
||||
|
||||
for lang, wnet in wordnets.items():
|
||||
synsets = wnet.synsets(pos=omw_pos)
|
||||
covered = 0
|
||||
for synset in synsets:
|
||||
ili = synset.ili
|
||||
if not ili:
|
||||
continue # skip synsets without an ILI — can't cross-link
|
||||
covered += 1
|
||||
if ili not in by_ili:
|
||||
by_ili[ili] = {}
|
||||
|
||||
lemmas = [str(lemma) for lemma in synset.lemmas()]
|
||||
defns = [d for d in synset.definitions() if d]
|
||||
|
||||
by_ili[ili][lang] = {"lemmas": lemmas, "glosses": defns}
|
||||
|
||||
print(f" {lang}: {covered:,} {pos_label} synsets with ILI")
|
||||
|
||||
# Build output records — sort by ILI for a stable, diffable file.
|
||||
records: list[dict] = []
|
||||
for ili in sorted(by_ili.keys()):
|
||||
lang_data = by_ili[ili]
|
||||
translations: dict[str, list[str]] = {}
|
||||
glosses: dict[str, list[str]] = {}
|
||||
|
||||
for lang, data in lang_data.items():
|
||||
if data["lemmas"]:
|
||||
translations[lang] = data["lemmas"]
|
||||
if data["glosses"]:
|
||||
glosses[lang] = data["glosses"]
|
||||
|
||||
# Include the record even if only one language has coverage —
|
||||
# the seed script imports all terms regardless of cross-language overlap.
|
||||
records.append(
|
||||
{
|
||||
"source_id": f"ili:{ili}",
|
||||
"pos": pos_label,
|
||||
"translations": translations,
|
||||
"glosses": glosses,
|
||||
}
|
||||
)
|
||||
|
||||
output_file = out / f"omw-{pos_label}.json"
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\nWrote {len(records):,} {pos_label} synsets → {output_file}")
|
||||
_print_coverage(records, pos_label)
|
||||
|
||||
|
||||
def _print_coverage(records: list[dict], pos_label: str) -> None:
|
||||
"""Print per-language translation and gloss counts."""
|
||||
lang_stats: dict[str, dict[str, int]] = {}
|
||||
for lang in SUPPORTED_LANGUAGE_CODES:
|
||||
lang_stats[lang] = {"translations": 0, "glosses": 0}
|
||||
|
||||
for r in records:
|
||||
for lang, lemmas in r["translations"].items():
|
||||
if lang in lang_stats:
|
||||
lang_stats[lang]["translations"] += len(lemmas)
|
||||
for lang, gloss_list in r["glosses"].items():
|
||||
if lang in lang_stats:
|
||||
lang_stats[lang]["glosses"] += len(gloss_list)
|
||||
|
||||
print(f"\nCoverage for {pos_label}s:")
|
||||
for lang, counts in lang_stats.items():
|
||||
t = counts["translations"]
|
||||
g = counts["glosses"]
|
||||
avg_t = t / len(records) if records else 0
|
||||
print(f" {lang}: {t:,} translations ({avg_t:.1f} avg/synset), {g:,} glosses")
|
||||
|
||||
# Sample output
|
||||
print(f"\nSample {pos_label}s (records 1000–1004):")
|
||||
for r in records[1000:1005]:
|
||||
print(f" {r['source_id']}: {r['translations']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
output_dir = sys.argv[1] if len(sys.argv) > 1 else "packages/db/src/data/datafiles/"
|
||||
extract_all(output_dir)
|
||||
96
scripts/extraction-scripts/english/extract-cefrj-csv.py
Normal file
96
scripts/extraction-scripts/english/extract-cefrj-csv.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
scripts/extraction-scripts/english/extract-cefrj-csv.py
|
||||
|
||||
Extracts CEFR data from cefrj.csv (CEFR-J vocabulary profile).
|
||||
Filters for supported POS (noun, verb).
|
||||
|
||||
Input: scripts/data-sources/english/cefrj.csv
|
||||
Output: scripts/data-sources/english/cefrj-extracted.json
|
||||
|
||||
Output format (normalized):
|
||||
[
|
||||
{ "word": "ability", "pos": "noun", "cefr": "A2", "source": "cefrj" }
|
||||
]
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Constants matching @lila/shared
|
||||
SUPPORTED_POS = ["noun", "verb"]
|
||||
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||
|
||||
# Paths (relative to project root)
|
||||
INPUT_FILE = Path("scripts/data-sources/english/cefrj.csv")
|
||||
OUTPUT_FILE = Path("scripts/data-sources/english/cefrj-extracted.json")
|
||||
|
||||
|
||||
def extract() -> None:
|
||||
print(f"Reading: {INPUT_FILE}")
|
||||
|
||||
records = []
|
||||
skipped_pos = 0
|
||||
skipped_invalid_cefr = 0
|
||||
skipped_empty_word = 0
|
||||
total_rows = 0
|
||||
|
||||
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
total_rows += 1
|
||||
|
||||
# Filter: must have supported POS
|
||||
pos = row.get("pos", "").lower().strip()
|
||||
if pos not in SUPPORTED_POS:
|
||||
skipped_pos += 1
|
||||
continue
|
||||
|
||||
# Filter: must have valid CEFR level
|
||||
cefr = row.get("CEFR", "").upper().strip()
|
||||
if cefr not in CEFR_LEVELS:
|
||||
skipped_invalid_cefr += 1
|
||||
continue
|
||||
|
||||
# Normalize word
|
||||
word = row.get("headword", "").lower().strip()
|
||||
if not word:
|
||||
skipped_empty_word += 1
|
||||
continue
|
||||
|
||||
record = {"word": word, "pos": pos, "cefr": cefr, "source": "cefrj"}
|
||||
records.append(record)
|
||||
|
||||
# Write output
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Stats
|
||||
noun_count = sum(1 for r in records if r["pos"] == "noun")
|
||||
verb_count = sum(1 for r in records if r["pos"] == "verb")
|
||||
|
||||
cefr_distribution = {}
|
||||
for level in CEFR_LEVELS:
|
||||
count = sum(1 for r in records if r["cefr"] == level)
|
||||
if count > 0:
|
||||
cefr_distribution[level] = count
|
||||
|
||||
print(f"\nTotal rows in CSV: {total_rows}")
|
||||
print(f"Extracted: {len(records)} records")
|
||||
print(f" - Nouns: {noun_count}")
|
||||
print(f" - Verbs: {verb_count}")
|
||||
print("\nCEFR distribution:")
|
||||
for level in CEFR_LEVELS:
|
||||
if level in cefr_distribution:
|
||||
print(f" - {level}: {cefr_distribution[level]}")
|
||||
|
||||
print("\nSkipped:")
|
||||
print(f" - Unsupported POS: {skipped_pos}")
|
||||
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||
print(f" - Empty word: {skipped_empty_word}")
|
||||
print(f"\nOutput: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract()
|
||||
107
scripts/extraction-scripts/english/extract-en_m3.py
Normal file
107
scripts/extraction-scripts/english/extract-en_m3.py
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
scripts/extraction-scripts/english/extract-en_m3.py
|
||||
|
||||
Extracts CEFR data from en_m3.xls (M3 wordlist).
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import xlrd
|
||||
|
||||
# Constants matching @lila/shared
|
||||
SUPPORTED_POS = ["noun", "verb"]
|
||||
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||
|
||||
# POS mapping (case-insensitive)
|
||||
POS_MAP = {
|
||||
"noun": "noun",
|
||||
"verb": "verb",
|
||||
}
|
||||
|
||||
# Paths (relative to project root)
|
||||
INPUT_FILE = Path("scripts/data-sources/english/en_m3.xls")
|
||||
OUTPUT_FILE = Path("scripts/data-sources/english/en_m3-extracted.json")
|
||||
|
||||
|
||||
def extract() -> None:
|
||||
print(f"Reading: {INPUT_FILE}")
|
||||
|
||||
records = []
|
||||
skipped_pos = 0
|
||||
skipped_invalid_cefr = 0
|
||||
skipped_empty_word = 0
|
||||
total_rows = 0
|
||||
|
||||
wb = xlrd.open_workbook(INPUT_FILE)
|
||||
ws = wb.sheet_by_index(0)
|
||||
|
||||
# Skip header row, start from row 1
|
||||
for row_idx in range(1, ws.nrows):
|
||||
total_rows += 1
|
||||
|
||||
# Unpack columns: ID number, Word, Part of Speech, CEFR, Points
|
||||
word_raw = ws.cell_value(row_idx, 1)
|
||||
pos_raw = ws.cell_value(row_idx, 2)
|
||||
cefr_raw = ws.cell_value(row_idx, 3)
|
||||
|
||||
# Normalize POS (case-insensitive)
|
||||
pos = str(pos_raw).lower().strip() if pos_raw else ""
|
||||
if pos not in POS_MAP:
|
||||
skipped_pos += 1
|
||||
continue
|
||||
|
||||
pos = POS_MAP[pos]
|
||||
|
||||
# Normalize CEFR - handle smart quotes
|
||||
cefr_str = str(cefr_raw).strip() if cefr_raw else ""
|
||||
# Strip Unicode smart quotes (U+201C and U+201D)
|
||||
cefr_str = cefr_str.strip("\u201c\u201d")
|
||||
cefr = cefr_str.upper()
|
||||
|
||||
if cefr not in CEFR_LEVELS:
|
||||
skipped_invalid_cefr += 1
|
||||
continue
|
||||
|
||||
# Normalize word
|
||||
word = str(word_raw).lower().strip() if word_raw else ""
|
||||
if not word:
|
||||
skipped_empty_word += 1
|
||||
continue
|
||||
|
||||
record = {"word": word, "pos": pos, "cefr": cefr, "source": "en_m3"}
|
||||
records.append(record)
|
||||
|
||||
# Write output
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Stats
|
||||
noun_count = sum(1 for r in records if r["pos"] == "noun")
|
||||
verb_count = sum(1 for r in records if r["pos"] == "verb")
|
||||
|
||||
cefr_distribution = {}
|
||||
for level in CEFR_LEVELS:
|
||||
count = sum(1 for r in records if r["cefr"] == level)
|
||||
if count > 0:
|
||||
cefr_distribution[level] = count
|
||||
|
||||
print(f"\nTotal rows in XLS: {total_rows}")
|
||||
print(f"Extracted: {len(records)} records")
|
||||
print(f" - Nouns: {noun_count}")
|
||||
print(f" - Verbs: {verb_count}")
|
||||
print("\nCEFR distribution:")
|
||||
for level in CEFR_LEVELS:
|
||||
if level in cefr_distribution:
|
||||
print(f" - {level}: {cefr_distribution[level]}")
|
||||
|
||||
print("\nSkipped:")
|
||||
print(f" - Unsupported POS: {skipped_pos}")
|
||||
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||
print(f" - Empty word: {skipped_empty_word}")
|
||||
print(f"\nOutput: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract()
|
||||
90
scripts/extraction-scripts/english/extract-octanove.py
Normal file
90
scripts/extraction-scripts/english/extract-octanove.py
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
scripts/extraction-scripts/english/extract-octanove.py
|
||||
Extracts CEFR data from octanove.csv (Octanove vocabulary profile).
|
||||
Filters for supported POS (noun, verb).
|
||||
Input: scripts/data-sources/english/octanove.csv
|
||||
Output: scripts/data-sources/english/octanove-extracted.json
|
||||
Output format (normalized):
|
||||
[
|
||||
{ "word": "example", "pos": "noun", "cefr": "C1", "source": "octanove" }
|
||||
]
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Constants matching @lila/shared
|
||||
SUPPORTED_POS = ["noun", "verb"]
|
||||
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||
|
||||
# Paths (relative to project root)
|
||||
INPUT_FILE = Path("scripts/data-sources/english/octanove.csv")
|
||||
OUTPUT_FILE = Path("scripts/data-sources/english/octanove-extracted.json")
|
||||
|
||||
|
||||
def extract() -> None:
|
||||
print(f"Reading: {INPUT_FILE}")
|
||||
records = []
|
||||
skipped_pos = 0
|
||||
skipped_invalid_cefr = 0
|
||||
skipped_empty_word = 0
|
||||
total_rows = 0
|
||||
|
||||
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
total_rows += 1
|
||||
|
||||
# Filter: must have supported POS
|
||||
pos = row.get("pos", "").lower().strip()
|
||||
if pos not in SUPPORTED_POS:
|
||||
skipped_pos += 1
|
||||
continue
|
||||
|
||||
# Filter: must have valid CEFR level
|
||||
cefr = row.get("CEFR", "").upper().strip()
|
||||
if cefr not in CEFR_LEVELS:
|
||||
skipped_invalid_cefr += 1
|
||||
continue
|
||||
|
||||
# Normalize word
|
||||
word = row.get("headword", "").lower().strip()
|
||||
if not word:
|
||||
skipped_empty_word += 1
|
||||
continue
|
||||
|
||||
record = {"word": word, "pos": pos, "cefr": cefr, "source": "octanove"}
|
||||
records.append(record)
|
||||
|
||||
# Write output
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Stats
|
||||
noun_count = sum(1 for r in records if r["pos"] == "noun")
|
||||
verb_count = sum(1 for r in records if r["pos"] == "verb")
|
||||
cefr_distribution = {}
|
||||
for level in CEFR_LEVELS:
|
||||
count = sum(1 for r in records if r["cefr"] == level)
|
||||
if count > 0:
|
||||
cefr_distribution[level] = count
|
||||
|
||||
print(f"\nTotal rows in CSV: {total_rows}")
|
||||
print(f"Extracted: {len(records)} records")
|
||||
print(f" - Nouns: {noun_count}")
|
||||
print(f" - Verbs: {verb_count}")
|
||||
print("\nCEFR distribution:")
|
||||
for level in CEFR_LEVELS:
|
||||
if level in cefr_distribution:
|
||||
print(f" - {level}: {cefr_distribution[level]}")
|
||||
print("\nSkipped:")
|
||||
print(f" - Unsupported POS: {skipped_pos}")
|
||||
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||
print(f" - Empty word: {skipped_empty_word}")
|
||||
print(f"\nOutput: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract()
|
||||
99
scripts/extraction-scripts/english/extract-random-json.py
Normal file
99
scripts/extraction-scripts/english/extract-random-json.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
scripts/extraction-scripts/english/extract-random-json.py
|
||||
|
||||
Extracts CEFR data from random.json (English flashcard source).
|
||||
Filters for useful_for_flashcard=true and supported POS (noun, verb).
|
||||
|
||||
Input: scripts/data-sources/english/random.json
|
||||
Output: scripts/data-sources/english/random-extracted.json
|
||||
|
||||
Output format (normalized):
|
||||
[
|
||||
{ "word": "be", "pos": "verb", "cefr": "A1", "source": "random" }
|
||||
]
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Constants matching @lila/shared
|
||||
SUPPORTED_POS = ["noun", "verb"]
|
||||
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||
|
||||
# Paths (relative to project root)
|
||||
INPUT_FILE = Path("scripts/data-sources/english/random.json")
|
||||
OUTPUT_FILE = Path("scripts/data-sources/english/random-extracted.json")
|
||||
|
||||
|
||||
def extract() -> None:
|
||||
print(f"Reading: {INPUT_FILE}")
|
||||
|
||||
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
records = []
|
||||
skipped_pos = 0
|
||||
skipped_not_useful = 0
|
||||
skipped_invalid_cefr = 0
|
||||
skipped_empty_word = 0
|
||||
|
||||
for entry in data:
|
||||
# Filter: must be useful for flashcard
|
||||
if not entry.get("useful_for_flashcard", False):
|
||||
skipped_not_useful += 1
|
||||
continue
|
||||
|
||||
# Filter: must have supported POS
|
||||
pos = entry.get("pos", "").lower().strip()
|
||||
if pos not in SUPPORTED_POS:
|
||||
skipped_pos += 1
|
||||
continue
|
||||
|
||||
# Filter: must have valid CEFR level
|
||||
cefr = entry.get("cefr_level", "").upper().strip()
|
||||
if cefr not in CEFR_LEVELS:
|
||||
skipped_invalid_cefr += 1
|
||||
continue
|
||||
|
||||
# Normalize word
|
||||
word = entry.get("word", "").lower().strip()
|
||||
if not word:
|
||||
skipped_empty_word += 1
|
||||
continue
|
||||
|
||||
record = {"word": word, "pos": pos, "cefr": cefr, "source": "random"}
|
||||
records.append(record)
|
||||
|
||||
# Write output
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Stats
|
||||
noun_count = sum(1 for r in records if r["pos"] == "noun")
|
||||
verb_count = sum(1 for r in records if r["pos"] == "verb")
|
||||
|
||||
cefr_distribution = {}
|
||||
for level in CEFR_LEVELS:
|
||||
count = sum(1 for r in records if r["cefr"] == level)
|
||||
if count > 0:
|
||||
cefr_distribution[level] = count
|
||||
|
||||
print(f"\nExtracted: {len(records)} records")
|
||||
print(f" - Nouns: {noun_count}")
|
||||
print(f" - Verbs: {verb_count}")
|
||||
print("\nCEFR distribution:")
|
||||
for level in CEFR_LEVELS:
|
||||
if level in cefr_distribution:
|
||||
print(f" - {level}: {cefr_distribution[level]}")
|
||||
|
||||
print("\nSkipped:")
|
||||
print(f" - Not useful for flashcard: {skipped_not_useful}")
|
||||
print(f" - Unsupported POS: {skipped_pos}")
|
||||
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||
print(f" - Empty word: {skipped_empty_word}")
|
||||
print(f"\nOutput: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract()
|
||||
114
scripts/extraction-scripts/italian/extract-it_m3.py
Normal file
114
scripts/extraction-scripts/italian/extract-it_m3.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
scripts/extraction-scripts/italian/extract-it_m3.py
|
||||
|
||||
Extracts CEFR data from it_m3.xls (Italian M3 wordlist).
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import xlrd
|
||||
|
||||
# Constants matching @glossa/shared
|
||||
SUPPORTED_POS = ["noun", "verb"]
|
||||
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||
|
||||
# POS mapping (case-insensitive) – based on observed abbreviations
|
||||
POS_MAP = {
|
||||
"n": "noun", # nome
|
||||
"v": "verb", # verbo
|
||||
}
|
||||
|
||||
# Column indices (0-based) – verified from sample
|
||||
WORD_COL = 0 # Lemma
|
||||
POS_COL = 1 # Pos
|
||||
CEFR_COL = 2 # Points (CEFR level)
|
||||
|
||||
# Paths (relative to project root)
|
||||
INPUT_FILE = Path("scripts/data-sources/italian/it_m3.xls")
|
||||
OUTPUT_FILE = Path("scripts/data-sources/italian/it_m3-extracted.json")
|
||||
|
||||
|
||||
def extract() -> None:
|
||||
print(f"Reading: {INPUT_FILE}")
|
||||
|
||||
records = []
|
||||
skipped_pos = 0
|
||||
skipped_invalid_cefr = 0
|
||||
skipped_empty_word = 0
|
||||
total_rows = 0
|
||||
|
||||
wb = xlrd.open_workbook(INPUT_FILE)
|
||||
ws = wb.sheet_by_index(0)
|
||||
|
||||
# Skip header row, start from row 1
|
||||
for row_idx in range(1, ws.nrows):
|
||||
total_rows += 1
|
||||
|
||||
word_raw = ws.cell_value(row_idx, WORD_COL)
|
||||
pos_raw = ws.cell_value(row_idx, POS_COL)
|
||||
cefr_raw = ws.cell_value(row_idx, CEFR_COL)
|
||||
|
||||
# Normalize POS (case-insensitive)
|
||||
pos = str(pos_raw).lower().strip() if pos_raw else ""
|
||||
if pos not in POS_MAP:
|
||||
skipped_pos += 1
|
||||
continue
|
||||
|
||||
pos = POS_MAP[pos]
|
||||
|
||||
# Normalize CEFR - handle smart quotes
|
||||
cefr_str = str(cefr_raw).strip() if cefr_raw else ""
|
||||
cefr_str = cefr_str.strip("\u201c\u201d") # strip Unicode smart quotes
|
||||
cefr = cefr_str.upper()
|
||||
|
||||
if cefr not in CEFR_LEVELS:
|
||||
skipped_invalid_cefr += 1
|
||||
continue
|
||||
|
||||
# Normalize word – handle multiple forms like "il, lo, la" → take first?
|
||||
word_raw_str = str(word_raw).strip() if word_raw else ""
|
||||
# If word contains comma, take first part (e.g., "il, lo, la" → "il")
|
||||
# But this may lose variants; consider keeping as is or processing differently.
|
||||
# For consistency, we'll keep the full string and lowercase it.
|
||||
word = word_raw_str.lower()
|
||||
if not word:
|
||||
skipped_empty_word += 1
|
||||
continue
|
||||
|
||||
record = {"word": word, "pos": pos, "cefr": cefr, "source": "it_m3"}
|
||||
records.append(record)
|
||||
|
||||
# Write output
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Stats
|
||||
noun_count = sum(1 for r in records if r["pos"] == "noun")
|
||||
verb_count = sum(1 for r in records if r["pos"] == "verb")
|
||||
|
||||
cefr_distribution = {}
|
||||
for level in CEFR_LEVELS:
|
||||
count = sum(1 for r in records if r["cefr"] == level)
|
||||
if count > 0:
|
||||
cefr_distribution[level] = count
|
||||
|
||||
print(f"\nTotal rows in XLS: {total_rows}")
|
||||
print(f"Extracted: {len(records)} records")
|
||||
print(f" - Nouns: {noun_count}")
|
||||
print(f" - Verbs: {verb_count}")
|
||||
print(f"\nCEFR distribution:")
|
||||
for level in CEFR_LEVELS:
|
||||
if level in cefr_distribution:
|
||||
print(f" - {level}: {cefr_distribution[level]}")
|
||||
|
||||
print(f"\nSkipped:")
|
||||
print(f" - Unsupported POS: {skipped_pos}")
|
||||
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||
print(f" - Empty word: {skipped_empty_word}")
|
||||
print(f"\nOutput: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract()
|
||||
91
scripts/extraction-scripts/italian/extract-random-json.py
Normal file
91
scripts/extraction-scripts/italian/extract-random-json.py
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
scripts/extraction-scripts/italian/extract-italian-json.py
|
||||
|
||||
Extracts CEFR data from italian.json (Italian flashcard source).
|
||||
Filters for useful_for_flashcard=true and supported POS (noun, verb).
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Constants matching @glossa/shared
|
||||
SUPPORTED_POS = ["noun", "verb"]
|
||||
CEFR_LEVELS = ["A1", "A2", "B1", "B2", "C1", "C2"]
|
||||
|
||||
# Paths (relative to project root)
|
||||
INPUT_FILE = Path("scripts/data-sources/italian/italian.json")
|
||||
OUTPUT_FILE = Path("scripts/data-sources/italian/italian-extracted.json")
|
||||
|
||||
|
||||
def extract() -> None:
|
||||
print(f"Reading: {INPUT_FILE}")
|
||||
|
||||
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
records = []
|
||||
skipped_pos = 0
|
||||
skipped_not_useful = 0
|
||||
skipped_invalid_cefr = 0
|
||||
skipped_empty_word = 0
|
||||
|
||||
for entry in data:
|
||||
# Filter: must be useful for flashcard
|
||||
if not entry.get("useful_for_flashcard", False):
|
||||
skipped_not_useful += 1
|
||||
continue
|
||||
|
||||
# Filter: must have supported POS
|
||||
pos = entry.get("pos", "").lower().strip()
|
||||
if pos not in SUPPORTED_POS:
|
||||
skipped_pos += 1
|
||||
continue
|
||||
|
||||
# Filter: must have valid CEFR level
|
||||
cefr = entry.get("cefr_level", "").upper().strip()
|
||||
if cefr not in CEFR_LEVELS:
|
||||
skipped_invalid_cefr += 1
|
||||
continue
|
||||
|
||||
# Normalize word
|
||||
word = entry.get("word", "").lower().strip()
|
||||
if not word:
|
||||
skipped_empty_word += 1
|
||||
continue
|
||||
|
||||
record = {"word": word, "pos": pos, "cefr": cefr, "source": "italian"}
|
||||
records.append(record)
|
||||
|
||||
# Write output
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Stats
|
||||
noun_count = sum(1 for r in records if r["pos"] == "noun")
|
||||
verb_count = sum(1 for r in records if r["pos"] == "verb")
|
||||
|
||||
cefr_distribution = {}
|
||||
for level in CEFR_LEVELS:
|
||||
count = sum(1 for r in records if r["cefr"] == level)
|
||||
if count > 0:
|
||||
cefr_distribution[level] = count
|
||||
|
||||
print(f"\nExtracted: {len(records)} records")
|
||||
print(f" - Nouns: {noun_count}")
|
||||
print(f" - Verbs: {verb_count}")
|
||||
print("\nCEFR distribution:")
|
||||
for level in CEFR_LEVELS:
|
||||
if level in cefr_distribution:
|
||||
print(f" - {level}: {cefr_distribution[level]}")
|
||||
|
||||
print("\nSkipped:")
|
||||
print(f" - Not useful for flashcard: {skipped_not_useful}")
|
||||
print(f" - Unsupported POS: {skipped_pos}")
|
||||
print(f" - Invalid CEFR: {skipped_invalid_cefr}")
|
||||
print(f" - Empty word: {skipped_empty_word}")
|
||||
print(f"\nOutput: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract()
|
||||
58
scripts/gametest/test-game.ts
Normal file
58
scripts/gametest/test-game.ts
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
async function main() {
|
||||
// Step 1: start a game
|
||||
const startResponse = await fetch("http://localhost:3000/api/v1/game/start", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
source_language: "en",
|
||||
target_language: "it",
|
||||
pos: "noun",
|
||||
difficulty: "easy",
|
||||
rounds: "3",
|
||||
}),
|
||||
});
|
||||
const game = await startResponse.json();
|
||||
console.log("Game started:", JSON.stringify(game, null, 2));
|
||||
|
||||
// Step 2: answer each question (always pick option 0)
|
||||
for (const question of game.data.questions) {
|
||||
const answerResponse = await fetch(
|
||||
"http://localhost:3000/api/v1/game/answer",
|
||||
{
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
sessionId: game.data.sessionId,
|
||||
questionId: question.questionId,
|
||||
selectedOptionId: 0,
|
||||
}),
|
||||
},
|
||||
);
|
||||
const result = await answerResponse.json();
|
||||
console.log("Raw result:", JSON.stringify(result, null, 2));
|
||||
console.log(
|
||||
`${question.prompt}: ${result.data.isCorrect ? "✓" : "✗"} (picked ${0}, correct was ${result.data.correctOptionId})`,
|
||||
);
|
||||
}
|
||||
|
||||
const badRequest = await fetch("http://localhost:3000/api/v1/game/start", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ source_language: "en" }),
|
||||
});
|
||||
console.log("400 test:", badRequest.status, await badRequest.json());
|
||||
|
||||
// Send a valid shape but a session that doesn't exist
|
||||
const notFound = await fetch("http://localhost:3000/api/v1/game/answer", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
sessionId: "00000000-0000-0000-0000-000000000000",
|
||||
questionId: "00000000-0000-0000-0000-000000000000",
|
||||
selectedOptionId: 0,
|
||||
}),
|
||||
});
|
||||
console.log("404 test:", notFound.status, await notFound.json());
|
||||
}
|
||||
|
||||
main();
|
||||
159
scripts/merge-scripts/merge-english-json.py
Normal file
159
scripts/merge-scripts/merge-english-json.py
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
CEFR Data Pipeline - Stage 3: English Merge
|
||||
Merges extracted JSON files for English into an authoritative dataset.
|
||||
"""
|
||||
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
# Supported CEFR levels and difficulty mapping
|
||||
CEFR_LEVELS = {"A1", "A2", "B1", "B2", "C1", "C2"}
|
||||
DIFFICULTY_MAP = {
|
||||
"A1": "easy",
|
||||
"A2": "easy",
|
||||
"B1": "intermediate",
|
||||
"B2": "intermediate",
|
||||
"C1": "hard",
|
||||
"C2": "hard",
|
||||
}
|
||||
|
||||
# Source priority order (from lowest to highest priority)
|
||||
# Higher index = higher authority when conflicts occur
|
||||
PRIORITY_ORDER = ["random", "octanove", "cefrj", "en_m3"]
|
||||
|
||||
|
||||
def load_extracted_files(data_dir: Path) -> Dict[str, List[dict]]:
|
||||
"""Load all *-extracted.json files from the English data directory."""
|
||||
sources = {}
|
||||
for file_path in data_dir.glob("*-extracted.json"):
|
||||
source_name = file_path.stem.replace("-extracted", "")
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if isinstance(data, list):
|
||||
sources[source_name] = data
|
||||
else:
|
||||
print(f"Warning: {file_path} does not contain a list, skipping.")
|
||||
return sources
|
||||
|
||||
|
||||
def normalize_entry(entry: dict) -> Tuple[str, str]:
|
||||
"""Return (word, pos) key for merging."""
|
||||
return entry["word"].lower().strip(), entry["pos"].lower().strip()
|
||||
|
||||
|
||||
def get_source_priority(source_name: str) -> int:
|
||||
"""Return priority index for a source (higher = more authoritative)."""
|
||||
try:
|
||||
return PRIORITY_ORDER.index(source_name)
|
||||
except ValueError:
|
||||
# If source not in list, assign lowest priority
|
||||
return -1
|
||||
|
||||
|
||||
def merge_entries(sources: Dict[str, List[dict]]) -> List[dict]:
|
||||
"""Merge entries from multiple sources, resolving conflicts by priority."""
|
||||
grouped = defaultdict(list)
|
||||
for src_name, entries in sources.items():
|
||||
for entry in entries:
|
||||
key = normalize_entry(entry)
|
||||
grouped[key].append((src_name, entry["cefr"], entry))
|
||||
|
||||
merged = []
|
||||
conflicts_resolved = 0
|
||||
total_multi_source = 0
|
||||
|
||||
for (word, pos), src_entries in grouped.items():
|
||||
if len(src_entries) == 1:
|
||||
src_name, cefr, original = src_entries[0]
|
||||
final_cefr = cefr
|
||||
contributing_sources = [src_name]
|
||||
else:
|
||||
total_multi_source += 1
|
||||
sorted_entries = sorted(
|
||||
src_entries, key=lambda x: get_source_priority(x[0]), reverse=True
|
||||
)
|
||||
highest_src, highest_cefr, _ = sorted_entries[0]
|
||||
all_cefrs = {e[1] for e in src_entries}
|
||||
if len(all_cefrs) > 1:
|
||||
conflicts_resolved += 1
|
||||
|
||||
final_cefr = highest_cefr
|
||||
contributing_sources = [e[0] for e in src_entries]
|
||||
|
||||
difficulty = DIFFICULTY_MAP.get(final_cefr, "unknown")
|
||||
|
||||
merged.append(
|
||||
{
|
||||
"word": word,
|
||||
"pos": pos,
|
||||
"cefr": final_cefr,
|
||||
"difficulty": difficulty,
|
||||
"sources": sorted(contributing_sources),
|
||||
}
|
||||
)
|
||||
|
||||
print(f"Merge statistics:")
|
||||
print(f" Total unique entries: {len(merged)}")
|
||||
print(f" Entries with multiple sources: {total_multi_source}")
|
||||
print(f" Conflicts resolved by priority: {conflicts_resolved}")
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
def print_summary(merged: List[dict]):
|
||||
"""Print distribution of CEFR levels and difficulty in final dataset."""
|
||||
cefr_counts = defaultdict(int)
|
||||
diff_counts = defaultdict(int)
|
||||
|
||||
for entry in merged:
|
||||
cefr_counts[entry["cefr"]] += 1
|
||||
diff_counts[entry["difficulty"]] += 1
|
||||
|
||||
print("\n📊 Final CEFR distribution:")
|
||||
for level in sorted(CEFR_LEVELS):
|
||||
count = cefr_counts.get(level, 0)
|
||||
if count:
|
||||
print(f" {level}: {count}")
|
||||
|
||||
print("\n📊 Final difficulty distribution:")
|
||||
for diff in ["easy", "intermediate", "hard"]:
|
||||
count = diff_counts.get(diff, 0)
|
||||
print(f" {diff}: {count}")
|
||||
|
||||
|
||||
def main():
|
||||
script_dir = Path(__file__).parent
|
||||
data_dir = script_dir.parent / "data-sources" / "english"
|
||||
output_dir = script_dir.parent / "datafiles"
|
||||
output_file = output_dir / "english-merged.json"
|
||||
|
||||
if not data_dir.exists():
|
||||
print(f"Error: English data directory not found: {data_dir}")
|
||||
return
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Loading extracted files from {data_dir}...")
|
||||
sources = load_extracted_files(data_dir)
|
||||
|
||||
if not sources:
|
||||
print("No extracted files found.")
|
||||
return
|
||||
|
||||
print(f"Found sources: {', '.join(sources.keys())}")
|
||||
print(f"Priority order (lowest to highest): {PRIORITY_ORDER}")
|
||||
|
||||
merged = merge_entries(sources)
|
||||
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(merged, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n✅ Merged dataset written to: {output_file}")
|
||||
print_summary(merged)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
159
scripts/merge-scripts/merge-italian-json.py
Normal file
159
scripts/merge-scripts/merge-italian-json.py
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
CEFR Data Pipeline - Stage 3: Italian Merge
|
||||
Merges extracted JSON files for Italian into an authoritative dataset.
|
||||
"""
|
||||
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
# Supported CEFR levels and difficulty mapping
|
||||
CEFR_LEVELS = {"A1", "A2", "B1", "B2", "C1", "C2"}
|
||||
DIFFICULTY_MAP = {
|
||||
"A1": "easy",
|
||||
"A2": "easy",
|
||||
"B1": "intermediate",
|
||||
"B2": "intermediate",
|
||||
"C1": "hard",
|
||||
"C2": "hard",
|
||||
}
|
||||
|
||||
# Source priority order (from lowest to highest priority)
|
||||
# Higher index = higher authority when conflicts occur
|
||||
PRIORITY_ORDER = ["italian", "it_m3"]
|
||||
|
||||
|
||||
def load_extracted_files(data_dir: Path) -> Dict[str, List[dict]]:
|
||||
"""Load all *-extracted.json files from the Italian data directory."""
|
||||
sources = {}
|
||||
for file_path in data_dir.glob("*-extracted.json"):
|
||||
source_name = file_path.stem.replace("-extracted", "")
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if isinstance(data, list):
|
||||
sources[source_name] = data
|
||||
else:
|
||||
print(f"Warning: {file_path} does not contain a list, skipping.")
|
||||
return sources
|
||||
|
||||
|
||||
def normalize_entry(entry: dict) -> Tuple[str, str]:
|
||||
"""Return (word, pos) key for merging."""
|
||||
return entry["word"].lower().strip(), entry["pos"].lower().strip()
|
||||
|
||||
|
||||
def get_source_priority(source_name: str) -> int:
|
||||
"""Return priority index for a source (higher = more authoritative)."""
|
||||
try:
|
||||
return PRIORITY_ORDER.index(source_name)
|
||||
except ValueError:
|
||||
# If source not in list, assign lowest priority
|
||||
return -1
|
||||
|
||||
|
||||
def merge_entries(sources: Dict[str, List[dict]]) -> List[dict]:
|
||||
"""Merge entries from multiple sources, resolving conflicts by priority."""
|
||||
grouped = defaultdict(list)
|
||||
for src_name, entries in sources.items():
|
||||
for entry in entries:
|
||||
key = normalize_entry(entry)
|
||||
grouped[key].append((src_name, entry["cefr"], entry))
|
||||
|
||||
merged = []
|
||||
conflicts_resolved = 0
|
||||
total_multi_source = 0
|
||||
|
||||
for (word, pos), src_entries in grouped.items():
|
||||
if len(src_entries) == 1:
|
||||
src_name, cefr, original = src_entries[0]
|
||||
final_cefr = cefr
|
||||
contributing_sources = [src_name]
|
||||
else:
|
||||
total_multi_source += 1
|
||||
sorted_entries = sorted(
|
||||
src_entries, key=lambda x: get_source_priority(x[0]), reverse=True
|
||||
)
|
||||
highest_src, highest_cefr, _ = sorted_entries[0]
|
||||
all_cefrs = {e[1] for e in src_entries}
|
||||
if len(all_cefrs) > 1:
|
||||
conflicts_resolved += 1
|
||||
|
||||
final_cefr = highest_cefr
|
||||
contributing_sources = [e[0] for e in src_entries]
|
||||
|
||||
difficulty = DIFFICULTY_MAP.get(final_cefr, "unknown")
|
||||
|
||||
merged.append(
|
||||
{
|
||||
"word": word,
|
||||
"pos": pos,
|
||||
"cefr": final_cefr,
|
||||
"difficulty": difficulty,
|
||||
"sources": sorted(contributing_sources),
|
||||
}
|
||||
)
|
||||
|
||||
print(f"Merge statistics:")
|
||||
print(f" Total unique entries: {len(merged)}")
|
||||
print(f" Entries with multiple sources: {total_multi_source}")
|
||||
print(f" Conflicts resolved by priority: {conflicts_resolved}")
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
def print_summary(merged: List[dict]):
|
||||
"""Print distribution of CEFR levels and difficulty in final dataset."""
|
||||
cefr_counts = defaultdict(int)
|
||||
diff_counts = defaultdict(int)
|
||||
|
||||
for entry in merged:
|
||||
cefr_counts[entry["cefr"]] += 1
|
||||
diff_counts[entry["difficulty"]] += 1
|
||||
|
||||
print("\n📊 Final CEFR distribution:")
|
||||
for level in sorted(CEFR_LEVELS):
|
||||
count = cefr_counts.get(level, 0)
|
||||
if count:
|
||||
print(f" {level}: {count}")
|
||||
|
||||
print("\n📊 Final difficulty distribution:")
|
||||
for diff in ["easy", "intermediate", "hard"]:
|
||||
count = diff_counts.get(diff, 0)
|
||||
print(f" {diff}: {count}")
|
||||
|
||||
|
||||
def main():
|
||||
script_dir = Path(__file__).parent
|
||||
data_dir = script_dir.parent / "data-sources" / "italian"
|
||||
output_dir = script_dir.parent / "datafiles"
|
||||
output_file = output_dir / "italian-merged.json"
|
||||
|
||||
if not data_dir.exists():
|
||||
print(f"Error: Italian data directory not found: {data_dir}")
|
||||
return
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Loading extracted files from {data_dir}...")
|
||||
sources = load_extracted_files(data_dir)
|
||||
|
||||
if not sources:
|
||||
print("No extracted files found.")
|
||||
return
|
||||
|
||||
print(f"Found sources: {', '.join(sources.keys())}")
|
||||
print(f"Priority order (lowest to highest): {PRIORITY_ORDER}")
|
||||
|
||||
merged = merge_entries(sources)
|
||||
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(merged, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n✅ Merged dataset written to: {output_file}")
|
||||
print_summary(merged)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
2987
scripts/random-datafiles/italian/it-list_with_glossas.csv
Normal file
2987
scripts/random-datafiles/italian/it-list_with_glossas.csv
Normal file
File diff suppressed because it is too large
Load diff
517565
scripts/random-datafiles/italian/subtlex-it.csv
Normal file
517565
scripts/random-datafiles/italian/subtlex-it.csv
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
2
scripts/requirements.txt
Normal file
2
scripts/requirements.txt
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
wn==1.1.0
|
||||
openpyxl==3.1.5
|
||||
|
|
@ -3,8 +3,7 @@
|
|||
{ "path": "./packages/shared" },
|
||||
{ "path": "./packages/db" },
|
||||
{ "path": "./apps/web" },
|
||||
{ "path": "./apps/api" },
|
||||
{ "path": "./data-pipeline" },
|
||||
{ "path": "./apps/api" }
|
||||
],
|
||||
"files": [],
|
||||
"files": []
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue