diff --git a/data-pipeline/package.json b/data-pipeline/package.json index a4b4523..1d27a98 100644 --- a/data-pipeline/package.json +++ b/data-pipeline/package.json @@ -3,13 +3,7 @@ "version": "1.0.0", "private": true, "type": "module", - "scripts": { - "extract": "tsx scripts/extract.ts", - "annotate": "tsx scripts/annotate.ts", - "enrich": "tsx scripts/enrich.ts", - "merge": "tsx scripts/merge.ts", - "compare": "tsx scripts/compare.ts" - }, + "scripts": {}, "dependencies": { "@lila/shared": "workspace:*", "better-sqlite3": "^12.9.0" diff --git a/data-pipeline/stage-2-annotate/scripts/annotate.ts b/data-pipeline/stage-2-annotate/scripts/annotate.ts new file mode 100644 index 0000000..bb71f60 --- /dev/null +++ b/data-pipeline/stage-2-annotate/scripts/annotate.ts @@ -0,0 +1,227 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; +import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; + +// ── Types ──────────────────────────────────────────────────────────────────── + +type OmwExample = { text: string; source: "omw" }; + +type CefrExample = { text: string; source: "cefr" }; + +type Example = OmwExample | CefrExample; + +type OmwRecord = { + source_id: string; + pos: SupportedPos; + translations: Partial>; + glosses: Partial>; + examples: Partial>; +}; + +type AnnotatedRecord = { + source_id: string; + pos: SupportedPos; + translations: Partial>; + glosses: Partial>; + examples: Partial>; + votes: Partial< + Record> + >; +}; + +type CefrSourceEntry = { + word: string; + pos: string; + cefr_level: string; + example_sentence_native?: string; +}; + +type ConflictEntry = { + word: string; + pos: string; + language: SupportedLanguageCode; + levels: string[]; +}; + +// ── Constants ───────────────────────────────────────────────────────────────── + +const POS_NORMALIZE: Record = { + noun: "noun", + n: "noun", + nom: "noun", // French + verb: "verb", + verbs: "verb", + v: "verb", + v1: "verb", + adjective: "adjective", + adjektiv: "adjective", // German + adj: "adjective", + adverb: "adverb", + adverbs: "adverb", + adv: "adverb", +}; + +const CEFR_LEVELS = new Set(["A1", "A2", "B1", "B2", "C1", "C2"]); + +const PATHS = { + omw: "stage-1-extract/output/omw.json", + cefrDir: "stage-2-annotate/sources/cefr", + outputDir: "stage-2-annotate/output", +}; + +// ── CEFR source loading ─────────────────────────────────────────────────────── + +type CefrIndex = Map; + +async function loadCefrSource( + lang: SupportedLanguageCode, +): Promise<{ index: CefrIndex; conflicts: ConflictEntry[] }> { + const filepath = path.join(PATHS.cefrDir, `${lang}.json`); + const raw = await fs.readFile(filepath, "utf-8"); + const entries = JSON.parse(raw) as CefrSourceEntry[]; + + // First pass — detect conflicts. + // Structure: "word|pos" -> Set of CEFR levels seen + const seen = new Map>(); + + for (const entry of entries) { + const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()]; + if (!pos) continue; + if (!CEFR_LEVELS.has(entry.cefr_level)) continue; + + const key = `${entry.word.toLowerCase().trim()}|${pos}`; + if (!seen.has(key)) seen.set(key, new Set()); + seen.get(key)!.add(entry.cefr_level); + } + + const conflicts: ConflictEntry[] = []; + for (const [key, levels] of seen.entries()) { + if (levels.size > 1) { + const [word, pos] = key.split("|") as [string, string]; + conflicts.push({ word, pos, language: lang, levels: [...levels] }); + } + } + + // Second pass — build index, skip conflicting entries. + const conflictKeys = new Set(conflicts.map((c) => `${c.word}|${c.pos}`)); + + const index: CefrIndex = new Map(); + for (const entry of entries) { + const pos = POS_NORMALIZE[entry.pos.toLowerCase().trim()]; + if (!pos) continue; + if (!CEFR_LEVELS.has(entry.cefr_level)) continue; + + const key = `${entry.word.toLowerCase().trim()}|${pos}`; + if (conflictKeys.has(key)) continue; + + index.set(key, { + level: entry.cefr_level, + ...(entry.example_sentence_native + ? { example: entry.example_sentence_native } + : {}), + }); + } + + return { index, conflicts }; +} + +// ── Annotation ──────────────────────────────────────────────────────────────── + +async function annotate(): Promise { + // Load OMW records + console.log("Reading OMW extract..."); + const raw = await fs.readFile(PATHS.omw, "utf-8"); + const omwRecords = JSON.parse(raw) as OmwRecord[]; + console.log(` Loaded ${omwRecords.length.toLocaleString()} synsets`); + + // Load CEFR sources for all languages + console.log("\nLoading CEFR source files..."); + const cefrIndexes = new Map(); + const allConflicts: ConflictEntry[] = []; + + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const { index, conflicts } = await loadCefrSource(lang); + cefrIndexes.set(lang, index); + allConflicts.push(...conflicts); + console.log( + ` ${lang}: ${index.size.toLocaleString()} entries, ${conflicts.length} conflicts`, + ); + } + + // Write conflicts file + await fs.mkdir(PATHS.outputDir, { recursive: true }); + await fs.writeFile( + path.join(PATHS.outputDir, "conflicts.json"), + JSON.stringify(allConflicts, null, 2), + "utf-8", + ); + console.log( + `\nWrote ${allConflicts.length} conflicts → ${PATHS.outputDir}/conflicts.json`, + ); + + // Annotate and write one file per language + console.log("\nAnnotating..."); + for (const lang of SUPPORTED_LANGUAGE_CODES) { + const index = cefrIndexes.get(lang)!; + const records: AnnotatedRecord[] = []; + let matched = 0; + + for (const record of omwRecords) { + const annotated: AnnotatedRecord = { + source_id: record.source_id, + pos: record.pos, + translations: record.translations, + glosses: record.glosses, + examples: {}, + votes: {}, + }; + + // Convert OMW examples to typed format + for (const [l, exList] of Object.entries(record.examples)) { + annotated.examples[l as SupportedLanguageCode] = exList.map((text) => ({ + text, + source: "omw" as const, + })); + } + + // Match translations for this language against CEFR index + const langTranslations = record.translations[lang] ?? []; + for (const word of langTranslations) { + const key = `${word.toLowerCase().trim()}|${record.pos}`; + const cefrEntry = index.get(key); + if (!cefrEntry) continue; + + matched++; + + // Add CEFR vote + if (!annotated.votes[lang]) annotated.votes[lang] = {}; + annotated.votes[lang]![word] = { cefr_source: cefrEntry.level }; + + // Add native example if present + if (cefrEntry.example) { + if (!annotated.examples[lang]) annotated.examples[lang] = []; + annotated.examples[lang]!.push({ + text: cefrEntry.example, + source: "cefr" as const, + }); + } + } + + records.push(annotated); + } + + const outputFile = path.join(PATHS.outputDir, `${lang}.json`); + await fs.writeFile(outputFile, JSON.stringify(records, null, 2), "utf-8"); + console.log( + ` ${lang}: ${matched.toLocaleString()} matches → ${outputFile}`, + ); + } +} + +// ── Main ───────────────────────────────────────────────────────────────────── + +annotate().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/data-pipeline/test/output/sample.json b/data-pipeline/test/output/sample.json new file mode 100644 index 0000000..5dd774f --- /dev/null +++ b/data-pipeline/test/output/sample.json @@ -0,0 +1,4492 @@ +[ + { + "source_id": "ili:i90862", + "pos": "noun", + "translations": { + "en": [ + "kinsman" + ], + "es": [ + "pariente" + ], + "de": [ + "Gevatter", + "Anverwandter", + "Familienmitglied", + "Verwandter", + "Familienangehöriger", + "Angehöriger", + "Verwandte" + ], + "fr": [ + "parent" + ] + }, + "glosses": { + "en": [ + "a male relative" + ], + "de": [ + "ein männlicher Verwandter" + ] + }, + "examples": { + "de": [ + { + "text": "Jedes Familienmitglied hat seine Aufgaben.", + "source": "cefr" + }, + { + "text": "Er ist ein entfernter Verwandter von mir.", + "source": "cefr" + }, + { + "text": "Alle Familienangehörigen kamen zum Treffen.", + "source": "cefr" + }, + { + "text": "Er ist ein Angehöriger der Familie.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "Ses parents sont très fiers de lui.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Tengo muchos parientes viviendo en esta ciudad.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "kinsman": { + "cefr_source": "C1" + } + }, + "de": { + "Familienmitglied": { + "cefr_source": "A2" + }, + "Verwandter": { + "cefr_source": "B1" + }, + "Familienangehöriger": { + "cefr_source": "B1" + }, + "Angehöriger": { + "cefr_source": "B2" + } + }, + "fr": { + "parent": { + "cefr_source": "A1" + } + }, + "es": { + "pariente": { + "cefr_source": "A2" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i23087", + "pos": "verb", + "translations": { + "en": [ + "teach" + ], + "it": [ + "addestrare", + "ammaestrare", + "insegnare" + ], + "es": [ + "enseñar" + ], + "fr": [ + "enseigner", + "apprendre", + "guider" + ] + }, + "glosses": { + "en": [ + "accustom gradually to some action or attitude" + ] + }, + "examples": { + "en": [ + { + "text": "The child is taught to obey her parents", + "source": "omw" + } + ], + "it": [ + { + "text": "Stiamo addestrando il nostro cane.", + "source": "cefr" + }, + { + "text": "Lei insegna italiano ai bambini.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "Elle enseigne le français au lycée.", + "source": "cefr" + }, + { + "text": "J'apprends le français.", + "source": "cefr" + }, + { + "text": "Il va nous guider à travers la forêt.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Ella enseña español en la universidad.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "teach": { + "cefr_source": "A1" + } + }, + "it": { + "addestrare": { + "cefr_source": "B1" + }, + "insegnare": { + "cefr_source": "A1" + } + }, + "fr": { + "enseigner": { + "cefr_source": "A2" + }, + "apprendre": { + "cefr_source": "A1" + }, + "guider": { + "cefr_source": "A2" + } + }, + "es": { + "enseñar": { + "cefr_source": "A1" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i26718", + "pos": "verb", + "translations": { + "en": [ + "dub", + "nickname" + ], + "it": [ + "battezzare", + "cognominare", + "doppiare", + "soprannominare" + ], + "es": [ + "apodar" + ], + "fr": [ + "surnom", + "baptiser" + ] + }, + "glosses": { + "en": [ + "give a nickname to" + ] + }, + "examples": { + "it": [ + { + "text": "Hanno deciso di battezzare il loro figlio la prossima primavera.", + "source": "cefr" + }, + { + "text": "Lo hanno soprannominato 'il Professore'.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "Ils ont décidé de baptiser leur enfant Marie.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "dub": { + "cefr_source": "B2" + } + }, + "it": { + "battezzare": { + "cefr_source": "B1" + }, + "soprannominare": { + "cefr_source": "B2" + } + }, + "fr": { + "baptiser": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i4448", + "pos": "adjective", + "translations": { + "en": [ + "drab", + "dreary" + ], + "es": [ + "igual", + "rutinario" + ], + "fr": [ + "morne", + "maussade", + "sombre" + ] + }, + "glosses": { + "en": [ + "lacking in liveliness or charm or surprise" + ] + }, + "examples": { + "en": [ + { + "text": "her drab personality", + "source": "omw" + }, + { + "text": "life was drab compared with the more exciting life style overseas", + "source": "omw" + }, + { + "text": "a series of dreary dinner parties", + "source": "omw" + } + ], + "fr": [ + { + "text": "Le temps était morne et pluvieux.", + "source": "cefr" + }, + { + "text": "Le temps était maussade toute la journée.", + "source": "cefr" + }, + { + "text": "La pièce était sombre sans lumière.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Todos somos iguales.", + "source": "cefr" + }, + { + "text": "Su trabajo se ha vuelto muy rutinario.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "drab": { + "cefr_source": "B2" + }, + "dreary": { + "cefr_source": "B2" + } + }, + "fr": { + "morne": { + "cefr_source": "B2" + }, + "maussade": { + "cefr_source": "B2" + }, + "sombre": { + "cefr_source": "B1" + } + }, + "es": { + "igual": { + "cefr_source": "A2" + }, + "rutinario": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i85845", + "pos": "noun", + "translations": { + "en": [ + "natural depression", + "depression" + ], + "it": [ + "avvallamento" + ], + "es": [ + "depresión", + "depresión natural" + ], + "fr": [ + "dépression" + ] + }, + "glosses": { + "en": [ + "a sunken or depressed geological formation" + ] + }, + "examples": { + "fr": [ + { + "text": "Elle souffre de dépression.", + "source": "cefr" + } + ], + "es": [ + { + "text": "La depresión es una enfermedad grave.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "depression": { + "cefr_source": "B2" + } + }, + "fr": { + "dépression": { + "cefr_source": "B2" + } + }, + "es": { + "depresión": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i27202", + "pos": "verb", + "translations": { + "en": [ + "jump" + ], + "fr": [ + "sauter" + ] + }, + "glosses": { + "en": [ + "make a sudden physical attack on" + ] + }, + "examples": { + "en": [ + { + "text": "The muggers jumped the woman in the fur coat", + "source": "omw" + } + ], + "fr": [ + { + "text": "Le chien aime sauter par-dessus la clôture.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "jump": { + "cefr_source": "A1" + } + }, + "fr": { + "sauter": { + "cefr_source": "A2" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i27830", + "pos": "verb", + "translations": { + "en": [ + "run into", + "bump into", + "jar against", + "butt against", + "knock against" + ], + "it": [ + "urtare" + ], + "es": [ + "chocar", + "colisionar", + "golpearse contra", + "topar" + ], + "de": [ + "anraunzen", + "anfahren", + "anschnauzen", + "ankläffen", + "anschreien", + "anblaffen", + "anblaffen", + "anbelfern", + "anbrüllen", + "anbellen" + ] + }, + "glosses": { + "en": [ + "collide violently with an obstacle" + ], + "de": [ + "heftig mit einem Hindernis zusammenstoßen" + ] + }, + "examples": { + "en": [ + { + "text": "I ran into the telephone pole", + "source": "omw" + } + ], + "it": [ + { + "text": "Ho urtato il tavolo con il gomito.", + "source": "cefr" + } + ], + "de": [ + { + "text": "Der Bus fuhr an die Haltestelle an.", + "source": "cefr" + }, + { + "text": "Er hat mich ohne Grund angeschrien.", + "source": "cefr" + } + ], + "es": [ + { + "text": "El coche chocó contra un árbol.", + "source": "cefr" + }, + { + "text": "Me topé con un viejo amigo en la calle.", + "source": "cefr" + } + ] + }, + "votes": { + "it": { + "urtare": { + "cefr_source": "B1" + } + }, + "de": { + "anfahren": { + "cefr_source": "B1" + }, + "anschreien": { + "cefr_source": "B1" + } + }, + "es": { + "chocar": { + "cefr_source": "A2" + }, + "topar": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i27676", + "pos": "verb", + "translations": { + "en": [ + "fumble" + ] + }, + "glosses": { + "en": [ + "handle clumsily" + ] + }, + "examples": {}, + "votes": { + "en": { + "fumble": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i30768", + "pos": "verb", + "translations": { + "en": [ + "attract", + "appeal" + ], + "it": [ + "allettare", + "attirare", + "attrarre" + ], + "es": [ + "atraer" + ], + "de": [ + "anziehen", + "etwas überziehen", + "einkleiden", + "etwas überstreifen", + "bekleiden", + "hineinschlüpfen", + "überstülpen", + "ankleiden", + "Kleidung anlegen" + ], + "fr": [ + "allécher", + "attirer" + ] + }, + "glosses": { + "en": [ + "be attractive to" + ], + "de": [ + "ein Kleidungsstück in der dafür vorgesehenen Weise auf den Körper bringen" + ] + }, + "examples": { + "en": [ + { + "text": "The idea of a vacation appeals to me", + "source": "omw" + }, + { + "text": "The beautiful garden attracted many people", + "source": "omw" + } + ], + "de": [ + { + "text": "Sie zog sich das Kleid an.", + "source": "omw" + } + ], + "it": [ + { + "text": "Il nuovo negozio attira molti clienti.", + "source": "cefr" + }, + { + "text": "Il magnete attrae il metallo.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "La promesse d'un salaire élevé a alléché de nombreux candidats.", + "source": "cefr" + }, + { + "text": "Cette publicité attire l'attention.", + "source": "cefr" + } + ], + "es": [ + { + "text": "El imán atrae el metal.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "attract": { + "cefr_source": "B1" + } + }, + "it": { + "attirare": { + "cefr_source": "B2" + }, + "attrarre": { + "cefr_source": "B1" + } + }, + "de": { + "anziehen": { + "cefr_source": "A2" + }, + "bekleiden": { + "cefr_source": "B2" + } + }, + "fr": { + "allécher": { + "cefr_source": "C1" + }, + "attirer": { + "cefr_source": "B1" + } + }, + "es": { + "atraer": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i112909", + "pos": "noun", + "translations": { + "en": [ + "regulation" + ], + "es": [ + "reglamento" + ], + "fr": [ + "réglementation", + "gouvernement", + "tenue" + ] + }, + "glosses": { + "en": [ + "the state of being controlled or governed" + ] + }, + "examples": { + "fr": [ + { + "text": "La nouvelle réglementation est très stricte.", + "source": "cefr" + }, + { + "text": "Le gouvernement a annoncé de nouvelles mesures.", + "source": "cefr" + }, + { + "text": "Elle a choisi une tenue élégante pour la soirée.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Debemos seguir el reglamento.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "regulation": { + "cefr_source": "B2" + } + }, + "fr": { + "réglementation": { + "cefr_source": "B2" + }, + "gouvernement": { + "cefr_source": "B1" + }, + "tenue": { + "cefr_source": "B1" + } + }, + "es": { + "reglamento": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i46846", + "pos": "noun", + "translations": { + "en": [ + "ladybug", + "ladybeetle", + "lady beetle", + "ladybird", + "ladybird beetle" + ], + "it": [ + "coccinella" + ], + "fr": [ + "coccinelle" + ] + }, + "glosses": { + "en": [ + "small round bright-colored and spotted beetle that usually feeds on aphids and other insect pests" + ] + }, + "examples": { + "fr": [ + { + "text": "Une coccinelle s'est posée sur ma main.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "ladybug": { + "cefr_source": "A2" + } + }, + "fr": { + "coccinelle": { + "cefr_source": "A2" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i15517", + "pos": "adjective", + "translations": { + "en": [ + "judicial" + ], + "it": [ + "giudiziale", + "giudiziario" + ], + "es": [ + "judicial" + ], + "de": [ + "durch einen Richter", + "durch ein Gericht", + "durch den Richter", + "richterlich" + ], + "fr": [ + "judiciaire" + ] + }, + "glosses": { + "en": [ + "belonging or appropriate to the office of a judge" + ], + "de": [ + "zum Amt eines Richters gehörend oder diesem zugehörig" + ] + }, + "examples": { + "en": [ + { + "text": "judicial robes", + "source": "omw" + } + ], + "it": [ + { + "text": "Hanno avviato un'azione giudiziale.", + "source": "cefr" + }, + { + "text": "Il sistema giudiziario italiano è complesso.", + "source": "cefr" + } + ], + "de": [ + { + "text": "Es bedarf einer richterlichen Anordnung.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "L'affaire est en cours de procédure judiciaire.", + "source": "cefr" + } + ], + "es": [ + { + "text": "El proceso judicial fue largo.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "judicial": { + "cefr_source": "C1" + } + }, + "it": { + "giudiziale": { + "cefr_source": "C1" + }, + "giudiziario": { + "cefr_source": "C1" + } + }, + "de": { + "richterlich": { + "cefr_source": "C1" + } + }, + "fr": { + "judiciaire": { + "cefr_source": "B2" + } + }, + "es": { + "judicial": { + "cefr_source": "C1" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i11095", + "pos": "adjective", + "translations": { + "en": [ + "poor" + ], + "es": [ + "pobre" + ], + "fr": [ + "pauvre" + ] + }, + "glosses": { + "en": [ + "characterized by or indicating poverty" + ] + }, + "examples": { + "en": [ + { + "text": "the country had a poor economy", + "source": "omw" + }, + { + "text": "they lived in the poor section of town", + "source": "omw" + } + ], + "fr": [ + { + "text": "Il est très pauvre.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Es un hombre muy pobre.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "poor": { + "cefr_source": "A2" + } + }, + "fr": { + "pauvre": { + "cefr_source": "A1" + } + }, + "es": { + "pobre": { + "cefr_source": "A1" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i62321", + "pos": "noun", + "translations": { + "en": [ + "flashiness", + "garishness", + "gaudiness", + "loudness", + "brashness", + "meretriciousness", + "tawdriness", + "glitz" + ], + "it": [ + "pacchianeria", + "vistosità" + ], + "es": [ + "astracanada", + "chabacanería", + "garrulería", + "horterada", + "mal gusto", + "ordinariez", + "zafiedad" + ], + "de": [ + "Aufdringlichkeit", + "Zudringlichkeit", + "Penetranz" + ], + "fr": [ + "culot" + ] + }, + "glosses": { + "en": [ + "tasteless showiness" + ], + "de": [ + "geschmacklose Aufdringlichkeit" + ] + }, + "examples": { + "fr": [ + { + "text": "Il a eu le culot de me demander de l'argent après tout ça.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "loudness": { + "cefr_source": "B2" + }, + "glitz": { + "cefr_source": "B2" + } + }, + "fr": { + "culot": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i22613", + "pos": "verb", + "translations": { + "en": [ + "scavenge", + "clean" + ], + "es": [ + "limpiar" + ], + "fr": [ + "nettoyer" + ] + }, + "glosses": { + "en": [ + "remove unwanted substances from" + ] + }, + "examples": { + "fr": [ + { + "text": "Je dois nettoyer ma chambre.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Necesito limpiar mi habitación.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "scavenge": { + "cefr_source": "B2" + } + }, + "fr": { + "nettoyer": { + "cefr_source": "A1" + } + }, + "es": { + "limpiar": { + "cefr_source": "A1" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i4857", + "pos": "adjective", + "translations": { + "en": [ + "enthusiastic" + ], + "it": [ + "caloroso", + "entusiastico", + "fervido", + "entusiasta" + ], + "fr": [ + "courageux", + "enthousiaste" + ] + }, + "glosses": { + "en": [ + "having or showing great excitement and interest" + ] + }, + "examples": { + "en": [ + { + "text": "enthusiastic crowds filled the streets", + "source": "omw" + }, + { + "text": "an enthusiastic response", + "source": "omw" + }, + { + "text": "was enthusiastic about taking ballet lessons", + "source": "omw" + } + ], + "it": [ + { + "text": "Abbiamo ricevuto un'accoglienza molto calorosa.", + "source": "cefr" + }, + { + "text": "Ha espresso un fervido desiderio di pace.", + "source": "cefr" + }, + { + "text": "Era molto entusiasta del nuovo progetto.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "C'est une personne très courageuse.", + "source": "cefr" + }, + { + "text": "Elle est très enthousiaste à l'idée de ce voyage.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "enthusiastic": { + "cefr_source": "B1" + } + }, + "it": { + "caloroso": { + "cefr_source": "B1" + }, + "fervido": { + "cefr_source": "C1" + }, + "entusiasta": { + "cefr_source": "B1" + } + }, + "fr": { + "courageux": { + "cefr_source": "A2" + }, + "enthousiaste": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i104521", + "pos": "noun", + "translations": { + "en": [ + "veronica", + "speedwell" + ], + "it": [ + "veronica" + ], + "de": [ + "Allerweltsheil", + "Grundheil", + "Ehrenpreis", + "Männertreu", + "Köhlerkraut", + "Schlangenkraut" + ], + "fr": [ + "veronica", + "véronique" + ] + }, + "glosses": { + "en": [ + "any plant of the genus Veronica" + ], + "de": [ + "jede Pflanze der Gattung Veronica" + ] + }, + "examples": { + "de": [ + { + "text": "Er erhielt den Ehrenpreis für sein Lebenswerk.", + "source": "cefr" + } + ] + }, + "votes": { + "de": { + "Ehrenpreis": { + "cefr_source": "C1" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i958", + "pos": "adjective", + "translations": { + "en": [ + "gracious" + ], + "es": [ + "amable" + ] + }, + "glosses": { + "en": [ + "disposed to bestow favors" + ] + }, + "examples": { + "en": [ + { + "text": "thanks to the gracious gods", + "source": "omw" + } + ], + "es": [ + { + "text": "Siempre es muy amable con todos.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "gracious": { + "cefr_source": "B2" + } + }, + "es": { + "amable": { + "cefr_source": "A2" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i109447", + "pos": "noun", + "translations": { + "en": [ + "declension" + ], + "it": [ + "declinazione" + ], + "es": [ + "declinación" + ], + "de": [ + "Deklination", + "Ortsmissweisung", + "Missweisung" + ], + "fr": [ + "déclinaison" + ] + }, + "glosses": { + "en": [ + "the inflection of nouns and pronouns and adjectives in Indo-European languages" + ], + "de": [ + "die Beugung von Substantiven, Pronomen und Adjektiven in den indogermanischen Sprachen" + ] + }, + "examples": { + "it": [ + { + "text": "La declinazione dei nomi latini può essere complessa.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "En latin, les noms ont des déclinaisons.", + "source": "cefr" + } + ] + }, + "votes": { + "it": { + "declinazione": { + "cefr_source": "B2" + } + }, + "fr": { + "déclinaison": { + "cefr_source": "C1" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i18812", + "pos": "adverb", + "translations": { + "en": [ + "fairly", + "fair", + "evenhandedly" + ], + "es": [ + "con justicia", + "imparcialmente", + "justamente" + ] + }, + "glosses": { + "en": [ + "without favoring one party, in a fair evenhanded manner" + ] + }, + "examples": { + "en": [ + { + "text": "deal fairly with one another", + "source": "omw" + } + ], + "es": [ + { + "text": "Llegó justamente a tiempo para la reunión.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "fairly": { + "cefr_source": "B1" + } + }, + "es": { + "justamente": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "has_cefr_vote" + }, + { + "source_id": "ili:i44747", + "pos": "noun", + "translations": { + "en": [ + "Centrocercus", + "genus Centrocercus" + ], + "es": [ + "Centrocercus", + "género Centrocercus" + ], + "fr": [ + "centrocercus" + ] + }, + "glosses": { + "en": [ + "sage grouse" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i20736", + "pos": "adverb", + "translations": { + "en": [ + "insinuatingly" + ] + }, + "glosses": { + "en": [ + "in an insinuating manner" + ] + }, + "examples": { + "en": [ + { + "text": "the art book has art to sell, insinuatingly, and for a purpose, like the American muse, which has in fact a tradition to sell, and one which doesn't exist, in painting", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i25017", + "pos": "verb", + "translations": { + "en": [ + "superordinate" + ] + }, + "glosses": { + "en": [ + "place in a superior order or rank" + ] + }, + "examples": { + "en": [ + { + "text": "These two notions are superordinated to a third", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i46616", + "pos": "noun", + "translations": { + "en": [ + "sand cat" + ], + "fr": [ + "chat de marguerite", + "chat du désert", + "chat du général marguerite", + "chat des sables" + ] + }, + "glosses": { + "en": [ + "a desert wildcat" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i83491", + "pos": "noun", + "translations": { + "en": [ + "Bangor" + ] + }, + "glosses": { + "en": [ + "a university town in northwestern Wales on the Menai Strait" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i72819", + "pos": "noun", + "translations": { + "en": [ + "Missouri" + ], + "fr": [ + "Saint Peters", + "Joplin", + "Missouri" + ] + }, + "glosses": { + "en": [ + "a dialect of the Chiwere language spoken by the Missouri" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i99797", + "pos": "noun", + "translations": { + "en": [ + "prickly poppy", + "argemone", + "white thistle", + "devil's fig" + ], + "es": [ + "argemone" + ], + "fr": [ + "argemone" + ] + }, + "glosses": { + "en": [ + "any plant of the genus Argemone having large white or yellow flowers and prickly leaves and stems and pods; chiefly of tropical America" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i90317", + "pos": "noun", + "translations": { + "en": [ + "great-uncle", + "granduncle" + ], + "it": [ + "protio", + "prozio" + ], + "es": [ + "tío abuelo" + ], + "fr": [ + "grand-oncle" + ] + }, + "glosses": { + "en": [ + "an uncle of your father or mother" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i53881", + "pos": "noun", + "translations": { + "en": [ + "flour bin" + ], + "es": [ + "frasco de harina", + "tarro de harina" + ] + }, + "glosses": { + "en": [ + "a bin for holding flour" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i58210", + "pos": "noun", + "translations": { + "en": [ + "road map" + ], + "fr": [ + "carte routière" + ] + }, + "glosses": { + "en": [ + "a map showing roads (for automobile travel)" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i82638", + "pos": "noun", + "translations": { + "en": [ + "South American country", + "South American nation" + ] + }, + "glosses": { + "en": [ + "any one of the countries occupying the South American continent" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i71111", + "pos": "noun", + "translations": { + "en": [ + "weekly" + ], + "it": [ + "ebdomadario", + "eddomadario", + "settimanale" + ], + "fr": [ + "hebdomadaire" + ] + }, + "glosses": { + "en": [ + "a periodical that is published every week (or 52 issues per year)" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i10131", + "pos": "adjective", + "translations": { + "en": [ + "embattled" + ], + "it": [ + "GAP!", + "in difficoltà" + ] + }, + "glosses": { + "en": [ + "prepared for battle" + ] + }, + "examples": { + "en": [ + { + "text": "an embattled city", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i108195", + "pos": "noun", + "translations": { + "en": [ + "mass unit" + ], + "es": [ + "unidad de masa" + ] + }, + "glosses": { + "en": [ + "a unit of measurement for mass" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i82225", + "pos": "noun", + "translations": { + "en": [ + "Wrangell-St. Elias National Park" + ] + }, + "glosses": { + "en": [ + "the largest national park of the United States; located in Alaska" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i47159", + "pos": "noun", + "translations": { + "en": [ + "Fenusa", + "genus-Fenusa" + ], + "es": [ + "Fenusa", + "género Fenusa" + ] + }, + "glosses": { + "en": [ + "birch leaf miner" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i106504", + "pos": "noun", + "translations": { + "en": [ + "entail" + ] + }, + "glosses": { + "en": [ + "land received by fee tail" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i46047", + "pos": "noun", + "translations": { + "en": [ + "Polynesian tattler", + "Heteroscelus incanus" + ], + "fr": [ + "heteroscelus incanus" + ] + }, + "glosses": { + "en": [ + "tattler of Pacific coastal regions" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i71598", + "pos": "noun", + "translations": { + "en": [ + "market letter" + ] + }, + "glosses": { + "en": [ + "a newsletter written by an analyst of the stock market and sold to subscribers" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i115719", + "pos": "noun", + "translations": { + "en": [ + "monosaccharide", + "monosaccharose", + "simple sugar" + ], + "it": [ + "manosio", + "monosaccaride", + "monosio", + "monoso" + ], + "es": [ + "monosacárido" + ], + "de": [ + "Monosaccharid", + "Einfachzucker" + ], + "fr": [ + "ose", + "Ose", + "monosaccharide" + ] + }, + "glosses": { + "en": [ + "a sugar (like sucrose or fructose) that does not hydrolyse to give other sugars; the simplest group of carbohydrates" + ], + "de": [ + "ein Zucker (wie Saccharose oder Fruktose), der nicht zu anderen Zuckern hydrolysiert wird; die einfachste Gruppe der Kohlenhydrate" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_cefr_vote" + }, + { + "source_id": "ili:i74228", + "pos": "noun", + "translations": { + "en": [ + "negotiation", + "dialogue", + "talks" + ], + "it": [ + "contrattazione", + "deal", + "dialogo", + "negoziato", + "negoziazione", + "trattativa" + ], + "es": [ + "gestión", + "negociación", + "tramitación" + ], + "de": [ + "Besprechung", + "Verhandlung" + ], + "fr": [ + "dialogue", + "négociation" + ] + }, + "glosses": { + "en": [ + "a discussion intended to produce an agreement" + ], + "de": [ + "Diskussion zur Ausarbeitung eines Abkommens" + ] + }, + "examples": { + "en": [ + { + "text": "the buyout negotiation lasted several days", + "source": "omw" + }, + { + "text": "they disagreed but kept an open dialogue", + "source": "omw" + }, + { + "text": "talks between Israelis and Palestinians", + "source": "omw" + } + ], + "it": [ + { + "text": "La contrattazione collettiva è importante per i lavoratori.", + "source": "cefr" + }, + { + "text": "Abbiamo chiuso un buon deal.", + "source": "cefr" + }, + { + "text": "È importante mantenere un dialogo aperto.", + "source": "cefr" + }, + { + "text": "Il negoziato per la pace è stato lungo e difficile.", + "source": "cefr" + }, + { + "text": "Le negoziazioni per il nuovo contratto sono state lunghe e complesse.", + "source": "cefr" + }, + { + "text": "Le trattative sono in corso.", + "source": "cefr" + } + ], + "de": [ + { + "text": "Wir haben morgen eine wichtige Besprechung.", + "source": "cefr" + }, + { + "text": "Die Verhandlungen dauerten den ganzen Tag.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "Le dialogue est essentiel pour résoudre les conflits.", + "source": "cefr" + }, + { + "text": "Les négociations ont été longues et difficiles.", + "source": "cefr" + } + ], + "es": [ + { + "text": "La gestión del proyecto fue excelente.", + "source": "cefr" + }, + { + "text": "Las negociaciones fueron difíciles.", + "source": "cefr" + }, + { + "text": "La tramitación de los documentos puede llevar tiempo.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "negotiation": { + "cefr_source": "B2" + }, + "dialogue": { + "cefr_source": "B2" + } + }, + "it": { + "contrattazione": { + "cefr_source": "B2" + }, + "deal": { + "cefr_source": "B1" + }, + "dialogo": { + "cefr_source": "B1" + }, + "negoziato": { + "cefr_source": "B2" + }, + "negoziazione": { + "cefr_source": "B2" + }, + "trattativa": { + "cefr_source": "B2" + } + }, + "de": { + "Besprechung": { + "cefr_source": "B1" + }, + "Verhandlung": { + "cefr_source": "B2" + } + }, + "fr": { + "dialogue": { + "cefr_source": "B1" + }, + "négociation": { + "cefr_source": "B2" + } + }, + "es": { + "gestión": { + "cefr_source": "B2" + }, + "negociación": { + "cefr_source": "B2" + }, + "tramitación": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i408", + "pos": "adjective", + "translations": { + "en": [ + "aground" + ], + "es": [ + "encallado", + "varado" + ], + "de": [ + "aufgrund", + "dank", + "aufgrund von", + "auf Grund von", + "vermöge", + "infolge", + "auf Grund" + ] + }, + "glosses": { + "en": [ + "stuck in a place where a ship can no longer float" + ], + "de": [ + "an einer Stelle feststecken, an der ein Schiff nicht mehr schwimmen kann" + ] + }, + "examples": { + "en": [ + { + "text": "a ship aground offshore", + "source": "omw" + }, + { + "text": "a boat aground on the beach waiting for the tide to lift it", + "source": "omw" + } + ], + "es": [ + { + "text": "El barco quedó varado en la arena.", + "source": "cefr" + } + ] + }, + "votes": { + "es": { + "varado": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i41575", + "pos": "noun", + "translations": { + "en": [ + "walkout" + ] + }, + "glosses": { + "en": [ + "the act of walking out (of a meeting or organization) as a sign of protest" + ] + }, + "examples": { + "en": [ + { + "text": "there was a walkout by the Black members as the chairman rose to speak", + "source": "omw" + } + ] + }, + "votes": { + "en": { + "walkout": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i67480", + "pos": "noun", + "translations": { + "en": [ + "tasting" + ], + "fr": [ + "dégustation" + ] + }, + "glosses": { + "en": [ + "a small amount (especially of food or wine)" + ] + }, + "examples": { + "fr": [ + { + "text": "Nous avons participé à une dégustation de vins.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "tasting": { + "cefr_source": "B1" + } + }, + "fr": { + "dégustation": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i11256", + "pos": "adjective", + "translations": { + "en": [ + "hobnailed" + ] + }, + "glosses": { + "en": [ + "marked by the wearing of heavy boots studded with hobnails" + ] + }, + "examples": { + "en": [ + { + "text": "hobnailed laborers", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i86151", + "pos": "noun", + "translations": { + "en": [ + "sediment", + "deposit" + ], + "it": [ + "deposito", + "posatura", + "sedimento" + ], + "es": [ + "depósito", + "sedimento" + ], + "de": [ + "Ablagerung", + "Sedimentation", + "Sedimentierung", + "Sedimentbildung" + ], + "fr": [ + "sédiment", + "dépôt" + ] + }, + "glosses": { + "en": [ + "matter that has been deposited by some natural process" + ], + "de": [ + "Materie, die durch einen natürlichen Prozess abgelagert wurde" + ] + }, + "examples": { + "it": [ + { + "text": "Ho lasciato i bagagli al deposito.", + "source": "cefr" + }, + { + "text": "C'era un sedimento sul fondo della bottiglia.", + "source": "cefr" + } + ], + "de": [ + { + "text": "Es gab Ablagerungen in den Rohren.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "Le sédiment au fond du lac est très fin.", + "source": "cefr" + }, + { + "text": "J'ai fait un dépôt à la banque.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Hice un depósito en el banco.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "sediment": { + "cefr_source": "C1" + }, + "deposit": { + "cefr_source": "B1" + } + }, + "it": { + "deposito": { + "cefr_source": "B1" + }, + "sedimento": { + "cefr_source": "B2" + } + }, + "de": { + "Ablagerung": { + "cefr_source": "B2" + } + }, + "fr": { + "sédiment": { + "cefr_source": "B2" + }, + "dépôt": { + "cefr_source": "B1" + } + }, + "es": { + "depósito": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i45550", + "pos": "noun", + "translations": { + "en": [ + "conch" + ], + "fr": [ + "conque" + ] + }, + "glosses": { + "en": [ + "any of various edible tropical marine gastropods of the genus Strombus having a brightly-colored spiral shell with large outer lip" + ] + }, + "examples": { + "fr": [ + { + "text": "On entend la mer dans une conque.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "conch": { + "cefr_source": "B1" + } + }, + "fr": { + "conque": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i117521", + "pos": "noun", + "translations": { + "en": [ + "moratorium" + ], + "it": [ + "moratoria" + ], + "fr": [ + "moratoire" + ] + }, + "glosses": { + "en": [ + "a legally authorized postponement before some obligation must be discharged" + ] + }, + "examples": { + "it": [ + { + "text": "Il governo ha imposto una moratoria sui nuovi progetti edilizi.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "Le gouvernement a décrété un moratoire sur la pêche.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "moratorium": { + "cefr_source": "C1" + } + }, + "it": { + "moratoria": { + "cefr_source": "C1" + } + }, + "fr": { + "moratoire": { + "cefr_source": "C1" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i31764", + "pos": "verb", + "translations": { + "en": [ + "return" + ], + "fr": [ + "rendre", + "retourner", + "revenir" + ] + }, + "glosses": { + "en": [ + "return to a previous position; in mathematics" + ] + }, + "examples": { + "en": [ + { + "text": "The point returned to the interior of the figure", + "source": "omw" + } + ], + "fr": [ + { + "text": "Il doit rendre les livres à la bibliothèque.", + "source": "cefr" + }, + { + "text": "Je dois retourner ce livre à la bibliothèque.", + "source": "cefr" + }, + { + "text": "Je dois revenir demain.", + "source": "cefr" + } + ] + }, + "votes": { + "fr": { + "rendre": { + "cefr_source": "A2" + }, + "retourner": { + "cefr_source": "A2" + }, + "revenir": { + "cefr_source": "A1" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i48149", + "pos": "noun", + "translations": { + "en": [ + "post horse", + "post-horse", + "poster" + ], + "it": [ + "cavallo di posta" + ], + "fr": [ + "affiche" + ] + }, + "glosses": { + "en": [ + "a horse kept at an inn or post house for use by mail carriers or for rent to travelers" + ] + }, + "examples": { + "fr": [ + { + "text": "L'affiche du concert est très colorée.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "poster": { + "cefr_source": "A2" + } + }, + "fr": { + "affiche": { + "cefr_source": "A2" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i51126", + "pos": "noun", + "translations": { + "en": [ + "brickwork" + ], + "it": [ + "GAP!", + "muratura in mattoni" + ], + "es": [ + "aparejo", + "calicanto", + "enladrillado", + "mampostería" + ], + "fr": [ + "appareil" + ] + }, + "glosses": { + "en": [ + "masonry done with bricks and mortar" + ] + }, + "examples": { + "fr": [ + { + "text": "J'ai acheté un nouvel appareil photo.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "brickwork": { + "cefr_source": "B2" + } + }, + "fr": { + "appareil": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i17542", + "pos": "adjective", + "translations": { + "en": [ + "interdisciplinary" + ], + "it": [ + "interdisciplinare", + "multidisciplinare" + ], + "de": [ + "multidisziplinär", + "fachübergreifend", + "interdisziplinär", + "fächerübergreifend" + ], + "fr": [ + "interdisciplinaire" + ] + }, + "glosses": { + "en": [ + "drawing from or characterized by participation of two or more fields of study" + ], + "de": [ + "die Zusammenarbeit mehrerer Disziplinen betreffend\">" + ] + }, + "examples": { + "en": [ + { + "text": "interdisciplinary studies", + "source": "omw" + }, + { + "text": "an interdisciplinary conference", + "source": "omw" + } + ], + "it": [ + { + "text": "Il progetto richiede un approccio interdisciplinare.", + "source": "cefr" + } + ], + "de": [ + { + "text": "Das Projekt ist interdisziplinär angelegt.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "Ce projet de recherche est résolument interdisciplinaire.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "interdisciplinary": { + "cefr_source": "C1" + } + }, + "it": { + "interdisciplinare": { + "cefr_source": "C1" + } + }, + "de": { + "interdisziplinär": { + "cefr_source": "C1" + } + }, + "fr": { + "interdisciplinaire": { + "cefr_source": "C1" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i69459", + "pos": "noun", + "translations": { + "en": [ + "new edition" + ], + "it": [ + "riedizione" + ], + "fr": [ + "new edition" + ] + }, + "glosses": { + "en": [ + "a publication (such as a book) that has been modified or updated and offered again for sale" + ] + }, + "examples": { + "it": [ + { + "text": "Il libro è stato pubblicato in una nuova riedizione.", + "source": "cefr" + } + ] + }, + "votes": { + "it": { + "riedizione": { + "cefr_source": "C1" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i75841", + "pos": "noun", + "translations": { + "en": [ + "stampede" + ], + "de": [ + "Stampede", + "Herdenpanik" + ], + "fr": [ + "débandade" + ] + }, + "glosses": { + "en": [ + "a wild headlong rush of frightened animals (horses or cattle)" + ], + "de": [ + "eine wilde, kopfüber laufende Flucht von verängstigten Tieren (Pferden oder Rindern)" + ] + }, + "examples": { + "fr": [ + { + "text": "Après l'explosion, ce fut la débandade générale.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "stampede": { + "cefr_source": "B2" + } + }, + "fr": { + "débandade": { + "cefr_source": "C1" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i67108", + "pos": "noun", + "translations": { + "en": [ + "stocktaking", + "stock-taking" + ], + "it": [ + "inventario" + ], + "es": [ + "balance" + ] + }, + "glosses": { + "en": [ + "reappraisal of a situation or position or outlook" + ] + }, + "examples": { + "it": [ + { + "text": "Dobbiamo fare l'inventario del magazzino.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Es importante mantener un balance entre trabajo y vida personal.", + "source": "cefr" + } + ] + }, + "votes": { + "it": { + "inventario": { + "cefr_source": "B2" + } + }, + "es": { + "balance": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i14270", + "pos": "adjective", + "translations": { + "en": [ + "cockamamie", + "cockamamy", + "goofy", + "sappy", + "silly", + "wacky", + "whacky", + "zany" + ], + "es": [ + "tonto" + ], + "de": [ + "albern", + "naiv", + "unreif", + "kindsköpfig", + "kindlich", + "kindisch", + "unentwickelt", + "kindhaft", + "pueril", + "infantil", + "puerilistisch" + ], + "fr": [ + "déraisonnable", + "fou", + "drôle", + "aberrant" + ] + }, + "glosses": { + "en": [ + "ludicrous, foolish" + ], + "de": [ + "lächerlich, töricht" + ] + }, + "examples": { + "en": [ + { + "text": "gave me a cockamamie reason for not going", + "source": "omw" + }, + { + "text": "wore a goofy hat", + "source": "omw" + }, + { + "text": "a silly idea", + "source": "omw" + }, + { + "text": "some wacky plan for selling more books", + "source": "omw" + } + ], + "de": [ + { + "text": "Hör auf, so albern zu sein!", + "source": "cefr" + }, + { + "text": "Sie ist manchmal etwas naiv.", + "source": "cefr" + }, + { + "text": "Die Früchte sind noch unreif.", + "source": "cefr" + }, + { + "text": "Sie hat eine sehr kindliche Freude.", + "source": "cefr" + }, + { + "text": "Sein Verhalten war ziemlich kindisch.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "Ses exigences sont déraisonnables.", + "source": "cefr" + }, + { + "text": "C'est une idée folle.", + "source": "cefr" + }, + { + "text": "C'est une histoire drôle.", + "source": "cefr" + }, + { + "text": "Son comportement était aberrant et choquant.", + "source": "cefr" + } + ], + "es": [ + { + "text": "No seas tonto, eso no es verdad.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "goofy": { + "cefr_source": "B1" + }, + "sappy": { + "cefr_source": "B2" + }, + "silly": { + "cefr_source": "A2" + }, + "wacky": { + "cefr_source": "B2" + }, + "zany": { + "cefr_source": "B2" + } + }, + "de": { + "albern": { + "cefr_source": "B1" + }, + "naiv": { + "cefr_source": "B1" + }, + "unreif": { + "cefr_source": "B1" + }, + "kindlich": { + "cefr_source": "B1" + }, + "kindisch": { + "cefr_source": "B1" + } + }, + "fr": { + "déraisonnable": { + "cefr_source": "B2" + }, + "fou": { + "cefr_source": "B1" + }, + "drôle": { + "cefr_source": "A2" + }, + "aberrant": { + "cefr_source": "C1" + } + }, + "es": { + "tonto": { + "cefr_source": "A2" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i1291", + "pos": "adjective", + "translations": { + "en": [ + "unifacial" + ] + }, + "glosses": { + "en": [ + "having but one principal or specialized surface" + ] + }, + "examples": { + "en": [ + { + "text": "a primitive unifacial flint tool", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i73668", + "pos": "noun", + "translations": { + "en": [ + "cantata", + "oratorio" + ], + "it": [ + "cantata", + "oratorio" + ], + "es": [ + "oratorio" + ], + "de": [ + "Andachtsraum", + "Oratorium", + "Gebetsraum" + ], + "fr": [ + "oratorio", + "cantate" + ] + }, + "glosses": { + "en": [ + "a musical composition for voices and orchestra based on a religious text" + ], + "de": [ + "eine musikalische Komposition für Stimmen und Orchester auf der Grundlage eines religiösen Textes" + ] + }, + "examples": { + "it": [ + { + "text": "I bambini giocano nell'oratorio della chiesa.", + "source": "cefr" + } + ], + "de": [ + { + "text": "Händels \"Messiah\" ist ein berühmtes Oratorium.", + "source": "cefr" + } + ], + "es": [ + { + "text": "El oratorio de la iglesia es un lugar de paz y reflexión.", + "source": "cefr" + } + ] + }, + "votes": { + "it": { + "oratorio": { + "cefr_source": "B1" + } + }, + "de": { + "Oratorium": { + "cefr_source": "C1" + } + }, + "es": { + "oratorio": { + "cefr_source": "C1" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i39774", + "pos": "noun", + "translations": { + "en": [ + "respiration" + ], + "es": [ + "respiración" + ] + }, + "glosses": { + "en": [ + "a single complete act of breathing in and out" + ] + }, + "examples": { + "en": [ + { + "text": "thirty respirations per minute", + "source": "omw" + } + ], + "es": [ + { + "text": "Su respiración era lenta y profunda.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "respiration": { + "cefr_source": "B2" + } + }, + "es": { + "respiración": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i28838", + "pos": "verb", + "translations": { + "en": [ + "unplug", + "disconnect" + ], + "fr": [ + "débrancher" + ] + }, + "glosses": { + "en": [ + "pull the plug of (electrical appliances) and render inoperable" + ] + }, + "examples": { + "en": [ + { + "text": "unplug the hair dryer after using it", + "source": "omw" + } + ], + "fr": [ + { + "text": "N'oubliez pas de débrancher l'appareil après utilisation.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "unplug": { + "cefr_source": "A2" + } + }, + "fr": { + "débrancher": { + "cefr_source": "A2" + } + } + }, + "_sample_bucket": "has_glosses_and_examples" + }, + { + "source_id": "ili:i85884", + "pos": "noun", + "translations": { + "en": [ + "North Sea" + ], + "es": [ + "Mar del Norte" + ], + "de": [ + "Nordsee", + "Deutsches Meer" + ], + "fr": [ + "mer du Nord", + "Mer du Nord" + ] + }, + "glosses": { + "en": [ + "an arm of the North Atlantic between the British Isles and Scandinavia; oil was discovered under the North Sea in 1970" + ], + "de": [ + "ein Arm des Nordatlantiks zwischen den Britischen Inseln und Skandinavien; 1970 wurde unter der Nordsee Öl entdeckt" + ] + }, + "examples": { + "de": [ + { + "text": "Wir fahren im Sommer an die Nordsee.", + "source": "cefr" + } + ] + }, + "votes": { + "de": { + "Nordsee": { + "cefr_source": "A2" + } + } + }, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i57058", + "pos": "noun", + "translations": { + "en": [ + "patriarchal cross" + ], + "es": [ + "cruz patriarcal" + ], + "de": [ + "Erzbischofskreuz", + "Spanisches Kreuz", + "Ungarisches Kreuz", + "Patriarchenkreuz", + "Patriarchenhochkreuz" + ] + }, + "glosses": { + "en": [ + "a cross with two crossbars" + ], + "de": [ + "ein Kreuz mit zwei Querbalken" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i14067", + "pos": "adjective", + "translations": { + "en": [ + "maximizing", + "maximising" + ], + "fr": [ + "maximaliste" + ] + }, + "glosses": { + "en": [ + "making as great as possible" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i57206", + "pos": "noun", + "translations": { + "en": [ + "photocathode" + ], + "es": [ + "fotocátodo" + ], + "de": [ + "Photokathode", + "Fotokathode" + ], + "fr": [ + "photocathode" + ] + }, + "glosses": { + "en": [ + "a cathode that emits electrons when illuminated" + ], + "de": [ + "eine Kathode, die bei Beleuchtung Elektronen abgibt" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i97025", + "pos": "noun", + "translations": { + "en": [ + "Stockton", + "Frank Stockton", + "Francis Richard Stockton" + ], + "es": [ + "Francis Richard Stockton", + "Frank Stockton", + "Stockton" + ], + "fr": [ + "Stockton" + ] + }, + "glosses": { + "en": [ + "United States writer (1834-1902)" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i101248", + "pos": "noun", + "translations": { + "en": [ + "obeche" + ] + }, + "glosses": { + "en": [ + "the wood of an African obeche tree; used especially for veneering" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i94985", + "pos": "noun", + "translations": { + "en": [ + "Eames", + "Charles Eames" + ], + "es": [ + "Charles Eames" + ] + }, + "glosses": { + "en": [ + "United States designer noted for an innovative series of chairs (1907-1978)" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i16699", + "pos": "adjective", + "translations": { + "en": [ + "mensural", + "measured", + "mensurable" + ], + "es": [ + "mensural" + ] + }, + "glosses": { + "en": [ + "having notes of fixed rhythmic value" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i99999", + "pos": "noun", + "translations": { + "en": [ + "China aster", + "Callistephus chinensis" + ], + "fr": [ + "callistephus chinensis" + ] + }, + "glosses": { + "en": [ + "valued for their beautiful flowers in a wide range of clear bright colors; grown primarily for cutting" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i75135", + "pos": "noun", + "translations": { + "en": [ + "kiss of death" + ], + "fr": [ + "baiser de la mort" + ] + }, + "glosses": { + "en": [ + "something that is ruinous" + ] + }, + "examples": { + "en": [ + { + "text": "if this were known it would be the kiss of death for my political career", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i36428", + "pos": "noun", + "translations": { + "en": [ + "dark adaptation" + ] + }, + "glosses": { + "en": [ + "the process of adjusting the eyes to low levels of illumination; cones adapt first; rods continue to adapt for up to four hours" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i103092", + "pos": "noun", + "translations": { + "en": [ + "saw palmetto", + "scrub palmetto", + "Serenoa repens" + ] + }, + "glosses": { + "en": [ + "small hardy clump-forming spiny palm of southern United States" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i14834", + "pos": "adjective", + "translations": { + "en": [ + "zoic" + ] + }, + "glosses": { + "en": [ + "pertaining to animals or animal life or action" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i25953", + "pos": "verb", + "translations": { + "en": [ + "blog" + ], + "es": [ + "blogear" + ] + }, + "glosses": { + "en": [ + "read, write, or edit a shared on-line journal" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i24441", + "pos": "verb", + "translations": { + "en": [ + "ream" + ], + "es": [ + "taladrar" + ] + }, + "glosses": { + "en": [ + "enlarge with a reamer" + ] + }, + "examples": { + "en": [ + { + "text": "ream a hole", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i60874", + "pos": "noun", + "translations": { + "en": [ + "virtual memory", + "virtual storage" + ], + "it": [ + "memoria virtuale" + ], + "es": [ + "memoria virtual" + ], + "fr": [ + "mémoire virtuelle" + ] + }, + "glosses": { + "en": [ + "(computer science) memory created by using the hard disk to simulate additional random-access memory; the addressable storage space available to the user of a computer system in which virtual addresses are mapped into real addresses" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i105979", + "pos": "noun", + "translations": { + "en": [ + "Dryopteris", + "genus Dryopteris" + ], + "fr": [ + "Dryopteris", + "dryopteris" + ] + }, + "glosses": { + "en": [ + "large widespread genus of medium-sized terrestrial ferns; in some classification systems placed in Polypodiaceae" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i44411", + "pos": "noun", + "translations": { + "en": [ + "blue racer", + "Coluber constrictor flaviventris" + ], + "fr": [ + "coluber constrictor" + ] + }, + "glosses": { + "en": [ + "bluish-green blacksnake found from Ohio to Texas" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i14592", + "pos": "adjective", + "translations": { + "en": [ + "anagrammatic", + "anagrammatical" + ], + "it": [ + "anagrammatico" + ] + }, + "glosses": { + "en": [ + "related to anagrams or containing or making an anagram" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i5174", + "pos": "adjective", + "translations": { + "en": [ + "protrusile", + "protrusible" + ], + "fr": [ + "protrusible" + ] + }, + "glosses": { + "en": [ + "capable of being thrust forward, as the tongue" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "no_glosses_no_examples" + }, + { + "source_id": "ili:i99278", + "pos": "noun", + "translations": { + "en": [ + "pink calla", + "Zantedeschia rehmanii" + ] + }, + "glosses": { + "en": [ + "calla having a rose-colored spathe" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i97983", + "pos": "noun", + "translations": { + "en": [ + "phosphorescence" + ], + "it": [ + "fosforescenza", + "fotoluminescenza" + ], + "fr": [ + "phosphorescence" + ] + }, + "glosses": { + "en": [ + "a fluorescence that persists after the bombarding radiation has ceased" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i54194", + "pos": "noun", + "translations": { + "en": [ + "garrison cap", + "overseas cap" + ] + }, + "glosses": { + "en": [ + "a wedge-shaped wool or cotton cap; worn as part of a uniform" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i102972", + "pos": "noun", + "translations": { + "en": [ + "Tipuana", + "genus Tipuana" + ], + "fr": [ + "tipuana" + ] + }, + "glosses": { + "en": [ + "one species: South American tree: tipu tree" + ] + }, + "examples": {}, + "votes": {}, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i55386", + "pos": "noun", + "translations": { + "en": [ + "king" + ], + "fr": [ + "roi" + ] + }, + "glosses": { + "en": [ + "a checker that has been moved to the opponent's first row where it is promoted to a piece that is free to move either forward or backward" + ] + }, + "examples": { + "fr": [ + { + "text": "Le roi a visité la ville.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "king": { + "cefr_source": "A2" + } + }, + "fr": { + "roi": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i26482", + "pos": "verb", + "translations": { + "en": [ + "articulate", + "enunciate", + "vocalize", + "vocalise" + ], + "it": [ + "articolare", + "enunciare", + "enunziare", + "scandire" + ], + "es": [ + "articular" + ], + "de": [ + "ausdrücken", + "artikulieren" + ], + "fr": [ + "articuler", + "exprimer", + "énoncer", + "formuler", + "vocaliser" + ] + }, + "glosses": { + "en": [ + "express or state clearly" + ], + "de": [ + "klar ausdrücken oder erklären" + ] + }, + "examples": { + "it": [ + { + "text": "È importante articolare bene le parole.", + "source": "cefr" + } + ], + "de": [ + { + "text": "Er konnte seine Gefühle nicht ausdrücken.", + "source": "cefr" + }, + { + "text": "Er konnte seine Gedanken nicht klar artikulieren.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "Il faut bien articuler pour être compris.", + "source": "cefr" + }, + { + "text": "Il est difficile d'exprimer ses sentiments.", + "source": "cefr" + }, + { + "text": "Le professeur a énoncé les règles clairement.", + "source": "cefr" + }, + { + "text": "Il a formulé une question très pertinente.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Es importante articular bien las palabras al hablar en público.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "articulate": { + "cefr_source": "B2" + } + }, + "it": { + "articolare": { + "cefr_source": "B2" + } + }, + "de": { + "ausdrücken": { + "cefr_source": "B1" + }, + "artikulieren": { + "cefr_source": "B2" + } + }, + "fr": { + "articuler": { + "cefr_source": "B1" + }, + "exprimer": { + "cefr_source": "B1" + }, + "énoncer": { + "cefr_source": "B2" + }, + "formuler": { + "cefr_source": "B2" + } + }, + "es": { + "articular": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i22492", + "pos": "verb", + "translations": { + "en": [ + "spike" + ] + }, + "glosses": { + "en": [ + "manifest a sharp increase" + ] + }, + "examples": { + "en": [ + { + "text": "the voltage spiked", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i26383", + "pos": "verb", + "translations": { + "en": [ + "redefine" + ], + "fr": [ + "redéfinir" + ] + }, + "glosses": { + "en": [ + "give a new or different definition of (a word)" + ] + }, + "examples": { + "fr": [ + { + "text": "Il est temps de redéfinir nos objectifs.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "redefine": { + "cefr_source": "B2" + } + }, + "fr": { + "redéfinir": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i22943", + "pos": "verb", + "translations": { + "en": [ + "slake", + "abate", + "slack" + ], + "es": [ + "aflojar", + "reducir" + ], + "fr": [ + "descendre", + "cesser", + "réduire", + "ralentir", + "amoindrir", + "diminuer", + "supprimer" + ] + }, + "glosses": { + "en": [ + "make less active or intense" + ] + }, + "examples": { + "fr": [ + { + "text": "Nous allons descendre au rez-de-chaussée.", + "source": "cefr" + }, + { + "text": "La pluie a cessé de tomber.", + "source": "cefr" + }, + { + "text": "Nous devons réduire nos dépenses.", + "source": "cefr" + }, + { + "text": "Il faut ralentir avant le virage.", + "source": "cefr" + }, + { + "text": "Ces mesures visent à amoindrir l'impact de la crise.", + "source": "cefr" + }, + { + "text": "Les prix ont commencé à diminuer.", + "source": "cefr" + }, + { + "text": "Il faut supprimer les fichiers inutiles.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Tienes que aflojar el nudo.", + "source": "cefr" + }, + { + "text": "Necesitamos reducir el consumo de energía.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "abate": { + "cefr_source": "C1" + } + }, + "fr": { + "descendre": { + "cefr_source": "A2" + }, + "cesser": { + "cefr_source": "B1" + }, + "réduire": { + "cefr_source": "B1" + }, + "ralentir": { + "cefr_source": "B1" + }, + "amoindrir": { + "cefr_source": "C1" + }, + "diminuer": { + "cefr_source": "B1" + }, + "supprimer": { + "cefr_source": "B2" + } + }, + "es": { + "aflojar": { + "cefr_source": "B1" + }, + "reducir": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i31348", + "pos": "verb", + "translations": { + "en": [ + "romp" + ] + }, + "glosses": { + "en": [ + "run easily and fairly fast" + ] + }, + "examples": {}, + "votes": { + "en": { + "romp": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i10413", + "pos": "adjective", + "translations": { + "en": [ + "imprudent" + ], + "it": [ + "imprudente", + "incauto" + ], + "es": [ + "imprudente", + "insensato" + ], + "fr": [ + "imprudent" + ] + }, + "glosses": { + "en": [ + "not prudent or wise" + ] + }, + "examples": { + "en": [ + { + "text": "very imprudent of her mother to encourage her in such silly romantic ideas", + "source": "omw" + }, + { + "text": "\"would be imprudent for a noneconomist to talk about the details of economic policy\"- A.M.Schlesinger", + "source": "omw" + } + ], + "it": [ + { + "text": "È stato imprudente guidare così velocemente.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "C'était imprudent de traverser sans regarder.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Fue una decisión imprudente conducir tan rápido.", + "source": "cefr" + }, + { + "text": "Fue una decisión insensata.", + "source": "cefr" + } + ] + }, + "votes": { + "it": { + "imprudente": { + "cefr_source": "B2" + } + }, + "fr": { + "imprudent": { + "cefr_source": "B2" + } + }, + "es": { + "imprudente": { + "cefr_source": "B2" + }, + "insensato": { + "cefr_source": "B2" + } + } + }, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i8645", + "pos": "adjective", + "translations": { + "en": [ + "metaphysical" + ], + "es": [ + "metafísico" + ], + "fr": [ + "métaphysique" + ] + }, + "glosses": { + "en": [ + "without material form or substance" + ] + }, + "examples": { + "en": [ + { + "text": "metaphysical forces", + "source": "omw" + } + ] + }, + "votes": { + "en": { + "metaphysical": { + "cefr_source": "C1" + } + } + }, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i6969", + "pos": "adjective", + "translations": { + "en": [ + "all-important", + "all important", + "crucial", + "essential", + "of the essence" + ], + "it": [ + "essenziale" + ], + "es": [ + "crucial", + "esencial" + ], + "de": [ + "bedeutsam", + "wesentlich", + "wichtig", + "prägnant", + "hauptsächlich", + "gehaltvoll", + "aussagekräftig", + "signifikant" + ], + "fr": [ + "essentiel" + ] + }, + "glosses": { + "en": [ + "of the greatest importance" + ], + "de": [ + "von allergrößter Bedeutung" + ] + }, + "examples": { + "en": [ + { + "text": "the all-important subject of disarmament", + "source": "omw" + }, + { + "text": "crucial information", + "source": "omw" + }, + { + "text": "in chess cool nerves are of the essence", + "source": "omw" + } + ], + "it": [ + { + "text": "L'acqua è essenziale per la vita.", + "source": "cefr" + } + ], + "de": [ + { + "text": "Das war ein bedeutsamer Moment in der Geschichte.", + "source": "cefr" + }, + { + "text": "Das ist ein wesentlicher Unterschied.", + "source": "cefr" + }, + { + "text": "Das ist eine wichtige Information.", + "source": "cefr" + }, + { + "text": "Er formulierte seine Gedanken sehr prägnant.", + "source": "cefr" + }, + { + "text": "Die Studie lieferte aussagekräftige Ergebnisse.", + "source": "cefr" + }, + { + "text": "Es gab eine signifikante Veränderung.", + "source": "cefr" + } + ], + "fr": [ + { + "text": "C'est essentiel de bien manger pour rester en forme.", + "source": "cefr" + } + ], + "es": [ + { + "text": "Es crucial que lleguemos a tiempo.", + "source": "cefr" + }, + { + "text": "El agua es esencial para la vida.", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "crucial": { + "cefr_source": "B2" + }, + "essential": { + "cefr_source": "B1" + } + }, + "it": { + "essenziale": { + "cefr_source": "B1" + } + }, + "de": { + "bedeutsam": { + "cefr_source": "B2" + }, + "wesentlich": { + "cefr_source": "B1" + }, + "wichtig": { + "cefr_source": "A1" + }, + "prägnant": { + "cefr_source": "B2" + }, + "aussagekräftig": { + "cefr_source": "B2" + }, + "signifikant": { + "cefr_source": "C1" + } + }, + "fr": { + "essentiel": { + "cefr_source": "B1" + } + }, + "es": { + "crucial": { + "cefr_source": "B2" + }, + "esencial": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i13690", + "pos": "adjective", + "translations": { + "en": [ + "round-arm" + ] + }, + "glosses": { + "en": [ + "with the arm swung round at shoulder height" + ] + }, + "examples": { + "en": [ + { + "text": "round-arm bowling", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i16993", + "pos": "adjective", + "translations": { + "en": [ + "Monacan", + "Monegasque" + ], + "it": [ + "monegasco" + ], + "fr": [ + "monégasque" + ] + }, + "glosses": { + "en": [ + "of or relating to or characteristic of Monaco or its people" + ] + }, + "examples": { + "fr": [ + { + "text": "Il est de nationalité monégasque.", + "source": "cefr" + } + ] + }, + "votes": { + "fr": { + "monégasque": { + "cefr_source": "B1" + } + } + }, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i18824", + "pos": "adverb", + "translations": { + "en": [ + "here", + "hither" + ], + "it": [ + "qua", + "qui" + ], + "fr": [ + "ici", + "çà", + "par ici" + ] + }, + "glosses": { + "en": [ + "to this place (especially toward the speaker)" + ] + }, + "examples": { + "en": [ + { + "text": "come here, please", + "source": "omw" + } + ], + "it": [ + { + "text": "Vieni qua, per favore.", + "source": "cefr" + }, + { + "text": "Vieni qui!", + "source": "cefr" + } + ], + "fr": [ + { + "text": "Venez ici !", + "source": "cefr" + } + ] + }, + "votes": { + "en": { + "here": { + "cefr_source": "A1" + }, + "hither": { + "cefr_source": "C2" + } + }, + "it": { + "qua": { + "cefr_source": "A1" + }, + "qui": { + "cefr_source": "A1" + } + }, + "fr": { + "ici": { + "cefr_source": "A1" + } + } + }, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i19641", + "pos": "adverb", + "translations": { + "en": [ + "head-on" + ], + "es": [ + "de frente" + ] + }, + "glosses": { + "en": [ + "with the front foremost" + ] + }, + "examples": { + "en": [ + { + "text": "the cars collided head-on", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i21417", + "pos": "adverb", + "translations": { + "en": [ + "sweepingly" + ] + }, + "glosses": { + "en": [ + "in a sweeping manner" + ] + }, + "examples": { + "en": [ + { + "text": "he sweepingly condemned the entire population of the country for the war crimes", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i20131", + "pos": "adverb", + "translations": { + "en": [ + "gallantly", + "chivalrously" + ], + "it": [ + "galantemente" + ], + "fr": [ + "chevaleresquement" + ] + }, + "glosses": { + "en": [ + "in a gallant manner" + ] + }, + "examples": { + "en": [ + { + "text": "he gallantly offered to take her home", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "pos_spread" + }, + { + "source_id": "ili:i20516", + "pos": "adverb", + "translations": { + "en": [ + "fractiously" + ] + }, + "glosses": { + "en": [ + "in a fractious manner" + ] + }, + "examples": { + "en": [ + { + "text": "the horse was behaving fractiously and refused to jump", + "source": "omw" + } + ] + }, + "votes": {}, + "_sample_bucket": "pos_spread" + } +] \ No newline at end of file diff --git a/data-pipeline/test/scripts/sample.ts b/data-pipeline/test/scripts/sample.ts new file mode 100644 index 0000000..63ead71 --- /dev/null +++ b/data-pipeline/test/scripts/sample.ts @@ -0,0 +1,205 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared"; +import type { SupportedLanguageCode, SupportedPos } from "@lila/shared"; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type Example = { text: string; source: "omw" | "cefr" }; + +type AnnotatedRecord = { + source_id: string; + pos: SupportedPos; + translations: Partial>; + glosses: Partial>; + examples: Partial>; + votes: Partial< + Record> + >; +}; + +type SampleRecord = AnnotatedRecord & { _sample_bucket: string }; + +// ── Constants ───────────────────────────────────────────────────────────────── + +const PATHS = { + annotatedDir: "stage-2-annotate/output", + output: "test/output/sample.json", +}; + +const BUCKET_SIZE = 20; + +// ── Bucket predicates ───────────────────────────────────────────────────────── + +type Bucket = { name: string; predicate: (record: AnnotatedRecord) => boolean }; + +const BUCKETS: Bucket[] = [ + { + name: "has_cefr_vote", + predicate: (r) => + Object.values(r.votes).some( + (langVotes) => Object.keys(langVotes ?? {}).length > 0, + ), + }, + { + name: "no_cefr_vote", + predicate: (r) => + Object.values(r.votes).every( + (langVotes) => Object.keys(langVotes ?? {}).length === 0, + ), + }, + { + name: "has_glosses_and_examples", + predicate: (r) => + Object.keys(r.glosses).length > 0 && Object.keys(r.examples).length > 0, + }, + { + name: "no_glosses_no_examples", + predicate: (r) => + !r.glosses["fr"] && + !r.examples["fr"] && + !r.votes["fr"] && + !r.glosses["es"] && + !r.examples["es"] && + !r.votes["es"], + }, + { + name: "pos_spread", + predicate: () => true, // sampled separately to ensure POS coverage + }, +]; + +// ── Sampling ────────────────────────────────────────────────────────────────── + +function sampleBucket( + records: AnnotatedRecord[], + predicate: (r: AnnotatedRecord) => boolean, + size: number, + exclude: Set, +): AnnotatedRecord[] { + const candidates = records.filter( + (r) => !exclude.has(r.source_id) && predicate(r), + ); + + // Shuffle for random sampling + for (let i = candidates.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + [candidates[i], candidates[j]] = [candidates[j]!, candidates[i]!]; + } + + return candidates.slice(0, size); +} + +function samplePosBucket( + records: AnnotatedRecord[], + exclude: Set, +): AnnotatedRecord[] { + const posList: SupportedPos[] = ["noun", "verb", "adjective", "adverb"]; + const perPos = Math.floor(BUCKET_SIZE / posList.length); + const result: AnnotatedRecord[] = []; + + for (const pos of posList) { + const sampled = sampleBucket( + records, + (r) => r.pos === pos, + perPos, + exclude, + ); + result.push(...sampled); + } + + return result; +} + +// ── Loading ─────────────────────────────────────────────────────────────────── + +async function loadAnnotated(): Promise { + // Load all language files and merge votes into a single record set. + // Use en.json as the base record structure since it has the most complete + // glosses and examples. Votes from all other languages are merged in. + const baseRaw = await fs.readFile( + path.join(PATHS.annotatedDir, "en.json"), + "utf-8", + ); + const base = JSON.parse(baseRaw) as AnnotatedRecord[]; + + // Build a map for fast lookup by source_id + const byId = new Map(); + for (const record of base) { + byId.set(record.source_id, record); + } + + // Merge votes from remaining language files + for (const lang of SUPPORTED_LANGUAGE_CODES) { + if (lang === "en") continue; + const raw = await fs.readFile( + path.join(PATHS.annotatedDir, `${lang}.json`), + "utf-8", + ); + const records = JSON.parse(raw) as AnnotatedRecord[]; + + for (const record of records) { + const base = byId.get(record.source_id); + if (!base) continue; + + // Merge votes + for (const [l, langVotes] of Object.entries(record.votes)) { + if (!base.votes[l as SupportedLanguageCode]) { + base.votes[l as SupportedLanguageCode] = {}; + } + Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes); + } + + // Merge examples from CEFR source files not in base + for (const [l, examples] of Object.entries(record.examples)) { + const lang = l as SupportedLanguageCode; + if (!base.examples[lang]) { + base.examples[lang] = examples as Example[]; + } + } + } + } + + return [...byId.values()]; +} + +// ── Main ───────────────────────────────────────────────────────────────────── + +async function main(): Promise { + console.log("Loading annotated files..."); + const records = await loadAnnotated(); + console.log(` Loaded ${records.length.toLocaleString()} synsets`); + + const sampled: SampleRecord[] = []; + const seen = new Set(); + + // Sample each bucket except pos_spread + for (const bucket of BUCKETS.filter((b) => b.name !== "pos_spread")) { + const results = sampleBucket(records, bucket.predicate, BUCKET_SIZE, seen); + for (const r of results) { + seen.add(r.source_id); + sampled.push({ ...r, _sample_bucket: bucket.name }); + } + console.log(` ${bucket.name}: ${results.length} records`); + } + + // Sample pos_spread bucket + const posResults = samplePosBucket(records, seen); + for (const r of posResults) { + seen.add(r.source_id); + sampled.push({ ...r, _sample_bucket: "pos_spread" }); + } + console.log(` pos_spread: ${posResults.length} records`); + + console.log(`\nTotal sampled: ${sampled.length} records`); + + // Write output + await fs.mkdir(path.dirname(PATHS.output), { recursive: true }); + await fs.writeFile(PATHS.output, JSON.stringify(sampled, null, 2), "utf-8"); + console.log(`Wrote sample → ${PATHS.output}`); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/data-pipeline/tsconfig.json b/data-pipeline/tsconfig.json index 19bf9bb..83c3053 100644 --- a/data-pipeline/tsconfig.json +++ b/data-pipeline/tsconfig.json @@ -4,8 +4,9 @@ "module": "NodeNext", "moduleResolution": "NodeNext", "outDir": "dist", - "rootDir": "scripts", + "rootDir": ".", + "types": ["node"], }, "references": [{ "path": "../packages/shared" }], - "include": ["scripts/**/*"], + "include": ["./**/*"], } diff --git a/documentation/llm-setup.md b/documentation/llm-setup.md new file mode 100644 index 0000000..6cc1f91 --- /dev/null +++ b/documentation/llm-setup.md @@ -0,0 +1,295 @@ +# LLM Setup — lila pipeline + +This document covers the LLM infrastructure for stage 3 (enrich) of the lila +data pipeline. It documents the hardware constraints, supported providers, +model recommendations, and how to configure and swap providers in the test +and production scripts. + +--- + +## Hardware (dev machine) + +| Component | Spec | +|---|---| +| CPU | Intel Core i7-6500U (2 cores / 4 threads @ 3.10 GHz) | +| RAM | 8 GB | +| GPU | NVIDIA GeForce GTX 950M — 4 GB VRAM (Maxwell, CUDA compute 5.0) | +| OS | Debian GNU/Linux 13 (trixie) x86_64 | + +**Local inference verdict:** viable for small/quantized models, not for +production runs. See the [Local inference](#local-inference-llamacpp) section +for details. + +--- + +## Provider overview + +The enrich script uses a single, swappable provider config. All providers +except Anthropic expose an OpenAI-compatible API, so the same client code +works across all of them — only `baseURL`, `apiKey`, and `model` change. + +| Provider | Use case | Cost | Rate limits | +|---|---|---|---| +| llama.cpp (local) | Quality testing, overnight dev runs | Free (electricity) | None | +| OpenRouter (free tier) | Quality comparison, multi-model evaluation | Free | 50 req/day, 20 req/min | +| OpenRouter (paid) | Production runs if local quality insufficient | Pay-per-token | None | +| Anthropic API | Quality baseline / reference | Pay-per-token | Standard | + +--- + +## Local inference (llama.cpp) + +### Why local inference is worth testing + +Time is not a constraint — the pipeline scripts are fully resumable. The +laptop can run overnight for multiple nights. The only question is output +quality, which the test script evaluates empirically. + +### Hardware constraints + +The GTX 950M has 4 GB VRAM and Maxwell architecture (CUDA compute 5.0). +llama.cpp supports Maxwell via CUDA backend but newer builds may require +the `--cuda-no-kv-offload` flag depending on the version. + +llama.cpp splits model layers between GPU and CPU automatically via +`--n-gpu-layers`. You set how many layers go on the GPU; the rest run on +CPU/RAM. This means a model larger than VRAM is not a dead end — it runs +in hybrid mode, slower than full-GPU but much faster than pure CPU. + +Practical estimates for this hardware (~3.5 GB VRAM usable after drivers): + +| Model size | Q4 VRAM | Mode | Est. speed | +|---|---|---|---| +| 3B | ~2.0 GB | Full GPU | ~15–20 tok/s | +| 4B | ~2.5 GB | Full GPU | ~12–18 tok/s | +| 7B | ~4.5 GB | Hybrid (~26/32 layers on GPU) | ~8–12 tok/s | +| 13B+ | ~8 GB+ | CPU-heavy hybrid | too slow | + +### Recommended local models + +Two candidates worth testing, covering different points on the size/quality +tradeoff: + +**Gemma 4 E4B Instruct (Q4 / UD-Q4_K_XL)** +- GGUF file: `gemma-4-E4B-it-UD-Q4_K_XL.gguf` (~2.5 GB) +- Source: https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF +- Runs fully on GPU. Brand new (April 2025), built for edge hardware, 140+ + language support including all five pipeline languages. First candidate + to test. + +**Qwen2.5 7B Instruct (Q4_K_M)** +- GGUF file: `Qwen2.5-7B-Instruct-Q4_K_M.gguf` (~4.5 GB) +- Source: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF +- Runs in hybrid mode (~26 of 32 layers on GPU, rest on CPU), ~8–12 tok/s. + Stronger multilingual generation than any 3–4B model. Second candidate, + for comparison against the smaller Gemma 4 E4B. + +### Installation + +```bash +# Install build dependencies +sudo apt install build-essential cmake git + +# Clone llama.cpp +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp + +# Build with CUDA support (GTX 950M — compute 5.0) +cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=50 +cmake --build build --config Release -j$(nproc) + +# Download model (example — adjust path as needed) +mkdir -p models +wget -O models/qwen2.5-3b-instruct-q4_k_m.gguf \ + https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_k_m.gguf +``` + +### Starting the server + +**Gemma 4 E4B** (full GPU): +```bash +./build/bin/llama-server \ + --model models/gemma-4-e4b-it-ud-q4_k_xl.gguf \ + --port 8080 \ + --ctx-size 4096 \ + --n-gpu-layers 999 \ + --host 127.0.0.1 +``` + +**Qwen2.5 7B** (hybrid — tune `--n-gpu-layers` to fit your VRAM): +```bash +./build/bin/llama-server \ + --model models/qwen2.5-7b-instruct-q4_k_m.gguf \ + --port 8080 \ + --ctx-size 4096 \ + --n-gpu-layers 28 \ + --host 127.0.0.1 +``` + +`--n-gpu-layers 999` means "put everything on GPU" — llama.cpp caps at the +actual layer count automatically, so 999 is safe as a "full offload" value. +For the 7B hybrid, start with `28` and reduce by 2 if the server reports +out-of-memory at startup. + +### Verify the server is running + +```bash +curl http://127.0.0.1:8080/health +# Expected: {"status":"ok"} +``` + +--- + +## OpenRouter (free tier) + +OpenRouter exposes all models via an OpenAI-compatible API. No code changes +are needed to switch from local llama.cpp to OpenRouter — only the config +object changes. + +### Rate limits (free tier) + +- **50 requests per day** (account total, not per model) +- 20 requests per minute + +> **Implication for testing:** with a 10-record test set you have headroom +> to test 4–5 models per day. With a 100-record test set, plan one model per +> day. + +> **Implication for production:** the free tier is not viable for 117k +> records. If local quality is insufficient, use paid OpenRouter credits or +> a dedicated provider. + +### Free models recommended for this pipeline + +Ranked by expected multilingual generation quality for en/it/de/fr/es: + +| Model ID | Params | Notes | +|---|---|---| +| `qwen/qwen3-coder:free` | 480B MoE (35B active) | Best free option. Strong multilingual despite "coder" label. Use as quality ceiling. | +| `qwen/qwen3-next-80b-a3b-instruct:free` | 80B MoE (3B active) | Smaller Qwen, useful comparison point. | +| `nvidia/nemotron-3-super-120b-a12b:free` | 120B MoE (12B active) | 262K context, supports structured output. | +| `google/gemma-4-31b-it:free` | 31B | 140+ language support, good European language coverage. | +| `zhipuai/glm-4.5-air:free` | MoE | Multilingual-focused. | + +**Skip for this pipeline:** +- Llama models — weaker European language generation than Qwen/Gemma +- Mistral free tier — requests may be used for model training + +### API endpoint + +``` +https://openrouter.ai/api/v1/chat/completions +``` + +Set `Authorization: Bearer ` in the request headers. + +--- + +## Provider configuration in the test script + +The enrich test script reads a single config object. To switch providers, +change this object and re-run. + +```typescript +// config.ts + +export type ProviderConfig = { + name: string; // used for output folder naming + baseURL: string; + apiKey: string; + model: string; + maxTokens: number; +}; + +// Local llama.cpp +export const LOCAL_QWEN3B: ProviderConfig = { + name: "local-qwen2.5-3b", + baseURL: "http://127.0.0.1:8080/v1", + apiKey: "none", // llama.cpp ignores this + model: "qwen2.5-3b", // llama.cpp ignores model name, uses loaded model + maxTokens: 512, +}; + +// OpenRouter — Qwen3 480B (free) +export const OR_QWEN3_480B: ProviderConfig = { + name: "or-qwen3-480b", + baseURL: "https://openrouter.ai/api/v1", + apiKey: process.env.OPENROUTER_API_KEY!, + model: "qwen/qwen3-coder:free", + maxTokens: 512, +}; + +// OpenRouter — Gemma 4 31B (free) +export const OR_GEMMA4_31B: ProviderConfig = { + name: "or-gemma4-31b", + baseURL: "https://openrouter.ai/api/v1", + apiKey: process.env.OPENROUTER_API_KEY!, + model: "google/gemma-4-31b-it:free", + maxTokens: 512, +}; + +// Anthropic (reference baseline — different adapter required) +export const ANTHROPIC_SONNET: ProviderConfig = { + name: "anthropic-sonnet", + baseURL: "https://api.anthropic.com/v1", // adapter handles format difference + apiKey: process.env.ANTHROPIC_API_KEY!, + model: "claude-sonnet-4-6", + maxTokens: 512, +}; +``` + +Output from each run lands in: +``` +stage-3-enrich/test/output/{provider.name}/results.json +stage-3-enrich/test/output/{provider.name}/metrics.json +``` + +The evaluate script compares all `metrics.json` files side by side. + +--- + +## Evaluation metrics + +The test script measures the following per provider run: + +| Metric | What it measures | +|---|---| +| **JSON parse rate** | % of responses that are valid, schema-compliant JSON. Critical — a failed parse is a wasted call. Target: >97% | +| **Field coverage** | % of records where all required fields are present (cefr votes for all translations, descriptions for all languages, glosses/examples for fr/es) | +| **CEFR agreement** | For records that have a `cefr_source` vote, % where the model agrees. Measures calibration. | +| **Language correctness** | Manual spot-check only — automated detection not reliable enough | +| **Tokens/second** | Local only. Indicates overnight run feasibility | + +### Decision thresholds + +| Metric | Threshold | Action if below | +|---|---|---| +| JSON parse rate | < 97% | Do not use this model for production | +| Field coverage | < 95% | Prompt needs revision before production | +| CEFR agreement | < 70% | Model lacks vocabulary knowledge for this task | + +--- + +## Recommended test sequence + +1. **Start local, minimal dataset (5–10 records)** + Install llama.cpp, run Qwen2.5 3B against 5–10 hand-picked records. + Verify the server works, the output parses, and the model produces + something reasonable. This is purely a smoke test. + +2. **Expand local to full 100-record sample** + Once the pipeline is confirmed working, run all 100 records locally. + Collect metrics. This is your local quality baseline. + +3. **Run the same 100 records through OpenRouter free models** + One model per day (50 req/day limit). Start with `qwen/qwen3-coder:free` + as the quality ceiling. + +4. **Compare metrics side by side** + If local 3B is within acceptable range of the cloud models on CEFR + agreement and field coverage, proceed with local overnight runs for + production. If not, use the cloud model that passed. + +5. **Production run** + Full 117k records. Resume-safe — the script checkpoints after each + record so overnight runs can be stopped and continued.