feat: add db schema, init, and vitest config
This commit is contained in:
parent
74cfc82bdd
commit
4fa3073412
13 changed files with 248 additions and 8 deletions
2472
data-pipeline/sample/output/sample.json
Normal file
2472
data-pipeline/sample/output/sample.json
Normal file
File diff suppressed because it is too large
Load diff
205
data-pipeline/sample/scripts/sample.ts
Normal file
205
data-pipeline/sample/scripts/sample.ts
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { SUPPORTED_LANGUAGE_CODES } from "@lila/shared";
|
||||
import type { SupportedLanguageCode, SupportedPos } from "@lila/shared";
|
||||
|
||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
type Example = { text: string; source: "omw" | "cefr" };
|
||||
|
||||
type AnnotatedRecord = {
|
||||
source_id: string;
|
||||
pos: SupportedPos;
|
||||
translations: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
glosses: Partial<Record<SupportedLanguageCode, string[]>>;
|
||||
examples: Partial<Record<SupportedLanguageCode, Example[]>>;
|
||||
votes: Partial<
|
||||
Record<SupportedLanguageCode, Record<string, { cefr_source: string }>>
|
||||
>;
|
||||
};
|
||||
|
||||
type SampleRecord = AnnotatedRecord & { _sample_bucket: string };
|
||||
|
||||
// ── Constants ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const PATHS = {
|
||||
annotatedDir: "stage-2-annotate/output",
|
||||
output: "test/output/sample.json",
|
||||
};
|
||||
|
||||
const BUCKET_SIZE = 20;
|
||||
|
||||
// ── Bucket predicates ─────────────────────────────────────────────────────────
|
||||
|
||||
type Bucket = { name: string; predicate: (record: AnnotatedRecord) => boolean };
|
||||
|
||||
const BUCKETS: Bucket[] = [
|
||||
{
|
||||
name: "has_cefr_vote",
|
||||
predicate: (r) =>
|
||||
Object.values(r.votes).some(
|
||||
(langVotes) => Object.keys(langVotes ?? {}).length > 0,
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "no_cefr_vote",
|
||||
predicate: (r) =>
|
||||
Object.values(r.votes).every(
|
||||
(langVotes) => Object.keys(langVotes ?? {}).length === 0,
|
||||
),
|
||||
},
|
||||
{
|
||||
name: "has_glosses_and_examples",
|
||||
predicate: (r) =>
|
||||
Object.keys(r.glosses).length > 0 && Object.keys(r.examples).length > 0,
|
||||
},
|
||||
{
|
||||
name: "no_glosses_no_examples",
|
||||
predicate: (r) =>
|
||||
!r.glosses["fr"] &&
|
||||
!r.examples["fr"] &&
|
||||
!r.votes["fr"] &&
|
||||
!r.glosses["es"] &&
|
||||
!r.examples["es"] &&
|
||||
!r.votes["es"],
|
||||
},
|
||||
{
|
||||
name: "pos_spread",
|
||||
predicate: () => true, // sampled separately to ensure POS coverage
|
||||
},
|
||||
];
|
||||
|
||||
// ── Sampling ──────────────────────────────────────────────────────────────────
|
||||
|
||||
function sampleBucket(
|
||||
records: AnnotatedRecord[],
|
||||
predicate: (r: AnnotatedRecord) => boolean,
|
||||
size: number,
|
||||
exclude: Set<string>,
|
||||
): AnnotatedRecord[] {
|
||||
const candidates = records.filter(
|
||||
(r) => !exclude.has(r.source_id) && predicate(r),
|
||||
);
|
||||
|
||||
// Shuffle for random sampling
|
||||
for (let i = candidates.length - 1; i > 0; i--) {
|
||||
const j = Math.floor(Math.random() * (i + 1));
|
||||
[candidates[i], candidates[j]] = [candidates[j]!, candidates[i]!];
|
||||
}
|
||||
|
||||
return candidates.slice(0, size);
|
||||
}
|
||||
|
||||
function samplePosBucket(
|
||||
records: AnnotatedRecord[],
|
||||
exclude: Set<string>,
|
||||
): AnnotatedRecord[] {
|
||||
const posList: SupportedPos[] = ["noun", "verb", "adjective", "adverb"];
|
||||
const perPos = Math.floor(BUCKET_SIZE / posList.length);
|
||||
const result: AnnotatedRecord[] = [];
|
||||
|
||||
for (const pos of posList) {
|
||||
const sampled = sampleBucket(
|
||||
records,
|
||||
(r) => r.pos === pos,
|
||||
perPos,
|
||||
exclude,
|
||||
);
|
||||
result.push(...sampled);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ── Loading ───────────────────────────────────────────────────────────────────
|
||||
|
||||
async function loadAnnotated(): Promise<AnnotatedRecord[]> {
|
||||
// Load all language files and merge votes into a single record set.
|
||||
// Use en.json as the base record structure since it has the most complete
|
||||
// glosses and examples. Votes from all other languages are merged in.
|
||||
const baseRaw = await fs.readFile(
|
||||
path.join(PATHS.annotatedDir, "en.json"),
|
||||
"utf-8",
|
||||
);
|
||||
const base = JSON.parse(baseRaw) as AnnotatedRecord[];
|
||||
|
||||
// Build a map for fast lookup by source_id
|
||||
const byId = new Map<string, AnnotatedRecord>();
|
||||
for (const record of base) {
|
||||
byId.set(record.source_id, record);
|
||||
}
|
||||
|
||||
// Merge votes from remaining language files
|
||||
for (const lang of SUPPORTED_LANGUAGE_CODES) {
|
||||
if (lang === "en") continue;
|
||||
const raw = await fs.readFile(
|
||||
path.join(PATHS.annotatedDir, `${lang}.json`),
|
||||
"utf-8",
|
||||
);
|
||||
const records = JSON.parse(raw) as AnnotatedRecord[];
|
||||
|
||||
for (const record of records) {
|
||||
const base = byId.get(record.source_id);
|
||||
if (!base) continue;
|
||||
|
||||
// Merge votes
|
||||
for (const [l, langVotes] of Object.entries(record.votes)) {
|
||||
if (!base.votes[l as SupportedLanguageCode]) {
|
||||
base.votes[l as SupportedLanguageCode] = {};
|
||||
}
|
||||
Object.assign(base.votes[l as SupportedLanguageCode]!, langVotes);
|
||||
}
|
||||
|
||||
// Merge examples from CEFR source files not in base
|
||||
for (const [l, examples] of Object.entries(record.examples)) {
|
||||
const lang = l as SupportedLanguageCode;
|
||||
if (!base.examples[lang]) {
|
||||
base.examples[lang] = examples;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...byId.values()];
|
||||
}
|
||||
|
||||
// ── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log("Loading annotated files...");
|
||||
const records = await loadAnnotated();
|
||||
console.log(` Loaded ${records.length.toLocaleString()} synsets`);
|
||||
|
||||
const sampled: SampleRecord[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
// Sample each bucket except pos_spread
|
||||
for (const bucket of BUCKETS.filter((b) => b.name !== "pos_spread")) {
|
||||
const results = sampleBucket(records, bucket.predicate, BUCKET_SIZE, seen);
|
||||
for (const r of results) {
|
||||
seen.add(r.source_id);
|
||||
sampled.push({ ...r, _sample_bucket: bucket.name });
|
||||
}
|
||||
console.log(` ${bucket.name}: ${results.length} records`);
|
||||
}
|
||||
|
||||
// Sample pos_spread bucket
|
||||
const posResults = samplePosBucket(records, seen);
|
||||
for (const r of posResults) {
|
||||
seen.add(r.source_id);
|
||||
sampled.push({ ...r, _sample_bucket: "pos_spread" });
|
||||
}
|
||||
console.log(` pos_spread: ${posResults.length} records`);
|
||||
|
||||
console.log(`\nTotal sampled: ${sampled.length} records`);
|
||||
|
||||
// Write output
|
||||
await fs.mkdir(path.dirname(PATHS.output), { recursive: true });
|
||||
await fs.writeFile(PATHS.output, JSON.stringify(sampled, null, 2), "utf-8");
|
||||
console.log(`Wrote sample → ${PATHS.output}`);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue