From ac98b03dbd231c1550495f906e2c1821878c4fbb Mon Sep 17 00:00:00 2001 From: MH4GF Date: Wed, 8 Oct 2025 11:48:32 +0900 Subject: [PATCH 1/5] refactor(schema-bench): unify JSON file loading with loadJsonFiles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace specialized loadInputFiles with generic loadJsonFiles function. - Rename loadInputFiles.ts to loadJsonFiles.ts - Update executeLiamDbShared.ts to use loadJsonFiles with explicit path - Update executeOpenaiUnified.ts to use loadJsonFiles with explicit path - Change return value key from 'input' to 'data' for consistency - Remove loadInputFiles export from utils/index.ts This simplifies the codebase by having a single, flexible function for loading JSON files from any directory. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../src/cli/executeLiamDbShared.ts | 14 ++++---- .../src/cli/executeOpenaiUnified.ts | 16 ++++++---- .../schema-bench/src/cli/utils/index.ts | 2 +- .../{loadInputFiles.ts => loadJsonFiles.ts} | 32 ++++++++----------- 4 files changed, 30 insertions(+), 34 deletions(-) rename frontend/internal-packages/schema-bench/src/cli/utils/{loadInputFiles.ts => loadJsonFiles.ts} (63%) diff --git a/frontend/internal-packages/schema-bench/src/cli/executeLiamDbShared.ts b/frontend/internal-packages/schema-bench/src/cli/executeLiamDbShared.ts index cd7379c8b3..e6f7614588 100644 --- a/frontend/internal-packages/schema-bench/src/cli/executeLiamDbShared.ts +++ b/frontend/internal-packages/schema-bench/src/cli/executeLiamDbShared.ts @@ -1,11 +1,11 @@ #!/usr/bin/env node -import { resolve } from 'node:path' +import { join, resolve } from 'node:path' import { config } from 'dotenv' import { err, ok, type Result } from 'neverthrow' import * as v from 'valibot' import { execute, type LiamDbExecutorInput } from '../executors/liamDb/index.ts' -import { loadInputFiles, saveOutputFile } from './utils' +import { loadJsonFiles, saveOutputFile } from './utils' config({ path: resolve(__dirname, '../../../../../.env') }) @@ -52,10 +52,10 @@ export async function processDataset( datasetPath: string, ): Promise { // Load input files - const inputsResult = await loadInputFiles< + const inputsResult = await loadJsonFiles< typeof InputSchema, LiamDbExecutorInput - >(datasetPath, InputSchema, (value) => ({ + >(join(datasetPath, 'execution', 'input'), InputSchema, (value) => ({ input: typeof value === 'string' ? value : value.input, })) if (inputsResult.isErr()) { @@ -76,10 +76,10 @@ export async function processDataset( let failureCount = 0 const processBatch = async ( - batch: Array<{ caseId: string; input: LiamDbExecutorInput }>, + batch: Array<{ caseId: string; data: LiamDbExecutorInput }>, ) => { - const promises = batch.map(({ caseId, input }) => - executeCase(datasetPath, caseId, input), + const promises = batch.map(({ caseId, data }) => + executeCase(datasetPath, caseId, data), ) const results = await Promise.allSettled(promises) diff --git a/frontend/internal-packages/schema-bench/src/cli/executeOpenaiUnified.ts b/frontend/internal-packages/schema-bench/src/cli/executeOpenaiUnified.ts index 1dbec40fed..a9e943f668 100644 --- a/frontend/internal-packages/schema-bench/src/cli/executeOpenaiUnified.ts +++ b/frontend/internal-packages/schema-bench/src/cli/executeOpenaiUnified.ts @@ -1,6 +1,6 @@ #!/usr/bin/env node -import { resolve } from 'node:path' +import { join, resolve } from 'node:path' import { config as loadEnv } from 'dotenv' import { err, ok, type Result } from 'neverthrow' import * as v from 'valibot' @@ -11,7 +11,7 @@ import { getWorkspacePath, handleCliError, handleUnexpectedError, - loadInputFiles, + loadJsonFiles, parseArgs, saveOutputFile, selectTargetDatasets, @@ -53,10 +53,12 @@ async function processDataset( datasetName: string, datasetPath: string, ): Promise { - const inputsResult = await loadInputFiles< + const inputsResult = await loadJsonFiles< typeof InputSchema, OpenAIExecutorInput - >(datasetPath, InputSchema, (value) => ({ input: value.input })) + >(join(datasetPath, 'execution', 'input'), InputSchema, (value) => ({ + input: value.input, + })) if (inputsResult.isErr()) { console.warn(`⚠️ ${datasetName}: ${inputsResult.error.message}`) return { datasetName, success: 0, failure: 1 } @@ -73,10 +75,10 @@ async function processDataset( let failureCount = 0 const processBatch = async ( - batch: Array<{ caseId: string; input: OpenAIExecutorInput }>, + batch: Array<{ caseId: string; data: OpenAIExecutorInput }>, ) => { - const promises = batch.map(({ caseId, input }) => - executeCase(executor, datasetPath, caseId, input), + const promises = batch.map(({ caseId, data }) => + executeCase(executor, datasetPath, caseId, data), ) const results = await Promise.allSettled(promises) results.forEach((result) => { diff --git a/frontend/internal-packages/schema-bench/src/cli/utils/index.ts b/frontend/internal-packages/schema-bench/src/cli/utils/index.ts index 0091f5de42..2fbed6f5da 100644 --- a/frontend/internal-packages/schema-bench/src/cli/utils/index.ts +++ b/frontend/internal-packages/schema-bench/src/cli/utils/index.ts @@ -2,7 +2,7 @@ export * from './discoverDefaultDatasets.ts' export * from './error.ts' export * from './filterAndResolveDatasets.ts' export * from './listAllDatasets.ts' -export * from './loadInputFiles.ts' +export * from './loadJsonFiles.ts' export * from './parseArgs.ts' export * from './saveOutputFile.ts' export * from './selectTargetDatasets.ts' diff --git a/frontend/internal-packages/schema-bench/src/cli/utils/loadInputFiles.ts b/frontend/internal-packages/schema-bench/src/cli/utils/loadJsonFiles.ts similarity index 63% rename from frontend/internal-packages/schema-bench/src/cli/utils/loadInputFiles.ts rename to frontend/internal-packages/schema-bench/src/cli/utils/loadJsonFiles.ts index 6f66059fcc..64885efac5 100644 --- a/frontend/internal-packages/schema-bench/src/cli/utils/loadInputFiles.ts +++ b/frontend/internal-packages/schema-bench/src/cli/utils/loadJsonFiles.ts @@ -11,39 +11,33 @@ import { import * as v from 'valibot' /** - * Load and validate input JSON files under `execution/input` for a dataset. - * The schema and normalize function allow callers to adapt to per-executor needs. + * Load and validate JSON files from a specified directory. + * Generic function that can be used for both input and reference files. */ -export async function loadInputFiles< +export async function loadJsonFiles< Schema extends v.BaseSchema>, T, >( - datasetPath: string, + directory: string, schema: Schema, normalize: (value: v.InferOutput) => T, -): Promise, Error>> { - const inputDir = join(datasetPath, 'execution/input') - - if (!existsSync(inputDir)) { - return err( - new Error( - `Input directory not found: ${inputDir}. Please run setup-workspace first.`, - ), - ) +): Promise, Error>> { + if (!existsSync(directory)) { + return err(new Error(`Directory not found: ${directory}`)) } - const filesResult = await fromPromise(readdir(inputDir), (error) => + const filesResult = await fromPromise(readdir(directory), (error) => error instanceof Error ? error : new Error('Failed to read directory'), ) if (filesResult.isErr()) return err(filesResult.error) const jsonFiles = filesResult.value.filter((file) => file.endsWith('.json')) - const inputs: Array<{ caseId: string; input: T }> = [] + const results: Array<{ caseId: string; data: T }> = [] for (const file of jsonFiles) { const caseId = file.replace('.json', '') const contentResult = await fromPromise( - readFile(join(inputDir, file), 'utf-8'), + readFile(join(directory, file), 'utf-8'), (error) => error instanceof Error ? error @@ -64,14 +58,14 @@ export async function loadInputFiles< if (!validationResult.success) { return err( new Error( - `Invalid input format in ${file}: ${JSON.stringify(validationResult.issues)}`, + `Invalid format in ${file}: ${JSON.stringify(validationResult.issues)}`, ), ) } const normalized: T = normalize(validationResult.output) - inputs.push({ caseId, input: normalized }) + results.push({ caseId, data: normalized }) } - return ok(inputs) + return ok(results) } From b4b4c31eb6bea7e8b6c6675a0e55f59bec9bde52 Mon Sep 17 00:00:00 2001 From: MH4GF Date: Wed, 8 Oct 2025 12:14:10 +0900 Subject: [PATCH 2/5] feat(schema-bench): add LangSmith integration for evaluation tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add comprehensive LangSmith integration to schema-bench for advanced evaluation tracking, visualization, and experiment comparison. Key features: - LangSmith dataset upload from local workspace files - Evaluation execution with LiamDB and OpenAI executors - Schema metrics tracking (table/column counts, F1 scores, recall) - Configurable repetitions and concurrency - neverthrow-based error handling for type safety Implementation details: - Created evaluateWithLangsmith.ts for running evaluations - Created uploadDatasetToLangsmith.ts for dataset management - Added schemaEvaluator for computing schema similarity metrics - Integrated @liam-hq/neverthrow for functional error handling - Added loadJsonFiles utility for consistent file loading - Used ResultAsync.combine for parallel async operations 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../internal-packages/schema-bench/README.md | 55 ++++++ .../schema-bench/package.json | 3 + .../src/cli/evaluateWithLangsmith.ts | 162 ++++++++++++++++ .../src/cli/uploadDatasetToLangsmith.ts | 174 ++++++++++++++++++ .../src/langsmith/schemaEvaluator.ts | 61 ++++++ .../schema-bench/src/langsmith/types.ts | 15 ++ pnpm-lock.yaml | 9 +- 7 files changed, 476 insertions(+), 3 deletions(-) create mode 100644 frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts create mode 100644 frontend/internal-packages/schema-bench/src/cli/uploadDatasetToLangsmith.ts create mode 100644 frontend/internal-packages/schema-bench/src/langsmith/schemaEvaluator.ts create mode 100644 frontend/internal-packages/schema-bench/src/langsmith/types.ts diff --git a/frontend/internal-packages/schema-bench/README.md b/frontend/internal-packages/schema-bench/README.md index 5806bb8a24..b61d71068e 100644 --- a/frontend/internal-packages/schema-bench/README.md +++ b/frontend/internal-packages/schema-bench/README.md @@ -187,3 +187,58 @@ export OPENAI_API_KEY="your-api-key" - Model comparison across datasets - Quality assurance for schema generation - Repeatable benchmarking with standardized metrics + +## LangSmith Integration + +Schema-bench integrates with [LangSmith](https://smith.langchain.com) for advanced evaluation tracking, visualization, and experiment comparison. + +### Setup + +1. Set your LangSmith API key: +```bash +export LANGSMITH_API_KEY="your-api-key" +``` + +2. Upload datasets to LangSmith (one-time setup): +```bash +# Upload all datasets +pnpm --filter @liam-hq/schema-bench langsmith:upload -all + +# Upload specific datasets +pnpm --filter @liam-hq/schema-bench langsmith:upload -default +pnpm --filter @liam-hq/schema-bench langsmith:upload -entity-extraction +``` + +This creates LangSmith datasets from your local benchmark workspace files. + +### Running Evaluations + +LangSmith combines execution and evaluation in a single command. Use the same dataset flags as the regular executors: + +```bash +# LiamDB: Run on all datasets +pnpm --filter @liam-hq/schema-bench langsmith -all --liamdb + +# LiamDB: Run on specific datasets +pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb +pnpm --filter @liam-hq/schema-bench langsmith -default -entity-extraction --liamdb + +# OpenAI: Run on all datasets +pnpm --filter @liam-hq/schema-bench langsmith -all --openai + +# OpenAI: Run on specific datasets +pnpm --filter @liam-hq/schema-bench langsmith -default --openai +pnpm --filter @liam-hq/schema-bench langsmith -entity-extraction -relational-inference --openai + +# Advanced options +pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb --num-repetitions=5 +pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb --max-concurrency=5 +``` + +**Options:** +- `--num-repetitions=N`: Number of times to run each test case (default: 3) +- `--max-concurrency=N`: Maximum concurrent executions (default: 3) + +### Viewing Results + +After the evaluation completes, a LangSmith URL will be displayed in the console. Open this URL to view detailed evaluation results. diff --git a/frontend/internal-packages/schema-bench/package.json b/frontend/internal-packages/schema-bench/package.json index 089887f961..f3e1f7acfe 100644 --- a/frontend/internal-packages/schema-bench/package.json +++ b/frontend/internal-packages/schema-bench/package.json @@ -6,6 +6,7 @@ "dependencies": { "@huggingface/transformers": "3.3.3", "@liam-hq/agent": "workspace:*", + "@liam-hq/neverthrow": "workspace:*", "@liam-hq/schema": "workspace:*", "dotenv": "16.5.0", "langsmith": "0.3.69", @@ -30,6 +31,8 @@ "fmt": "concurrently \"pnpm:fmt:*\"", "fmt:biome": "biome check --write --unsafe .", "fmt:eslint": "eslint --fix .", + "langsmith": "tsx src/cli/evaluateWithLangsmith.ts", + "langsmith:upload": "tsx src/cli/uploadDatasetToLangsmith.ts", "lint": "concurrently \"pnpm:lint:*\"", "lint:biome": "biome check .", "lint:eslint": "eslint .", diff --git a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts new file mode 100644 index 0000000000..cde090087d --- /dev/null +++ b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts @@ -0,0 +1,162 @@ +#!/usr/bin/env node + +import { resolve } from 'node:path' +import { config } from 'dotenv' +import { evaluate } from 'langsmith/evaluation' +import { fromPromise, ResultAsync } from 'neverthrow' +import * as v from 'valibot' +import { execute as executeLiamDb } from '../executors/liamDb/liamDbExecutor.ts' +import { OpenAIExecutor } from '../executors/openai/openaiExecutor.ts' +import { schemaEvaluator } from '../langsmith/schemaEvaluator.ts' +import type { LangSmithInput, LangSmithOutput } from '../langsmith/types.ts' +import { + filterAndResolveDatasets, + getWorkspacePath, + handleCliError, + handleUnexpectedError, + parseArgs, + selectTargetDatasets, +} from './utils/index.ts' + +config({ path: resolve(__dirname, '../../../../../.env') }) + +const executorTypeSchema = v.picklist(['liamdb', 'openai']) +const positiveIntegerSchema = v.pipe( + v.union([v.pipe(v.string(), v.transform(Number)), v.number()]), + v.integer(), + v.minValue(1), +) +const optionsSchema = v.object({ + executorType: v.optional(executorTypeSchema, 'liamdb'), + numRepetitions: v.optional(positiveIntegerSchema, 3), + maxConcurrency: v.optional(positiveIntegerSchema, 3), +}) + +type ExecutorOptions = v.InferOutput +type ExecutorType = v.InferOutput + +const parseExecutorAndOptions = (argv: string[]): ExecutorOptions => { + const args = argv.slice(2) + + const rawOptions: Record = {} + + for (const arg of args) { + if (arg === '--openai') { + rawOptions['executorType'] = 'openai' + } else if (arg === '--liamdb') { + rawOptions['executorType'] = 'liamdb' + } else if (arg.startsWith('--num-repetitions=')) { + rawOptions['numRepetitions'] = arg.split('=')[1] + } else if (arg.startsWith('--max-concurrency=')) { + rawOptions['maxConcurrency'] = arg.split('=')[1] + } + } + + return v.parse(optionsSchema, rawOptions) +} + +const createTarget = ( + executorType: ExecutorType, +): ((input: LangSmithInput) => Promise) => { + if (executorType === 'liamdb') { + return async (input: LangSmithInput): Promise => { + const prompt = input.prompt || input.input || '' + + const result = await executeLiamDb({ input: prompt }) + + if (result.isErr()) { + throw result.error + } + + return { schema: result.value } + } + } + + if (executorType === 'openai') { + const apiKey = process.env['OPENAI_API_KEY'] + if (!apiKey) { + handleCliError('OPENAI_API_KEY environment variable is required') + } + + const executor = new OpenAIExecutor({ apiKey: apiKey || '' }) + + return async (input: LangSmithInput): Promise => { + const prompt = input.prompt || input.input || '' + + const result = await executor.execute({ input: prompt }) + + if (result.isErr()) { + throw result.error + } + + return { schema: result.value } + } + } + + return handleCliError(`Unknown executor type: ${executorType}`) +} + +type ExperimentResults = Awaited> + +const runEvaluation = ( + datasetName: string, + options: ExecutorOptions, +): ResultAsync => { + const target = createTarget(options.executorType) + + return fromPromise( + evaluate(target, { + data: `schema-bench-${datasetName}`, + evaluators: [schemaEvaluator], + experimentPrefix: `${options.executorType}-${datasetName}`, + maxConcurrency: options.maxConcurrency, + numRepetitions: options.numRepetitions, + }), + (error) => { + return error instanceof Error ? error : new Error(String(error)) + }, + ) +} + +const runDatasets = async ( + datasets: Array<{ name: string }>, + options: ExecutorOptions, +) => { + const results = datasets.map(({ name }) => runEvaluation(name, options)) + return ResultAsync.combineWithAllErrors(results) +} + +const main = async () => { + // Filter out executor options (--xxx) for parseArgs + const datasetArgs = process.argv.filter((arg) => !arg.startsWith('--')) + + // Parse dataset flags using existing utility + const cliOptions = parseArgs(datasetArgs) + + // Parse executor and evaluation options + const options = parseExecutorAndOptions(process.argv) + + // Get workspace and select datasets + const workspacePath = getWorkspacePath() + const targetDatasets = selectTargetDatasets(cliOptions, workspacePath) + + if (targetDatasets.length === 0) { + handleCliError('No datasets found to process. Use -all or -') + } + + const validDatasets = filterAndResolveDatasets(targetDatasets, workspacePath) + + if (validDatasets.length === 0) { + handleCliError('No valid datasets found in workspace') + } + + const result = await runDatasets(validDatasets, options) + + if (result.isErr()) { + process.exit(1) + } +} + +if (import.meta.url === `file://${process.argv[1]}`) { + main().catch(handleUnexpectedError) +} diff --git a/frontend/internal-packages/schema-bench/src/cli/uploadDatasetToLangsmith.ts b/frontend/internal-packages/schema-bench/src/cli/uploadDatasetToLangsmith.ts new file mode 100644 index 0000000000..5d4d4482f8 --- /dev/null +++ b/frontend/internal-packages/schema-bench/src/cli/uploadDatasetToLangsmith.ts @@ -0,0 +1,174 @@ +#!/usr/bin/env node + +import { join, resolve } from 'node:path' +import { fromPromise } from '@liam-hq/neverthrow' +import { type Schema, schemaSchema } from '@liam-hq/schema' +import { config } from 'dotenv' +import { Client } from 'langsmith' +import { ResultAsync } from 'neverthrow' +import * as v from 'valibot' +import type { LangSmithDatasetConfig } from '../langsmith/types.ts' +import { + filterAndResolveDatasets, + getWorkspacePath, + handleCliError, + handleUnexpectedError, + loadJsonFiles, + parseArgs, + selectTargetDatasets, +} from './utils/index.ts' + +config({ path: resolve(__dirname, '../../../../../.env') }) + +const inputSchema = v.union([v.string(), v.object({ input: v.string() })]) + +const getOrCreateDataset = (client: Client, datasetName: string) => { + return fromPromise(client.readDataset({ datasetName })).orElse(() => + fromPromise(client.createDataset(datasetName)), + ) +} + +const findExistingExample = async ( + client: Client, + datasetId: string, + caseId: string, +) => { + for await (const example of client.listExamples({ datasetId })) { + const metadata = example.metadata + if (metadata && 'caseId' in metadata && metadata['caseId'] === caseId) { + return example + } + } + return null +} + +const uploadOrUpdateExample = async ( + client: Client, + datasetId: string, + caseId: string, + input: { input: string }, + reference: { schema: Schema }, +) => { + const existingExample = await findExistingExample(client, datasetId, caseId) + + if (existingExample) { + await client.updateExample({ + id: existingExample.id, + inputs: input, + outputs: reference, + }) + } else { + await client.createExample({ + inputs: input, + outputs: reference, + dataset_id: datasetId, + metadata: { caseId }, + }) + } +} + +type DatasetResult = Awaited> + +const uploadExamples = ( + client: Client, + dataset: DatasetResult, + inputs: Array<{ caseId: string; data: { input: string } }>, + references: Array<{ caseId: string; data: { schema: Schema } }>, +): ResultAsync => { + const uploadExamplesPromise = async () => { + for (const inputItem of inputs) { + const reference = references.find((r) => r.caseId === inputItem.caseId) + + if (!reference) { + console.warn( + `⚠️ No reference found for case: ${inputItem.caseId}, skipping`, + ) + continue + } + + await uploadOrUpdateExample( + client, + dataset.id, + inputItem.caseId, + inputItem.data, + reference.data, + ) + } + } + + return fromPromise(uploadExamplesPromise()) +} + +const uploadDataset = ( + config: LangSmithDatasetConfig, +): ResultAsync => { + const client = new Client() + + const inputDir = join(config.workspacePath, 'execution', 'input') + const referenceDir = join(config.workspacePath, 'execution', 'reference') + + const datasetResult = getOrCreateDataset(client, config.datasetName) + const inputsResult = fromPromise( + loadJsonFiles(inputDir, inputSchema, (value) => + typeof value === 'string' ? { input: value } : { input: value.input }, + ).then((result) => { + if (result.isErr()) throw result.error + return result.value + }), + ) + const referencesResult = fromPromise( + loadJsonFiles(referenceDir, schemaSchema, (value) => ({ + schema: value, + })).then((result) => { + if (result.isErr()) throw result.error + return result.value + }), + ) + + return ResultAsync.combine([ + datasetResult, + inputsResult, + referencesResult, + ]).andThen(([dataset, inputs, references]) => + uploadExamples(client, dataset, inputs, references), + ) +} + +const runUploads = (datasets: Array<{ name: string; path: string }>) => { + const results = datasets.map(({ name, path }) => + uploadDataset({ + datasetName: `schema-bench-${name}`, + workspacePath: path, + }), + ) + return ResultAsync.combineWithAllErrors(results) +} + +const main = async () => { + // Parse dataset flags using existing utility + const cliOptions = parseArgs(process.argv) + + // Get workspace and select datasets + const workspacePath = getWorkspacePath() + const targetDatasets = selectTargetDatasets(cliOptions, workspacePath) + + if (targetDatasets.length === 0) { + handleCliError('No datasets found to process. Use -all or -') + } + + const validDatasets = filterAndResolveDatasets(targetDatasets, workspacePath) + + if (validDatasets.length === 0) { + handleCliError('No valid datasets found in workspace') + } + + const result = await runUploads(validDatasets) + + if (result.isErr()) { + process.exit(1) + } +} + +if (import.meta.url === `file://${process.argv[1]}`) { + main().catch(handleUnexpectedError) +} diff --git a/frontend/internal-packages/schema-bench/src/langsmith/schemaEvaluator.ts b/frontend/internal-packages/schema-bench/src/langsmith/schemaEvaluator.ts new file mode 100644 index 0000000000..2d7d422b74 --- /dev/null +++ b/frontend/internal-packages/schema-bench/src/langsmith/schemaEvaluator.ts @@ -0,0 +1,61 @@ +import { type Schema, schemaSchema } from '@liam-hq/schema' +import type { EvaluatorT } from 'langsmith/evaluation' +import type { Example, Run } from 'langsmith/schemas' +import * as v from 'valibot' +import { evaluate } from '../evaluate/evaluate.ts' + +const validateSchema = (data: unknown): Schema => v.parse(schemaSchema, data) + +const countTables = (schema: Schema): number => { + return Object.keys(schema.tables).length +} + +const countColumns = (schema: Schema): number => { + return Object.values(schema.tables).reduce( + (total, table) => total + Object.keys(table.columns).length, + 0, + ) +} + +export const schemaEvaluator: EvaluatorT = async (args: { + run: Run + example: Example + inputs: Record + outputs: Record + referenceOutputs?: Record +}) => { + const referenceSchema = validateSchema(args.referenceOutputs?.['schema']) + const outputSchema = validateSchema(args.outputs['schema']) + const result = await evaluate(referenceSchema, outputSchema) + + return [ + { + key: 'Table Count', + score: countTables(outputSchema), + }, + { + key: 'Column Count', + score: countColumns(outputSchema), + }, + { + key: 'Table F1 Score', + score: result.tableF1Score, + }, + { + key: 'Table Recall', + score: result.tableRecall, + }, + { + key: 'Column F1 Score Average', + score: result.columnF1ScoreAverage, + }, + { + key: 'Column Recall Average', + score: result.columnRecallAverage, + }, + { + key: 'Column All Correct Rate Average', + score: result.columnAllCorrectRateAverage, + }, + ] +} diff --git a/frontend/internal-packages/schema-bench/src/langsmith/types.ts b/frontend/internal-packages/schema-bench/src/langsmith/types.ts new file mode 100644 index 0000000000..ea4a56852b --- /dev/null +++ b/frontend/internal-packages/schema-bench/src/langsmith/types.ts @@ -0,0 +1,15 @@ +import type { Schema } from '@liam-hq/schema' + +export type LangSmithInput = { + prompt?: string + input?: string +} + +export type LangSmithOutput = { + schema: Schema +} + +export type LangSmithDatasetConfig = { + datasetName: string + workspacePath: string +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0378ee8647..d1974981bb 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -115,7 +115,7 @@ importers: version: link:../../packages/ui '@next/third-parties': specifier: 15.3.5 - version: 15.3.5(next@15.4.7(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1) + version: 15.3.5(next@15.4.7(@babel/core@7.28.4)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1) '@sentry/nextjs': specifier: '9' version: 9.46.0(@opentelemetry/context-async-hooks@2.1.0(@opentelemetry/api@1.9.0))(@opentelemetry/core@2.1.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.1.0(@opentelemetry/api@1.9.0))(next@15.4.7(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1)(webpack@5.102.0(@swc/core@1.12.11)) @@ -254,7 +254,7 @@ importers: version: link:../../packages/ui '@next/third-parties': specifier: 15.3.5 - version: 15.3.5(next@15.4.7(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1) + version: 15.3.5(next@15.4.7(@babel/core@7.28.4)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1) '@rive-app/react-canvas': specifier: 4.23.3 version: 4.23.3(react@19.1.1) @@ -675,6 +675,9 @@ importers: '@liam-hq/agent': specifier: workspace:* version: link:../agent + '@liam-hq/neverthrow': + specifier: workspace:* + version: link:../neverthrow '@liam-hq/schema': specifier: workspace:* version: link:../../packages/schema @@ -13337,7 +13340,7 @@ snapshots: '@next/swc-win32-x64-msvc@15.4.7': optional: true - '@next/third-parties@15.3.5(next@15.4.7(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1)': + '@next/third-parties@15.3.5(next@15.4.7(@babel/core@7.28.4)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1)': dependencies: next: 15.4.7(@babel/core@7.28.4)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1) react: 19.1.1 From 7433be982410787516327fbea187eba511286c06 Mon Sep 17 00:00:00 2001 From: MH4GF Date: Wed, 8 Oct 2025 12:18:09 +0900 Subject: [PATCH 3/5] refactor(schema-bench): use @liam-hq/neverthrow default error handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace manual error handler in fromPromise with @liam-hq/neverthrow's default error handler for consistency across the codebase. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../schema-bench/src/cli/evaluateWithLangsmith.ts | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts index cde090087d..0c6a8f8965 100644 --- a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts +++ b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts @@ -1,9 +1,10 @@ #!/usr/bin/env node import { resolve } from 'node:path' +import { fromPromise } from '@liam-hq/neverthrow' import { config } from 'dotenv' import { evaluate } from 'langsmith/evaluation' -import { fromPromise, ResultAsync } from 'neverthrow' +import { ResultAsync } from 'neverthrow' import * as v from 'valibot' import { execute as executeLiamDb } from '../executors/liamDb/liamDbExecutor.ts' import { OpenAIExecutor } from '../executors/openai/openaiExecutor.ts' @@ -112,9 +113,6 @@ const runEvaluation = ( maxConcurrency: options.maxConcurrency, numRepetitions: options.numRepetitions, }), - (error) => { - return error instanceof Error ? error : new Error(String(error)) - }, ) } From 0667f1ddabc758a8b1a0f8704fb59831542b111e Mon Sep 17 00:00:00 2001 From: Hirotaka Miyagi <31152321+MH4GF@users.noreply.github.com> Date: Wed, 8 Oct 2025 12:27:16 +0900 Subject: [PATCH 4/5] remove unnecessary fallback Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../schema-bench/src/cli/evaluateWithLangsmith.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts index 0c6a8f8965..cb78ce377a 100644 --- a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts +++ b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts @@ -79,7 +79,7 @@ const createTarget = ( handleCliError('OPENAI_API_KEY environment variable is required') } - const executor = new OpenAIExecutor({ apiKey: apiKey || '' }) + const executor = new OpenAIExecutor({ apiKey: apiKey }) return async (input: LangSmithInput): Promise => { const prompt = input.prompt || input.input || '' From 8852d2557e6033553dd3eb85037997a3afd7937d Mon Sep 17 00:00:00 2001 From: MH4GF Date: Wed, 8 Oct 2025 12:33:33 +0900 Subject: [PATCH 5/5] fix(schema-bench): resolve TypeScript type error in evaluateWithLangsmith MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add non-null assertion for apiKey after handleCliError check. Since handleCliError returns never, apiKey is guaranteed to be defined after the check, but TypeScript's control flow analysis doesn't recognize this. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../schema-bench/src/cli/evaluateWithLangsmith.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts index cb78ce377a..0dc7e1a40f 100644 --- a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts +++ b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts @@ -79,7 +79,7 @@ const createTarget = ( handleCliError('OPENAI_API_KEY environment variable is required') } - const executor = new OpenAIExecutor({ apiKey: apiKey }) + const executor = new OpenAIExecutor({ apiKey: apiKey! }) return async (input: LangSmithInput): Promise => { const prompt = input.prompt || input.input || ''