Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions frontend/internal-packages/schema-bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,3 +187,58 @@ export OPENAI_API_KEY="your-api-key"
- Model comparison across datasets
- Quality assurance for schema generation
- Repeatable benchmarking with standardized metrics

## LangSmith Integration

Schema-bench integrates with [LangSmith](https://smith.langchain.com) for advanced evaluation tracking, visualization, and experiment comparison.

### Setup

1. Set your LangSmith API key:
```bash
export LANGSMITH_API_KEY="your-api-key"
```

2. Upload datasets to LangSmith (one-time setup):
```bash
# Upload all datasets
pnpm --filter @liam-hq/schema-bench langsmith:upload -all

# Upload specific datasets
pnpm --filter @liam-hq/schema-bench langsmith:upload -default
pnpm --filter @liam-hq/schema-bench langsmith:upload -entity-extraction
```

This creates LangSmith datasets from your local benchmark workspace files.

### Running Evaluations

LangSmith combines execution and evaluation in a single command. Use the same dataset flags as the regular executors:

```bash
# LiamDB: Run on all datasets
pnpm --filter @liam-hq/schema-bench langsmith -all --liamdb

# LiamDB: Run on specific datasets
pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb
pnpm --filter @liam-hq/schema-bench langsmith -default -entity-extraction --liamdb

# OpenAI: Run on all datasets
pnpm --filter @liam-hq/schema-bench langsmith -all --openai

# OpenAI: Run on specific datasets
pnpm --filter @liam-hq/schema-bench langsmith -default --openai
pnpm --filter @liam-hq/schema-bench langsmith -entity-extraction -relational-inference --openai

# Advanced options
pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb --num-repetitions=5
pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb --max-concurrency=5
```

**Options:**
- `--num-repetitions=N`: Number of times to run each test case (default: 3)
- `--max-concurrency=N`: Maximum concurrent executions (default: 3)

### Viewing Results

After the evaluation completes, a LangSmith URL will be displayed in the console. Open this URL to view detailed evaluation results.
3 changes: 3 additions & 0 deletions frontend/internal-packages/schema-bench/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"dependencies": {
"@huggingface/transformers": "3.3.3",
"@liam-hq/agent": "workspace:*",
"@liam-hq/neverthrow": "workspace:*",
"@liam-hq/schema": "workspace:*",
"dotenv": "16.5.0",
"langsmith": "0.3.69",
Expand All @@ -30,6 +31,8 @@
"fmt": "concurrently \"pnpm:fmt:*\"",
"fmt:biome": "biome check --write --unsafe .",
"fmt:eslint": "eslint --fix .",
"langsmith": "tsx src/cli/evaluateWithLangsmith.ts",
"langsmith:upload": "tsx src/cli/uploadDatasetToLangsmith.ts",
"lint": "concurrently \"pnpm:lint:*\"",
"lint:biome": "biome check .",
"lint:eslint": "eslint .",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#!/usr/bin/env node

import { resolve } from 'node:path'
import { fromPromise } from '@liam-hq/neverthrow'
import { config } from 'dotenv'
import { evaluate } from 'langsmith/evaluation'
import { ResultAsync } from 'neverthrow'
import * as v from 'valibot'
import { execute as executeLiamDb } from '../executors/liamDb/liamDbExecutor.ts'
import { OpenAIExecutor } from '../executors/openai/openaiExecutor.ts'
import { schemaEvaluator } from '../langsmith/schemaEvaluator.ts'
import type { LangSmithInput, LangSmithOutput } from '../langsmith/types.ts'
import {
filterAndResolveDatasets,
getWorkspacePath,
handleCliError,
handleUnexpectedError,
parseArgs,
selectTargetDatasets,
} from './utils/index.ts'

config({ path: resolve(__dirname, '../../../../../.env') })

const executorTypeSchema = v.picklist(['liamdb', 'openai'])
const positiveIntegerSchema = v.pipe(
v.union([v.pipe(v.string(), v.transform(Number)), v.number()]),
v.integer(),
v.minValue(1),
)
const optionsSchema = v.object({
executorType: v.optional(executorTypeSchema, 'liamdb'),
numRepetitions: v.optional(positiveIntegerSchema, 3),
maxConcurrency: v.optional(positiveIntegerSchema, 3),
})

type ExecutorOptions = v.InferOutput<typeof optionsSchema>
type ExecutorType = v.InferOutput<typeof executorTypeSchema>

const parseExecutorAndOptions = (argv: string[]): ExecutorOptions => {
const args = argv.slice(2)

const rawOptions: Record<string, unknown> = {}

for (const arg of args) {
if (arg === '--openai') {
rawOptions['executorType'] = 'openai'
} else if (arg === '--liamdb') {
rawOptions['executorType'] = 'liamdb'
} else if (arg.startsWith('--num-repetitions=')) {
rawOptions['numRepetitions'] = arg.split('=')[1]
} else if (arg.startsWith('--max-concurrency=')) {
rawOptions['maxConcurrency'] = arg.split('=')[1]
}
}

return v.parse(optionsSchema, rawOptions)
}

const createTarget = (
executorType: ExecutorType,
): ((input: LangSmithInput) => Promise<LangSmithOutput>) => {
if (executorType === 'liamdb') {
return async (input: LangSmithInput): Promise<LangSmithOutput> => {
const prompt = input.prompt || input.input || ''

const result = await executeLiamDb({ input: prompt })

if (result.isErr()) {
throw result.error
}

return { schema: result.value }
}
}

if (executorType === 'openai') {
const apiKey = process.env['OPENAI_API_KEY']
if (!apiKey) {
handleCliError('OPENAI_API_KEY environment variable is required')
}

const executor = new OpenAIExecutor({ apiKey: apiKey! })

return async (input: LangSmithInput): Promise<LangSmithOutput> => {
const prompt = input.prompt || input.input || ''

const result = await executor.execute({ input: prompt })

if (result.isErr()) {
throw result.error
}

return { schema: result.value }
}
}

return handleCliError(`Unknown executor type: ${executorType}`)
}

type ExperimentResults = Awaited<ReturnType<typeof evaluate>>

const runEvaluation = (
datasetName: string,
options: ExecutorOptions,
): ResultAsync<ExperimentResults, Error> => {
const target = createTarget(options.executorType)

return fromPromise(
evaluate(target, {
data: `schema-bench-${datasetName}`,
evaluators: [schemaEvaluator],
experimentPrefix: `${options.executorType}-${datasetName}`,
maxConcurrency: options.maxConcurrency,
numRepetitions: options.numRepetitions,
}),
Comment on lines +109 to +115
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the main function for submitting evaluations to LangSmith.
ref: https://docs.langchain.com/langsmith/code-evaluator

Pass in the target and evaluators to execute and evaluate in one go. Use numRepetitions to run multiple executions.

)
}

const runDatasets = async (
datasets: Array<{ name: string }>,
options: ExecutorOptions,
) => {
const results = datasets.map(({ name }) => runEvaluation(name, options))
return ResultAsync.combineWithAllErrors(results)
}

const main = async () => {
// Filter out executor options (--xxx) for parseArgs
const datasetArgs = process.argv.filter((arg) => !arg.startsWith('--'))

// Parse dataset flags using existing utility
const cliOptions = parseArgs(datasetArgs)

// Parse executor and evaluation options
const options = parseExecutorAndOptions(process.argv)

// Get workspace and select datasets
const workspacePath = getWorkspacePath()
const targetDatasets = selectTargetDatasets(cliOptions, workspacePath)

if (targetDatasets.length === 0) {
handleCliError('No datasets found to process. Use -all or -<dataset-name>')
}

const validDatasets = filterAndResolveDatasets(targetDatasets, workspacePath)

if (validDatasets.length === 0) {
handleCliError('No valid datasets found in workspace')
}

const result = await runDatasets(validDatasets, options)

if (result.isErr()) {
process.exit(1)
}
}

if (import.meta.url === `file://${process.argv[1]}`) {
main().catch(handleUnexpectedError)
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/usr/bin/env node

import { resolve } from 'node:path'
import { join, resolve } from 'node:path'
import { config } from 'dotenv'
import { err, ok, type Result } from 'neverthrow'
import * as v from 'valibot'
import { execute, type LiamDbExecutorInput } from '../executors/liamDb/index.ts'
import { loadInputFiles, saveOutputFile } from './utils'
import { loadJsonFiles, saveOutputFile } from './utils'

config({ path: resolve(__dirname, '../../../../../.env') })

Expand Down Expand Up @@ -52,10 +52,10 @@ export async function processDataset(
datasetPath: string,
): Promise<DatasetResult> {
// Load input files
const inputsResult = await loadInputFiles<
const inputsResult = await loadJsonFiles<
typeof InputSchema,
LiamDbExecutorInput
>(datasetPath, InputSchema, (value) => ({
>(join(datasetPath, 'execution', 'input'), InputSchema, (value) => ({
input: typeof value === 'string' ? value : value.input,
}))
if (inputsResult.isErr()) {
Expand All @@ -76,10 +76,10 @@ export async function processDataset(
let failureCount = 0

const processBatch = async (
batch: Array<{ caseId: string; input: LiamDbExecutorInput }>,
batch: Array<{ caseId: string; data: LiamDbExecutorInput }>,
) => {
const promises = batch.map(({ caseId, input }) =>
executeCase(datasetPath, caseId, input),
const promises = batch.map(({ caseId, data }) =>
executeCase(datasetPath, caseId, data),
)
const results = await Promise.allSettled(promises)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env node

import { resolve } from 'node:path'
import { join, resolve } from 'node:path'
import { config as loadEnv } from 'dotenv'
import { err, ok, type Result } from 'neverthrow'
import * as v from 'valibot'
Expand All @@ -11,7 +11,7 @@ import {
getWorkspacePath,
handleCliError,
handleUnexpectedError,
loadInputFiles,
loadJsonFiles,
parseArgs,
saveOutputFile,
selectTargetDatasets,
Expand Down Expand Up @@ -53,10 +53,12 @@ async function processDataset(
datasetName: string,
datasetPath: string,
): Promise<DatasetResult> {
const inputsResult = await loadInputFiles<
const inputsResult = await loadJsonFiles<
typeof InputSchema,
OpenAIExecutorInput
>(datasetPath, InputSchema, (value) => ({ input: value.input }))
>(join(datasetPath, 'execution', 'input'), InputSchema, (value) => ({
input: value.input,
}))
if (inputsResult.isErr()) {
console.warn(`⚠️ ${datasetName}: ${inputsResult.error.message}`)
return { datasetName, success: 0, failure: 1 }
Expand All @@ -73,10 +75,10 @@ async function processDataset(
let failureCount = 0

const processBatch = async (
batch: Array<{ caseId: string; input: OpenAIExecutorInput }>,
batch: Array<{ caseId: string; data: OpenAIExecutorInput }>,
) => {
const promises = batch.map(({ caseId, input }) =>
executeCase(executor, datasetPath, caseId, input),
const promises = batch.map(({ caseId, data }) =>
executeCase(executor, datasetPath, caseId, data),
)
const results = await Promise.allSettled(promises)
results.forEach((result) => {
Expand Down
Loading