liam-hq · sasamuku · Oct 10, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025
@@ -187,3 +187,58 @@ export OPENAI_API_KEY="your-api-key"
 - Model comparison across datasets
 - Quality assurance for schema generation
 - Repeatable benchmarking with standardized metrics
+
+## LangSmith Integration
+
+Schema-bench integrates with [LangSmith](https://smith.langchain.com) for advanced evaluation tracking, visualization, and experiment comparison.
+
+### Setup
+
+1. Set your LangSmith API key:
+```bash
+export LANGSMITH_API_KEY="your-api-key"
+```
+
+2. Upload datasets to LangSmith (one-time setup):
+```bash
+# Upload all datasets
+pnpm --filter @liam-hq/schema-bench langsmith:upload -all
+
+# Upload specific datasets
+pnpm --filter @liam-hq/schema-bench langsmith:upload -default
+pnpm --filter @liam-hq/schema-bench langsmith:upload -entity-extraction
+```
+
+This creates LangSmith datasets from your local benchmark workspace files.
+
+### Running Evaluations
+
+LangSmith combines execution and evaluation in a single command. Use the same dataset flags as the regular executors:
+
+```bash
+# LiamDB: Run on all datasets
+pnpm --filter @liam-hq/schema-bench langsmith -all --liamdb
+
+# LiamDB: Run on specific datasets
+pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb
+pnpm --filter @liam-hq/schema-bench langsmith -default -entity-extraction --liamdb
+
+# OpenAI: Run on all datasets
+pnpm --filter @liam-hq/schema-bench langsmith -all --openai
+
+# OpenAI: Run on specific datasets
+pnpm --filter @liam-hq/schema-bench langsmith -default --openai
+pnpm --filter @liam-hq/schema-bench langsmith -entity-extraction -relational-inference --openai
+
+# Advanced options
+pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb --num-repetitions=5
+pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb --max-concurrency=5
+```
+
+**Options:**
+- `--num-repetitions=N`: Number of times to run each test case (default: 3)
+- `--max-concurrency=N`: Maximum concurrent executions (default: 3)
+
+### Viewing Results
+
+After the evaluation completes, a LangSmith URL will be displayed in the console. Open this URL to view detailed evaluation results.
@@ -6,6 +6,7 @@
   "dependencies": {
     "@huggingface/transformers": "3.3.3",
     "@liam-hq/agent": "workspace:*",
+    "@liam-hq/neverthrow": "workspace:*",
     "@liam-hq/schema": "workspace:*",
     "dotenv": "16.5.0",
     "langsmith": "0.3.69",
@@ -30,6 +31,8 @@
     "fmt": "concurrently \"pnpm:fmt:*\"",
     "fmt:biome": "biome check --write --unsafe .",
     "fmt:eslint": "eslint --fix .",
+    "langsmith": "tsx src/cli/evaluateWithLangsmith.ts",
+    "langsmith:upload": "tsx src/cli/uploadDatasetToLangsmith.ts",
     "lint": "concurrently \"pnpm:lint:*\"",
     "lint:biome": "biome check .",
     "lint:eslint": "eslint .",

@@ -0,0 +1,160 @@
+#!/usr/bin/env node
+
+import { resolve } from 'node:path'
+import { fromPromise } from '@liam-hq/neverthrow'
+import { config } from 'dotenv'
+import { evaluate } from 'langsmith/evaluation'
+import { ResultAsync } from 'neverthrow'
+import * as v from 'valibot'
+import { execute as executeLiamDb } from '../executors/liamDb/liamDbExecutor.ts'
+import { OpenAIExecutor } from '../executors/openai/openaiExecutor.ts'
+import { schemaEvaluator } from '../langsmith/schemaEvaluator.ts'
+import type { LangSmithInput, LangSmithOutput } from '../langsmith/types.ts'
+import {
+  filterAndResolveDatasets,
+  getWorkspacePath,
+  handleCliError,
+  handleUnexpectedError,
+  parseArgs,
+  selectTargetDatasets,
+} from './utils/index.ts'
+
+config({ path: resolve(__dirname, '../../../../../.env') })
+
+const executorTypeSchema = v.picklist(['liamdb', 'openai'])
+const positiveIntegerSchema = v.pipe(
+  v.union([v.pipe(v.string(), v.transform(Number)), v.number()]),
+  v.integer(),
+  v.minValue(1),
+)
+const optionsSchema = v.object({
+  executorType: v.optional(executorTypeSchema, 'liamdb'),
+  numRepetitions: v.optional(positiveIntegerSchema, 3),
+  maxConcurrency: v.optional(positiveIntegerSchema, 3),
+})
+
+type ExecutorOptions = v.InferOutput<typeof optionsSchema>
+type ExecutorType = v.InferOutput<typeof executorTypeSchema>
+
+const parseExecutorAndOptions = (argv: string[]): ExecutorOptions => {
+  const args = argv.slice(2)
+
+  const rawOptions: Record<string, unknown> = {}
+
+  for (const arg of args) {
+    if (arg === '--openai') {
+      rawOptions['executorType'] = 'openai'
+    } else if (arg === '--liamdb') {
+      rawOptions['executorType'] = 'liamdb'
+    } else if (arg.startsWith('--num-repetitions=')) {
+      rawOptions['numRepetitions'] = arg.split('=')[1]
+    } else if (arg.startsWith('--max-concurrency=')) {
+      rawOptions['maxConcurrency'] = arg.split('=')[1]
+    }
+  }
+
+  return v.parse(optionsSchema, rawOptions)
+}
+
+const createTarget = (
+  executorType: ExecutorType,
+): ((input: LangSmithInput) => Promise<LangSmithOutput>) => {
+  if (executorType === 'liamdb') {
+    return async (input: LangSmithInput): Promise<LangSmithOutput> => {
+      const prompt = input.prompt || input.input || ''
+
+      const result = await executeLiamDb({ input: prompt })
+
+      if (result.isErr()) {
+        throw result.error
+      }
+
+      return { schema: result.value }
+    }
+  }
+
+  if (executorType === 'openai') {
+    const apiKey = process.env['OPENAI_API_KEY']
+    if (!apiKey) {
+      handleCliError('OPENAI_API_KEY environment variable is required')
+    }
+
+    const executor = new OpenAIExecutor({ apiKey: apiKey! })
+
+    return async (input: LangSmithInput): Promise<LangSmithOutput> => {
+      const prompt = input.prompt || input.input || ''
+
+      const result = await executor.execute({ input: prompt })
+
+      if (result.isErr()) {
+        throw result.error
+      }
+
+      return { schema: result.value }
+    }
+  }
+
+  return handleCliError(`Unknown executor type: ${executorType}`)
+}
+
+type ExperimentResults = Awaited<ReturnType<typeof evaluate>>
+
+const runEvaluation = (
+  datasetName: string,
+  options: ExecutorOptions,
+): ResultAsync<ExperimentResults, Error> => {
+  const target = createTarget(options.executorType)
+
+  return fromPromise(
+    evaluate(target, {
+      data: `schema-bench-${datasetName}`,
+      evaluators: [schemaEvaluator],
+      experimentPrefix: `${options.executorType}-${datasetName}`,
+      maxConcurrency: options.maxConcurrency,
+      numRepetitions: options.numRepetitions,
+    }),
+  )
+}
+
+const runDatasets = async (
+  datasets: Array<{ name: string }>,
+  options: ExecutorOptions,
+) => {
+  const results = datasets.map(({ name }) => runEvaluation(name, options))
+  return ResultAsync.combineWithAllErrors(results)
+}
+
+const main = async () => {
+  // Filter out executor options (--xxx) for parseArgs
+  const datasetArgs = process.argv.filter((arg) => !arg.startsWith('--'))
+
+  // Parse dataset flags using existing utility
+  const cliOptions = parseArgs(datasetArgs)
+
+  // Parse executor and evaluation options
+  const options = parseExecutorAndOptions(process.argv)
+
+  // Get workspace and select datasets
+  const workspacePath = getWorkspacePath()
+  const targetDatasets = selectTargetDatasets(cliOptions, workspacePath)
+
+  if (targetDatasets.length === 0) {
+    handleCliError('No datasets found to process. Use -all or -<dataset-name>')
+  }
+
+  const validDatasets = filterAndResolveDatasets(targetDatasets, workspacePath)
+
+  if (validDatasets.length === 0) {
+    handleCliError('No valid datasets found in workspace')
+  }
+
+  const result = await runDatasets(validDatasets, options)
+
+  if (result.isErr()) {
+    process.exit(1)
+  }
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main().catch(handleUnexpectedError)
+}
@@ -1,11 +1,11 @@
 #!/usr/bin/env node
 
-import { resolve } from 'node:path'
+import { join, resolve } from 'node:path'
 import { config } from 'dotenv'
 import { err, ok, type Result } from 'neverthrow'
 import * as v from 'valibot'
 import { execute, type LiamDbExecutorInput } from '../executors/liamDb/index.ts'
-import { loadInputFiles, saveOutputFile } from './utils'
+import { loadJsonFiles, saveOutputFile } from './utils'
 
 config({ path: resolve(__dirname, '../../../../../.env') })
 
@@ -52,10 +52,10 @@ export async function processDataset(
   datasetPath: string,
 ): Promise<DatasetResult> {
   // Load input files
-  const inputsResult = await loadInputFiles<
+  const inputsResult = await loadJsonFiles<
     typeof InputSchema,
     LiamDbExecutorInput
-  >(datasetPath, InputSchema, (value) => ({
+  >(join(datasetPath, 'execution', 'input'), InputSchema, (value) => ({
     input: typeof value === 'string' ? value : value.input,
   }))
   if (inputsResult.isErr()) {
@@ -76,10 +76,10 @@ export async function processDataset(
   let failureCount = 0
 
   const processBatch = async (
-    batch: Array<{ caseId: string; input: LiamDbExecutorInput }>,
+    batch: Array<{ caseId: string; data: LiamDbExecutorInput }>,
   ) => {
-    const promises = batch.map(({ caseId, input }) =>
-      executeCase(datasetPath, caseId, input),
+    const promises = batch.map(({ caseId, data }) =>
+      executeCase(datasetPath, caseId, data),
     )
     const results = await Promise.allSettled(promises)
 

@@ -1,6 +1,6 @@
 #!/usr/bin/env node
 
-import { resolve } from 'node:path'
+import { join, resolve } from 'node:path'
 import { config as loadEnv } from 'dotenv'
 import { err, ok, type Result } from 'neverthrow'
 import * as v from 'valibot'
@@ -11,7 +11,7 @@ import {
   getWorkspacePath,
   handleCliError,
   handleUnexpectedError,
-  loadInputFiles,
+  loadJsonFiles,
   parseArgs,
   saveOutputFile,
   selectTargetDatasets,
@@ -53,10 +53,12 @@ async function processDataset(
   datasetName: string,
   datasetPath: string,
 ): Promise<DatasetResult> {
-  const inputsResult = await loadInputFiles<
+  const inputsResult = await loadJsonFiles<
     typeof InputSchema,
     OpenAIExecutorInput
-  >(datasetPath, InputSchema, (value) => ({ input: value.input }))
+  >(join(datasetPath, 'execution', 'input'), InputSchema, (value) => ({
+    input: value.input,
+  }))
   if (inputsResult.isErr()) {
     console.warn(`⚠️  ${datasetName}: ${inputsResult.error.message}`)
     return { datasetName, success: 0, failure: 1 }
@@ -73,10 +75,10 @@ async function processDataset(
   let failureCount = 0
 
   const processBatch = async (
-    batch: Array<{ caseId: string; input: OpenAIExecutorInput }>,
+    batch: Array<{ caseId: string; data: OpenAIExecutorInput }>,
   ) => {
-    const promises = batch.map(({ caseId, input }) =>
-      executeCase(executor, datasetPath, caseId, input),
+    const promises = batch.map(({ caseId, data }) =>
+      executeCase(executor, datasetPath, caseId, data),
     )
     const results = await Promise.allSettled(promises)
     results.forEach((result) => {