From ac98b03dbd231c1550495f906e2c1821878c4fbb Mon Sep 17 00:00:00 2001
From: MH4GF <h.miyagi.cnw@gmail.com>
Date: Wed, 8 Oct 2025 11:48:32 +0900
Subject: [PATCH 1/5] refactor(schema-bench): unify JSON file loading with
 loadJsonFiles
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace specialized loadInputFiles with generic loadJsonFiles function.

- Rename loadInputFiles.ts to loadJsonFiles.ts
- Update executeLiamDbShared.ts to use loadJsonFiles with explicit path
- Update executeOpenaiUnified.ts to use loadJsonFiles with explicit path
- Change return value key from 'input' to 'data' for consistency
- Remove loadInputFiles export from utils/index.ts

This simplifies the codebase by having a single, flexible function for
loading JSON files from any directory.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../src/cli/executeLiamDbShared.ts            | 14 ++++----
 .../src/cli/executeOpenaiUnified.ts           | 16 ++++++----
 .../schema-bench/src/cli/utils/index.ts       |  2 +-
 .../{loadInputFiles.ts => loadJsonFiles.ts}   | 32 ++++++++-----------
 4 files changed, 30 insertions(+), 34 deletions(-)
 rename frontend/internal-packages/schema-bench/src/cli/utils/{loadInputFiles.ts => loadJsonFiles.ts} (63%)
diff --git a/frontend/internal-packages/schema-bench/src/cli/executeLiamDbShared.ts b/frontend/internal-packages/schema-bench/src/cli/executeLiamDbShared.ts
index cd7379c8b3..e6f7614588 100644
--- a/frontend/internal-packages/schema-bench/src/cli/executeLiamDbShared.ts
+++ b/frontend/internal-packages/schema-bench/src/cli/executeLiamDbShared.ts
@@ -1,11 +1,11 @@
 #!/usr/bin/env node
 
-import { resolve } from 'node:path'
+import { join, resolve } from 'node:path'
 import { config } from 'dotenv'
 import { err, ok, type Result } from 'neverthrow'
 import * as v from 'valibot'
 import { execute, type LiamDbExecutorInput } from '../executors/liamDb/index.ts'
-import { loadInputFiles, saveOutputFile } from './utils'
+import { loadJsonFiles, saveOutputFile } from './utils'
 
 config({ path: resolve(__dirname, '../../../../../.env') })
 
@@ -52,10 +52,10 @@ export async function processDataset(
   datasetPath: string,
 ): Promise<DatasetResult> {
   // Load input files
-  const inputsResult = await loadInputFiles<
+  const inputsResult = await loadJsonFiles<
     typeof InputSchema,
     LiamDbExecutorInput
-  >(datasetPath, InputSchema, (value) => ({
+  >(join(datasetPath, 'execution', 'input'), InputSchema, (value) => ({
     input: typeof value === 'string' ? value : value.input,
   }))
   if (inputsResult.isErr()) {
@@ -76,10 +76,10 @@ export async function processDataset(
   let failureCount = 0
 
   const processBatch = async (
-    batch: Array<{ caseId: string; input: LiamDbExecutorInput }>,
+    batch: Array<{ caseId: string; data: LiamDbExecutorInput }>,
   ) => {
-    const promises = batch.map(({ caseId, input }) =>
-      executeCase(datasetPath, caseId, input),
+    const promises = batch.map(({ caseId, data }) =>
+      executeCase(datasetPath, caseId, data),
     )
     const results = await Promise.allSettled(promises)
 
diff --git a/frontend/internal-packages/schema-bench/src/cli/executeOpenaiUnified.ts b/frontend/internal-packages/schema-bench/src/cli/executeOpenaiUnified.ts
index 1dbec40fed..a9e943f668 100644
--- a/frontend/internal-packages/schema-bench/src/cli/executeOpenaiUnified.ts
+++ b/frontend/internal-packages/schema-bench/src/cli/executeOpenaiUnified.ts
@@ -1,6 +1,6 @@
 #!/usr/bin/env node
 
-import { resolve } from 'node:path'
+import { join, resolve } from 'node:path'
 import { config as loadEnv } from 'dotenv'
 import { err, ok, type Result } from 'neverthrow'
 import * as v from 'valibot'
@@ -11,7 +11,7 @@ import {
   getWorkspacePath,
   handleCliError,
   handleUnexpectedError,
-  loadInputFiles,
+  loadJsonFiles,
   parseArgs,
   saveOutputFile,
   selectTargetDatasets,
@@ -53,10 +53,12 @@ async function processDataset(
   datasetName: string,
   datasetPath: string,
 ): Promise<DatasetResult> {
-  const inputsResult = await loadInputFiles<
+  const inputsResult = await loadJsonFiles<
     typeof InputSchema,
     OpenAIExecutorInput
-  >(datasetPath, InputSchema, (value) => ({ input: value.input }))
+  >(join(datasetPath, 'execution', 'input'), InputSchema, (value) => ({
+    input: value.input,
+  }))
   if (inputsResult.isErr()) {
     console.warn(`⚠️  ${datasetName}: ${inputsResult.error.message}`)
     return { datasetName, success: 0, failure: 1 }
@@ -73,10 +75,10 @@ async function processDataset(
   let failureCount = 0
 
   const processBatch = async (
-    batch: Array<{ caseId: string; input: OpenAIExecutorInput }>,
+    batch: Array<{ caseId: string; data: OpenAIExecutorInput }>,
   ) => {
-    const promises = batch.map(({ caseId, input }) =>
-      executeCase(executor, datasetPath, caseId, input),
+    const promises = batch.map(({ caseId, data }) =>
+      executeCase(executor, datasetPath, caseId, data),
     )
     const results = await Promise.allSettled(promises)
     results.forEach((result) => {
diff --git a/frontend/internal-packages/schema-bench/src/cli/utils/index.ts b/frontend/internal-packages/schema-bench/src/cli/utils/index.ts
index 0091f5de42..2fbed6f5da 100644
--- a/frontend/internal-packages/schema-bench/src/cli/utils/index.ts
+++ b/frontend/internal-packages/schema-bench/src/cli/utils/index.ts
@@ -2,7 +2,7 @@ export * from './discoverDefaultDatasets.ts'
 export * from './error.ts'
 export * from './filterAndResolveDatasets.ts'
 export * from './listAllDatasets.ts'
-export * from './loadInputFiles.ts'
+export * from './loadJsonFiles.ts'
 export * from './parseArgs.ts'
 export * from './saveOutputFile.ts'
 export * from './selectTargetDatasets.ts'
diff --git a/frontend/internal-packages/schema-bench/src/cli/utils/loadInputFiles.ts b/frontend/internal-packages/schema-bench/src/cli/utils/loadJsonFiles.ts
similarity index 63%
rename from frontend/internal-packages/schema-bench/src/cli/utils/loadInputFiles.ts
rename to frontend/internal-packages/schema-bench/src/cli/utils/loadJsonFiles.ts
index 6f66059fcc..64885efac5 100644
--- a/frontend/internal-packages/schema-bench/src/cli/utils/loadInputFiles.ts
+++ b/frontend/internal-packages/schema-bench/src/cli/utils/loadJsonFiles.ts
@@ -11,39 +11,33 @@ import {
 import * as v from 'valibot'
 
 /**
- * Load and validate input JSON files under `execution/input` for a dataset.
- * The schema and normalize function allow callers to adapt to per-executor needs.
+ * Load and validate JSON files from a specified directory.
+ * Generic function that can be used for both input and reference files.
  */
-export async function loadInputFiles<
+export async function loadJsonFiles<
   Schema extends v.BaseSchema<unknown, unknown, v.BaseIssue<unknown>>,
   T,
 >(
-  datasetPath: string,
+  directory: string,
   schema: Schema,
   normalize: (value: v.InferOutput<Schema>) => T,
-): Promise<Result<Array<{ caseId: string; input: T }>, Error>> {
-  const inputDir = join(datasetPath, 'execution/input')
-
-  if (!existsSync(inputDir)) {
-    return err(
-      new Error(
-        `Input directory not found: ${inputDir}. Please run setup-workspace first.`,
-      ),
-    )
+): Promise<Result<Array<{ caseId: string; data: T }>, Error>> {
+  if (!existsSync(directory)) {
+    return err(new Error(`Directory not found: ${directory}`))
   }
 
-  const filesResult = await fromPromise(readdir(inputDir), (error) =>
+  const filesResult = await fromPromise(readdir(directory), (error) =>
     error instanceof Error ? error : new Error('Failed to read directory'),
   )
   if (filesResult.isErr()) return err(filesResult.error)
 
   const jsonFiles = filesResult.value.filter((file) => file.endsWith('.json'))
-  const inputs: Array<{ caseId: string; input: T }> = []
+  const results: Array<{ caseId: string; data: T }> = []
 
   for (const file of jsonFiles) {
     const caseId = file.replace('.json', '')
     const contentResult = await fromPromise(
-      readFile(join(inputDir, file), 'utf-8'),
+      readFile(join(directory, file), 'utf-8'),
       (error) =>
         error instanceof Error
           ? error
@@ -64,14 +58,14 @@ export async function loadInputFiles<
     if (!validationResult.success) {
       return err(
         new Error(
-          `Invalid input format in ${file}: ${JSON.stringify(validationResult.issues)}`,
+          `Invalid format in ${file}: ${JSON.stringify(validationResult.issues)}`,
         ),
       )
     }
 
     const normalized: T = normalize(validationResult.output)
-    inputs.push({ caseId, input: normalized })
+    results.push({ caseId, data: normalized })
   }
 
-  return ok(inputs)
+  return ok(results)
 }

From b4b4c31eb6bea7e8b6c6675a0e55f59bec9bde52 Mon Sep 17 00:00:00 2001
From: MH4GF <h.miyagi.cnw@gmail.com>
Date: Wed, 8 Oct 2025 12:14:10 +0900
Subject: [PATCH 2/5] feat(schema-bench): add LangSmith integration for
 evaluation tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add comprehensive LangSmith integration to schema-bench for advanced evaluation tracking, visualization, and experiment comparison.

Key features:
- LangSmith dataset upload from local workspace files
- Evaluation execution with LiamDB and OpenAI executors
- Schema metrics tracking (table/column counts, F1 scores, recall)
- Configurable repetitions and concurrency
- neverthrow-based error handling for type safety

Implementation details:
- Created evaluateWithLangsmith.ts for running evaluations
- Created uploadDatasetToLangsmith.ts for dataset management
- Added schemaEvaluator for computing schema similarity metrics
- Integrated @liam-hq/neverthrow for functional error handling
- Added loadJsonFiles utility for consistent file loading
- Used ResultAsync.combine for parallel async operations

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../internal-packages/schema-bench/README.md  |  55 ++++++
 .../schema-bench/package.json                 |   3 +
 .../src/cli/evaluateWithLangsmith.ts          | 162 ++++++++++++++++
 .../src/cli/uploadDatasetToLangsmith.ts       | 174 ++++++++++++++++++
 .../src/langsmith/schemaEvaluator.ts          |  61 ++++++
 .../schema-bench/src/langsmith/types.ts       |  15 ++
 pnpm-lock.yaml                                |   9 +-
 7 files changed, 476 insertions(+), 3 deletions(-)
 create mode 100644 frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
 create mode 100644 frontend/internal-packages/schema-bench/src/cli/uploadDatasetToLangsmith.ts
 create mode 100644 frontend/internal-packages/schema-bench/src/langsmith/schemaEvaluator.ts
 create mode 100644 frontend/internal-packages/schema-bench/src/langsmith/types.ts

diff --git a/frontend/internal-packages/schema-bench/README.md b/frontend/internal-packages/schema-bench/README.md
index 5806bb8a24..b61d71068e 100644
--- a/frontend/internal-packages/schema-bench/README.md
+++ b/frontend/internal-packages/schema-bench/README.md
@@ -187,3 +187,58 @@ export OPENAI_API_KEY="your-api-key"
 - Model comparison across datasets
 - Quality assurance for schema generation
 - Repeatable benchmarking with standardized metrics
+
+## LangSmith Integration
+
+Schema-bench integrates with [LangSmith](https://smith.langchain.com) for advanced evaluation tracking, visualization, and experiment comparison.
+
+### Setup
+
+1. Set your LangSmith API key:
+```bash
+export LANGSMITH_API_KEY="your-api-key"
+```
+
+2. Upload datasets to LangSmith (one-time setup):
+```bash
+# Upload all datasets
+pnpm --filter @liam-hq/schema-bench langsmith:upload -all
+
+# Upload specific datasets
+pnpm --filter @liam-hq/schema-bench langsmith:upload -default
+pnpm --filter @liam-hq/schema-bench langsmith:upload -entity-extraction
+```
+
+This creates LangSmith datasets from your local benchmark workspace files.
+
+### Running Evaluations
+
+LangSmith combines execution and evaluation in a single command. Use the same dataset flags as the regular executors:
+
+```bash
+# LiamDB: Run on all datasets
+pnpm --filter @liam-hq/schema-bench langsmith -all --liamdb
+
+# LiamDB: Run on specific datasets
+pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb
+pnpm --filter @liam-hq/schema-bench langsmith -default -entity-extraction --liamdb
+
+# OpenAI: Run on all datasets
+pnpm --filter @liam-hq/schema-bench langsmith -all --openai
+
+# OpenAI: Run on specific datasets
+pnpm --filter @liam-hq/schema-bench langsmith -default --openai
+pnpm --filter @liam-hq/schema-bench langsmith -entity-extraction -relational-inference --openai
+
+# Advanced options
+pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb --num-repetitions=5
+pnpm --filter @liam-hq/schema-bench langsmith -default --liamdb --max-concurrency=5
+```
+
+**Options:**
+- `--num-repetitions=N`: Number of times to run each test case (default: 3)
+- `--max-concurrency=N`: Maximum concurrent executions (default: 3)
+
+### Viewing Results
+
+After the evaluation completes, a LangSmith URL will be displayed in the console. Open this URL to view detailed evaluation results.
diff --git a/frontend/internal-packages/schema-bench/package.json b/frontend/internal-packages/schema-bench/package.json
index 089887f961..f3e1f7acfe 100644
--- a/frontend/internal-packages/schema-bench/package.json
+++ b/frontend/internal-packages/schema-bench/package.json
@@ -6,6 +6,7 @@
   "dependencies": {
     "@huggingface/transformers": "3.3.3",
     "@liam-hq/agent": "workspace:*",
+    "@liam-hq/neverthrow": "workspace:*",
     "@liam-hq/schema": "workspace:*",
     "dotenv": "16.5.0",
     "langsmith": "0.3.69",
@@ -30,6 +31,8 @@
     "fmt": "concurrently \"pnpm:fmt:*\"",
     "fmt:biome": "biome check --write --unsafe .",
     "fmt:eslint": "eslint --fix .",
+    "langsmith": "tsx src/cli/evaluateWithLangsmith.ts",
+    "langsmith:upload": "tsx src/cli/uploadDatasetToLangsmith.ts",
     "lint": "concurrently \"pnpm:lint:*\"",
     "lint:biome": "biome check .",
     "lint:eslint": "eslint .",
diff --git a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
new file mode 100644
index 0000000000..cde090087d
--- /dev/null
+++ b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
@@ -0,0 +1,162 @@
+#!/usr/bin/env node
+
+import { resolve } from 'node:path'
+import { config } from 'dotenv'
+import { evaluate } from 'langsmith/evaluation'
+import { fromPromise, ResultAsync } from 'neverthrow'
+import * as v from 'valibot'
+import { execute as executeLiamDb } from '../executors/liamDb/liamDbExecutor.ts'
+import { OpenAIExecutor } from '../executors/openai/openaiExecutor.ts'
+import { schemaEvaluator } from '../langsmith/schemaEvaluator.ts'
+import type { LangSmithInput, LangSmithOutput } from '../langsmith/types.ts'
+import {
+  filterAndResolveDatasets,
+  getWorkspacePath,
+  handleCliError,
+  handleUnexpectedError,
+  parseArgs,
+  selectTargetDatasets,
+} from './utils/index.ts'
+
+config({ path: resolve(__dirname, '../../../../../.env') })
+
+const executorTypeSchema = v.picklist(['liamdb', 'openai'])
+const positiveIntegerSchema = v.pipe(
+  v.union([v.pipe(v.string(), v.transform(Number)), v.number()]),
+  v.integer(),
+  v.minValue(1),
+)
+const optionsSchema = v.object({
+  executorType: v.optional(executorTypeSchema, 'liamdb'),
+  numRepetitions: v.optional(positiveIntegerSchema, 3),
+  maxConcurrency: v.optional(positiveIntegerSchema, 3),
+})
+
+type ExecutorOptions = v.InferOutput<typeof optionsSchema>
+type ExecutorType = v.InferOutput<typeof executorTypeSchema>
+
+const parseExecutorAndOptions = (argv: string[]): ExecutorOptions => {
+  const args = argv.slice(2)
+
+  const rawOptions: Record<string, unknown> = {}
+
+  for (const arg of args) {
+    if (arg === '--openai') {
+      rawOptions['executorType'] = 'openai'
+    } else if (arg === '--liamdb') {
+      rawOptions['executorType'] = 'liamdb'
+    } else if (arg.startsWith('--num-repetitions=')) {
+      rawOptions['numRepetitions'] = arg.split('=')[1]
+    } else if (arg.startsWith('--max-concurrency=')) {
+      rawOptions['maxConcurrency'] = arg.split('=')[1]
+    }
+  }
+
+  return v.parse(optionsSchema, rawOptions)
+}
+
+const createTarget = (
+  executorType: ExecutorType,
+): ((input: LangSmithInput) => Promise<LangSmithOutput>) => {
+  if (executorType === 'liamdb') {
+    return async (input: LangSmithInput): Promise<LangSmithOutput> => {
+      const prompt = input.prompt || input.input || ''
+
+      const result = await executeLiamDb({ input: prompt })
+
+      if (result.isErr()) {
+        throw result.error
+      }
+
+      return { schema: result.value }
+    }
+  }
+
+  if (executorType === 'openai') {
+    const apiKey = process.env['OPENAI_API_KEY']
+    if (!apiKey) {
+      handleCliError('OPENAI_API_KEY environment variable is required')
+    }
+
+    const executor = new OpenAIExecutor({ apiKey: apiKey || '' })
+
+    return async (input: LangSmithInput): Promise<LangSmithOutput> => {
+      const prompt = input.prompt || input.input || ''
+
+      const result = await executor.execute({ input: prompt })
+
+      if (result.isErr()) {
+        throw result.error
+      }
+
+      return { schema: result.value }
+    }
+  }
+
+  return handleCliError(`Unknown executor type: ${executorType}`)
+}
+
+type ExperimentResults = Awaited<ReturnType<typeof evaluate>>
+
+const runEvaluation = (
+  datasetName: string,
+  options: ExecutorOptions,
+): ResultAsync<ExperimentResults, Error> => {
+  const target = createTarget(options.executorType)
+
+  return fromPromise(
+    evaluate(target, {
+      data: `schema-bench-${datasetName}`,
+      evaluators: [schemaEvaluator],
+      experimentPrefix: `${options.executorType}-${datasetName}`,
+      maxConcurrency: options.maxConcurrency,
+      numRepetitions: options.numRepetitions,
+    }),
+    (error) => {
+      return error instanceof Error ? error : new Error(String(error))
+    },
+  )
+}
+
+const runDatasets = async (
+  datasets: Array<{ name: string }>,
+  options: ExecutorOptions,
+) => {
+  const results = datasets.map(({ name }) => runEvaluation(name, options))
+  return ResultAsync.combineWithAllErrors(results)
+}
+
+const main = async () => {
+  // Filter out executor options (--xxx) for parseArgs
+  const datasetArgs = process.argv.filter((arg) => !arg.startsWith('--'))
+
+  // Parse dataset flags using existing utility
+  const cliOptions = parseArgs(datasetArgs)
+
+  // Parse executor and evaluation options
+  const options = parseExecutorAndOptions(process.argv)
+
+  // Get workspace and select datasets
+  const workspacePath = getWorkspacePath()
+  const targetDatasets = selectTargetDatasets(cliOptions, workspacePath)
+
+  if (targetDatasets.length === 0) {
+    handleCliError('No datasets found to process. Use -all or -<dataset-name>')
+  }
+
+  const validDatasets = filterAndResolveDatasets(targetDatasets, workspacePath)
+
+  if (validDatasets.length === 0) {
+    handleCliError('No valid datasets found in workspace')
+  }
+
+  const result = await runDatasets(validDatasets, options)
+
+  if (result.isErr()) {
+    process.exit(1)
+  }
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main().catch(handleUnexpectedError)
+}
diff --git a/frontend/internal-packages/schema-bench/src/cli/uploadDatasetToLangsmith.ts b/frontend/internal-packages/schema-bench/src/cli/uploadDatasetToLangsmith.ts
new file mode 100644
index 0000000000..5d4d4482f8
--- /dev/null
+++ b/frontend/internal-packages/schema-bench/src/cli/uploadDatasetToLangsmith.ts
@@ -0,0 +1,174 @@
+#!/usr/bin/env node
+
+import { join, resolve } from 'node:path'
+import { fromPromise } from '@liam-hq/neverthrow'
+import { type Schema, schemaSchema } from '@liam-hq/schema'
+import { config } from 'dotenv'
+import { Client } from 'langsmith'
+import { ResultAsync } from 'neverthrow'
+import * as v from 'valibot'
+import type { LangSmithDatasetConfig } from '../langsmith/types.ts'
+import {
+  filterAndResolveDatasets,
+  getWorkspacePath,
+  handleCliError,
+  handleUnexpectedError,
+  loadJsonFiles,
+  parseArgs,
+  selectTargetDatasets,
+} from './utils/index.ts'
+
+config({ path: resolve(__dirname, '../../../../../.env') })
+
+const inputSchema = v.union([v.string(), v.object({ input: v.string() })])
+
+const getOrCreateDataset = (client: Client, datasetName: string) => {
+  return fromPromise(client.readDataset({ datasetName })).orElse(() =>
+    fromPromise(client.createDataset(datasetName)),
+  )
+}
+
+const findExistingExample = async (
+  client: Client,
+  datasetId: string,
+  caseId: string,
+) => {
+  for await (const example of client.listExamples({ datasetId })) {
+    const metadata = example.metadata
+    if (metadata && 'caseId' in metadata && metadata['caseId'] === caseId) {
+      return example
+    }
+  }
+  return null
+}
+
+const uploadOrUpdateExample = async (
+  client: Client,
+  datasetId: string,
+  caseId: string,
+  input: { input: string },
+  reference: { schema: Schema },
+) => {
+  const existingExample = await findExistingExample(client, datasetId, caseId)
+
+  if (existingExample) {
+    await client.updateExample({
+      id: existingExample.id,
+      inputs: input,
+      outputs: reference,
+    })
+  } else {
+    await client.createExample({
+      inputs: input,
+      outputs: reference,
+      dataset_id: datasetId,
+      metadata: { caseId },
+    })
+  }
+}
+
+type DatasetResult = Awaited<ReturnType<typeof Client.prototype.readDataset>>
+
+const uploadExamples = (
+  client: Client,
+  dataset: DatasetResult,
+  inputs: Array<{ caseId: string; data: { input: string } }>,
+  references: Array<{ caseId: string; data: { schema: Schema } }>,
+): ResultAsync<void, Error> => {
+  const uploadExamplesPromise = async () => {
+    for (const inputItem of inputs) {
+      const reference = references.find((r) => r.caseId === inputItem.caseId)
+
+      if (!reference) {
+        console.warn(
+          `⚠️  No reference found for case: ${inputItem.caseId}, skipping`,
+        )
+        continue
+      }
+
+      await uploadOrUpdateExample(
+        client,
+        dataset.id,
+        inputItem.caseId,
+        inputItem.data,
+        reference.data,
+      )
+    }
+  }
+
+  return fromPromise(uploadExamplesPromise())
+}
+
+const uploadDataset = (
+  config: LangSmithDatasetConfig,
+): ResultAsync<void, Error> => {
+  const client = new Client()
+
+  const inputDir = join(config.workspacePath, 'execution', 'input')
+  const referenceDir = join(config.workspacePath, 'execution', 'reference')
+
+  const datasetResult = getOrCreateDataset(client, config.datasetName)
+  const inputsResult = fromPromise(
+    loadJsonFiles(inputDir, inputSchema, (value) =>
+      typeof value === 'string' ? { input: value } : { input: value.input },
+    ).then((result) => {
+      if (result.isErr()) throw result.error
+      return result.value
+    }),
+  )
+  const referencesResult = fromPromise(
+    loadJsonFiles(referenceDir, schemaSchema, (value) => ({
+      schema: value,
+    })).then((result) => {
+      if (result.isErr()) throw result.error
+      return result.value
+    }),
+  )
+
+  return ResultAsync.combine([
+    datasetResult,
+    inputsResult,
+    referencesResult,
+  ]).andThen(([dataset, inputs, references]) =>
+    uploadExamples(client, dataset, inputs, references),
+  )
+}
+
+const runUploads = (datasets: Array<{ name: string; path: string }>) => {
+  const results = datasets.map(({ name, path }) =>
+    uploadDataset({
+      datasetName: `schema-bench-${name}`,
+      workspacePath: path,
+    }),
+  )
+  return ResultAsync.combineWithAllErrors(results)
+}
+
+const main = async () => {
+  // Parse dataset flags using existing utility
+  const cliOptions = parseArgs(process.argv)
+
+  // Get workspace and select datasets
+  const workspacePath = getWorkspacePath()
+  const targetDatasets = selectTargetDatasets(cliOptions, workspacePath)
+
+  if (targetDatasets.length === 0) {
+    handleCliError('No datasets found to process. Use -all or -<dataset-name>')
+  }
+
+  const validDatasets = filterAndResolveDatasets(targetDatasets, workspacePath)
+
+  if (validDatasets.length === 0) {
+    handleCliError('No valid datasets found in workspace')
+  }
+
+  const result = await runUploads(validDatasets)
+
+  if (result.isErr()) {
+    process.exit(1)
+  }
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main().catch(handleUnexpectedError)
+}
diff --git a/frontend/internal-packages/schema-bench/src/langsmith/schemaEvaluator.ts b/frontend/internal-packages/schema-bench/src/langsmith/schemaEvaluator.ts
new file mode 100644
index 0000000000..2d7d422b74
--- /dev/null
+++ b/frontend/internal-packages/schema-bench/src/langsmith/schemaEvaluator.ts
@@ -0,0 +1,61 @@
+import { type Schema, schemaSchema } from '@liam-hq/schema'
+import type { EvaluatorT } from 'langsmith/evaluation'
+import type { Example, Run } from 'langsmith/schemas'
+import * as v from 'valibot'
+import { evaluate } from '../evaluate/evaluate.ts'
+
+const validateSchema = (data: unknown): Schema => v.parse(schemaSchema, data)
+
+const countTables = (schema: Schema): number => {
+  return Object.keys(schema.tables).length
+}
+
+const countColumns = (schema: Schema): number => {
+  return Object.values(schema.tables).reduce(
+    (total, table) => total + Object.keys(table.columns).length,
+    0,
+  )
+}
+
+export const schemaEvaluator: EvaluatorT = async (args: {
+  run: Run
+  example: Example
+  inputs: Record<string, unknown>
+  outputs: Record<string, unknown>
+  referenceOutputs?: Record<string, unknown>
+}) => {
+  const referenceSchema = validateSchema(args.referenceOutputs?.['schema'])
+  const outputSchema = validateSchema(args.outputs['schema'])
+  const result = await evaluate(referenceSchema, outputSchema)
+
+  return [
+    {
+      key: 'Table Count',
+      score: countTables(outputSchema),
+    },
+    {
+      key: 'Column Count',
+      score: countColumns(outputSchema),
+    },
+    {
+      key: 'Table F1 Score',
+      score: result.tableF1Score,
+    },
+    {
+      key: 'Table Recall',
+      score: result.tableRecall,
+    },
+    {
+      key: 'Column F1 Score Average',
+      score: result.columnF1ScoreAverage,
+    },
+    {
+      key: 'Column Recall Average',
+      score: result.columnRecallAverage,
+    },
+    {
+      key: 'Column All Correct Rate Average',
+      score: result.columnAllCorrectRateAverage,
+    },
+  ]
+}
diff --git a/frontend/internal-packages/schema-bench/src/langsmith/types.ts b/frontend/internal-packages/schema-bench/src/langsmith/types.ts
new file mode 100644
index 0000000000..ea4a56852b
--- /dev/null
+++ b/frontend/internal-packages/schema-bench/src/langsmith/types.ts
@@ -0,0 +1,15 @@
+import type { Schema } from '@liam-hq/schema'
+
+export type LangSmithInput = {
+  prompt?: string
+  input?: string
+}
+
+export type LangSmithOutput = {
+  schema: Schema
+}
+
+export type LangSmithDatasetConfig = {
+  datasetName: string
+  workspacePath: string
+}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 0378ee8647..d1974981bb 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -115,7 +115,7 @@ importers:
         version: link:../../packages/ui
       '@next/third-parties':
         specifier: 15.3.5
-        version: 15.3.5(next@15.4.7(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1)
+        version: 15.3.5(next@15.4.7(@babel/core@7.28.4)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1)
       '@sentry/nextjs':
         specifier: '9'
         version: 9.46.0(@opentelemetry/context-async-hooks@2.1.0(@opentelemetry/api@1.9.0))(@opentelemetry/core@2.1.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.1.0(@opentelemetry/api@1.9.0))(next@15.4.7(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1)(webpack@5.102.0(@swc/core@1.12.11))
@@ -254,7 +254,7 @@ importers:
         version: link:../../packages/ui
       '@next/third-parties':
         specifier: 15.3.5
-        version: 15.3.5(next@15.4.7(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1)
+        version: 15.3.5(next@15.4.7(@babel/core@7.28.4)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1)
       '@rive-app/react-canvas':
         specifier: 4.23.3
         version: 4.23.3(react@19.1.1)
@@ -675,6 +675,9 @@ importers:
       '@liam-hq/agent':
         specifier: workspace:*
         version: link:../agent
+      '@liam-hq/neverthrow':
+        specifier: workspace:*
+        version: link:../neverthrow
       '@liam-hq/schema':
         specifier: workspace:*
         version: link:../../packages/schema
@@ -13337,7 +13340,7 @@ snapshots:
   '@next/swc-win32-x64-msvc@15.4.7':
     optional: true
 
-  '@next/third-parties@15.3.5(next@15.4.7(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1)':
+  '@next/third-parties@15.3.5(next@15.4.7(@babel/core@7.28.4)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1))(react@19.1.1)':
     dependencies:
       next: 15.4.7(@babel/core@7.28.4)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@19.1.1(react@19.1.1))(react@19.1.1)
       react: 19.1.1

From 7433be982410787516327fbea187eba511286c06 Mon Sep 17 00:00:00 2001
From: MH4GF <h.miyagi.cnw@gmail.com>
Date: Wed, 8 Oct 2025 12:18:09 +0900
Subject: [PATCH 3/5] refactor(schema-bench): use @liam-hq/neverthrow default
 error handler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace manual error handler in fromPromise with @liam-hq/neverthrow's
default error handler for consistency across the codebase.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../schema-bench/src/cli/evaluateWithLangsmith.ts           | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
index cde090087d..0c6a8f8965 100644
--- a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
+++ b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
@@ -1,9 +1,10 @@
 #!/usr/bin/env node
 
 import { resolve } from 'node:path'
+import { fromPromise } from '@liam-hq/neverthrow'
 import { config } from 'dotenv'
 import { evaluate } from 'langsmith/evaluation'
-import { fromPromise, ResultAsync } from 'neverthrow'
+import { ResultAsync } from 'neverthrow'
 import * as v from 'valibot'
 import { execute as executeLiamDb } from '../executors/liamDb/liamDbExecutor.ts'
 import { OpenAIExecutor } from '../executors/openai/openaiExecutor.ts'
@@ -112,9 +113,6 @@ const runEvaluation = (
       maxConcurrency: options.maxConcurrency,
       numRepetitions: options.numRepetitions,
     }),
-    (error) => {
-      return error instanceof Error ? error : new Error(String(error))
-    },
   )
 }
 

From 0667f1ddabc758a8b1a0f8704fb59831542b111e Mon Sep 17 00:00:00 2001
From: Hirotaka Miyagi <31152321+MH4GF@users.noreply.github.com>
Date: Wed, 8 Oct 2025 12:27:16 +0900
Subject: [PATCH 4/5] remove unnecessary fallback

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .../schema-bench/src/cli/evaluateWithLangsmith.ts               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
index 0c6a8f8965..cb78ce377a 100644
--- a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
+++ b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
@@ -79,7 +79,7 @@ const createTarget = (
       handleCliError('OPENAI_API_KEY environment variable is required')
     }
 
-    const executor = new OpenAIExecutor({ apiKey: apiKey || '' })
+    const executor = new OpenAIExecutor({ apiKey: apiKey })
 
     return async (input: LangSmithInput): Promise<LangSmithOutput> => {
       const prompt = input.prompt || input.input || ''

From 8852d2557e6033553dd3eb85037997a3afd7937d Mon Sep 17 00:00:00 2001
From: MH4GF <h.miyagi.cnw@gmail.com>
Date: Wed, 8 Oct 2025 12:33:33 +0900
Subject: [PATCH 5/5] fix(schema-bench): resolve TypeScript type error in
 evaluateWithLangsmith
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add non-null assertion for apiKey after handleCliError check.

Since handleCliError returns never, apiKey is guaranteed to be defined
after the check, but TypeScript's control flow analysis doesn't recognize this.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../schema-bench/src/cli/evaluateWithLangsmith.ts               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
index cb78ce377a..0dc7e1a40f 100644
--- a/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
+++ b/frontend/internal-packages/schema-bench/src/cli/evaluateWithLangsmith.ts
@@ -79,7 +79,7 @@ const createTarget = (
       handleCliError('OPENAI_API_KEY environment variable is required')
     }
 
-    const executor = new OpenAIExecutor({ apiKey: apiKey })
+    const executor = new OpenAIExecutor({ apiKey: apiKey! })
 
     return async (input: LangSmithInput): Promise<LangSmithOutput> => {
       const prompt = input.prompt || input.input || ''