diff --git a/docs/api/README.md b/docs/api/README.md index de76fe2..310b4be 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -137,13 +137,24 @@ interface HttpBridgeOptions { ## Codec Utilities ```ts -import { decodeValue, decodeValueAsync, registerArrowDecoder, clearArrowDecoder } from 'tywrap'; +import { + decodeValue, + decodeValueAsync, + autoRegisterArrowDecoder, + registerArrowDecoder, + clearArrowDecoder, +} from 'tywrap'; + +// NodeBridge auto-registers when apache-arrow is installed. +// If you're decoding outside the bridge, call autoRegisterArrowDecoder() or register manually: +const arrowReady = await autoRegisterArrowDecoder(); +// if (!arrowReady) throw new Error('Install apache-arrow or enable JSON fallback'); +// registerArrowDecoder(bytes => bytes); -registerArrowDecoder(bytes => bytes); const value = await decodeValueAsync(pythonValue); ``` -Arrow-encoded payloads throw unless you register a decoder or enable JSON fallback on the Python bridge. +Arrow-encoded payloads throw unless a decoder is registered or JSON fallback is enabled on the Python bridge. ## Error Types diff --git a/docs/codec-roadmap.md b/docs/codec-roadmap.md index 9d59e0a..66af6d2 100644 --- a/docs/codec-roadmap.md +++ b/docs/codec-roadmap.md @@ -14,6 +14,13 @@ This is a forward-looking plan for adding codecs beyond numpy/pandas. The focus - Keep failures explicit unless a user opts into lossy fallbacks. - Avoid heavy implicit conversions (CPU/GPU) without clear config. +## DX Defaults (Decisions) + +- Arrow is the default for ndarray/dataframe/series. The JS runtime should auto-register an Arrow decoder when `apache-arrow` is installed so users do not have to wire it manually. +- JSON fallback is opt-in only (via `TYWRAP_CODEC_FALLBACK=json`) and remains explicitly lossy for dtype/NA fidelity. +- GPU handling stays explicit: no implicit `.cpu()` or contiguous copies. Opt-in copy/transfer remains available, and GPU-native transport is a follow-up track (DLPack/Arrow CUDA). +- Large payloads should not be forced through single-line JSONL forever; add an artifact/chunked transport to keep responses reliable without silent truncation. + ## Envelope Conventions All tywrap codec envelopes share: @@ -28,6 +35,11 @@ The subprocess JSONL transport is not streaming: large results must fit in memor - Set `TYWRAP_CODEC_MAX_BYTES` (bytes, UTF-8) to cap the maximum serialized response size emitted by the Python bridge. - If exceeded, the call fails with an explicit error instead of attempting a silent fallback. +- Planned: add an artifact/chunked transport path for large payloads to avoid JSONL size ceilings. + +### Feature Detection + +Bridge metadata should surface optional codec availability to help the JS side decide when to rely on SciPy/Torch/Sklearn codecs. ## SciPy (sparse matrices) @@ -93,6 +105,7 @@ Notes: - Default to CPU tensors; require opt-in for `.cpu()` conversion. - Reject non-contiguous tensors unless explicitly allowed. - Opt-in copy/transfer via `TYWRAP_TORCH_ALLOW_COPY=1`. +- Future: GPU-native transport (DLPack/Arrow CUDA) to avoid implicit device transfers. ## Sklearn (models + outputs) @@ -123,9 +136,7 @@ Notes: 1. Define envelope specs and validation tests (round-trip + size limits). 2. Implement Python bridge serialization with feature detection. 3. Implement JS decoder + type mapping presets. -4. Add performance gates and CI coverage for the new codecs. - -## Open Questions - -- GPU handling: do we allow implicit device transfers? -- Max payload sizes: per-call limits vs global caps beyond `TYWRAP_CODEC_MAX_BYTES`? +4. Make Arrow frictionless (auto-register decoder + living app on Arrow path). +5. Add payload scaling via artifact/chunked transport (protocol versioned if needed). +6. Improve scientific codec ergonomics (explicit GPU opt-in, SciPy format expansion, safe sklearn opt-ins). +7. Add performance gates and CI coverage for the new codecs. diff --git a/docs/runtimes/browser.md b/docs/runtimes/browser.md index c701c6b..284305b 100644 --- a/docs/runtimes/browser.md +++ b/docs/runtimes/browser.md @@ -63,7 +63,7 @@ loading, rely on Pyodide directly. ## Data Transport -Arrow envelopes are supported in the browser if you register an Arrow decoder: +Arrow envelopes are supported in the browser if you register an Arrow decoder (Node auto-registers when `apache-arrow` is installed): ```ts import { registerArrowDecoder } from 'tywrap'; diff --git a/docs/runtimes/nodejs.md b/docs/runtimes/nodejs.md index 9603921..96644f0 100644 --- a/docs/runtimes/nodejs.md +++ b/docs/runtimes/nodejs.md @@ -176,18 +176,21 @@ pip install pyarrow npm install apache-arrow ``` +Auto path (when apache-arrow is installed): + +```typescript +import { autoRegisterArrowDecoder } from 'tywrap'; + +await autoRegisterArrowDecoder(); +``` + +Manual path (customize decoding outside NodeBridge): + ```typescript -import { createRequire } from 'node:module'; import { registerArrowDecoder } from 'tywrap'; +import { tableFromIPC } from 'apache-arrow'; -// Register Arrow decoder for optimal performance -const require = createRequire(import.meta.url); -const { tableFromIPC } = require('apache-arrow'); registerArrowDecoder(bytes => tableFromIPC(bytes)); - -// If you don't register a decoder, Arrow-encoded payloads will throw. -// To accept raw bytes, register a passthrough decoder: -// registerArrowDecoder(bytes => bytes); ``` ### JSON Fallback diff --git a/examples/living-app/README.md b/examples/living-app/README.md index fb48a6b..d04eee8 100644 --- a/examples/living-app/README.md +++ b/examples/living-app/README.md @@ -4,7 +4,7 @@ This is a small but non-trivial “living example” that exercises tywrap end-t - TypeScript (Node) calls into Python via `NodeBridge` - Python uses `pandas` + `numpy` for data work and `pydantic` for config validation -- Rich results (e.g. `pandas.DataFrame`) come back via Arrow IPC and are decoded in Node with `apache-arrow` +- Rich results (e.g. `pandas.DataFrame`) come back via Arrow IPC and are decoded in Node with `apache-arrow` (auto-registered by `NodeBridge`) ## What it does diff --git a/examples/living-app/src/index.ts b/examples/living-app/src/index.ts index c76a03c..99adc4b 100644 --- a/examples/living-app/src/index.ts +++ b/examples/living-app/src/index.ts @@ -2,11 +2,9 @@ import { existsSync, mkdtempSync, rmSync } from 'node:fs'; import { dirname, join, resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; import { tmpdir } from 'node:os'; -import { createRequire } from 'node:module'; - import { NodeBridge } from 'tywrap/node'; import { setRuntimeBridge } from 'tywrap/runtime'; -import { clearArrowDecoder, registerArrowDecoder } from 'tywrap'; +import { autoRegisterArrowDecoder, clearArrowDecoder } from 'tywrap'; import { driftReport, @@ -52,30 +50,14 @@ function resolveCodecMode(argv: readonly string[]): CodecMode { * Register an Arrow decoder for this Node process. * * Why: `apache-arrow` is an optional dependency and tywrap should run without it in JSON mode. - * We use `require()` instead of ESM `import()` so Node/TypeScript resolve the package's "node" - * export + typings correctly (the ESM export map can otherwise select the DOM build/types). */ async function enableArrowDecoder(): Promise { - const require = createRequire(import.meta.url); - let arrowModule: unknown; - try { - arrowModule = require('apache-arrow'); - } catch (err) { - const code = (err as { code?: unknown }).code; - if (code === 'MODULE_NOT_FOUND') { - throw new Error( - "Arrow mode requires the optional dependency 'apache-arrow'. Install it with `npm install apache-arrow`." - ); - } - throw err; - } - const arrow = arrowModule as { - tableFromIPC?: (bytes: Uint8Array) => { toArray?: () => unknown[] }; - }; - if (typeof arrow.tableFromIPC !== 'function') { - throw new Error('apache-arrow does not export tableFromIPC'); + const registered = await autoRegisterArrowDecoder(); + if (!registered) { + throw new Error( + "Arrow mode requires the optional dependency 'apache-arrow'. Install it with `npm install apache-arrow`." + ); } - registerArrowDecoder((bytes: Uint8Array) => arrow.tableFromIPC!(bytes)); } /** diff --git a/runtime/python_bridge.py b/runtime/python_bridge.py index 4ad2e5b..3c9e76a 100644 --- a/runtime/python_bridge.py +++ b/runtime/python_bridge.py @@ -2,6 +2,7 @@ import sys import json import importlib +import importlib.util import os import traceback import base64 @@ -84,14 +85,40 @@ class ProtocolError(Exception): def arrow_available(): + """ + Return True when pyarrow can be imported. + + Why: advertise Arrow capability to the TS side without crashing startup when + pyarrow is optional or missing. + """ try: - import pyarrow # noqa: F401 - except Exception: + import pyarrow + except (ImportError, OSError): return False return True +def module_available(module_name: str) -> bool: + """ + Lightweight feature detection for optional codec dependencies. + + Why: exposes availability in bridge metadata without importing heavy modules or triggering + side effects, so the TS side can decide when to rely on optional codecs. These flags are + best-effort hints; serialization still performs its own import checks for correctness. + """ + try: + return importlib.util.find_spec(module_name) is not None + except (ImportError, AttributeError, TypeError, ValueError): + # Why: guard against unusual importlib edge cases without masking other failures. + return False + + def is_numpy_array(obj): + """ + Detect numpy arrays when NumPy is installed. + + Why: keep NumPy optional while enabling ndarray serialization. + """ try: import numpy as np # noqa: F401 except Exception: @@ -100,6 +127,11 @@ def is_numpy_array(obj): def is_pandas_dataframe(obj): + """ + Detect pandas DataFrame instances when pandas is installed. + + Why: avoid hard pandas dependency while enabling dataframe encoding. + """ try: import pandas as pd # noqa: F401 except Exception: @@ -108,6 +140,11 @@ def is_pandas_dataframe(obj): def is_pandas_series(obj): + """ + Detect pandas Series instances when pandas is installed. + + Why: avoid hard pandas dependency while enabling series encoding. + """ try: import pandas as pd # noqa: F401 except Exception: @@ -116,6 +153,11 @@ def is_pandas_series(obj): def is_scipy_sparse(obj): + """ + Detect scipy sparse matrices when scipy is installed. + + Why: allow sparse matrix encoding without importing scipy in all environments. + """ try: import scipy.sparse as sp # noqa: F401 except Exception: @@ -127,6 +169,11 @@ def is_scipy_sparse(obj): def is_torch_tensor(obj): + """ + Detect torch tensors when torch is installed. + + Why: allow tensor encoding without a hard torch dependency. + """ try: import torch # noqa: F401 except Exception: @@ -138,6 +185,11 @@ def is_torch_tensor(obj): def is_sklearn_estimator(obj): + """ + Detect sklearn estimators for metadata-only serialization. + + Why: allow feature-gated estimator metadata without importing sklearn by default. + """ try: from sklearn.base import BaseEstimator # noqa: F401 except Exception: @@ -547,6 +599,11 @@ def handle_dispose_instance(params): def handle_meta(): + """ + Return bridge metadata for capability detection. + + Why: the Node side uses this to decide whether optional codecs can be used. + """ return { 'protocol': PROTOCOL, 'protocolVersion': PROTOCOL_VERSION, @@ -555,6 +612,9 @@ def handle_meta(): 'pid': os.getpid(), 'codecFallback': 'json' if FALLBACK_JSON else 'none', 'arrowAvailable': arrow_available(), + 'scipyAvailable': module_available('scipy'), + 'torchAvailable': module_available('torch'), + 'sklearnAvailable': module_available('sklearn'), 'instances': len(instances), } diff --git a/src/index.ts b/src/index.ts index 4620522..0a7ba54 100644 --- a/src/index.ts +++ b/src/index.ts @@ -69,6 +69,7 @@ export { detectRuntime, isNodejs, isDeno, isBun, isBrowser } from './utils/runti export { decodeValue, decodeValueAsync, + autoRegisterArrowDecoder, registerArrowDecoder, clearArrowDecoder, } from './utils/codec.js'; diff --git a/src/runtime/node.ts b/src/runtime/node.ts index 8cc6190..c083038 100644 --- a/src/runtime/node.ts +++ b/src/runtime/node.ts @@ -5,8 +5,9 @@ import { existsSync } from 'node:fs'; import { delimiter, isAbsolute, join, resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; +import { createRequire } from 'node:module'; -import { decodeValueAsync } from '../utils/codec.js'; +import { autoRegisterArrowDecoder, decodeValueAsync } from '../utils/codec.js'; import { getDefaultPythonPath } from '../utils/python.js'; import { getVenvBinDir, getVenvPythonExe } from '../utils/runtime.js'; import type { BridgeInfo } from '../types/index.js'; @@ -273,6 +274,10 @@ export class NodeBridge extends RuntimeBridge { private async startProcess(): Promise { try { + const require = createRequire(import.meta.url); + await autoRegisterArrowDecoder({ + loader: () => require('apache-arrow'), + }); const { spawn } = await import('child_process'); const allowedPrefixes = ['TYWRAP_']; const allowedKeys = new Set(['PATH', 'PYTHONPATH', 'VIRTUAL_ENV', 'PYTHONHOME']); diff --git a/src/runtime/optimized-node.ts b/src/runtime/optimized-node.ts index 04a2524..486af34 100644 --- a/src/runtime/optimized-node.ts +++ b/src/runtime/optimized-node.ts @@ -5,11 +5,12 @@ import { delimiter, isAbsolute, join, resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; +import { createRequire } from 'node:module'; import type { ChildProcess } from 'child_process'; import { EventEmitter } from 'events'; import { globalCache } from '../utils/cache.js'; -import { decodeValueAsync } from '../utils/codec.js'; +import { autoRegisterArrowDecoder, decodeValueAsync } from '../utils/codec.js'; import { getDefaultPythonPath } from '../utils/python.js'; import { getVenvBinDir, getVenvPythonExe } from '../utils/runtime.js'; @@ -186,6 +187,11 @@ export class OptimizedNodeBridge extends RuntimeBridge { throw new Error('Bridge has been disposed'); } + const require = createRequire(import.meta.url); + await autoRegisterArrowDecoder({ + loader: () => require('apache-arrow'), + }); + // Ensure minimum processes are available while (this.processPool.length < this.options.minProcesses) { await this.spawnProcess(); diff --git a/src/types/index.ts b/src/types/index.ts index 6581534..2783ff8 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -317,6 +317,9 @@ export interface BridgeInfo { pid: number; codecFallback: 'json' | 'none'; arrowAvailable: boolean; + scipyAvailable: boolean; + torchAvailable: boolean; + sklearnAvailable: boolean; instances: number; } diff --git a/src/utils/codec.ts b/src/utils/codec.ts index fb17e08..bf29cb3 100644 --- a/src/utils/codec.ts +++ b/src/utils/codec.ts @@ -128,6 +128,71 @@ export function hasArrowDecoder(): boolean { return typeof arrowTableFrom === 'function'; } +type ArrowModuleLoader = () => unknown | Promise; + +/** + * Detect Node.js runtime capabilities without hard dependencies. + * + * Why: keep browser/bundler builds safe while still enabling Node-only paths. + */ +function isNodeRuntime(): boolean { + return ( + typeof process !== 'undefined' && + typeof (process as { versions?: { node?: string } }).versions?.node === 'string' + ); +} + +/** + * Validate the Arrow module shape and register its IPC decoder. + * + * Why: centralize tableFromIPC checks so callers get consistent errors and can + * rely on a single registration path. + */ +function registerArrowDecoderFromModule(module: { tableFromIPC?: unknown }): void { + const tableFromIPC = module.tableFromIPC; + if (typeof tableFromIPC !== 'function') { + throw new Error('apache-arrow does not export tableFromIPC'); + } + registerArrowDecoder((bytes: Uint8Array) => tableFromIPC(bytes)); +} + +/** + * Attempt to lazily register an Arrow decoder at runtime. + * + * Why: keep apache-arrow optional while letting NodeBridge (or callers) enable + * Arrow decoding when the module is present. + */ +export async function autoRegisterArrowDecoder( + options: { loader?: ArrowModuleLoader } = {} +): Promise { + if (hasArrowDecoder()) { + return true; + } + const loader: ArrowModuleLoader | undefined = + options.loader ?? + (isNodeRuntime() + ? (async (): Promise => { + try { + const nodeModule = await import('node:module'); + const require = nodeModule.createRequire(import.meta.url); + return require('apache-arrow') as unknown; + } catch { + return await import('apache-arrow'); + } + }) + : undefined); + if (!loader) { + return false; + } + try { + const arrowModule = await loader(); + registerArrowDecoderFromModule(arrowModule as { tableFromIPC?: unknown }); + return true; + } catch { + return false; + } +} + function isObject(value: unknown): value is { [k: string]: unknown } { return typeof value === 'object' && value !== null; } diff --git a/test/runtime_codec.test.ts b/test/runtime_codec.test.ts index 1c784de..4b8c027 100644 --- a/test/runtime_codec.test.ts +++ b/test/runtime_codec.test.ts @@ -7,6 +7,7 @@ import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; import { decodeValueAsync, decodeValue, + autoRegisterArrowDecoder, registerArrowDecoder, clearArrowDecoder, hasArrowDecoder, @@ -51,6 +52,47 @@ describe('Cross-Runtime Data Transfer Codec', () => { expect(hasArrowDecoder()).toBe(true); }); + it('should auto-register Arrow decoder from loader', async () => { + const tableFromIPC = vi.fn().mockReturnValue({ numRows: 1, numCols: 1 }); + const loader = vi.fn().mockResolvedValue({ tableFromIPC }); + + const registered = await autoRegisterArrowDecoder({ loader }); + + expect(registered).toBe(true); + expect(loader).toHaveBeenCalled(); + expect(hasArrowDecoder()).toBe(true); + }); + + it('should skip loader when decoder already registered', async () => { + registerArrowDecoder(bytes => bytes); + const loader = vi.fn().mockImplementation(() => { + throw new Error('loader should not be called'); + }); + + const registered = await autoRegisterArrowDecoder({ loader }); + + expect(registered).toBe(true); + expect(loader).not.toHaveBeenCalled(); + }); + + it('should return false when loader lacks tableFromIPC', async () => { + const loader = vi.fn().mockResolvedValue({}); + + const registered = await autoRegisterArrowDecoder({ loader }); + + expect(registered).toBe(false); + expect(hasArrowDecoder()).toBe(false); + }); + + it('should return false when loader throws', async () => { + const loader = vi.fn().mockRejectedValue(new Error('missing')); + + const registered = await autoRegisterArrowDecoder({ loader }); + + expect(registered).toBe(false); + expect(hasArrowDecoder()).toBe(false); + }); + it('should initially have no Arrow decoder', () => { clearArrowDecoder(); expect(hasArrowDecoder()).toBe(false); diff --git a/test/runtime_node.test.ts b/test/runtime_node.test.ts index b9e845b..a788140 100644 --- a/test/runtime_node.test.ts +++ b/test/runtime_node.test.ts @@ -140,6 +140,9 @@ describeNodeOnly('Node.js Runtime Bridge', () => { expect(info.protocol).toBe('tywrap/1'); expect(info.protocolVersion).toBeGreaterThan(0); expect(info.pythonVersion).toMatch(/^\d+\.\d+\.\d+$/); + expect(typeof info.scipyAvailable).toBe('boolean'); + expect(typeof info.torchAvailable).toBe('boolean'); + expect(typeof info.sklearnAvailable).toBe('boolean'); const before = info.instances; const handle = await bridge.instantiate('collections', 'Counter', [[1, 2, 2]]);