Skip to content

Commit a51a6e1

Browse files
committed
feat: Attempting to load use CSafeLoader for faster YAML parsing where available. CSafeLoader can offer 9->11x performance improvements in some cases while still performing the safe loading methods of safe_load (No code execution)
Signed-off-by: Brian Axelson <86568017+baxeaz@users.noreply.github.com>
1 parent 3920096 commit a51a6e1

File tree

3 files changed

+342
-1
lines changed

3 files changed

+342
-1
lines changed

src/openjd/model/_parse.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
22

33
import json
4+
import os
45
from dataclasses import is_dataclass
56
from decimal import Decimal
67
from enum import Enum
@@ -33,6 +34,18 @@ class DocumentType(str, Enum):
3334
YAML = "YAML"
3435

3536

37+
# Environment variable can optionally disable CSafeLoader (used for benchmarking)
38+
use_csafe_loader = os.environ.get("OPENJD_USE_CSAFE_LOADER", "true").lower() in ("true", "1", "yes")
39+
40+
try:
41+
if use_csafe_loader:
42+
from yaml import CSafeLoader as _YamlLoader
43+
else:
44+
raise ImportError("CSafeLoader disabled by environment variable")
45+
except ImportError:
46+
from yaml import SafeLoader as _YamlLoader
47+
48+
3649
# Pydantic injects a __pydantic_model__ attribute into all dataclasses. To be able to parse
3750
# dataclass models we need to be able to invoke Model.__pydantic_model__.model_validate(), but
3851
# type checkers do not realize that pydantic dataclasses have a __pydantic_model__ attribute.
@@ -111,7 +124,7 @@ def document_string_to_object(*, document: str, document_type: DocumentType) ->
111124
if document_type == DocumentType.JSON:
112125
parsed_document = json.loads(document)
113126
else: # YAML
114-
parsed_document = yaml.safe_load(document)
127+
parsed_document = yaml.load(document, Loader=_YamlLoader)
115128
if not isinstance(parsed_document, dict):
116129
raise ValueError()
117130
return parsed_document
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,7 @@
11
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
3+
"""
4+
Benchmark tests for OpenJD model performance testing.
5+
6+
This package contains performance benchmarks for various components of the OpenJD model.
7+
"""
Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
3+
"""
4+
Benchmark tests for YAML loader performance comparison between CSafeLoader and SafeLoader.
5+
6+
This module provides comprehensive benchmarking of YAML parsing performance with different
7+
loader implementations, testing both small and large template scenarios.
8+
"""
9+
10+
import time
11+
import statistics
12+
import logging
13+
from typing import Dict, List
14+
15+
import pytest
16+
import yaml
17+
18+
# Configure logging
19+
logging.basicConfig(
20+
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
21+
)
22+
logger = logging.getLogger("openjd.model.benchmark")
23+
24+
25+
class YAMLLoaderBenchmark:
26+
"""Benchmark suite for YAML loader performance testing."""
27+
28+
def __init__(self):
29+
self.results: Dict[str, List[float]] = {}
30+
31+
def create_small_template(self) -> str:
32+
"""Create a small OpenJD template for testing."""
33+
return """
34+
specificationVersion: jobtemplate-2023-09
35+
name: SmallBenchmarkJob
36+
description: A small template for performance testing
37+
parameterDefinitions:
38+
- name: InputFile
39+
type: PATH
40+
objectType: FILE
41+
dataFlow: IN
42+
- name: OutputDir
43+
type: PATH
44+
objectType: DIRECTORY
45+
dataFlow: OUT
46+
- name: FrameRange
47+
type: STRING
48+
default: "1-10"
49+
steps:
50+
- name: RenderStep
51+
description: Main rendering step
52+
parameterSpace:
53+
taskParameterDefinitions:
54+
- name: Frame
55+
type: INT
56+
range: "{{Param.FrameRange}}"
57+
script:
58+
actions:
59+
onRun:
60+
command: render
61+
args:
62+
- "--input"
63+
- "{{Param.InputFile}}"
64+
- "--output"
65+
- "{{Param.OutputDir}}/frame_{{Task.Param.Frame}}.exr"
66+
- "--frame"
67+
- "{{Task.Param.Frame}}"
68+
env:
69+
- name: RENDER_THREADS
70+
value: "4"
71+
- name: RENDER_QUALITY
72+
value: "high"
73+
"""
74+
75+
def create_large_template(self, num_steps: int = 50, num_params_per_step: int = 10) -> str:
76+
"""Create a large OpenJD template for stress testing."""
77+
template_parts = [
78+
"specificationVersion: jobtemplate-2023-09",
79+
"name: LargeBenchmarkJob",
80+
"description: A large template for performance stress testing",
81+
"parameterDefinitions:",
82+
]
83+
84+
# Add global parameters
85+
for i in range(20):
86+
template_parts.extend(
87+
[
88+
f" - name: GlobalParam{i}",
89+
" type: STRING",
90+
f' default: "value{i}"',
91+
f" description: Global parameter {i} for testing",
92+
]
93+
)
94+
95+
template_parts.append("steps:")
96+
97+
# Add multiple steps
98+
for step_idx in range(num_steps):
99+
template_parts.extend(
100+
[
101+
f" - name: Step{step_idx}",
102+
f" description: Processing step {step_idx}",
103+
" parameterSpace:",
104+
" taskParameterDefinitions:",
105+
]
106+
)
107+
108+
# Add task parameters for each step
109+
for param_idx in range(num_params_per_step):
110+
template_parts.extend(
111+
[
112+
f" - name: TaskParam{param_idx}",
113+
" type: INT",
114+
f' range: "1-{param_idx + 5}"',
115+
]
116+
)
117+
118+
template_parts.extend(
119+
[
120+
' combination: "('
121+
+ ", ".join([f"TaskParam{i}" for i in range(min(3, num_params_per_step))])
122+
+ ')"',
123+
" script:",
124+
" actions:",
125+
" onRun:",
126+
f" command: process_step_{step_idx}",
127+
" args:",
128+
]
129+
)
130+
131+
# Add multiple arguments
132+
for arg_idx in range(5):
133+
template_parts.append(
134+
f' - "--arg{arg_idx}={{{{Task.Param.TaskParam{arg_idx % num_params_per_step}}}}}"'
135+
)
136+
137+
template_parts.extend([" env:"])
138+
139+
# Add environment variables
140+
for env_idx in range(3):
141+
template_parts.extend(
142+
[
143+
f" - name: ENV_VAR_{env_idx}",
144+
f' value: "{{{{Param.GlobalParam{env_idx % 20}}}}}"',
145+
]
146+
)
147+
148+
# Add dependencies for later steps
149+
if step_idx > 0:
150+
template_parts.extend([" dependencies:"])
151+
# Add dependencies to previous steps
152+
for dep_idx in range(min(3, step_idx)):
153+
template_parts.append(f" - dependsOn: Step{dep_idx}")
154+
155+
return "\n".join(template_parts)
156+
157+
def benchmark_loader(
158+
self, template_content: str, loader_type: str, iterations: int = 10
159+
) -> List[float]:
160+
"""Benchmark a specific loader type with given template content."""
161+
162+
times = []
163+
164+
# Select the appropriate loader directly
165+
if loader_type == "CSafeLoader":
166+
try:
167+
from yaml import CSafeLoader as YamlLoader
168+
except ImportError:
169+
from yaml import SafeLoader as YamlLoader
170+
else:
171+
from yaml import SafeLoader as YamlLoader
172+
173+
for _ in range(iterations):
174+
start_time = time.perf_counter()
175+
# Parse YAML directly instead of using document_string_to_object
176+
# to avoid the module-level loader selection
177+
yaml.load(template_content, Loader=YamlLoader)
178+
end_time = time.perf_counter()
179+
times.append((end_time - start_time) * 1000) # Convert to milliseconds
180+
181+
return times
182+
183+
def run_benchmark_comparison(
184+
self, template_content: str, template_name: str, iterations: int = 10
185+
) -> Dict[str, Dict[str, float]]:
186+
"""Run benchmark comparison between CSafeLoader and SafeLoader."""
187+
logger.info(f"=== BENCHMARKING {template_name.upper()} ===")
188+
logger.info(f"Template size: {len(template_content):,} characters")
189+
logger.info(f"Running {iterations} iterations per loader...")
190+
191+
results = {}
192+
193+
for loader_type in ["SafeLoader", "CSafeLoader"]:
194+
logger.info(f"Testing {loader_type}...")
195+
times = self.benchmark_loader(template_content, loader_type, iterations)
196+
197+
stats = {
198+
"mean": statistics.mean(times),
199+
"median": statistics.median(times),
200+
"min": min(times),
201+
"max": max(times),
202+
"stdev": statistics.stdev(times) if len(times) > 1 else 0.0,
203+
"times": times,
204+
}
205+
206+
results[loader_type] = stats
207+
208+
logger.info(f" Mean: {stats['mean']:.2f}ms")
209+
logger.info(f" Median: {stats['median']:.2f}ms")
210+
logger.info(f" Min: {stats['min']:.2f}ms")
211+
logger.info(f" Max: {stats['max']:.2f}ms")
212+
logger.info(f" StdDev: {stats['stdev']:.2f}ms")
213+
214+
# Calculate performance improvement
215+
safe_mean = results["SafeLoader"]["mean"]
216+
csafe_mean = results["CSafeLoader"]["mean"]
217+
improvement = safe_mean / csafe_mean if csafe_mean > 0 else 0
218+
219+
logger.info("=== PERFORMANCE SUMMARY ===")
220+
logger.info(f"SafeLoader mean: {safe_mean:.2f}ms")
221+
logger.info(f"CSafeLoader mean: {csafe_mean:.2f}ms")
222+
logger.info(f"Performance improvement: {improvement:.1f}x faster")
223+
logger.info(f"Time saved per parse: {safe_mean - csafe_mean:.2f}ms")
224+
225+
return results
226+
227+
228+
class TestYAMLLoaderPerformance:
229+
"""Test class for YAML loader performance benchmarks."""
230+
231+
@pytest.fixture
232+
def benchmark_suite(self):
233+
"""Fixture providing a benchmark suite instance."""
234+
return YAMLLoaderBenchmark()
235+
236+
def test_small_template_performance(self, benchmark_suite):
237+
"""Test performance with small templates."""
238+
template_content = benchmark_suite.create_small_template()
239+
results = benchmark_suite.run_benchmark_comparison(
240+
template_content, "Small Template", iterations=20
241+
)
242+
243+
# Assertions to ensure CSafeLoader is faster
244+
csafe_mean = results["CSafeLoader"]["mean"]
245+
safe_mean = results["SafeLoader"]["mean"]
246+
247+
assert (
248+
csafe_mean < safe_mean
249+
), f"CSafeLoader ({csafe_mean:.2f}ms) should be faster than SafeLoader ({safe_mean:.2f}ms)"
250+
251+
# Expect at least 2x improvement for small templates
252+
improvement = safe_mean / csafe_mean
253+
assert improvement >= 2.0, f"Expected at least 2x improvement, got {improvement:.1f}x"
254+
255+
def test_large_template_performance(self, benchmark_suite):
256+
"""Test performance with large templates."""
257+
template_content = benchmark_suite.create_large_template(
258+
num_steps=30, num_params_per_step=8
259+
)
260+
results = benchmark_suite.run_benchmark_comparison(
261+
template_content, "Large Template", iterations=10
262+
)
263+
264+
# Assertions to ensure CSafeLoader is faster
265+
csafe_mean = results["CSafeLoader"]["mean"]
266+
safe_mean = results["SafeLoader"]["mean"]
267+
268+
assert (
269+
csafe_mean < safe_mean
270+
), f"CSafeLoader ({csafe_mean:.2f}ms) should be faster than SafeLoader ({safe_mean:.2f}ms)"
271+
272+
# Expect at least 4x improvement for large templates
273+
improvement = safe_mean / csafe_mean
274+
assert improvement >= 4.0, f"Expected at least 4x improvement, got {improvement:.1f}x"
275+
276+
def test_extra_large_template_performance(self, benchmark_suite):
277+
"""Test performance with extra large templates for stress testing."""
278+
template_content = benchmark_suite.create_large_template(
279+
num_steps=100, num_params_per_step=15
280+
)
281+
results = benchmark_suite.run_benchmark_comparison(
282+
template_content, "Extra Large Template", iterations=5
283+
)
284+
285+
# Assertions to ensure CSafeLoader is faster
286+
csafe_mean = results["CSafeLoader"]["mean"]
287+
safe_mean = results["SafeLoader"]["mean"]
288+
289+
assert (
290+
csafe_mean < safe_mean
291+
), f"CSafeLoader ({csafe_mean:.2f}ms) should be faster than SafeLoader ({safe_mean:.2f}ms)"
292+
293+
# Expect significant improvement for extra large templates
294+
improvement = safe_mean / csafe_mean
295+
assert improvement >= 5.0, f"Expected at least 5x improvement, got {improvement:.1f}x"
296+
297+
def test_template_file_benchmark(self, benchmark_suite, tmp_path):
298+
"""Test performance using temporary files."""
299+
# Create a medium-sized template
300+
template_content = benchmark_suite.create_large_template(
301+
num_steps=20, num_params_per_step=6
302+
)
303+
304+
# Write to temporary file
305+
temp_file = tmp_path / "benchmark_template.yaml"
306+
temp_file.write_text(template_content)
307+
308+
# Read and benchmark
309+
file_content = temp_file.read_text()
310+
results = benchmark_suite.run_benchmark_comparison(
311+
file_content, f"File-based Template ({temp_file.name})", iterations=15
312+
)
313+
314+
# Verify file was processed correctly
315+
assert len(file_content) > 1000, "Template file should be substantial"
316+
317+
# Performance assertions
318+
csafe_mean = results["CSafeLoader"]["mean"]
319+
safe_mean = results["SafeLoader"]["mean"]
320+
improvement = safe_mean / csafe_mean
321+
322+
assert improvement >= 3.0, f"Expected at least 3x improvement, got {improvement:.1f}x"

0 commit comments

Comments
 (0)