Skip to content

Commit 21f5426

Browse files
authored
Support deserializing non-utf8 attributes in rare case (#184)
Even though https://onnx.ai/onnx/repo-docs/IR.html#attributes requires the attribute for strings to be utf-8 encoded bytes, custom ops may still store arbitrary data there (see SentencepieceTokenizer in https://github.com/microsoft/onnxruntime-extensions). This change allows the value to stay in bytes and be reserialized as-is. Fix #182 --------- Signed-off-by: Justin Chu <[email protected]>
1 parent b9ecb98 commit 21f5426

File tree

1 file changed

+25
-2
lines changed

1 file changed

+25
-2
lines changed

src/onnx_ir/serde.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1143,7 +1143,19 @@ def _deserialize_attribute(
11431143
if type_ == _enums.AttributeType.FLOAT:
11441144
return _core.AttrFloat32(name, proto.f, doc_string=doc_string)
11451145
if type_ == _enums.AttributeType.STRING:
1146-
return _core.AttrString(name, proto.s.decode("utf-8"), doc_string=doc_string)
1146+
try:
1147+
return _core.AttrString(name, proto.s.decode("utf-8"), doc_string=doc_string)
1148+
except UnicodeDecodeError:
1149+
# Even though onnx.ai/onnx/repo-docs/IR.html#attributes requires the attribute
1150+
# for strings to be utf-8 encoded bytes, custom ops may still store arbitrary data there
1151+
logger.warning(
1152+
"Attribute %r contains invalid UTF-8 bytes. ONNX spec requires string attributes "
1153+
"to be UTF-8 encoded so the model is invalid. We will skip decoding the attribute and "
1154+
"use the bytes as attribute value",
1155+
name,
1156+
)
1157+
return _core.Attr(name, type_, proto.s, doc_string=doc_string)
1158+
11471159
if type_ == _enums.AttributeType.INTS:
11481160
return _core.AttrInt64s(name, proto.ints, doc_string=doc_string)
11491161
if type_ == _enums.AttributeType.FLOATS:
@@ -1792,7 +1804,18 @@ def _fill_in_value_for_attribute(
17921804
attribute_proto.type = onnx.AttributeProto.FLOAT
17931805
elif type_ == _enums.AttributeType.STRING:
17941806
# value: str
1795-
attribute_proto.s = value.encode("utf-8")
1807+
if type(value) is bytes:
1808+
# Even though onnx.ai/onnx/repo-docs/IR.html#attributes requires the attribute
1809+
# for strings to be utf-8 encoded bytes, custom ops may still store arbitrary data there
1810+
logger.warning(
1811+
"Value in attribute %r should be a string but is instead bytes. ONNX "
1812+
"spec requires string attributes to be UTF-8 encoded so the model is invalid. "
1813+
"We will skip encoding the attribute and use the bytes as attribute value",
1814+
attribute_proto.name,
1815+
)
1816+
attribute_proto.s = value
1817+
else:
1818+
attribute_proto.s = value.encode("utf-8")
17961819
attribute_proto.type = onnx.AttributeProto.STRING
17971820
elif type_ == _enums.AttributeType.INTS:
17981821
# value: Sequence[int]

0 commit comments

Comments
 (0)