From a88f416435f638a57034e31b7267ad02b89f1008 Mon Sep 17 00:00:00 2001
From: airmang <38392618+airmang@users.noreply.github.com>
Date: Tue, 5 May 2026 00:34:07 +0900
Subject: [PATCH] Prevent Hancom from rejecting HWPX roundtrips
Keep section/header XML roots and archive metadata aligned with Hancom-authored packages so simple read-modify-save operations do not produce files that look damaged or tampered with.
Constraint: Hancom Office is stricter than generic XML parsers about HWPML root declarations, standalone XML declarations, and OPC ZIP entry metadata.
Rejected: Relying on XML well-formedness alone | it allowed files that validate in Python but can be rejected by Hancom.
Confidence: high
Scope-risk: moderate
Directive: Preserve Hancom-compatible HWPML root metadata when adding new serializers or pack/unpack paths.
Tested: python -m pytest tests/test_gap_closure_tools.py tests/test_opc_package.py -q; python -m pytest -q; pyright; real HWPX add_paragraph roundtrip validator/root namespace audit
Not-tested: Manual opening in Hancom Office GUI
---
src/hwpx/opc/package.py | 73 ++++++++++++++++++++++++-----
src/hwpx/oxml/document.py | 16 +++++++
src/hwpx/oxml/namespaces.py | 25 ++++++++++
src/hwpx/tools/archive_cli.py | 32 ++++++++++++-
src/hwpx/tools/package_validator.py | 63 ++++++++++++++++++++++++-
tests/test_gap_closure_tools.py | 71 +++++++++++++++++++++++++---
tests/test_opc_package.py | 29 ++++++++++++
7 files changed, 290 insertions(+), 19 deletions(-)
diff --git a/src/hwpx/opc/package.py b/src/hwpx/opc/package.py
index 08dc0b9..90424b4 100644
--- a/src/hwpx/opc/package.py
+++ b/src/hwpx/opc/package.py
@@ -9,7 +9,7 @@
import tempfile
from dataclasses import dataclass
from pathlib import Path, PurePosixPath
-from typing import BinaryIO, Iterable, Iterator, Mapping, MutableMapping
+from typing import BinaryIO, Iterable, Iterator, Mapping, MutableMapping, Sequence
from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile, ZipInfo
from lxml import etree # type: ignore[reportAttributeAccessIssue]
@@ -133,11 +133,16 @@ def __init__(
rootfiles: Iterable[RootFile],
version_info: VersionInfo,
mimetype: str,
+ *,
+ zip_infos: Mapping[str, ZipInfo] | None = None,
+ zip_order: Sequence[str] | None = None,
) -> None:
self._files = files
self._rootfiles = list(rootfiles)
self._version = version_info
self._mimetype = mimetype
+ self._zip_infos = dict(zip_infos or {})
+ self._zip_order = list(zip_order or files.keys())
self._manifest_tree: etree._Element | None = None
self._spine_cache: list[str] | None = None
self._section_paths_cache: list[str] | None = None
@@ -156,14 +161,24 @@ def open(cls, pkg_file: str | Path | bytes | bytearray | BinaryIO) -> HwpxPackag
stream = pkg_file
with ZipFile(stream, "r") as zf:
- files = {info.filename: zf.read(info.filename) for info in zf.infolist()}
+ infos = [info for info in zf.infolist() if not info.is_dir()]
+ files = {info.filename: zf.read(info.filename) for info in infos}
+ zip_infos = {info.filename: info for info in infos}
+ zip_order = [info.filename for info in infos]
logger.debug("HWPX 패키지 파일 목록 %d개를 로드했습니다.", len(files))
if cls.MIMETYPE_PATH not in files:
raise HwpxStructureError("HWPX package is missing the mandatory 'mimetype' file.")
mimetype = files[cls.MIMETYPE_PATH].decode("utf-8")
rootfiles = cls._parse_container(files.get(cls.CONTAINER_PATH))
version_info = cls._parse_version(files.get(cls.VERSION_PATH))
- package = cls(files, rootfiles, version_info, mimetype)
+ package = cls(
+ files,
+ rootfiles,
+ version_info,
+ mimetype,
+ zip_infos=zip_infos,
+ zip_order=zip_order,
+ )
return package
@staticmethod
@@ -243,6 +258,8 @@ def write(self, path: str, data: bytes | str) -> None:
elif norm_path == self.VERSION_PATH:
pending_version = self._parse_version(data)
self._files[norm_path] = data
+ if norm_path not in self._zip_order:
+ self._zip_order.append(norm_path)
if norm_path == self.MIMETYPE_PATH:
self._mimetype = mimetype
elif norm_path == self.CONTAINER_PATH:
@@ -263,6 +280,8 @@ def delete(self, path: str) -> None:
"Cannot remove mandatory files ('mimetype', 'container.xml', 'version.xml')."
)
del self._files[norm_path]
+ self._zip_infos.pop(norm_path, None)
+ self._zip_order = [name for name in self._zip_order if name != norm_path]
self._invalidate_caches(norm_path)
self._validate_structure()
@@ -553,15 +572,47 @@ def _save_to_zip(self, pkg_file: str | Path | BinaryIO) -> None:
def _write_archive(self, zf: ZipFile) -> None:
self._write_mimetype(zf)
- for name in sorted(self._files):
- if name == self.MIMETYPE_PATH:
- continue
+ written = {self.MIMETYPE_PATH}
+ ordered_names = [
+ name
+ for name in self._zip_order
+ if name != self.MIMETYPE_PATH and name in self._files
+ ]
+ new_names = sorted(
+ name for name in self._files if name not in written and name not in ordered_names
+ )
+ for name in [*ordered_names, *new_names]:
self._write_zip_entry(zf, name, self._files[name], ZIP_DEFLATED)
-
- @staticmethod
- def _write_zip_entry(zf: ZipFile, path: str, payload: bytes, compress_type: int) -> None:
- info = ZipInfo(path)
- info.compress_type = compress_type
+ written.add(name)
+
+ def _zip_info_for_write(self, path: str, compress_type: int) -> ZipInfo:
+ original = self._zip_infos.get(path)
+ if original is None:
+ info = ZipInfo(path)
+ info.compress_type = compress_type
+ return info
+
+ info = ZipInfo(path, original.date_time)
+ info.compress_type = original.compress_type
+ info.comment = original.comment
+ info.extra = original.extra
+ info.create_system = original.create_system
+ info.create_version = original.create_version
+ info.extract_version = original.extract_version
+ info.flag_bits = original.flag_bits
+ info.volume = original.volume
+ info.internal_attr = original.internal_attr
+ info.external_attr = original.external_attr
+ return info
+
+ def _write_zip_entry(
+ self,
+ zf: ZipFile,
+ path: str,
+ payload: bytes,
+ compress_type: int,
+ ) -> None:
+ info = self._zip_info_for_write(path, compress_type)
zf.writestr(info, payload)
def _write_mimetype(self, zf: ZipFile) -> None:
diff --git a/src/hwpx/oxml/document.py b/src/hwpx/oxml/document.py
index f92c73c..e6faf56 100644
--- a/src/hwpx/oxml/document.py
+++ b/src/hwpx/oxml/document.py
@@ -31,6 +31,7 @@
parse_track_change_authors,
parse_track_changes,
)
+from .namespaces import HWPML_COMPAT_ROOT_NAMESPACES
from .utils import parse_int
ET.register_namespace("hp", "http://www.hancom.co.kr/hwpml/2011/paragraph")
@@ -104,6 +105,21 @@ def _sanitize_text(value: str) -> str:
def _serialize_xml(element: ET.Element) -> bytes:
"""Return a UTF-8 encoded XML document for *element*."""
+ xml_bytes = ET.tostring(element, encoding="utf-8", xml_declaration=False)
+ if element.tag in {_HS + "sec", _HH + "head"}:
+ root = LET.fromstring(xml_bytes)
+ wrapped = LET.Element(root.tag, nsmap=HWPML_COMPAT_ROOT_NAMESPACES)
+ wrapped.attrib.update(root.attrib)
+ wrapped.text = root.text
+ wrapped.tail = root.tail
+ for child in root:
+ wrapped.append(child)
+ return LET.tostring(
+ wrapped,
+ encoding="UTF-8",
+ xml_declaration=True,
+ standalone=True,
+ )
return ET.tostring(element, encoding="utf-8", xml_declaration=True)
diff --git a/src/hwpx/oxml/namespaces.py b/src/hwpx/oxml/namespaces.py
index 5f01087..f62facc 100644
--- a/src/hwpx/oxml/namespaces.py
+++ b/src/hwpx/oxml/namespaces.py
@@ -23,3 +23,28 @@
HS10_NS = "http://www.hancom.co.kr/hwpml/2016/section"
HC10_NS = "http://www.hancom.co.kr/hwpml/2016/core"
HH10_NS = "http://www.hancom.co.kr/hwpml/2016/head"
+
+OPF_NS = "http://www.idpf.org/2007/opf/"
+XML_NS = "http://www.w3.org/XML/1998/namespace"
+
+# Hancom Office emits this broad namespace surface on HWPML document roots and
+# may treat declarations that generic XML serializers consider optional as part
+# of the package compatibility contract. Keep section/header roots close to the
+# shape Hancom writes so read-modify-save roundtrips do not look tampered with.
+HWPML_COMPAT_ROOT_NAMESPACES = {
+ "ha": "http://www.hancom.co.kr/hwpml/2011/app",
+ "hp": HP_NS,
+ "hp10": HP10_NS,
+ "hs": HS_NS,
+ "hc": HC_NS,
+ "hh": HH_NS,
+ "hhs": "http://www.hancom.co.kr/hwpml/2011/history",
+ "hm": "http://www.hancom.co.kr/hwpml/2011/master-page",
+ "hpf": "http://www.hancom.co.kr/schema/2011/hpf",
+ "dc": "http://purl.org/dc/elements/1.1/",
+ "opf": OPF_NS,
+ "ooxmlchart": "http://www.hancom.co.kr/hwpml/2016/ooxmlchart",
+ "hwpunitchar": "http://www.hancom.co.kr/hwpml/2016/HwpUnitChar",
+ "epub": "http://www.idpf.org/2007/ops",
+ "config": "urn:oasis:names:tc:opendocument:xmlns:config:1.0",
+}
diff --git a/src/hwpx/tools/archive_cli.py b/src/hwpx/tools/archive_cli.py
index 9947348..790f14d 100644
--- a/src/hwpx/tools/archive_cli.py
+++ b/src/hwpx/tools/archive_cli.py
@@ -14,6 +14,8 @@
from lxml import etree # type: ignore[reportAttributeAccessIssue]
+from ..opc.relationships import is_header_part_name, is_section_part_name
+from ..oxml.namespaces import HWPML_COMPAT_ROOT_NAMESPACES
from .package_validator import validate_package
_XML_SUFFIXES = (".xml", ".hpf")
@@ -92,6 +94,30 @@ def _format_xml_bytes(payload: bytes) -> bytes:
)
+def _normalize_hwpml_compat_root(rel_path: str, payload: bytes) -> bytes:
+ if not (is_section_part_name(rel_path) or is_header_part_name(rel_path)):
+ return payload
+ try:
+ root = etree.fromstring(payload)
+ except etree.XMLSyntaxError:
+ return payload
+ if not (root.tag.endswith("}sec") or root.tag.endswith("}head")):
+ return payload
+
+ wrapped = etree.Element(root.tag, nsmap=HWPML_COMPAT_ROOT_NAMESPACES)
+ wrapped.attrib.update(root.attrib)
+ wrapped.text = root.text
+ wrapped.tail = root.tail
+ for child in root:
+ wrapped.append(child)
+ return etree.tostring(
+ wrapped,
+ encoding="UTF-8",
+ xml_declaration=True,
+ standalone=True,
+ )
+
+
def _iter_file_entries(archive: ZipFile) -> tuple[ArchiveEntryInfo, ...]:
entries: list[ArchiveEntryInfo] = []
for info in archive.infolist():
@@ -261,7 +287,11 @@ def pack_hwpx(
compress_type = compress_types.get(rel_path, ZIP_DEFLATED)
if compress_type != ZIP_STORED:
compress_type = ZIP_DEFLATED
- archive.write(root / rel_path, rel_path, compress_type=compress_type)
+ payload = _normalize_hwpml_compat_root(
+ rel_path,
+ (root / rel_path).read_bytes(),
+ )
+ archive.writestr(rel_path, payload, compress_type=compress_type)
_summarize_pack_validation(tmp_path)
os.replace(tmp_path, destination)
diff --git a/src/hwpx/tools/package_validator.py b/src/hwpx/tools/package_validator.py
index b10d71f..af4c8bf 100644
--- a/src/hwpx/tools/package_validator.py
+++ b/src/hwpx/tools/package_validator.py
@@ -3,14 +3,19 @@
import argparse
import io
+import re
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from pathlib import Path, PurePosixPath
from typing import BinaryIO, Literal, Sequence
from zipfile import ZIP_STORED, BadZipFile, ZipFile
+from lxml import etree as LET # type: ignore[reportMissingImports]
+
+from ..oxml.namespaces import HWPML_COMPAT_ROOT_NAMESPACES
from ..opc.relationships import (
MAIN_ROOTFILE_MEDIA_TYPE,
+ is_header_part_name,
is_section_part_name,
parse_container_rootfiles,
parse_manifest_relationships,
@@ -23,6 +28,9 @@
HEADER_PATH = "Contents/header.xml"
VERSION_PATH = "version.xml"
+_XML_DECLARATION_RE = re.compile(br"^<\?xml\s+([^?]*?)\?>", re.IGNORECASE)
+_STANDALONE_YES_RE = re.compile(br"\bstandalone\s*=\s*(['\"])yes\1", re.IGNORECASE)
+
IssueLevel = Literal["error", "warning"]
__all__ = [
@@ -83,6 +91,57 @@ def _parse_xml(payload: bytes) -> ET.Element:
raise ValueError(f"malformed XML: {exc}") from exc
+def _root_declared_namespaces(payload: bytes) -> dict[str, str]:
+ try:
+ root = LET.fromstring(payload)
+ except LET.XMLSyntaxError:
+ return {}
+ return {"" if prefix is None else prefix: uri for prefix, uri in root.nsmap.items() if uri}
+
+
+def _has_standalone_yes_declaration(payload: bytes) -> bool:
+ stripped = payload.lstrip()
+ if stripped.startswith(b"\xef\xbb\xbf"):
+ stripped = stripped[3:]
+ match = _XML_DECLARATION_RE.match(stripped)
+ return bool(match and _STANDALONE_YES_RE.search(match.group(1)))
+
+
+def _check_hwpml_compat_root(
+ issues: list[PackageValidationIssue],
+ part_name: str,
+ payload: bytes,
+ root: ET.Element,
+) -> None:
+ if not (
+ is_section_part_name(part_name)
+ or is_header_part_name(part_name)
+ or part_name == HEADER_PATH
+ ):
+ return
+ if not (root.tag.endswith("}sec") or root.tag.endswith("}head")):
+ return
+ if not _has_standalone_yes_declaration(payload):
+ _error(
+ issues,
+ part_name,
+ 'missing XML declaration with standalone="yes"',
+ )
+ declared = _root_declared_namespaces(payload)
+ missing = [
+ prefix
+ for prefix, uri in HWPML_COMPAT_ROOT_NAMESPACES.items()
+ if declared.get(prefix) != uri
+ ]
+ if missing:
+ _error(
+ issues,
+ part_name,
+ "missing Hancom-compatible HWPML root namespace declarations: "
+ + ", ".join(missing),
+ )
+
+
def _error(issues: list[PackageValidationIssue], part_name: str, message: str) -> None:
issues.append(PackageValidationIssue(part_name, message, "error"))
@@ -172,7 +231,9 @@ def validate_package(source: str | Path | bytes | BinaryIO) -> PackageValidation
_error(issues, name, "unable to read entry for XML parsing")
continue
try:
- xml_roots[name] = _parse_xml(payload)
+ root = _parse_xml(payload)
+ xml_roots[name] = root
+ _check_hwpml_compat_root(issues, name, payload, root)
except ValueError as exc:
_error(issues, name, str(exc))
diff --git a/tests/test_gap_closure_tools.py b/tests/test_gap_closure_tools.py
index 6713c75..4fdd061 100644
--- a/tests/test_gap_closure_tools.py
+++ b/tests/test_gap_closure_tools.py
@@ -13,6 +13,7 @@
from hwpx import HwpxDocument
from hwpx.opc.package import HwpxPackage
from hwpx.opc.relationships import MAIN_ROOTFILE_MEDIA_TYPE, resolve_part_name
+from hwpx.oxml.namespaces import HWPML_COMPAT_ROOT_NAMESPACES
from hwpx.tools import archive_cli
from hwpx.tools.archive_cli import pack_hwpx, unpack_hwpx
from hwpx.tools.package_validator import validate_package
@@ -22,10 +23,19 @@
_REPO_ROOT = Path(__file__).resolve().parents[1]
_MIMETYPE = b"application/hwp+zip"
_VERSION_XML = b''
+
+
+def _hwpml_root_namespace_attrs() -> str:
+ return " ".join(
+ f'xmlns:{prefix}="{uri}"'
+ for prefix, uri in HWPML_COMPAT_ROOT_NAMESPACES.items()
+ )
+
+
_HEADER_XML = (
- b''
- b''
-)
+ ''
+ f""
+).encode("utf-8")
_MASTER_PAGE_XML = (
b''
b''
@@ -86,9 +96,8 @@ def _build_manifest_xml(
def _build_section_xml(text: str) -> bytes:
return (
- ''
- ''
+ ''
+ f""
''
f'{text}'
""
@@ -107,6 +116,19 @@ def _zip_parts(parts: list[tuple[str, bytes]]) -> bytes:
return buffer.getvalue()
+def _replace_zip_part(package_bytes: bytes, part_name: str, payload: bytes) -> bytes:
+ buffer = io.BytesIO()
+ with ZipFile(io.BytesIO(package_bytes), "r") as source:
+ with ZipFile(buffer, "w", compression=ZIP_DEFLATED) as archive:
+ for info in source.infolist():
+ replacement = payload if info.filename == part_name else source.read(info.filename)
+ if info.filename == "mimetype":
+ archive.writestr(info.filename, replacement, compress_type=ZIP_STORED)
+ else:
+ archive.writestr(info.filename, replacement)
+ return buffer.getvalue()
+
+
def _build_manual_package(
*,
manifest_path: str = "Contents/content.hpf",
@@ -205,6 +227,43 @@ def test_package_validator_accepts_valid_document() -> None:
assert "Contents/header.xml" in report.checked_parts
+def test_add_paragraph_roundtrip_preserves_section_root_compat_metadata() -> None:
+ package_bytes, paths = _build_manual_package(text="Original paragraph")
+
+ document = HwpxDocument.open(package_bytes)
+ document.add_paragraph("Added paragraph")
+ roundtrip = document.to_bytes()
+
+ with ZipFile(io.BytesIO(roundtrip), "r") as archive:
+ section_xml = archive.read(paths["section"])
+
+ declaration = section_xml.split(b"?>", 1)[0]
+ assert b"standalone='yes'" in declaration or b'standalone="yes"' in declaration
+ for prefix, uri in HWPML_COMPAT_ROOT_NAMESPACES.items():
+ expected = f'xmlns:{prefix}="{uri}"'.encode("utf-8")
+ assert expected in section_xml
+ assert validate_package(roundtrip).ok
+
+
+def test_package_validator_rejects_section_root_metadata_regression() -> None:
+ package_bytes, paths = _build_manual_package(text="Original paragraph")
+ regressed_section_xml = (
+ b''
+ b''
+ b''
+ b'Regressed'
+ b""
+ )
+ regressed = _replace_zip_part(package_bytes, paths["section"], regressed_section_xml)
+
+ report = validate_package(regressed)
+
+ assert not report.ok
+ assert any("standalone" in issue.message for issue in report.errors)
+ assert any("root namespace declarations" in issue.message for issue in report.errors)
+
+
def test_package_validator_reports_missing_mimetype() -> None:
package_bytes, _ = _build_manual_package()
with ZipFile(io.BytesIO(package_bytes), "r") as source:
diff --git a/tests/test_opc_package.py b/tests/test_opc_package.py
index 6425166..6ca39c4 100644
--- a/tests/test_opc_package.py
+++ b/tests/test_opc_package.py
@@ -79,3 +79,32 @@ def test_save_preserves_expected_compress_type_per_entry() -> None:
assert infos[0].compress_type == ZIP_STORED
for info in infos[1:]:
assert info.compress_type == ZIP_DEFLATED
+
+
+def test_save_preserves_existing_archive_order_and_entry_metadata() -> None:
+ buffer = io.BytesIO()
+ with ZipFile(buffer, "w", compression=ZIP_DEFLATED) as archive:
+ archive.writestr("mimetype", _MIMETYPE, compress_type=ZIP_STORED)
+ archive.writestr("version.xml", _VERSION_XML, compress_type=ZIP_STORED)
+ archive.writestr("Contents/header.xml", _HEADER_XML)
+ archive.writestr("Contents/content.hpf", _MANIFEST_XML)
+ archive.writestr("META-INF/container.xml", _CONTAINER_XML)
+
+ source_bytes = buffer.getvalue()
+ with ZipFile(io.BytesIO(source_bytes), "r") as archive:
+ original_metadata = [
+ (info.filename, info.compress_type, info.create_system, info.external_attr)
+ for info in archive.infolist()
+ ]
+
+ package = HwpxPackage.open(source_bytes)
+ package.write("Contents/header.xml", _HEADER_XML + b"")
+
+ output = package.save()
+ with ZipFile(io.BytesIO(output), "r") as archive:
+ roundtrip_metadata = [
+ (info.filename, info.compress_type, info.create_system, info.external_attr)
+ for info in archive.infolist()
+ ]
+
+ assert roundtrip_metadata == original_metadata