diff --git a/src/hwpx/opc/package.py b/src/hwpx/opc/package.py index 08dc0b9..90424b4 100644 --- a/src/hwpx/opc/package.py +++ b/src/hwpx/opc/package.py @@ -9,7 +9,7 @@ import tempfile from dataclasses import dataclass from pathlib import Path, PurePosixPath -from typing import BinaryIO, Iterable, Iterator, Mapping, MutableMapping +from typing import BinaryIO, Iterable, Iterator, Mapping, MutableMapping, Sequence from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile, ZipInfo from lxml import etree # type: ignore[reportAttributeAccessIssue] @@ -133,11 +133,16 @@ def __init__( rootfiles: Iterable[RootFile], version_info: VersionInfo, mimetype: str, + *, + zip_infos: Mapping[str, ZipInfo] | None = None, + zip_order: Sequence[str] | None = None, ) -> None: self._files = files self._rootfiles = list(rootfiles) self._version = version_info self._mimetype = mimetype + self._zip_infos = dict(zip_infos or {}) + self._zip_order = list(zip_order or files.keys()) self._manifest_tree: etree._Element | None = None self._spine_cache: list[str] | None = None self._section_paths_cache: list[str] | None = None @@ -156,14 +161,24 @@ def open(cls, pkg_file: str | Path | bytes | bytearray | BinaryIO) -> HwpxPackag stream = pkg_file with ZipFile(stream, "r") as zf: - files = {info.filename: zf.read(info.filename) for info in zf.infolist()} + infos = [info for info in zf.infolist() if not info.is_dir()] + files = {info.filename: zf.read(info.filename) for info in infos} + zip_infos = {info.filename: info for info in infos} + zip_order = [info.filename for info in infos] logger.debug("HWPX 패키지 파일 목록 %d개를 로드했습니다.", len(files)) if cls.MIMETYPE_PATH not in files: raise HwpxStructureError("HWPX package is missing the mandatory 'mimetype' file.") mimetype = files[cls.MIMETYPE_PATH].decode("utf-8") rootfiles = cls._parse_container(files.get(cls.CONTAINER_PATH)) version_info = cls._parse_version(files.get(cls.VERSION_PATH)) - package = cls(files, rootfiles, version_info, mimetype) + package = cls( + files, + rootfiles, + version_info, + mimetype, + zip_infos=zip_infos, + zip_order=zip_order, + ) return package @staticmethod @@ -243,6 +258,8 @@ def write(self, path: str, data: bytes | str) -> None: elif norm_path == self.VERSION_PATH: pending_version = self._parse_version(data) self._files[norm_path] = data + if norm_path not in self._zip_order: + self._zip_order.append(norm_path) if norm_path == self.MIMETYPE_PATH: self._mimetype = mimetype elif norm_path == self.CONTAINER_PATH: @@ -263,6 +280,8 @@ def delete(self, path: str) -> None: "Cannot remove mandatory files ('mimetype', 'container.xml', 'version.xml')." ) del self._files[norm_path] + self._zip_infos.pop(norm_path, None) + self._zip_order = [name for name in self._zip_order if name != norm_path] self._invalidate_caches(norm_path) self._validate_structure() @@ -553,15 +572,47 @@ def _save_to_zip(self, pkg_file: str | Path | BinaryIO) -> None: def _write_archive(self, zf: ZipFile) -> None: self._write_mimetype(zf) - for name in sorted(self._files): - if name == self.MIMETYPE_PATH: - continue + written = {self.MIMETYPE_PATH} + ordered_names = [ + name + for name in self._zip_order + if name != self.MIMETYPE_PATH and name in self._files + ] + new_names = sorted( + name for name in self._files if name not in written and name not in ordered_names + ) + for name in [*ordered_names, *new_names]: self._write_zip_entry(zf, name, self._files[name], ZIP_DEFLATED) - - @staticmethod - def _write_zip_entry(zf: ZipFile, path: str, payload: bytes, compress_type: int) -> None: - info = ZipInfo(path) - info.compress_type = compress_type + written.add(name) + + def _zip_info_for_write(self, path: str, compress_type: int) -> ZipInfo: + original = self._zip_infos.get(path) + if original is None: + info = ZipInfo(path) + info.compress_type = compress_type + return info + + info = ZipInfo(path, original.date_time) + info.compress_type = original.compress_type + info.comment = original.comment + info.extra = original.extra + info.create_system = original.create_system + info.create_version = original.create_version + info.extract_version = original.extract_version + info.flag_bits = original.flag_bits + info.volume = original.volume + info.internal_attr = original.internal_attr + info.external_attr = original.external_attr + return info + + def _write_zip_entry( + self, + zf: ZipFile, + path: str, + payload: bytes, + compress_type: int, + ) -> None: + info = self._zip_info_for_write(path, compress_type) zf.writestr(info, payload) def _write_mimetype(self, zf: ZipFile) -> None: diff --git a/src/hwpx/oxml/document.py b/src/hwpx/oxml/document.py index f92c73c..e6faf56 100644 --- a/src/hwpx/oxml/document.py +++ b/src/hwpx/oxml/document.py @@ -31,6 +31,7 @@ parse_track_change_authors, parse_track_changes, ) +from .namespaces import HWPML_COMPAT_ROOT_NAMESPACES from .utils import parse_int ET.register_namespace("hp", "http://www.hancom.co.kr/hwpml/2011/paragraph") @@ -104,6 +105,21 @@ def _sanitize_text(value: str) -> str: def _serialize_xml(element: ET.Element) -> bytes: """Return a UTF-8 encoded XML document for *element*.""" + xml_bytes = ET.tostring(element, encoding="utf-8", xml_declaration=False) + if element.tag in {_HS + "sec", _HH + "head"}: + root = LET.fromstring(xml_bytes) + wrapped = LET.Element(root.tag, nsmap=HWPML_COMPAT_ROOT_NAMESPACES) + wrapped.attrib.update(root.attrib) + wrapped.text = root.text + wrapped.tail = root.tail + for child in root: + wrapped.append(child) + return LET.tostring( + wrapped, + encoding="UTF-8", + xml_declaration=True, + standalone=True, + ) return ET.tostring(element, encoding="utf-8", xml_declaration=True) diff --git a/src/hwpx/oxml/namespaces.py b/src/hwpx/oxml/namespaces.py index 5f01087..f62facc 100644 --- a/src/hwpx/oxml/namespaces.py +++ b/src/hwpx/oxml/namespaces.py @@ -23,3 +23,28 @@ HS10_NS = "http://www.hancom.co.kr/hwpml/2016/section" HC10_NS = "http://www.hancom.co.kr/hwpml/2016/core" HH10_NS = "http://www.hancom.co.kr/hwpml/2016/head" + +OPF_NS = "http://www.idpf.org/2007/opf/" +XML_NS = "http://www.w3.org/XML/1998/namespace" + +# Hancom Office emits this broad namespace surface on HWPML document roots and +# may treat declarations that generic XML serializers consider optional as part +# of the package compatibility contract. Keep section/header roots close to the +# shape Hancom writes so read-modify-save roundtrips do not look tampered with. +HWPML_COMPAT_ROOT_NAMESPACES = { + "ha": "http://www.hancom.co.kr/hwpml/2011/app", + "hp": HP_NS, + "hp10": HP10_NS, + "hs": HS_NS, + "hc": HC_NS, + "hh": HH_NS, + "hhs": "http://www.hancom.co.kr/hwpml/2011/history", + "hm": "http://www.hancom.co.kr/hwpml/2011/master-page", + "hpf": "http://www.hancom.co.kr/schema/2011/hpf", + "dc": "http://purl.org/dc/elements/1.1/", + "opf": OPF_NS, + "ooxmlchart": "http://www.hancom.co.kr/hwpml/2016/ooxmlchart", + "hwpunitchar": "http://www.hancom.co.kr/hwpml/2016/HwpUnitChar", + "epub": "http://www.idpf.org/2007/ops", + "config": "urn:oasis:names:tc:opendocument:xmlns:config:1.0", +} diff --git a/src/hwpx/tools/archive_cli.py b/src/hwpx/tools/archive_cli.py index 9947348..790f14d 100644 --- a/src/hwpx/tools/archive_cli.py +++ b/src/hwpx/tools/archive_cli.py @@ -14,6 +14,8 @@ from lxml import etree # type: ignore[reportAttributeAccessIssue] +from ..opc.relationships import is_header_part_name, is_section_part_name +from ..oxml.namespaces import HWPML_COMPAT_ROOT_NAMESPACES from .package_validator import validate_package _XML_SUFFIXES = (".xml", ".hpf") @@ -92,6 +94,30 @@ def _format_xml_bytes(payload: bytes) -> bytes: ) +def _normalize_hwpml_compat_root(rel_path: str, payload: bytes) -> bytes: + if not (is_section_part_name(rel_path) or is_header_part_name(rel_path)): + return payload + try: + root = etree.fromstring(payload) + except etree.XMLSyntaxError: + return payload + if not (root.tag.endswith("}sec") or root.tag.endswith("}head")): + return payload + + wrapped = etree.Element(root.tag, nsmap=HWPML_COMPAT_ROOT_NAMESPACES) + wrapped.attrib.update(root.attrib) + wrapped.text = root.text + wrapped.tail = root.tail + for child in root: + wrapped.append(child) + return etree.tostring( + wrapped, + encoding="UTF-8", + xml_declaration=True, + standalone=True, + ) + + def _iter_file_entries(archive: ZipFile) -> tuple[ArchiveEntryInfo, ...]: entries: list[ArchiveEntryInfo] = [] for info in archive.infolist(): @@ -261,7 +287,11 @@ def pack_hwpx( compress_type = compress_types.get(rel_path, ZIP_DEFLATED) if compress_type != ZIP_STORED: compress_type = ZIP_DEFLATED - archive.write(root / rel_path, rel_path, compress_type=compress_type) + payload = _normalize_hwpml_compat_root( + rel_path, + (root / rel_path).read_bytes(), + ) + archive.writestr(rel_path, payload, compress_type=compress_type) _summarize_pack_validation(tmp_path) os.replace(tmp_path, destination) diff --git a/src/hwpx/tools/package_validator.py b/src/hwpx/tools/package_validator.py index b10d71f..af4c8bf 100644 --- a/src/hwpx/tools/package_validator.py +++ b/src/hwpx/tools/package_validator.py @@ -3,14 +3,19 @@ import argparse import io +import re import xml.etree.ElementTree as ET from dataclasses import dataclass from pathlib import Path, PurePosixPath from typing import BinaryIO, Literal, Sequence from zipfile import ZIP_STORED, BadZipFile, ZipFile +from lxml import etree as LET # type: ignore[reportMissingImports] + +from ..oxml.namespaces import HWPML_COMPAT_ROOT_NAMESPACES from ..opc.relationships import ( MAIN_ROOTFILE_MEDIA_TYPE, + is_header_part_name, is_section_part_name, parse_container_rootfiles, parse_manifest_relationships, @@ -23,6 +28,9 @@ HEADER_PATH = "Contents/header.xml" VERSION_PATH = "version.xml" +_XML_DECLARATION_RE = re.compile(br"^<\?xml\s+([^?]*?)\?>", re.IGNORECASE) +_STANDALONE_YES_RE = re.compile(br"\bstandalone\s*=\s*(['\"])yes\1", re.IGNORECASE) + IssueLevel = Literal["error", "warning"] __all__ = [ @@ -83,6 +91,57 @@ def _parse_xml(payload: bytes) -> ET.Element: raise ValueError(f"malformed XML: {exc}") from exc +def _root_declared_namespaces(payload: bytes) -> dict[str, str]: + try: + root = LET.fromstring(payload) + except LET.XMLSyntaxError: + return {} + return {"" if prefix is None else prefix: uri for prefix, uri in root.nsmap.items() if uri} + + +def _has_standalone_yes_declaration(payload: bytes) -> bool: + stripped = payload.lstrip() + if stripped.startswith(b"\xef\xbb\xbf"): + stripped = stripped[3:] + match = _XML_DECLARATION_RE.match(stripped) + return bool(match and _STANDALONE_YES_RE.search(match.group(1))) + + +def _check_hwpml_compat_root( + issues: list[PackageValidationIssue], + part_name: str, + payload: bytes, + root: ET.Element, +) -> None: + if not ( + is_section_part_name(part_name) + or is_header_part_name(part_name) + or part_name == HEADER_PATH + ): + return + if not (root.tag.endswith("}sec") or root.tag.endswith("}head")): + return + if not _has_standalone_yes_declaration(payload): + _error( + issues, + part_name, + 'missing XML declaration with standalone="yes"', + ) + declared = _root_declared_namespaces(payload) + missing = [ + prefix + for prefix, uri in HWPML_COMPAT_ROOT_NAMESPACES.items() + if declared.get(prefix) != uri + ] + if missing: + _error( + issues, + part_name, + "missing Hancom-compatible HWPML root namespace declarations: " + + ", ".join(missing), + ) + + def _error(issues: list[PackageValidationIssue], part_name: str, message: str) -> None: issues.append(PackageValidationIssue(part_name, message, "error")) @@ -172,7 +231,9 @@ def validate_package(source: str | Path | bytes | BinaryIO) -> PackageValidation _error(issues, name, "unable to read entry for XML parsing") continue try: - xml_roots[name] = _parse_xml(payload) + root = _parse_xml(payload) + xml_roots[name] = root + _check_hwpml_compat_root(issues, name, payload, root) except ValueError as exc: _error(issues, name, str(exc)) diff --git a/tests/test_gap_closure_tools.py b/tests/test_gap_closure_tools.py index 6713c75..4fdd061 100644 --- a/tests/test_gap_closure_tools.py +++ b/tests/test_gap_closure_tools.py @@ -13,6 +13,7 @@ from hwpx import HwpxDocument from hwpx.opc.package import HwpxPackage from hwpx.opc.relationships import MAIN_ROOTFILE_MEDIA_TYPE, resolve_part_name +from hwpx.oxml.namespaces import HWPML_COMPAT_ROOT_NAMESPACES from hwpx.tools import archive_cli from hwpx.tools.archive_cli import pack_hwpx, unpack_hwpx from hwpx.tools.package_validator import validate_package @@ -22,10 +23,19 @@ _REPO_ROOT = Path(__file__).resolve().parents[1] _MIMETYPE = b"application/hwp+zip" _VERSION_XML = b'' + + +def _hwpml_root_namespace_attrs() -> str: + return " ".join( + f'xmlns:{prefix}="{uri}"' + for prefix, uri in HWPML_COMPAT_ROOT_NAMESPACES.items() + ) + + _HEADER_XML = ( - b'' - b'' -) + '' + f"" +).encode("utf-8") _MASTER_PAGE_XML = ( b'' b'' @@ -86,9 +96,8 @@ def _build_manifest_xml( def _build_section_xml(text: str) -> bytes: return ( - '' - '' + '' + f"" '' f'{text}' "" @@ -107,6 +116,19 @@ def _zip_parts(parts: list[tuple[str, bytes]]) -> bytes: return buffer.getvalue() +def _replace_zip_part(package_bytes: bytes, part_name: str, payload: bytes) -> bytes: + buffer = io.BytesIO() + with ZipFile(io.BytesIO(package_bytes), "r") as source: + with ZipFile(buffer, "w", compression=ZIP_DEFLATED) as archive: + for info in source.infolist(): + replacement = payload if info.filename == part_name else source.read(info.filename) + if info.filename == "mimetype": + archive.writestr(info.filename, replacement, compress_type=ZIP_STORED) + else: + archive.writestr(info.filename, replacement) + return buffer.getvalue() + + def _build_manual_package( *, manifest_path: str = "Contents/content.hpf", @@ -205,6 +227,43 @@ def test_package_validator_accepts_valid_document() -> None: assert "Contents/header.xml" in report.checked_parts +def test_add_paragraph_roundtrip_preserves_section_root_compat_metadata() -> None: + package_bytes, paths = _build_manual_package(text="Original paragraph") + + document = HwpxDocument.open(package_bytes) + document.add_paragraph("Added paragraph") + roundtrip = document.to_bytes() + + with ZipFile(io.BytesIO(roundtrip), "r") as archive: + section_xml = archive.read(paths["section"]) + + declaration = section_xml.split(b"?>", 1)[0] + assert b"standalone='yes'" in declaration or b'standalone="yes"' in declaration + for prefix, uri in HWPML_COMPAT_ROOT_NAMESPACES.items(): + expected = f'xmlns:{prefix}="{uri}"'.encode("utf-8") + assert expected in section_xml + assert validate_package(roundtrip).ok + + +def test_package_validator_rejects_section_root_metadata_regression() -> None: + package_bytes, paths = _build_manual_package(text="Original paragraph") + regressed_section_xml = ( + b'' + b'' + b'' + b'Regressed' + b"" + ) + regressed = _replace_zip_part(package_bytes, paths["section"], regressed_section_xml) + + report = validate_package(regressed) + + assert not report.ok + assert any("standalone" in issue.message for issue in report.errors) + assert any("root namespace declarations" in issue.message for issue in report.errors) + + def test_package_validator_reports_missing_mimetype() -> None: package_bytes, _ = _build_manual_package() with ZipFile(io.BytesIO(package_bytes), "r") as source: diff --git a/tests/test_opc_package.py b/tests/test_opc_package.py index 6425166..6ca39c4 100644 --- a/tests/test_opc_package.py +++ b/tests/test_opc_package.py @@ -79,3 +79,32 @@ def test_save_preserves_expected_compress_type_per_entry() -> None: assert infos[0].compress_type == ZIP_STORED for info in infos[1:]: assert info.compress_type == ZIP_DEFLATED + + +def test_save_preserves_existing_archive_order_and_entry_metadata() -> None: + buffer = io.BytesIO() + with ZipFile(buffer, "w", compression=ZIP_DEFLATED) as archive: + archive.writestr("mimetype", _MIMETYPE, compress_type=ZIP_STORED) + archive.writestr("version.xml", _VERSION_XML, compress_type=ZIP_STORED) + archive.writestr("Contents/header.xml", _HEADER_XML) + archive.writestr("Contents/content.hpf", _MANIFEST_XML) + archive.writestr("META-INF/container.xml", _CONTAINER_XML) + + source_bytes = buffer.getvalue() + with ZipFile(io.BytesIO(source_bytes), "r") as archive: + original_metadata = [ + (info.filename, info.compress_type, info.create_system, info.external_attr) + for info in archive.infolist() + ] + + package = HwpxPackage.open(source_bytes) + package.write("Contents/header.xml", _HEADER_XML + b"") + + output = package.save() + with ZipFile(io.BytesIO(output), "r") as archive: + roundtrip_metadata = [ + (info.filename, info.compress_type, info.create_system, info.external_attr) + for info in archive.infolist() + ] + + assert roundtrip_metadata == original_metadata