From 0729051b803b54ee3cfd15be15abc47a8c04ded2 Mon Sep 17 00:00:00 2001
From: thisismypassport <thisismypassport@noemail.com>
Date: Wed, 12 Feb 2025 12:29:57 +0200
Subject: [PATCH] fix and improve shrinkotron support - improving compression
 rates and allowing dumping uncompressed pods, etc. (contains algorithms to
 read/write the pxu format)

---
 README.md        |   2 +-
 picotron_cart.py |  26 +++--
 picotron_fs.py   | 285 ++++++++++++++++++++++++++++++++++++++++++-----
 shrinko.py       |  14 ++-
 4 files changed, 289 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index e04d7ce..d46c54c 100644
--- a/README.md
+++ b/README.md
@@ -1014,5 +1014,5 @@ Cart manipulation features:
 
 Notes:
 * Shrinkotron assumes calls to `include` are used to include other unmodified lua files. If this is not the case, minify may break even under `--minify-safe-only`
-* Currently, Shrinkotron does not touch data files (gfx/sfx/etc). It may play with their compression in the future, however.
+* Shrinkotron repacks all POD files for better compression. (There are options to change this - `--uncompress-pods` and `--keep-pod-compression`)
 * As Picotron evolves, there might be new globals or table keys that Shrinkotron isn't aware of. You can report such cases and use [`--preserve`](#preserving-identifiers-across-the-entire-cart) meanwhile.
diff --git a/picotron_cart.py b/picotron_cart.py
index d209621..38c8f0d 100644
--- a/picotron_cart.py
+++ b/picotron_cart.py
@@ -369,16 +369,18 @@ def preview_order_key(pair):
     # we prefer to sort p64 files for better visibility of code, e.g. in the webapp's preview
     # (this is NOT what picotron does currently, hopefully doesn't matter)
     dirname, filename = str_split_last(pair[0], "/")
-    if filename == k_p64_main_path:
-        order = 0
+    if filename == "": # directory itself must be first
+        order = -3
+    elif filename == k_p64_main_path:
+        order = -2
     elif filename.endswith(".lua"):
-        order = 1
+        order = -1
     elif filename == k_label_file:
-        order = 3
+        order = 1
     elif filename.startswith("."):
-        order = 4
-    else:
         order = 2
+    else:
+        order = 0
     return (dirname, order, filename)
 
 def write_cart64_to_source(cart, avoid_base64=False, **opts):
@@ -530,7 +532,7 @@ def filter_cart64(cart, sections):
     for path in to_delete:
         del cart.files[path]
 
-def preproc_cart64(cart, delete_meta):
+def preproc_cart64(cart, delete_meta=None, uncompress_pods=False, keep_pod_compression=False, need_pod_compression=False):
     if delete_meta:
         to_delete = []
         
@@ -546,6 +548,16 @@ def preproc_cart64(cart, delete_meta):
         
         for path in to_delete:
             del cart.files[path]
+    
+    if not keep_pod_compression:
+        for path, file in cart.files.items():
+            if not file.is_raw and not file.is_dir:
+                if uncompress_pods:
+                    file.set_payload(file.payload, compress=False, use_pxu=False)
+                elif need_pod_compression:
+                    file.set_payload(file.payload, compress=True, use_pxu=True)
+                else:
+                    file.set_payload(file.payload, compress=False, use_pxu=True)
 
 def merge_cart64(dest, src, sections):
     glob = Cart64Glob(sections) if e(sections) else None
diff --git a/picotron_fs.py b/picotron_fs.py
index 961f349..25be0f6 100644
--- a/picotron_fs.py
+++ b/picotron_fs.py
@@ -1,24 +1,33 @@
 from utils import *
 from pico_defs import Language, encode_luastr, decode_luastr
+from pico_export import lz4_compress, lz4_uncompress
+from pico_compress import update_mtf
+
+# (note - picotron pods have nothing to do with pico8 pods)
 
 k_pod = b"pod"
 k_pod_str = k_pod.decode()
 k_pod_prefix_str = k_pod_str + ","
 k_pod_format = b"pod_format"
-k_pod_format_str = k_pod_format.decode()
+k_pod_prefix_strs = (k_pod_format.decode(), "pod_type")
 k_pod_raw_format = k_pod_format + b"=\"raw\""
 k_meta_prefix = b"--[["
 k_meta_pod_prefix = k_meta_prefix + k_pod
 k_meta_pod_raw_prefix = k_meta_prefix + k_pod_raw_format
 k_meta_suffix = b"]]"
 
-class UserData(Struct):
+class UserData(Tuple):
+    """Represents a picotron userdata"""
     type = width = height = data = ...
 
-def parse_pod(pod):
+def parse_pod(pod, ud_handler=None):
+    """Parses a picotron pod from a readable string"""
     src = Source("<pod>", pod)
     tokens, token_errors = tokenize(src, lang=Language.picotron)
-    root, parse_errors = parse(src, tokens, lang=Language.picotron, for_expr=True)
+    if tokens:
+        root, parse_errors = parse(src, tokens, lang=Language.picotron, for_expr=True)
+    else:
+        root, parse_errors = None, []
     
     value_errors = []
     def add_error(msg, node):
@@ -36,15 +45,21 @@ def node_to_value(node):
                 return None
         elif node.type == NodeType.unary_op and node.op == "-" and node.child.type == NodeType.const and node.child.token.type == TokenType.number:
             return -node.child.token.parsed_value
-        elif node.type == NodeType.call and node.func.type == NodeType.var and node.func.name == "userdata" and len(node.args) in (3, 4):
-            type = node_to_value(node.args[0])
-            width = node_to_value(node.args[1])
-            height = node_to_value(node.args[2]) if len(node.args) == 4 else 0
-            data = node_to_value(node.args[-1])
-            if isinstance(type, str) and isinstance(width, int) and isinstance(height, int) and isinstance(data, str):
-                return UserData(type, width, height, data)
-            else:
-                add_error(f"unknown userdata params: {type}, {width}, {height}, {data}")
+
+        elif node.type == NodeType.call and node.func.type == NodeType.var and node.func.name == "userdata":
+            ud_args = tuple(node_to_value(arg) for arg in node.args)
+
+            if len(ud_args) in (3, 4):
+                type, width, data = ud_args[0], ud_args[1], ud_args[-1]
+                height = ud_args[2] if len(ud_args) == 4 else 0
+                if isinstance(type, str) and isinstance(width, int) and isinstance(height, int) and isinstance(data, str):
+                    return UserData(type, width, height, data)
+
+            if len(ud_args) == 0 and ud_handler and (userdata := ud_handler()):
+                return userdata
+
+            add_error(f"unknown userdata params: {ud_args}")
+
         elif node.type == NodeType.table:
             table = {}
             index = 1
@@ -75,13 +90,15 @@ def node_to_value(node):
     return value
 
 def parse_meta_pod(pod):
+    """Parses a picotron pod as it appears in a file's metadata"""
     if pod == k_pod_str:
         return {}
 
     pod = str_remove_prefix(pod, k_pod_prefix_str)
     return parse_pod("{" + pod + "}")
 
-def format_pod(value):
+def format_pod(value, ud_handler=None):
+    """Formats a picotron pod into a readable string"""
     if value is None:
         return "nil"
     elif value is False:
@@ -92,24 +109,29 @@ def format_pod(value):
         return format_luanum(value, base=10)
     elif isinstance(value, str):
         return format_string_literal(value, long=False, quote='"')
+
     elif isinstance(value, UserData):
-        # TODO: pxu (though not usable in meta, anyway)
         type, width, height, data = format_pod(value.type), format_pod(value.width), format_pod(value.height), format_pod(value.data)
         if value.height:
-            return f"userdata({type},{width},{height},{data})"
+            result = f"userdata({type},{width},{height},{data})"
         else:
-            return f"userdata({type},{width},{data})"
+            result = f"userdata({type},{width},{data})"
+        
+        if ud_handler and ud_handler(value, result):
+            result = "\0" # to allow unambiguously finding it in the result
+        return result
+
     elif isinstance(value, dict):
         index = 1
         parts = []
         for key, child in value.items():
             if key == index:
-                parts.append(format_pod(child))
+                parts.append(format_pod(child, ud_handler))
                 index += 1
-            elif is_identifier(key, Language.picotron):
-                parts.append(f"{key}={format_pod(child)}")
+            elif isinstance(key, str) and is_identifier(key, Language.picotron):
+                parts.append(f"{key}={format_pod(child, ud_handler)}")
             else:
-                parts.append(f"[{format_pod(key)}]={format_pod(child)}")
+                parts.append(f"[{format_pod(key, ud_handler)}]={format_pod(child, ud_handler)}")
         return "{" + ",".join(parts) + "}"
     else:
         throw(f"invalid pod value {value}")
@@ -125,17 +147,209 @@ def escape_meta(pod):
     return pod
 
 def format_meta_pod(value):
-    if k_pod_format_str in value: # put it first
-        prefix = f"{k_pod_format_str}={format_pod(value[k_pod_format_str])}"
-        value = value.copy()
-        del value[k_pod_format_str]
+    """Formats a picotron pod as it should appear in a file's metadata"""
+    for pod_prefix_str in k_pod_prefix_strs:
+        if pod_prefix_str in value: # put it first
+            prefix = f"{pod_prefix_str}={format_pod(value[pod_prefix_str])}"
+            value = value.copy()
+            del value[pod_prefix_str]
+            break
     else:
         prefix = k_pod_str
     
-    rest = escape_meta(format_pod(value)[1:-1])
-    return f"{prefix},{rest}" if rest else prefix
+    rest = format_pod(value)[1:-1]
+    return escape_meta(f"{prefix},{rest}" if rest else prefix)
+
+k_lz4_prefix = b"lz4\0"
+k_pxu_prefix = b"pxu\0"
+
+class PxuFlags(Bitmask):
+    unk_type = 0x3
+    has_height = 0x40
+    long_size = 0x800
+    compress = 0x2000
+
+def read_pxu(data, idx):
+    """Reads the picotron userdata compression format 'pxu' into a UserData."""
+
+    with BinaryReader(BytesIO(data)) as r:
+        r.setpos(idx)
+        check(r.bytes(4) == k_pxu_prefix, "wrong pxu header")
+        flags = PxuFlags(r.u16())
+        if not flags.compress or flags.unk_type != 3:
+            throw(f"unsupported pxu flags: {flags}")
+
+        width = r.u32() if flags.long_size else r.u8()
+        height = (r.u32() if flags.long_size else r.u8()) if flags.has_height else 1
+        size = width * height
+
+        bits = r.u8()
+        check(bits == 4, "unsupported pxu bits") # TODO - allow more?
+        mask = (1 << bits) - 1
+        ext_count = 1 << (8 - bits)
+
+        # the general idea behind the complexity is that repeated pixels can
+        # take up spots from low-valued pixels.
+
+        data = bytearray()
+        mapping = [i for i in range(mask)]
+        mtf = [i for i in range(mask)]
+
+        while len(data) < size:
+            b = r.u8()
+            
+            index = b & mask
+            if index == mask:
+                value = r.u8()
+                mapping[mtf[-1]] = value
+            
+            else:
+                update_mtf(mtf, mtf.index(index), index)
+                value = mapping[index]
+            
+            count = 1 + (b >> bits)
+            if count == ext_count:
+                while True:
+                    c = r.u8()
+                    count += c
+                    if c != 0xff:
+                        break
+            
+            for i in range(count):
+                data.append(value)
+
+        hexdata = "".join(f"{b:02x}" for b in data)
+        return UserData("u8", width, height if flags.has_height else 0, hexdata), r.pos()
+
+def read_pod(value):
+    """Reads a picotron pod from possibly compressed bytes"""
+
+    if value.startswith(k_lz4_prefix):
+        with BinaryReader(BytesIO(value)) as r:
+            r.addpos(4)
+            size = r.u32()
+            _unc_size = r.u32()
+            value = lz4_uncompress(r.bytes(size))
+
+    pxu_i = 0
+    userdatas = None
+    while True:
+        pxu_i = value.find(k_pxu_prefix, pxu_i)
+        if pxu_i < 0:
+            break
+        
+        userdatas = userdatas or deque()
+        userdata, end_i = read_pxu(value, pxu_i)
+        value = str_replace_between(value, pxu_i, end_i, b"userdata()")
+        userdatas.append(userdata)
+
+    def handle_userdata():
+        if userdatas:
+            return userdatas.popleft()
+
+    return parse_pod(decode_luastr(value), handle_userdata)
+
+def write_pxu(ud):
+    """Writes userdata via the picotron userdata compression format 'pxu'"""
+    if ud.type != "u8":
+        return None
+    
+    with BinaryWriter() as w:
+        flags = PxuFlags.unk_type | PxuFlags.compress
+        if ud.height:
+            flags |= PxuFlags.has_height
+        if ud.width >= 0x100 or ud.height >= 0x100:
+            flags |= PxuFlags.long_size
+
+        w.bytes(k_pxu_prefix)
+        w.u16(int(flags))
+        (w.u32 if flags.long_size else w.u8)(ud.width)
+        if flags.has_height:
+            (w.u32 if flags.long_size else w.u8)(ud.height)
+        
+        data = bytearray()
+        try:
+            for i in range(0, len(ud.data), 2):
+                data.append(int(ud.data[i:i+2], 16))
+        except ValueError:
+            throw("invalid userdata encountered")
+
+        bits = 4 # could try other values, but picotron itself never does?
+        w.u8(bits)
+        mask = (1 << bits) - 1
+        ext_count = 1 << (8 - bits)
+        
+        mapping = [i for i in range(mask)]
+        mtf = [i for i in range(mask)]
+
+        i = 0
+        while i < len(data):
+            count = 1
+            value = data[i]
+            i += 1
+            while i < len(data) and data[i] == value:
+                count += 1
+                i += 1
+
+            index = list_find(mapping, value)
+            if index < 0:
+                index = mask
+                mapping[mtf[-1]] = value
+
+            else:
+                update_mtf(mtf, mtf.index(index), index)
+            
+            w.u8(index | ((min(count, ext_count) - 1) << bits))
+            if index == mask:
+                w.u8(value)
+            
+            if count >= ext_count:
+                count -= ext_count
+                while count >= 0xff:
+                    w.u8(0xff)
+                    count -= 0xff
+                w.u8(count)
+
+        return w.f.getvalue()
+
+def write_pod(pod, compress=True, use_pxu=True):
+    """Writes a picotron pod into optionally compressed bytes"""
+    
+    pxu_datas = None
+    def handle_userdata(ud, str_data):
+        nonlocal pxu_datas
+        if use_pxu:
+            pxu_data = write_pxu(ud)
+            if pxu_data and len(pxu_data) < len(str_data):
+                pxu_datas = pxu_datas or deque()
+                pxu_datas.append(pxu_data)
+                return True
+
+    value = encode_luastr(format_pod(pod, handle_userdata))
+
+    pxu_i = 0
+    while pxu_datas:
+        pxu_i = value.find(0, pxu_i)
+        assert pxu_i >= 0
+
+        pxu_data = pxu_datas.popleft()
+        value = str_replace_at(value, pxu_i, 1, pxu_data)
+        pxu_i += len(pxu_data)
+
+    if compress:
+        with BinaryWriter() as w:
+            compressed = lz4_compress(value)
+            w.bytes(k_lz4_prefix)
+            w.u32(len(compressed))
+            w.u32(len(value))
+            w.bytes(compressed)
+            value = w.f.getvalue()
+    
+    return value
 
 class PicotronFile:
+    """A picotron file or directory in its filesystem - files contain metadata & payload"""
+
     def __init__(m, data, line=0):
         m.data = data
         m.line = line
@@ -196,6 +410,23 @@ def raw_payload(m, value):
         else:
             m.data = value
 
+    @property
+    def payload(m):
+        if m.is_raw:
+            return m.raw_payload
+        else:
+            return read_pod(m.raw_payload)
+    
+    @payload.setter
+    def payload(m, value):
+        m.set_payload(value)
+
+    def set_payload(m, value, compress=True, use_pxu=True):
+        if m.is_raw:
+            m.raw_payload = value
+        else:
+            m.raw_payload = write_pod(value, compress=compress, use_pxu=use_pxu)
+
     is_dir = False
 
 class PicotronDir(PicotronFile):
diff --git a/shrinko.py b/shrinko.py
index 1f41e20..d168b85 100644
--- a/shrinko.py
+++ b/shrinko.py
@@ -168,6 +168,8 @@ def create_parser():
                                 help=f"specify a {sections_desc} that contain lua code to process (default: *.lua)")
             pgroup.add_argument("--delete-meta", type=SplitBySeps, action="extend",
                                 help=f"specify a {sections_desc} to delete metadata of (default: * if minifying unsafely, else none)")
+            pgroup.add_argument("--keep-pod-compression", action="store_true", help="keep compression of all pod files as-is")
+            pgroup.add_argument("--uncompress-pods", action="store_true", help="uncompress all pod files to plain text")
             pgroup.add_argument("--list", action="store_true", help="list all files inside the cart")
             pgroup.add_argument("--filter", type=SplitBySeps, action="extend", help=f"specify a {sections_desc} to keep in the output")
             pgroup.add_argument("--insert", nargs='+', action="append", metavar=(f"INPUT [FSPATH] [FILES_FILTER]", ""),
@@ -175,7 +177,8 @@ def create_parser():
             pgroup.add_argument("--extract", nargs='+', action="append", metavar=(f"FSPATH [OUTPUT]", ""),
                                 help=f"extract the specified file or directory from FSPATH to OUTPUT ")
         else:
-            pgroup.set_defaults(code_sections=None, delete_meta=None, filter=None, insert=None, extract=None)
+            pgroup.set_defaults(code_sections=None, delete_meta=None, uncompress_pods=None, keep_pod_compression=None,
+                                filter=None, insert=None, extract=None)
         
         pgroup.add_argument("--merge", nargs='+', action="append", metavar=(f"INPUT {sections_meta} [FORMAT]", ""),
                             help=f"merge {sections_str} from the specified INPUT file, where {sections_meta} is a {sections_desc}")
@@ -511,8 +514,13 @@ def handle_processing(args, main_cart, extra_carts):
         
             if args.filter:
                 filter_cart_func(cart, args.filter)
-            if args.delete_meta:
-                preproc_cart_func(cart, delete_meta=args.delete_meta)
+
+            if is_picotron:
+                preproc_cart_func(cart, delete_meta=args.delete_meta,
+                                  keep_pod_compression=args.keep_pod_compression,
+                                  uncompress_pods=args.uncompress_pods,
+                                  # binary formats are already compressed, so pod compression just hurts
+                                  need_pod_compression=args.format and args.format.is_src)
 
             src = CartSourceCls(cart, args.code_sections)