From de914516f1bf473af8bfb91d2992eefcbddf3717 Mon Sep 17 00:00:00 2001 From: "Bernhard M. Wiedemann" Date: Sun, 4 Oct 2015 11:35:27 +0200 Subject: [PATCH 1/4] xz magic is actually 6 bytes long --- xdelta3/xdelta3-main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xdelta3/xdelta3-main.h b/xdelta3/xdelta3-main.h index 018a6971..df6deb44 100644 --- a/xdelta3/xdelta3-main.h +++ b/xdelta3/xdelta3-main.h @@ -296,7 +296,7 @@ static main_extcomp extcomp_types[] = { "compress", "-c", "uncompress", "-c", "Z", "\037\235", 2, 0 }, /* Xz is lzma with a magic number http://tukaani.org/xz/format.html */ - { "xz", "-c", "xz", "-dc", "Y", "\xfd\x37\x7a\x58\x5a\x00", 2, 0 }, + { "xz", "-c", "xz", "-dc", "Y", "\xfd\x37\x7a\x58\x5a\x00", 6, 0 }, }; static int main_input (xd3_cmd cmd, main_file *ifile, From 8150f15f140bee5b157498dccc007ef08bf8f1b5 Mon Sep 17 00:00:00 2001 From: "Bernhard M. Wiedemann" Date: Sun, 4 Oct 2015 11:36:57 +0200 Subject: [PATCH 2/4] use gzip -n option for compression to omit the timestamp and thus produce reproducible compression output which has a chance to match the original --- xdelta3/xdelta3-main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xdelta3/xdelta3-main.h b/xdelta3/xdelta3-main.h index df6deb44..f2aec44e 100644 --- a/xdelta3/xdelta3-main.h +++ b/xdelta3/xdelta3-main.h @@ -292,7 +292,7 @@ static xd3_stream *merge_stream = NULL; static main_extcomp extcomp_types[] = { { "bzip2", "-c", "bzip2", "-dc", "B", "BZh", 3, 0 }, - { "gzip", "-c", "gzip", "-dc", "G", "\037\213", 2, 0 }, + { "gzip", "-cn", "gzip", "-dc", "G", "\037\213", 2, 0 }, { "compress", "-c", "uncompress", "-c", "Z", "\037\235", 2, 0 }, /* Xz is lzma with a magic number http://tukaani.org/xz/format.html */ From 56e5f9ecc53af9ff3b50b00557b2cfd6a1468215 Mon Sep 17 00:00:00 2001 From: "Bernhard M. Wiedemann" Date: Sun, 4 Oct 2015 14:42:04 +0000 Subject: [PATCH 3/4] add compression-level detection logic --- xdelta3/xdelta3-internal.h | 1 + xdelta3/xdelta3-main.h | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/xdelta3/xdelta3-internal.h b/xdelta3/xdelta3-internal.h index 35de56bd..5fa3d82b 100644 --- a/xdelta3/xdelta3-internal.h +++ b/xdelta3/xdelta3-internal.h @@ -87,6 +87,7 @@ struct _main_file const char *realname; /* File name or /dev/stdin, * /dev/stdout, /dev/stderr. */ const main_extcomp *compressor; /* External compression struct. */ + int compression_level; /* 0..9 or -1 if not detected */ int flags; /* RD_FIRST, RD_NONEXTERNAL, ... */ xoff_t nread; /* for input position */ xoff_t nwrite; /* for output position */ diff --git a/xdelta3/xdelta3-main.h b/xdelta3/xdelta3-main.h index f2aec44e..71978365 100644 --- a/xdelta3/xdelta3-main.h +++ b/xdelta3/xdelta3-main.h @@ -207,6 +207,8 @@ struct _main_extcomp const char *magic; usize_t magic_size; int flags; +/* function for detecting compression level of input file */ + int (*detect_func)(uint8_t *data, int len); }; /* Merge state: */ @@ -291,12 +293,12 @@ static xd3_stream *merge_stream = NULL; * false just so the program knows the mapping of IDENT->NAME. */ static main_extcomp extcomp_types[] = { - { "bzip2", "-c", "bzip2", "-dc", "B", "BZh", 3, 0 }, - { "gzip", "-cn", "gzip", "-dc", "G", "\037\213", 2, 0 }, - { "compress", "-c", "uncompress", "-c", "Z", "\037\235", 2, 0 }, + { "bzip2", "-c", "bzip2", "-dc", "B", "BZh", 3, 0, NULL }, + { "gzip", "-cn", "gzip", "-dc", "G", "\037\213", 2, 0, NULL }, + { "compress", "-c", "uncompress", "-c", "Z", "\037\235", 2, 0, NULL }, /* Xz is lzma with a magic number http://tukaani.org/xz/format.html */ - { "xz", "-c", "xz", "-dc", "Y", "\xfd\x37\x7a\x58\x5a\x00", 6, 0 }, + { "xz", "-c", "xz", "-dc", "Y", "\xfd\x37\x7a\x58\x5a\x00", 6, 0, NULL }, }; static int main_input (xd3_cmd cmd, main_file *ifile, @@ -2453,7 +2455,12 @@ main_secondary_decompress_check (main_file *file, if (memcmp (check_buf, decomp->magic, decomp->magic_size) == 0) { + int compression_level = -1; decompressor = decomp; + if(decomp->detect_func) { + compression_level = decomp->detect_func(check_buf, try_read); + } + file->compression_level = compression_level; break; } } From 355d06d3f1e8b9e30f033f9caa39b1816a05aa3e Mon Sep 17 00:00:00 2001 From: "Bernhard M. Wiedemann" Date: Sun, 4 Oct 2015 16:37:20 +0000 Subject: [PATCH 4/4] detect xz, bz2 and gz compression level this can help to get closer to identical files after applying deltas --- xdelta3/xdelta3-compr.h | 42 +++++++++++++++++++++++++++++++++++++++++ xdelta3/xdelta3-main.h | 7 ++++--- 2 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 xdelta3/xdelta3-compr.h diff --git a/xdelta3/xdelta3-compr.h b/xdelta3/xdelta3-compr.h new file mode 100644 index 00000000..746cd838 --- /dev/null +++ b/xdelta3/xdelta3-compr.h @@ -0,0 +1,42 @@ +int gz_detect_func(uint8_t *data, int len) +{ + uint8_t flags; + if(len < 9) return -1; + flags = data[8]; + switch(flags) { + case 2: return 9; /* maximum compr */ + case 4: return 1; /* minimum compr */ + } + return 6; /* could be 2..8 but 6 is default */ +} + +int bz2_detect_func(uint8_t *data, int len) +{ + if(len < 4) return -1; + return data[3]&0xf; +} + +int xz_detect_func(uint8_t *data, int len) +{ + int offs; + int dict_size; + /* there might be extra headers which need to be skipped */ + for( offs = 14; offs < 26; offs++) { + if(offs+2 >= len) return -1; + if(data[offs+0] != 0x21) continue; /* LZMA filter */ + if(data[offs+1] != 0x01) continue; /* Size of Filter Properties: 1 byte */ + dict_size = data[offs+2]; + switch(dict_size) { + case 12: return 0; + case 16: return 1; + case 18: return 2; + case 20: return 3; /* could also be 4 */ + case 22: return 6; /* could also be 5 but 6 is the default so the guess is correct in 99% of cases */ + case 24: return 7; + case 26: return 8; + case 28: return 9; + default: return -1; /* not guessable */ + } + } + return -1; +} diff --git a/xdelta3/xdelta3-main.h b/xdelta3/xdelta3-main.h index 71978365..f2572fd6 100644 --- a/xdelta3/xdelta3-main.h +++ b/xdelta3/xdelta3-main.h @@ -289,16 +289,17 @@ static xd3_stream *recode_stream = NULL; /* merge_stream is used by merge commands for storing the source encoding */ static xd3_stream *merge_stream = NULL; +#include "xdelta3-compr.h" /* This array of compressor types is compiled even if EXTERNAL_COMPRESSION is * false just so the program knows the mapping of IDENT->NAME. */ static main_extcomp extcomp_types[] = { - { "bzip2", "-c", "bzip2", "-dc", "B", "BZh", 3, 0, NULL }, - { "gzip", "-cn", "gzip", "-dc", "G", "\037\213", 2, 0, NULL }, + { "bzip2", "-c", "bzip2", "-dc", "B", "BZh", 3, 0, bz2_detect_func }, + { "gzip", "-cn", "gzip", "-dc", "G", "\037\213", 2, 0, gz_detect_func }, { "compress", "-c", "uncompress", "-c", "Z", "\037\235", 2, 0, NULL }, /* Xz is lzma with a magic number http://tukaani.org/xz/format.html */ - { "xz", "-c", "xz", "-dc", "Y", "\xfd\x37\x7a\x58\x5a\x00", 6, 0, NULL }, + { "xz", "-c", "xz", "-dc", "Y", "\xfd\x37\x7a\x58\x5a\x00", 6, 0, xz_detect_func }, }; static int main_input (xd3_cmd cmd, main_file *ifile,