deflate_compress: rewind blocks in near-optimal compressor

The block splitting algorithm works by examining successive chunks of data and ending the block when a chunk differs significantly from the rest of the block. Currently, the data chunk where the change is detected is included in the block, which is suboptimal -- after all, we know that it's different from the rest of the block. Better results could be achieved by ending the block just before the chunk. Implement this in the near-optimal compressor. This slightly improves its compression ratio. Note: I also tested an implementation of this for the lazy compressor. It improves compression ratio too, but it doesn't seem worthwhile there from a performance and complexity standpoint.
ebiggers · Jan 6, 2022 · 55f9f70 · 55f9f70
1 parent 5f4da4b
commit 55f9f70
Showing 1 changed file with 168 additions and 55 deletions.
diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c
@@ -592,6 +592,7 @@ struct libdeflate_compressor {
 			 * greedy parse, gathered during matchfinding.  This is
 			 * used for setting the initial symbol costs.
 			 */
+			u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
 			u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
 
 			unsigned num_optim_passes;
@@ -2164,14 +2165,22 @@ do_end_block_check(struct block_split_stats *stats, u32 block_length)
 	return false;
 }
 
+static forceinline bool
+ready_to_check_block(const struct block_split_stats *stats,
+		     const u8 *in_block_begin, const u8 *in_next,
+		     const u8 *in_end)
+{
+	return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK
+		&& in_next - in_block_begin >= MIN_BLOCK_LENGTH
+		&& in_end - in_next >= MIN_BLOCK_LENGTH;
+}
+
 static forceinline bool
 should_end_block(struct block_split_stats *stats,
 		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
 {
-	/* Ready to check block split statistics? */
-	if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
-	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
-	    in_end - in_next < MIN_BLOCK_LENGTH)
+	/* Ready to try to end the block (again)? */
+	if (!ready_to_check_block(stats, in_block_begin, in_next, in_end))
 		return false;
 
 	return do_end_block_check(stats, in_next - in_block_begin);
@@ -2330,11 +2339,12 @@ recalculate_min_match_len(const struct deflate_freqs *freqs,
 }
 
 static forceinline const u8 *
-choose_max_block_end(const u8 *in_next, const u8 *in_end, size_t soft_max_len)
+choose_max_block_end(const u8 *in_block_begin, const u8 *in_end,
+		     size_t soft_max_len)
 {
-	if (in_end - in_next < soft_max_len + MIN_BLOCK_LENGTH)
+	if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH)
 		return in_end;
-	return in_next + soft_max_len;
+	return in_block_begin + soft_max_len;
 }
 
 /*
@@ -2981,17 +2991,21 @@ static const struct {
  */
 static void
 deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
-				    u32 block_length,
+				    const u8 *block_begin, u32 block_length,
 				    u32 *lit_cost, u32 *len_sym_cost)
 {
 	unsigned num_used_literals = 0;
 	u32 literal_freq = block_length;
 	u32 match_freq = 0;
 	u32 cutoff;
-	unsigned i;
+	u32 i;
 
 	/* Calculate the number of distinct literals that exist in the data. */
+	memset(c->freqs.litlen, 0,
+	       DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
 	cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */
+	for (i = 0; i < block_length; i++)
+		c->freqs.litlen[block_begin[i]]++;
 	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
 		if (c->freqs.litlen[i] > cutoff)
 			num_used_literals++;
@@ -3258,7 +3272,8 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
  * as the costs.
  */
 static void
-deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
+deflate_optimize_block(struct libdeflate_compressor *c,
+		       const u8 *block_begin, u32 block_length,
 		       const struct lz_match *cache_ptr, bool is_first_block,
 		       bool is_final_block)
 {
@@ -3275,11 +3290,8 @@ deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
 		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
 		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
 
-	/* Make sure the literal/match statistics are up to date. */
-	merge_new_observations(&c->split_stats);
-
 	/* Set the initial costs. */
-	deflate_choose_default_litlen_costs(c, block_length,
+	deflate_choose_default_litlen_costs(c, block_begin, block_length,
 					    &lit_cost, &len_sym_cost);
 	if (is_first_block)
 		deflate_set_default_costs(c, lit_cost, len_sym_cost);
@@ -3308,31 +3320,49 @@ deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
 }
 
 static void
-deflate_near_optimal_begin_block(struct libdeflate_compressor *c,
-				 bool is_first_block)
+deflate_near_optimal_init_stats(struct libdeflate_compressor *c)
 {
-	int i;
+	init_block_split_stats(&c->split_stats);
+	memset(c->p.n.new_match_len_freqs, 0,
+	       sizeof(c->p.n.new_match_len_freqs));
+	memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
+}
 
-	if (!is_first_block) {
-		/*
-		 * Save some literal/match statistics from the previous block so
-		 * that deflate_adjust_costs() will be able to decide how much
-		 * the current block differs from the previous one.
-		 */
-		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
-			c->p.n.prev_observations[i] =
-				c->split_stats.observations[i];
-		}
-		c->p.n.prev_num_observations = c->split_stats.num_observations;
+static void
+deflate_near_optimal_merge_stats(struct libdeflate_compressor *c)
+{
+	unsigned i;
+
+	merge_new_observations(&c->split_stats);
+	for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
+		c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i];
+		c->p.n.new_match_len_freqs[i] = 0;
 	}
-	init_block_split_stats(&c->split_stats);
+}
 
-	/*
-	 * During matchfinding, we keep track of approximate literal and match
-	 * length frequencies for the purpose of setting the initial costs.
-	 */
-	memset(c->freqs.litlen, 0,
-	       DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
+/*
+ * Save some literal/match statistics from the previous block so that
+ * deflate_adjust_costs() will be able to decide how much the current block
+ * differs from the previous one.
+ */
+static void
+deflate_near_optimal_save_stats(struct libdeflate_compressor *c)
+{
+	int i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+		c->p.n.prev_observations[i] = c->split_stats.observations[i];
+	c->p.n.prev_num_observations = c->split_stats.num_observations;
+}
+
+static void
+deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c)
+{
+	int i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+		c->split_stats.observations[i] = 0;
+	c->split_stats.num_observations = 0;
 	memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
 }
 
@@ -3355,30 +3385,37 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 			      u8 * restrict out, size_t out_nbytes_avail)
 {
 	const u8 *in_next = in;
+	const u8 *in_block_begin = in_next;
 	const u8 *in_end = in_next + in_nbytes;
 	struct deflate_output_bitstream os;
 	const u8 *in_cur_base = in_next;
 	const u8 *in_next_slide =
 		in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
 	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
 	unsigned nice_len = MIN(c->nice_match_length, max_len);
+	struct lz_match *cache_ptr = c->p.n.match_cache;
 	u32 next_hashes[2] = {0, 0};
 
 	deflate_init_output(&os, out, out_nbytes_avail);
 	bt_matchfinder_init(&c->p.n.bt_mf);
+	deflate_near_optimal_init_stats(c);
 
 	do {
 		/* Starting a new DEFLATE block */
-
-		struct lz_match *cache_ptr = c->p.n.match_cache;
-		const u8 * const in_block_begin = in_next;
 		const u8 * const in_max_block_end = choose_max_block_end(
-				in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
+				in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH);
+		const u8 *prev_end_block_check = NULL;
+		bool change_detected = false;
 		const u8 *next_observation = in_next;
 		unsigned min_len;
 
-		deflate_near_optimal_begin_block(c, in_block_begin == in);
-		min_len = calculate_min_match_len(in_next,
+		/*
+		 * Use the minimum match length heuristic to improve the
+		 * literal/match statistics gathered during matchfinding.
+		 * However, the actual near-optimal parse won't respect min_len,
+		 * as it can accurately assess the costs of different matches.
+		 */
+		min_len = calculate_min_match_len(in_block_begin,
 						  in_max_block_end - in_next,
 						  c->max_search_depth);
 
@@ -3390,7 +3427,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 		 * (2) Match catch may overflow.
 		 * (3) Block split heuristic says to split now.
 		 */
-		do {
+		for (;;) {
 			struct lz_match *matches;
 			unsigned best_len;
 			size_t remaining = in_end - in_next;
@@ -3436,13 +3473,12 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 				if (cache_ptr > matches)
 					best_len = cache_ptr[-1].length;
 			}
-			c->freqs.litlen[*in_next]++;
 			if (in_next >= next_observation) {
 				if (best_len >= min_len) {
 					observe_match(&c->split_stats,
 						      best_len);
 					next_observation = in_next + best_len;
-					c->p.n.match_len_freqs[best_len]++;
+					c->p.n.new_match_len_freqs[best_len]++;
 				} else {
 					observe_literal(&c->split_stats,
 							*in_next);
@@ -3495,24 +3531,101 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 					}
 					cache_ptr->length = 0;
 					cache_ptr->offset = *in_next;
-					c->freqs.litlen[*in_next]++;
 					in_next++;
 					cache_ptr++;
 				} while (--best_len);
 			}
-		} while (in_next < in_max_block_end &&
-			 cache_ptr < &c->p.n.match_cache[MATCH_CACHE_LENGTH] &&
-			 !should_end_block(&c->split_stats,
-					   in_block_begin, in_next, in_end));
+			/* Maximum block length or end of input reached? */
+			if (in_next >= in_max_block_end)
+				break;
+			/* Match cache overflowed? */
+			if (cache_ptr >=
+			    &c->p.n.match_cache[MATCH_CACHE_LENGTH])
+				break;
+			/* Not ready to try to end the block (again)? */
+			if (!ready_to_check_block(&c->split_stats,
+						  in_block_begin, in_next,
+						  in_end))
+				continue;
+			/* Check if it would be worthwhile to end the block. */
+			if (do_end_block_check(&c->split_stats,
+					       in_next - in_block_begin)) {
+				change_detected = true;
+				break;
+			}
+			/* Ending the block doesn't seem worthwhile here. */
+			deflate_near_optimal_merge_stats(c);
+			prev_end_block_check = in_next;
+		}
 		/*
 		 * All the matches for this block have been cached.  Now choose
-		 * the sequence of items to output and flush the block.
+		 * the precise end of the block and the sequence of items to
+		 * output to represent it, then flush the block.
 		 */
-		deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
-				       in_block_begin == in, in_next == in_end);
-		deflate_flush_block(c, &os, in_block_begin,
-				    in_next - in_block_begin,
-				    NULL, in_next == in_end);
+		if (change_detected && prev_end_block_check != NULL) {
+			/*
+			 * The block is being ended because a recent chunk of
+			 * data differs from the rest of the block.  We could
+			 * end the block at 'in_next' like the greedy and lazy
+			 * compressors do, but that's not ideal since it would
+			 * include the differing chunk in the block.  The
+			 * near-optimal compressor has time to do a better job.
+			 * Therefore, we rewind to just before the chunk, and
+			 * output a block that only goes up to there.
+			 *
+			 * We then set things up to correctly start the next
+			 * block, considering that some work has already been
+			 * done on it (some matches found and stats gathered).
+			 */
+			struct lz_match *orig_cache_ptr = cache_ptr;
+			const u8 *in_block_end = prev_end_block_check;
+			u32 block_length = in_block_end - in_block_begin;
+			bool is_first = (in_block_begin == in);
+			bool is_final = false;
+			u32 num_bytes_to_rewind = in_next - in_block_end;
+			size_t cache_len_rewound;
+
+			/* Rewind the match cache. */
+			do {
+				cache_ptr--;
+				cache_ptr -= cache_ptr->length;
+			} while (--num_bytes_to_rewind);
+			cache_len_rewound = orig_cache_ptr - cache_ptr;
+
+			deflate_optimize_block(c, in_block_begin, block_length,
+					       cache_ptr, is_first, is_final);
+			deflate_flush_block(c, &os, in_block_begin,
+					    block_length, NULL, is_final);
+			memmove(c->p.n.match_cache, cache_ptr,
+				cache_len_rewound * sizeof(*cache_ptr));
+			cache_ptr = &c->p.n.match_cache[cache_len_rewound];
+			deflate_near_optimal_save_stats(c);
+			/*
+			 * Clear the stats for the just-flushed block, leaving
+			 * just the stats for the beginning of the next block.
+			 */
+			deflate_near_optimal_clear_old_stats(c);
+			in_block_begin = in_block_end;
+		} else {
+			/*
+			 * The block is being ended for a reason other than a
+			 * differing data chunk being detected.  Don't rewind at
+			 * all; just end the block at the current position.
+			 */
+			u32 block_length = in_next - in_block_begin;
+			bool is_first = (in_block_begin == in);
+			bool is_final = (in_next == in_end);
+
+			deflate_near_optimal_merge_stats(c);
+			deflate_optimize_block(c, in_block_begin, block_length,
+					       cache_ptr, is_first, is_final);
+			deflate_flush_block(c, &os, in_block_begin,
+					    block_length, NULL, is_final);
+			cache_ptr = &c->p.n.match_cache[0];
+			deflate_near_optimal_save_stats(c);
+			deflate_near_optimal_init_stats(c);
+			in_block_begin = in_next;
+		}
 	} while (in_next != in_end);
 
 	return deflate_flush_output(&os);