Conversation
Owner
Author
|
I found that pre hash hurts the compression ratio. the following patch can improve the compression ratio for lz4hc: Detailsdiff --git a/src/block/compress_hc/hash_chain.rs b/src/block/compress_hc/hash_chain.rs
index a84230f..865312c 100644
--- a/src/block/compress_hc/hash_chain.rs
+++ b/src/block/compress_hc/hash_chain.rs
@@ -487,19 +487,6 @@ impl HashTableHCU32 {
self.dictionary[hash] as usize
}
- /// Set dictionary slot at hash index.
- #[inline]
- fn set_dictionary_at(&mut self, hash: usize, pos: usize) {
- self.dictionary[hash] = pos as u32;
- }
-
- /// Set chain value at position
- #[inline]
- fn set_chain(&mut self, pos: usize, delta: u16) {
- let chain_index = pos & self.chain_mask();
- self.chain_table[chain_index] = delta;
- }
-
/// Insert hashes for all positions up to the given local offset.
/// Positions stored in the hash table are absolute (`local_pos + stream_offset`).
#[inline]
@@ -739,39 +726,6 @@ pub(super) fn find_longer_hash_chain_match(
}
}
-/// Update the hash chain for the first match found at `cur`.
-///
-/// This mirrors the pre-hash step from the LZ4 HC reference: positions inside the
-/// first accepted match are inserted eagerly so later searches can skip ahead.
-fn prehash_first_match(
- hash_table: &mut HashTableHCU32,
- input: &[u8],
- cur_absolute: usize,
- stream_offset: usize,
- first_match_length: usize,
- delta: usize,
-) {
- let mut hash_pos = cur_absolute;
- let end_pos = cur_absolute + first_match_length - 3;
-
- while hash_pos < end_pos - delta {
- hash_table.set_chain(hash_pos, delta as u16);
- hash_pos += 1;
- }
-
- loop {
- hash_table.set_chain(hash_pos, delta as u16);
- let local_hash_pos = hash_pos - stream_offset;
- hash_table.set_dictionary_at(get_hash_at(input, local_hash_pos), hash_pos);
- hash_pos += 1;
- if hash_pos >= end_pos {
- break;
- }
- }
-
- hash_table.next_to_update = end_pos;
-}
-
/// Insert `cur` into the hash/chain tables, then search the chain for the
/// longest match starting at `cur`.
///
@@ -795,8 +749,6 @@ fn find_best_hash_chain_match(
match_length: 0,
candidate: 0,
};
- let mut first_match_delta = 0usize;
- let mut first_match_length = 0usize;
let cur_absolute = cur + stream_offset;
let ext_dict_stream_offset = stream_offset - ext_dict.len();
@@ -805,7 +757,7 @@ fn find_best_hash_chain_match(
let mut candidate = hash_table.get_dictionary_at(get_hash_at(input, cur));
- for attempt in 0..hash_table.max_attempts {
+ for _ in 0..hash_table.max_attempts {
if !hash_table.in_range(candidate, cur_absolute) {
break;
}
@@ -836,28 +788,12 @@ fn find_best_hash_chain_match(
best_match.match_length = match_length as u32;
}
- if attempt == 0 && match_length > 0 {
- first_match_length = match_length;
- first_match_delta = cur_absolute - candidate;
- }
-
let Some(next_candidate) = hash_table.advance(candidate, cur_absolute) else {
break;
};
candidate = next_candidate;
}
- if first_match_length != 0 {
- prehash_first_match(
- hash_table,
- input,
- cur_absolute,
- stream_offset,
- first_match_length,
- first_match_delta,
- );
- }
-
if best_match.match_length == 0 {
None
} else {
@@ -1165,13 +1101,14 @@ pub(super) fn compress_hash_chain_internal(
// Do not extend matches into the last `LAST_LITERALS` bytes (they are literals).
let match_limit = input_end - LAST_LITERALS;
- let mut cur = input_pos + 1;
+ // Match C's LZ4HC main loop: start at block start and scan through `mflimit` inclusive.
+ let mut cur = input_pos;
let mut literal_start = input_pos;
let mut previous_match;
let mut current_match;
let mut next_match;
- while cur < end_pos_check {
+ while cur <= end_pos_check {
let Some(found_match) = find_best_hash_chain_match(
hash_table,
input,
diff --git a/src/block/compress_hc/tests.rs b/src/block/compress_hc/tests.rs
index 07f8a06..9bd9fc6 100644
--- a/src/block/compress_hc/tests.rs
+++ b/src/block/compress_hc/tests.rs
@@ -257,8 +257,8 @@ fn test_compressed_sizes_exact() {
// (level, html_like, json_like, code_like)
let expected: &[(u8, usize, usize, usize)] = &[
(1, 16_350, 16_183, 6_246),
- (4, 15_620, 16_198, 6_071),
- (9, 15_509, 15_698, 5_985),
+ (4, 15_642, 16_363, 6_081),
+ (9, 15_111, 15_482, 5_984),
(10, 15_153, 15_102, 5_990),
(12, 15_100, 15_083, 5_979),
]; |
- cursor_pos → cur, literal_anchor_pos → literal_start - candidate_absolute_position → candidate, absolute_byte_offset → cur_absolute - reference_local_position → candidate_local - external_dictionary → ext_dict, external_dictionary_stream_offset → ext_dict_stream_offset - match_extension_end_pos → match_limit, max_main_cursor_pos → end_pos_check - MIN_MATCH → MINMATCH (use shared constant from mod.rs) - MIN_BYTES_FROM_CURSOR_TO_BLOCK_END → MFLIMIT (use shared constant) - Shorten verbose names in opt parser and mid compressor - Ignore 10MB tests for faster iteration
The same ext_dict candidate matching logic (boundary-crossing reads, min-match check, forward count) was duplicated across 3 search methods in HashTableHCU32. Extract into a shared helper function.
- Extract in_range() and advance() helpers to replace repeated chain-validity checks - Use early continue/break to reduce indentation depth - Shorten local variable names in insert_and_find_wider_match
…loops
Replace the hard-to-follow break true/break false pattern in the
match0/match1/match2/match3 lazy evaluation with labeled loops ('lazy
and 'resolve), making control flow explicit and adding comments
explaining each branch.
…ctions Move add_hash4/add_hash8 to methods on HashTableMid and resolve_candidate to a standalone resolve_mid_candidate function, eliminating nested fn definitions and reducing parameter passing.
Remove test_lz4mid_debug (println-based debug test). Make Match, HashTableHCU32, and HashTableMid private since they're only used within compress_hc.rs. Restrict HcLevelParams fields to pub(crate).
Align compress_hc.rs naming with compress.rs conventions where the earlier match position is consistently called 'candidate'.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Based on the PR from @yujincheng08 #209 with some changes on top
closes #21
closes #165