From cdcaa0ccb416d48a0689839cfdf78faaf67bf8a9 Mon Sep 17 00:00:00 2001 From: Sebastian Deorowicz Date: Fri, 11 Oct 2024 15:20:14 +0200 Subject: [PATCH] Feature/faster decompression (#6) --- src/defs.h | 4 +-- src/parser.h | 18 ++++++++++--- src/seq_reservoir.h | 66 ++++++++++++++++++++++++++++++++++----------- 3 files changed, 68 insertions(+), 20 deletions(-) diff --git a/src/defs.h b/src/defs.h index c555f52..845a577 100644 --- a/src/defs.h +++ b/src/defs.h @@ -15,8 +15,8 @@ #include #include "params.h" -const std::string LZ_ANI_VER = "lz-ani 1.2.0"; -const std::string LZ_ANI_DATE = "2024-10-09"; +const std::string LZ_ANI_VER = "lz-ani 1.2.1"; +const std::string LZ_ANI_DATE = "2024-10-11"; const std::string LZ_ANI_AUTHORS = "Sebastian Deorowicz, Adam Gudys"; const std::string LZ_ANI_INFO = LZ_ANI_VER + " (" + LZ_ANI_DATE + ") by " + LZ_ANI_AUTHORS; diff --git a/src/parser.h b/src/parser.h index 4fea926..0e4f08f 100644 --- a/src/parser.h +++ b/src/parser.h @@ -30,6 +30,8 @@ class CParser CParams params; + seq_t seq_working; + seq_t seq_ref; seq_t seq_data; uint32_t n_ref_seqs = 0; @@ -59,21 +61,31 @@ class CParser void append(seq_t& seq, const seq_view& sv, const uint8_t allowed_N, const uint8_t forbidden_N) { - for (uint32_t i = 0; i < sv.size(); ++i) +/* for (uint32_t i = 0; i < sv.size(); ++i) { auto c = sv[i]; if (c == forbidden_N) seq.emplace_back(allowed_N); else seq.emplace_back(c); - } + }*/ + + auto prev_size = seq.size(); + seq.resize(prev_size + sv.size()); + sv.unpack(seq.data() + prev_size); + replace(seq.begin() + prev_size, seq.end(), forbidden_N, allowed_N); } void append_rc(seq_t& seq, const seq_view& sv, const uint8_t allowed_N, const uint8_t forbidden_N) { + seq.reserve(seq.size() + sv.size()); + + seq_working.resize(sv.size()); + sv.unpack(seq_working.data()); + for (uint32_t i = 0; i < sv.size(); ++i) { - auto c = sv[sv.size() - 1 - i]; + auto c = seq_working[seq_working.size() - 1 - i]; if (c == forbidden_N) seq.emplace_back(allowed_N); else if (c < code_N) diff --git a/src/seq_reservoir.h b/src/seq_reservoir.h index 2f88c01..c77fe76 100644 --- a/src/seq_reservoir.h +++ b/src/seq_reservoir.h @@ -33,6 +33,25 @@ class seq_view const uint32_t len; const internal_packing_t internal_packing; + static inline uint8_t triples[256][4]; + + struct _si + { + _si() + { + for(uint8_t i = 0; i < 6; ++i) + for(uint8_t j = 0; j < 6; ++j) + for (uint8_t k = 0; k < 6; ++k) + { + auto idx = 36 * i + 6 * j + k; + triples[idx][0] = i; + triples[idx][1] = j; + triples[idx][2] = k; + triples[idx][3] = 0; + } + } + } static inline _init; + public: seq_view(const uint8_t *data = 0, const uint32_t len = 0, internal_packing_t internal_packing = internal_packing_t::none) : data(data), @@ -90,21 +109,21 @@ class seq_view if (len & 1) dest[len / 2] = src[len - 1] << 4; - } + }*/ - static void unpack2(uint8_t* dest, uint8_t* src, uint32_t len) + void unpack2(uint8_t* dest) const { for (uint32_t i = 0; i < len / 2; ++i) { - dest[2 * i] = src[i] >> 4; - dest[2 * i + 1] = src[i] & 0xf; + dest[2 * i] = data[i] >> 4; + dest[2 * i + 1] = data[i] & 0xf; } if (len & 1) - dest[len - 1] = src[len / 2] >> 4; + dest[len - 1] = data[len / 2] >> 4; } - static void pack3(uint8_t* dest, uint8_t* src, uint32_t len) +/* static void pack3(uint8_t* dest, uint8_t* src, uint32_t len) { for (uint32_t i = 0; i < len / 3; ++i) dest[i] = 36 * src[3 * i] + 6 * src[3 * i + 1] + src[3 * i + 2]; @@ -121,32 +140,49 @@ class seq_view // Nothing break; } - } + }*/ - static void unpack3(uint8_t * dest, uint8_t * src, uint32_t len) + void unpack3(uint8_t *dest) const { uint32_t len_div_3 = len / 3; for (uint32_t i = 0; i < len_div_3; ++i) { - dest[3 * i] = src[i] / 36; - dest[3 * i + 1] = src[i] / 6 - 6 * dest[3 * i]; - dest[3 * i + 2] = src[i] - 36 * dest[3 * i] - 6 * dest[3 * i + 1]; + dest[3 * i] = triples[data[i]][0]; + dest[3 * i + 1] = triples[data[i]][1]; + dest[3 * i + 2] = triples[data[i]][2]; } switch (len % 3) { case 2: - dest[len - 2] = src[len_div_3] / 6; - dest[len - 1] = src[len_div_3] - 6 * dest[len - 2]; + dest[len - 2] = triples[data[len_div_3]][0]; + dest[len - 1] = triples[data[len_div_3]][1]; break; case 1: - dest[len - 1] = src[len_div_3]; + dest[len - 1] = triples[data[len_div_3]][0]; + break; case 0: // Nothing break; } - }*/ + } + + void unpack(uint8_t* dest) const + { + switch (internal_packing) + { + case internal_packing_t::none: + copy_n(data, len, dest); + break; + case internal_packing_t::two_in_byte: + unpack2(dest); + break; + case internal_packing_t::three_in_byte: + unpack3(dest); + break; + } + } }; class CSeqReservoir