Faster K-mer counting on longer sequences #41

AntonOresten · 2024-09-11T10:22:30Z

I've been playing around with some optimizations to remove branching in the counting loop, and this one in particular is faster on longer sequences, but not shorter sequences due to an added constant time for uncounting the k-mers ends of the sequence.

function VectorizedKmers.countkmers!(
    kmer_vector::KmerVector{4,K,<:Real},
    sequence::SeqOrView{<:NucleicAcidAlphabet{2}};
    reset::Bool = true,
) where K
    reset && VectorizedKmers.zeros!(kmer_vector)
    kmer_vector_values = kmer_vector.values
    mask = one(UInt) << 2K - 1
    start, stop = sequence isa LongSubSeq ? (sequence.part.start, sequence.part.stop) : (1, length(sequence))
    data_start, data_stop = cld(start, 32), cld(stop, 32)
    data_ints = @view sequence.data[data_start:data_stop]

    if length(sequence) < 1000
        # old method (with conditionals)
        first_count_index = K + start - 1
        kmer_int = zero(UInt)
        i = 32 * (data_start - 1)
        @inbounds for data_int in data_ints
            for j in 0:2:63
                i += 1
                i > stop && break
                kmer_int = (kmer_int << 2) & mask | (data_int >> j) & 0b11
                kmer_vector_values[kmer_int + 1] += first_count_index <= i
            end
        end
    else
        # new method (with uncounting in constant time loops, no conditionals in main loop)
        kmer_int = zero(UInt)
        for data_int in data_ints
            for j in 0:2:62
                kmer_int = (kmer_int << 2) & mask | (data_int >> j) & 0b11
                @inbounds kmer_vector_values[kmer_int + 1] += 1
            end
        end

        # uncount kmers at stop
        i = 32data_stop
        for data_int in @view data_ints[end:-1:max(end-1,begin)]
            for j in 62:-2:0
                i -= 1
                i >= stop - K + 1 || break
                kmer_int = (kmer_int >> 2) | (data_int >> j << (2K - 2)) & mask
                @inbounds kmer_vector_values[kmer_int + 1] -= 1
            end
        end

        # uncount kmers at start
        i = 32(data_start - 1)
        kmer_int = zero(UInt)
        for data_int in @view data_ints[1:min(2,end)]
            for j in 0:2:62
                i += 1
                i < start || break
                kmer_int = (kmer_int << 2) & mask | ((data_int >> j) & 0b11)
                @inbounds kmer_vector_values[kmer_int + 1] -= 1
            end
        end
    end

    return kmer_vector
end

Benchmarks

Setup:

julia> short_seq, long_seq = LongDNA{2}(randdnaseq(100)), LongDNA{2}(randdnaseq(1000000));

Old method

julia> @benchmark countkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $short_seq)
BenchmarkTools.Trial: 10000 samples with 976 evaluations.
 Range (min … max):  71.824 ns … 132.275 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     72.746 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   75.439 ns ±   8.824 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █▆▆▃▁▂▁▁                                                     ▁
  █████████▇▇▇▆▆█▇▆▅▆▆▅▄▅▅▅▂▅▄▃▃▄▄▅▄▅▅▄▅▄▅▅▆▆▅▆▆▇▇▇▇▇▇████▇▇▆▆ █
  71.8 ns       Histogram: log(frequency) by time       111 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark countkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $long_seq)
BenchmarkTools.Trial: 6929 samples with 1 evaluation.
 Range (min … max):  628.600 μs …   2.590 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     635.800 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   713.545 μs ± 170.188 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █▃▄▂▂▃▁    ▁▁                                                 ▁
  █████████████████▇▇▆▆▆▇▇▇▇▇▆▇▇▇██▇▇▇▇▇▆▇▇▆▆▇▇▆▇▆▆▆▇▆▆▆▅▆▄▅▅▄▅ █
  629 μs        Histogram: log(frequency) by time       1.33 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

New method

julia> @benchmark countkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $short_seq)
BenchmarkTools.Trial: 10000 samples with 746 evaluations.
 Range (min … max):  168.499 ns … 795.308 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     173.257 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   184.903 ns ±  29.125 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▆█▅▇▃▂▃▁▁▁  ▁                  ▁▁▁▁▁▁▂▂▂▁▁▂▁▁                 ▂
  ██████████████▇█▇▇█▇▇▇▇█▇▇██████████████████████▇▇▇▆▇▇▄▆▆▅▄▅▅ █
  168 ns        Histogram: log(frequency) by time        265 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark countkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $long_seq)
BenchmarkTools.Trial: 7951 samples with 1 evaluation.
 Range (min … max):  554.600 μs …   2.898 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     557.400 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   621.315 μs ± 161.282 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █▂▂▁▁▁                                                        ▁
  ████████████▇▇▇▆▇▇▇▆▆▇▇▆▇▇▇▇▇▆▇▇▇▇██▇▇▇▆▆▇▇▇▆▆▆▆▆▅▆▅▅▆▅▄▆▄▅▄▄ █
  555 μs        Histogram: log(frequency) by time        1.2 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

Hybrid method

julia> @benchmark countkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $short_seq)
BenchmarkTools.Trial: 10000 samples with 972 evaluations.
 Range (min … max):  76.132 ns …  1.399 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     85.185 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   93.007 ns ± 31.409 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █▄▅▄▅▆▄▄▃▃▃▂▂▃▃▃▃▄▃▃▃▃▃▃▂▁▁▁▁                               ▂
  ███████████████████████████████▇█▆▆▇▆▆▇▅▆▅▆▅▄▆▅▅▄▄▅▅▄▃▅▁▄▄▄ █
  76.1 ns      Histogram: log(frequency) by time       188 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark countkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $long_seq)
BenchmarkTools.Trial: 8574 samples with 1 evaluation.
 Range (min … max):  523.600 μs …   4.261 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     532.800 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   575.878 μs ± 113.757 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

  █▇▄▃▂▂▂▁▁▁▁▂▁▁▁   ▁▁▁                                         ▁
  ████████████████████████▇█▆▆▇▇▇▇▆▇▇▇▇█▇▇▇▆▇▇▆▇▆▆▆▆▆▄▅▅▅▄▅▅▄▃▅ █
  524 μs        Histogram: log(frequency) by time        977 μs <

 Memory estimate: 0 bytes, allocs estimate: 0.

The threshold is kinda iffy, but 1000 seems like a sweetspot. The break can be removed in favor of adding some boolean expression instead to make it more GPU-friendly.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Faster K-mer counting on longer sequences #41

Faster K-mer counting on longer sequences #41

AntonOresten commented Sep 11, 2024 •

edited

Loading

Faster K-mer counting on longer sequences #41

Faster K-mer counting on longer sequences #41

Comments

AntonOresten commented Sep 11, 2024 • edited Loading

Benchmarks

AntonOresten commented Sep 11, 2024 •

edited

Loading