You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I've been playing around with some optimizations to remove branching in the counting loop, and this one in particular is faster on longer sequences, but not shorter sequences due to an added constant time for uncounting the k-mers ends of the sequence.
function VectorizedKmers.countkmers!(
kmer_vector::KmerVector{4,K,<:Real},
sequence::SeqOrView{<:NucleicAcidAlphabet{2}};
reset::Bool=true,
) where K
reset && VectorizedKmers.zeros!(kmer_vector)
kmer_vector_values = kmer_vector.values
mask =one(UInt) <<2K -1
start, stop = sequence isa LongSubSeq ? (sequence.part.start, sequence.part.stop) : (1, length(sequence))
data_start, data_stop =cld(start, 32), cld(stop, 32)
data_ints =@view sequence.data[data_start:data_stop]
iflength(sequence) <1000# old method (with conditionals)
first_count_index = K + start -1
kmer_int =zero(UInt)
i =32* (data_start -1)
@inboundsfor data_int in data_ints
for j in0:2:63
i +=1
i > stop &&break
kmer_int = (kmer_int <<2) & mask | (data_int >> j) &0b11
kmer_vector_values[kmer_int +1] += first_count_index <= i
endendelse# new method (with uncounting in constant time loops, no conditionals in main loop)
kmer_int =zero(UInt)
for data_int in data_ints
for j in0:2:62
kmer_int = (kmer_int <<2) & mask | (data_int >> j) &0b11@inbounds kmer_vector_values[kmer_int +1] +=1endend# uncount kmers at stop
i =32data_stop
for data_int in@view data_ints[end:-1:max(end-1,begin)]
for j in62:-2:0
i -=1
i >= stop - K +1||break
kmer_int = (kmer_int >>2) | (data_int >> j << (2K -2)) & mask
@inbounds kmer_vector_values[kmer_int +1] -=1endend# uncount kmers at start
i =32(data_start -1)
kmer_int =zero(UInt)
for data_int in@view data_ints[1:min(2,end)]
for j in0:2:62
i +=1
i < start ||break
kmer_int = (kmer_int <<2) & mask | ((data_int >> j) &0b11)
@inbounds kmer_vector_values[kmer_int +1] -=1endendendreturn kmer_vector
end
julia>@benchmarkcountkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $short_seq)
BenchmarkTools.Trial:10000 samples with 976 evaluations.
Range (min … max):71.824 ns …132.275 ns ┊ GC (min … max):0.00%…0.00%
Time (median):72.746 ns ┊ GC (median):0.00%
Time (mean ± σ):75.439 ns ±8.824 ns ┊ GC (mean ± σ):0.00%±0.00%
█▆▆▃▁▂▁▁ ▁
█████████▇▇▇▆▆█▇▆▅▆▆▅▄▅▅▅▂▅▄▃▃▄▄▅▄▅▅▄▅▄▅▅▆▆▅▆▆▇▇▇▇▇▇████▇▇▆▆ █
71.8 ns Histogram:log(frequency) by time 111 ns <
Memory estimate:0 bytes, allocs estimate:0.
julia>@benchmarkcountkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $long_seq)
BenchmarkTools.Trial:6929 samples with 1 evaluation.
Range (min … max):628.600 μs …2.590 ms ┊ GC (min … max):0.00%…0.00%
Time (median):635.800 μs ┊ GC (median):0.00%
Time (mean ± σ):713.545 μs ±170.188 μs ┊ GC (mean ± σ):0.00%±0.00%
█▃▄▂▂▃▁ ▁▁ ▁
█████████████████▇▇▆▆▆▇▇▇▇▇▆▇▇▇██▇▇▇▇▇▆▇▇▆▆▇▇▆▇▆▆▆▇▆▆▆▅▆▄▅▅▄▅ █
629 μs Histogram:log(frequency) by time 1.33 ms <
Memory estimate:0 bytes, allocs estimate:0.
New method
julia>@benchmarkcountkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $short_seq)
BenchmarkTools.Trial:10000 samples with 746 evaluations.
Range (min … max):168.499 ns …795.308 ns ┊ GC (min … max):0.00%…0.00%
Time (median):173.257 ns ┊ GC (median):0.00%
Time (mean ± σ):184.903 ns ±29.125 ns ┊ GC (mean ± σ):0.00%±0.00%
▆█▅▇▃▂▃▁▁▁ ▁ ▁▁▁▁▁▁▂▂▂▁▁▂▁▁ ▂
██████████████▇█▇▇█▇▇▇▇█▇▇██████████████████████▇▇▇▆▇▇▄▆▆▅▄▅▅ █
168 ns Histogram:log(frequency) by time 265 ns <
Memory estimate:0 bytes, allocs estimate:0.
julia>@benchmarkcountkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $long_seq)
BenchmarkTools.Trial:7951 samples with 1 evaluation.
Range (min … max):554.600 μs …2.898 ms ┊ GC (min … max):0.00%…0.00%
Time (median):557.400 μs ┊ GC (median):0.00%
Time (mean ± σ):621.315 μs ±161.282 μs ┊ GC (mean ± σ):0.00%±0.00%
█▂▂▁▁▁ ▁
████████████▇▇▇▆▇▇▇▆▆▇▇▆▇▇▇▇▇▆▇▇▇▇██▇▇▇▆▆▇▇▇▆▆▆▆▆▅▆▅▅▆▅▄▆▄▅▄▄ █
555 μs Histogram:log(frequency) by time 1.2 ms <
Memory estimate:0 bytes, allocs estimate:0.
Hybrid method
julia>@benchmarkcountkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $short_seq)
BenchmarkTools.Trial:10000 samples with 972 evaluations.
Range (min … max):76.132 ns …1.399 μs ┊ GC (min … max):0.00%…0.00%
Time (median):85.185 ns ┊ GC (median):0.00%
Time (mean ± σ):93.007 ns ±31.409 ns ┊ GC (mean ± σ):0.00%±0.00%
█▄▅▄▅▆▄▄▃▃▃▂▂▃▃▃▃▄▃▃▃▃▃▃▂▁▁▁▁ ▂
███████████████████████████████▇█▆▆▇▆▆▇▅▆▅▆▅▄▆▅▅▄▄▅▅▄▃▅▁▄▄▄ █
76.1 ns Histogram:log(frequency) by time 188 ns <
Memory estimate:0 bytes, allocs estimate:0.
julia>@benchmarkcountkmers!($(KmerVector{4,1}(zeros(Int,4^1))), $long_seq)
BenchmarkTools.Trial:8574 samples with 1 evaluation.
Range (min … max):523.600 μs …4.261 ms ┊ GC (min … max):0.00%…0.00%
Time (median):532.800 μs ┊ GC (median):0.00%
Time (mean ± σ):575.878 μs ±113.757 μs ┊ GC (mean ± σ):0.00%±0.00%
█▇▄▃▂▂▂▁▁▁▁▂▁▁▁ ▁▁▁ ▁
████████████████████████▇█▆▆▇▇▇▇▆▇▇▇▇█▇▇▇▆▇▇▆▇▆▆▆▆▆▄▅▅▅▄▅▅▄▃▅ █
524 μs Histogram:log(frequency) by time 977 μs <
Memory estimate:0 bytes, allocs estimate:0.
The threshold is kinda iffy, but 1000 seems like a sweetspot. The break can be removed in favor of adding some boolean expression instead to make it more GPU-friendly.
The text was updated successfully, but these errors were encountered:
I've been playing around with some optimizations to remove branching in the counting loop, and this one in particular is faster on longer sequences, but not shorter sequences due to an added constant time for uncounting the k-mers ends of the sequence.
Benchmarks
Setup:
Old method
New method
Hybrid method
The threshold is kinda iffy, but 1000 seems like a sweetspot. The break can be removed in favor of adding some boolean expression instead to make it more GPU-friendly.
The text was updated successfully, but these errors were encountered: