diff --git a/Project.toml b/Project.toml index c982d88..c419713 100644 --- a/Project.toml +++ b/Project.toml @@ -1,29 +1,21 @@ name = "AssigningSecondaryStructure" uuid = "8ed43e74-60fb-4e11-99b9-91deed37aef7" authors = ["Anton Oresten ", "Shintaro Minami"] -version = "0.3.2" +version = "0.3.3" [deps] +Backboner = "9ac9c2a2-1cfe-46d3-b3fd-6fa470ea56a7" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -PDBTools = "e29189f1-7114-4dbd-93d0-c5673a921a58" PaddedViews = "5432bcbf-9aad-5242-b902-cca2824c8663" -[weakdeps] -Backboner = "9ac9c2a2-1cfe-46d3-b3fd-6fa470ea56a7" - -[extensions] -BackbonerExt = "Backboner" - [compat] Backboner = "0.9" LinearAlgebra = "1" -PDBTools = "1" PaddedViews = "0.5" julia = "^1.7" [extras] -Backboner = "9ac9c2a2-1cfe-46d3-b3fd-6fa470ea56a7" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "Backboner"] +test = ["Test"] diff --git a/ext/BackbonerExt.jl b/ext/BackbonerExt.jl deleted file mode 100644 index 3580634..0000000 --- a/ext/BackbonerExt.jl +++ /dev/null @@ -1,14 +0,0 @@ -module BackbonerExt - -import AssigningSecondaryStructure as ASS - -import Backboner - -function ASS.assign_secondary_structure(backbones::Vector{<:Backboner.Backbone{T}}) where T - ncaco_arrays = Array{T, 3}[cat(reshape(b.coords, 3, 3, :), reshape(Backboner.Protein.oxygen_coords(b), 3, 1, :), dims=2) for b in backbones] - ASS.assign_secondary_structure(ncaco_arrays) -end - -ASS.assign_secondary_structure(backbone::Backboner.Backbone) = ASS.assign_secondary_structure([backbone])[1] - -end \ No newline at end of file diff --git a/src/AssigningSecondaryStructure.jl b/src/AssigningSecondaryStructure.jl index f68a005..842158f 100644 --- a/src/AssigningSecondaryStructure.jl +++ b/src/AssigningSecondaryStructure.jl @@ -1,8 +1,8 @@ module AssigningSecondaryStructure include("utils.jl") +include("hydrogen.jl") include("dssp.jl") include("assign.jl") -include("pdb.jl") end diff --git a/src/assign.jl b/src/assign.jl index 34fadd8..3395378 100644 --- a/src/assign.jl +++ b/src/assign.jl @@ -27,3 +27,12 @@ function assign_secondary_structure(coords_chains::Vector{<:AbstractArray{T, 3}} return code_vectors_by_chain end + +import Backboner, Backboner.Protein +import Backboner.Protein: ncaco_coords, readpdb + +assign_secondary_structure(chains::Vector{Protein.Chain}) = assign_secondary_structure(ncaco_coords.(chains)) + +assign_secondary_structure(chain::Protein.Chain) = assign_secondary_structure([chain])[1] + +assign_secondary_structure(filename::AbstractString) = assign_secondary_structure(readpdb(filename)) \ No newline at end of file diff --git a/src/dssp.jl b/src/dssp.jl index 4367268..9f6490f 100644 --- a/src/dssp.jl +++ b/src/dssp.jl @@ -14,65 +14,10 @@ function _unfold(a::AbstractArray, window::Int, axis::Int) return _moveaxis(unfolded, axis, ndims(unfolded)) end -function _get_hydrogen_positions(coord::AbstractArray{T, 3}) where T <: Real - vec_cn = coord[2:end, 1, :] .- coord[1:end-1, 3, :] - vec_cn ./= mapslices(norm, vec_cn, dims=2) - vec_can = coord[2:end, 1, :] .- coord[2:end, 2, :] - vec_can ./= mapslices(norm, vec_can, dims=2) - vec_nh = vec_cn .+ vec_can - vec_nh ./= mapslices(norm, vec_nh, dims=2) - return coord[2:end, 1, :] .+ 1.01 .* vec_nh -end - -function _get_hbond_map( - coord::AbstractArray{T, 3}; - cutoff::Float64 = DEFAULT_CUTOFF, - margin::Float64 = DEFAULT_MARGIN, -) where T <: Real - residue_count, atoms_per_residue, _ = size(coord) - @assert atoms_per_residue == 4 - - cpos = coord[1:end-1, 3, :] - opos = coord[1:end-1, 4, :] - npos = coord[2:end, 1, :] - hpos = _get_hydrogen_positions(coord) - - cmap = repeat(reshape(cpos, 1, :, 3), outer=(residue_count-1, 1, 1)) - omap = repeat(reshape(opos, 1, :, 3), outer=(residue_count-1, 1, 1)) - nmap = repeat(reshape(npos, :, 1, 3), outer=(1, residue_count-1, 1)) - hmap = repeat(reshape(hpos, :, 1, 3), outer=(1, residue_count-1, 1)) - - d_on = dropdims(sqrt.(sum(abs2.(omap .- nmap), dims=3)), dims=3) - d_ch = dropdims(sqrt.(sum(abs2.(cmap .- hmap), dims=3)), dims=3) - d_oh = dropdims(sqrt.(sum(abs2.(omap .- hmap), dims=3)), dims=3) - d_cn = dropdims(sqrt.(sum(abs2.(cmap .- nmap), dims=3)), dims=3) - - arr = Q1Q2_F .* (1.0 ./ d_on .+ 1.0 ./ d_ch .- 1.0 ./ d_oh .- 1.0 ./ d_cn) - - e = _pad(0.0, arr, (1,0), (0,1)) - - local_mask = trues(residue_count, residue_count) - for i in 1:residue_count - local_mask[i, i] = false - if i > 1 - local_mask[i, i-1] = false - end - if i > 2 - local_mask[i, i-2] = false - end - end - - hbond_map = clamp.(cutoff - margin .- e, -margin, margin) - hbond_map .= (sin.(hbond_map ./ margin .* π ./ 2) .+ 1.0) ./ 2.0 - hbond_map .*= local_mask - - return hbond_map -end - # not differentiable like the PyDSSP version cause we use bitwise operators function dssp(coords::AbstractArray{T, 3}) where T - @assert size(coords, 1) == 3 - @assert size(coords, 2) == 4 + size(coords, 1) == 3 || throw(DimensionMismatch("Expected 3 coordinates per atom, got $(size(coords, 1))")) + size(coords, 2) == 4 || throw(DimensionMismatch("Expected 4 atoms per residue, got $(size(coords, 2))")) N = size(coords, 3) if N < 6 @@ -81,7 +26,7 @@ function dssp(coords::AbstractArray{T, 3}) where T coords = permutedims(coords, (3, 2, 1)) - hbmap = _get_hbond_map(coords) + hbmap = get_hbond_map(coords) hbmap = permutedims(hbmap, (2, 1)) # Rearrange to "i:C=O, j:N-H" form # Identify turn 3, 4, 5 diff --git a/src/hydrogen.jl b/src/hydrogen.jl new file mode 100644 index 0000000..0c68742 --- /dev/null +++ b/src/hydrogen.jl @@ -0,0 +1,54 @@ +function get_hydrogen_positions(coords::AbstractArray{T, 3}) where T <: Real + vec_cn = coords[2:end, 1, :] .- coords[1:end-1, 3, :] + vec_cn ./= mapslices(norm, vec_cn, dims=2) + vec_can = coords[2:end, 1, :] .- coords[2:end, 2, :] + vec_can ./= mapslices(norm, vec_can, dims=2) + vec_nh = vec_cn .+ vec_can + vec_nh ./= mapslices(norm, vec_nh, dims=2) + return coords[2:end, 1, :] .+ 1.01 .* vec_nh +end + +function get_hbond_map( + coords::AbstractArray{T, 3}; + cutoff::Float64 = DEFAULT_CUTOFF, + margin::Float64 = DEFAULT_MARGIN, +) where T <: Real + residue_count, atoms_per_residue, _ = size(coords) + atoms_per_residue == 4 || throw(DimensionMismatch("Expected 4 atoms per residue, got $atoms_per_residue")) + + cpos = coords[1:end-1, 3, :] + opos = coords[1:end-1, 4, :] + npos = coords[2:end, 1, :] + hpos = get_hydrogen_positions(coords) + + cmap = repeat(reshape(cpos, 1, :, 3), outer=(residue_count-1, 1, 1)) + omap = repeat(reshape(opos, 1, :, 3), outer=(residue_count-1, 1, 1)) + nmap = repeat(reshape(npos, :, 1, 3), outer=(1, residue_count-1, 1)) + hmap = repeat(reshape(hpos, :, 1, 3), outer=(1, residue_count-1, 1)) + + d_on = dropdims(sqrt.(sum(abs2.(omap .- nmap), dims=3)), dims=3) + d_ch = dropdims(sqrt.(sum(abs2.(cmap .- hmap), dims=3)), dims=3) + d_oh = dropdims(sqrt.(sum(abs2.(omap .- hmap), dims=3)), dims=3) + d_cn = dropdims(sqrt.(sum(abs2.(cmap .- nmap), dims=3)), dims=3) + + arr = Q1Q2_F .* (1.0 ./ d_on .+ 1.0 ./ d_ch .- 1.0 ./ d_oh .- 1.0 ./ d_cn) + + e = _pad(0.0, arr, (1,0), (0,1)) + + local_mask = trues(residue_count, residue_count) + for i in 1:residue_count + local_mask[i, i] = false + if i > 1 + local_mask[i, i-1] = false + end + if i > 2 + local_mask[i, i-2] = false + end + end + + hbond_map = clamp.(cutoff - margin .- e, -margin, margin) + hbond_map .= (sin.(hbond_map .* (π / 2 / margin)) .+ 1.0) ./ 2.0 + hbond_map .*= local_mask + + return hbond_map +end \ No newline at end of file diff --git a/src/pdb.jl b/src/pdb.jl deleted file mode 100644 index ea9bcff..0000000 --- a/src/pdb.jl +++ /dev/null @@ -1,41 +0,0 @@ -export load_pdb_chains - -import PDBTools - -function collect_residues(atoms::Vector{PDBTools.Atom}) - residues = Vector{PDBTools.Atom}[] - i = 1 - while i <= length(atoms) - 3 # Ensure there are at least four atoms left to process - # Check if the next four atoms are N, CA, C, O in order - if atoms[i].name == "N" && atoms[i+1].name == "CA" && atoms[i+2].name == "C" && atoms[i+3].name == "O" && - all(==(PDBTools.resnum(atoms[i])), PDBTools.resnum.(atoms[i+1:i+3])) - push!(residues, atoms[i:i+3]) - i += 4 - else - i += 1 - end - end - return residues -end - -function chain_coords(id::AbstractString, atoms::Vector{PDBTools.Atom}) - chain_atoms = filter(a -> PDBTools.chain(a) == id, atoms) - residues = collect_residues(chain_atoms) - coords = zeros(Float32, (3, 4, length(residues))) - for (i, residue) in enumerate(residues) - for (j, atom) in enumerate(residue) - coords[:, j, i] = [atom.x, atom.y, atom.z] - end - end - return coords -end - -function load_pdb_chains(filename::AbstractString) - atoms = PDBTools.readPDB(filename) - filter!(a -> a.name in ["N", "CA", "C", "O"], atoms) - ids = unique(PDBTools.chain.(atoms)) - chains = [chain_coords(id, atoms) for id in ids] - return chains -end - -assign_secondary_structure(filename::AbstractString) = assign_secondary_structure(load_pdb_chains(filename)) \ No newline at end of file diff --git a/src/utils.jl b/src/utils.jl index 04ff522..2750bae 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,7 +1,7 @@ # These functions come from numpy and were used to port the code from python to julia. function _pad(x::T, arr::AbstractArray{T, N}, paddings::Vararg{Tuple{Int, Int}, N}) where {T, N} - @assert ndims(arr) == length(paddings) + ndims(arr) == length(paddings) || throw(DimensionMismatch("Number of paddings must match the number of dimensions of the array")) new_size = Int[] offsets = UnitRange{Int}[] for (n, (a,b)) in zip(size(arr), paddings) diff --git a/test/BackbonerExt.jl b/test/BackbonerExt.jl deleted file mode 100644 index 77adbb6..0000000 --- a/test/BackbonerExt.jl +++ /dev/null @@ -1,10 +0,0 @@ -using Backboner - -@testset "BackbonerExt.jl" begin - - backbones = [chain.backbone for chain in Protein.readpdb("data/1ZAK.pdb")] - - @test length(assign_secondary_structure(backbones[1])) == 220 - @test length.(assign_secondary_structure(backbones)) == [220, 220] - -end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 1c2e954..5d91d6c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,30 +1,16 @@ using AssigningSecondaryStructure using Test +import AssigningSecondaryStructure as ASS + ss_composition(ss::Vector{Int}) = [count(==(code), ss) for code in 1:3] @testset "AssigningSecondaryStructure.jl" begin - @testset "io.jl" begin - - @testset "1ASS" begin - backbone = load_pdb_chains("data/1ASS.pdb") - @test length(backbone) == 1 - @test size.(backbone, 3) == [152] - end - - @testset "1ZAK" begin - backbone = load_pdb_chains("data/1ZAK.pdb") - @test length(backbone) == 2 - @test size.(backbone, 3) == [220, 220] - end - - end - @testset "DSSP" begin @testset "dssp" begin - coords = load_pdb_chains("data/1ASS.pdb")[1] + coords = ASS.ncaco_coords.(ASS.readpdb("data/1ASS.pdb"))[1] @test AssigningSecondaryStructure.dssp(coords[:, :, 35:39]) == [1, 1, 1, 1, 1] # minimum helix length is 4 @test AssigningSecondaryStructure.dssp(coords[:, :, 35:40]) == [1, 2, 2, 2, 2, 1] # ends don't count towards helix length :shrug: @test AssigningSecondaryStructure.dssp(coords[:, :, 35:41]) == [1, 2, 2, 2, 2, 2, 1] @@ -44,6 +30,4 @@ ss_composition(ss::Vector{Int}) = [count(==(code), ss) for code in 1:3] end - include("BackbonerExt.jl") - end