diff --git a/Project.toml b/Project.toml index 2110d3f..29f0549 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Backboner" uuid = "9ac9c2a2-1cfe-46d3-b3fd-6fa470ea56a7" authors = ["Anton Oresten "] -version = "0.9.5" +version = "0.9.6" [deps] BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc" diff --git a/src/protein/atom.jl b/src/protein/atom.jl new file mode 100644 index 0000000..274106b --- /dev/null +++ b/src/protein/atom.jl @@ -0,0 +1,16 @@ +export Atom + +struct Atom + name::String + coords::AbstractVector{<:Real} + + function Atom(name::AbstractString, coords::AbstractVector{<:Real}) + length(coords) == 3 || throw(ArgumentError("coords must be a 3-vector")) + return new(strip(name), coords) + end +end + +Atom(atom::BioStructures.Atom) = Atom(atom.name, atom.coords) + +Base.summary(atom::Atom) = "Atom $(atom.name) at $(atom.coords)" +Base.show(io::IO, atom::Atom) = print(io, summary(atom)) \ No newline at end of file diff --git a/src/protein/chain.jl b/src/protein/chain.jl index 6e0fb3a..254b007 100644 --- a/src/protein/chain.jl +++ b/src/protein/chain.jl @@ -10,7 +10,7 @@ export phi_angles, psi_angles, omega_angles A `Chain` represents a chain of a protein, and is a vector of `Residue`s, which are instantiated from indexing the chain. ## Fields -- `id::AbstractString`: A string identifier (usually a single letter). +- `id::String`: A string identifier (usually a single letter). - `backbone::Backbone`: A backbone with a length divisible by 3, to ensure 3 atoms per residue (N, Ca, C). - `modelnum::Int`: The model number of the chain. - `resnums::Vector{Int}`: storing the residue numbers. @@ -22,8 +22,9 @@ struct Chain <: AbstractVector{Residue} backbone::Backbone modelnum::Int resnums::Vector{Int} - aavector::Vector{Char} # TODO: switch to BioStructures.LongAA (from BioSequences) + aavector::Vector{Char} ssvector::Vector{Char} + residue_atoms_dict::Dict{Int, Vector{Atom}} function Chain( backbone::Backbone; @@ -32,6 +33,7 @@ struct Chain <: AbstractVector{Residue} resnums::Vector{Int} = collect(1:length(backbone) ÷ 3), aavector::Vector{Char} = fill('G', length(backbone) ÷ 3), ssvector::Union{Vector{Char}, Vector{<:Integer}} = fill(' ', length(backbone) ÷ 3), + residue_atoms_dict::Dict{Int, Vector{Atom}} = Dict{Int, Vector{Atom}}(i => Atom[] for i in resnums), ) L, r = divrem(length(backbone), 3) iszero(r) || throw(ArgumentError("backbone must have a length divisible by 3")) @@ -39,7 +41,9 @@ struct Chain <: AbstractVector{Residue} length(aavector) == L || throw(ArgumentError("length of aavector must be equal to length of backbone divided by 3")) length(ssvector) == L || throw(ArgumentError("length of ssvector must be equal to length of backbone divided by 3")) ssvector isa Vector{<:Integer} && (ssvector = get.(('-', 'H', 'E'), ssvector, ' ')) - return new(id, backbone, modelnum, resnums, aavector, ssvector) + chain = new(id, backbone, modelnum, resnums, aavector, ssvector, residue_atoms_dict) + assign_missing_backbone_atoms!(chain) + return chain end Chain(id::AbstractString, backbone::Backbone; kwargs...) = Chain(backbone; id=id, kwargs...) @@ -48,7 +52,7 @@ end @inline Base.:(==)(chain1::Chain, chain2::Chain) = chain1.id == chain2.id && chain1.backbone == chain2.backbone && chain1.ssvector == chain2.ssvector @inline Base.length(chain::Chain) = length(chain.backbone) ÷ 3 @inline Base.size(chain::Chain) = Tuple(length(chain)) -@inline Base.getindex(chain::Chain, i::Integer) = Residue(chain.resnums[i], chain.aavector[i], chain.ssvector[i]) +@inline Base.getindex(chain::Chain, i::Integer) = Residue(chain.resnums[i], chain.residue_atoms_dict[chain.resnums[i]], chain.aavector[i], chain.ssvector[i]) Base.summary(chain::Chain) = "Chain $(chain.id) with $(length(chain)) residue$(length(chain) == 1 ? "" : "s")" Base.show(io::IO, chain::Chain) = print(io, summary(chain)) @@ -60,7 +64,17 @@ has_assigned_ss(ssvector::Vector{Char}) = all(!=(' '), ssvector) has_assigned_ss(chain::Chain) = has_assigned_ss(chain.ssvector) has_assigned_ss(protein::AbstractVector{Chain}) = all(has_assigned_ss, protein) -Backboner.is_knotted(chain::Chain) = is_knotted(chain.backbone[2:3:end]) +Backboner.is_knotted(chain::Chain) = is_knotted(@view(chain.backbone[2:3:end])) + +function assign_missing_backbone_atoms!(chain::Chain) + reshaped_backbone = reshape(chain.backbone.coords, 3, 3, :) + for (residue_backbone_coords, residue) in zip(eachslice(reshaped_backbone, dims=3), chain) + atom_names = map(atom -> atom.name, residue.atoms) + for (name, coords) in Iterators.reverse(zip(BACKBONE_ATOM_NAMES, eachcol(residue_backbone_coords))) + !any(==(name), atom_names) && insert!(residue.atoms, 1, Atom(name, coords)) + end + end +end # oxygen_coords function in src/protein/oxygen.jl # TODO: hydrogen_coords diff --git a/src/protein/oxygen.jl b/src/protein/oxygen.jl index 2913f3c..9559653 100644 --- a/src/protein/oxygen.jl +++ b/src/protein/oxygen.jl @@ -19,17 +19,17 @@ end const IDEALIZED_OXYGEN_VECTOR = [-0.672, -1.034, 0.003] -function estimate_oxygen_position( +function _estimate_oxygen_position( CA::AbstractVector{T}, C::AbstractVector{T}, next_N::AbstractVector{T}, ) where T <: Real R = get_rotation_matrix(CA, C, next_N) - return R' \ IDEALIZED_OXYGEN_VECTOR + C # inverse of R' * (oxygen_pos - C) + return R * IDEALIZED_OXYGEN_VECTOR + C # inverse of R' * (oxygen_pos - C) end # last oxygen is put on the same plane as the last residue triangle # 1.2 angstroms away from the C atom # with a 120 degree angle between the C-Ca and C-O bonds -function get_last_oxygen( +function _estimate_oxygen_position_last( N_pos::V, Ca_pos::V, C_pos::V, ) where V <: AbstractVector{T} where T <: Real angle = 2π/3 @@ -38,29 +38,34 @@ function get_last_oxygen( w = cross(v, Ca_pos - N_pos) u = cross(w, v) O_pos = C_pos + normalize(u)*bond_length*cos(angle - 0.5π) - normalize(v)*bond_length*sin(angle - 0.5π) - return O_pos end -function oxygen_coords(backbone::Backbone) - T = eltype(eltype(backbone)) - L = length(backbone) ÷ 3 - - CAs = eachcol(alphacarbon_coords(backbone)) - Cs = eachcol(carbonyl_coords(backbone)) - next_Ns = eachcol(@view nitrogen_coords(backbone)[:, 2:end]) - oxygen_coords = zeros(T, 3, L) - for (i, (CA, C, next_N)) in enumerate(zip(CAs, Cs, next_Ns)) - oxygen_coords[:, i] = estimate_oxygen_position(CA, C, next_N) +function estimate_oxygen_position(chain::Chain, i::Int) + 1 <= i <= length(chain) || throw(ArgumentError("Index $i out of bounds for chain of length $(length(chain))")) + chain[i].atoms + if i == length(chain) + N = chain.backbone[3*(i-1)+1] + CA = chain.backbone[3*(i-1)+2] + C = chain.backbone[3*(i-1)+3] + return _estimate_oxygen_position_last(N, CA, C) + else + CA = chain.backbone[3*(i-1)+2] + C = chain.backbone[3*(i-1)+3] + next_N = chain.backbone[3*(i-1)+4] + return _estimate_oxygen_position(CA, C, next_N) end - oxygen_coords[:, end] = get_last_oxygen(eachcol(backbone[end-2:end])...) +end - return oxygen_coords +function get_oxygen(chain::Chain, i::Int) + residue = chain[i] + k = findfirst(==("O") ∘ (atom -> atom.name), residue.atoms) + isnothing(k) && return estimate_oxygen_position(chain, i) + return residue.atoms[k].coords end """ oxygen_coords(chain::Chain) - oxygen_coords(backbone::Backbone) Add oxygen atoms to the backbone of a protein, turning the coordinate array from size 3x3xL to 3x4xL-1, where L is the length of the backbone. @@ -83,6 +88,14 @@ julia> oxygen_coords(chains["A"]) # returns the estimated position of oxygen ato The `oxygen_coords` function finds the oxygen atoms to the backbone using idealized geometry, and oxygens atom will on average deviate [0.05 Å](https://github.com/MurrellGroup/Backboner.jl/blob/main/test/protein/oxygen.jl) from original PDB positions. Moreover, the last oxygen atom is essentially given a random (although deterministic) orientation, as that information is lost when the backbone is reduced to 3 atoms, and there's no next nitrogen atom to compare with. """ -oxygen_coords(chain::Chain) = oxygen_coords(chain.backbone) +function oxygen_coords(chain::Chain) + return stack(get_oxygen(chain, i) for i in 1:length(chain)) +end + +function assign_missing_oxygens!(chain::Chain) + for (i, residue) in enumerate(chain) + !any(==("O") ∘ (atom -> atom.name), residue.atoms) && insert!(residue.atoms, 4, Atom("O", estimate_oxygen_position(chain, i))) + end +end ncaco_coords(chain::Chain) = cat(reshape(chain.backbone.coords, 3, 3, :), reshape(oxygen_coords(chain), 3, 1, :), dims=2) \ No newline at end of file diff --git a/src/protein/pdb.jl b/src/protein/pdb.jl index ded8f32..56cd3ff 100644 --- a/src/protein/pdb.jl +++ b/src/protein/pdb.jl @@ -17,7 +17,7 @@ backboneselector(res::BioStructures.AbstractResidue) = getresidues(chain::BioStructures.Chain, res_selector) = BioStructures.collectresidues(chain, res_selector) -atomcoords(res, atom_name)::Vector = BioStructures.collectatoms(res, atom -> BioStructures.atomnameselector(atom, [atom_name])) |> only |> BioStructures.coords +atomcoords(res, atom_name) = BioStructures.collectatoms(res, atom -> BioStructures.atomnameselector(atom, [atom_name])) |> only |> BioStructures.coords backbonecoords(res::BioStructures.AbstractResidue)::Matrix = stack(AT -> atomcoords(res, AT), BACKBONE_ATOM_NAMES) @@ -27,13 +27,22 @@ protein_backbone(chain::BioStructures.Chain, res_selector) = Backboner.Backbone( aminoacid_sequence(residues::Vector{<:BioStructures.AbstractResidue}) = map(oneletter_resname, residues) aminoacid_sequence(chain::BioStructures.Chain, res_selector) = aminoacid_sequence(getresidues(chain, res_selector)) +function get_residue_atoms_dict(residues::Vector{<:BioStructures.AbstractResidue}) + atoms = Dict{Int, Vector{Atom}}() + for res in residues + append!(get!(atoms, res.number, Atom[]), Atom.(BioStructures.collectatoms(res, BioStructures.standardselector))) + end + return atoms +end + function Protein.Chain(residues::Vector{<:BioStructures.AbstractResidue}) id = only(unique(BioStructures.chainid.(residues))) backbone = protein_backbone(residues) aavector = aminoacid_sequence(residues) resnums = BioStructures.resnumber.(residues) modelnum = only(unique(BioStructures.modelnumber.(residues))) - return Protein.Chain(id, backbone; resnums, aavector, modelnum) + residue_atoms_dict = get_residue_atoms_dict(residues) + return Protein.Chain(id, backbone; resnums, aavector, modelnum, residue_atoms_dict) end function Protein.Chain(chain::BioStructures.Chain; res_selector=backboneselector) @@ -62,7 +71,6 @@ function readpdb(pdbfile::String) return chains(struc) end - import PDBTools """ @@ -71,30 +79,34 @@ import PDBTools Write a protein (represented as a `Vector{Protein.Chain}`s) to a PDB file. """ function writepdb(protein::Vector{Chain}, filename, header=:auto, footer=:auto) - atoms = PDBTools.Atom[] + all_atoms = PDBTools.Atom[] index = 0 residue_index = 0 for chain in protein - coords = ncaco_coords(chain) - for (residue, res_coords) in zip(chain, eachslice(coords, dims=3)) + residue_backbone_coords = reshape(chain.backbone.coords, 3, 3, :) + assign_missing_oxygens!(chain) + for (i, residue) in enumerate(chain) resname = get(threeletter_aa_names, residue.aa, "XXX") residue_index += 1 - for (name, atomcoords) in zip(["N", "CA", "C", "O"], eachcol(res_coords)) + residue_atoms = [ + [Atom(name, coords) for (name, coords) in zip(BACKBONE_ATOM_NAMES, eachcol(view(residue_backbone_coords, :, :, i)))]; + filter(a -> !(a.name in BACKBONE_ATOM_NAMES), residue.atoms) + ] + for atom in residue_atoms index += 1 - atom = PDBTools.Atom( + push!(all_atoms, PDBTools.Atom( index = index, - name = name, + name = atom.name, resname = resname, chain = chain.id, resnum = residue.num, residue = residue_index, - x = atomcoords[1], - y = atomcoords[2], - z = atomcoords[3], - ) - push!(atoms, atom) + x = atom.coords[1], + y = atom.coords[2], + z = atom.coords[3], + )) end end end - PDBTools.writePDB(atoms, filename, header=header, footer=footer) + PDBTools.writePDB(all_atoms, filename, header=header, footer=footer) end diff --git a/src/protein/protein.jl b/src/protein/protein.jl index 0de8816..a243b99 100644 --- a/src/protein/protein.jl +++ b/src/protein/protein.jl @@ -3,6 +3,7 @@ module Protein using ..Backboner import BioStructures +include("atom.jl") include("residue.jl") include("chain.jl") include("oxygen.jl") diff --git a/src/protein/residue.jl b/src/protein/residue.jl index 7dc5ade..2fdf899 100644 --- a/src/protein/residue.jl +++ b/src/protein/residue.jl @@ -2,19 +2,20 @@ export Residue const threeletter_aa_names = Dict{Char, String}([Char(v) => k for (k, v) in BioStructures.threeletter_to_aa]) -struct Residue +struct Residue #<: AbstractVector{Atom} num::Integer + atoms::Vector{Atom} aa::Char ss::Char - function Residue(num::Integer, aa::Char='G', ss::Char=' ') - return new(num, aa, ss) + function Residue(num::Integer, atoms::Vector{Atom}, aa::Char='G', ss::Char=' ') + return new(num, atoms, aa, ss) end end function Base.show(io::IO, residue::Residue) num = residue.num aa = get(threeletter_aa_names, residue.aa, "XXX") - ss = residue.ss - print(io, "Residue $num $aa $ss") + ss = residue.ss == ' ' ? "" : "($(residue.ss))" + print(io, "Residue $num $aa with $(length(residue.atoms)) atom$(length(residue.atoms) == 1 ? "" : "s") $ss") end \ No newline at end of file diff --git a/test/protein/chain.jl b/test/protein/chain.jl index 17ae617..ec93707 100644 --- a/test/protein/chain.jl +++ b/test/protein/chain.jl @@ -16,7 +16,7 @@ protein = [chain] @test protein["A"] == protein[:A] == protein[1] - @test chain[1] == Residue(1, 'G', ' ') + @test chain[1] == Residue(1, chain.residue_atoms_dict[1], 'G', ' ') @test summary(chain) == "Chain A with 5 residues" diff --git a/test/protein/protein.jl b/test/protein/protein.jl index 42c7262..65f916f 100644 --- a/test/protein/protein.jl +++ b/test/protein/protein.jl @@ -4,5 +4,4 @@ include("residue.jl") include("chain.jl") include("oxygen.jl") include("pdb.jl") -include("rotations.jl") include("idealization.jl") \ No newline at end of file diff --git a/test/protein/residue.jl b/test/protein/residue.jl index 90d0147..c9d2888 100644 --- a/test/protein/residue.jl +++ b/test/protein/residue.jl @@ -1,7 +1,7 @@ @testset "residue.jl" begin - residue = Residue(1, 'A', 'H') + residue = Residue(1, Atom[], 'A', 'H') io = IOBuffer() show(io, residue) - @test String(take!(io)) == "Residue 1 ALA H" + @test String(take!(io)) == "Residue 1 ALA with 0 atoms (H)" end \ No newline at end of file diff --git a/test/protein/rotations.jl b/test/protein/rotations.jl deleted file mode 100644 index f7ceeb7..0000000 --- a/test/protein/rotations.jl +++ /dev/null @@ -1,19 +0,0 @@ -@testset "rotations.jl" begin - - @testset "Backbone constructors" begin - locations = Float64[0; 0; 0;;] - - # 90 degree rotation around x axis - quatrots = [cos(π/2/2); sin(π/2/2); 0; 0;;] - rot_matrices = Float64[1 0 0; 0 0 -1; 0 1 0;;;] - - #@test locs_and_rots_to_backbone(locations, quatrots) ≈ locs_and_rots_to_backbone(locations, rot_matrices) - end - - @testset "locs_and_rots" begin - protein = Protein.readpdb("data/1ASS.pdb") - backbone = protein["A"].backbone - #@test locs_and_rots_to_backbone(backbone_to_locs_and_rots(backbone)...) ≈ backbone - end - -end \ No newline at end of file