Skip to content

Commit

Permalink
Return ints instead of chars
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonOresten committed Jan 8, 2024
1 parent 31c895e commit 7ac76e5
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 79 deletions.
8 changes: 4 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "AssigningSecondaryStructure"
uuid = "8ed43e74-60fb-4e11-99b9-91deed37aef7"
authors = ["Shintaro Minami, Anton Oresten"]
version = "0.2.1"
authors = ["Anton Oresten, Shintaro Minami"]
version = "0.3.0"

[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand All @@ -10,8 +10,8 @@ PaddedViews = "5432bcbf-9aad-5242-b902-cca2824c8663"

[compat]
LinearAlgebra = "1"
PDBTools = "^0.15"
PaddedViews = "^0.5"
PDBTools = "0.15"
PaddedViews = "0.5"
julia = "^1.7"

[extras]
Expand Down
2 changes: 1 addition & 1 deletion src/AssigningSecondaryStructure.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module AssigningSecondaryStructure

include("utils.jl")
include("dssp.jl")
include("io.jl")
include("assign.jl")
include("pdb.jl")

end
27 changes: 7 additions & 20 deletions src/assign.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,21 @@ Given a vector of chains, each represented as a 3-dimensional array of size 3x4x
- The first dimension corresponds to the x, y, and z coordinates of the atoms.
- The second dimension represents the atom type, ordered as N, CA, C, and O.
- The third dimension specifies the residue number in the chain.
Returns a vector of vectors of integers, each of which is the secondary structure assignment
for the corresponding chain and their respective residues. The integers are encoded as follows:
- `1`: coil/loop. Neither helix nor strand.
- `2`: 4-turn helix (α-helix). Minimum length 4 residues.
- `3`: extended strand in parallel and/or anti-parallel β-sheet conformation. Min length 2 residues.
"""
function assign_secondary_structure(coords_chains::Vector{<:AbstractArray{T, 3}}) where T
lengths = size.(coords_chains, 3)

coords = cat(coords_chains..., dims=3)
num_vector = dssp(coords)
code_vector = [NUM_TO_SS_CODE[num] for num in num_vector]

cum_indices = cumsum(lengths)
code_vectors_by_chain = [code_vector[get(cum_indices, n-1, 0)+1:cum_indices[n]] for n in 1:length(lengths)]

clean_secondary_structure!.(code_vectors_by_chain)
code_vectors_by_chain = [num_vector[get(cum_indices, n-1, 0)+1:cum_indices[n]] for n in 1:length(lengths)]

return code_vectors_by_chain
end

"""
assign_secondary_structure(filename)
Returns a vector of vectors of integers, each of which is the secondary structure assignment
for the corresponding chain and their respective residues.
The integers are assigned as follows:
- '-': coil/loop. Neither helix nor strand.
- 'H': 4-turn helix (α helix). Minimum length 4 residues.
- 'E': extended strand in parallel and/or anti-parallel β-sheet conformation. Min length 2 residues.
"""
function assign_secondary_structure(filename::String)
chains = load_pdb_backbone_coords(filename)
return assign_secondary_structure(chains)
end
20 changes: 10 additions & 10 deletions src/io.jl → src/pdb.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import PDBTools
export load_pdb_chains

export load_pdb_backbone_coords
import PDBTools

function collect_ncaco(atoms::Vector{PDBTools.Atom})
function collect_residues(atoms::Vector{PDBTools.Atom})
residues = Vector{PDBTools.Atom}[]
i = 1
while i <= length(atoms) - 3 # Ensure there are at least four atoms left to process
Expand All @@ -20,7 +20,7 @@ end

function chain_coords(id::AbstractString, atoms::Vector{PDBTools.Atom})
chain_atoms = filter(a -> PDBTools.chain(a) == id, atoms)
residues = collect_ncaco(chain_atoms)
residues = collect_residues(chain_atoms)
coords = zeros(Float32, (3, 4, length(residues)))
for (i, residue) in enumerate(residues)
for (j, atom) in enumerate(residue)
Expand All @@ -30,15 +30,15 @@ function chain_coords(id::AbstractString, atoms::Vector{PDBTools.Atom})
return coords
end

"""
load_pdb_backbone_coords(filename::String)
Assumes that each residue starts with four atoms: N, CA, C, O.
"""
function load_pdb_backbone_coords(filename::String)
function load_pdb_chains(filename::AbstractString)
atoms = PDBTools.readPDB(filename)
filter!(a -> a.name in ["N", "CA", "C", "O"], atoms)
ids = unique(PDBTools.chain.(atoms))
chains = [chain_coords(id, atoms) for id in ids]
return chains
end

function AssigningSecondaryStructure.assign_secondary_structure(filename::AbstractString)
chains = load_pdb_chains(filename)
return AssigningSecondaryStructure.assign_secondary_structure(chains)
end
36 changes: 0 additions & 36 deletions src/utils.jl
Original file line number Diff line number Diff line change
@@ -1,41 +1,5 @@
# These functions come from numpy and were used to port the code from python to julia.

const NUM_TO_SS_CODE = Dict(
1 => '-',
2 => 'H',
3 => 'E',
)

const MIN_HELIX_LENGTH = 4
const MIN_STRAND_LENGTH = 2

function clean_secondary_structure!(ss_vector::Vector{Char})
n = length(ss_vector)
i = 1

while i <= n
current_structure = ss_vector[i]
start = i

while i <= n && ss_vector[i] == current_structure
i += 1
end
segment_end = i - 1
segment_length = segment_end - start + 1

for (code, max_len) in [('H', MIN_HELIX_LENGTH), ('E', MIN_STRAND_LENGTH)]
if current_structure == code && segment_length < max_len
for j in start:segment_end
ss_vector[j] = '-'
end
end
end
end

return ss_vector
end


function _pad(x::T, arr::AbstractArray{T, N}, paddings::Vararg{Tuple{Int, Int}, N}) where {T, N}
@assert ndims(arr) == length(paddings)
new_size = Int[]
Expand Down
16 changes: 8 additions & 8 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
using AssigningSecondaryStructure
using Test

ss_composition(ss::Vector{Char}) = [count(==(code), ss) for code in ['-', 'H', 'E']]
ss_composition(ss::Vector{Int}) = [count(==(code), ss) for code in 1:3]

@testset "AssigningSecondaryStructure.jl" begin

@testset "io.jl" begin

@testset "1ASS" begin
backbone = load_pdb_backbone_coords("data/1ASS.pdb")
backbone = load_pdb_chains("data/1ASS.pdb")
@test length(backbone) == 1
@test size.(backbone, 3) == [152]
end

@testset "1ZAK" begin
backbone = load_pdb_backbone_coords("data/1ZAK.pdb")
backbone = load_pdb_chains("data/1ZAK.pdb")
@test length(backbone) == 2
@test size.(backbone, 3) == [220, 220]
end
Expand All @@ -24,20 +24,20 @@ ss_composition(ss::Vector{Char}) = [count(==(code), ss) for code in ['-', 'H', '
@testset "DSSP" begin

@testset "dssp" begin
coords = load_pdb_backbone_coords("data/1ASS.pdb")[1]
coords = load_pdb_chains("data/1ASS.pdb")[1]
@test AssigningSecondaryStructure.dssp(coords[:, :, 35:39]) == [1, 1, 1, 1, 1] # minimum helix length is 4
@test AssigningSecondaryStructure.dssp(coords[:, :, 35:40]) == [1, 2, 2, 2, 2, 1] # ends don't count towards helix length :shrug:
@test AssigningSecondaryStructure.dssp(coords[:, :, 35:41]) == [1, 2, 2, 2, 2, 2, 1]
end

@testset "1ASS" begin
ss = assign_secondary_structure("data/1ASS.pdb")
ss = assign_secondary_structure(load_pdb_chains("data/1ASS.pdb"))
@test length(ss) == 1
@test ss_composition.(ss) == [[63, 53, 36]]
@test ss_composition.(ss) == [[60, 53, 39]]
end

@testset "1ZAK" begin
ss = assign_secondary_structure("data/1ZAK.pdb")
ss = assign_secondary_structure(load_pdb_chains("data/1ZAK.pdb"))
@test length(ss) == 2
@test ss_composition.(ss) == [[72, 116, 32], [72, 116, 32]]
end
Expand Down

0 comments on commit 7ac76e5

Please sign in to comment.