From 7ac76e506b416c417ca5f54a5cba63e3f64457ee Mon Sep 17 00:00:00 2001 From: anton083 Date: Mon, 8 Jan 2024 20:38:09 +0100 Subject: [PATCH] Return ints instead of chars --- Project.toml | 8 +++---- src/AssigningSecondaryStructure.jl | 2 +- src/assign.jl | 27 ++++++---------------- src/{io.jl => pdb.jl} | 20 ++++++++--------- src/utils.jl | 36 ------------------------------ test/runtests.jl | 16 ++++++------- 6 files changed, 30 insertions(+), 79 deletions(-) rename src/{io.jl => pdb.jl} (74%) diff --git a/Project.toml b/Project.toml index 8ee9c90..ff0ed7b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AssigningSecondaryStructure" uuid = "8ed43e74-60fb-4e11-99b9-91deed37aef7" -authors = ["Shintaro Minami, Anton Oresten"] -version = "0.2.1" +authors = ["Anton Oresten, Shintaro Minami"] +version = "0.3.0" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -10,8 +10,8 @@ PaddedViews = "5432bcbf-9aad-5242-b902-cca2824c8663" [compat] LinearAlgebra = "1" -PDBTools = "^0.15" -PaddedViews = "^0.5" +PDBTools = "0.15" +PaddedViews = "0.5" julia = "^1.7" [extras] diff --git a/src/AssigningSecondaryStructure.jl b/src/AssigningSecondaryStructure.jl index cd8d7e3..f68a005 100644 --- a/src/AssigningSecondaryStructure.jl +++ b/src/AssigningSecondaryStructure.jl @@ -2,7 +2,7 @@ module AssigningSecondaryStructure include("utils.jl") include("dssp.jl") -include("io.jl") include("assign.jl") +include("pdb.jl") end diff --git a/src/assign.jl b/src/assign.jl index 5273c36..34fadd8 100644 --- a/src/assign.jl +++ b/src/assign.jl @@ -9,34 +9,21 @@ Given a vector of chains, each represented as a 3-dimensional array of size 3x4x - The first dimension corresponds to the x, y, and z coordinates of the atoms. - The second dimension represents the atom type, ordered as N, CA, C, and O. - The third dimension specifies the residue number in the chain. + +Returns a vector of vectors of integers, each of which is the secondary structure assignment +for the corresponding chain and their respective residues. The integers are encoded as follows: +- `1`: coil/loop. Neither helix nor strand. +- `2`: 4-turn helix (α-helix). Minimum length 4 residues. +- `3`: extended strand in parallel and/or anti-parallel β-sheet conformation. Min length 2 residues. """ function assign_secondary_structure(coords_chains::Vector{<:AbstractArray{T, 3}}) where T lengths = size.(coords_chains, 3) coords = cat(coords_chains..., dims=3) num_vector = dssp(coords) - code_vector = [NUM_TO_SS_CODE[num] for num in num_vector] cum_indices = cumsum(lengths) - code_vectors_by_chain = [code_vector[get(cum_indices, n-1, 0)+1:cum_indices[n]] for n in 1:length(lengths)] - - clean_secondary_structure!.(code_vectors_by_chain) + code_vectors_by_chain = [num_vector[get(cum_indices, n-1, 0)+1:cum_indices[n]] for n in 1:length(lengths)] return code_vectors_by_chain end - -""" - assign_secondary_structure(filename) - -Returns a vector of vectors of integers, each of which is the secondary structure assignment -for the corresponding chain and their respective residues. - -The integers are assigned as follows: -- '-': coil/loop. Neither helix nor strand. -- 'H': 4-turn helix (α helix). Minimum length 4 residues. -- 'E': extended strand in parallel and/or anti-parallel β-sheet conformation. Min length 2 residues. -""" -function assign_secondary_structure(filename::String) - chains = load_pdb_backbone_coords(filename) - return assign_secondary_structure(chains) -end \ No newline at end of file diff --git a/src/io.jl b/src/pdb.jl similarity index 74% rename from src/io.jl rename to src/pdb.jl index b85fc5e..daab9bf 100644 --- a/src/io.jl +++ b/src/pdb.jl @@ -1,8 +1,8 @@ -import PDBTools +export load_pdb_chains -export load_pdb_backbone_coords +import PDBTools -function collect_ncaco(atoms::Vector{PDBTools.Atom}) +function collect_residues(atoms::Vector{PDBTools.Atom}) residues = Vector{PDBTools.Atom}[] i = 1 while i <= length(atoms) - 3 # Ensure there are at least four atoms left to process @@ -20,7 +20,7 @@ end function chain_coords(id::AbstractString, atoms::Vector{PDBTools.Atom}) chain_atoms = filter(a -> PDBTools.chain(a) == id, atoms) - residues = collect_ncaco(chain_atoms) + residues = collect_residues(chain_atoms) coords = zeros(Float32, (3, 4, length(residues))) for (i, residue) in enumerate(residues) for (j, atom) in enumerate(residue) @@ -30,15 +30,15 @@ function chain_coords(id::AbstractString, atoms::Vector{PDBTools.Atom}) return coords end -""" - load_pdb_backbone_coords(filename::String) - -Assumes that each residue starts with four atoms: N, CA, C, O. -""" -function load_pdb_backbone_coords(filename::String) +function load_pdb_chains(filename::AbstractString) atoms = PDBTools.readPDB(filename) filter!(a -> a.name in ["N", "CA", "C", "O"], atoms) ids = unique(PDBTools.chain.(atoms)) chains = [chain_coords(id, atoms) for id in ids] return chains +end + +function AssigningSecondaryStructure.assign_secondary_structure(filename::AbstractString) + chains = load_pdb_chains(filename) + return AssigningSecondaryStructure.assign_secondary_structure(chains) end \ No newline at end of file diff --git a/src/utils.jl b/src/utils.jl index 6041ea1..04ff522 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,41 +1,5 @@ # These functions come from numpy and were used to port the code from python to julia. -const NUM_TO_SS_CODE = Dict( - 1 => '-', - 2 => 'H', - 3 => 'E', -) - -const MIN_HELIX_LENGTH = 4 -const MIN_STRAND_LENGTH = 2 - -function clean_secondary_structure!(ss_vector::Vector{Char}) - n = length(ss_vector) - i = 1 - - while i <= n - current_structure = ss_vector[i] - start = i - - while i <= n && ss_vector[i] == current_structure - i += 1 - end - segment_end = i - 1 - segment_length = segment_end - start + 1 - - for (code, max_len) in [('H', MIN_HELIX_LENGTH), ('E', MIN_STRAND_LENGTH)] - if current_structure == code && segment_length < max_len - for j in start:segment_end - ss_vector[j] = '-' - end - end - end - end - - return ss_vector -end - - function _pad(x::T, arr::AbstractArray{T, N}, paddings::Vararg{Tuple{Int, Int}, N}) where {T, N} @assert ndims(arr) == length(paddings) new_size = Int[] diff --git a/test/runtests.jl b/test/runtests.jl index 3e27a58..4465045 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,20 +1,20 @@ using AssigningSecondaryStructure using Test -ss_composition(ss::Vector{Char}) = [count(==(code), ss) for code in ['-', 'H', 'E']] +ss_composition(ss::Vector{Int}) = [count(==(code), ss) for code in 1:3] @testset "AssigningSecondaryStructure.jl" begin @testset "io.jl" begin @testset "1ASS" begin - backbone = load_pdb_backbone_coords("data/1ASS.pdb") + backbone = load_pdb_chains("data/1ASS.pdb") @test length(backbone) == 1 @test size.(backbone, 3) == [152] end @testset "1ZAK" begin - backbone = load_pdb_backbone_coords("data/1ZAK.pdb") + backbone = load_pdb_chains("data/1ZAK.pdb") @test length(backbone) == 2 @test size.(backbone, 3) == [220, 220] end @@ -24,20 +24,20 @@ ss_composition(ss::Vector{Char}) = [count(==(code), ss) for code in ['-', 'H', ' @testset "DSSP" begin @testset "dssp" begin - coords = load_pdb_backbone_coords("data/1ASS.pdb")[1] + coords = load_pdb_chains("data/1ASS.pdb")[1] @test AssigningSecondaryStructure.dssp(coords[:, :, 35:39]) == [1, 1, 1, 1, 1] # minimum helix length is 4 @test AssigningSecondaryStructure.dssp(coords[:, :, 35:40]) == [1, 2, 2, 2, 2, 1] # ends don't count towards helix length :shrug: @test AssigningSecondaryStructure.dssp(coords[:, :, 35:41]) == [1, 2, 2, 2, 2, 2, 1] end @testset "1ASS" begin - ss = assign_secondary_structure("data/1ASS.pdb") + ss = assign_secondary_structure(load_pdb_chains("data/1ASS.pdb")) @test length(ss) == 1 - @test ss_composition.(ss) == [[63, 53, 36]] + @test ss_composition.(ss) == [[60, 53, 39]] end - + @testset "1ZAK" begin - ss = assign_secondary_structure("data/1ZAK.pdb") + ss = assign_secondary_structure(load_pdb_chains("data/1ZAK.pdb")) @test length(ss) == 2 @test ss_composition.(ss) == [[72, 116, 32], [72, 116, 32]] end