-
Notifications
You must be signed in to change notification settings - Fork 0
/
aa_comp_plotter.py
126 lines (105 loc) · 3.24 KB
/
aa_comp_plotter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 21 18:18:30 2021
This script takes an excel (xlsx) file as an input and generates a plot of amino
acid usage frequency in IDPs. The dataset was obtained from the Disprot human
IDP database. Prior to importing here, the excel file was sorted such that the
longest IDR sequence was first (but this was mostly because I'm not good at pandas).
NB: any entries that contained sequences of less than 30 residues were omitted
from this analysis (per the convention for what length defines an IDP/IDR).
The output text file from this script ("disprot_AAs.txt") and the folded protein
output data ("pdb_AAs.txt") are used by the script "idp_AA_enrinchment.py"
to calculate the fold enrichment of AAs in IDP sequences compared to folded
protein sequences.
@author: emeryusher
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# excel sheet was pre-sorted such that the entry with the longest IDR was first
# NB: you can skip this step and sort via pandas instead if you are so inclined
disprot_df = pd.read_excel('disprot_Hsapiens_v2.xlsx', usecols = ['acc', 'disprot_id', 'region_sequence'])
# get the unique uniprot IDs from the list; this can be used later
acc_nos = np.array(disprot_df['acc'].unique())
# remove the lines that are duplicates, but preserve the first instance (which contains the longest IDR sequence)
clean_df = disprot_df.drop_duplicates(subset=['acc'])
# make variables for each amino acid that we will iteratively add to while counting
# the appearnce of each residue over all of the entries
A = 0
C = 0
D = 0
E = 0
F = 0
G = 0
H = 0
I = 0
K = 0
L = 0
M = 0
N = 0
P = 0
Q = 0
R = 0
S = 0
T = 0
V = 0
W = 0
Y = 0
aa_strings = clean_df['region_sequence'].unique()
aa_lab = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
for i in range(0, len(aa_strings)):
for x in aa_strings[i]:
if x == 'A':
A = A + 1
elif x == 'C':
C = C + 1
elif x == 'D':
D = D + 1
elif x == 'E':
E = E + 1
elif x == 'F':
F = F + 1
elif x == 'G':
G = G + 1
elif x == 'H':
H = H + 1
elif x == 'I':
I = I + 1
elif x == 'K':
K = K + 1
elif x == 'L':
L = L + 1
elif x == 'M':
M = M + 1
elif x == 'N':
N = N + 1
elif x == 'P':
P = P + 1
elif x == 'Q':
Q = Q + 1
elif x == 'R':
R = R + 1
elif x == 'S':
S = S + 1
elif x == 'T':
T = T + 1
elif x == 'V':
V = V + 1
elif x == 'W':
W = W + 1
elif x == 'Y':
Y = Y + 1
aa = np.array([A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y])
total = sum(aa)
freq = aa / total
# save amino acid counts in text file
np.savetxt('disprot_AAs.txt', aa, fmt = '%s')
## plotting stuff ##
ypos = np.arange(len(aa_lab))
plt.bar(ypos, freq, color = 'grey', edgecolor = 'white', lw = 1)
plt.xticks(ypos, aa_lab)
plt.xlabel('amino acid')
plt.ylabel('frequency')
#plt.savefig('disprot_AAs.ps', format = 'ps', dpi = 300)
plt.show()