-
Notifications
You must be signed in to change notification settings - Fork 0
/
group_and_classify.py
94 lines (84 loc) · 3.44 KB
/
group_and_classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 5 16:43:43 2013
@author: jl
"""
from Bio import SeqIO
from functions import *
threads = 7 #This number should be the same as the number of cores used in the extractor.py
records = []
records_orig = []
#Gather the .fastq and .fa files generated by the different cores and put them together
i=1
while i<=threads:
infile='output/output_'+str(i)+'.fa'
infile2='output/output_'+str(i)+'.fastq'
for record in SeqIO.parse(infile, "fasta"):
records.append(record)
for record in SeqIO.parse(infile2, "fastq"):
records_orig.append(record)
print i
i+=1
SeqIO.write(records, "output/output_all.fasta", "fasta") #Export all the cut reads to a fasta file
SeqIO.write(records_orig, "output/output_all.fastq", "fastq") #Export all the original filtered reads to a fastq file
#for each read, look at dictionary and redirect it to the proper file
letters= map(chr, range(65, 73)) #=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
#Generate dictionary structure for individuals and locus
records={} #cut reads in fasta
records2={} #original filtered reads in fastq
for letter in letters:
i = 1
while i < 13:
individual=letter+format(i)
records[individual]={}
records2[individual]={}
i+=1
#Read output (cut fasta) and classify it into individual and locus dictionary structure
infile='output/output_all.fasta'
for record in SeqIO.parse(infile, "fasta"):
this_record=record.id.split('_')
if this_record[1] in records[this_record[0]]:
records[this_record[0]][this_record[1]].append(record)
else:
records[this_record[0]][this_record[1]]=[]
records[this_record[0]][this_record[1]].append(record)
#Read output (original filtered fastq) and classify it into individual and locus dictionary structure
infile='output/output_all.fastq'
for record in SeqIO.parse(infile, "fastq"):
this_record=record.id.split('_')
if this_record[1] in records2[this_record[0]]:
records2[this_record[0]][this_record[1]].append(record)
else:
records2[this_record[0]][this_record[1]]=[]
records2[this_record[0]][this_record[1]].append(record)
try:
os.system('mkdir output')
except:
pass
#We will parse the dictionary structure of records, navigating individuals and locus
for indiv in records:
try:
os.system('mkdir output/'+indiv) #Folder for individual
except:
pass
for locus in records[indiv]:
try:
os.system('mkdir output/'+indiv+'/'+locus) #Folder for locus
except:
pass
try:
#Write output and run mira assembly
SeqIO.write(records[indiv][locus], "output/"+indiv+'/'+locus+'/'+indiv+'_'+locus+'_output.fa', "fasta")
SeqIO.write(records2[indiv][locus], "output/"+indiv+'/'+locus+'/'+indiv+'_'+locus+'_output.fastq', "fastq")
if len(records[indiv][locus])>=5:
#mira_assembly(indiv,locus)
#cluster_and_align(indiv,locus,0.95)#individual, locus and clustering percentage
f = open('output/'+indiv+'/'+locus+'/'+indiv+'_'+locus+'_lengths.txt','w')
for record in records[indiv][locus]:
print>>f, record.id+'\t'+record.id.split('_')[3]
f.close()
R_plot(indiv,locus) #Generate R code
else:
print 'Not enough reads here', indiv, locus
except:
print 'No reads here', indiv, locus