forked from skoren/ArrowGrid
-
Notifications
You must be signed in to change notification settings - Fork 0
/
subset.sh
executable file
·137 lines (114 loc) · 4 KB
/
subset.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env bash
######################################################################
# PUBLIC DOMAIN NOTICE
#
# This software is "United States Government Work" under the terms of the United
# States Copyright Act. It was written as part of the authors' official duties
# for the United States Government and thus cannot be copyrighted. This software
# is freely available to the public for use without a copyright
# notice. Restrictions cannot be placed on its present or future use.
#
# Although all reasonable efforts have been taken to ensure the accuracy and
# reliability of the software and associated data, the National Human Genome
# Research Institute (NHGRI), National Institutes of Health (NIH) and the
# U.S. Government do not and cannot warrant the performance or results that may
# be obtained by using this software or data. NHGRI, NIH and the U.S. Government
# disclaim all warranties as to performance, merchantability or fitness for any
# particular purpose.
#
# Please cite the authors in any work or product based on this material.
######################################################################
SCRIPT_PATH=`cat scripts`
if [ -e `pwd`/CONFIG ]; then
CONFIG=`pwd`/CONFIG
else
CONFIG=${SCRIPT_PATH}/CONFIG
fi
. "${CONFIG}"
if [ ! -z "${LD_ADDITION}" ]; then
export LD_LIBRARY_PATH=$LD_ADDITION:$LD_LIBRARY_PATH
fi
wrk=`pwd`
syst=`uname -s`
arch=`uname -m`
name=`uname -n`
if [ "$arch" = "x86_64" ] ; then
arch="amd64"
fi
if [ $GRID == "SGE" ]; then
baseid=$SGE_TASK_ID
offset=$1
cores=$NSLOTS
elif [ $GRID == "LSF" ]; then
baseid=$LSB_JOBINDEX
offset=$1
#LSB_MCPU_HOSTS=blade18-1-2.gsc.wustl.edu 8
cores=$(echo ${LSB_MCPU_HOSTS} | awk '{print $2}')
elif [ $GRID == "SLURM" ]; then
baseid=$SLURM_ARRAY_TASK_ID
offset=$1
cores=$SLURM_CPUS_PER_TASK
fi
if [ x$baseid = x -o x$baseid = xundefined -o x$baseid = x0 ]; then
baseid=$1
offset=0
cores=`grep -c ^processor /proc/cpuinfo`
fi
if [ x$offset = x ]; then
offset=0
fi
jobid=`expr $baseid + $offset`
if test x$jobid = x; then
echo Error: I need SGE_TASK_ID set, or a job index on the command line
exit 1
fi
echo Running job $jobid based on command line options.
line=`cat filterfofn |head -n $jobid |tail -n 1`
prefix=`cat prefix`
reference=`cat asm`
whitelist=`cat whitelist`
if [ -e seq/$prefix.$jobid.filtered.bam ]; then
echo "Already done"
exit
fi
IS_BAM=`echo $line |grep ".bam$" |wc -l |awk '{print $1}'`
if [ $IS_BAM -eq 0 ]; then
# not a bam input convert
echo "Not a bam input, assuming P6-C4 chemistry, converting to BAM"
fofn=`echo $line |awk '{print $1}'`
`find $fofn*.bax.h5 > $prefix.$jobid.fofn`
bax2bam -f $prefix.$jobid.fofn -o $prefix.$jobid
line="$prefix.$jobid.subreads.bam"
fi
echo "Subsetting $prefix using files in $whitelist"
# get list of read names in this file
samtools view $line | awk '{print $1}' > $jobid.readnames
java -cp $SCRIPT_PATH:. SubFile $jobid.readnames $whitelist 1 > $jobid.filterednames
# subset in chunks to avoid making arguments too long
split -d -a 5 -l 5000 $jobid.filterednames $jobid.split.
dataset create $jobid.xml $line
for file in `ls $jobid.split.*`; do
ARGS=`cat $file |awk '{print "qname="$2}' |tr '\n' ' '`
ID=`echo $file |awk -F "." '{print $NF}'`
# now we can subset
dataset filter $jobid.xml $jobid.$ID.filter.xml $ARGS
done
# now we create the subsets in parallel
ls $jobid.split.* |awk -F "." '{print $NF}' |parallel --tmpdir `pwd` -P $cores dataset --log-file $jobid.{1}.log consolidate --noTmp $jobid.{1}.filter.xml $jobid.{1}.filtered.bam $jobid.{1}.final.xml
# and merge
dataset create $jobid.filtered.xml $jobid.*.filtered.bam
dataset consolidate --noTmp $jobid.filtered.xml seq/$prefix.$jobid.filtered.bam seq/$prefix.$jobid.filtered.xml
#clean up
rm -f $jobid.xml
rm -f $jobid.fasta
rm -f $jobid.readnames
rm -f $jobid.filter.xml
rm -f $jobid.final.xml
rm -f $jobid.split*
rm -f $jobid.*.xml
rm -f $jobid.*.filtered.bam*
rm -f $jobid.*.log
if [ $IS_BAM -eq 0 ]; then
# removing converted bam file
rm -f $line
fi