-
Notifications
You must be signed in to change notification settings - Fork 3
/
longReadValidate
executable file
·112 lines (94 loc) · 2.26 KB
/
longReadValidate
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/bin/bash
############################################################
# Program:
# Author :
############################################################
## BEGIN SCRIPT
usage()
{
cat << EOF
usage: $0 OPTIONS
OPTIONS:
-h Show this message
-i BEDPE input file name
-s slop (default: 0)
-m Moleculo file
-p PacBio file
-k keep temporary files
EOF
}
# Show usage when there are no arguments.
if test -z "$1"
then
usage
exit
fi
BIN_DIR=$( dirname $0 )
PB=
MO=
INPUT=
SLOP=0
KEEP=0
# Check options passed in.
while getopts "h m:p:i:s:k" OPTION
do
case $OPTION in
h)
usage
exit 1
;;
m)
MO=$OPTARG
;;
p)
PB=$OPTARG
;;
i)
INPUT=$OPTARG
;;
s)
SLOP=$OPTARG
;;
k)
KEEP=1
;;
?)
usage
exit
;;
esac
done
if [[ ! -f $PB ]] || [[ ! -f $MO ]]
then
usage
echo -e "Error: -p and -m are required\n"
exit 1
fi
# calculate the number of columns in the input file
NCOL=`head -n 1 $INPUT | awk '{ print NF }'`
pairToPair -type both -is -slop $SLOP -a $PB -b $INPUT \
| awk '! array[$0]++' \
> $INPUT.p.slop$SLOP.tmp
cat $INPUT.p.slop$SLOP.tmp | sort -k 25 > $INPUT.p.slop$SLOP.tmp.sort
cat $INPUT.p.slop$SLOP.tmp.sort \
| $BIN_DIR/filter_pb.py \
> $INPUT.p.slop$SLOP.tmp
pairToPair -type both -is -slop $SLOP -a $MO -b $INPUT \
| awk '! array[$0]++' \
> $INPUT.m.slop$SLOP.tmp
cat $INPUT.p.slop$SLOP.tmp \
| awk '$18==$29' \
| cut -f 19- \
| sort -k7,7n \
| groupBy -g 1,2,3,4,5,6 -c 1 -o count -full \
| $BIN_DIR/zjoin_empty -r -a $INPUT -b stdin -1 7 -2 7 \
| cut -f -$NCOL,$(($NCOL+$NCOL+1)) \
| awk '{ if ($NF=="NA") { $NF=0 } print $0 }' OFS="\t" \
| $BIN_DIR/zjoin_empty -r -1 7 -2 7 -a stdin -b <(cat $INPUT.m.slop$SLOP.tmp | awk '$18==$29' | cut -f 19- | sort -k7,7n | groupBy -g 1,2,3,4,5,6 -c 1 -o count -full) \
| cut -f -$(($NCOL+1)),$(($NCOL+$NCOL+2)) \
| awk '{ if ($NF=="NA") { $NF=0 } print $0 }' OFS="\t"
# > $INPUT.slop$SLOP.val
if [[ "$KEEP" -eq 0 ]]
then
rm $INPUT.p.slop$SLOP.tmp $INPUT.m.slop$SLOP.tmp $INPUT.p.slop$SLOP.tmp.sort
fi