-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_gbk.sh
executable file
·154 lines (135 loc) · 3.37 KB
/
run_gbk.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/bin/bash
# Copyright 2023 Matthieu Barba
# This program is free software under AGPLv3 license
# License terms are in the LICENSE file, or at <http://www.gnu.org/licenses/>.
DIR=""
NAME="syntebase"
BLOCKS_TOLERANCE=2
# Optionals
descrip=""
author=""
JOBS=1
function usage {
if [ -n "$1" ]; then
echo "[ $1 ]"
fi
read -d '' help << '_EOF_' || true
usage: run_migenis.sh -i path/to/gbk/files/ -n dbname
Files:
-i <path> : path to the gbk files directory
-n <str> : name of the database
Optional:
-p <num> : blocks tolerance (default: 2)
-N <str> : database description
-A <str> : database authors
-j <num> : number of threads (default: 1)
_EOF_
echo "$help"
exit
}
export -f usage
####################################################
# Run with all parameters
while getopts "i:n:p:N:A:j:h" option
do
case $option in
i)
DIR=$OPTARG
;;
n)
NAME=$OPTARG
;;
p)
BLOCKS_TOLERANCE=$OPTARG
;;
N)
descrip=$OPTARG
;;
A)
author=$OPTARG
;;
j)
JOBS=$OPTARG
;;
h)
usage
;;
esac
done
if [ -z "$author" ] ; then author=""; fi
if [ -z "$descrip" ] ; then descrip=""; fi
if [ -z "$JOBS" ] ; then JOBS=1; fi
if [ -z "$BLOCKS_TOLERANCE" ] ; then BLOCKS_TOLERANCE=2; fi
if [ -z "$DIR" ] ; then usage "Gbk directory needed (-i)"; fi
DIR=$(realpath $DIR)
function echo_log {
echo "[$(date +'%F %T')] $1" 1>&2
}
WORK_DIR=$DIR/temp
mkdir -p $WORK_DIR
# Delete any previous files with the same name
cd $WORK_DIR
rm ./*.tmp ./*.faa* -f
ln -s $DIR/*.* $WORK_DIR/
# Check usable files
n_files=$(ls *.gb* *.dat *.txt *.embl 2> /dev/null | wc -l)
if [ $n_files -lt 2 ]
then
echo_log "Not enough files to build a Synteruptor database ($n_files). Verify the extension used"
exit 1
fi
# Remove special characters from file names
rename 's/ /_/g' *.*
rename 's/[^A-Za-z0-9_\-.]+//g' *.*
# Prepare genes file
echo_log "Begin Migenis database creation for $NAME with $n_files files"
shopt -s extglob
list=`ls +(*.gb*|*.dat|*.txt|*.embl)`
# Blast
echo_log "Extract fasta files"
parallel --jobs $JOBS gbk_parser.pl -i {} -f {.}.faa ::: $list
# Check that all files have sequences
empty_fasta="0"
for fasta in $(ls *.faa); do
nseqs=$(grep '>' $fasta | wc -l)
if [ "$nseqs" == "0" ]
then
echo "Fasta file $fasta has no sequences."
empty_fasta=1
fi
done
if [ "$empty_fasta" == "1" ]
then
echo "Some fasta files had no sequences. Check the input files."
exit 1
fi
echo_log "Blast all vs all"
BLAST_FILE=$NAME"_blast.txt.tmp"
rm -f $BLAST_FILE
blaster_local.sh -n $JOBS >&2 || exit 1
cat *.blast > $BLAST_FILE
echo_log "Prepare genes data"
GENES_FILE=$NAME"_genes.txt.tmp"
GENOMES_FILE=$NAME"_genomes.txt.tmp"
BLASTDB=$NAME".faa"
OPTP=""
if [ -n "$BLOCKS_TOLERANCE" ]; then
OPTP="-p $BLOCKS_TOLERANCE"
fi
gbk_parser.pl -i "*.gb* *.dat *.txt *.embl" -o $GENES_FILE -g $GENOMES_FILE -f $BLASTDB || exit 1
# Run the breaks search
echo_log "Search for breaks and create the database"
DATABASE_FILE=$NAME".sqlite"
rm -f $DATABASE_FILE
migenis_log=$NAME.error.log
run_migenis.sh -i $BLAST_FILE -g $GENES_FILE -d $DATABASE_FILE -G $GENOMES_FILE $OPTP -A "$author" -N "$descrip" -E $migenis_log >&2
if [ $? -eq 0 ]; then
cp $WORK_DIR/$DATABASE_FILE $DIR/
echo_log "Database created: $DIR/$DATABASE_FILE"
cp $WORK_DIR/$BLASTDB $DIR/
echo_log "Blast database also created: $DIR/$BLASTDB"
else
cat $migenis_log
echo_log "Error: run_migenis.sh failed at some point (error $!)"
exit 1
fi