-
Notifications
You must be signed in to change notification settings - Fork 0
Tony Notes: Miscellaneous
Tony E Lewis edited this page Mar 15, 2017
·
2 revisions
-
/export/people/ucbctnl/gemma_stuff/dfx/dfx_pfam1/tools/compass/compass_db1Xdb2_241
($compass_dbXdb_executable
) -
/export/people/ucbctnl/gemma_stuff/dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed
($compass_executable
)
$compass_params = "-g 0.50001";
Extensions:
-
.faa
- FASTA file -
.aln
- mafft aligned sequence file -
.prof
- COMPASS profile file
cp dfx/dfx_pfam1/tools/mafft-6.864-without-extensions/core/mafft /dev/shm/
rsync -av dfx/dfx_pfam1/tools/mafft-6.864-without-extensions/binaries/ /dev/shm/mafft_binaries_dir/
setenv MAFFT_BINARIES /dev/shm/mafft_binaries_dir
Just copy the sequence files with only one sequence:
ls -1v GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | xargs wc -l | awk '$1 <= 2 {print $2}' | grep -Po '\d+\.faa$' | sed 's/.faa//g' | xargs -P 8 -I VAR cp GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/VAR.faa GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.aln
Use high-quality alignment for S90s with 1 < N <= 200 sequences:
ls -1v GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | xargs wc -l | awk '$1 > 2 && $1 <= 400 {print $2}' | grep -Po '\d+\.faa$' | sed 's/.faa//g' | xargs -P 8 -I VAR /bin/tcsh -c "/dev/shm/mafft --amino --anysymbol --localpair --maxiterate 1000 --quiet GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/VAR.faa > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.aln"
Use low-quality alignment for S90s with N > 200 sequences:
ls -1v GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | xargs wc -l | awk ' $1 > 400 {print $2}' | grep -Po '\d+\.faa$' | sed 's/.faa//g' | xargs -P 8 -I VAR /bin/tcsh -c "/dev/shm/mafft --amino --anysymbol --parttree --retree 1 --quiet GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/VAR.faa > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.aln"
cp dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed /dev/shm/
cp dfx/dfx_pfam1/tools/compass/compass_db1Xdb2_241 /dev/shm/
ls -1v GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/ | sed 's/.aln//g' | xargs -P 8 -I VAR /dev/shm/compass_wp_245_fixed -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.aln -j /dev/shm/small.faa -p1 GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.prof -p2 /dev/shm/VAR.small.prof
/dev/shm/compass_db1Xdb2_241 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/13019.prof -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/13593.prof
( dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/5.faa -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/5.faa -p1 /tmp/compass.profile.2.20.100.10.cluster_5.part_1.prof -p2 /tmp/compass.profile.2.20.100.10.cluster_5.part_2.prof > /tmp/compass.profile.2.20.100.10.cluster_5.stdout ) >& /tmp/compass.profile.2.20.100.10.cluster_5.stderr
profile.pl
cd /export/people/ucbctnl/gemma_stuff/
( dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/5.faa -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/5.faa -p1 /tmp/compass.profile.2.20.100.10.cluster_5.part_1.prof -p2 /tmp/compass.profile.2.20.100.10.cluster_5.part_2.prof > /tmp/compass.profile.2.20.100.10.cluster_5.stdout ) >& /tmp/compass.profile.2.20.100.10.cluster_5.stderr
( dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/6.faa -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/6.faa -p1 /tmp/compass.profile.2.20.100.10.cluster_6.part_1.prof -p2 /tmp/compass.profile.2.20.100.10.cluster_6.part_2.prof > /tmp/compass.profile.2.20.100.10.cluster_6.stdout ) >& /tmp/compass.profile.2.20.100.10.cluster_6.stderr
dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/5.faa -p1 /tmp/test.p1
( dfx/dfx_pfam1/tools/compass/compass_db1Xdb2_241 -i /tmp/compass.profile.2.20.100.10.cluster_5.part_1.prof -j /tmp/compass.profile.2.20.100.10.cluster_6.part_1.prof > /tmp/compass.profile.2.20.100.10.5_vs_6.stdout ) >& /tmp/compass.profile.2.20.100.10.5_vs_6.stderr
GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/
echo '>A;A;' | tr ';' '\n' > /dev/shm/small.faa
cp /export/people/ucbctnl/gemma_stuff/dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed /dev/shm/
Time single
cp /export/people/ucbctnl/gemma_stuff/GeMMA_folders_and_datasets/v4.1_dataset/3.40.33.10/starting-clusters/5866.faa /dev/shm/
/usr/bin/time /dev/shm/compass_wp_245_fixed -g 0.50001 -i /dev/shm/5866.faa -j /dev/shm/5866.faa -p1 /dev/shm/out.prof -p2 /dev/shm/5866.2.prof
Clusters worth processing
13019 versus 13593 should have evalue 7.33e-13
12528 has 600 sequences
GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/12528.faa
GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/13019.faa
GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/13593.faa
6352 versus 6553 should have evalue 3.70e-58
5866 has 181 sequences
GeMMA_folders_and_datasets/v4.1_dataset/3.40.33.10/starting-clusters/6352.faa
GeMMA_folders_and_datasets/v4.1_dataset/3.40.33.10/starting-clusters/6553.faa
GeMMA_folders_and_datasets/v4.1_dataset/3.40.33.10/starting-clusters/5866.faa
ls -1 starting-clusters/ | grep .faa | grep -Po '\d+' | sort -g > tony_starting_cluster_nums.txt
awk '{print $3}' < *.trace > tony_new_cluster_ids
awk '{print $1 " " $2}' < *.trace | grep -Fwvf tony_new_cluster_ids > tony_pure_starting_pairs
awk '{print $1 " " $2}' < *.trace | grep -Fwvf tony_new_cluster_ids | grep -Po '\d+' | sort -gu > tony_clusters_in_pure_starting_pairs
ls -1v starting-clusters/* | grep -Fwf tony_clusters_in_pure_starting_pairs | xargs wc -l | grep -v total | sort -g | tail -n 1
GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/12528.faa GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/13019.faa GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/13593.faa
Be aware: calling mk_compass_db
multiple times will concatenate on top of the existing library rather than replacing it.
rm -f /dev/shm/compass_db_310 /dev/shm/compass_db_310.len
./mk_compass_db_310 -g 0.50001 -i /dev/shm/aln_list -o /dev/shm/compass_db_310
rm -f /dev/shm/compass_db_310.len
wget "ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-*-x64-linux.tar.gz"
tar -zxvf ncbi-blast-2.6.0+-x64-linux.tar.gz
rm -f ncbi-blast-2.6.0+-x64-linux.tar.gz
wc -l GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | awk '$1 <= 2 {print $2}' | sort -V | xargs cat > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.fa
wc -l GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | awk '$1 <= 2 {print $2}' | sort -V | xargs grep '>' GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.fa | sed 's/.faa:>/ /g' | grep -Po '\d+\s.*' > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.mapping.txt
wc -l GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | awk '$1 <= 2 {print $2}' | grep -Po '\b\d+\.faa$' | grep -Po '\d+' | sort -g | xargs -I VAR echo GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.aln > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.aln_list
./mk_compass_db -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.aln_list -o GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db
rm -f GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db.len
dfx/dfx_pfam1/tools/compass/compass_db1Xdb2_241 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db
dfx/dfx_pfam1/tools/compass/compass_db1Xdb2_241 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db | grep -P '\b(Ali\d:|Evalue = ).*' -o | perl -p -e 'if ($_ =~ /aln/) { chomp ($_); }' | sed 's/Evalue =//g' | sed 's/.aln//g' | sed 's/Ali[12]: GeMMA_folders_and_datasets\/v4.1_dataset\/2.20.100.10\/tony_data\///g' | awk '{print $1 " " $2 " " $3}' > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_results.txt
ncbi-blast-*/bin/makeblastdb -in GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.fa -dbtype prot -out GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.blast_db
mkdir -p GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons_blast_results
wc -l GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | awk '$1 <= 2 {print $2}' | grep -Po '\b\d+\.faa$' | sed 's/\.faa$//g' | sort -V | xargs -P 8 -I VAR ncbi-blast-*/bin/blastp -dbsize 100000 -query GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/VAR.faa -db GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.blast_db -outfmt '6 qseqid sseqid pident length slen qlen' -out GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons_blast_results/VAR.blast_results.txt -max_target_seqs 100000000
rm -f compass_db_310.len
./mk_compass_db_310 -g 0.50001 -i /dev/shm/aln_list -o /dev/shm/compass_db_310