Skip to content

Tony Notes: Miscellaneous

Tony E Lewis edited this page Mar 15, 2017 · 2 revisions

Notes

  • /export/people/ucbctnl/gemma_stuff/dfx/dfx_pfam1/tools/compass/compass_db1Xdb2_241 ( $compass_dbXdb_executable )
  • /export/people/ucbctnl/gemma_stuff/dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed ( $compass_executable )

$compass_params = "-g 0.50001";

Extensions:

  • .faa - FASTA file
  • .aln - mafft aligned sequence file
  • .prof - COMPASS profile file

mafft

Preparation

cp dfx/dfx_pfam1/tools/mafft-6.864-without-extensions/core/mafft /dev/shm/
rsync -av dfx/dfx_pfam1/tools/mafft-6.864-without-extensions/binaries/ /dev/shm/mafft_binaries_dir/
setenv MAFFT_BINARIES /dev/shm/mafft_binaries_dir

single-sequences

Just copy the sequence files with only one sequence:

ls -1v GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | xargs wc -l | awk '$1 <= 2              {print $2}' | grep -Po '\d+\.faa$' | sed 's/.faa//g' | xargs -P 8 -I VAR cp GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/VAR.faa GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.aln

Use high-quality alignment for S90s with 1 < N <= 200 sequences:

ls -1v GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | xargs wc -l | awk '$1  > 2 && $1 <= 400 {print $2}' | grep -Po '\d+\.faa$' | sed 's/.faa//g' | xargs -P 8 -I VAR /bin/tcsh -c "/dev/shm/mafft --amino --anysymbol --localpair --maxiterate 1000 --quiet GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/VAR.faa > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.aln"

Use low-quality alignment for S90s with N > 200 sequences:

ls -1v GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | xargs wc -l | awk '           $1 >  400 {print $2}' | grep -Po '\d+\.faa$' | sed 's/.faa//g' | xargs -P 8 -I VAR /bin/tcsh -c "/dev/shm/mafft --amino --anysymbol --parttree  --retree     1    --quiet GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/VAR.faa > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.aln"

compass model building

Preparation

cp dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed /dev/shm/
cp dfx/dfx_pfam1/tools/compass/compass_db1Xdb2_241 /dev/shm/

Build models

ls -1v GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/ | sed 's/.aln//g' | xargs -P 8 -I VAR /dev/shm/compass_wp_245_fixed -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.aln -j /dev/shm/small.faa -p1 GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.prof -p2 /dev/shm/VAR.small.prof
/dev/shm/compass_db1Xdb2_241 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/13019.prof -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/13593.prof

Misc

( dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/5.faa -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/5.faa -p1 /tmp/compass.profile.2.20.100.10.cluster_5.part_1.prof -p2 /tmp/compass.profile.2.20.100.10.cluster_5.part_2.prof > /tmp/compass.profile.2.20.100.10.cluster_5.stdout ) >& /tmp/compass.profile.2.20.100.10.cluster_5.stderr

profile.pl
cd /export/people/ucbctnl/gemma_stuff/

( dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/5.faa -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/5.faa -p1 /tmp/compass.profile.2.20.100.10.cluster_5.part_1.prof -p2 /tmp/compass.profile.2.20.100.10.cluster_5.part_2.prof > /tmp/compass.profile.2.20.100.10.cluster_5.stdout ) >& /tmp/compass.profile.2.20.100.10.cluster_5.stderr

( dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/6.faa -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/6.faa -p1 /tmp/compass.profile.2.20.100.10.cluster_6.part_1.prof -p2 /tmp/compass.profile.2.20.100.10.cluster_6.part_2.prof > /tmp/compass.profile.2.20.100.10.cluster_6.stdout ) >& /tmp/compass.profile.2.20.100.10.cluster_6.stderr

dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/5.faa -p1 /tmp/test.p1

( dfx/dfx_pfam1/tools/compass/compass_db1Xdb2_241 -i /tmp/compass.profile.2.20.100.10.cluster_5.part_1.prof -j /tmp/compass.profile.2.20.100.10.cluster_6.part_1.prof > /tmp/compass.profile.2.20.100.10.5_vs_6.stdout ) >& /tmp/compass.profile.2.20.100.10.5_vs_6.stderr




GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/



echo '>A;A;' | tr ';' '\n' > /dev/shm/small.faa
cp /export/people/ucbctnl/gemma_stuff/dfx/dfx_pfam1/tools/compass/compass_wp_245_fixed /dev/shm/

Time single
cp /export/people/ucbctnl/gemma_stuff/GeMMA_folders_and_datasets/v4.1_dataset/3.40.33.10/starting-clusters/5866.faa /dev/shm/
/usr/bin/time /dev/shm/compass_wp_245_fixed -g 0.50001 -i /dev/shm/5866.faa -j /dev/shm/5866.faa -p1 /dev/shm/out.prof -p2 /dev/shm/5866.2.prof



Clusters worth processing
13019 versus 13593 should have evalue 7.33e-13
12528 has 600 sequences
GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/12528.faa
GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/13019.faa
GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/13593.faa

6352 versus 6553 should have evalue 3.70e-58
5866 has 181 sequences
GeMMA_folders_and_datasets/v4.1_dataset/3.40.33.10/starting-clusters/6352.faa
GeMMA_folders_and_datasets/v4.1_dataset/3.40.33.10/starting-clusters/6553.faa
GeMMA_folders_and_datasets/v4.1_dataset/3.40.33.10/starting-clusters/5866.faa


ls -1 starting-clusters/ | grep .faa | grep -Po '\d+' | sort -g > tony_starting_cluster_nums.txt
awk '{print $3}' < *.trace > tony_new_cluster_ids
awk '{print $1 " " $2}' < *.trace | grep -Fwvf tony_new_cluster_ids > tony_pure_starting_pairs
awk '{print $1 " " $2}' < *.trace | grep -Fwvf tony_new_cluster_ids | grep -Po '\d+' | sort -gu > tony_clusters_in_pure_starting_pairs
ls -1v starting-clusters/* | grep -Fwf tony_clusters_in_pure_starting_pairs | xargs wc -l | grep -v total | sort -g | tail -n 1


GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/12528.faa GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/13019.faa GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/13593.faa

COMPASS 3.1 bits

Be aware: calling mk_compass_db multiple times will concatenate on top of the existing library rather than replacing it.

rm -f /dev/shm/compass_db_310 /dev/shm/compass_db_310.len 
./mk_compass_db_310 -g 0.50001 -i /dev/shm/aln_list -o /dev/shm/compass_db_310
rm -f /dev/shm/compass_db_310.len 

Compare Blast seq-id vs COMPASS evalue for 2.20.100.10 Singletons

wget "ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-*-x64-linux.tar.gz"
tar -zxvf ncbi-blast-2.6.0+-x64-linux.tar.gz
rm -f ncbi-blast-2.6.0+-x64-linux.tar.gz
wc -l GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | awk '$1 <= 2 {print $2}' | sort -V | xargs cat > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.fa
wc -l GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | awk '$1 <= 2 {print $2}' | sort -V | xargs grep '>' GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.fa | sed 's/.faa:>/ /g' | grep -Po '\d+\s.*' > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.mapping.txt

wc -l GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | awk '$1 <= 2 {print $2}' | grep -Po '\b\d+\.faa$' | grep -Po '\d+' | sort -g | xargs -I VAR echo GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/tony_data/VAR.aln > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.aln_list

./mk_compass_db -g 0.50001 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.aln_list -o GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db
rm -f GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db.len
dfx/dfx_pfam1/tools/compass/compass_db1Xdb2_241 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db

dfx/dfx_pfam1/tools/compass/compass_db1Xdb2_241 -i GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db -j GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_prof_db | grep -P '\b(Ali\d:|Evalue = ).*' -o | perl -p -e 'if ($_ =~ /aln/) { chomp ($_); }' | sed 's/Evalue =//g' | sed 's/.aln//g' | sed 's/Ali[12]: GeMMA_folders_and_datasets\/v4.1_dataset\/2.20.100.10\/tony_data\///g' | awk '{print $1 " " $2 " " $3}' > GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.compass_results.txt

ncbi-blast-*/bin/makeblastdb -in GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.fa -dbtype prot -out GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.blast_db
mkdir -p GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons_blast_results
wc -l GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/*.faa | awk '$1 <= 2 {print $2}' | grep -Po '\b\d+\.faa$' | sed 's/\.faa$//g' | sort -V | xargs -P 8 -I VAR ncbi-blast-*/bin/blastp -dbsize 100000 -query GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/starting-clusters/VAR.faa -db GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons.blast_db -outfmt '6 qseqid sseqid pident length slen qlen' -out GeMMA_folders_and_datasets/v4.1_dataset/2.20.100.10/all_singletons_blast_results/VAR.blast_results.txt -max_target_seqs 100000000
rm -f compass_db_310.len 
./mk_compass_db_310 -g 0.50001 -i /dev/shm/aln_list -o /dev/shm/compass_db_310