diff --git a/.gitignore b/.gitignore index f05e383..495cea6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,16 @@ __pycache__ .snakemake .ipynb_checkpoints +.sourcetrail data tools sandbox jupyter *.log *.aux -publications/astronaut-telomeres-paper -publications/methods-paper/figures/*-.png -publications/methods-paper/figures/*/*-.png -publications/methods-paper/figures/*.svg -publications/methods-paper/figures/*/*.svg +assets/paper/figures/*.png +assets/paper/figures/*/*.png +assets/paper/figures/*.svg +assets/paper/figures/*/*.svg +*.odt +Snakefile diff --git a/assets/M19947.hmm b/assets/M19947.hmm deleted file mode 100644 index 0193192..0000000 --- a/assets/M19947.hmm +++ /dev/null @@ -1,809 +0,0 @@ -HMMER3/f [3.2.1 | June 2018] -NAME M19947 -LENG 262 -MAXL 411 -ALPH DNA -RF no -MM no -CONS yes -CS no -MAP yes -DATE Fri May 17 13:25:39 2019 -NSEQ 1 -EFFN 1.000000 -CKSUM 349386442 -STATS LOCAL MSV -14.3697 0.70754 -STATS LOCAL VITERBI -15.7278 0.70754 -STATS LOCAL FORWARD -4.3609 0.70754 -HMM A C G T - m->m m->i m->d i->m i->i d->m d->d - COMPO 1.52003 1.94148 0.99245 1.31999 - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 0.00000 * - 1 1.91091 1.78233 1.95844 0.61114 1 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 2 1.91091 1.78233 1.95844 0.61114 2 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 3 1.91091 1.78233 1.95844 0.61114 3 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 4 0.56139 2.08489 1.86185 1.89791 4 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 5 1.97267 2.25870 0.46443 2.05625 5 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 6 1.97267 2.25870 0.46443 2.05625 6 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 7 1.97267 2.25870 0.46443 2.05625 7 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 8 1.91091 1.78233 1.95844 0.61114 8 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 9 1.91091 1.78233 1.95844 0.61114 9 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 10 0.56139 2.08489 1.86185 1.89791 10 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 11 1.97267 2.25870 0.46443 2.05625 11 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 12 1.97267 2.25870 0.46443 2.05625 12 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 13 1.97267 2.25870 0.46443 2.05625 13 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 14 1.91091 1.78233 1.95844 0.61114 14 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 15 1.91091 1.78233 1.95844 0.61114 15 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 16 0.56139 2.08489 1.86185 1.89791 16 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 17 1.97267 2.25870 0.46443 2.05625 17 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 18 1.97267 2.25870 0.46443 2.05625 18 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 19 1.97267 2.25870 0.46443 2.05625 19 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 20 1.91091 1.78233 1.95844 0.61114 20 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 21 1.91091 1.78233 1.95844 0.61114 21 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 22 0.56139 2.08489 1.86185 1.89791 22 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 23 1.97267 2.25870 0.46443 2.05625 23 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 24 1.97267 2.25870 0.46443 2.05625 24 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 25 1.97267 2.25870 0.46443 2.05625 25 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 26 1.91091 1.78233 1.95844 0.61114 26 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 27 1.91091 1.78233 1.95844 0.61114 27 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 28 0.56139 2.08489 1.86185 1.89791 28 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 29 1.97267 2.25870 0.46443 2.05625 29 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 30 1.97267 2.25870 0.46443 2.05625 30 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 31 1.97267 2.25870 0.46443 2.05625 31 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 32 1.91091 1.78233 1.95844 0.61114 32 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 33 1.91091 1.78233 1.95844 0.61114 33 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 34 0.56139 2.08489 1.86185 1.89791 34 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 35 1.97267 2.25870 0.46443 2.05625 35 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 36 1.97267 2.25870 0.46443 2.05625 36 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 37 1.97267 2.25870 0.46443 2.05625 37 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 38 1.91091 1.78233 1.95844 0.61114 38 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 39 1.91091 1.78233 1.95844 0.61114 39 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 40 0.56139 2.08489 1.86185 1.89791 40 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 41 1.97267 2.25870 0.46443 2.05625 41 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 42 1.97267 2.25870 0.46443 2.05625 42 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 43 1.97267 2.25870 0.46443 2.05625 43 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 44 1.91091 1.78233 1.95844 0.61114 44 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 45 1.91091 1.78233 1.95844 0.61114 45 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 46 0.56139 2.08489 1.86185 1.89791 46 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 47 1.97267 2.25870 0.46443 2.05625 47 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 48 1.97267 2.25870 0.46443 2.05625 48 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 49 1.97267 2.25870 0.46443 2.05625 49 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 50 1.91091 1.78233 1.95844 0.61114 50 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 51 1.91091 1.78233 1.95844 0.61114 51 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 52 0.56139 2.08489 1.86185 1.89791 52 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 53 1.97267 2.25870 0.46443 2.05625 53 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 54 1.97267 2.25870 0.46443 2.05625 54 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 55 1.97267 2.25870 0.46443 2.05625 55 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 56 1.91091 1.78233 1.95844 0.61114 56 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 57 1.91091 1.78233 1.95844 0.61114 57 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 58 0.56139 2.08489 1.86185 1.89791 58 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 59 1.97267 2.25870 0.46443 2.05625 59 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 60 1.97267 2.25870 0.46443 2.05625 60 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 61 1.97267 2.25870 0.46443 2.05625 61 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 62 1.91091 1.78233 1.95844 0.61114 62 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 63 1.91091 1.78233 1.95844 0.61114 63 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 64 0.56139 2.08489 1.86185 1.89791 64 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 65 1.97267 2.25870 0.46443 2.05625 65 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 66 1.97267 2.25870 0.46443 2.05625 66 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 67 1.97267 2.25870 0.46443 2.05625 67 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 68 1.91091 1.78233 1.95844 0.61114 68 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 69 1.91091 1.78233 1.95844 0.61114 69 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 70 0.56139 2.08489 1.86185 1.89791 70 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 71 1.97267 2.25870 0.46443 2.05625 71 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 72 1.97267 2.25870 0.46443 2.05625 72 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 73 1.97267 2.25870 0.46443 2.05625 73 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 74 1.91091 1.78233 1.95844 0.61114 74 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 75 1.91091 1.78233 1.95844 0.61114 75 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 76 0.56139 2.08489 1.86185 1.89791 76 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 77 1.97267 2.25870 0.46443 2.05625 77 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 78 1.97267 2.25870 0.46443 2.05625 78 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 79 1.97267 2.25870 0.46443 2.05625 79 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 80 0.56139 2.08489 1.86185 1.89791 80 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 81 1.97267 2.25870 0.46443 2.05625 81 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 82 1.91091 1.78233 1.95844 0.61114 82 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 83 0.56139 2.08489 1.86185 1.89791 83 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 84 1.97267 2.25870 0.46443 2.05625 84 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 85 1.97267 2.25870 0.46443 2.05625 85 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 86 1.97267 2.25870 0.46443 2.05625 86 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 87 1.91091 1.78233 1.95844 0.61114 87 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 88 1.91091 1.78233 1.95844 0.61114 88 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 89 0.56139 2.08489 1.86185 1.89791 89 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 90 1.97267 2.25870 0.46443 2.05625 90 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 91 1.97267 2.25870 0.46443 2.05625 91 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 92 1.97267 2.25870 0.46443 2.05625 92 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 93 1.91091 1.78233 1.95844 0.61114 93 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 94 1.91091 1.78233 1.95844 0.61114 94 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 95 0.56139 2.08489 1.86185 1.89791 95 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 96 1.97267 2.25870 0.46443 2.05625 96 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 97 1.97267 2.25870 0.46443 2.05625 97 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 98 1.97267 2.25870 0.46443 2.05625 98 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 99 1.91091 1.78233 1.95844 0.61114 99 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 100 1.91091 1.78233 1.95844 0.61114 100 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 101 0.56139 2.08489 1.86185 1.89791 101 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 102 1.97267 2.25870 0.46443 2.05625 102 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 103 1.97267 2.25870 0.46443 2.05625 103 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 104 1.97267 2.25870 0.46443 2.05625 104 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 105 1.91091 1.78233 1.95844 0.61114 105 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 106 1.91091 1.78233 1.95844 0.61114 106 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 107 0.56139 2.08489 1.86185 1.89791 107 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 108 1.97267 2.25870 0.46443 2.05625 108 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 109 1.97267 2.25870 0.46443 2.05625 109 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 110 1.97267 2.25870 0.46443 2.05625 110 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 111 1.91091 1.78233 1.95844 0.61114 111 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 112 1.91091 1.78233 1.95844 0.61114 112 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 113 0.56139 2.08489 1.86185 1.89791 113 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 114 1.97267 2.25870 0.46443 2.05625 114 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 115 1.97267 2.25870 0.46443 2.05625 115 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 116 1.97267 2.25870 0.46443 2.05625 116 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 117 1.91091 1.78233 1.95844 0.61114 117 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 118 1.91091 1.78233 1.95844 0.61114 118 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 119 0.56139 2.08489 1.86185 1.89791 119 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 120 1.97267 2.25870 0.46443 2.05625 120 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 121 1.97267 2.25870 0.46443 2.05625 121 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 122 1.97267 2.25870 0.46443 2.05625 122 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 123 1.91091 1.78233 1.95844 0.61114 123 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 124 1.91091 1.78233 1.95844 0.61114 124 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 125 0.56139 2.08489 1.86185 1.89791 125 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 126 1.97267 2.25870 0.46443 2.05625 126 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 127 1.97267 2.25870 0.46443 2.05625 127 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 128 1.97267 2.25870 0.46443 2.05625 128 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 129 1.91091 1.78233 1.95844 0.61114 129 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 130 1.91091 1.78233 1.95844 0.61114 130 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 131 0.56139 2.08489 1.86185 1.89791 131 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 132 1.97267 2.25870 0.46443 2.05625 132 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 133 1.97267 2.25870 0.46443 2.05625 133 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 134 1.97267 2.25870 0.46443 2.05625 134 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 135 1.91091 1.78233 1.95844 0.61114 135 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 136 1.91091 1.78233 1.95844 0.61114 136 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 137 0.56139 2.08489 1.86185 1.89791 137 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 138 1.97267 2.25870 0.46443 2.05625 138 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 139 1.97267 2.25870 0.46443 2.05625 139 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 140 1.97267 2.25870 0.46443 2.05625 140 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 141 1.91091 1.78233 1.95844 0.61114 141 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 142 1.91091 1.78233 1.95844 0.61114 142 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 143 0.56139 2.08489 1.86185 1.89791 143 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 144 1.97267 2.25870 0.46443 2.05625 144 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 145 1.97267 2.25870 0.46443 2.05625 145 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 146 1.97267 2.25870 0.46443 2.05625 146 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 147 1.91091 1.78233 1.95844 0.61114 147 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 148 1.91091 1.78233 1.95844 0.61114 148 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 149 0.56139 2.08489 1.86185 1.89791 149 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 150 1.97267 2.25870 0.46443 2.05625 150 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 151 1.97267 2.25870 0.46443 2.05625 151 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 152 1.97267 2.25870 0.46443 2.05625 152 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 153 1.91091 1.78233 1.95844 0.61114 153 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 154 1.91091 1.78233 1.95844 0.61114 154 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 155 0.56139 2.08489 1.86185 1.89791 155 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 156 1.97267 2.25870 0.46443 2.05625 156 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 157 1.97267 2.25870 0.46443 2.05625 157 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 158 1.97267 2.25870 0.46443 2.05625 158 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 159 1.91091 1.78233 1.95844 0.61114 159 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 160 1.91091 1.78233 1.95844 0.61114 160 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 161 0.56139 2.08489 1.86185 1.89791 161 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 162 1.97267 2.25870 0.46443 2.05625 162 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 163 1.97267 2.25870 0.46443 2.05625 163 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 164 1.97267 2.25870 0.46443 2.05625 164 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 165 1.91091 1.78233 1.95844 0.61114 165 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 166 1.91091 1.78233 1.95844 0.61114 166 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 167 0.56139 2.08489 1.86185 1.89791 167 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 168 1.97267 2.25870 0.46443 2.05625 168 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 169 1.97267 2.25870 0.46443 2.05625 169 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 170 1.97267 2.25870 0.46443 2.05625 170 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 171 1.91091 1.78233 1.95844 0.61114 171 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 172 1.91091 1.78233 1.95844 0.61114 172 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 173 0.56139 2.08489 1.86185 1.89791 173 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 174 1.97267 2.25870 0.46443 2.05625 174 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 175 1.97267 2.25870 0.46443 2.05625 175 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 176 1.97267 2.25870 0.46443 2.05625 176 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 177 1.91091 1.78233 1.95844 0.61114 177 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 178 1.91091 1.78233 1.95844 0.61114 178 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 179 0.56139 2.08489 1.86185 1.89791 179 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 180 1.97267 2.25870 0.46443 2.05625 180 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 181 1.97267 2.25870 0.46443 2.05625 181 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 182 1.97267 2.25870 0.46443 2.05625 182 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 183 1.91091 1.78233 1.95844 0.61114 183 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 184 1.91091 1.78233 1.95844 0.61114 184 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 185 0.56139 2.08489 1.86185 1.89791 185 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 186 1.97267 2.25870 0.46443 2.05625 186 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 187 1.97267 2.25870 0.46443 2.05625 187 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 188 1.97267 2.25870 0.46443 2.05625 188 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 189 1.91091 1.78233 1.95844 0.61114 189 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 190 1.91091 1.78233 1.95844 0.61114 190 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 191 0.56139 2.08489 1.86185 1.89791 191 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 192 1.97267 2.25870 0.46443 2.05625 192 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 193 1.97267 2.25870 0.46443 2.05625 193 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 194 1.97267 2.25870 0.46443 2.05625 194 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 195 1.91091 1.78233 1.95844 0.61114 195 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 196 1.91091 1.78233 1.95844 0.61114 196 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 197 0.56139 2.08489 1.86185 1.89791 197 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 198 1.97267 2.25870 0.46443 2.05625 198 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 199 1.97267 2.25870 0.46443 2.05625 199 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 200 1.97267 2.25870 0.46443 2.05625 200 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 201 1.91091 1.78233 1.95844 0.61114 201 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 202 1.91091 1.78233 1.95844 0.61114 202 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 203 0.56139 2.08489 1.86185 1.89791 203 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 204 1.97267 2.25870 0.46443 2.05625 204 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 205 1.97267 2.25870 0.46443 2.05625 205 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 206 1.97267 2.25870 0.46443 2.05625 206 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 207 1.91091 1.78233 1.95844 0.61114 207 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 208 1.91091 1.78233 1.95844 0.61114 208 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 209 0.56139 2.08489 1.86185 1.89791 209 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 210 1.97267 2.25870 0.46443 2.05625 210 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 211 1.97267 2.25870 0.46443 2.05625 211 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 212 1.97267 2.25870 0.46443 2.05625 212 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 213 1.91091 1.78233 1.95844 0.61114 213 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 214 1.91091 1.78233 1.95844 0.61114 214 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 215 0.56139 2.08489 1.86185 1.89791 215 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 216 1.97267 2.25870 0.46443 2.05625 216 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 217 1.97267 2.25870 0.46443 2.05625 217 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 218 1.97267 2.25870 0.46443 2.05625 218 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 219 1.91091 1.78233 1.95844 0.61114 219 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 220 1.91091 1.78233 1.95844 0.61114 220 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 221 0.56139 2.08489 1.86185 1.89791 221 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 222 1.97267 2.25870 0.46443 2.05625 222 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 223 1.97267 2.25870 0.46443 2.05625 223 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 224 1.97267 2.25870 0.46443 2.05625 224 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 225 1.91091 1.78233 1.95844 0.61114 225 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 226 1.91091 1.78233 1.95844 0.61114 226 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 227 0.56139 2.08489 1.86185 1.89791 227 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 228 1.97267 2.25870 0.46443 2.05625 228 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 229 1.97267 2.25870 0.46443 2.05625 229 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 230 1.97267 2.25870 0.46443 2.05625 230 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 231 1.91091 1.78233 1.95844 0.61114 231 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 232 1.91091 1.78233 1.95844 0.61114 232 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 233 0.56139 2.08489 1.86185 1.89791 233 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 234 1.97267 2.25870 0.46443 2.05625 234 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 235 1.97267 2.25870 0.46443 2.05625 235 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 236 1.97267 2.25870 0.46443 2.05625 236 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 237 1.91091 1.78233 1.95844 0.61114 237 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 238 1.91091 1.78233 1.95844 0.61114 238 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 239 0.56139 2.08489 1.86185 1.89791 239 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 240 1.97267 2.25870 0.46443 2.05625 240 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 241 1.97267 2.25870 0.46443 2.05625 241 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 242 1.97267 2.25870 0.46443 2.05625 242 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 243 1.91091 1.78233 1.95844 0.61114 243 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 244 1.97267 2.25870 0.46443 2.05625 244 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 245 1.91091 1.78233 1.95844 0.61114 245 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 246 1.91091 1.78233 1.95844 0.61114 246 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 247 0.56139 2.08489 1.86185 1.89791 247 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 248 1.97267 2.25870 0.46443 2.05625 248 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 249 1.97267 2.25870 0.46443 2.05625 249 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 250 1.97267 2.25870 0.46443 2.05625 250 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 251 1.91091 1.78233 1.95844 0.61114 251 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 252 1.91091 1.78233 1.95844 0.61114 252 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 253 0.56139 2.08489 1.86185 1.89791 253 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 254 1.97267 2.25870 0.46443 2.05625 254 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 255 1.97267 2.25870 0.46443 2.05625 255 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 256 1.97267 2.25870 0.46443 2.05625 256 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 257 1.91091 1.78233 1.95844 0.61114 257 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 258 1.91091 1.78233 1.95844 0.61114 258 t - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 259 0.56139 2.08489 1.86185 1.89791 259 a - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 260 1.97267 2.25870 0.46443 2.05625 260 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 261 1.97267 2.25870 0.46443 2.05625 261 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.06454 3.46574 3.46574 1.46634 0.26236 1.09861 0.40547 - 262 1.97267 2.25870 0.46443 2.05625 262 g - - - - 1.38629 1.38629 1.38629 1.38629 - 0.03279 3.43399 * 1.46634 0.26236 0.00000 * -// diff --git a/assets/densityplot-haplotypes.png b/assets/densityplot-haplotypes.png deleted file mode 100644 index 58a60b1..0000000 Binary files a/assets/densityplot-haplotypes.png and /dev/null differ diff --git a/tools/generate-hg38ext.py b/assets/generate-hg38ext.py similarity index 60% rename from tools/generate-hg38ext.py rename to assets/generate-hg38ext.py index 9622e93..f96a0c1 100755 --- a/tools/generate-hg38ext.py +++ b/assets/generate-hg38ext.py @@ -7,16 +7,31 @@ from contextlib import contextmanager from binascii import hexlify from gzip import open as gzopen -from tqdm import tqdm from itertools import chain from textwrap import fill +from zlib import decompress +from base64 import decodebytes + +try: + from tqdm import tqdm +except ModuleNotFoundError: + def tqdm(it, desc=None, *args, **kwargs): + if desc is not None: + print(desc, end="...\n", file=stderr) + return it USAGE = """usage: {0} --local hg38.fasta stong2014.fasta - generate hg38ext.fa from local files + generate hg38ext.fa from local files and output to stdout {0} --remote - download appropriate assemblies and generate hg38ext.fa + download appropriate assemblies, generate hg38ext.fa, and output to stdout +{0} --ecx + output the edgeCase indeX (hg38ext.fa.ecx) to stdout + +NOTE! This tool writes uncompressed data (FASTA or ECX) to stdout. +You should pipe it into a file, for example: +{0} --remote > hg38ext.fa """ NCBI_FTP_DIR = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405" @@ -49,6 +64,39 @@ COMPLEMENTS = dict(zip(ALPHABET, reversed(ALPHABET))) COMPLEMENT_PATTERN = compile(r'|'.join(COMPLEMENTS.keys())) +HG38EXT_ECX_GZ = b'\n'.join([ +b'eJyVWEtvIzcMPk/+Ro9FAFFPsos9F8X2WmD3ZLhGdhMkcba2u2j/fUk9JvOwOG6AUMSY/CRqOB8p' +b'/fT1Zf/t/Mvr/vy82x8Pj2+nj95Q/PD17fT8EYHsh8tpf7i03yA69Hc/PRwvp3+H03H/+jB8fzvL' +b'/88wHB5Pb69v5zd++Lp/Ou7K7zLB8P30xOrhZX8+D+e3v0+Hh+Hl6fg8/PmyPzy/PJ0vd0bcYQDD' +b'f1lCeZCFLGkIw2SVw+M3h8P9cH+XTezS0RahOWYTt3R0RWiO2cQvHX0RmmM2CUvHUITmmE3iELNj' +b'bI6xCM0xm6TljKkIzTGb4HJGLEJzzCa0nJGK0ByzCZhVBpgq1RwoRrBcLzSpOhejVQqBrVJ1Lkac' +b'Rnlq8S8K1OdFqhDFyHcgfJUqRDHi1EoNIk0gQpUqRDGKq12IVarOxSit9j9VqToXI1zNjFWqztnI' +b'muXM1lSpOuecszAEU+YuI9SnRaoUUmaR1Alt9eEdwlapQmTTz8vgPxehemb4L0vPL0WoniXjBuuR' +b'fPQM1DS3JF3XQfCFX623gC5YapozS/btIYRKv4TWhRCoadEsabiHECsPk7HGM8SoxSUf9xAqEUIg' +b'5wIlN2p+SZM9hMqI4INJGF0ctbTkyx5CpUZw6JxP/DE0DZfE2UFwjSHBuYRJXmfT3JpBeyCNKcHx' +b'+mPMIEVzaybtgTTG5Plt5FygptU1zBi1B9I4E8C74B1vcdNozak9kMaaYCKiS5ldikZrVu2BNN4E' +b'w7kJgNQ0Mmte7YE0akRnffIemmLXBNmDaASJxkbej9CUuKbJHkQxoiFg5JfKfF2VVJ8XqUJQY1rP' +b'6QkxNaUx8IRvOxDeNE6NkYjQNcWvGbcH0Zg1GDToefKq0JpxexC2Mi5TjTNIYdTiknl7CK5wbEgW' +b'kucEqEpc8m/Pv2Qfv4NAiGWgCfPmLpvJ+/T0cHl83R93r/vL4bH5Px0f/uGW/M6HwrLAyVCkm3Cv' +b'jsFriLUZBTQpS5w0o5veSetIb1w/tubUurz+8uU3Tr4Ng7Q+9TaMYFrLyuUrSz9h3q2dCK1LBeNB' +b'QLxtfFufbwHYjd71xijc2MWGFLKMM87dXId/bwBtPjKU4jY2gJsAYezjYibNCHOC2gSIlUMScQ0W' +b'GRurFHnjTqR5Z4GTHuO9syhYbgsLxx4DrA3oRs1PeowbsWjabeDYbeCs29Cx+ERnJh2Hk1xtWlhl' +b'voIyL/chUtOSuZJ8CtCs5HtiamtauJJ+CpCbFluDQE2z82K7CeSnpc5TbMq81G3ChLHccaFNzI5V' +b'oSspufXqY5wWPnnhVQmzwre5pjQpfkkqV1XirPhtwuB7AbRceWDU7KQA3hhZ2U+7+/V3TNxzhh+w' +b'279cBk+B6Vykn6WBXCnxFyzXSwWK8+3+LrX2kD9SbmdFgu7G87JbPSHvPv1mkzQBdXLk75NPKHkI' +b'03ZvZSiwbroaK7D2vXnkz1P4uGl2inbFHcQ9Z/IfxzJZ8rIlpwPvsWdgkW7W+a13xAuIrz/3TtRr' +b'NyduYXDfLw8vO7gPxnzawY5fDe8fJ+qK+8IghhmgvWNrOKp8/ROv4/B2lmNXRaqruA6UkiA1NoYo' +b'xwGIYBT/EkkUP25olUhmzKtGQtdxUkBMWeIET40HJVXRVEuucS5Lr/iXeCRXkZMolYXsrgU0vtjt' +b'kNB2oPjAVeXVFOtEJXmDzQE8OKEQGXSQHBrK94JSH5V3teBcNbTQgUp8dBOBM0A9MEmjekai1XfU' +b'gyhhBXHl3fhrvRT5lj3JH7NvBl027k4LELuglijKJybD9B6mskwnyJxb9fBgPRMwJLKjpuGUSFGu' +b'Xs1gb410drzQIuWy0gWNSPw684ATUDVSklSjaim3TYGDjKOWFJwcKQk384bQrZHODgD9SFslJN+H' +b'dk7OuHkIE2g9XslBqubgxJbknFc1p+CUeKWMkLTzNyfx/Miivtyk4MqlEZTBznD1eHMmYluCczE3' +b'FKOGKlQJWQoOhwT+5pA7jcH2DoDcufbmcWQSl8o8xBsakOvTcP+bJ+p0OWDZmaMuI/2fafJuSQQC' +b'L9XkShw/Ojs2r1P6FjkNmk/bIAgyhHWp6e+Kz9itQUKud8ZJj1iUK1VrHbnLEFxs8OZMmZ9i9bij' +b'BsxHrFCG+fF4I+qUkSdXgXwa5aVVBVSoGnXMEDiEW4Oe3QbpIZOCmuQ95yFMUPV4IScn1GUAgk/S' +b'z48aKUg1XMoIMMRbw51dX6nhcrr1UflXWwY3Qd0IN2ckVAfI5wbiHrhppCCVcMFmBOmJb87p+dWO' +b'HnFQgJEMl+I8pBnwRtA5IUdjMtai874pQYWqUUuV/A80XjlL', b'']) + def revcomp(sequence): """Reverse-complement a sequence""" @@ -112,7 +160,7 @@ def parser_iterator(filename, to_revcomp, desc="Parsing", entry_filter=lambda e: def generate_hg38ext(hg38, stong2014): - """Generate hg38ext from the hg38 and stong2014 FASTA files""" + """Generate hg38ext from the hg38 and stong2014 FASTA files, write to stdout""" subhaps = {mask.format(STONG_INFIX) for mask in STONG2014_SUBHAP_MASKS} to_revcomp = {mask.format(STONG_INFIX) for mask in TO_REVCOMP_MASKS} hg38_iterator = parser_iterator(hg38, to_revcomp, desc="Parsing reference") @@ -125,6 +173,11 @@ def generate_hg38ext(hg38, stong2014): return 0 +def output_ecx(): + """Write the hg38ext ECX to stdout""" + print(decompress(decodebytes(HG38EXT_ECX_GZ)).decode().rstrip("\n")) + + if __name__ == "__main__": # interpret command-line arguments and dispatch to subroutines: if (len(argv) == 2) and (argv[1] == "--remote"): @@ -133,6 +186,8 @@ def generate_hg38ext(hg38, stong2014): returncode = generate_hg38ext(hg38, stong2014) elif (len(argv) == 4) and (argv[1] == "--local"): returncode = generate_hg38ext(hg38=argv[2], stong2014=argv[3]) + elif (len(argv) == 2) and (argv[1] == "--ecx"): + returncode = output_ecx() else: print(USAGE.format(__file__).rstrip(), file=stderr) returncode = 1 diff --git a/assets/haplotypes-example.png b/assets/haplotypes-example.png new file mode 100644 index 0000000..0969f78 Binary files /dev/null and b/assets/haplotypes-example.png differ diff --git a/assets/hg38ext.fa.ecx b/assets/hg38ext.fa.ecx index e03bcf2..c6eb662 100644 --- a/assets/hg38ext.fa.ecx +++ b/assets/hg38ext.fa.ecx @@ -1,74 +1,74 @@ -#flags:ucsc_mask_anchor=4096;fork=8192;tract_anchor=16384 -#entry rname pos pos+1 chromosome ucsc_rname flag prime class source link blacklist -0 chr1 10000 10001 chr1 chr1 4096 5 ucsc_mask_anchor hg38 - - -1 chr2 10000 10001 chr2 chr2 4096 5 ucsc_mask_anchor hg38 - - -2 chr3 10000 10001 chr3 chr3 4096 5 ucsc_mask_anchor hg38 - - -3 chr4 10000 10001 chr4 chr4 4096 5 ucsc_mask_anchor hg38 - - -4 chr5 10000 10001 chr5 chr5 4096 5 ucsc_mask_anchor hg38 - - -5 chr6 60000 60001 chr6 chr6 4096 5 ucsc_mask_anchor hg38 - - -6 chr7 10000 10001 chr7 chr7 4096 5 ucsc_mask_anchor hg38 - - -7 chr8 60000 60001 chr8 chr8 4096 5 ucsc_mask_anchor hg38 - - -8 chr9 10000 10001 chr9 chr9 4096 5 ucsc_mask_anchor hg38 - - -9 chr10 10000 10001 chr10 chr10 4096 5 ucsc_mask_anchor hg38 - - -10 chr11 60000 60001 chr11 chr11 4096 5 ucsc_mask_anchor hg38 - - -11 chr12 10000 10001 chr12 chr12 4096 5 ucsc_mask_anchor hg38 - - -12 chr13 16000000 16000001 chr13 chr13 4096 5 ucsc_mask_anchor hg38 - - -13 chr14 16000000 16000001 chr14 chr14 4096 5 ucsc_mask_anchor hg38 - - -14 chr15 17000000 17000001 chr15 chr15 4096 5 ucsc_mask_anchor hg38 - - -15 chr16 10000 10001 chr16 chr16 4096 5 ucsc_mask_anchor hg38 - - -16 chr17 60000 60001 chr17 chr17 4096 5 ucsc_mask_anchor hg38 - - -17 chr18 10000 10001 chr18 chr18 4096 5 ucsc_mask_anchor hg38 - - -18 chr20 60000 60001 chr20 chr20 4096 5 ucsc_mask_anchor hg38 - - -19 chr21 5010000 5010001 chr21 chr21 4096 5 ucsc_mask_anchor hg38 - - -20 chr22 10510000 10510001 chr22 chr22 4096 5 ucsc_mask_anchor hg38 - - -21 chrX 10000 10001 chrX chrX 4096 5 ucsc_mask_anchor hg38 - - -22 chrY 10000 10001 chrY chrY 4096 5 ucsc_mask_anchor hg38 - - -23 chr1 248946422 248946423 chr1 chr1 4096 3 ucsc_mask_anchor hg38 - - -24 chr2 242183529 242183530 chr2 chr2 4096 3 ucsc_mask_anchor hg38 - - -25 chr3 198235559 198235560 chr3 chr3 4096 3 ucsc_mask_anchor hg38 - - -26 chr4 190204555 190204556 chr4 chr4 4096 3 ucsc_mask_anchor hg38 - - -27 chr7 159335973 159335974 chr7 chr7 4096 3 ucsc_mask_anchor hg38 - - -28 chr8 145078636 145078637 chr8 chr8 4096 3 ucsc_mask_anchor hg38 - - -29 chr9 138334717 138334718 chr9 chr9 4096 3 ucsc_mask_anchor hg38 - - -30 chr10 133787422 133787423 chr10 chr10 4096 3 ucsc_mask_anchor hg38 - - -31 chr11 135076622 135076623 chr11 chr11 4096 3 ucsc_mask_anchor hg38 - - -32 chr12 133265309 133265310 chr12 chr12 4096 3 ucsc_mask_anchor hg38 - - -33 chr13 114354328 114354329 chr13 chr13 4096 3 ucsc_mask_anchor hg38 - - -34 chr14 106883718 106883719 chr14 chr14 4096 3 ucsc_mask_anchor hg38 - - -35 chr15 101981189 101981190 chr15 chr15 4096 3 ucsc_mask_anchor hg38 - - -36 chr17 83247441 83247442 chr17 chr17 4096 3 ucsc_mask_anchor hg38 - - -37 chr18 80263285 80263286 chr18 chr18 4096 3 ucsc_mask_anchor hg38 - - -38 chr19 58607616 58607617 chr19 chr19 4096 3 ucsc_mask_anchor hg38 - - -39 chr20 64334167 64334168 chr20 chr20 4096 3 ucsc_mask_anchor hg38 - - -40 chr21 46699983 46699984 chr21 chr21 4096 3 ucsc_mask_anchor hg38 - - -41 chr22 50808468 50808469 chr22 chr22 4096 3 ucsc_mask_anchor hg38 - - -42 chrX 156030895 156030896 chrX chrX 4096 3 ucsc_mask_anchor hg38 - - -43 chrY 57217415 57217416 chrY chrY 4096 3 ucsc_mask_anchor hg38 - - -44 chr1 585988 585989 chr1 chr1 16384 5 riethman_match hg38 - dup? +#flags:mask_anchor=4096;fork=8192;tract_anchor=16384 +#entry rname pos pos+1 chromosome main_rname flag prime class source link blacklist +0 chr1 10000 10001 chr1 chr1 4096 5 mask_anchor hg38 - - +1 chr2 10000 10001 chr2 chr2 4096 5 mask_anchor hg38 - - +2 chr3 10000 10001 chr3 chr3 4096 5 mask_anchor hg38 - - +3 chr4 10000 10001 chr4 chr4 4096 5 mask_anchor hg38 - - +4 chr5 10000 10001 chr5 chr5 4096 5 mask_anchor hg38 - - +5 chr6 60000 60001 chr6 chr6 4096 5 mask_anchor hg38 - - +6 chr7 10000 10001 chr7 chr7 4096 5 mask_anchor hg38 - - +7 chr8 60000 60001 chr8 chr8 4096 5 mask_anchor hg38 - - +8 chr9 10000 10001 chr9 chr9 4096 5 mask_anchor hg38 - - +9 chr10 10000 10001 chr10 chr10 4096 5 mask_anchor hg38 - - +10 chr11 60000 60001 chr11 chr11 4096 5 mask_anchor hg38 - - +11 chr12 10000 10001 chr12 chr12 4096 5 mask_anchor hg38 - - +12 chr13 16000000 16000001 chr13 chr13 4096 5 mask_anchor hg38 - - +13 chr14 16000000 16000001 chr14 chr14 4096 5 mask_anchor hg38 - - +14 chr15 17000000 17000001 chr15 chr15 4096 5 mask_anchor hg38 - - +15 chr16 10000 10001 chr16 chr16 4096 5 mask_anchor hg38 - - +16 chr17 60000 60001 chr17 chr17 4096 5 mask_anchor hg38 - - +17 chr18 10000 10001 chr18 chr18 4096 5 mask_anchor hg38 - - +18 chr20 60000 60001 chr20 chr20 4096 5 mask_anchor hg38 - - +19 chr21 5010000 5010001 chr21 chr21 4096 5 mask_anchor hg38 - - +20 chr22 10510000 10510001 chr22 chr22 4096 5 mask_anchor hg38 - - +21 chrX 10000 10001 chrX chrX 4096 5 mask_anchor hg38 - - +22 chrY 10000 10001 chrY chrY 4096 5 mask_anchor hg38 - - +23 chr1 248946422 248946423 chr1 chr1 4096 3 mask_anchor hg38 - - +24 chr2 242183529 242183530 chr2 chr2 4096 3 mask_anchor hg38 - - +25 chr3 198235559 198235560 chr3 chr3 4096 3 mask_anchor hg38 - - +26 chr4 190204555 190204556 chr4 chr4 4096 3 mask_anchor hg38 - - +27 chr7 159335973 159335974 chr7 chr7 4096 3 mask_anchor hg38 - - +28 chr8 145078636 145078637 chr8 chr8 4096 3 mask_anchor hg38 - - +29 chr9 138334717 138334718 chr9 chr9 4096 3 mask_anchor hg38 - - +30 chr10 133787422 133787423 chr10 chr10 4096 3 mask_anchor hg38 - - +31 chr11 135076622 135076623 chr11 chr11 4096 3 mask_anchor hg38 - - +32 chr12 133265309 133265310 chr12 chr12 4096 3 mask_anchor hg38 - - +33 chr13 114354328 114354329 chr13 chr13 4096 3 mask_anchor hg38 - - +34 chr14 106883718 106883719 chr14 chr14 4096 3 mask_anchor hg38 - - +35 chr15 101981189 101981190 chr15 chr15 4096 3 mask_anchor hg38 - - +36 chr17 83247441 83247442 chr17 chr17 4096 3 mask_anchor hg38 - - +37 chr18 80263285 80263286 chr18 chr18 4096 3 mask_anchor hg38 - - +38 chr19 58607616 58607617 chr19 chr19 4096 3 mask_anchor hg38 - - +39 chr20 64334167 64334168 chr20 chr20 4096 3 mask_anchor hg38 - - +40 chr21 46699983 46699984 chr21 chr21 4096 3 mask_anchor hg38 - - +41 chr22 50808468 50808469 chr22 chr22 4096 3 mask_anchor hg38 - - +42 chrX 156030895 156030896 chrX chrX 4096 3 mask_anchor hg38 - - +43 chrY 57217415 57217416 chrY chrY 4096 3 mask_anchor hg38 - - +44 chr1 585988 585989 chr1 chr1 16384 5 riethman_match hg38 - inexact 45 chr2 10262 10263 chr2 chr2 16384 5 riethman_match hg38 - - 46 chr5 11807 11808 chr5 chr5 16384 5 riethman_match hg38 - - -47 chr6 60000 60001 chr6 chr6 16384 5 riethman_match hg38 - - -48 chr7 10232 10233 chr7 chr7 16384 5 riethman_match hg38 - - -49 chr8 60000 60001 chr8 chr8 16384 5 riethman_match hg38 - - +47 chr6 60000 60001 chr6 chr6 16384 5 riethman_match hg38 - inexact +48 chr7 10232 10233 chr7 chr7 16384 5 riethman_match hg38 - inexact +49 chr8 60000 60001 chr8 chr8 16384 5 riethman_match hg38 - inexact 50 chr9 10353 10354 chr9 chr9 16384 5 riethman_match hg38 - - 51 chr10 10419 10420 chr10 chr10 16384 5 riethman_match hg38 - - -52 chr11 60000 60001 chr11 chr11 16384 5 riethman_match hg38 - - +52 chr11 60000 60001 chr11 chr11 16384 5 riethman_match hg38 - inexact 53 chr12 10575 10576 chr12 chr12 16384 5 riethman_match hg38 - - 54 chr16 10027 10028 chr16 chr16 16384 5 riethman_match hg38 - - 55 chr18 10615 10616 chr18 chr18 16384 5 riethman_match hg38 - - -56 chr20 79359 79360 chr20 chr20 16384 5 riethman_match hg38 - dup? -57 chr3 198235558 198235559 chr3 chr3 16384 3 riethman_match hg38 - - -58 chr4 190122583 190122584 chr4 chr4 16384 3 riethman_match hg38 - dup? +56 chr20 79359 79360 chr20 chr20 16384 5 riethman_match hg38 - inexact +57 chr3 198235558 198235559 chr3 chr3 16384 3 riethman_match hg38 - inexact +58 chr4 190122583 190122584 chr4 chr4 16384 3 riethman_match hg38 - inexact 59 chr7 159335873 159335874 chr7 chr7 16384 3 riethman_match hg38 - - 60 chr8 145073354 145073355 chr8 chr8 16384 3 riethman_match hg38 - - 61 chr11 135076569 135076570 chr11 chr11 16384 3 riethman_match hg38 - - 62 chr12 133264944 133264945 chr12 chr12 16384 3 riethman_match hg38 - - 63 chr15 101980819 101980820 chr15 chr15 16384 3 riethman_match hg38 - - 64 chr19 58607496 58607497 chr19 chr19 16384 3 riethman_match hg38 - - -65 chr20 64286708 64286709 chr20 chr20 16384 3 riethman_match hg38 - dup? +65 chr20 64286708 64286709 chr20 chr20 16384 3 riethman_match hg38 - inexact 66 chr21 46699874 46699875 chr21 chr21 16384 3 riethman_match hg38 - - 67 chr22 50807895 50807896 chr22 chr22 16384 3 riethman_match hg38 - - -68 chrX 156029891 156029892 chrX chrX 16384 3 riethman_match hg38 - - +68 chrX 156029891 156029892 chrX chrX 16384 3 riethman_match hg38 - inexact 69 chr12_GL877875v1_alt 49533 49534 chr12 chr12 8192 5 fork hg38 70 - 70 chr12 55530 55531 chr12 chr12 8192 5 fork hg38 69 - 71 chr14_KI270846v1_alt 825824 825825 chr14 chr14_KI270846v1_alt 8192 3 fork hg38 72 - @@ -93,7 +93,7 @@ 90 2qtel_1-500K_1_12_12_rc 499999 500000 chr2 chr2 16384 3 tel_fork riethman2014 - - 91 2qtel_1-500K_1_12_12_rc 468937 468938 chr2 chr2 8192 3 tel_fork riethman2014 92 - 92 chr2 242152486 242152487 chr2 chr2 8192 3 tel_fork hg38 91 - -93 9qtel_1-500K_1_12_12_rc 499999 500000 chr9 chr9 16384 3 tel_fork riethman2014 - - +93 9qtel_1-500K_1_12_12_rc 499999 500000 chr9 chr9 16384 3 tel_fork riethman2014 - inexact 94 9qtel_1-500K_1_12_12_rc 433984 433985 chr9 chr9 8192 3 tel_fork riethman2014 95 - 95 chr9 138192962 138192963 chr9 chr9 8192 3 tel_fork hg38 94 - 96 10qtel_1-500K_1_12_12_rc 499999 500000 chr10 chr10 16384 3 tel_fork riethman2014 - - diff --git a/publications/methods-paper/COPYRIGHT b/assets/paper/COPYRIGHT similarity index 86% rename from publications/methods-paper/COPYRIGHT rename to assets/paper/COPYRIGHT index e045b5c..73255e5 100644 --- a/publications/methods-paper/COPYRIGHT +++ b/assets/paper/COPYRIGHT @@ -6,5 +6,5 @@ unless otherwise noted, are protected by U.S. and International copyright laws. Reproduction and distribution, with or without modification, of these files without a written permission of the authors is prohibited. -© 2020 Kirill Grigorev, Jonathan Foox, Christopher E. Mason +© 2021 Kirill Grigorev, Jonathan Foox, Christopher E. Mason Institute for Computational Biomedicine, Weill Cornell Medicine diff --git a/assets/paper/Snakefile b/assets/paper/Snakefile new file mode 100644 index 0000000..44b4122 --- /dev/null +++ b/assets/paper/Snakefile @@ -0,0 +1,82 @@ +from pandas import read_fwf +from io import StringIO + + +HG38EXT_ECX = "data/references/hg38/hg38ext.fa.ecx" +DATA_DIR = "data/datasets/2021" +MIN_MAP_OVERLAP = 500 +MIN_SUBTELOMERE_OVERLAP = 3000 +MIN_TELOMERE_OVERLAP = 3000 +MAX_READ_LENGTH = 100000 +TARGET = "tract_anchor" +N_MOTIFS_TO_PLOT = 3 +MIN_CHROM_COVERAGE = 25 +SMALLEST_P_VALUE = 5e-324 + +DATASETS = read_fwf(StringIO(str.strip(""" +group subject dataset priority +NA12878 HG001 11kb 1 +AshkenazimTrio HG002 10kb 1 +AshkenazimTrio HG002 15kb 1 +AshkenazimTrio HG002 15kb_20kb 1 +AshkenazimTrio HG003 15kb 1 +AshkenazimTrio HG003 15kb_20kb 2 +AshkenazimTrio HG004 15kb 1 +AshkenazimTrio HG004 15kb_21kb 1 +ChineseTrio HG005 11kb 1 +ChineseTrio HG006 15kb_20kb 1 +ChineseTrio HG006 hifi_google 1 +ChineseTrio HG007 15kb_20kb 1 +ChineseTrio HG007 hifi_google 1 +"""))) + +"""Fully cannibalized datasets: +AshkenazimTrio HG004 15kb_20kb +""" + +DATASETS["subject_pacbio_path"] = DATASETS.apply( + lambda row: "{}/PacBio/{}/{}".format(DATA_DIR, *row[:2]), axis=1, +) +DATASETS["dataset_pacbio_path"] = DATASETS.apply( + lambda row: "{}/PacBio/{}/{}/{}".format(DATA_DIR, *row[:3]), axis=1, +) + +wildcard_constraints: + group="[^/]+", subject="[^/]+", dataset="[^/]+", name="[^/]+", kind="[^/]+", + arm="[pq]_arm", + + +def get_sam_flags(arm, target=None): + if target: + if arm == "p_arm": + return "-f '{}' -F is_q".format(target) + elif arm == "q_arm": + return "-f is_q -f '{}'".format(target) + else: + raise ValueError("arm", arm) + else: + if arm == "p_arm": + return "-F is_q" + elif arm == "q_arm": + return "-f is_q" + else: + raise ValueError("arm", arm) + + +include: "assets/paper/snakefiles/longread-motifs.snake" +include: "assets/paper/snakefiles/shortread-support.snake" +include: "assets/paper/snakefiles/shortread-motifs.snake" +include: "assets/paper/snakefiles/bonferroni.snake" +include: "assets/paper/snakefiles/densityplots.snake" +include: "assets/paper/snakefiles/kmerscanner-all.snake" +include: "assets/paper/snakefiles/levenshtein.snake" + + +rule all: + input: + rules.densityplot_all.input, + rules.telbam_support_all.input, + rules.kmerscanner_all_motifs_all_subjects.input, + rules.kmerscanner_for_haploplots.input, + rules.levenshtein_all.input, + rules.repeatfinder_all_shortread.input, diff --git a/publications/methods-paper/environment-paper.yaml b/assets/paper/environment-paper.yaml similarity index 100% rename from publications/methods-paper/environment-paper.yaml rename to assets/paper/environment-paper.yaml diff --git a/assets/paper/figures/Figure_1.pdf b/assets/paper/figures/Figure_1.pdf new file mode 100644 index 0000000..31b4190 Binary files /dev/null and b/assets/paper/figures/Figure_1.pdf differ diff --git a/assets/paper/figures/Figure_2.pdf b/assets/paper/figures/Figure_2.pdf new file mode 100644 index 0000000..35548f9 Binary files /dev/null and b/assets/paper/figures/Figure_2.pdf differ diff --git a/assets/paper/figures/Figure_2.tex b/assets/paper/figures/Figure_2.tex new file mode 100644 index 0000000..01d69bd --- /dev/null +++ b/assets/paper/figures/Figure_2.tex @@ -0,0 +1,15 @@ +\documentclass{article} +\usepackage[paperheight=840pt,paperwidth=700pt,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} +\begin{document} +\begin{textblock}{13}(0.300,0.000)\includegraphics{Figure_2/densityplot-p_arm.pdf}\end{textblock} +\begin{textblock}{13}(-0.05,0.200)\LARGE{(A)}\end{textblock} +\begin{textblock}{13}(0.300,5.800)\includegraphics{Figure_2/densityplot-q_arm.pdf}\end{textblock} +\begin{textblock}{13}(-0.05,6.000)\LARGE{(B)}\end{textblock} +\begin{textblock}{13}(7.000,1.100)\includegraphics[width=2.30in,keepaspectratio]{Figure_2/densityplot-legend.pdf}\end{textblock} +\end{document} diff --git a/assets/paper/figures/Figure_2/densityplot-legend.pdf b/assets/paper/figures/Figure_2/densityplot-legend.pdf new file mode 100644 index 0000000..1310db7 Binary files /dev/null and b/assets/paper/figures/Figure_2/densityplot-legend.pdf differ diff --git a/assets/paper/figures/Figure_2/densityplot-p_arm.pdf b/assets/paper/figures/Figure_2/densityplot-p_arm.pdf new file mode 100644 index 0000000..38d85a6 Binary files /dev/null and b/assets/paper/figures/Figure_2/densityplot-p_arm.pdf differ diff --git a/assets/paper/figures/Figure_2/densityplot-q_arm.pdf b/assets/paper/figures/Figure_2/densityplot-q_arm.pdf new file mode 100644 index 0000000..a5d7bca Binary files /dev/null and b/assets/paper/figures/Figure_2/densityplot-q_arm.pdf differ diff --git a/assets/paper/figures/Figure_3.pdf b/assets/paper/figures/Figure_3.pdf new file mode 100644 index 0000000..cd54003 Binary files /dev/null and b/assets/paper/figures/Figure_3.pdf differ diff --git a/assets/paper/figures/Figure_4.pdf b/assets/paper/figures/Figure_4.pdf new file mode 100644 index 0000000..46579cb Binary files /dev/null and b/assets/paper/figures/Figure_4.pdf differ diff --git a/assets/paper/figures/Figure_4.tex b/assets/paper/figures/Figure_4.tex new file mode 100644 index 0000000..aa98cc0 --- /dev/null +++ b/assets/paper/figures/Figure_4.tex @@ -0,0 +1,20 @@ +\documentclass{article} +\usepackage[paperheight=14.724in,paperwidth=12.108in,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} +\begin{document} +\begin{textblock}{13}(-0.200,0.000)\includegraphics{Figure_4/chr2.pdf}\end{textblock} +\begin{textblock}{13}(-0.200,1.727)\includegraphics{Figure_4/3ptel_1-500K_1_12_12.pdf}\end{textblock} +\begin{textblock}{13}(-0.200,3.288)\includegraphics{Figure_4/4ptel_1-500K_1_12_12.pdf}\end{textblock} +\begin{textblock}{13}(-0.200,7.862)\includegraphics{Figure_4/chr5.pdf}\end{textblock} +\begin{textblock}{13}(-0.200,9.006)\includegraphics{Figure_4/chr9.pdf}\end{textblock} +\begin{textblock}{13}(-0.200,10.067)\includegraphics{Figure_4/chr12.pdf}\end{textblock} +\begin{textblock}{13}(-0.200,11.322)\includegraphics{Figure_4/17ptel_1_500K_1_12_12.pdf}\end{textblock} +\begin{textblock}{13}(7.750,0) +\includegraphics[width=4.000in,keepaspectratio]{Figure_4/legend.pdf} +\end{textblock} +\end{document} diff --git a/assets/paper/figures/Figure_4/17ptel_1_500K_1_12_12.pdf b/assets/paper/figures/Figure_4/17ptel_1_500K_1_12_12.pdf new file mode 100644 index 0000000..edb2997 Binary files /dev/null and b/assets/paper/figures/Figure_4/17ptel_1_500K_1_12_12.pdf differ diff --git a/assets/paper/figures/Figure_4/3ptel_1-500K_1_12_12.pdf b/assets/paper/figures/Figure_4/3ptel_1-500K_1_12_12.pdf new file mode 100644 index 0000000..9358b53 Binary files /dev/null and b/assets/paper/figures/Figure_4/3ptel_1-500K_1_12_12.pdf differ diff --git a/assets/paper/figures/Figure_4/4ptel_1-500K_1_12_12.pdf b/assets/paper/figures/Figure_4/4ptel_1-500K_1_12_12.pdf new file mode 100644 index 0000000..332f433 Binary files /dev/null and b/assets/paper/figures/Figure_4/4ptel_1-500K_1_12_12.pdf differ diff --git a/assets/paper/figures/Figure_4/chr12.pdf b/assets/paper/figures/Figure_4/chr12.pdf new file mode 100644 index 0000000..4f83e16 Binary files /dev/null and b/assets/paper/figures/Figure_4/chr12.pdf differ diff --git a/assets/paper/figures/Figure_4/chr2.pdf b/assets/paper/figures/Figure_4/chr2.pdf new file mode 100644 index 0000000..cfacff9 Binary files /dev/null and b/assets/paper/figures/Figure_4/chr2.pdf differ diff --git a/assets/paper/figures/Figure_4/chr5.pdf b/assets/paper/figures/Figure_4/chr5.pdf new file mode 100644 index 0000000..7ba1b71 Binary files /dev/null and b/assets/paper/figures/Figure_4/chr5.pdf differ diff --git a/assets/paper/figures/Figure_4/chr9.pdf b/assets/paper/figures/Figure_4/chr9.pdf new file mode 100644 index 0000000..7c82834 Binary files /dev/null and b/assets/paper/figures/Figure_4/chr9.pdf differ diff --git a/assets/paper/figures/Figure_4/legend.pdf b/assets/paper/figures/Figure_4/legend.pdf new file mode 100644 index 0000000..3faf3f7 Binary files /dev/null and b/assets/paper/figures/Figure_4/legend.pdf differ diff --git a/assets/paper/figures/Figure_5.pdf b/assets/paper/figures/Figure_5.pdf new file mode 100644 index 0000000..07daf62 Binary files /dev/null and b/assets/paper/figures/Figure_5.pdf differ diff --git a/assets/paper/figures/Figure_5.tex b/assets/paper/figures/Figure_5.tex new file mode 100644 index 0000000..aba3140 --- /dev/null +++ b/assets/paper/figures/Figure_5.tex @@ -0,0 +1,20 @@ +\documentclass{article} +\usepackage[paperheight=12.891in,paperwidth=10.236in,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} +\begin{document} +\begin{textblock}{13}(0.175,0.000)\includegraphics{Figure_5/chr7.pdf}\end{textblock} +\begin{textblock}{13}(0.869,2.394)\includegraphics{Figure_5/chr8.pdf}\end{textblock} +\begin{textblock}{13}(-0.200,3.857)\includegraphics{Figure_5/chr11.pdf}\end{textblock} +\begin{textblock}{13}(0.772,6.098)\includegraphics{Figure_5/chr12.pdf}\end{textblock} +\begin{textblock}{13}(0.578,7.631)\includegraphics{Figure_5/14qtel_1-500K_1_12_12_rc.pdf}\end{textblock} +\begin{textblock}{13}(1.411,9.3)\includegraphics{Figure_5/chr15.pdf}\end{textblock} +\begin{textblock}{13}(0.356,10.530)\includegraphics{Figure_5/18qtel_1-500K_1_12_12_rc.pdf}\end{textblock} +\begin{textblock}{13}(6.456,0) +\includegraphics[width=3.600in,keepaspectratio]{Figure_5/legend.pdf} +\end{textblock} +\end{document} diff --git a/assets/paper/figures/Figure_5/14qtel_1-500K_1_12_12_rc.pdf b/assets/paper/figures/Figure_5/14qtel_1-500K_1_12_12_rc.pdf new file mode 100644 index 0000000..61076fe Binary files /dev/null and b/assets/paper/figures/Figure_5/14qtel_1-500K_1_12_12_rc.pdf differ diff --git a/assets/paper/figures/Figure_5/18qtel_1-500K_1_12_12_rc.pdf b/assets/paper/figures/Figure_5/18qtel_1-500K_1_12_12_rc.pdf new file mode 100644 index 0000000..049ae15 Binary files /dev/null and b/assets/paper/figures/Figure_5/18qtel_1-500K_1_12_12_rc.pdf differ diff --git a/assets/paper/figures/Figure_5/chr11.pdf b/assets/paper/figures/Figure_5/chr11.pdf new file mode 100644 index 0000000..bcd174d Binary files /dev/null and b/assets/paper/figures/Figure_5/chr11.pdf differ diff --git a/assets/paper/figures/Figure_5/chr12.pdf b/assets/paper/figures/Figure_5/chr12.pdf new file mode 100644 index 0000000..a3462c5 Binary files /dev/null and b/assets/paper/figures/Figure_5/chr12.pdf differ diff --git a/assets/paper/figures/Figure_5/chr15.pdf b/assets/paper/figures/Figure_5/chr15.pdf new file mode 100644 index 0000000..23b817c Binary files /dev/null and b/assets/paper/figures/Figure_5/chr15.pdf differ diff --git a/assets/paper/figures/Figure_5/chr7.pdf b/assets/paper/figures/Figure_5/chr7.pdf new file mode 100644 index 0000000..726301f Binary files /dev/null and b/assets/paper/figures/Figure_5/chr7.pdf differ diff --git a/assets/paper/figures/Figure_5/chr8.pdf b/assets/paper/figures/Figure_5/chr8.pdf new file mode 100644 index 0000000..7b14322 Binary files /dev/null and b/assets/paper/figures/Figure_5/chr8.pdf differ diff --git a/assets/paper/figures/Figure_5/legend.pdf b/assets/paper/figures/Figure_5/legend.pdf new file mode 100644 index 0000000..88efe36 Binary files /dev/null and b/assets/paper/figures/Figure_5/legend.pdf differ diff --git a/assets/paper/figures/Figure_R1.pdf b/assets/paper/figures/Figure_R1.pdf new file mode 100644 index 0000000..e478a22 Binary files /dev/null and b/assets/paper/figures/Figure_R1.pdf differ diff --git a/assets/paper/figures/Figure_S1-nolegend.pdf b/assets/paper/figures/Figure_S1-nolegend.pdf new file mode 100644 index 0000000..a8a5edf Binary files /dev/null and b/assets/paper/figures/Figure_S1-nolegend.pdf differ diff --git a/assets/paper/figures/Figure_S1-nolegend.tex b/assets/paper/figures/Figure_S1-nolegend.tex new file mode 100644 index 0000000..aff8319 --- /dev/null +++ b/assets/paper/figures/Figure_S1-nolegend.tex @@ -0,0 +1,20 @@ +\documentclass{article} +\usepackage[paperheight=4.45in,paperwidth=13in,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} + +\begin{document} + +\begin{textblock}{13}(.35, 1.61) \includegraphics{Figure_S1/p_arm-support.png} \end{textblock} +\begin{textblock}{13}(-.1, 1.71) \LARGE{(A)} \end{textblock} + +\begin{textblock}{13}(6.5, 0.00) \includegraphics{Figure_S1/q_arm-support.png} \end{textblock} +\begin{textblock}{13}(7.7, 0.25) \LARGE{(B)} \end{textblock} + +\begin{textblock}{13}(.45, 0.30) \includegraphics[width=2.300in,keepaspectratio]{Figure_S1/support-legend.pdf} \end{textblock} + +\end{document} diff --git a/assets/paper/figures/Figure_S1/p_arm-support.png b/assets/paper/figures/Figure_S1/p_arm-support.png new file mode 100644 index 0000000..5cde501 Binary files /dev/null and b/assets/paper/figures/Figure_S1/p_arm-support.png differ diff --git a/assets/paper/figures/Figure_S1/q_arm-support.png b/assets/paper/figures/Figure_S1/q_arm-support.png new file mode 100644 index 0000000..9f71ab6 Binary files /dev/null and b/assets/paper/figures/Figure_S1/q_arm-support.png differ diff --git a/assets/paper/figures/Figure_S1/support-legend.pdf b/assets/paper/figures/Figure_S1/support-legend.pdf new file mode 100644 index 0000000..2c71ea7 Binary files /dev/null and b/assets/paper/figures/Figure_S1/support-legend.pdf differ diff --git a/assets/paper/figures/Figure_S2-nolegend.pdf b/assets/paper/figures/Figure_S2-nolegend.pdf new file mode 100644 index 0000000..5817ffd Binary files /dev/null and b/assets/paper/figures/Figure_S2-nolegend.pdf differ diff --git a/assets/paper/figures/Figure_S2-nolegend.tex b/assets/paper/figures/Figure_S2-nolegend.tex new file mode 100644 index 0000000..4c5bc25 --- /dev/null +++ b/assets/paper/figures/Figure_S2-nolegend.tex @@ -0,0 +1,38 @@ +\documentclass{article} +\usepackage[paperheight=1255pt,paperwidth=1220pt,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage{ulem} + \renewcommand{\ULdepth}{7pt} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} +\begin{document} + +\begin{textblock}{13}(0.300,0.000)\includegraphics{Figure_S2/HG001/densityplot-p_arm.pdf}\end{textblock} +\begin{textblock}{13}(-0.05,0.200)\LARGE{(A)}\end{textblock} +\begin{textblock}{13}(-0.05,0.550)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG001\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(9.000,0.000)\includegraphics{Figure_S2/HG003/densityplot-p_arm.pdf}\end{textblock} +\begin{textblock}{13}(8.650,0.200)\LARGE{(B)}\end{textblock} +\begin{textblock}{13}(8.650,0.550)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG003\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(0.300,5.800)\includegraphics{Figure_S2/HG004/densityplot-p_arm.pdf}\end{textblock} +\begin{textblock}{13}(-0.05,6.000)\LARGE{(C)}\end{textblock} +\begin{textblock}{13}(-0.05,6.350)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG004\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(9.000,5.800)\includegraphics{Figure_S2/HG005/densityplot-p_arm.pdf}\end{textblock} +\begin{textblock}{13}(8.650,6.000)\LARGE{(D)}\end{textblock} +\begin{textblock}{13}(8.650,6.350)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG005\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(0.300,11.60)\includegraphics{Figure_S2/HG006/densityplot-p_arm.pdf}\end{textblock} +\begin{textblock}{13}(-0.05,11.80)\LARGE{(E)}\end{textblock} +\begin{textblock}{13}(-0.05,12.15)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG006\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(9.000,11.60)\includegraphics{Figure_S2/HG007/densityplot-p_arm.pdf}\end{textblock} +\begin{textblock}{13}(8.650,11.80)\LARGE{(F)}\end{textblock} +\begin{textblock}{13}(8.650,12.15)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG007\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(7.000,1.000)\includegraphics[width=1.60in,keepaspectratio]{Figure_S2/densityplot-p_arm-legend.pdf}\end{textblock} +\end{document} diff --git a/assets/paper/figures/Figure_S2/HG001/densityplot-p_arm.pdf b/assets/paper/figures/Figure_S2/HG001/densityplot-p_arm.pdf new file mode 100644 index 0000000..7d4ae8c Binary files /dev/null and b/assets/paper/figures/Figure_S2/HG001/densityplot-p_arm.pdf differ diff --git a/assets/paper/figures/Figure_S2/HG003/densityplot-p_arm.pdf b/assets/paper/figures/Figure_S2/HG003/densityplot-p_arm.pdf new file mode 100644 index 0000000..394e4f2 Binary files /dev/null and b/assets/paper/figures/Figure_S2/HG003/densityplot-p_arm.pdf differ diff --git a/assets/paper/figures/Figure_S2/HG004/densityplot-p_arm.pdf b/assets/paper/figures/Figure_S2/HG004/densityplot-p_arm.pdf new file mode 100644 index 0000000..ed0a22c Binary files /dev/null and b/assets/paper/figures/Figure_S2/HG004/densityplot-p_arm.pdf differ diff --git a/assets/paper/figures/Figure_S2/HG005/densityplot-p_arm.pdf b/assets/paper/figures/Figure_S2/HG005/densityplot-p_arm.pdf new file mode 100644 index 0000000..970fbb0 Binary files /dev/null and b/assets/paper/figures/Figure_S2/HG005/densityplot-p_arm.pdf differ diff --git a/assets/paper/figures/Figure_S2/HG006/densityplot-p_arm.pdf b/assets/paper/figures/Figure_S2/HG006/densityplot-p_arm.pdf new file mode 100644 index 0000000..3d52058 Binary files /dev/null and b/assets/paper/figures/Figure_S2/HG006/densityplot-p_arm.pdf differ diff --git a/assets/paper/figures/Figure_S2/HG007/densityplot-p_arm.pdf b/assets/paper/figures/Figure_S2/HG007/densityplot-p_arm.pdf new file mode 100644 index 0000000..617fb09 Binary files /dev/null and b/assets/paper/figures/Figure_S2/HG007/densityplot-p_arm.pdf differ diff --git a/assets/paper/figures/Figure_S2/densityplot-p_arm-legend.pdf b/assets/paper/figures/Figure_S2/densityplot-p_arm-legend.pdf new file mode 100644 index 0000000..ac6f84d Binary files /dev/null and b/assets/paper/figures/Figure_S2/densityplot-p_arm-legend.pdf differ diff --git a/assets/paper/figures/Figure_S3-nolegend.pdf b/assets/paper/figures/Figure_S3-nolegend.pdf new file mode 100644 index 0000000..07f5ea6 Binary files /dev/null and b/assets/paper/figures/Figure_S3-nolegend.pdf differ diff --git a/assets/paper/figures/Figure_S3-nolegend.tex b/assets/paper/figures/Figure_S3-nolegend.tex new file mode 100644 index 0000000..c5b9885 --- /dev/null +++ b/assets/paper/figures/Figure_S3-nolegend.tex @@ -0,0 +1,38 @@ +\documentclass{article} +\usepackage[paperheight=1255pt,paperwidth=1220pt,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage{ulem} + \renewcommand{\ULdepth}{7pt} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} +\begin{document} + +\begin{textblock}{13}(0.300,0.000)\includegraphics{Figure_S3/HG001/densityplot-q_arm.pdf}\end{textblock} +\begin{textblock}{13}(-0.05,0.200)\LARGE{(A)}\end{textblock} +\begin{textblock}{13}(-0.05,0.550)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG001\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(9.000,0.000)\includegraphics{Figure_S3/HG003/densityplot-q_arm.pdf}\end{textblock} +\begin{textblock}{13}(8.650,0.200)\LARGE{(B)}\end{textblock} +\begin{textblock}{13}(8.650,0.550)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG003\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(0.300,5.800)\includegraphics{Figure_S3/HG004/densityplot-q_arm.pdf}\end{textblock} +\begin{textblock}{13}(-0.05,6.000)\LARGE{(C)}\end{textblock} +\begin{textblock}{13}(-0.05,6.350)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG004\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(9.000,5.800)\includegraphics{Figure_S3/HG005/densityplot-q_arm.pdf}\end{textblock} +\begin{textblock}{13}(8.650,6.000)\LARGE{(D)}\end{textblock} +\begin{textblock}{13}(8.650,6.350)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG005\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(0.300,11.60)\includegraphics{Figure_S3/HG006/densityplot-q_arm.pdf}\end{textblock} +\begin{textblock}{13}(-0.05,11.80)\LARGE{(E)}\end{textblock} +\begin{textblock}{13}(-0.05,12.15)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG006\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(9.000,11.60)\includegraphics{Figure_S3/HG007/densityplot-q_arm.pdf}\end{textblock} +\begin{textblock}{13}(8.650,11.80)\LARGE{(F)}\end{textblock} +\begin{textblock}{13}(8.650,12.15)\rotatebox{90}{\Large{\uline{\hspace*{140pt}HG007\hspace*{140pt}}}}\end{textblock} + +\begin{textblock}{13}(0.900,0.150)\includegraphics[width=1.60in,keepaspectratio]{Figure_S3/densityplot-q_arm-legend.pdf}\end{textblock} +\end{document} diff --git a/assets/paper/figures/Figure_S3/HG001/densityplot-q_arm.pdf b/assets/paper/figures/Figure_S3/HG001/densityplot-q_arm.pdf new file mode 100644 index 0000000..36cdc06 Binary files /dev/null and b/assets/paper/figures/Figure_S3/HG001/densityplot-q_arm.pdf differ diff --git a/assets/paper/figures/Figure_S3/HG003/densityplot-q_arm.pdf b/assets/paper/figures/Figure_S3/HG003/densityplot-q_arm.pdf new file mode 100644 index 0000000..b7f52b7 Binary files /dev/null and b/assets/paper/figures/Figure_S3/HG003/densityplot-q_arm.pdf differ diff --git a/assets/paper/figures/Figure_S3/HG004/densityplot-q_arm.pdf b/assets/paper/figures/Figure_S3/HG004/densityplot-q_arm.pdf new file mode 100644 index 0000000..f0fe690 Binary files /dev/null and b/assets/paper/figures/Figure_S3/HG004/densityplot-q_arm.pdf differ diff --git a/assets/paper/figures/Figure_S3/HG005/densityplot-q_arm.pdf b/assets/paper/figures/Figure_S3/HG005/densityplot-q_arm.pdf new file mode 100644 index 0000000..eda202c Binary files /dev/null and b/assets/paper/figures/Figure_S3/HG005/densityplot-q_arm.pdf differ diff --git a/assets/paper/figures/Figure_S3/HG006/densityplot-q_arm.pdf b/assets/paper/figures/Figure_S3/HG006/densityplot-q_arm.pdf new file mode 100644 index 0000000..1225c5d Binary files /dev/null and b/assets/paper/figures/Figure_S3/HG006/densityplot-q_arm.pdf differ diff --git a/assets/paper/figures/Figure_S3/HG007/densityplot-q_arm.pdf b/assets/paper/figures/Figure_S3/HG007/densityplot-q_arm.pdf new file mode 100644 index 0000000..5382c12 Binary files /dev/null and b/assets/paper/figures/Figure_S3/HG007/densityplot-q_arm.pdf differ diff --git a/assets/paper/figures/Figure_S3/densityplot-q_arm-legend.pdf b/assets/paper/figures/Figure_S3/densityplot-q_arm-legend.pdf new file mode 100644 index 0000000..588bd84 Binary files /dev/null and b/assets/paper/figures/Figure_S3/densityplot-q_arm-legend.pdf differ diff --git a/assets/paper/figures/Figure_S4-nolegend.pdf b/assets/paper/figures/Figure_S4-nolegend.pdf new file mode 100644 index 0000000..78d6e69 Binary files /dev/null and b/assets/paper/figures/Figure_S4-nolegend.pdf differ diff --git a/assets/paper/figures/Figure_S4-nolegend.tex b/assets/paper/figures/Figure_S4-nolegend.tex new file mode 100644 index 0000000..7ba2099 --- /dev/null +++ b/assets/paper/figures/Figure_S4-nolegend.tex @@ -0,0 +1,67 @@ +\documentclass{article} +\usepackage[paperheight=15.038in,paperwidth=11.375in,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage{ulem} + \renewcommand{\ULdepth}{7pt} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} +\makeatletter + \newcommand*{\textoverline}[1]{$\overline{\hbox{#1\vphantom{\"A}}}\m@th$} + \makeatother +\begin{document} +\begin{textblock}{13}(0,0.050)\includegraphics{./Figure_S4/chr2-HG002.pdf}\end{textblock} +\begin{textblock}{13}(0,1.112)\includegraphics{./Figure_S4/chr2-HG003.pdf}\end{textblock} +\begin{textblock}{13}(0,1.243)\includegraphics{./Figure_S4/chr2-HG004.pdf}\end{textblock} +\begin{textblock}{13}(0,1.402)\includegraphics{./Figure_S4/chr2-HG005.pdf}\end{textblock} +\begin{textblock}{13}(-0.11,1.477)\includegraphics{./Figure_S4/chr2-HG007.pdf}\end{textblock} +\begin{textblock}{13}(0,1.821)\includegraphics{./Figure_S4/3ptel_1-500K_1_12_12-HG001.pdf}\end{textblock} +\begin{textblock}{13}(0,1.897)\includegraphics{./Figure_S4/3ptel_1-500K_1_12_12-HG002.pdf}\end{textblock} +\begin{textblock}{13}(0,2.444)\includegraphics{./Figure_S4/3ptel_1-500K_1_12_12-HG003.pdf}\end{textblock} +\begin{textblock}{13}(0,2.603)\includegraphics{./Figure_S4/3ptel_1-500K_1_12_12-HG004.pdf}\end{textblock} +\begin{textblock}{13}(0,2.818)\includegraphics{./Figure_S4/3ptel_1-500K_1_12_12-HG006.pdf}\end{textblock} +\begin{textblock}{13}(-0.11,2.963)\includegraphics{./Figure_S4/3ptel_1-500K_1_12_12-HG007.pdf}\end{textblock} +\begin{textblock}{13}(0,3.473)\includegraphics{./Figure_S4/4ptel_1-500K_1_12_12-HG001.pdf}\end{textblock} +\begin{textblock}{13}(0,3.841)\includegraphics{./Figure_S4/4ptel_1-500K_1_12_12-HG002.pdf}\end{textblock} +\begin{textblock}{13}(0,5.958)\includegraphics{./Figure_S4/4ptel_1-500K_1_12_12-HG003.pdf}\end{textblock} +\begin{textblock}{13}(0,6.714)\includegraphics{./Figure_S4/4ptel_1-500K_1_12_12-HG004.pdf}\end{textblock} +\begin{textblock}{13}(0,7.109)\includegraphics{./Figure_S4/4ptel_1-500K_1_12_12-HG005.pdf}\end{textblock} +\begin{textblock}{13}(0,7.587)\includegraphics{./Figure_S4/4ptel_1-500K_1_12_12-HG006.pdf}\end{textblock} +\begin{textblock}{13}(-0.11,7.774)\includegraphics{./Figure_S4/4ptel_1-500K_1_12_12-HG007.pdf}\end{textblock} +\begin{textblock}{13}(0,8.173)\includegraphics{./Figure_S4/chr5-HG001.pdf}\end{textblock} +\begin{textblock}{13}(0,8.249)\includegraphics{./Figure_S4/chr5-HG002.pdf}\end{textblock} +\begin{textblock}{13}(0,8.797)\includegraphics{./Figure_S4/chr5-HG003.pdf}\end{textblock} +\begin{textblock}{13}(0,8.928)\includegraphics{./Figure_S4/chr5-HG005.pdf}\end{textblock} +\begin{textblock}{13}(-0.11,9.017)\includegraphics{./Figure_S4/chr5-HG006.pdf}\end{textblock} +\begin{textblock}{13}(0,9.375)\includegraphics{./Figure_S4/chr9-HG001.pdf}\end{textblock} +\begin{textblock}{13}(0,9.617)\includegraphics{./Figure_S4/chr9-HG002.pdf}\end{textblock} +\begin{textblock}{13}(0,9.957)\includegraphics{./Figure_S4/chr9-HG003.pdf}\end{textblock} +\begin{textblock}{13}(0,10.018)\includegraphics{./Figure_S4/chr9-HG004.pdf}\end{textblock} +\begin{textblock}{13}(-0.11,10.080)\includegraphics{./Figure_S4/chr9-HG005.pdf}\end{textblock} +\begin{textblock}{13}(0,10.438)\includegraphics{./Figure_S4/chr12-HG001.pdf}\end{textblock} +\begin{textblock}{13}(0,10.527)\includegraphics{./Figure_S4/chr12-HG002.pdf}\end{textblock} +\begin{textblock}{13}(0,10.742)\includegraphics{./Figure_S4/chr12-HG003.pdf}\end{textblock} +\begin{textblock}{13}(0,10.942)\includegraphics{./Figure_S4/chr12-HG004.pdf}\end{textblock} +\begin{textblock}{13}(0,11.073)\includegraphics{./Figure_S4/chr12-HG005.pdf}\end{textblock} +\begin{textblock}{13}(0,11.163)\includegraphics{./Figure_S4/chr12-HG006.pdf}\end{textblock} +\begin{textblock}{13}(-0.11,11.308)\includegraphics{./Figure_S4/chr12-HG007.pdf}\end{textblock} +\begin{textblock}{13}(0,11.679)\includegraphics{./Figure_S4/17ptel_1_500K_1_12_12-HG001.pdf}\end{textblock} +\begin{textblock}{13}(0,11.755)\includegraphics{./Figure_S4/17ptel_1_500K_1_12_12-HG002.pdf}\end{textblock} +\begin{textblock}{13}(0,12.622)\includegraphics{./Figure_S4/17ptel_1_500K_1_12_12-HG003.pdf}\end{textblock} +\begin{textblock}{13}(0,13.114)\includegraphics{./Figure_S4/17ptel_1_500K_1_12_12-HG004.pdf}\end{textblock} +\begin{textblock}{13}(0,13.454)\includegraphics{./Figure_S4/17ptel_1_500K_1_12_12-HG005.pdf}\end{textblock} +\begin{textblock}{13}(0,13.696)\includegraphics{./Figure_S4/17ptel_1_500K_1_12_12-HG006.pdf}\end{textblock} +\begin{textblock}{13}(-0.11,14.063)\includegraphics{./Figure_S4/17ptel_1_500K_1_12_12-HG007.pdf}\end{textblock} +\begin{textblock}{13}(6.465148181818182,0.47)\rotatebox{90}{\Large{\textoverline{\hspace*{7.8pt}2p (chr2)\hspace*{7.8pt}}}}\end{textblock} +\begin{textblock}{13}(6.4024209090909095,1.8211146)\rotatebox{90}{\Large{\textoverline{\hspace*{7.8pt}3p (3ptel\_1...)\hspace*{7.8pt}}}}\end{textblock} +\begin{textblock}{13}(8.531310909090909,3.4733371999999996)\rotatebox{90}{\Large{\textoverline{\hspace*{117.0pt}4p (4ptel\_1...)\hspace*{117.0pt}}}}\end{textblock} +\begin{textblock}{13}(6.365148181818182,8.1733342)\rotatebox{90}{\Large{\textoverline{\hspace*{3.9pt}5p (chr5)\hspace*{3.9pt}}}}\end{textblock} +\begin{textblock}{13}(6.037378181818182,9.375001199999998)\rotatebox{90}{\Large{\textoverline{\hspace*{0.0pt}9p (chr9)\hspace*{0.0pt}}}}\end{textblock} +\begin{textblock}{13}(5.9,10.437778599999998)\rotatebox{90}{\Large{\textoverline{\hspace*{0.0pt}12p (chr12)\hspace*{0.0pt}}}}\end{textblock} +\begin{textblock}{13}(6.8,11.679445399999999)\rotatebox{90}{\Large{\textoverline{\hspace*{50.699999999999996pt}17p (17ptel\_1...)\hspace*{50.699999999999996pt}}}}\end{textblock} +\begin{textblock}{13}(7.070,.4) +\includegraphics[width=4.100in,keepaspectratio]{Figure_4/legend.pdf} +\end{textblock} +\end{document} diff --git a/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG001.pdf b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG001.pdf new file mode 100644 index 0000000..dee0405 Binary files /dev/null and b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG002.pdf b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG002.pdf new file mode 100644 index 0000000..346ecae Binary files /dev/null and b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG003.pdf b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG003.pdf new file mode 100644 index 0000000..1d93c75 Binary files /dev/null and b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG004.pdf b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG004.pdf new file mode 100644 index 0000000..81d2450 Binary files /dev/null and b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG005.pdf b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG005.pdf new file mode 100644 index 0000000..879013b Binary files /dev/null and b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG006.pdf b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG006.pdf new file mode 100644 index 0000000..a1d9791 Binary files /dev/null and b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG006.pdf differ diff --git a/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG007.pdf b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG007.pdf new file mode 100644 index 0000000..9a658a2 Binary files /dev/null and b/assets/paper/figures/Figure_S4/17ptel_1_500K_1_12_12-HG007.pdf differ diff --git a/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG001.pdf b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG001.pdf new file mode 100644 index 0000000..310b6c2 Binary files /dev/null and b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG002.pdf b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG002.pdf new file mode 100644 index 0000000..2734cf5 Binary files /dev/null and b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG003.pdf b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG003.pdf new file mode 100644 index 0000000..8781bb2 Binary files /dev/null and b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG004.pdf b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG004.pdf new file mode 100644 index 0000000..c3410d3 Binary files /dev/null and b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG006.pdf b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG006.pdf new file mode 100644 index 0000000..87913f3 Binary files /dev/null and b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG006.pdf differ diff --git a/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG007.pdf b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG007.pdf new file mode 100644 index 0000000..cc7de37 Binary files /dev/null and b/assets/paper/figures/Figure_S4/3ptel_1-500K_1_12_12-HG007.pdf differ diff --git a/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG001.pdf b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG001.pdf new file mode 100644 index 0000000..35b2346 Binary files /dev/null and b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG002.pdf b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG002.pdf new file mode 100644 index 0000000..fb66129 Binary files /dev/null and b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG003.pdf b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG003.pdf new file mode 100644 index 0000000..97c9f9c Binary files /dev/null and b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG004.pdf b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG004.pdf new file mode 100644 index 0000000..7cf77a9 Binary files /dev/null and b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG005.pdf b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG005.pdf new file mode 100644 index 0000000..fd78c2b Binary files /dev/null and b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG006.pdf b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG006.pdf new file mode 100644 index 0000000..d2176d5 Binary files /dev/null and b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG006.pdf differ diff --git a/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG007.pdf b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG007.pdf new file mode 100644 index 0000000..b3f1ac4 Binary files /dev/null and b/assets/paper/figures/Figure_S4/4ptel_1-500K_1_12_12-HG007.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr12-HG001.pdf b/assets/paper/figures/Figure_S4/chr12-HG001.pdf new file mode 100644 index 0000000..abb3bef Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr12-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr12-HG002.pdf b/assets/paper/figures/Figure_S4/chr12-HG002.pdf new file mode 100644 index 0000000..dd553f8 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr12-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr12-HG003.pdf b/assets/paper/figures/Figure_S4/chr12-HG003.pdf new file mode 100644 index 0000000..6f0a5b7 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr12-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr12-HG004.pdf b/assets/paper/figures/Figure_S4/chr12-HG004.pdf new file mode 100644 index 0000000..9399a23 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr12-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr12-HG005.pdf b/assets/paper/figures/Figure_S4/chr12-HG005.pdf new file mode 100644 index 0000000..0c8a670 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr12-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr12-HG006.pdf b/assets/paper/figures/Figure_S4/chr12-HG006.pdf new file mode 100644 index 0000000..e0bb080 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr12-HG006.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr12-HG007.pdf b/assets/paper/figures/Figure_S4/chr12-HG007.pdf new file mode 100644 index 0000000..38c8fa1 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr12-HG007.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr2-HG002.pdf b/assets/paper/figures/Figure_S4/chr2-HG002.pdf new file mode 100644 index 0000000..7f49c5f Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr2-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr2-HG003.pdf b/assets/paper/figures/Figure_S4/chr2-HG003.pdf new file mode 100644 index 0000000..26bdb7c Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr2-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr2-HG004.pdf b/assets/paper/figures/Figure_S4/chr2-HG004.pdf new file mode 100644 index 0000000..a06c081 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr2-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr2-HG005.pdf b/assets/paper/figures/Figure_S4/chr2-HG005.pdf new file mode 100644 index 0000000..69b54cd Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr2-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr2-HG007.pdf b/assets/paper/figures/Figure_S4/chr2-HG007.pdf new file mode 100644 index 0000000..58d51f9 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr2-HG007.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr5-HG001.pdf b/assets/paper/figures/Figure_S4/chr5-HG001.pdf new file mode 100644 index 0000000..2fd6d93 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr5-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr5-HG002.pdf b/assets/paper/figures/Figure_S4/chr5-HG002.pdf new file mode 100644 index 0000000..0a3e4b5 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr5-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr5-HG003.pdf b/assets/paper/figures/Figure_S4/chr5-HG003.pdf new file mode 100644 index 0000000..260da08 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr5-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr5-HG005.pdf b/assets/paper/figures/Figure_S4/chr5-HG005.pdf new file mode 100644 index 0000000..fd2c86a Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr5-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr5-HG006.pdf b/assets/paper/figures/Figure_S4/chr5-HG006.pdf new file mode 100644 index 0000000..b326273 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr5-HG006.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr9-HG001.pdf b/assets/paper/figures/Figure_S4/chr9-HG001.pdf new file mode 100644 index 0000000..059cfd3 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr9-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr9-HG002.pdf b/assets/paper/figures/Figure_S4/chr9-HG002.pdf new file mode 100644 index 0000000..855eead Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr9-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr9-HG003.pdf b/assets/paper/figures/Figure_S4/chr9-HG003.pdf new file mode 100644 index 0000000..747a332 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr9-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr9-HG004.pdf b/assets/paper/figures/Figure_S4/chr9-HG004.pdf new file mode 100644 index 0000000..3a2d9d6 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr9-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S4/chr9-HG005.pdf b/assets/paper/figures/Figure_S4/chr9-HG005.pdf new file mode 100644 index 0000000..8dcf5c7 Binary files /dev/null and b/assets/paper/figures/Figure_S4/chr9-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S5-nolegend.pdf b/assets/paper/figures/Figure_S5-nolegend.pdf new file mode 100644 index 0000000..d75b29c Binary files /dev/null and b/assets/paper/figures/Figure_S5-nolegend.pdf differ diff --git a/assets/paper/figures/Figure_S5-nolegend.tex b/assets/paper/figures/Figure_S5-nolegend.tex new file mode 100644 index 0000000..4cc24e5 --- /dev/null +++ b/assets/paper/figures/Figure_S5-nolegend.tex @@ -0,0 +1,68 @@ +\documentclass{article} +\usepackage[paperheight=13.423in,paperwidth=10.514in,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage{ulem} + \renewcommand{\ULdepth}{7pt} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} +\begin{document} +\begin{textblock}{13}(0.0,0.050)\includegraphics{./Figure_S5/chr7-HG001.pdf}\end{textblock} +\begin{textblock}{13}(0.8194499999999998,0.598)\includegraphics{./Figure_S5/chr7-HG002.pdf}\end{textblock} +\begin{textblock}{13}(1.76389,1.493)\includegraphics{./Figure_S5/chr7-HG003.pdf}\end{textblock} +\begin{textblock}{13}(1.73611,1.693)\includegraphics{./Figure_S5/chr7-HG004.pdf}\end{textblock} +\begin{textblock}{13}(1.8333300000000001,1.908)\includegraphics{./Figure_S5/chr7-HG006.pdf}\end{textblock} +\begin{textblock}{13}(1.7844499999999999,2.053)\includegraphics{./Figure_S5/chr7-HG007.pdf}\end{textblock} +\begin{textblock}{13}(1.88889,2.522)\includegraphics{./Figure_S5/chr8-HG001.pdf}\end{textblock} +\begin{textblock}{13}(1.1944499999999998,2.639)\includegraphics{./Figure_S5/chr8-HG002.pdf}\end{textblock} +\begin{textblock}{13}(1.86111,3.256)\includegraphics{./Figure_S5/chr8-HG003.pdf}\end{textblock} +\begin{textblock}{13}(1.8135599999999998,3.387)\includegraphics{./Figure_S5/chr8-HG004.pdf}\end{textblock} +\begin{textblock}{13}(1.9305599999999998,3.546)\includegraphics{./Figure_S5/chr8-HG005.pdf}\end{textblock} +\begin{textblock}{13}(1.8333300000000001,3.622)\includegraphics{./Figure_S5/chr8-HG006.pdf}\end{textblock} +\begin{textblock}{13}(1.965,3.767)\includegraphics{./Figure_S5/chr8-HG007.pdf}\end{textblock} +\begin{textblock}{13}(1.9166699999999999,4.111)\includegraphics{./Figure_S5/chr11-HG001.pdf}\end{textblock} +\begin{textblock}{13}(1.0416699999999999,4.200)\includegraphics{./Figure_S5/chr11-HG002.pdf}\end{textblock} +\begin{textblock}{13}(1.4166699999999999,4.928)\includegraphics{./Figure_S5/chr11-HG003.pdf}\end{textblock} +\begin{textblock}{13}(1.73611,5.379)\includegraphics{./Figure_S5/chr11-HG004.pdf}\end{textblock} +\begin{textblock}{13}(1.8125599999999998,5.593)\includegraphics{./Figure_S5/chr11-HG005.pdf}\end{textblock} +\begin{textblock}{13}(1.8125599999999998,5.752)\includegraphics{./Figure_S5/chr11-HG006.pdf}\end{textblock} +\begin{textblock}{13}(1.6872200000000002,5.911)\includegraphics{./Figure_S5/chr11-HG007.pdf}\end{textblock} +\begin{textblock}{13}(1.88889,6.449)\includegraphics{./Figure_S5/chr12-HG001.pdf}\end{textblock} +\begin{textblock}{13}(1.22222,6.567)\includegraphics{./Figure_S5/chr12-HG002.pdf}\end{textblock} +\begin{textblock}{13}(1.9166699999999999,7.170)\includegraphics{./Figure_S5/chr12-HG003.pdf}\end{textblock} +\begin{textblock}{13}(1.88889,7.259)\includegraphics{./Figure_S5/chr12-HG004.pdf}\end{textblock} +\begin{textblock}{13}(1.88889,7.377)\includegraphics{./Figure_S5/chr12-HG005.pdf}\end{textblock} +\begin{textblock}{13}(1.88889,7.494)\includegraphics{./Figure_S5/chr12-HG006.pdf}\end{textblock} +\begin{textblock}{13}(1.715,7.611)\includegraphics{./Figure_S5/chr12-HG007.pdf}\end{textblock} +\begin{textblock}{13}(1.9305599999999998,8.136)\includegraphics{./Figure_S5/14qtel_1-500K_1_12_12_rc-HG001.pdf}\end{textblock} +\begin{textblock}{13}(1.0694499999999998,8.211)\includegraphics{./Figure_S5/14qtel_1-500K_1_12_12_rc-HG002.pdf}\end{textblock} +\begin{textblock}{13}(1.5416699999999999,8.926)\includegraphics{./Figure_S5/14qtel_1-500K_1_12_12_rc-HG003.pdf}\end{textblock} +\begin{textblock}{13}(1.9166699999999999,9.293)\includegraphics{./Figure_S5/14qtel_1-500K_1_12_12_rc-HG004.pdf}\end{textblock} +\begin{textblock}{13}(1.9335599999999998,9.382)\includegraphics{./Figure_S5/14qtel_1-500K_1_12_12_rc-HG005.pdf}\end{textblock} +\begin{textblock}{13}(1.89189,9.458)\includegraphics{./Figure_S5/14qtel_1-500K_1_12_12_rc-HG006.pdf}\end{textblock} +\begin{textblock}{13}(1.9124499999999999,9.575)\includegraphics{./Figure_S5/14qtel_1-500K_1_12_12_rc-HG007.pdf}\end{textblock} +\begin{textblock}{13}(1.5416699999999999,9.947)\includegraphics{./Figure_S5/chr15-HG001.pdf}\end{textblock} +\begin{textblock}{13}(1.86111,10.314)\includegraphics{./Figure_S5/chr15-HG002.pdf}\end{textblock} +\begin{textblock}{13}(1.9583300000000001,10.445)\includegraphics{./Figure_S5/chr15-HG003.pdf}\end{textblock} +\begin{textblock}{13}(1.9166699999999999,10.507)\includegraphics{./Figure_S5/chr15-HG004.pdf}\end{textblock} +\begin{textblock}{13}(1.73978,10.596)\includegraphics{./Figure_S5/chr15-HG005.pdf}\end{textblock} +\begin{textblock}{13}(1.9305599999999998,11.107)\includegraphics{./Figure_S5/18qtel_1-500K_1_12_12_rc-HG001.pdf}\end{textblock} +\begin{textblock}{13}(0.75,11.182)\includegraphics{./Figure_S5/18qtel_1-500K_1_12_12_rc-HG002.pdf}\end{textblock} +\begin{textblock}{13}(1.6666699999999999,12.133)\includegraphics{./Figure_S5/18qtel_1-500K_1_12_12_rc-HG003.pdf}\end{textblock} +\begin{textblock}{13}(1.88889,12.403)\includegraphics{./Figure_S5/18qtel_1-500K_1_12_12_rc-HG004.pdf}\end{textblock} +\begin{textblock}{13}(1.9583300000000001,12.520)\includegraphics{./Figure_S5/18qtel_1-500K_1_12_12_rc-HG005.pdf}\end{textblock} +\begin{textblock}{13}(1.8333300000000001,12.582)\includegraphics{./Figure_S5/18qtel_1-500K_1_12_12_rc-HG006.pdf}\end{textblock} +\begin{textblock}{13}(1.9372200000000002,12.727)\includegraphics{./Figure_S5/18qtel_1-500K_1_12_12_rc-HG007.pdf}\end{textblock} +\begin{textblock}{13}(0.5194499999999997,0.47)\rotatebox{90}{\Large{\uline{\hspace*{35.1pt}7q (chr7)\hspace*{35.1pt}}}}\end{textblock} +\begin{textblock}{13}(0.8944499999999997,2.521667)\rotatebox{90}{\Large{\uline{\hspace*{19.5pt}8q (chr8)\hspace*{19.5pt}}}}\end{textblock} +\begin{textblock}{13}(0.7416699999999998,4.1105558)\rotatebox{90}{\Large{\uline{\hspace*{39.0pt}11q (chr11)\hspace*{39.0pt}}}}\end{textblock} +\begin{textblock}{13}(0.92222,6.4494442)\rotatebox{90}{\Large{\uline{\hspace*{15.6pt}12q (chr12)\hspace*{15.6pt}}}}\end{textblock} +\begin{textblock}{13}(0.7694499999999997,8.135554399999998)\rotatebox{90}{\Large{\uline{\hspace*{7.8pt}14q (14qtel\_1...)\hspace*{7.8pt}}}}\end{textblock} +\begin{textblock}{13}(1.2416699999999998,9.946665199999998)\rotatebox{90}{\Large{\uline{\hspace*{0.0pt}15q (chr15)\hspace*{0.0pt}}}}\end{textblock} +\begin{textblock}{13}(0.45,11.106665299999998)\rotatebox{90}{\Large{\uline{\hspace*{11.7pt}18q (18qtel\_1...)\hspace*{11.7pt}}}}\end{textblock} +\begin{textblock}{13}(5.789,0) +\includegraphics[width=4.500in,keepaspectratio]{Figure_5/legend.pdf} +\end{textblock} +\end{document} diff --git a/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG001.pdf b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG001.pdf new file mode 100644 index 0000000..e40f4cd Binary files /dev/null and b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG002.pdf b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG002.pdf new file mode 100644 index 0000000..fc64872 Binary files /dev/null and b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG003.pdf b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG003.pdf new file mode 100644 index 0000000..f721af2 Binary files /dev/null and b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG004.pdf b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG004.pdf new file mode 100644 index 0000000..cc293fb Binary files /dev/null and b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG005.pdf b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG005.pdf new file mode 100644 index 0000000..2ab2f23 Binary files /dev/null and b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG006.pdf b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG006.pdf new file mode 100644 index 0000000..5bdc4b0 Binary files /dev/null and b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG006.pdf differ diff --git a/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG007.pdf b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG007.pdf new file mode 100644 index 0000000..51af01a Binary files /dev/null and b/assets/paper/figures/Figure_S5/14qtel_1-500K_1_12_12_rc-HG007.pdf differ diff --git a/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG001.pdf b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG001.pdf new file mode 100644 index 0000000..e53e874 Binary files /dev/null and b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG002.pdf b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG002.pdf new file mode 100644 index 0000000..c609f7f Binary files /dev/null and b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG003.pdf b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG003.pdf new file mode 100644 index 0000000..1934121 Binary files /dev/null and b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG004.pdf b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG004.pdf new file mode 100644 index 0000000..6a7d551 Binary files /dev/null and b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG005.pdf b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG005.pdf new file mode 100644 index 0000000..cf232f6 Binary files /dev/null and b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG006.pdf b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG006.pdf new file mode 100644 index 0000000..0a59373 Binary files /dev/null and b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG006.pdf differ diff --git a/publications/methods-paper/figures/entropy.pdf b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG007.pdf similarity index 51% rename from publications/methods-paper/figures/entropy.pdf rename to assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG007.pdf index 135f9ba..ea370a0 100644 Binary files a/publications/methods-paper/figures/entropy.pdf and b/assets/paper/figures/Figure_S5/18qtel_1-500K_1_12_12_rc-HG007.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr11-HG001.pdf b/assets/paper/figures/Figure_S5/chr11-HG001.pdf new file mode 100644 index 0000000..7955c1e Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr11-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr11-HG002.pdf b/assets/paper/figures/Figure_S5/chr11-HG002.pdf new file mode 100644 index 0000000..0faad9c Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr11-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr11-HG003.pdf b/assets/paper/figures/Figure_S5/chr11-HG003.pdf new file mode 100644 index 0000000..11f1bb9 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr11-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr11-HG004.pdf b/assets/paper/figures/Figure_S5/chr11-HG004.pdf new file mode 100644 index 0000000..9ceec98 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr11-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr11-HG005.pdf b/assets/paper/figures/Figure_S5/chr11-HG005.pdf new file mode 100644 index 0000000..0ce5679 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr11-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr11-HG006.pdf b/assets/paper/figures/Figure_S5/chr11-HG006.pdf new file mode 100644 index 0000000..5987a85 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr11-HG006.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr11-HG007.pdf b/assets/paper/figures/Figure_S5/chr11-HG007.pdf new file mode 100644 index 0000000..5d350ef Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr11-HG007.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr12-HG001.pdf b/assets/paper/figures/Figure_S5/chr12-HG001.pdf new file mode 100644 index 0000000..8c0df86 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr12-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr12-HG002.pdf b/assets/paper/figures/Figure_S5/chr12-HG002.pdf new file mode 100644 index 0000000..e192025 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr12-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr12-HG003.pdf b/assets/paper/figures/Figure_S5/chr12-HG003.pdf new file mode 100644 index 0000000..94d15ca Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr12-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr12-HG004.pdf b/assets/paper/figures/Figure_S5/chr12-HG004.pdf new file mode 100644 index 0000000..a4b4a0d Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr12-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr12-HG005.pdf b/assets/paper/figures/Figure_S5/chr12-HG005.pdf new file mode 100644 index 0000000..6adf76b Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr12-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr12-HG006.pdf b/assets/paper/figures/Figure_S5/chr12-HG006.pdf new file mode 100644 index 0000000..b9ba460 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr12-HG006.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr12-HG007.pdf b/assets/paper/figures/Figure_S5/chr12-HG007.pdf new file mode 100644 index 0000000..a3eb636 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr12-HG007.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr15-HG001.pdf b/assets/paper/figures/Figure_S5/chr15-HG001.pdf new file mode 100644 index 0000000..a626ce7 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr15-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr15-HG002.pdf b/assets/paper/figures/Figure_S5/chr15-HG002.pdf new file mode 100644 index 0000000..4e12b1c Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr15-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr15-HG003.pdf b/assets/paper/figures/Figure_S5/chr15-HG003.pdf new file mode 100644 index 0000000..cea4a19 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr15-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr15-HG004.pdf b/assets/paper/figures/Figure_S5/chr15-HG004.pdf new file mode 100644 index 0000000..9deacf3 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr15-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr15-HG005.pdf b/assets/paper/figures/Figure_S5/chr15-HG005.pdf new file mode 100644 index 0000000..6c5852d Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr15-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr7-HG001.pdf b/assets/paper/figures/Figure_S5/chr7-HG001.pdf new file mode 100644 index 0000000..563505a Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr7-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr7-HG002.pdf b/assets/paper/figures/Figure_S5/chr7-HG002.pdf new file mode 100644 index 0000000..705e5e1 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr7-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr7-HG003.pdf b/assets/paper/figures/Figure_S5/chr7-HG003.pdf new file mode 100644 index 0000000..be9fe6e Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr7-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr7-HG004.pdf b/assets/paper/figures/Figure_S5/chr7-HG004.pdf new file mode 100644 index 0000000..7727c1c Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr7-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr7-HG006.pdf b/assets/paper/figures/Figure_S5/chr7-HG006.pdf new file mode 100644 index 0000000..57ae88d Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr7-HG006.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr7-HG007.pdf b/assets/paper/figures/Figure_S5/chr7-HG007.pdf new file mode 100644 index 0000000..2479ab2 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr7-HG007.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr8-HG001.pdf b/assets/paper/figures/Figure_S5/chr8-HG001.pdf new file mode 100644 index 0000000..e5b4cd6 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr8-HG001.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr8-HG002.pdf b/assets/paper/figures/Figure_S5/chr8-HG002.pdf new file mode 100644 index 0000000..1b1b1d1 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr8-HG002.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr8-HG003.pdf b/assets/paper/figures/Figure_S5/chr8-HG003.pdf new file mode 100644 index 0000000..98aeaf3 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr8-HG003.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr8-HG004.pdf b/assets/paper/figures/Figure_S5/chr8-HG004.pdf new file mode 100644 index 0000000..f9bcd37 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr8-HG004.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr8-HG005.pdf b/assets/paper/figures/Figure_S5/chr8-HG005.pdf new file mode 100644 index 0000000..15e0002 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr8-HG005.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr8-HG006.pdf b/assets/paper/figures/Figure_S5/chr8-HG006.pdf new file mode 100644 index 0000000..d6a2456 Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr8-HG006.pdf differ diff --git a/assets/paper/figures/Figure_S5/chr8-HG007.pdf b/assets/paper/figures/Figure_S5/chr8-HG007.pdf new file mode 100644 index 0000000..780087f Binary files /dev/null and b/assets/paper/figures/Figure_S5/chr8-HG007.pdf differ diff --git a/assets/paper/figures/Supplemental_Fig_S1.pdf b/assets/paper/figures/Supplemental_Fig_S1.pdf new file mode 100644 index 0000000..2bb3704 Binary files /dev/null and b/assets/paper/figures/Supplemental_Fig_S1.pdf differ diff --git a/assets/paper/figures/Supplemental_Fig_S1.tex b/assets/paper/figures/Supplemental_Fig_S1.tex new file mode 100644 index 0000000..8f63c48 --- /dev/null +++ b/assets/paper/figures/Supplemental_Fig_S1.tex @@ -0,0 +1,22 @@ +\documentclass{article} +\usepackage[paperheight=5in,paperwidth=11in,margin=.2in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage[T1]{fontenc} +\usepackage{graphicx} + +\begin{document} +\pagenumbering{gobble} +\begin{samepage} + +\noindent \textbf{Supplemental Figure S1.} +Positions supported by short Illumina reads in the telomeric candidate long-read sequences on \textbf{(A)} \textit{p} arms and \textbf{(B)} \textit{q} arms of datasets HG001 through HG007 (sequences from all datasets are plotted together). +Long reads are plotted in blue, and positions supported by short reads are marked in green. +Genomic coordinates are given in Kbp, relative to the boundary of the annotated telomeric tract. +Note that, due to the multi-Kbp scale, individual small supported/unsupported regions may not be visible. + +\begin{figure}[h!] \centering +\includegraphics[width=\textwidth,keepaspectratio]{Figure_S1-nolegend.pdf} +\end{figure} + +\end{samepage} +\end{document} diff --git a/assets/paper/figures/Supplemental_Fig_S2.pdf b/assets/paper/figures/Supplemental_Fig_S2.pdf new file mode 100644 index 0000000..227a156 Binary files /dev/null and b/assets/paper/figures/Supplemental_Fig_S2.pdf differ diff --git a/assets/paper/figures/Supplemental_Fig_S2.tex b/assets/paper/figures/Supplemental_Fig_S2.tex new file mode 100644 index 0000000..8bccdec --- /dev/null +++ b/assets/paper/figures/Supplemental_Fig_S2.tex @@ -0,0 +1,28 @@ +\documentclass{article} +\usepackage[paperheight=10.75in,paperwidth=9.5in,margin=.2in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage[T1]{fontenc} +\usepackage{graphicx} + +\begin{document} +\pagenumbering{gobble} + +\begin{samepage} +\noindent \textbf{Supplemental Figure S2.} +Densities of the top two enriched motifs at ends of chromosomal \textit{p} arms of datasets +\textbf{(A)} HG001, +\textbf{(B)} HG003, +\textbf{(C)} HG004, +\textbf{(D)} HG005, +\textbf{(E)} HG006, and +\textbf{(F)} HG007. +Only the arms covered by at least 25 reads across all datasets are displayed. +Genomic coordinates are given in Kbp. +Vertical red dashed lines denote the position of the boundary of the annotated telomeric tract. + +\begin{figure}[ht!] \centering +\includegraphics[width=\textwidth,keepaspectratio]{Figure_S2-nolegend.pdf} +\end{figure} +\end{samepage} + +\end{document} diff --git a/assets/paper/figures/Supplemental_Fig_S3.pdf b/assets/paper/figures/Supplemental_Fig_S3.pdf new file mode 100644 index 0000000..8527d86 Binary files /dev/null and b/assets/paper/figures/Supplemental_Fig_S3.pdf differ diff --git a/assets/paper/figures/Supplemental_Fig_S3.tex b/assets/paper/figures/Supplemental_Fig_S3.tex new file mode 100644 index 0000000..f82e8ab --- /dev/null +++ b/assets/paper/figures/Supplemental_Fig_S3.tex @@ -0,0 +1,28 @@ +\documentclass{article} +\usepackage[paperheight=10.75in,paperwidth=9.5in,margin=.2in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage[T1]{fontenc} +\usepackage{graphicx} + +\begin{document} +\pagenumbering{gobble} + +\begin{samepage} +\noindent \textbf{Supplemental Figure S3.} +Densities of the top three enriched motifs at ends of chromosomal \textit{q} arms of datasets +\textbf{(A)} HG001, +\textbf{(B)} HG003, +\textbf{(C)} HG004, +\textbf{(D)} HG005, +\textbf{(E)} HG006, and +\textbf{(F)} HG007. +Only the arms covered by at least 25 reads across all datasets are displayed. +Genomic coordinates are given in Kbp. +Vertical red dashed lines denote the position of the boundary of the annotated telomeric tract. + +\begin{figure}[ht!] \centering +\includegraphics[width=\textwidth,keepaspectratio]{Figure_S3-nolegend.pdf} +\end{figure} +\end{samepage} + +\end{document} diff --git a/assets/paper/figures/Supplemental_Fig_S4.pdf b/assets/paper/figures/Supplemental_Fig_S4.pdf new file mode 100644 index 0000000..2260871 Binary files /dev/null and b/assets/paper/figures/Supplemental_Fig_S4.pdf differ diff --git a/assets/paper/figures/Supplemental_Fig_S4.tex b/assets/paper/figures/Supplemental_Fig_S4.tex new file mode 100644 index 0000000..23148ca --- /dev/null +++ b/assets/paper/figures/Supplemental_Fig_S4.tex @@ -0,0 +1,20 @@ +\documentclass{article} +\usepackage[paperwidth=7.85in,paperheight=11in,margin=.2in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage[T1]{fontenc} +\usepackage{graphicx} + +\begin{document} +\pagenumbering{gobble} +\begin{samepage} + +\noindent \textbf{Supplemental Figure S4.} +Clustering of reads on each chromosomal \textit{p} arm by pairwise Levenshtein distance, for each subject (HG001 through HG007), and densities of the top two enriched motifs along each read in the telomeric region. +Genomic coordinates are given in Kbp, relative to the boundary of the annotated telomeric tract. + +\begin{figure}[h!] \centering +\includegraphics[width=\textwidth,keepaspectratio]{Figure_S4-nolegend.pdf} +\end{figure} + +\end{samepage} +\end{document} diff --git a/assets/paper/figures/Supplemental_Fig_S5.pdf b/assets/paper/figures/Supplemental_Fig_S5.pdf new file mode 100644 index 0000000..bdaebf1 Binary files /dev/null and b/assets/paper/figures/Supplemental_Fig_S5.pdf differ diff --git a/assets/paper/figures/Supplemental_Fig_S5.tex b/assets/paper/figures/Supplemental_Fig_S5.tex new file mode 100644 index 0000000..a83ef0d --- /dev/null +++ b/assets/paper/figures/Supplemental_Fig_S5.tex @@ -0,0 +1,20 @@ +\documentclass{article} +\usepackage[paperwidth=8in,paperheight=11in,margin=.2in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage[T1]{fontenc} +\usepackage{graphicx} + +\begin{document} +\pagenumbering{gobble} +\begin{samepage} + +\noindent \textbf{Supplemental Figure S5.} +Clustering of reads on each chromosomal \textit{q} arm by pairwise Levenshtein distance, for each subject (HG001 through HG007), and densities of the top three enriched motifs along each read in the telomeric region. +Genomic coordinates are given in Kbp, relative to the boundary of the annotated telomeric tract. + +\begin{figure}[h!] \centering +\includegraphics[width=\textwidth,keepaspectratio]{Figure_S5-nolegend.pdf} +\end{figure} + +\end{samepage} +\end{document} diff --git a/assets/paper/figures/make-figure-4-tex.py b/assets/paper/figures/make-figure-4-tex.py new file mode 100755 index 0000000..8085245 --- /dev/null +++ b/assets/paper/figures/make-figure-4-tex.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +from sys import argv +from tqdm import tqdm +from subprocess import check_output +from re import search +from collections import OrderedDict + + +PDFS = [ + "Figure_4/chr2.pdf", + "Figure_4/3ptel_1-500K_1_12_12.pdf", + "Figure_4/4ptel_1-500K_1_12_12.pdf", + "Figure_4/chr5.pdf", + "Figure_4/chr9.pdf", + "Figure_4/chr12.pdf", + "Figure_4/17ptel_1_500K_1_12_12.pdf", +] + + +TEX_LEGEND = r'''\begin{textblock}{13}($X,$Y) +\includegraphics[width=$WIDTHin,keepaspectratio]{Figure_4/legend.pdf} +\end{textblock}''' +LEGEND_WIDTH = 4 +EXTRA_MARGIN = 1.5 +RIGHT_SHIFT = .2 +UPWARD_SHIFT = .12 + + +TEX_HEADER = r'''\documentclass{article} +\usepackage[paperheight=$HEIGHTin,paperwidth=$WIDTHin,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} +\begin{document}''' + +TEX_IMAGE = r'\begin{textblock}{13}($X,$Y)\includegraphics{$PDF}\end{textblock}' + +TEX_FOOTER = r'\end{document}' + + +def get_pdf_sizes(pdfs): + for pdf in tqdm(pdfs, desc="Identifying sizes"): + imagemagick_bytes = check_output(["identify", "-verbose", pdf]) + for line in imagemagick_bytes.decode().split("\n"): + if line.strip().startswith("Print size:"): + matcher = search(r'([0-9.]+)x([0-9.]+)', line) + if matcher: + w, h = float(matcher.group(1)), float(matcher.group(2)) + yield pdf, (w, h) + break + else: + yield pdf, (None, None) + + +def main(argv): + pdf_sizes = OrderedDict(get_pdf_sizes(PDFS)) + combined_width = ( + max(width for width, _ in pdf_sizes.values()) - EXTRA_MARGIN * .9 + ) + combined_height = ( + sum(height for _, height in pdf_sizes.values()) - + UPWARD_SHIFT * (len(pdf_sizes) - 1) + ) + header = (TEX_HEADER + .replace("$WIDTH", format(combined_width+EXTRA_MARGIN, ".3f")) + .replace("$HEIGHT", format(combined_height, ".3f")) + ) + print(header) + y = 0 + for pdf, (width, height) in pdf_sizes.items(): + section = (TEX_IMAGE + .replace("$X", format(-RIGHT_SHIFT, ".3f")) + .replace("$Y", format(y, ".3f")) + .replace("$PDF", pdf) + ) + print(section) + y += height - UPWARD_SHIFT + legend_section = (TEX_LEGEND + .replace("$WIDTH", format(LEGEND_WIDTH, ".3f")) + .replace("$X", format(7.75, ".3f")) + .replace("$Y", "0") + ) + print(legend_section) + print(TEX_FOOTER) + return 0 + + +if __name__ == "__main__": + returncode = main(argv) + exit(returncode) diff --git a/assets/paper/figures/make-figure-5-tex.py b/assets/paper/figures/make-figure-5-tex.py new file mode 100755 index 0000000..11a6e92 --- /dev/null +++ b/assets/paper/figures/make-figure-5-tex.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +from sys import argv +from tqdm import tqdm +from subprocess import check_output +from re import search +from collections import OrderedDict + + +PDFS = [ + "Figure_5/chr7.pdf", + "Figure_5/chr8.pdf", + "Figure_5/chr11.pdf", + "Figure_5/chr12.pdf", + "Figure_5/14qtel_1-500K_1_12_12_rc.pdf", + "Figure_5/chr15.pdf", + "Figure_5/18qtel_1-500K_1_12_12_rc.pdf", +] + + +TEX_LEGEND = r'''\begin{textblock}{13}($X,$Y) +\includegraphics[width=$WIDTHin,keepaspectratio]{Figure_5/legend.pdf} +\end{textblock}''' +LEGEND_WIDTH = 3.6 +EXTRA_MARGIN = 1.5 +LEFT_SHIFT = .2 +UPWARD_SHIFT = .12 + + +TEX_HEADER = r'''\documentclass{article} +\usepackage[paperheight=$HEIGHTin,paperwidth=$WIDTHin,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} +\begin{document}''' + +TEX_IMAGE = r'\begin{textblock}{13}($X,$Y)\includegraphics{$PDF}\end{textblock}' + +TEX_FOOTER = r'\end{document}' + + +def get_pdf_sizes(pdfs): + for pdf in tqdm(pdfs, desc="Identifying sizes"): + imagemagick_bytes = check_output(["identify", "-verbose", pdf]) + for line in imagemagick_bytes.decode().split("\n"): + if line.strip().startswith("Print size:"): + matcher = search(r'([0-9.]+)x([0-9.]+)', line) + if matcher: + w, h = float(matcher.group(1)), float(matcher.group(2)) + yield pdf, (w, h) + break + else: + yield pdf, (None, None) + + +def main(argv): + pdf_sizes = OrderedDict(get_pdf_sizes(PDFS)) + combined_width = max(width for width, _ in pdf_sizes.values()) + combined_height = ( + sum(height for _, height in pdf_sizes.values()) - + UPWARD_SHIFT * (len(pdf_sizes) - 1) + ) + header = (TEX_HEADER + .replace("$WIDTH", format(combined_width+EXTRA_MARGIN, ".3f")) + .replace("$HEIGHT", format(combined_height, ".3f")) + ) + print(header) + y = 0 + for pdf, (width, height) in pdf_sizes.items(): + section = (TEX_IMAGE + .replace("$X", format(combined_width-width-LEFT_SHIFT, ".3f")) + .replace("$Y", format(y, ".3f")) + .replace("$PDF", pdf) + ) + print(section) + y += height - UPWARD_SHIFT + legend_section = (TEX_LEGEND + .replace("$WIDTH", format(LEGEND_WIDTH, ".3f")) + .replace("$X", format( + combined_width+EXTRA_MARGIN-LEGEND_WIDTH*1.05, ".3f", + )) + .replace("$Y", "0") + ) + print(legend_section) + print(TEX_FOOTER) + return 0 + + +if __name__ == "__main__": + returncode = main(argv) + exit(returncode) diff --git a/assets/paper/figures/make-figure-S4-tex.py b/assets/paper/figures/make-figure-S4-tex.py new file mode 100755 index 0000000..c21c418 --- /dev/null +++ b/assets/paper/figures/make-figure-S4-tex.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +from sys import argv +from os import path +from tqdm import tqdm +from subprocess import check_output +from re import search +from collections import OrderedDict, defaultdict + + +SOURCE_DIR = "./Figure_S4" +SUBJECTS = [f"HG00{i}" for i in range(1, 8)] + +CHROMS = [ + "chr2", + "3ptel_1-500K_1_12_12", + "4ptel_1-500K_1_12_12", + "chr5", + "chr9", + "chr12", + "17ptel_1_500K_1_12_12", +] + +pad = lambda s, n: "\hspace*{"+str(n*3.9)+"pt}"+s+"\hspace*{"+str(n*3.9)+"pt}" +ADJUSTED_NAMES = { + "chr2": pad("2p (chr2)", 2), + "3ptel_1-500K_1_12_12": pad("3p (3ptel_1...)", 2), + "4ptel_1-500K_1_12_12": pad("4p (4ptel_1...)", 30), + "chr5": pad("5p (chr5)", 1), + "chr9": pad("9p (chr9)", 0), + "chr12": pad("12p (chr12)", 0), + "17ptel_1_500K_1_12_12": pad("17p (17ptel_1...)", 13), +} + +TEX_LEGEND = r'''\begin{textblock}{13}($X,$Y) +\includegraphics[width=$WIDTHin,keepaspectratio]{Figure_4/legend.pdf} +\end{textblock}''' +LEGEND_WIDTH = 4.1 +EXTRA_MARGIN = 3 +Y_MINOR_SKIP = .020 +Y_MAJOR_SKIP = .08 + +TEX_HEADER = r'''\documentclass{article} +\usepackage[paperheight=$HEIGHTin,paperwidth=$WIDTHin,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage{ulem} + \renewcommand{\ULdepth}{7pt} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} +\makeatletter + \newcommand*{\textoverline}[1]{$\overline{\hbox{#1\vphantom{\"A}}}\m@th$} + \makeatother +\begin{document}''' + +TEX_IMAGE = r'\begin{textblock}{13}($X,$Y)\includegraphics{$PDF}\end{textblock}' +TEX_YLABEL = r'\begin{textblock}{13}($X,$Y)\rotatebox{90}{\Large{\textoverline{$T}}}\end{textblock}' + +TEX_FOOTER = r'\end{document}' + + +def get_pdfs(source_dir, chroms, subjects): + for chrom in chroms: + for subject in subjects: + pdf = f"{source_dir}/{chrom}-{subject}.pdf" + if path.isfile(pdf): + yield pdf, (chrom, subject) + + +def get_pdf_sizes(pdfs): + for pdf in tqdm(pdfs, desc="Identifying sizes"): + imagemagick_bytes = check_output(["identify", "-verbose", pdf]) + for line in imagemagick_bytes.decode().split("\n"): + if line.strip().startswith("Print size:"): + matcher = search(r'([0-9.]+)x([0-9.]+)', line) + if matcher: + w, h = float(matcher.group(1)), float(matcher.group(2)) + yield pdf, (w, h) + break + else: + yield pdf, (None, None) + + +def get_ylabels(pdfs, pdf_sizes, combined_width): + chrom2xs, chrom2ys = defaultdict(list), defaultdict(list) + y = .05 + for pdf, (chrom, subject) in pdfs.items(): + chrom2ys[chrom].append(y if (y != .05) else .47) + width, height = pdf_sizes[pdf] + if "chr2-HG001" not in pdf: + chrom2xs[chrom].append(width+len(pdf)/22) + y += height + ( + Y_MAJOR_SKIP if ( + (subject == "HG007") or + ((chrom == "chr5") and (subject == "HG006")) or + ((chrom == "chr9") and (subject == "HG005")) + ) + else Y_MINOR_SKIP + ) + for chrom, xs in chrom2xs.items(): + x = min(xs) - .3 + y = min(chrom2ys[chrom]) + yield x, y, ADJUSTED_NAMES[chrom].replace("_", r'\_') + + +def main(argv): + pdfs = OrderedDict(get_pdfs(SOURCE_DIR, CHROMS, SUBJECTS)) + pdf_sizes = OrderedDict(get_pdf_sizes(pdfs)) + combined_width = max(width for width, _ in pdf_sizes.values()) + combined_height = ( + sum(height+Y_MINOR_SKIP for _, height in pdf_sizes.values()) + + (Y_MAJOR_SKIP-Y_MINOR_SKIP) * len(CHROMS) + ) + print((TEX_HEADER + .replace("$WIDTH", format(combined_width+EXTRA_MARGIN, ".3f")) + .replace("$HEIGHT", format(combined_height, ".3f")) + )) + y = .05 + for pdf, (chrom, subject) in pdfs.items(): + width, height = pdf_sizes[pdf] + print((TEX_IMAGE + .replace("$X", str( + (-.11 if (subject=="HG007") else 0) + )) + .replace("$Y", format(y, ".3f")).replace("$PDF", pdf) + )) + y += height + ( + Y_MAJOR_SKIP if ( + (subject == "HG007") or + ((chrom == "chr5") and (subject == "HG006")) or + ((chrom == "chr9") and (subject == "HG005")) + ) + else Y_MINOR_SKIP + ) + for x, y, ylabel in get_ylabels(pdfs, pdf_sizes, combined_width): + print((TEX_YLABEL + .replace("$X", str(x)).replace("$Y", str(y)).replace("$T", ylabel) + )) + print((TEX_LEGEND + .replace("$WIDTH", format(LEGEND_WIDTH, ".3f")) + .replace("$X", format( + combined_width+EXTRA_MARGIN-LEGEND_WIDTH*1.05, ".3f", + )) + .replace("$Y", ".4") + )) + print(TEX_FOOTER) + return 0 + + +if __name__ == "__main__": + returncode = main(argv) + exit(returncode) diff --git a/assets/paper/figures/make-figure-S5-tex.py b/assets/paper/figures/make-figure-S5-tex.py new file mode 100755 index 0000000..4334aa6 --- /dev/null +++ b/assets/paper/figures/make-figure-S5-tex.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python +from sys import argv +from os import path +from tqdm import tqdm +from subprocess import check_output +from re import search +from collections import OrderedDict, defaultdict + + +SOURCE_DIR = "./Figure_S5" +SUBJECTS = [f"HG00{i}" for i in range(1, 8)] + +CHROMS = [ + "chr7", + "chr8", + "chr11", + "chr12", + "14qtel_1-500K_1_12_12_rc", + "chr15", + "18qtel_1-500K_1_12_12_rc", +] + +pad = lambda s, n: "\hspace*{"+str(n*3.9)+"pt}"+s+"\hspace*{"+str(n*3.9)+"pt}" +ADJUSTED_NAMES = { + "chr7": pad("7q (chr7)", 9), + "chr8": pad("8q (chr8)", 5), + "chr11": pad("11q (chr11)", 10), + "chr12": pad("12q (chr12)", 4), + "14qtel_1-500K_1_12_12_rc": pad("14q (14qtel_1...)", 2), + "chr15": pad("15q (chr15)", 0), + "18qtel_1-500K_1_12_12_rc": pad("18q (18qtel_1...)", 3), +} + +TEX_LEGEND = r'''\begin{textblock}{13}($X,$Y) +\includegraphics[width=$WIDTHin,keepaspectratio]{Figure_5/legend.pdf} +\end{textblock}''' +LEGEND_WIDTH = 4.5 +EXTRA_MARGIN = 3 +Y_MINOR_SKIP = .020 +Y_MAJOR_SKIP = .08 + +TEX_HEADER = r'''\documentclass{article} +\usepackage[paperheight=$HEIGHTin,paperwidth=$WIDTHin,margin=0in]{geometry} +\usepackage[sfdefault]{roboto} +\usepackage{graphicx} +\usepackage{tikz} +\usepackage{ulem} + \renewcommand{\ULdepth}{7pt} +\usepackage[absolute,overlay]{textpos} + \setlength{\TPHorizModule}{1in} + \setlength{\TPVertModule}{1in} +\begin{document}''' + +TEX_IMAGE = r'\begin{textblock}{13}($X,$Y)\includegraphics{$PDF}\end{textblock}' +TEX_YLABEL = r'\begin{textblock}{13}($X,$Y)\rotatebox{90}{\Large{\uline{$T}}}\end{textblock}' + +TEX_FOOTER = r'\end{document}' + + +def get_pdfs(source_dir, chroms, subjects): + for chrom in chroms: + for subject in subjects: + pdf = f"{source_dir}/{chrom}-{subject}.pdf" + if path.isfile(pdf): + yield pdf, (chrom, subject) + + +def get_pdf_sizes(pdfs): + for pdf in tqdm(pdfs, desc="Identifying sizes"): + imagemagick_bytes = check_output(["identify", "-verbose", pdf]) + for line in imagemagick_bytes.decode().split("\n"): + if line.strip().startswith("Print size:"): + matcher = search(r'([0-9.]+)x([0-9.]+)', line) + if matcher: + w, h = float(matcher.group(1)), float(matcher.group(2)) + yield pdf, (w, h) + break + else: + yield pdf, (None, None) + + +def get_ylabels(pdfs, pdf_sizes, combined_width): + chrom2xs, chrom2ys = defaultdict(list), defaultdict(list) + y = .05 + for pdf, (chrom, subject) in pdfs.items(): + chrom2ys[chrom].append(y if (y != .05) else .47) + width, height = pdf_sizes[pdf] + if "chr7-HG001" not in pdf: + chrom2xs[chrom].append(combined_width-width) + y += height + ( + Y_MAJOR_SKIP + if ((subject == "HG007") or ((chrom == "chr15") and (subject == "HG005"))) + else Y_MINOR_SKIP + ) + for chrom, xs in chrom2xs.items(): + x = min(xs) - .3 + y = min(chrom2ys[chrom]) + yield x, y, ADJUSTED_NAMES[chrom].replace("_", r'\_') + + +def main(argv): + pdfs = OrderedDict(get_pdfs(SOURCE_DIR, CHROMS, SUBJECTS)) + pdf_sizes = OrderedDict(get_pdf_sizes(pdfs)) + combined_width = max(width for width, _ in pdf_sizes.values()) + combined_height = ( + sum(height+Y_MINOR_SKIP for _, height in pdf_sizes.values()) + + (Y_MAJOR_SKIP-Y_MINOR_SKIP) * len(CHROMS) + ) + print((TEX_HEADER + .replace("$WIDTH", format(combined_width+EXTRA_MARGIN, ".3f")) + .replace("$HEIGHT", format(combined_height, ".3f")) + )) + y = .05 + for pdf, (chrom, subject) in pdfs.items(): + width, height = pdf_sizes[pdf] + print((TEX_IMAGE + .replace("$X", str( + combined_width-width+(.09 if (subject=="HG007") else 0) + )) + .replace("$Y", format(y, ".3f")).replace("$PDF", pdf) + )) + y += height + ( + Y_MAJOR_SKIP + if ((subject == "HG007") or ((chrom == "chr15") and (subject == "HG005"))) + else Y_MINOR_SKIP + ) + for x, y, ylabel in get_ylabels(pdfs, pdf_sizes, combined_width): + print((TEX_YLABEL + .replace("$X", str(x)).replace("$Y", str(y)).replace("$T", ylabel) + )) + print((TEX_LEGEND + .replace("$WIDTH", format(LEGEND_WIDTH, ".3f")) + .replace("$X", format( + combined_width+EXTRA_MARGIN-LEGEND_WIDTH*1.05, ".3f", + )) + .replace("$Y", "0") + )) + print(TEX_FOOTER) + return 0 + + +if __name__ == "__main__": + returncode = main(argv) + exit(returncode) diff --git a/assets/paper/files/Supplemental_File_S1.txt b/assets/paper/files/Supplemental_File_S1.txt new file mode 100644 index 0000000..c6eb662 --- /dev/null +++ b/assets/paper/files/Supplemental_File_S1.txt @@ -0,0 +1,119 @@ +#flags:mask_anchor=4096;fork=8192;tract_anchor=16384 +#entry rname pos pos+1 chromosome main_rname flag prime class source link blacklist +0 chr1 10000 10001 chr1 chr1 4096 5 mask_anchor hg38 - - +1 chr2 10000 10001 chr2 chr2 4096 5 mask_anchor hg38 - - +2 chr3 10000 10001 chr3 chr3 4096 5 mask_anchor hg38 - - +3 chr4 10000 10001 chr4 chr4 4096 5 mask_anchor hg38 - - +4 chr5 10000 10001 chr5 chr5 4096 5 mask_anchor hg38 - - +5 chr6 60000 60001 chr6 chr6 4096 5 mask_anchor hg38 - - +6 chr7 10000 10001 chr7 chr7 4096 5 mask_anchor hg38 - - +7 chr8 60000 60001 chr8 chr8 4096 5 mask_anchor hg38 - - +8 chr9 10000 10001 chr9 chr9 4096 5 mask_anchor hg38 - - +9 chr10 10000 10001 chr10 chr10 4096 5 mask_anchor hg38 - - +10 chr11 60000 60001 chr11 chr11 4096 5 mask_anchor hg38 - - +11 chr12 10000 10001 chr12 chr12 4096 5 mask_anchor hg38 - - +12 chr13 16000000 16000001 chr13 chr13 4096 5 mask_anchor hg38 - - +13 chr14 16000000 16000001 chr14 chr14 4096 5 mask_anchor hg38 - - +14 chr15 17000000 17000001 chr15 chr15 4096 5 mask_anchor hg38 - - +15 chr16 10000 10001 chr16 chr16 4096 5 mask_anchor hg38 - - +16 chr17 60000 60001 chr17 chr17 4096 5 mask_anchor hg38 - - +17 chr18 10000 10001 chr18 chr18 4096 5 mask_anchor hg38 - - +18 chr20 60000 60001 chr20 chr20 4096 5 mask_anchor hg38 - - +19 chr21 5010000 5010001 chr21 chr21 4096 5 mask_anchor hg38 - - +20 chr22 10510000 10510001 chr22 chr22 4096 5 mask_anchor hg38 - - +21 chrX 10000 10001 chrX chrX 4096 5 mask_anchor hg38 - - +22 chrY 10000 10001 chrY chrY 4096 5 mask_anchor hg38 - - +23 chr1 248946422 248946423 chr1 chr1 4096 3 mask_anchor hg38 - - +24 chr2 242183529 242183530 chr2 chr2 4096 3 mask_anchor hg38 - - +25 chr3 198235559 198235560 chr3 chr3 4096 3 mask_anchor hg38 - - +26 chr4 190204555 190204556 chr4 chr4 4096 3 mask_anchor hg38 - - +27 chr7 159335973 159335974 chr7 chr7 4096 3 mask_anchor hg38 - - +28 chr8 145078636 145078637 chr8 chr8 4096 3 mask_anchor hg38 - - +29 chr9 138334717 138334718 chr9 chr9 4096 3 mask_anchor hg38 - - +30 chr10 133787422 133787423 chr10 chr10 4096 3 mask_anchor hg38 - - +31 chr11 135076622 135076623 chr11 chr11 4096 3 mask_anchor hg38 - - +32 chr12 133265309 133265310 chr12 chr12 4096 3 mask_anchor hg38 - - +33 chr13 114354328 114354329 chr13 chr13 4096 3 mask_anchor hg38 - - +34 chr14 106883718 106883719 chr14 chr14 4096 3 mask_anchor hg38 - - +35 chr15 101981189 101981190 chr15 chr15 4096 3 mask_anchor hg38 - - +36 chr17 83247441 83247442 chr17 chr17 4096 3 mask_anchor hg38 - - +37 chr18 80263285 80263286 chr18 chr18 4096 3 mask_anchor hg38 - - +38 chr19 58607616 58607617 chr19 chr19 4096 3 mask_anchor hg38 - - +39 chr20 64334167 64334168 chr20 chr20 4096 3 mask_anchor hg38 - - +40 chr21 46699983 46699984 chr21 chr21 4096 3 mask_anchor hg38 - - +41 chr22 50808468 50808469 chr22 chr22 4096 3 mask_anchor hg38 - - +42 chrX 156030895 156030896 chrX chrX 4096 3 mask_anchor hg38 - - +43 chrY 57217415 57217416 chrY chrY 4096 3 mask_anchor hg38 - - +44 chr1 585988 585989 chr1 chr1 16384 5 riethman_match hg38 - inexact +45 chr2 10262 10263 chr2 chr2 16384 5 riethman_match hg38 - - +46 chr5 11807 11808 chr5 chr5 16384 5 riethman_match hg38 - - +47 chr6 60000 60001 chr6 chr6 16384 5 riethman_match hg38 - inexact +48 chr7 10232 10233 chr7 chr7 16384 5 riethman_match hg38 - inexact +49 chr8 60000 60001 chr8 chr8 16384 5 riethman_match hg38 - inexact +50 chr9 10353 10354 chr9 chr9 16384 5 riethman_match hg38 - - +51 chr10 10419 10420 chr10 chr10 16384 5 riethman_match hg38 - - +52 chr11 60000 60001 chr11 chr11 16384 5 riethman_match hg38 - inexact +53 chr12 10575 10576 chr12 chr12 16384 5 riethman_match hg38 - - +54 chr16 10027 10028 chr16 chr16 16384 5 riethman_match hg38 - - +55 chr18 10615 10616 chr18 chr18 16384 5 riethman_match hg38 - - +56 chr20 79359 79360 chr20 chr20 16384 5 riethman_match hg38 - inexact +57 chr3 198235558 198235559 chr3 chr3 16384 3 riethman_match hg38 - inexact +58 chr4 190122583 190122584 chr4 chr4 16384 3 riethman_match hg38 - inexact +59 chr7 159335873 159335874 chr7 chr7 16384 3 riethman_match hg38 - - +60 chr8 145073354 145073355 chr8 chr8 16384 3 riethman_match hg38 - - +61 chr11 135076569 135076570 chr11 chr11 16384 3 riethman_match hg38 - - +62 chr12 133264944 133264945 chr12 chr12 16384 3 riethman_match hg38 - - +63 chr15 101980819 101980820 chr15 chr15 16384 3 riethman_match hg38 - - +64 chr19 58607496 58607497 chr19 chr19 16384 3 riethman_match hg38 - - +65 chr20 64286708 64286709 chr20 chr20 16384 3 riethman_match hg38 - inexact +66 chr21 46699874 46699875 chr21 chr21 16384 3 riethman_match hg38 - - +67 chr22 50807895 50807896 chr22 chr22 16384 3 riethman_match hg38 - - +68 chrX 156029891 156029892 chrX chrX 16384 3 riethman_match hg38 - inexact +69 chr12_GL877875v1_alt 49533 49534 chr12 chr12 8192 5 fork hg38 70 - +70 chr12 55530 55531 chr12 chr12 8192 5 fork hg38 69 - +71 chr14_KI270846v1_alt 825824 825825 chr14 chr14_KI270846v1_alt 8192 3 fork hg38 72 - +72 chr14 106358001 106358002 chr14 chr14 8192 3 fork hg38 71 - +73 chrUn_KI270745v1_rc 15472 15473 chr17 chr17 8192 5 fork hg38 74 - +74 chr17 60000 60001 chr17 chr17 8192 5 fork hg38 73 - +75 3ptel_1-500K_1_12_12 0 1 chr3 chr3 16384 5 tel_fork riethman2014 - - +76 3ptel_1-500K_1_12_12 25 26 chr3 chr3 8192 5 tel_fork riethman2014 77 - +77 chr3 11609 11610 chr3 chr3 8192 5 tel_fork hg38 76 - +78 4ptel_1-500K_1_12_12 0 1 chr4 chr4 16384 5 tel_fork riethman2014 - - +79 4ptel_1-500K_1_12_12 75887 75888 chr4 chr4 8192 5 tel_fork riethman2014 80 - +80 chr4 86153 86154 chr4 chr4 8192 5 tel_fork hg38 79 - +81 17ptel_1_500K_1_12_12 0 1 chr17 chr17 16384 5 tel_fork riethman2014 - - +82 17ptel_1_500K_1_12_12 30912 30913 chr17 chr17 8192 5 tel_fork riethman2014 83 - +83 chr17 141322 141323 chr17 chr17 8192 5 tel_fork hg38 82 - +84 19ptel_1-500K_1_12_12 0 1 chr19 chr19 16384 5 tel_fork riethman2014 - - +85 19ptel_1-500K_1_12_12 7837 7838 chr19 chr19 8192 5 tel_fork riethman2014 86 - +86 chr19 60000 60001 chr19 chr19 8192 5 tel_fork hg38 85 - +87 1qtel_1-500K_1_12_12_rc 499999 500000 chr1 chr1 16384 3 tel_fork riethman2014 - - +88 1qtel_1-500K_1_12_12_rc 299612 299613 chr1 chr1 8192 3 tel_fork riethman2014 89 - +89 chr1 248751792 248751793 chr1 chr1 8192 3 tel_fork hg38 88 - +90 2qtel_1-500K_1_12_12_rc 499999 500000 chr2 chr2 16384 3 tel_fork riethman2014 - - +91 2qtel_1-500K_1_12_12_rc 468937 468938 chr2 chr2 8192 3 tel_fork riethman2014 92 - +92 chr2 242152486 242152487 chr2 chr2 8192 3 tel_fork hg38 91 - +93 9qtel_1-500K_1_12_12_rc 499999 500000 chr9 chr9 16384 3 tel_fork riethman2014 - inexact +94 9qtel_1-500K_1_12_12_rc 433984 433985 chr9 chr9 8192 3 tel_fork riethman2014 95 - +95 chr9 138192962 138192963 chr9 chr9 8192 3 tel_fork hg38 94 - +96 10qtel_1-500K_1_12_12_rc 499999 500000 chr10 chr10 16384 3 tel_fork riethman2014 - - +97 10qtel_1-500K_1_12_12_rc 476621 476622 chr10 chr10 8192 3 tel_fork riethman2014 98 - +98 chr10 133687787 133687788 chr10 chr10 8192 3 tel_fork hg38 97 - +99 14qtel_1-500K_1_12_12_rc 499999 500000 chr14 chr14_KI270846v1_alt 16384 3 tel_fork riethman2014 - - +100 14qtel_1-500K_1_12_12_rc 390725 390726 chr14 chr14_KI270846v1_alt 8192 3 tel_fork riethman2014 101 - +101 chr14_KI270846v1_alt 1219238 1219239 chr14 chr14_KI270846v1_alt 8192 3 tel_fork hg38 100 - +102 17qtel_1-500K_1_12_12v2_rc 499999 500000 chr17 chr17 16384 3 tel_fork riethman2014 - - +103 17qtel_1-500K_1_12_12v2_rc 353184 353185 chr17 chr17 8192 3 tel_fork riethman2014 104 - +104 chr17 83090312 83090313 chr17 chr17 8192 3 tel_fork hg38 103 - +105 18qtel_1-500K_1_12_12_rc 499999 500000 chr18 chr18 16384 3 tel_fork riethman2014 - - +106 18qtel_1-500K_1_12_12_rc 495705 495706 chr18 chr18 8192 3 tel_fork riethman2014 107 - +107 chr18 80258300 80258301 chr18 chr18 8192 3 tel_fork hg38 106 - +108 5qtel_1-500K_1_12_12_rc 499999 500000 chr5 chr5 16384 3 tel_fork riethman2014 - - +109 5qtel_1-500K_1_12_12_rc 497184 497185 chr5 chr5 8192 3 tel_fork riethman2014 110 - +110 chr5 181478258 181478259 chr5 chr5 8192 3 tel_fork hg38 109 - +111 6qtel_1-500K_1_12_12_rc 499999 500000 chr6 chr6 16384 3 tel_fork riethman2014 - - +112 6qtel_1-500K_1_12_12_rc 491122 491123 chr6 chr6 8192 3 tel_fork riethman2014 113 - +113 chr6 170745978 170745979 chr6 chr6 8192 3 tel_fork hg38 112 - +114 16qtel_1-500K_1_12_12_rc 499999 500000 chr16 chr16 16384 3 tel_fork riethman2014 - - +115 16qtel_1-500K_1_12_12_rc 489086 489087 chr16 chr16 8192 3 tel_fork riethman2014 116 - +116 chr16 90228344 90228345 chr16 chr16 8192 3 tel_fork hg38 115 - diff --git a/assets/paper/files/Supplemental_File_S2.txt b/assets/paper/files/Supplemental_File_S2.txt new file mode 100644 index 0000000..f96a0c1 --- /dev/null +++ b/assets/paper/files/Supplemental_File_S2.txt @@ -0,0 +1,194 @@ +#!/usr/bin/env python +from sys import argv, stderr +from re import compile +from tempfile import TemporaryDirectory +from os import path +from urllib.request import urlretrieve +from contextlib import contextmanager +from binascii import hexlify +from gzip import open as gzopen +from itertools import chain +from textwrap import fill +from zlib import decompress +from base64 import decodebytes + +try: + from tqdm import tqdm +except ModuleNotFoundError: + def tqdm(it, desc=None, *args, **kwargs): + if desc is not None: + print(desc, end="...\n", file=stderr) + return it + + +USAGE = """usage: +{0} --local hg38.fasta stong2014.fasta + generate hg38ext.fa from local files and output to stdout +{0} --remote + download appropriate assemblies, generate hg38ext.fa, and output to stdout +{0} --ecx + output the edgeCase indeX (hg38ext.fa.ecx) to stdout + +NOTE! This tool writes uncompressed data (FASTA or ECX) to stdout. +You should pipe it into a file, for example: +{0} --remote > hg38ext.fa +""" + +NCBI_FTP_DIR = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405" +HG38_RELEASE = "GCA_000001405.28_GRCh38.p13" +HG38_VERSION = "GRCh38_major_release_seqs_for_alignment_pipelines" +HG38_FASTAGZ = "GCA_000001405.15_GRCh38_full_analysis_set.fna.gz" + +CSHLP_URLDIR = "https://genome.cshlp.org/content/suppl/2014/04/16" +CSHLP_DOIDIR = "gr.166983.113.DC1" +CSHLP_SUPPFN = "Supplemental_FileS1.txt" + +HG38_URL = "/".join([NCBI_FTP_DIR, HG38_RELEASE, HG38_VERSION, HG38_FASTAGZ]) +STONG2014_URL = "/".join([CSHLP_URLDIR, CSHLP_DOIDIR, CSHLP_SUPPFN]) + +STONG_INFIX = "500K_1_12_12" +STONG2014_SUBHAP_MASKS = { + "1qtel_1-{}", "2qtel_1-{}", "3ptel_1-{}", "4ptel_1-{}", "5qtel_1-{}", + "6qtel_1-{}", "9qtel_1-{}", "10qtel_1-{}", "14qtel_1-{}", "16qtel_1-{}", + "17ptel_1_{}", "17qtel_1-{}v2", "18qtel_1-{}", "19ptel_1-{}" +} + +REVCOMP_POSTFIX = "_rc" +TO_REVCOMP_MASKS = { + "1qtel_1-{}", "2qtel_1-{}", "5qtel_1-{}", "6qtel_1-{}", "9qtel_1-{}", + "10qtel_1-{}", "14qtel_1-{}", "16qtel_1-{}", "17qtel_1-{}v2", "18qtel_1-{}", + "chrUn_KI270745v1" +} + +ALPHABET = list("AaCcNnnNgGtT") +COMPLEMENTS = dict(zip(ALPHABET, reversed(ALPHABET))) +COMPLEMENT_PATTERN = compile(r'|'.join(COMPLEMENTS.keys())) + +HG38EXT_ECX_GZ = b'\n'.join([ +b'eJyVWEtvIzcMPk/+Ro9FAFFPsos9F8X2WmD3ZLhGdhMkcba2u2j/fUk9JvOwOG6AUMSY/CRqOB8p' +b'/fT1Zf/t/Mvr/vy82x8Pj2+nj95Q/PD17fT8EYHsh8tpf7i03yA69Hc/PRwvp3+H03H/+jB8fzvL' +b'/88wHB5Pb69v5zd++Lp/Ou7K7zLB8P30xOrhZX8+D+e3v0+Hh+Hl6fg8/PmyPzy/PJ0vd0bcYQDD' +b'f1lCeZCFLGkIw2SVw+M3h8P9cH+XTezS0RahOWYTt3R0RWiO2cQvHX0RmmM2CUvHUITmmE3iELNj' +b'bI6xCM0xm6TljKkIzTGb4HJGLEJzzCa0nJGK0ByzCZhVBpgq1RwoRrBcLzSpOhejVQqBrVJ1Lkac' +b'Rnlq8S8K1OdFqhDFyHcgfJUqRDHi1EoNIk0gQpUqRDGKq12IVarOxSit9j9VqToXI1zNjFWqztnI' +b'muXM1lSpOuecszAEU+YuI9SnRaoUUmaR1Alt9eEdwlapQmTTz8vgPxehemb4L0vPL0WoniXjBuuR' +b'fPQM1DS3JF3XQfCFX623gC5YapozS/btIYRKv4TWhRCoadEsabiHECsPk7HGM8SoxSUf9xAqEUIg' +b'5wIlN2p+SZM9hMqI4INJGF0ctbTkyx5CpUZw6JxP/DE0DZfE2UFwjSHBuYRJXmfT3JpBeyCNKcHx' +b'+mPMIEVzaybtgTTG5Plt5FygptU1zBi1B9I4E8C74B1vcdNozak9kMaaYCKiS5ldikZrVu2BNN4E' +b'w7kJgNQ0Mmte7YE0akRnffIemmLXBNmDaASJxkbej9CUuKbJHkQxoiFg5JfKfF2VVJ8XqUJQY1rP' +b'6QkxNaUx8IRvOxDeNE6NkYjQNcWvGbcH0Zg1GDToefKq0JpxexC2Mi5TjTNIYdTiknl7CK5wbEgW' +b'kucEqEpc8m/Pv2Qfv4NAiGWgCfPmLpvJ+/T0cHl83R93r/vL4bH5Px0f/uGW/M6HwrLAyVCkm3Cv' +b'jsFriLUZBTQpS5w0o5veSetIb1w/tubUurz+8uU3Tr4Ng7Q+9TaMYFrLyuUrSz9h3q2dCK1LBeNB' +b'QLxtfFufbwHYjd71xijc2MWGFLKMM87dXId/bwBtPjKU4jY2gJsAYezjYibNCHOC2gSIlUMScQ0W' +b'GRurFHnjTqR5Z4GTHuO9syhYbgsLxx4DrA3oRs1PeowbsWjabeDYbeCs29Cx+ERnJh2Hk1xtWlhl' +b'voIyL/chUtOSuZJ8CtCs5HtiamtauJJ+CpCbFluDQE2z82K7CeSnpc5TbMq81G3ChLHccaFNzI5V' +b'oSspufXqY5wWPnnhVQmzwre5pjQpfkkqV1XirPhtwuB7AbRceWDU7KQA3hhZ2U+7+/V3TNxzhh+w' +b'279cBk+B6Vykn6WBXCnxFyzXSwWK8+3+LrX2kD9SbmdFgu7G87JbPSHvPv1mkzQBdXLk75NPKHkI' +b'03ZvZSiwbroaK7D2vXnkz1P4uGl2inbFHcQ9Z/IfxzJZ8rIlpwPvsWdgkW7W+a13xAuIrz/3TtRr' +b'NyduYXDfLw8vO7gPxnzawY5fDe8fJ+qK+8IghhmgvWNrOKp8/ROv4/B2lmNXRaqruA6UkiA1NoYo' +b'xwGIYBT/EkkUP25olUhmzKtGQtdxUkBMWeIET40HJVXRVEuucS5Lr/iXeCRXkZMolYXsrgU0vtjt' +b'kNB2oPjAVeXVFOtEJXmDzQE8OKEQGXSQHBrK94JSH5V3teBcNbTQgUp8dBOBM0A9MEmjekai1XfU' +b'gyhhBXHl3fhrvRT5lj3JH7NvBl027k4LELuglijKJybD9B6mskwnyJxb9fBgPRMwJLKjpuGUSFGu' +b'Xs1gb410drzQIuWy0gWNSPw684ATUDVSklSjaim3TYGDjKOWFJwcKQk384bQrZHODgD9SFslJN+H' +b'dk7OuHkIE2g9XslBqubgxJbknFc1p+CUeKWMkLTzNyfx/Miivtyk4MqlEZTBznD1eHMmYluCczE3' +b'FKOGKlQJWQoOhwT+5pA7jcH2DoDcufbmcWQSl8o8xBsakOvTcP+bJ+p0OWDZmaMuI/2fafJuSQQC' +b'L9XkShw/Ojs2r1P6FjkNmk/bIAgyhHWp6e+Kz9itQUKud8ZJj1iUK1VrHbnLEFxs8OZMmZ9i9bij' +b'BsxHrFCG+fF4I+qUkSdXgXwa5aVVBVSoGnXMEDiEW4Oe3QbpIZOCmuQ95yFMUPV4IScn1GUAgk/S' +b'z48aKUg1XMoIMMRbw51dX6nhcrr1UflXWwY3Qd0IN2ckVAfI5wbiHrhppCCVcMFmBOmJb87p+dWO' +b'HnFQgJEMl+I8pBnwRtA5IUdjMtai874pQYWqUUuV/A80XjlL', b'']) + + +def revcomp(sequence): + """Reverse-complement a sequence""" + matcher = lambda match: COMPLEMENTS[match.group()] + return COMPLEMENT_PATTERN.sub(matcher, sequence[::-1]) + + +def retrieve_hook(count, blocksize, totalsize): + """Progress bar hook for urlretrieve""" + if totalsize > 0: + percent = min(count * blocksize * 100 / totalsize, 100) + print("{:.2f}% downloaded".format(percent), end="\r", file=stderr) + else: + megabytes = count * blocksize / 1e6 + print("{:.3f}Mb downloaded".format(megabytes), end="\r", file=stderr) + + +def download_assemblies(workdir): + """Download hg38 and stong2014 assemblies from appropriate URLs""" + for url, fn in [(HG38_URL, "hg38.fa.gz"), (STONG2014_URL, "stong2014.fa")]: + local_fn = path.join(workdir, fn) + print("Downloading into {} from {}".format(local_fn, url), file=stderr) + urlretrieve(url=url, filename=local_fn, reporthook=retrieve_hook) + yield local_fn + + +@contextmanager +def open_fasta(filename): + """Open FASTA with 'open' if plaintext, 'gzip.open' if gzipped""" + with open(filename, mode="rb") as bytes_handle: + is_gzipped = (hexlify(bytes_handle.read(2)) == b"1f8b") + if is_gzipped: + yield gzopen(filename, mode="rt") + else: + yield open(filename, mode="rt") + + +def parser_iterator(filename, to_revcomp, desc="Parsing", entry_filter=lambda e:True): + """Parse FASTA and iterate over converted entries""" + entry_accepted, entry_needs_revcomp, lines_to_revcomp = False, False, [] + with open_fasta(filename) as fasta: + for line in tqdm(map(str.strip, fasta), desc=desc, unit="line"): + if line.startswith(">"): + if lines_to_revcomp: # generated from previous entry + previous_entry_sequence = revcomp("".join(lines_to_revcomp)) + yield fill(previous_entry_sequence) + lines_to_revcomp = [] + name = line[1:].split()[0] + entry_accepted = entry_filter(name) + entry_needs_revcomp = (name in to_revcomp) + if entry_accepted: + if entry_needs_revcomp: + yield ">" + name + REVCOMP_POSTFIX + else: + yield ">" + name + elif entry_accepted: + if entry_needs_revcomp: + lines_to_revcomp.append(line) + else: + yield line + + +def generate_hg38ext(hg38, stong2014): + """Generate hg38ext from the hg38 and stong2014 FASTA files, write to stdout""" + subhaps = {mask.format(STONG_INFIX) for mask in STONG2014_SUBHAP_MASKS} + to_revcomp = {mask.format(STONG_INFIX) for mask in TO_REVCOMP_MASKS} + hg38_iterator = parser_iterator(hg38, to_revcomp, desc="Parsing reference") + stong2014_iterator = parser_iterator( + stong2014, to_revcomp, desc="Parsing subtelomeres", + entry_filter=lambda name: (name in subhaps) + ) + for line in chain(hg38_iterator, stong2014_iterator): + print(line) + return 0 + + +def output_ecx(): + """Write the hg38ext ECX to stdout""" + print(decompress(decodebytes(HG38EXT_ECX_GZ)).decode().rstrip("\n")) + + +if __name__ == "__main__": + # interpret command-line arguments and dispatch to subroutines: + if (len(argv) == 2) and (argv[1] == "--remote"): + with TemporaryDirectory() as workdir: + hg38, stong2014 = download_assemblies(workdir) + returncode = generate_hg38ext(hg38, stong2014) + elif (len(argv) == 4) and (argv[1] == "--local"): + returncode = generate_hg38ext(hg38=argv[2], stong2014=argv[3]) + elif (len(argv) == 2) and (argv[1] == "--ecx"): + returncode = output_ecx() + else: + print(USAGE.format(__file__).rstrip(), file=stderr) + returncode = 1 + exit(returncode) diff --git a/assets/paper/jupyter/Figure-1-pipeline.ipynb b/assets/paper/jupyter/Figure-1-pipeline.ipynb new file mode 100644 index 0000000..d30a060 --- /dev/null +++ b/assets/paper/jupyter/Figure-1-pipeline.ipynb @@ -0,0 +1,253 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import re\n", + "from matplotlib.pyplot import subplots, style, rc\n", + "from tqdm import tqdm\n", + "from venn import venn, pseudovenn\n", + "from collections import defaultdict\n", + "from itertools import count, islice\n", + "from functools import lru_cache\n", + "from pysam import AlignmentFile" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "ecx = pd.read_csv(\"../../hg38ext.fa.ecx\", sep=\"\\t\", skiprows=1, escapechar=\"#\") \\\n", + " .query(\"flag==16384\").query(\"blacklist=='-'\") \\\n", + " .drop(columns=[\"entry\", \"pos+1\", \"main_rname\", \"flag\", \"link\", \"blacklist\", \"class\"])\n", + "\n", + "ecx[\"sorter\"] = ecx[\"chromosome\"].apply(lambda c: 999 if c == \"chrX\" else int(c[3:]))\n", + "ecx = ecx.sort_values(by=\"sorter\").drop(columns=\"sorter\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def load_bam(filename):\n", + " p_arm, q_arm = defaultdict(list), defaultdict(list)\n", + " with AlignmentFile(filename) as bam:\n", + " for entry in bam:\n", + " if (entry.flag & 0x4000 == 0x4000) and (entry.seq is not None): # tract_anchor\n", + " entry_stats = [\n", + " entry.reference_start, entry.query_alignment_start,\n", + " entry.reference_end, entry.query_alignment_end,\n", + " len(entry.seq)\n", + " ]\n", + " if entry.flag & 0x8000 == 0x8000: # is_q\n", + " q_arm[entry.reference_name].append(entry_stats)\n", + " else:\n", + " p_arm[entry.reference_name].append(entry_stats)\n", + " return p_arm, q_arm" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_chroms(ecx, p_arm, q_arm, blacklist=[]):\n", + " chroms = []\n", + " for c in ecx[\"chromosome\"]:\n", + " if ecx.loc[ecx[\"chromosome\"]==c, \"rname\"].iloc[0] in (set(p_arm) | set(q_arm)):\n", + " if (c not in chroms) and (c not in blacklist):\n", + " chroms.append(c)\n", + " return chroms" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def get_reads(refdata, x_arm, prime):\n", + " _x = refdata.loc[refdata[\"prime\"]==prime, \"rname\"]\n", + " if len(_x):\n", + " if x_arm[_x.iloc[0]]:\n", + " return _x.iloc[0], x_arm[_x.iloc[0]]\n", + " return None, []" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "Y_FACTOR = 2" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def init_figure(chroms, p_arm, q_arm, r=15000, rin=-20000, rout=12000, ymin=-19, b=-13, m=-9, t=-4, lw=.75, annot=True):\n", + " maxcov = max(\n", + " max(len(get_reads(ecx[ecx[\"chromosome\"]==chrom], p_arm, 5)[1]), len(get_reads(ecx[ecx[\"chromosome\"]==chrom], q_arm, 3)[1]))\n", + " for chrom in chroms\n", + " )\n", + " n_chroms = len(chroms)\n", + " figure, axs = subplots(\n", + " figsize=(15, (maxcov-ymin)*n_chroms*Y_FACTOR/40), gridspec_kw=dict(wspace=0, width_ratios=(7, 3, 7)),\n", + " ncols=3, nrows=n_chroms, squeeze=False, sharey=True,\n", + " )\n", + " for j, row in enumerate(axs):\n", + " for i, ax in enumerate(row):\n", + " order = i - 1\n", + " if i % 2 == 0:\n", + " ax.set(xlim=[rin*order, rout*order][::order], ylim=(ymin, maxcov*Y_FACTOR))\n", + " ax.set(xticklabels=[int(t/1000) for t in ax.get_xticks()])\n", + " else:\n", + " ax.set(xlim=[-r, r], ylim=(ymin, maxcov*Y_FACTOR), xticks=[0], xticklabels=[\"...\"], yticks=[])\n", + " ax.spines[\"bottom\"].set_linestyle((0, (3, 10, 1, 10, 1, 10)))\n", + " for spine in [\"left\", \"top\", \"right\"]:\n", + " ax.spines[spine].set_visible(False)\n", + " if i == 1:\n", + " if j == axs.shape[0]-1:\n", + " ax.text(x=0, y=ymin*2.4, s=\"Position in Kbp, relative to boundaries of telomeric tracts\", fontsize=13, ha=\"center\")\n", + " if annot:\n", + " ax.text(x=0, y=5, s=\"centromere\", fontsize=12, ha=\"center\")\n", + " else:\n", + " if annot:\n", + " sub_ha, tel_ha = [\"right\", \"left\"][::order]\n", + " ax.text(x=-order*r/20, y=m, s=\"subtelomere\", fontsize=12, ha=sub_ha, va=\"center\")\n", + " ax.text(x=order*r/20, y=m, s=\"telomere\", fontsize=12, ha=tel_ha, va=\"center\")\n", + " xw = r/3\n", + " for order in (1, -1):\n", + " row[1].plot([-xw, xw], [t, b][::order], color=\"black\", lw=lw)\n", + " row[1].fill_between([xw*order, 0], [b, m], [t, m], color=\"#E0E0E0\")\n", + " for i in range(1, 8):\n", + " row[1].plot([(xw+xw*i/3)*order]*2, [b, t], color=\"gray\", lw=3)\n", + " for ax, order in zip((row[0], row[2]), (1, -1)):\n", + " ax.plot([0, 0], [b, t], lw=.75, color=\"black\")\n", + " for y in (t, b):\n", + " ax.plot([0, -.95*rin*order], [y, y], lw=lw, color=\"black\")\n", + " ax.plot([0, -.95*rout*order], [y, y], lw=lw, color=\"black\", ls=\"--\")\n", + " ax.fill_between([0, -.95*rin*order], [b, b], [t, t], color=\"#E0E0E0\")\n", + " ax.fill_between([0, -.95*rout*order], [b, b], [t, t], color=\"#EEF0FF\")\n", + " if annot:\n", + " row[0].text(x=-12000, y=0, s=\"$p$ arm (5')\", fontsize=12, ha=\"left\")\n", + " row[2].text(x=+12000, y=0, s=\"$q$ arm (3')\", fontsize=12, ha=\"right\")\n", + " return figure, axs" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_pileup(chrom, p_arm, q_arm, ecx, pax, qax, ymin=-19):\n", + " refdata = ecx[ecx[\"chromosome\"]==chrom]\n", + " p_ref, p_reads = get_reads(refdata, p_arm, 5)\n", + " q_ref, q_reads = get_reads(refdata, q_arm, 3)\n", + " if p_reads:\n", + " anchor_pos = refdata.loc[refdata[\"prime\"]==5, \"pos\"].iloc[0]\n", + " plottables = []\n", + " for rs, qs, re, qe, sl in p_reads:\n", + " xe = re + (sl - qe) - anchor_pos\n", + " xs = xe - sl\n", + " plottables.append([xs, xe])\n", + " for i, (xs, xe) in enumerate(sorted(plottables, key=lambda x:x[1], reverse=True)):\n", + " j = i * Y_FACTOR\n", + " pax.plot([xs, 0], [j, j], color=\"darkblue\", lw=.75*Y_FACTOR)\n", + " pax.plot([0, xe], [j, j], color=\"gray\", lw=.75*Y_FACTOR)\n", + " if q_reads:\n", + " anchor_pos = refdata.loc[refdata[\"prime\"]==3, \"pos\"].iloc[0]\n", + " for i, (rs, qs, re, qe, sl) in enumerate(sorted(q_reads)):\n", + " j = i * Y_FACTOR\n", + " xs = rs - anchor_pos\n", + " xe = xs + (sl - qs)\n", + " qax.plot([xs, 0], [j, j], color=\"gray\", lw=.75*Y_FACTOR)\n", + " qax.plot([0, xe], [j, j], color=\"darkblue\", lw=.75*Y_FACTOR)\n", + " for ax, reads in zip((pax, qax), (p_reads, q_reads)):\n", + " ax.plot([0, 0], [ymin, len(reads)*Y_FACTOR], color=\"#EE3333\", lw=2, ls=\"--\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from matplotlib.patches import Rectangle\n", + "from matplotlib.pyplot import close\n", + "\n", + "def add_custom_legend(ax):\n", + " ax.add_patch(Rectangle((-11500, 40), 16500, 53, facecolor=\"none\", edgecolor=\"black\", lw=.5, clip_on=False))\n", + " ax.plot([-5000, 3500], [75, 75], lw=3, color=\"#BBB\", clip_on=False)\n", + " ax.plot([-10300, -5000], [75, 75], lw=3, color=\"darkblue\", clip_on=False)\n", + " ax.plot([-4900, -4900], [72, 78], lw=2, color=\"#EE3333\", clip_on=False)\n", + " ax.text(x=-9000, y=81, s=\"candidate read\", clip_on=False)\n", + " ax.plot([-10300, -5000], [50, 50], lw=3, color=\"darkblue\", clip_on=False)\n", + " ax.plot([-4900, -4900], [47, 53], lw=2, color=\"#EE3333\", clip_on=False)\n", + " ax.text(x=-9000, y=56, s=\"telomeric sequence\", clip_on=False)\n", + "\n", + "p_arm, q_arm = load_bam(\"../../../data/datasets/2021/PacBio/AshkenazimTrio/HG002/tailpuller.bam\")\n", + "chroms = get_chroms(ecx, p_arm, q_arm)\n", + "chroms = [\"chr12\"]\n", + "\n", + "Y_FACTOR = 2.5\n", + "\n", + "figure, axs = init_figure(chroms, p_arm, q_arm, annot=True)\n", + "for chrom, (pax, cax, qax) in zip(chroms, axs):\n", + " plot_pileup(chrom, p_arm, q_arm, ecx, pax, qax)\n", + " for ax in pax, qax:\n", + " ax.text(x=0, y=-31, s=\"$tract\\_anchor$\", va=\"top\", ha=\"center\", fontsize=8)\n", + " add_custom_legend(pax)\n", + "\n", + "figure.savefig(\"Figure_1.pdf\", bbox_inches=\"tight\")\n", + "close(figure)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/paper/jupyter/Figure-3-entropy.ipynb b/assets/paper/jupyter/Figure-3-entropy.ipynb new file mode 100644 index 0000000..7c8c717 --- /dev/null +++ b/assets/paper/jupyter/Figure-3-entropy.ipynb @@ -0,0 +1,244 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import re\n", + "from matplotlib.pyplot import subplots, style, rc\n", + "from tqdm import tqdm\n", + "from venn import venn, pseudovenn\n", + "from collections import defaultdict\n", + "from itertools import count, islice, chain\n", + "from functools import lru_cache" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from edgecaselib.formats import load_kmerscan, EmptyKmerscanError\n", + "from scipy.stats import entropy\n", + "from os import path\n", + "from pickle import load, dump\n", + "from gzip import open as gzopen" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SUBJECT_TO_TRIO = {\n", + " \"HG001\": \"NA12878\",\n", + " \"HG002\": \"AshkenazimTrio\", \"HG003\": \"AshkenazimTrio\", \"HG004\": \"AshkenazimTrio\",\n", + " \"HG005\": \"ChineseTrio\", \"HG006\": \"ChineseTrio\", \"HG007\": \"ChineseTrio\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = \"../../../data/datasets/2021\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "KMERSCANS_PKL = f\"{DATA_DIR}/PacBio/kmerscans-for-entropy.pkl.gz\"\n", + "\n", + "if path.isfile(KMERSCANS_PKL):\n", + " with gzopen(KMERSCANS_PKL, mode=\"rb\") as pkl:\n", + " KMERSCANS = load(pkl)\n", + "else:\n", + " KMERSCANS = {}\n", + " for subject, trio in SUBJECT_TO_TRIO.items():\n", + " try:\n", + " ksp = load_kmerscan(\n", + " f\"{DATA_DIR}/PacBio/{trio}/{subject}/kmerscanner-all-p_arm.dat.gz\",\n", + " gzipped=True, samfilters=[[\"tract_anchor\"], [\"is_q\"], 0],\n", + " bin_size=10,\n", + " )\n", + " except EmptyKmerscanError:\n", + " ksp = {}\n", + " try:\n", + " ksq = load_kmerscan(\n", + " f\"{DATA_DIR}/PacBio/{trio}/{subject}/kmerscanner-all-q_arm.dat.gz\",\n", + " gzipped=True, samfilters=[[\"is_q\", \"tract_anchor\"], 0, 0],\n", + " bin_size=10,\n", + " )\n", + " except EmptyKmerscanError:\n", + " ksq = {}\n", + " KMERSCANS[subject] = (ksp, ksq)\n", + " with gzopen(KMERSCANS_PKL, mode=\"wb\") as pkl:\n", + " dump(KMERSCANS, pkl)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_per_read_modes(bdf):\n", + " return (\n", + " bdf.groupby(\"name\")\n", + " .apply(lambda block: block.set_index(\"motif\").iloc[:,8:].idxmax(axis=0))\n", + " .dropna(how=\"all\", axis=1)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_entropies(bdf):\n", + " per_read_modes = get_per_read_modes(bdf)\n", + " N = len(per_read_modes.melt().value.dropna().unique())\n", + " return pd.DataFrame({\n", + " \"entropy\": per_read_modes.apply(lambda c: entropy(c.value_counts())) / np.log(N),\n", + " \"coverage\": (~per_read_modes.isnull()).sum(axis=0),\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ENTROPIES_DICT = {\n", + " subject: pd.concat(\n", + " calculate_entropies(bdf)\n", + " for bdf in tqdm(\n", + " chain(ksp.values(), ksq.values()),\n", + " desc=subject, unit=\"arm\",\n", + " total=len(ksp)+len(ksq),\n", + " )\n", + " )\n", + " for subject, (ksp, ksq) in KMERSCANS.items()\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def weighted_quantile(points, weights, q):\n", + " if q == 1:\n", + " return max(points)\n", + " elif q > 1:\n", + " return ValueError\n", + " else:\n", + " indsort = np.argsort(points.values)\n", + " spoints, sweights = points.values[indsort], weights.values[indsort]\n", + " sn = np.cumsum(sweights)\n", + " pn = (sn - sweights / 2) / sn[-1]\n", + " return np.interp(q, pn, spoints)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_entropies(entropies_dict, downsample_viz=1, scale=1, cmap_2d=\"Greys\", vmin_2d=-.009, cmap_levels=\"gist_heat_r\", vmin_levels=-.003):\n", + " width_ratios = [.75, 2.5] * len(entropies_dict)\n", + " height_ratios = [5, .5]\n", + " figure, axs = subplots(\n", + " figsize=(sum(width_ratios)*scale, sum(height_ratios)*scale),\n", + " ncols=len(width_ratios), nrows=len(height_ratios), squeeze=False,\n", + " gridspec_kw=dict(width_ratios=width_ratios, wspace=0, height_ratios=height_ratios, hspace=0),\n", + " )\n", + " for i, (subject, entropies) in tqdm(enumerate(entropies_dict.items()), desc=\"Plotting\", total=len(entropies_dict)):\n", + " maxcov = entropies[\"coverage\"].max()\n", + " ax2d = axs[0,i*2+1]\n", + " kde2d_kws=dict(data=entropies[::downsample_viz], x=\"coverage\", y=\"entropy\", cut=0, ax=ax2d)\n", + " sns.kdeplot(**kde2d_kws, levels=50, fill=True, cmap=cmap_2d, vmin=vmin_2d, vmax=1.5)\n", + " sns.kdeplot(**kde2d_kws, levels=10, fill=False, cmap=cmap_levels, vmin=vmin_levels, vmax=1.75)\n", + " ax2d.set(xlabel=None, ylabel=None, ylim=(0, 1), xticks=[], yticks=[])\n", + " ax2d.set_title(subject, fontsize=13)\n", + " xoffset = maxcov * (.45 if subject == \"HG006\" else .95) # not sure why it gets misaligned for HG006 otherwise...\n", + " ax2d.text(x=xoffset, y=.95, ha=\"right\", va=\"top\", s=\"Weighted\\npercentiles\\nof entropy:\")\n", + " for q, y, desc in zip([.25, .5, .75, 1], [.6, .67, .74, .81], [\"25th (Q1)\", \"50th (Q2)\", \"75th (Q3)\", \"100th\"]):\n", + " qval = weighted_quantile(entropies[\"entropy\"], entropies[\"coverage\"]-1, q)\n", + " color = \"green\" if (q == .5) else \"#777\"\n", + " ax2d.text(x=xoffset, y=y-.05, ha=\"right\", va=\"top\", s=f\"{desc}: {qval:.2f}\", color=color, fontsize=13)\n", + " ax_e = axs[0,i*2]\n", + " sns.kdeplot(data=entropies, y=\"entropy\", cut=0, ax=ax_e, shade=True, color=\"#555\")\n", + " ax_e.set(\n", + " xlim=(ax_e.get_xlim()[1]*1.25, 0),\n", + " ylim=(0, 1),\n", + " xticks=[], yticks=[], xlabel=None, ylabel=None,\n", + " )\n", + " for spine in \"top\", \"bottom\":\n", + " ax_e.spines[spine].set_visible(False)\n", + " ax_c = axs[1,i*2+1]\n", + " sns.kdeplot(data=entropies, x=\"coverage\", cut=0, ax=ax_c, shade=True, color=\"#555\")\n", + " ax_c.set(\n", + " xlim=(1, maxcov), ylim=(ax_c.get_ylim()[1]*1.2, 0),\n", + " xticks=[1, maxcov], xlabel=None, yticks=[], ylabel=None,\n", + " )\n", + " ax_c.text(x=maxcov/2, y=ax_c.get_ylim()[0]*1.22, s=\"coverage\", ha=\"center\", va=\"top\", fontsize=13)\n", + " for spine in \"left\", \"right\":\n", + " ax_c.spines[spine].set_visible(False)\n", + " axs[1,i*2].set(frame_on=False, xticks=[], yticks=[], xlim=(0, 1), ylim=(0, 1))\n", + " axs[0,0].set(yticks=[0, 1])\n", + " axs[0,0].text(x=axs[0,0].get_xlim()[0]*1.12, y=.5, s=\"entropy\", rotation=90, ha=\"right\", va=\"center\", fontsize=13)\n", + " return figure\n", + "\n", + "figure = plot_entropies(ENTROPIES_DICT, downsample_viz=1, vmin_2d=-.015, vmin_levels=-.05, scale=.82)\n", + "figure.savefig(\"Figure_3.pdf\", bbox_inches=\"tight\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/paper/jupyter/Figure-4-haplotypes-p_arm.ipynb b/assets/paper/jupyter/Figure-4-haplotypes-p_arm.ipynb new file mode 100644 index 0000000..3f469d8 --- /dev/null +++ b/assets/paper/jupyter/Figure-4-haplotypes-p_arm.ipynb @@ -0,0 +1,816 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import re\n", + "from matplotlib.pyplot import subplots, style, rc, rc_context, close\n", + "from tqdm import tqdm\n", + "from venn import venn, pseudovenn\n", + "from collections import defaultdict\n", + "from itertools import count, islice\n", + "from functools import lru_cache\n", + "from argparse import Namespace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.spatial.distance import squareform\n", + "from scipy.cluster.hierarchy import dendrogram, fcluster, linkage\n", + "from sklearn.metrics import silhouette_score\n", + "from matplotlib.gridspec import GridSpec\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.cluster.hierarchy import cophenet\n", + "from scipy.stats import pearsonr, wilcoxon" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from edgecaselib.formats import load_index, load_kmerscan\n", + "from edgecaselib.densityplot import interpret_arguments\n", + "from edgecaselib.util import natsorted_chromosomes\n", + "from pickle import dump, load\n", + "from os import path\n", + "from tempfile import NamedTemporaryFile\n", + "from subprocess import check_output, CalledProcessError\n", + "from pysam import AlignmentFile\n", + "from scipy.stats import chi2_contingency\n", + "from statsmodels.stats.multitest import multipletests\n", + "from scipy.ndimage import uniform_filter1d\n", + "from matplotlib.patches import FancyArrowPatch, Rectangle\n", + "from matplotlib.lines import Line2D" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def wilcoxon_dropna(df, a, b):\n", + " dfnona = df[[a, b]].dropna()\n", + " try:\n", + " yes = sum(dfnona[a] < dfnona[b])\n", + " no = sum(dfnona[a] > dfnona[b])\n", + " p = wilcoxon(dfnona[a], dfnona[b])[1]\n", + " return yes, no, p\n", + " except ValueError:\n", + " return np.nan, np.nan, np.nan" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SAMFILTERS = [[\"tract_anchor\"], [\"is_q\"], 0]\n", + "ecx = load_index(\"../../hg38ext.fa.ecx\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SUBJECT_TO_TRIO = {\n", + " \"HG001\": \"NA12878\",\n", + " \"HG002\": \"AshkenazimTrio\", \"HG003\": \"AshkenazimTrio\", \"HG004\": \"AshkenazimTrio\",\n", + " \"HG005\": \"ChineseTrio\", \"HG006\": \"ChineseTrio\", \"HG007\": \"ChineseTrio\",\n", + "}\n", + "P_CHROMS = [\"chr2\", \"3ptel_1-500K_1_12_12\", \"4ptel_1-500K_1_12_12\", \"chr5\", \"chr9\", \"chr12\", \"17ptel_1_500K_1_12_12\"]\n", + "MAXLEN = 1450\n", + "DATA_DIR = \"../../../data/datasets/2021\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def squarify(narrowform):\n", + " triu_fillna = narrowform.pivot(index=\"qname1\", columns=\"qname2\", values=\"relative_ld\").fillna(0)\n", + " return triu_fillna.T + triu_fillna\n", + "\n", + "distances_narrowform = pd.read_csv(f\"{DATA_DIR}/PacBio/haplotypes/levenshtein-p_arm.tsv\", sep=\"\\t\", escapechar=\"#\")\n", + "\n", + "RAW_GLOBAL_LDS = {\n", + " rname: squarify(distances_narrowform[distances_narrowform[\"rname\"]==rname].drop(columns=\"rname\"))\n", + " for rname in distances_narrowform[\"rname\"].drop_duplicates()\n", + " if rname in P_CHROMS\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "KMERSCANNER_PKL = f\"{DATA_DIR}/PacBio/kmerscanner-p_arm.pkl\"\n", + "KMERSCANNER_DAT = f\"{DATA_DIR}/PacBio/kmerscanner-p_arm.dat.gz\"\n", + "\n", + "if path.isfile(KMERSCANNER_PKL):\n", + " with open(KMERSCANNER_PKL, mode=\"rb\") as pkl:\n", + " DENSITIES = load(pkl)\n", + "else:\n", + " DENSITIES = load_kmerscan(KMERSCANNER_DAT, True, SAMFILTERS, 10)\n", + " with open(KMERSCANNER_PKL, mode=\"wb\") as pkl:\n", + " dump(DENSITIES, pkl)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class GridFig():\n", + "\n", + " def __init__(self, width_ratios, height_ratios, scale=1):\n", + " self.figure, _ = subplots(\n", + " figsize=(sum(width_ratios)*scale, sum(height_ratios)*scale),\n", + " ncols=0, nrows=0,\n", + " )\n", + " self.gs = GridSpec(\n", + " ncols=len(width_ratios), wspace=0, width_ratios=width_ratios, \n", + " nrows=len(height_ratios), hspace=0, height_ratios=height_ratios,\n", + " figure=self.figure,\n", + " )\n", + " \n", + " def subplot(self, gridspec_slice, aspect=\"auto\", frame=False):\n", + " ax = self.figure.add_subplot(gridspec_slice, aspect=aspect)\n", + " if frame is False:\n", + " ax.set(frame_on=False)\n", + " else:\n", + " for spine in {\"top\", \"right\", \"bottom\", \"left\"} - set(frame):\n", + " ax.spines[spine].set_visible(False)\n", + " return ax" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_dendrogram(Z, gf):\n", + " ax = gf.subplot(gf.gs[0,10])\n", + " with rc_context({\"lines.linewidth\": .5}):\n", + " dendrogram(\n", + " Z, orientation=\"left\",\n", + " link_color_func=lambda x: \"black\", ax=ax,\n", + " )\n", + " ax.set(\n", + " xticks=[], xlabel=None,\n", + " yticks=[], ylabel=None,\n", + " xlim=ax.get_xlim()[::-1],\n", + " ylim=ax.get_ylim()[::-1],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from matplotlib.colors import PowerNorm\n", + "\n", + "def plot_heatmap(data2d, gf, cmap=\"gray_r\", vmax=.15):\n", + " ax = gf.subplot(gf.gs[0,9])\n", + " sns.heatmap(data2d, cmap=cmap, cbar=False, vmin=0, vmax=vmax, ax=ax, norm=PowerNorm(gamma=.5))\n", + " ax.set(xticks=[], yticks=[], xlabel=None, ylabel=None, xlim=ax.get_xlim()[::-1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def cluster(lds, metric=\"correlation\", method=\"ward\"):\n", + " Z = linkage(squareform(lds), metric=metric, method=method, optimal_ordering=False)\n", + " leaves = dendrogram(Z, no_plot=True)[\"leaves\"]\n", + " data2d = lds.iloc[leaves, leaves].copy()\n", + " dispatcher = pd.DataFrame(index=data2d.index)\n", + " dispatcher.index.name = \"read\"\n", + " to_subject = dispatcher.index.map(lambda s: s.split(\":\")[1])\n", + " for subject in sorted(to_subject.drop_duplicates()):\n", + " dispatcher[subject] = (to_subject==subject)\n", + " return Z, data2d, dispatcher" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_plottable_density_section(densities, chrom, motif, data2d, ecx):\n", + " chrom_densities = densities[chrom]\n", + " if motif is None:\n", + " by_motif = chrom_densities[chrom_densities[\"motif\"]==\"CCCTAA\"]\n", + " else:\n", + " by_motif = chrom_densities[chrom_densities[\"motif\"]==motif]\n", + " section = by_motif.set_index(\"name\").reindex(data2d.index).iloc[:,8:].copy()\n", + " if motif is None:\n", + " section = (~section.isnull()).astype(int) / 3\n", + " section.columns = section.columns.astype(int)\n", + " anchor = ecx.loc[\n", + " (ecx[\"rname\"]==chrom) & (ecx[\"flag\"]==0x4000) & (ecx[\"prime\"]==5),\n", + " \"pos\",\n", + " ].iloc[0]\n", + " return section[[c for c in section.columns if c<=anchor]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_absentees(lds, densities, chrom, ecx):\n", + " raw_section = get_plottable_density_section(densities, chrom, \"CCCTAA\", lds, ecx)\n", + " nulls = raw_section.isnull().all(axis=1)\n", + " return nulls[nulls].index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def section_to_RGB(ps, color, alpha_factor=1.2):\n", + " return np.transpose(\n", + " np.array([\n", + " np.full_like(ps, color[0]),\n", + " np.full_like(ps, color[1]),\n", + " np.full_like(ps, color[2]),\n", + " np.clip(ps*alpha_factor, a_min=None, a_max=1),\n", + " ]),\n", + " axes=(1, 2, 0),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def draw_fancy_arrow(\n", + " y, start, end, ax, lw=.25,\n", + " csty=\"angle3,angleA=45,angleB=-45\",\n", + " asty=\"Simple, tail_width=.25, head_width=2, head_length=3\"\n", + "):\n", + " ax.add_patch(FancyArrowPatch(\n", + " (start, y), (end, y),\n", + " connectionstyle=csty,\n", + " arrowstyle=asty,\n", + " lw=lw, color=\"#888\", clip_on=False,\n", + " ))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "POPULATION_COLORS = {\n", + " \"HG001\": \"black\",\n", + " \"HG002\": \"green\", \"HG003\": \"green\", \"HG004\": \"green\",\n", + " \"HG005\": \"steelblue\", \"HG006\": \"steelblue\", \"HG007\": \"steelblue\",\n", + "}\n", + "\n", + "def plot_subjects(dispatcher, gf, s=10):\n", + " for i, subject in enumerate(sorted(SUBJECT_TO_TRIO)):\n", + " sax = gf.subplot(gf.gs[0,1+i])\n", + " sax.plot([0, 0], [0, len(dispatcher)], lw=.5, color=\"#888\")\n", + " if subject in dispatcher:\n", + " truthiness = dispatcher[subject].reset_index(drop=True)\n", + " positions = truthiness[truthiness].index\n", + " for x in [-.1, 0, .1]:\n", + " sax.scatter(x=[x]*len(positions), y=positions, marker=\"_\", s=s, color=POPULATION_COLORS[subject])\n", + " sax.set(\n", + " xticks=[0], xticklabels=[subject+\" \"],\n", + " yticks=[], xlabel=None, ylabel=None,\n", + " xlim=(-.5, .5),\n", + " ylim=(len(dispatcher), -1),\n", + " )\n", + " for tick in sax.get_xticklabels():\n", + " tick.set_rotation(90)\n", + " if subject in {\"HG002\", \"HG005\"}:\n", + " draw_fancy_arrow(len(dispatcher), 1, 0, sax)\n", + " draw_fancy_arrow(len(dispatcher), 2, 0, sax)\n", + " sax.tick_params(axis=\"both\", which=\"both\", length=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "IMSHOW_PALETTE = {\n", + " None: [.7, .7, .7],\n", + " \"CCCTAA\": [.0, .4, .1],\n", + " \"CCCTCA\": [1, 1, 0],\n", + " \"CCCCTAA\": [.5, .9, 1],\n", + "}\n", + "\n", + "def plot_densities(densities, chrom, data2d, ecx, gf, extent, bin_size=100):\n", + " ax = gf.subplot(gf.gs[0,0])\n", + " for motif, color in IMSHOW_PALETTE.items():\n", + " ps = get_plottable_density_section(densities, chrom, motif, data2d, ecx).values\n", + " breakat = MAXLEN // 100\n", + " if ps.shape[1] < MAXLEN:\n", + " ps = np.pad(ps, ((0, 0), (MAXLEN-ps.shape[1], 0)))\n", + " elif ps.shape[1] > MAXLEN:\n", + " ps = ps[:,-MAXLEN:]\n", + " pa = section_to_RGB(np.clip(uniform_filter1d(ps[:,::-1], 5, 1)[:,::-1], a_min=0.0, a_max=1.0), color, 2)\n", + " ax.imshow(pa, extent=extent, interpolation=\"nearest\")\n", + " ticklabels=(-np.linspace(MAXLEN//100, 0, MAXLEN//100+1).astype(int)).astype(str)\n", + " fullaxislen = len(ticklabels)\n", + " ticklabels = ticklabels[-breakat-1:]\n", + " xmin, xmax = extent[:2]\n", + " ax.set(\n", + " xticks=np.linspace(xmin, xmax, MAXLEN//100+1)[-breakat-1:],\n", + " xticklabels=ticklabels,\n", + " xlabel=\"Kbp of telomeric tract\",\n", + " yticks=[], ylabel=None,\n", + " )\n", + " ax.tick_params(axis=\"both\", which=\"both\", length=0)\n", + " ax.tick_params(axis=\"x\", which=\"both\", length=3)\n", + " ax.axhline(0, 1-(breakat+1)/fullaxislen, 1, lw=1, c=\"black\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@lru_cache(maxsize=None)\n", + "def convname(cn):\n", + " match = re.search(r'^\\d+', cn)\n", + " if match:\n", + " return match.group() + \"p\"\n", + " else:\n", + " return cn.split(\"chr\")[1] + \"p\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def process_lds(raw_global_lds, chrom, densities, ecx, no_plot=False, scale=.2):\n", + " lds = raw_global_lds[chrom].copy()\n", + " absentees = get_absentees(lds, densities, chrom, ecx)\n", + " lds.drop(index=absentees, columns=absentees, inplace=True)\n", + " Z, data2d, dispatcher = cluster(lds, metric=\"euclidean\", method=\"ward\")\n", + " if no_plot:\n", + " gf = None\n", + " else:\n", + " h = 6*len(lds)/50\n", + " w = 30\n", + " gf = GridFig([w]+[.85]*7+[.3,h,h/3], [h], scale=scale)\n", + " plot_dendrogram(Z, gf=gf)\n", + " plot_heatmap(data2d, gf=gf)\n", + " plot_subjects(dispatcher, gf=gf, s=7)\n", + " plot_densities(densities, chrom, data2d, ecx, gf=gf, extent=[0,w,0,h])\n", + " if len(chrom) > 11:\n", + " name = \"{} ({})\".format(convname(chrom), chrom[:6]+\"…\")\n", + " else:\n", + " name = \"{} ({})\".format(convname(chrom), chrom)\n", + " gf.figure.get_axes()[-10].set_ylabel(name, fontsize=13)\n", + " gf.figure.get_axes()[-10].yaxis.set_label_position(\"right\")\n", + " return lds, Z, data2d, dispatcher, gf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def cophenetic_correlation(lds, Z):\n", + " r, p = pearsonr(squareform(lds), cophenet(Z))\n", + " return r, max(p, 5e-324) # p-value of zero is just a rounding issue" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def fixup_labels(gf, chrom):\n", + " if chrom == \"chr2\":\n", + " gf.figure.get_axes()[1].set_title(\"Pairwise relative\\nLevenshtein distances\", fontsize=13, loc=\"left\")\n", + " gf.figure.get_axes()[5].set_title(\"Subjects\", fontsize=13)\n", + " gf.figure.get_axes()[9].set_title(\"Motif densities\", loc=\"right\", fontsize=13)\n", + " if chrom != \"17ptel_1_500K_1_12_12\":\n", + " for ax in gf.figure.get_axes()[:-1]:\n", + " ax.set(xticklabels=[], xlabel=None)\n", + " gf.figure.get_axes()[-1].set(xlabel=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "read_to_category = lambda dispatcher, pos: pd.Series(\n", + " index=dispatcher.index,\n", + " data=dispatcher.index.map(lambda s: s.split(\":\")[pos])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def count_subtrees(dispatcher):\n", + " read_to_trio = read_to_category(dispatcher, 0)\n", + " running_trio = None\n", + " trio_runs = defaultdict(int)\n", + " run = 0\n", + " for trio in read_to_trio:\n", + " if trio != running_trio:\n", + " if run:\n", + " trio_runs[running_trio] += 1\n", + " running_trio, run = trio, 1\n", + " else:\n", + " run += 1\n", + " if run:\n", + " trio_runs[running_trio] += 1\n", + " return trio_runs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reindex_to = lambda row, rtc, cat: row.reindex(rtc[rtc==cat].index).dropna()\n", + "reindex_in = lambda row, rtc: row.reindex(rtc[rtc==rtc[row.name]].index).dropna()\n", + "reindex_out = lambda row, rtc: row.reindex(rtc[rtc!=rtc[row.name]].index).dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_closest_distances(lds, dispatcher):\n", + " read_to_trio = read_to_category(dispatcher, 0)\n", + " read_to_subject = read_to_category(dispatcher, 1)\n", + " return lds.apply(\n", + " lambda row: pd.Series({\n", + " \"subject\": reindex_in(row, read_to_subject).drop(index=row.name).min(),\n", + " \"trio\": reindex_in(reindex_out(row, read_to_subject), read_to_trio).min(),\n", + " \"outgroup\": reindex_out(row, read_to_trio).min(),\n", + " }),\n", + " axis=1,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def subject_to_subject_lds(lds, dispatcher, a, b):\n", + " read_to_subject = read_to_category(dispatcher, 1)\n", + " return lds.loc[read_to_subject[read_to_subject==a].index, read_to_subject[read_to_subject==b].index]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_closest_family_distances(lds, dispatcher, **kwargs):\n", + " target, test, control = list(kwargs)\n", + " return pd.DataFrame({\n", + " f\"{test} to {target}\": subject_to_subject_lds(lds, dispatcher, kwargs[test], kwargs[target]).min(axis=1),\n", + " f\"{test} to {control}\": subject_to_subject_lds(lds, dispatcher, kwargs[test], kwargs[control]).min(axis=1),\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def process_distances(lds, dispatcher):\n", + " closest_distances = get_closest_distances(lds, dispatcher)\n", + " ashkenazim_from_father = get_closest_family_distances(lds, dispatcher, son=\"HG002\", father=\"HG003\", mother=\"HG004\")\n", + " ashkenazim_from_mother = get_closest_family_distances(lds, dispatcher, son=\"HG002\", mother=\"HG004\", father=\"HG003\")\n", + " chinese_from_father = get_closest_family_distances(lds, dispatcher, son=\"HG005\", father=\"HG006\", mother=\"HG007\")\n", + " chinese_from_mother = get_closest_family_distances(lds, dispatcher, son=\"HG005\", mother=\"HG007\", father=\"HG006\")\n", + " return (\n", + " closest_distances,\n", + " ashkenazim_from_father, ashkenazim_from_mother,\n", + " chinese_from_father, chinese_from_mother,\n", + " *wilcoxon_dropna(closest_distances, \"subject\", \"trio\"),\n", + " *wilcoxon_dropna(closest_distances, \"subject\", \"outgroup\"),\n", + " *wilcoxon_dropna(closest_distances, \"trio\", \"outgroup\"),\n", + " *wilcoxon_dropna(ashkenazim_from_father, \"father to son\", \"father to mother\"),\n", + " *wilcoxon_dropna(ashkenazim_from_mother, \"mother to son\", \"mother to father\"),\n", + " *wilcoxon_dropna(chinese_from_father, \"father to son\", \"father to mother\"),\n", + " *wilcoxon_dropna(chinese_from_mother, \"mother to son\", \"mother to father\"),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "stats = pd.DataFrame(columns=[\n", + " \"NA12878\", \"AshkenazimTrio\", \"ChineseTrio\",\n", + " \"cr\", \"cp\",\n", + " \"s2t_p\", \"s2o_p\", \"t2o_p\", \"aff_p\", \"afm_p\", \"cff_p\", \"cfm_p\",\n", + "])\n", + "\n", + "cd_list, aff_list, afm_list, cff_list, cfm_list = [], [], [], [], []\n", + "NO_PLOT = False\n", + "\n", + "for chrom in tqdm(RAW_GLOBAL_LDS):\n", + " try:\n", + " lds, Z, data2d, dispatcher, gf = process_lds(RAW_GLOBAL_LDS, chrom, DENSITIES, ecx, no_plot=NO_PLOT, scale=.2)\n", + " except ValueError: # too few observations\n", + " continue\n", + " try:\n", + " cr, cp = cophenetic_correlation(lds, Z)\n", + " except ValueError: # too few observations\n", + " cr, cp = np.nan, np.nan\n", + " if not NO_PLOT:\n", + " fixup_labels(gf, chrom)\n", + " gf.figure.savefig(\n", + " f\"{DATA_DIR}/PacBio/haplotypes/clusters-p_arm/\"+chrom+\".pdf\", bbox_inches=\"tight\",\n", + " )\n", + " close(gf.figure)\n", + " cd, aff, afm, cff, cfm, _, _, s2t_p, _, _, s2o_p, _, _, t2o_p, _, _, aff_p, _, _, afm_p, _, _, cff_p, _, _, cfm_p = (\n", + " process_distances(lds, dispatcher)\n", + " )\n", + " cd_list.append(cd)\n", + " aff_list.append(aff)\n", + " afm_list.append(afm)\n", + " cff_list.append(cff)\n", + " cfm_list.append(cfm)\n", + " stats.loc[chrom] = [\n", + " np.nan, np.nan, np.nan,\n", + " cr, cp,\n", + " s2t_p, s2o_p, t2o_p, aff_p, afm_p, cff_p, cfm_p,\n", + " ]\n", + " for trio, subtree_count in count_subtrees(dispatcher).items():\n", + " stats.loc[chrom, trio] = subtree_count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "legend, axs = subplots(figsize=(6.7, 2.5), ncols=4, gridspec_kw=dict(width_ratios=(4.2, 4.5, 1, 1), wspace=.1))\n", + "\n", + "axs[3].imshow(np.vstack([np.linspace(0, 1, 256)**.5]).T, cmap=\"Greys_r\", aspect=\"auto\")\n", + "axs[3].set(xticks=[], yticks=[0, 255])\n", + "axs[3].set_yticklabels([\"$\\geq{}0.15$\", \"0\"], fontsize=14)\n", + "axs[3].text(x=-.2, y=128, s=\"Relative\\ndistance\\n\", rotation=90, ha=\"right\", va=\"center\", fontsize=17)\n", + "\n", + "axs[2].set(frame_on=False, xticks=[], yticks=[])\n", + "\n", + "for x, subject in enumerate(sorted(SUBJECT_TO_TRIO)):\n", + " axs[1].plot([x, x], [0, 1], color=\"#888\", lw=1.5)\n", + " axs[1].scatter([x]*4, np.linspace(.1, .7, 4)+x/30, color=POPULATION_COLORS[subject], marker=\"_\", s=125)\n", + "\n", + "axs[1].set(xlim=(-4.5, 8.5), xticks=[], yticks=[])\n", + "twiny = axs[1].twiny()\n", + "twiny.set(xlim=(-4.5, 8.5), xticks=[])\n", + "for tick in twiny.get_xticklabels():\n", + " tick.set_rotation(80)\n", + "twiny.tick_params(axis=\"both\", which=\"both\", length=0)\n", + "for spine in \"top\", \"bottom\", \"right\", \"left\":\n", + " axs[1].spines[spine].set_visible(False)\n", + " twiny.spines[spine].set_visible(False)\n", + "axs[1].text(x=-2, y=.5, s=\"Assignment of\\nreads to subjects\", rotation=90, fontsize=16, ha=\"center\", va=\"center\")\n", + "axs[1].text(x=-1.5, y=1.1, s=\"/ populations\", rotation=90, ha=\"center\", va=\"bottom\", fontsize=16)\n", + "\n", + "csty1 = \"angle3,angleA=80,angleB=-60\"\n", + "csty2 = \"angle3,angleA=60,angleB=-70\"\n", + "asty = \"Simple, tail_width=.25, head_width=7, head_length=5\"\n", + "draw_fancy_arrow(-0.02, 2.1, 1, axs[1], lw=1, csty=csty1)\n", + "draw_fancy_arrow(-0.02, 3.1, 1, axs[1], lw=1, asty=asty, csty=csty2)\n", + "draw_fancy_arrow(-0.02, 5.1, 4, axs[1], lw=1, csty=csty1)\n", + "draw_fancy_arrow(-0.02, 6.1, 4, axs[1], lw=1, asty=asty, csty=csty2)\n", + "\n", + "axs[1].text(x=2.95, y=-.35, s=\"child{}parent\\nrelatedness\".format(chr(0x2190)), va=\"center\", ha=\"center\", fontsize=16)\n", + "line = Line2D((2.5, 4.1), (-.25, -.12), lw=1, ls=\"--\", color=\"#888\")\n", + "line.set_clip_on(False)\n", + "axs[1].add_line(line)\n", + "line = Line2D((2.5, 2.1), (-.25, -.16), lw=1, ls=\"--\", color=\"#888\")\n", + "line.set_clip_on(False)\n", + "axs[1].add_line(line)\n", + "\n", + "line = Line2D((-.2, .2), (1.1, 1.1), lw=4, color=POPULATION_COLORS[\"HG001\"])\n", + "line.set_clip_on(False)\n", + "axs[1].add_line(line)\n", + "axs[1].text(x=.7, y=1.1, s=\" Utah\", ha=\"center\", va=\"bottom\", rotation=50, fontsize=17, color=POPULATION_COLORS[\"HG001\"])\n", + "\n", + "line = Line2D((.8, 3.2), (1.1, 1.1), lw=4, color=POPULATION_COLORS[\"HG002\"])\n", + "line.set_clip_on(False)\n", + "axs[1].add_line(line)\n", + "axs[1].text(x=4.8, y=1.1, s=\" Ashkenazim\", ha=\"center\", va=\"bottom\", rotation=50, fontsize=17, color=POPULATION_COLORS[\"HG002\"])\n", + "\n", + "line = Line2D((3.8, 6.2), (1.1, 1.1), lw=4, color=POPULATION_COLORS[\"HG005\"])\n", + "line.set_clip_on(False)\n", + "axs[1].add_line(line)\n", + "axs[1].text(x=6.4, y=1.1, s=\" Chinese\", ha=\"center\", va=\"bottom\", rotation=50, fontsize=17, color=POPULATION_COLORS[\"HG005\"])\n", + "\n", + "INCLUDE_FIVEMER = False\n", + "\n", + "axs[0].add_patch(Rectangle((0,3), 1.5, .65, facecolor=\"#119933\", edgecolor=\"black\"))\n", + "axs[0].text(x=1.75, y=3.25, s=\"CCCTAA\", fontsize=15, va=\"center\")\n", + "axs[0].add_patch(Rectangle((0,2), 1.5, .65, facecolor=\"#88DFEF\", edgecolor=\"black\"))\n", + "axs[0].text(x=1.75, y=2.25, s=\"CCCCTAA\", fontsize=15, va=\"center\")\n", + "axs[0].add_patch(Rectangle((0,1), 1.5, .65, facecolor=\"#DDDDDD\", edgecolor=\"black\"))\n", + "axs[0].text(x=1.75, y=1.25, s=\"background\", fontsize=13, va=\"center\")\n", + "axs[0].set(frame_on=False)\n", + "axs[0].add_patch(Rectangle((-.4, .6), 5.3, 3.45, facecolor=\"none\", edgecolor=\"black\", lw=1, clip_on=False))\n", + "axs[0].set(xticks=[], yticks=[], xlim=(-.4, 4.8), ylim=(-0.7, 4.4))\n", + "axs[0].set_title(\"Motif densities\", fontsize=17)\n", + "\n", + "legend.add_artist(Rectangle((.08, -.2), .87, 1.73, edgecolor=\"black\", facecolor=\"none\"))\n", + "\n", + "legend.savefig(f\"{DATA_DIR}/PacBio/haplotypes/clusters-p_arm/legend.pdf\", bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "PRINT_NS = False\n", + "\n", + "if PRINT_NS:\n", + " format_pval = lambda p: \"ns\" if (p >= .05) else (\"<1.0e-300\" if (p < 1e-300) else format(p, \".1e\"))\n", + "else:\n", + " format_pval = lambda p: format(p, \".2f\") if (p >= .05) else (\"<1.0e-300\" if (p < 1e-300) else format(p, \".1e\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "st = stats.reindex(natsorted_chromosomes(stats.index)).copy()\n", + "st.index.name = \"contig\"\n", + "tc = st.iloc[:,:3].values.flatten()\n", + "tc = tc[~np.isnan(tc)]\n", + "print(\"Max subtree count:\", stats.iloc[:,:3].max(axis=0).sort_values().iloc[[-1]].to_string())\n", + "print(\"Median subtree count:\", np.median(tc))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "st = stats.reindex(natsorted_chromosomes(stats.index)).copy()\n", + "st.index.name = \"contig\"\n", + "st = st.iloc[:,3:].reset_index()\n", + "st.insert(loc=0, column=\"chromosome\", value=st[\"contig\"].apply(convname))\n", + "\n", + "coph = st.iloc[:,:4].copy()\n", + "coph[\"cp\"] = multipletests(coph[\"cp\"], method=\"bonferroni\")[1]\n", + "coph[\"r\"] = coph[\"cr\"].apply(lambda r: format(r, \".2f\"))\n", + "coph[\"p\"] = coph[\"cp\"].apply(format_pval)\n", + "coph.drop(columns=[\"cr\", \"cp\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"for_wilcoxon-p_arm.pkl\", mode=\"wb\") as pkl:\n", + " P = Namespace(cd_list=cd_list, aff_list=aff_list, afm_list=afm_list, cff_list=cff_list, cfm_list=cfm_list)\n", + " dump(P, pkl)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cd_all = pd.concat(cd_list)\n", + "print(len(cd_all[cd_all[\"subject\"]>cd_all[\"outgroup\"]]), len(cd_all[cd_all[\"subject\"]>cd_all[\"outgroup\"]])/3729)\n", + "inter_reads = cd_all[cd_all[\"subject\"]>=cd_all[\"outgroup\"]*2].index\n", + "inter_dispatcher = pd.DataFrame(index=inter_reads, data={\"subject\": inter_reads.map(lambda s: s.split(\":\")[1])})\n", + "print(len(inter_dispatcher), len(inter_dispatcher)/3729)\n", + "inter_dispatcher[\"subject\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inter_dispatcher[\"chromosome\"] = np.nan\n", + "inter_dispatcher[\"rname\"] = np.nan\n", + "\n", + "for chrom, lds in RAW_GLOBAL_LDS.items():\n", + " for name in lds.index:\n", + " if name in inter_dispatcher.index:\n", + " inter_dispatcher.loc[name, \"rname\"] = chrom\n", + "inter_dispatcher[\"chromosome\"] = inter_dispatcher[\"rname\"].apply(convname)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inter_counts = inter_dispatcher.groupby([\"subject\", \"chromosome\"], as_index=False).count().pivot(\n", + " index=\"subject\", columns=\"chromosome\", values=\"rname\",\n", + ")\n", + "inter_counts = inter_counts[natsorted_chromosomes(inter_counts.columns)].applymap(lambda x: \"\" if np.isnan(x) else str(int(x)))\n", + "inter_counts" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/paper/jupyter/Figure-5-haplotypes-q_arm.ipynb b/assets/paper/jupyter/Figure-5-haplotypes-q_arm.ipynb new file mode 100644 index 0000000..dd96ec7 --- /dev/null +++ b/assets/paper/jupyter/Figure-5-haplotypes-q_arm.ipynb @@ -0,0 +1,809 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import re\n", + "from matplotlib.pyplot import subplots, style, rc, rc_context, close\n", + "from tqdm import tqdm\n", + "from venn import venn, pseudovenn\n", + "from collections import defaultdict\n", + "from itertools import count, islice\n", + "from functools import lru_cache\n", + "from argparse import Namespace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.spatial.distance import squareform\n", + "from scipy.cluster.hierarchy import dendrogram, fcluster, linkage\n", + "from sklearn.metrics import silhouette_score\n", + "from matplotlib.gridspec import GridSpec\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.cluster.hierarchy import cophenet\n", + "from scipy.stats import pearsonr, wilcoxon" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from edgecaselib.formats import load_index, load_kmerscan\n", + "from edgecaselib.densityplot import interpret_arguments\n", + "from edgecaselib.util import natsorted_chromosomes\n", + "from pickle import dump, load\n", + "from os import path\n", + "from tempfile import NamedTemporaryFile\n", + "from subprocess import check_output, CalledProcessError\n", + "from pysam import AlignmentFile\n", + "from scipy.stats import chi2_contingency\n", + "from statsmodels.stats.multitest import multipletests\n", + "from scipy.ndimage import uniform_filter1d\n", + "from matplotlib.patches import FancyArrowPatch, Rectangle\n", + "from matplotlib.lines import Line2D" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def wilcoxon_dropna(df, a, b):\n", + " dfnona = df[[a, b]].dropna()\n", + " try:\n", + " yes = sum(dfnona[a] < dfnona[b])\n", + " no = sum(dfnona[a] > dfnona[b])\n", + " p = wilcoxon(dfnona[a], dfnona[b])[1]\n", + " return yes, no, p\n", + " except ValueError:\n", + " return np.nan, np.nan, np.nan" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SAMFILTERS = [[\"is_q\", \"tract_anchor\"], 0, 0]\n", + "ecx = load_index(\"../../hg38ext.fa.ecx\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SUBJECT_TO_TRIO = {\n", + " \"HG001\": \"NA12878\",\n", + " \"HG002\": \"AshkenazimTrio\", \"HG003\": \"AshkenazimTrio\", \"HG004\": \"AshkenazimTrio\",\n", + " \"HG005\": \"ChineseTrio\", \"HG006\": \"ChineseTrio\", \"HG007\": \"ChineseTrio\",\n", + "}\n", + "Q_CHROMS = [\"chr7\", \"chr8\", \"chr11\", \"chr12\", \"14qtel_1-500K_1_12_12_rc\", \"chr15\", \"18qtel_1-500K_1_12_12_rc\"]\n", + "MAXLEN = 1600\n", + "DATA_DIR = \"../../../data/datasets/2021\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def squarify(narrowform):\n", + " triu_fillna = narrowform.pivot(index=\"qname1\", columns=\"qname2\", values=\"relative_ld\").fillna(0)\n", + " return triu_fillna.T + triu_fillna\n", + "\n", + "distances_narrowform = pd.read_csv(f\"{DATA_DIR}/PacBio/haplotypes/levenshtein-q_arm.tsv\", sep=\"\\t\", escapechar=\"#\")\n", + "\n", + "RAW_GLOBAL_LDS = {\n", + " rname: squarify(distances_narrowform[distances_narrowform[\"rname\"]==rname].drop(columns=\"rname\"))\n", + " for rname in distances_narrowform[\"rname\"].drop_duplicates()\n", + " if rname in Q_CHROMS\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "KMERSCANNER_PKL = f\"{DATA_DIR}/PacBio/kmerscanner-q_arm.pkl\"\n", + "KMERSCANNER_DAT = f\"{DATA_DIR}/PacBio/kmerscanner-q_arm.dat.gz\"\n", + "\n", + "if path.isfile(KMERSCANNER_PKL):\n", + " with open(KMERSCANNER_PKL, mode=\"rb\") as pkl:\n", + " DENSITIES = load(pkl)\n", + "else:\n", + " DENSITIES = load_kmerscan(KMERSCANNER_DAT, True, SAMFILTERS, 10)\n", + " with open(KMERSCANNER_PKL, mode=\"wb\") as pkl:\n", + " dump(DENSITIES, pkl)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class GridFig():\n", + "\n", + " def __init__(self, width_ratios, height_ratios, scale=1):\n", + " self.figure, _ = subplots(\n", + " figsize=(sum(width_ratios)*scale, sum(height_ratios)*scale),\n", + " ncols=0, nrows=0,\n", + " )\n", + " self.gs = GridSpec(\n", + " ncols=len(width_ratios), wspace=0, width_ratios=width_ratios, \n", + " nrows=len(height_ratios), hspace=0, height_ratios=height_ratios,\n", + " figure=self.figure,\n", + " )\n", + " \n", + " def subplot(self, gridspec_slice, aspect=\"auto\", frame=False):\n", + " ax = self.figure.add_subplot(gridspec_slice, aspect=aspect)\n", + " if frame is False:\n", + " ax.set(frame_on=False)\n", + " else:\n", + " for spine in {\"top\", \"right\", \"bottom\", \"left\"} - set(frame):\n", + " ax.spines[spine].set_visible(False)\n", + " return ax" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_dendrogram(Z, gf):\n", + " ax = gf.subplot(gf.gs[0,0])\n", + " with rc_context({\"lines.linewidth\": .5}):\n", + " dendrogram(\n", + " Z, orientation=\"left\",\n", + " link_color_func=lambda x: \"black\", ax=ax,\n", + " )\n", + " ax.set(\n", + " xticks=[], xlabel=None,\n", + " yticks=[], ylabel=None,\n", + " ylim=ax.get_ylim()[::-1],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_heatmap(data2d, gf, cmap=\"gray_r\", vmax=.15):\n", + " ax = gf.subplot(gf.gs[0,1])\n", + " sns.heatmap(data2d, cmap=cmap, cbar=False, vmin=0, vmax=vmax, ax=ax)\n", + " ax.set(xticks=[], yticks=[], xlabel=None, ylabel=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def cluster(lds, metric=\"correlation\", method=\"ward\"):\n", + " Z = linkage(squareform(lds), metric=metric, method=method, optimal_ordering=False)\n", + " leaves = dendrogram(Z, no_plot=True)[\"leaves\"]\n", + " data2d = lds.iloc[leaves, leaves].copy()\n", + " dispatcher = pd.DataFrame(index=data2d.index)\n", + " dispatcher.index.name = \"read\"\n", + " to_subject = dispatcher.index.map(lambda s: s.split(\":\")[1])\n", + " for subject in sorted(to_subject.drop_duplicates()):\n", + " dispatcher[subject] = (to_subject==subject)\n", + " return Z, data2d, dispatcher" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_plottable_density_section(densities, chrom, motif, data2d, ecx):\n", + " chrom_densities = densities[chrom]\n", + " if motif is None:\n", + " by_motif = chrom_densities[chrom_densities[\"motif\"]==\"TTAGGG\"]\n", + " else:\n", + " by_motif = chrom_densities[chrom_densities[\"motif\"]==motif]\n", + " section = by_motif.set_index(\"name\").reindex(data2d.index).iloc[:,8:].copy()\n", + " if motif is None:\n", + " section = (~section.isnull()).astype(int) / 3\n", + " section.columns = section.columns.astype(int)\n", + " anchor = ecx.loc[\n", + " (ecx[\"rname\"]==chrom) & (ecx[\"flag\"]==0x4000) & (ecx[\"prime\"]==3),\n", + " \"pos\",\n", + " ].iloc[0]\n", + " return section[[c for c in section.columns if c>=anchor]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_absentees(lds, densities, chrom, ecx):\n", + " raw_section = get_plottable_density_section(densities, chrom, \"TTAGGG\", lds, ecx)\n", + " nulls = raw_section.isnull().all(axis=1)\n", + " return nulls[nulls].index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def section_to_RGB(ps, color, alpha_factor=1.2):\n", + " return np.transpose(\n", + " np.array([\n", + " np.full_like(ps, color[0]),\n", + " np.full_like(ps, color[1]),\n", + " np.full_like(ps, color[2]),\n", + " np.clip(ps*alpha_factor, a_min=None, a_max=1),\n", + " ]),\n", + " axes=(1, 2, 0),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def draw_fancy_arrow(\n", + " y, start, end, ax, lw=.25,\n", + " csty=\"angle3,angleA=45,angleB=-45\",\n", + " asty=\"Simple, tail_width=.25, head_width=2, head_length=3\"\n", + "):\n", + " ax.add_patch(FancyArrowPatch(\n", + " (start, y), (end, y),\n", + " connectionstyle=csty,\n", + " arrowstyle=asty,\n", + " lw=lw, color=\"#888\", clip_on=False,\n", + " ))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "POPULATION_COLORS = {\n", + " \"HG001\": \"black\",\n", + " \"HG002\": \"green\", \"HG003\": \"green\", \"HG004\": \"green\",\n", + " \"HG005\": \"steelblue\", \"HG006\": \"steelblue\", \"HG007\": \"steelblue\",\n", + "}\n", + "\n", + "def plot_subjects(dispatcher, gf, s=10):\n", + " for i, subject in enumerate(sorted(SUBJECT_TO_TRIO)):\n", + " sax = gf.subplot(gf.gs[0,i+3])\n", + " sax.plot([0, 0], [0, len(dispatcher)], lw=.5, color=\"#888\")\n", + " if subject in dispatcher:\n", + " truthiness = dispatcher[subject].reset_index(drop=True)\n", + " positions = truthiness[truthiness].index\n", + " for x in [-.1, 0, .1]:\n", + " sax.scatter(x=[x]*len(positions), y=positions, marker=\"_\", s=s, color=POPULATION_COLORS[subject])\n", + " sax.set(\n", + " xticks=[0], xticklabels=[subject+\" \"],\n", + " yticks=[], xlabel=None, ylabel=None,\n", + " xlim=(-.5, .5),\n", + " ylim=(len(dispatcher), -1),\n", + " )\n", + " for tick in sax.get_xticklabels():\n", + " tick.set_rotation(90)\n", + " if subject in {\"HG002\", \"HG005\"}:\n", + " draw_fancy_arrow(len(dispatcher), 1, 0, sax)\n", + " draw_fancy_arrow(len(dispatcher), 2, 0, sax)\n", + " sax.tick_params(axis=\"both\", which=\"both\", length=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "IMSHOW_PALETTE = {\n", + " None: [.7, .7, .7],\n", + " \"TTAGGG\": [.0, .4, .1],\n", + " \"TGAGGG\": [1, 1, 0],\n", + " \"TTAGGGG\": [.5, .9, 1],\n", + "}\n", + "\n", + "def plot_densities(densities, chrom, data2d, ecx, gf, extent, bin_size=100):\n", + " ax = gf.subplot(gf.gs[0,-1])\n", + " for motif, color in IMSHOW_PALETTE.items():\n", + " ps = get_plottable_density_section(densities, chrom, motif, data2d, ecx).values\n", + " breakat = MAXLEN // 100\n", + " orig_len = ps.shape[1]\n", + " if ps.shape[1] < MAXLEN:\n", + " ps = np.pad(ps, ((0, 0), (0, MAXLEN-ps.shape[1])))\n", + " elif ps.shape[1] > MAXLEN:\n", + " ps = ps[:,:MAXLEN]\n", + " pa = section_to_RGB(np.clip(uniform_filter1d(ps, 5, 1), a_min=0.0, a_max=1.0), color, 1.5)\n", + " ax.imshow(pa, extent=extent, interpolation=\"nearest\")\n", + " ticklabels=np.linspace(0, MAXLEN//100, MAXLEN//100+1).astype(int).astype(str)\n", + " fullaxislen = len(ticklabels)\n", + " ticklabels = ticklabels[:breakat+1]\n", + " xmin, xmax = extent[:2]\n", + " ax.set(\n", + " xticks=np.linspace(xmin, xmax, MAXLEN//100+1)[:breakat+1],\n", + " xticklabels=ticklabels,\n", + " xlabel=\"Kbp of telomeric tract\",\n", + " yticks=[], ylabel=None,\n", + " )\n", + " ax.tick_params(axis=\"both\", which=\"both\", length=0)\n", + " ax.tick_params(axis=\"x\", which=\"both\", length=3)\n", + " ax.axhline(0, 0, (breakat+1)/fullaxislen, lw=1, c=\"black\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@lru_cache(maxsize=None)\n", + "def convname(cn):\n", + " match = re.search(r'^\\d+', cn)\n", + " if match:\n", + " return match.group() + \"q\"\n", + " else:\n", + " return cn.split(\"chr\")[1] + \"q\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def process_lds(raw_global_lds, chrom, densities, ecx, no_plot=False, scale=.2):\n", + " lds = raw_global_lds[chrom].copy()\n", + " absentees = get_absentees(lds, densities, chrom, ecx)\n", + " lds.drop(index=absentees, columns=absentees, inplace=True)\n", + " Z, data2d, dispatcher = cluster(lds, metric=\"euclidean\", method=\"ward\")\n", + " if no_plot:\n", + " gf = None\n", + " else:\n", + " h = 6*len(lds)/50\n", + " w = 30\n", + " gf = GridFig([h/3,h,.3]+[.85]*7+[w], [h], scale=scale)\n", + " plot_dendrogram(Z, gf=gf)\n", + " plot_heatmap(data2d, gf=gf)\n", + " plot_subjects(dispatcher, gf=gf, s=7)\n", + " plot_densities(densities, chrom, data2d, ecx, gf=gf, extent=[0,w,0,h])\n", + " if len(chrom) > 11:\n", + " name = \"{} ({})\".format(convname(chrom), chrom[:6]+\"…\")\n", + " else:\n", + " name = \"{} ({})\".format(convname(chrom), chrom)\n", + " gf.figure.get_axes()[0].set_ylabel(name, fontsize=13)\n", + " return lds, Z, data2d, dispatcher, gf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def cophenetic_correlation(lds, Z):\n", + " r, p = pearsonr(squareform(lds), cophenet(Z))\n", + " return r, max(p, 5e-324) # p-value of zero is just a rounding issue" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def fixup_labels(gf, chrom):\n", + " if chrom == \"chr7\":\n", + " gf.figure.get_axes()[1].set_title(\"Pairwise relative\\nLevenshtein distances\", fontsize=13, loc=\"right\")\n", + " gf.figure.get_axes()[5].set_title(\"Subjects\", fontsize=13)\n", + " gf.figure.get_axes()[-1].set_title(\"Motif densities\", loc=\"left\", fontsize=13)\n", + " if chrom != \"18qtel_1-500K_1_12_12_rc\":\n", + " for ax in gf.figure.get_axes()[:-1]:\n", + " ax.set(xticklabels=[], xlabel=None)\n", + " gf.figure.get_axes()[-1].set(xlabel=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "read_to_category = lambda dispatcher, pos: pd.Series(\n", + " index=dispatcher.index,\n", + " data=dispatcher.index.map(lambda s: s.split(\":\")[pos])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def count_subtrees(dispatcher):\n", + " read_to_trio = read_to_category(dispatcher, 0)\n", + " running_trio = None\n", + " trio_runs = defaultdict(int)\n", + " run = 0\n", + " for trio in read_to_trio:\n", + " if trio != running_trio:\n", + " if run:\n", + " trio_runs[running_trio] += 1\n", + " running_trio, run = trio, 1\n", + " else:\n", + " run += 1\n", + " if run:\n", + " trio_runs[running_trio] += 1\n", + " return trio_runs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reindex_to = lambda row, rtc, cat: row.reindex(rtc[rtc==cat].index).dropna()\n", + "reindex_in = lambda row, rtc: row.reindex(rtc[rtc==rtc[row.name]].index).dropna()\n", + "reindex_out = lambda row, rtc: row.reindex(rtc[rtc!=rtc[row.name]].index).dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_closest_distances(lds, dispatcher):\n", + " read_to_trio = read_to_category(dispatcher, 0)\n", + " read_to_subject = read_to_category(dispatcher, 1)\n", + " return lds.apply(\n", + " lambda row: pd.Series({\n", + " \"subject\": reindex_in(row, read_to_subject).drop(index=row.name).min(),\n", + " \"trio\": reindex_in(reindex_out(row, read_to_subject), read_to_trio).min(),\n", + " \"outgroup\": reindex_out(row, read_to_trio).min(),\n", + " }),\n", + " axis=1,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def subject_to_subject_lds(lds, dispatcher, a, b):\n", + " read_to_subject = read_to_category(dispatcher, 1)\n", + " return lds.loc[read_to_subject[read_to_subject==a].index, read_to_subject[read_to_subject==b].index]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_closest_family_distances(lds, dispatcher, **kwargs):\n", + " target, test, control = list(kwargs)\n", + " return pd.DataFrame({\n", + " f\"{test} to {target}\": subject_to_subject_lds(lds, dispatcher, kwargs[test], kwargs[target]).min(axis=1),\n", + " f\"{test} to {control}\": subject_to_subject_lds(lds, dispatcher, kwargs[test], kwargs[control]).min(axis=1),\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def process_distances(lds, dispatcher):\n", + " closest_distances = get_closest_distances(lds, dispatcher)\n", + " ashkenazim_from_father = get_closest_family_distances(lds, dispatcher, son=\"HG002\", father=\"HG003\", mother=\"HG004\")\n", + " ashkenazim_from_mother = get_closest_family_distances(lds, dispatcher, son=\"HG002\", mother=\"HG004\", father=\"HG003\")\n", + " chinese_from_father = get_closest_family_distances(lds, dispatcher, son=\"HG005\", father=\"HG006\", mother=\"HG007\")\n", + " chinese_from_mother = get_closest_family_distances(lds, dispatcher, son=\"HG005\", mother=\"HG007\", father=\"HG006\")\n", + " return (\n", + " closest_distances,\n", + " ashkenazim_from_father, ashkenazim_from_mother,\n", + " chinese_from_father, chinese_from_mother,\n", + " *wilcoxon_dropna(closest_distances, \"subject\", \"trio\"),\n", + " *wilcoxon_dropna(closest_distances, \"subject\", \"outgroup\"),\n", + " *wilcoxon_dropna(closest_distances, \"trio\", \"outgroup\"),\n", + " *wilcoxon_dropna(ashkenazim_from_father, \"father to son\", \"father to mother\"),\n", + " *wilcoxon_dropna(ashkenazim_from_mother, \"mother to son\", \"mother to father\"),\n", + " *wilcoxon_dropna(chinese_from_father, \"father to son\", \"father to mother\"),\n", + " *wilcoxon_dropna(chinese_from_mother, \"mother to son\", \"mother to father\"),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "stats = pd.DataFrame(columns=[\n", + " \"NA12878\", \"AshkenazimTrio\", \"ChineseTrio\",\n", + " \"cr\", \"cp\",\n", + " \"s2t_p\", \"s2o_p\", \"t2o_p\", \"aff_p\", \"afm_p\", \"cff_p\", \"cfm_p\",\n", + "])\n", + "\n", + "cd_list, aff_list, afm_list, cff_list, cfm_list = [], [], [], [], []\n", + "NO_PLOT = False\n", + "\n", + "for chrom in tqdm(RAW_GLOBAL_LDS):\n", + " try:\n", + " lds, Z, data2d, dispatcher, gf = process_lds(RAW_GLOBAL_LDS, chrom, DENSITIES, ecx, no_plot=NO_PLOT, scale=.2)\n", + " except ValueError: # too few observations\n", + " continue\n", + " try:\n", + " cr, cp = cophenetic_correlation(lds, Z)\n", + " except ValueError: # too few observations\n", + " cr, cp = np.nan, np.nan\n", + " if not NO_PLOT:\n", + " fixup_labels(gf, chrom)\n", + " gf.figure.savefig(\n", + " f\"{DATA_DIR}/PacBio/haplotypes/clusters-q_arm/\"+chrom+\".pdf\", bbox_inches=\"tight\",\n", + " )\n", + " close(gf.figure)\n", + " cd, aff, afm, cff, cfm, _, _, s2t_p, _, _, s2o_p, _, _, t2o_p, _, _, aff_p, _, _, afm_p, _, _, cff_p, _, _, cfm_p = (\n", + " process_distances(lds, dispatcher)\n", + " )\n", + " cd_list.append(cd)\n", + " aff_list.append(aff)\n", + " afm_list.append(afm)\n", + " cff_list.append(cff)\n", + " cfm_list.append(cfm)\n", + " stats.loc[chrom] = [\n", + " np.nan, np.nan, np.nan,\n", + " cr, cp,\n", + " s2t_p, s2o_p, t2o_p, aff_p, afm_p, cff_p, cfm_p,\n", + " ]\n", + " for trio, subtree_count in count_subtrees(dispatcher).items():\n", + " stats.loc[chrom, trio] = subtree_count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "legend, axs = subplots(figsize=(6.7, 2.5), ncols=3, gridspec_kw=dict(width_ratios=(1, 4.5, 3.4), wspace=0))\n", + "\n", + "axs[0].imshow(np.vstack([np.linspace(0, 1, 256)]).T, cmap=\"Greys_r\", aspect=\"auto\")\n", + "axs[0].set(xticks=[], yticks=[0, 255])\n", + "axs[0].set_yticklabels([\"$\\geq{}0.15$\", \"0\"], fontsize=14)\n", + "axs[0].text(x=-.2, y=128, s=\"Relative\\ndistance\\n\", rotation=90, ha=\"right\", va=\"center\", fontsize=17)\n", + "\n", + "for x, subject in enumerate(sorted(SUBJECT_TO_TRIO)):\n", + " axs[1].plot([x, x], [0, 1], color=\"#888\", lw=1.5)\n", + " axs[1].scatter([x]*4, np.linspace(.1, .7, 4)+x/30, color=POPULATION_COLORS[subject], marker=\"_\", s=125)\n", + "\n", + "axs[1].set(xlim=(-4.5, 8.5), xticks=[], yticks=[])\n", + "twiny = axs[1].twiny()\n", + "twiny.set(xlim=(-4.5, 8.5), xticks=[])\n", + "for tick in twiny.get_xticklabels():\n", + " tick.set_rotation(80)\n", + "twiny.tick_params(axis=\"both\", which=\"both\", length=0)\n", + "for spine in \"top\", \"bottom\":\n", + " axs[1].spines[spine].set_visible(False)\n", + " twiny.spines[spine].set_visible(False)\n", + "axs[1].text(x=-2, y=.5, s=\"Assignment of\\nreads to subjects\", rotation=90, fontsize=16, ha=\"center\", va=\"center\")\n", + "axs[1].text(x=-1.5, y=1.1, s=\"/ populations\", rotation=90, ha=\"center\", va=\"bottom\", fontsize=16)\n", + "\n", + "csty1 = \"angle3,angleA=80,angleB=-60\"\n", + "csty2 = \"angle3,angleA=60,angleB=-70\"\n", + "asty = \"Simple, tail_width=.25, head_width=7, head_length=5\"\n", + "draw_fancy_arrow(-0.02, 2.1, 1, axs[1], lw=1, csty=csty1)\n", + "draw_fancy_arrow(-0.02, 3.1, 1, axs[1], lw=1, asty=asty, csty=csty2)\n", + "draw_fancy_arrow(-0.02, 5.1, 4, axs[1], lw=1, csty=csty1)\n", + "draw_fancy_arrow(-0.02, 6.1, 4, axs[1], lw=1, asty=asty, csty=csty2)\n", + "\n", + "axs[1].text(x=2.95, y=-.35, s=\"child{}parent\\nrelatedness\".format(chr(0x2190)), va=\"center\", ha=\"center\", fontsize=16)\n", + "line = Line2D((2.5, 4.1), (-.25, -.12), lw=1, ls=\"--\", color=\"#888\")\n", + "line.set_clip_on(False)\n", + "axs[1].add_line(line)\n", + "line = Line2D((2.5, 2.1), (-.25, -.16), lw=1, ls=\"--\", color=\"#888\")\n", + "line.set_clip_on(False)\n", + "axs[1].add_line(line)\n", + "\n", + "line = Line2D((-.2, .2), (1.1, 1.1), lw=4, color=POPULATION_COLORS[\"HG001\"])\n", + "line.set_clip_on(False)\n", + "axs[1].add_line(line)\n", + "axs[1].text(x=.4, y=1.1, s=\" Utah\", ha=\"center\", va=\"bottom\", rotation=50, fontsize=17, color=POPULATION_COLORS[\"HG001\"])\n", + "\n", + "line = Line2D((.8, 3.2), (1.1, 1.1), lw=4, color=POPULATION_COLORS[\"HG002\"])\n", + "line.set_clip_on(False)\n", + "axs[1].add_line(line)\n", + "axs[1].text(x=3.7, y=1.1, s=\" Ashkenazim\", ha=\"center\", va=\"bottom\", rotation=50, fontsize=17, color=POPULATION_COLORS[\"HG002\"])\n", + "\n", + "line = Line2D((3.8, 6.2), (1.1, 1.1), lw=4, color=POPULATION_COLORS[\"HG005\"])\n", + "line.set_clip_on(False)\n", + "axs[1].add_line(line)\n", + "axs[1].text(x=6, y=1.1, s=\" Chinese\", ha=\"center\", va=\"bottom\", rotation=50, fontsize=17, color=POPULATION_COLORS[\"HG005\"])\n", + "\n", + "axs[2].add_patch(Rectangle((0,3), 1.5, .65, facecolor=\"#119933\", edgecolor=\"black\"))\n", + "axs[2].text(x=1.75, y=3.25, s=\"TTAGGG\", fontsize=15, va=\"center\")\n", + "axs[2].add_patch(Rectangle((0,2), 1.5, .65, facecolor=\"#EEDD77\", edgecolor=\"black\"))\n", + "axs[2].text(x=1.75, y=2.25, s=\"TGAGGG\", fontsize=15, va=\"center\")\n", + "axs[2].add_patch(Rectangle((0,1), 1.5, .65, facecolor=\"#88DFEF\", edgecolor=\"black\"))\n", + "axs[2].text(x=1.75, y=1.25, s=\"TTAGGGG\", fontsize=15, va=\"center\")\n", + "axs[2].add_patch(Rectangle((0,0), 1.5, .65, facecolor=\"#DDDDDD\", edgecolor=\"black\"))\n", + "axs[2].text(x=1.75, y=0.25, s=\"background\", fontsize=13, va=\"center\")\n", + "axs[2].set(xticks=[], yticks=[], xlim=(-.4, 4.8), ylim=(-0.7, 4.4))\n", + "axs[2].set_title(\"Motif densities\", fontsize=17)\n", + "\n", + "legend.add_artist(Rectangle((.0, -.2), .94, 1.73, edgecolor=\"black\", facecolor=\"none\"))\n", + "\n", + "legend.savefig(f\"{DATA_DIR}/PacBio/haplotypes/clusters-q_arm/legend.pdf\", bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "PRINT_NS = False\n", + "\n", + "if PRINT_NS:\n", + " format_pval = lambda p: \"ns\" if (p >= .05) else (\"<1.0e-300\" if (p < 1e-300) else format(p, \".1e\"))\n", + "else:\n", + " format_pval = lambda p: format(p, \".2f\") if (p >= .05) else (\"<1.0e-300\" if (p < 1e-300) else format(p, \".1e\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "st = stats.reindex(natsorted_chromosomes(stats.index)).copy()\n", + "st.index.name = \"contig\"\n", + "tc = st.iloc[:,:3].values.flatten()\n", + "tc = tc[~np.isnan(tc)]\n", + "print(\"Max subtree count:\", stats.iloc[:,:3].max(axis=0).sort_values().iloc[[-1]].to_string())\n", + "print(\"Median subtree count:\", np.median(tc))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "st = stats.reindex(natsorted_chromosomes(stats.index)).copy()\n", + "st.index.name = \"contig\"\n", + "st = st.iloc[:,3:].reset_index()\n", + "st.insert(loc=0, column=\"chromosome\", value=st[\"contig\"].apply(convname))\n", + "\n", + "coph = st.iloc[:,:4].copy()\n", + "coph[\"cp\"] = multipletests(coph[\"cp\"], method=\"bonferroni\")[1]\n", + "coph[\"r\"] = coph[\"cr\"].apply(lambda r: format(r, \".2f\"))\n", + "coph[\"p\"] = coph[\"cp\"].apply(format_pval)\n", + "coph.drop(columns=[\"cr\", \"cp\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"for_wilcoxon-q_arm.pkl\", mode=\"wb\") as pkl:\n", + " Q = Namespace(cd_list=cd_list, aff_list=aff_list, afm_list=afm_list, cff_list=cff_list, cfm_list=cfm_list)\n", + " dump(Q, pkl)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cd_all = pd.concat(cd_list)\n", + "print(len(cd_all[cd_all[\"subject\"]>cd_all[\"outgroup\"]]), len(cd_all[cd_all[\"subject\"]>cd_all[\"outgroup\"]])/3729)\n", + "inter_reads = cd_all[cd_all[\"subject\"]>=cd_all[\"outgroup\"]*2].index\n", + "inter_dispatcher = pd.DataFrame(index=inter_reads, data={\"subject\": inter_reads.map(lambda s: s.split(\":\")[1])})\n", + "print(len(inter_dispatcher), len(inter_dispatcher)/3729)\n", + "inter_dispatcher[\"subject\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inter_dispatcher[\"chromosome\"] = np.nan\n", + "inter_dispatcher[\"rname\"] = np.nan\n", + "\n", + "for chrom, lds in RAW_GLOBAL_LDS.items():\n", + " for name in lds.index:\n", + " if name in inter_dispatcher.index:\n", + " inter_dispatcher.loc[name, \"rname\"] = chrom\n", + "inter_dispatcher[\"chromosome\"] = inter_dispatcher[\"rname\"].apply(convname)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inter_counts = inter_dispatcher.groupby([\"subject\", \"chromosome\"], as_index=False).count().pivot(\n", + " index=\"subject\", columns=\"chromosome\", values=\"rname\",\n", + ")\n", + "inter_counts = inter_counts[natsorted_chromosomes(inter_counts.columns)].applymap(lambda x: \"\" if np.isnan(x) else str(int(x)))\n", + "inter_counts" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/paper/jupyter/Figure-S1-shortread-support.ipynb b/assets/paper/jupyter/Figure-S1-shortread-support.ipynb new file mode 100644 index 0000000..21eeffe --- /dev/null +++ b/assets/paper/jupyter/Figure-S1-shortread-support.ipynb @@ -0,0 +1,429 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import re\n", + "from matplotlib.pyplot import subplots, style, rc\n", + "from tqdm import tqdm\n", + "from venn import venn, pseudovenn\n", + "from collections import defaultdict\n", + "from itertools import count, islice\n", + "from functools import lru_cache" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "from pysam import AlignmentFile\n", + "from edgecaselib.formats import load_index\n", + "from edgecaselib.util import natsorted_chromosomes\n", + "from pickle import dump, load\n", + "from os import path\n", + "\n", + "DATA_DIR = \"../../../data/datasets/2021\"\n", + "ecx = load_index(\"../../hg38ext.fa.ecx\")\n", + "\n", + "Q_CHROMS = [\"chr7\", \"chr8\", \"chr11\", \"chr12\", \"14qtel_1-500K_1_12_12_rc\", \"chr15\", \"18qtel_1-500K_1_12_12_rc\"]\n", + "P_CHROMS = [\"chr2\", \"3ptel_1-500K_1_12_12\", \"4ptel_1-500K_1_12_12\", \"chr5\", \"chr9\", \"chr12\", \"17ptel_1_500K_1_12_12\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def load_support(filename, is_q):\n", + " support = defaultdict(set)\n", + " with AlignmentFile(filename) as bam:\n", + " for entry in bam:\n", + " if entry.reference_name.endswith(\"/q\") == is_q:\n", + " support[entry.reference_name[:-2]] |= {\n", + " rp for qp, rp in entry.get_aligned_pairs()\n", + " if (qp is not None) and (rp is not None)\n", + " }\n", + " return support" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def load_chopper(filename, is_q):\n", + " chopper = {}\n", + " with AlignmentFile(filename) as bam:\n", + " for entry in bam:\n", + " if entry.flag & 0x4000 == 0x4000:\n", + " if (entry.flag & 0x8000 == 0x8000) == is_q:\n", + " chopper[entry.qname] = (entry.reference_name, entry.reference_start, len(entry.seq))\n", + " return chopper" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "def load_puller(filename, is_q, support):\n", + " puller = {}\n", + " with AlignmentFile(filename) as bam:\n", + " for entry in bam:\n", + " if entry.qname in support:\n", + " if (entry.flag & 0x4000 == 0x4000):# and (entry.flag & 3844 == 0):\n", + " if (entry.flag & 0x8000 == 0x8000) == is_q:\n", + " if is_q:\n", + " puller[entry.qname] = (\n", + " entry.reference_name, entry.reference_start, len(entry.seq),\n", + " )\n", + " else:\n", + " puller[entry.qname] = (\n", + " entry.reference_name,\n", + " entry.reference_start-entry.query_alignment_start,\n", + " len(entry.seq),\n", + " )\n", + " return puller" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "SUBJECT_TO_TRIO = {\n", + " \"HG001\": \"NA12878\",\n", + " \"HG002\": \"AshkenazimTrio\", \"HG003\": \"AshkenazimTrio\", \"HG004\": \"AshkenazimTrio\",\n", + " \"HG005\": \"ChineseTrio\", \"HG006\": \"ChineseTrio\", \"HG007\": \"ChineseTrio\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [17:42<00:00, 151.78s/it]\n" + ] + } + ], + "source": [ + "Q_SUPPORT_PKL = f\"{DATA_DIR}/PacBio/q_support.pkl\"\n", + "Q_CHOPPER_PKL = f\"{DATA_DIR}/PacBio/q_chopper.pkl\"\n", + "Q_PULLER_PKL = f\"{DATA_DIR}/PacBio/q_puller.pkl\"\n", + "\n", + "if path.isfile(Q_SUPPORT_PKL) and path.isfile(Q_CHOPPER_PKL) and path.isfile(Q_PULLER_PKL):\n", + " with open(Q_SUPPORT_PKL, mode=\"rb\") as pkl:\n", + " q_support = load(pkl)\n", + " with open(Q_CHOPPER_PKL, mode=\"rb\") as pkl:\n", + " q_chopper = load(pkl)\n", + " with open(Q_PULLER_PKL, mode=\"rb\") as pkl:\n", + " q_puller = load(pkl)\n", + "else:\n", + " q_support, q_chopper, q_puller = {}, {}, {}\n", + " for subject, trio in tqdm(SUBJECT_TO_TRIO.items(), total=len(SUBJECT_TO_TRIO)):\n", + " support = load_support(f\"{DATA_DIR}/PacBio/{trio}/{subject}/telbam2tailchopper.bam\", is_q=True)\n", + " chopper = load_chopper(f\"{DATA_DIR}/PacBio/{trio}/{subject}/tailchopper.bam\", is_q=True)\n", + " puller = load_puller(f\"{DATA_DIR}/PacBio/{trio}/{subject}/tailpuller.bam\", is_q=True, support=support)\n", + " q_support.update(support)\n", + " q_chopper.update(chopper)\n", + " q_puller.update(puller)\n", + " with open(Q_SUPPORT_PKL, mode=\"wb\") as pkl:\n", + " dump(q_support, pkl)\n", + " with open(Q_CHOPPER_PKL, mode=\"wb\") as pkl:\n", + " dump(q_chopper, pkl)\n", + " with open(Q_PULLER_PKL, mode=\"wb\") as pkl:\n", + " dump(q_puller, pkl)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [16:56<00:00, 145.24s/it]\n" + ] + } + ], + "source": [ + "P_SUPPORT_PKL = f\"{DATA_DIR}/PacBio/p_support.pkl\"\n", + "P_CHOPPER_PKL = f\"{DATA_DIR}/PacBio/p_chopper.pkl\"\n", + "P_PULLER_PKL = f\"{DATA_DIR}/PacBio/p_puller.pkl\"\n", + "\n", + "if path.isfile(P_SUPPORT_PKL) and path.isfile(P_CHOPPER_PKL) and path.isfile(P_PULLER_PKL):\n", + " with open(P_SUPPORT_PKL, mode=\"rb\") as pkl:\n", + " p_support = load(pkl)\n", + " with open(P_CHOPPER_PKL, mode=\"rb\") as pkl:\n", + " p_chopper = load(pkl)\n", + " with open(P_PULLER_PKL, mode=\"rb\") as pkl:\n", + " p_puller = load(pkl)\n", + "else:\n", + " p_support, p_chopper, p_puller = {}, {}, {}\n", + " for subject, trio in tqdm(SUBJECT_TO_TRIO.items(), total=len(SUBJECT_TO_TRIO)):\n", + " support = load_support(f\"{DATA_DIR}/PacBio/{trio}/{subject}/telbam2tailchopper.bam\", is_q=False)\n", + " chopper = load_chopper(f\"{DATA_DIR}/PacBio/{trio}/{subject}/tailchopper.bam\", is_q=False)\n", + " puller = load_puller(f\"{DATA_DIR}/PacBio/{trio}/{subject}/tailpuller.bam\", is_q=False, support=support)\n", + " p_support.update(support)\n", + " p_chopper.update(chopper)\n", + " p_puller.update(puller)\n", + " with open(P_SUPPORT_PKL, mode=\"wb\") as pkl:\n", + " dump(p_support, pkl)\n", + " with open(P_CHOPPER_PKL, mode=\"wb\") as pkl:\n", + " dump(p_chopper, pkl)\n", + " with open(P_PULLER_PKL, mode=\"wb\") as pkl:\n", + " dump(p_puller, pkl)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def get_counts(chopper, min_count=50):\n", + " counts = defaultdict(int)\n", + " for _, (chrom, *_) in chopper.items():\n", + " counts[chrom] += 1\n", + " return {chrom: max(counts[chrom], 50) for chrom in natsorted_chromosomes(counts)}" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "def scatter_one_q_read(ecx, chrom, prime, positions, start, length, ax, i):\n", + " tract_anchor = ecx.loc[(ecx[\"rname\"]==chrom) & (ecx[\"flag\"]==0x4000) & (ecx[\"prime\"]==prime), \"pos\"].iloc[0]\n", + " sorted_positions = [p for p in sorted(positions)]\n", + " non_covered_positions = [\n", + " p for p in range(max(0, start-tract_anchor), start-tract_anchor+length)\n", + " if p not in sorted_positions\n", + " ]\n", + " ax.scatter(non_covered_positions, [i]*len(non_covered_positions), color=\"darkblue\", marker=\".\", s=.5)\n", + " ax.scatter(sorted_positions, [i]*len(sorted_positions), color=\"lightgreen\", marker=\".\", s=.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "def scatter_one_p_read(ecx, chrom, prime, positions, start, length, ax, i):\n", + " tract_anchor = ecx.loc[(ecx[\"rname\"]==chrom) & (ecx[\"flag\"]==0x4000) & (ecx[\"prime\"]==prime), \"pos\"].iloc[0]\n", + " sorted_positions = np.array(sorted(positions))\n", + " shifted_positions = sorted_positions + start - tract_anchor\n", + " set_shifted_positions = set(shifted_positions)\n", + " shifted_non_covered_positions = np.array([\n", + " p for p in range(start - tract_anchor, min(0, start + length - tract_anchor))\n", + " if p not in set_shifted_positions\n", + " ])\n", + " ax.scatter(shifted_non_covered_positions, [i]*len(shifted_non_covered_positions), color=\"darkblue\", marker=\".\", s=.5)\n", + " ax.scatter(shifted_positions, [i]*len(shifted_positions), color=\"lightgreen\", marker=\".\", s=.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "def plot_support(support, puller, chopper, ecx, is_q, chroms, min_count=50, factor=.005):\n", + " counts = {c: i for c, i in get_counts(chopper, min_count=min_count).items() if c in chroms}\n", + " figure, axs = subplots(\n", + " figsize=(7*.75, .75*(sum(counts.values())+len(counts)*20)*factor),\n", + " ncols=1, nrows=len(counts),\n", + " sharex=True,\n", + " gridspec_kw=dict(\n", + " wspace=0, hspace=.4,\n", + " height_ratios=counts.values(),\n", + " ),\n", + " )\n", + " chrom2ax = dict(zip(counts, axs))\n", + " ax2i = defaultdict(int)\n", + " prime = 3 if is_q else 5\n", + " for name in tqdm(set(support) & set(puller) & set(chopper)):\n", + " positions = support[name]\n", + " chrom, start, length = puller[name]\n", + " if chrom not in chroms:\n", + " continue\n", + " ax = chrom2ax[chrom]\n", + " i = ax2i[ax]\n", + " if is_q:\n", + " scatter_one_q_read(ecx, chrom, prime, positions, start, length, ax, i)\n", + " else:\n", + " scatter_one_p_read(ecx, chrom, prime, positions, start, length, ax, i)\n", + " ax2i[ax] += 1\n", + " for chrom, ax in chrom2ax.items():\n", + " if chrom not in chroms:\n", + " continue\n", + " if is_q:\n", + " ax.text(x=0, y=0, s=chrom+\" \", ha=\"right\", va=\"bottom\")\n", + " ax.set(\n", + " yticks=[], ylabel=None,\n", + " xlim=(0, ax.get_xlim()[1]), ylim=(0, max(ax2i[ax], min_count)+20),\n", + " )\n", + " ax.spines[\"right\"].set_visible(False)\n", + " else:\n", + " ax.text(x=0, y=0, s=\" \"+chrom, ha=\"left\", va=\"bottom\")\n", + " ax.set(\n", + " yticks=[], ylabel=None,\n", + " xlim=(ax.get_xlim()[0], 0), ylim=(0, max(ax2i[ax], min_count)+20),\n", + " )\n", + " ax.spines[\"left\"].set_visible(False)\n", + " ax.spines[\"top\"].set_visible(False)\n", + " figure.get_axes()[0].spines[\"top\"].set_visible(False)\n", + " for ax in figure.get_axes()[:-1]:\n", + " ax.tick_params(axis=\"x\", colors=\"white\")\n", + " ticks = figure.get_axes()[-1].get_xticks()\n", + " figure.get_axes()[-1].set(\n", + " xticklabels=(ticks / 1000).astype(int).astype(str),\n", + " xlabel=\"Kbp of telomeric tract\",\n", + " )\n", + " return figure" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 637/637 [00:15<00:00, 42.20it/s]\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "p_figure = plot_support(p_support, p_puller, p_chopper, ecx, chroms=P_CHROMS, is_q=False)\n", + "p_figure.savefig(\"Figure SXA, support.png\", dpi=300, bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 541/541 [01:42<00:00, 5.28it/s]\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "q_figure = plot_support(q_support, q_puller, q_chopper, ecx, chroms=Q_CHROMS, is_q=True, factor=.01)\n", + "q_figure.savefig(\"Figure SXB, support.png\", dpi=300, bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "p arm: 0.89\n", + "q arm: 0.95\n" + ] + } + ], + "source": [ + "def get_numerical_support(support, chopper):\n", + " numerical_support = pd.DataFrame(\n", + " data=[\n", + " [len(positions), chopper[name][2], len(positions)/chopper[name][2]]\n", + " for name, positions in support.items()\n", + " ],\n", + " columns=[\"supported\", \"length\", \"fraction\"],\n", + " )\n", + " return numerical_support\n", + "\n", + "print(\"p arm: {:.2f}\".format(get_numerical_support(p_support, p_chopper)[\"fraction\"].median()))\n", + "print(\"q arm: {:.2f}\".format(get_numerical_support(q_support, q_chopper)[\"fraction\"].median()))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/paper/jupyter/Figure-S4-haplotypes-constrained-p_arm.ipynb b/assets/paper/jupyter/Figure-S4-haplotypes-constrained-p_arm.ipynb new file mode 100644 index 0000000..10ac80f --- /dev/null +++ b/assets/paper/jupyter/Figure-S4-haplotypes-constrained-p_arm.ipynb @@ -0,0 +1,472 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import re\n", + "from matplotlib.pyplot import subplots, style, rc, rc_context, close\n", + "from tqdm import tqdm\n", + "from venn import venn, pseudovenn\n", + "from collections import defaultdict\n", + "from itertools import count, islice\n", + "from functools import lru_cache\n", + "from argparse import Namespace" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.spatial.distance import squareform\n", + "from scipy.cluster.hierarchy import dendrogram, fcluster, linkage\n", + "from sklearn.metrics import silhouette_score\n", + "from matplotlib.gridspec import GridSpec\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.cluster.hierarchy import cophenet\n", + "from scipy.stats import pearsonr, wilcoxon" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from edgecaselib.formats import load_index, load_kmerscan\n", + "from edgecaselib.densityplot import interpret_arguments\n", + "from edgecaselib.util import natsorted_chromosomes\n", + "from pickle import dump, load\n", + "from os import path\n", + "from tempfile import NamedTemporaryFile\n", + "from subprocess import check_output, CalledProcessError\n", + "from pysam import AlignmentFile\n", + "from scipy.stats import chi2_contingency\n", + "from statsmodels.stats.multitest import multipletests\n", + "from scipy.ndimage import uniform_filter1d\n", + "from matplotlib.patches import FancyArrowPatch, Rectangle\n", + "from matplotlib.lines import Line2D" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "SAMFILTERS = [[\"tract_anchor\"], [\"is_q\"], 0]\n", + "ecx = load_index(\"../../hg38ext.fa.ecx\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "SUBJECT_TO_TRIO = {\n", + " \"HG001\": \"NA12878\",\n", + " \"HG002\": \"AshkenazimTrio\", \"HG003\": \"AshkenazimTrio\", \"HG004\": \"AshkenazimTrio\",\n", + " \"HG005\": \"ChineseTrio\", \"HG006\": \"ChineseTrio\", \"HG007\": \"ChineseTrio\",\n", + "}\n", + "P_CHROMS = [\"chr2\", \"3ptel_1-500K_1_12_12\", \"4ptel_1-500K_1_12_12\", \"chr5\", \"chr9\", \"chr12\", \"17ptel_1_500K_1_12_12\"]\n", + "MAXLEN = 1450\n", + "DATA_DIR = \"../../../data/datasets/2021\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def squarify(narrowform):\n", + " triu_fillna = narrowform.pivot(index=\"qname1\", columns=\"qname2\", values=\"relative_ld\").fillna(0)\n", + " return triu_fillna.T + triu_fillna\n", + "\n", + "distances_narrowform = pd.read_csv(f\"{DATA_DIR}/PacBio/haplotypes/levenshtein-p_arm.tsv\", sep=\"\\t\", escapechar=\"#\")\n", + "\n", + "RAW_GLOBAL_LDS = {\n", + " rname: squarify(distances_narrowform[distances_narrowform[\"rname\"]==rname].drop(columns=\"rname\"))\n", + " for rname in distances_narrowform[\"rname\"].drop_duplicates()\n", + " if rname in P_CHROMS\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "KMERSCANNER_PKL = f\"{DATA_DIR}/PacBio/kmerscanner-p_arm.pkl\"\n", + "KMERSCANNER_DAT = f\"{DATA_DIR}/PacBio/kmerscanner-p_arm.dat.gz\"\n", + "\n", + "if path.isfile(KMERSCANNER_PKL):\n", + " with open(KMERSCANNER_PKL, mode=\"rb\") as pkl:\n", + " DENSITIES = load(pkl)\n", + "else:\n", + " DENSITIES = load_kmerscan(KMERSCANNER_DAT, True, SAMFILTERS, 10)\n", + " with open(KMERSCANNER_PKL, mode=\"wb\") as pkl:\n", + " dump(DENSITIES, pkl)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "class GridFig():\n", + "\n", + " def __init__(self, width_ratios, height_ratios, scale=1):\n", + " self.figure, _ = subplots(\n", + " figsize=(sum(width_ratios)*scale, sum(height_ratios)*scale),\n", + " ncols=0, nrows=0,\n", + " )\n", + " self.gs = GridSpec(\n", + " ncols=len(width_ratios), wspace=0, width_ratios=width_ratios, \n", + " nrows=len(height_ratios), hspace=0, height_ratios=height_ratios,\n", + " figure=self.figure,\n", + " )\n", + " \n", + " def subplot(self, gridspec_slice, aspect=\"auto\", frame=False):\n", + " ax = self.figure.add_subplot(gridspec_slice, aspect=aspect)\n", + " if frame is False:\n", + " ax.set(frame_on=False)\n", + " else:\n", + " for spine in {\"top\", \"right\", \"bottom\", \"left\"} - set(frame):\n", + " ax.spines[spine].set_visible(False)\n", + " return ax" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_dendrogram(Z, gf):\n", + " ax = gf.subplot(gf.gs[0,10])\n", + " with rc_context({\"lines.linewidth\": .5}):\n", + " dendrogram(\n", + " Z, orientation=\"left\",\n", + " link_color_func=lambda x: \"black\", ax=ax,\n", + " )\n", + " ax.set(\n", + " xticks=[], xlabel=None,\n", + " yticks=[], ylabel=None,\n", + " xlim=ax.get_xlim()[::-1],\n", + " ylim=ax.get_ylim()[::-1],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from matplotlib.colors import PowerNorm\n", + "\n", + "def plot_heatmap(data2d, gf, cmap=\"gray_r\", vmax=.15):\n", + " ax = gf.subplot(gf.gs[0,9])\n", + " sns.heatmap(data2d, cmap=cmap, cbar=False, vmin=0, vmax=vmax, ax=ax, norm=PowerNorm(gamma=.5))\n", + " ax.set(xticks=[], yticks=[], xlabel=None, ylabel=None, xlim=ax.get_xlim()[::-1])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def cluster(lds, metric=\"correlation\", method=\"ward\"):\n", + " Z = linkage(squareform(lds), metric=metric, method=method, optimal_ordering=False)\n", + " leaves = dendrogram(Z, no_plot=True)[\"leaves\"]\n", + " data2d = lds.iloc[leaves, leaves].copy()\n", + " dispatcher = pd.DataFrame(index=data2d.index)\n", + " dispatcher.index.name = \"read\"\n", + " to_subject = dispatcher.index.map(lambda s: s.split(\":\")[1])\n", + " for subject in sorted(to_subject.drop_duplicates()):\n", + " dispatcher[subject] = (to_subject==subject)\n", + " return Z, data2d, dispatcher" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def get_plottable_density_section(densities, chrom, motif, data2d, ecx):\n", + " chrom_densities = densities[chrom]\n", + " if motif is None:\n", + " by_motif = chrom_densities[chrom_densities[\"motif\"]==\"CCCTAA\"]\n", + " else:\n", + " by_motif = chrom_densities[chrom_densities[\"motif\"]==motif]\n", + " section = by_motif.set_index(\"name\").reindex(data2d.index).iloc[:,8:].copy()\n", + " if motif is None:\n", + " section = (~section.isnull()).astype(int) / 3\n", + " section.columns = section.columns.astype(int)\n", + " anchor = ecx.loc[\n", + " (ecx[\"rname\"]==chrom) & (ecx[\"flag\"]==0x4000) & (ecx[\"prime\"]==5),\n", + " \"pos\",\n", + " ].iloc[0]\n", + " return section[[c for c in section.columns if c<=anchor]]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def get_absentees(lds, densities, chrom, ecx):\n", + " raw_section = get_plottable_density_section(densities, chrom, \"CCCTAA\", lds, ecx)\n", + " nulls = raw_section.isnull().all(axis=1)\n", + " return nulls[nulls].index" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def section_to_RGB(ps, color, alpha_factor=1.2):\n", + " return np.transpose(\n", + " np.array([\n", + " np.full_like(ps, color[0]),\n", + " np.full_like(ps, color[1]),\n", + " np.full_like(ps, color[2]),\n", + " np.clip(ps*alpha_factor, a_min=None, a_max=1),\n", + " ]),\n", + " axes=(1, 2, 0),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "def draw_fancy_arrow(\n", + " y, start, end, ax, lw=.25,\n", + " csty=\"angle3,angleA=45,angleB=-45\",\n", + " asty=\"Simple, tail_width=.25, head_width=2, head_length=3\"\n", + "):\n", + " ax.add_patch(FancyArrowPatch(\n", + " (start, y), (end, y),\n", + " connectionstyle=csty,\n", + " arrowstyle=asty,\n", + " lw=lw, color=\"#888\", clip_on=False,\n", + " ))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "POPULATION_COLORS = {\n", + " \"HG001\": \"black\",\n", + " \"HG002\": \"green\", \"HG003\": \"green\", \"HG004\": \"green\",\n", + " \"HG005\": \"steelblue\", \"HG006\": \"steelblue\", \"HG007\": \"steelblue\",\n", + "}\n", + "\n", + "def plot_subjects(dispatcher, gf, s=10, arrows=True):\n", + " for i, subject in enumerate(sorted(SUBJECT_TO_TRIO)):\n", + " sax = gf.subplot(gf.gs[0,1+i])\n", + " sax.plot([0, 0], [0, len(dispatcher)], lw=.5, color=\"#888\")\n", + " if subject in dispatcher:\n", + " truthiness = dispatcher[subject].reset_index(drop=True)\n", + " positions = truthiness[truthiness].index\n", + " for x in [-.1, 0, .1]:\n", + " sax.scatter(x=[x]*len(positions), y=positions, marker=\"_\", s=s, color=POPULATION_COLORS[subject])\n", + " sax.set(\n", + " xticks=[0], xticklabels=[subject+\" \"],\n", + " yticks=[], xlabel=None, ylabel=None,\n", + " xlim=(-.5, .5),\n", + " ylim=(len(dispatcher), -1),\n", + " )\n", + " for tick in sax.get_xticklabels():\n", + " tick.set_rotation(90)\n", + " if arrows and (subject in {\"HG002\", \"HG005\"}):\n", + " draw_fancy_arrow(len(dispatcher), 1, 0, sax)\n", + " draw_fancy_arrow(len(dispatcher), 2, 0, sax)\n", + " sax.tick_params(axis=\"both\", which=\"both\", length=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "IMSHOW_PALETTE = {\n", + " None: [.7, .7, .7],\n", + " \"CCCTAA\": [.0, .4, .1],\n", + " \"CCCTCA\": [1, 1, 0],\n", + " \"CCCCTAA\": [.5, .9, 1],\n", + "}\n", + "\n", + "def plot_densities(densities, chrom, data2d, ecx, gf, extent, bin_size=100):\n", + " ax = gf.subplot(gf.gs[0,0])\n", + " for motif, color in IMSHOW_PALETTE.items():\n", + " ps = get_plottable_density_section(densities, chrom, motif, data2d, ecx).values\n", + " breakat = MAXLEN // 100\n", + " if ps.shape[1] < MAXLEN:\n", + " ps = np.pad(ps, ((0, 0), (MAXLEN-ps.shape[1], 0)))\n", + " elif ps.shape[1] > MAXLEN:\n", + " ps = ps[:,-MAXLEN:]\n", + " pa = section_to_RGB(np.clip(uniform_filter1d(ps[:,::-1], 5, 1)[:,::-1], a_min=0.0, a_max=1.0), color, 2)\n", + " ax.imshow(pa, extent=extent, interpolation=\"nearest\")\n", + " ticklabels=(-np.linspace(MAXLEN//100, 0, MAXLEN//100+1).astype(int)).astype(str)\n", + " fullaxislen = len(ticklabels)\n", + " ticklabels = ticklabels[-breakat-1:]\n", + " xmin, xmax = extent[:2]\n", + " ax.set(\n", + " xticks=np.linspace(xmin, xmax, MAXLEN//100+1)[-breakat-1:],\n", + " xticklabels=ticklabels,\n", + " xlabel=\"Kbp of telomeric tract\",\n", + " yticks=[], ylabel=None,\n", + " )\n", + " ax.tick_params(axis=\"both\", which=\"both\", length=0)\n", + " ax.tick_params(axis=\"x\", which=\"both\", length=3)\n", + " if (subject == \"HG007\") or ((chrom == \"chr5\") and (subject == \"HG006\")) or ((chrom == \"chr9\") and (subject == \"HG005\")):\n", + " ax.axhline(0, 1-(breakat+1)/fullaxislen, 1, lw=1, c=\"black\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "@lru_cache(maxsize=None)\n", + "def convname(cn):\n", + " match = re.search(r'^\\d+', cn)\n", + " if match:\n", + " return match.group() + \"p\"\n", + " else:\n", + " return cn.split(\"chr\")[1] + \"p\"" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def fixup_labels(gf, chrom, subject):\n", + " if (chrom == \"chr2\") and (subject == \"HG002\"):\n", + " gf.figure.get_axes()[1].set_title(\"Pairwise relative\\nLevenshtein distances\", fontsize=13, loc=\"left\")\n", + " gf.figure.get_axes()[5].set_title(\"Subjects\", fontsize=13)\n", + " gf.figure.get_axes()[9].set_title(\"Motif densities\", loc=\"right\", fontsize=13)\n", + " if (chrom != \"17ptel_1_500K_1_12_12\") or (subject != \"HG007\"):\n", + " for ax in gf.figure.get_axes()[:-1]:\n", + " ax.set(xticklabels=[], xlabel=None)\n", + " gf.figure.get_axes()[-1].set(xlabel=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:34<00:00, 4.90s/it]\n" + ] + } + ], + "source": [ + "for chrom in tqdm(RAW_GLOBAL_LDS):\n", + " lds = RAW_GLOBAL_LDS[chrom]\n", + " absentees = get_absentees(lds, DENSITIES, chrom, ecx)\n", + " lds.drop(index=absentees, columns=absentees, inplace=True)\n", + " constrainer = pd.DataFrame(\n", + " index=lds.index, columns=[\"subject\"],\n", + " data=lds.index.map(lambda s: s.split(\":\")[1]).values\n", + " ).sort_values(by=\"subject\")\n", + " for subject in constrainer[\"subject\"].drop_duplicates():\n", + " try:\n", + " subject_index = constrainer[constrainer[\"subject\"]==subject].index\n", + " subject_lds = lds.loc[subject_index, subject_index]\n", + " subject_Z = linkage(squareform(subject_lds), metric=\"euclidean\", method=\"ward\", optimal_ordering=False)\n", + " subject_leaves = dendrogram(subject_Z, no_plot=True)[\"leaves\"]\n", + " subject_data2d = subject_lds.iloc[subject_leaves, subject_leaves]\n", + " dispatcher = pd.DataFrame(index=subject_index)\n", + " dispatcher.index.name = \"read\"\n", + " to_subject = dispatcher.index.map(lambda s: s.split(\":\")[1])\n", + " for subject in sorted(to_subject.drop_duplicates()):\n", + " dispatcher[subject] = (to_subject==subject)\n", + " h = 6*len(subject_lds)/50\n", + " w = 30\n", + " gf = GridFig([w]+[.8]*7+[.3,h,h/3], [h], scale=.2)\n", + " plot_dendrogram(subject_Z, gf=gf)\n", + " plot_heatmap(subject_data2d, gf=gf)\n", + " plot_subjects(dispatcher, gf=gf, s=7, arrows=(\n", + " (subject == \"HG007\") or ((chrom == \"chr5\") and (subject == \"HG006\")) or\n", + " ((chrom == \"chr9\") and (subject == \"HG005\"))\n", + " ))\n", + " plot_densities(DENSITIES, chrom, subject_data2d, ecx, gf=gf, extent=[0,w,0,h])\n", + " fixup_labels(gf, chrom, subject)\n", + " if ((chrom == \"chr5\") and (subject == \"HG006\")) or ((chrom == \"chr9\") and (subject == \"HG005\")):\n", + " pass\n", + " elif subject != \"HG007\":\n", + " gf.figure.get_axes()[-1].set(xticks=[])\n", + " except ValueError: # too few observations\n", + " continue\n", + " gf.figure.savefig(\n", + " f\"{DATA_DIR}/PacBio/haplotypes/clusters-p_arm/constrained/\"+chrom+\"-\"+subject+\".pdf\",\n", + " bbox_inches=\"tight\", pad_inches=0,\n", + " )\n", + " close(gf.figure)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/paper/jupyter/Figure-S5-haplotypes-constrained-q_arm.ipynb b/assets/paper/jupyter/Figure-S5-haplotypes-constrained-q_arm.ipynb new file mode 100644 index 0000000..afa922b --- /dev/null +++ b/assets/paper/jupyter/Figure-S5-haplotypes-constrained-q_arm.ipynb @@ -0,0 +1,469 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import re\n", + "from matplotlib.pyplot import subplots, style, rc, rc_context, close\n", + "from tqdm import tqdm\n", + "from venn import venn, pseudovenn\n", + "from collections import defaultdict\n", + "from itertools import count, islice\n", + "from functools import lru_cache\n", + "from argparse import Namespace" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.spatial.distance import squareform\n", + "from scipy.cluster.hierarchy import dendrogram, fcluster, linkage\n", + "from sklearn.metrics import silhouette_score\n", + "from matplotlib.gridspec import GridSpec\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.cluster.hierarchy import cophenet\n", + "from scipy.stats import pearsonr, wilcoxon" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from edgecaselib.formats import load_index, load_kmerscan\n", + "from edgecaselib.densityplot import interpret_arguments\n", + "from edgecaselib.util import natsorted_chromosomes\n", + "from pickle import dump, load\n", + "from os import path\n", + "from tempfile import NamedTemporaryFile\n", + "from subprocess import check_output, CalledProcessError\n", + "from pysam import AlignmentFile\n", + "from scipy.stats import chi2_contingency\n", + "from statsmodels.stats.multitest import multipletests\n", + "from scipy.ndimage import uniform_filter1d\n", + "from matplotlib.patches import FancyArrowPatch, Rectangle\n", + "from matplotlib.lines import Line2D" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "SAMFILTERS = [[\"is_q\", \"tract_anchor\"], 0, 0]\n", + "ecx = load_index(\"../../hg38ext.fa.ecx\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "SUBJECT_TO_TRIO = {\n", + " \"HG001\": \"NA12878\",\n", + " \"HG002\": \"AshkenazimTrio\", \"HG003\": \"AshkenazimTrio\", \"HG004\": \"AshkenazimTrio\",\n", + " \"HG005\": \"ChineseTrio\", \"HG006\": \"ChineseTrio\", \"HG007\": \"ChineseTrio\",\n", + "}\n", + "Q_CHROMS = [\"chr7\", \"chr8\", \"chr11\", \"chr12\", \"14qtel_1-500K_1_12_12_rc\", \"chr15\", \"18qtel_1-500K_1_12_12_rc\"]\n", + "MAXLEN = 1600\n", + "DATA_DIR = \"../../../data/datasets/2021\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def squarify(narrowform):\n", + " triu_fillna = narrowform.pivot(index=\"qname1\", columns=\"qname2\", values=\"relative_ld\").fillna(0)\n", + " return triu_fillna.T + triu_fillna\n", + "\n", + "distances_narrowform = pd.read_csv(f\"{DATA_DIR}/PacBio/haplotypes/levenshtein-q_arm.tsv\", sep=\"\\t\", escapechar=\"#\")\n", + "\n", + "RAW_GLOBAL_LDS = {\n", + " rname: squarify(distances_narrowform[distances_narrowform[\"rname\"]==rname].drop(columns=\"rname\"))\n", + " for rname in distances_narrowform[\"rname\"].drop_duplicates()\n", + " if rname in Q_CHROMS\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "KMERSCANNER_PKL = f\"{DATA_DIR}/PacBio/kmerscanner-q_arm.pkl\"\n", + "KMERSCANNER_DAT = f\"{DATA_DIR}/PacBio/kmerscanner-q_arm.dat.gz\"\n", + "\n", + "if path.isfile(KMERSCANNER_PKL):\n", + " with open(KMERSCANNER_PKL, mode=\"rb\") as pkl:\n", + " DENSITIES = load(pkl)\n", + "else:\n", + " DENSITIES = load_kmerscan(KMERSCANNER_DAT, True, SAMFILTERS, 10)\n", + " with open(KMERSCANNER_PKL, mode=\"wb\") as pkl:\n", + " dump(DENSITIES, pkl)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "class GridFig():\n", + "\n", + " def __init__(self, width_ratios, height_ratios, scale=1):\n", + " self.figure, _ = subplots(\n", + " figsize=(sum(width_ratios)*scale, sum(height_ratios)*scale),\n", + " ncols=0, nrows=0,\n", + " )\n", + " self.gs = GridSpec(\n", + " ncols=len(width_ratios), wspace=0, width_ratios=width_ratios, \n", + " nrows=len(height_ratios), hspace=0, height_ratios=height_ratios,\n", + " figure=self.figure,\n", + " )\n", + " \n", + " def subplot(self, gridspec_slice, aspect=\"auto\", frame=False):\n", + " ax = self.figure.add_subplot(gridspec_slice, aspect=aspect)\n", + " if frame is False:\n", + " ax.set(frame_on=False)\n", + " else:\n", + " for spine in {\"top\", \"right\", \"bottom\", \"left\"} - set(frame):\n", + " ax.spines[spine].set_visible(False)\n", + " return ax" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_dendrogram(Z, gf):\n", + " ax = gf.subplot(gf.gs[0,0])\n", + " with rc_context({\"lines.linewidth\": .5}):\n", + " dendrogram(\n", + " Z, orientation=\"left\",\n", + " link_color_func=lambda x: \"black\", ax=ax,\n", + " )\n", + " ax.set(\n", + " xticks=[], xlabel=None,\n", + " yticks=[], ylabel=None,\n", + " ylim=ax.get_ylim()[::-1],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_heatmap(data2d, gf, cmap=\"gray_r\", vmax=.15):\n", + " ax = gf.subplot(gf.gs[0,1])\n", + " sns.heatmap(data2d, cmap=cmap, cbar=False, vmin=0, vmax=vmax, ax=ax)\n", + " ax.set(xticks=[], yticks=[], xlabel=None, ylabel=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def cluster(lds, metric=\"correlation\", method=\"ward\"):\n", + " Z = linkage(squareform(lds), metric=metric, method=method, optimal_ordering=False)\n", + " leaves = dendrogram(Z, no_plot=True)[\"leaves\"]\n", + " data2d = lds.iloc[leaves, leaves].copy()\n", + " dispatcher = pd.DataFrame(index=data2d.index)\n", + " dispatcher.index.name = \"read\"\n", + " to_subject = dispatcher.index.map(lambda s: s.split(\":\")[1])\n", + " for subject in sorted(to_subject.drop_duplicates()):\n", + " dispatcher[subject] = (to_subject==subject)\n", + " return Z, data2d, dispatcher" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def get_plottable_density_section(densities, chrom, motif, data2d, ecx):\n", + " chrom_densities = densities[chrom]\n", + " if motif is None:\n", + " by_motif = chrom_densities[chrom_densities[\"motif\"]==\"TTAGGG\"]\n", + " else:\n", + " by_motif = chrom_densities[chrom_densities[\"motif\"]==motif]\n", + " section = by_motif.set_index(\"name\").reindex(data2d.index).iloc[:,8:].copy()\n", + " if motif is None:\n", + " section = (~section.isnull()).astype(int) / 3\n", + " section.columns = section.columns.astype(int)\n", + " anchor = ecx.loc[\n", + " (ecx[\"rname\"]==chrom) & (ecx[\"flag\"]==0x4000) & (ecx[\"prime\"]==3),\n", + " \"pos\",\n", + " ].iloc[0]\n", + " return section[[c for c in section.columns if c>=anchor]]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def get_absentees(lds, densities, chrom, ecx):\n", + " raw_section = get_plottable_density_section(densities, chrom, \"TTAGGG\", lds, ecx)\n", + " nulls = raw_section.isnull().all(axis=1)\n", + " return nulls[nulls].index" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def section_to_RGB(ps, color, alpha_factor=1.2):\n", + " return np.transpose(\n", + " np.array([\n", + " np.full_like(ps, color[0]),\n", + " np.full_like(ps, color[1]),\n", + " np.full_like(ps, color[2]),\n", + " np.clip(ps*alpha_factor, a_min=None, a_max=1),\n", + " ]),\n", + " axes=(1, 2, 0),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def draw_fancy_arrow(\n", + " y, start, end, ax, lw=.25,\n", + " csty=\"angle3,angleA=45,angleB=-45\",\n", + " asty=\"Simple, tail_width=.25, head_width=2, head_length=3\"\n", + "):\n", + " ax.add_patch(FancyArrowPatch(\n", + " (start, y), (end, y),\n", + " connectionstyle=csty,\n", + " arrowstyle=asty,\n", + " lw=lw, color=\"#888\", clip_on=False,\n", + " ))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "POPULATION_COLORS = {\n", + " \"HG001\": \"black\",\n", + " \"HG002\": \"green\", \"HG003\": \"green\", \"HG004\": \"green\",\n", + " \"HG005\": \"steelblue\", \"HG006\": \"steelblue\", \"HG007\": \"steelblue\",\n", + "}\n", + "\n", + "def plot_subjects(dispatcher, gf, s=10, arrows=True):\n", + " for i, subject in enumerate(sorted(SUBJECT_TO_TRIO)):\n", + " sax = gf.subplot(gf.gs[0,i+3])\n", + " sax.plot([0, 0], [0, len(dispatcher)], lw=.5, color=\"#888\")\n", + " if subject in dispatcher:\n", + " truthiness = dispatcher[subject].reset_index(drop=True)\n", + " positions = truthiness[truthiness].index\n", + " for x in [-.1, 0, .1]:\n", + " sax.scatter(x=[x]*len(positions), y=positions, marker=\"_\", s=s, color=POPULATION_COLORS[subject])\n", + " sax.set(\n", + " xticks=[0], xticklabels=[subject+\" \"],\n", + " yticks=[], xlabel=None, ylabel=None,\n", + " xlim=(-.5, .5),\n", + " ylim=(len(dispatcher), -1),\n", + " )\n", + " for tick in sax.get_xticklabels():\n", + " tick.set_rotation(90)\n", + " if arrows and (subject in {\"HG002\", \"HG005\"}):\n", + " draw_fancy_arrow(len(dispatcher), 1, 0, sax)\n", + " draw_fancy_arrow(len(dispatcher), 2, 0, sax)\n", + " sax.tick_params(axis=\"both\", which=\"both\", length=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "IMSHOW_PALETTE = {\n", + " None: [.7, .7, .7],\n", + " \"TTAGGG\": [.0, .4, .1],\n", + " \"TGAGGG\": [1, 1, 0],\n", + " \"TTAGGGG\": [.5, .9, 1],\n", + "}\n", + "\n", + "def plot_densities(densities, chrom, data2d, ecx, gf, extent, subject, bin_size=100):\n", + " ax = gf.subplot(gf.gs[0,-1])\n", + " for motif, color in IMSHOW_PALETTE.items():\n", + " ps = get_plottable_density_section(densities, chrom, motif, data2d, ecx).values\n", + " breakat = MAXLEN // 100\n", + " orig_len = ps.shape[1]\n", + " if ps.shape[1] < MAXLEN:\n", + " ps = np.pad(ps, ((0, 0), (0, MAXLEN-ps.shape[1])))\n", + " elif ps.shape[1] > MAXLEN:\n", + " ps = ps[:,:MAXLEN]\n", + " pa = section_to_RGB(np.clip(uniform_filter1d(ps, 5, 1), a_min=0.0, a_max=1.0), color, 2)\n", + " ax.imshow(pa, extent=extent, interpolation=\"nearest\")\n", + " ticklabels=np.linspace(0, MAXLEN//100, MAXLEN//100+1).astype(int).astype(str)\n", + " fullaxislen = len(ticklabels)\n", + " ticklabels = ticklabels[:breakat+1]\n", + " xmin, xmax = extent[:2]\n", + " ax.set(\n", + " xticks=np.linspace(xmin, xmax, MAXLEN//100+1)[:breakat+1],\n", + " xticklabels=ticklabels,\n", + " xlabel=\"Kbp of telomeric tract\",\n", + " yticks=[], ylabel=None,\n", + " )\n", + " ax.tick_params(axis=\"both\", which=\"both\", length=0)\n", + " ax.tick_params(axis=\"x\", which=\"both\", length=3)\n", + " if (subject == \"HG007\") or ((chrom == \"chr15\") and (subject == \"HG005\")):\n", + " ax.axhline(0, 0, (breakat+1)/fullaxislen, lw=1, c=\"black\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "@lru_cache(maxsize=None)\n", + "def convname(cn):\n", + " match = re.search(r'^\\d+', cn)\n", + " if match:\n", + " return match.group() + \"q\"\n", + " else:\n", + " return cn.split(\"chr\")[1] + \"q\"" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "def fixup_labels(gf, chrom, subject):\n", + " if (chrom == \"chr7\") and (subject == \"HG001\"):\n", + " gf.figure.get_axes()[1].set_title(\"Pairwise relative\\nLevenshtein distances\", fontsize=13, loc=\"right\")\n", + " gf.figure.get_axes()[5].set_title(\"Subjects\", fontsize=13)\n", + " gf.figure.get_axes()[-1].set_title(\"Motif densities\", loc=\"left\", fontsize=13)\n", + " if (chrom != \"18qtel_1-500K_1_12_12_rc\") or (subject != \"HG007\"):\n", + " for ax in gf.figure.get_axes()[:-1]:\n", + " ax.set(xticklabels=[], xlabel=None)\n", + " gf.figure.get_axes()[-1].set(xlabel=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:40<00:00, 5.72s/it]\n" + ] + } + ], + "source": [ + "for chrom in tqdm(RAW_GLOBAL_LDS):\n", + " lds = RAW_GLOBAL_LDS[chrom]\n", + " absentees = get_absentees(lds, DENSITIES, chrom, ecx)\n", + " lds.drop(index=absentees, columns=absentees, inplace=True)\n", + " constrainer = pd.DataFrame(\n", + " index=lds.index, columns=[\"subject\"],\n", + " data=lds.index.map(lambda s: s.split(\":\")[1]).values\n", + " ).sort_values(by=\"subject\")\n", + " for subject in constrainer[\"subject\"].drop_duplicates():\n", + " try:\n", + " subject_index = constrainer[constrainer[\"subject\"]==subject].index\n", + " subject_lds = lds.loc[subject_index, subject_index]\n", + " subject_Z = linkage(squareform(subject_lds), metric=\"euclidean\", method=\"ward\", optimal_ordering=False)\n", + " subject_leaves = dendrogram(subject_Z, no_plot=True)[\"leaves\"]\n", + " subject_data2d = subject_lds.iloc[subject_leaves, subject_leaves]\n", + " dispatcher = pd.DataFrame(index=subject_index)\n", + " dispatcher.index.name = \"read\"\n", + " to_subject = dispatcher.index.map(lambda s: s.split(\":\")[1])\n", + " for subject in sorted(to_subject.drop_duplicates()):\n", + " dispatcher[subject] = (to_subject==subject)\n", + " h = 6*len(subject_lds)/50\n", + " w = 30\n", + " gf = GridFig([h/3,h,.3]+[.8]*7+[w], [h], scale=.2)\n", + " plot_dendrogram(subject_Z, gf=gf)\n", + " plot_heatmap(subject_data2d, gf=gf)\n", + " plot_subjects(dispatcher, gf=gf, s=7, arrows=(\n", + " (subject==\"HG007\") or ((chrom == \"chr15\") and (subject == \"HG005\"))\n", + " ))\n", + " plot_densities(DENSITIES, chrom, subject_data2d, ecx, subject=subject, gf=gf, extent=[0,w,0,h])\n", + " fixup_labels(gf, chrom, subject)\n", + " if (chrom == \"chr15\") and (subject == \"HG005\"):\n", + " pass\n", + " elif subject != \"HG007\":\n", + " gf.figure.get_axes()[-1].set(xticks=[])\n", + " except ValueError: # too few observations\n", + " continue\n", + " gf.figure.savefig(\n", + " f\"{DATA_DIR}/PacBio/haplotypes/clusters-q_arm/constrained/\"+chrom+\"-\"+subject+\".pdf\",\n", + " bbox_inches=\"tight\", pad_inches=0,\n", + " )\n", + " close(gf.figure)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/paper/jupyter/Reviewer-response-ONT.ipynb b/assets/paper/jupyter/Reviewer-response-ONT.ipynb new file mode 100644 index 0000000..123e9a8 --- /dev/null +++ b/assets/paper/jupyter/Reviewer-response-ONT.ipynb @@ -0,0 +1,512 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import re\n", + "from matplotlib.pyplot import subplots, style, rc\n", + "from tqdm import tqdm\n", + "from venn import venn, pseudovenn\n", + "from collections import defaultdict\n", + "from itertools import count, islice\n", + "from functools import lru_cache" + ] + }, + { + "cell_type": "code", + "execution_count": 465, + "metadata": {}, + "outputs": [], + "source": [ + "from edgecaselib.repeatfinder import lowest_alpha_inversion as lai, custom_alpha_inversion as cai\n", + "from edgecaselib.util import revcomp\n", + "from regex import finditer, IGNORECASE\n", + "from matplotlib.patches import Rectangle\n", + "from scipy.stats import mannwhitneyu, wilcoxon\n", + "from itertools import product\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 276, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3/3 [00:02<00:00, 1.28it/s]\n" + ] + } + ], + "source": [ + "from pysam import AlignmentFile\n", + "from glob import glob\n", + "\n", + "READ_NAMES = {\"p\": defaultdict(set), \"q\": defaultdict(set)}\n", + "for filename in tqdm(glob(\"ONT/AshkenazimTrio/HG002/guppy-V*tailpuller.bam\")):\n", + " version = re.search(r'\\d.\\d.\\d', filename).group()\n", + " with AlignmentFile(filename) as bam:\n", + " for entry in bam:\n", + " if entry.flag & 3844 == 0:\n", + " if entry.flag & 0x4000 == 0x4000:\n", + " arm = \"q\" if (entry.flag & 0x8000 == 0x8000) else \"p\"\n", + " READ_NAMES[arm][version].add(entry.qname)\n", + "\n", + "COMMON_Q_NAMES = set.intersection(*READ_NAMES[\"q\"].values())\n", + "VERSIONS = sorted(READ_NAMES[\"q\"].keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "figure, axs = subplots(figsize=(12, 6), ncols=2)\n", + "for arm, ax in zip(\"pq\", axs):\n", + " venn(READ_NAMES[arm], ax=ax)\n", + " ax.set(title=f\"{arm} arm\")\n", + "\n", + "_ = figure.suptitle(\"Telomeric candidate reads w/ different versions of Guppy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 256, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3/3 [00:00<00:00, 373.42it/s]\n" + ] + } + ], + "source": [ + "KMER_COUNTS = pd.concat(\n", + " objs=[\n", + " pd.read_csv(filename, sep=\"\\t\", names=[\"kmer\", re.search(r'\\d.\\d.\\d', filename).group()], index_col=0)\n", + " for filename in tqdm(sorted(glob(\"ONT/AshkenazimTrio/HG002/guppy-V*tailchopper.tsv\")))\n", + " ],\n", + " axis=1, sort=False,\n", + ")\n", + "\n", + "KMER_FRAQS = KMER_COUNTS / KMER_COUNTS.sum()\n", + "KMER_FRAQS[\"identity\"] = KMER_FRAQS.index.map(lambda m: max(cai(revcomp(cai(m))), cai(m)))\n", + "KMER_FRAQS = KMER_FRAQS.groupby(\"identity\").sum()\n", + "KMER_FRAQS.index.name = \"kmer\"" + ] + }, + { + "cell_type": "code", + "execution_count": 483, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
2.3.43.2.43.4.5
kmer
TTAGGG0.0924810.0154900.092932
GAAAAA0.0091490.0103950.009628
GAAGAA0.0045320.0078840.006332
TTGGGG0.0130050.0049000.011142
TTTTTG0.0079150.0083950.008087
\n", + "
" + ], + "text/plain": [ + " 2.3.4 3.2.4 3.4.5\n", + "kmer \n", + "TTAGGG 0.092481 0.015490 0.092932\n", + "GAAAAA 0.009149 0.010395 0.009628\n", + "GAAGAA 0.004532 0.007884 0.006332\n", + "TTGGGG 0.013005 0.004900 0.011142\n", + "TTTTTG 0.007915 0.008395 0.008087" + ] + }, + "execution_count": 483, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "TARGET_KMERS = set.union(*(\n", + " set(KMER_FRAQS[version].sort_values(ascending=False)[:4].index)\n", + " for version in KMER_COUNTS.columns\n", + "))\n", + "\n", + "TARGET_FRAQS = KMER_FRAQS.loc[TARGET_KMERS]\n", + "TARGET_KMERS_ORDERED = list(TARGET_FRAQS.index)\n", + "TARGET_FRAQS" + ] + }, + { + "cell_type": "code", + "execution_count": 484, + "metadata": {}, + "outputs": [], + "source": [ + "id2m = defaultdict(set)\n", + "for m in KMER_COUNTS.index:\n", + " id2m[max(cai(revcomp(cai(m))), cai(m))].add(m)" + ] + }, + { + "cell_type": "code", + "execution_count": 485, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3/3 [00:04<00:00, 1.45s/it]\n" + ] + } + ], + "source": [ + "per_read_fraqs = {\n", + " m: pd.DataFrame(columns=READ_NAMES[\"q\"].keys(), dtype=float)\n", + " for m in TARGET_KMERS\n", + "}\n", + "\n", + "for filename in tqdm(glob(\"ONT/AshkenazimTrio/HG002/guppy-V*tailchopper.bam\")):\n", + " version = re.search(r'\\d.\\d.\\d', filename).group()\n", + " with AlignmentFile(filename) as bam:\n", + " for entry in bam:\n", + " if entry.flag & 3844 == 0:\n", + " if entry.flag & 0xC000 == 0xC000:\n", + " for m in TARGET_KMERS:\n", + " m_count = sum(\n", + " 1 for _ in finditer(r'|'.join(id2m[m]), entry.seq, overlapped=True)\n", + " )\n", + " per_read_fraqs[m].loc[entry.qname, version] = m_count / len(entry.seq)" + ] + }, + { + "cell_type": "code", + "execution_count": 486, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "converted_list = []\n", + "\n", + "for m in TARGET_KMERS:\n", + " narrow = per_read_fraqs[m].melt(var_name=\"version\", value_name=\"hits per base\").dropna()\n", + " narrow[\"kmer\"] = m\n", + " converted_list.append(narrow)\n", + "\n", + "fraq_dists = pd.concat(converted_list, sort=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 487, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "nv, nk = len(VERSIONS), len(TARGET_KMERS)\n", + "#dispatcher = fraq_dists[[\"version\", \"kmer\"]].drop_duplicates().sort_values(by=\"version\").reset_index(drop=True)\n", + "\n", + "figure, ax = subplots(figsize=(nv*(nk+4)/4, 6))\n", + "x = -1\n", + "ticks, titles, ticklabels = [], [], []\n", + "\n", + "KMER_COLORS = {\n", + " \"TTAGGG\": \"#117733\",\n", + " \"GAAAAA\": \"#332288\",\n", + " \"GAAGAA\": \"#D01000\",\n", + " \"TTGGGG\": \"#AA4499\",\n", + " \"TGGGCC\": \"orange\",\n", + " \"TGAGGG\": \"#DDCC77\",\n", + " \"TTTTAA\": \"purple\",\n", + " \"TTTTTG\": \"#999999\",\n", + "}\n", + "\n", + "for i, (version, kmer) in enumerate(product(VERSIONS, TARGET_KMERS_ORDERED)):\n", + " if i % nk == 0:\n", + " x += 2\n", + " else:\n", + " x += 1\n", + " titles.append(version)\n", + " ticks.append(x)\n", + " ticklabels.append(kmer)\n", + " data = fraq_dists.loc[(fraq_dists[\"version\"]==version) & (fraq_dists[\"kmer\"]==kmer), \"hits per base\"]\n", + " ax.add_patch(Rectangle(\n", + " (x-.45, 0), .9, .16, facecolor=\"#00000008\", clip_on=False,\n", + " ))\n", + " ax.scatter(x=np.random.normal(x, .1, len(data)), y=data, color=KMER_COLORS[kmer], s=2)\n", + " parts = ax.boxplot(\n", + " data, positions=[x], widths=[.9], showfliers=False,\n", + " medianprops=dict(color=\"black\", alpha=.3),\n", + " )\n", + " xd, yd = parts[\"boxes\"][0].get_xdata(), parts[\"boxes\"][0].get_ydata()\n", + " ax.add_patch(Rectangle(\n", + " (xd.min(), yd.min()), xd.max()-xd.min(), yd.max()-yd.min(),\n", + " facecolor=\"#FFFFFF99\",\n", + " ))\n", + "\n", + "ax.set(xticks=ticks, xticklabels=ticklabels, ylim=(-.005, .155))\n", + "for tick in ax.get_xticklabels():\n", + " tick.set(rotation=70, fontname=\"Monospace\")\n", + "\n", + "for spine in \"top\", \"right\":\n", + " ax.spines[spine].set(visible=False)\n", + "\n", + "for version, x in zip(VERSIONS, [(nk+1)/2, (nk+1)/2+nk+1, (nk+1)/2+(nk+1)*2]):\n", + " ax.text(x=x, y=.165, s=f\"Guppy {version}\", ha=\"center\", va=\"bottom\")\n", + " ax.plot([x-nk/2, x+nk/2], [.16, .16], color=\"gray\", lw=1, ls=\"--\", clip_on=False)\n", + "\n", + "ax.set(xlim=(0, ax.get_xlim()[1]), ylabel=\"hits per base\")\n", + "figure.savefig(\"Figure SY, ONT.pdf\", bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "code", + "execution_count": 429, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5.669624079052662e-66" + ] + }, + "execution_count": 429, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "_, ttaggg_p1 = mannwhitneyu(\n", + " fraq_dists.query(\"(version=='3.2.4') & (kmer=='TTAGGG')\")[\"hits per base\"],\n", + " fraq_dists.query(\"(version=='2.3.4') & (kmer=='TTAGGG')\")[\"hits per base\"],\n", + " alternative=\"less\",\n", + ")\n", + "ttaggg_p1 # TTAGGG in 3.2.4 is less frequent than in 2.3.4" + ] + }, + { + "cell_type": "code", + "execution_count": 430, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2.4171992340962484e-64" + ] + }, + "execution_count": 430, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "_, ttaggg_p2 = mannwhitneyu(\n", + " fraq_dists.query(\"(version=='3.2.4') & (kmer=='TTAGGG')\")[\"hits per base\"],\n", + " fraq_dists.query(\"(version=='3.4.5') & (kmer=='TTAGGG')\")[\"hits per base\"],\n", + " alternative=\"less\",\n", + ")\n", + "ttaggg_p2 # TTAGGG in 3.2.4 is less frequent than in 3.4.5" + ] + }, + { + "cell_type": "code", + "execution_count": 499, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4.1888904848499314e-50" + ] + }, + "execution_count": 499, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "_, gaagaa_p1 = mannwhitneyu(\n", + " fraq_dists.query(\"(version=='2.3.4') & (kmer=='GAAGAA')\")[\"hits per base\"],\n", + " fraq_dists.query(\"(version=='3.2.4') & (kmer=='GAAGAA')\")[\"hits per base\"],\n", + " alternative=\"less\",\n", + ")\n", + "\n", + "gaagaa_p1 # GAAGAA in 3.2.4 is more frequent than in 2.3.4" + ] + }, + { + "cell_type": "code", + "execution_count": 501, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.345919150777346e-43" + ] + }, + "execution_count": 501, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "_, gaagaa_p2 = mannwhitneyu(\n", + " fraq_dists.query(\"(version=='3.4.5') & (kmer=='GAAGAA')\")[\"hits per base\"],\n", + " fraq_dists.query(\"(version=='3.2.4') & (kmer=='GAAGAA')\")[\"hits per base\"],\n", + " alternative=\"less\",\n", + ")\n", + "\n", + "gaagaa_p2 # GAAGAA in 3.4.5 is more frequent than in 2.3.4" + ] + }, + { + "cell_type": "code", + "execution_count": 504, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([2.26784963e-65, 9.66879694e-64, 1.67555619e-49, 5.38367660e-43])" + ] + }, + "execution_count": 504, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from statsmodels.stats.multitest import multipletests\n", + "multipletests([ttaggg_p1, ttaggg_p2, gaagaa_p1, gaagaa_p2], method=\"bonferroni\")[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Jellyfish three versions for 6-mers, select five top 6-mers.\n", + "\n", + "Not only did TTAGGG occur only about once every 10 bases in the \"better\" guppies, but:\n", + "* TTAGGG was significantly less frequent in Guppy 3.2.4 than in two other versions (p=2.3e-64 and p=1e-63, respectively)\n", + "* GAAGAA was significantly more frequent (p=1.7e-49, p=5.4e-43)\n", + "\n", + "While it looks like 3.2.4 is just the odd one out, the fact that the output of the model changes so drastically and seemingly arbitrarily between different versions of the basecaller indicates that it is too early to trust these outputs for telomere analyses." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/paper/jupyter/Table-1-repeatfinder.ipynb b/assets/paper/jupyter/Table-1-repeatfinder.ipynb new file mode 100644 index 0000000..8ce1980 --- /dev/null +++ b/assets/paper/jupyter/Table-1-repeatfinder.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import re\n", + "from matplotlib.pyplot import subplots, style, rc\n", + "from tqdm import tqdm\n", + "from venn import venn, pseudovenn\n", + "from collections import defaultdict\n", + "from itertools import count, islice\n", + "from functools import lru_cache" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "TEX_HEADER = r'\\begin{samepage} \\begin{table}[h!] \\small'\n", + "TEX_FOOTER = \"\\\\caption{}\\n\\\\label{}\\n\\\\end{table}\\n\\\\end{samepage}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\begin{samepage} \\begin{table}[h!] \\small\n", + "\\begin{tabular}{lllllllllllllllll}\n", + "\\hline\n", + "arm & monomer & \\multicolumn{7}{l}{fraction\\_explained} & \\multicolumn{7}{l}{score} & p\\_adjusted \\\\\n", + " & . & HG001 & HG002 & HG003 & HG004 & HG005 & HG006 & HG007 & HG001 & HG002 & HG003 & HG004 & HG005 & HG006 & HG007 & . \\\\\n", + "\\hline\n", + " q & TTAGGG & 74.5 & 82.5 & 80.1 & 81.7 & 75.7 & 77.5 & 62.2 & 0.6295 & 0.7126 & 0.6255 & 0.6497 & 0.6113 & 0.5988 & 0.4550 & 9.51e-113 \\\\\n", + " q & TTGGGG & 2.5 & 3.4 & 2.8 & 2.8 & 2.4 & 3.1 & 6.6 & 0.0158 & 0.0229 & 0.0175 & 0.0179 & 0.0155 & 0.0197 & 0.0434 & 4.04e-58 \\\\\n", + " q & TTAGGGG & 4.6 & 4.8 & 7.2 & 6.0 & 5.1 & 7.6 & 9.0 & 0.0152 & 0.0166 & 0.0200 & 0.0163 & 0.0161 & 0.0232 & 0.0279 & 4.22e-110 \\\\\n", + " q & TGAGGG & 1.9 & 2.5 & 1.7 & 2.0 & 3.6 & 2.9 & 4.1 & 0.0128 & 0.0162 & 0.0102 & 0.0129 & 0.0230 & 0.0184 & 0.0265 & 1.15e-47 \\\\\n", + " q & TTCGGG & 1.2 & 0.5 & 0.7 & 0.4 & 1.4 & 1.1 & 2.5 & 0.0080 & 0.0034 & 0.0043 & 0.0025 & 0.0095 & 0.0077 & 0.0168 & 7.68e-46 \\\\\n", + " q & TTAGGGTTAGGGG & 3.0 & 3.3 & 6.3 & 5.4 & 3.7 & 6.0 & 6.5 & 0.0043 & 0.0050 & 0.0090 & 0.0073 & 0.0053 & 0.0083 & 0.0092 & 2.76e-102 \\\\\n", + " q & TCAGGG & 0.9 & 0.7 & 1.1 & 1.0 & 1.1 & 0.8 & 1.4 & 0.0065 & 0.0044 & 0.0078 & 0.0069 & 0.0082 & 0.0058 & 0.0087 & 1.22e-24 \\\\\n", + " q & TTAGG & 1.8 & 1.6 & 3.4 & 4.2 & 2.0 & 3.2 & 1.9 & 0.0048 & 0.0041 & 0.0092 & 0.0110 & 0.0052 & 0.0084 & 0.0049 & 4.60e-94 \\\\\n", + " q & TAGGG & 2.3 & 1.9 & 3.1 & 3.0 & 2.8 & 3.2 & 2.4 & 0.0050 & 0.0039 & 0.0067 & 0.0063 & 0.0058 & 0.0067 & 0.0048 & 5.75e-91 \\\\\n", + " q & TTAGGTTAGGG & 2.7 & 2.6 & 5.2 & 6.5 & 2.8 & 4.9 & 2.5 & 0.0037 & 0.0034 & 0.0069 & 0.0088 & 0.0037 & 0.0065 & 0.0033 & 1.97e-89 \\\\\n", + " q & TAGGGC & 0.5 & 0.4 & 0.6 & 0.6 & 0.8 & 0.2 & 1.3 & 0.0039 & 0.0032 & 0.0047 & 0.0047 & 0.0060 & 0.0014 & 0.0099 & 5.64e-42 \\\\\n", + " q & TTTAGGG & 1.5 & 1.5 & 1.4 & 1.4 & 1.4 & 2.2 & 2.5 & 0.0048 & 0.0039 & 0.0029 & 0.0028 & 0.0034 & 0.0055 & 0.0058 & 2.32e-79 \\\\\n", + " q & TAGGGG & 0.7 & 0.9 & 0.6 & 0.9 & 0.7 & 0.6 & 1.2 & 0.0035 & 0.0051 & 0.0028 & 0.0044 & 0.0034 & 0.0025 & 0.0060 & 2.68e-42 \\\\\n", + " q & TAGGGTTAGGG & 3.1 & 2.6 & 3.9 & 4.0 & 3.5 & 3.8 & 2.9 & 0.0036 & 0.0031 & 0.0041 & 0.0041 & 0.0041 & 0.0040 & 0.0035 & 1.45e-84 \\\\\n", + " q & TTAAGGG & 0.8 & 1.2 & 1.1 & 0.8 & 1.0 & 1.2 & 1.3 & 0.0022 & 0.0030 & 0.0032 & 0.0021 & 0.0029 & 0.0034 & 0.0032 & 4.87e-70 \\\\\n", + " q & TTGGG & 1.4 & 0.9 & 1.9 & 1.7 & 1.8 & 1.9 & 1.4 & 0.0022 & 0.0013 & 0.0032 & 0.0026 & 0.0028 & 0.0028 & 0.0022 & 3.17e-70 \\\\\n", + " q & TTAGGGTTTAGGG & 1.2 & 1.4 & 1.4 & 1.5 & 1.3 & 2.0 & 2.3 & 0.0011 & 0.0017 & 0.0013 & 0.0014 & 0.0016 & 0.0021 & 0.0033 & 5.17e-68 \\\\\n", + " q & TTGGGTTAGGG & 1.7 & 1.0 & 2.1 & 1.9 & 1.9 & 2.0 & 1.1 & 0.0012 & 0.0007 & 0.0013 & 0.0014 & 0.0015 & 0.0014 & 0.0008 & 1.75e-53 \\\\\n", + " q & TTAGGGTTAAGGG & 0.5 & 1.0 & 0.9 & 0.5 & 0.7 & 0.7 & 1.0 & 0.0005 & 0.0020 & 0.0009 & 0.0004 & 0.0006 & 0.0009 & 0.0007 & 1.03e-50 \\\\\n", + " p & CCCTAA & 21.5 & 36.3 & 19.9 & 17.1 & 32.0 & 16.9 & 11.6 & 0.1687 & 0.3113 & 0.1491 & 0.1258 & 0.2639 & 0.1255 & 0.0831 & 9.51e-113 \\\\\n", + " p & CCCCAA & 1.5 & 1.6 & 1.4 & 1.1 & 1.8 & 1.1 & 1.4 & 0.0100 & 0.0104 & 0.0087 & 0.0073 & 0.0120 & 0.0073 & 0.0093 & 1.05e-73 \\\\\n", + " p & CCCCTAA & 2.3 & 2.4 & 1.9 & 2.0 & 2.2 & 1.9 & 1.9 & 0.0075 & 0.0075 & 0.0054 & 0.0059 & 0.0067 & 0.0056 & 0.0061 & 9.17e-109 \\\\\n", + " p & CCCCTAACCCTAA & 1.8 & 2.0 & 1.6 & 1.6 & 2.0 & 1.6 & 1.3 & 0.0029 & 0.0031 & 0.0023 & 0.0023 & 0.0029 & 0.0023 & 0.0022 & 1.46e-97 \\\\\n", + " p & GGCGCA & 2.1 & 1.8 & 1.4 & 1.1 & 1.6 & 1.4 & 1.1 & 0.0028 & 0.0023 & 0.0019 & 0.0014 & 0.0022 & 0.0020 & 0.0016 & 2.35e-27 \\\\\n", + " p & CCGCG & 1.1 & 0.8 & 0.7 & 0.5 & 0.8 & 0.9 & 0.9 & 0.0028 & 0.0020 & 0.0018 & 0.0013 & 0.0021 & 0.0022 & 0.0021 & 4.35e-100 \\\\\n", + " p & CCCTA & 0.9 & 1.1 & 1.0 & 0.9 & 1.2 & 0.8 & 0.5 & 0.0020 & 0.0021 & 0.0022 & 0.0019 & 0.0026 & 0.0015 & 0.0010 & 2.38e-98 \\\\\n", + " p & CCTAA & 0.8 & 1.0 & 0.9 & 0.9 & 0.6 & 0.6 & 0.4 & 0.0020 & 0.0026 & 0.0023 & 0.0023 & 0.0016 & 0.0016 & 0.0010 & 5.75e-100 \\\\\n", + " p & CCCTAACCTAA & 1.1 & 1.6 & 1.3 & 1.2 & 0.9 & 0.9 & 0.5 & 0.0015 & 0.0021 & 0.0017 & 0.0016 & 0.0012 & 0.0012 & 0.0007 & 1.47e-80 \\\\\n", + " p & CCCTACCCTAA & 1.1 & 1.3 & 1.2 & 0.9 & 1.6 & 0.9 & 0.5 & 0.0012 & 0.0020 & 0.0012 & 0.0011 & 0.0021 & 0.0010 & 0.0007 & 6.67e-77 \\\\\n", + "\\hline\n", + "\\end{tabular}\n", + "\\caption{}\n", + "\\label{}\n", + "\\end{table}\n" + ] + } + ], + "source": [ + "def convert_tsv(filename, arm):\n", + " tsv = pd.read_csv(filename, sep=\"\\t\", escapechar=\"#\", header=[0,1])\n", + " tsv.columns = pd.MultiIndex.from_tuples([\n", + " (top, bottom if ((bottom != \"monomer\") and (not bottom.startswith(\"Unnamed\"))) else \".\")\n", + " for top, bottom in list(tsv.columns)\n", + " ])\n", + " tsv = tsv[(tsv.iloc[:,1:8]>=.01).any(axis=1)]\n", + " for i in range(1, 8):\n", + " tsv.iloc[:,i] = tsv.iloc[:,i].apply(lambda x: \"<0.1\" if x < .0005 else format(100*x, \".1f\"))\n", + " for i in range(8, 15):\n", + " tsv.iloc[:,i] = tsv.iloc[:,i].apply(lambda x: format(x, \".4f\"))\n", + " tsv.iloc[:,15] = tsv.iloc[:,15].apply(lambda x: format(x, \".2e\"))\n", + " tsv.insert(loc=0, column=(\"Arm\", \"\"), value=arm)\n", + " return tsv\n", + "\n", + "preformatted = pd.concat(\n", + " objs=[convert_tsv(\"PacBio/repeatfinder-q_arm.tsv\", arm=\"q\"), convert_tsv(\"PacBio/repeatfinder-p_arm.tsv\", arm=\"p\")],\n", + " axis=0, sort=False,\n", + ")\n", + "\n", + "with open(\"Table-1-repeatfinder.tex\", mode=\"wt\") as tex:\n", + " print(\n", + " TEX_HEADER,\n", + " preformatted.to_latex(index=False).rstrip(\"\\n\")\n", + " .replace(r'\\toprule', r'\\hline')\n", + " .replace(r'\\midrule', r'\\hline')\n", + " .replace(r'\\bottomrule', r'\\hline'),\n", + " TEX_FOOTER,\n", + " sep=\"\\n\", file=tex,\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/paper/jupyter/Table-3-readtests.ipynb b/assets/paper/jupyter/Table-3-readtests.ipynb new file mode 100644 index 0000000..e7bdece --- /dev/null +++ b/assets/paper/jupyter/Table-3-readtests.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import re\n", + "from matplotlib.pyplot import subplots, style, rc\n", + "from tqdm import tqdm\n", + "from venn import venn, pseudovenn\n", + "from collections import defaultdict\n", + "from itertools import count, islice\n", + "from functools import lru_cache" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "style.use([\"seaborn-poster\", \"seaborn-whitegrid\"])\n", + "rc(\"axes\", linewidth=1, edgecolor=\"black\")\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from pickle import load\n", + "\n", + "with open(\"for_wilcoxon-p_arm.pkl\", mode=\"rb\") as pkl:\n", + " P = load(pkl)\n", + "with open(\"for_wilcoxon-q_arm.pkl\", mode=\"rb\") as pkl:\n", + " Q = load(pkl)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import wilcoxon\n", + "\n", + "def wilcoxon_dropna(df, a, b):\n", + " dfnona = df[[a, b]].dropna()\n", + " try:\n", + " yes = sum(dfnona[a] < dfnona[b])\n", + " no = sum(dfnona[a] > dfnona[b])\n", + " p = wilcoxon(dfnona[a], dfnona[b])[1]\n", + " return yes, no, p\n", + " except ValueError:\n", + " return np.nan, np.nan, np.nan" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from statsmodels.stats.multitest import multipletests\n", + "\n", + "overall = pd.DataFrame(\n", + " index=['s2t_p', 's2o_p', 't2o_p', 'aff_p', 'afm_p', 'cff_p', 'cfm_p'],\n", + " columns=[\"p\"],\n", + " data=[\n", + " wilcoxon_dropna(pd.concat(P.cd_list + Q.cd_list), \"subject\", \"trio\")[2],\n", + " wilcoxon_dropna(pd.concat(P.cd_list + Q.cd_list), \"subject\", \"outgroup\")[2],\n", + " wilcoxon_dropna(pd.concat(P.cd_list + Q.cd_list), \"trio\", \"outgroup\")[2],\n", + " wilcoxon_dropna(pd.concat(P.aff_list + Q.aff_list), \"father to son\", \"father to mother\")[2],\n", + " wilcoxon_dropna(pd.concat(P.afm_list + Q.afm_list), \"mother to son\", \"mother to father\")[2],\n", + " wilcoxon_dropna(pd.concat(P.cff_list + Q.cff_list), \"father to son\", \"father to mother\")[2],\n", + " wilcoxon_dropna(pd.concat(P.cfm_list + Q.cfm_list), \"mother to son\", \"mother to father\")[2],\n", + " ],\n", + ")\n", + "\n", + "overall[\"p_adjusted\"] = multipletests(overall[\"p\"], method=\"bonferroni\")[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pp_adjustedp_adjusted_formatted
s2t_p6.043497e-574.230448e-564.2e-56
s2o_p1.088936e-1077.622554e-1077.6e-107
t2o_p3.205007e-412.243505e-402.2e-40
aff_p5.317692e-123.722384e-113.7e-11
afm_p1.608129e-011.000000e+001.00
cff_p4.794396e-033.356077e-023.4e-02
cfm_p3.224968e-022.257478e-010.23
\n", + "
" + ], + "text/plain": [ + " p p_adjusted p_adjusted_formatted\n", + "s2t_p 6.043497e-57 4.230448e-56 4.2e-56\n", + "s2o_p 1.088936e-107 7.622554e-107 7.6e-107\n", + "t2o_p 3.205007e-41 2.243505e-40 2.2e-40\n", + "aff_p 5.317692e-12 3.722384e-11 3.7e-11\n", + "afm_p 1.608129e-01 1.000000e+00 1.00\n", + "cff_p 4.794396e-03 3.356077e-02 3.4e-02\n", + "cfm_p 3.224968e-02 2.257478e-01 0.23" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PRINT_NS = False\n", + "\n", + "if PRINT_NS:\n", + " format_pval = lambda p: \"ns\" if (p >= .05) else (\"<1.0e-300\" if (p < 1e-300) else format(p, \".1e\"))\n", + "else:\n", + " format_pval = lambda p: format(p, \".2f\") if (p >= .05) else (\"<1.0e-300\" if (p < 1e-300) else format(p, \".1e\"))\n", + "\n", + "overall[\"p_adjusted_formatted\"] = overall[\"p_adjusted\"].apply(format_pval)\n", + "overall" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/paper/jupyter/Table-S1-mapping-statistics.ipynb b/assets/paper/jupyter/Table-S1-mapping-statistics.ipynb new file mode 100644 index 0000000..cf49cc6 --- /dev/null +++ b/assets/paper/jupyter/Table-S1-mapping-statistics.ipynb @@ -0,0 +1,672 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import re\n", + "from matplotlib.pyplot import subplots, style, rc\n", + "from tqdm import tqdm\n", + "from venn import venn, pseudovenn\n", + "from collections import defaultdict\n", + "from itertools import count, islice\n", + "from functools import lru_cache\n", + "from pysam import AlignmentFile" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "SUBJECT_TO_TRIO = {\n", + " \"HG001\": \"NA12878\",\n", + " \"HG002\": \"AshkenazimTrio\", \"HG003\": \"AshkenazimTrio\", \"HG004\": \"AshkenazimTrio\",\n", + " \"HG005\": \"ChineseTrio\", \"HG006\": \"ChineseTrio\", \"HG007\": \"ChineseTrio\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "ecx = pd.read_csv(\"assets/hg38ext.fa.ecx\", sep=\"\\t\", skiprows=1, escapechar=\"#\") \\\n", + " .query(\"flag==16384\").query(\"blacklist=='-'\") \\\n", + " .drop(columns=[\"entry\", \"pos+1\", \"main_rname\", \"flag\", \"link\", \"blacklist\", \"class\"])\n", + "\n", + "ecx[\"sorter\"] = ecx[\"chromosome\"].apply(lambda c: 999 if c == \"chrX\" else int(c[3:]))\n", + "ecx = ecx.sort_values(by=\"sorter\").drop(columns=\"sorter\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def load_bam(filename):\n", + " p_arm, q_arm = defaultdict(list), defaultdict(list)\n", + " with AlignmentFile(filename) as bam:\n", + " for entry in bam:\n", + " if (entry.flag & 0x4000 == 0x4000) and (entry.seq is not None) and (entry.flag & 3844 == 0): # tract_anchor\n", + " entry_stats = [\n", + " entry.reference_start, entry.query_alignment_start,\n", + " entry.reference_end, entry.query_alignment_end,\n", + " len(entry.seq)\n", + " ]\n", + " if entry.flag & 0x8000 == 0x8000: # is_q\n", + " q_arm[entry.reference_name].append(entry_stats)\n", + " else:\n", + " p_arm[entry.reference_name].append(entry_stats)\n", + " return p_arm, q_arm" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def arm_to_counts(arm, name):\n", + " return pd.Series({rname: len(reads) for rname, reads in arm.items()}, name=name)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "p_counts, q_counts = [], []\n", + "\n", + "for subject, trio in SUBJECT_TO_TRIO.items():\n", + " p_arm, q_arm = load_bam(f\"PacBio/{trio}/{subject}/tailpuller.bam\")\n", + " p_counts.append(arm_to_counts(p_arm, subject))\n", + " q_counts.append(arm_to_counts(q_arm, subject))\n", + "\n", + "raw_p_counts = pd.concat(p_counts, axis=1, sort=False).fillna(0).astype(int)\n", + "raw_p_counts.index.name = \"rname\"\n", + "raw_p_counts.reset_index(inplace=True)\n", + "\n", + "raw_q_counts = pd.concat(q_counts, axis=1, sort=False).fillna(0).astype(int)\n", + "raw_q_counts.index.name = \"rname\"\n", + "raw_q_counts.reset_index(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HG001HG002HG003HG004HG005HG006HG007total
armchromosomername
pchr10chr102121525431
chr12chr121021168616683
chr16chr168201203025
chr1717ptel_1_500K_1_12_12091713829
chr18chr1834944622213668321
chr1919ptel_1-500K_1_12_12070020514
chr2chr21063182561211145
chr5chr518611412111031157
chr8chr800040037
chr9chr9926171645973204
total91295113111531442091016
qchr1010qtel_1-500K_1_12_12_rc0200015017
chr11chr1124633520162350231
chr12chr1219584627201815203
chr1414qtel_1-500K_1_12_12_rc1467311191313158
chr15chr1551362631418054319
chr1616qtel_1-500K_1_12_12_rc066802830
chr1717qtel_1-500K_1_12_12v2_rc161218221
chr1818qtel_1-500K_1_12_12_rc1157211191818145
chr19chr198113724271797
chr22qtel_1-500K_1_12_12_rc00001001
chr21chr21541575237524042434
chr22chr2251383352106657361
chr3chr300030104
chr55qtel_1-500K_1_12_12_rc68154104753560102598
chr66qtel_1-500K_1_12_12_rc451214745394949395
chr7chr719722721162325203
chr8chr822743222202138229
chrXchrX431072216362336283
total38411194963883294875263729
total47514146094993826317354745
\n", + "
" + ], + "text/plain": [ + " HG001 HG002 HG003 HG004 \\\n", + "arm chromosome rname \n", + "p chr10 chr10 2 12 1 5 \n", + " chr12 chr12 10 21 16 8 \n", + " chr16 chr16 8 2 0 12 \n", + " chr17 17ptel_1_500K_1_12_12 0 9 1 7 \n", + " chr18 chr18 34 94 46 22 \n", + " chr19 19ptel_1-500K_1_12_12 0 7 0 0 \n", + " chr2 chr2 10 63 18 25 \n", + " chr5 chr5 18 61 14 12 \n", + " chr8 chr8 0 0 0 4 \n", + " chr9 chr9 9 26 17 16 \n", + " total 91 295 113 111 \n", + "q chr10 10qtel_1-500K_1_12_12_rc 0 2 0 0 \n", + " chr11 chr11 24 63 35 20 \n", + " chr12 chr12 19 58 46 27 \n", + " chr14 14qtel_1-500K_1_12_12_rc 14 67 31 11 \n", + " chr15 chr15 51 36 26 31 \n", + " chr16 16qtel_1-500K_1_12_12_rc 0 6 6 8 \n", + " chr17 17qtel_1-500K_1_12_12v2_rc 1 6 1 2 \n", + " chr18 18qtel_1-500K_1_12_12_rc 11 57 21 11 \n", + " chr19 chr19 8 1 13 7 \n", + " chr2 2qtel_1-500K_1_12_12_rc 0 0 0 0 \n", + " chr21 chr21 54 157 52 37 \n", + " chr22 chr22 5 138 33 52 \n", + " chr3 chr3 0 0 0 3 \n", + " chr5 5qtel_1-500K_1_12_12_rc 68 154 104 75 \n", + " chr6 6qtel_1-500K_1_12_12_rc 45 121 47 45 \n", + " chr7 chr7 19 72 27 21 \n", + " chr8 chr8 22 74 32 22 \n", + " chrX chrX 43 107 22 16 \n", + " total 384 1119 496 388 \n", + "total 475 1414 609 499 \n", + "\n", + " HG005 HG006 HG007 total \n", + "arm chromosome rname \n", + "p chr10 chr10 2 5 4 31 \n", + " chr12 chr12 6 16 6 83 \n", + " chr16 chr16 0 3 0 25 \n", + " chr17 17ptel_1_500K_1_12_12 1 3 8 29 \n", + " chr18 chr18 21 36 68 321 \n", + " chr19 19ptel_1-500K_1_12_12 2 0 5 14 \n", + " chr2 chr2 6 12 11 145 \n", + " chr5 chr5 11 10 31 157 \n", + " chr8 chr8 0 0 3 7 \n", + " chr9 chr9 4 59 73 204 \n", + " total 53 144 209 1016 \n", + "q chr10 10qtel_1-500K_1_12_12_rc 0 15 0 17 \n", + " chr11 chr11 16 23 50 231 \n", + " chr12 chr12 20 18 15 203 \n", + " chr14 14qtel_1-500K_1_12_12_rc 9 13 13 158 \n", + " chr15 chr15 41 80 54 319 \n", + " chr16 16qtel_1-500K_1_12_12_rc 0 2 8 30 \n", + " chr17 17qtel_1-500K_1_12_12v2_rc 1 8 2 21 \n", + " chr18 18qtel_1-500K_1_12_12_rc 9 18 18 145 \n", + " chr19 chr19 24 27 17 97 \n", + " chr2 2qtel_1-500K_1_12_12_rc 1 0 0 1 \n", + " chr21 chr21 52 40 42 434 \n", + " chr22 chr22 10 66 57 361 \n", + " chr3 chr3 0 1 0 4 \n", + " chr5 5qtel_1-500K_1_12_12_rc 35 60 102 598 \n", + " chr6 6qtel_1-500K_1_12_12_rc 39 49 49 395 \n", + " chr7 chr7 16 23 25 203 \n", + " chr8 chr8 20 21 38 229 \n", + " chrX chrX 36 23 36 283 \n", + " total 329 487 526 3729 \n", + "total 382 631 735 4745 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p_counts = pd.merge(raw_p_counts, ecx.loc[ecx[\"prime\"]==5, [\"rname\", \"chromosome\"]]).sort_values(by=\"chromosome\")\n", + "p_counts[\"arm\"] = \"p\"\n", + "p_counts = p_counts.set_index([\"arm\", \"chromosome\", \"rname\"])\n", + "\n", + "q_counts = pd.merge(raw_q_counts, ecx.loc[ecx[\"prime\"]==3, [\"rname\", \"chromosome\"]]).sort_values(by=\"chromosome\")\n", + "q_counts[\"arm\"] = \"q\"\n", + "q_counts = q_counts.set_index([\"arm\", \"chromosome\", \"rname\"])\n", + "\n", + "assert len(p_counts) == len(raw_p_counts)\n", + "assert len(q_counts) == len(raw_q_counts)\n", + "\n", + "p_counts.loc[(\"p\", \"total\", \"\")] = p_counts.sum()\n", + "q_counts.loc[(\"q\", \"total\", \"\")] = q_counts.sum()\n", + "\n", + "counts = pd.concat([p_counts, q_counts])\n", + "counts.loc[(\"total\", \"\", \"\")] = p_counts.loc[(\"p\", \"total\", \"\")] + q_counts.loc[(\"q\", \"total\", \"\")]\n", + "\n", + "counts[\"total\"] = counts.sum(axis=1)\n", + "counts" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "counts.to_csv(\"Table-S1-coverage.tsv\", sep=\"\\t\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/assets/paper/jupyter/edgecaselib b/assets/paper/jupyter/edgecaselib new file mode 120000 index 0000000..a0f5762 --- /dev/null +++ b/assets/paper/jupyter/edgecaselib @@ -0,0 +1 @@ +../../../edgecaselib \ No newline at end of file diff --git a/assets/paper/manuscript/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx b/assets/paper/manuscript/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx new file mode 100644 index 0000000..e82c98a Binary files /dev/null and b/assets/paper/manuscript/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx differ diff --git a/assets/paper/manuscript/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.pdf b/assets/paper/manuscript/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.pdf new file mode 100644 index 0000000..3fecf4f Binary files /dev/null and b/assets/paper/manuscript/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.pdf differ diff --git a/assets/paper/manuscript/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.tex b/assets/paper/manuscript/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.tex new file mode 100644 index 0000000..8e55039 --- /dev/null +++ b/assets/paper/manuscript/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.tex @@ -0,0 +1,676 @@ +\documentclass{article} + +\usepackage[tmargin=.9in, bmargin=.9in, lmargin=1in, rmargin=1in]{geometry} +\usepackage{bookmark, wrapfig, enumitem, pdflscape, hyphenat} +\usepackage[labelfont=bf]{caption} +\usepackage{lmodern} +\usepackage[sfdefault]{roboto} +\usepackage[T1]{fontenc} + +\usepackage{graphicx} + \makeatletter % tex.stackexchange.com/a/28565 + \setlength{\@fptop}{0pt} + \setlength{\@fpbot}{0pt plus 1fil} + \makeatother + +\usepackage[absolute, overlay]{textpos} + \setlength{\TPHorizModule}{1mm} + \setlength{\TPVertModule}{1mm} + +\usepackage{xcolor} + \definecolor{WCM}{RGB}{172,31,44} % AC1E2C + +\usepackage{hyperref} \hypersetup{ + colorlinks=true, linkcolor={blue!65!black}, + citecolor={blue!65!black}, urlcolor={blue!50!black}, + pdfpagelayout=OneColumn, pdfstartview={XYZ null null 1.25}, + bookmarksnumbered=true, bookmarksopen=true, bookmarksopenlevel=3 +} + +% Genome Research citation style: +% https://github.com/khyox/GR + \RequirePackage[ + backend=bibtex,maxcitenames=2,maxbibnames=10, + style=authoryear,sorting=nyt,sortlocale=auto, + uniquename=init,maxitems=2,giveninits=true,terseinits=true, + dashed=false,doi=false,isbn=false,url=false, + % uniquelist=false, % Avoid disambiguation adding a 2n surname + ]{biblatex} + \DeclareNameAlias{sortname}{family-given} % for the bibliography section + \renewcommand*{\revsdnamepunct}{} % No commas between last and first names + \DeclareFieldFormat*{title}{#1} % Do not quote the title + \renewbibmacro*{date+extradate}{ % Avoid year inside parenthesis + \iffieldundef{labelyear} + {} + {\printtext[]{\printlabeldateextra} + \newunit} + }% + \DeclareBibliographyDriver{book}{% + \printnames{author}% + \finentry} + \DeclareBibliographyDriver{article}{% + \printnames{author}% + \newunit% + \usebibmacro{date+extradate}% + \printfield{title}\printfield{titleaddon}% + \newunit% + \printfield{journaltitle}% + \space%?\newunit% + \printfield{volume}% + \printfield{pages}% + \usebibmacro{doi} + \usebibmacro{finentry}} + \DeclareFieldFormat[article]{journaltitle}{\mkbibemph{#1}} % italic title + \DeclareFieldFormat[article]{volume}{\mkbibbold{#1}\addcolon\addspace} % -- + \DeclareFieldFormat[article]{pages}{#1} + \newbibmacro*{doi}{% + \iffieldequalstr{journaltitle}{BioRxiv}{ % DOI only for bioRxiv + \newunit + \printfield{doi}}{} + } + \DeclareBibliographyDriver{book}{% + \printnames{author}% + \newunit% + \usebibmacro{date+extradate}% + \usebibmacro{chaptertitle+booktitle}% + \newunit + \ifnameundef{editor}{}{\printtext[parens]{\usebibmacro{byeditor+others}}} % Print (editors) + \newunit + \usebibmacro{chapter+pages}% + \newunit + \usebibmacro{publisher+location}% + \usebibmacro{finentry}} + \DeclareFieldFormat[book]{booktitle}{\mkbibemph{#1}} % italic journal title + \newbibmacro{chaptertitle+booktitle}{% + \iffieldundef{title} + {\printfield{booktitle}} + {\printfield{title}\newunit\bibstring{in}\addspace\printfield{booktitle}} + } + \newbibmacro*{publisher+location}{% + \printlist{publisher} + \iflistundef{location} + {} + {\addcomma\newunit\printlist{location}} + \newunit + } + \DeclareBibliographyAlias{inbook}{book} + +% https://tex.stackexchange.com/a/371909 + \renewcommand*{\finalnamedelim}{\addcomma\addspace}% + +% https://tex.stackexchange.com/a/359430 + \DeclareDelimFormat[parencite]{finalnamedelim}{% + \ifnumgreater{\value{liststop}}{2}{\finalandcomma}{}% + \addspace\bibstring{and}\space} + +%\usepackage[natbib=true,backend=bibtex,style=authoryear]{biblatex} + \addbibresource{references.bib} + +\newcommand{\beginsupplement}{ +% bytesizebio.net/2013/03/11/adding-supplementary-tables-and-figures-in-latex + \newpage + \setcounter{page}{1} + \renewcommand{\thepage}{S-\arabic{page}} + \setcounter{table}{0} + \renewcommand{\thetable}{S\arabic{table}} + \setcounter{figure}{0} + \renewcommand{\thefigure}{S\arabic{figure}} + } + +\usepackage{setspace} + +\begin{document} + +\begin{center} + \Large{\textbf{Haplotype Diversity and Sequence Heterogeneity of Human Telomeres}} + \\~\\ + \small{ + Kirill Grigorev\textsuperscript{1,2 \#}, + Jonathan Foox\textsuperscript{1,2,3 \#}, + Daniela Bezdan\textsuperscript{1,2,3}, + Daniel Butler\textsuperscript{1}, + Jared J. Luxton\textsuperscript{4,5}, + Jake Reed\textsuperscript{1}, + \\%rem + Miles J. McKenna\textsuperscript{4,5}, + Lynn Taylor\textsuperscript{4,5}, + Kerry A. George\textsuperscript{4,5}, + Cem Meydan\textsuperscript{1,2,3}, + Susan M. Bailey\textsuperscript{4,5 *}, + Christopher E. Mason\textsuperscript{1,2,3,6 *} + } +\end{center} + +\small{ \noindent + \textsuperscript{1} Department of Physiology and Biophysics, Weill Cornell Medicine, New York, New York, USA + \\ + \textsuperscript{2} The HRH Prince Alwaleed Bin Talal Bin Abdulaziz Alsaud Institute for Computational Biomedicine, \\ + \textcolor{white}{\textsuperscript{2}} Weill Cornell Medicine, New York, New York, USA + \\ + \textsuperscript{3} The Feil Family Brain and Mind Research Institute, New York, New York, USA + \\ + \textsuperscript{4} Department of Environmental and Radiological Health Sciences, Colorado State University, Fort Collins, CO + \\ + \textsuperscript{5} Cell and Molecular Biology Program, Colorado State University, Fort Collins, CO + \\ + \textsuperscript{6} The WorldQuant Initiative for Quantitative Prediction, Weill Cornell Medicine, New York, NY, USA + \\ + \textsuperscript{\#} Co-first authors + \\ + \textsuperscript{*} Corresponding authors. Send correspondence to S.M.B. (susan.bailey@colostate.edu) \\%rem + \textcolor{white}{\textsuperscript{*}} and C.E.M. (chm2042@med.cornell.edu) +} + +\normalsize +\doublespacing + +\section*{Abstract} \addcontentsline{toc}{section}{Abstract} + Telomeres are regions of repetitive nucleotide sequences capping the ends of eukaryotic chromosomes that protect against deterioration, + and whose lengths can be correlated with age and adverse health risk factors. + Yet, given their length and repetitive nature, + telomeric regions are not easily reconstructed from short-read sequencing, + making telomere sequencing, mapping, and variant resolution difficult problems. + Recently, long-read sequencing, with read lengths measuring in hundreds of Kbp, + has made it possible to routinely read into telomeric regions and inspect their sequence structure. + Here, we describe a framework for + extracting telomeric reads from whole genome single-molecule sequencing experiments, + including \textit{de novo} identification of telomere repeat motifs and repeat types, + and also describe their sequence variation. + We find that + long, complex telomeric stretches can be accurately captured with long-read sequencing, + observe extensive sequence heterogeneity of human telomeres, + discover and localize non-canonical motifs (both previously reported, as well as novel), + confirm the presence of the non-canonical motifs in short read sequencing experiments, + and report the first motif composition maps of human telomeric haplotypes across three distinct ancestries + (Ashkenazim, Chinese, and Utah) and two trios on a multi-Kbp scale. + +\section*{Keywords} \addcontentsline{toc}{section}{Keywords} + Telomere, telomeric haplotypes, long-read sequencing, telomere sequence heterogeneity + +\pagebreak +%\singlespacing +%\tableofcontents +\doublespacing + +\section*{Introduction} \addcontentsline{toc}{section}{Introduction} + Telomeres are the functional ends of human chromosomes that naturally shorten with cell division, and thus with age \parencite{teloaging}. + Telomere length is also influenced by a variety of lifestyle factors and environmental exposures + (e.g., stress, exercise, air pollution, radiation) \parencite{teloeffects}. + While human telomeres are known to consist largely of a conserved six-nucleotide repeat (TTAGGG) \parencite{moyzis}, + several studies have identified variations of this motif in proximal telomeric regions + \parencite{telovars1989,telovars1999,telovars2018,telovars2019}. + However, such studies were performed with oligonucleotide hybridization, PCR, immunoprecipitation, and short-read sequencing, + requiring prior assumptions about specific target motifs, custom sample preparation, and targeted sequencing, + and therefore preventing \textit{de novo} identification of motif variants and their localization. + Thus, long-range maps of telomeric sequence variation in the human genome are still incomplete, preliminary \parencite{shafin}, + or have only been completed for a single genome \parencite{jain,t2t}. + Therefore, completing maps of telomeres and providing new tools for such research \parencite{nurk} can provide new insight into + telomere biology and enable novel approaches to analyze the effects of aging, environment, and health status \parencite{telovars2018} + on telomere sequence and length. + \\~\\ + To improve our understanding of telomere sequence structure and variation, we developed \textit{edgeCase}, + a scalable framework for alignment and \textit{de novo} telomeric motif discovery + from human whole genome long-read sequencing experiments. + We have validated these methods using + Genome in a Bottle \parencite{giab} single-molecule real-time (SMRT) sequencing datasets + generated with Pacific Biosciences circular consensus sequencing (PacBio CCS) \parencite{pacbio,pacbioccs}, + and short-read Illumina \parencite{illumina} and 10x Genomics [Chromium] (www.10xgenomics.com) datasets, + as well as with healthy donor peripheral blood mononuclear cells (PBMCs). + These results provide evidence for + multiple novel, non-canonical telomeric repeats, + resolution of multiple chromosome-specific haplotypes with SMRT sequencing, + and a new method for long-range characterization of the structure of telomeric sequences. + +\section*{Results} \addcontentsline{toc}{section}{Results} + +\subsection*{A telomere-annotated reference genome enables recovery of telomeric reads from human long-read whole genome sequencing datasets} +\addcontentsline{toc}{subsection}{A telomere-annotated reference genome enables recovery of telomeric reads from human long-read whole genome sequencing datasets} + We first constructed an extended reference genome, \textit{hg38ext}, + that combines chromosome sequences of the \textit{hg38} reference genome \parencite{grch38,hg38} + and human subtelomeric assemblies \parencite{riethman2014}, + resulting in a reference set annotated with boundaries of subtelomeric and telomeric tracts. + % (see \hyperref[sec:methods]{Materials and Methods}). + The layout of this reference set is available in \textbf{Supplemental File S1}, + and the set itself can be reproduced with a script available as \textbf{Supplemental File S2}. + We then aligned to it PacBio CCS reads of seven Genome in a Bottle [GIAB] \parencite{giab} human subjects (HG001 through HG007) + from three different ancestries (Ashkenazim, Chinese, and Utah), + which included two son/father/mother trios (\textbf{Supplemental Table S1}). + In total, we observed reads uniquely mapping to the ends of chromosomes and extending into telomeric regions + on 9 \textit{p} arms and 14 \textit{q} arms, + with 43\textendash{}285 such reads on the \textit{p} arms + and 34\textendash{}250 on the \textit{q} arms + (\textbf{Supplemental Table S2}). + Portions of reads contained in the telomeric regions were extracted for further analysis (\textbf{\autoref{fig:schematic_alignment}}). + \begin{figure}[h!] \centering % \vspace{5mm} + \includegraphics[height=.75\textheight,width=\textwidth,keepaspectratio]{../figures/Figure_1.pdf} + \caption{ + \small Mapping of candidate telomeric reads, illustrated with reads from the HG002 dataset aligning to \mbox{Chromosome 12.} + The chromosome is displayed schematically, centered around the centromere. + Vertical red dashed lines denote the position of the boundary of the annotated telomeric tract. + Coordinates are given in Kbp, relative to the positions of the telomeric tract boundaries. + Statistics for all chromosomes of all seven datasets are provided in \textbf{Supplemental Table S2}. + } + \label{fig:schematic_alignment} + \end{figure} + +\subsection*{Telomeric long reads contain variations of the canonical motif} +\addcontentsline{toc}{subsection}{Telomeric long reads contain variations of the canonical motif} + We then performed \textit{de novo} repeat discovery in the telomeric sequences for motifs of lengths 4 through 16, + and identified motifs in repeat contexts that are statistically enriched in the seven datasets. + The majority of motifs were either the canonical TTAGGG / CCCTAA, + its variations (e.g., T\underline{G}AGGG), + or a duplet of variants, such as TTAGGGTTA\underline{G}GGG + (\textbf{\autoref{tab:repeatfinder}}). + CG-rich motifs were also observed on the \textit{q} arms. + The top enriched motif (TTAGGG / CCCTAA) explained + 27.0\%\textendash{}76.9\% of the telomeric repeat content on the \textit{p} arms + and 49.1\%\textendash{}80.1\% on the \textit{q} arms, + while motifs T\underline{G}AGGG and TTA\underline{G}GGG explained up to 8.0\% and 6.6\% of the repeat content overall, respectively. + \\~\\ + We next visualized the locations of the top three enriched motifs and their reverse complements + on the chromosomal ends of the HG002 dataset (\textbf{\autoref{fig:hg002_densityplot}}), + as it provided the deepest coverage among the assessed datasets (\textbf{Supplemental Table S2}); + plots for the other six datasets are available as \textbf{Supplemental Figs. S2} and \textbf{S3}. + Only the chromosomal arms cumulatively covered by at least 25 reads across datasets were plotted. + These data showed that the overwhelming majority of the telomeric regions were represented by the canonical repeats, + but also novel, chromosome-specific repeat motif patterns could be observed, + and they were enriched for the proximal end of the telomere; + these data also illustrated the positions of the repeat-rich portions of the genomes in relation to the + known subtelomere-telomere boundaries, including deletions/insertions (4p, 8q) + and an apparent extension of the 17p subtelomere (see \hyperref[sec:discussion]{Discussion}). + \\~\\ % \noindent + To discern if the sequence mapping, read length, or overall coverage had any effect on the discovery or enrichment of these motifs, + the motif entropies were examined as a function of their location within reads and coverage across the telomere tracks. + When the locations of different motifs were examined within any 10 bp window across the length of the long reads, + the entropy data showed consistency among reads and across samples (\textbf{\autoref{fig:entropy}}). + Indeed, the coverage-weighted median of normalized Shannon entropy was 0.07 for one dataset and 0.00 for the other six, + while most non-zero values were contained only in the top quartile, + i.e., between the 75th and the 100th percentile, + indicating that locations of the variations are colinear among reads. + \input{../tables/Table_1.tex} % tab:repeatfinder + \begin{figure}[h!] \centering + \includegraphics[height=.88\textheight,width=\textwidth,keepaspectratio]{../figures/Figure_2.pdf} + \caption{ + \small Densities of the top three enriched motifs at ends of chromosomal \textbf{(A)} \textit{p} arms + and \textbf{(B)} \textit{q} arms of the HG002 dataset. + \textit{Background} represents the remaining sequence content (non-repeating sequence and not significantly enriched motifs). + Reads are shown aligned to the contigs in the \textit{hg38ext} reference set, and genomic coordinates are given in Kbp. + Vertical red dashed lines denote the position of the boundary of the annotated telomeric tract. + } + \label{fig:hg002_densityplot} + \end{figure} + \begin{figure}[h!] \centering + \includegraphics[height=.75\textheight,width=\textwidth,keepaspectratio]{../figures/Figure_3.pdf} + \caption{ + \small Distribution of motif entropies in 10 bp windows of of candidate PacBio CCS reads aligning to the same chromosomal arms + in GIAB datasets HG001 through HG007, + with respect to per-window coverage, + and the coverage-weighted percentiles of the entropy values. + } + \label{fig:entropy} + \end{figure} + +\subsection*{Short-read sequencing validates motif variations observed in long reads} +\addcontentsline{toc}{subsection}{Short-read sequencing validates motif variations observed in long reads} + We next validated these findings using short-read sequencing in two ways. + First, we extracted telomeric candidate reads with \textit{Telomerecat} \parencite{telomerecat} + from respective GIAB Illumina datasets, + and found that they supported a definitive majority of the long-read telomeric candidates, with + a median 89\% of the \textit{p} arm sequence + and a median 95\% of the \textit{q} arm sequence supported + (\textbf{Supplemental Fig. S1}). + Second, we confirmed 13 of the enriched motifs in independently generated human short-read and linked-read genomic datasets + from donated PBMCs, with the same three motifs being the most enriched (\textbf{Supplemental Table S3}). + +\subsection*{Long-read sequencing uncovers a variety of human telomeric haplotypes} +\addcontentsline{toc}{subsection}{Long-read sequencing uncovers a variety of human telomeric haplotypes} + While reads agreed on colinearity of motifs, evidenced by low entropy, + rarer non-zero entropy values could be attributable both to sequencing errors + and to structural variations within the same subject's dataset. + To investigate the latter possibility, + we clustered reads on each arm of each subject by relative pairwise Levenshtein distances \parencite{levenshtein} + and found that hierarchical clustering described read similarity well, + resulting in high cophenetic correlation between the dendrograms and the pairwise distance matrices + (\textbf{\autoref{tab:cophenetic}}), + and in visible structure (\textbf{\autoref{fig:haplotypes_p}}, \textbf{\autoref{fig:haplotypes_q}}). + % \eject \pdfpageheight=40in \pdfpagewidth=22in %remall + \begin{figure}[h!] \centering % \vspace{5mm} %remall + \includegraphics[height=.88\textheight,width=\textwidth,keepaspectratio]{../figures/Figure_4.pdf} %remall + \caption{ %remall + \small Clustering of reads by relative pairwise Levenshtein distances (unitless measure) %remall + on each chromosomal \mbox{\textit{p} arm} of datasets HG001 through HG007, %remall + and densities of the top enriched motifs along each read. %remall + Each horizontal line represents an individual read; %remall + genomic coordinates are given in Kbp, relative to the positions of the telomeric tract boundaries. %remall + Only the chromosomal arms cumulatively covered by at least 25 reads are displayed. %remall + } %remall + \label{fig:haplotypes_p} %remall + \end{figure} %remall + % \eject \pdfpageheight=11in \pdfpagewidth=8.5in + % \eject \pdfpageheight=40in \pdfpagewidth=22in %remall + \begin{figure}[h!] \centering % \vspace{5mm} %remall + \includegraphics[height=.88\textheight,width=\textwidth,keepaspectratio]{../figures/Figure_5.pdf} %remall + \caption{ %remall + \small Clustering of reads by relative pairwise Levenshtein distances (unitless measure) %remall + on each chromosomal \mbox{\textit{q} arm} of datasets HG001 through HG007, %remall + and densities of the top enriched motifs along each read. %remall + Each horizontal line represents an individual read; %remall + genomic coordinates are given in Kbp, relative to the positions of the telomeric tract boundaries. %remall + Only the chromosomal arms cumulatively covered by at least 25 reads are displayed. %remall + } %remall + \label{fig:haplotypes_q} %remall + \end{figure} %remall + % \eject \pdfpageheight=11in \pdfpagewidth=8.5in + \input{../tables/Table_2.tex} % tab:cophenetic + ~\\~\\ + In this complex clustering, subject- and population-specific variation was evident and quantifiable via relative Levenshtein distances + (\textbf{\autoref{tab:haptests}}; see \hyperref[sec:methods]{Materials and Methods}): + overall, + telomeric reads within a subject were more similar than within a population + (adjusted Wilcoxon signed-rank test \textit{p} = 4.2e-56), + and telomeric reads within a population were more similar than between populations + (\textit{p} = 2.2e-40). + ~\\ + \input{../tables/Table_3.tex} + ~\\ + Importantly, however, this was true for most, but not all reads; + 13.8\% of all assessed reads (165 out of 1,192) contributed to interpopulation similarity; + these reads were twice as close to reads from a different population than they were to any reads of their own subjects. + This trend is observable on \textbf{\autoref{fig:haplotypes_p}} and \textbf{\autoref{fig:haplotypes_q}}, + with subjects' and populations' reads interspersed across multiple clusters. + Therefore, the captured reads reflected spectra of haplotypes, + generally describing subject- and population-specific similarities, + but including a sizable component that described interpopulation similarity. + Paternal inheritance of variation was also observed: + each father's telomeric reads were more similar to their son's than to the mother's reads + in both the Ashkenazim and the Chinese trios. + +\section*{Discussion} \addcontentsline{toc}{section}{Discussion} \label{sec:discussion} + Repeat-rich, low-complexity regions of the human genome such as telomeres + have been historically recalcitrant to full mapping and annotation \parencite{miga2015}, + mainly due to the alignment challenge they pose and to the read lengths required to span such areas \parencite{ngslowcomplexity}. + The advent of long-read, single-molecule methods (third generation sequencing) + has provided new opportunities to map the sequence composition of a previously "dark" area of the human genome, + enabling research into the sequence composition and length dynamics \parencite{luxton2020} of telomeres. + Our results reaffirm + that the canonical repeat (TTAGGG) is certainly the most dominant motif found within telomeres, + but also reveal a surprising diversity of repeat variations, + which are confirmed by both short and long-read sequencing technologies. + This diversity of repeat sequence includes previously reported variants, + as well as novel motifs that are characterized not only by nucleotide substitutions, + but also insertions, deletions, and even motif pairing. + Interestingly, repeat patterns were chromosome-specific, with different non-canonical repeats being pronounced on different chromosomes, + such as TGAGGG on 12q and TTAGGGG on 15q, which may be correlated with particular biological pathways \parencite{telovars2019}. + Apart from these variations, + CG-rich motifs were identified in telomeric regions of \textit{q} arms, + consistent with previously reported findings \parencite{cpg}. + Moreover, while short read sequencing is capable of identifying such variants, + it alone cannot reveal the relative locations of these motifs within telomeres, + as repetitive short reads can + neither be aligned outside of the reference genome + nor provide enough overlap variability to be assembled \textit{de novo}. + Long SMRT reads, on the other hand, + can be anchored to known subtelomeric sequences of the human genome and extend into the previously unmapped telomeric area. + Furthermore, in contrast to previously published research that utilized targeted sequencing + \parencite{telovars1989,telovars1999,telovars2018,telovars2019}, + the method described here allows identification of multiple enriched motifs and their localization + \textit{de novo}, without any bias introduced by prior knowledge about the sequence of target motifs. + These results also highlight the need of better subtelomeric and telomeric annotations in the human genome: + the canonical motif was present on the \textit{q} arm of Chromosome 8 + only 2\textendash{}3Kbp beyond the annotated boundary in all datasets; + the candidate reads on the \textit{p} arm of Chromosome 17 represented TTAGGG-rich and non-TTAGGG-rich haplotypes, + indicating that in multiple subjects and ancestries there exists an extension of the 17p subtelomere. + Strikingly, for example, the Ashkenazim son (HG002) provided only non-TTAGGG-rich 17p reads, while both + the father (HG003) and the mother (HG004) had a mixture of apparently telomeric and non-telomeric 17p reads. + This supports previous findings \parencite{riethman2020} + that the existing assemblies do not provide completely accurate subtelomeric annotation, + and suggests that methods described herein could help to resolve these areas of reference genomes. + \\~\\ + We observed PacBio CCS reads reaching up to 16 Kbp beyond the known regions of the genome, + and resolving the underlying sequence with fidelity, + as measured both by the entropy of motif assignment and by pairwise Levenshtein distances + between the reads belonging to the same chromosomal arms. + While short reads also provided support for non-canonical motifs, + the overlap between the short and the long reads was substantial, but not complete, + which can be explained by the necessary bias towards the canonical motif during the selection of short reads. + Therefore, telomeric regions with higher content of non-canonical repeats are less likely to be identified through the use of short reads, + and so, long reads appear to be more suitable for this purpose as well. + \\~\\ + The identified variations in long range contexts elucidate + subject-specific, trio- and population-specific similarities of telomeric sequences, + as well as a level of interpopulation similarity, + and thus provide a new means of haplotype mapping and reveal the existence and motif composition of haplotype spectra + on a multi-Kbp scale. + Interpopulation similarity, as well as paternal inheritance of variation, + provided evidence that the observed haplotypes could not be attributed to per-dataset batch effects. + The lengths of PacBio CCS reads allowed resolution of uniquely mapping reads only on 23 chromosomal arms, + and coverage of different arms was uneven. + As such, numbers of captured telomeric reads and levels of observed similarity varied from subject to subject; + in particular, maternal inheritance of haplotypes could not be determined, in contrast to statistically significant paternal inheritance. + This calls for more sequencing experiments aimed to reconstruct the full picture of this variation. + Clustering on a per-subject basis concealed interpopulation similarity, + but underscored intra-subject variation (\textbf{Supplemental Figs. S4} and \textbf{S5}), + suggesting coexistence of two or more telomeric haplotypes per chromosomal arm within each subject. + Given that the reference DNA for the subjects HG001 through HG007 + was extracted from growths of B lymphoblastoid cell lines, + this suggests that as B cells undergo maturation, + distinct clones may gain distinct variations in their telomeric sequence. + This opens up avenues of investigation into the haplotypic variation among not only immune cells, + but also different cell types overall, + and provides a new opportunity to map, quantify, and characterize a previously unrecognized form of human genetic variation. + +\section*{Materials and Methods} \addcontentsline{toc}{section}{Materials and Methods} \label{sec:methods} + +\subsection*{The extended reference genome} +\addcontentsline{toc}{subsection}{The extended reference genome} + We constructed the extended reference genome by performing an all-to-all alignment + of all contigs in the \textit{hg38} reference genome \parencite{grch38,hg38} + and the subtelomeric assemblies \parencite{riethman2014} + with \textit{minimap2} \parencite{minimap} using three settings + for assembly-to-reference mapping (\textit{asm5}, \textit{asm10}, \textit{asm20}). + Forty subtelomeric contigs mapped to ends of \textit{hg38} chromosomes with a mapping quality of 60, + one (XpYptel) mapped with the quality of 0 and was discarded; + one (14qtel) mapped to the ALT version of Chromosome 14 (chr14\_KI270846v1\_alt) with the quality of 52, + which, in turn, mapped to the main contig of Chromosome 14 (chr14) with the quality of 60. + These data and the exact match and mismatch coordinates were used to create a combined reference (\textit{hg38ext}) + in which subtelomeric contigs informed the locations of the boundaries of the telomeric tracts (\textit{tract\_anchor}). + Such contigs that mapped fully within \textit{hg38} chromosomes resulted in \textit{tract\_anchor} annotations + directly on those \textit{hg38} chromosomes; + partially mapping contigs were considered as forking from the \textit{hg38} sequence and were similarly annotated by themselves. + For the purposes of capturing candidate reads that uniquely align to subtelomere-telomere boundaries, + subtelomeric contigs which were not previously assembled as extending completely up to the start of the telomere, + and/or were not precisely localized in relation to the reference genome, + such as 1p, 6p, 7p, 8p, 11p, 20p, 3q, 4q, 20q, and Xq \parencite{riethman2014,riethman2020}, + were masked prior to downstream analyses. + +\subsection*{Detection of telomeric sequences in long-read datasets} +\addcontentsline{toc}{subsection}{Detection of telomeric sequences in long-read datasets} + Seven subjects were selected for the analysis. + The first individual (NA12878/HG001) came from the pilot genome of the HapMap project \parencite{HG001}, + while the other six, + including + the Ashkenazim Jewish Trio (son: NA24385/HG002, father: NA24149/HG003, mother: NA24143/HG004) + and the Chinese Trio \\%rem + (\mbox{son: NA24631/HG005}, father: NA24694/HG006, mother: NA24695/HG007), + are members of the Personal Genome Project, + whose genomes are consented for commercial redistribution and reidentification \parencite{HG00X}. + These subjects are referred to throughout as HG001 through HG007, respectively. + \\~\\ + Multiple Genome in a Bottle \parencite{giab} PacBio CCS \parencite{pacbio,pacbioccs} datasets were available and combined per each subject, + with mean coverages of individual datasets ranging from $\sim$21x to $\sim$69x (\textbf{Supplemental Table S1}). + We mapped these reads to \textit{hg38ext} with \textit{minimap2}, allowing secondary mappings, + and selected reads that mapped to either end of either chromosome, + having an at least 500 bp portion of their sequence mapped to the reference contig and a portion extending beyond the reference + (soft- or hard-clipped in the alignment file). + As each of such reads can map to multiple subtelomeres due to paralogy, + we considered such multiple mappings and only retained the reads that mapped to a unique subtelomere; + furthermore, out of these candidates, we only selected the ones overlapping the subtelomere and the telomere by at least 3Kbp. + Sequences past the \textit{tract\_anchor} marker were extracted from the reads that had this marker within their mapped portion + (from the 5' end to the marker on \textit{p} arms and from the marker to the 3' end on \textit{q} arms, + accounting for forward and reverse mappings; \textbf{\autoref{fig:schematic_alignment}}). + +\subsection*{Evaluation of telomeric content in short- and linked-read datasets} +\addcontentsline{toc}{subsection}{Evaluation of telomeric content in short- and linked-read datasets} + To evaluate the concordance of telomeric reads captured by long- and short-read technologies, + we extracted candidate telomeric reads from GIAB Illumina datasets for each subject (\textbf{Supplemental Table S1}) + with \textit{Telomerecat} \parencite{telomerecat}, + and mapped the short reads back onto the candidate long reads from the same subject's dataset with \textit{minimap2}, + again allowing all secondary mappings. + Then, we calculated the fractions of each long read that were supported by the short reads that aligned to them. + \\~\\ + To evaluate sequence motifs in independent samples collected from human subjects (as opposed to reference cell lines), + we analyzed four whole-genome Illumina datasets (mean coverage $\sim$104x) + and three linked-read 10x datasets (mean coverage $\sim$28x) for one individual at different timepoints, + and one additional linked-read 10x dataset (coverage $\sim$47x) for another individual. + These data were originally obtained from astronaut subjects for an unrelated space biology experiment, + and the blood samples were collected from the subjects as described in the study \parencite{twins_study}. + For each sample, 1.2ng of sorted immune cell input was aliquoted for + TruSeq PCR-free WGS (short-read) and standard Chromium 10x whole genome (linked-read) preparation respectively, + and sequenced across one S4 flow cell on an Illumina NovaSeq 6000. + From these datasets, candidate telomeric short reads were selected using Telomerecat \parencite{telomerecat}. + +\subsection*{Identification of repeat content} +\addcontentsline{toc}{subsection}{Identification of repeat content} + Overrepresentation of motifs of lengths $k \subset [4 .. 16]$ was tested within the candidate telomeric regions of PacBio CCS reads, + as well as in the candidate reads from independently generated Illumina and 10x Chromium datasets. + To target motifs in repeat contexts, + doubled sequences (for example, \textit{k}-mer ACGTACGT for motif ACGT) were counted with \textit{jellyfish} \parencite{jellyfish}, + and counts of \textit{k}-mers synonymous with respect to circular shifts (for example, ACGTACGT and CGTACGTA) were summed together. + For each such \textit{k}-mer, + Fisher's exact test was performed to determine whether its count is significant + on the background of counts of other \textit{k}-mers of the same length. + Briefly, we considered \textit{k}-mers with counts higher than 1.5 interquartile range above the third quartile of the distribution + as potentially classifiable, + and a $ 2 \times{} 2 $ contingency matrix $ C $ for the test was constructed as follows: + row 0 contained counts of potentially classifiable \textit{k}-mers, + row 1 contained counts of remaining (non-classifiable) \textit{k}-mers, + columns 0 and 1 contained counts of single and remaining (background) \textit{k}-mers, respectively, + i.e.: + $ C_{0,0} = $ {\rmfamily count of target \textit{k}-mer}, + $ C_{0,1} = $ {\rmfamily sum of counts of other potentially classifiable \textit{k}-mers}, + $ C_{1,0} = $ {\rmfamily median count of \textit{k}-mer}, + $ C_{1,1} = $ {\rmfamily sum of counts of other non-classifiable \textit{k}-mers}. + The resultant \textit{p}-values for each motif among the samples were combined using the Mudholkar-George method \parencite{george} + within each technology (PacBio CCS, Illumina, 10x Genomics), + and the Bonferroni multiple testing correction was applied. + Motifs in the long-read datasets for which \textit{k}-mers yielded \textit{p}-values below the cutoff of 0.05 were reported. + As even doubled sequences (such as ACGTACGT for motif ACGT) can partially overlap at the boundaries of repeat contexts, + we quantified their presence in the telomeric reads in two distinct ways. + Consider a sequence such as TTAGGG(\underline{TTAGTTAG})GGTTA: + the inner (TTAG)x2 repeat can be explained by the repeats of the canonical motif extending into it from either side; + the middle part of a similar sequence with a bigger number of the repeats of the 4-mer, TTAGGG\underline{TTAG(TTAGTTAG)TTAG}GGTTA, + can only be explained by the repeats of said 4-mer. + On the one hand, the maximum fraction of the sequence that can be explained by any one motif is a useful metric, + and it was calculated and reported. + On the other hand, the fraction of the \textit{k}-mers attributable to a specific motif \textendash{} and not to any others \textendash{} + elucidates the extent of deviation from the background repeat context, + and identifies motifs that most affect the sequence structure; + it was calculated as well and reported as each motif's score. + Additionally, motifs that were significantly enriched in the datasets produced by all three technologies (PacBio, Illumina, 10x), + with respect to reverse-complemented equivalence, were reported. + +\subsection*{Evaluation of sequence concordance in telomeric long reads} +\addcontentsline{toc}{subsection}{Evaluation of sequence concordance in telomeric long reads} + As telomeric reads contain long low-complexity regions and present an alignment challenge, + we evaluated concordance of their sequences without realignment of their portions that extended past the reference sequence. + To that end, for all reads mapping to the same chromosomal arm, + we calculated densities of each identified motif in a rolling window starting from the innermost mapped position of each entire read. + To evaluate whether the reads on the same arm agree on the positions of different motifs, + for each read, we calculated motif densities in 10 bp windows with 10 bp smoothing to buffer insertions and deletions. + For each window in each read, + the motif with the highest density was selected to represent that window. + Then, normalized Shannon entropy among all reads was calculated in each window as $ S = \frac{ - \sum_{i} \; ( p_{i} ln p_{i} )}{ln N} $, + where $ p_{i} $ is the frequency of each motif in the window and $ N $ is the number of motifs \parencite{hepc_entropy}. + The value of normalized entropy was a metric bounded by $ [ 0, 1 ] $, + with $ 0 $ describing perfect agreement and $ 1 $ describing maximum randomness. + As coverage of windows drops off towards the distal end of the alignment, + lower covered windows have less chance to produce entropy; + we calculated percentiles of entropy as weighted by coverage minus one + (thus prioritizing higher covered windows, and removing windows with the coverage of one and no entropy from the calculation). + For motif visualization, + we performed 1000 rounds of bootstrap of the calculated density values, + this time in 100 bp rolling windows to accommodate the scale of multi-Kbp plots, + and selected the lower and the upper bounds of the 95\% confidence interval of bootstrap. + +\subsection*{Identification of telomeric haplotypic variation} +\addcontentsline{toc}{subsection}{Identification of telomeric haplotypic variation} + Within groups of reads mapping to each chromosomal arm, all relative pairwise Levenshtein distances were calculated. + In short, Levenshtein distance is a string metric defined as the edit distance between two strings (sequences), + equal to the minimum number of single-character insertions, deletions, and substitutions + required to make these sequences identical \parencite{levenshtein}. + For each pair of reads, this metric was calculated and represented absolute edit distance; + the relative distance was then computed as the absolute distance divided by the length of the overlap, + to normalize for the variation of such lengths. + Pairwise relative distances were then clustered using Ward's method via the Euclidean metric, + resulting in hierarchical structure describing the extents of similarity among reads. + To quantify how accurately hierarchical clustering described this similarity, + cophenetic distances \parencite{cophenetic} between the hierarchies (dendrograms) and the distance matrices was calculated, + and their Pearson correlation coefficients and Bonferroni-corrected \textit{p}-values were reported. + \\~\\ + We then traversed the distance matrices, + and for each read, tracked the closest reads by category: + closest reads from the same subject, + from the same trio (population), + and from the outgroup (other populations). + For the Ashkenazim and the Chinese trios, we also tracked the closest reads + between the parents + and between each parent and the child. + Thus, for each read, we determined whether it locally clustered + within its own category + (for example, with other reads of the same subject, or with other reads from the same population) + or in a different one + (for example, with other reads of a different population), + and the value of the distances that drove either clustering. + Performing the Wilcoxon signed-rank test on these values between either categories + provided us with \textit{p}-values that, after a Bonferroni correction, + described whether reads tended to cluster in their own category or in a different one. + Additionally, we also identified the minority of reads that did not follow the overall trend, + and quantified the extent to which they did so + (such as the reads that contributed to interpopulation similarity). + +\section*{Data access} \addcontentsline{toc}{section}{Data access} +Healthy donor DNA came from a previous study [The NASA Twins Study] \parencite{twins_study}. +The NASA Life Sciences Data Archive (LSDA) is the repository for all human and animal research data, including the whole genome Illumina and 10x Chromium sequencing datasets from subjects aboard the ISS that were used in this study. +These datasets are protected by the terms of the Weill Cornell Medicine Internal Review Board (IRB) and can be made available to be shared upon request. +LSDA has a public facing portal where data requests can be initiated (\href{https://lsda.jsc.nasa.gov/Request/dataRequestFAQ}{lsda.jsc.nasa.gov/Request/dataRequestFAQ}); +the LSDA team provides the appropriate processes, tools, and secure infrastructure for archival of experimental data and dissemination while complying with applicable rules, regulations, policies, and procedures governing the management and archival of sensitive data and information. +The LSDA team enables data and information dissemination to the public or to authorized personnel either by providing public access to information or via an approved request process for information and data from the LSDA in accordance with NASA Human Research Program and JSC Institutional Review Board direction. +\\~\\ +%\section*{Availability and implementation} \addcontentsline{toc}{section}{Availability and implementation} +The software for identification of telomeric reads, \textit{de novo} discovery of repeat motifs, haplotype inference and motif density visualization was implemented in Python and is freely available at \\%rem +\href{https://github.com/lankycyril/edgecase}{github.com/lankycyril/edgecase}, +as well as \textbf{Supplemental File S3}. + +\section*{Acknowledgements} \addcontentsline{toc}{section}{Acknowledgements} +We would like to thank +the Epigenomics Core Facility at Weill Cornell Medicine, +the Scientific Computing Unit (SCU), +XSEDE Supercomputing Resources, +as well as +the STARR grants I9-A9-071, I13-0052, +The Vallee Foundation, +The WorldQuant Foundation, +The Pershing Square Sohn Cancer Research Alliance, +NASA (NNX14AH51G, NNX14AB02G, NNX17AB26G), +The National Institutes of Health (R01MH117406, \\%rem +R01NS076465, R01CA249054, R01AI151059, P01HD067244, P01CA214274), +TRISH (NNX16AO69A:0107, \\%rem +NNX16AO69A:0061), +the LLS (9238-16, Mak, MCL-982, Chen-Kiang), +and +the NSF (1840275). + +\section*{Author contributions} \addcontentsline{toc}{section}{Author contributions} +S.M.B. and C.E.M. conceived the study. +K.G., J.F., and C.E.M. developed the framework and analyzed the data. +D.Bu., J.J.L., M.J.M., L.T., and K.A.G. participated in collection and processing of the ISS samples. +D.Be., D.Bu., J.J.L, J.R., and C.M. analyzed the data. +All authors edited the manuscript. + +\section*{Competing interests} \addcontentsline{toc}{section}{Competing interests} +The authors declare no relevant conflict of interest, although C.E.M. is a Co-Founder of Onegevity. + +\section*{References} \addcontentsline{toc}{section}{References} +\begingroup \raggedright \singlespacing \printbibliography[heading=none] \endgroup + +\end{document} diff --git a/assets/paper/manuscript/makefile b/assets/paper/manuscript/makefile new file mode 100644 index 0000000..f7636c1 --- /dev/null +++ b/assets/paper/manuscript/makefile @@ -0,0 +1,29 @@ +all: haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.pdf +submission: ../figures/Figure_1.png ../figures/Figure_2.png ../figures/Figure_3.png ../figures/Figure_4.png ../figures/Figure_5.png +submission: haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.odt + +%.pdf: %.tex + pdflatex $< + bibtex $(basename $(@F)) + pdflatex $< + pdflatex $< + rm -f $(basename $(@F)).aux $(basename $(@F)).bbl $(basename $(@F)).blg + rm -f $(basename $(@F))-blx.bib $(basename $(@F)).log + rm -f $(basename $(@F)).toc $(basename $(@F)).run.xml + rm -f $(basename $(@F)).bcf + +%.png: %.pdf + convert -density 300 $< $@ + +%.odt: %.tex + cd .. && ./tools/tex2office -i manuscript/references.bib figures tables --svg --rem-dot-dot manuscript/$< + +clean: + rm -f haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.aux + rm -f haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.bbl + rm -f haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-blx.bib + rm -f haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.blg + rm -f haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.log + rm -f haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.odt + rm -f haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.pdf + rm -f haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.run.xml diff --git a/publications/methods-paper/references.bib b/assets/paper/manuscript/references.bib similarity index 64% rename from publications/methods-paper/references.bib rename to assets/paper/manuscript/references.bib index 1efd932..06f0bd0 100644 --- a/publications/methods-paper/references.bib +++ b/assets/paper/manuscript/references.bib @@ -1,10 +1,12 @@ @article {teloaging, doi = {10.1152/physrev.00026.2007}, - year = {2008}, - month = apr, + url = {https://doi.org/10.1152%2Fphysrev.00026.2007}, + year = 2008, + month = {apr}, publisher = {American Physiological Society}, volume = {88}, number = {2}, + pages = {557--579}, author = {Geraldine Aubert and Peter M. Lansdorp}, title = {Telomeres and Aging}, journal = {Physiological Reviews} @@ -12,34 +14,40 @@ @article {teloaging @article {teloeffects, doi = {10.1097/mco.0b013e32834121b1}, - year = {2011}, - month = jan, + url = {https://doi.org/10.1097%2Fmco.0b013e32834121b1}, + year = 2011, + month = {jan}, publisher = {Ovid Technologies (Wolters Kluwer Health)}, volume = {14}, number = {1}, + pages = {28--34}, author = {Masood A Shammas}, - title = {Telomeres, lifestyle, cancer, and aging}, + title = {Telomeres, lifestyle, cancer, and aging}, journal = {Current Opinion in Clinical Nutrition and Metabolic Care} } @article {telovars1989, doi = {10.1093/nar/17.12.4611}, - year = {1989}, + url = {https://doi.org/10.1093%2Fnar%2F17.12.4611}, + year = 1989, publisher = {Oxford University Press ({OUP})}, volume = {17}, number = {12}, + pages = {4611--4627}, author = {Robin C Allshire and Maureen Dempster and Nicholas D. Hastie}, title = {Human telomeres contain at least three types of G{\textendash}rich repeat distributed non-randomly}, - journal = {Nucleic Acids Research} + journal = {Nucl Acids Res} } @article {telovars1999, doi = {10.1093/hmg/8.9.1637}, - year = {1999}, - month = sep, + url = {https://doi.org/10.1093%2Fhmg%2F8.9.1637}, + year = 1999, + month = {sep}, publisher = {Oxford University Press ({OUP})}, volume = {8}, number = {9}, + pages = {1637--1646}, author = {J. Coleman and D. M. Baird and N. J. Royle}, title = {The Plasticity of Human Telomeres Demonstrated by a Hypervariable Telomere Repeat Array That Is Located on Some Copies of 16p and 16q}, journal = {Human Molecular Genetics} @@ -47,11 +55,13 @@ @article {telovars1999 @article {telovars2018, doi = {10.1093/nar/gky297}, - year = {2018}, - month = apr, + url = {https://doi.org/10.1093%2Fnar%2Fgky297}, + year = 2018, + month = {apr}, publisher = {Oxford University Press ({OUP})}, volume = {46}, number = {10}, + pages = {4903--4918}, author = {Michael Lee and Erdahl T Teber and Oliver Holmes and Katia Nones and Ann-Marie Patch and Rebecca A Dagg and Loretta M~S Lau and Joyce H Lee and Christine E Napier and Jonathan W Arthur and Sean M Grimmond and Nicholas K Hayward and Peter A Johansson and Graham J Mann and Richard A Scolyer and James S Wilmott and Roger R Reddel and John V Pearson and Nicola Waddell and Hilda A Pickett}, title = {Telomere sequence content can be used to determine {ALT} activity in tumours}, journal = {Nucleic Acids Research} @@ -59,11 +69,13 @@ @article {telovars2018 @article {telovars2019, doi = {10.1093/nar/gky1289}, - year = {2019}, - month = jan, + url = {https://doi.org/10.1093%2Fnar%2Fgky1289}, + year = 2019, + month = {jan}, publisher = {Oxford University Press ({OUP})}, volume = {47}, number = {4}, + pages = {1896--1907}, author = {Alina Bluhm and Nikenza Viceconte and Fudong Li and Grishma Rane and Sandra Ritz and Suman Wang and Michal Levin and Yunyu Shi and Dennis Kappei and Falk Butter}, title = {{ZBTB}10 binds the telomeric variant repeat {TTGGGG} and interacts with {TRF}2}, journal = {Nucleic Acids Research} @@ -71,23 +83,27 @@ @article {telovars2019 @article {giab, doi = {10.1038/s41587-019-0074-6}, - year = {2019}, - month = apr, + url = {https://doi.org/10.1038%2Fs41587-019-0074-6}, + year = 2019, + month = {apr}, publisher = {Springer Science and Business Media {LLC}}, volume = {37}, number = {5}, + pages = {561--566}, author = {Justin M. Zook and Jennifer McDaniel and Nathan D. Olson and Justin Wagner and Hemang Parikh and Haynes Heaton and Sean A. Irvine and Len Trigg and Rebecca Truty and Cory Y. McLean and Francisco M. De La Vega and Chunlin Xiao and Stephen Sherry and Marc Salit}, title = {An open resource for accurately benchmarking small variant and reference calls}, - journal = {Nature Biotechnology} + journal = {Nat Biotechnol} } @article {pacbio, doi = {10.1126/science.1162986}, - year = {2009}, - month = jan, + url = {https://doi.org/10.1126%2Fscience.1162986}, + year = 2009, + month = {jan}, publisher = {American Association for the Advancement of Science ({AAAS})}, volume = {323}, number = {5910}, + pages = {133--138}, author = {J. Eid and A. Fehr and J. Gray and K. Luong and J. Lyle and G. Otto and P. Peluso and D. Rank and P. Baybayan and B. Bettman and A. Bibillo and K. Bjornson and B. Chaudhuri and F. Christians and R. Cicero and S. Clark and R. Dalal and A. deWinter and J. Dixon and M. Foquet and A. Gaertner and P. Hardenbol and C. Heiner and K. Hester and D. Holden and G. Kearns and X. Kong and R. Kuse and Y. Lacroix and S. Lin and P. Lundquist and C. Ma and P. Marks and M. Maxham and D. Murphy and I. Park and T. Pham and M. Phillips and J. Roy and R. Sebra and G. Shen and J. Sorenson and A. Tomaney and K. Travers and M. Trulson and J. Vieceli and J. Wegener and D. Wu and A. Yang and D. Zaccarin and P. Zhao and F. Zhong and J. Korlach and S. Turner}, title = {Real-Time {DNA} Sequencing from Single Polymerase Molecules}, journal = {Science} @@ -95,11 +111,13 @@ @article {pacbio @article {pacbioccs, doi = {10.1093/nar/gky066}, - year = {2018}, - month = feb, + url = {https://doi.org/10.1093%2Fnar%2Fgky066}, + year = 2018, + month = {feb}, publisher = {Oxford University Press ({OUP})}, volume = {46}, number = {5}, + pages = {2159--2168}, author = {Simon Ardui and Adam Ameur and Joris R Vermeesch and Matthew S Hestand}, title = {Single molecule real-time ({SMRT}) sequencing comes of age: applications and utilities for medical diagnostics}, journal = {Nucleic Acids Research} @@ -107,11 +125,13 @@ @article {pacbioccs @article {illumina, doi = {10.1038/nature07517}, - year = {2008}, - month = nov, + url = {https://doi.org/10.1038%2Fnature07517}, + year = 2008, + month = {nov}, publisher = {Springer Science and Business Media {LLC}}, volume = {456}, number = {7218}, + pages = {53--59}, author = {David R. Bentley and Shankar Balasubramanian and Harold P. Swerdlow and Geoffrey P. Smith and John Milton and Clive G. Brown and Kevin P. Hall and Dirk J. Evers and Colin L. Barnes and Helen R. Bignell and Jonathan M. Boutell and Jason Bryant and Richard J. Carter and R. Keira Cheetham and Anthony J. Cox and Darren J. Ellis and Michael R. Flatbush and Niall A. Gormley and Sean J. Humphray and Leslie J. Irving and Mirian S. Karbelashvili and Scott M. Kirk and Heng Li and Xiaohai Liu and Klaus S. Maisinger and Lisa J. Murray and Bojan Obradovic and Tobias Ost and Michael L. Parkinson and Mark R. Pratt and Isabelle M. J. Rasolonjatovo and Mark T. Reed and Roberto Rigatti and Chiara Rodighiero and Mark T. Ross and Andrea Sabot and Subramanian V. Sankar and Aylwyn Scally and Gary P. Schroth and Mark E. Smith and Vincent P. Smith and Anastassia Spiridou and Peta E. Torrance and Svilen S. Tzonev and Eric H. Vermaas and Klaudia Walter and Xiaolin Wu and Lu Zhang and Mohammed D. Alam and Carole Anastasi and Ify C. Aniebo and David M. D. Bailey and Iain R. Bancarz and Saibal Banerjee and Selena G. Barbour and Primo A. Baybayan and Vincent A. Benoit and Kevin F. Benson and Claire Bevis and Phillip J. Black and Asha Boodhun and Joe S. Brennan and John A. Bridgham and Rob C. Brown and Andrew A. Brown and Dale H. Buermann and Abass A. Bundu and James C. Burrows and Nigel P. Carter and Nestor Castillo and Maria Chiara E. Catenazzi and Simon Chang and R. Neil Cooley and Natasha R. Crake and Olubunmi O. Dada and Konstantinos D. Diakoumakos and Belen Dominguez-Fernandez and David J. Earnshaw and Ugonna C. Egbujor and David W. Elmore and Sergey S. Etchin and Mark R. Ewan and Milan Fedurco and Louise J. Fraser and Karin V. Fuentes Fajardo and W. Scott Furey and David George and Kimberley J. Gietzen and Colin P. Goddard and George S. Golda and Philip A. Granieri and David E. Green and David L. Gustafson and Nancy F. Hansen and Kevin Harnish and Christian D. Haudenschild and Narinder I. Heyer and Matthew M. Hims and Johnny T. Ho and Adrian M. Horgan and Katya Hoschler and Steve Hurwitz and Denis V. Ivanov and Maria Q. Johnson and Terena James and T. A. Huw Jones and Gyoung-Dong Kang and Tzvetana H. Kerelska and Alan D. Kersey and Irina Khrebtukova and Alex P. Kindwall and Zoya Kingsbury and Paula I. Kokko-Gonzales and Anil Kumar and Marc A. Laurent and Cynthia T. Lawley and Sarah E. Lee and Xavier Lee and Arnold K. Liao and Jennifer A. Loch and Mitch Lok and Shujun Luo and Radhika M. Mammen and John W. Martin and Patrick G. McCauley and Paul McNitt and Parul Mehta and Keith W. Moon and Joe W. Mullens and Taksina Newington and Zemin Ning and Bee Ling Ng and Sonia M. Novo and Michael J. O'Neill and Mark A. Osborne and Andrew Osnowski and Omead Ostadan and Lambros L. Paraschos and Lea Pickering and Andrew C. Pike and Alger C. Pike and D. Chris Pinkard and Daniel P. Pliskin and Joe Podhasky and Victor J. Quijano and Come Raczy and Vicki H. Rae and Stephen R. Rawlings and Ana Chiva Rodriguez and Phyllida M. Roe and John Rogers and Maria C. Rogert Bacigalupo and Nikolai Romanov and Anthony Romieu and Rithy K. Roth and Natalie J. Rourke and Silke T. Ruediger and Eli Rusman and Raquel M. Sanches-Kuiper and Martin R. Schenker and Josefina M. Seoane and Richard J. Shaw and Mitch K. Shiver and Steven W. Short and Ning L. Sizto and Johannes P. Sluis and Melanie A. Smith and Jean Ernest Sohna Sohna and Eric J. Spence and Kim Stevens and Neil Sutton and Lukasz Szajkowski and Carolyn L. Tregidgo and Gerardo Turcatti and Stephanie vandeVondele and Yuli Verhovsky and Selene M. Virk and Suzanne Wakelin and Gregory C. Walcott and Jingwen Wang and Graham J. Worsley and Juying Yan and Ling Yau and Mike Zuerlein and Jane Rogers and James C. Mullikin and Matthew E. Hurles and Nick J. McCooke and John S. West and Frank L. Oaks and Peter L. Lundberg and David Klenerman and Richard Durbin and Anthony J. Smith}, title = {Accurate whole human genome sequencing using reversible terminator chemistry}, journal = {Nature} @@ -119,34 +139,40 @@ @article {illumina @article {hg38, doi = {10.1038/35057062}, - year = {2001}, - month = feb, + url = {https://doi.org/10.1038%2F35057062}, + year = 2001, + month = {feb}, publisher = {Springer Science and Business Media {LLC}}, volume = {409}, number = {6822}, + pages = {860--921}, title = {Initial sequencing and analysis of the human genome}, journal = {Nature} } @article {grch38, doi = {10.1101/gr.213611.116}, - year = {2017}, - month = apr, + url = {https://doi.org/10.1101%2Fgr.213611.116}, + year = 2017, + month = {apr}, publisher = {Cold Spring Harbor Laboratory}, volume = {27}, number = {5}, + pages = {849--864}, author = {Valerie A. Schneider and Tina Graves-Lindsay and Kerstin Howe and Nathan Bouk and Hsiu-Chuan Chen and Paul A. Kitts and Terence D. Murphy and Kim D. Pruitt and Fran{\c{c}}oise Thibaud-Nissen and Derek Albracht and Robert S. Fulton and Milinn Kremitzki and Vincent Magrini and Chris Markovic and Sean McGrath and Karyn Meltz Steinberg and Kate Auger and William Chow and Joanna Collins and Glenn Harden and Timothy Hubbard and Sarah Pelan and Jared T. Simpson and Glen Threadgold and James Torrance and Jonathan M. Wood and Laura Clarke and Sergey Koren and Matthew Boitano and Paul Peluso and Heng Li and Chen-Shan Chin and Adam M. Phillippy and Richard Durbin and Richard K. Wilson and Paul Flicek and Evan E. Eichler and Deanna M. Church}, title = {Evaluation of {GRCh}38 and de novo haploid genome assemblies demonstrates the enduring quality of the reference assembly}, - journal = {Genome Research} + journal = {Genome Res.} } @article {riethman2014, doi = {10.1101/gr.166983.113}, - year = {2014}, - month = mar, + url = {https://doi.org/10.1101%2Fgr.166983.113}, + year = 2014, + month = {mar}, publisher = {Cold Spring Harbor Laboratory}, volume = {24}, number = {6}, + pages = {1039--1050}, author = {N. Stong and Z. Deng and R. Gupta and S. Hu and S. Paul and A. K. Weiner and E. E. Eichler and T. Graves and C. C. Fronick and L. Courtney and R. K. Wilson and P. M. Lieberman and R. V. Davuluri and H. Riethman}, title = {Subtelomeric {CTCF} and cohesin binding site organization using improved subtelomere assemblies and a novel annotation pipeline}, journal = {Genome Research} @@ -154,122 +180,126 @@ @article {riethman2014 @article {minimap, doi = {10.1093/bioinformatics/bty191}, - year = {2018}, - month = may, + url = {https://doi.org/10.1093%2Fbioinformatics%2Fbty191}, + year = 2018, + month = {may}, publisher = {Oxford University Press ({OUP})}, volume = {34}, number = {18}, + pages = {3094--3100}, author = {Heng Li}, + editor = {Inanc Birol}, title = {Minimap2: pairwise alignment for nucleotide sequences}, journal = {Bioinformatics} } @article {t2t, - doi = {10.1101/735928}, - year = {2019}, - month = aug, - publisher = {Cold Spring Harbor Laboratory}, - author = {Karen H. Miga and Sergey Koren and Arang Rhie and Mitchell R. Vollger and Ariel Gershman and Andrey Bzikadze and Shelise Brooks and Edmund Howe and David Porubsky and Glennis A. Logsdon and Valerie A. Schneider and Tamara Potapova and Jonathan Wood and William Chow and Joel Armstrong and Jeanne Fredrickson and Evgenia Pak and Kristof Tigyi and Milinn Kremitzki and Christopher Markovic and Valerie Maduro and Amalia Dutra and Gerard G. Bouffard and Alexander M. Chang and Nancy F. Hansen and Fran{\c{c}}oisen Thibaud-Nissen and Anthony D. Schmitt and Jon-Matthew Belton and Siddarth Selvaraj and Megan Y. Dennis and Daniela C. Soto and Ruta Sahasrabudhe and Gulhan Kaya and Josh Quick and Nicholas J. Loman and Nadine Holmes and Matthew Loose and Urvashi Surti and Rosa ana Risques and Tina A. Graves Lindsay and Robert Fulton and Ira Hall and Benedict Paten and Kerstin Howe and Winston Timp and Alice Young and James C. Mullikin and Pavel A. Pevzner and Jennifer L. Gerton and Beth A. Sullivan and Evan E. Eichler and Adam M. Phillippy}, - title = {Telomere-to-telomere assembly of a complete human X chromosome} + doi = {10.1038/s41586-020-2547-7}, + url = {https://doi.org/10.1038/s41586-020-2547-7}, + year = {2020}, + month = jul, + publisher = {Springer Science and Business Media {LLC}}, + volume = {585}, + number = {7823}, + pages = {79--84}, + author = {Karen H. Miga and Sergey Koren and Arang Rhie and Mitchell R. Vollger and Ariel Gershman and Andrey Bzikadze and Shelise Brooks and Edmund Howe and David Porubsky and Glennis A. Logsdon and Valerie A. Schneider and Tamara Potapova and Jonathan Wood and William Chow and Joel Armstrong and Jeanne Fredrickson and Evgenia Pak and Kristof Tigyi and Milinn Kremitzki and Christopher Markovic and Valerie Maduro and Amalia Dutra and Gerard G. Bouffard and Alexander M. Chang and Nancy F. Hansen and Amy B. Wilfert and Fran{\c{c}}oise Thibaud-Nissen and Anthony D. Schmitt and Jon-Matthew Belton and Siddarth Selvaraj and Megan Y. Dennis and Daniela C. Soto and Ruta Sahasrabudhe and Gulhan Kaya and Josh Quick and Nicholas J. Loman and Nadine Holmes and Matthew Loose and Urvashi Surti and Rosa ana Risques and Tina A. Graves Lindsay and Robert Fulton and Ira Hall and Benedict Paten and Kerstin Howe and Winston Timp and Alice Young and James C. Mullikin and Pavel A. Pevzner and Jennifer L. Gerton and Beth A. Sullivan and Evan E. Eichler and Adam M. Phillippy}, + title = {Telomere-to-telomere assembly of a complete human X chromosome}, + journal = {Nature} } @article {jellyfish, doi = {10.1093/bioinformatics/btr011}, - year = {2011}, - month = jan, + url = {https://doi.org/10.1093%2Fbioinformatics%2Fbtr011}, + year = 2011, + month = {jan}, publisher = {Oxford University Press ({OUP})}, volume = {27}, number = {6}, + pages = {764--770}, author = {Guillaume Mar{\c{c}}ais and Carl Kingsford}, - title = {A fast, lock-free approach for efficient parallel counting of occurrences of k-mers}, + title = {A fast, lock-free approach for efficient parallel counting of occurrences of k-mers}, journal = {Bioinformatics} } -@inproceedings {levenshtein, - title = {Binary codes capable of correcting deletions, insertions, and reversals}, - author = {Levenshtein, Vladimir I}, - booktitle = {Soviet physics doklady}, - volume = {10}, - number = {8}, - year = {1966} -} - @article {moyzis, doi = {10.1073/pnas.85.18.6622}, - year = {1988}, - month = sep, + url = {https://doi.org/10.1073%2Fpnas.85.18.6622}, + year = 1988, + month = {sep}, publisher = {Proceedings of the National Academy of Sciences}, volume = {85}, number = {18}, + pages = {6622--6626}, author = {R. K. Moyzis and J. M. Buckingham and L. S. Cram and M. Dani and L. L. Deaven and M. D. Jones and J. Meyne and R. L. Ratliff and J. R. Wu}, - title = {A highly conserved repetitive {DNA} sequence, ({TTAGGG})n, present at the telomeres of human chromosomes.}, + title = {A highly conserved repetitive {DNA} sequence, ({TTAGGG})n, present at the telomeres of human chromosomes.}, journal = {Proceedings of the National Academy of Sciences} } @article {telomerecat, doi = {10.1038/s41598-017-14403-y}, - year = {2018}, - month = jan, + url = {https://doi.org/10.1038%2Fs41598-017-14403-y}, + year = 2018, + month = {jan}, publisher = {Springer Science and Business Media {LLC}}, volume = {8}, number = {1}, + pages = {1--17}, author = {James H. R. Farmery and and Mike L. Smith and Andy G. Lynch}, title = {Telomerecat: A ploidy-agnostic method for estimating telomere length from whole genome sequencing data}, - journal = {Scientific Reports} + journal = {Sci Rep} } @article {miga2015, doi = {10.1007/s10577-015-9488-2}, - year = {2015}, - month = sep, + url = {https://doi.org/10.1007%2Fs10577-015-9488-2}, + year = 2015, + month = {sep}, publisher = {Springer Science and Business Media {LLC}}, volume = {23}, number = {3}, + pages = {421--426}, author = {Karen H. Miga}, title = {Completing the human genome: the progress and challenge of satellite {DNA} assembly}, - journal = {Chromosome Research} + journal = {Chromosome Res} } @article {ngslowcomplexity, doi = {10.1038/nrg3117}, - year = {2011}, - month = nov, + url = {https://doi.org/10.1038%2Fnrg3117}, + year = 2011, + month = {nov}, publisher = {Springer Science and Business Media {LLC}}, volume = {13}, number = {1}, + pages = {36--46}, author = {Todd J. Treangen and Steven L. Salzberg}, title = {Repetitive {DNA} and next-generation sequencing: computational challenges and solutions}, - journal = {Nature Reviews Genetics} + journal = {Nat Rev Genet} } @article {bic, doi = {10.1214/aos/1176344136}, - year = {1978}, - month = mar, + url = {https://doi.org/10.1214%2Faos%2F1176344136}, + year = 1978, + month = {mar}, publisher = {Institute of Mathematical Statistics}, volume = {6}, number = {2}, + pages = {461--464}, author = {Gideon Schwarz}, title = {Estimating the Dimension of a Model}, - journal = {The Annals of Statistics} -} - -@book {silhouette, - doi = {10.1002/9780470316801}, - year = {1990}, - month = mar, - publisher = {John Wiley {\&} Sons, Inc.}, - editor = {Leonard Kaufman and Peter J. Rousseeuw}, - title = {Finding Groups in Data} + journal = {Ann. Statist.} } @article {hepc_entropy, doi = {10.1086/508889}, - year = {2006}, - month = dec, + url = {https://doi.org/10.1086%2F508889}, + year = 2006, + month = {dec}, publisher = {Oxford University Press ({OUP})}, volume = {194}, number = {11}, + pages = {1529--1536}, author = {Claudia Minosse and Silvia Calcaterra and Isabella Abbate and Marina Selleri and Maria~S. Zaniratti and Maria~R. Capobianchi}, title = {Possible Compartmentalization of Hepatitis C Viral Replication in the Genital Tract of {HIV}-1{\textendash}Coinfected Women}, journal = {The Journal of Infectious Diseases} @@ -277,43 +307,50 @@ @article {hepc_entropy @article {HG001, doi = {10.1038/nature02168}, - year = {2003}, - month = dec, + url = {https://doi.org/10.1038%2Fnature02168}, + year = 2003, + month = {dec}, publisher = {Springer Science and Business Media {LLC}}, volume = {426}, number = {6968}, + pages = {789--796}, title = {The International {HapMap} Project}, journal = {Nature} } @article {HG00X, doi = {10.1038/sdata.2016.25}, - year = {2016}, - month = jun, + url = {https://doi.org/10.1038%2Fsdata.2016.25}, + year = 2016, + month = {jun}, publisher = {Springer Science and Business Media {LLC}}, volume = {3}, number = {1}, + pages = {1--26}, author = {Justin M. Zook and David Catoe and Jennifer McDaniel and Lindsay Vang and Noah Spies and Arend Sidow and Ziming Weng and Yuling Liu and Christopher E. Mason and Noah Alexander and Elizabeth Henaff and Alexa B.R. McIntyre and Dhruva Chandramohan and Feng Chen and Erich Jaeger and Ali Moshrefi and Khoa Pham and William Stedman and Tiffany Liang and Michael Saghbini and Zeljko Dzakula and Alex Hastie and Han Cao and Gintaras Deikus and Eric Schadt and Robert Sebra and Ali Bashir and Rebecca M. Truty and Christopher C. Chang and Natali Gulbahce and Keyan Zhao and Srinka Ghosh and Fiona Hyland and Yutao Fu and Mark Chaisson and Chunlin Xiao and Jonathan Trow and Stephen T. Sherry and Alexander W. Zaranek and Madeleine Ball and Jason Bobe and Preston Estep and George M. Church and Patrick Marks and Sofia Kyriazopoulou-Panagiotopoulou and Grace X.Y. Zheng and Michael Schnall-Levin and Heather S. Ordonez and Patrice A. Mudivarti and Kristina Giorda and Ying Sheng and Karoline Bjarnesdatter Rypdal and Marc Salit}, title = {Extensive sequencing of seven human genomes to characterize benchmark reference materials}, - journal = {Scientific Data} + journal = {Sci Data} } @article {cpg, doi = {10.1261/rna.1748309}, - year = {2009}, - month = oct, + url = {https://doi.org/10.1261%2Frna.1748309}, + year = 2009, + month = {oct}, publisher = {Cold Spring Harbor Laboratory}, volume = {15}, number = {12}, + pages = {2186--2194}, author = {S. G. Nergadze and B. O. Farnung and H. Wischnewski and L. Khoriauli and V. Vitelli and R. Chawla and E. Giulotto and C. M. Azzalin}, title = {{CpG}-island promoters drive transcription of human telomeres}, - journal = {{RNA}} + journal = {RNA}} } @article {george, doi = {10.1007/bf02056895}, - year = {1983}, - month = dec, + url = {https://doi.org/10.1007%2Fbf02056895}, + year = 1983, + month = {dec}, publisher = {Springer Science and Business Media {LLC}}, volume = {30}, number = {1}, @@ -323,9 +360,126 @@ @article {george journal = {Metrika} } -@online { 10x, +@article {luxton2020, + doi = {10.1016/j.celrep.2020.108435}, + url = {https://doi.org/10.1016/j.celrep.2020.108435}, + year = {2020}, + month = dec, + publisher = {Elsevier {BV}}, + volume = {33}, + number = {10}, + pages = {108435}, + author = {Jared J. Luxton and Miles J. McKenna and Lynn E. Taylor and Kerry A. George and Sara R. Zwart and Brian E. Crucian and Viktor R. Drel and Francine E. Garrett-Bakelman and Matthew J. Mackay and Daniel Butler and Jonathan Foox and Kirill Grigorev and Daniela Bezdan and Cem Meydan and Scott M. Smith and Kumar Sharma and Christopher E. Mason and Susan M. Bailey}, + title = {Temporal Telomere and {DNA} Damage Responses in the Space Radiation Environment}, + journal = {Cell Reports} +} + +@article {cophenetic, + doi = {10.2307/1217208}, + url = {https://doi.org/10.2307%2F1217208}, + year = 1962, + month = {feb}, + publisher = {Wiley}, + volume = {11}, + number = {2}, + pages = {33--40}, + author = {Robert R. Sokal and F. James Rohlf}, + title = {The comparison of dendrograms by objective methods}, + journal = {Taxon} +} + +@book {silhouette, + doi = {10.1002/9780470316801}, + url = {https://doi.org/10.1002%2F9780470316801}, + year = 1990, + month = {mar}, + publisher = {John Wiley {\&} Sons, Inc.}, + editor = {Leonard Kaufman and Peter J. Rousseeuw}, + title = {Finding groups in data: an introduction to cluster analysis} +} + +@inproceedings {levenshtein, + title = {Binary codes capable of correcting deletions, insertions, and reversals}, + author = {Levenshtein, Vladimir I}, + booktitle = {Soviet physics doklady}, + volume = {10}, + number = {8}, + pages = {707--710}, + year = {1966} +} + +@online {10x, author = {10x Genomics}, title = {Resolving Biology to Advance Human Health}, url = {https://www.10xgenomics.com/}, urldate = {2020-04-28} } + +@article {twins_study, + doi = {10.1126/science.aau8650}, + title = {The NASA Twins Study: A multidimensional analysis of a year-long human spaceflight}, + author = {Garrett-Bakelman, Francine E and Darshi, Manjula and Green, Stefan J and Gur, Ruben C and Lin, Ling and Macias, Brandon R and McKenna, Miles J and Meydan, Cem and Mishra, Tejaswini and Nasrini, Jad and others}, + journal = {Science}, + volume = {364}, + number = {6436}, + pages = {144}, + year = {2019}, + publisher = {American Association for the Advancement of Science} +} + +@article {shafin, + doi = {10.1038/s41587-020-0503-6}, + url = {https://doi.org/10.1038/s41587-020-0503-6}, + year = {2020}, + month = may, + publisher = {Springer Science and Business Media {LLC}}, + volume = {38}, + number = {9}, + pages = {1044--1053}, + author = {Kishwar Shafin and Trevor Pesout and Ryan Lorig-Roach and Marina Haukness and Hugh E. Olsen and Colleen Bosworth and Joel Armstrong and Kristof Tigyi and Nicholas Maurer and Sergey Koren and Fritz J. Sedlazeck and Tobias Marschall and Simon Mayes and Vania Costa and Justin M. Zook and Kelvin J. Liu and Duncan Kilburn and Melanie Sorensen and Katy M. Munson and Mitchell R. Vollger and Jean Monlong and Erik Garrison and Evan E. Eichler and Sofie Salama and David Haussler and Richard E. Green and Mark Akeson and Adam Phillippy and Karen H. Miga and Paolo Carnevali and Miten Jain and Benedict Paten}, + title = {Nanopore sequencing and the Shasta toolkit enable efficient de novo assembly of eleven human genomes}, + journal = {Nature Biotechnology} +} + +@article {jain, + doi = {10.1038/nbt.4060}, + url = {https://doi.org/10.1038/nbt.4060}, + year = {2018}, + month = jan, + publisher = {Springer Science and Business Media {LLC}}, + volume = {36}, + number = {4}, + pages = {338--345}, + author = {Miten Jain and Sergey Koren and Karen H Miga and Josh Quick and Arthur C Rand and Thomas A Sasani and John R Tyson and Andrew D Beggs and Alexander T Dilthey and Ian T Fiddes and Sunir Malla and Hannah Marriott and Tom Nieto and Justin O{\textquotesingle}Grady and Hugh E Olsen and Brent S Pedersen and Arang Rhie and Hollian Richardson and Aaron R Quinlan and Terrance P Snutch and Louise Tee and Benedict Paten and Adam M Phillippy and Jared T Simpson and Nicholas J Loman and Matthew Loose}, + title = {Nanopore sequencing and assembly of a human genome with ultra-long reads}, + journal = {Nature Biotechnology} +} + +@article {nurk, + doi = {10.1101/gr.263566.120}, + url = {https://doi.org/10.1101/gr.263566.120}, + year = {2020}, + month = aug, + publisher = {Cold Spring Harbor Laboratory}, + volume = {30}, + number = {9}, + pages = {1291--1305}, + author = {Sergey Nurk and Brian P. Walenz and Arang Rhie and Mitchell R. Vollger and Glennis A. Logsdon and Robert Grothe and Karen H. Miga and Evan E. Eichler and Adam M. Phillippy and Sergey Koren}, + title = {{HiCanu}: accurate assembly of segmental duplications, satellites, and allelic variants from high-fidelity long reads}, + journal = {Genome Research} +} + +@article {riethman2020, + doi = {10.1371/journal.pgen.1008347}, + url = {https://doi.org/10.1371/journal.pgen.1008347}, + year = {2020}, + month = jan, + publisher = {Public Library of Science ({PLoS})}, + volume = {16}, + number = {1}, + pages = {e1008347}, + author = {Eleanor Young and Heba Z. Abid and Pui-Yan Kwok and Harold Riethman and Ming Xiao}, + editor = {Nancy Maizels}, + title = {Comprehensive Analysis of Human Subtelomeres by Whole Genome Mapping}, + journal = {{PLOS} Genetics} +} diff --git a/assets/paper/manuscript/response-to-reviewers-2021-01.docx b/assets/paper/manuscript/response-to-reviewers-2021-01.docx new file mode 100644 index 0000000..587ecf1 Binary files /dev/null and b/assets/paper/manuscript/response-to-reviewers-2021-01.docx differ diff --git a/assets/paper/manuscript/response-to-reviewers-2021-02.docx b/assets/paper/manuscript/response-to-reviewers-2021-02.docx new file mode 100644 index 0000000..9a6b7b4 Binary files /dev/null and b/assets/paper/manuscript/response-to-reviewers-2021-02.docx differ diff --git a/assets/paper/snakefiles/bonferroni.snake b/assets/paper/snakefiles/bonferroni.snake new file mode 100644 index 0000000..ce0be24 --- /dev/null +++ b/assets/paper/snakefiles/bonferroni.snake @@ -0,0 +1,42 @@ +from pandas import concat, MultiIndex +from statsmodels.stats.multitest import multipletests +from re import sub + + +def load_multilevel_repeatfinder(tsv): + rf = read_csv(tsv, sep="\t", comment="#", header=[0,1]) + rf.columns = MultiIndex.from_tuples(( + [("#monomer", "#monomer")] + [ + (a, "" if b.startswith("Unnamed") else b) + for (a, b) in list(rf.columns)[1:] + ] + )) + return rf + + +rule adjust_all_repeatfinder: + input: + DATA_DIR+"/PacBio/repeatfinder-p_arm-unadjusted.tsv", + DATA_DIR+"/PacBio/repeatfinder-q_arm-unadjusted.tsv", + DATA_DIR+"/NASA/10X/repeatfinder-unadjusted.tsv", + DATA_DIR+"/NASA/Illumina/repeatfinder-unadjusted.tsv", + output: + DATA_DIR+"/PacBio/repeatfinder-p_arm.tsv", + DATA_DIR+"/PacBio/repeatfinder-q_arm.tsv", + DATA_DIR+"/NASA/10X/repeatfinder.tsv", + DATA_DIR+"/NASA/Illumina/repeatfinder.tsv", + run: + pvals = concat([ + load_multilevel_repeatfinder(tsv)["p"] for tsv in input + ]) + p_adjusted = multipletests(pvals, method="bonferroni")[1] + bonferroni_lookup = {p: padj for p, padj in zip(pvals, p_adjusted)} + for tsv in input: + rf = load_multilevel_repeatfinder(tsv) + rf["p_adjusted"] = rf["p"].map(bonferroni_lookup) + rf_filtered = rf[rf["p_adjusted"]<.05].drop(columns="p").copy() + rf_filtered["mean"] = rf_filtered["score"].mean(axis=1) + rf_sorted = rf_filtered.sort_values(by="mean", ascending=False) + rf_sorted.drop(columns="mean").to_csv( + sub(r'-unadjusted', "", tsv), sep="\t", index=False, + ) diff --git a/assets/paper/snakefiles/densityplots.snake b/assets/paper/snakefiles/densityplots.snake new file mode 100644 index 0000000..401c04e --- /dev/null +++ b/assets/paper/snakefiles/densityplots.snake @@ -0,0 +1,160 @@ +from pandas import read_csv +from io import StringIO +from subprocess import check_output +from gzip import open as gzopen +from matplotlib.pyplot import subplots, switch_backend + + +rule plottable_repeats: + input: + p_arm=DATA_DIR+"/PacBio/repeatfinder-p_arm.tsv", + q_arm=DATA_DIR+"/PacBio/repeatfinder-q_arm.tsv", + output: + p_arm=DATA_DIR+"/PacBio/repeatfinder-p_arm-plottable.tsv", + q_arm=DATA_DIR+"/PacBio/repeatfinder-q_arm-plottable.tsv", + params: + n=N_MOTIFS_TO_PLOT, + run: + for arm in "p_arm", "q_arm": + rf = read_csv( + getattr(input, arm), sep="\t", + skiprows=1, usecols=[0]+list(range(8,15)), + ) + top_rf = rf.rename(columns={"#monomer": "#motif"})[:params.n].copy() + top_rf["score"] = ( + top_rf.drop(columns=["#motif", "p_adjusted"], errors="ignore") + .mean(axis=1) + ) + top_rf[["#motif", "score"]].to_csv( + getattr(output, arm), sep="\t", index=False, + ) + + +rule read_counts: + input: bam=f"{DATA_DIR}/PacBio/tailpuller.bam", + output: tsv=DATA_DIR+"/PacBio/counts-{arm}.tsv", + run: + if wildcards.arm == "p_arm": + flags = "-f0x4000 -F0x8000" + elif wildcards.arm == "q_arm": + flags = "-f0x4000 -f0x8000" + else: + raise ValueError(f"arm=='{wildcards.arm}'") + cmd = " | ".join([ + f"samtools view {flags} {input.bam}", r"tr ':' '\t'", + "cut -f2,6", "sort", "uniq -c", + r"sed -E 's/^\s*//g; s/\s+/\t/g'" + ]) + counts_narrow = read_csv( + StringIO(check_output(cmd, shell=True, universal_newlines=True)), + sep="\t", names=["count", "subject", "rname"], + ) + counts = counts_narrow.pivot( + index="rname", columns="subject", values="count", + ) + counts.columns.name = None + counts["total"] = counts.sum(axis=1) + counts.fillna(0).astype(int).to_csv(output.tsv, sep="\t") + + +rule kmerscanner_unfiltered: + input: + bam=DATA_DIR+"/PacBio/{group}/{subject}/tailpuller.bam", + tsv=DATA_DIR+"/PacBio/repeatfinder-{arm}-plottable.tsv", + output: + dat=DATA_DIR+"/PacBio/{group}/{subject}/kmerscanner-{arm}-unfiltered.dat.gz", + run: + flags = get_sam_flags(wildcards.arm, target="tract_anchor") + shell(""" + ./edgecase kmerscanner --motif-file {input.tsv} -b 10 \ + {flags} {input.bam} | gzip -2 > {output.dat} + """) + + +rule kmerscanner_filtered: + input: + dat=DATA_DIR+"/PacBio/{group}/{subject}/kmerscanner-{arm}-unfiltered.dat.gz", + tsv=DATA_DIR+"/PacBio/counts-{arm}.tsv", + output: + dat=DATA_DIR+"/PacBio/{group}/{subject}/kmerscanner-{arm}.dat.gz", + params: + min_reads=MIN_CHROM_COVERAGE, + run: + counts = read_csv(input.tsv, sep="\t", index_col=0) + raw_densities = read_csv(input.dat, sep="\t") + chromosomes_to_keep = ( + set(raw_densities["chrom"].drop_duplicates()) & + set(counts[counts["total"]>=params.min_reads].index) + ) + filtered_densities = raw_densities[ + raw_densities["chrom"].isin(chromosomes_to_keep) + ] + filtered_densities.to_csv( + output.dat, compression="gzip", sep="\t", index=False, + ) + + +rule densityplot: + input: + dat=DATA_DIR+"/PacBio/{group}/{subject}/kmerscanner-{arm}.dat.gz", + tsv=DATA_DIR+"/PacBio/counts-{arm}.tsv", + output: + pdf=DATA_DIR+"/PacBio/{group}/{subject}/densityplot-{arm}.{outfmt}", + params: + min_reads=MIN_CHROM_COVERAGE, + run: + counts = read_csv(input.tsv, sep="\t", index_col=0) + chroms_to_plot = ",".join( + counts[counts["total"]>=params.min_reads].index + ) + with gzopen(input.dat, mode="rt") as dat: + for i, _ in enumerate(dat): + if i > 0: + kmerscan_empty = False + break + else: + kmerscan_empty = True + if wildcards.outfmt == "pdf": + if wildcards.arm == "p_arm": + options = "--palette 'paper|legend=density'" + else: + options = "--palette paper" + elif wildcards.outfmt == "pkl": + options = "--palette 'paper|legend=full' --plot-coverage" + else: + raise ValueError(f"outfmt={wildcards.outfmt}") + if kmerscan_empty: + switch_backend("Agg") + subplots()[0].savefig(output.pdf, bbox_inches="tight") + else: + flags = get_sam_flags(wildcards.arm, target="tract_anchor") + shell(""" + ./edgecase densityplot \ + -x {HG38EXT_ECX} --title ' ' {flags} -b 100 {options} \ + --chroms-to-plot '{chroms_to_plot}' \ + -z {input.dat} --outfmt {wildcards.outfmt} > {output.pdf} + """) + + +rule densityplot_all: + input: + p_arm_pdf=[ + DATA_DIR+f"/PacBio/{group}/{subject}/densityplot-p_arm.pdf" + for _, (group, subject) + in DATASETS[["group", "subject"]].drop_duplicates().iterrows() + ], + q_arm_pdf=[ + DATA_DIR+f"/PacBio/{group}/{subject}/densityplot-q_arm.pdf" + for _, (group, subject) + in DATASETS[["group", "subject"]].drop_duplicates().iterrows() + ], + p_arm_pkl=[ + DATA_DIR+f"/PacBio/{group}/{subject}/densityplot-p_arm.pkl" + for _, (group, subject) + in DATASETS[["group", "subject"]].drop_duplicates().iterrows() + ], + q_arm_pkl=[ + DATA_DIR+f"/PacBio/{group}/{subject}/densityplot-q_arm.pkl" + for _, (group, subject) + in DATASETS[["group", "subject"]].drop_duplicates().iterrows() + ], diff --git a/assets/paper/snakefiles/kmerscanner-all.snake b/assets/paper/snakefiles/kmerscanner-all.snake new file mode 100644 index 0000000..fc8b0bf --- /dev/null +++ b/assets/paper/snakefiles/kmerscanner-all.snake @@ -0,0 +1,37 @@ +rule repeatfinder_for_all_motifs: + input: tsv=DATA_DIR+"/PacBio/repeatfinder-{arm}.tsv", + output: tsv=temp(DATA_DIR+"/PacBio/repeatfinder-all-{arm}.tsv"), + run: + with open(input.tsv, mode="rt") as tsv_in: + with open(output.tsv, mode="wt") as tsv_out: + for i, line in enumerate(map(str.strip, tsv_in)): + if i > 0: + print("\t".join(line.split("\t")[:2]), file=tsv_out) + + +rule kmerscanner_all_motifs: + input: + bam=DATA_DIR+"/PacBio/{group}/{subject}/tailpuller.bam", + tsv=DATA_DIR+"/PacBio/repeatfinder-all-{arm}.tsv", + output: + dat=DATA_DIR+"/PacBio/{group}/{subject}/kmerscanner-all-{arm}.dat.gz", + run: + flags = get_sam_flags(wildcards.arm, target="tract_anchor") + shell(""" + ./edgecase kmerscanner --motif-file {input.tsv} -b 10 \ + {flags} {input.bam} | gzip -2 > {output.dat} + """) + + +rule kmerscanner_all_motifs_all_subjects: + input: + p_arm=[ + DATA_DIR+f"/PacBio/{group}/{subject}/kmerscanner-all-p_arm.dat.gz" + for _, (group, subject) + in DATASETS[["group", "subject"]].drop_duplicates().iterrows() + ], + q_arm=[ + DATA_DIR+f"/PacBio/{group}/{subject}/kmerscanner-all-q_arm.dat.gz" + for _, (group, subject) + in DATASETS[["group", "subject"]].drop_duplicates().iterrows() + ], diff --git a/assets/paper/snakefiles/levenshtein.snake b/assets/paper/snakefiles/levenshtein.snake new file mode 100644 index 0000000..beeaf67 --- /dev/null +++ b/assets/paper/snakefiles/levenshtein.snake @@ -0,0 +1,83 @@ +rule levenshtein: + input: bam=DATA_DIR+"/PacBio/{group}/{subject}/tailpuller.bam", + output: tsv=DATA_DIR+"/PacBio/{group}/{subject}/haplotypes/levenshtein-{arm}.tsv", + threads: 48, + run: + flags = get_sam_flags(wildcards.arm, target="tract_anchor") + shell(""" + ./edgecase levenshtein -j {threads} {flags} \ + {input.bam} > {output.tsv} + """) + + +def trio_bam_input(w): + return { + subject: f"{DATA_DIR}/PacBio/{w.group}/{subject}/tail{w.kind}.bam" + for subject in set(DATASETS.loc[DATASETS["group"]==w.group, "subject"]) + } + +rule trio_bam: + input: unpack(trio_bam_input), + output: bam=DATA_DIR+"/PacBio/{group}/tail{kind}.bam", + params: sam=DATA_DIR+"/PacBio/{group}/tail{kind}.sam", flags="-f0x4000", + run: + first_bam = next(iter(input)) + shell("samtools view -H {first_bam} > {params.sam}") + for subject, bam in input.items(): + shell(""" + samtools view {params.flags} {bam} \ + | sed -E 's/^/{subject}:/g' >> {params.sam} + """) + shell("samtools view -bh {params.sam} > {output.bam}") + shell("rm {params.sam}") + + +def all_bam_input(w): + return { + group: f"{DATA_DIR}/PacBio/{group}/tail{w.kind}.bam" + for group in set(DATASETS["group"]) + } +rule all_bam: + input: unpack(all_bam_input), + output: bam=DATA_DIR+"/PacBio/tail{kind}.bam", + params: sam=DATA_DIR+"/PacBio/tail{kind}.sam", flags="-f0x4000", + run: + first_bam = next(iter(input)) + shell("samtools view -H {first_bam} > {params.sam}") + for group, bam in input.items(): + shell(""" + samtools view {params.flags} {bam} \ + | sed -E 's/^/{group}:/g' >> {params.sam} + """) + shell("samtools view -bh {params.sam} > {output.bam}") + shell("rm {params.sam}") + + +rule kmerscanner_for_haploplots: + input: + bam=f"{DATA_DIR}/PacBio/tailpuller.bam", + tsv=f"{DATA_DIR}/PacBio/repeatfinder-q_arm-plottable.tsv", + output: + dat=f"{DATA_DIR}/PacBio/kmerscanner-q_arm.dat.gz", + shell: """ + ./edgecase kmerscanner --motif-file {input.tsv} -b 10 \ + -f0x4000 -f0x8000 {input.bam} | gzip -2 > {output.dat} + """ + + +rule levenshtein_across_populations: + input: bam=DATA_DIR+"/PacBio/tailpuller.bam", + output: tsv=DATA_DIR+"/PacBio/haplotypes/levenshtein-{arm}.tsv", + threads: 96, + run: + flags = get_sam_flags(wildcards.arm, target="tract_anchor") + shell(""" + ./edgecase levenshtein -j {threads} {flags} \ + {input.bam} > {output.tsv} + """) + + +rule levenshtein_all: + input: + p_arm=DATA_DIR+"/PacBio/haplotypes/levenshtein-p_arm.tsv", + q_arm=DATA_DIR+"/PacBio/haplotypes/levenshtein-q_arm.tsv", diff --git a/assets/paper/snakefiles/longread-motifs.snake b/assets/paper/snakefiles/longread-motifs.snake new file mode 100644 index 0000000..ac6e4a0 --- /dev/null +++ b/assets/paper/snakefiles/longread-motifs.snake @@ -0,0 +1,138 @@ +from pandas import read_csv, concat +from pysam import AlignmentFile +from re import sub, escape +from scipy.stats import combine_pvalues + + +make_pvals_worse = lambda p: p if p != 0 else SMALLEST_P_VALUE + + +rule tailpuller: + input: + bam=DATA_DIR+"/PacBio/{group}/{subject}/{dataset}/wgs.bam", + bai=DATA_DIR+"/PacBio/{group}/{subject}/{dataset}/wgs.bam.bai", + output: + bam=DATA_DIR+"/PacBio/{group}/{subject}/{dataset}/tailpuller.bam", + params: + max_read_length=MAX_READ_LENGTH, + min_map_overlap=MIN_MAP_OVERLAP, + min_subtelomere_overlap=MIN_SUBTELOMERE_OVERLAP, + min_telomere_overlap=MIN_TELOMERE_OVERLAP, + target=TARGET, + samtools_flags="-f0x4000", + shell: """ + ./edgecase tailpuller -x {HG38EXT_ECX} -t {params.target} \ + -M {params.max_read_length} \ + --min-map-overlap {params.min_map_overlap} \ + --min-subtelomere-overlap {params.min_subtelomere_overlap} \ + --min-telomere-overlap {params.min_telomere_overlap} \ + {input.bam} \ + | samtools view -bh {params.samtools_flags} > {output.bam} + """ + +rule tailchopper: + input: + bam=DATA_DIR+"/PacBio/{group}/{subject}/{dataset}/tailpuller.bam", + output: + bam=DATA_DIR+"/PacBio/{group}/{subject}/{dataset}/tailchopper.bam", + params: + target=TARGET, + shell: """ + ./edgecase tailchopper -x {HG38EXT_ECX} \ + -f {params.target} -t cigar {input.bam} \ + | samtools view -bh > {output.bam} + """ + +def bam_combiner(w): + return [f"{pacbio_path}/{w.name}.bam" for pacbio_path in DATASETS.loc[ + (DATASETS["group"]==w.group) & (DATASETS["subject"]==w.subject), + "dataset_pacbio_path", + ]] + +rule combined_bam: + input: bams=bam_combiner, + output: bam=DATA_DIR+"/PacBio/{group}/{subject}/{name}.bam", + params: sam=DATA_DIR+"/PacBio/{group}/{subject}/{name}.sam", + run: + shell("samtools view -H {input.bams[0]} > {params.sam}") + visited_entries = set() + selection = DATASETS.loc[ + (DATASETS["subject"]==wildcards.subject), + ["priority", "dataset", "dataset_pacbio_path"], + ] + iterator = selection.sort_values(by="priority").iterrows() + for _, (_, dataset, dataset_pacbio_path) in iterator: + filename = f"{dataset_pacbio_path}/{wildcards.name}.bam" + with AlignmentFile(filename) as bam: + with open(params.sam, mode="at") as sam: + for entry in bam: + entry_identifier = "{} {} {}".format( + entry.qname, entry.seq, entry.qual, + ) + if entry_identifier not in visited_entries: + visited_entries.add(entry_identifier) + modified_entry = "{}:{}".format( + dataset, entry.tostring(), + ) + print(modified_entry, file=sam) + shell("samtools view -bh {params.sam} > {output.bam}") + shell("rm {params.sam}") + + +rule repeatfinder_pacbio: + input: bam=DATA_DIR+"/PacBio/{group}/{subject}/tailchopper.bam", + output: tsv=DATA_DIR+"/PacBio/{group}/{subject}/repeatfinder-{arm}.tsv", + params: s="8G", min_k=4, max_k=16, max_p_adjusted=1.1, + threads: 24, + run: + flags = get_sam_flags(wildcards.arm) + shell(""" + ./edgecase repeatfinder -j {threads} -s {params.s} \ + -m {params.min_k} -M {params.max_k} -P {params.max_p_adjusted} \ + {flags} {input.bam} > {output.tsv} + """) + + +rule repeatfinder_all_pacbio: + input: + p_arm=[ + DATA_DIR+f"/PacBio/{group}/{subject}/repeatfinder-p_arm.tsv" + for _, (group, subject) + in DATASETS[["group", "subject"]].drop_duplicates().iterrows() + ], + q_arm=[ + DATA_DIR+f"/PacBio/{group}/{subject}/repeatfinder-q_arm.tsv" + for _, (group, subject) + in DATASETS[["group", "subject"]].drop_duplicates().iterrows() + ], + output: + p_arm=DATA_DIR+"/PacBio/repeatfinder-p_arm-unadjusted.tsv", + q_arm=DATA_DIR+"/PacBio/repeatfinder-q_arm-unadjusted.tsv", + run: + pivot_p = dict(index="#monomer", columns="subject", values="p") + pivot_s = dict(index="#monomer", columns="subject", values="score") + pivot_f = dict( + index="#monomer", columns="subject", values="fraction_explained", + ) + for arm in "p_arm", "q_arm": + arm_rf_as_list = [] + for tsv in getattr(input, arm): + sample_rf = read_csv(tsv, sep="\t", usecols=(0, 3, 4, 5)) + rpath = sub(r'^'+escape(DATA_DIR)+r'/PacBio/', "", tsv) + sample_rf["group"], sample_rf["subject"], *_ = rpath.split("/") + arm_rf_as_list.append(sample_rf) + arm_rf = concat(arm_rf_as_list) + arm_rf["p"] = arm_rf["p"].apply(make_pvals_worse) + arm_rf_by_p = arm_rf[pivot_p.values()].pivot(**pivot_p).fillna(1) + arm_rf_by_s = arm_rf[pivot_s.values()].pivot(**pivot_s).fillna(0) + arm_rf_by_f = arm_rf[pivot_f.values()].pivot(**pivot_f).fillna(0) + assert (arm_rf_by_p.index == arm_rf_by_f.index).all() + assert (arm_rf_by_p.index == arm_rf_by_s.index).all() + arm_rf_combined = concat( + {"score": arm_rf_by_s, "fraction_explained": arm_rf_by_f}, + axis=1, + ) + arm_rf_combined = arm_rf_combined[sorted(arm_rf_combined.columns)] + mg = lambda row: combine_pvalues(row, method="mudholkar_george")[1] + arm_rf_combined["p"] = arm_rf_by_p.apply(mg, axis=1).fillna(1) + arm_rf_combined.to_csv(getattr(output, arm), sep="\t") diff --git a/assets/paper/snakefiles/shortread-motifs.snake b/assets/paper/snakefiles/shortread-motifs.snake new file mode 100644 index 0000000..290bcef --- /dev/null +++ b/assets/paper/snakefiles/shortread-motifs.snake @@ -0,0 +1,70 @@ +make_pvals_worse = lambda p: p if p != 0 else SMALLEST_P_VALUE + + +rule repeatfinder_illumina: + input: fq=DATA_DIR+"/NASA/Illumina/{subject}/telbam.fa", + output: tsv=DATA_DIR+"/NASA/Illumina/{subject}/repeatfinder.tsv", + params: s="32G", min_k=4, max_k=16, max_p_adjusted=1.1, + threads: 12, + run: + shell(""" + ./edgecase repeatfinder -j {threads} -s {params.s} \ + -m {params.min_k} -M {params.max_k} -P {params.max_p_adjusted} \ + -C --fmt fastx {input.fq} > {output.tsv} + """) + + +rule repeatfinder_chromium: + input: bam=DATA_DIR+"/NASA/10X/{subject}/telbam.bam", + output: tsv=DATA_DIR+"/NASA/10X/{subject}/repeatfinder.tsv", + params: s="32G", min_k=4, max_k=16, max_p_adjusted=1.1, + threads: 12, + run: + shell(""" + ./edgecase repeatfinder -j {threads} -s {params.s} \ + -m {params.min_k} -M {params.max_k} -P {params.max_p_adjusted} \ + -C {input.bam} > {output.tsv} + """) + + +rule repeatfinder_all_shortread: + input: + chromium=[ + DATA_DIR+f"/NASA/10X/Subject_{sid}/repeatfinder.tsv" + for sid in ["1_1", "1_2", "1_3", "2"] + ], + illumina=[ + DATA_DIR+f"/NASA/Illumina/{subject}/repeatfinder.tsv" + for subject in list("ABCD") + ], + output: + chromium=DATA_DIR+"/NASA/10X/repeatfinder-unadjusted.tsv", + illumina=DATA_DIR+"/NASA/Illumina/repeatfinder-unadjusted.tsv", + run: + pivot_p = dict(index="#monomer", columns="subject", values="p") + pivot_s = dict(index="#monomer", columns="subject", values="score") + pivot_f = dict( + index="#monomer", columns="subject", values="fraction_explained", + ) + for kind in "chromium", "illumina": + rf_as_list = [] + for tsv in getattr(input, kind): + kd = escape({"chromium": "10X", "illumina": "Illumina"}[kind]) + sample_rf = read_csv(tsv, sep="\t", usecols=(0, 3, 4, 5)) + rpath = sub(r'^'+escape(DATA_DIR)+r'/NASA/'+kd+r'/', "", tsv) + sample_rf["subject"], *_ = rpath.split("/") + rf_as_list.append(sample_rf) + rf = concat(rf_as_list) + rf["p"] = rf["p"].apply(make_pvals_worse) + rf_by_p = rf[pivot_p.values()].pivot(**pivot_p).fillna(1) + rf_by_s = rf[pivot_s.values()].pivot(**pivot_s).fillna(0) + rf_by_f = rf[pivot_f.values()].pivot(**pivot_f) + assert (rf_by_p.index == rf_by_f.index).all() + assert (rf_by_p.index == rf_by_s.index).all() + rf_combined = concat( + {"score": rf_by_s, "fraction_explained": rf_by_f}, axis=1, + ) + rf_combined = rf_combined[sorted(rf_combined.columns)] + mg = lambda row: combine_pvalues(row, method="mudholkar_george")[1] + rf_combined["p"] = rf_by_p.apply(mg, axis=1).fillna(1) + rf_combined.to_csv(getattr(output, kind), sep="\t") diff --git a/assets/paper/snakefiles/shortread-support.snake b/assets/paper/snakefiles/shortread-support.snake new file mode 100644 index 0000000..61d82f6 --- /dev/null +++ b/assets/paper/snakefiles/shortread-support.snake @@ -0,0 +1,63 @@ +from pysam import AlignmentFile +from gzip import open as gzopen + + +rule bam_to_fastq: + input: bam="{prefix}/telbam.bam" + output: fq="{prefix}/telbam.fq.gz" + run: + names = set() + with AlignmentFile(input.bam) as bam, gzopen(output.fq, mode="wt") as fq: + for entry in bam: + if entry.qname not in names: + names.add(entry.qname) + read_repr = "@{}\n{}\n+\n{}".format( + entry.qname, entry.query_sequence, + "".join(map(lambda x:chr(x+33), entry.query_qualities)), + ) + print(read_repr, file=fq) + + +rule tailchopper_to_fasta: + input: + bam="{prefix}/tailchopper.bam" + output: + fa=temp("{prefix}/tailchopper.fa"), + fai=temp("{prefix}/tailchopper.fa.fai"), + run: + names = set() + with AlignmentFile(input.bam) as bam, open(output.fa, mode="wt") as fa: + for entry in bam: + if entry.flag & 0x4000 == 0x4000: + if entry.qname not in names: + names.add(entry.qname) + arm = "q" if (entry.flag & 0x8000 == 0x8000) else "p" + read_repr = ">{}/{}\n{}".format( + entry.qname, arm, entry.query_sequence, + ) + print(read_repr, file=fa) + shell("samtools faidx {output.fa}") + + +rule telbam_support: + input: + tailchopper=DATA_DIR+"/PacBio/{group}/{subject}/tailchopper.fa", + telbam_fq=DATA_DIR+"/Illumina/{group}/{subject}/telbam.fq.gz", + output: + bam=DATA_DIR+"/PacBio/{group}/{subject}/telbam2tailchopper.bam", + threads: 48, + params: n_multimap=1000000, + shell: """ + minimap2 -t {threads} -ax sr \ + --secondary=yes -p0 -N{params.n_multimap} \ + {input.tailchopper} {input.telbam_fq} \ + | samtools view -F4 -bh > {output.bam} + """ + +rule telbam_support_all: + input: + bams=[ + DATA_DIR+f"/PacBio/{group}/{subject}/telbam2tailchopper.bam" + for _, (group, subject) + in DATASETS[["group", "subject"]].drop_duplicates().iterrows() + ], diff --git a/assets/paper/tables/Supplemental_Table_S1.tsv b/assets/paper/tables/Supplemental_Table_S1.tsv new file mode 100644 index 0000000..74dee20 --- /dev/null +++ b/assets/paper/tables/Supplemental_Table_S1.tsv @@ -0,0 +1,16 @@ +# Description of GIAB datasets used +Subject Ethnicity Status within trio GIAB PacBio CCS datasets used Average coverage GIAB Illumina datasets used Average coverage +NA12878/HG001 Utah/Mormon None PacBio_SequelII_CCS_11kb ~29x NIST_NA12878_HG001_HiSeq_300x ~300x +NA24385/HG002 Ashkenazim Jewish Son PacBio_CCS_10kb ~30x NIST_HiSeq_HG002_Homogeneity-10953946 ~300x + PacBio_CCS_15kb ~32x + PacBio_CCS_15kb_20kb_chemistry2 ~56x +NA24149/HG003 Ashkenazim Jewish Father PacBio_CCS_Google_15kb ~22x NIST_HiSeq_HG003_Homogeneity-12389378 ~300x + PacBio_CCS_15kb_20kb_chemistry2 ~68x +NA24143/HG004 Ashkenazim Jewish Mother PacBio_CCS_Google_15kb ~21x NIST_HiSeq_HG004_Homogeneity-14572558 ~300x + PacBio_CCS_15kb_20kb_chemistry2 ~69x + PacBio_CCS_HudsonAlpha_15kb_21kb ~48x +NA24631/HG005 Chinese Son PacBio_SequelII_CCS_11kb ~32x HG005_NA24631_son_HiSeq_300x ~300x +NA24694/HG006 Chinese Father PacBio_CCS_15kb_20kb_chemistry2 ~46x NA24694_Father_HiSeq100x ~100x + PacBio_HiFi_Google ~27x +NA24695/HG007 Chinese Mother PacBio_CCS_15kb_20kb_chemistry2 ~46x NA24695_Mother_HiSeq100x ~100x + PacBio_HiFi_Google ~22x diff --git a/assets/paper/tables/Supplemental_Table_S2.tsv b/assets/paper/tables/Supplemental_Table_S2.tsv new file mode 100644 index 0000000..1d7cc65 --- /dev/null +++ b/assets/paper/tables/Supplemental_Table_S2.tsv @@ -0,0 +1,28 @@ +# The number of telomeric reads on each arm identified in GIAB PacBio CCS datasets +Telomere Reference contig HG001 HG002 HG003 HG004 HG005 HG006 HG007 TOTAL +2p chr2 1 34 6 8 3 1 2 55 +3p 3ptel_1-500K_1_12_12 3 29 8 11 0 7 11 69 +4p 4ptel_1-500K_1_12_12 19 116 41 21 25 9 5 236 +5p chr5 3 29 6 0 4 3 1 46 +9p chr9 12 18 2 2 3 1 0 38 +12p chr12 4 11 10 6 4 7 4 46 +16p chr16 0 0 1 7 0 1 2 11 +17p 17ptel_1_500K_1_12_12 3 47 26 18 12 19 18 143 +19p 19ptel_1-500K_1_12_12 0 1 0 0 0 0 0 1 + p TOTAL 45 285 100 73 51 48 43 645 +1q 1qtel_1-500K_1_12_12_rc 0 0 2 1 0 0 1 4 +2q 2qtel_1-500K_1_12_12_rc 0 0 1 0 0 0 0 1 +5q 5qtel_1-500K_1_12_12_rc 0 0 0 0 1 0 0 1 +6q 6qtel_1-500K_1_12_12_rc 0 0 0 1 0 0 0 1 +7q chr7 6 48 10 11 1 7 9 92 +8q chr8 5 33 6 8 3 7 2 64 +10q 10qtel_1-500K_1_12_12_rc 0 0 0 1 0 0 0 1 +11q chr11 4 39 24 11 8 8 13 107 +12q chr12 5 32 4 5 5 5 12 68 +14q 14qtel_1-500K_1_12_12_rc 3 38 19 4 3 5 4 76 +15q chr15 19 6 2 4 11 0 0 42 +16q 16qtel_1-500K_1_12_12_rc 0 1 0 0 0 1 0 2 +17q 17qtel_1-500K_1_12_12v2_rc 0 2 0 0 0 1 0 3 +18q 18qtel_1-500K_1_12_12_rc 3 51 14 5 2 7 3 85 + q TOTAL 45 250 82 51 34 41 44 547 +TOTAL 90 535 182 124 85 89 87 1192 diff --git a/assets/paper/tables/Supplemental_Table_S3.tsv b/assets/paper/tables/Supplemental_Table_S3.tsv new file mode 100644 index 0000000..53a0c97 --- /dev/null +++ b/assets/paper/tables/Supplemental_Table_S3.tsv @@ -0,0 +1,17 @@ +# Significantly enriched repeating motifs in telomeric candidate reads in short-read sequencing experiments, +# subset to motifs also observed in PacBio telomeric reads, with respect to reverse-complemented equivalence. +Motif Illumina datasets 10X datasets + Median score Adjusted p-value Median score Adjusted p-value +TTAGGG 0.29907 1.55e-65 0.46171 1.55e-65 +TGAGGG 0.00748 1.55e-65 0.01852 1.55e-65 +TTAGGGG 0.00301 1.55e-65 0.00502 1.55e-65 +TTCGGG 0.00254 8.74e-63 0.00212 9.66e-43 +TTTAGGG 0.00062 1.43e-54 0.00088 2.27e-57 +TAGGG 0.00062 1.55e-65 0.00102 1.55e-65 +TTAGGGTTAGGGG 0.00053 2.32e-51 0.00102 1.55e-65 +TTGGG 0.00050 9.35e-65 0.00099 1.55e-65 +TTAGGGTAGGG 0.00031 9.44e-40 0.00050 3.36e-65 +TTAGG 0.00024 1.81e-34 0.00021 2.46e-55 +TTAGGTTAGGG 0.00007 4.44e-18 0.00009 1.74e-47 +TTAGGGGG 0.00004 2.26e-14 0.00006 9.61e-40 +TGGG NA ns 0.00079 1.55e-65 diff --git a/assets/paper/tables/Table_1.tex b/assets/paper/tables/Table_1.tex new file mode 100644 index 0000000..6f33d4a --- /dev/null +++ b/assets/paper/tables/Table_1.tex @@ -0,0 +1,41 @@ +\begin{landscape} \begin{samepage} \begin{table}[h!] \scriptsize \begin{tabular}{lllllllllllllllll} +\hline +\textbf{Arm} & \textbf{Motif} & \multicolumn{7}{l}{\textbf{Percentage of sequence explainable by motif, \%}} & \multicolumn{7}{l}{\textbf{Score}} & \textbf{Combined adjusted} \\ +\textbf{} & \textbf{} & \textbf{HG001} & \textbf{HG002} & \textbf{HG003} & \textbf{HG004} & \textbf{HG005} & \textbf{HG006} & \textbf{HG007} & \textbf{HG001} & \textbf{HG002} & \textbf{HG003} & \textbf{HG004} & \textbf{HG005} & \textbf{HG006} & \textbf{HG007} & \textbf{p value} \\ +\hline +p & CCCTAA & 76.9 & 67.9 & 56.4 & 60.0 & 64.4 & 41.3 & 27.0 & 0.6395 & 0.6149 & 0.4514 & 0.4678 & 0.5387 & 0.3257 & 0.1935 & 9.33e-113 \\ +\textbf{} & CCCCTAA & 4.1 & 4.3 & 4.1 & 4.6 & 3.6 & 3.3 & 3.8 & 0.0113 & 0.0109 & 0.0096 & 0.0118 & 0.0092 & 0.0087 & 0.0116 & 6.41e-92 \\ +\textbf{} & CCTAA & 2.0 & 1.6 & 2.6 & 3.4 & 1.4 & 1.6 & 1.0 & 0.0051 & 0.0043 & 0.0067 & 0.0091 & 0.0036 & 0.0041 & 0.0025 & 8.55e-74 \\ +\textbf{} & CCCCTAACCCTAA & 3.5 & 2.4 & 3.9 & 4.3 & 3.2 & 3.3 & 2.9 & 0.0051 & 0.0030 & 0.0051 & 0.0056 & 0.0043 & 0.0044 & 0.0039 & 5.93e-83 \\ +\textbf{} & CCCTA & 2.5 & 1.2 & 2.2 & 2.7 & 2.4 & 1.6 & 1.1 & 0.0053 & 0.0027 & 0.0047 & 0.0058 & 0.0055 & 0.0033 & 0.0023 & 1.23e-67 \\ +\textbf{} & CCCGAA & 1.3 & 0.4 & 0.3 & 0.6 & 0.3 & 0.3 & 1.0 & 0.0085 & 0.0029 & 0.0019 & 0.0045 & 0.0022 & 0.0020 & 0.0068 & 1.36e-21 \\ +\textbf{} & CCCTAACCTAA & 3.0 & 2.7 & 4.1 & 5.0 & 2.3 & 2.5 & 1.2 & 0.0043 & 0.0036 & 0.0055 & 0.0067 & 0.0031 & 0.0032 & 0.0016 & 2.72e-77 \\ +\textbf{} & CCCTACCCTAA & 3.4 & 1.7 & 2.8 & 3.1 & 2.8 & 2.1 & 1.2 & 0.0038 & 0.0016 & 0.0028 & 0.0033 & 0.0034 & 0.0021 & 0.0014 & 7.27e-65 \\ +\textbf{} & CCCTAAA & 1.2 & 0.9 & 0.8 & 1.1 & 0.7 & 0.7 & 0.9 & 0.0034 & 0.0019 & 0.0012 & 0.0024 & 0.0011 & 0.0011 & 0.0018 & 8.79e-46 \\ +\textbf{} & CCCAA & 1.5 & 0.4 & 1.2 & 1.3 & 1.3 & 0.9 & 1.0 & 0.0018 & 0.0005 & 0.0018 & 0.0018 & 0.0019 & 0.0013 & 0.0017 & 1.37e-43 \\ +\textbf{} & CCCA & 0.4 & 0.1 & 0.4 & 0.4 & 0.5 & 0.3 & 0.3 & 0.0006 & 0.0001 & 0.0008 & 0.0007 & 0.0009 & 0.0005 & 0.0006 & 1.36e-75 \\ +\textbf{} & CCCCCTAA & 0.2 & 0.1 & 0.4 & 0.4 & 0.1 & 0.4 & 0.5 & 0.0003 & 0.0002 & 0.0005 & 0.0005 & 0.0002 & 0.0008 & 0.0011 & 2.08e-67 \\ +q & TTAGGG & 51.7 & 79.9 & 79.6 & 74.8 & 75.3 & 80.1 & 49.1 & 0.4217 & 0.7194 & 0.6298 & 0.6008 & 0.6043 & 0.6391 & 0.3471 & 8.16e-112 \\ +\textbf{} & TGAGGG & 1.1 & 2.5 & 0.7 & 1.4 & 4.2 & 2.5 & 8.0 & 0.0066 & 0.0150 & 0.0033 & 0.0081 & 0.0248 & 0.0148 & 0.0476 & 2.12e-56 \\ +\textbf{} & TTAGGGG & 4.6 & 3.2 & 6.5 & 5.2 & 3.5 & 5.7 & 6.6 & 0.0147 & 0.0099 & 0.0172 & 0.0126 & 0.0100 & 0.0149 & 0.0213 & 8.85e-100 \\ +\textbf{} & TTAGG & 1.2 & 1.5 & 3.8 & 3.8 & 1.9 & 4.0 & 1.9 & 0.0031 & 0.0038 & 0.0104 & 0.0101 & 0.0048 & 0.0106 & 0.0051 & 3.62e-94 \\ +\textbf{} & TTAGGGTTAGGGG & 3.3 & 2.5 & 6.4 & 5.4 & 3.0 & 6.0 & 4.6 & 0.0049 & 0.0037 & 0.0083 & 0.0073 & 0.0039 & 0.0077 & 0.0062 & 1.62e-85 \\ +\textbf{} & TTAGGTTAGGG & 1.6 & 2.4 & 5.8 & 6.1 & 2.8 & 5.9 & 2.3 & 0.0022 & 0.0032 & 0.0078 & 0.0081 & 0.0038 & 0.0081 & 0.0029 & 5.54e-74 \\ +\textbf{} & TAGGG & 1.9 & 1.4 & 2.9 & 2.3 & 3.1 & 3.1 & 1.8 & 0.0036 & 0.0027 & 0.0062 & 0.0048 & 0.0065 & 0.0065 & 0.0037 & 8.14e-86 \\ +\textbf{} & TAGGGTTAGGG & 2.8 & 2.0 & 3.7 & 3.0 & 3.8 & 4.0 & 2.2 & 0.0040 & 0.0019 & 0.0034 & 0.0029 & 0.0048 & 0.0042 & 0.0028 & 1.29e-64 \\ +\textbf{} & TTTAGGG & 1.0 & 0.8 & 1.3 & 1.0 & 1.1 & 1.3 & 1.3 & 0.0030 & 0.0018 & 0.0026 & 0.0015 & 0.0031 & 0.0026 & 0.0029 & 1.12e-49 \\ +\textbf{} & TTGGG & 1.1 & 0.5 & 1.8 & 1.7 & 2.5 & 1.5 & 1.0 & 0.0015 & 0.0008 & 0.0029 & 0.0025 & 0.0040 & 0.0021 & 0.0016 & 4.72e-69 \\ +\textbf{} & GCGGC & 0.5 & 0.7 & 0.9 & 0.8 & 0.9 & 0.9 & 0.6 & 0.0012 & 0.0018 & 0.0022 & 0.0020 & 0.0022 & 0.0021 & 0.0015 & 6.16e-64 \\ +\textbf{} & TTGGGTTAGGG & 1.4 & 0.5 & 2.0 & 2.0 & 2.6 & 1.8 & 0.8 & 0.0011 & 0.0003 & 0.0013 & 0.0018 & 0.0022 & 0.0013 & 0.0007 & 9.24e-30 \\ +\textbf{} & TTAGGGTTTAGGG & 0.7 & 0.9 & 1.6 & 1.4 & 1.1 & 1.4 & 1.3 & 0.0006 & 0.0008 & 0.0016 & 0.0013 & 0.0009 & 0.0011 & 0.0011 & 2.26e-35 \\ +\textbf{} & TGGG & 0.3 & 0.1 & 0.5 & 0.7 & 1.1 & 0.5 & 0.4 & 0.0004 & 0.0002 & 0.0009 & 0.0013 & 0.0024 & 0.0008 & 0.0007 & 9.59e-75 \\ +\hline +\end{tabular} +\caption{ + \small Significantly enriched repeating motifs in telomeric regions of GIAB datasets HG001 through HG007. + See \hyperref[sec:methods]{Materials and Methods} for the definition of \textit{score}. +} +\label{tab:repeatfinder} +\end{table} +\end{samepage} +\end{landscape} diff --git a/assets/paper/tables/Table_2.tex b/assets/paper/tables/Table_2.tex new file mode 100644 index 0000000..fb1f796 --- /dev/null +++ b/assets/paper/tables/Table_2.tex @@ -0,0 +1,28 @@ +\begin{samepage} \begin{table}[h!] \small \begin{tabular}{llll} +\hline +\textbf{Telomere} & \textbf{Reference contig} & \multicolumn{2}{l}{\textbf{Cophenetic correlation}} \\ +\textbf{} & \textbf{} & \textbf{r} & \textbf{p} \\ +\hline +2p & chr2 & 0.631 & 6.8e-165 \\ +3p & 3ptel\_1-500K\_1\_12\_12 & 0.607 & 1.4e-235 \\ +4p & 4ptel\_1-500K\_1\_12\_12 & 0.490 & <1.0e-300 \\ +5p & chr5 & 0.760 & 2.4e-194 \\ +9p & chr9 & 0.734 & 7.3e-119 \\ +12p & chr12 & 0.783 & 2.5e-214 \\ +17p & 17ptel\_1\_500K\_1\_12\_12 & 0.937 & <1.0e-300 \\ +7q & chr7 & 0.838 & <1.0e-300 \\ +8q & chr8 & 0.928 & <1.0e-300 \\ +11q & chr11 & 0.630 & <1.0e-300 \\ +12q & chr12 & 0.881 & <1.0e-300 \\ +14q & 14qtel\_1-500K\_1\_12\_12\_rc & 0.842 & <1.0e-300 \\ +15q & chr15 & 0.915 & <1.0e-300 \\ +18q & 18qtel\_1-500K\_1\_12\_12\_rc & 0.682 & <1.0e-300 \\ +\hline +\end{tabular} +\caption{ + \small Measures of cophenetic correlation (Pearson's \textit{r} and adjusted \textit{p}-value) + between the hierarchical clustering and the pairwise distance matrix on each chromosomal arm. +} +\label{tab:cophenetic} +\end{table} +\end{samepage} diff --git a/assets/paper/tables/Table_3.tex b/assets/paper/tables/Table_3.tex new file mode 100644 index 0000000..690f3f8 --- /dev/null +++ b/assets/paper/tables/Table_3.tex @@ -0,0 +1,24 @@ +\begin{samepage} \begin{table}[h!] \small \begin{tabular}{ll} +\hline +\textbf{Comparison} & \textbf{Adjusted p-value} \\ +\hline +A subject's reads are closer to each other than to other subjects' reads in the trio & 4.2e-56 \\ +A subject's reads are closer to each other than to subjects' reads in other populations & 7.6e-107 \\ +Reads within a population are closer to each other than to reads in other populations & 2.2e-40 \\ +Ashkenazim trio: & \textbf{} \\ +\hspace{.5cm} Father's reads are closer to son's reads than to mother's reads & 3.1e-11 \\ +\hspace{.5cm} Mother's reads are closer to son's reads than to father's reads & ns (1.00) \\ +Chinese trio: & \textbf{} \\ +\hspace{.5cm} Father's reads are closer to son's reads than to mother's reads & 3.4e-02 \\ +\hspace{.5cm} Mother's reads are closer to son's reads than to father's reads & ns (0.23) \\ +\hline +\end{tabular} +\caption{ + \small Adjusted \textit{p}-values of the Wilcoxon signed-rank tests on relative Levenshtein distances. + For each read among all telomeric reads on each arm, + closest distances to groups of reads described in the \textit{Comparison} column are compared + (see \hyperref[sec:methods]{Materials and Methods}). +} +\label{tab:haptests} +\end{table} +\end{samepage} diff --git a/assets/paper/tools/preformat-table.py b/assets/paper/tools/preformat-table.py new file mode 100755 index 0000000..1c7abe7 --- /dev/null +++ b/assets/paper/tools/preformat-table.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +from sys import argv +from pandas import read_csv +from subprocess import Popen, PIPE +from itertools import cycle, tee + + +def get_header_height(filename): + with open(filename, mode="rt") as handle: + for i, line in enumerate(handle): + if line[0] != "#": + break + return i + + +def RunCounter(iterable): + previous_value = None + current_count = 1 + for value in iterable: + if previous_value is None: + previous_value = value + elif value == previous_value: + current_count += 1 + else: + yield (previous_value, current_count) + current_count = 1 + previous_value = value + yield (previous_value, current_count) + + +def init_table(ncols): + return ( + r'\begin{samepage} \begin{table}[h!] \small \begin{tabular}{' + + "l"*ncols + "}\n\\hline" + ) + + +def preformat_header(tsv, level): + name_counts = RunCounter(tsv.columns.get_level_values(level)) + for name, count in name_counts: + realname = ( + "" if name.startswith("Unnamed:") else name.replace("_", r'\_') + ) + if count == 1: + yield r'\textbf{'+realname+"}" + else: + yield r'\multicolumn{'+str(count)+r'}{l}{\textbf{'+realname+"}}" + + +def preformat_row(row, factors_tee, fmts_tee): + for label, value in row.items(): + is_p = False + if isinstance(label, (tuple, list)): + if len(set(label) & {"p", "p_adjusted"}): + is_p = True + elif label in {"p", "p_adjusted"}: + is_p = True + if is_p: + try: + float_value = float(value) + except: + yield value.replace("_", r'\_') + else: + if float(value) == 0.0: + yield r'<1e-300' + elif float(value) == 1.0: + yield "1.00" + else: + yield format(float(value), ".2e") + else: + try: + float_value = float(value) + except ValueError: + yield value.replace("_", r'\_') + else: + yield format(float_value*next(factors_tee), next(fmts_tee)) + + +def columnize(list_of_iterables): + raw_rows = [] + for iterable in list_of_iterables: + if isinstance(iterable, str) and (iterable == r'\hline'): + raw_rows.append(iterable) + else: + raw_rows.append("\t&\t".join(iterable)+" \\\\\n") + column_t = Popen( + ["column", "-t", "-s\t"], stdin=PIPE, stdout=PIPE, + universal_newlines=True, + ) + return column_t.communicate(input="\n".join(raw_rows))[0].rstrip("\n") + + +def finish_table(): + return "\n".join([ + r'\hline', r'\end{tabular}', r'\caption{}', + r'\label{}', r'\end{table}', r'\end{samepage}', + ]) + + +def main(filename, factors, fmts): + header_height = get_header_height(filename) + tsv = read_csv( + filename, sep="\t", escapechar="#", + header=list(range(header_height)), + dtype=str, + ) + print(init_table(ncols=tsv.shape[1])) + print(columnize(( + [preformat_header(tsv, n) for n in range(header_height)] + + [r'\hline'] + [ + preformat_row(row, tee(factors)[0], tee(fmts)[0]) + for _, row in tsv.iterrows() + ] + ))) + print(finish_table()) + return 0 + + +if __name__ == "__main__": + if len(argv) > 3: + factors = list(float(f) for f in argv[2].split(";")) + fmts = argv[3].split(";") + elif len(argv) > 2: + if argv[2] == "Table_1": + factors = [100]*7 + [1]*7 + fmts = [".1f"]*7 + [".4f"]*7 + else: + factors = list(float(f) for f in argv[2].split(";")) + fmts = cycle([".6f"]) + else: + factors = cycle([1]) + fmts = cycle([".6f"]) + returncode = main(argv[1], factors, fmts) + exit(returncode) diff --git a/publications/methods-paper/tex2office b/assets/paper/tools/tex2office similarity index 92% rename from publications/methods-paper/tex2office rename to assets/paper/tools/tex2office index abc3d88..79ffbf7 100755 --- a/publications/methods-paper/tex2office +++ b/assets/paper/tools/tex2office @@ -58,6 +58,10 @@ ARG_RULES = { help="lines to replace with {}".format(PXFONTS), nargs="+", metavar="P", default="\\usepackage[sfdefault]{roboto}" ), + ("--rem-dot-dot", "--rdd",) : dict( + help="remove all instances of '../' from the LaTeX source", + action="store_true", + ), ("-d", "--debug",): dict(help="debug mode", action="store_true"), } @@ -97,17 +101,22 @@ def parse_and_validate_args(parser): return args, TempDir -def prepare_source(tex, workdir, replace_fonts, debug, basename="source"): +def prepare_source(tex, workdir, replace_fonts, rem_dot_dot, debug, basename="source"): """Creates a copy of the source LaTeX file and modifies it for improved compatibility""" with open(tex, mode="rt") as tex_in: with open(path.join(workdir, basename+".tex"), mode="wt") as tex_out: for line in map(str.strip, tex_in): if line in replace_fonts: line = PXFONTS - elif search(r'^\s*\\\\%rem\s*$', line): - continue - elif search(r'\\\\%rem', line): - line = sub(r'\\\\%rem', " ", line) + else: + if search(r'^\s*\\\\%rem\s*$', line): + continue + elif search(r'%remall', line): + continue + elif search(r'\\\\%rem', line): + line = sub(r'\\\\%rem', " ", line) + if rem_dot_dot and search(r'\.\.\/', line): + line = sub(r'\.\.\/', "", line) line = sub(r'\\textcolor{white}\S*', "", line) print(line, file=tex_out) return basename @@ -180,7 +189,11 @@ def to_base64(soup, workdir, debug): for img in soup.find_all("img"): extension = img["src"].split(".")[-1].lower() if extension in BASE64_PREFIXES: - with open(path.join(workdir, img["src"]), mode="rb") as img_bytes: + if path.exists(img["src"]): + img_path = img["src"] + else: + img_path = path.join(workdir, img["src"]) + with open(img_path, mode="rb") as img_bytes: b64 = b64encode(img_bytes.read()) img["src"] = "data:image/{};base64,{}".format( BASE64_PREFIXES[extension], b64.decode(), @@ -307,11 +320,11 @@ def main(args, TempDir, prefix="converted"): """Dispatches data and arguments to subroutines""" with TempDir() as workdir: basename = prepare_source( - args.tex, workdir, args.replace_fonts, args.debug + args.tex, workdir, args.replace_fonts, args.rem_dot_dot, args.debug, ) if args.include: for filename in args.include: - mklink(filename, workdir) + mklink(filename.split("/")[0], workdir) xhlatex(basename, args.include, workdir, args.debug) soup = simplify_html(basename, workdir, args.debug) soup_with_PIC_fix = adjust_PICs( diff --git a/edgecase b/edgecase index 21b07c0..7ce2f18 100755 --- a/edgecase +++ b/edgecase @@ -18,11 +18,9 @@ Commmands (): kmerscanner perform scan of known kmers/motifs densityplot visualize densities of candidate motifs -Combined "pipeline" commands: - basic-pipeline-longread extract candidate long reads, discover motifs, plot - Development area: - levenshtein cluster long reads by edit distance + entropy calculate motif entropy among long reads + levenshtein calculate pairwise edit distance among long reads """ diff --git a/edgecaselib/__init__.py b/edgecaselib/__init__.py index ff3d4be..5674fa5 100644 --- a/edgecaselib/__init__.py +++ b/edgecaselib/__init__.py @@ -1,2 +1,2 @@ -from . import tailpuller, levenshtein, tailchopper, repeatfinder, kmerscanner -from . import densityplot, basic_pipeline_longread +from . import tailpuller, tailchopper, repeatfinder, kmerscanner, densityplot +from . import entropy, levenshtein diff --git a/edgecaselib/assets.py b/edgecaselib/assets.py deleted file mode 100644 index a190707..0000000 --- a/edgecaselib/assets.py +++ /dev/null @@ -1 +0,0 @@ -M19947_HMMGZ = b'+,^C)Jk=m*!X$];:MUq?\'ZBJ;.Jb%RHM[U-p*$Z&-/48WE[8U1KH(,+QT`*5L\\KEg\\2ZK$SfJV+oD_O1E4pVjjh\'R4m^RLi=POph\\iW+KqY8.EGWjsHjDt:kZAbE0b;AHcFK9SHhn,8:7s+`)o`qZ\\5LMmp^FUE?:YR>BXVBSpGWoB^GnFedB)_aE*/_dZr8\'(jh5;<#[+D;/Z,e8<0@cO4^--55G-(X=*fY[LMr8jkobM[4@[#-mq>ME:#SmW\\$4\'Z:_03>L8"ItLOmUZ"*F2CRfg@`\'GXipJ.>eBtVXo\\kE#/LWPr#s%"uomuE4Q65?"7ib#%Y=\'49"@+UZ"2/M9VJ?U.XoXrn0k@j"d*\'Eo0,p#;)FLA@/R`XckQG1c"8&lTd)unY?"\\,f%V3/rOQeek0099Llp.H6%%49qnh)ZK&p+GqkrOIGFWf.UL.p3M9\\1.;>mKS)-_QfK>5jAC;+i%A0(\'@Ip3(l#bFNP#^0P*M!eQ\\\'YQ9?86*LHHc5@*t!.ohhDus;h)7i3N+Lj04"u;`\'A3*jg01YF+LJA6hC^bo^e&tT]cm8[i\'*b@u"9^*)?m*o)6`-%W*\'+:MW;sV2#(!6K]i52^WIWGHK@;8709q)p.ZinZ+k%l*09q)p.R5YPJR0fk09q)p.c>Eki,f=Q94ac_#%Tc)aE.X594OW]#1-"QTU,l094OW]"ur/.31U*BdMV2E!L_4M17\\I5i,lfnF90b3@%`heA@c0iF90b3@%`hed*#giQN>IV@%`he3*e3t>laLP)!GN4R4]@4>lsXR)b4LTUZ0)h[&YF9r"p.DUlV@@30$[/tq1K4"]_$\\RC!oQ&HaJ*CnTGWEGW[Hi#C?q-#-bX`=C5d3!L"u;`\'A3+_gcoK";$UAYb#Yq$c,X[muS;q#IMF[sT1XJbu17^123X+`D1=/YtkbU&]9*OOU1=/[%7Z&nD8.c$hV)$Cj"YuW(A3\'nXV)$Cj"mDd58REg@Wk7!b2!2g2/V[FWr"[f-hAu*Lt1*]5!r)Mp];t]WWi5>6,k60,Mj8,fgoD%k<:N129"`_!O!!' diff --git a/edgecaselib/basic_pipeline_longread.py b/edgecaselib/basic_pipeline_longread.py deleted file mode 100644 index 730d85f..0000000 --- a/edgecaselib/basic_pipeline_longread.py +++ /dev/null @@ -1,136 +0,0 @@ -from os import path, walk -from numpy import inf -from edgecaselib import tailpuller, tailchopper, repeatfinder -from edgecaselib import kmerscanner, densityplot -from edgecaselib.formats import EmptyKmerscanError -from gzip import open as gzopen - -__doc__ = """edgeCase basic pipeline: select reads, find enriched motifs, plot - -Usage: {0} basic-pipeline-longread -x filename -o dirname [--force] [-j integer] - {1} [-m integer] [-n integer] [--target targetspec] - {1} [-q integer] [--min-k integer] [--max-k integer] - {1} [--jellyfish filename] [--jellyfish-hash-size string] - {1} [--max-p-adjusted float] [--window-size integer] - {1} [--n-boot integer] [--palette palettespec] - {1} [--title string] - -Output (in --output-dir): - * tailpuller.sam candidate reads - * tailchopper.sam telomeric regions - * repeatfinder-p_arm.tsv overrepresented motifs on the p arm - * repeatfinder-q_arm.tsv overrepresented motifs on the q arm - * kmerscanner-p_arm.dat.gz motif densities on the p arm - * kmerscanner-q_arm.dat.gz motif densities on the q arm - * densityplot-p_arm.pdf plot of motif densities on the p arm - * densityplot-q_arm.pdf plot of motif densities on the q arm - -Positional arguments: - name of input BAM/SAM file; must have a .bai index - -Required options: - -x, --index [filename] location of the reference .ecx index - -o, --output-dir [dirname] name of the output directory (must exist) - -Options: - --force force overwrite files in --output-dir - -j, --jobs [integer] number of parallel jobs (for jellyfish and kmerscanner) [default: 1] - -m, --max-read-length [integer] maximum read length to consider when selecting lookup regions - -n, --max-motifs [integer] maximum number of motifs to report [default: 4] - --target [targetspec] an ECX flag for heads/tails [default: tract_anchor] - --min-k [integer] smallest target repeat length [default: 4] - --max-k [integer] largest target repeat length [default: 16] - --max-p-adjusted [float] cutoff adjusted p-value [default: .05] - --jellyfish [filename] jellyfish binary (unless in $PATH) - --jellyfish-hash-size [string] jellyfish initial hash size [default: 2G] - --window-size [integer] size of the window (for kmerscanner and densityplot) [default: 100] - --n-boot [integer] number of bootstrap iterations for plotting [default: 1000] - --palette [palettespec] custom palette for plotting motifs - --title [string] title for the density plots - -Input filtering options: - -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] -""" - -__docopt_converters__ = [ - lambda jobs: int(jobs), - lambda max_read_length: - inf if (max_read_length is None) else int(max_read_length), - lambda max_motifs: int(max_motifs), - lambda min_k: int(min_k), - lambda max_k: int(max_k), - lambda max_p_adjusted: float(max_p_adjusted), - lambda window_size: int(window_size), - lambda n_boot: int(n_boot), - lambda min_quality: None if (min_quality is None) else int(min_quality), -] - -__docopt_tests__ = { - lambda bam: - path.isfile(bam + ".bai"): "BAM index ({}.bai) not found", - lambda output_dir: - path.isdir(output_dir): "{} does not exist or is not a directory", - lambda output_dir, force: - force or (len(next(walk(output_dir))[2]) == 0): - "{} already has files in it", - lambda max_read_length: - max_read_length > 0: "--max-read-length below 0", - lambda target: - target in {"ucsc_mask_anchor", "fork", "tract_anchor"}: - "unknown value of --target", - lambda min_k, max_k: - 0 < min_k < max_k: "not satisfied: 0 < m < M", -} - - -def main(bam, index, output_dir, jobs, max_read_length, max_motifs, target, min_k, max_k, max_p_adjusted, jellyfish, jellyfish_hash_size, window_size, n_boot, palette, title, min_quality, **kwargs): - """basic pipeline: select reads, find enriched motifs, plot""" - get_filename = lambda fn: path.join(output_dir, fn) - tailpuller_sam = get_filename("tailpuller.sam") - with open(tailpuller_sam, mode="wt") as sam: - tailpuller.main( - bam, index, flags=0, flag_filter=3844, - min_quality=min_quality, max_read_length=max_read_length, file=sam, - ) - tailchopper_sam = get_filename("tailchopper.sam") - with open(tailchopper_sam, mode="wt") as sam: - tailchopper.main( - tailpuller_sam, index, target=target, flags=target, - flag_filter=3844, min_quality=min_quality, file=sam, - ) - for arm, f, F in [("p", target, "is_q|3840"), ("q", target+"|is_q", 3840)]: - repeatfinder_tsv = get_filename("repeatfinder-{}_arm.tsv".format(arm)) - with open(repeatfinder_tsv, mode="wt") as tsv: - repeatfinder.main( - tailchopper_sam, fmt="sam", - flags=f, flag_filter=F, collapse_reverse_complement=False, - min_quality=min_quality, min_k=min_k, max_k=max_k, - max_motifs=max_motifs, max_p_adjusted=max_p_adjusted, - no_context=False, jellyfish=jellyfish, min_repeats=2, - jellyfish_hash_size=jellyfish_hash_size, jobs=jobs, file=tsv, - ) - kmerscanner_dat = get_filename("kmerscanner-{}_arm.dat.gz".format(arm)) - with gzopen(kmerscanner_dat, mode="wt") as dat: - kmerscanner.main( - tailpuller_sam, fmt="sam", flags=f, flag_filter=F, - min_quality=min_quality, motif_file=repeatfinder_tsv, - head_test=None, tail_test=None, cutoff=None, num_reads=None, - window_size=window_size, jobs=jobs, file=dat, - ) - densityplot_pdf = get_filename("densityplot-{}_arm.pdf".format(arm)) - with open(densityplot_pdf, mode="wb") as pdf: - try: - if title is None: - plot_title = None - else: - plot_title = "{}, {} arm".format(title, arm) - densityplot.main( - kmerscanner_dat, gzipped=True, index=index, flags=f, - flag_filter=F, min_quality=min_quality, - bin_size=window_size, n_boot=n_boot, exploded=False, - zoomed_in=False, palette=palette, title=plot_title, - file=pdf, - ) - except EmptyKmerscanError: - pass - return 0 diff --git a/edgecaselib/densityplot.py b/edgecaselib/densityplot.py index bbb958a..7ee919f 100644 --- a/edgecaselib/densityplot.py +++ b/edgecaselib/densityplot.py @@ -1,27 +1,27 @@ from sys import stdout from edgecaselib.formats import load_index, load_kmerscan from edgecaselib.formats import FLAG_COLORS, explain_sam_flags, interpret_flags -from edgecaselib.formats import DEFAULT_MOTIF_COLORS, PAPER_PALETTE, BGCOLOR +from edgecaselib.formats import TOL_COLORSCHEME, PAPER_PALETTE, BGCOLOR from collections import OrderedDict from edgecaselib.util import natsorted_chromosomes, progressbar, motif_revcomp -from matplotlib.pyplot import subplots, rc_context, switch_backend -from matplotlib.backends.backend_pdf import PdfPages -from matplotlib.patches import Rectangle, Patch -from numpy import clip +from matplotlib.pyplot import subplots, switch_backend, cm +from matplotlib.patches import Patch +from numpy import clip, arange from seaborn import lineplot -from itertools import count -from os import path, getenv +from os import path from pandas import concat from re import search +from pickle import dump +buffer = getattr(stdout, "buffer", stdout) __doc__ = """edgeCase densityplot: visualization of motif densities -Usage: {0} densityplot -x filename [-b integer] [-e] [--zoomed-in] +Usage: {0} densityplot -x filename [-b integer] [--plot-coverage] {1} [--palette palettespec] [--title string] - {1} [--n-boot integer] - {1} [-f flagspec] [-F flagspec] [-q integer] - {1} [-z] + {1} [--n-boot integer] [--chroms-to-plot string] + {1} [-f flagspec]... [-F flagspec]... [-q integer] + {1} [--figwidth-inches float] [--outfmt string] [-z] Output: PDF file with motif densities visualized along chromosomal ends @@ -33,13 +33,15 @@ -x, --index [filename] location of the reference .ecx index Options: - -z, --gzipped input is gzipped (must specify if any of -qfF present - -b, --bin-size [integer] size of each bin in bp for visualization speedup [default: 100] + -z, --gzipped input is gzipped (must specify if any of -qfF present) + -b, --bin-size [integer] size of each bin in bp (overrides bin size in ) --n-boot [integer] number of bootstrap iterations for 95% confidence intervals [default: 1000] - -e, --exploded plot each read separately - --zoomed-in plot taller traces, cut off pre-anchor regions --palette [palettespec] custom palette for plotting motifs --title [string] figure title (defaults to input filename) + --chroms-to-plot [string] if set, plot chromosomes from this comma-separated list unconditionally + --plot-coverage plot coverage by telomeric reads on each arm + --figwidth-inches [float] width of figure in inches [default: 13] + --outfmt [string] output format (pdf, pkl) [default: pdf] Input filtering options: -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] @@ -48,12 +50,16 @@ """ __docopt_converters__ = [ - lambda bin_size: int(bin_size), + lambda bin_size: None if bin_size is None else int(bin_size), lambda n_boot: int(n_boot), + lambda figwidth_inches: float(figwidth_inches), ] - -DENSITYPLOT_FIGWIDTH = 17 +__docopt_tests__ = { + lambda outfmt: + outfmt in {"pdf", "pkl"}: + "--outfmt must be either 'pdf' or 'pkl'", +} def simplify_axes(ax, keep=(), keep_scientific=False): @@ -65,35 +71,13 @@ def simplify_axes(ax, keep=(), keep_scientific=False): ax.ticklabel_format(useOffset=False, style="plain") -def motif_subplots(nreads, chrom, max_mapq): - """Prepare figure with subplots for each read""" - page, axs = subplots( - ncols=2, nrows=nreads, squeeze=False, - figsize=(DENSITYPLOT_FIGWIDTH, nreads*2/3), sharey=True, frameon=False, - gridspec_kw={"hspace": 0, "wspace": 0, "width_ratios": (15, 1)}, - ) - # remove subplot borders: - for ax in axs.flatten(): - simplify_axes(ax) - # remove xticks for all reads except bottom one: - for ax in axs[:-1, 0]: - ax.set(xticks=[]) - # annotate chromosome and MAPQ: - axs[-1, 0].set(xlabel="Chromosome {}".format(chrom)) - meta_twiny = axs[0, 1].twiny() - meta_twiny.set(title="MAPQ", xlim=(0, max_mapq)) - simplify_axes(meta_twiny) - axs[-1, 1].set(xticks=[]) - return page, axs - - -def chromosome_subplots(nrows, zoomed_in=False): +def chromosome_subplots(nrows, figwidth_inches, plot_coverage=False): """Prepare figure with subplots for each chromosome end""" - if zoomed_in: - figsize=(DENSITYPLOT_FIGWIDTH*.6, nrows*3*.65) - hspace = .35 + if plot_coverage: + figsize = (figwidth_inches*.7, nrows*2.2*.7*figwidth_inches/15) + hspace = .3 else: - figsize=(DENSITYPLOT_FIGWIDTH*.7, nrows*1.5*.7) + figsize = (figwidth_inches*.7, nrows*1.5*.7*figwidth_inches/15) hspace = .7 figure, axs = subplots( figsize=figsize, gridspec_kw={"hspace": hspace}, @@ -104,117 +88,31 @@ def chromosome_subplots(nrows, zoomed_in=False): return figure, axs -def plot_read_motif_densities(read_data, trace_ax, legend=False): - """Plot traces for motif densities of one read""" - trace_data = read_data.iloc[:,9:].T - trace_data.columns = read_data["motif"] - try: - lineplot(data=trace_data, ax=trace_ax, legend=legend, dashes=False) - except: - pass - if legend: - trace_ax.legend(loc="lower left", bbox_to_anchor=(0, 1.4)) - return trace_data - - -def highlight_mapped_region(read_data, trace_data, name, trace_ax): - """Plot a rectangle around mapped region of read""" - leftmost_map = read_data["pos"].iloc[0] - read_flag = read_data["flag"].iloc[0] - map_length = ( - trace_data.dropna().index.max() - - read_data["clip_3prime"].iloc[0] - - leftmost_map - ) - trace_ax.add_patch( - Rectangle( - (leftmost_map, -.1), map_length, 1.2, - edgecolor="gray", facecolor="none", - ) - ) - trace_ax.text( - leftmost_map+map_length/2, 1, - name + "\n" + explain_sam_flags(read_flag), - verticalalignment="top", horizontalalignment="center", - ) - - -def plot_read_metadata(read_data, max_mapq, meta_ax): - """Annotate additional SAM information for read""" - mapq = read_data["mapq"].iloc[0] - meta_ax.add_patch( - Rectangle( - (0, .2), mapq, .6, - edgecolor="cornflowerblue", facecolor="cornflowerblue", - ) - ) - meta_ax.set(xlim=(0, max_mapq)) - - -def chromosome_exploded_motif_plot(binned_density_dataframe, ecx, chrom, max_mapq, title, samfilters): - """Render figure with all motif densities of all reads mapping to one chromosome""" - names = binned_density_dataframe["name"].drop_duplicates() - page, axs = motif_subplots(len(names), chrom, max_mapq) - pos_range = binned_density_dataframe.columns[9:] - for i, name, (trace_ax, meta_ax) in zip(count(), names, axs): - read_data = binned_density_dataframe[ - binned_density_dataframe["name"]==name - ] - legend = "full" if (i==0) else False - trace_data = plot_read_motif_densities(read_data, trace_ax, legend) - highlight_mapped_region(read_data, trace_data, name, trace_ax) - trace_ax.set( - xlim=(pos_range.min(), pos_range.max()), - ylim=(-.2, 1.2), yticks=[], - ) - plot_read_metadata(read_data, max_mapq, meta_ax) - plottable_flags = ecx.loc[ecx["rname"]==chrom, ["pos", "flag"]] - for _, pos, flag in plottable_flags.itertuples(): - trace_ax.axvline( - pos, -.2, 1.2, ls=":", lw=4, c=FLAG_COLORS[flag], alpha=.4, - ) - axs[0, 0].set( - title="{}\n-f={} -F={} -q={}".format(title, *samfilters), - ) - return page - - -def make_decorated_densities_iterator(densities): +def make_decorated_densities_iterator(densities, chroms_to_plot=None): """Order chromosomes and wrap with progress bar""" - sorted_chromosomes = natsorted_chromosomes(densities.keys()) + if chroms_to_plot: + sorted_chromosomes = natsorted_chromosomes( + set(densities.keys()) | set(chroms_to_plot.split(",")) + ) + else: + sorted_chromosomes = natsorted_chromosomes(densities.keys()) sorted_densities_iterator = ( - (chrom, densities[chrom]) for chrom in sorted_chromosomes + (chrom, densities.get(chrom)) for chrom in sorted_chromosomes ) - return progressbar( - sorted_densities_iterator, total=len(densities), + decorated_densities_iterator = progressbar( + sorted_densities_iterator, total=len(sorted_chromosomes), desc="Plotting", unit="chromosome", ) - - -def plot_exploded_densities(densities, ecx, title, samfilters, file=stdout.buffer): - """Plot binned densities as line plots, one read at a time""" - max_mapq = max(d["mapq"].max() for d in densities.values()) - decorated_densities_iterator = make_decorated_densities_iterator(densities) - with rc_context({"figure.max_open_warning": len(densities)+2}): - with PdfPages(file) as pdf: - for chrom, binned_density_dataframe in decorated_densities_iterator: - page = chromosome_exploded_motif_plot( - binned_density_dataframe, ecx, chrom, max_mapq, - title, samfilters, - ) - pdf.savefig(page, bbox_inches="tight") + return decorated_densities_iterator, len(sorted_chromosomes) def stack_motif_densities(binned_density_dataframe): """Prepare densities for plotting the stacked area chart; return normal order but stack from bottom""" - if "abundance" in binned_density_dataframe.columns: - motif_order = binned_density_dataframe[["motif", "abundance"]] \ - .drop_duplicates().sort_values(by="abundance") \ - ["motif"].drop_duplicates().values - else: - motif_order = binned_density_dataframe[["motif", "total_count"]] \ - .drop_duplicates().sort_values(by="total_count") \ - ["motif"].drop_duplicates().values + motif_order = ( + binned_density_dataframe[["motif", "score"]] + .drop_duplicates().sort_values(by="score")["motif"] + .drop_duplicates().values + ) skinny_bdf = binned_density_dataframe[ ["name", "motif"] + list(binned_density_dataframe.columns[9:]) ] @@ -236,8 +134,12 @@ def stack_motif_densities(binned_density_dataframe): def get_motif_data(plottable_df, motif): """Extract condensed averaged data for one motif (for fill_between)""" - return plottable_df[plottable_df["motif"]==motif].dropna(how="any") \ - .groupby(["motif", "position"], as_index=False).mean() + return ( + plottable_df[plottable_df["motif"]==motif] + .dropna(how="any") + .groupby(["motif", "position"], as_index=False) + .mean() + ) def shorten_chrom_name(chrom): @@ -277,7 +179,7 @@ def fill_area(plottable_df, i, updated_palette, ax): ) -def coverage_plot(plottable_df, motif_count, is_q, ax, y_offset=.1): +def coverage_plot(plottable_df, motif_count, is_q, ax, y_offset=.025): """Plot read coverage under area chart""" covered_positions = plottable_df.loc[ ~plottable_df["density"].isnull(), "position" @@ -289,43 +191,37 @@ def coverage_plot(plottable_df, motif_count, is_q, ax, y_offset=.1): coverage_df["viz_coverage"] = 1 + y_offset + .01 * clip( coverage_df["coverage"], a_min=1, a_max=50, ) - ax.plot( - [coverage_df["position"].min(), coverage_df["position"].max()], - [coverage_df["viz_coverage"].max()]*2, ls="--", color="gray" - ) - if is_q: - x, ha = coverage_df["position"].max(), "left" - ticks_mask, reads_label = " {}", "\n\nreads" - else: - x, ha = coverage_df["position"].min(), "right" - ticks_mask, reads_label = "{} ", "reads\n\n" - ax.text( - x=x, y=coverage_df["viz_coverage"].max(), va="center", ha=ha, - s=ticks_mask.format(coverage_df["coverage"].max()) + max_viz_coverage = coverage_df["viz_coverage"].max() + poly, = ax.fill( + coverage_df["position"], coverage_df["viz_coverage"], color="none", ) - ax.text(x=x, y=1+y_offset, va="center", ha=ha, s=ticks_mask.format(1)) - ax.text( - x=x, y=.5*(1+y_offset+coverage_df["viz_coverage"].max()), - va="center", ha=ha, rotation=90, s=reads_label, + gradient = arange( + coverage_df["coverage"].min(), coverage_df["coverage"].max(), .1, ) - ax.fill_between( - x=coverage_df["position"], y1=1+y_offset, - y2=coverage_df["viz_coverage"], step="pre", color="gray", alpha=.4, + img = ax.imshow( + gradient.reshape(gradient.size, 1), aspect="auto", origin="lower", + cmap=cm.bone, alpha=.6, vmin=-10, vmax=60, extent=[ + coverage_df["position"].min(), + coverage_df["position"].max(), + coverage_df["viz_coverage"].min(), + coverage_df["viz_coverage"].max(), + ], ) - return coverage_df["viz_coverage"].max() + img.set_clip_path(poly) + return max_viz_coverage def generate_updated_palette(palette, motif_order): """Given user palette and motifs in density file, generate palette that satisfies both""" updated_palette = OrderedDict() if palette is None: - if len(motif_order) > len(DEFAULT_MOTIF_COLORS): + if len(motif_order) > len(TOL_COLORSCHEME): error_msk = "Cannot plot over {} motifs with default palette; {}" raise ValueError(error_msk.format( - len(DEFAULT_MOTIF_COLORS), "please provide a custom --palette", + len(TOL_COLORSCHEME), "please provide a custom --palette", )) else: - for motif, color in zip(motif_order, DEFAULT_MOTIF_COLORS): + for motif, color in zip(motif_order, TOL_COLORSCHEME.values()): updated_palette[motif] = color updated_motif_order = motif_order else: @@ -339,7 +235,21 @@ def generate_updated_palette(palette, motif_order): return updated_palette, updated_motif_order -def plot_combined_density(binned_density_dataframe, n_boot, ecx, title, palette, target_anchor, is_q, display_chrom_name, ecx_chrom_name, zoomed_in, ax): +def plot_anchors(ecx_chrom_name, ecx, target_anchor, is_q, ax): + """Plot anchor positions over densities""" + indexer = ( + (ecx["rname"]==ecx_chrom_name) & (ecx["prime"]==(3 if is_q else 5)) & + (ecx["flag"]==interpret_flags(target_anchor)) + ) + plottable_flags = ecx.loc[indexer, ["pos", "flag"]] + for _, pos, flag in plottable_flags.itertuples(): + ax.plot( + [pos, pos], [0, 1], ls="--", lw=3, c=FLAG_COLORS[flag], alpha=.4, + ) + return ecx.loc[indexer, "pos"].min(), ecx.loc[indexer, "pos"].max() + + +def plot_combined_density(binned_density_dataframe, n_boot, ecx, title, palette, target_anchor, is_q, ecx_chrom_name, plot_coverage, ax): """Plot stacked area charts with bootstrapped CIs""" plottable_df, motif_order = stack_motif_densities(binned_density_dataframe) updated_palette, updated_motif_order = generate_updated_palette( @@ -356,28 +266,23 @@ def plot_combined_density(binned_density_dataframe, n_boot, ecx, title, palette, x=plottable_df["position"].drop_duplicates().sort_values(), y=1, legend=False, color="black", linewidth=1, alpha=1, ax=ax, ) - if zoomed_in: + if plot_coverage: ymax = coverage_plot(plottable_df, len(motif_order), is_q, ax) else: ymax = 1 - indexer = ( - (ecx["rname"]==ecx_chrom_name) & (ecx["prime"]==(3 if is_q else 5)) & - (ecx["flag"]==interpret_flags(target_anchor)) - ) - plottable_flags = ecx.loc[indexer, ["pos", "flag"]] - for _, pos, flag in plottable_flags.itertuples(): - ax.axvline(pos, 0, 1/ymax, ls=":", lw=4, c=FLAG_COLORS[flag], alpha=.4) position_values = plottable_df["position"].values ax.set(xlim=(position_values.min(), position_values.max())) - ax.set(xlabel="", ylim=(0, ymax), ylabel=display_chrom_name, yticks=[]) + ax.set(xlabel="", ylim=(0, ymax), ylabel="", yticks=[]) return updated_palette -def align_subplots(ax2chrom, ecx, target_anchor, is_q, unit_adjustment=1e6): +def align_subplots(ax2chrom, ax2ylabel, ecx, target_anchor, is_q, unit_adjustment=None, unit_fmt=",.0f"): """Modify xlim of related axes to make their scales match""" prime = 3 if is_q else 5 anchor_positions, left_spans, right_spans = {}, {}, {} + ymax = 0 for ax, chrom in ax2chrom.items(): + ymax = max(ymax, ax.get_ylim()[1]) indexer = ( (ecx["rname"]==chrom) & (ecx["prime"]==prime) & (ecx["flag"]==interpret_flags(target_anchor)) @@ -392,63 +297,59 @@ def align_subplots(ax2chrom, ecx, target_anchor, is_q, unit_adjustment=1e6): else: error_msk = "{} not found on {} prime for {} in ECX" raise ValueError(error_msk.format(target_anchor, prime, chrom)) - max_left_span = str(getenv("PAPER_LEFT_SPAN")) - if max_left_span.isdigit(): - max_left_span = int(max_left_span) - else: - max_left_span = max(left_spans.values()) - max_right_span = str(getenv("PAPER_RIGHT_SPAN")) - if max_right_span.isdigit(): - max_right_span = int(max_right_span) - else: - max_right_span = max(right_spans.values()) for ax, chrom in ax2chrom.items(): - ax.set(xlim=( - anchor_positions[ax] - max_left_span, - anchor_positions[ax] + max_right_span, - )) + ax.set( + xlim=( + anchor_positions[ax] - max(left_spans.values()), + anchor_positions[ax] + max(right_spans.values()), + ), + ylim=(0, ymax), + ) + xmin, xmax = ax.get_xlim() + ax.text( + x=xmin-(xmax-xmin)/100, y=.5, transform=ax.transData, + s=ax2ylabel[ax], ha="right", va="center", rotation=90, + ) if unit_adjustment: ax.set( xticklabels=[ - "{:.3f}".format(int(xt) / unit_adjustment) + format(int(xt) / unit_adjustment, unit_fmt) for xt in ax.get_xticks().tolist() ] ) -def add_legend(updated_palette, ax, exploded, is_q): +def add_motifs_legend(updated_palette, ax, is_q): """Add custom legend""" - if exploded: - raise NotImplementedError("Custom legend for exploded density plot") + handles = [ + Patch(label=motif, fc=color, ec="black", hatch=None, alpha=.7) + for motif, color in reversed(updated_palette.items()) + ][::-1] + background_handle = [Patch( + label="background", fc=BGCOLOR, ec="black", hatch=None, alpha=.7, + )] + if is_q: + loc="upper left" else: - handles = [ - Patch(label=motif, fc=color, ec="black", hatch=None, alpha=.7) - for motif, color in reversed(updated_palette.items()) - ][::-1] - background_handle = [ - Patch(label="other", fc=BGCOLOR, ec="black", hatch=None, alpha=.7) - ] - if is_q: - loc="upper left" - else: - loc="upper right" - ax.legend(handles=handles+background_handle, loc=loc, framealpha=.9) - ax.set(zorder=float("inf")) + loc="upper right" + ax.legend(handles=handles+background_handle, loc=loc, framealpha=.9) + ax.set(zorder=float("inf")) -def plot_density_scale(ax): - bar_position = 1 + .002 * DENSITYPLOT_FIGWIDTH - tick_width = .0003 * DENSITYPLOT_FIGWIDTH +def plot_density_scale(ax, figwidth_inches): + bar_position = 1 + .002 * figwidth_inches + tick_width = .0004 * figwidth_inches + ymax = ax.get_ylim()[1] xtrace = [ bar_position+tick_width, bar_position, bar_position, bar_position+tick_width, ] ax.plot( - xtrace, [1, 1, 0, 0], transform=ax.transAxes, + xtrace, [1/ymax, 1/ymax, 0, 0], transform=ax.transAxes, color="black", linewidth=1, clip_on=False, ) ax.text( - x=bar_position+tick_width, y=1, transform=ax.transAxes, + x=bar_position+tick_width, y=1/ymax, transform=ax.transAxes, s=" 100%", ha="left", va="center", ) ax.text( @@ -456,64 +357,81 @@ def plot_density_scale(ax): s=" 0%", ha="left", va="center", ) ax.text( - x=bar_position-tick_width, y=.5, transform=ax.transAxes, rotation=90, + x=bar_position-tick_width, y=.5/ymax, transform=ax.transAxes, s="density", ha="right", va="center", + rotation=90, size=figwidth_inches*2/3, ) -def plot_densities(densities, n_boot, ecx, title, palette, legend, target_anchor, is_q, zoomed_in, file=stdout.buffer): +def format_chrom(chrom): + """Get printable options based on value of `chrom`""" + ecx_chrom_name, comment, *_ = chrom.split(":", 1) + [None] + short_chrom_name = shorten_chrom_name(ecx_chrom_name) + if comment: + display_chrom_name = short_chrom_name + "\n" + comment + else: + display_chrom_name = short_chrom_name + return ecx_chrom_name, short_chrom_name, display_chrom_name + + +def plot_densities(densities, n_boot, ecx, title, palette, motifs_legend, density_legend, target_anchor, is_q, chroms_to_plot, figwidth_inches, plot_coverage, unit="Kbp"): """Plot binned densities as bootstrapped line plots, combined per chromosome""" - decorated_densities_iterator = make_decorated_densities_iterator(densities) - switch_backend("Agg") - figure, axs = chromosome_subplots(len(densities), zoomed_in) - ax2chrom = {} + decorated_densities_iterator, n_axes = make_decorated_densities_iterator( + densities, chroms_to_plot, + ) + figure, axs = chromosome_subplots(n_axes, figwidth_inches, plot_coverage) + ax2chrom, ax2ylabel = {}, {} for (chrom, bdf), ax in zip(decorated_densities_iterator, axs[:, 0]): - ecx_chrom_name, comment, *_ = chrom.split(":", 1) + [None] - if zoomed_in: - short_chrom_name = ecx_chrom_name - else: - short_chrom_name = shorten_chrom_name(ecx_chrom_name) - if comment: - display_chrom_name = short_chrom_name + "\n" + comment - else: - display_chrom_name = short_chrom_name - updated_palette = plot_combined_density( - bdf, n_boot, ecx, title, palette, target_anchor, is_q, ax=ax, - zoomed_in=zoomed_in, display_chrom_name=display_chrom_name, - ecx_chrom_name=ecx_chrom_name, - ) - ax2chrom[ax] = ecx_chrom_name - align_subplots(ax2chrom, ecx, target_anchor, is_q) - if legend: - add_legend( - updated_palette, axs[0, 0], exploded=False, is_q=is_q, + ecx_chrom_name, short_chrom_name, display_chrom_name = format_chrom( + chrom, ) - plot_density_scale(axs[0, 0]) - if title: - axs[0, 0].set(title=title) - axs[-1, 0].set(xlabel="Mbp") - figure.savefig(file, bbox_inches="tight", format="pdf") + if bdf is None: + ax.set(xlabel="", ylim=(0, 1), ylabel=display_chrom_name, yticks=[]) + else: + updated_palette = plot_combined_density( + bdf, n_boot, ecx, title, palette, target_anchor, is_q, ax=ax, + ecx_chrom_name=ecx_chrom_name, plot_coverage=plot_coverage, + ) + xlim = plot_anchors(ecx_chrom_name, ecx, target_anchor, is_q, ax) + if bdf is None: + ax.set(xlim=(xlim[0]-1, xlim[1]+1)) + ax2chrom[ax], ax2ylabel[ax] = ecx_chrom_name, display_chrom_name + try: + unit_adjustment = {"Kbp": 1e3, "Mbp": 1e6, "bp": None}[unit] + unit_fmt = {"Kbp": ",.0f", "Mbp": ",.3f", "bp": ",.0f"}[unit] + except KeyError: + raise ValueError("unit", unit) + align_subplots( + ax2chrom, ax2ylabel, ecx, target_anchor, is_q, + unit_adjustment=unit_adjustment, unit_fmt=unit_fmt, + ) + if motifs_legend: + add_motifs_legend(updated_palette, axs[0, 0], is_q=is_q) + if density_legend: + plot_density_scale(axs[0, 0], figwidth_inches) + axs[0, 0].set(title=title) + axs[-1, 0].set(xlabel=unit) + return figure def interpret_target(samfilters): - """For non-exploded densityplots, infer which arm to plot and which anchor to center around""" - flags2set = lambda f: set(explain_sam_flags(interpret_flags(f)).split("|")) - potential_target_anchors = {"tract_anchor", "ucsc_mask_anchor", "fork"} + """Infer which arm to plot and which anchor to center around""" + flags2set = lambda f: set(explain_sam_flags(interpret_flags(f))) flags, flag_filter, _ = samfilters if "is_q" in (flags2set(flags) - flags2set(flag_filter)): is_q = True elif "is_q" in (flags2set(flag_filter) - flags2set(flags)): is_q = False else: - raise NotImplementedError("non-exploded on both arms") + raise NotImplementedError("Input contains reads on both arms") target_anchors = ( - potential_target_anchors & + {"tract_anchor", "mask_anchor", "fork"} & (flags2set(flags) - flags2set(flag_filter)) ) if len(target_anchors) == 1: target_anchor = target_anchors.pop() else: - error_message = "non-exploded without a single identifiable anchor" + error_message = f"Multiple anchors to plot: {sorted(target_anchor)}" raise NotImplementedError(error_message) return is_q, target_anchor @@ -531,30 +449,29 @@ def generate_paper_palette(paper_palette, is_q): raise NotImplementedError("--palette 'paper' for unknown arm") -def interpret_arguments(palette, exploded, zoomed_in, samfilters, title, dat): +def interpret_arguments(palette, chroms_to_plot, samfilters, title, outfmt, dat): """Parse and check arguments""" - if exploded: - if zoomed_in: - raise NotImplementedError("--exploded with --zoomed-in") - target_anchor, is_q = None, None - else: - is_q, target_anchor = interpret_target(samfilters) + is_q, target_anchor = interpret_target(samfilters) + PAPER_PALETTE_AS_PASSED_ARGS = { + "paper", "paper|legend=none", "paper|legend=full", + "paper|legend=density", "paper|legend=motifs", + } if palette is None: - legend = not zoomed_in - elif palette in {"paper|legend=False", "paper|legend=True", "paper"}: - legend = "False" not in palette + motifs_legend, density_legend = True, True + elif palette in PAPER_PALETTE_AS_PASSED_ARGS: + full_legend = "legend=full" in palette + motifs_legend = ("legend=motifs" in palette) or full_legend + density_legend = ("legend=density" in palette) or full_legend palette = generate_paper_palette(PAPER_PALETTE, is_q) else: - if exploded: - raise NotImplementedError("--palette with --exploded") interpreted_palette = OrderedDict() - legend = not zoomed_in + motifs_legend, density_legend = True, True for palette_field in palette.split("|"): if palette_field.startswith("legend="): - if palette_field[7:].lower() == "true": - legend = True - elif palette_field[7:].lower() == "false": - legend = False + spec = palette_field[7:] + if spec in {"full", "motifs", "density", "none"}: + motifs_legend = spec in {"full", "motifs"} + density_legend = spec in {"full", "density"} else: raise ValueError("Uknown syntax: " + palette_field) elif "=" in palette_field: @@ -562,27 +479,31 @@ def interpret_arguments(palette, exploded, zoomed_in, samfilters, title, dat): interpreted_palette[motif] = color else: raise ValueError("Uknown syntax: " + palette_field) - if interpreted_palette: - palette = interpreted_palette - else: - palette = None - return target_anchor, is_q, palette, legend, (title or path.split(dat)[-1]) + palette = interpreted_palette or None + return ( + target_anchor, is_q, palette, motifs_legend, density_legend, + (title or path.split(dat)[-1]), + ) -def main(dat, gzipped, index, flags, flag_filter, min_quality, bin_size, n_boot, exploded, zoomed_in, palette, title, file=stdout.buffer, **kwargs): +def main(dat, index, gzipped, flags, flag_filter, min_quality, bin_size, n_boot, palette, title, chroms_to_plot, plot_coverage, outfmt, figwidth_inches, file=buffer, **kwargs): """Dispatch data to subroutines""" samfilters = [flags, flag_filter, min_quality] - target_anchor, is_q, palette, legend, title = interpret_arguments( - palette, exploded, zoomed_in, samfilters, title, dat, + _intrg = interpret_arguments( + palette, chroms_to_plot, samfilters, title, outfmt, dat, ) + target_anchor, is_q, palette, motifs_legend, density_legend, title = _intrg ecx = load_index(index) densities = load_kmerscan(dat, gzipped, samfilters, bin_size) - if exploded: - plot_exploded_densities( - densities, ecx, title, samfilters, file, - ) + switch_backend("Agg") + figure = plot_densities( + densities, n_boot, ecx, title, palette, motifs_legend, density_legend, + target_anchor, is_q, chroms_to_plot, figwidth_inches, plot_coverage, + ) + if outfmt == "pdf": + figure.savefig(file, bbox_inches="tight", format="pdf") + elif outfmt == "pkl": + dump(figure, file) else: - plot_densities( - densities, n_boot, ecx, title, palette, legend, target_anchor, - is_q, zoomed_in, file, - ) + raise ValueError(f"--outfmt={outfmt}") + return 0 diff --git a/edgecaselib/entropy.py b/edgecaselib/entropy.py new file mode 100644 index 0000000..39c4119 --- /dev/null +++ b/edgecaselib/entropy.py @@ -0,0 +1,80 @@ +from sys import stdout +from edgecaselib.formats import load_kmerscan +from pandas import DataFrame, concat +from scipy.stats import entropy +from numpy import log, argsort, cumsum, interp +from edgecaselib.util import progressbar +from itertools import chain + + +__doc__ = """edgeCase entropy: calculation of motif entropy among reads + +Usage: {0} entropy [-b integer] [-f flagspec]... [-F flagspec]... [-q integer] + {1} [-z] ... + +Output: + TSV file of entropy values and read coverage per bin, + with coverage-weighted quantiles of entropy in a comment on first line + +Positional arguments: + name of input kmerscanner file(s) + +Options: + -z, --gzipped input is gzipped (must specify if any of -qfF present) + -b, --bin-size [integer] size of each bin in bp (overrides bin size in ) + +Input filtering options: + -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] + -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] + -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] +""" + +__docopt_converters__ = [ + lambda bin_size: None if bin_size is None else int(bin_size), +] + + +def calculate_entropies(bdf): + """Calculate entropies per binned density dataframe""" + per_read_modes = ( + bdf.groupby("name") + .apply(lambda block: block.set_index("motif").iloc[:,8:].idxmax(axis=0)) + .dropna(how="all", axis=1) + ) + N = len(per_read_modes.melt().value.dropna().unique()) + return DataFrame({ + "#entropy": ( + per_read_modes.apply(lambda c: entropy(c.value_counts())) / log(N) + ), + "coverage": (~per_read_modes.isnull()).sum(axis=0), + }) + + +def weighted_quantile(points, weights, q): + """Calculate quantile `q` of `points`, weighted by `weights`""" + indsort = argsort(points.values) + spoints, sweights = points.values[indsort], weights.values[indsort] + sn = cumsum(sweights) + pn = (sn - sweights / 2) / sn[-1] + return interp(q, pn, spoints) + + +def main(dat, gzipped, flags, flag_filter, min_quality, bin_size, file=stdout, **kwargs): + """Dispatch data to subroutines""" + samfilters = [flags, flag_filter, min_quality] + kmerscans = [load_kmerscan(fn, gzipped, samfilters, bin_size) for fn in dat] + entropies = concat( + calculate_entropies(bdf) for bdf in progressbar( + chain(*(ks.values() for ks in kmerscans)), + desc="Calculating entropies", unit="arm", + total=sum(len(ks) for ks in kmerscans), + ) + ) + quantiles = { + q: weighted_quantile( + entropies["#entropy"], entropies["coverage"]-1, q/100, + ) + for q in progressbar(range(5, 101, 5), desc="Calculating quantiles") + } + print("#"+",".join(f"q{k}={v}" for k, v in quantiles.items()), file=file) + entropies.to_csv(file, sep="\t", index=False) diff --git a/edgecaselib/formats.py b/edgecaselib/formats.py index b4938e9..d02c8d4 100644 --- a/edgecaselib/formats.py +++ b/edgecaselib/formats.py @@ -6,6 +6,7 @@ from numpy import nan, unique from pandas import read_csv, merge, concat, DataFrame from gzip import open as gzopen +from re import search from tempfile import TemporaryDirectory from os import path from edgecaselib.util import progressbar @@ -16,58 +17,82 @@ ALL_SAM_FLAGS = [ "paired", "mapped_proper_pair", "unmapped", "mate_unmapped", "rev", "mate_rev", "1stmate", "2ndmate", "secondary", "qcfail", "pcrdup", "supp", - "ucsc_mask_anchor", "fork", "tract_anchor", "is_q" + "mask_anchor", "fork", "tract_anchor", "is_q" ] FLAG_COLORS = {0x1000: "gray", 0x2000: "blueviolet", 0x4000: "red"} -DEFAULT_MOTIF_COLORS = [ - "#117733", "#88CCEE", "#AA4499", "#DDCC77", "#332288", "#882255", - "#44AA99", "#CC6677", "#EEEEEE", -] +TOL_COLORSCHEME = OrderedDict([ + ("green", "#117733"), + ("yellow", "#DDCC77"), + ("cyan", "#88DDFF"), + ("magenta", "#AA4499"), + ("blue", "#332288"), + ("red", "#882255"), + ("teal", "#44AA99"), + ("pink", "#CC6677"), + ("gray", "#EEEEEE"), +]) BGCOLOR = "#BBBBCA" PAPER_PALETTE = OrderedDict([ - ("TTAGGG", "#117733"), ("TTGGGG", "#AA4499"), ("TTAGGGG", "#332288"), - ("TGAGGG", "#DDCC77"), ("TCAGGG", "#44AA99"), ("TTAGGGTTAGGGG", "#EEEEEE"), - ("CGCGG", "#88CCEE"), + ("TTAGGG", TOL_COLORSCHEME["green"]), + ("TGAGGG", TOL_COLORSCHEME["yellow"]), + ("TTAGGGG", TOL_COLORSCHEME["cyan"]), + ("TTAGG", TOL_COLORSCHEME["magenta"]), + ("TTAGGGTTAGGGG", TOL_COLORSCHEME["blue"]), + ("TTGGGG", TOL_COLORSCHEME["red"]), + ("TCAGGG", TOL_COLORSCHEME["teal"]), + ("CGCGG", TOL_COLORSCHEME["pink"]), ]) PAPER_PALETTE_RC = OrderedDict([ - ("CCCTAA", "#117733"), ("CCCCAA", "#AA4499"), ("CCCCTAA", "#332288"), - ("CCCTCA", "#DDCC77"), ("CCCTGA", "#44AA99"), ("CCCCTAACCCTAA", "#EEEEEE"), - ("CCGCG", "#88CCEE"), + ("CCCTAA", TOL_COLORSCHEME["green"]), + ("CCCTCA", TOL_COLORSCHEME["yellow"]), + ("CCCCTAA", TOL_COLORSCHEME["cyan"]), + ("CCTAA", TOL_COLORSCHEME["magenta"]), + ("CCCCTAACCCTAA", TOL_COLORSCHEME["blue"]), + ("CCCCAA", TOL_COLORSCHEME["red"]), + ("CCCTGA", TOL_COLORSCHEME["teal"]), + ("CCGCG", TOL_COLORSCHEME["pink"]), ]) +KMERSCANNER_INCONSISTENT_NUMBER_OF_MOTIFS = ( + "Inconsistent number of motifs in DAT; plotting of reads " + + "identified de novo with kmerscanner is not implemented" +) + class EmptyKmerscanError(ValueError): """Raised when supplied kmerscanner file is empty""" pass -def explain_sam_flags(flag, sep="|"): - """Convert an integer flag into string""" - return sep.join(ALL_SAM_FLAGS[i] for i in range(16) if flag & 2**i != 0) +def explain_sam_flags(flag): + """Convert an integer flag into list of identifiers""" + return [ALL_SAM_FLAGS[i] for i in range(16) if flag & 2**i != 0] def interpret_flags(flags): """If flags are not a decimal number, assume strings and convert to number""" - if isinstance(flags, int) or flags.isdigit(): - return int(flags) - elif not isinstance(flags, str): - raise ValueError("Unknown flags: {}".format(repr(flags))) - elif flags[:2] == "0x": - return int(flags, 16) - elif flags[:2] == "0b": - return int(flags, 2) - elif "|" in flags: - flag_set = set(map(interpret_flags, flags.split("|"))) - return reduce(__or__, flag_set | {0}) - elif flags in ALL_SAM_FLAGS: - return 2**ALL_SAM_FLAGS.index(flags) + if isinstance(flags, int): + return flags + elif isinstance(flags, str): + if flags.isdigit(): + return int(flags) + if flags[:2] == "0x": + return int(flags, 16) + elif flags[:2] == "0b": + return int(flags, 2) + elif flags in ALL_SAM_FLAGS: + return 2**ALL_SAM_FLAGS.index(flags) + else: + raise ValueError("Unknown flag(s): {}".format(repr(flags))) + elif isinstance(flags, (tuple, set, list)): + return reduce(__or__, set(map(interpret_flags, flags)) | {0}) else: - raise ValueError("Unknown flags: {}".format(repr(flags))) + raise ValueError("Unknown flag(s): {}".format(repr(flags))) def entry_filters_ok(entry_flag, entry_mapq, integer_samfilters): @@ -85,14 +110,13 @@ def entry_filters_ok(entry_flag, entry_mapq, integer_samfilters): ) -def filter_and_read_tsv(dat, gzipped, samfilters): +def filter_and_read_tsv(dat, gzipped, integer_samfilters): """If filters supplied, subset DAT first, then read with pandas""" number_retained = 0 if gzipped: opener = gzopen else: opener = open - integer_samfilters = list(map(interpret_flags, samfilters)) with opener(dat, mode="rt") as dat_handle: with TemporaryDirectory() as tempdir: datflt_name = path.join(tempdir, "dat.gz") @@ -173,24 +197,29 @@ def are_motifs_consistent(raw_densities): return True -def load_kmerscan(dat, gzipped, samfilters, bin_size, no_align=False, each_once=True): +def load_kmerscan(dat, gzipped, samfilters, bin_size=None, no_align=False, each_once=True): """Load densities from dat file, split into dataframes per chromosome""" - if not any(samfilters): # all zero / None + integer_samfilters = list(map(interpret_flags, samfilters)) + if not any(integer_samfilters): # all zero / None print("Loading DAT...", file=stderr, flush=True) raw_densities = read_csv(dat, sep="\t", escapechar="#") else: - raw_densities = filter_and_read_tsv(dat, gzipped, samfilters) + raw_densities = filter_and_read_tsv(dat, gzipped, integer_samfilters) if len(raw_densities) == 0: raise EmptyKmerscanError if not are_motifs_consistent(raw_densities): - raise NotImplementedError( - "Inconsistent number of motifs in DAT; plotting of reads " + - "identified de novo with kmerscanner is not implemented" - ) + raise NotImplementedError(KMERSCANNER_INCONSISTENT_NUMBER_OF_MOTIFS) + bin_size_data = raw_densities.columns[-1] + raw_densities.rename(columns={bin_size_data: "density"}, inplace=True) + if bin_size is None: + bin_size_matcher = search(r'[0-9]+$', bin_size_data) + if bin_size_matcher: + bin_size = int(bin_size_matcher.group()) + else: + raise ValueError("No bin size in DAT, user must specify") if each_once: - raw_densities["length"] = raw_densities["density"].apply( - lambda d: d.count(",")+1 - ) + count_commas = lambda d: d.count(",")+1 + raw_densities["length"] = raw_densities["density"].apply(count_commas) groups = raw_densities[["name", "motif", "length"]].groupby( ["name", "motif"], as_index=False, ).max() diff --git a/edgecaselib/kmerscanner.py b/edgecaselib/kmerscanner.py index cecaf9a..ca2af5e 100644 --- a/edgecaselib/kmerscanner.py +++ b/edgecaselib/kmerscanner.py @@ -1,7 +1,7 @@ -from sys import stdout, stderr -from regex import compile, IGNORECASE +from sys import stdout from numpy import zeros, array, cumsum, nan from multiprocessing import Pool +from edgecaselib.util import get_circular_pattern from edgecaselib.formats import filter_bam from edgecaselib.tailchopper import get_cigar_clip_length from pysam import AlignmentFile, FastxFile @@ -15,9 +15,8 @@ __doc__ = """edgeCase kmerscanner: calculation of motif densities Usage: {0} kmerscanner [-j integer] --motif-file filename - {1} [-w integer] [-n integer] - {1} [-c float] [--head-test integer] [--tail-test integer] - {1} [-f flagspec] [-F flagspec] [-q integer] + {1} [-b integer] [-n integer] + {1} [-f flagspec]... [-F flagspec]... [-q integer] {1} [--fmt string] Output: @@ -31,12 +30,9 @@ Options: --fmt sam|fastx format of input file [default: sam] - -w, --window-size [integer] size of the rolling window [default: 100] + -b, --bin-size [integer] size of the rolling window [default: 10] -n, --num-reads [integer] expected number of reads in input (for progress display) -j, --jobs [integer] number of jobs to run in parallel [default: 1] - -c, --cutoff [float] use hard cutoff for density - --head-test [integer] length of head to use for density filter (with --cutoff) - --tail-test [integer] length of tail to use for density filter (with --cutoff) Input filtering options: -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] @@ -45,57 +41,22 @@ """ __docopt_converters__ = [ - lambda window_size: int(window_size), + lambda bin_size: int(bin_size), lambda num_reads: None if (num_reads is None) else int(num_reads), lambda jobs: int(jobs), - lambda cutoff: None if (cutoff is None) else float(cutoff), - lambda head_test: None if (head_test is None) else int(head_test), - lambda tail_test: None if (tail_test is None) else int(tail_test), lambda min_quality: None if (min_quality is None) else int(min_quality), ] DAT_HEADER = [ - "#name", "flag", "chrom", "pos", "mapq", "motif", "abundance", - "clip_5prime", "clip_3prime", "density", + "#name", "flag", "chrom", "pos", "mapq", "motif", "score", + "clip_5prime", "clip_3prime", ] -def get_circular_pattern(motif, repeats=2): - """Convert motif into circular regex pattern (e.g., r'TCGA|CGAT|GATC|ATCG' for TCGA)""" - atom_pattern = compile(r'[ACGT.]|\[[ACGT]+\]', flags=IGNORECASE) - atoms = atom_pattern.findall(motif) - if "".join(atoms) != motif: - raise ValueError("Could not parse motif: {}".format(motif)) - repeated_inversions = { - "".join(atoms[i:] + atoms[:i]) * repeats - for i in range(len(atoms)) - } - return compile(r'|'.join(repeated_inversions), flags=IGNORECASE) - - -def get_edge_density(entry, pattern, head_test, tail_test): - """Calculate density of pattern in head_test or tail_test of read""" - if (entry.query_sequence is None) or (len(entry.query_sequence) == 0): - return 0 - if head_test: - subsequence = entry.query_sequence[:head_test] - elif tail_test: - subsequence = entry.query_sequence[-tail_test:] - pattern_matches = pattern.findall(subsequence, overlapped=True) - return len(pattern_matches) / len(subsequence) - - -def calculate_density(entry, pattern, cutoff, window_size, head_test, tail_test, positions_accounted_for): +def calculate_density(entry, pattern, bin_size, positions_accounted_for): """Calculate density of pattern hits in a rolling window along given read""" - if cutoff: # if cutoff specified, filter by hard cutoff - edge_density = get_edge_density( - entry, pattern, head_test, tail_test, - ) - passes_filter = (edge_density > cutoff) - else: # otherwise, allow all that have data - passes_filter = (entry.query_sequence is not None) - if passes_filter: # calculations will make sense + if entry.query_sequence is not None: # calculations will make sense read_length = len(entry.query_sequence) canvas = zeros(read_length, dtype=bool) pattern_positions = array([ @@ -105,25 +66,24 @@ def calculate_density(entry, pattern, cutoff, window_size, head_test, tail_test, ]) if len(pattern_positions): canvas[pattern_positions] = True - if read_length <= window_size: # use one window: + if read_length <= bin_size: # use one window: density_array = (canvas.sum(axis=0) / read_length).reshape(1) else: # use rolling window: roller = cumsum(canvas, axis=0) - roller[window_size:] = roller[window_size:] - roller[:-window_size] - density_array = roller[window_size-1:] / window_size + roller[bin_size:] = roller[bin_size:] - roller[:-bin_size] + density_array = roller[bin_size-1:] / bin_size return True, density_array, pattern_positions else: # effectively skip return False, zeros(1), array([]) -def calculate_density_of_patterns(entry, motif_patterns, cutoff, window_size, head_test, tail_test): +def calculate_density_of_patterns(entry, motif_patterns, bin_size): """Calculate density of hits of each pattern in a rolling window along given read""" entry_set = [] positions_accounted_for = set() for motif, pattern in motif_patterns.items(): passes_filter, density_array, pattern_positions = calculate_density( - entry, pattern, cutoff, window_size, head_test, tail_test, - positions_accounted_for=positions_accounted_for, + entry, pattern, bin_size, positions_accounted_for, ) if passes_filter: entry_set.append([entry, motif, density_array]) @@ -132,7 +92,7 @@ def calculate_density_of_patterns(entry, motif_patterns, cutoff, window_size, he return entry_set -def pattern_scanner(entry_iterator, fmt, samfilters, motif_patterns, cutoff, window_size, head_test, tail_test, num_reads, jobs): +def pattern_scanner(entry_iterator, fmt, samfilters, motif_patterns, bin_size, num_reads, jobs): """Calculate density of pattern hits in a rolling window along each read""" if fmt == "sam": filtered_iterator = filter_bam(entry_iterator, samfilters) @@ -158,8 +118,7 @@ def pattern_scanner(entry_iterator, fmt, samfilters, motif_patterns, cutoff, win # imap_unordered() only accepts single-argument functions: density_calculator = partial( calculate_density_of_patterns, motif_patterns=motif_patterns, - window_size=window_size, head_test=head_test, tail_test=tail_test, - cutoff=cutoff, + bin_size=bin_size, ) # lazy multiprocess evaluation: read_density_iterator = pool.imap_unordered( @@ -172,7 +131,7 @@ def pattern_scanner(entry_iterator, fmt, samfilters, motif_patterns, cutoff, win ) -def interpret_arguments(fmt, head_test, tail_test, cutoff, motif_file): +def interpret_arguments(fmt, motif_file): """Parse and check arguments""" if fmt == "sam": manager = AlignmentFile @@ -180,51 +139,37 @@ def interpret_arguments(fmt, head_test, tail_test, cutoff, motif_file): manager = FastxFile else: raise ValueError("--fmt can only be 'sam' or 'fastx'") - if (head_test is not None) and (tail_test is not None): - raise ValueError("Can only specify one of --head-test, --tail-test") - elif (cutoff is not None) and (head_test is None) and (tail_test is None): - raise ValueError("--cutoff has no effect without a head/tail test") - elif (head_test is not None) or (tail_test is not None): - if cutoff is None: - message = "Warning: head/tail test has no effect without --cutoff" - print(message, file=stderr) - elif (head_test is None) and (tail_test is None) and (cutoff is None): - message = ( - "Warning: no head/tail testing options selected; regardless of " + - "the number of jobs (-j/--jobs), this will likely be " + - "bottlenecked by disk writing speeds" - ) - print(message, file=stderr) motif_data = read_csv(motif_file, sep="\t", escapechar="#") + if "motif" not in motif_data.columns: + if "monomer" in motif_data.columns: + motif_data = motif_data.rename(columns={"monomer": "motif"}) + else: + raise KeyError("No motif column found in motif file") if "length" not in motif_data.columns: motif_data["length"] = motif_data["motif"].apply(lambda m: len(m)) motif_data = motif_data.sort_values(by="length", ascending=False) motif_patterns = OrderedDict([ [motif, get_circular_pattern(motif)] for motif in motif_data["motif"] ]) - if "abundance" in motif_data.columns: - total_abundance = dict(zip( - motif_data["motif"], motif_data["abundance"] + if "score" in motif_data.columns: + scores = dict(zip( + motif_data["motif"], motif_data["score"] )) else: - total_abundance = {m: nan for m in motif_data["motif"]} - return manager, motif_patterns, total_abundance + scores = {m: nan for m in motif_data["motif"]} + return manager, motif_patterns, scores -def main(sequencefile, fmt, flags, flag_filter, min_quality, motif_file, head_test, tail_test, cutoff, window_size, num_reads, jobs=1, file=stdout, **kwargs): +def main(sequencefile, fmt, flags, flag_filter, min_quality, motif_file, bin_size, num_reads, jobs=1, file=stdout, **kwargs): # parse and check arguments: - manager, motif_patterns, total_abundance = interpret_arguments( - fmt, head_test, tail_test, cutoff, motif_file, - ) - print(*DAT_HEADER, sep="\t", file=file) + manager, motif_patterns, scores = interpret_arguments(fmt, motif_file) + print(*DAT_HEADER, f"b={bin_size}", sep="\t", file=file) # scan fastq for target motif queries, parallelizing on reads: with manager(sequencefile) as entry_iterator: scanner = pattern_scanner( entry_iterator, motif_patterns=motif_patterns, samfilters=[flags, flag_filter, min_quality], - fmt=fmt, window_size=window_size, - head_test=head_test, tail_test=tail_test, - cutoff=cutoff, num_reads=num_reads, jobs=jobs, + fmt=fmt, bin_size=bin_size, num_reads=num_reads, jobs=jobs, ) # output densities of reads that pass filter: for entry_set in scanner: @@ -233,9 +178,10 @@ def main(sequencefile, fmt, flags, flag_filter, min_quality, motif_file, head_te meta_fields = [ entry.query_name, entry.flag, entry.reference_name, entry.reference_start, entry.mapping_quality, - motif, total_abundance[motif], + motif, scores[motif], get_cigar_clip_length(entry, 5), get_cigar_clip_length(entry, 3), ] print(*meta_fields, sep="\t", end="\t", file=file) print(*density_array, sep=",", file=file) + return 0 diff --git a/edgecaselib/levenshtein.py b/edgecaselib/levenshtein.py index 55a8664..4e6902a 100644 --- a/edgecaselib/levenshtein.py +++ b/edgecaselib/levenshtein.py @@ -1,48 +1,31 @@ -from sys import stdout, stderr -from warnings import filterwarnings, resetwarnings +from sys import stdout from numba import njit from pysam import AlignmentFile from edgecaselib.formats import filter_bam -from numpy import zeros, array, uint32, uint8, log, nan, pi, isnan, allclose -from numpy import linspace, vstack, concatenate, unique, tile, triu +from numpy import zeros, array, uint32, uint8 +from concurrent.futures import ThreadPoolExecutor, as_completed +from itertools import combinations_with_replacement from collections import defaultdict from edgecaselib.util import progressbar -from pandas import Series, DataFrame, read_csv, merge -from matplotlib.pyplot import switch_backend -from matplotlib.patches import Rectangle -from matplotlib import __version__ as matplotlib_version -from seaborn import clustermap -from scipy.cluster.hierarchy import fcluster -from sklearn.metrics import silhouette_score -from scipy.stats import mannwhitneyu -from os import path -from statsmodels.stats.multitest import multipletests -from glob import glob -from re import search __warning__ = """The `levenshtein` subprogram is in development! Pairwise distance computation is O(n^2) and is not suited for large scale experiments.""" -__doc__ = """edgeCase levenshtein: clustering of telomeric reads by distance +__doc__ = """edgeCase levenshtein: pairwise edit distance among telomeric reads -Usage: {0} levenshtein [-m integer] [-o dirname] [--kmerscanner-file filename] - {1} [-f flagspec] [-F flagspec] [-q integer] +Usage: {0} levenshtein [-f flagspec]... [-F flagspec]... [-q integer] + {1} [-j integer] Output: - * TSV-formatted file with statistics describing identified clusters; - * optionally (with --output-dir set) PDF files with clustermaps; - * optionally (with --output-dir and --kmerscanner-file set) kmerscanner - files for each identified clustering + TSV-formatted file with computed relative pairwise distances (per-arm) Positional arguments: - name of input BAM/SAM file or directory with precomputed distances + name of input BAM/SAM file Options: - -m, --min-cluster-size [integer] minimum cluster size to consider [default: 5] - -o, --output-dir [dirname] output directory for clustermaps and per-haplotype SAM files - --kmerscanner-file [filename] kmerscanner file (optional, for use with --output-dir) + -j, --jobs [integer] number of jobs to run in parallel [default: 1] Input filtering options: -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] @@ -53,15 +36,10 @@ __docopt_converters__ = [ lambda min_quality: None if (min_quality is None) else int(min_quality), + lambda jobs: int(jobs), ] -CLUSTERMAP_FIGSIZE = (10, 10) -CLUSTERMAP_CMAP = "viridis_r" -CLUSTERMAP_VMAX = .15 -LOG2PI1 = log(2 * pi) + 1 - - def load_bam_as_dict(alignment, samfilters): """Load BAM entries as dictionary: chrom -> qname -> (mappos, int8array)""" dna2int = lambda seq: ((array(list(seq.upper())).view(uint32) - 2) >> 1) & 3 @@ -92,8 +70,10 @@ def ld(v, w): return dp[m, n] -def get_relative_read_ld(sra, A, srb, B, return_bases=False): +def get_relative_read_ld(aname, bname, adata, bdata): """Calculate relative levenshtein distance between overlapping parts of two reads""" + sra, A = adata + srb, B = bdata if sra < srb: _A, _B = A[srb-sra:], B elif sra > srb: @@ -102,302 +82,40 @@ def get_relative_read_ld(sra, A, srb, B, return_bases=False): _A, _B = A, B overlap_length = min(len(_A), len(_B)) _A, _B = _A[:overlap_length], _B[:overlap_length] - if overlap_length > 0: - distance = ld(_A, _B) / overlap_length - if return_bases: - bases = concatenate([_A, _B]) - else: - bases = zeros(shape=0) + if (_A == _B).all(): + return aname, bname, 0 + elif overlap_length > 0: + return aname, bname, ld(_A, _B) / overlap_length else: - distance = 1 - bases = zeros(shape=0) - return distance, overlap_length, bases + return aname, bname, 1 -def calculate_chromosome_lds(chrom, entries): +def calculate_chromosome_lds(chrom, entries, jobs): """Calculate pairwise relative levenshtein distances between all reads mapping to one chromosome""" - lds = DataFrame( - data=nan, columns=sorted(entries.keys()), index=sorted(entries.keys()), - ) - read_iterator = progressbar(entries.items(), desc=chrom, unit="read") - for aname, (sra, A) in read_iterator: - for bname, (srb, B) in entries.items(): - if aname == bname: - distance = 0 - elif not isnan(lds.loc[aname, bname]): - distance = lds.loc[bname, aname] - else: - distance, *_ = get_relative_read_ld(sra, A, srb, B) - lds.loc[aname, bname] = distance - return lds.fillna(1) - - -def generate_clustermap(lds, metric="euclidean", method="ward", cmap=CLUSTERMAP_CMAP, vmax=CLUSTERMAP_VMAX): - """Generate clustermap of pairwise levenshtein distances between reads mapping to one chromosome""" - try: - cm = clustermap( - data=lds, metric=metric, method=method, - cmap=cmap, vmin=0, vmax=vmax, figsize=CLUSTERMAP_FIGSIZE, - ) - except (ValueError, ModuleNotFoundError): - return None - else: - cm.cax.set(visible=False) - cm.ax_heatmap.set(xticks=[], yticks=[]) - return cm - - -def loglikelihood(*, n, m, f, v, k): - """Calculate log likelihood for arbitrary cluster""" - if allclose(v, 0): - return 0 - else: - return m * (log(m) - log(n) - 0.5 * (f * log(v) + LOG2PI1)) + 0.5 * k - - -def cluster_loglikelihood(*, cluster, dataset_size, n_features, k): - """Calculate log likelihood for subcluster of a cluster""" - return loglikelihood( - n=dataset_size, m=cluster.shape[0], f=n_features, v=cluster.var(), k=k, - ) - - -def information_criterion(lds, labels, kind): - """Calculate AIC or BIC for clustering""" - n, f = lds.shape - unique_labels = unique(labels) - k = len(unique_labels) - if kind == "AIC": - penalty = - k * (f + 1) / 2 * 2 - elif kind == "BIC": - penalty = - k * (f + 1) / 2 * log(n) - else: - raise ValueError("`kind` must be 'AIC' or 'BIC'") - return penalty + sum( - cluster_loglikelihood( - cluster=lds.loc[labels==label].values, - dataset_size=n, n_features=f, k=k, - ) - for label in unique_labels - ) - - -def get_mwu_pval(lds, ingroup_visual_indexer): - """Calculate Mann-Whitney U p-value between within-cluster and out-of-cluster levenshtein distances""" - ingroup_indexer = triu(ingroup_visual_indexer, k=1) - outgroup_indexer = triu(~ingroup_visual_indexer, k=1) - ingroup = lds.mask(~ingroup_indexer).values.flatten() - outgroup = lds.mask(~outgroup_indexer).values.flatten() - u, p = mannwhitneyu( - ingroup[~isnan(ingroup)], outgroup[~isnan(outgroup)], - alternative="less", - ) - return p - - -def get_clusters(lds, linkage, min_cluster_size): - """Find two major clusters; so far, only works if there is not more than one outlier read""" - bic2k, k2silh, labels = {}, {}, {} - for k in range(2, len(lds)): - labels[k] = fcluster(linkage, k, criterion="maxclust") - label_counts = dict(zip(*unique(labels[k], return_counts=True))) - if len(label_counts) > 1: - if sorted(label_counts.values())[-2] >= min_cluster_size: - bic2k[information_criterion(lds, labels[k], "BIC")] = k - k2silh[k] = silhouette_score(lds, labels[k]) - labels[k] = array([ - label if (label_counts[label]>=min_cluster_size) else nan - for label in labels[k] - ]) - if len(bic2k) == 0: - return None, 0, nan, nan - else: - best_k = bic2k[max(bic2k)] - best_silh, best_labels = k2silh[best_k], labels[best_k] - if isnan(best_labels).any(): - min_label = min(label for label in best_labels if not isnan(label)) - best_labels = array([ - nan if isnan(label) else label-min_label+1 - for label in best_labels - ]) - ingroup_axis = tile(best_labels, (len(best_labels), 1)) - ingroup_visual_indexer = (ingroup_axis == ingroup_axis.T) - pval = get_mwu_pval(lds, ingroup_visual_indexer) - used_k = best_labels[~isnan(best_labels)].max() - return best_labels, used_k, best_silh, pval - - -def warn_about_unsupported_hierarchy(chrom): - """Throw warning if impossible to satisfy clustering conditions""" - msg = "({}): not implemented: complex clustering hierarchy or too few reads" - print("\rWarning", msg.format(chrom), file=stderr) - - -def generate_kmerscanner_file(kmerscanner_file, names, labels, output_dir, chrom): - """Annotate chromosomes for reads from different haplotypes""" - kmerscanner_dat = read_csv(kmerscanner_file, sep="\t") - name_to_label = DataFrame( - data=[list(names), list(labels)], index=["#name", "label"] - ).T - haplo_dat = merge( - kmerscanner_dat, name_to_label.dropna(), on="#name", how="inner", - ) - haplo_dat["chrom"] = haplo_dat.apply( - lambda row: "{}:haplotype {}".format(row["chrom"], int(row["label"])), - axis=1, - ) - haplo_dat.drop(columns="label").to_csv( - path.join(output_dir, chrom+".dat.gz"), compression="gzip", - sep="\t", index=False, - ) - - -def generate_report(report_rows, adj="bonferroni"): - """Convert raw report to DataFrame and calculate adjusted p-values""" - report = DataFrame( - data=report_rows, - columns=["#chrom", "cluster_count", "silhouette_score", "p"], - ) - report["cluster_count"] = report["cluster_count"].astype(int) - report["p_adjusted"] = multipletests(report["p"], method=adj)[1] - return report - - -def draw_square(start, end, ax): - """Draw fancy frame from (start, start) to (end, end)""" - xy = (start, start) - width = end - start + 1 - ax.add_patch(Rectangle( - xy, width, width, fill=False, lw=5, ec="white", clip_on=False, - )) - ax.add_patch(Rectangle( - xy, width, width, fill=False, lw=1, ec="black", clip_on=False, - )) - - -def apply_mask(cm, labels): - """Draw rectangles around within-cluster pairings""" - labeled_names = Series(index=cm.data.index, data=labels) - ordered_labeled_names = labeled_names.loc[cm.data2d.index].to_frame( - name="label", - ) - ordered_labeled_names["position"] = range(len(ordered_labeled_names)) - labels_groupby = ordered_labeled_names.groupby("label") - starts, ends = ( - labels_groupby.min().iloc[:,0], - labels_groupby.max().iloc[:,0], - ) - for start, end in zip(starts, ends): - draw_square(start, end, cm.ax_heatmap) - - -def generate_pdf(cm, silh_score, labels, output_dir, chrom, cmap=CLUSTERMAP_CMAP, vmax=CLUSTERMAP_VMAX): - """Annotate clustermap figure and save to file""" - cm.ax_col_dendrogram.clear() - cm.ax_col_dendrogram.imshow( - vstack([linspace(0, 1, 256)]*2), aspect="auto", cmap=cmap, - ) - gp = cm.ax_col_dendrogram.get_position() - cm.ax_col_dendrogram.set( - position=[ - gp.x0*.25+gp.x1*.75, gp.y0*.5+gp.y1*.46, gp.x1*.1-gp.x0*.1, .015 - ], - zorder=float("inf"), - ) - cm.ax_col_dendrogram.text( - x=-672, y=.8, va="center", ha="left", fontsize=19, s="Distance: 0", - ) - cm.ax_col_dendrogram.text( - x=272, y=.8, va="center", ha="left", fontsize=19, s="{}+".format(vmax), - ) - cm.ax_col_dendrogram.set_axis_off() - silh_text = "N/A" if isnan(silh_score) else "{:.3f}".format(silh_score) - cm.ax_col_dendrogram.text( - x=-672, y=3, va="top", ha="left", fontsize=19, - s="Silhouette score: "+silh_text, - ) - cm.ax_col_dendrogram.text( - x=-672, y=-4.9, va="top", ha="left", fontsize=19, s=chrom, - ) - if matplotlib_version == "3.1.1": # stackoverflow.com/a/58165593 - bottom, top = cm.ax_heatmap.get_ylim() - cm.ax_heatmap.set_ylim(bottom + .5, top - .5) - apply_mask(cm, labels) - filename = path.join(output_dir, chrom+".pdf") - cm.fig.savefig(filename, bbox_inches="tight") - - -def hide_stats_warnings(state=True): - """Prevent known harmless warnings from being printed to stderr""" - if state: - filterwarnings("ignore", message="invalid value encountered") - filterwarnings( - "ignore", - message="looks suspiciously like an uncondensed distance matrix", + with ThreadPoolExecutor(max_workers=jobs) as pool: + workers = [ + pool.submit( + get_relative_read_ld, + aname, bname, entries[aname], entries[bname], + ) + for aname, bname in combinations_with_replacement( + sorted(entries.keys()), r=2, + ) + ] + iterator = progressbar( + as_completed(workers), desc=chrom, unit="pair", total=len(workers), ) - else: - resetwarnings() + for worker in iterator: + yield worker.result() -def process_levenshtein_input(sequencedata, samfilters, output_dir): - """Iterate over chromosomes and return pairwise levenshtein distances for reads mapped to them""" - if path.isfile(sequencedata): - with AlignmentFile(sequencedata) as alignment: - bam_dict = load_bam_as_dict(alignment, samfilters) - for chrom, entries in bam_dict.items(): - lds = calculate_chromosome_lds(chrom, entries) - if output_dir: - lds.to_csv(path.join(output_dir, chrom+"-matrix.tsv"), sep="\t") - yield chrom, lds - elif path.isdir(sequencedata): - tsv_iterator = progressbar( - glob(path.join(sequencedata, "*-matrix.tsv")), - desc="Clustering", unit="chromsome", +def main(sequencefile, flags, flag_filter, min_quality, jobs=1, file=stdout, **kwargs): + print("#rname", "qname1", "qname2", "relative_ld", sep="\t", file=file) + with AlignmentFile(sequencefile) as alignment: + bam_dict = load_bam_as_dict( + alignment, samfilters=[flags, flag_filter, min_quality], ) - for tsv in tsv_iterator: - chrom_matcher = search(r'([^/]+)-matrix\.tsv', tsv) - if chrom_matcher: - chrom = chrom_matcher.group(1) - lds = read_csv(tsv, sep="\t", index_col=0) - is_lds_square = (lds.shape[0] == lds.shape[1]) - if (not is_lds_square) or (lds.index!=lds.columns).any(): - msg_mask = "({}): malformed matrix? Skipping" - print("Warning", msg_mask.format(chrom), file=stderr) - else: - yield chrom, lds - else: - raise IOError("Unknown type of input") - - -def main(sequencedata, min_cluster_size, kmerscanner_file, output_dir, flags, flag_filter, min_quality, jobs=1, file=stdout, **kwargs): - switch_backend("pdf") - hide_stats_warnings(True) - report_rows = [] - input_iterator = process_levenshtein_input( - sequencedata, [flags, flag_filter, min_quality], output_dir, - ) - for chrom, lds in input_iterator: - cm = generate_clustermap(lds) - if cm is not None: - labels, k, silh_score, pval = get_clusters( - lds, cm.dendrogram_row.linkage, min_cluster_size, - ) - if labels is not None: - if output_dir: - generate_pdf( - cm, silh_score, labels, output_dir, chrom, - ) - if kmerscanner_file: - generate_kmerscanner_file( - kmerscanner_file, lds.index, labels, - output_dir, chrom, - ) - else: - warn_about_unsupported_hierarchy(chrom) - else: - k, silh_score, pval = 0, nan, nan - warn_about_unsupported_hierarchy(chrom) - report_rows.append([chrom, k, silh_score, pval]) - report = generate_report(report_rows) - print(report.to_csv(sep="\t", index=False, na_rep="NA")) - hide_stats_warnings(False) + for chrom, entries in bam_dict.items(): + ld_iterator = calculate_chromosome_lds(chrom, entries, jobs) + for qname1, qname2, relative_ld in ld_iterator: + print(chrom, qname1, qname2, relative_ld, sep="\t", file=file) diff --git a/edgecaselib/repeatfinder.py b/edgecaselib/repeatfinder.py index ca18e79..253562b 100644 --- a/edgecaselib/repeatfinder.py +++ b/edgecaselib/repeatfinder.py @@ -1,9 +1,10 @@ from sys import stdout from tempfile import TemporaryDirectory from pysam import AlignmentFile, FastxFile -from re import finditer, IGNORECASE from os import path from edgecaselib.util import get_executable, progressbar, revcomp +from edgecaselib.util import get_circular_pattern +from concurrent.futures import ThreadPoolExecutor, as_completed from edgecaselib.formats import filter_bam from functools import lru_cache from subprocess import check_output @@ -17,8 +18,8 @@ Usage: {0} repeatfinder [-m integer] [-M integer] [-r integer] [-P float] {1} [--jellyfish filename] [--jellyfish-hash-size string] - {1} [-n integer] [-j integer] [-f flagspec] [-F flagspec] - {1} [-q integer] [--fmt string] + {1} [-n integer] [-j integer] [-q integer] + {1} [-f flagspec]... [-F flagspec]... [--fmt string] {1} [--collapse-reverse-complement] Output: @@ -43,6 +44,11 @@ -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] + +Notes: + * Depending on the aligner used, MAPQ of secondary reads may have been set to + zero regardless of real mapping quality; use this filtering option with + caution. """ __docopt_converters__ = [ @@ -51,6 +57,7 @@ lambda max_k: int(max_k), lambda min_repeats: int(min_repeats), lambda max_p_adjusted: float(max_p_adjusted), + lambda max_motifs: None if (max_motifs is None) else int(max_motifs), lambda jobs: int(jobs), ] @@ -60,6 +67,13 @@ } +REPORT_COLUMNS = [ + "monomer", "motif", "length", "score", "fraction_explained", + "p", "p_adjusted", +] +REPORT_COLUMNS_ESCAPED = ["#"+REPORT_COLUMNS[0]] + REPORT_COLUMNS[1:] + + def interpret_args(fmt, jellyfish): """Parse and check arguments""" if fmt == "sam": @@ -74,25 +88,14 @@ def interpret_args(fmt, jellyfish): def convert_input(bam, manager, tempdir, samfilters): """Convert BAM to fasta; count bases""" fasta = path.join(tempdir, "input.fa") - base_count = 0 with manager(bam) as alignment, open(fasta, mode="wt") as fasta_handle: for entry in filter_bam(alignment, samfilters, "SAM/BAM -> FASTA"): entry_str = ">{}\n{}".format(entry.qname, entry.query_sequence) - base_count += len(entry.query_sequence) print(entry_str, file=fasta_handle) - return fasta, base_count - - -def count_fastx_bases(sequencefile, pattern=r'[acgt]', flags=IGNORECASE, desc="Counting input bases"): - """Count bases in FASTX file""" - with FastxFile(sequencefile) as fastx: - return sum( - sum(1 for _ in finditer(pattern, entry.sequence, flags=flags)) - for entry in progressbar(fastx, desc=desc, unit="read") - ) + return fasta -def find_repeats(sequencefile, min_k, max_k, min_repeats, base_count, jellyfish, jellyfish_hash_size, collapse_reverse_complement, jobs, tempdir): +def find_repeats(sequencefile, min_k, max_k, min_repeats, jellyfish, jellyfish_hash_size, collapse_reverse_complement, jobs, tempdir): """Find all repeats in sequencefile""" per_k_reports = [] k_iterator = progressbar( @@ -120,7 +123,6 @@ def find_repeats(sequencefile, min_k, max_k, min_repeats, base_count, jellyfish, ) k_report = k_report[repeats_indexer] k_report["kmer"] = k_report["kmer"].apply(lambda kmer:kmer[:k]) - k_report["abundance"] = k_report["count"] / base_count k_report["length"] = k per_k_reports.append(k_report) return concat(per_k_reports, axis=0) @@ -181,7 +183,7 @@ def get_motifs_fisher(single_length_report, collapse_reverse_complement=False): lowest_collapsed_revcomp_alpha_inversion if collapse_reverse_complement else lowest_alpha_inversion ) - fishery_groupby = fishery[["motif", "count", "abundance"]].groupby( + fishery_groupby = fishery[["motif", "count"]].groupby( "motif", as_index=False, ) fishery = fishery_groupby.sum() @@ -219,7 +221,7 @@ def analyze_repeats(full_report, collapse_reverse_complement=False, adj="bonferr ]) candidates["p_adjusted"] = multipletests(candidates["p"], method=adj)[1] return candidates[ - ["motif", "length", "count", "abundance", "p", "p_adjusted"] + ["motif", "length", "count", "p", "p_adjusted"] ] @@ -244,7 +246,7 @@ def coerce_and_filter_report(analysis, max_p_adjusted): if len(synonym_data): synonyms_to_keep.add( synonym_data.sort_values( - by="abundance", ascending=False + by="count", ascending=False ).iloc[0, 0] ) synonyms_to_remove = ( @@ -256,6 +258,46 @@ def coerce_and_filter_report(analysis, max_p_adjusted): ].copy() +def explain_report(filtered_analysis, sequencefile, min_repeats, jobs=1): + """Calculate fraction of reads explainable by each motif""" + explained_analysis = filtered_analysis.copy() + explained_analysis["bases_explained"], total_bases = 0.0, 0 + with FastxFile(sequencefile) as fastx: + def get_number_of_masked_positions(sequence, motifs): + n_masked_positions_per_motif = {} + for motif in motifs: + positions_to_mask = set() + motifs_pattern = get_circular_pattern( + motif, repeats=min_repeats, + ) + matcher = motifs_pattern.finditer(sequence, overlapped=True) + for match in matcher: + positions_to_mask |= set(range(match.start(), match.end())) + n_masked_positions_per_motif[motif] = len(positions_to_mask) + return n_masked_positions_per_motif, len(sequence) + with ThreadPoolExecutor(max_workers=jobs) as pool: + workers = [ + pool.submit( + get_number_of_masked_positions, entry.sequence, + set(filtered_analysis["motif"]), + ) + for entry in fastx + ] + iterator = progressbar( + as_completed(workers), total=len(workers), + desc="Calculating fractions", unit="read", + ) + for worker in iterator: + n_masked_positions_per_motif, total_seq_bases = worker.result() + for motif, n_pos in n_masked_positions_per_motif.items(): + indexer = ( + explained_analysis["motif"]==motif, "bases_explained", + ) + explained_analysis.loc[indexer] += n_pos + total_bases += total_seq_bases + return explained_analysis, total_bases + + def coerce_to_monomer(motif, min_k): """Coerce motif to monomer, e.g. TATA -> TA, CAT -> CAT; this can be used to find functionally synonymous entries too""" n = len(motif) @@ -268,23 +310,23 @@ def coerce_to_monomer(motif, min_k): return motif -def format_analysis(filtered_analysis, min_k, max_motifs): +def format_analysis(explained_analysis, min_k, max_motifs, total_bases): """Make dataframe prettier""" - filtered_analysis["motif"] = filtered_analysis["motif"].apply( + explained_analysis["motif"] = explained_analysis["motif"].apply( custom_alpha_inversion, ) - filtered_analysis["monomer"] = filtered_analysis["motif"].apply( + explained_analysis["monomer"] = explained_analysis["motif"].apply( lambda motif: coerce_to_monomer(motif, min_k=min_k), ) - formatted_analysis = filtered_analysis.sort_values( - by=["abundance", "p_adjusted"], ascending=[False, True], + formatted_analysis = explained_analysis.sort_values( + by=["count", "p_adjusted"], ascending=[False, True], ) - formatted_analysis = formatted_analysis[ - ["monomer", "motif", "length", "count", "abundance", "p", "p_adjusted"] - ] - formatted_analysis.columns = [ - "#monomer", "motif", "length", "count", "abundance", "p", "p_adjusted", - ] + formatted_analysis["score"], formatted_analysis["fraction_explained"] = ( + formatted_analysis["count"] / total_bases, + formatted_analysis["bases_explained"] / total_bases + ) + formatted_analysis = formatted_analysis[REPORT_COLUMNS] + formatted_analysis.columns = REPORT_COLUMNS_ESCAPED if max_motifs is None: return formatted_analysis else: @@ -297,26 +339,25 @@ def main(sequencefile, fmt, flags, flag_filter, min_quality, min_k, max_k, min_r with TemporaryDirectory() as tempdir: if manager == AlignmentFile: # will need to convert SAM to fastx samfilters = [flags, flag_filter, min_quality] - sequencefile, base_count = convert_input( + sequencefile = convert_input( sequencefile, manager, tempdir, samfilters, ) - else: - base_count = count_fastx_bases(sequencefile) full_report = find_repeats( - sequencefile, min_k, max_k, min_repeats, base_count, - jellyfish, jellyfish_hash_size, collapse_reverse_complement, - jobs, tempdir, + sequencefile, min_k, max_k, min_repeats, jellyfish, + jellyfish_hash_size, collapse_reverse_complement, jobs, tempdir, ) - if full_report is None: - columns = [ - "#monomer", "motif", "length", "count", "abundance", - "p", "p_adjusted", - ] - print(*columns, sep="\t", file=file) - else: - analysis = analyze_repeats(full_report, collapse_reverse_complement) - filtered_analysis = coerce_and_filter_report(analysis, max_p_adjusted) - formatted_analysis = format_analysis( - filtered_analysis, min_k, max_motifs, - ) - formatted_analysis.to_csv(file, sep="\t", index=False) + if full_report is None: + print(*REPORT_COLUMNS_ESCAPED, sep="\t", file=file) + else: + analysis = analyze_repeats(full_report, collapse_reverse_complement) + filtered_analysis = coerce_and_filter_report( + analysis, max_p_adjusted, + ) + explained_analysis, total_bases = explain_report( + filtered_analysis, sequencefile, min_repeats, jobs=jobs, + ) + formatted_analysis = format_analysis( + explained_analysis, min_k, max_motifs, total_bases, + ) + formatted_analysis.to_csv(file, sep="\t", index=False) + return 0 diff --git a/edgecaselib/tailchopper.py b/edgecaselib/tailchopper.py index b358337..c282dee 100644 --- a/edgecaselib/tailchopper.py +++ b/edgecaselib/tailchopper.py @@ -1,5 +1,6 @@ from sys import stdout, stderr from pysam import AlignmentFile +from numpy import array, where, errstate from re import search, split from edgecaselib.formats import load_index, filter_bam, interpret_flags from edgecaselib.util import progressbar @@ -7,8 +8,8 @@ __doc__ = """edgeCase tailchopper: selection of overhanging heads/tails of reads -Usage: {0} tailchopper -x filename [-t targetspec] [-f flagspec] [-F flagspec] - {1} [-q integer] +Usage: {0} tailchopper -x filename [-t targetspec] + {1} [-f flagspec]... [-F flagspec]... [-q integer] Output: SAM-formatted file with tails of candidate reads overhanging anchors defined @@ -27,6 +28,11 @@ -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] + +Notes: + * Depending on the aligner used, MAPQ of secondary reads may have been set to + zero regardless of real mapping quality; use this filtering option with + caution. """ __docopt_converters__ = [ @@ -36,7 +42,7 @@ __docopt_tests__ = { lambda target: - target in {"ucsc_mask_anchor", "fork", "tract_anchor", "cigar"}: + target in {"mask_anchor", "fork", "tract_anchor", "cigar"}: "unknown value of --target", } @@ -58,8 +64,9 @@ def get_cigar_clip_length(entry, prime): ) -def update_aligned_segment(entry, map_pos, start=None, end=None): +def update_aligned_segment(entry, ref_map_pos, bounds): """Update sequence, cigar, quality string in-place""" + start, end = bounds if (end is not None) and (start is not None) and (end < start): start, end = end, start qualities_substring = entry.query_qualities[start:end] @@ -68,8 +75,8 @@ def update_aligned_segment(entry, map_pos, start=None, end=None): entry.cigarstring = str(len(entry.query_sequence)) + "S" else: entry.cigarstring = None - if map_pos is not None: - entry.reference_start += map_pos + if ref_map_pos is not None: + entry.reference_start = ref_map_pos entry.query_qualities = qualities_substring @@ -82,7 +89,7 @@ def cigar_chopper(entry, ecx, integer_target): else: cigar_clip = search(r'^(\d+[SH])+', entry.cigarstring) if not cigar_clip: - update_aligned_segment(entry, None, 0, 0) + update_aligned_segment(entry, None, (0, 0)) error = "No clipped sequence" else: clip_length = sum( @@ -91,55 +98,35 @@ def cigar_chopper(entry, ecx, integer_target): ) if clip_length > 0: if is_q: - map_pos = entry.reference_end - 1 - update_aligned_segment(entry, map_pos, -clip_length, None) + update_aligned_segment( + entry, entry.reference_end-1, (-clip_length, None), + ) else: - map_pos = entry.reference_start - update_aligned_segment(entry, map_pos, None, clip_length) + update_aligned_segment( + entry, entry.reference_start, (None, clip_length), + ) else: - update_aligned_segment(entry, None, 0, 0) + update_aligned_segment(entry, None, (0, 0)) error = "No clipped sequence" return entry, error def find_map_and_cut_positions(entry, anchor_pos, is_q): - """Find closest mapping position within `relax_radius` of anchor""" - positions = entry.get_reference_positions(full_length=True) + """Find closest mapping position to anchor""" + ref_positions = array( + entry.get_reference_positions(full_length=True), # [read_pos->ref_pos] + dtype=float, + ) try: - cut_pos = positions.index(anchor_pos) if is_q: - return cut_pos, cut_pos - else: - return entry.reference_start, cut_pos - except ValueError: - if entry.reference_start <= anchor_pos < entry.reference_end: - ref_map_length = entry.reference_end - 1 - entry.reference_start - try: - read_map_length = ( - positions.index(entry.reference_end-1) - - positions.index(entry.reference_start) - ) - except ValueError: - raise IndexError("SAM error? Last mapped position not on read") - ref_anchor_distance = anchor_pos - entry.reference_start - cut_pos = int(round( - ref_anchor_distance * read_map_length / ref_map_length - )) - if is_q: - return cut_pos, cut_pos - else: - return entry.reference_start, cut_pos - elif is_q: - if entry.reference_start >= anchor_pos: - return entry.reference_start, entry.reference_start - elif entry.reference_end < anchor_pos: - raise ValueError("Anchor position beyond mapped portion") + read_cut_pos = where(ref_positions>=anchor_pos)[0][0] else: - if entry.reference_end < anchor_pos: - return entry.reference_start, entry.reference_end - 1 - elif entry.reference_start > anchor_pos: - raise ValueError("Anchor position beyond mapped portion") - raise ValueError("Satisfactory cutting position not found") + read_cut_pos = where(ref_positions<=anchor_pos)[0][-1] + except IndexError: + ref_map_pos, read_cut_pos = None, None + else: + ref_map_pos = ref_positions[read_cut_pos].astype(int) + return ref_map_pos, read_cut_pos def relative_chopper(entry, ecx, integer_target): @@ -155,22 +142,16 @@ def relative_chopper(entry, ecx, integer_target): if len(anchor_positions) > 1: raise ValueError("Ambiguous index entry: {}".format(anchor_positions)) elif len(anchor_positions) == 0: - update_aligned_segment(entry, None, 0, 0) + update_aligned_segment(entry, None, (0, 0)) error = "No anchor data in index" else: - anchor_pos = anchor_positions.iloc[0] - try: - map_pos, cut_pos = find_map_and_cut_positions( - entry, anchor_pos, is_q, - ) - except ValueError as e: - update_aligned_segment(entry, None, 0, 0) - error = str(e) + ref_map_pos, read_cut_pos = find_map_and_cut_positions( + entry, anchor_positions.iloc[0], is_q, + ) + if is_q: + update_aligned_segment(entry, ref_map_pos, (read_cut_pos, None)) else: - if is_q: - update_aligned_segment(entry, map_pos, cut_pos, None) - else: - update_aligned_segment(entry, map_pos, None, -cut_pos) + update_aligned_segment(entry, ref_map_pos, (None, read_cut_pos)) return entry, error @@ -188,20 +169,23 @@ def main(bam, index, flags, flag_filter, min_quality, target, file=stdout, **kwa filter_bam(alignment, [flags, flag_filter, min_quality]), desc="Chopping", unit="read", ) - for entry in bam_iterator: - if entry.query_sequence: - chopped_entry, error = chopper( - entry, ecx, integer_target, - ) - if chopped_entry.query_sequence: - print(chopped_entry.to_string(), file=file) - else: - n_skipped += 1 + with errstate(invalid="ignore"): + for entry in bam_iterator: + if entry.query_sequence: + chopped_entry, error = chopper( + entry, ecx, integer_target, + ) + if chopped_entry.query_sequence: + print(chopped_entry.to_string(), file=file) + else: + n_skipped += 1 if n_skipped: - print(n_skipped, "reads skipped", file=stderr) - warnings = [ + msg_mask = "Skipped {} reads to be safe (unsure where to chop)" + print(msg_mask.format(n_skipped), file=stderr) + warning = [ "WARNING: Read mapping positions were adjusted and retained;", " this is needed to comply with the SAM spec.", - " Do not use these positions for analyses outside of edgeCase!" + " Do not use these positions for analyses outside of edgeCase!", ] - print("\n".join(warnings), file=stderr) + print("\n".join(warning), file=stderr) + return 0 diff --git a/edgecaselib/tailpuller.py b/edgecaselib/tailpuller.py index 4b58b90..69cc8c9 100644 --- a/edgecaselib/tailpuller.py +++ b/edgecaselib/tailpuller.py @@ -1,6 +1,6 @@ -from sys import stdout +from sys import stdout, stderr from os import path -from edgecaselib.formats import load_index, filter_bam +from edgecaselib.formats import interpret_flags, load_index, filter_bam from pysam import AlignmentFile from functools import reduce from operator import __or__ @@ -8,29 +8,56 @@ from edgecaselib.util import progressbar from itertools import chain from numpy import isnan, inf +from pandas import DataFrame, merge +from collections import defaultdict __doc__ = """edgeCase tailpuller: selection of candidate telomeric long reads -Usage: {0} tailpuller -x filename [-f flagspec] [-F flagspec] [-q integer] - {1} [-m integer] +Usage: {0} tailpuller -x filename [-t targetspec]... + {1} [-M integer] [--min-map-overlap integer] + {1} [-m integer] [--min-telomere-overlap integer] + {1} [--output-ambiguous-reads string] + {1} [-f flagspec]... [-F flagspec]... [-q integer] Output: SAM-formatted file with reads overhanging anchors defined in index Positional arguments: - name of input BAM/SAM file; must have a .bai index + name of input BAM/SAM file; must have a .bai index Required options: - -x, --index [filename] location of the reference .ecx index + -x, --index [filename] location of the reference .ecx index Options: - -m, --max-read-length [integer] maximum read length to consider when selecting lookup regions + -t, --target [targetspec] target reads overlapping these features (ECX flags) [default: tract_anchor] + -M, --max-read-length [integer] maximum read length to consider when selecting lookup regions * + --min-map-overlap [integer] minimum overlap of reference to consider read as mapped [default: 1] ** + -m, --min-subtelomere-overlap [integer] minimum overlap of subtelomere to consider read as candidate [default: 1] *** + --min-telomere-overlap [integer] minimum overlap of telomere to consider read as candidate [default: 1] *** + --output-ambiguous-reads [string] which ambiguously mapping reads to retain (none, all, longest-overlap) [default: none] Input filtering options: - -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] - -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] - -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] + -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] + -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] **** + -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] ***** + +Notes: + * Suggested value of --max-read-length for PacBio HiFi: 30000; + if not specified, will assume +infinity (will be slow). + ** Suggested value of --min-map-overlap for PacBio HiFi: 500; + *** Suggested value of --min-(sub)telomere-overlap for PacBio HiFi: 3000; + **** It is recommended to include secondary and supplementary reads (i.e., + leave the -F flag as default [0]), because: + **** edgeCase determines unambiguously mapped reads on its own; aligners + assign the 'supplementary' flag to multi-mapping reads arbitrarily, + and removing such supplementary reads upstream may lead to loss of + information in telomeric regions; + **** edgeCase will discard chimeric reads in terminal regions if + information about supplementary alignments is present. +***** Depending on the aligner used, MAPQ of secondary reads may have been set + to zero regardless of real mapping quality; use this filtering option with + caution. """ __docopt_converters__ = [ @@ -38,6 +65,12 @@ None if (min_quality is None) else int(min_quality), lambda max_read_length: inf if (max_read_length is None) else int(max_read_length), + lambda min_map_overlap: + 1 if (min_map_overlap is None) else int(min_map_overlap), + lambda min_subtelomere_overlap: + 1 if (min_subtelomere_overlap is None) else int(min_subtelomere_overlap), + lambda min_telomere_overlap: + None if (min_telomere_overlap is None) else int(min_telomere_overlap), ] __docopt_tests__ = { @@ -45,6 +78,12 @@ path.isfile(bam + ".bai"): "BAM index (.bai) not found", lambda max_read_length: max_read_length > 0: "--max-read-length below 0", + lambda target: + set(target) <= {"mask_anchor", "fork", "tract_anchor"}: + "unknown value of --target", + lambda output_ambiguous_reads: + output_ambiguous_reads in {"none", "all", "longest-overlap"}: + "unknown value of --output-ambiguous-reads", } @@ -74,23 +113,24 @@ def updated_entry(entry, flags, is_q=False): return new_entry -def filter_entries(bam_data, ecxfd, samfilters): +def filter_entries(bam_data, ecxfd, targets, samfilters, min_map_overlap): """Only pass reads extending past regions specified in the ECX""" for entry in filter_bam(bam_data, samfilters): if entry.reference_name in ecxfd: - # find positions of start and end of read relative to reference: - p_pos = get_terminal_pos(entry, cigarpos=0) - q_pos = get_terminal_pos(entry, cigarpos=-1) - # collect ECX flags where anchor is to right of read start: - ecx_t_p5 = ecxfd[entry.reference_name][5] - p_flags = set(ecx_t_p5.loc[ecx_t_p5["pos"]>=p_pos, "flag"]) - if p_flags: - yield updated_entry(entry, p_flags) - # collect ECX flags where anchor is to left of read end - ecx_t_p3 = ecxfd[entry.reference_name][3] - q_flags = set(ecx_t_p3.loc[ecx_t_p3["pos"]= min_map_overlap: + # find positions of start and end of read relative to reference: + p_pos = get_terminal_pos(entry, cigarpos=0) + q_pos = get_terminal_pos(entry, cigarpos=-1) + # collect ECX flags where anchor is to right of read start: + ecx_t_p5 = ecxfd[entry.reference_name][5] + p_flags = set(ecx_t_p5.loc[ecx_t_p5["pos"]>=p_pos, "flag"]) + if targets & p_flags: + yield updated_entry(entry, p_flags) + # collect ECX flags where anchor is to left of read end + ecx_t_p3 = ecxfd[entry.reference_name][3] + q_flags = set(ecx_t_p3.loc[ecx_t_p3["pos"]= min_telomere_overlap: + if entry.reference_length >= min_subtelomere_overlap: + if entry.mapq >= min_quality: # enforce quality on second pass + print(entry.to_string(), file=file) + if entry.seq is None: + n_orphaned_entries += 1 + if n_orphaned_entries == 0: + return 0 + else: + warning = [ + f"CRITICAL: {n_orphaned_entries} entries have no sequence data;", + " this will cause information loss downstream.", + " Please re-submit a SAM/BAM with sequences reported for ", + " all alignments.", + ] + print("\n".join(warning), file=stderr) + return 1 diff --git a/edgecaselib/util.py b/edgecaselib/util.py index 1ffa3d5..9605c26 100644 --- a/edgecaselib/util.py +++ b/edgecaselib/util.py @@ -1,5 +1,5 @@ from sys import stderr -from regex import compile +from regex import compile, IGNORECASE as REGEX_IGNORECASE from re import split, search, IGNORECASE from shutil import which from os import path, access, X_OK @@ -7,10 +7,6 @@ from tqdm import tqdm -MAINCHROMS_ENSEMBL = {str(i) for i in range(1, 23)} | {"X", "Y"} -MAINCHROMS_UCSC = {"chr" + s for s in MAINCHROMS_ENSEMBL} -MAINCHROMS_T2T = {"chrX_fixedBionanoSV_centromereV3"} - ALPHABET = list("ACGT") COMPLEMENTS = dict(zip(ALPHABET, reversed(ALPHABET))) COMPLEMENT_PATTERN = compile(r'|'.join(COMPLEMENTS.keys())) @@ -54,6 +50,19 @@ def motif_revcomp(motif, ignorecase=True): raise ValueError("Unsupported character(s) in motif: {}".format(motif)) +def get_circular_pattern(motif, repeats=2): + """Convert motif into circular regex pattern (e.g., r'TCGA|CGAT|GATC|ATCG' for TCGA)""" + atom_pattern = compile(r'[ACGT.]|\[[ACGT]+\]', flags=REGEX_IGNORECASE) + atoms = atom_pattern.findall(motif) + if "".join(atoms) != motif: + raise ValueError("Could not parse motif: {}".format(motif)) + repeated_inversions = { + "".join(atoms[i:] + atoms[:i]) * repeats + for i in range(len(atoms)) + } + return compile(r'|'.join(repeated_inversions), flags=REGEX_IGNORECASE) + + def chromosome_natsort(chrom): """Natural order sorting that undestands chr1, 4, chr10, chr14_K*, 7ptel etc""" keyoder = [] diff --git a/publications/methods-paper/Snakefile b/publications/methods-paper/Snakefile deleted file mode 100644 index 2291027..0000000 --- a/publications/methods-paper/Snakefile +++ /dev/null @@ -1,772 +0,0 @@ -from pandas import read_csv, concat, DataFrame, Series, merge -from re import search -from statsmodels.stats.multitest import multipletests -from itertools import product, count, chain -from numpy import array, arange, fromiter, full_like, uint32, concatenate, nan -from numpy import sort, ceil, argsort, median, cumsum, interp -from numpy import log as log2 -from glob import glob -from os import path, getcwd -from sys import path as sys_path -from matplotlib.pyplot import switch_backend, subplots -from seaborn import lineplot, heatmap -from pysam import AlignmentFile, FastxFile -from gzip import open as gzopen -from collections import defaultdict, OrderedDict -from scipy.stats import combine_pvalues, entropy -from functools import reduce, partial - -sys_path.insert(0, getcwd()) -from edgecaselib.formats import load_kmerscan, PAPER_PALETTE, PAPER_PALETTE_RC -from edgecaselib.util import progressbar, natsorted_chromosomes, revcomp -from edgecaselib.repeatfinder import lowest_collapsed_revcomp_alpha_inversion -from edgecaselib.repeatfinder import custom_alpha_inversion - - -PACBIO_NAME_TO_SAMPLE = { - "HG001": "HG001/HG001.RTG", "HG002": "HG002/HG002.10kb+15kb", - "HG005": "HG005/HG005.10x" -} -PAPER_PALETTES = {"p_arm": PAPER_PALETTE_RC, "q_arm": PAPER_PALETTE} -IS_Q_TRACT_ANCHOR, TRACT_ANCHOR, IS_Q_3844 = 49152, 16384, 36612 -ENTROPY_W = 10 - - -rule all: - input: - giab_full=expand( - "data/datasets/GIAB/PacBio/{sample}-densityplot-{arm}.pdf", - sample=PACBIO_NAME_TO_SAMPLE.values(), arm=["p_arm", "q_arm"] - ), - chm_full=expand( - "data/datasets/T2T/PacBio/sorted_pbhifi_t2t_CHM13hTERT-densityplot-{arm}.pdf", - arm=["p_arm", "q_arm"] - ), - giab_haplotypes=expand( - "data/datasets/GIAB/PacBio/{sample}-levenshtein-{arm}-densityplots.tar", - sample=PACBIO_NAME_TO_SAMPLE.values(), arm=["p_arm", "q_arm"] - ), - chm_haplotypes=expand( - "data/datasets/T2T/PacBio/sorted_pbhifi_t2t_CHM13hTERT-levenshtein-{arm}-densityplots.tar", - arm=["p_arm", "q_arm"] - ), - shell: "rm -f {input.giab_haplotypes} {input.chm_haplotypes} || :" - - -rule telbam_to_fastq: - """Extracts reads in FASTQ format from telbams for: mapping to long reads; shortread-only analyses. Telbams come from previous analyses.""" - input: bam="{prefix}-telbam.bam" - output: fq="{prefix}-telbam.fq.gz" - run: - namecounts, processed = defaultdict(lambda: count(1)), set() - get_name_repr = lambda en: "{}/{}".format(en, next(namecounts[en])) - with AlignmentFile(input.bam) as bam, gzopen(output.fq, "wt") as fq: - for entry in bam: - if (entry.qname, entry.seq) not in processed: - processed.add((entry.qname, entry.seq)) - entry_repr = "@{}\n{}\n+\n{}".format( - get_name_repr(entry.qname), entry.seq, entry.qual - ) - print(entry_repr, file=fq) - - -rule tailchopper_faidx: - """Creates a FASTA index (FAI) for mapping telbam FASTQs to long reads. Tailchopper BAMs come from previous analyses.""" - input: bam="{prefix}-tailchopper.bam" - output: fa="{prefix}-tailchopper.fa", fai="{prefix}-tailchopper.fa.fai" - shell: """ - samtools view -F3840 {input.bam} \ - | bioawk -c sam '{{print ">"$qname; print $seq}}' > {output.fa} - samtools faidx {output.fa} - """ - - -def map_telbam_to_tailchopper_input(w): - fq_mask = "data/datasets/{}/Illumina/{}/{}-telbam.fq.gz" - fq = fq_mask.format(w.group, w.sample, w.sample) - fa_mask = "data/datasets/{}/PacBio/{}-tailchopper.fa" - fa = fa_mask.format(w.group, PACBIO_NAME_TO_SAMPLE[w.sample]) - return dict(fq=fq, fa=fa, fai=fa+".fai") - - -rule map_telbam_to_tailchopper: - """Maps telbam FASTQs (from telbam_to_fastq) to tailchopper FASTAs (from tailchopper_faidx)""" - input: unpack(map_telbam_to_tailchopper_input) - output: bam="data/datasets/{group}/Illumina/{sample}/{sample}-telbam2tailchopper.bam" - threads: 12 - shell: """ - n_multimap=1000000 - minimap2 -t {threads} -ax sr \ - --secondary=yes -p0 -N$n_multimap {input.fa} {input.fq} \ - | samtools view -bh > {output.bam} - """ # n_multimap=$(cat {input.fai} | wc -l) - - -def telbam_support_input(w): - ref_mask = "data/datasets/{}/PacBio/{}-tailchopper.bam" - ref = ref_mask.format(w.group, PACBIO_NAME_TO_SAMPLE[w.sample]) - bam_mask = "data/datasets/{}/Illumina/{}/{}-telbam2tailchopper.bam" - bam = bam_mask.format(w.group, w.sample, w.sample) - return dict(ref=ref, bam=bam) - - -rule telbam_support: - """Outputs stretches of long reads that are mapped by short telbam reads (from map_telbam_to_tailchopper)""" - input: - unpack(telbam_support_input) - output: - sam="data/datasets/{group}/Illumina/{sample}/{sample}-telbam-support.sam" - params: - cigar_regex=r'(^[0-9A-LN-Z]*[A-LN-Z]|^)([0-9]+M)[0-9A-LN-Z]*$', - min_match=50 - run: - ref_covpos = defaultdict(set) - with AlignmentFile(input.bam) as bam: - for entry in progressbar(bam, desc="telbam input"): - cigar_match = search(params.cigar_regex, str(entry.cigarstring)) - if cigar_match: - start, end = entry.reference_start, entry.reference_end - if end - start >= params.min_match: - if end - start == int(cigar_match.group(2)[:-1]): - covpos = set(range(start, end)) - ref_covpos[entry.reference_name] |= covpos - with AlignmentFile(input.ref) as ref, open(output.sam, "wt") as sam: - print(str(ref.header).rstrip("\n"), file=sam) - for ref_entry in progressbar(ref, desc="bam output"): - covpos = ref_covpos[ref_entry.qname] - if covpos: - ref_seq_array = fromiter(ref_entry.seq, dtype="= 0] - else: - support_columns = [c for c in arm_support.columns if c < 0] - arm_support = arm_support[support_columns] - else: - raise NotImplementedError("Fix known issue with tailchopper mappos!") - steps = range(int(ceil(arm_support.shape[1]/mu))) - blocks = (arm_support.iloc[:,i*mu:i*mu+mu].mean(axis=1) for i in steps) - row_order = arm_support.isnull().sum(axis=1).sort_values().index - mean_arm_support = concat(blocks, axis=1).loc[row_order] - heatmap(mean_arm_support, cmap="summer_r", cbar=False, ax=ax) - twinx = ax.twinx() - lineplot_kws = dict( - x=mean_arm_support.columns, y=mean_arm_support.sum(axis=0), ax=twinx - ) - lineplot(**lineplot_kws, color="white", lw=5, alpha=.5) - lineplot(**lineplot_kws, color="#E10000", lw=1.5) - twinx.set(ylim=(0, len(arm_support))) - offset = arm_support.columns[::mu].to_series().abs().min() - get_mu_label = lambda t: arm_support.columns[::mu][int(t)] + offset - check_mu_label = lambda t: get_mu_label(t) % mu_visible < mu_visible / 5 - xticklabels = [ - get_mu_label(xt.get_text()) if check_mu_label(xt.get_text()) else "" - for xt in ax.get_xticklabels() - ] - ax.set(xticklabels=xticklabels, yticks=[]) - return twinx, arm_support - - -def redecorate_support_and_save(axs, twinxs, chroms, figure, pdf, text_kws=dict(x=.02, y=.9, va="top", ha="left")): - ymax = max(max(ax.get_ylim()) for ax in chain(axs.flatten(), twinxs)) - for ax in chain(axs.flatten(), twinxs): - ax.set(ylim=(0, ymax)) - for i, chrom in enumerate(chroms): - axs[i,0].text(**text_kws, s=chrom, transform=axs[i,0].transAxes) - for ax in chain(axs[:,0], axs[:-1,1], twinxs[:-1]): - ax.set(yticks=[]) - for ax in axs[:-1,:].flatten(): - ax.set(xticks=[]) - figure.savefig(pdf, bbox_inches="tight") - - -def dna2bool(seq): - return array(list(seq.upper())).view(uint32) & 7 != 6 - - -def write_support_stats(telbam2tailchopper, telbam_fq, supported_bases, total_bases, coverages, txt): - with AlignmentFile(telbam2tailchopper) as tb2tc: - supporting_telbams = len(set( - e.qname for e in progressbar(tb2tc, desc="Calculating stats") - if e.flag & 4 == 0 - )) - with FastxFile(telbam_fq) as fq: - total_telbams = sum(1 for _ in fq) - p_coverage = concatenate(coverages[False]) - q_coverage = concatenate(coverages[True]) - with open(txt, mode="wt") as txt_handle: - print("telbam_used", end="\t", file=txt_handle) - print(supporting_telbams, total_telbams, sep="\t", file=txt_handle) - print("bases_supported", end="\t", file=txt_handle) - print(supported_bases, total_bases, sep="\t", file=txt_handle) - print("median_coverage_[p,q]", end="\t", file=txt_handle) - print(median(p_coverage), median(q_coverage), sep="\t", file=txt_handle) - print("mean_coverage_[p,q]", end="\t", file=txt_handle) - print(p_coverage.mean(), q_coverage.mean(), sep="\t", file=txt_handle) - - -rule telbam_support_coverage: - """Calculates and visualizes telbam support of long reads (from telbam_support)""" - input: - telbam_fq="data/datasets/{group}/Illumina/{sample}/{sample}-telbam.fq.gz", - telbam2tailchopper="data/datasets/{group}/Illumina/{sample}/{sample}-telbam2tailchopper.bam", - tb_support="data/datasets/{group}/Illumina/{sample}/{sample}-telbam-support.sam" - output: - pdf="data/datasets/{group}/Illumina/{sample}/{sample}-telbam-support.pdf", - txt="data/datasets/{group}/Illumina/{sample}/{sample}-telbam-support.txt" - params: - fix_tailchopper_mappos=True, - subplots_kws=dict(ncols=2, sharey=True, squeeze=False, gridspec_kw=dict(hspace=0, wspace=0)) - run: - supported_bases, total_bases = 0, 0 - with AlignmentFile(input.tb_support) as tb_support: - entry_support_as_list = [] - for entry in filter(lambda e: e.flag & 3840 == 0, tb_support): - bool_seq = dna2bool(entry.seq) - total_bases += len(bool_seq) - supported_bases += sum(bool_seq) - meta = [entry.reference_name, entry.flag & 0x8000 == 0x8000] - entry_support_as_list.append(Series( - name=entry.qname, data=meta+bool_seq.tolist(), - index=reindex_entry(entry, params.fix_tailchopper_mappos) - )) - entry_support = concat(entry_support_as_list, axis=1, sort=True).T - support_cols = ["chrom", "is_q"] + sorted(entry_support.columns[2:]) - entry_support = entry_support[support_cols] - chroms = natsorted_chromosomes(entry_support["chrom"].drop_duplicates()) - switch_backend("pdf") - twinxs, figure, axs = [], *subplots( - nrows=len(chroms), figsize=(14, len(chroms)), **params.subplots_kws - ) - coverages = {False: [], True: []} - for i, chrom in enumerate(progressbar(chroms, unit="chromosome")): - for j, is_q in enumerate([False, True]): - twinx, arm_support = visualize_telbam_coverage( - entry_support, chrom=chrom, is_q=is_q, ax=axs[i,j], - fix_tailchopper_mappos=params.fix_tailchopper_mappos - ) - twinxs.append(twinx) - if arm_support is not None: - coverages[is_q].append(arm_support.sum(axis=0).values) - redecorate_support_and_save(axs, twinxs, chroms, figure, output.pdf) - write_support_stats( - input.telbam2tailchopper, input.telbam_fq, - supported_bases, total_bases, coverages, output.txt - ) - - -def repeatfinder_input(w): - if w.group == "GIAB": - sample = "/".join([w.sample.split("/")[0], w.sample.split("/")[0]]) - technology, postfix = "Illumina", "telbam-support.sam" - else: - sample, technology, postfix = w.sample, "PacBio", "tailchopper.bam" - return dict(sam="data/datasets/{}/{}/{}-{}".format( - w.group, technology, sample, postfix - )) - - -rule repeatfinder: - """Runs edgecase repeatfinder on any PacBio sample. Outputs of relevance to the paper: - GIAB: PacBio: HG001.RTG-repeatfinder-p_arm.tsv - GIAB: PacBio: HG001.RTG-repeatfinder-q_arm.tsv - GIAB: PacBio: HG002.10kb+15kb-repeatfinder-p_arm.tsv - GIAB: PacBio: HG002.10kb+15kb-repeatfinder-q_arm.tsv - GIAB: PacBio: HG005.10x-repeatfinder-p_arm.tsv - GIAB: PacBio: HG005.10x-repeatfinder-q_arm.tsv""" - input: - unpack(repeatfinder_input) - output: - p_arm="data/datasets/{group}/PacBio/{sample}-repeatfinder-p_arm.tsv", - q_arm="data/datasets/{group}/PacBio/{sample}-repeatfinder-q_arm.tsv" - threads: 4 - shell: """ - ./edgecase repeatfinder -j {threads} -f 'tract_anchor' -F 'is_q|3840' \ - -m 4 -M 16 -P 1.1 {input.sam} > {output.p_arm} - ./edgecase repeatfinder -j {threads} -f 'is_q|tract_anchor' -F 3840 \ - -m 4 -M 16 -P 1.1 {input.sam} > {output.q_arm} - """ - - -def load_repeatfinder(unfiltered_filenames, arm): - sample_reports, subjects = [], [] - for tsv in sorted(f for f in unfiltered_filenames if arm in f): - sample_report = read_csv(tsv, sep="\t", usecols=(0, 4, 5), index_col=0) - subject = search(r'HG00[0-9]', tsv).group() - subjects.append(subject) - sample_report.columns = [ - subject+" "+c for c in list(sample_report.columns) - ] - sample_reports.append(sample_report) - raw_report = concat(sample_reports, axis=1, sort=False) - raw_report["arm"] = arm - return raw_report, subjects - - -rule giab_repeats: - """Combines GIAB repeatfinder results (from repeatfinder) into one table, calculates Mudholkar-George combined p-values. - Outputs of relevance to the paper: - GIAB: PacBio: repeatfinder-paper-p_arm.tsv - GIAB: PacBio: repeatfinder-paper-q_arm.tsv""" - input: - giab=expand( - "data/datasets/GIAB/PacBio/{sample}-repeatfinder-{arm}.tsv", - sample=["HG001/HG001.RTG", "HG002/HG002.10kb+15kb", "HG005/HG005.10x"], - arm=["p_arm", "q_arm"] - ) - output: - giab_merged_p_arm=temp("data/datasets/GIAB/PacBio/repeatfinder-paper-p_arm-unadjusted.tsv"), - giab_merged_q_arm=temp("data/datasets/GIAB/PacBio/repeatfinder-paper-q_arm-unadjusted.tsv"), - run: - p_arm_report, p_arm_subjects = load_repeatfinder(input.giab, "p_arm") - q_arm_report, q_arm_subjects = load_repeatfinder(input.giab, "q_arm") - assert set(p_arm_subjects) == set(q_arm_subjects) - subjects = p_arm_subjects - merged_report = concat([p_arm_report, q_arm_report], axis=0).dropna() - p_ilocs = list(range(1, len(input.giab), 2)) - merged_report["mgpval"] = merged_report.iloc[:,p_ilocs].fillna(1).apply( - lambda r: combine_pvalues(r, method="mudholkar_george")[1], axis=1, - ) - merged_report.index.name = "#motif" - merged_report = merged_report[sorted( - c for c in merged_report.columns if not c.endswith("p") - )] - subset_report = lambda df, arm: df[df["arm"]==arm].drop(columns="arm") - subset_report(merged_report, "p_arm").to_csv( - output.giab_merged_p_arm, sep="\t" - ) - subset_report(merged_report, "q_arm").to_csv( - output.giab_merged_q_arm, sep="\t" - ) - - -rule plottable_repeats: - """Selects top repeats for plotting. Of relevance to the paper: - GIAB: PacBio: repeatfinder-plottable-p_arm.tsv - GIAB: PacBio: repeatfinder-plottable-q_arm.tsv""" - input: - giab=expand("data/datasets/GIAB/PacBio/repeatfinder-paper-{arm}.tsv", arm=["p_arm", "q_arm"]), - chm=expand("data/datasets/T2T/PacBio/sorted_pbhifi_t2t_CHM13hTERT-repeatfinder-{arm}.tsv", arm=["p_arm", "q_arm"]) - output: - giab=expand("data/datasets/GIAB/PacBio/repeatfinder-plottable-{arm}.tsv", arm=["p_arm", "q_arm"]), - chm=expand("data/datasets/T2T/PacBio/sorted_pbhifi_t2t_CHM13hTERT-repeatfinder-plottable-{arm}.tsv", arm=["p_arm", "q_arm"]) - params: - max_motifs=5, min_abundance={"p_arm": .002, "q_arm": .01} - run: - for group, arm in product(["GIAB", "T2T"], ["p_arm", "q_arm"]): - matches = lambda tsv: (group in tsv) and (arm in tsv) - full_tsv = next(filter(matches, input)) - full_report = read_csv(full_tsv, sep="\t", index_col=0) - full_report = full_report[( - c for c in full_report.columns if "abundance" in c - )] - report_medians = full_report.median(axis=1).rename("abundance") - report_medians.index.name = "#motif" - report_medians_df = report_medians.to_frame() - filtered_report_medians = report_medians_df[:params.max_motifs][ - report_medians_df["abundance"]>=params.min_abundance[arm] - ] - small_tsv = next(filter(matches, output)) - filtered_report_medians.to_csv(small_tsv, sep="\t") - - -def kmerscanner_input(w): - bam_mask = "{}/{}/PacBio/{}-tailpuller.bam" - bam = bam_mask.format(w.prefix, w.group, w.sample) - if w.group == "GIAB": - tsv_mask = "{}/{}/PacBio/repeatfinder-plottable-{}.tsv" - tsv = tsv_mask.format(w.prefix, w.group, w.arm) - else: - tsv_mask = "{}/{}/PacBio/{}-repeatfinder-plottable-{}.tsv" - tsv = tsv_mask.format(w.prefix, w.group, w.sample, w.arm) - return dict(bam=bam, tsv=tsv) - - -def get_sam_flags(arm): - if arm == "q_arm": - shell_flags = "-f {} -F 3844".format(IS_Q_TRACT_ANCHOR) - samfilters = [IS_Q_TRACT_ANCHOR, 3844, 0] - elif arm == "p_arm": - shell_flags = "-f {} -F {}".format(TRACT_ANCHOR, IS_Q_3844) - samfilters = [TRACT_ANCHOR, IS_Q_3844, 0] - else: - raise ValueError("Unknown `arm`: '{}'".format(arm)) - return shell_flags, samfilters - - -rule kmerscanner: - """Runs kmerscanner for only the plottable repeats (from plottable_repeats). Outputs of relevance to the paper: - GIAB: PacBio: HG001.RTG-kmerscanner-plottable-p_arm-temp.dat.gz - GIAB: PacBio: HG001.RTG-kmerscanner-plottable-q_arm-temp.dat.gz - GIAB: PacBio: HG002.10kb+15kb-kmerscanner-plottable-p_arm-temp.dat.gz - GIAB: PacBio: HG002.10kb+15kb-kmerscanner-plottable-q_arm-temp.dat.gz - GIAB: PacBio: HG005.10x-kmerscanner-plottable-p_arm-temp.dat.gz - GIAB: PacBio: HG005.10x-kmerscanner-plottable-q_arm-temp.dat.gz""" - input: unpack(kmerscanner_input) - output: dat=temp("{prefix}/{group}/PacBio/{sample}-kmerscanner-plottable-{arm}-temp.dat.gz") - run: - shell_flags, _ = get_sam_flags(wildcards.arm) - shell(""" - ./edgecase kmerscanner {shell_flags} --motif-file {input.tsv} \ - {input.bam} | gzip -2 > {output.dat} - """) - - -rule kmerscanner_filtered: - """Filters kmerscanner outputs (from kmerscanner) to only the chromosome arms that are covered enough. Of relevance to the paper: - GIAB: PacBio: HG001.RTG-kmerscanner-plottable-p_arm.dat.gz - GIAB: PacBio: HG001.RTG-kmerscanner-plottable-q_arm.dat.gz - GIAB: PacBio: HG002.10kb+15kb-kmerscanner-plottable-p_arm.dat.gz - GIAB: PacBio: HG002.10kb+15kb-kmerscanner-plottable-q_arm.dat.gz - GIAB: PacBio: HG005.10x-kmerscanner-plottable-p_arm.dat.gz - GIAB: PacBio: HG005.10x-kmerscanner-plottable-q_arm.dat.gz""" - input: dat="{prefix}/{group}/PacBio/{sample}-kmerscanner-plottable-{arm}-temp.dat.gz" - output: dat="{prefix}/{group}/PacBio/{sample}-kmerscanner-plottable-{arm}.dat.gz" - params: min_reads=5 - run: - raw_densities = read_csv(input.dat, sep="\t") - chromosome_counter = raw_densities[["#name", "chrom"]].drop_duplicates() - chromosome_counts = chromosome_counter["chrom"].value_counts() - indexer = (chromosome_counts>=params.min_reads) - chromosomes_to_keep = chromosome_counts[indexer].index - filtered_densities = raw_densities[ - raw_densities["chrom"].isin(chromosomes_to_keep) - ] - filtered_densities.to_csv( - output.dat, compression="gzip", sep="\t", index=False - ) - - -rule densityplot: - """Plots density plots for each of the kmer scan files (from kmerscanner_filtered)""" - input: dat="{prefix}/{group}/PacBio/{sample}-kmerscanner-plottable-{arm}.dat.gz" - output: pdf="{prefix}/{group}/PacBio/{sample}-densityplot-{arm}.pdf" - params: index="assets/hg38ext.fa.ecx" - run: - shell_flags, _ = get_sam_flags(wildcards.arm) - shell(""" - ./edgecase densityplot --palette paper -x {params.index} \ - --title ' ' {shell_flags} -z {input.dat} > {output.pdf} - """) - - -rule levenshtein: - """Runs Levenshtein-based clustering for all longread samples""" - input: bam="{prefix}-tailpuller.bam", tsv="{prefix}-kmerscanner-plottable-{arm}.dat.gz" - output: tsv="{prefix}-levenshtein-{arm}.tsv" - params: directory="{prefix}-levenshtein-{arm}" - run: - shell_flags, _ = get_sam_flags(wildcards.arm) - shell(""" - mkdir {params.directory} 2>/dev/null || : - ./edgecase levenshtein {shell_flags} --kmerscanner-file {input.tsv} \ - --output-dir {params.directory} {input.bam} \ - > {output.tsv} - """) - - -def haplotype_densityplots_input(w): - tsv_mask = "data/datasets/{}/{}-levenshtein-{}.tsv" - tsv = tsv_mask.format(w.group, w.prefix, w.arm) - dat_mask = "data/datasets/{}/{}-levenshtein-{}/*dat.gz" - dats = glob(dat_mask.format(w.group, w.prefix, w.arm)) - return [tsv] + dats - - -rule haplotype_densityplots: - """Plots density plots for each of the haplotype kmer scan files (from levenshtein)""" - input: - haplotype_densityplots_input - output: - outdir=directory("data/datasets/{group,[^/]+}/{prefix}-levenshtein-{arm}-densityplots"), - tar="data/datasets/{group,[^/]+}/{prefix}-levenshtein-{arm}-densityplots.tar" - params: - index="assets/hg38ext.fa.ecx", - plot_span={ - "GIAB": { - "p_arm": "PAPER_LEFT_SPAN=12000 PAPER_RIGHT_SPAN=500", - "q_arm": "PAPER_LEFT_SPAN=1000 PAPER_RIGHT_SPAN=11000" - }, - "T2T": { - "p_arm": "PAPER_LEFT_SPAN=3000 PAPER_RIGHT_SPAN=500", - "q_arm": "PAPER_LEFT_SPAN=500 PAPER_RIGHT_SPAN=4000" - } - } - run: - shell("mkdir {output.outdir} 2>/dev/null || :") - for filename in input: - matcher = search(r'([^/]+)\.dat\.gz', filename) - if matcher: - pdf = path.join(output.outdir, matcher.group(1) + ".pdf") - shell_flags, _ = get_sam_flags(wildcards.arm) - env_directive = params.plot_span[wildcards.group][wildcards.arm] - shell(""" - {env_directive} ./edgecase densityplot \ - --palette 'paper|legend=False' -x {params.index} \ - {shell_flags} --zoomed-in -z {filename} > {pdf} - """) - shell("tar cf {output.tar} {output.outdir}") - - -def weighted_quantile(points, weights, q): - indsort = argsort(points.values) - spoints, sweights = points.values[indsort], weights.values[indsort] - sn = cumsum(sweights) - pn = (sn - sweights / 2) / sn[-1] - if isinstance(q, float): - return interp(q, pn, spoints) - else: - return [(_q, interp(_q, pn, spoints)) for _q in q] - - -rule kmerscan_for_entropy: - """Performs kmerscan with a different window size (ENTROPY_W) for further calculation of entropy""" - input: - bam="data/datasets/GIAB/PacBio/{sample}-tailpuller.bam", - tsv="data/datasets/GIAB/PacBio/repeatfinder-paper-{arm}.tsv" - output: - dat="data/datasets/GIAB/PacBio/{sample}-entropy/kmerscanner-{arm}.dat.gz" - run: - shell_flags, _ = get_sam_flags(wildcards.arm) - shell(""" - ./edgecase kmerscanner -w {ENTROPY_W} {shell_flags} \ - --motif-file {input.tsv} {input.bam} | gzip -2 > {output.dat} - """) - - -def safe_idxmax(column): - if column.max() > 0: - return column.idxmax() - else: - return nan - - -rule giab_entropy: - """Performs entropy calculation""" - input: dat="data/datasets/GIAB/PacBio/{sample}-entropy/kmerscanner-{arm}.dat.gz" - output: tsv="data/datasets/GIAB/PacBio/{sample}-entropy/entropies-{arm}.tsv.gz" - run: - _, samfilters = get_sam_flags(wildcards.arm) - ks = load_kmerscan( - input.dat, gzipped=True, samfilters=samfilters, bin_size=ENTROPY_W - ) - entropy_stats_list = [] - desc = wildcards.sample + " entropy" - for bdf in progressbar(ks.values(), desc=desc): - per_read_modes = bdf.groupby("name").apply( - lambda block: block.set_index("motif").iloc[:,8:].apply( - safe_idxmax, axis=0 - ) - ) - filtered_prm = per_read_modes.dropna(how="all", axis=1) - coverage = (~filtered_prm.isnull()).sum(axis=0) - max_motifs = len(bdf["motif"].drop_duplicates()) - max_entropy = log2(coverage.apply(lambda c: min(c, max_motifs))) - raw_entropies = per_read_modes.apply( - lambda column: entropy(column.value_counts()) - ) - entropies = raw_entropies / max_entropy - entropy_stats_list.append( - DataFrame({"entropy": entropies, "coverage": coverage}) - ) - entropy_stats = concat(entropy_stats_list, axis=0) - entropy_stats.dropna(how="any", axis=0).to_csv( - output.tsv, sep="\t", index=False - ) - - -rule giab_entropies: - """Combines entropies for multiple samples into one dataframe""" - input: - tsvs=expand( - "data/datasets/GIAB/PacBio/{sample}-entropy/entropies-{arm}.tsv.gz", - sample=PACBIO_NAME_TO_SAMPLE.values(), arm=["p_arm", "q_arm"] - ) - output: - tsv="data/datasets/GIAB/PacBio/entropy.tsv" - run: - entropies = concat([ - read_csv(tsv, sep="\t") for tsv in input.tsvs - ], axis=0) - entropies.to_csv(output.tsv, sep="\t", index=False) - qrange = arange(0, 1.01, .01) - qstats = [entropies["entropy"].quantile(q) for q in qrange] - wqstats = weighted_quantile( - entropies["entropy"], entropies["coverage"], qrange - ) - for (q, wqval), qval in zip(wqstats, qstats): - print("{:03d}\t{:.6f}\t{:.6f}".format(int(q*100), qval, wqval)) - - -rule telbam_to_fasta: - """Extracts reads in FASTA format from telbams for shortread-only repeatfinder""" - input: bam="data/datasets/NASA/{technology}/{subject}/{subject}-telbam.bam" - output: fa="data/datasets/NASA/{technology}/{subject}/{subject}-telbam.fa" - threads: 4 - shell: """ - samtools collate -@{threads} \ - -o {input.bam}-collated {input.bam} {input.bam}-collation-temp - samtools fasta -n -F3844 {input.bam}-collated > {output.fa} - rm -f {input.bam}-collated - """ - - -rule repeatfinder_shortread: - """Runs repeatfinder in --collapse-reverse-complement mode on short read data""" - input: fa="data/datasets/NASA/{technology}/{subject}/{subject}-telbam.fa" - output: tsv="data/datasets/NASA/{technology}/{subject}/{subject}-telbam-repeatfinder.tsv" - threads: 4 - shell: """ - ./edgecase repeatfinder -j {threads} -C -s 8G \ - -m 4 -M 16 -P 1.1 --fmt fastx {input.fa} > {output.tsv} - """ - - -SHORTREAD_10X_SAMPLES = [ - "Subject_1_1", "Subject_1_2", "Subject_1_3", "Subject_2", -] -SHORTREAD_ILLUMINA_SAMPLES = [ - "A", "B", "C", "D", -] - - -def shortread_repeatfinder_combined_input(w): - if w.technology == "10X": - mask = "data/datasets/NASA/10X/{}/{}-telbam-repeatfinder.tsv" - samples = SHORTREAD_10X_SAMPLES - elif w.technology == "Illumina": - mask = "data/datasets/NASA/Illumina/{}/{}-telbam-repeatfinder.tsv" - samples = SHORTREAD_ILLUMINA_SAMPLES - else: - raise ValueError("technology='{}'".format(w.technology)) - return [mask.format(sample, sample) for sample in samples] - - -rule shortread_repeatfinder_combined: - """Combine shortread repeatfinder results per-technology (RNA-seq [TODO], Illumina, 10X)""" - input: shortread_repeatfinder_combined_input - output: tsv="data/datasets/NASA/{technology}/{technology}-repeatfinder-unadjusted.tsv" - run: - rfs = [ - read_csv(tsv, sep="\t", usecols=(0, 4, 5), escapechar="#").rename( - columns={ - "abundance": tsv.split("/")[-1].split("-")[0], - "p": tsv.split("/")[-1].split("-")[0]+" p", - } - ) - for tsv in input - ] - merged_report = concat( - [rf.set_index("monomer") for rf in rfs], axis=1, - ).dropna() - merged_report["mgpval"] = merged_report.iloc[:,1::2].fillna(1).apply( - lambda r: combine_pvalues(r, method="mudholkar_george")[1], axis=1, - ) - #george = merged_report.iloc[:,p_ilocs].fillna(1).apply( - # lambda r: combine_pvalues(r, method="mudholkar_george")[1], axis=1, - #) - #merged_report["mgp_adjusted"] = Series( - # data=multipletests(george, method="bonferroni")[1], - # index=george.index, - #) - #merged_report = merged_report[merged_report["mgp_adjusted"]<.05] - merged_report.index.name = "#motif" - merged_report = merged_report[sorted( - c for c in merged_report.columns if not c.endswith("p") - )] - merged_report.to_csv(output.tsv, sep="\t") - - -rule all_platform_repeatfinder_adjusted: - input: - giab_p="data/datasets/GIAB/PacBio/repeatfinder-paper-p_arm-unadjusted.tsv", - giab_q="data/datasets/GIAB/PacBio/repeatfinder-paper-q_arm-unadjusted.tsv", - illumina="data/datasets/NASA/Illumina/Illumina-repeatfinder-unadjusted.tsv", - chromium="data/datasets/NASA/10X/10X-repeatfinder-unadjusted.tsv", - output: - giab_p="data/datasets/GIAB/PacBio/repeatfinder-paper-p_arm.tsv", - giab_q="data/datasets/GIAB/PacBio/repeatfinder-paper-q_arm.tsv", - illumina="data/datasets/NASA/Illumina/Illumina-repeatfinder.tsv", - chromium="data/datasets/NASA/10X/10X-repeatfinder.tsv", - run: - pvals = concat( - [read_csv(tsv, sep="\t")["mgpval"] for tsv in input], axis=0, - ) - p_adjusted = multipletests(pvals, method="bonferroni")[1] - bonferroni_lookup = {p: padj for p, padj in zip(pvals, p_adjusted)} - for dataset, tsv in input.items(): - rf = read_csv(tsv, sep="\t") - rf["p_adjusted"] = rf["mgpval"].map(bonferroni_lookup) - rf = rf.drop(columns="mgpval") - rf["#motif"] = rf["#motif"].apply( # fix old repeatfinder motifs - lambda m: m*int(ceil(4/len(m))) if len(m) < 4 else m - ) - rf = rf[rf["p_adjusted"]<.05] - rf.to_csv(output[dataset], sep="\t", index=False) - - -rule all_platform_repeatfinder_intersected: - input: - giab_p="data/datasets/GIAB/PacBio/repeatfinder-paper-p_arm.tsv", - giab_q="data/datasets/GIAB/PacBio/repeatfinder-paper-q_arm.tsv", - illumina="data/datasets/NASA/Illumina/Illumina-repeatfinder.tsv", - chromium="data/datasets/NASA/10X/10X-repeatfinder.tsv", - output: - tsv="data/datasets/NASA/repeatfinder-intersected-with-GIAB.tsv" - run: - get_cmc = lambda tsv: set(read_csv(tsv, sep="\t")["#motif"].apply( - lowest_collapsed_revcomp_alpha_inversion - )) - common_motifs_collapsed = ( - (get_cmc(input.giab_p) | get_cmc(input.giab_q)) & - get_cmc(input.illumina) & - get_cmc(input.chromium) - ) - def resubset(tsv, name): - rf = read_csv(tsv, sep="\t", index_col=0) - rf.index = rf.index.map(lowest_collapsed_revcomp_alpha_inversion) - rf = rf.reindex(common_motifs_collapsed) - rf["median"] = rf.iloc[:,:-1].median(axis=1) - rf = rf.iloc[:,[-1,-2]] - rf.columns = [name+" median abundance", name+" adjusted p-value"] - rf.index = rf.index.map(revcomp).map(custom_alpha_inversion) - return rf - shortread_rf = concat([ - resubset(input.illumina, "illumina"), - resubset(input.chromium, "chromium") - ], - axis=1 - ) - shortread_rf.sort_values( - by="chromium median abundance", ascending=False, - ).to_csv( - output.tsv, sep="\t", - ) - - -# Retained for documentation/historical purposes: -# include: "../../publications/methods-paper/snakefiles/old-shortread.snake" -# include: "../../publications/methods-paper/snakefiles/bootstraps.snake" diff --git a/publications/methods-paper/csl/springer-basic-brackets.csl b/publications/methods-paper/csl/springer-basic-brackets.csl deleted file mode 100644 index f82942b..0000000 --- a/publications/methods-paper/csl/springer-basic-brackets.csl +++ /dev/null @@ -1,188 +0,0 @@ - - diff --git a/publications/methods-paper/figures/HG001-densityplots.pdf b/publications/methods-paper/figures/HG001-densityplots.pdf deleted file mode 100644 index 8ae56a2..0000000 Binary files a/publications/methods-paper/figures/HG001-densityplots.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/HG001-densityplots.tex b/publications/methods-paper/figures/HG001-densityplots.tex deleted file mode 100644 index afcfcaa..0000000 --- a/publications/methods-paper/figures/HG001-densityplots.tex +++ /dev/null @@ -1,18 +0,0 @@ -\documentclass{article} -\usepackage[paperheight=13.20in,paperwidth=10.4in,margin=0in]{geometry} -\usepackage[sfdefault]{roboto} -\usepackage{graphicx} -\usepackage{tikz} -\usepackage[absolute,overlay]{textpos} - \setlength{\TPHorizModule}{1in} - \setlength{\TPVertModule}{1in} - -\begin{document} - -\begin{textblock}{13}(-0.0, 0.20) \LARGE{(A)} \end{textblock} -\begin{textblock}{13}(-0.2, 0.40) \includegraphics{assets/HG001-densityplot-p_arm.pdf} \end{textblock} - -\begin{textblock}{13}(-0.0, 3.40) \LARGE{(B)} \end{textblock} -\begin{textblock}{13}(-0.2, 3.80) \includegraphics{assets/HG001-densityplot-q_arm.pdf} \end{textblock} - -\end{document} diff --git a/publications/methods-paper/figures/HG002-alignment.pdf b/publications/methods-paper/figures/HG002-alignment.pdf deleted file mode 100644 index ec629a2..0000000 Binary files a/publications/methods-paper/figures/HG002-alignment.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/HG002-densityplot-p_arm.pdf b/publications/methods-paper/figures/HG002-densityplot-p_arm.pdf deleted file mode 100644 index 2be761d..0000000 Binary files a/publications/methods-paper/figures/HG002-densityplot-p_arm.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/HG002-densityplot-q_arm.pdf b/publications/methods-paper/figures/HG002-densityplot-q_arm.pdf deleted file mode 100644 index d8dac5c..0000000 Binary files a/publications/methods-paper/figures/HG002-densityplot-q_arm.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/HG002-levenshtein-densityplots.pdf b/publications/methods-paper/figures/HG002-levenshtein-densityplots.pdf deleted file mode 100644 index 424bcb5..0000000 Binary files a/publications/methods-paper/figures/HG002-levenshtein-densityplots.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/HG002-levenshtein-densityplots.tex b/publications/methods-paper/figures/HG002-levenshtein-densityplots.tex deleted file mode 100644 index 9310815..0000000 --- a/publications/methods-paper/figures/HG002-levenshtein-densityplots.tex +++ /dev/null @@ -1,23 +0,0 @@ -\documentclass{article} -\usepackage[paperheight=21.65in,paperwidth=17.88in,margin=0in]{geometry} -\usepackage{graphicx} -\usepackage{tikz} -\usepackage[absolute,overlay]{textpos} - \setlength{\TPHorizModule}{1in} - \setlength{\TPVertModule}{1in} - -\begin{document} - -\begin{textblock}{13}(-0.3, 0.10) \includegraphics[width=5in]{assets/HG002-q_arm-levenshtein-clustermaps/chr7.pdf} \end{textblock} -\begin{textblock}{13}(4.70, 0.30) \includegraphics[height=5in]{assets/HG002-q_arm-levenshtein-densityplots/chr7.pdf} \end{textblock} - -\begin{textblock}{13}(-0.3, 5.60) \includegraphics[width=5in]{assets/HG002-q_arm-levenshtein-clustermaps/chr11.pdf} \end{textblock} -\begin{textblock}{13}(4.70, 5.54) \includegraphics[height=5in]{assets/HG002-q_arm-levenshtein-densityplots/chr11.pdf} \end{textblock} - -\begin{textblock}{13}(-0.3, 11.10) \includegraphics[width=5in]{assets/HG002-q_arm-levenshtein-clustermaps/chr15.pdf} \end{textblock} -\begin{textblock}{13}(4.70, 11.03) \includegraphics[height=5in]{assets/HG002-q_arm-levenshtein-densityplots/chr15.pdf} \end{textblock} - -\begin{textblock}{13}(-0.3, 16.60) \includegraphics[width=5in]{assets/HG002-q_arm-levenshtein-clustermaps/chr22.pdf} \end{textblock} -\begin{textblock}{13}(4.70, 16.54) \includegraphics[height=5in]{assets/HG002-q_arm-levenshtein-densityplots/chr22.pdf} \end{textblock} - -\end{document} diff --git a/publications/methods-paper/figures/HG005-densityplots.pdf b/publications/methods-paper/figures/HG005-densityplots.pdf deleted file mode 100644 index 6f6ecef..0000000 Binary files a/publications/methods-paper/figures/HG005-densityplots.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/HG005-densityplots.tex b/publications/methods-paper/figures/HG005-densityplots.tex deleted file mode 100644 index 5abed5b..0000000 --- a/publications/methods-paper/figures/HG005-densityplots.tex +++ /dev/null @@ -1,18 +0,0 @@ -\documentclass{article} -\usepackage[paperheight=13.85in,paperwidth=10.4in,margin=0in]{geometry} -\usepackage[sfdefault]{roboto} -\usepackage{graphicx} -\usepackage{tikz} -\usepackage[absolute,overlay]{textpos} - \setlength{\TPHorizModule}{1in} - \setlength{\TPVertModule}{1in} - -\begin{document} - -\begin{textblock}{13}(-0.0, 0.20) \LARGE{(A)} \end{textblock} -\begin{textblock}{13}(-0.2, 0.40) \includegraphics{assets/HG005-densityplot-p_arm.pdf} \end{textblock} - -\begin{textblock}{13}(-0.0, 2.70) \LARGE{(B)} \end{textblock} -\begin{textblock}{13}(-0.2, 2.90) \includegraphics{assets/HG005-densityplot-q_arm.pdf} \end{textblock} - -\end{document} diff --git a/publications/methods-paper/figures/HG00X-alignments.pdf b/publications/methods-paper/figures/HG00X-alignments.pdf deleted file mode 100644 index e8ef649..0000000 Binary files a/publications/methods-paper/figures/HG00X-alignments.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/HG00X-alignments.tex b/publications/methods-paper/figures/HG00X-alignments.tex deleted file mode 100644 index ece6cea..0000000 --- a/publications/methods-paper/figures/HG00X-alignments.tex +++ /dev/null @@ -1,18 +0,0 @@ -\documentclass{article} -\usepackage[paperheight=19.80in,paperwidth=17.3in,margin=0in]{geometry} -\usepackage[sfdefault]{roboto} -\usepackage{graphicx} -\usepackage{tikz} -\usepackage[absolute,overlay]{textpos} - \setlength{\TPHorizModule}{1in} - \setlength{\TPVertModule}{1in} - -\begin{document} - -\begin{textblock}{13}(-0.0, 0.20) \Huge{(A)} \end{textblock} -\begin{textblock}{13}(-0.2, 0.94) \includegraphics{assets/HG001-alignment.pdf} \end{textblock} - -\begin{textblock}{13}(8.5, -0.30) \includegraphics{assets/HG005-alignment.pdf} \end{textblock} -\begin{textblock}{13}(8.5, 0.20) \Huge{(B)} \end{textblock} - -\end{document} diff --git a/publications/methods-paper/figures/assets/HG001-alignment.pdf b/publications/methods-paper/figures/assets/HG001-alignment.pdf deleted file mode 100644 index e2d7eb9..0000000 Binary files a/publications/methods-paper/figures/assets/HG001-alignment.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG001-densityplot-p_arm.pdf b/publications/methods-paper/figures/assets/HG001-densityplot-p_arm.pdf deleted file mode 100644 index 0af42fb..0000000 Binary files a/publications/methods-paper/figures/assets/HG001-densityplot-p_arm.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG001-densityplot-q_arm.pdf b/publications/methods-paper/figures/assets/HG001-densityplot-q_arm.pdf deleted file mode 100644 index 2810af9..0000000 Binary files a/publications/methods-paper/figures/assets/HG001-densityplot-q_arm.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr11.pdf b/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr11.pdf deleted file mode 100644 index 9e38a20..0000000 Binary files a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr11.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr15.pdf b/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr15.pdf deleted file mode 100644 index 913a0cb..0000000 Binary files a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr15.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr22.pdf b/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr22.pdf deleted file mode 100644 index 91edb26..0000000 Binary files a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr22.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr7.pdf b/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr7.pdf deleted file mode 100644 index 9b985ff..0000000 Binary files a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-clustermaps/chr7.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr11.pdf b/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr11.pdf deleted file mode 100644 index d2ec91d..0000000 Binary files a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr11.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr15.pdf b/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr15.pdf deleted file mode 100644 index 94b0f86..0000000 Binary files a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr15.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr22.pdf b/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr22.pdf deleted file mode 100644 index b0a84ee..0000000 Binary files a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr22.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr7.pdf b/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr7.pdf deleted file mode 100644 index b81094f..0000000 Binary files a/publications/methods-paper/figures/assets/HG002-q_arm-levenshtein-densityplots/chr7.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG005-alignment.pdf b/publications/methods-paper/figures/assets/HG005-alignment.pdf deleted file mode 100644 index ca23f28..0000000 Binary files a/publications/methods-paper/figures/assets/HG005-alignment.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG005-densityplot-p_arm.pdf b/publications/methods-paper/figures/assets/HG005-densityplot-p_arm.pdf deleted file mode 100644 index 8ea5659..0000000 Binary files a/publications/methods-paper/figures/assets/HG005-densityplot-p_arm.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/assets/HG005-densityplot-q_arm.pdf b/publications/methods-paper/figures/assets/HG005-densityplot-q_arm.pdf deleted file mode 100644 index 61f3881..0000000 Binary files a/publications/methods-paper/figures/assets/HG005-densityplot-q_arm.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/threemotifp/HG001-densityplots-threemotifp.pdf b/publications/methods-paper/figures/threemotifp/HG001-densityplots-threemotifp.pdf deleted file mode 100644 index 8b89d00..0000000 Binary files a/publications/methods-paper/figures/threemotifp/HG001-densityplots-threemotifp.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/threemotifp/HG002-densityplot-p_arm-threemotifp.pdf b/publications/methods-paper/figures/threemotifp/HG002-densityplot-p_arm-threemotifp.pdf deleted file mode 100644 index 89c46f0..0000000 Binary files a/publications/methods-paper/figures/threemotifp/HG002-densityplot-p_arm-threemotifp.pdf and /dev/null differ diff --git a/publications/methods-paper/figures/threemotifp/HG005-densityplots-threemotifp.pdf b/publications/methods-paper/figures/threemotifp/HG005-densityplots-threemotifp.pdf deleted file mode 100644 index adb1d17..0000000 Binary files a/publications/methods-paper/figures/threemotifp/HG005-densityplots-threemotifp.pdf and /dev/null differ diff --git a/publications/methods-paper/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.tex b/publications/methods-paper/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.tex deleted file mode 100644 index 0e55548..0000000 --- a/publications/methods-paper/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.tex +++ /dev/null @@ -1,140 +0,0 @@ -\documentclass{article} - -\usepackage[tmargin=.9in, bmargin=.9in, lmargin=1in, rmargin=1in]{geometry} -\usepackage{bookmark, wrapfig, enumitem, pdflscape, hyphenat} -\usepackage[labelfont=bf]{caption} -\usepackage{lmodern} -\usepackage[sfdefault]{roboto} -\usepackage[T1]{fontenc} -\usepackage{setspace} - -\usepackage{graphicx} - \makeatletter % tex.stackexchange.com/a/28565 - \setlength{\@fptop}{0pt} - \setlength{\@fpbot}{0pt plus 1fil} - \makeatother - -\usepackage[absolute, overlay]{textpos} - \setlength{\TPHorizModule}{1mm} - \setlength{\TPVertModule}{1mm} - -\usepackage{xcolor} - \definecolor{WCM}{RGB}{172,31,44} % AC1E2C - -\usepackage{hyperref} \hypersetup{ - colorlinks=true, linkcolor={blue!65!black}, - citecolor={blue!65!black}, urlcolor={blue!50!black}, - pdfpagelayout=OneColumn, pdfstartview={XYZ null null 1.25}, - bookmarksnumbered=true, bookmarksopen=true, bookmarksopenlevel=3 -} - -\usepackage[backend=bibtex, style=authoryear]{biblatex} - \addbibresource{references.bib} - % https://tex.stackexchange.com/a/134065 : - \renewcommand*{\nameyeardelim}{\addcomma\space} - % https://tex.stackexchange.com/a/404787 : - \usepackage{xpatch} - \DeclareNameAlias{sortname}{last-first} % general changes - \renewcommand*{\bibinitdelim}{} - \renewbibmacro*{in:}{\iffieldequalstr{entrytype}{inproceedings}{\printtext{\bibstring{in}\addspace}}{}} - \csletcs{abx@macro@publisher+location+date@orig}{abx@macro@publisher+location+date} % changes for "book" - \renewbibmacro*{publisher+location+date}{\printtext[parens]{\usebibmacro{publisher+location+date@orig}}} - \DeclareFieldFormat[book]{title}{#1\printunit{\addspace}} - \DeclareFieldFormat[inproceedings]{title}{#1\isdot} % changes for "inproceedings" - \DeclareFieldFormat{booktitle}{#1\addcomma} - \xpatchbibmacro{byeditor+others} - {\usebibmacro{byeditor+othersstrg} \setunit{\addspace} \printnames[byeditor]{editor} \clearname{editor}} - {\printnames[byeditor]{editor} \clearname{editor} \addcomma\addspace \bibstring{editor} \setunit{\addspace}}{}{} - \DeclareFieldFormat[article]{title}{#1} % changes in "article" - \DeclareFieldFormat[article]{journaltitle}{#1\isdot} - \DeclareFieldFormat[article]{volume}{\textit{#1}} - \DeclareFieldFormat[article]{pages}{#1} - % https://en.wikibooks.org/wiki/LaTeX/Macros#New_commands - \newcommand{\citep}[1]{(\cite{#1})} - -\newcommand{\beginsupplement}{ -% bytesizebio.net/2013/03/11/adding-supplementary-tables-and-figures-in-latex - %\newpage - %\setcounter{page}{1} - %\renewcommand{\thepage}{S-\arabic{page}} - \pagenumbering{gobble} - \setcounter{table}{0} - \renewcommand{\thetable}{S\arabic{table}} - \setcounter{figure}{0} - \renewcommand{\thefigure}{S\arabic{figure}} - } -\begin{document} - -\beginsupplement - -\section*{Supplemental Information} - -\subsection*{Supplemental figures} \addcontentsline{toc}{subsection}{Supplemental figures} \label{sec:supp_figs} - -\begin{figure}[ht!] \centering -\includegraphics[height=.65\textheight,width=\textwidth,keepaspectratio]{figures/HG00X-alignments.pdf} -\caption{ - Mapping of candidate telomeric PacBio CCS reads from datasets (A) HG001 and (B) HG005. - Chromosomes are displayed schematically, centered around the centromere, with only the arms shown to which candidate reads aligned. - Vertical red dashed lines denote the position of the boundary of the annotated telomeric tract. - Coordinates are given in bp, relative to the positions of the telomeric tract boundaries. - Relates to: \textbf{Figure 1}. -} -\label{fig:hg00x_alignments} -\end{figure} -\clearpage \pagebreak - -\begin{figure}[ht!] \centering -\includegraphics[height=.9\textheight,width=\textwidth,keepaspectratio]{figures/threemotifp/HG002-densityplot-p_arm-threemotifp.pdf} -\caption{ - Densities of top three enriched motifs (contributing to at least 0.5\% of the repeat content) at ends of chromosomal \textit{p} arms of the HG002 dataset. - Only the arms covered by at least 20 reads are displayed. - Genomic coordinates are given in Mbp. - Vertical red dashed lines denote the position of the boundary of the annotated telomeric tract. - Relates to: \textbf{Figure 2}, \textbf{Table 1}. -} -\label{fig:hg002_densityplot_p_arm} -\end{figure} -\clearpage \pagebreak - -\begin{figure}[ht!] \centering -\includegraphics[height=.95\textheight,width=\textwidth,keepaspectratio]{figures/threemotifp/HG001-densityplots-threemotifp.pdf} -\caption{ - Motif densities at ends of chromosomal (A) \textit{p} and (B) \textit{q} arms of the HG001 dataset. - Only the arms covered by at least 20 reads are displayed. - Genomic coordinates are given in Mbp. - Relates to: \textbf{Figure 2}, \textbf{Table 1}. -} -\label{fig:hg001_densityplots} -\end{figure} -\clearpage \pagebreak - -\begin{figure}[ht!] \centering -\includegraphics[height=.95\textheight,width=\textwidth,keepaspectratio]{figures/threemotifp/HG005-densityplots-threemotifp.pdf} -\caption{ - Motif densities at ends of chromosomal (A) \textit{p} and (B) \textit{q} arms of the HG005 dataset. - Only the arms covered by at least 20 reads are displayed. - Genomic coordinates are given in Mbp. - Relates to: \textbf{Figure 2}, \textbf{Table 1}. -} -\label{fig:hg005_densityplots} -\end{figure} -\clearpage \pagebreak - -\begin{figure}[ht!] \centering -\includegraphics[height=.95\textheight,width=.65\textwidth,keepaspectratio]{figures/entropy.pdf} -\caption{ - Distribution of motif entropies in 10 bp windows of candidate PacBio CCS reads aligning to the same chromosomal arms in GIAB datasets HG001, HG002, and HG005. - Red solid lines denote the position of the median (0.000 in all three datasets), and red dashed lines denote the 3rd quartile (0.166, 0.074, and 0.211, respectively). - Relates to: \textbf{STAR Methods, Evaluation of sequence concordance in telomeric long reads}. -} -\label{fig:entropy} -\end{figure} -\clearpage \pagebreak - -\subsection*{Supplemental tables} \addcontentsline{toc}{subsection}{Supplemental tables} -\input{tables/telomeric-read-counts.tex} -\input{tables/shortread-repeatfinder.tex} -\input{tables/HG002-haplotype-assignment.tex} - -\end{document} diff --git a/publications/methods-paper/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.pdf b/publications/methods-paper/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.pdf deleted file mode 100644 index 357c5d8..0000000 Binary files a/publications/methods-paper/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.pdf and /dev/null differ diff --git a/publications/methods-paper/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.tex b/publications/methods-paper/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.tex deleted file mode 100644 index 544a7e5..0000000 --- a/publications/methods-paper/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.tex +++ /dev/null @@ -1,404 +0,0 @@ -\documentclass{article} - -\usepackage[tmargin=.9in, bmargin=.9in, lmargin=1in, rmargin=1in]{geometry} -\usepackage{bookmark, wrapfig, enumitem, pdflscape, hyphenat} -\usepackage[labelfont=bf]{caption} -\usepackage{lmodern} -\usepackage[sfdefault]{roboto} -\usepackage[T1]{fontenc} - -\usepackage{graphicx} - \makeatletter % tex.stackexchange.com/a/28565 - \setlength{\@fptop}{0pt} - \setlength{\@fpbot}{0pt plus 1fil} - \makeatother - -\usepackage[absolute, overlay]{textpos} - \setlength{\TPHorizModule}{1mm} - \setlength{\TPVertModule}{1mm} - -\usepackage{xcolor} - \definecolor{WCM}{RGB}{172,31,44} % AC1E2C - -\usepackage{hyperref} \hypersetup{ - colorlinks=true, linkcolor={blue!65!black}, - citecolor={blue!65!black}, urlcolor={blue!50!black}, - pdfpagelayout=OneColumn, pdfstartview={XYZ null null 1.25}, - bookmarksnumbered=true, bookmarksopen=true, bookmarksopenlevel=3 -} - -\usepackage[backend=bibtex, style=nature]{biblatex} - \addbibresource{references.bib} - -\newcommand{\beginsupplement}{ -% bytesizebio.net/2013/03/11/adding-supplementary-tables-and-figures-in-latex - \newpage - \setcounter{page}{1} - \renewcommand{\thepage}{S-\arabic{page}} - \setcounter{table}{0} - \renewcommand{\thetable}{S\arabic{table}} - \setcounter{figure}{0} - \renewcommand{\thefigure}{S\arabic{figure}} - } - -\usepackage{setspace} - -\begin{document} - -\begin{center} - \Large{\textbf{Haplotype Diversity and Sequence Heterogeneity of Human Telomeres}} - \\~\\ - \small{ - Kirill Grigorev\textsuperscript{1,2 \#}, - Jonathan Foox\textsuperscript{1,2,3 \#}, - Daniela Bezdan\textsuperscript{1,2,3}, - Daniel Butler\textsuperscript{1}, - Jared J. Luxton\textsuperscript{4,5}, - Jake Reed\textsuperscript{1}, - \\%rem - Miles J. McKenna\textsuperscript{4,5}, - Lynn Taylor\textsuperscript{4,5}, - Kerry A. George\textsuperscript{4,5}, - Cem Meydan\textsuperscript{1,2,3}, - Susan M. Bailey\textsuperscript{4,5 *}, - Christopher E. Mason\textsuperscript{1,2,3,6 *} - } -\end{center} - -\small{ \noindent - \textsuperscript{1} Department of Physiology and Biophysics, Weill Cornell Medicine, New York, New York, USA - \\ - \textsuperscript{2} The HRH Prince Alwaleed Bin Talal Bin Abdulaziz Alsaud Institute for Computational Biomedicine, \\ - \textcolor{white}{\textsuperscript{2}} Weill Cornell Medicine, New York, New York, USA - \\ - \textsuperscript{3} The Feil Family Brain and Mind Research Institute, New York, New York, USA - \\ - \textsuperscript{4} Department of Environmental and Radiological Health Sciences, Colorado State University, Fort Collins, CO - \\ - \textsuperscript{5} Cell and Molecular Biology Program, Colorado State University, Fort Collins, CO - \\ - \textsuperscript{6} The WorldQuant Initiative for Quantitative Prediction, Weill Cornell Medicine, New York, NY, USA - \\ - \textsuperscript{\#} Co-first authors - \\ - \textsuperscript{*} Corresponding authors. Send correspondence to S.M.B. (susan.bailey@colostate.edu) \\%rem - \textcolor{white}{\textsuperscript{*}} and C.E.M. (chm2042@med.cornell.edu) -} - -\normalsize -\doublespacing - -\section*{Abstract} \addcontentsline{toc}{section}{Abstract} -Telomeres are regions of repetitive nucleotide sequences capping the ends of eukaryotic chromosomes that protect against deterioration, whose lengths can be correlated with age and disease risk factors. -Given their length and repetitive nature, telomeric regions are not easily reconstructed from short read sequencing, making telomere sequence resolution a very costly and generally intractable problem. -Recently, long-read sequencing, with read lengths measuring in hundreds of Kbp, has made it possible to routinely read into telomeric regions and inspect their structure. -Here, we describe a framework for extracting telomeric reads from single-molecule sequencing experiments, describing their sequence variation and motifs, and for haplotype inference. -We find that -long telomeric stretches can be accurately captured with long-read sequencing, -observe extensive sequence heterogeneity of human telomeres, -discover and localize non-canonical motifs (both previously reported as well as novel), -confirm the presence of the non-canonical motifs in short read sequencing experiments, -and report the first motif composition maps of human telomeric diplotypes on a multi-Kbp scale. - -\pagebreak -%\singlespacing -%\tableofcontents -\doublespacing - -\section*{Introduction} \addcontentsline{toc}{section}{Introduction} -Telomeres are the functional ends of human chromosomes that naturally shorten with mitosis and age \cite{teloaging}, whose lengths can also be influenced by disease and environmental exposures (e.g., radiation, pollution, exercise, cancers) \cite{teloeffects}. -While human telomeres are known to consist largely of a conserved six-nucleotide repeat (TTAGGG) \cite{moyzis}, several studies have identified variations of this motif in proximal telomeric regions \cite{telovars1989,telovars1999,telovars2018,telovars2019}. -However, such studies were performed with oligonucleotide hybridization, PCR, immunoprecipitation, and short read sequencing, resulting in discovery, but not localization, of motif variants. -Thus, long-range maps of telomeric sequence variation in the human genome are still lacking. -Such maps can provide insight into telomere biology and enable novel approaches to analyze the effects of aging, disease, and environment on telomere structure and length. -\\~\\ -To improve our understanding of telomere structure and sequence variation, we developed \textit{edgeCase}, a framework for alignment, motif discovery, and haplotype inference from human telomeric reads. -We have validated these methods using Genome in a Bottle \cite{giab} single-molecule real-time (SMRT) sequencing datasets generated with Pacific Biosciences circular consensus sequencing (PacBio CCS) \cite{pacbio,pacbioccs} and short read Illumina \cite{illumina} and 10X Genomics (Chromium) \cite{10x} datasets. -These results provide evidence for multiple novel, non-canonical telomeric repeats, resolution of chromosome-specific diplotypes with SMRT sequencing, and a new method for long-range characterization of the structure of telomeric sequences. - -\section*{Results} \addcontentsline{toc}{section}{Results} - -\subsection*{Telomeric reads are present in human long-read whole genome sequencing datasets} -\addcontentsline{toc}{subsection}{Telomeric reads are present in human long-read whole genome sequencing datasets} -We aligned PacBio CCS reads of three Genome in a Bottle (GIAB) human subjects (HG001, HG002, and HG005) to a combination of the human reference genome and human subtelomeric assemblies (see \hyperref[sec:methods]{Materials and Methods}). -In total, we observed reads mapping to the ends of chromosomes and extending past them into telomeric regions on 9 \textit{p} arms and 17 \textit{q} arms, with 256 such reads ($\sim$10x mean coverage) in the HG001 dataset, 570 ($\sim$22x) in HG002, and 241 ($\sim$9x) in HG005. -\autoref{fig:hg002_alignment} schematically represents the alignment of such reads in the HG002 dataset; alignment plots for the other two datasets are available as a supplementary figure (\autoref{fig:hg00x_alignments}), and full mapping statistics are available \mbox{in \autoref{tab:telomeric_read_counts}}. -Illumina reads from matching GIAB datasets supported 70.8\%, 63.3\%, and 82.7\% of the candidate PacBio CCS sequence, % 69.2\% combined -providing average coverages of $\sim$5x, $\sim$9x, and $\sim$6x, respectively, by sequences supported by both technologies. - -\subsection*{Telomeric reads contain variations of the canonical motif} -\addcontentsline{toc}{subsection}{Telomeric reads contain variations of the canonical motif} -We performed \textit{de novo} repeat discovery in the supported regions for motifs of lengths 4 through 16 and identified motifs in repeat contexts that are statistically enriched in the three datasets. -The majority of motifs were either the canonical TTAGGG / CCCTAA, its variation (e.g., TT\underline{G}GGG / CCC\underline{C}AA), or a duplet of variants, such as TTAGGGTTA\underline{G}GGG (\autoref{tab:repeatfinder_full}). -CG-rich motifs were also observed on the \textit{p} arms. -The top enriched motif (TTAGGG / CCCTAA) explained 43.3\%\textendash{}54.4\% of the telomeric repeat content on the \textit{q} arms, and 10.0\%\textendash{}22.7\% on the \textit{p} arms, while overall, four motifs on the \textit{q} arms and three motifs on the \textit{p} arms each explained at least 0.5\% of the repeat content. -These top motifs, as well as 15 less enriched ones, were confirmed in independently generated human short read and linked-read genomic datasets (\hyperref[sec:supp_methods]{Supplementary methods}, \autoref{tab:shortread_repeatfinder}). -% ?? out of ?? (??.?\%) of these motifs were confirmed in independently generated human linked-read, short read genomic, and short read transcriptomic datasets (\hyperref[sec:supp_methods]{Supplementary methods}, \autoref{tab:shortread_repeatfinder}). -\autoref{fig:hg002_densityplot_q_arm} visualizes the locations of the top four enriched motifs on the \textit{q} arm of the HG002 dataset; only the arms covered by at least 20 reads are displayed. -Plots for other datasets and arms are available as supplementary figures: \autoref{fig:hg002_densityplot_p_arm} visualizes the top three motifs on the \textit{p} arm of the HG002 dataset, \autoref{fig:hg001_densityplots} and \autoref{fig:hg005_densityplots} visualize datasets HG001 and HG005 respectively. -Long reads on each arm agreed on the locations of different motifs within any given 10 bp window (the median of normalized Shannon entropy was 0.000 for all data, and the 3rd quartile was 0.166, 0.074, and 0.211 for the three datasets, respectively, \autoref{fig:entropy}), indicating that locations of the variations are colinear among reads and are not a result of sequencing errors. - -\subsection*{Long-read sequencing resolves human telomeric haplotypes} -\addcontentsline{toc}{subsection}{Long-read sequencing resolves human telomeric haplotypes} -Sequences of telomeric reads clustered by relative pairwise Levenshtein distances \cite{levenshtein} with varying levels of heterogeneity depending on the dataset and the chromosomal arm to which they belonged. -We examined the \textit{q} arms of the HG002 dataset to investigate this heterogeneity, as they provided the deepest coverage (\autoref{tab:telomeric_read_counts}), and found that, on 12 out of the 15 arms, reads clustered into two prominent groups per arm when maximizing the Bayesian information criterion \cite{bic} (see \hyperref[sec:methods]{Materials and Methods}). -Pairwise distances between the reads within these clusters were significantly lower than those for out-of-cluster pairings, implying that distinct telomeric haplotypes are present. -To quantify the differences between putative haplotypes, we calculated silhouette scores \cite{silhouette} for these clusterings (\autoref{tab:levenshtein-q_arm}), and generated motif density plots for the four chromosome arms with the highest such scores to visualize the differences in haplotypes (\autoref{fig:levenshtein_q_arm}). - -\section*{Discussion} \addcontentsline{toc}{section}{Discussion} -Repeat-rich, low-complexity regions of the human genome such as telomeres have been historically recalcitrant to full mapping and annotation \cite{miga2015}, mainly due to the alignment challenge they pose and to the read lengths required to span such areas \cite{ngslowcomplexity}. -The advent of long-read, single-molecule methods (third generation sequencing) has provided new opportunities to map the sequence composition of a previously "dark" area of the human genome. -These results reaffirm that the canonical repeat (TTAGGG) is certainly the most dominant type of motif in telomeres, but also reveal a surprising diversity of repeat variations, which are confirmed by both short and long-read sequencing technologies. -This diversity of repeats includes previously reported variants, as well as novel motifs that are characterized not only by nucleotide substitutions, but also insertions, deletions, and even motif pairing. -Apart from these variations, CG-rich motifs were identified in telomeric regions of \textit{p} arms, consistent with previously reported findings \cite{cpg}. -Moreover, while short read sequencing is able to identify such variants, it alone cannot reveal the relative locations of these motifs within telomeres, as repetitive short reads can neither be aligned outside of the reference genome nor provide enough overlap variability to be assembled \textit{de novo}. -Long SMRT reads, on the other hand, can be anchored to known subtelomeric sequences of the human genome and extend into the previously unmapped telomeric area. -These results also highlight the need of better subtelomeric and telomeric annotations in the human genome. -Four of the 40 subtelomeric assemblies \cite{riethman2014} were homologous to regions in the reference genome far within the respective chromosomes (up to 586 Kbp into the reference sequence), and the canonical motif was present on the \textit{q} arm of chr8 only after 2\textendash{}3Kbp past the annotated boundary in all datasets, suggesting that the existing assemblies do not provide a completely accurate telomeric annotation, and that methods described herein could help to resolve these areas of reference genomes. -\\~\\ -We observed PacBio CCS reads reaching up to 13 Kbp beyond the known regions of the genome, and resolving the underlying sequence with reasonable fidelity \textendash{} even without support from short reads, \textendash{} both measured by the entropy of motif assignment and by pairwise Levenshtein distances between the reads belonging to the same chromosomal arms. -While short reads also provided support for all of the reported motifs, the overlap between the short and the long reads was substantial, but not complete, which can be explained by the necessary bias towards the canonical motif during the selection of short reads. -Therefore, telomeric regions with higher content of non-canonical repeats are less likely to be identified through the use of short reads, and instead, long reads appear to be more suitable for this purpose as well. -The identified variations in long range contexts enable clustering of SMRT reads into distinct haplotypes at ends of chromosomes, and thus provide a new means of diplotype mapping and reveal the existence and motif composition of such diplotypes on a multi-Kbp scale. - -\section*{Materials and Methods} \addcontentsline{toc}{section}{Materials and Methods} \label{sec:methods} - -\subsection*{The extended reference genome} -\addcontentsline{toc}{subsection}{The extended reference genome} -We constructed the extended reference genome by performing an all-to-all alignment of all contigs in the \textit{hg38} reference genome \cite{grch38,hg38} and the subtelomeric assemblies \cite{riethman2014} with \textit{minimap2} \cite{minimap} using three settings for assembly-to-reference mapping (\textit{asm5}, \textit{asm10}, \textit{asm20}). -Forty subtelomeric contigs mapped to ends of \textit{hg38} chromosomes with a mapping quality of 60, one (XpYptel) mapped with the quality of 0 and was discarded; one (14qtel) mapped to the ALT version of chr14 (chr14\_KI270846v1\_alt) with the quality of 52, which, in turn, mapped to the main chr14 chromosome with the quality of 60. -%Finally, an ALT version of chr12 (chr12\_GL877875v1\_alt) mapped to chr12 and an unplaced chrUn\_KI270745v1 to chr17, both with the quality of 60. -These data and the exact match and mismatch coordinates were used to create a combined reference (\textit{hg38ext}) in which subtelomeric contigs informed the locations of the boundaries of the telomeric tracts (\textit{tract\_anchor}). -Such contigs that mapped fully within \textit{hg38} chromosomes resulted in \textit{tract\_anchor} annotations directly on those \textit{hg38} chromosomes; partially mapping contigs were considered as forking from the \textit{hg38} sequence and were similarly annotated by themselves. - -\subsection*{Selection of telomeric reads and identification of repeat content} -\addcontentsline{toc}{subsection}{Selection of telomeric reads and identification of repeat content} -Three subjects were selected for the analysis. -The first individual (NA12878/HG001) came from the pilot genome of the HapMap project \cite{HG001}, while the other two, including the son from the Ashkenazi Jewish Trio (NA24385/HG002) and the son from the Chinese Trio (NA24631/HG005), are members of the Personal Genome Project, whose genomes are consented for commercial redistribution and reidentification \cite{HG00X}. -These subjects are referred to hereafter as HG001, HG002, and HG005, respectively. -\\~\\ -For subjects HG001 and HG005, Genome in a Bottle \cite{giab} PacBio\_SequelII\_CCS\_11kb datasets were used (one dataset per each subject). -For subject HG002, a combination of two sequencing experiments was analyzed (PacBio\_CCS\_10kb and PacBio\_CCS\_15kb). -The mean coverage was $\sim$29x, $\sim$58x, and $\sim$32x for subjects HG001, HG002, and HG005, respectively. -Reads were mapped to \textit{hg38ext} with \textit{minimap2}, and reads that mapped to either end of either chromosome and overlapped the boundary of its telomeric tract were selected for further analysis. -These reads had a portion of their sequence mapped to the reference contig and a portion extending beyond the reference (soft- or hard-clipped in the alignment file). -Sequences past the \textit{tract\_anchor} marker were extracted from the reads that had this marker within their mapped portion (from the 5' end to the marker on \textit{p} arms and from the marker to the 3' end on \textit{q} arms, accounting for forward and reverse mappings). -To identify regions of the telomeres that are fully supported by both short and long reads, we extracted candidate telomeric reads from GIAB Illumina datasets - (NIST\_NA12878\_HG001\_HiSeq\_300x, - NIST\_HiSeq\_HG002\_Homogeneity-10953946, - HG005\_NA24631\_son\_HiSeq\_300x; - all three $\sim$300x coverage) -with \textit{Telomerecat} \cite{telomerecat}, and selected those that mapped perfectly with \textit{minimap2} (at least a 50bp-long exact match without insertions or deletions, allowing all secondary mappings) to the telomeric regions of the PacBio CCS candidates from the same subject's dataset. -\\~\\ -Within the regions supported by both PacBio CCS and Illumina candidate reads, overrepresentation of motifs of lengths $k \subset [4 .. 16]$ was tested. -To target motifs in repeat contexts, doubled sequences (for example, \textit{k}-mer ACGTACGT for motif ACGT) were counted with \textit{jellyfish} \cite{jellyfish}, and counts of \textit{k}-mers synonymous with respect to circular shifts (for example, ACGTACGT and CGTACGTA) were summed together. -For each such \textit{k}-mer, Fisher's exact test was performed to determine whether its count is significant on the background of counts of other \textit{k}-mers of the same length. -Briefly, we considered \textit{k}-mers with counts higher than 1.5 interquartile range above the third quartile of the distribution as potentially classifiable, and a $2\times{}2$ contingency matrix $ C $ for the test was constructed as follows: -row 0 contained counts of potentially classifiable \textit{k}-mers, -row 1 contained counts of remaining (non-classifiable) \textit{k}-mers, -columns 0 and 1 contained counts of single and remaining (background) \textit{k}-mers, respectively, -i.e.: -$ C_{0,0} = $ {\rmfamily count of target \textit{k}-mer}, -$ C_{0,1} = $ {\rmfamily sum of counts of other potentially classifiable \textit{k}-mers}, -$ C_{1,0} = $ {\rmfamily median count of \textit{k}-mer}, -$ C_{1,1} = $ {\rmfamily sum of counts of other non-classifiable \textit{k}-mers}. -The resultant \textit{p}-values for each motif among the samples were combined using the Mudholkar-George method \cite{george}, the Bonferroni multiple testing correction was applied, and motifs for which \textit{k}-mers yielded \textit{p}-values below the cutoff of 0.05 were reported. -\\~\\ -As telomeric reads contain long low-complexity regions and present an alignment challenge, we evaluated concordance of their sequences without realignment of their portions that extended past the reference sequence. -To that end, for all reads mapping to the same chromosomal arm, we calculated densities of each motif in a rolling window starting from the innermost mapped position. -To evaluate whether the reads on the same arm agree on the positions of different motifs, for each read, we calculated motif densities in 10 bp windows with 10 bp smoothing to buffer insertions and deletions. -For each window in a read, the motif with the highest density was selected to represent that window. -Then, normalized Shannon entropy among all reads was calculated in each window as $ S = \frac{ - \sum_{i} \; ( p_{i} ln p_{i} )}{ln N} $, where $ p_{i} $ is the frequency of each motif in the window and $ N $ is the number of motifs \cite{hepc_entropy}. -The value of normalized entropy was a metric bounded by $ [ 0, 1 ] $, with $ 0 $ describing perfect agreement and $ 1 $ describing maximum randomness. -For visualization, we performed 1000 rounds of bootstrap of the calculated density values in 100 bp rolling windows, and selected the lower and the upper bounds of the 95\% confidence interval of bootstrap. -Of note, several chromosome arms had the \textit{tract\_anchor} position further away from the end of the contig than others ($\sim$79\textendash{}586 Kbp into the chromosome sequence), and the reads mapping to these arms did not contain these motifs, suggesting that either their subtelomeric annotations were incorrect or large insertions or duplications were present in the reference genome; in light of this, reads mapping to the \mbox{\textit{p} arm} of chr1, the \textit{q} arm of chr4, and both arms of chr20 were removed from the study, and the analysis was repeated. - -\subsection*{Extraction of telomeric haplotypes} -\addcontentsline{toc}{subsection}{Extraction of telomeric haplotypes} -Within groups of reads mapping to each chromosome arm, all relative pairwise Levenshtein distances were calculated. -In short, to calculate the absolute distance between each pair of reads, the sequences in the overlapping positions of the reads were extracted; the distance then equaled the minimum number of single-character insertions, deletions, and substitutions required to make these sequences identical. -The relative distance was computed as the absolute distance divided by the length of the overlap. -Relative distances were then clustered using Ward's method via the Euclidean metric. -The optimal number of clusters was determined by maximizing the Bayesian information criterion \cite{bic}, allowing for no more than one outlier and at least five reads per cluster, and silhouette scores for these clusterings were calculated. -Briefly, as previously described \cite{silhouette}, a silhouette score of a clustering was computed as the mean value of silhouette coefficients of all entries, which, in turn, equaled $ (b - a) \over{max(a, b)} $ where $ a = $ {\rmfamily mean intra-cluster distance} and $ b = $ {\rmfamily mean nearest-cluster distance} for an entry. -Levenshtein distances of all within-cluster pairings and of all out-of-cluster pairings were compared using the one-tailed Mann-Whitney U test; \textit{p}-values were adjusted with the Bonferroni correction. -Distinct clusters of reads within the same chromosome arm (adjusted Mann-Whitney U \textit{p}-value below 0.05) were reported as putative haplotypes. -As the HG002 dataset was combined from two sequencing experiments, we investigated the provenance of reads in these haplotypes; reads from both sequencing experiments contributed to each haplotype with an average $\sim$1:2 ratio (\autoref{tab:hg002_haplotype_assignment}). - -\section*{Availability and implementation} \addcontentsline{toc}{section}{Availability and implementation} -The software for identification of telomeric reads, \textit{de novo} discovery of repeat motifs, haplotype inference and motif density visualization was implemented in Python and is freely available at \\%rem -\href{https://github.com/lankycyril/edgecase}{github.com/lankycyril/edgecase}. - -\section*{Acknowledgements} \addcontentsline{toc}{section}{Acknowledgements} -We would like to thank -the Epigenomics Core Facility at Weill Cornell Medicine, -the Scientific Computing Unit (SCU), -XSEDE Supercomputing Resources, -as well as -the STARR grants I9-A9-071, I13-0052, -The Vallee Foundation, -The WorldQuant Foundation, -The Pershing Square Sohn Cancer Research Alliance, -NASA (NNX14AH51G, NNX14AB02G, NNX17AB26G), -The National Institutes of Health (R01MH117406, \\%rem -R01NS076465, R01CA249054, R01AI151059, P01HD067244, P01CA214274), -TRISH (NNX16AO69A:0107, \\%rem -NNX16AO69A:0061), -the LLS (9238-16, Mak, MCL-982, Chen-Kiang), -and -the NSF (1840275). - -\section*{Author contributions} \addcontentsline{toc}{section}{Author contributions} -S.M.B. and C.E.M. conceived the study. -K.G., J.F., and C.E.M. developed the framework and analyzed the data. -D.Bu., J.J.L., M.J.M., L.T., and K.A.G. participated in collection and processing of the ISS samples. -D.Be., D.Bu., J.J.L, J.R., and C.M. analyzed the data. -All authors edited the manuscript. - -\section*{Competing interests} \addcontentsline{toc}{section}{Competing interests} -The authors declare no relevant conflict of interest. - -\section*{References} \addcontentsline{toc}{section}{References} -\begingroup \raggedright \singlespacing \printbibliography[heading=none] \endgroup - -\pagebreak -\section*{Figures} \addcontentsline{toc}{section}{Figures} - -\begin{figure}[h!] \centering -\includegraphics[height=.85\textheight,width=\textwidth,keepaspectratio]{figures/HG002-alignment.pdf} -\caption{ - Mapping of candidate telomeric PacBio CCS reads from the HG002 dataset. - Chromosomes are displayed schematically, centered around the centromere, with only the arms shown to which candidate reads aligned. - Vertical red dashed lines denote the position of the boundary of the annotated telomeric tract. - Coordinates are given in bp, relative to the positions of the telomeric tract boundaries. -} -\label{fig:hg002_alignment} -\end{figure} -\clearpage \pagebreak - -%\addcontentsline{toc}{subsection}{\autoref{fig:hg002_densityplot_q_arm}} -\begin{figure}[h!] \centering -\includegraphics[height=\textheight,width=\textwidth,keepaspectratio]{figures/HG002-densityplot-q_arm.pdf} -\caption{ - Motif densities at ends of chromosomal \textit{q} arms of the HG002 dataset. - Only the arms covered by at least 20 reads are displayed. - Shaded boxes span the mapped regions of the genome. - Motif densities are plotted as stacked area charts; ribbons surrounding area boundaries represent the 95\% confidence interval of bootstrap. - Top four enriched motifs (contributing to at least 0.5\% of the repeat content) are plotted in color; pale tinted areas represent the density of any other motifs and non-repeating sequences (absence of enriched motifs). - Absolute genomic coordinates are given in Mbp on the specific reference contigs the reads mapped to (for example, for chr5, reads mapped to the 500 Kbp-long subtelomeric assembly \mbox{5qtel\_1-500K\_1\_12\_12}). - Vertical red dashed lines denote the position of the boundary of the annotated telomeric tract. -} -\label{fig:hg002_densityplot_q_arm} -\end{figure} -\clearpage \pagebreak - -%\addcontentsline{toc}{subsection}{\autoref{fig:levenshtein_q_arm}} -\begin{figure}[ht!] \centering -\includegraphics[height=.9\textheight,width=\textwidth,keepaspectratio]{figures/HG002-levenshtein-densityplots.pdf} -\caption{ - Clustering of reads into haplotypes based on relative pairwise Levenshtein distances on four representative chromosomal \textit{q} arms in the HG002 dataset, and densities of top enriched motifs in each haplotype. - Genomic coordinates are given in Mbp. - Read coverage of each haplotype is annotated above the density plot. -} -\label{fig:levenshtein_q_arm} -\end{figure} -\clearpage \pagebreak - -\pagebreak -\section*{Tables} \addcontentsline{toc}{section}{Tables} - -\input{tables/repeatfinder-full.tex} -\input{tables/HG002-levenshtein-q_arm.tex} - -\beginsupplement - -\singlespacing -\addcontentsline{toc}{section}{Supplementary materials} -\begin{center} - \LARGE{\textbf{Haplotype Diversity and Sequence Heterogeneity of Human Telomeres}} - \\~\\ - \small{Kirill Grigorev, Jonathan Foox \textit{et al.}} -\section*{Supplementary materials} -\end{center} -\doublespacing - -\subsection*{Supplementary methods} \addcontentsline{toc}{subsection}{Supplementary methods} \label{sec:supp_methods} - -To test the presence of non-canonical repeat motifs in datasets generated by technologies other than SMRT, -we generated four whole-genome Illumina datasets (mean coverage $\sim$104x) -and three linked-read 10X datasets (mean coverage $\sim$28x) -for one individual at different timepoints aboard the International Space Station (ISS), -and one additional linked-read 10X dataset (coverage $\sim$47x) for another individual aboard the ISS. -From these datasets, candidate telomeric short reads were selected using \textit{Telomerecat} \cite{telomerecat}, -and enriched repeated motifs were discovered \textit{de novo} with the method described in \hyperref[sec:methods]{Materials and Methods}. -\textit{p}-values were combined with the Mudholkar-George method \cite{george} within each technology (Illumina, 10X), -and the Bonferroni multiple testing correction was applied -(note: the Bonferroni correction was applied simultaneously to the \textit{p}-values for the motifs in PacBio reads (\autoref{tab:repeatfinder_full}) and for the motifs in short and linked reads). -Motifs that were significantly enriched (adjusted \textit{p}-value below the cutoff of 0.05) in the datasets produced by all three technologies (PacBio, Illumina, 10X), with respect to reverse-complemented equivalence, were reported (\autoref{tab:shortread_repeatfinder}). - -\pagebreak - -\subsection*{Supplementary figures} \addcontentsline{toc}{subsection}{Supplementary figures} \label{sec:supp_figs} - -\begin{figure}[ht!] \centering -\includegraphics[height=.65\textheight,width=\textwidth,keepaspectratio]{figures/HG00X-alignments.pdf} -\caption{ - Mapping of candidate telomeric PacBio CCS reads from datasets (A) HG001 and (B) HG005. - Chromosomes are displayed schematically, centered around the centromere, with only the arms shown to which candidate reads aligned. - Vertical red dashed lines denote the position of the boundary of the annotated telomeric tract. - Coordinates are given in bp, relative to the positions of the telomeric tract boundaries. -} -\label{fig:hg00x_alignments} -\end{figure} -\clearpage \pagebreak - -\begin{figure}[ht!] \centering -\includegraphics[height=.9\textheight,width=\textwidth,keepaspectratio]{figures/threemotifp/HG002-densityplot-p_arm-threemotifp.pdf} -\caption{ - Densities of top three enriched motifs (contributing to at least 0.5\% of the repeat content) at ends of chromosomal \textit{p} arms of the HG002 dataset. - Only the arms covered by at least 20 reads are displayed. - Genomic coordinates are given in Mbp. - Vertical red dashed lines denote the position of the boundary of the annotated telomeric tract. -} -\label{fig:hg002_densityplot_p_arm} -\end{figure} -\clearpage \pagebreak - -\begin{figure}[ht!] \centering -\includegraphics[height=.95\textheight,width=\textwidth,keepaspectratio]{figures/threemotifp/HG001-densityplots-threemotifp.pdf} -\caption{ - Motif densities at ends of chromosomal (A) \textit{p} and (B) \textit{q} arms of the HG001 dataset. - Only the arms covered by at least 20 reads are displayed. - Genomic coordinates are given in Mbp. -} -\label{fig:hg001_densityplots} -\end{figure} -\clearpage \pagebreak - -\begin{figure}[ht!] \centering -\includegraphics[height=.95\textheight,width=\textwidth,keepaspectratio]{figures/threemotifp/HG005-densityplots-threemotifp.pdf} -\caption{ - Motif densities at ends of chromosomal (A) \textit{p} and (B) \textit{q} arms of the HG005 dataset. - Only the arms covered by at least 20 reads are displayed. - Genomic coordinates are given in Mbp. -} -\label{fig:hg005_densityplots} -\end{figure} -\clearpage \pagebreak - -\begin{figure}[ht!] \centering -\includegraphics[height=.95\textheight,width=.65\textwidth,keepaspectratio]{figures/entropy.pdf} -\caption{ - Distribution of motif entropies in 10 bp windows of candidate PacBio CCS reads aligning to the same chromosomal arms in GIAB datasets HG001, HG002, and HG005. - Red solid lines denote the position of the median (0.000 in all three datasets), and red dashed lines denote the 3rd quartile (0.166, 0.074, and 0.211, respectively). -} -\label{fig:entropy} -\end{figure} -\clearpage \pagebreak - -\subsection*{Supplementary tables} \addcontentsline{toc}{subsection}{Supplementary tables} -\input{tables/telomeric-read-counts.tex} -\input{tables/shortread-repeatfinder.tex} -\input{tables/HG002-haplotype-assignment.tex} - -\end{document} diff --git a/publications/methods-paper/heterogeneity-of-telomeres-revealed-by-long-read-sequencing.docx b/publications/methods-paper/heterogeneity-of-telomeres-revealed-by-long-read-sequencing.docx deleted file mode 100644 index 8577b22..0000000 Binary files a/publications/methods-paper/heterogeneity-of-telomeres-revealed-by-long-read-sequencing.docx and /dev/null differ diff --git a/publications/methods-paper/heterogeneity-of-telomeres-revealed-by-long-read-sequencing.odt b/publications/methods-paper/heterogeneity-of-telomeres-revealed-by-long-read-sequencing.odt deleted file mode 100644 index 107c320..0000000 Binary files a/publications/methods-paper/heterogeneity-of-telomeres-revealed-by-long-read-sequencing.odt and /dev/null differ diff --git a/publications/methods-paper/makefile b/publications/methods-paper/makefile deleted file mode 100644 index 1e66e7f..0000000 --- a/publications/methods-paper/makefile +++ /dev/null @@ -1,13 +0,0 @@ -all: heterogeneity-of-telomeres-revealed-by-long-read-sequencing.pdf - -%.pdf: %.tex - pdflatex $< - bibtex $(basename $<) - pdflatex $< - pdflatex $< - rm -f $(basename $<).aux $(basename $<).bbl $(basename $<).blg - rm -f $(basename $<)-blx.bib $(basename $<).log $(basename $<).run.xml - rm -f $(basename $<).toc - -%.odt: %.tex - ./tex2office -i references.bib figures tables --svg heterogeneity-of-telomeres-revealed-by-long-read-sequencing.tex diff --git a/publications/methods-paper/snakefiles/bootstraps.snake b/publications/methods-paper/snakefiles/bootstraps.snake deleted file mode 100644 index 54f53cf..0000000 --- a/publications/methods-paper/snakefiles/bootstraps.snake +++ /dev/null @@ -1,146 +0,0 @@ -from numpy import sort, trim_zeros -from numpy.random import choice -from seaborn import kdeplot - - -BOOTSTRAP_W = 10 - - -rule kmerscan_for_bootstrap: - """Performs kmerscan with a different window size (BOOTSTRAP_W) for further bootstrapping. - Note: for visualization in densityplots, we ended up relying on seaborn's internal bootstrapping procedures, which are identical - to the ones outlined here.""" - input: - bam="data/datasets/GIAB/PacBio/{sample}-tailpuller.bam", - tsv="data/datasets/GIAB/PacBio/repeatfinder-paper-{arm}.tsv" - output: - dat="data/datasets/GIAB/PacBio/{sample}-bootstrap/kmerscanner-{arm}.dat.gz" - run: - shell_flags, _ = get_sam_flags(wildcards.arm) - shell(""" - ./edgecase kmerscanner -w {BOOTSTRAP_W} {shell_flags} \ - --motif-file {input.tsv} {input.bam} | gzip -2 > {output.dat} - """) - - -def bootstrap_margin(bdf, n_boot, ci_f, dropna=True): - """stackoverflow.com/a/38667081""" - bdf_wide = bdf.set_index("name").iloc[:,8:].copy() - N = len(bdf_wide) - boot_means = concat( - objs=[bdf_wide.iloc[choice(N, N)].mean(axis=0) for _ in range(n_boot)], - axis=1 - ) - sorted_boot_means = sort(boot_means.values, axis=1) - bdf_wide_mean = bdf_wide.mean(axis=0).values - low = sorted_boot_means[:,int(n_boot*(1-ci_f)/2)] - high = sorted_boot_means[:,int(n_boot*(1+ci_f)/2)] - weight = (~bdf_wide.isnull()).sum(axis=0) - margins = DataFrame( - data=array([bdf_wide_mean, (high-low)/2, weight]).T, - columns=["mean", "margin", "weight"], - index=bdf_wide.columns - ) - if dropna: - return margins.dropna(how="any", axis=0) - else: - return margins - - -rule bootstrap_per_sample: - """Performs bootstrapping""" - input: dat="data/datasets/GIAB/PacBio/{sample}-bootstrap/kmerscanner-{arm}.dat.gz" - output: tsv="data/datasets/GIAB/PacBio/{sample}-bootstrap/bootstrap-{arm}.tsv.gz" - params: n_boot=1000, ci_f=.95 - run: - _, samfilters = get_sam_flags(wildcards.arm) - ks = load_kmerscan( - input.dat, gzipped=True, samfilters=samfilters, bin_size=BOOTSTRAP_W - ) - with gzopen(output.tsv, mode="wt") as tsv: - header = ["mean", "margin", "weight", "chrom", "motif", "arm"] - print(*header, sep="\t", file=tsv) - for chrom, bdf in progressbar(ks.items(), desc="Bootstrapping"): - for motif in bdf["motif"].drop_duplicates(): - margins = bootstrap_margin( - bdf[bdf["motif"]==motif], params.n_boot, params.ci_f - ) - margins["chrom"], margins["motif"], margins["arm"] = ( - chrom, motif, wildcards.arm[0] - ) - margins_repr = margins.to_csv( - sep="\t", index=False, header=False - ) - print(margins_repr.rstrip(), file=tsv) - - -def filter_margins(margins, arm, zeros_action="keep"): - """Not used""" - if zeros_action == "keep": - margins_filtered = margins - elif zeros_action == "trim": - margins_groupby = margins.groupby( - ["chrom", "motif", "arm"], as_index=False - ) - margins_filtered = margins_groupby.apply( - lambda block: block.loc[trim_zeros(block["mean"]).index,:] - ) - elif zeros_action == "remove": - margins_filtered = margins[margins["mean"]!=0] - else: - raise ValueError("Unknown `zeros_action`: '{}'".format(zeros_action)) - return margins_filtered[ - margins_filtered["arm"]==arm - ].dropna(how="any", axis=0) - - -rule bootstrap_combined: - """Combines bootstraps for multiple samples into one dataframe; not used in the paper""" - input: - tsvs=expand( - "data/datasets/GIAB/PacBio/{sample}-bootstrap/bootstrap-{arm}.tsv.gz", - sample=PACBIO_NAME_TO_SAMPLE.values(), arm=["p_arm", "q_arm"] - ) - output: - tsv="data/datasets/GIAB/PacBio/bootstraps.tsv.gz" - run: - bootstraps = [ - read_csv(tsv, sep="\t").dropna(how="any", axis=0) - for tsv in input.tsvs - ] - concat(bootstraps, axis=0).to_csv( - output.tsv, compression="gzip", sep="\t", index=False - ) - - -rule bootstrap_median: - """Visualizes values of bootstrap median; not used in the paper""" - input: tsv="data/datasets/GIAB/PacBio/bootstraps.tsv.gz" - output: pdf="data/datasets/GIAB/PacBio/bootstraps.pdf" - params: weighted=False - run: - margins = read_csv(input.tsv, sep="\t") - if params.weighted: - med, q98 = ( - weighted_quantile( - margins["margin"].values, margins["weight"].values, q - ) - for q in [.5, .98] - ) - else: - med, q98 = (margins["margin"].quantile(q) for q in [.5, .98]) - print(med, q98) - switch_backend("pdf") - figure, ax = subplots(figsize=(6, 2.4)) - kdeplot( - data=margins["margin"], color="darkblue", shade=True, - bw=.0035, cut=0, - ax=ax, legend=False - ) - ax.set( - xlim=(0, .5), xlabel="95% margin of error of bootstrap", - ylim=(0, ax.get_ylim()[1]), - yticks=[], ylabel="kernel density estimate" - ) - ax.plot([q98, q98], [0, .32], lw=2, ls=":", color="#F01000") - figure.savefig(output.pdf, bbox_inches="tight") diff --git a/publications/methods-paper/snakefiles/old-shortread.snake b/publications/methods-paper/snakefiles/old-shortread.snake deleted file mode 100644 index 4dbd3d3..0000000 --- a/publications/methods-paper/snakefiles/old-shortread.snake +++ /dev/null @@ -1,81 +0,0 @@ -OLD_SHORTREAD_NASA_SAMPLES = [ - "10X/Subject_1_1/Subject_1_1_Longranger_2_1_4_phased_possorted-telomerecat", - "10X/Subject_1_2/Subject_1_2_Longranger_2_1_4_phased_possorted-telomerecat", - "10X/Subject_1_3/Subject_1_3_Longranger_2_1_4_phased_possorted-telomerecat", - "10X/Subject_2/Subject_2_Longranger_2_1_4_phased_possorted-telomerecat", - "Illumina/A/A-telomerecat", "Illumina/B/B-telomerecat", - "Illumina/C/C-telomerecat", "Illumina/D/D-telomerecat", - "Illumina/PINE_MAR15_01/PINE_MAR15_01-telomerecat", - "Illumina/PINE_MAR15_02/PINE_MAR15_02-telomerecat", - "RNASeq/reads_out_unfiltered", -] - -OLD_SHORTREAD_SAMPLES = [ - "NASA/" + sample for sample in OLD_SHORTREAD_NASA_SAMPLES -] - - -rule old_shortread_fasta: - input: bam="data/datasets/{sample}.bam" - output: fa="data/datasets/{sample}.fa" - shell: """ - samtools view -F3844 {input.bam} \ - | bioawk -c sam '{{print ">"$qname; print $seq}}' \ - > {output.fa} - """ - - -rule old_shortread_repeatfinder: - input: fa="data/datasets/{sample}.fa" - output: tsv="data/datasets/{sample}-repeatfinder.tsv" - threads: 4 - shell: """ - ./edgecase repeatfinder -j {threads} \ - -m 4 -M 16 -P 1.1 --fmt fastx {input.fa} > {output.tsv} - """ - - -rule old_shortread_repeatfinder_combined: - input: - tsvs=expand( - "data/datasets/{sample}-repeatfinder.tsv", - sample=OLD_SHORTREAD_SAMPLES - ) - output: - tsv="data/datasets/shortread-combined-repeatfinder.tsv" - params: - adjust=False, alpha=1.1 - run: - rfs = OrderedDict([( - tsv.split("/")[4].split("-")[0], - read_csv(tsv, sep="\t", usecols=(0, 4, 5)) - ) for tsv in input.tsvs - ]) - for name, rf in rfs.items(): - rf.columns = ["monomer", name, name + " p-value"] - # fix old repeatfinder's issue with 1-3-mers reported too short: - rf["monomer"] = rf["monomer"].apply(lambda m: m*int(ceil(4/len(m)))) - if params.adjust: # adjust together - rf_combined = reduce(partial(merge, on="monomer"), rfs.values()) - p_locs = [c for c in rf_combined.columns if c.endswith(" p-value")] - pvals = rf_combined[p_locs].fillna(1).values.flatten() - flattened_p_adjusted = multipletests(pvals, method="bonferroni")[1] - p_adjusted = DataFrame( - data=flattened_p_adjusted.reshape(-1, len(p_locs)), - index=rf_combined.index, - columns=[c+"-adjusted" for c in p_locs], - ) - rf_adjusted = concat([rf_combined, p_adjusted], axis=1) - rf_adjusted = rf_adjusted[(p_adjusted\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namechromhaplotypesample
0m54238_180701_114913/17892208/ccs18qtel_1-500K_1_12_12_rc2HG002.10kb
1m54316_180719_115850/24379831/ccs18qtel_1-500K_1_12_12_rc2HG002.10kb
2m54315_180714_142044/68878424/ccs18qtel_1-500K_1_12_12_rc1HG002.10kb
\n", - "" - ], - "text/plain": [ - " name chrom haplotype \\\n", - "0 m54238_180701_114913/17892208/ccs 18qtel_1-500K_1_12_12_rc 2 \n", - "1 m54316_180719_115850/24379831/ccs 18qtel_1-500K_1_12_12_rc 2 \n", - "2 m54315_180714_142044/68878424/ccs 18qtel_1-500K_1_12_12_rc 1 \n", - "\n", - " sample \n", - "0 HG002.10kb \n", - "1 HG002.10kb \n", - "2 HG002.10kb " - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.merge(kmerscan, sample_assignment)[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "assigner = pd.merge(kmerscan, sample_assignment).drop(columns=\"name\")\n", - "assigner[\"count\"] = 1\n", - "calculator = assigner.groupby([\"chrom\", \"haplotype\", \"sample\"], as_index=False).sum()\n", - "h1 = calculator.query(\"haplotype=='1'\").pivot(index=\"chrom\", columns=\"sample\", values=\"count\").reset_index()\n", - "h1[\"haplotype\"] = 1\n", - "h2 = calculator.query(\"haplotype=='2'\").pivot(index=\"chrom\", columns=\"sample\", values=\"count\").reset_index()\n", - "h2[\"haplotype\"] = 2\n", - "\n", - "assignment = pd.concat([h1, h2], axis=0).fillna(0)\n", - "assignment.columns.name = None\n", - "assignment[\"HG002.10kb\"] = assignment[\"HG002.10kb\"].astype(int)\n", - "assignment[\"HG002.15kb\"] = assignment[\"HG002.15kb\"].astype(int)\n", - "assignment = assignment.sort_values(by=[\"chrom\", \"haplotype\"]).iloc[:,[0,3,1,2]]\n", - "assignment.to_csv(\"HG002.10kb+15kb-haplotype-assignment.tsv\", sep=\"\\t\", index=False)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/publications/methods-paper/tables/repeatfinder-full.tex b/publications/methods-paper/tables/repeatfinder-full.tex deleted file mode 100644 index 3dc16de..0000000 --- a/publications/methods-paper/tables/repeatfinder-full.tex +++ /dev/null @@ -1,38 +0,0 @@ -\begin{samepage} \begin{table}[h!] \small \begin{tabular}{llllll} -\hline -\textbf{Motif} & \textbf{Arm} & \multicolumn{3}{l}{\textbf{Abundance}} & \textbf{Combined adjusted p-value} \\ -\textbf{} & \textbf{} & \textbf{HG001} & \textbf{HG002} & \textbf{HG005} & \textbf{} \\ -\hline -TTAGGG & q & 0.506392 & 0.432620 & 0.544289 & 0.00e+0 \\ -TGAGGG & q & 0.011527 & 0.017107 & 0.025117 & 8.33e-46 \\ -TTGGGG & q & 0.014520 & 0.021062 & 0.015897 & 2.13e-45 \\ -TTAGGGG & q & 0.014502 & 0.013954 & 0.015123 & 0.00e+0 \\ -TAGGG & q & 0.004637 & 0.003448 & 0.005159 & 4.46e-32 \\ -TTAGG & q & 0.004529 & 0.003182 & 0.004487 & 1.04e-30 \\ -TTTAGGG & q & 0.004508 & 0.003721 & 0.003078 & 7.95e-33 \\ -TTAGGGTTAGGGG & q & 0.004357 & 0.003782 & 0.004980 & 1.44e-38 \\ -TTAGGTTAGGG & q & 0.003650 & 0.002806 & 0.003548 & 4.97e-36 \\ -TAGGGTTAGGG & q & 0.003597 & 0.003075 & 0.004034 & 4.60e-37 \\ -TTGGG & q & 0.002112 & 0.000966 & 0.002253 & 3.79e-12 \\ -TTAAGGG & q & 0.001827 & 0.002942 & 0.002856 & 8.68e-26 \\ -TTGGGTTAGGG & q & 0.001317 & 0.000530 & 0.001532 & 3.67e-20 \\ -TTAGGGTTTAGGG & q & 0.001104 & 0.001442 & 0.001486 & 1.66e-22 \\ -TTAGGGTTAAGGG & q & 0.000490 & 0.001690 & 0.000495 & 1.00e-15 \\ -TTAGGGGG & q & 0.000358 & 0.001613 & 0.000526 & 0.00e+0 \\ -TTAGGGTTGTTAGGG & q & 0.000272 & 0.000500 & 0.000438 & 2.13e-28 \\ -TTAGAGGG & q & 0.000146 & 0.000046 & 0.000103 & 4.72e-3 \\ -TTGGGGTTGGGGG & q & 0.000073 & 0.000095 & 0.000075 & 2.35e-7 \\ -TGGTTAGGGTTAGGG & q & 0.000045 & 0.000208 & 0.000164 & 1.62e-12 \\ -CCCTAA & p & 0.099720 & 0.226981 & 0.188396 & 0.00e+0 \\ -CCCCAA & p & 0.010279 & 0.010869 & 0.007581 & 5.65e-31 \\ -CCCCTAA & p & 0.007083 & 0.007903 & 0.006217 & 8.83e-47 \\ -CCGCG & p & 0.002097 & 0.001710 & 0.002713 & 9.93e-40 \\ -CCCG & p & 0.000703 & 0.000330 & 0.000493 & 2.09e-31 \\ -GGGG & p & 0.000502 & 0.000122 & 0.000202 & 2.42e-22 \\ -TTTT & p & 0.000183 & 0.000109 & 0.000153 & 9.52e-17 \\ -\hline -\end{tabular} -\caption{Significantly enriched repeating motifs in telomeric regions of GIAB datasets HG001, HG002, and HG005.} -\label{tab:repeatfinder_full} -\end{table} -\end{samepage} diff --git a/publications/methods-paper/tables/shortread-repeatfinder.tex b/publications/methods-paper/tables/shortread-repeatfinder.tex deleted file mode 100644 index b8cad97..0000000 --- a/publications/methods-paper/tables/shortread-repeatfinder.tex +++ /dev/null @@ -1,33 +0,0 @@ -\begin{samepage} \begin{table}[h!] \small \begin{tabular}{lllll} -\hline -\textbf{Motif} & \multicolumn{2}{l}{\textbf{Illumina datasets}} & \multicolumn{2}{l}{\textbf{10X datasets}} \\ -\textbf{} & \textbf{Median abundance} & \textbf{Adjusted p-value} & \textbf{Median abundance} & \textbf{Adjusted p-value} \\ -\hline -TTAGGG & 0.299068 & 0.00e+0 & 0.461711 & 0.00e+0 \\ -TGAGGG & 0.007484 & 0.00e+0 & 0.018524 & 0.00e+0 \\ -TTGGGG & 0.002495 & 0.00e+0 & 0.007190 & 0.00e+0 \\ -GGGG & 0.020347 & 0.00e+0 & 0.006080 & 0.00e+0 \\ -TTAGGGG & 0.003007 & 0.00e+0 & 0.005024 & 0.00e+0 \\ -TTTT & 0.001294 & 0.00e+0 & 0.001490 & 0.00e+0 \\ -TTAAGGG & 0.000664 & 1.39e-55 & 0.001124 & 1.58e-59 \\ -TTAGGGGTTAGGG & 0.000533 & 1.04e-51 & 0.001020 & 0.00e+0 \\ -TAGGG & 0.000619 & 0.00e+0 & 0.001020 & 0.00e+0 \\ -TTGGG & 0.000500 & 0.00e+0 & 0.000989 & 0.00e+0 \\ -TTTAGGG & 0.000622 & 6.40e-55 & 0.000884 & 1.02e-57 \\ -TAGGGTTAGGG & 0.000312 & 4.24e-40 & 0.000503 & 0.00e+0 \\ -TTAGGGTTTAGGG & 0.000176 & 4.41e-38 & 0.000284 & 6.22e-59 \\ -TTAGGGTTAAGGG & 0.000145 & 6.63e-36 & 0.000264 & 4.15e-57 \\ -TTAGG & 0.000241 & 8.13e-35 & 0.000213 & 1.10e-55 \\ -TTGGGTTAGGG & 0.000127 & 4.47e-28 & 0.000178 & 3.34e-56 \\ -TTAGGGTTAGG & 0.000066 & 1.99e-18 & 0.000092 & 7.82e-48 \\ -TTAGGGGG & 0.000039 & 1.02e-14 & 0.000062 & 4.31e-40 \\ -TTAGGGTTGTTAGGG & 0.000035 & 4.64e-09 & 0.000061 & 4.65e-57 \\ -TTAGAGGG & 0.000036 & 5.44e-13 & 0.000053 & 2.66e-36 \\ -TTGGGGTTGGGGG & 0.000002 & 4.51e-13 & 0.000014 & 5.84e-21 \\ -TTAGGGTGGTTAGGG & 0.000007 & 5.39e-06 & 0.000013 & 5.42e-38 \\ -\hline -\end{tabular} -\caption{Significantly enriched repeating motifs in telomeric candidate reads in short-read sequencing experiments, subset to motifs also observed in PacBio telomeric reads, with respect to reverse-complement equivalence.} -\label{tab:shortread_repeatfinder} -\end{table} -\end{samepage} diff --git a/publications/methods-paper/tables/telomeric-read-counts.tex b/publications/methods-paper/tables/telomeric-read-counts.tex deleted file mode 100644 index ef6ace6..0000000 --- a/publications/methods-paper/tables/telomeric-read-counts.tex +++ /dev/null @@ -1,36 +0,0 @@ -\begin{samepage} \begin{table}[h!] \begin{tabular}{llllll} -\hline -\textbf{Chromosome} & \textbf{Reference contig} & \textbf{Arm} & \textbf{HG001} & \textbf{HG002} & \textbf{HG005} \\ -\hline -chr2 & 2qtel\_1-500K\_1\_12\_12\_rc & q & 0 & 0 & 1 \\ -chr2 & chr2 & p & 5 & 16 & 3 \\ -chr5 & 5qtel\_1-500K\_1\_12\_12\_rc & q & 42 & 53 & 23 \\ -chr5 & chr5 & p & 4 & 15 & 5 \\ -chr6 & 6qtel\_1-500K\_1\_12\_12\_rc & q & 31 & 49 & 29 \\ -chr7 & chr7 & q & 8 & 32 & 10 \\ -chr8 & chr8 & q & 14 & 35 & 14 \\ -chr9 & chr9 & p & 6 & 6 & 0 \\ -chr10 & 10qtel\_1-500K\_1\_12\_12\_rc & q & 0 & 1 & 0 \\ -chr10 & chr10 & p & 1 & 2 & 1 \\ -chr11 & chr11 & q & 11 & 31 & 9 \\ -chr12 & chr12 & q & 10 & 27 & 18 \\ -chr12 & chr12 & p & 4 & 5 & 3 \\ -chr14 & 14qtel\_1-500K\_1\_12\_12\_rc & q & 8 & 26 & 6 \\ -chr15 & chr15 & q & 25 & 21 & 26 \\ -chr16 & 16qtel\_1-500K\_1\_12\_12\_rc & q & 0 & 2 & 0 \\ -chr16 & chr16 & p & 1 & 0 & 0 \\ -chr17 & 17qtel\_1-500K\_1\_12\_12v2\_rc & q & 0 & 4 & 0 \\ -chr17 & 17ptel\_1\_500K\_1\_12\_12 & p & 0 & 1 & 1 \\ -chr18 & 18qtel\_1-500K\_1\_12\_12\_rc & q & 4 & 26 & 6 \\ -chr18 & chr18 & p & 11 & 35 & 7 \\ -chr19 & 19ptel\_1-500K\_1\_12\_12 & p & 0 & 1 & 1 \\ -chr19 & chr19 & q & 6 & 0 & 16 \\ -chr21 & chr21 & q & 35 & 77 & 35 \\ -chr22 & chr22 & q & 2 & 51 & 5 \\ -chrX & chrX & q & 28 & 54 & 22 \\ -\hline -\end{tabular} -\caption{The number of telomeric reads on each arm identified in GIAB PacBio CCS datasets HG001, HG002, and HG005. Relates to: \textbf{Figure 1}, \textbf{Figure S1}.} -\label{tab:telomeric_read_counts} -\end{table} -\end{samepage} diff --git a/publications/methods-paper/versions/20200430/figures-20200430.zip b/publications/methods-paper/versions/20200430/figures-20200430.zip deleted file mode 100644 index 63bcd82..0000000 Binary files a/publications/methods-paper/versions/20200430/figures-20200430.zip and /dev/null differ diff --git a/publications/methods-paper/versions/20200430/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200430.docx b/publications/methods-paper/versions/20200430/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200430.docx deleted file mode 100644 index 8577b22..0000000 Binary files a/publications/methods-paper/versions/20200430/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200430.docx and /dev/null differ diff --git a/publications/methods-paper/versions/20200430/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200430.pdf b/publications/methods-paper/versions/20200430/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200430.pdf deleted file mode 100644 index 5353e4a..0000000 Binary files a/publications/methods-paper/versions/20200430/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200430.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200514/figures-20200514.zip b/publications/methods-paper/versions/20200514/figures-20200514.zip deleted file mode 100644 index 63bcd82..0000000 Binary files a/publications/methods-paper/versions/20200514/figures-20200514.zip and /dev/null differ diff --git a/publications/methods-paper/versions/20200514/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200514-KG SB KG.docx b/publications/methods-paper/versions/20200514/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200514-KG SB KG.docx deleted file mode 100644 index d5e1571..0000000 Binary files a/publications/methods-paper/versions/20200514/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200514-KG SB KG.docx and /dev/null differ diff --git a/publications/methods-paper/versions/20200514/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200514-KG.docx b/publications/methods-paper/versions/20200514/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200514-KG.docx deleted file mode 100644 index f7a7581..0000000 Binary files a/publications/methods-paper/versions/20200514/heterogeneity-of-telomeres-revealed-by-long-read-sequencing-20200514-KG.docx and /dev/null differ diff --git a/publications/methods-paper/versions/20200609/figures/Figure 1.pdf b/publications/methods-paper/versions/20200609/figures/Figure 1.pdf deleted file mode 100644 index ec629a2..0000000 Binary files a/publications/methods-paper/versions/20200609/figures/Figure 1.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200609/figures/Figure 2.pdf b/publications/methods-paper/versions/20200609/figures/Figure 2.pdf deleted file mode 100644 index d8dac5c..0000000 Binary files a/publications/methods-paper/versions/20200609/figures/Figure 2.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200609/figures/Figure 3.pdf b/publications/methods-paper/versions/20200609/figures/Figure 3.pdf deleted file mode 100644 index 424bcb5..0000000 Binary files a/publications/methods-paper/versions/20200609/figures/Figure 3.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200609/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-KRT.docx b/publications/methods-paper/versions/20200609/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-KRT.docx deleted file mode 100644 index a0b9484..0000000 Binary files a/publications/methods-paper/versions/20200609/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-KRT.docx and /dev/null differ diff --git a/publications/methods-paper/versions/20200609/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.pdf b/publications/methods-paper/versions/20200609/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.pdf deleted file mode 100644 index acdd539..0000000 Binary files a/publications/methods-paper/versions/20200609/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200609/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx b/publications/methods-paper/versions/20200609/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx deleted file mode 100644 index 8a592ee..0000000 Binary files a/publications/methods-paper/versions/20200609/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx and /dev/null differ diff --git a/publications/methods-paper/versions/20200630/figures/Figure 1.pdf b/publications/methods-paper/versions/20200630/figures/Figure 1.pdf deleted file mode 100644 index ec629a2..0000000 Binary files a/publications/methods-paper/versions/20200630/figures/Figure 1.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200630/figures/Figure 2.pdf b/publications/methods-paper/versions/20200630/figures/Figure 2.pdf deleted file mode 100644 index d8dac5c..0000000 Binary files a/publications/methods-paper/versions/20200630/figures/Figure 2.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200630/figures/Figure 3.pdf b/publications/methods-paper/versions/20200630/figures/Figure 3.pdf deleted file mode 100644 index 424bcb5..0000000 Binary files a/publications/methods-paper/versions/20200630/figures/Figure 3.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200630/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-KRT.docx b/publications/methods-paper/versions/20200630/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-KRT.docx deleted file mode 100644 index 2eb1fd0..0000000 Binary files a/publications/methods-paper/versions/20200630/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-KRT.docx and /dev/null differ diff --git a/publications/methods-paper/versions/20200630/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.pdf b/publications/methods-paper/versions/20200630/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.pdf deleted file mode 100644 index acdd539..0000000 Binary files a/publications/methods-paper/versions/20200630/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200630/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx b/publications/methods-paper/versions/20200630/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx deleted file mode 100644 index b881a70..0000000 Binary files a/publications/methods-paper/versions/20200630/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx and /dev/null differ diff --git a/publications/methods-paper/versions/20200701/figures/Figure 1.pdf b/publications/methods-paper/versions/20200701/figures/Figure 1.pdf deleted file mode 100644 index ec629a2..0000000 Binary files a/publications/methods-paper/versions/20200701/figures/Figure 1.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200701/figures/Figure 2.pdf b/publications/methods-paper/versions/20200701/figures/Figure 2.pdf deleted file mode 100644 index d8dac5c..0000000 Binary files a/publications/methods-paper/versions/20200701/figures/Figure 2.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200701/figures/Figure 3.pdf b/publications/methods-paper/versions/20200701/figures/Figure 3.pdf deleted file mode 100644 index 424bcb5..0000000 Binary files a/publications/methods-paper/versions/20200701/figures/Figure 3.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200701/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-KRT.docx b/publications/methods-paper/versions/20200701/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-KRT.docx deleted file mode 100644 index 2eb1fd0..0000000 Binary files a/publications/methods-paper/versions/20200701/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-KRT.docx and /dev/null differ diff --git a/publications/methods-paper/versions/20200701/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.pdf b/publications/methods-paper/versions/20200701/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.pdf deleted file mode 100644 index acdd539..0000000 Binary files a/publications/methods-paper/versions/20200701/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres-SUPPLEMENTAL.pdf and /dev/null differ diff --git a/publications/methods-paper/versions/20200701/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx b/publications/methods-paper/versions/20200701/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx deleted file mode 100644 index 291ee41..0000000 Binary files a/publications/methods-paper/versions/20200701/haplotype-diversity-and-sequence-heterogeneity-of-human-telomeres.docx and /dev/null differ diff --git a/publications/poster-2019/COPYRIGHT b/publications/poster-2019/COPYRIGHT deleted file mode 100644 index 7adfd21..0000000 --- a/publications/poster-2019/COPYRIGHT +++ /dev/null @@ -1,10 +0,0 @@ -COPYRIGHT NOTICE -================ - -The files in this directory and all its subdirectories in this repository, -unless otherwise noted, are protected by U.S. and International copyright laws. -Reproduction and distribution, with or without modification, of these files -without a written permission of the authors is prohibited. - -© 2019–2020 Kirill Grigorev, Jonathan Foox, Christopher E. Mason -Institute for Computational Biomedicine, Weill Cornell Medicine diff --git a/publications/poster-2019/baposter.cls b/publications/poster-2019/baposter.cls deleted file mode 100644 index 965c4aa..0000000 --- a/publications/poster-2019/baposter.cls +++ /dev/null @@ -1,1094 +0,0 @@ -%% -%% This is file `baposter.cls' -%% -%% A relatively comfortable latex class to produce posters with a grid based -%% layout. It comes with a number of combinable styles and is (maybe only for -%% the writer) easy to extend, as all the graphics is based on pgf. -%% -%% It is meant to be used with pdftex, but might also work with pslatex if you -%% are not interested in things like transparency. -%% -%% Copyright (C) 2007-2011 Brian Amberg -%% Copyright (C) 2011 Reinhold Kainhofer -%% -%% 29. September 2011: -%% - Finally fixed confusion with paper size handling and landscape. This required seperate handling of papersizes -%% known to the geometry package and other packages. -%% 26. September 2011: -%% - Reverted drawing of faded borders to manual method, as the current result does not work with evince, -%% and produced spurious colored boxes with okular and acroread. -%% - Added one more example due to popular request -%% 16. September 2011: -%% - Removed nag package, such that it compiles on older systems -%% - Added more examples -%% 15. September 2011: -%% - Merged fork by (rk) back into mainline, essentially taking over all the great improvements that Reinhold Kainhofer coded. -%% We call this version 2, as it breaks the interface in some small points. Essentially watch out for this: -%% - no/yes keys are now false/true -%% - the shade-lr etc. keys have the hypen removed, and are now called shadelr etc. -% - Added more examples, which are all adapted to work with the current version -%% 27. August 2011 (rk): -%% - Completely factored out all drawing commands to handler functions, assigned via choice keys -%% - Added cornerradius, boxheaderheight, boxpadding options -%% - Added missing roundedright -%% 26. August 2011 (rk): -%% - Turned headerbox cmd into posterbox environment (allows verbatim) -%% - pass several options to packages -%% - added debug cls option to reduce default output -%% - rewrote several loops and ifthenelse -%% - Use boolean and choice keys for all options -%% - Changed all choice options to set a global function to -%% \baposter@OPTION@OPTIONVALUE, which are a functions that execute the -%% option's setting (drawing routine, color setting for tikz, etc.) -%% - Add a5paper -%% 07. April 2011: Fixed paper size handling -%% - Fixed Paper Size handling, you can now specify all page sizes using paperwidth=, paperheight= class options. -%% - Added rudimentary documentation of the available keys. -%% 11. December 2010: Fixed "Empty Second Page" bug -%% - A second empty page was introduced when a document ended with -%% \end{poster} -%% \end{document} -%% This was fixed thanks to a suggestion by Martin Aulbach -%% - Added -%% \thispagestyle{empty} -%% to suppress page numbers, which became visible when the margins are large. -%% The underlying problem of page-size handling has not yet been solved. -%% 30. September 2010: Border Bugfixes -%% - Correct background color with textborder=rounded-small, Thanks to Ke Chen for the bugreport and patch -%% - Correctly draw rectangular text boxes, Thanks to abenkst for the bugreport and fix -%% 10. June 2010: Added option to set the number of columns -%% - added a class option to set the number of columns -%% - columns=5: sets the number of columns to 5, possible values are 1..6, default is 3 in portrait and 4 in landscape format -%% 29. April 2009: Incorporated Patches by Arne Henningsen -%% - added some class options -%% - a4shrink: shrink the paper to A4 size (for printing drafts or handouts) -%% - movebody=Xpt: move the text/poster body Xpt to the right -%% (or to the left if Xpt is negative), -%% e.g. for manually centering the poster on the page -%% - showframe: use the "showframe" option of the "geometry" package -%% - a0paper (default): set paper size to A0 -%% - archE: set paper size to Arch E -%% - setting "background" can be "none" now (otherwise the "showframe" -%% option has no effect) -%% - the page number has been removed (it was mostly not visible before) -%% - the "margin=" option works now -%% 04. December 2008 -%% - Mainly an update to the landscape example -%% 14. November 2008 -%% - Actually center the title when eyecatcher is used. -%% 04. November 2008 -%% - Fixed bug with eyecatcher not working. -%% 26. June 2008 -%% - Fixed bug with plain background mode. -%% 14. June 2008 -%% - Support for portrait/landscape switching. -%% - Some smaller bugfixes. -%% 01. June 2007 -%% - First version released. -%% -%% Use this class with pdflatex -%% -%% I have confirmed that this package works with -%% - texlive 2007 and -%% - miktex 2.7 -%% -%% It does not seem to work with -%% - miktex 2.2 -%% - some old versions of tetex -%% -%% -%% TODO: -%% -) Rename backgrounds back to shaded-tb shade-lr -%% -) Rename textborder back to rounded-small (consistent naming needed!) -%% -) Rename headershade back to shade-lr, shade-tb, shade-tb-inverse -%% -) Rename headershape back to small-rounded -%% -) Option value consistency (small-rounded vs. rounded-small; missing ones) -%% -) Rename \baposterHeaderSetShade, \baposterHeaderDrawText to include @ and verb -%% -%% -%% Licence: GPL -\ProvidesClass{baposter}[2011/11/26 v2.0 baposter class] -\NeedsTeXFormat{LaTeX2e}[1995/06/01] -\LoadClass{article} -\typeout{baposter: Brian Amberg, 2007, 2008, 2009, 2010, 2011 | http://www.brian-amberg.de/uni/poster/} -\typeout{baposter: Reinhold Kainhofer, 2011 | http://reinhold.kainhofer.com/} - -%% Define lengths only once on inclusion, such that we can make multiple posters -\newlength{\baposter@basepaperwidth} -\newlength{\baposter@basepaperheight} -\newlength{\baposter@basemargin} -\newlength{\baposter@finalpaperwidth} -\newlength{\baposter@finalpaperheight} -\newlength{\baposter@finalmargin} -\newlength{\headerheight}% -\newlength{\colwidth}% -\newlength{\colheight}% -\newlength{\baposter@@colspacing}% -\newlength{\baposter@box@@cornerradius}% -\newlength{\baposter@box@@boxheaderheight}% -\newlength{\baposter@box@@boxpadding}% -\newlength{\boxstartx}% -\newlength{\boxstarty}% -\newlength{\boxwidth}% -\newlength{\boxheight}% -\newlength{\baposter@titleimage@left@width}% -\newlength{\baposter@titleimage@right@width}% -\newlength{\baposter@titleimage@textwidth}% -\newbox\baposter@box@content% -\newbox\baposter@titleimage@left% -\newbox\baposter@titleimage@title% -\newbox\baposter@titleimage@right% - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Packages -%------------------------------------------------------------------------------- -% The only ``weird'' dependency of this package is pgf. All the rest should be -% installed on any decent system. -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\typeout{Use Packages} -\RequirePackage{xkeyval} -\RequirePackage{calc} -\RequirePackage[cmyk]{xcolor} -\RequirePackage{tikz} -\RequirePackage{pgf} -\RequirePackage{ifthen} -\RequirePackage[T1]{fontenc} -%\RequirePackage[l2tabu, orthodox]{nag} -\usetikzlibrary{decorations} -\usetikzlibrary{fadings} -\usetikzlibrary{snakes} -\usetikzlibrary{calc} - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Settings -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -% Choose a smaller value for larger fonts -\newcommand{\baposter@fontscale}{0.292} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Zoom -%------------------------------------------------------------------------------- -% We scale the page from fontscale * papersize up to papersize -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - -%% Paper sizes -\newif\if@landscape -\newif\if@geometryKnowsThisSize -\DeclareOptionX{landscape}{\@landscapetrue} -\DeclareOptionX{portrait}{} - -\newcommand{\baposter@setfinalpapersize}[2]{% -\if@geometryKnowsThisSize - \setlength{\baposter@finalpaperwidth}{#1}% - \setlength{\baposter@finalpaperheight}{#2}% -\else -\if@landscape -% Transpose length, if geometry does not handle the papersize based on the key - \setlength{\baposter@finalpaperwidth}{#2}% - \setlength{\baposter@finalpaperheight}{#1}% -\else - \setlength{\baposter@finalpaperwidth}{#1}% - \setlength{\baposter@finalpaperheight}{#2}% -\fi -\fi -} - -% Default paperwidth and paperheight = a0paper -\DeclareOptionX{paperwidth}[841mm]{\setlength{\baposter@finalpaperwidth}{#1}} -\DeclareOptionX{paperheight}[1189mm]{\setlength{\baposter@finalpaperheight}{#1}} -\DeclareOptionX{archA} { \baposter@setfinalpapersize{9in}{12in}}% -\DeclareOptionX{archB} { \baposter@setfinalpapersize{12in}{18in}}% -\DeclareOptionX{archC} { \baposter@setfinalpapersize{18in}{24in}}% -\DeclareOptionX{archD} { \baposter@setfinalpapersize{24in}{36in}}% -\DeclareOptionX{archE} { \baposter@setfinalpapersize{36in}{48in}}% -\DeclareOptionX{archE1} { \baposter@setfinalpapersize{30in}{42in}}% -\DeclareOptionX{archE2} { \baposter@setfinalpapersize{26in}{38in}}% -\DeclareOptionX{archE3} { \baposter@setfinalpapersize{27in}{39in}}% -\DeclareOptionX{a0paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{841mm}{1189mm}}%g -\DeclareOptionX{a1paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{594mm}{841mm}}%g -\DeclareOptionX{a2paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{420mm}{594mm}}%g -\DeclareOptionX{a3paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{297mm}{420mm}}%g -\DeclareOptionX{a4paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{210mm}{297mm}}%g -\DeclareOptionX{a5paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{148mm}{210mm}}%g -\DeclareOptionX{a6paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{105mm}{148mm}}%g -\DeclareOptionX{b0paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{1000mm}{1414mm}}%g -\DeclareOptionX{b1paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{707mm}{1000mm}}%g -\DeclareOptionX{b2paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{500mm}{707mm}}%g -\DeclareOptionX{b3paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{353mm}{500mm}}%g -\DeclareOptionX{b4paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{250mm}{353mm}}%g -\DeclareOptionX{b5paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{176mm}{250mm}}%g -\DeclareOptionX{b6paper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{125mm}{176mm}}%g -\DeclareOptionX{ansiapaper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{8.5in}{11in}}% -\DeclareOptionX{ansibpaper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{11in}{17in}}% -\DeclareOptionX{ansicpaper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{17in}{22in}}% -\DeclareOptionX{ansidpaper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{22in}{34in}}% -\DeclareOptionX{ansiepaper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{34in}{44in}}% -\DeclareOptionX{letterpaper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{8.5in}{11in}}% -\DeclareOptionX{legalpaper} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{8.5in}{14in}}% -\DeclareOptionX{executivepaper}{\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{7.25in}{10.5in}}% -\DeclareOptionX{screen} {\@geometryKnowsThisSizetrue\baposter@setfinalpapersize{225mm}{180mm}}%g - -% Margin -\setlength{\baposter@finalmargin}{1.5cm} -\DeclareOptionX{fontscale}[0.292]{\renewcommand{\baposter@fontscale}{#1}} -\DeclareOptionX{margin} [1.5cm]{\setlength{\baposter@finalmargin}{#1}} - -% move text/poster body to the right (or to the left if negative) -\newlength{\baposter@movebody} -\setlength{\baposter@movebody}{0cm} -\DeclareOptionX{movebody}[0cm]{\setlength{\baposter@movebody}{#1}} - - -\newif\if@debug -\DeclareOptionX{debug}{\@debugtrue} -%% Will be passed on to other packages (xcolor and geometry), still we don't want unused warnings -\DeclareOptionX{table}{} -\DeclareOptionX{showframe}{} - -\ProcessOptionsX - -\if@debug -\newcommand{\debug}[1]{\typeout{#1}} -\else -\newcommand{\debug}[1]{} -\fi - - - -\setlength{\baposter@basepaperwidth} {\baposter@fontscale\baposter@finalpaperwidth } -\setlength{\baposter@basepaperheight}{\baposter@fontscale\baposter@finalpaperheight} -\setlength{\baposter@basemargin} {\baposter@fontscale\baposter@finalmargin} -\newlength{\baposter@basemarginright} -\setlength{\baposter@basemarginright}{\baposter@basemargin} -\addtolength{\baposter@basemarginright}{-\baposter@fontscale\baposter@movebody} -\newlength{\baposter@basemarginleft} -\setlength{\baposter@basemarginleft}{\baposter@basemargin} -\addtolength{\baposter@basemarginleft}{\baposter@fontscale\baposter@movebody} - -\typeout{Paperwidth=\the\baposter@finalpaperwidth} -\typeout{Paperheight=\the\baposter@finalpaperheight} -\typeout{BasePaperwidth=\the\baposter@basepaperwidth} -\typeout{BasePaperheight=\the\baposter@basepaperheight} -\usepackage[ - paperwidth=\baposter@basepaperwidth, - paperheight=\baposter@basepaperheight, - tmargin=\baposter@basemargin, - bmargin=\baposter@basemargin, - lmargin=\baposter@basemarginleft, - rmargin=\baposter@basemarginright, - ]{geometry} - -\usepackage{pgfpages} -\if@landscape -\if@geometryKnowsThisSize -\pgfpagesuselayout{resize to}[physical paper width=\baposter@finalpaperheight,physical paper height=\baposter@finalpaperwidth] -\else -\pgfpagesuselayout{resize to}[physical paper width=\baposter@finalpaperwidth,physical paper height=\baposter@finalpaperheight] -\fi -\else -\pgfpagesuselayout{resize to}[physical paper width=\baposter@finalpaperwidth,physical paper height=\baposter@finalpaperheight] -\fi - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Default functions for borders/backgrounds -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -%% These functions will all be redefined from the actual option values. In -%% particular, they will be set to \baposter@optionname@optionvalue, which -%% should do the actual work / setting for that particular optionvalue. - -\newcommand{\baposterPosterDrawBackground}[2]{} % Draw poster background - -\newcommand{\baposterBoxGetShape}{} % Returns path of text box shape -\newcommand{\baposterBoxDrawBackground}[2]{} % Draw bg of boxes -\newcommand{\baposterBoxDrawBorder}[1]{} % Draw border of individual boxes - -\newcommand{\baposterHeaderGetShape}{} % Returns path of text box shape -\newcommand{\baposterHeaderSetShade}[3]{} % Set bg style for box headers -\newcommand{\baposterHeaderDrawBackground}[3]{} % Draw background of box header -\newcommand{\baposterHeaderDrawBorder}[1]{} % Draw border of box header -\newcommand{\baposterHeaderDrawText}[1]{} % Draw text inside box header - -\newcommand{\@@previousbox}{notset} % stores the previously processed box for below=auto - -% Function to set a user-defined background -\newcommand{\baposter@backgroundCmd}{\error{No background command defined. Use \background{...} to define background}} -\newcommand{\background}[1]{\renewcommand{\baposter@backgroundCmd}{#1}} - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Handle poster and box options -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -\debug{Handling keys} - -%% -%% POSTER OPTIONS -%% -%% Store all poster options in variables of the form \baposter@option -%% choose-keys also store the index in \baposter@optionnr -%% choose-keys typically also assign a function -\definecolor{baposter@silver}{cmyk}{0,0,0,0.7} -\define@boolkey[ba]{poster}[baposter@]{grid} [false] {} -\define@boolkey[ba]{poster}[baposter@]{eyecatcher} [true] {} -\define@cmdkey [ba]{poster}[baposter@]{headerheight} [0.1\textheight]{} -\define@cmdkey [ba]{poster}[baposter@]{columns} [{}] {} -\define@cmdkey [ba]{poster}[baposter@]{colspacing} [1em] {} -\define@cmdkey [ba]{poster}[baposter@]{bgColorOne} [baposter@silver]{} -\define@cmdkey [ba]{poster}[baposter@]{bgColorTwo} [green] {} - -% background can be one of: shadeLR, shadeTB, plain, user, none -\define@choicekey*+[ba]{poster}{background}% - [\baposter@background\baposter@backgroundnr]% - {shadeLR, shadeTB, plain, user, none} [plain] {% - \debug{Poster background: \baposter@background} - \renewcommand{\baposterPosterDrawBackground}[2]{ - \csname baposter@background@\baposter@background\endcsname{##1}{##2}} -}{ - \PackageWarning{baposter}{Unknown background `\baposter@background' (use - shadeLR, shadeTB, plain, none, or user). If user is used, you also - have to define \background{...}.} - \renewcommand{\baposterPosterDrawBackground}[2]{\baposter@background@none{##1}{##2}} -} - - -%% -%% BOX OPTIONS -%% -\define@cmdkey[ba]{posterbox}[baposter@box@]{cornerradius} [1em] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{boxheaderheight} [2em] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{boxpadding} [0.5em] {} - - -% textborder can be one of: none, bars, coils, triangles, rectangle, rounded, -% roundedleft, roundedsmall, faded; UNIMPLEMENTED: roundedright -\edef\baposter@box@textborder@validvalues{none,bars,coils,triangles,rectangle,rounded,roundedleft,roundedsmall,faded} -\define@choicekey*+[ba]{posterbox}{textborder}% - [\baposter@box@textborder\baposter@box@textbordernr]% - {none,bars,coils,triangles,rectangle,rounded,roundedleft,roundedright,roundedsmall,faded} [rectangle] {% - \debug{Text border: \baposter@box@textborder} - \renewcommand{\baposterBoxGetShape}{ - \csname baposter@box@boxshape@\baposter@box@textborder\endcsname} - \renewcommand{\baposterBoxDrawBorder}[1]{ - \csname baposter@box@drawborder@\baposter@box@textborder\endcsname{##1}} -}{ - \PackageWarning{baposter}{Unknown text-border style `\baposter@box@textborder'. - Edit your file to choose a valid option (\baposter@box@textborder@validvalues).} - \renewcommand{\baposterBoxGetShape}{\baposter@boxshape@rectangle} - \renewcommand{\baposterBoxDrawBorder}[1]{\baposter@drawborder@rectangle{##1}} -} - -% boxshade can be one of: shadeLR, shadeTB, plain, none -\define@choicekey*+[ba]{posterbox}{boxshade}% - [\baposter@box@boxshade\baposter@box@boxshadenr]% - {shadelr,shadetb,plain,none} [none] {% - \debug{Box shade: \baposter@box@boxshade} - \renewcommand{\baposterBoxDrawBackground}[2]{ - \csname baposter@box@drawbackground@\baposter@box@boxshade\endcsname{##1}{##2}} -}{ - \PackageWarning{baposter}{Unknown boxshade style `\baposter@boxshade'. - Edit your file to choose a valid option.} - \renewcommand{\baposterBoxDrawBackground}[2]{\baposter@box@drawbackground@none{##1}{##2}} -} - -% headershade can be one of: shade-lr, shade-tb, shade-tb-inverse, plain -\define@choicekey*+[ba]{posterbox}{headershade}% - [\baposter@box@headershade\baposter@box@headershadenr]% - {shadelr, shadetb, shadetbinverse, plain} [shadelr] {% - \debug{Header shade: \baposter@box@headershade} - \renewcommand{\baposterHeaderSetShade}[3]{ - \csname baposter@box@headershade@\baposter@box@headershade\endcsname{##1}{##2}{##3}} -}{ - \PackageWarning{baposter}{Unknown headershade style `\baposter@box@headershade'. - Edit your file to choose a valid option.} - \renewcommand{\baposterHeaderSetShade}[3]{\baposter@box@headershade@none{##1}{##2}{##3}} -} - -% headershape can be one of: rectangle, rounded, smallrounded, roundedleft, roundedright -\define@choicekey*+[ba]{posterbox}{headershape}% - [\baposter@box@headershape\baposter@box@headershapenr]% - {rectangle,rounded,smallrounded,roundedleft,roundedright} [roundedright] {% - \debug{Header shape: \baposter@box@headershape} - \renewcommand{\baposterHeaderGetShape}{ - \csname baposter@box@headershape@\baposter@box@headershape\endcsname} - \renewcommand{\baposterHeaderDrawText}[1]{ - \csname baposter@box@headerdrawtext@\baposter@box@headershape\endcsname{##1}} - \renewcommand{\baposterHeaderDrawBorder}[1]{ - \csname baposter@box@headerdrawborder@\baposter@box@headershape\endcsname{##1}} -}{ - \PackageWarning{baposter}{Unknown headershape style `\baposter@headershape'. - Edit your file to choose a valid option.} - \renewcommand{\baposterHeaderGetShape}{\baposter@box@headershape@rectangle} - \renewcommand{\baposterHeaderDrawText}[1]{\baposter@box@headerdrawtext@rectangle{##1}} - \renewcommand{\baposterHeaderDrawBorder}[1]{\baposter@box@headerdrawborder@rectangle{##1}} -} - -% headerborder can be one of: open, closed, none -\define@choicekey*+[ba]{posterbox}{headerborder}% - [\baposter@box@headerborder\baposter@box@headerbordernr]% - {open,closed,none} [open] {% - \debug{Header border: \baposter@box@headerborder} -% \renewcommand{\baposterHeaderBorder}{ -% \csname baposter@headerborder@\baposter@box@headerborder\endcsname} -}{ - \PackageWarning{baposter}{Unknown headerborder style `\baposter@headerborder'. - Edit your file to choose a valid option.} -% \renewcommand{\baposterHeaderBorder}{\baposter@box@headerborder@rectangle} -} - - -\define@cmdkey[ba]{posterbox}[baposter@box@]{borderColor} [yellow] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{headerColorOne} [red] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{headerColorTwo} [brown] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{headerFontColor} [black] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{boxColorOne} [magenta] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{boxColorTwo} [cyan] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{headerfont} [\scshape\Large] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{textfont} [{}] {} - -\define@cmdkey[ba]{posterbox}[baposter@box@]{linewidth} [2pt] {} - -\define@cmdkey[ba]{posterbox}[baposter@box@]{below} [notset]{} -\define@cmdkey[ba]{posterbox}[baposter@box@]{above} [notset]{} -\define@cmdkey[ba]{posterbox}[baposter@box@]{aligned}[notset]{} -\define@cmdkey[ba]{posterbox}[baposter@box@]{bottomaligned}[notset]{} -\define@cmdkey[ba]{posterbox}[baposter@box@]{column} [0] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{row} [0] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{span} [1] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{height} [auto] {} -\define@cmdkey[ba]{posterbox}[baposter@box@]{name} [noname]{} - -% Set some default values, the poster and posterbox environments can override: -\setkeys[ba]{poster}{ - % Debug grid - grid=false, - % Is there an eyecatcher image - eyecatcher=true, - columns={}, - % Colours - bgColorOne=baposter@silver, - bgColorTwo=green, - % - colspacing=1em, - headerheight=0.1\textheight, - background=shadeLR, -}{} -\setkeys[ba]{posterbox}{ - % Position - column=0,row=0,span=1, - below=notset,above=notset, - bottomaligned=notset, - aligned=notset, - height=auto, - % Name - name=noname, - % Box design: border: - linewidth=2pt, - borderColor=yellow, - cornerradius=1em, - % text box: - textfont={}, - boxshade=plain, - boxColorOne=magenta, - boxColorTwo=cyan, - textborder=faded, - boxpadding=0.5em, - % header - headerfont=\scshape\Large,% or headerfont=\color{white}\textsf\textbf - headerFontColor=black, - headerColorOne=red, - headerColorTwo=brown, - headershape=rectangle, - headershade=shadeLR, - headerborder=none, - boxheaderheight=2em, -}{} - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%% Background options and functions (one function for each possible value) -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -\newcommand{\baposter@background@shadelr}[2]{ - \debug{BAPOSTER: Using shade left right background.} - \begin{tikzpicture}[remember picture,overlay]% - \shade [shading=axis,left color=#1,right color=#2] (current page.north west) - rectangle(current page.south east); - \end{tikzpicture}% -} -\newcommand{\baposter@background@shadetb}[2]{ - \debug{BAPOSTER: Using shade top to bottom background.} - \begin{tikzpicture}[remember picture,overlay]% - \shade [shading=axis,top color=#1,bottom color=#2] (current page.north west) - rectangle(current page.south east); - \end{tikzpicture}% -} -\newcommand{\baposter@background@plain}[2]{ - \debug{BAPOSTER: Using plain background.} - \begin{tikzpicture}[remember picture,overlay]% - \fill [fill=#1] (current page.north west) rectangle(current page.south east); - \end{tikzpicture}% -} -\newcommand{\baposter@background@user}[2]{ - \debug{BAPOSTER: Using user background.} - \baposter@backgroundCmd% -} -\newcommand{\baposter@background@none}[2]{ - \debug{BAPOSTER: Using no background.} -} - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%% Return shape path of text box (depending on the box shape) -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -\newcommand{\baposter@box@boxshape@none}{\baposter@box@boxshape@rectangle} -\newcommand{\baposter@box@boxshape@bars}{ - (\baposter@box@name tnw) -- (\baposter@box@name sw) % - (\baposter@box@name se) -- (\baposter@box@name tne) -} -\newcommand{\baposter@box@boxshape@coils}{\baposter@box@boxshape@bars} -\newcommand{\baposter@box@boxshape@triangles}{\baposter@box@boxshape@bars} -\newcommand{\baposter@box@boxshape@rectangle}{ - (\baposter@box@name tnw) -- (\baposter@box@name sw) -- % - (\baposter@box@name se) -- (\baposter@box@name tne)% -} -\newcommand{\baposter@box@boxshape@faded}{ - (\baposter@box@name tnw) -- (\baposter@box@name sw) % - (\baposter@box@name tne) -- (\baposter@box@name se) - } -\newcommand{\baposter@box@boxshape@rounded}{ - [rc] \baposter@box@boxshape@rectangle% -} -\newcommand{\baposter@box@boxshape@roundedsmall}{ - [src] \baposter@box@boxshape@rectangle -} -\newcommand{\baposter@box@boxshape@roundedleft}{ - (\baposter@box@name tnw) {[rc]-- (\baposter@box@name sw)} -- % - (\baposter@box@name se) -- (\baposter@box@name tne)% -} -\newcommand{\baposter@box@boxshape@roundedright}{ - (\baposter@box@name tnw) -- (\baposter@box@name sw) {[rc]-- % - (\baposter@box@name se)} -- (\baposter@box@name tne)% -} - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%% Draw box background (one function for each possible value) -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -% These functions take no arguments -\newcommand{\baposter@box@drawbackground@none}[2]{ - \tikzstyle{box colors}=[] -} -\newcommand{\baposter@box@drawbackground@plain}[2]{ - \tikzstyle{box colors}=[fill=#1] - \fill[box colors] \baposterBoxGetShape; -} -\newcommand{\baposter@box@drawbackground@shadelr}[2]{ - \tikzstyle{box colors}=[shading=axis, left color=#1, right color=#2]% - \fill[box colors] \baposterBoxGetShape; -} -\newcommand{\baposter@box@drawbackground@shadetb}[2]{ - \tikzstyle{box colors}=[shading=axis, top color=#1, bottom color=#2]% - \fill[box colors] \baposterBoxGetShape; -} - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%% Draw box border -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -% These functions take two arguments: borderColor -\newcommand{\baposter@box@drawborder@none}[1]{} -\newcommand{\baposter@box@drawborder@bars}[1]{ - \draw[color=#1] \baposterBoxGetShape;% -} -\newcommand{\baposter@box@drawborder@coils}[1]{ - \draw[color=#1,segment amplitude=0.35em,segment length=0.4em,snake=coil] \baposterBoxGetShape;% -} -\newcommand{\baposter@box@drawborder@triangles}[1]{ - \draw[color=#1,segment amplitude=0.2em,segment length=0.4em,snake=triangles] \baposterBoxGetShape;% -} -\newcommand{\baposter@box@drawborder@rectangle}[1]{ - \draw[color=#1] \baposterBoxGetShape;% -} -\newcommand{\baposter@box@drawborder@rounded}[1]{ - \draw[color=#1] \baposterBoxGetShape;% -} -\newcommand{\baposter@box@drawborder@roundedleft}[1]{ - \draw[color=#1] \baposterBoxGetShape;% -} -\newcommand{\baposter@box@drawborder@roundedright}[1]{ - \draw[color=#1] \baposterBoxGetShape;% -} -\newcommand{\baposter@box@drawborder@faded}[1]{ - % This is the right way to do it, but it does not work with evince, and has problems during printing, so instead we do - %\draw[color=#1,path fading=south] \baposterBoxGetShape;% - % this - \foreach \x in {0,1,...,90} \draw[color=#1!\x] ($(\baposter@box@name tnw)!{(100-\x)/100}!(\baposter@box@name sw)$) -- ($(\baposter@box@name tnw)!{(100-(\x+10))/100}!(\baposter@box@name sw)$);% - \foreach \x in {0,1,...,90} \draw[color=#1!\x] ($(\baposter@box@name tne)!{(100-\x)/100}!(\baposter@box@name se)$) -- ($(\baposter@box@name tne)!{(100-(\x+10))/100}!(\baposter@box@name se)$);% -} -\newcommand{\baposter@box@drawborder@roundedsmall}[1]{ - \draw[color=#1] \baposterBoxGetShape;% -} - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%% Return shape path of text box (depending on the box shape) -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -% These functions take no arguments -% TODO: For headerborder==none, use (\baposter@box@name outer tnw) instead! -\newcommand{\baposter@box@headershape@rectangle}{% - (\baposter@box@name tnw) -- (\baposter@box@name nw) -- % - (\baposter@box@name ne) -- (\baposter@box@name tne)% -} -\newcommand{\baposter@box@headershape@smallrounded}{% - (\baposter@box@name tnw) {[src] -- (\baposter@box@name nw) -- % - (\baposter@box@name ne)} -- (\baposter@box@name tne)% -} -\newcommand{\baposter@box@headershape@roundedright}{% - (\baposter@box@name tnw) -- (\baposter@box@name nw) {[rc] -- % - (\baposter@box@name ne)} -- (\baposter@box@name tne)% -} -\newcommand{\baposter@box@headershape@roundedleft}{% - (\baposter@box@name tnw) {[rc]-- (\baposter@box@name nw)} -- % - (\baposter@box@name ne) -- (\baposter@box@name tne)% -} -\newcommand{\baposter@box@headershape@rounded}{% - (\baposter@box@name tnw) {[rc] -- (\baposter@box@name nw) -- % - (\baposter@box@name ne) } -- (\baposter@box@name tne)% -} - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%% Header text drawing (one function for each possible value of headershape) -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -% These functions take one argument: the header text -\newcommand{\baposter@box@headerdrawtext@rectangle}[1]{ - \path (\baposter@box@name nw) +(0em,-0.5\baposter@box@@boxheaderheight) node[anchor=west,inner sep=0.4em] {#1};% -} -\newcommand{\baposter@box@headerdrawtext@smallrounded}[1]{ - \path (\baposter@box@name nw) +(0.5\boxwidth,-0.5\baposter@box@@boxheaderheight) node[anchor=center] {#1};% -} -\newcommand{\baposter@box@headerdrawtext@roundedright}[1]{ - \path (\baposter@box@name nw) +(0em,-0.5\baposter@box@@boxheaderheight)% - node[anchor=west,inner sep=0.4em,text depth=0.0em] {#1};% -} -\newcommand{\baposter@box@headerdrawtext@roundedleft}[1]{ - \path (\baposter@box@name nw) +(0em,-0.5\baposter@box@@boxheaderheight)% - node[anchor=west,inner sep=0.4em] {#1};% -} -\newcommand{\baposter@box@headerdrawtext@rounded}[1]{ - \path (\baposter@box@name nw) +(0.5\boxwidth,-0.5\baposter@box@@boxheaderheight) node[anchor=center] {#1};% -} - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%% Header shade options and functions (one function for each possible value) -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -% These functions take two arguments: headerColorOne, headerColorTwo and borderColor -\newcommand{\baposter@box@headershade@shadelr}[3]{ - \debug{Header-Shade: Shade Left - Right} - \tikzstyle{header colors}=[% - color=#3,% - shading=axis,% - left color=#1,% - right color=#2% - ]% -} -\newcommand{\baposter@box@headershade@shadetb}[3]{ - \debug{Header-Shade: Shade Top - Bottom} - \tikzstyle{header colors}=[% - color=#3,% - shading=axis,% - top color=#1,% - bottom color=#2% - ]% -} -\newcommand{\baposter@box@headershade@shadetbinverse}[3]{ - \tikzstyle{header colors}=[% - top color=#1!75!#2,% - bottom color=#2!100!#1,% - shading angle=20% - ]% - \colorlet{baposterHeaderFontColor}{white}% -} -\newcommand{\baposter@box@headershade@plain}[3]{ - \debug{Header-Shade: Plain} - \tikzstyle{header colors}=[% - color=#3,% - fill=#1% - ]% -} -\newcommand{\baposter@box@headershade@none}[3]{ - \debug{Header-Shade: none} - \tikzstyle{header colors}=[] -} - - - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%%% The main poster environment -%%% \begin{baposter}{settings}{Eye Catcher}{Title}{Author}{University Logo} -%%%----------------------------------------------------------------------------- -%%% The settings are -%%% - grid=true,[false]:Show grid to help with alignment -%%% - colspacing=0.7em: Column spacing -%%% - columns=4: number of columns (default 4 in landscape and 3 in portrait format) (maximum number is 6) -%%% - color=[orange]: xcolor color definition used as the main color of the poster -%%% - colortwo=[white]: The other color for gradient based layouts -%%% - textborder=none,bars,coils,triangles,rectangle,rounded,roundedsmall,roundedleft,roundedright,[faded] -%%% The style of the box around the text area -%%% - headerborder=none,closed,open -%%% No extra border around box header, full border around box header or border that is open below. -%%% - headershape=rectangle,rounded,roundedleft,roundedright -%%% Shape of the box-header region -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\newenvironment{poster}[5]{% - \thispagestyle{empty}% Suppress Page Number - \debug{Poster Starts}% - % This setkeys call parses all provided options and depending on the option - % value, assigns different handler functions to the \baposter(Box|Header)* - % functions. Once that is done, we don't have to care about particular - % values for border, shading, etc. All we have to do is call the - % handler functions and let them do their job. - % This also allows the user to override the poster-wide defaults on a per-box - % basis. - \setkeys[ba]{posterbox,poster}{#1}% -% - % TODO: Move all those assignments to the key macros! - % Parse Keys% - \colorlet{bgColorOne}{\baposter@bgColorOne} - \colorlet{bgColorTwo}{\baposter@bgColorTwo} -% - %% Boxes% - \setlength{\headerheight}{\baposter@headerheight}% - \setlength{\colheight}{\textheight-\baposter@headerheight}% - \renewcommand{\@@previousbox}{notset} - - \debug{Format}% - % Set default for columns if unset (4 for landscape, 3 for portrait) - \ifthenelse{\equal{\baposter@columns}{}}{% - \renewcommand{\baposter@columns}{\if@landscape4\else3\fi}% - }{} -% - \debug{Columns: \baposter@columns}% - \setlength{\baposter@@colspacing}{\baposter@colspacing}% - \setlength{\colwidth}{\textwidth}% - \addtolength{\colwidth}{\baposter@@colspacing*(1-\baposter@columns)}% - \ifcase\baposter@columns\relax - \error{You need to have at least one column!} - \or % 1 - \setlength{\colwidth}{\colwidth}% - \or % 2 - \setlength{\colwidth}{0.5\colwidth}% - \or % 3 - \setlength{\colwidth}{0.3333333333333\colwidth}% - \or % 4 - \setlength{\colwidth}{0.25\colwidth}% - \or % 5 - \setlength{\colwidth}{0.2\colwidth}% - \or % 6 - \setlength{\colwidth}{0.16666666666\colwidth}% - \else % >6 - \error{You do not want so many columns} - \fi -% - \newcommand{\baposter@reference}{north west}% -% - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - % A box with a header and some content. The basic unit of the poster% - %---------------------------------------------------------------------------% - % Each box has a name and can be placed absolutely or relatively.% - % The only inconvenience is that you can only specify a relative position % - % towards an already declared box. So if you have a box attached to the % - % bottom, one to the top and a third one which should be inbetween, you % - % have to specify the top and bottom boxes before you specify the middle % - % box.% - %% - % below= name of other node% - % above= name of other node% - % aligned=name of other node% - % bottomaligned=name of other node% - % column= [0] % - % row= [0] % - % span= [1] % - % height= ,[auto]% - % name= [noname]% - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - - % Backward-compatibility definition (\headerbox command uses posterbox env): - \newcommand{\headerbox}[3]{% - \begin{posterbox}[##2]{##1} - ##3 - \end{posterbox} - } - - \newenvironment{posterbox}[2][]{% - \debug{Poster box options: ##1}% - % Override the poster-wide defaults on a per-box basis - \setkeys[ba]{posterbox}{##1}% -% - \def\baposter@box@title{##2} -% - \colorlet{borderColor}{\baposter@box@borderColor} - \colorlet{headerColorOne}{\baposter@box@headerColorOne} - \colorlet{headerColorTwo}{\baposter@box@headerColorTwo} - \colorlet{headerFontColor}{\baposter@box@headerFontColor} - \colorlet{boxColorOne}{\baposter@box@boxColorOne} - \colorlet{boxColorTwo}{\baposter@box@boxColorTwo} -% - \setlength{\baposter@box@@cornerradius}{\baposter@box@cornerradius}% - \setlength{\baposter@box@@boxheaderheight}{\baposter@box@boxheaderheight}% - \setlength{\baposter@box@@boxpadding}{\baposter@box@boxpadding}% - - - %% The columns is always given absolute - % boxstartx = column * colwidth + column * colspacing - \setlength{\boxstartx}{(\colwidth+\baposter@@colspacing)*\baposter@box@column}% -% - %% The width is gvien absolute - % Box Width = span * colwidth + (span-1) * colspacing - \setlength{\boxwidth}{\baposter@box@span\colwidth} % - \addtolength{\boxwidth}{\baposter@@colspacing*(\baposter@box@span-1)}% -% - %% Measure the content of the box% - \setbox\baposter@box@content=\hbox\bgroup% - \begin{pgfinterruptpicture}% - \begin{minipage}[t]{\boxwidth-\baposter@box@@boxpadding*2}% - \baposter@box@textfont\bgroup% - }% End of posterbox preamble - %%% HERE COME THE ACTUAL CONTENTS OF THE HEADERBOX ENVIRONMENT - {% posterbox handling after contents (i.e. drawing everything) - \egroup% - \end{minipage}% - \end{pgfinterruptpicture}% - \egroup% - \setlength{\boxheight}{\ht\baposter@box@content}% - \addtolength{\boxheight}{\dp\baposter@box@content}% - \addtolength{\boxheight}{\baposter@box@@boxheaderheight} % Header% - \addtolength{\boxheight}{2\baposter@box@@boxpadding} % Inner Sep -% - \ifthenelse{\equal{\baposter@box@height}{bottom}}{% - }{\ifthenelse{\equal{\baposter@box@height}{auto}}{% - }{ % Neither auto nor bottom% - \setlength{\boxheight}{\baposter@box@height\colheight}% - }}% -% - %% Determine the box position% - \debug{Setting Coordinates}% - \debug{Upper Right}% - \debug{\baposter@box@name}% -% - %%% Upper Right Corner% - % if below=auto, set it to the previous box - % TODO: We should generalize this to the previous box of the used column, - % currently we use the previous box, which might be in a different column - \ifthenelse{\equal{\baposter@box@below}{auto}}{% - \edef\baposter@box@below{\@@previousbox} - \debug{Box \baposter@box@name has below=auto, placing it below box \baposter@box@below.} - }{} - \xdef\@@previousbox{\baposter@box@name} - - \ifthenelse{\not\equal{\baposter@box@below}{notset} }{% - %% Below% - \debug{Below}% - \path[shape=coordinate] (\boxstartx,0pt |- \baposter@box@below se) ++(0pt,-\baposter@@colspacing) coordinate(\baposter@box@name nw);% - }{% - \ifthenelse{\not\equal{\baposter@box@aligned}{notset} }{% - %% Aligned% - \debug{Aligned: \baposter@box@aligned}% - \path[shape=coordinate] (\boxstartx,0pt |- \baposter@box@aligned nw) coordinate(\baposter@box@name nw);% - }{% - %% Fixed% - \debug{Fixed}% - \setlength{\boxstarty}{\baposter@box@row\colheight}% - \path[shape=coordinate] (\boxstartx,\colheight-\boxstarty) coordinate(\baposter@box@name nw);% - }}% -% - %% Lower Left Corner% - \debug{Lower Left}% - \ifthenelse{\equal{\baposter@box@above}{bottom}}{% - %% Above = Bottom% - \debug{Above bottom}% - \ifthenelse{\equal{\baposter@box@below}{notset} \and \equal{\baposter@box@aligned}{notset}}{% - \path[shape=coordinate] (\boxstartx,\boxheight) coordinate(\baposter@box@name nw);% - }{}% - \path[shape=coordinate] (\boxstartx+\boxwidth,0pt) coordinate(\baposter@box@name se);% - }{\ifthenelse{\not \equal{\baposter@box@bottomaligned}{notset}}{% - \path[shape=coordinate] (\boxstartx+\boxwidth,0pt |- \baposter@box@bottomaligned se) coordinate(\baposter@box@name se);% - }{{\ifthenelse{\not \equal{\baposter@box@above}{notset}}{% - %% Above = Node% - \path[shape=coordinate] (\boxstartx+\boxwidth,0pt |- \baposter@box@above nw) +(0pt,\baposter@@colspacing) coordinate(\baposter@box@name se);% - }{% - %% Above = notset% - \debug{Above=not set}% - \ifthenelse{\equal{\baposter@box@height}{bottom}}{% - %% height=bottom% - \debug{height=bottom}% - \path[shape=coordinate] (\boxstartx+\boxwidth,0pt) coordinate(\baposter@box@name se);% - }{ %% height=auto or fixed% - \debug{height=auto or fixed}% - \path[shape=coordinate] (\baposter@box@name nw) ++(\boxwidth,-\boxheight) coordinate(\baposter@box@name se);% - }}}}}% -% - % - % Set coordinates relative to nw,se% - \debug{Fixing Coordinates}% - \path[shape=coordinate]% - (\baposter@box@name nw) +(0pt,-\baposter@box@@boxheaderheight) coordinate(\baposter@box@name tnw)% - (\baposter@box@name nw |- \baposter@box@name se) coordinate(\baposter@box@name sw)% - (\baposter@box@name se |- \baposter@box@name nw) coordinate(\baposter@box@name ne)% - (\baposter@box@name ne) +(0pt,-\baposter@box@@boxheaderheight) coordinate(\baposter@box@name tne)% -% - (\baposter@box@name nw) +(-0.025em,0pt) coordinate(\baposter@box@name outer nw)% - (\baposter@box@name tnw) +(-0.025em,0pt) coordinate(\baposter@box@name outer tnw)% - (\baposter@box@name sw) +(-0.025em,0pt) coordinate(\baposter@box@name outer sw)% -% - (\baposter@box@name ne) +( 0.025em,0pt) coordinate(\baposter@box@name outer ne)% - (\baposter@box@name tne) +( 0.025em,0pt) coordinate(\baposter@box@name outer tne)% - (\baposter@box@name se) +( 0.025em,0pt) coordinate(\baposter@box@name outer se);% -% - %% Setting the bg colors of the box header - \baposterHeaderSetShade{headerColorOne}{headerColorTwo}{borderColor} -% - \tikzstyle{rc}=[rounded corners=\baposter@box@@cornerradius];% - \tikzstyle{src}=[rounded corners=0.5em];% -% - - %% Now that everything is set up, draw the actual box, with bg and header - \begin{scope}[line width=\baposter@box@linewidth] - %% Header% - \debug{Header}% - \debug{Header-Shape: \baposter@box@headershape, header-border: \baposter@box@headerborder (\baposter@box@headerbordernr)}% - % TODO: Also turn this last ifcase construct into a handler function - % We only need to determine (fill|shade)(draw|)... -% \baposterHeaderDrawBackground{bgColorOne}{bgColorTwo}{borderColor} -% \baposterHeaderDrawBorder{borderColor} - \ifcase\baposter@box@headerbordernr\relax% - % open - \ifthenelse{\equal{\baposter@box@headershade}{plain}}{ - \filldraw [style=header colors] \baposterHeaderGetShape;% - }{ - \shadedraw [style=header colors] \baposterHeaderGetShape;% - } - \or - % closed - \ifthenelse{\equal{\baposter@box@headershade}{plain}}{ - \filldraw [style=header colors] \baposterHeaderGetShape -- cycle;% - }{ - \shadedraw [style=header colors] \baposterHeaderGetShape -- cycle;% - } - \or - % none - \ifthenelse{\equal{\baposter@box@headershade}{plain}}{ - \fill [style=header colors] \baposterHeaderGetShape;% - }{ - \shade [style=header colors] \baposterHeaderGetShape;% - } - \fi - % - %% Draw the text inside the box header: - \baposterHeaderDrawText{\color{headerFontColor}\baposter@box@headerfont{\baposter@box@title}};% - % - %% Text borders (border around boxes) - \debug{Poster boxes}% - % First set box shade - \baposterBoxDrawBackground{boxColorOne}{boxColorTwo} - \baposterBoxDrawBorder{borderColor} - %% - %% Text Box% - \debug{Drawing Text}% - \path (\baposter@box@name tnw) node(text) [anchor=north west, - outer sep=-0.000em,text width=\boxwidth-2\baposter@box@@boxpadding,inner sep=\baposter@box@@boxpadding, - text justified] {\usebox{\baposter@box@content}};% - \end{scope} - % - % Finally store the box name as the previous box for the next call -% \xdef\@@previousbox{\baposter@box@name}% - }% END of posterbox definition -% - %% Poster Background% - \baposterPosterDrawBackground{bgColorOne}{bgColorTwo}% - %% Poster header/title - \hspace{-1.5em}% - \begin{tikzpicture}[inner sep=0pt,outer sep=0pt,line width=0.05em]% - \useasboundingbox (0em,0em) rectangle(\textwidth,\textheight);% - \path[shape=coordinate]% - (0pt,\colheight) coordinate(north west) (\textwidth,\colheight) coordinate(north east)% - (0pt,0pt) coordinate(south west) (\textwidth,0pt) coordinate(south east);% -% - \ifbaposter@eyecatcher% Has eye catcher - \debug{Eyecatcher found!} - \setbox\baposter@titleimage@left=\hbox{#2}% - \else% Has no eye catcher% - \setbox\baposter@titleimage@left=\hbox{}% - \fi% - \setlength{\baposter@titleimage@left@width}{\wd\baposter@titleimage@left}% - \setbox\baposter@titleimage@right=\hbox{#5}% - \setlength{\baposter@titleimage@right@width}{\wd\baposter@titleimage@right}% - \setlength{\baposter@titleimage@textwidth}{\textwidth}% - \addtolength{\baposter@titleimage@textwidth}{-\baposter@titleimage@left@width}% - \addtolength{\baposter@titleimage@textwidth}{-\baposter@titleimage@right@width}% - - \debug{#3} - % - % - % % Draw Header% - \draw (north west) +(0em,1em+0.5\headerheight) node(image)[anchor=west] { {\usebox{\baposter@titleimage@left }} };% - \draw (north east) +(0em,1em+0.5\headerheight) node(logo) [anchor=east] { {\usebox{\baposter@titleimage@right}} };% - % - \ifbaposter@eyecatcher% Has eye catcher% - \draw (image.east) node(title)[anchor=west,text width=\baposter@titleimage@textwidth]{% - \begin{minipage}{\baposter@titleimage@textwidth}% - \begin{center}% - \textbf{\Huge #3}\\% - {\Large #4}% - \end{center}% - \end{minipage} - };% - \else% Has no eye catcher - \draw (image.east) node(title)[anchor=west] { {\begin{minipage}{\baposter@titleimage@textwidth}{\bfseries\Huge #3}\\{\Large #4}\end{minipage}} };% - \fi - }% END poster begin -% The body - {% BEGIN poster end - % The end, draw gridlines if neccesary - \ifbaposter@grid - \newdimen{\gridpos} - \pgfmathsetmacro{\z}{\baposter@columns-1} - \foreach \y in {0,...,\z} - { - \setlength{\gridpos}{\y\colwidth+\y\baposter@@colspacing} - \draw[draw=green,draw opacity=0.7] (\gridpos,0pt) -- (\gridpos,\colheight) - (\gridpos+\colwidth,0pt) -- (\gridpos+\colwidth,\colheight);% - } - % Horizontal lines, every 0.1: - %% Explicitly list all percentages, because with {0.0, 0.1, ..., 1.0} we - %% get rounding errors in the displayed numbers! - \foreach \y in {0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0} - \draw[draw=green,draw opacity=0.7] (0pt,\colheight-\y\colheight) -- - (\textwidth,\colheight-\y\colheight) node[anchor=west] {\y};% - \fi% - \end{tikzpicture}% - % \xkvview{} - \par - }% END poster end diff --git a/publications/poster-2019/figures/edgecase.graphml b/publications/poster-2019/figures/edgecase.graphml deleted file mode 100644 index de978b0..0000000 --- a/publications/poster-2019/figures/edgecase.graphml +++ /dev/null @@ -1,211 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - Mapping - - -Alignment of reads to the -extended hg38 reference - - - - - - - - - - - - - Tailpuller - - -Identification of candidate -reads overlapping the -annotated anchors - - - - - - - - - - - - Tailchopper - - -Selection of subsequences of -the candidate reads that extend -past the annotated anchors - - - - - - - - - - - - MEME - - -Discovery of characteristic -motifs in the candidate -telomeric subsequences - - - - - - - - - - - - Kmerscanner - - -Calculation of motif density -in a rolling window -along each read - - - - - - - - - - - - Densityplot - - -Visualization of motif density -along the candidate reads - - - - - - - - - - - - Assembler - - -Local de novo assembly -of telomeric haplotypes - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/publications/poster-2019/figures/edgecase.pdf b/publications/poster-2019/figures/edgecase.pdf deleted file mode 100644 index 6b935ab..0000000 Binary files a/publications/poster-2019/figures/edgecase.pdf and /dev/null differ diff --git a/publications/poster-2019/figures/giab-pacbio-densityplot.pdf b/publications/poster-2019/figures/giab-pacbio-densityplot.pdf deleted file mode 100644 index f672526..0000000 Binary files a/publications/poster-2019/figures/giab-pacbio-densityplot.pdf and /dev/null differ diff --git a/publications/poster-2019/figures/giab-pacbio-meme-top.pdf b/publications/poster-2019/figures/giab-pacbio-meme-top.pdf deleted file mode 100644 index fc28fae..0000000 Binary files a/publications/poster-2019/figures/giab-pacbio-meme-top.pdf and /dev/null differ diff --git a/publications/poster-2019/figures/hg38ext.png b/publications/poster-2019/figures/hg38ext.png deleted file mode 100644 index a7b907d..0000000 Binary files a/publications/poster-2019/figures/hg38ext.png and /dev/null differ diff --git a/publications/poster-2019/figures/logo.pdf b/publications/poster-2019/figures/logo.pdf deleted file mode 100644 index e324c8c..0000000 Binary files a/publications/poster-2019/figures/logo.pdf and /dev/null differ diff --git a/publications/poster-2019/figures/poster-graph.pdf b/publications/poster-2019/figures/poster-graph.pdf deleted file mode 100644 index 93443d4..0000000 Binary files a/publications/poster-2019/figures/poster-graph.pdf and /dev/null differ diff --git a/publications/poster-2019/figures/riethman.png b/publications/poster-2019/figures/riethman.png deleted file mode 100644 index 1435fe1..0000000 Binary files a/publications/poster-2019/figures/riethman.png and /dev/null differ diff --git a/publications/poster-2019/figures/t2t.png b/publications/poster-2019/figures/t2t.png deleted file mode 100644 index 18a5015..0000000 Binary files a/publications/poster-2019/figures/t2t.png and /dev/null differ diff --git a/publications/poster-2019/figures/twin-telomeres-wetlab.png b/publications/poster-2019/figures/twin-telomeres-wetlab.png deleted file mode 100644 index d2f5a20..0000000 Binary files a/publications/poster-2019/figures/twin-telomeres-wetlab.png and /dev/null differ diff --git a/publications/poster-2019/figures/twins-flipflop-densityplot.pdf b/publications/poster-2019/figures/twins-flipflop-densityplot.pdf deleted file mode 100644 index 8682b29..0000000 Binary files a/publications/poster-2019/figures/twins-flipflop-densityplot.pdf and /dev/null differ diff --git a/publications/poster-2019/figures/twins-noflipflop-densityplot.pdf b/publications/poster-2019/figures/twins-noflipflop-densityplot.pdf deleted file mode 100644 index 6590a6e..0000000 Binary files a/publications/poster-2019/figures/twins-noflipflop-densityplot.pdf and /dev/null differ diff --git a/publications/poster-2019/figures/unimizers.png b/publications/poster-2019/figures/unimizers.png deleted file mode 100644 index 25c422d..0000000 Binary files a/publications/poster-2019/figures/unimizers.png and /dev/null differ diff --git a/publications/poster-2019/grigorev-poster.pdf b/publications/poster-2019/grigorev-poster.pdf deleted file mode 100644 index 29b9959..0000000 Binary files a/publications/poster-2019/grigorev-poster.pdf and /dev/null differ diff --git a/publications/poster-2019/grigorev-poster.tex b/publications/poster-2019/grigorev-poster.tex deleted file mode 100644 index b9feda9..0000000 --- a/publications/poster-2019/grigorev-poster.tex +++ /dev/null @@ -1,247 +0,0 @@ -\documentclass[paperwidth=42in,paperheight=47.75in,portrait]{baposter} - -\usepackage[font=small,labelfont=bf]{caption} % Required for specifying captions to tables and figures -\usepackage{booktabs} % Horizontal rules in tables -\usepackage{relsize} % Used for making text smaller in some places -\usepackage[sfdefault,light]{roboto} -\usepackage[backend=bibtex,style=nature]{biblatex} -\addbibresource{references.bib} - -\graphicspath{{figures/}} % Directory in which figures are stored -\definecolor{bordercol}{RGB}{100,40,40} -\definecolor{boxcolor}{RGB}{255,255,255} -\definecolor{headerfontcol}{RGB}{179,27,27} -\definecolor{graybox}{RGB}{220,220,220} -\definecolor{greenbox}{RGB}{210,245,210} -\definecolor{greenbox_adj}{RGB}{215,235,215} -\definecolor{bluebox}{RGB}{230,245,255} -\definecolor{bluebox_adj}{RGB}{230,240,250} - -\begin{document} - -\begin{poster}{ - columns=4, - borderColor=bordercol,headerColorOne=boxcolor,headerColorTwo=boxcolor,headerFontColor=headerfontcol,boxColorOne=boxcolor, - background=none,boxshade=plain, - headershape=rectangle,textborder=none,headerborder=none, - headerfont=\Large\sf\bf, - grid=false,linewidth=1 -} -{\includegraphics[scale=1.7]{logo.pdf}}{\sf\bf Long read sequencing and local graph assembly \\ reveal heterogeneity of telomeres} { - \vspace{.4em} Grigorev K, Foox J, Bezdan D, Butler D, Mason C \\ - {\small Institute for Computational Biomedicine, Weill Cornell Medicine} -} - - -\headerbox{Abstract}{name=abstract,column=0,row=0,span=1,boxColorOne=graybox,headerColorOne=graybox,headerColorTwo=graybox} -{\small - Telomeres are regions of repetitive nucleotide sequences capping eukaryotic chromosomes that protect the ends of chromosomes from deterioration. - Telomeres are known to generally shorten after each cell replication, eventually blocking somatic cell division and preventing genomic instability. - As such, their length is an important marker in senescence, where it can inversely correlate with a subject’s age, and in cancers, where both - telomere shortening and unrestricted elongation have been suggested as risk factors. - - Given their length and repetitive nature, telomeric regions cannot be veritably assembled from reads of kilobase-order lengths (Sanger sequencing) - or sub-1Kbp reads (second-generation sequencing), making telomere resolution a very costly and generally intractable problem. - Recently, with third-generation technologies like SMRT and nanopore sequencing attaining read lengths on the order of tens and hundreds kilobase - pairs, with the longest reads reported as 2Mbp, it became possible to routinely read into the telomeric regions and inspect their structure - and length. - - We describe a framework for extracting telomeric reads from third-generation sequencing experiments and describing their sequence content and - prevalent motifs. We find that human telomeric sequences exhibit surprising heterogeneity, suggesting the possibility of localization of previously - reported non-canonical motifs as well as novel sequences. - - We also propose a local graph assembly algorithm capable of describing the haplotypic diversity of telomeres. Given the lower complexity of such - reads, established methods for long read overlap and assembly that rely on MinHash sketches and minimizers are unsuitable for this problem and fail - to detect most correct overlaps when compared to the computationally prohibitive, but mathematically correct Smith-Waterman alignment. - We implement a modified method relying on \textit{unimizers} (minimizers occurring once within a given read) that improves overlaps and reduces - complexity of the assembly graph, and locally assemble branching telomeric sequences using the computationally efficient A-Bruijn structure. -} - - -\headerbox{References}{name=references,column=0,below=abstract} -{ - \begingroup - \raggedright - \AtNextBibliography{\footnotesize} - \printbibliography[heading=none] - \endgroup -} - - -\headerbox{Acknowledgements}{name=acknowledgements,column=0,below=references,boxColorOne=graybox,headerColorOne=graybox,headerColorTwo=graybox} -{\small - Susan Bailey, Jared Luxton \\ (Colorado State University) -} - - -\headerbox{}{name=copyright,column=0,below=acknowledgements,above=bottom,boxheaderheight=0mm} -{\scriptsize - \vspace{.3in} - \textcopyright{} 2019 Weill Cornell Medicine -} - - -\headerbox{Background}{name=telomere_sequencing,span=1,column=1,row=0,boxColorOne=graybox,headerColorOne=graybox,headerColorTwo=graybox} -{ - \begin{center} - \includegraphics[width=\linewidth]{t2t.png} - \captionof{figure}{ - Telomere-to-telomere sequencing of the human X chromosome \\ - (adapted from \textit{Miga et al., 2019} \cite{t2t}). - } - \end{center} -} - - -\headerbox{}{name=riethman,span=1,column=2,row=0,boxheaderheight=0mm,boxColorOne=graybox,headerColorOne=graybox,headerColorTwo=graybox} -{ - \begin{center} - \includegraphics[width=\linewidth]{riethman.png} - \captionof{figure}{ - Sequencing of human subtelomeres (adapted from - \textit{Stong et al., 2014} \cite{riethman2014}). - } - \end{center} -} - - -\headerbox{}{name=wetlab_telomeres,span=1,column=3,row=0,boxheaderheight=0mm,boxColorOne=graybox,headerColorOne=graybox,headerColorTwo=graybox} -{ - \begin{center} - \includegraphics[width=\linewidth]{twin-telomeres-wetlab.png} - \captionof{figure}{ - Elongation of telomeres observed \\ - during space flight in the NASA Twins Study \cite{nasatwins}. - } - \end{center} -} - - -\headerbox{Methods}{name=hg38ext,span=1,column=1,below=telomere_sequencing} -{ - \begin{center} - \includegraphics[width=\linewidth]{hg38ext.png} - \captionof{figure}{ - Construction of the extended reference. Subtelomeric - sequences were mapped to the hg38 reference and added to the - index according to their relationship with the chromosomes. \\ - Annotated: hard-masked (unsequenced) regions (gray dashed line), - boundaries between the subtelomere and the telomere (green), - origins of divergent sequences (red). - } - \end{center} -} - - -\headerbox{}{name=pipeline,span=2,column=2,below=telomere_sequencing,boxheaderheight=0mm} -{ - \begin{center} - \includegraphics[width=\linewidth]{edgecase.pdf} - \captionof{figure}{ - The edgeCase pipeline\hfill{} - } - \end{center} -} - - -\headerbox{Results}{name=giab_pb,span=1,column=1,below=hg38ext,boxColorOne=bluebox,headerColorOne=bluebox_adj,headerColorTwo=bluebox_adj} -{ - \begin{center} - \includegraphics[width=\linewidth]{giab-pacbio-densityplot.pdf} - \captionof{figure}{ - Distribution of major motifs in a subset of telomeric reads - from the SMRT hg002 Genome In a Bottle \cite{giab} sample. \\ - The density plots suggest that two haplotypes are present, with - a shorter and a longer distance between the non-canonical TTGGGG - runs. The similarity of overall distributions suggests that the - deviation from the canonical motif is unlikely to be due to - random sequencing errors. \\ - Local assembly is required to describe these haplotypes. - } - \end{center} -} - - -\headerbox{}{name=no_flipflop,span=1,column=2,below=hg38ext,boxheaderheight=0mm,boxColorOne=bluebox} -{ - \begin{center} - \includegraphics[width=\linewidth]{twins-noflipflop-densityplot.pdf} - \captionof{figure}{ - Distribution of major motifs in a subset of telomeric reads from - an in-house nanopore sequencing dataset, basecalled with - \textit{guppy} without the flip-flop model. \\ - The offending motif (orange) is GAA. - } - \end{center} -} - - -\headerbox{}{name=flipflop,span=1,column=3,below=hg38ext,boxheaderheight=0mm,boxColorOne=bluebox} -{ - \begin{center} - \includegraphics[width=\linewidth]{twins-flipflop-densityplot.pdf} - \captionof{figure}{ - Distribution of major motifs in a subset of telomeric reads from - an in-house nanopore sequencing dataset, basecalled with - \textit{guppy} \textbf{with} the flip-flop model. - The main offending motif (orange) is CCAGG. The additional - offending motif (purple) is TTAAAA. The same motifs are present - in Genome In a Bottle nanopore data. - } - \end{center} -} - - -\headerbox{}{name=meme,span=1,column=1,below=giab_pb,above=bottom,boxheaderheight=0mm,boxColorOne=bluebox} -{ - \begin{center} - \includegraphics[width=\linewidth]{giab-pacbio-meme-top.pdf} - \captionof{figure}{ - Motifs in segments of SMRT reads extending past the annotated - subtelomere. \\ - Motif discovery was performed with MEME \cite{meme}. - } - \end{center} -} - - -\headerbox{Further work}{name=unimizers,span=1,column=2,below=no_flipflop,above=bottom} -{ - \begin{center} - \includegraphics[width=\linewidth]{unimizers.png} - \captionof{figure}{ - Comparison between \textit{minimizers} (gray) and - \textit{\textbf{unimizers}} (blue), both with \textit{k}=16 and - \textit{w}=11. - \\ - Unimizers are minimizers that occur only once per given read. - The same values of \textit{k} and \textit{w} in this figure are - used for a strict comparison; more permissive values result in - a bigger number of unimizers, especially useful for - low-compexity regions. - \\ - Arrays of unimizers have an advantage of - \textit{\textbf{directionality}} compared to minimizers and - can identify overlaps of low-complexity reads \\ - while minimizers fail to do so. - } - \end{center} -} - - -\headerbox{}{name=unimizers,span=1,column=3,below=flipflop,above=bottom,boxheaderheight=0mm} -{ - \vspace*{-.03in} - \begin{center} - \includegraphics[angle=90,origin=c,width=\linewidth]{poster-graph.pdf} - \captionof{figure}{ - The directionality of unimizers allows to more easily construct - de Bruijn-like graphs (A-Bruijn graphs originally introduced in - a different framework \cite{abruijn2004}). - } - \end{center} -} - - -\end{poster} -\end{document} diff --git a/publications/poster-2019/makefile b/publications/poster-2019/makefile deleted file mode 100644 index 28e097f..0000000 --- a/publications/poster-2019/makefile +++ /dev/null @@ -1,17 +0,0 @@ -all: grigorev-poster.pdf - -%.aux: %.tex baposter.cls references.bib - pdflatex $< - -%.bbl: %.aux references.bib - bibtex $(basename $<) - -%.pdf: %.tex %.bbl - rm -f $(basename $<)-blx.bib $(basename $<).blg $(basename $<).run.xml - rm -f $(basename $<).aux $(basename $<).bcf - rm -f $(basename $<).toc $(basename $<).out - pdflatex $< - pdflatex $< - rm -f $(basename $<)-blx.bib $(basename $<).blg $(basename $<).run.xml - rm -f $(basename $<).aux $(basename $<).bcf - rm -f $(basename $<).toc $(basename $<).out diff --git a/publications/poster-2019/references.bib b/publications/poster-2019/references.bib deleted file mode 100644 index 5167650..0000000 --- a/publications/poster-2019/references.bib +++ /dev/null @@ -1,69 +0,0 @@ -@article {t2t, - doi = {10.1101/735928}, - year = {2019}, - month = aug, - publisher = {Cold Spring Harbor Laboratory}, - author = {Karen H. Miga and Sergey Koren and Arang Rhie and Mitchell R. Vollger and Ariel Gershman and Andrey Bzikadze and Shelise Brooks and Edmund Howe and David Porubsky and Glennis A. Logsdon and Valerie A. Schneider and Tamara Potapova and Jonathan Wood and William Chow and Joel Armstrong and Jeanne Fredrickson and Evgenia Pak and Kristof Tigyi and Milinn Kremitzki and Christopher Markovic and Valerie Maduro and Amalia Dutra and Gerard G. Bouffard and Alexander M. Chang and Nancy F. Hansen and Fran{\c{c}}oisen Thibaud-Nissen and Anthony D. Schmitt and Jon-Matthew Belton and Siddarth Selvaraj and Megan Y. Dennis and Daniela C. Soto and Ruta Sahasrabudhe and Gulhan Kaya and Josh Quick and Nicholas J. Loman and Nadine Holmes and Matthew Loose and Urvashi Surti and Rosa ana Risques and Tina A. Graves Lindsay and Robert Fulton and Ira Hall and Benedict Paten and Kerstin Howe and Winston Timp and Alice Young and James C. Mullikin and Pavel A. Pevzner and Jennifer L. Gerton and Beth A. Sullivan and Evan E. Eichler and Adam M. Phillippy}, - title = {Telomere-to-telomere assembly of a complete human X chromosome} -} - -@article {riethman2014, - doi = {10.1101/gr.166983.113}, - year = {2014}, - month = mar, - publisher = {Cold Spring Harbor Laboratory}, - volume = {24}, - number = {6}, - author = {N. Stong and Z. Deng and R. Gupta and S. Hu and S. Paul and A. K. Weiner and E. E. Eichler and T. Graves and C. C. Fronick and L. Courtney and R. K. Wilson and P. M. Lieberman and R. V. Davuluri and H. Riethman}, - title = {Subtelomeric {CTCF} and cohesin binding site organization using improved subtelomere assemblies and a novel annotation pipeline}, - journal = {Genome Research} -} - -@article {nasatwins, - author = {Garrett-Bakelman, Francine E. and Darshi, Manjula and Green, Stefan J. and Gur, Ruben C. and Lin, Ling and Macias, Brandon R. and McKenna, Miles J. and Meydan, Cem and Mishra, Tejaswini and Nasrini, Jad and Piening, Brian D. and Rizzardi, Lindsay F. and Sharma, Kumar and Siamwala, Jamila H. and Taylor, Lynn and Vitaterna, Martha Hotz and Afkarian, Maryam and Afshinnekoo, Ebrahim and Ahadi, Sara and Ambati, Aditya and Arya, Maneesh and Bezdan, Daniela and Callahan, Colin M. and Chen, Songjie and Choi, Augustine M. K. and Chlipala, George E. and Contrepois, K{\'e}vin and Covington, Marisa and Crucian, Brian E. and De Vivo, Immaculata and Dinges, David F. and Ebert, Douglas J. and Feinberg, Jason I. and Gandara, Jorge A. and George, Kerry A. and Goutsias, John and Grills, George S. and Hargens, Alan R. and Heer, Martina and Hillary, Ryan P. and Hoofnagle, Andrew N. and Hook, Vivian Y. H. and Jenkinson, Garrett and Jiang, Peng and Keshavarzian, Ali and Laurie, Steven S. and Lee-McMullen, Brittany and Lumpkins, Sarah B. and MacKay, Matthew and Maienschein-Cline, Mark G. and Melnick, Ari M. and Moore, Tyler M. and Nakahira, Kiichi and Patel, Hemal H. and Pietrzyk, Robert and Rao, Varsha and Saito, Rintaro and Salins, Denis N. and Schilling, Jan M. and Sears, Dorothy D. and Sheridan, Caroline K. and Stenger, Michael B. and Tryggvadottir, Rakel and Urban, Alexander E. and Vaisar, Tomas and Van Espen, Benjamin and Zhang, Jing and Ziegler, Michael G. and Zwart, Sara R. and Charles, John B. and Kundrot, Craig E. and Scott, Graham B. I. and Bailey, Susan M. and Basner, Mathias and Feinberg, Andrew P. and Lee, Stuart M. C. and Mason, Christopher E. and Mignot, Emmanuel and Rana, Brinda K. and Smith, Scott M. and Snyder, Michael P. and Turek, Fred W.}, - title = {The NASA Twins Study: A multidimensional analysis of a year-long human spaceflight}, - volume = {364}, - number = {6436}, - elocation-id = {eaau8650}, - year = {2019}, - doi = {10.1126/science.aau8650}, - publisher = {American Association for the Advancement of Science}, - abstract = {Space is the final frontier for understanding how extreme environments affect human physiology. Following twin astronauts, one of which spent a year-long mission on the International Space Station, Garrett-Bakelman et al. examined molecular and physiological traits that may be affected by time in space (see the Perspective by L{\"o}brich and Jeggo). Sequencing the components of whole blood revealed that the length of telomeres, which is important to maintain in dividing cells and may be related to human aging, changed substantially during space flight and again upon return to Earth. Coupled with changes in DNA methylation in immune cells and cardiovascular and cognitive effects, this study provides a basis to assess the hazards of long-term space habitation.Science, this issue p. eaau8650; see also p. 127INTRODUCTIONTo date, 559 humans have been flown into space, but long-duration (\>300 days) missions are rare (n = 8 total). Long-duration missions that will take humans to Mars and beyond are planned by public and private entities for the 2020s and 2030s; therefore, comprehensive studies are needed now to assess the impact of long-duration spaceflight on the human body, brain, and overall physiology. The space environment is made harsh and challenging by multiple factors, including confinement, isolation, and exposure to environmental stressors such as microgravity, radiation, and noise. The selection of one of a pair of monozygotic (identical) twin astronauts for NASA{\textquoteright}s first 1-year mission enabled us to compare the impact of the spaceflight environment on one twin to the simultaneous impact of the Earth environment on a genetically matched subject.RATIONALEThe known impacts of the spaceflight environment on human health and performance, physiology, and cellular and molecular processes are numerous and include bone density loss, effects on cognitive performance, microbial shifts, and alterations in gene regulation. However, previous studies collected very limited data, did not integrate simultaneous effects on multiple systems and data types in the same subject, or were restricted to 6-month missions. Measurement of the same variables in an astronaut on a year-long mission and in his Earth-bound twin indicated the biological measures that might be used to determine the effects of spaceflight. Presented here is an integrated longitudinal, multidimensional description of the effects of a 340-day mission onboard the International Space Station.RESULTSPhysiological, telomeric, transcriptomic, epigenetic, proteomic, metabolomic, immune, microbiomic, cardiovascular, vision-related, and cognitive data were collected over 25 months. Some biological functions were not significantly affected by spaceflight, including the immune response (T cell receptor repertoire) to the first test of a vaccination in flight. However, significant changes in multiple data types were observed in association with the spaceflight period; the majority of these eventually returned to a preflight state within the time period of the study. These included changes in telomere length, gene regulation measured in both epigenetic and transcriptional data, gut microbiome composition, body weight, carotid artery dimensions, subfoveal choroidal thickness and peripapillary total retinal thickness, and serum metabolites. In addition, some factors were significantly affected by the stress of returning to Earth, including inflammation cytokines and immune response gene networks, as well as cognitive performance. For a few measures, persistent changes were observed even after 6 months on Earth, including some genes{\textquoteright} expression levels, increased DNA damage from chromosomal inversions, increased numbers of short telomeres, and attenuated cognitive function.CONCLUSIONGiven that the majority of the biological and human health variables remained stable, or returned to baseline, after a 340-day space mission, these data suggest that human health can be mostly sustained over this duration of spaceflight. The persistence of the molecular changes (e.g., gene expression) and the extrapolation of the identified risk factors for longer missions (\>1 year) remain estimates and should be demonstrated with these measures in future astronauts. Finally, changes described in this study highlight pathways and mechanisms that may be vulnerable to spaceflight and may require safeguards for longer space missions; thus, they serve as a guide for targeted countermeasures or monitoring during future missions.Multidimensional, longitudinal assays of the NASA Twins Study.(Left and middle) Genetically identical twin subjects (ground and flight) were characterized across 10 generalized biomedical modalities before (preflight), during (inflight), and after flight (postflight) for a total of 25 months (circles indicate time points at which data were collected). (Right) Data were integrated to guide biomedical metrics across various {\textquotedblleft}-omes{\textquotedblright} for future missions (concentric circles indicate, from inner to outer, cytokines, proteome, transcriptome, and methylome).To understand the health impact of long-duration spaceflight, one identical twin astronaut was monitored before, during, and after a 1-year mission onboard the International Space Station; his twin served as a genetically matched ground control. Longitudinal assessments identified spaceflight-specific changes, including decreased body mass, telomere elongation, genome instability, carotid artery distension and increased intima-media thickness, altered ocular structure, transcriptional and metabolic changes, DNA methylation changes in immune and oxidative stress{\textendash}related pathways, gastrointestinal microbiota alterations, and some cognitive decline postflight. Although average telomere length, global gene expression, and microbiome changes returned to near preflight levels within 6 months after return to Earth, increased numbers of short telomeres were observed and expression of some genes was still disrupted. These multiomic, molecular, physiological, and behavioral datasets provide a valuable roadmap of the putative health risks for future human spaceflight.}, - journal = {Science} -} - -@article {giab, - doi = {10.1038/sdata.2016.25}, - year = {2016}, - month = jun, - publisher = {Springer Science and Business Media {LLC}}, - volume = {3}, - number = {1}, - author = {Justin M. Zook and David Catoe and Jennifer McDaniel and Lindsay Vang and Noah Spies and Arend Sidow and Ziming Weng and Yuling Liu and Christopher E. Mason and Noah Alexander and Elizabeth Henaff and Alexa B.R. McIntyre and Dhruva Chandramohan and Feng Chen and Erich Jaeger and Ali Moshrefi and Khoa Pham and William Stedman and Tiffany Liang and Michael Saghbini and Zeljko Dzakula and Alex Hastie and Han Cao and Gintaras Deikus and Eric Schadt and Robert Sebra and Ali Bashir and Rebecca M. Truty and Christopher C. Chang and Natali Gulbahce and Keyan Zhao and Srinka Ghosh and Fiona Hyland and Yutao Fu and Mark Chaisson and Chunlin Xiao and Jonathan Trow and Stephen T. Sherry and Alexander W. Zaranek and Madeleine Ball and Jason Bobe and Preston Estep and George M. Church and Patrick Marks and Sofia Kyriazopoulou-Panagiotopoulou and Grace X.Y. Zheng and Michael Schnall-Levin and Heather S. Ordonez and Patrice A. Mudivarti and Kristina Giorda and Ying Sheng and Karoline Bjarnesdatter Rypdal and Marc Salit}, - title = {Extensive sequencing of seven human genomes to characterize benchmark reference materials}, - journal = {Scientific Data} -} - -@article {meme, - doi = {10.1093/nar/gkp335}, - year = {2009}, - month = may, - publisher = {Oxford University Press ({OUP})}, - volume = {37}, - number = {Web Server}, - author = {T. L. Bailey and M. Boden and F. A. Buske and M. Frith and C. E. Grant and L. Clementi and J. Ren and W. W. Li and W. S. Noble}, - title = {{MEME} {SUITE}: tools for motif discovery and searching}, - journal = {Nucleic Acids Research} -} - -@article {abruijn2004, - doi = {10.1101/gr.2395204}, - year = {2004}, - month = sep, - publisher = {Cold Spring Harbor Laboratory}, - volume = {14}, - number = {9}, - author = {P. A. Pevzner}, - title = {De Novo Repeat Classification and Fragment Assembly}, - journal = {Genome Research} -} diff --git a/readme.md b/readme.md index 0cbcf33..33e5dcb 100644 --- a/readme.md +++ b/readme.md @@ -5,79 +5,49 @@ edgeCase from long-read single-molecule whole genome sequencing datasets. Associated preprint: https://www.biorxiv.org/content/10.1101/2020.01.31.929307v1 -![densityplot_sample](assets/densityplot-haplotypes.png?raw=true "densityplot example") +![haplotypes_example](assets/haplotypes-example.png?raw=true "haplotypes example") ## Installation -#### With Conda (preferred): +### Obtaining code + +The code can either be downloaded from the +[releases](https://github.com/LankyCyril/edgecase/releases) page, +or cloned with git: `git clone https://github.com/LankyCyril/edgecase` + +### Environment setup + +#### With Conda (preferred) ```{sh} -$ git clone https://github.com/LankyCyril/edgecase $ cd edgecase $ conda env create --name edgecase --file environment.yaml $ conda activate edgecase $ ./edgecase ``` -#### By manually installing dependencies: +#### By manually installing dependencies ```{sh} -$ git clone https://github.com/LankyCyril/edgecase $ cd edgecase $ pip install numpy scipy scikit-learn statsmodels numba $ pip install pandas matplotlib seaborn tqdm regex pysam $ ./edgecase ``` -## Version history - -#### 2020-08-25 - -* Interface updates: - * all subroutines: - * removed flawed option `--flags-any` (`-g`) - * tailchopper: - * fixed the error that led to CIGAR strings being dropped - * kmerscanner: - * can now accept both BAM and Fastx input (option `--fmt`) - * repeatfinder: - * accepts option `--collapse-reverse-complement` (`-C`), which works - similarly to option `-C` of jellyfish (count reverse complement motifs - together) - * better coerces motifs in the output into human-friendly inversions - * densityplot: - * accepts option `--n-boot` to specify number of bootstrap rounds - for plotting confidence intervals - * basic-pipeline-longread: - * a new subroutine that runs all invididual subroutines in order -* Refactoring and internal updates: - * all subroutines: - * switched from `argparse` to `docopt` - * improved code style - * kmerscanner: - * function `get_circular_pattern()` accepts parameter `repeats` - (currently it is always set to 2) -* Other updates: - * paper drafts -* TODO: - * update README - * kmerscanner: remove obsolete options (`--head-test`, `--tail-test`, - `--cutoff`) and associated warning messages ## Input data and formats -### The extended reference genome and BAM files +### The extended reference genome *edgeCase* works with SAM/BAM files aligned to a reference that is annotated with known subtelomeric regions and uses reads anchored to the outermost ends -of subtelomeres (5' on the *p* arm, 3' on the *q* arm). For a BAM file -*dataset.bam* aligned to *ref.fa*, it needs several files: +of subtelomeres (5' on the *p* arm, 3' on the *q* arm). ``` -dataset.bam.bai: a BAI index; create with "samtools index dataset.bam" -ref.fa.fai: a FAI index; create with "samtools faidx ref.fa" -ref.fa.ecx: an index containing annotations of subtelomere-telomere boundareis +ref.fa.fai: a FAI index; create with "samtools faidx ref.fa" +ref.fa.ecx: an index containing annotations of subtelomere-telomere boundaries ``` *ref.fa.ecx*, a.k.a. the edgeCase indeX, describes anchors of interest in the @@ -89,10 +59,23 @@ Specifically, as described in the bioRxiv preprint, the human reference can be constructed from the hg38/GRCh38 reference genome and subtelomeric assemblies published by [Stong et al., 2014](https://dx.doi.org/10.1101%2Fgr.166983.113). To generate this reference, which we call "extended", or *hg38ext*, run -`tools/generate-hg38ext.py --remote > hg38ext.fa`. +`assets/generate-hg38ext.py --remote > hg38ext.fa`. + +### Alignment files + +We recommend using *minimap2* to generate BAM files for edgeCase. Another option +is *winnowmap*, but it has not been sufficiently tested yet. +**NB**: currently, it is imperative to supply a BAM file where secondary +alignment entries have read sequences. For example, *minimap2* creates BAMs in +this format with the use of the `-Y` switch. +We plan to implement a workaround for this requirement in the future. -### Custom SAM flags +BAM files must also be indexed (i.e., have a `.bai` file created with +`samtools index`). + + +## Custom SAM flags *edgeCase* extends the zoo of SAM flags with four of its own. The full table of flag names: @@ -112,46 +95,52 @@ secondary | 256 | 0x0100 | SAM specification flag qcfail | 512 | 0x0200 | SAM specification flag pcrdup | 1024 | 0x0400 | SAM specification flag supp | 2048 | 0x0800 | SAM specification flag -ucsc_mask_anchor | 4096 | 0x1000 | edgeCase-specific flag; added during pipeline +mask_anchor | 4096 | 0x1000 | edgeCase-specific flag; added during pipeline fork | 8192 | 0x2000 | edgeCase-specific flag; added during pipeline tract_anchor | 16384 | 0x4000 | edgeCase-specific flag; added during pipeline is_q | 32768 | 0x8000 | edgeCase-specific flag; added during pipeline +**NB**: these flags are unused in the SAM specification and should not clash with +anything. `samtools view` can correctly subset using these flags. + *Note:* All edgeCase routines that allow flag filtering recognize both the -numeric flag format (such as 3844) and the "human-readable" format such as "rev" -or "is_q|paired". Combinations are also understood, for example, "3844|is_q". +numeric flag format (such as 3844) and the "human-readable" format such as "rev". +Combinations are also understood, for example, "-F 3844 -F is_q". + +*Note:* In the future, custom SAM flags may be superseded with tags. +A backwards compatibility layer will be provided (i.e., arguments like "-f fork" +or "-F 16384" will still work but interpret and produce appropriate tags). ## The edgeCase pipeline ``` -usage: ./edgecase [-h] {tailpuller,tailchopper,repeatfinder,kmerscanner,levenshtein,densityplot} ... - -positional arguments: - tailpuller select overhanging reads - tailchopper get overhanging heads/tails of reads - repeatfinder discover enriched repeats in candidate sequences - kmerscanner perform scan of known kmers/motifs - levenshtein cluster reads by edit distance - densityplot visualize densities of candidate reads +Usage: ./edgecase [-h | --help] + ./edgecase [...] + +Commmands (): + tailpuller select overhanging long reads + tailchopper get overhanging heads/tails of long reads + repeatfinder discover enriched repeats in candidate sequences + kmerscanner perform scan of known kmers/motifs + densityplot visualize densities of candidate motifs + +Development area: + entropy calculate motif entropy among long reads + levenshtein calculate pairwise edit distance among long reads ``` +All commands output their results to stdout; you must pipe them into other +commands or into the destination file. This applies even to outputs in PDF and +PKL formats. -### tailpuller - -``` -usage: ./edgecase tailpuller --index X [options] bam > sam +**NB**: Depending on the aligner used upstream, MAPQ of secondary reads may have +been set to zero regardless of real mapping quality; use this filtering option +with caution. **This warning applies to all edgeCase subroutines that accept +the `-q` filtering flag.** -positional arguments: - bam name of input BAM/SAM file -optional arguments: - -x X, --index X location of the reference .ecx index (REQUIRED) - -f f, --flags f process only entries with all these sam flags present (default: 0) - -F F, --flag-filter F process only entries with none of these sam flags present (default: 0) - -q Q, --min-quality Q process only entries with MAPQ >= Q (default: 0) - -m M, --max-read-length M max read length to consider when selecting lookup regions (default: None) -``` +### tailpuller Outputs a subset SAM file that contains only the reads that overhang anchors defined in the ECX. If the read overhangs the mask anchor, the 4096 SAM flag is @@ -159,184 +148,262 @@ added; for forks, 8192 is added; for telomeric tracts, 16384. For reads on the *q* arm (i.e., on the 3' end), the 32768 flag is added (see above for the full list and the explanation of flags). -**NB**: these flags are unused in the SAM specification and should not clash with -anything. `samtools view` can correctly subset using these flags. +``` +Usage: ./edgecase tailpuller -x filename [-t targetspec]... + [-M integer] [--min-map-overlap integer] + [-m integer] [--min-telomere-overlap integer] + [--output-ambiguous-reads string] + [-f flagspec]... [-F flagspec]... [-q integer] + +Required options: + -x, --index [filename] location of the reference .ecx index + +Options: + -t, --target [targetspec] target reads overlapping these features (ECX flags) [default: tract_anchor] + -M, --max-read-length [integer] maximum read length to consider when selecting lookup regions + --min-map-overlap [integer] minimum overlap of reference to consider read as mapped [default: 1] + -m, --min-subtelomere-overlap [integer] minimum overlap of subtelomere to consider read as candidate [default: 1] + --min-telomere-overlap [integer] minimum overlap of telomere to consider read as candidate [default: 1] + --output-ambiguous-reads [string] which ambiguously mapping reads to retain (none, all, longest-overlap) [default: none] + +Input filtering options: + -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] + -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] + -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] +``` Suggestions: -* use `-F 3844` to skip secondary, supplementary and QC-fail alignments; -* pipe the output through `samtools view -bh -` to compress on the fly; -* supplying `--max-read-length` drastically improves wall time if reads are -significantly shorter than chromosomes. +* It is recommended to include secondary and supplementary reads (i.e., leave + the -F flag as default [0]), because: + * edgeCase determines unambiguously mapped reads on its own; aligners + assign the 'supplementary' flag to multi-mapping reads arbitrarily, and + removing such supplementary reads upstream may lead to loss of information + in telomeric regions; + * edgeCase will discard chimeric reads in terminal regions if information + about supplementary alignments is present. +* Supplying `--max-read-length` drastically improves wall time if reads are + significantly shorter than chromosomes; for PacBio HiFi (CCS) it is suggested + to use the value of 30000. If the value is not specified, edgeCase will + assume *infinity*, and will have to go over the entire content of the BAM file. +* Suggested value of --min-map-overlap for PacBio HiFi: 500. +* Suggested value of --min-(sub)telomere-overlap for PacBio HiFi: 3000. +* Pipe the output through `samtools view -bh -` to compress on the fly. ### tailchopper +Truncates reads in the tailpuller file either to soft/hard-clipped ends (when +--target is "cigar"), or to sequences extending past given anchor (when +--target is "tract_anchor", "fork", or "mask_anchor"). + +Outputs a SAM file with overhanging tails of candidate reads. + ``` -usage: ./edgecase tailchopper --index X [options] bam > fasta +Usage: ./edgecase tailchopper -x filename [-t targetspec] + [-f flagspec]... [-F flagspec]... [-q integer] + +Required options: + -x, --index [filename] location of the reference .ecx index -positional arguments: - bam name of input BAM/SAM file +Options: + -t, --target [targetspec] an ECX flag (cut relative to reference) or 'cigar' [default: tract_anchor] -optional arguments: - -x X, --index X location of the reference .ecx index (REQUIRED) - -f f, --flags f process only entries with all these sam flags present (default: 0) - -F F, --flag-filter F process only entries with none of these sam flags present (default: 0) - -q Q, --min-quality Q process only entries with MAPQ >= Q (default: 0) - -t ?, --target ? an ECX flag (cut relative to reference) or 'cigar' (default: tract_anchor) +Input filtering options: + -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] + -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] + -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] ``` -Truncates reads in the tailpuller file either to soft/hard-clipped ends (when ---target is "cigar"), or to sequences extending past given anchor (when ---target is "tract_anchor", "fork", or "ucsc_mask_anchor"). +**NB**: tailchopper outputs a SAM file with unmapped reads (sets the 0x0004 bit +in the flag), but *retains the original mapping position*; do *not* use this +value for downstream analyses unless you know exactly what you are after. -**NB**: outputs a SAM file with unmapped reads (sets the 0x0004 bit in the -flag), but *retains the original mapping position*; do *not* use this value for -downstream analyses unless you know exactly what you are after. +*Suggestion*: pipe the output through `samtools view -bh -` to compress on the +fly. ### repeatfinder -``` -usage: ./edgecase repeatfinder [options] sequencefile > tsv - -positional arguments: - sequencefile name of input SAM/BAM/FASTA/FASTQ file - -optional arguments: - -f f, --flags f process only entries with all these sam flags present (default: 0) - -F F, --flag-filter F process only entries with none of these sam flags present (default: 0) - -q Q, --min-quality Q process only entries with MAPQ >= Q (default: 0) - --fmt ? format of input file(s) (default: sam) - -m ?, --min-k ? smallest target repeat length (default: 4) - -M ?, --max-k ? largest target repeat length (default: 16) - -r R, --min-repeats R minimum number of consecutive repeats (default: 2) - -n ?, --max-motifs ? maximum number of motifs to report (default: None) - -P ?, --max-p-adjusted ? cutoff adjusted p-value (default: 0.05) - --jellyfish ? jellyfish binary (unless in $PATH) (default: None) - --jellyfish-hash-size ? jellyfish initial hash size (default: 2G) - -j J, --jobs J number of jellyfish jobs (parallel threads) (default: 1) -``` +Expects the SAM/BAM file from `tailchopper` as input; however, will also work +on any SAM/BAM file as well as Fasta/Fastq files. Performs Fisher's exact tests on *k*-mer counts to identify significantly -enriched repeating motifs of lengths from *--min-k* to *--max-k* in the input +enriched repeating motifs of lengths from `--min-k` to `--max-k` in the input file. Relies on [jellyfish](http://www.genome.umd.edu/jellyfish.html) to count -*k*-mers. If *edgeCase* has been installed with the Conda method (by creating -an environment from *environment.yaml*), *jellyfish* is already installed and no +*k*-mers. If edgeCase has been installed with the Conda method (by creating +an environment from `environment.yaml`), `jellyfish` is already installed and no special action is needed. Otherwise, it needs to be installed manually and, if -not in $PATH, supplied with the *--jellyfish* option. +not in `$PATH`, supplied with the `--jellyfish` option. + +Outputs a TSV file with columns: +`monomer motif length score fraction_explained p p_adjusted` + +``` +Usage: ./edgecase repeatfinder [-m integer] [-M integer] [-r integer] [-P float] + [--jellyfish filename] [--jellyfish-hash-size string] + [-n integer] [-j integer] [-q integer] + [-f flagspec]... [-F flagspec]... [--fmt string] + [--collapse-reverse-complement] + +Options: + --fmt sam|fastx format of input file [default: sam] + -m, --min-k [integer] smallest target repeat length [default: 4] + -M, --max-k [integer] largest target repeat length [default: 16] + -r, --min-repeats [integer] minimum number of consecutive repeats [default: 2] + -P, --max-p-adjusted [float] cutoff adjusted p-value [default: .05] + --jellyfish [filename] jellyfish binary (unless in $PATH) + -s, --jellyfish-hash-size [string] jellyfish initial hash size [default: 2G] + -n, --max-motifs [integer] maximum number of motifs to report + -j, --jobs [integer] number of jellyfish jobs (parallel threads) [default: 1] + -C, --collapse-reverse-complement collapse counts of reverse complement motifs + +Input filtering options: + -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] + -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] + -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] +``` ### kmerscanner +Expects the SAM/BAM file from `tailpuller` as input; however, will also work +on any SAM/BAM file as well as Fasta/Fastq files. + +Expects the TSV file from `kmerscanner` provided as the `--motif-file` option; +however, one may supply an arbitrary tab-separated file where the first field of +each line is a motif (except for lines starting with "#" which are treated as +comments). + +In a rolling window along each read in a BAM file, calculates densities of given +motifs and outputs a tab-separated DAT file with columns: +`name flag chrom pos mapq motif score clip_5prime clip_3prime b=N`, +where the last column name contains the value of `--bin-size`, and the column +itself lists all density values along rolling windows for a given motif. + +*Note*: it is recommended to pipe the output through `gzip`, as these files +are quite verbose and easily compressible. In the future, we plan to implement +a more space-efficient (and backwards-compatible) format. + ``` -usage: ./edgecase kmerscanner --motif-file M [options] bam > dat - -positional arguments: - bam name of input SAM/BAM file - -optional arguments: - --motif-file M file with repeated motif sequences (REQUIRED) - -f f, --flags f process only entries with all these sam flags present (default: 0) - -F F, --flag-filter F process only entries with none of these sam flags present (default: 0) - -q Q, --min-quality Q process only entries with MAPQ >= Q (default: 0) - -w W, --window-size W size of the rolling window (default: 100) - --head-test H length of head to use for density filter (if specified) (default: None) - --tail-test T length of tail to use for density filter (if specified) (default: None) - -c C, --cutoff C use hard cutoff for density (default: None) - -j J, --jobs J number of jobs to run in parallel (default: 1) +Usage: ./edgecase kmerscanner [-j integer] --motif-file filename + [-b integer] [-n integer] + [-f flagspec]... [-F flagspec]... [-q integer] + [--fmt string] + +Required options: + --motif-file [filename] file with repeated motif sequences (output of `repeatfinder`) + +Options: + --fmt sam|fastx format of input file [default: sam] + -b, --bin-size [integer] size of the rolling window [default: 10] + -n, --num-reads [integer] expected number of reads in input (for progress display) + -j, --jobs [integer] number of jobs to run in parallel [default: 1] + +Input filtering options: + -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] + -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] + -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] ``` -In a rolling window along each read in a BAM file, calculates densities of given -motifs and outputs a DAT file. -Optionally filters input by terminal density (outputs data only for reads -exceeding density cutoff). By default, outputs data for all input reads. -*--motif-file* is usually the output of *repeatfinder*, but can be an arbitrary -tab-separated file where the first field of each line is a motif (except for -lines starting with "#" which are treated as comments). -**NB**: it is possible to run *kmerscanner* on an entire WGS BAM and look for -reads that pass tests encoded by *--head-test*, *--tail-test*, and *--cutoff*, -but this use case is experimental and is discouraged. Generally, *kmerscanner* -is intended to be used downstream of *tailpuller/tailchopper* and *repeatfinder* -to calculate densities of identified motifs in telomeric candidate reads. In -this case, *--head-test*, *--tail-test*, and *--cutoff* should be omitted. +### densityplot +Expects the DAT file from `kmerscanner` as input; +visualizes the density of motifs on each chromosomal arm in the regions covered +by candidate reads. -### levenshtein +The value of `--palette` can be either none (in which case the maximum of nine +motifs can be plotted with default colors), "paper", "paper|legend=full", +"paper|legend=density", or "paper|legend=motifs" (in which case motifs known +from research can be plotted with custom colors, matching the colors in the +figures in the paper), or a chained key-value sequence of "motif=color" and +"legend=spec", where "spec" is one of "none", "full", "density", "motifs". +For example: `"TTAGGG=green|TGAGGG=#D01000|legend=full"`. + +Annotates the anchors from the ECX with dashed lines: +* mask_anchor == gray, +* fork == blueviolet, +* tract_anchor == red. + +Outputs a PDF file (writes it to stdout; you must pipe the output into a file). +Alternatively, can output a Python pickle file (with `--outfmt=pkl`). ``` -usage: ./edgecase levenshtein [options] sequencedata > tsv - -positional arguments: - sequencedata name of input BAM/SAM file or directory with precomputed distances - -optional arguments: - -f f, --flags f process only entries with all these sam flags present (default: 0) - -F F, --flag-filter F process only entries with none of these sam flags present (default: 0) - -q Q, --min-quality Q process only entries with MAPQ >= Q (default: 0) - --kmerscanner-file ? kmerscanner file (optional, for use with --output-dir) - --min-cluster-size ? minimum cluster size to consider (default: 5) - -o ?, --output-dir ? output directory for clustermaps and per-haplotype SAM files (default: None) +Usage: ./edgecase densityplot -x filename [-b integer] [--plot-coverage] + [--palette palettespec] [--title string] + [--n-boot integer] [--chroms-to-plot string] + [-f flagspec]... [-F flagspec]... [-q integer] + [--figwidth-inches float] [--outfmt string] [-z] + +Required options: + -x, --index [filename] location of the reference .ecx index + +Options: + -z, --gzipped input is gzipped (must specify if any of -qfF present) + -b, --bin-size [integer] size of each bin in bp (overrides bin size in ) + --n-boot [integer] number of bootstrap iterations for 95% confidence intervals [default: 1000] + --palette [palettespec] custom palette for plotting motifs + --title [string] figure title (defaults to input filename) + --chroms-to-plot [string] if set, plot chromosomes from this comma-separated list unconditionally + --plot-coverage plot coverage by telomeric reads on each arm + --figwidth-inches [float] width of figure in inches [default: 13] + --outfmt [string] output format (pdf, pkl) [default: pdf] + +Input filtering options: + -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] + -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] + -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] ``` -For each chromosome arm in the BAM/SAM file, clusters the reads that align there -by their relative pairwise edit distance (Levenshtein distance) and decides on -the best number of clusters by maximizing the Bayesian information criterion. -If more than one cluster is identified, performs a Mann-Whitney U one-tailed -test on all intra-cluster distances vs. all inter-cluster distances. -If the input is a directory with precomputed matrices (files matching mask -*${chromosome_name}-matrix.tsv*), uses these values to cluster and compute -*p*-values (skips the actual step of distance computation). -If *--kmerscanner-file* is provided, generates kmerscanner files for read -clusters (haplotypes) on each arm where more than one such cluster is -detected. - -**NB**: this is an experimental module, and the maximum number of outliers is -hard-coded as 1. This worked for the datasets analyzed in the bioRxiv preprint, -but the number of outliers may have to be adjusted for other datasets. -**NB**: this algorithm scales quadratically with the number of input reads and -is computationally infeasible for large datasets. +### entropy -### densityplot +Expects the DAT file from `kmerscanner` as input; can accept multiple DAT files +at once. + +Calculates entropy values of motif assignments per window among reads, +and outputs a TSV file with columns: +`entropy coverage`. ``` -usage: ./edgecase densityplot --index X [options] dat > pdf - -positional arguments: - dat input density file - -optional arguments: - -x X, --index X location of the reference .ecx index (REQUIRED) - -f f, --flags f process only entries with all these sam flags present (default: 0) - -F F, --flag-filter F process only entries with none of these sam flags present (default: 0) - -q Q, --min-quality Q process only entries with MAPQ >= Q (default: 0) - -z, --gzipped input is gzipped (must specify if any of -qfF present) (default: False) - -b B, --bin-size B size of each bin in bp for visualization speedup (default: 100) - --zoomed-in plot taller traces, cut off pre-anchor regions (default: False) - --palette ? custom palette for plotting motifs (default: None) - -e, --exploded plot each read separately (default: False) - --title T figure title (defaults to input filename) (default: None) +Usage: ./edgecase entropy [-b integer] [-f flagspec]... [-F flagspec]... [-q integer] + [-z] ... + +Options: + -z, --gzipped input is gzipped (must specify if any of -qfF present) + -b, --bin-size [integer] size of each bin in bp (overrides bin size in ) + +Input filtering options: + -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] + -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] + -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] ``` -Visualizes the density of motifs on each chromosomal arm in the regions covered -by candidate reads, binning the values by windows of *--bin-size*. -The value of *--palette* can be either none (in which case the maximum of nine -motifs can be plotted with default colors), "paper" or "paper|legend=False" (in -which case motifs known from research can be plotted with custom colors, -matching the colors in the figures in the bioRxiv preprint), or a chained -key-value sequence of "motif=color" and "legend=boolean", for example: -"TTAGGG=green|TGAGGG=#D01000|legend=True". -Options *--exploded* and *--title* are deprecated. - -Option *--zoomed-in* plots taller figures, discards non-telomeric regions, and -visualizes read coverage above each plot. With this option, two custom "debug" -environment variables can be passed to *densityplot* that specify how much -of the surrounding reference coordinates should be included: *PAPER_LEFT_SPAN* -and *PAPER_RIGHT_SPAN*. -Annotates the anchors from the ECX with dashed lines: -* ucsc_mask_anchor == gray, -* fork == blueviolet, -* tract_anchor == red. +### levenshtein + +Expects the SAM/BAM file from `tailpuller` as input. + +Calculates pairwise relative edit distance (Levenshtein distance) for all pairs +of reads mapping to each chromosomal arm in the input SAM/BAM file. +Outputs a TSV file with columns: +`rname qname1 qname2 relative_ld`, +where `rname` is the name of the chromosome, `qnameN` is the name of a read in +the pair, and `relative_ld` is the distance. + +**NB**: this algorithm scales quadratically with the number of input reads and +is computationally infeasible for large datasets. + +``` +Usage: ./edgecase levenshtein [-f flagspec]... [-F flagspec]... [-q integer] + [-j integer] + +Options: + -j, --jobs [integer] number of jobs to run in parallel [default: 1] + +Input filtering options: + -f, --flags [flagspec] process only entries with all these sam flags present [default: 0] + -F, --flag-filter [flagspec] process only entries with none of these sam flags present [default: 0] + -q, --min-quality [integer] process only entries with this MAPQ or higher [default: 0] +``` diff --git a/sandbox/kmers-per-read/interpret-kmer-counter.py b/sandbox/kmers-per-read/interpret-kmer-counter.py deleted file mode 100755 index e9fee38..0000000 --- a/sandbox/kmers-per-read/interpret-kmer-counter.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python -from sys import argv, stdout -from gzip import open as gzopen -from functools import lru_cache -from collections import defaultdict -from tqdm import tqdm -from pandas import DataFrame - - -""" Generate the input gzipped file with: -./kmer-counter --double-count-palindromes --k 12 --fasta ${INPUT_FILE} \ -| cut -f2 | sed 's/$/ ;/g' | tr ' ' '\n' \ -| awk -F':' '{if (($1==";") || (substr($1,1,6)==substr($1,7))) {print}}' \ -| gzip -3 > ${OUTPUT_FILE} -""" - - -@lru_cache(maxsize=None) -def lowest_alpha_inversion(kmer): - return min(kmer[i:]+kmer[:i] for i in range(len(kmer))) - - -@lru_cache(maxsize=None) -def get_motif_identity(kmer, min_repeats=2): - lai = lowest_alpha_inversion(kmer) - motif = lai[:int(len(lai)/min_repeats)] - if motif * min_repeats == lai: - return motif - - -def main(raw_kmercount_filename): - motif_count_database, counts = [], defaultdict(int) - with gzopen(raw_kmercount_filename, mode="rt") as raw_kc: - for line in tqdm(map(str.strip, raw_kc), total=3506715): - if line == ";": - motif_count_database.append(counts) - counts = defaultdict(int) - else: - stat = line.split(":") - if len(stat) == 2: - counts[get_motif_identity(stat[0])] += int(stat[1]) - DataFrame(motif_count_database).to_csv(stdout, sep="\t", index=False) - - -if __name__ == "__main__": - exit(main(argv[1]) or 0) diff --git a/sandbox/kmers-per-read/kmer-counter/kmer-counter b/sandbox/kmers-per-read/kmer-counter/kmer-counter deleted file mode 100755 index 53129f5..0000000 Binary files a/sandbox/kmers-per-read/kmer-counter/kmer-counter and /dev/null differ diff --git a/tools/masker.py b/tools/masker.py deleted file mode 100644 index 49e6de9..0000000 --- a/tools/masker.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python -from sys import argv -from pysam import AlignmentFile -from regex import finditer, IGNORECASE -from itertools import count -from os import path - - -def get_circular_motif(motif): - return r'|'.join({motif[i:]+motif[:i] for i in range(len(motif))}) - - -def get_motifs_pattern(motif_description): - if path.isfile(motif_description): - with open(motif_description, mode="rt") as motif_handle: - return r'|'.join({ - get_circular_motif(line.split()[0]*2) - for line in motif_handle if line[0] != "#" - }) - else: - return r'|'.join({ - get_circular_motif(kmer*2) - for kmer in motif_description.split("|") - }) - -def iterate_bam(filename, f, F): - with AlignmentFile(filename) as bam: - for entry in bam: - if (entry.flag & f == f) and (entry.flag & F == 0) and (entry.seq): - yield entry - - -def cut_chunks(seq, positions_to_mask, minlen): - prev_pos, start, end = -1, 0, 0 - for pos in range(len(seq)): - if pos not in positions_to_mask: - if pos == prev_pos + 1: - end = pos - else: - start = pos - prev_pos = pos - else: - if start < end: - if end - start + 1 >= minlen: - yield seq[start:end+1] - start = end + 1 - - -def main(filename, motif_description, action="mask", f=49152, F=3840, minlen=22): - minlen = int(minlen) - motifs_pattern = get_motifs_pattern(motif_description) - cid = count(1) - total_masked, total_bases = 0, 0 - for entry in iterate_bam(filename, int(f), int(F)): - positions_to_mask = set() - matcher = finditer( - motifs_pattern, entry.seq, overlapped=True, flags=IGNORECASE - ) - for match in matcher: - positions_to_mask |= set(range(match.start(), match.end())) - if action == "cut": - for chunk in cut_chunks(entry.seq, positions_to_mask, minlen): - print(">{}\n{}".format(next(cid), chunk)) - elif action == "measure": - positions_to_keep = set(range(len(entry.seq)+1)) - positions_to_mask - for chunk in cut_chunks(entry.seq, positions_to_keep, minlen): - print(len(chunk)) - elif action == "mask": - raise NotImplementedError("`action` 'mask'") - elif action == "count": - print("{}\t{}".format( - len(positions_to_mask), len(entry.seq) - len(positions_to_mask) - )) - elif action == "total-fraction": - total_masked += len(positions_to_mask) - total_bases += len(entry.seq) - else: - raise ValueError("Unknown action: '{}'".format(action)) - if action == "total-fraction": - print(total_masked, total_bases, total_masked/total_bases, sep="\t") - return 0 - - -if __name__ == "__main__": - returncode = main(*argv[1:]) - exit(returncode)