diff --git a/Assembly.cpp b/Assembly.cpp index 3335655..2a25122 100644 --- a/Assembly.cpp +++ b/Assembly.cpp @@ -12,6 +12,7 @@ #include "kthread.h" #include "rcut.h" #include "kalloc.h" +#include "ecovlp.h" void ha_get_candidates_interface(ha_abuf_t *ab, int64_t rid, UC_Read *ucr, overlap_region_alloc *overlap_list, overlap_region_alloc *overlap_list_hp, Candidates_list *cl, double bw_thres, int max_n_chain, int keep_whole_chain, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* chain_idx, ma_hit_t_alloc* paf, ma_hit_t_alloc* rev_paf, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp); @@ -595,9 +596,9 @@ static void worker_ovec(void *data, long i, int tid) { ha_ovec_buf_t *b = ((ha_ovec_buf_t**)data)[tid]; int fully_cov, abnormal; - // if(i != 33) return; + // if(i != 12578) return; // fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i); - // if (memcmp("m64012_190920_173625/88015004/ccs", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { + // if (memcmp("7897e875-76e5-42c8-bc37-94b370c4cc8d", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { // fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i); // } else { // return; @@ -606,6 +607,9 @@ static void worker_ovec(void *data, long i, int tid) ha_get_candidates_interface(b->ab, i, &b->self_read, &b->olist, &b->olist_hp, &b->clist, 0.02, asm_opt.max_n_chain, 1, NULL/**&(b->k_flag)**/, &b->r_buf, &(R_INF.paf[i]), &(R_INF.reverse_paf[i]), &(b->tmp_region), NULL, &(b->sp)); + // prt_chain(&b->olist); + // return; + clear_Cigar_record(&b->cigar1); clear_Round2_alignment(&b->round2); @@ -967,6 +971,88 @@ void prt_dbg_rs(FILE *fp, Debug_reads* x, uint64_t round) } +void ha_ec(int64_t round) +{ + int i, hom_cov, het_cov, r_out = 0; + ec_ovec_buf_t *b = NULL; + ha_ecsave_buf_t *e = NULL; + ha_flt_tab_hp = ha_idx_hp = NULL; + + if((ha_idx == NULL)&&(asm_opt.flag & HA_F_VERBOSE_GFA)&&(round == asm_opt.number_of_round - 1)) r_out = 1; + + if(asm_opt.required_read_name) init_Debug_reads(&R_INF_FLAG, asm_opt.required_read_name); // for debugging only + // overlap and correct reads + b = gen_ec_ovec_buf_t(asm_opt.thread_num, 0, (round == asm_opt.number_of_round - 1)); + // CALLOC(b, asm_opt.thread_num); + // for (i = 0; i < asm_opt.thread_num; ++i) + // b[i] = ha_ovec_init(0, (round == asm_opt.number_of_round - 1),0); + + + if(ha_idx) hom_cov = asm_opt.hom_cov; + if(ha_idx == NULL) ha_idx = ha_pt_gen(&asm_opt, ha_flt_tab, round == 0? 0 : 1, 0, &R_INF, &hom_cov, &het_cov); // build the index + ///debug_adapter(&asm_opt, &R_INF); + if (round == 0 && ha_flt_tab == 0) // then asm_opt.hom_cov hasn't been updated + ha_opt_update_cov(&asm_opt, hom_cov); + het_cnt = NULL; + if(round == asm_opt.number_of_round-1 && asm_opt.is_dbg_het_cnt) CALLOC(het_cnt, R_INF.total_reads); + + /** + // fprintf(stderr, "[M::%s-start]\n", __func__); + if (asm_opt.required_read_name) + kt_for(asm_opt.thread_num, worker_ovec_related_reads, b, R_INF.total_reads); + else + kt_for(asm_opt.thread_num, worker_ovec, b, R_INF.total_reads);///debug_for_fix + // fprintf(stderr, "[M::%s-end]\n", __func__); + **/ + cal_ec_multiple(b, asm_opt.thread_num, R_INF.total_reads); + // kt_for(asm_opt.thread_num, worker_hap_ec, b, R_INF.total_reads);///debug_for_fix + + if (r_out) write_pt_index(ha_flt_tab, ha_idx, &R_INF, &asm_opt, asm_opt.output_file_name); + ha_pt_destroy(ha_idx); + ha_idx = NULL; + + if(het_cnt) { + print_het_cnt_log(het_cnt); free(het_cnt); het_cnt = NULL; + } + + /** + // collect statistics + for (i = 0; i < asm_opt.thread_num; ++i) { + asm_opt.num_bases += b[i]->num_read_base; + asm_opt.num_corrected_bases += b[i]->num_correct_base; + asm_opt.num_recorrected_bases += b[i]->num_recorrect_base; + asm_opt.mem_buf += ha_ovec_mem(b[i], NULL); + ha_ovec_destroy(b[i]); + } + free(b); + **/ + destroy_ec_ovec_buf_t(b); + exit(1); + + if (asm_opt.required_read_name) prt_dbg_rs(R_INF_FLAG.fp_r0, &R_INF_FLAG, 0); // for debugging only + + // save corrected reads to R_INF + CALLOC(e, asm_opt.thread_num); + for (i = 0; i < asm_opt.thread_num; ++i) { + init_UC_Read(&e[i].g_read); + e[i].first_round_read_size = e[i].second_round_read_size = 50000; + CALLOC(e[i].first_round_read, e[i].first_round_read_size); + CALLOC(e[i].second_round_read, e[i].second_round_read_size); + } + kt_for(asm_opt.thread_num, worker_ec_save, e, R_INF.total_reads); + for (i = 0; i < asm_opt.thread_num; ++i) { + destory_UC_Read(&e[i].g_read); + free(e[i].first_round_read); + free(e[i].second_round_read); + } + free(e); + + if (asm_opt.required_read_name) prt_dbg_rs(R_INF_FLAG.fp_r1, &R_INF_FLAG, 1); // for debugging only + if (asm_opt.required_read_name) destory_Debug_reads(&R_INF_FLAG), exit(0); // for debugging only + ///debug_print_pob_regions(); +} + + void ha_overlap_and_correct(int round) { int i, hom_cov, het_cov, r_out = 0; @@ -992,12 +1078,15 @@ void ha_overlap_and_correct(int round) het_cnt = NULL; if(round == asm_opt.number_of_round-1 && asm_opt.is_dbg_het_cnt) CALLOC(het_cnt, R_INF.total_reads); // fprintf(stderr, "[M::%s-start]\n", __func__); + // double tt0 = yak_realtime_0(); if (asm_opt.required_read_name) kt_for(asm_opt.thread_num, worker_ovec_related_reads, b, R_INF.total_reads); else kt_for(asm_opt.thread_num, worker_ovec, b, R_INF.total_reads);///debug_for_fix // fprintf(stderr, "[M::%s-end]\n", __func__); - + // fprintf(stderr, "[M::%s::%.3f] ==> chaining\n", __func__, yak_realtime_0()-tt0); + // exit(1); + if (r_out) write_pt_index(ha_flt_tab, ha_idx, &R_INF, &asm_opt, asm_opt.output_file_name); ha_pt_destroy(ha_idx); ha_idx = NULL; @@ -1953,7 +2042,8 @@ int ha_assemble(void) assert(asm_opt.number_of_round > 0); for (r = ha_idx?asm_opt.number_of_round-1:0; r < asm_opt.number_of_round; ++r) { ha_opt_reset_to_round(&asm_opt, r); // this update asm_opt.roundID and a few other fields - ha_overlap_and_correct(r); + // ha_overlap_and_correct(r); + ha_ec(r); fprintf(stderr, "[M::%s::%.3f*%.2f@%.3fGB] ==> corrected reads for round %d\n", __func__, yak_realtime(), yak_cpu_usage(), yak_peakrss_in_gb(), r + 1); fprintf(stderr, "[M::%s] # bases: %lld; # corrected bases: %lld; # recorrected bases: %lld\n", __func__, diff --git a/Correct.cpp b/Correct.cpp index 34c4450..e624b13 100644 --- a/Correct.cpp +++ b/Correct.cpp @@ -1859,7 +1859,7 @@ inline void generate_cigar(char* path, int path_length, window_list *idx, window return; } - ///0 is match, 1 is mismatch, 2 is up, 3 is left + ///0 is match, 1 is mismatch, 2 is up (more y), 3 is left (more x) int32_t i = 0, pre_cl = 0, trem_p = -1; char pre_c = 5; for (i = 0; i < path_length; i++) { if(path[i] == 1) { @@ -3447,9 +3447,30 @@ inline void recalcate_window_advance(overlap_region_alloc* overlap_list, All_rea // fprintf(stderr, "[M::%s::j->%lld] utg%.6dl(%c), align_length::%u, overlap_length::%lld\n", __func__, // j, (int32_t)z->y_id + 1, "+-"[z->y_pos_strand], z->align_length, overlap_length); // } - // if(y_id == 4) { - // fprintf(stderr, "[M::%s::idx->%lld::] z::x_pos_s->%u, z::x_pos_e->%u, ovl->%lld, aln->%u\n", - // __func__, j, z->x_pos_s, z->x_pos_e, overlap_length, z->align_length); + // if(y_id == 24128) { + // fprintf(stderr, "[M::%s::idx->%lld::] x::[%u, %u), y::[%u, %u), ovl->%lld, aln->%u\n", + // __func__, j, z->x_pos_s, z->x_pos_e+1, z->y_pos_s, z->y_pos_e+1, overlap_length, z->align_length); + // radix_sort_window_list_xs_srt(z->w_list.a, z->w_list.a + z->w_list.n); a_nw = z->w_list.n; + // int64_t ss, ee; + // for (i = 0, ss = ee = -2; i < a_nw; i++) { + // p = &(z->w_list.a[i]); + // if(p->x_start == ee) { + // ee = p->x_end + 1; + // } else { + // if(ee > 0) { + // fprintf(stderr, "[M::%s::] x::[%ld, %ld)\n", __func__, ss, ee); + // } + // ss = p->x_start; ee = p->x_end + 1; + // } + // } + // if(ee > 0) { + // fprintf(stderr, "[M::%s::] x::[%ld, %ld)\n", __func__, ss, ee); + // } + + // for (i = 0; i < a_nw; i++) { + // p = &(z->w_list.a[i]); if(p->y_end == -1) continue; + // fprintf(stderr, "[M::%s::] x::[%d, %d), y::[%d, %d), error::%d\n", __func__, p->x_start, p->x_end + 1, p->y_start, p->y_end + 1, p->error); + // } // } ///debug_scan_cigar(&(overlap_list->list[j])); @@ -3992,7 +4013,7 @@ void push_wcigar(window_list *idx, window_list_alloc *res, bit_extz_t *exz) } inline uint32_t aln_wlst_adv_exz(overlap_region *z, All_reads *rref, hpc_t *hpc_g, -const ul_idx_t *uref, char *qstr, char *tstr, bit_extz_t *exz, +const ul_idx_t *uref, char *qstr, char *tstr, bit_extz_t *exz, uint32_t max_err, uint32_t rev, uint32_t id, int64_t qs, int64_t qe, int64_t t_s, int64_t block_s, double e_rate, uint32_t is_cigar) { @@ -4003,9 +4024,9 @@ double e_rate, uint32_t is_cigar) ///1. this window has a large number of differences ///2. DP does not start from the right offset if(rref) { - thres = double_error_threshold(get_init_err_thres(ql, e_rate, block_s, THRESHOLD), ql); + thres = double_error_threshold(get_init_err_thres(ql, e_rate, block_s, max_err), ql); } else { - thres = double_ul_error_threshold(get_init_err_thres(ql, e_rate, block_s, THRESHOLD_MAX_SIZE), ql); + thres = double_ul_error_threshold(get_init_err_thres(ql, e_rate, block_s, max_err), ql); } aln_l = ql + (thres << 1); if(hpc_g) t_tot_l = hpc_len(*hpc_g, id); @@ -8665,8 +8686,21 @@ void generate_haplotypes_naive_advance(haplotype_evdience_alloc* hap, overlap_re } +void prt_sub_read(char *str, uint64_t str_l, uint64_t site, uint64_t win) +{ + uint64_t k, s, e; + s = ((site >= win)?(site-win):(0)); + e = (((site+win+1) <= str_l)?(site+win+1):(str_l)); + fprintf(stderr, "[M::%s-site::%lu] [%lu, %lu)\n", __func__, site, s, e); + + for (k = s; k < site; k++) fprintf(stderr, "%c", str[k]); + fprintf(stderr, "[%c]", str[k++]); + for (; k < e; k++) fprintf(stderr, "%c", str[k]); + + fprintf(stderr, "\n"); +} -void generate_haplotypes_naive_HiFi(haplotype_evdience_alloc* hap, overlap_region_alloc* overlap_list, double up, void *km) +void generate_haplotypes_naive_HiFi(haplotype_evdience_alloc* hap, overlap_region_alloc* overlap_list, double up, UC_Read* g_read, void *km) { // fprintf(stderr, "[M::%s::] Done\n", __func__); if(hap->length == 0) return; @@ -8698,6 +8732,18 @@ void generate_haplotypes_naive_HiFi(haplotype_evdience_alloc* hap, overlap_regio hap->snp_stat.n = m_snp_stat; hap->length = m_list; if(hap->snp_stat.n == 0 || hap->length == 0) return; + // for (k = 0; k < hap->snp_stat.n; k++) { + // s = &(hap->snp_stat.a[k]); if(s->site != 1502) continue; + // fprintf(stderr, "[M::%s] site::%u, occ_0::%u, occ_1::%u, occ_2::%u, is_homopolymer::%u\n", __func__, s->site, s->occ_0, s->occ_1, s->occ_2, s->is_homopolymer); + // prt_sub_read(g_read->seq, g_read->length, s->site, 25); + // for (i = 0; i < overlap_list->length; i++) { + // if(/**overlap_list->list[i].is_match == 1 &&**/ overlap_list->list[i].x_pos_s <= s->site && s->site <= overlap_list->list[i].x_pos_e) { + // fprintf(stderr, "%.*s\tis_match::%u\tid::%u\n", (int)Get_NAME_LENGTH(R_INF, overlap_list->list[i].y_id), Get_NAME(R_INF, overlap_list->list[i].y_id), overlap_list->list[i].is_match, overlap_list->list[i].y_id); + // } + // } + // } + + hap->snp_srt.n = 0; radix_sort_haplotype_evdience_id_srt(hap->list, hap->list + hap->length); for (k = 1, l = 0; k <= hap->length; ++k) { @@ -8709,6 +8755,20 @@ void generate_haplotypes_naive_HiFi(haplotype_evdience_alloc* hap, overlap_regio if(s->occ_0 < 2 || s->occ_1 < 2) continue; if(s->occ_0 >= asm_opt.s_hap_cov && s->occ_1 >= asm_opt.infor_cov) o++;///allels must be real } + // if(overlap_list->list[hap->list[l].overlapID].y_id == 24128) { + // fprintf(stderr, "[M::%s-id::%u] o->%lu(%c)\n", __func__, overlap_list->list[hap->list[l].overlapID].y_id, o, "+-"[overlap_list->list[hap->list[l].overlapID].y_pos_strand]); + // for (i = l, o = 0; i < k; i++) { + // if(hap->list[i].type!=1) continue;///mismatch + // s = &(hap->snp_stat.a[hap->list[i].overlapSite]); + // assert(s->site == hap->list[i].site); + // if(s->occ_0 < 2 || s->occ_1 < 2) continue; + // if(s->occ_0 >= asm_opt.s_hap_cov && s->occ_1 >= asm_opt.infor_cov) { + // // o++;///allels must be real + // fprintf(stderr, "[M::%s] site::%u, occ_0::%u, occ_1::%u, occ_2::%u, is_homopolymer::%u\n", __func__, s->site, s->occ_0, s->occ_1, s->occ_1, s->is_homopolymer); + // prt_sub_read(g_read->seq, g_read->length, s->site, 50); + // } + // } + // } if(o > 0) { o = ((uint32_t)-1) - o; o <<= 32; o += l; @@ -9398,7 +9458,7 @@ void partition_overlaps_advance(overlap_region_alloc* overlap_list, All_reads* R hap->length = m; // generate_haplotypes_naive_advance(hap, overlap_list, NULL); - generate_haplotypes_naive_HiFi(hap, overlap_list, 0.04, NULL); + generate_haplotypes_naive_HiFi(hap, overlap_list, 0.04, g_read, NULL); // generate_haplotypes_DP(hap, overlap_list, R_INF, g_read->length, force_repeat); // generate_haplotypes_naive(hap, overlap_list, R_INF, g_read->length, force_repeat); @@ -11367,7 +11427,7 @@ char *qstr, char *tstr, bit_extz_t *exz, uint32_t rev, uint32_t id) ///ts do not have aux_beg, while te has uint32_t push_wlst_exz(const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, overlap_region* ol, - char* qstr, char *tstr, bit_extz_t *exz, + char* qstr, char *tstr, bit_extz_t *exz, uint32_t max_err, int64_t qs, int64_t qe, int64_t ts, int64_t te, int64_t tl, int64_t aux_beg, int64_t aux_end, double e_rate, int64_t block_s, uint32_t sec_check, double ovlp_cut, int64_t force_aln, void *km) { @@ -11384,7 +11444,7 @@ uint32_t push_wlst_exz(const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, over w_s = w_e + 1; get_win_id_by_s(ol, w_s, block_s, &w_e); // x_start = w_s; x_end = w_e; - if(aln_wlst_adv_exz(ol, rref, hpc_g, uref, qstr, tstr, exz, + if(aln_wlst_adv_exz(ol, rref, hpc_g, uref, qstr, tstr, exz, max_err, ol->y_pos_strand, ol->y_id, w_s, w_e, toff, block_s, e_rate, 0)) { toff = ol->w_list.a[ol->w_list.n-1].y_end + 1; } else { @@ -11402,7 +11462,7 @@ uint32_t push_wlst_exz(const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, over w_e = w_s - 1; get_win_id_by_e(ol, w_e, block_s, &w_s); ys = toff+1-(w_e+1-w_s); // x_start = w_s; x_end = w_e; x_len = x_end + 1 - x_start; - if((ys >= 0) && aln_wlst_adv_exz(ol, rref, hpc_g, uref, qstr, tstr, exz, + if((ys >= 0) && aln_wlst_adv_exz(ol, rref, hpc_g, uref, qstr, tstr, exz, max_err, ol->y_pos_strand, ol->y_id, w_s, w_e, ys, block_s, e_rate, 1)) { toff = ol->w_list.a[ol->w_list.n-1].y_start - 1; } else { @@ -11428,6 +11488,70 @@ uint32_t push_wlst_exz(const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, over return 1; } +#define pass_qovlp(o, a, r) (((a)>0)&&((o)*(r)<=(a))) + +///ts do not have aux_beg, while te has +uint32_t push_hc_wlst_exz(const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, overlap_region* ol, + char* qstr, char *tstr, bit_extz_t *exz, uint32_t max_err, + int64_t qs, int64_t qe, int64_t ts, int64_t te, int64_t tl, + int64_t aux_beg, int64_t aux_end, double e_rate, int64_t block_s, double ovlp_cut, int64_t force_aln, void *km) +{ + + window_list p, t, *a; int64_t w_e, w_s, ce = qs - 1, cs = ol->x_pos_s, toff, ovl, ualn, aln, ys; + uint64_t a_n, k; + p.x_start = qs; p.x_end = qe; p.y_start = ts; p.y_end = te; p.error = exz->err; + p.extra_begin = aux_beg; p.extra_end = aux_end; p.error_threshold = exz->thre; p.cidx = p.clen = 0; + if(ol->w_list.n > 0) { //utilize the the end pos of pre-window in forward + w_e = ol->w_list.a[ol->w_list.n-1].x_end; + toff = ol->w_list.a[ol->w_list.n-1].y_end + 1; + while ((w_e < ce) && (toff < tl)) { + w_s = w_e + 1; + get_win_id_by_s(ol, w_s, block_s, &w_e); + // x_start = w_s; x_end = w_e; + if(aln_wlst_adv_exz(ol, rref, hpc_g, uref, qstr, tstr, exz, max_err, + ol->y_pos_strand, ol->y_id, w_s, w_e, toff, block_s, e_rate, 0)) { + toff = ol->w_list.a[ol->w_list.n-1].y_end + 1; + } else { + break; + } + } + cs = ol->w_list.a[ol->w_list.n-1].x_end + 1; + } + ///utilize the the start pos of next window in backward + a_n = ol->w_list.n; w_s = qs; + if(w_s > cs) { + gen_backtrace_adv_exz(&p, ol, rref, hpc_g, uref, qstr, tstr, exz, ol->y_pos_strand, ol->y_id); + toff = p.y_start - 1; + while (w_s > cs) { + w_e = w_s - 1; + get_win_id_by_e(ol, w_e, block_s, &w_s); ys = toff+1-(w_e+1-w_s); + // x_start = w_s; x_end = w_e; x_len = x_end + 1 - x_start; + if((ys >= 0) && aln_wlst_adv_exz(ol, rref, hpc_g, uref, qstr, tstr, exz, max_err, + ol->y_pos_strand, ol->y_id, w_s, w_e, ys, block_s, e_rate, 1)) { + toff = ol->w_list.a[ol->w_list.n-1].y_start - 1; + } else { + break; + } + } + } + + ol->align_length += qe + 1 - qs; + ovl = ol->x_pos_e+1-ol->x_pos_s; ualn = (qe + 1 - ol->x_pos_s) - ol->align_length; aln = ovl-ualn; + // if((!force_aln) && (!simi_pass(ovl, aln, 0, ovlp_cut, &e_rate)) && (!simi_pass(ovl, aln, sec_check, ovlp_cut, NULL))) { + if((!force_aln) && (!pass_qovlp(ovl, aln, ovlp_cut))) { + kv_push(window_list, ol->w_list, p); + return 0; + } + + if(ol->w_list.n > a_n) { + a = ol->w_list.a + a_n; a_n = ol->w_list.n - a_n; toff = a_n; a_n >>=1; + for (k = 0; k < a_n; k++) { + t = a[k]; a[k] = a[toff-1-k]; a[toff-1-k] = t; + } + } + kv_push(window_list, ol->w_list, p); + return 1; +} uint32_t align_ul_ed_post_extz(overlap_region *z, const ul_idx_t *uref, hpc_t *hpc_g, char* qstr, char *tstr, bit_extz_t *exz, double e_rate, int64_t w_l, double ovlp_cut, int64_t force_aln, void *km) { @@ -11463,7 +11587,7 @@ uint32_t align_ul_ed_post_extz(overlap_region *z, const ul_idx_t *uref, hpc_t *h // assert(exz->err <= exz->thre); // fprintf(stderr, "[M::%s] exz->err::%d\n", __func__, exz->err); ///t_s do not have aux_beg, while t_s + t_end (aka, te) has - if(!push_wlst_exz(uref, hpc_g, NULL, z, qstr, tstr, exz, q_s, q_e, t_s, t_s + exz->pe, + if(!push_wlst_exz(uref, hpc_g, NULL, z, qstr, tstr, exz, THRESHOLD_MAX_SIZE, q_s, q_e, t_s, t_s + exz->pe, t_tot_l, aux_beg, aux_end, e_rate, w_l, sec_check, ovlp_cut, force_aln, km)) { return 0; } @@ -11479,9 +11603,63 @@ uint32_t align_ul_ed_post_extz(overlap_region *z, const ul_idx_t *uref, hpc_t *h return 1; } + +uint32_t align_hc_ed_post_extz(overlap_region *z, All_reads *rref, char* qstr, char *tstr, bit_extz_t *exz, double e_rate, int64_t w_l, double ovlp_cut, int64_t force_aln, void *km) +{ + int64_t q_s, q_e, nw, k, q_l, t_l, t_tot_l, aux_beg, aux_end, t_s, thre, aln_l, t_pri_l; + char *q_string, *t_string; + z->w_list.n = 0; z->is_match = 0; z->align_length = 0; + nw = get_num_wins(z->x_pos_s, z->x_pos_e+1, w_l); + get_win_se_by_normalize_xs(z, (z->x_pos_s/w_l)*w_l, w_l, &q_s, &q_e); + for (k = 0; k < nw; k++) { + aux_beg = aux_end = 0; q_l = 1 + q_e - q_s; + thre = q_l*e_rate; thre = Adjust_Threshold(thre, q_l); + if(thre > THRESHOLD_MAX_SIZE) thre = THRESHOLD_MAX_SIZE; + ///offset of y + t_s = (q_s - z->x_pos_s) + z->y_pos_s; + t_s += y_start_offset(q_s, &(z->f_cigar)); + + aln_l = q_l + (thre<<1); t_tot_l = Get_READ_LENGTH((*rref), z->y_id); + if(init_waln(thre, t_s, t_tot_l, aln_l, &aux_beg, &aux_end, &t_s, &t_pri_l)) { + q_string = qstr+q_s; + recover_UC_Read_sub_region(tstr, t_s, t_pri_l, z->y_pos_strand, rref, z->y_id); t_string = tstr; + // t_string = return_str_seq_exz(tstr, t_s, t_pri_l, z->y_pos_strand, hpc_g, uref, z->y_id); + + t_l = t_pri_l; + // t_end = Reserve_Banded_BPM(t_string, aln_l, q_string, q_l, thre, &error); + ed_band_cal_semi_64_w_absent_diag(t_string, t_l, q_string, q_l, thre, aux_beg, exz); + + // if(z->x_id == 5569 && z->y_id == 5557 && q_s == 10075 && q_e == 10849) { + // fprintf(stderr, "\n[M::%s::semi::t_s->%ld::t_pri_l->%ld::aux_beg->%ld::aux_end->%ld::thre->%ld] exz->ps::%d, exz->pe::%d, exz->ts::%d, exz->te::%d, exz->err::%d, exz->cigar.n::%d, thre::%ld\n", + // __func__, t_s, t_pri_l, aux_beg, aux_end, thre, exz->ps, exz->pe, exz->ts, exz->te, exz->err, (int32_t)exz->cigar.n, thre); + // fprintf(stderr, "[tstr::len->%ld] %.*s\n", t_l, (int32_t)t_l, t_string); + // fprintf(stderr, "[qstr::len->%ld] %.*s\n", q_l, (int32_t)q_l, q_string); + // } + if (is_align(*exz)) { + // ed_band_cal_semi_64_w(t_string, aln_l, q_string, q_l, thre, exz); + // assert(exz->err <= exz->thre); + // fprintf(stderr, "[M::%s] exz->err::%d\n", __func__, exz->err); + ///t_s do not have aux_beg, while t_s + t_end (aka, te) has + if(!push_hc_wlst_exz(NULL, NULL, rref, z, qstr, tstr, exz, THRESHOLD_MAX_SIZE, q_s, q_e, t_s, t_s + exz->pe, + t_tot_l, aux_beg, aux_end, e_rate, w_l, ovlp_cut, force_aln, km)) { + return 0; + } + // append_window_list(z, q_s, q_e, t_s, t_s + t_end, error, aux_beg, aux_end, thre, w_l, km); + } + } + q_s = q_e + 1; q_e = q_s + w_l - 1; + if(q_e >= (int64_t)z->x_pos_e) q_e = z->x_pos_e; + } + + // if((!force_aln) && (!simi_pass(z->x_pos_e+1-z->x_pos_s, z->align_length, 0, ovlp_cut, &e_rate)) && + // (!simi_pass(z->x_pos_e+1-z->x_pos_s, z->align_length, sec_check, ovlp_cut, NULL))) return 0; + if((!force_aln) && (!pass_qovlp(z->x_pos_e+1-z->x_pos_s, z->align_length, ovlp_cut))) return 0; + return 1; +} + inline uint32_t ed_cut(const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char *qstr, char *tstr, uint32_t rev, uint32_t id, -int64_t qs, int64_t qe, int64_t t_s, int64_t block_s, double e_rate, +int64_t qs, int64_t qe, int64_t t_s, int64_t block_s, double e_rate, int64_t max_err, uint32_t aln_dir, int64_t* r_err, int64_t* qoff, int64_t* toff, int64_t* aln_qlen) { (*aln_qlen) = 0; (*r_err) = INT32_MAX; @@ -11494,9 +11672,9 @@ uint32_t aln_dir, int64_t* r_err, int64_t* qoff, int64_t* toff, int64_t* aln_qle ///1. this window has a large number of differences ///2. DP does not start from the right offset if(rref) { - thres = double_error_threshold(get_init_err_thres(ql, e_rate, block_s, THRESHOLD), ql); + thres = double_error_threshold(get_init_err_thres(ql, e_rate, block_s, max_err), ql); } else { - thres = double_ul_error_threshold(get_init_err_thres(ql, e_rate, block_s, THRESHOLD_MAX_SIZE), ql); + thres = double_ul_error_threshold(get_init_err_thres(ql, e_rate, block_s, max_err), ql); } aln_l = ql + (thres << 1); @@ -11566,14 +11744,14 @@ int64_t qs, int64_t qe, int64_t pk) else if(tb[1] == -1 && tb[0] != -1) tb[1] = tb[0]; if(tb[0] != -1) { - if(!ed_cut(uref, hpc_g, rref, qstr, tstr, rev, id, qs, qe, tb[0], block_s, e_rate, + if(!ed_cut(uref, hpc_g, rref, qstr, tstr, rev, id, qs, qe, tb[0], block_s, e_rate, ((rref)?THRESHOLD:THRESHOLD_MAX_SIZE), 0, &(di[0]), NULL, NULL, &(al[0]))) { di[0] = ql; al[0] = 0; } } if(tb[1] != -1) { - if(!ed_cut(uref, hpc_g, rref, qstr, tstr, rev, id, qs, qe, tb[1], block_s, e_rate, + if(!ed_cut(uref, hpc_g, rref, qstr, tstr, rev, id, qs, qe, tb[1], block_s, e_rate, ((rref)?THRESHOLD:THRESHOLD_MAX_SIZE), 1, &(di[1]), NULL, NULL, &(al[1]))) { di[1] = ql; al[1] = 0; } @@ -11602,7 +11780,7 @@ int64_t qs, int64_t qe, int64_t pk) int64_t gen_extend_err_0_exz(overlap_region *z, const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char* qstr, -char *tstr, bit_extz_t *exz, uint64_t *v_idx, int64_t block_s, double e_rate, +char *tstr, bit_extz_t *exz, uint64_t *v_idx, int64_t block_s, double e_rate, int64_t max_err, int64_t qs, int64_t qe, int64_t pk) { int64_t tot_e = 0, ts, di[2], al[2], tb[2], an = z->w_list.n; double rr; @@ -11636,14 +11814,14 @@ int64_t qs, int64_t qe, int64_t pk) else if(tb[1] == -1 && tb[0] != -1) tb[1] = tb[0]; if(tb[0] != -1) { - if(!ed_cut(uref, hpc_g, rref, qstr, tstr, rev, id, qs, qe, tb[0], block_s, e_rate, + if(!ed_cut(uref, hpc_g, rref, qstr, tstr, rev, id, qs, qe, tb[0], block_s, e_rate, max_err, 0, &(di[0]), NULL, NULL, &(al[0]))) { di[0] = ql; al[0] = 0; } } if(tb[1] != -1) { - if(!ed_cut(uref, hpc_g, rref, qstr, tstr, rev, id, qs, qe, tb[1], block_s, e_rate, + if(!ed_cut(uref, hpc_g, rref, qstr, tstr, rev, id, qs, qe, tb[1], block_s, e_rate, max_err, 1, &(di[1]), NULL, NULL, &(al[1]))) { di[1] = ql; al[1] = 0; } @@ -11715,11 +11893,11 @@ char *tstr, char *tstr_1, Correct_dumy* dumy, uint64_t *v_idx, int64_t block_s, } double gen_extend_err_exz(overlap_region *z, const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char* qstr, -char *tstr, bit_extz_t *exz, uint64_t *v_idx, int64_t block_s, double ovlp_cut, double e_rate, double e_max, int64_t *r_e) +char *tstr, bit_extz_t *exz, uint64_t *v_idx, int64_t block_s, double ovlp_cut, double e_rate, double e_max, int64_t max_err, int64_t sec_check, int64_t *r_e) { int64_t ovl, k, ce, an = z->w_list.n, tot_l, tot_e, ws, we, ql; ovl = z->x_pos_e+1-z->x_pos_s; if(r_e) (*r_e) = INT64_MAX; - if(!simi_pass(ovl, z->align_length, 0, ovlp_cut, &e_rate)) return DBL_MAX; + if((sec_check) && (!simi_pass(ovl, z->align_length, 0, ovlp_cut, &e_rate))) return DBL_MAX; // nw = get_num_wins(z->x_pos_s, z->x_pos_e+1, block_s); // for (k = 0; k < an; k++) { // if(z->w_list.a[k].clen) z->w_list.a[k].y_end -= z->w_list.a[k].extra_begin; @@ -11736,7 +11914,7 @@ char *tstr, bit_extz_t *exz, uint64_t *v_idx, int64_t block_s, double ovlp_cut, ws = we+1; get_win_id_by_s(z, ws, block_s, &we); ql = we+1-ws; tot_l += ql; - tot_e += gen_extend_err_0_exz(z, uref, hpc_g, rref, qstr, tstr, exz, v_idx, block_s, e_rate, ws, we, k); + tot_e += gen_extend_err_0_exz(z, uref, hpc_g, rref, qstr, tstr, exz, v_idx, block_s, e_rate, max_err, ws, we, k); if((e_max > 0) && (tot_e > (ovl*e_max))) return DBL_MAX; } ce = z->w_list.a[k].x_start-1; @@ -11749,7 +11927,7 @@ char *tstr, bit_extz_t *exz, uint64_t *v_idx, int64_t block_s, double ovlp_cut, ws = we+1; get_win_id_by_s(z, ws, block_s, &we); ql = we+1-ws; tot_l += ql; - tot_e += gen_extend_err_0_exz(z, uref, hpc_g, rref, qstr, tstr, exz, v_idx, block_s, e_rate, ws, we, k); + tot_e += gen_extend_err_0_exz(z, uref, hpc_g, rref, qstr, tstr, exz, v_idx, block_s, e_rate, max_err, ws, we, k); if((e_max > 0) && (tot_e > (ovl*e_max))) return DBL_MAX; } } @@ -13653,13 +13831,64 @@ int64_t cal_estimate_err(overlap_region *z, int64_t wl, int64_t qs, int64_t qe, tot += ((double)z->w_list.a[k].error)*((double)ovlp)/((double)(we-ws)); } } - tot += ((qe-qs)-cov_l)*e_rate; + tot += ((qe-qs)-cov_l)*e_rate; + return tot; +} + +int64_t cal_estimate_err_hc(overlap_region *z, int64_t wl, int64_t qs, int64_t qe, int64_t ts, int64_t te, double e_rate, int64_t *exact) +{ + int64_t k, ws, we, wid, os, oe, ovlp, tot, cov_l, est = (qe-qs)*e_rate, wn = z->w_list.n, exa = 1, ots, ote, q[2], t[2]; + if(exact) (*exact) = 0; + if(!wn) return est; + if(qs < z->x_pos_s) qs = z->x_pos_s; + if(qe > z->x_pos_e+1) qe = z->x_pos_e+1; + ws = qs/wl; ws *= wl; wid = get_win_id_by_s(z, ws, wl, NULL); + if(wid>=wn) wid = wn-1; + for (k = wid;k < wn && qs > z->w_list.a[k].x_end; k++); if(k == wn) return est; + ///qs <= z->w_list.a[k].x_end + for (;k>=0 && qs < z->w_list.a[k].x_start; k--); if(k < 0) k = 0; + ///qs >= z->w_list.a[k].x_start + + for (tot = cov_l = 0, ots = ote = -1; k < wn && z->w_list.a[k].x_start < qe; k++) { + if(z->w_list.a[k].y_end == -1) continue; + ws = z->w_list.a[k].x_start; we = z->w_list.a[k].x_end+1; + os = MAX(qs, ws); oe = MIN(qe, we); + ovlp = ((oe>os)? (oe-os):0); + if(!ovlp) continue; + cov_l += ovlp; + + if(ovlp == (we-ws)) { + tot += z->w_list.a[k].error; + } else { + tot += ((double)z->w_list.a[k].error)*((double)ovlp)/((double)(we-ws)); + } + if(z->w_list.a[k].error > 0) exa = 0; + if(exa) { + q[0] = os - ws; q[1] = we - oe; + we = z->w_list.a[k].y_end+1; ws = we - (z->w_list.a[k].x_end+1-z->w_list.a[k].x_start); + os = MAX(ts, ws); oe = MIN(te, we); + ovlp = ((oe>os)? (oe-os):0); + t[0] = os - ws; t[1] = we - oe; + if((ovlp) && (q[0] == t[0]) && (t[0] == t[0])) { + if(ote == -1) { + ots = os; ote = oe; + } else if(ote == os) { + ote = oe; + } else { + exa = 0; + } + } else { + exa = 0; + } + } + } + tot += ((qe-qs)-cov_l)*e_rate; + if((exact) && exa && (((qe-qs) == cov_l))) { + if(((qe-qs) == (ote - ots)) && (ots == ts) && (ote == te)) (*exact) = 1; + } return tot; } -#define UC_Read_resize(v, s) do {\ - if ((v).size<(s)) {REALLOC((v).seq,(s));(v).size=(s);}\ - } while (0) ///[s, e); [ps, pe) inline char *retrieve_str_seq_exz(UC_Read *tu, int64_t s, int64_t l, @@ -13811,8 +14040,10 @@ void ref_cigar_check(char* qstr, UC_Read *tu, const ul_idx_t *uref, hpc_t *hpc_g ez->cigar.a[k]&(0x3fff), (ez->cigar.a[k]>>14)); } - fprintf(stderr, "[M::%s::l->%ld] s::%ld, pstr::%.*s\n", __func__, tl, bps, (int32_t)tl, t); - fprintf(stderr, "[M::%s::l->%ld] s::%ld, tstr::%.*s\n", __func__, ql, bts, (int32_t)ql, q); + // fprintf(stderr, "[M::%s::l->%ld] s::%ld, pstr::%.*s\n", __func__, tl, bps, (int32_t)tl, t); + // fprintf(stderr, "[M::%s::l->%ld] s::%ld, tstr::%.*s\n", __func__, ql, bts, (int32_t)ql, q); + fprintf(stderr, "[M::%s::l->%ld] s::%ld\n", __func__, tl, bps); + fprintf(stderr, "[M::%s::l->%ld] s::%ld\n", __func__, ql, bts); exit(1); } ez->ps = bps; ez->ts = bts; @@ -14277,6 +14508,107 @@ int64_t estimate_err, overlap_region *aux_o) } +void set_exact_exz(bit_extz_t *exz, int64_t qs, int64_t qe, int64_t ts, int64_t te) +{ + clear_align(*exz); exz->thre = 0; exz->cigar.n = 0; + exz->err = 0; push_trace(&(exz->cigar), 0, qe - qs); + exz->pl = te - ts; exz->ps = 0; exz->pe = exz->pl-1; + exz->tl = qe - qs; exz->ts = 0; exz->te = exz->tl-1; + exz->ps += ts; exz->pe += ts; + exz->ts += qs; exz->te += qs; +} + + +int64_t hc_aln_exz_adv_hc(overlap_region *z, const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, +char* qstr, UC_Read *tu, int64_t qs, int64_t qe, int64_t ts, int64_t te, int64_t mode, int64_t wl, +bit_extz_t *exz, int64_t q_tot, double e_rate, int64_t maxl, int64_t maxe, int64_t force_l, +int64_t estimate_err, overlap_region *aux_o) +{ + clear_align(*exz); exz->thre = 0; + if(((ts == -1) && (te == -1))) mode = 3;///set to semi-global + int64_t thre, ql = qe - qs, thre0, pts = -1, pte = -1, pthre = -1, full = 0; + if(ql == 0 && (te-ts) == 0) return 1; + if((ql <= 0) || (te-ts) <= 0) return 0; + if(estimate_err < 0) estimate_err = cal_estimate_err_hc(z, wl, qs, qe, ts, te, e_rate, &full); + + + + // if(ql <= 16) { + if(estimate_err == 0) { + if(full) { + // if(!cal_exact_exz(z, uref, hpc_g, rref, exz, qstr, tu, qs, qe, ts, te, &pts, &pte, q_tot, mode)) { + // fprintf(stderr, "[M::%s::ql::%ld::%c] xid::%d, yid::%d, qs::[%ld, %ld), ts::[%ld, %ld), mode::%ld, est_err::%ld, e_rate::%f, maxe::%ld\n", + // __func__, ql, "+-"[z->y_pos_strand], z->x_id, z->y_id, qs, qe, ts, te, mode, estimate_err, e_rate, maxe); + // exit(1); + // } + set_exact_exz(exz, qs, qe, ts, te); push_alnw(aux_o, exz); + return 1; + } else if(cal_exact_exz(z, uref, hpc_g, rref, exz, qstr, tu, qs, qe, ts, te, &pts, &pte, q_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::0(+)\n", exz->err, exz->thre); + push_alnw(aux_o, exz); + return 1; + } + } + + if(ql <= maxl && (estimate_err>>1) <= maxe) { + thre = scale_ed_thre(estimate_err, maxe); + if(cal_exz_infi_adv(z, uref, hpc_g, rref, exz, qstr, tu, qs, qe, ts, te, &pts, &pte, thre, &pthre, q_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::%ld(+)\n", exz->err, exz->thre, thre); + push_alnw(aux_o, exz); + return 1; + } + + thre0 = thre; thre = ql*e_rate; thre = scale_ed_thre(thre, maxe); + if(thre > thre0) { + if(cal_exz_infi_adv(z, uref, hpc_g, rref, exz, qstr, tu, qs, qe, ts, te, &pts, &pte, thre, &pthre, q_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::%ld(-)\n", exz->err, exz->thre, thre); + push_alnw(aux_o, exz); + return 1; + } + } + + thre0 = thre; thre <<= 1; thre = scale_ed_thre(thre, maxe); + if(thre > thre0) { + if(cal_exz_infi_adv(z, uref, hpc_g, rref, exz, qstr, tu, qs, qe, ts, te, &pts, &pte, thre, &pthre, q_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::%ld(-)\n", exz->err, exz->thre, thre); + push_alnw(aux_o, exz); + return 1; + } + } + + thre0 = thre; thre = ql*0.51; thre = scale_ed_thre(thre, maxe); + if(thre > thre0) { + if(cal_exz_infi_adv(z, uref, hpc_g, rref, exz, qstr, tu, qs, qe, ts, te, &pts, &pte, thre, &pthre, q_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::%ld(*)\n", exz->err, exz->thre, thre); + push_alnw(aux_o, exz); + return 1; + } + } + + if(ql <= force_l) { + thre = maxe; + if(cal_exz_infi_adv(z, uref, hpc_g, rref, exz, qstr, tu, qs, qe, ts, te, &pts, &pte, thre, &pthre, q_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::%ld(*)\n", exz->err, exz->thre, thre); + push_alnw(aux_o, exz); + return 1; + } + } + } + // fprintf(stderr, ", err::%d, thre::%d\n", INT32_MAX, exz->thre); + // if(mode == 0) { + // fprintf(stderr, "[M::%s::] pstr::%.*s\n", __func__, (int32_t)tu->length, tu->seq); + // fprintf(stderr, "[M::%s::] tstr::%.*s\n", __func__, (int32_t)(qe-qs), qstr+qs); + // } + return 0; + +} + void prt_k_mer_hit(k_mer_hit *ch_a, int64_t ch_n) { int64_t k; @@ -14935,7 +15267,7 @@ int64_t gen_single_khit(Candidates_list *cl, int64_t ch_n, int64_t h_khit, int64 ///[qs, qe) && [ts, te) int64_t gen_win_chain(overlap_region *z, Candidates_list *cl, int64_t qs, int64_t qe, int64_t ts, int64_t te, int64_t wl, const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char* qstr, UC_Read *tu, bit_extz_t *exz, -int64_t ql, int64_t tl, double e_rate, int64_t h_khit, int64_t mode, int64_t rid) +int64_t ql, int64_t tl, double e_rate, int64_t h_khit, int64_t mode, int64_t rid, int64_t is_accurate) { assert(mode < 3); int64_t k, ws, we, os, oe, wsk, rcn = cl->length, ncn, occ = 0, ovlp, wn = z->w_list.n; asg16_v ez; uint32_t w = 1; @@ -14979,7 +15311,7 @@ int64_t ql, int64_t tl, double e_rate, int64_t h_khit, int64_t mode, int64_t rid ncn = cl->length; cl->length = rcn; k_mer_hit *ch_a = cl->list + rcn; int64_t ch_n0 = ncn - rcn, ch_n; int64_t max_skip, max_iter, max_dis, quick_check; double chn_pen_gap, chn_pen_skip; - set_lchain_dp_op(0, h_khit, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); + set_lchain_dp_op(is_accurate, h_khit, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); max_dis = MAX_SIN_L>>1; // for (k = 0; k < ch_n0; k++) { // assert(debug_k_mer_hit_retrive(&(ch_a[k]), hpc_g, rref, uref, qstr, tu, z->y_id, z->y_pos_strand)); @@ -15010,7 +15342,7 @@ int64_t ql, int64_t tl, int64_t h_khit, int64_t rid) ts = aux_o->w_list.a[aux_i].y_start; te = aux_o->w_list.a[aux_i].y_end+1; if(qe - qs < FORCE_SIN_L || te - ts < FORCE_SIN_L) return; mode = aux_o->w_list.a[aux_i].error_threshold; - ch_n = gen_win_chain(z, cl, qs, qe, ts, te, wl, uref, hpc_g, rref, qstr, tu, exz, ql, tl, e_rate, h_khit, mode, rid); + ch_n = gen_win_chain(z, cl, qs, qe, ts, te, wl, uref, hpc_g, rref, qstr, tu, exz, ql, tl, e_rate, h_khit, mode, rid, 0); ch_a = cl->list + rcn; if(ch_n) { idx.ts = idx.te = (uint32_t)-1; idx.qs = 0; idx.qe = ql; todo = 1; @@ -15077,7 +15409,21 @@ void debug_overlap_region(overlap_region *au, char* qstr, UC_Read *tu, const ul_ for (k = 0; k < wn; k++) { assert((k<=0)||((au->w_list.a[k].x_start>au->w_list.a[k-1].x_end) &&(au->w_list.a[k].y_start>au->w_list.a[k-1].y_end))); + // if(!((k<=0)||((au->w_list.a[k].x_start>au->w_list.a[k-1].x_end) + // &&(au->w_list.a[k].y_start>au->w_list.a[k-1].y_end)))) { + // if(k > 0) { + // fprintf(stderr, "\n[M::%s::(k-1)->%ld] x::[%u, %u], y::[%u, %u]\n", + // __func__, k-1, au->w_list.a[k-1].x_start, au->w_list.a[k-1].x_end, au->w_list.a[k-1].y_start, au->w_list.a[k-1].y_end); + // fprintf(stderr, "[M::%s::(k**)->%ld] x::[%u, %u], y::[%u, %u]\n", + // __func__, k, au->w_list.a[k].x_start, au->w_list.a[k].x_end, au->w_list.a[k].y_start, au->w_list.a[k].y_end); + // } + + // } assert(au->w_list.a[k].x_end>au->w_list.a[k].x_start); + // if(!(au->w_list.a[k].y_end>au->w_list.a[k].y_start)) { + // fprintf(stderr, "[M::%s::(k**)->%ld] x::[%u, %u], y::[%u, %u]\n", + // __func__, k, au->w_list.a[k].x_start, au->w_list.a[k].x_end, au->w_list.a[k].y_start, au->w_list.a[k].y_end); + // } assert(au->w_list.a[k].y_end>au->w_list.a[k].y_start); if(is_ualn_win(au->w_list.a[k]) || is_est_aln(au->w_list.a[k])) continue; ez.cigar.a = au->w_list.c.a + au->w_list.a[k].cidx; @@ -15264,19 +15610,103 @@ bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, int64_t tl, u } } -void cigar_gen_by_chain_adv_local(overlap_region *z, Candidates_list *cl, ul_ov_t *ov, int64_t on, uint64_t wl, const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char* qstr, -UC_Read *tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, uint64_t rid, int64_t h_khit) + +void hc_ovlp_base_direct(overlap_region *z, k_mer_hit *ch_a, int64_t ch_n, int64_t wl, All_reads *rref, char* qstr, UC_Read *tu, +bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, int64_t tl, uint64_t rid) { - if(on <= 0) return; - int64_t ch_idx = z->shared_seed, ch_n; - int64_t i, tl, id = z->y_id, m; - k_mer_hit *ch_a = cl->list + ch_idx; - if(hpc_g) tl = hpc_len(*hpc_g, id); - else if(uref) tl = uref->ug->u.a[id].len; - else tl = Get_READ_LENGTH((*rref), id); - for (i = ch_idx; i < cl->length && cl->list[i].readID == cl->list[ch_idx].readID; i++); ch_n = i-ch_idx; - - // fprintf(stderr, "[M::%s::rid->%ld] utg%.6dl(%c), z::[%u, %u)\n", + int64_t i, l, mode, q[2], t[2], qr, tr, is_done, zn; + + if(z->non_homopolymer_errors == 0 && z->w_list.n) { + zn = z->w_list.n; + for (i = 1; i < zn; i++) { + if((z->w_list.a[i].error == 0 && z->w_list.a[i-1].error == 0) && (z->w_list.a[i].x_start == z->w_list.a[i-1].x_end + 1) && + (z->w_list.a[i].y_end == (z->w_list.a[i-1].y_end + (z->w_list.a[i].x_end-z->w_list.a[i-1].x_end)))) { + continue; + } + break; + } + if(i >= zn) { + q[0] = z->w_list.a[0].x_start; q[1] = z->w_list.a[z->w_list.n-1].x_end; + t[1] = z->w_list.a[z->w_list.n-1].y_end; t[0] = z->w_list.a[0].y_end - (z->w_list.a[0].x_end-z->w_list.a[0].x_start); + + if(q[0] <= t[0]) { + t[0] -= q[0]; q[0] = 0; + } else { + q[0] -= t[0]; t[0] = 0; + } + + qr = ql-q[1]-1; tr = tl-t[1]-1; + if(qr <= tr) { + q[1] = ql-1; t[1] += qr; + } else { + t[1] = tl-1; q[1] += tr; + } + + if(q[0] == z->w_list.a[0].x_start && q[1] == z->w_list.a[z->w_list.n-1].x_end) { + // fprintf(stderr, "[M::%s::%u->%u::%c] ovlp::%u, w_list.n::%u\n", __func__, z->x_id, z->y_id+1, "+-"[z->y_pos_strand], z->x_pos_e+1-z->x_pos_s, (uint32_t)z->w_list.n); + set_exact_exz(exz, q[0], q[1] + 1, t[0], t[1] + 1); push_alnw(aux_o, exz); + return; + } + } + } + + for (l = -1, i = 0; i <= ch_n; i++) { + q[0] = q[1] = t[0] = t[1] = mode = -1; is_done = 0; + if(l >= 0) { + q[0] = ch_a[l].self_offset; t[0] = ch_a[l].offset; + } else { + q[0] = 0; + } + + if(i < ch_n) { + q[1] = ch_a[i].self_offset; t[1] = ch_a[i].offset; + } else { + q[1] = ql; + } + + if((t[0] != -1) && (t[1] != -1)) { + mode = 0;//global + } else if((t[0] != -1) && (t[1] == -1)) { + mode = 1;///forward extension + } else if((t[0] == -1) && (t[1] != -1)) { + mode = 2;///backward extension + } else { + mode = 3;///no primary hit within [ibeg, iend] + } + + // if(z->x_id == 57 && z->y_id == 2175) { + if(mode == 1 || mode == 2) adjust_ext_offset(&(q[0]), &(q[1]), &(t[0]), &(t[1]), ql, tl, 0, mode); + // fprintf(stderr, "#[M::%s::] utg%.6dl(%c), q::[%ld, %ld), t::[%ld, %ld), mode::%ld\n", + // __func__, (int32_t)z->y_id+1, "+-"[z->y_pos_strand], q[0], q[1], t[0], t[1], mode); + // } + is_done = hc_aln_exz_adv_hc(z, NULL, NULL, rref, qstr, tu, q[0], q[1], t[0], t[1], mode, wl, exz, ql, e_rate, + MAX_SIN_L, MAX_SIN_E, FORCE_SIN_L, -1, aux_o); + + // if(z->x_id == 57 && z->y_id == 2175) { + // fprintf(stderr, "-is_done::%ld[M::%s::] utg%.6dl(%c), q::[%ld, %ld), t::[%ld, %ld), mode::%ld, ch_n::%ld\n", + // is_done, __func__, (int32_t)z->y_id+1, "+-"[z->y_pos_strand], q[0], q[1], t[0], t[1], mode, ch_n); + // } + + if(!is_done) {///postprocess + push_unmap_alnw(aux_o, q[0], q[1]-1, t[0], t[1]-1, mode); + } + l = i; + } +} + +void cigar_gen_by_chain_adv_local(overlap_region *z, Candidates_list *cl, ul_ov_t *ov, int64_t on, uint64_t wl, const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char* qstr, +UC_Read *tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, uint64_t rid, int64_t h_khit) +{ + if(on <= 0) return; + int64_t ch_idx = z->shared_seed, ch_n; + int64_t i, tl, id = z->y_id, m; + k_mer_hit *ch_a = cl->list + ch_idx; + if(hpc_g) tl = hpc_len(*hpc_g, id); + else if(uref) tl = uref->ug->u.a[id].len; + else tl = Get_READ_LENGTH((*rref), id); + for (i = ch_idx; i < cl->length && cl->list[i].readID == cl->list[ch_idx].readID; i++); ch_n = i-ch_idx; + + // fprintf(stderr, "[M::%s::rid->%ld] utg%.6dl(%c), z::[%u, %u)\n", // __func__, rid, (int32_t)z->y_id+1, "+-"[z->y_pos_strand], z->x_pos_s, z->x_pos_e+1); on = fusion_chain_ovlp(z, ch_a, ch_n, ov, on, wl, ql, tl); aux_o->w_list.n = aux_o->w_list.c.n = 0; @@ -15335,18 +15765,159 @@ UC_Read *tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, // } } +void rechain_aln_hc(overlap_region *z, Candidates_list *cl, overlap_region *aux_o, int64_t aux_i, int64_t wl, +const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char* qstr, UC_Read *tu, bit_extz_t *exz, double e_rate, +int64_t ql, int64_t tl, int64_t h_khit, int64_t rid) +{ + int64_t rcn = cl->length, ch_n, qs, qe, ts, te, mode, an0, an, todo; + k_mer_hit *ch_a; uint8_t q[2], t[2]; ///ul_ov_t idx; + ///[qs, qe) && [ts, te) + qs = aux_o->w_list.a[aux_i].x_start; qe = aux_o->w_list.a[aux_i].x_end+1; + ts = aux_o->w_list.a[aux_i].y_start; te = aux_o->w_list.a[aux_i].y_end+1; + if(qe - qs < FORCE_SIN_L || te - ts < FORCE_SIN_L) return; + mode = aux_o->w_list.a[aux_i].error_threshold; + ch_n = gen_win_chain(z, cl, qs, qe, ts, te, wl, uref, hpc_g, rref, qstr, tu, exz, ql, tl, e_rate, h_khit, mode, rid, 1); + ch_a = cl->list + rcn; + if(ch_n) { + todo = 1; ///idx.ts = idx.te = (uint32_t)-1; idx.qs = 0; idx.qe = ql; + if(mode == 0) {//global + // idx.qn = 0; idx.tn = ch_n - 1; + // idx.qs = ch_a[idx.qn].self_offset; + // idx.ts = ch_a[idx.qn].offset; + // idx.qe = ch_a[idx.tn].self_offset; + // idx.te = ch_a[idx.tn].offset; + assert(ch_a[0].self_offset == qs && ch_a[0].offset == ts); + assert(ch_a[ch_n-1].self_offset == qe && ch_a[ch_n-1].offset == te); + if(ch_n <= 2) todo = 0; + } else if(mode == 1) {//forward ext + // idx.qn = 0; idx.tn = ch_n; + // idx.qs = ch_a[idx.qn].self_offset; + // idx.ts = ch_a[idx.qn].offset; + // idx.qe = ql; + assert(ch_a[0].self_offset == qs && ch_a[0].offset == ts); + if(ch_n <= 1) todo = 0; + } else if(mode == 2) {///backward ext + // idx.qn = (uint32_t)-1; idx.tn = ch_n-1; + // idx.qs = 0; + // idx.qe = ch_a[idx.tn].self_offset; + // idx.te = ch_a[idx.tn].offset; + assert(ch_a[ch_n-1].self_offset == qe && ch_a[ch_n-1].offset == te); + if(ch_n <= 1) todo = 0; + } + if(todo) { + an0 = aux_o->w_list.n; + // if(z->x_id == 29033 && z->y_id == 21307) { + // fprintf(stderr, "[M::%s]\tan0::%ld\tq::[%u,\t%u)\tt::[%u,\t%u)\tlw::%u\trw::%u\n", __func__, an0, + // idx.qs, idx.qe, idx.ts, idx.te, idx.qn, idx.tn); + // } + // ovlp_base_aln(z, ch_a, ch_n, &idx, wl, uref, hpc_g, rref, qstr, tu, exz, aux_o, e_rate, ql, tl, (uint64_t)-1); + hc_ovlp_base_direct(z, ch_a, ch_n, wl, rref, qstr, tu, exz, aux_o, e_rate, ql, tl, (uint64_t)-1); + an = aux_o->w_list.n; q[0] = q[1] = t[0] = t[1] = 0; todo = 0; + // if(z->x_id == 29033 && z->y_id == 21307) { + // fprintf(stderr, "[M::%s]\tan::%ld\n", __func__, an); + // } + // fprintf(stderr, "[M::%s::] awn0::%ld, awn::%lu\n", __func__, an0, an); + ///old unaligned window could be replaced by the new aligned window + if((an == (an0 + 1)) && (!(is_ualn_win(aux_o->w_list.a[an-1])))) { + if(aux_o->w_list.a[aux_i].x_start == aux_o->w_list.a[an-1].x_start) q[0] = 1; + if(aux_o->w_list.a[aux_i].x_end == aux_o->w_list.a[an-1].x_end) q[1] = 1; + if(aux_o->w_list.a[aux_i].y_start == aux_o->w_list.a[an-1].y_start) t[0] = 1; + if(aux_o->w_list.a[aux_i].y_end == aux_o->w_list.a[an-1].y_end) t[1] = 1; + if((mode == 0) && q[0] && q[1] && t[0] && t[1]) todo = 1; + if((mode == 1) && q[0] && t[0]) todo = 1; + if((mode == 2) && q[1] && t[1]) todo = 1; + if(todo) { + aux_o->w_list.a[aux_i] = aux_o->w_list.a[an-1]; aux_o->w_list.n--; + } + } + // if(an > an0) {///should always > 0 as there are unmapped windows + // } + // aux_o->w_list.n = an0; + } + } + cl->length = rcn;///must reset!!!! +} + +uint64_t gen_hc_fast_cigar0(overlap_region *z, Candidates_list *cl, uint64_t wl, All_reads *rref, char* qstr, UC_Read *tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, uint64_t rid, int64_t h_khit) +{ + int64_t ch_idx = z->shared_seed, ch_n; + int64_t i, tl, id = z->y_id, m, tot_e, aln; + k_mer_hit *ch_a = cl->list + ch_idx; + tl = Get_READ_LENGTH((*rref), id); + for (i = ch_idx; i < cl->length && cl->list[i].readID == cl->list[ch_idx].readID; i++); ch_n = i-ch_idx; + if(ch_n <= 0) return 0; + + + // fprintf(stderr, "[M::%s::rid->%ld] utg%.6dl(%c), z::[%u, %u)\n", + // __func__, rid, (int32_t)z->y_id+1, "+-"[z->y_pos_strand], z->x_pos_s, z->x_pos_e+1); + aux_o->w_list.n = aux_o->w_list.c.n = 0; + aux_o->y_id = z->y_id; aux_o->y_pos_strand = z->y_pos_strand; + aux_o->x_pos_s = z->x_pos_s; aux_o->x_pos_e = z->x_pos_e; + aux_o->y_pos_s = z->y_pos_s; aux_o->y_pos_e = z->y_pos_e; + + hc_ovlp_base_direct(z, ch_a, ch_n, wl, rref, qstr, tu, exz, aux_o, e_rate, ql, tl, rid); + + + int64_t aux_n = aux_o->w_list.n; + for (i = 0; i < aux_n; i++) { + if(!(is_ualn_win(aux_o->w_list.a[i]))) continue; + // if((aux_o->w_list.a[i].x_end+1-aux_o->w_list.a[i].x_start) <= FORCE_CNS_L) { + // fprintf(stderr, "[aln::-i->%ld::ql->%d] q::[%d, %d), t::[%d, %d), err::%d, clen::%u, mode::%d\n", i, + // aux_o->w_list.a[i].x_end+1-aux_o->w_list.a[i].x_start, + // aux_o->w_list.a[i].x_start, aux_o->w_list.a[i].x_end+1, + // aux_o->w_list.a[i].y_start, aux_o->w_list.a[i].y_end+1, + // aux_o->w_list.a[i].error, aux_o->w_list.a[i].clen, aux_o->w_list.a[i].error_threshold); + // } + //will overwrite ch_a; does not matter + rechain_aln_hc(z, cl, aux_o, i, wl, NULL, NULL, rref, qstr, tu, exz, e_rate, ql, tl, h_khit, rid); + } + + if(((int64_t)aux_o->w_list.n) > aux_n) { + for (i = m = 0; i < ((int64_t)aux_o->w_list.n); i++) { + if((i < aux_n) && (is_ualn_win(aux_o->w_list.a[i]))) continue; + aux_o->w_list.a[m++] = aux_o->w_list.a[i]; + } + aux_o->w_list.n = m; + radix_sort_window_list_xs_srt(aux_o->w_list.a, aux_o->w_list.a+aux_o->w_list.n); + } + + ///update z by aux_o + update_overlap_region(z, aux_o, ql, tl); + + aux_n = z->w_list.n; + for (i = tot_e = aln = 0; i < aux_n; i++) { + if(is_ualn_win(z->w_list.a[i])) { + tot_e += z->w_list.a[i].x_end + 1 - z->w_list.a[i].x_start; + } else { + tot_e += z->w_list.a[i].error; aln += z->w_list.a[i].x_end + 1 - z->w_list.a[i].x_start; + } + } + // fprintf(stderr, "[M::%s::%u->%u::%c] ovlp::%u, aln::%ld, tot_e::%ld, w_list.n::%u, ch_n::%ld\n", + // __func__, z->x_id, z->y_id+1, "+-"[z->y_pos_strand], z->x_pos_e+1-z->x_pos_s, aln, tot_e, (uint32_t)z->w_list.n, ch_n); + + // debug_overlap_region(aux_o, qstr, tu, NULL, NULL, rref); + + + // ch_a = cl->list + ch_idx; //update + // for (i = 0; i < wn; i++) z->w_list.a[i].clen = 0;///clean cigar + // if(on > 1) { + // fprintf(stderr, "[M::%s::] rid::%lu, on::%ld\n", __func__, rid, on); + // } + // if(z->y_id == 126) prt_k_mer_hit(ch_a, ch_n); + // for (i = ch_i = 0; i < on; i++) { + // assert((i<=0)||(ov[i].qs > ov[i-1].qe)); + // ov[i].sec = 16;///do not know the aln type + // ch_i = sub_base_aln(z, dp, ch_a, ch_n, pe, ov[i].qs, ov[i].qe, wl, uref, hpc_g, rref, qstr, tu, exz, e_rate, ql, tl, ch_i, rid); + // pe = ov[i].qe; + // } + + return 1; +} + #define gen_err_unaligned(xl, yl) (((xl)<=FORCE_SIN_L)?(MAX((xl), (yl))):MAX((MIN((xl), (yl))), ((xl*0.51)+1))) -#define ovlp_id(x) ((x).tn) -#define ovlp_min_wid(x) ((x).ts) -#define ovlp_max_wid(x) ((x).te) -#define ovlp_cur_wid(x) ((x).qn) -#define ovlp_cur_xoff(x) ((x).qs) -#define ovlp_cur_coff(x) ((x).qe) -#define ovlp_bd(x) ((x).sec) - int64_t retrieve_cigar_err_debug(bit_extz_t *ez, int64_t s, int64_t e) { if(!ez->cigar.n) return 0; @@ -15880,6 +16451,87 @@ int64_t extract_sub_cigar_err_rr(overlap_region *z, int64_t s, int64_t e, ul_ov_ return err; } +///[s, e) +int64_t extract_sub_cigar_hc(overlap_region *z, All_reads *rref, haplotype_evdience_alloc* hp, char* qstr, UC_Read* tu, int64_t s, int64_t e, ul_ov_t *p, int64_t set_f, uint8_t *f, uint8_t occ_thres) +{ + int64_t wk = ovlp_cur_wid(*p), xk = ovlp_cur_xoff(*p), yk = ovlp_cur_yoff(*p), ck = ovlp_cur_coff(*p), os, oe, t; + bit_extz_t ez; int64_t bd = ovlp_bd(*p), s0, e0; char *ystr; + s0 = ((int64_t)(z->w_list.a[wk].x_start)) + bd; + e0 = ((int64_t)(z->w_list.a[wk].x_end)) + 1 - bd; + if(s < s0) s = s0; if(e > e0) e = e0;///exclude boundary + if(s >= e) return -1; + os = MAX(s, s0); oe = MIN(e, e0); + if(oe <= os) return -1; + + set_bit_extz_t(ez, (*z), wk); + if(!ez.cigar.n) return -1; + int64_t cn = ez.cigar.n, op; int64_t ws, we, ovlp, xk0, yk0, ck0; haplotype_evdience ev; + xk0 = xk; yk0 = yk; ck0 = ck; ///for assertion + if((ck < 0) || (ck > cn)) {//(*ck) == cn is allowed + ck = 0; xk = ez.ts; yk = ez.ps; + } + + while (ck > 0 && xk > s) {///x -> t; y -> p + --ck; + op = ez.cigar.a[ck]>>14; + if(op!=2) xk -= (ez.cigar.a[ck]&(0x3fff)); + if(op!=3) yk -= (ez.cigar.a[ck]&(0x3fff)); + } + + if(set_f) { + ovlp_cur_ylen(*p) = 0; + xk0 = xk; yk0 = yk; ck0 = ck; + } else { + assert(xk0 == xk); assert(yk0 == yk); assert(ck0 == ck); + UC_Read_resize(*tu, ovlp_cur_ylen(*p)); ystr = tu->seq; + recover_UC_Read_sub_region(ystr, yk0, ovlp_cur_ylen(*p), z->y_pos_strand, rref, z->y_id); + } + + //some cigar will span s or e + while (ck < cn && xk < e) {//[s, e) + ws = xk; + op = ez.cigar.a[ck]>>14; + if(op!=2) xk += (ez.cigar.a[ck]&(0x3fff)); + if(op!=3) yk += (ez.cigar.a[ck]&(0x3fff)); + ck++; we = xk; + if(op != 0 && op != 1) continue;///only collect match/snp + + os = MAX(s, ws); oe = MIN(e, we); + ovlp = ((oe>os)? (oe-os):0); + if(!ovlp) continue; + + if(set_f) { + if(op == 1) { + for (t = os; t < oe; t++) { + f[t-s] = ((f[t-s]<=126)?(f[t-s]+1):(127)); + } + } + } else { + if(op == 1 || op == 0) { + for (t = os; t < oe; t++) { + if(f[t-s] > occ_thres) { + ev.misBase = ystr[t-xk+yk-yk0]; + ev.overlapID = ovlp_id(*p); + ev.site = t; + ev.overlapSite = t-xk+yk; + ev.type = op; + ev.cov = 1; + addHaplotypeEvdience(hp, &ev, NULL); + } + } + } + } + } + + if(set_f) { + ovlp_cur_xoff(*p) = xk0; ovlp_cur_yoff(*p) = yk0; ovlp_cur_coff(*p) = ck0; ovlp_cur_ylen(*p) = yk - yk0; + } else { + ovlp_cur_xoff(*p) = xk; ovlp_cur_yoff(*p) = yk; ovlp_cur_coff(*p) = ck; ovlp_cur_ylen(*p) = 0; + } + + return 1; +} + uint64_t is_mask_ov(mask_ul_ov_t *mk, uint64_t *bes_id, uint64_t bes_n, uint64_t sec_id) { @@ -15998,6 +16650,22 @@ uint64_t gen_region_phase_robust_rr(overlap_region* ol, uint64_t *id_a, uint64_t return id_n; } +uint64_t hc_phase_robust_rr(overlap_region* ol, All_reads *rref, haplotype_evdience_alloc* hp, char* qstr, UC_Read* tu, uint64_t *id_a, uint64_t id_n, uint64_t s, uint64_t e, ul_ov_t *c_idx, int64_t set_f, uint8_t occ_thres) +{ + uint64_t k, q[2], rr = 0, os, oe; ul_ov_t *p; overlap_region *z; + for (k = 0; k < id_n; k++) { + p = &(c_idx[id_a[k]]); z = &(ol[ovlp_id(*p)]); + q[0] = z->w_list.a[ovlp_cur_wid(*p)].x_start+ovlp_bd(*p); + q[1] = z->w_list.a[ovlp_cur_wid(*p)].x_end+1-ovlp_bd(*p); + if(q[1] <= e) rr = 1; + os = MAX(q[0], s); oe = MIN(q[1], e); + if(oe > os) { + extract_sub_cigar_hc(z, rref, hp, qstr, tu, os, oe, p, set_f, hp->flag + os - s, occ_thres); + } + } + return rr; +} + int64_t infer_rovlp(ul_ov_t *li, ul_ov_t *lj, uc_block_t *bi, uc_block_t *bj, All_reads *ridx, ma_ug_t *ug) { @@ -16454,6 +17122,185 @@ void rphase_rr(overlap_region_alloc* ol, const ul_idx_t *uref, const ug_opt_t *u } } +void debug_inter(overlap_region_alloc* ol, kv_ul_ov_t *c_idx, uint64_t *idx, int64_t idx_n, uint64_t *res, int64_t res_n, int64_t s, int64_t e) +{ + ul_ov_t *cp; int64_t q[2], a_n = 0, i, k = 0, os, oe; + for (i = 0; i < idx_n; i++) { + cp = &(c_idx->a[(uint32_t)idx[i]]); + q[0] = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + q[1] = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + os = MAX(q[0], s); oe = MIN(q[1], e); + if(oe > os) { + a_n++; assert(((uint32_t)idx[i]) == res[k++]); + } + } + // if(a_n != res_n) { + // fprintf(stderr, "[M::%s] a_n::%ld\tres_n::%ld\ts::%ld\te::%ld\n", __func__, a_n, res_n, s, e); + // } + assert(a_n == res_n); +} + +void debug_snp_site(overlap_region* ol, All_reads *rref, UC_Read *qu, haplotype_evdience *a, int64_t a_n) +{ + int64_t k; char ystr; + for (k = 0; k < a_n; k++) { + recover_UC_Read_sub_region(&ystr, a[k].overlapSite, 1, ol[a[k].overlapID].y_pos_strand, rref, ol[a[k].overlapID].y_id); + // if(a[k].misBase == ystr) fprintf(stderr, "[M::%s] misBase::%c\tystr::%c\n", __func__, a[k].misBase, ystr); + assert(a[k].misBase == ystr); + if(a[k].type == 0) { + assert(qu->seq[a[k].site] == ystr); + } else { + assert(qu->seq[a[k].site] != ystr); + } + } + +} + +void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_alloc* hp, UC_Read* qu, UC_Read* tu, kv_ul_ov_t *c_idx, asg64_v* idx, asg64_v* buf, int64_t bd, int64_t wl, int64_t ql, uint8_t occ_thres) +{ + int64_t on = ol->length, k, i, zwn, q[2]; + uint64_t m, l0, wi, wl0, si, ei, fi; overlap_region *z; ul_ov_t *cp; + kv_resize(uint64_t, *idx, (ol->length)); + kv_resize(ul_ov_t, *c_idx, ol->length); + + for (k = idx->n = c_idx->n = 0; k < on; k++) { + z = &(ol->list[k]); zwn = z->w_list.n; + // z->align_length = 0; z->overlapLen = (uint32_t)-1; z->non_homopolymer_errors = 0; + if(!zwn) continue; + for (i = 0; i < zwn; i++) { + if(is_ualn_win(z->w_list.a[i])) continue; + q[0] = z->w_list.a[i].x_start; q[1] = z->w_list.a[i].x_end; + q[0] += bd; q[1] -= bd; + if(q[1] >= q[0]) { + m = ((uint64_t)q[0]); m <<= 32; + m += c_idx->n; kv_push(uint64_t, *idx, m); + + kv_pushp(ul_ov_t, *c_idx, &cp); + ovlp_id(*cp) = k; ///ovlp id + // ovlp_min_wid(*cp) = i; ///beg id of windows + // ovlp_max_wid(*cp) = i; ///end id of windows + ovlp_cur_wid(*cp) = i; ///cur id of windows + ovlp_cur_xoff(*cp) = z->w_list.a[i].x_start; ///cur xpos + ovlp_cur_yoff(*cp) = z->w_list.a[i].y_start; ///cur xpos + ovlp_cur_ylen(*cp) = 0; + ovlp_cur_coff(*cp) = 0; ///cur cigar off in cur window + ovlp_bd(*cp) = bd; + } + } + } + + int64_t srt_n = idx->n, s, e, t, os, oe, rm_n, rr; i = 0; + radix_sort_bc64(idx->a, idx->a+idx->n); + for (k = 1, i = 0; k < srt_n; k++) { + if (k == srt_n || (idx->a[k]>>32) != (idx->a[i]>>32)) { + if(k - i > 1) { + for (t = i; t < k; t++) { + cp = &(c_idx->a[(uint32_t)idx->a[t]]); + // s = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + // assert(s == (int64_t)(idx->a[i]>>32)); + m = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + m <<= 32; m += ((uint32_t)idx->a[t]); idx->a[t] = m; + // fprintf(stderr, "[M::%s] s::%ld\tsi::%lu\n", __func__, s, (idx->a[i]>>32)); + } + radix_sort_bc64(idx->a + i, idx->a + k); + } + i = k; + } + } + + ResizeInitHaplotypeEvdience(hp); + ///fprintf(stderr, "[M::%s] ******\n", __func__); + i = 0; s = 0; e = wl; e = ((e<=ql)?e:ql); rr = 0; + for (; s < ql; ) { + if(rr) { + // rr = 0; + for (m = rm_n = srt_n; m < idx->n; m++) { + cp = &(c_idx->a[idx->a[m]]); + q[0] = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + q[1] = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + os = MAX(q[0], s); oe = MIN(q[1], e); + if(oe > os) { + idx->a[rm_n++] = idx->a[m]; + // if(q[1] <= e) rr = 1; + } + } + idx->n = rm_n; + } + + for (; i < srt_n; ++i) { + cp = &(c_idx->a[(uint32_t)idx->a[i]]); + q[0] = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + q[1] = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + if(q[0] >= e) break; + os = MAX(q[0], s); oe = MIN(q[1], e); + if(oe > os) { + kv_push(uint64_t, *idx, ((uint32_t)idx->a[i])); + // if(q[1] <= e) rr = 1; + } + } + + // fprintf(stderr, "[M::%s] s::%ld, e::%ld, srt_n::%ld, idx->n::%ld\n", __func__, s, e, srt_n, (int64_t)idx->n); + // debug_inter(ol, c_idx, idx->a, srt_n, idx->a + srt_n, idx->n - srt_n, s, e); + l0 = hp->length; + rr = hc_phase_robust_rr(ol->list, rref, hp, qu->seq, tu, idx->a + srt_n, idx->n - srt_n, s, e, c_idx->a, 1, occ_thres); + for (wi = fi = ei = 0, si = ((uint64_t)-1), wl0 = e - s; wi < wl0; wi++) { + if(hp->flag[wi] > 0) { + if(hp->flag[wi] > occ_thres) { + fi = 1; hp->nn_snp++; + } + ei = wi + 1; if(si == ((uint64_t)-1)) si = wi; + } + } + + if(fi) { + rr = hc_phase_robust_rr(ol->list, rref, hp, qu->seq, tu, idx->a + srt_n, idx->n - srt_n, s, e, c_idx->a, 0, occ_thres); + if(hp->length > l0) radix_sort_haplotype_evdience_srt(hp->list + l0, hp->list + hp->length); + } + + if(ei > si) memset(hp->flag + si, 0, (ei-si)*sizeof((*(hp->flag)))); + + + s += wl; e += wl; e = ((e<=ql)?e:ql); + } + + // debug_snp_site(ol->list, rref, qu, hp->list, hp->length); + + + SetSnpMatrix(hp, &(hp->nn_snp), &(ol->length), 0, NULL); srt_n = hp->length; + for (k = 1, i = t = 0; k <= srt_n; ++k) { + if (k == srt_n || hp->list[k].site != hp->list[i].site) { + t += insert_snp_ee(hp, hp->list+i, k-i, hp->list+t, qu, NULL); + i = k; + } + } + hp->length = t; + + // generate_haplotypes_naive_advance(hap, overlap_list, NULL); + generate_haplotypes_naive_HiFi(hp, ol, 0.04, qu, NULL); + // generate_haplotypes_DP(hap, overlap_list, R_INF, g_read->length, force_repeat); + // generate_haplotypes_naive(hap, overlap_list, R_INF, g_read->length, force_repeat); + + // lable_large_indels(overlap_list, g_read->length, dumy, asm_opt.max_ov_diff_ec); + + + // for (i = k = 0, dp = old_dp = 0, beg = 0, end = -1; i < srt_n; ++i) {///[beg, end) but coordinates in idx is [, ] + // ///if idx->a.a[] is qe + // old_dp = dp; + // if ((idx->a[i]>>32)&1) { + // --dp; end = (idx->a[i]>>33)+1; + // }else { + // //meet a new overlap; the overlaps are pushed by the x_pos_s + // ++dp; end = (idx->a[i]>>33); + // kv_push(uint64_t, *idx, ((uint32_t)idx->a[i])); + // } + // if((end > beg) && (old_dp >= 2)) { + // idx->n = srt_n + gen_region_phase_robust_rr(ol->list, idx->a+srt_n, idx->n-srt_n, beg, end, old_dp, c_idx->a, buf, mk); + // } + // beg = end; + // } + +} + int64_t gen_aln_ul_ov_t(int64_t in_id, int64_t tl, overlap_region *in, ul_ov_t *ou) { int64_t zwn; memset(ou, 0, sizeof((*ou))); @@ -18588,7 +19435,7 @@ void ul_lalign(overlap_region_alloc* ol, Candidates_list *cl, const ul_idx_t *ur double e_max = err*1.5, rr; int64_t re; for (i = k = 0; i < ol->length; i++) { z = &(ol->list[i]); ovl = z->x_pos_e + 1 - z->x_pos_s; - rr = gen_extend_err_exz(z, uref, NULL, NULL, qu->seq, tu->seq, exz, v_idx?v_idx->a.a:NULL, w.window_length, -1, err, (e_max+0.000001), &re); + rr = gen_extend_err_exz(z, uref, NULL, NULL, qu->seq, tu->seq, exz, v_idx?v_idx->a.a:NULL, w.window_length, -1, err, (e_max+0.000001), THRESHOLD_MAX_SIZE, 1, &re); z->is_match = 0;///must be here; @@ -21261,6 +22108,281 @@ void ul_rid_lalign_adv(overlap_region_alloc* ol, Candidates_list *cl, const ul_i } } + +uint64_t gen_hc_fast_cigar(overlap_region *z, Candidates_list *cl, All_reads *rref, int64_t wl, char *qstr, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, int64_t rid, int64_t khit) +{ + return_t_chain(z, cl); + gen_hc_fast_cigar0(z, cl, wl, rref, qstr, tu, exz, aux_o, e_rate, ql, rid, khit); + return 1; +} + +void append_cigar(window_list *idx, window_list_alloc *res, uint16_t c, uint32_t l) +{ + if(l <= 0) return; + uint16_t c0 = (uint16_t)-1; uint32_t l0 = 0; + ///last item of old cigar + if(idx->clen > 0) { + c0 = (res->c.a[res->c.n-1]>>14); + l0 = (res->c.a[res->c.n-1]&(0x3fff)); + } + ///first item of new cigar + if(c0 == c) {l += l0; res->c.n--; idx->clen--;} + + push_trace(((asg16_v *)(&(res->c))), c, l); + idx->clen = res->c.n-idx->cidx; +} + +uint16_t adjust_gap(window_list *idx, window_list_alloc *res, char *pstr, char *tstr, int64_t pi, int64_t ti, uint16_t op0, asg16_v* buf) +{ + if(idx->clen == 0) { + append_cigar(idx, res, op0, 1); + return 0;///no move + } + if(op0 != 2 && op0 != 3) return 0;///no move + if(op0 == 2) {///more p -> y + ti--; + } else if(op0 == 3) {///more t -> x + pi--; + } + + uint16_t *ca = res->c.a+idx->cidx; + int64_t ci = idx->clen; int64_t op, cl, k, l[2]; uint16_t p, ff; + for (ci--, buf->n = ff = 0; ci >= 0; ci--) { + op = ca[ci]>>14; cl = (ca[ci]&(0x3fff)); + if(op == 2 || op == 3) { + p = op0; p <<= 14; p += 1; kv_push(uint16_t, *buf, p); + l[0] = cl; + p = op; p <<= 14; p += l[0]; kv_push(uint16_t, *buf, p); + break; + } + for (k = cl-1, l[0] = l[1] = 0; k >= 0; k--, pi--, ti--) { + if((op == 0) && (pstr[pi] != (tstr[ti]))) break; + if(op == 1) assert(pstr[pi] != (tstr[ti])); + } + l[1] = cl - k - 1; l[0] = k + 1; + if(l[1] > 0) { + p = op; p <<= 14; p += l[1]; kv_push(uint16_t, *buf, p); ff = 1; + } + + if(l[0] > 0) { + p = op0; p <<= 14; p += 1; kv_push(uint16_t, *buf, p); + } + + if(l[0] > 0) { + p = op; p <<= 14; p += l[0]; kv_push(uint16_t, *buf, p); + } + + // fprintf(stderr, "[M::%s] ci::%ld, l[0]::%ld, l[1]::%ld\n", __func__, ci, l[0], l[1]); + + if(l[0] > 0) break; + } + + // fprintf(stderr, "[M::%s] ff::%u, ci::%ld, buf->n::%lu\n", __func__, ff, ci, (uint64_t)buf->n); + + if(!ff) {///no move + append_cigar(idx, res, op0, 1); + return 0;///no move + } else if(ci >= 0) { + // if(!(idx->cidx + idx->clen == res->c.n)) { + // fprintf(stderr, "[M::%s] idx->cidx::%lu, idx->clen::%lu, res->c.n::%lu\n", __func__, (uint64_t)idx->cidx, (uint64_t)idx->clen, (uint64_t)res->c.n); + // } + assert(idx->cidx + idx->clen == res->c.n); + idx->clen = ci; res->c.n = idx->cidx + idx->clen; + for (k = ((int64_t)buf->n)-1; k >= 0; k--) { + op = buf->a[k]>>14; cl = (buf->a[k]&(0x3fff)); + append_cigar(idx, res, op, cl); + } + } else { + idx->clen = 0; res->c.n = idx->cidx + idx->clen; + append_cigar(idx, res, op0, 1); + for (k = ((int64_t)buf->n)-1; k >= 0; k--) { + op = buf->a[k]>>14; cl = (buf->a[k]&(0x3fff)); + append_cigar(idx, res, op, cl); + } + } + return 1;///no move +} + +uint16_t ajust_end_cigar(window_list *idx, window_list_alloc *res) +{ + uint16_t *ca = res->c.a+idx->cidx, p, rr = 0; int64_t ci, cn = idx->clen, op, cl; + if(cn <= 0) return rr; + for (ci = 0; ci < cn; ci++) { + op = ca[ci]>>14; cl = (ca[ci]&(0x3fff)); + if(op != 1) break; + p = 3; p <<= 14; p += cl; ca[ci] = p; idx->y_start += cl; rr = 1; + } + + for (ci = cn-1; ci >= 0; ci--) { + op = ca[ci]>>14; cl = (ca[ci]&(0x3fff)); + if(op != 1) break; + p = 3; p <<= 14; p += cl; ca[ci] = p; idx->y_end -= cl; rr = 1; + } + + // if(rr) fprintf(stderr, "[M::%s] rr::%u\n", __func__, rr); + + return rr; +} + +uint16_t move_wins(overlap_region *z, uint32_t wid, overlap_region *aux, All_reads *rref, char *tseq, UC_Read *pu, asg16_v* buf) +{ + // if(z->y_id == 24 && z->x_id == 25) { + // fprintf(stderr, "\n[M::%s] x_id::%u, y_id::%u, x::[%u, %u), y::[%u, %u)\n", __func__, z->x_id, z->y_id, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); + // fprintf(stderr, "[M::%s] wid::%u, x::[%d, %d), y::[%d, %d), err::%d\n", __func__, wid, z->w_list.a[wid].x_start, z->w_list.a[wid].x_end + 1, z->w_list.a[wid].y_start, z->w_list.a[wid].y_end + 1, z->w_list.a[wid].error); + // } + if(is_ualn_win((z->w_list.a[wid]))) { + kv_push(window_list, aux->w_list, (z->w_list.a[wid])); + return 0;///no move + } + + window_list *p = NULL; + bit_extz_t ez; set_bit_extz_t(ez, (*z), wid); + kv_pushp(window_list, aux->w_list, &p); + p->x_start = ez.ts; p->x_end = ez.te; + p->y_start = ez.ps; p->y_end = ez.pe; + p->error_threshold = 0; p->error = ez.err;///single round of alignment cannot have INT16_MAX errors + p->cidx = aux->w_list.c.n; p->clen = 0; ///aux->w_list.c.n += ez.cigar.n; + + if(ez.err == 0) { + push_wcigar(p, &(aux->w_list), &ez); + return 0;///no move + } + + int64_t pi = ez.ps, ti = ez.ts/**, err = 0**/, cl, op, k, rr = 0; uint64_t ci = 0, mm = 0; char *pseq = NULL; + for (ci = mm = 0; ci < ez.cigar.n; ci++) { + op = ez.cigar.a[ci]>>14; ///cl = (ez.cigar.a[ci]&(0x3fff)); + if(op < 2) { + mm = 1; + } else if(mm) { + break; + } + // if(op == 2 || op == 3) break; + } + if(ci >= ez.cigar.n) { + push_wcigar(p, &(aux->w_list), &ez); + if(ajust_end_cigar(p, &(aux->w_list))) rr = 1; + return rr; + } + + if(z->y_pos_strand) { + recover_UC_Read_RC(pu, rref, aux->y_id); + } else { + recover_UC_Read(pu, rref, aux->y_id); + } + pseq = pu->seq; + kv_resize(uint16_t, aux->w_list.c, (aux->w_list.c.n + ez.cigar.n)); + + for (ci = mm = 0; ci < ez.cigar.n; ci++) { + op = ez.cigar.a[ci]>>14; cl = (ez.cigar.a[ci]&(0x3fff)); + // if(z->y_id == 24 && z->x_id == 25) { + // fprintf(stderr, "[M::%s] op::%ld, cl::%ld, pi::%ld, ti::%ld\n", __func__, op, cl, pi, ti); + // fprintf(stderr, "[M::%s] p->cidx::%lu, p->clen::%lu, cc->n::%lu\n", __func__, (uint64_t)p->cidx, (uint64_t)p->clen, (uint64_t)aux->w_list.c.n); + // } + + if(op < 2) { + append_cigar(p, &(aux->w_list), op, cl); + pi+=cl; ti+=cl; mm = 1; + } else { + if(mm == 0) { + append_cigar(p, &(aux->w_list), op, cl); + if(op == 2) {///more p -> y + pi+=cl; + } else if(op == 3) {///more t -> x + ti+=cl; + } + } else { + for (k = 0; k < cl; k++) { + if(adjust_gap(p, &(aux->w_list), pseq, tseq, pi, ti, op, buf)) rr = 1; + if(op == 2) {///more p -> y + pi++; + } else if(op == 3) {///more t -> x + ti++; + } + } + } + } + } + + // if(rr) fprintf(stderr, "[M::%s] rr::%ld\n", __func__, rr); + + if(ajust_end_cigar(p, &(aux->w_list))) rr = 1; + return rr; +} + +void reassign_gaps(overlap_region *z, overlap_region *aux, All_reads *rref, UC_Read* qu, UC_Read* tu, int64_t ql, asg16_v* buf) +{ + // if(z->y_id != 24 || z->x_id != 25) return; + if(z->non_homopolymer_errors == 0) return; + aux->w_list.n = aux->w_list.c.n = 0; + aux->y_id = z->y_id; aux->y_pos_strand = z->y_pos_strand; + aux->x_pos_s = z->x_pos_s; aux->x_pos_e = z->x_pos_e; + aux->y_pos_s = z->y_pos_s; aux->y_pos_e = z->y_pos_e; + + int64_t k, z_n = z->w_list.n, rr = 0; + for (k = 0; k < z_n; k++) { + if(move_wins(z, k, aux, rref, qu->seq, tu, buf)) rr = 1; + } + ///update z by aux_o + if(rr) update_overlap_region(z, aux, ql, Get_READ_LENGTH((*rref), z->y_id)); + + + // fprintf(stderr, "[M::%s] rr::%ld\n", __func__, rr); + ///debug + // bit_extz_t ez; + // if(z->y_pos_strand) { + // recover_UC_Read_RC(tu, rref, z->y_id); + // } else { + // recover_UC_Read(tu, rref, z->y_id); + // } + // for (k = 0; k < z_n; k++) { + // if(is_ualn_win((z->w_list.a[k]))) continue; + // set_bit_extz_t(ez, (*z), k); + // if(!cigar_check(tu->seq, qu->seq, &ez)) { + // fprintf(stderr, "\n[M::%s] x_id::%u, y_id::%u, x::[%u, %u), y::[%u, %u)\n", __func__, z->x_id, z->y_id, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); + // exit(1); + // } + // } +} + +void gen_hc_r_alin(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf) +{ + uint64_t i, bs, k, ql = qu->length; Window_Pool w; double err, e_max, rr; int64_t re; + overlap_region t; overlap_region *z; //asg64_v iidx, buf, buf1; + ol->mapped_overlaps_length = 0; + if(ol->length <= 0) return; + + ///base alignment + err = e_rate; e_max = err * 1.5; + init_Window_Pool(&w, ql, wl, (int)(1.0/err)); + bs = (w.window_length)+(THRESHOLD_MAX_SIZE<<1)+1; + resize_UC_Read(tu, bs<<1); + // fprintf(stderr, "[M::%s] window_length::%lld\n", __func__, w.window_length); + + for (i = k = 0; i < ol->length; i++) { + z = &(ol->list[i]); z->shared_seed = z->non_homopolymer_errors;///for index + + if(!align_hc_ed_post_extz(z, rref, qu->seq, tu->seq, exz, err, w.window_length, OVERLAP_THRESHOLD_HIFI_FILTER, 0, NULL)) continue; + + rr = gen_extend_err_exz(z, NULL, NULL, rref, qu->seq, tu->seq, exz, NULL, w.window_length, -1, err, (e_max+0.000001), THRESHOLD_MAX_SIZE, 0, &re); + z->is_match = 0; + if (rr > err) continue; + z->non_homopolymer_errors = re; + + if(!gen_hc_fast_cigar(z, cl, rref, w.window_length, qu->seq, tu, exz, aux_o, e_rate, ql, rid, khit)) continue; + + reassign_gaps(z, aux_o, rref, qu, tu, ql, buf); + + if(k != i) { + t = ol->list[k]; + ol->list[k] = ol->list[i]; + ol->list[i] = t; + } + z = &(ol->list[k++]); z->is_match = 1; z->non_homopolymer_errors = re; + } + ol->length = k; + if(ol->length <= 0) return; +} + /** void ul_raw_lalign_adv(overlap_region_alloc* ol, Candidates_list *cl, const ul_idx_t *uref, All_reads *rdb, const ug_opt_t *uopt, char *qstr, uint64_t ql, UC_Read* qu, UC_Read* tu, Correct_dumy* dumy, bit_extz_t *exz, haplotype_evdience_alloc* hap, @@ -21371,7 +22493,7 @@ void ug_lalign(overlap_region_alloc* ol, Candidates_list *cl, const ul_idx_t *ur double e_max = err*1.5, rr; int64_t re; for (i = k = 0; i < ol->length; i++) { z = &(ol->list[i]); ovl = z->x_pos_e + 1 - z->x_pos_s; - rr = gen_extend_err_exz(z, uref, NULL, NULL, in/**qu->seq**/, tu->seq, exz, NULL/**v_idx?v_idx->a.a:NULL**/, w.window_length, -1, err, (e_max+0.000001), &re); + rr = gen_extend_err_exz(z, uref, NULL, NULL, in/**qu->seq**/, tu->seq, exz, NULL/**v_idx?v_idx->a.a:NULL**/, w.window_length, -1, err, (e_max+0.000001), THRESHOLD_MAX_SIZE, 1, &re); z->is_match = 0;///must be here; // if(z->x_id == 57 && z->y_id == 2175) { diff --git a/Correct.h b/Correct.h index d56e6f2..5556a34 100644 --- a/Correct.h +++ b/Correct.h @@ -1388,4 +1388,21 @@ int64_t get_rid_backward_cigar_err(rtrace_iter *it, ul_ov_t *aln, kv_rtrace_t *t const ul_idx_t *uref, char* qstr, UC_Read *tu, overlap_region_alloc *ol, overlap_region *o, bit_extz_t *exz, double e_rate, int64_t qs); +void gen_hc_r_alin(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf); +void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_alloc* hp, UC_Read* qu, UC_Read* tu, kv_ul_ov_t *c_idx, asg64_v* idx, asg64_v* buf, int64_t bd, int64_t wl, int64_t ql, uint8_t occ_thres); + +#define ovlp_id(x) ((x).tn) +#define ovlp_min_wid(x) ((x).ts) +#define ovlp_max_wid(x) ((x).te) +#define ovlp_cur_wid(x) ((x).qn) +#define ovlp_cur_xoff(x) ((x).qs) +#define ovlp_cur_yoff(x) ((x).ts) +#define ovlp_cur_ylen(x) ((x).te) +#define ovlp_cur_coff(x) ((x).qe) +#define ovlp_bd(x) ((x).sec) + +#define UC_Read_resize(v, s) do {\ + if ((v).size<(s)) {REALLOC((v).seq,(s));(v).size=(s);}\ + } while (0) + #endif diff --git a/Hash_Table.cpp b/Hash_Table.cpp index 0626305..58c70bb 100644 --- a/Hash_Table.cpp +++ b/Hash_Table.cpp @@ -1986,6 +1986,252 @@ uint64_t lchain_qdp_mcopy(Candidates_list *cl, int64_t a_idx, int64_t a_n, int64 return cL; } +void quick_ck_lchain(k_mer_hit* a, int64_t a_n, int64_t xl, int64_t yl, double chn_pen_gap, double chn_pen_skip, double bw_rate, +int64_t *p, int64_t *t, int32_t *f, int32_t *ii, int64_t *plus, int64_t *msc, int64_t *msc_i, int64_t *movl, int64_t *si, int64_t *ei) +{ + if(a_n <= 0) return; + int64_t l, k, is_srt = 1, z; k_mer_hit *ai, *aj; + int64_t dq, dr, dd, dg, q_span, sc, csc, ddt; + int64_t plus0, msc0, msc_i0, movl0; double lin_pen, a_pen; + + *plus = 0; *msc = *msc_i = INT32_MIN; *movl = INT32_MAX; *si = 0; *ei = a_n; + + for (k = 1, l = 0; k <= a_n; k++) { + if(k == a_n || a[k].strand != a[l].strand) { + t[k-1] = 0; ii[k-1] = 0; + if(is_srt) { + plus0 = 0; msc0 = msc_i0 = INT32_MIN; movl0 = INT32_MAX; ddt = 0; + + + p[l] = -1; f[l] = a[l].cnt&(0xffu); + if(f[l] >= msc0) {msc0 = f[l]; msc_i0 = l;}///difference + if(f[l] < plus0) plus0 = f[l]; + + + for (z = l + 1; z < k; z++) { + ///roughly same to comput_sc_ch(&a[z], &a[z-1]) + ai = &a[z]; aj = &a[z-1]; + dq = (int64_t)(ai->self_offset) - (int64_t)(aj->self_offset); + if(dq <= 0) break; + dr = (int64_t)(ai->offset) - (int64_t)(aj->offset); + if(dr <= 0) break; + dd = dr > dq? dr - dq : dq - dr;//gap + if((dd > 16) && (dd > cal_bw(&(a[z]), &(a[z-1]), bw_rate, xl, yl))) break; + dg = dr < dq? dr : dq;//len + q_span = ai->cnt&(0xffu); + sc = q_span < dg? q_span : dg; + sc = normal_w(sc, ((int32_t)(ai->cnt>>8))); + if (dd || (dg > q_span && dg > 0)) { + lin_pen = (chn_pen_gap*(double)dd); + a_pen = ((double)(sc))*((((double)dd)/((double)dg))/bw_rate); + if(lin_pen > a_pen) lin_pen = a_pen; + lin_pen += (chn_pen_skip*(double)dg); + sc -= (int32_t)lin_pen; + } + + sc += f[z-1]; csc = a[z].cnt&(0xffu); if(sc < csc) break; + p[z] = z - 1; f[z] = sc; ddt += dd; + + if(f[z] >= msc0) {msc0 = f[z]; msc_i0 = z;}///difference + if(f[z] < plus0) plus0 = f[z]; + } + + if((z >= k) && (msc_i0 == (k - 1))) { + if((k - l >= 2) && (ddt > 16) && (ddt > cal_bw(&(a[k-1]), &(a[l]), bw_rate, xl, yl))) msc_i0 = INT32_MIN; + if(msc_i0 == (k - 1)) { + if(msc0 >= (*msc)) { + movl0 = get_chainLen(a[msc_i0].self_offset, a[msc_i0].self_offset, xl, a[msc_i0].offset, a[msc_i0].offset, yl); + if(msc0 > (*msc) || movl0 < (*movl)) { + *msc = msc0; *msc_i = msc_i0; *movl = movl0; + } + } + if(plus0 < (*plus)) *plus = plus0; + if((*ei) > k) { + (*si) = k; + } else { + (*ei) = l; + } + } + } + } + l = k; is_srt = 1; + } else { + if((a[k].self_offset <= a[k-1].self_offset) || (a[k].offset <= a[k-1].offset)) is_srt = 0; + t[k-1] = 0; ii[k-1] = 0; + } + } +} + + +uint64_t lchain_qdp_mcopy_fast(Candidates_list *cl, int64_t a_idx, int64_t a_n, int64_t des_idx, + Chain_Data* dp, overlap_region_alloc* res, int64_t max_skip, int64_t max_iter, + int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, + uint32_t xid, int64_t xl, int64_t yl, int64_t quick_check, uint32_t apend_be, + int64_t gen_cigar, int64_t enable_mcopy, double mcopy_rate, int64_t mcopy_khit_cutoff, + int64_t khit_n) +{ + if(a_n <= 0) return 0; + int64_t *p, *t, max_f, n_skip, st, max_j, end_j, sc, msc, msc_i, max_ii, ovl, movl, plus = 0, min_sc, ch_n, si, ei; + int32_t *f, max, tmp, *ii; int64_t i, k, j, cL = 0; k_mer_hit* a; k_mer_hit* des; k_mer_hit *swap; overlap_region *z; + resize_Chain_Data(dp, a_n, NULL); ch_n = 1; // int64_t bw; bw = ((xl < yl)?xl:yl); bw *= bw_rate; + t = dp->tmp; f = dp->score; p = dp->pre; ii = dp->occ; + + a = cl->list + a_idx; des = cl->list + des_idx; + // if(a_n && a[0].readID == 0) { + // fprintf(stderr, "---[M::%s::utg%.6dl::%c]\n", + // __func__, (int32_t)a[0].readID+1, "+-"[a[0].strand]); + // } + if(quick_check) { + quick_ck_lchain(a, a_n, xl, yl, chn_pen_gap, chn_pen_skip, bw_rate, p, t, f, ii, &plus, &msc, &msc_i, &movl, &si, &ei); + } else { + msc = msc_i = INT32_MIN; movl = INT32_MAX; plus = 0; si = 0; ei = a_n; + memset(t, 0, (a_n*sizeof((*t)))); + } + + for (i = st = si, max_ii = -1; i < ei; ++i) { + max_f = a[i].cnt&(0xffu); + n_skip = 0; max_j = end_j = -1; + if ((i-st) > max_iter) st = i-max_iter; + while (a[i].strand != a[st].strand) ++st; + + for (j = i - 1; j >= st; --j) { + sc = comput_sc_ch(&a[i], &a[j], bw_rate, chn_pen_gap, chn_pen_skip, xl, yl); + if (sc == INT32_MIN) continue; + sc += f[j]; + if (sc > max_f) { + max_f = sc, max_j = j; + if (n_skip > 0) --n_skip; + } else if (t[j] == (int32_t)i) { + if (++n_skip > max_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; + } + end_j = j; + + if ((max_ii<0) || (a[i].self_offset>a[max_ii].self_offset+max_dis) || (a[i].strand!=a[max_ii].strand)) { + max = INT32_MIN; max_ii = -1; + for (j=i-1; (j>=st) && (a[i].self_offset<=max_dis+a[j].self_offset)&&(a[i].strand==a[j].strand); --j) { + if (max < f[j]) { + max = f[j], max_ii = j; + } + } + } + + if ((max_ii >= 0) && (max_ii < end_j) && (a[i].strand == a[max_ii].strand)) {///just have a try with a[i]<->a[max_ii] + tmp = comput_sc_ch(&a[i], &a[max_ii], bw_rate, chn_pen_gap, chn_pen_skip, xl, yl); + if (tmp != INT32_MIN && max_f < tmp + f[max_ii]) + max_f = tmp + f[max_ii], max_j = max_ii; + } + f[i] = max_f; p[i] = max_j; + if ((max_ii < 0) || ((a[i].self_offset<=max_dis+a[max_ii].self_offset)&&(a[i].strand==a[max_ii].strand)&&(f[max_ii]= msc) { + ovl = get_chainLen(a[i].self_offset, a[i].self_offset, xl, a[i].offset, a[i].offset, yl); + if(f[i] > msc || ovl < movl) { + msc = f[i]; msc_i = i; movl = ovl; + } + } + if(f[i] < plus) plus = f[i]; + ii[i] = 0;///for mcopy, not here + // if(a_n && a[0].readID == 0) { + // fprintf(stderr, "i::%ld[M::%s::utg%.6dl::%c] x::%u, y::%u, st::%ld, max_ii::%ld, f[i]::%d, p[i]::%ld, msc_i::%ld, msc::%ld, movl::%ld\n", + // i, __func__, (int32_t)a[i].readID+1, "+-"[a[i].strand], + // a[i].self_offset, a[i].offset, st, max_ii, f[i], p[i], msc_i, msc, movl); + // } + } + + for (i = msc_i, cL = 0; i >= 0; i = p[i]) { ii[i] = 1; t[cL++] = i;}///label the best chain + if((movl < xl) && enable_mcopy/**(movl < yl)**/) { + if(cL >= mcopy_khit_cutoff) {///if there are too few k-mers, disable mcopy + msc -= plus; min_sc = msc*mcopy_rate/**0.2**/; ii[msc_i] = 0; + for (i = ch_n = 0; i < a_n; ++i) {///make all f[] positive + f[i] -= plus; if(i >= ch_n) t[i] = 0; + if((!(ii[i])) && (f[i] >= min_sc)) { + t[ch_n] = ((uint64_t)f[i])<<32; t[ch_n] += (i<<1); ch_n++; + } + } + if(ch_n > 1) { + int64_t n_v, n_v0, ni, n_u, n_u0 = res->length; + radix_sort_hc64i(t, t + ch_n); + for (k = ch_n-1, n_v = n_u = 0; k >= 0; --k) { + n_v0 = n_v; + for (i = ((uint32_t)t[k])>>1; i >= 0 && (t[i]&1) == 0; ) { + ii[n_v++] = i; t[i] |= 1; i = p[i]; + } + if(n_v0 == n_v) continue; + sc = (i<0?(t[k]>>32):((t[k]>>32)-f[i])); + if(sc >= min_sc) { + kv_pushp_ol(overlap_region, (*res), &z); + push_ovlp_chain_qgen(z, xid, xl, yl, sc+plus, &(a[ii[n_v-1]]), &(a[ii[n_v0]])); + ///mcopy_khit_cutoff <= 1: disable the mcopy_khit_cutoff filtering, for the realignment + if((mcopy_khit_cutoff <= 1) || ((z->x_pos_e+1-z->x_pos_s) <= (movl<<2))) { + z->align_length = n_v-n_v0; z->x_id = n_v0; + n_u++; + } else {///non-best is too long + res->length--; n_v = n_v0; + } + } else { + n_v = n_v0; + } + } + + if(n_u > 1) ks_introsort_or_sss(n_u, res->list + n_u0); + res->length = n_u0 + filter_non_ovlp_xchains(res->list + n_u0, n_u, &n_v); + n_u = res->length; + if(n_u > n_u0 + 1) { + kv_resize_cl(k_mer_hit, (*cl), (n_v+cl->length)); + a = cl->list + a_idx; des = cl->list + des_idx; swap = cl->list + cl->length; + for (k = n_u0, i = n_v0 = n_v = 0; k < n_u; k++) { + z = &(res->list[k]); + z->non_homopolymer_errors = des_idx + i; + n_v0 = z->x_id; ni = z->align_length; + for (j = 0; j < ni; j++, i++) { + ///k0 + (ni - j - 1) + swap[i] = a[ii[n_v0 + (ni- j - 1)]]; + swap[i].readID = k; + } + z->x_id = xid; + if(gen_cigar) gen_fake_cigar(&(z->f_cigar), z, apend_be, swap+i-ni, ni); + if(!khit_n) z->align_length = 0; + } + memcpy(des, swap, i*sizeof((*swap))); //assert(i == ch_n); + + // fprintf(stderr, "[M::%s::msc->%ld] msc_k_hits::%u, cL::%ld, min_sc::%ld, best_sc::%ld, n_u0_sc::%d, mcopy_rate::%f, # chains::%ld\n", + // __func__, msc, res->list[n_u0].align_length, cL, min_sc, msc+plus, res->list[n_u0].shared_seed, + // mcopy_rate, n_u-n_u0); + } else if(n_u == n_u0 + 1) { + z = &(res->list[n_u0]); k = n_u0; i = 0; + z->non_homopolymer_errors = des_idx + i; + n_v0 = z->x_id; ni = z->align_length; + for (j = 0; j < ni; j++, i++) { + ///k0 + (ni - j - 1) + des[i] = a[ii[n_v0 + (ni- j - 1)]]; + des[i].readID = k; + } + z->x_id = xid; + if(gen_cigar) gen_fake_cigar(&(z->f_cigar), z, apend_be, des+i-ni, ni); + if(!khit_n) z->align_length = 0; + } + return i; + } else { + msc += plus; i = msc_i; cL = 0; + while (i >= 0) {t[cL++] = i; i = p[i];} + } + } + } + ///a[] has been sorted by self_offset + // i = msc_i; cL = 0; + // while (i >= 0) {t[cL++] = i; i = p[i];} + kv_pushp_ol(overlap_region, (*res), &z); + push_ovlp_chain_qgen(z, xid, xl, yl, msc, &(a[t[cL-1]]), &(a[t[0]])); + for (i = 0; i < cL; i++) {des[i] = a[t[cL-i-1]]; des[i].readID = res->length-1;} + z->non_homopolymer_errors = des_idx; + if(gen_cigar) gen_fake_cigar(&(z->f_cigar), z, apend_be, des, cL); + if(khit_n) z->align_length = cL; + return cL; +} #define rev_khit(an, xl, yl) do { \ diff --git a/Hash_Table.h b/Hash_Table.h index e6f05c9..8ab6142 100644 --- a/Hash_Table.h +++ b/Hash_Table.h @@ -8,6 +8,7 @@ #define WINDOW 375 #define WINDOW_BOUNDARY 375 +#define WINDOW_HC 775 ///for one side, the first or last WINDOW_UNCORRECT_SINGLE_SIDE_BOUNDARY bases should not be corrected #define WINDOW_UNCORRECT_SINGLE_SIDE_BOUNDARY 25 #define THRESHOLD 15 @@ -236,4 +237,11 @@ uint64_t lchain_qdp_mcopy(Candidates_list *cl, int64_t a_idx, int64_t a_n, int64 int64_t gen_cigar, int64_t enable_mcopy, double mcopy_rate, int64_t mcopy_khit_cutoff, int64_t khit_n); +uint64_t lchain_qdp_mcopy_fast(Candidates_list *cl, int64_t a_idx, int64_t a_n, int64_t des_idx, + Chain_Data* dp, overlap_region_alloc* res, int64_t max_skip, int64_t max_iter, + int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, + uint32_t xid, int64_t xl, int64_t yl, int64_t quick_check, uint32_t apend_be, + int64_t gen_cigar, int64_t enable_mcopy, double mcopy_rate, int64_t mcopy_khit_cutoff, + int64_t khit_n); + #endif diff --git a/Levenshtein_distance.h b/Levenshtein_distance.h index 592f1b5..612a9b9 100644 --- a/Levenshtein_distance.h +++ b/Levenshtein_distance.h @@ -548,6 +548,24 @@ inline int32_t pop_trace_back(asg16_v *res, int32_t i, uint16_t *c, uint32_t *le return i; } +inline void push_trace_bp(asg16_v *res, uint16_t c, uint16_t b, uint32_t len, uint32_t is_append) +{ + uint16_t p; + if((is_append) && (res->n)) { + + } + + + + c <<= 14; + while (len >= (0x3fff)) { + p = (c + (0x3fff)); kv_push(uint16_t, *res, p); len -= (0x3fff); + } + if(len) { + p = (c + len); kv_push(uint16_t, *res, p); + } +} + ///511 -> 16 64-bits // #define MAX_E 511 // #define MAX_L 2500 @@ -601,7 +619,7 @@ inline uint32_t cigar_check(char *pstr, char *tstr, bit_extz_t *ez) if(c == 0) { for (k=0;(klength = ab->n_a; } + +void minimizers_qgen0(ha_abuf_t *ab, char* rs, int64_t rl, uint64_t mz_w, uint64_t mz_k, Candidates_list *cl, kvec_t_u8_warp* k_flag, +void *ha_flt_tab, ha_pt_t *ha_idx, All_reads* rdb, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ) +{ + // fprintf(stderr, "+[M::%s]\n", __func__); + uint64_t i, k, l, max_cnt = UINT32_MAX, min_cnt = 0; int n, j; ha_mz1_t *z; seed1_t *s; + if(high_occ) { + max_cnt = (*high_occ); + if(max_cnt < 2) max_cnt = 2; + } + if(low_occ) { + min_cnt = (*low_occ); + if(min_cnt < 2) min_cnt = 2; + } + clear_Candidates_list(cl); ab->mz.n = 0, ab->n_a = 0; + + // get the list of anchors + mz1_ha_sketch(rs, rl, mz_w, mz_k, 0, !(asm_opt.flag & HA_F_NO_HPC), &ab->mz, ha_flt_tab, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 0, NULL); + + // minimizer of queried read + if (ab->mz.m > ab->old_mz_m) { + ab->old_mz_m = ab->mz.m; + REALLOC(ab->seed, ab->old_mz_m); + } + + for (i = 0, ab->n_a = 0; i < ab->mz.n; ++i) { + + ab->seed[i].a = ha_pt_get(ha_idx, ab->mz.a[i].x, &n); + ab->seed[i].n = n; + ab->n_a += n; + } + + if (ab->n_a > ab->m_a) { + ab->m_a = ab->n_a; + REALLOC(ab->a, ab->m_a); + } + + for (i = 0, k = 0; i < ab->mz.n; ++i) { + ///z is one of the minimizer + z = &ab->mz.a[i]; s = &ab->seed[i]; + for (j = 0; j < s->n; ++j) { + const ha_idxpos_t *y = &s->a[j]; + anchor1_t *an = &ab->a[k++]; + uint8_t rev = z->rev == y->rev? 0 : 1; + an->other_off = rev?((uint32_t)-1)-1-(y->pos+1-y->span):y->pos; + an->self_off = z->pos; + ///an->cnt: cnt<<8|span + an->cnt = s->n; if(an->cnt > ((uint32_t)(0xffffffu))) an->cnt = 0xffffffu; + an->cnt <<= 8; an->cnt |= ((z->span <= ((uint32_t)(0xffu)))?z->span:((uint32_t)(0xffu))); + an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | an->self_off; + } + } + + // copy over to _cl_ + if (ab->m_a >= (uint64_t)cl->size) { + cl->size = ab->m_a; + REALLOC(cl->list, cl->size); + } + + k_mer_hit *p; uint64_t tid = (uint64_t)-1, tl = (uint64_t)-1; + radix_sort_ha_an1(ab->a, ab->a + ab->n_a); + for (k = 1, l = 0; k <= ab->n_a; ++k) { + if (k == ab->n_a || ab->a[k].srt != ab->a[l].srt) { + if (k-l>1) radix_sort_ha_an3(ab->a+l, ab->a+k); + if((ab->a[l].srt>>33)!=tid) { + tid = ab->a[l].srt>>33; + tl = Get_READ_LENGTH((*rdb), tid); + // tl = rdb?Get_READ_LENGTH((*rdb), tid):udb->ug->u.a[tid].len; + } + for (i = l; i < k; i++) { + p = &cl->list[i]; + p->readID = ab->a[i].srt>>33; + p->strand = (ab->a[i].srt>>32)&1; + if(!(p->strand)) { + p->offset = ab->a[i].other_off; + } else { + p->offset = ((uint32_t)-1)-ab->a[i].other_off; + p->offset = tl-p->offset; + } + p->self_offset = ab->a[i].self_off; + if(((ab->a[i].cnt>>8) < max_cnt) && ((ab->a[i].cnt>>8) > min_cnt)){ + p->cnt = 1; + } else if((ab->a[i].cnt>>8) <= min_cnt) { + p->cnt = 2; + } else{ + p->cnt = 1 + (((ab->a[i].cnt>>8) + (max_cnt<<1) - 1)/(max_cnt<<1)); + p->cnt = pow(p->cnt, 1.1); + } + if(p->cnt > ((uint32_t)(0xffffffu))) p->cnt = 0xffffffu; + p->cnt <<= 8; p->cnt |= (((uint32_t)(0xffu))&(ab->a[i].cnt)); + } + l = k; + } + } + cl->length = ab->n_a; +} + void gen_pair_chain(ha_abufl_t *ab, uint64_t rid, st_mt_t *tid, uint64_t tid_n, ha_mzl_t *in, uint64_t in_n, ha_mzl_t *idx, int64_t idx_n, uint64_t mzl_cutoff) { if(!tid_n) return; @@ -1615,6 +1712,155 @@ void lchain_qgen_mcopy(Candidates_list* cl, overlap_region_alloc* ol, uint32_t r for (i = 0; i < ol->length; ++i) ol->list[i].align_length = 0; } +void lchain_qgen_mcopy_fast(Candidates_list* cl, overlap_region_alloc* ol, uint32_t rid, uint64_t rl, All_reads* rdb, + uint32_t apend_be, uint64_t max_n_chain, int64_t max_skip, int64_t max_iter, + int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, int64_t quick_check, + uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut, st_mt_t *sp) +{ + // fprintf(stderr, "+[M::%s]\n", __func__); + uint64_t i, k, l, m, cn = cl->length, yid, ol0, lch; overlap_region *r, t; ///srt = 0 + clear_overlap_region_alloc(ol); + + for (l = 0, k = 1, m = 0, lch = 0; k <= cn; k++) { + if((k == cn) || (cl->list[k].readID != cl->list[l].readID)) { + if(cl->list[l].readID != rid) { + yid = cl->list[l].readID; ol0 = ol->length; + m += lchain_qdp_mcopy_fast(cl, l, k-l, m, &(cl->chainDP), ol, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_rate, + rid, rl, Get_READ_LENGTH((*rdb), yid), quick_check, apend_be, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, 1); + if((chain_cutoff >= 2) && (!lch)) { + for (i = ol0; (ilength) && (!lch); i++) { + if(ol->list[i].align_length < chain_cutoff) lch = 1; + } + } + } + l = k; + } + } + cl->length = m; + + // for (k = 0; k < ol->length; k++) { + // fprintf(stderr, "---[M::%s::utg%.6dl] q[%d, %d), t[%d, %d), khit_off::%u\n", __func__, + // (int32_t)ol->list[k].y_id+1, ol->list[k].x_pos_s, ol->list[k].x_pos_e+1, + // ol->list[k].y_pos_s, ol->list[k].y_pos_e+1, ol->list[k].non_homopolymer_errors); + // } + + k = ol->length; + if (ol->length > max_n_chain) { + int32_t w, n[4], s[4]; + n[0] = n[1] = n[2] = n[3] = 0, s[0] = s[1] = s[2] = s[3] = 0; + ks_introsort_or_ss(ol->length, ol->list); + for (i = 0; i < ol->length; ++i) { + r = &(ol->list[i]); + w = ha_ov_type(r, rl); + ++n[w]; + if (((uint64_t)n[w]) == max_n_chain) s[w] = r->shared_seed; + } + if (s[0] > 0 || s[1] > 0 || s[2] > 0 || s[3] > 0) { + // n[0] = n[1] = n[2] = n[3] = 0; + for (i = 0, k = 0, lch = 0; i < ol->length; ++i) { + r = &(ol->list[i]); + w = ha_ov_type(r, rl); + // ++n[w]; + // if (((int)n[w] <= max_n_chain) || (r->shared_seed >= s[w] && s[w] >= (asm_opt.k_mer_length<<1))) { + if (r->shared_seed >= s[w]) { + if (k != i) { + t = ol->list[k]; + ol->list[k] = ol->list[i]; + ol->list[i] = t; + } + if(ol->list[k].align_length < chain_cutoff) lch = 1; + ++k; + } + } + ol->length = k; + } + } + + ks_introsort_or_xs(ol->length, ol->list); + if(lch) { + //@brief r485 + uint64_t zs, ze, rs, re, ob, os, oe, ocn, pp, kn, ms, me; int64_t osc; + for (i = l = 0, cn = cl->length; i < ol->length; ++i) { + if(ol->list[i].align_length < chain_cutoff) { + zs = ol->list[i].x_pos_s; ze = ol->list[i].x_pos_e + 1; + ob = (ze - zs)*OFL; if(ob < 16) ob = 16; + osc = ol->list[i].shared_seed*CH_SC; + ocn = ol->list[i].align_length<length) && (ze > ol->list[k].x_pos_s); k++) { + if(ol->list[k].align_length < chain_cutoff) continue; + if(ol->list[k].align_length < ocn) continue; + if(ol->list[k].shared_seed < osc) continue; + rs = ol->list[k].x_pos_s; re = ol->list[k].x_pos_e + 1; + os = ((rs>=zs)?rs:zs); oe = ((re<=ze)?re:ze); + if((oe > os) && (oe - os) >= ob) { + m = ol->list[k].non_homopolymer_errors; + pp = cl->list[m].readID; kn = 0; + for (; (m < cn) && (cl->list[m].readID == pp) && (kn < ocn); m++) { + me = cl->list[m].self_offset; ms = me - (cl->list[m].cnt&(0xffu)); + if((ms >= os) && (me <= oe)) kn++; + } + if(kn >= ocn) break; + } + } + if((k < ol->length) && (ze > ol->list[k].x_pos_s)) continue; + } + if (l != i) { + t = ol->list[l]; + ol->list[l] = ol->list[i]; + ol->list[i] = t; + } + l++; + } + // fprintf(stderr, "+[M::%s] rid::%u, ol->length0::%lu, ol->length1::%lu\n", __func__, rid, ol->length, l); + ol->length = l; + + + /** + //@brief r484 + for (i = sp->n = 0; i < ol->length; ++i) { + if(ol->list[i].align_length < chain_cutoff) continue; + os = ol->list[i].x_pos_s; oe = ol->list[i].x_pos_e + 1; + if((sp->n) && (((uint32_t)sp->a[sp->n-1]) >= os)) { + if(oe > ((uint32_t)sp->a[sp->n-1])) { + oe = oe - ((uint32_t)sp->a[sp->n-1]); + sp->a[sp->n-1] += oe; + } + } else { + os = (os<<32)|oe; kv_push(uint64_t, *sp, os); + } + } + + for (i = k = 0; i < ol->length; ++i) { + if(ol->list[i].align_length < chain_cutoff) {///ol has been sorted by x_pos_s + r = &(ol->list[i]); rs = r->x_pos_s; re = r->x_pos_e + 1; + rl = re - rs; ovl = 0; + for (m = 0; (m < sp->n) && (re > (sp->a[m]>>32)); m++) { + os = ((rs>=(sp->a[m]>>32))? rs:(sp->a[m]>>32)); + oe = ((re<=((uint32_t)sp->a[m]))? re:((uint32_t)sp->a[m])); + if(oe > os) { + ovl += (oe - os); if(ovl >= (rl*0.95)) break; + } + } + if(ovl >= (rl*0.95)) continue; + } + if (k != i) { + t = ol->list[k]; + ol->list[k] = ol->list[i]; + ol->list[i] = t; + } + ol->list[k++].align_length = 0; + } + // fprintf(stderr, "+[M::%s] ol->length0::%lu, ol->length1::%lu\n", __func__, ol->length, k); + ol->length = k; + **/ + } + /**else { + for (i = 0; i < ol->length; ++i) ol->list[i].align_length = 0; + } + **/ + for (i = 0; i < ol->length; ++i) ol->list[i].align_length = 0; +} + inline uint64_t special_lchain(Candidates_list* cl, overlap_region_alloc* ol, uint32_t rid, uint64_t rl, All_reads* rdb, const ul_idx_t *udb, uint32_t apend_be, int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, int64_t quick_check, uint32_t gen_off, double mcopy_rate, uint32_t mcopy_khit_cut, @@ -1802,6 +2048,21 @@ void ul_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t lchain_qgen_mcopy(cl, overlap_list, rid, rl, NULL, uref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp); } +void h_ec_lchain(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut) +{ + extern void *ha_flt_tab; + extern ha_pt_t *ha_idx; + int64_t max_skip, max_iter, max_dis, quick_check; double chn_pen_gap, chn_pen_skip; + set_lchain_dp_op(is_accurate, mz_k, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); + // minimizers_gen(ab, rs, rl, mz_w, mz_k, cl, k_flag, ha_flt_tab, ha_idx, dbg_ct, sp, high_occ, low_occ); + minimizers_qgen0(ab, rs, rl, mz_w, mz_k, cl, k_flag, ha_flt_tab, ha_idx, rref, dbg_ct, sp, high_occ, low_occ); + // lchain_gen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); + // lchain_qgen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); + ///no need to sort here, overlap_list has been sorted at lchain_gen + lchain_qgen_mcopy_fast(cl, overlap_list, rid, rl, rref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, enable_mcopy, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp); +} + int64_t ug_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, double bw_thres_sec, int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, double mcopy_rate, uint32_t mcopy_khit_cut, uint32_t is_hpc, ha_mzl_t *res, uint64_t res_n, ha_mzl_t *idx, uint64_t idx_n, uint64_t mzl_cutoff, uint64_t chain_cutoff, kv_u_trans_t *kov) diff --git a/ecovlp.cpp b/ecovlp.cpp new file mode 100644 index 0000000..d3f0176 --- /dev/null +++ b/ecovlp.cpp @@ -0,0 +1,860 @@ +#include +#include +#include +#include "Correct.h" +#include "Process_Read.h" +#include "ecovlp.h" +#include "kthread.h" +#include "htab.h" +#define HA_KMER_GOOD_RATIO 0.333 +#define E_KHIT 31 + +#define generic_key(x) (x) +KRADIX_SORT_INIT(ec16, uint16_t, generic_key, 2) +KRADIX_SORT_INIT(ec32, uint32_t, generic_key, 4) +KRADIX_SORT_INIT(ec64, uint64_t, generic_key, 8) + +void h_ec_lchain(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut); + + + +ec_ovec_buf_t* gen_ec_ovec_buf_t(uint32_t n, uint32_t is_final, uint32_t save_ov) +{ + uint32_t k; ec_ovec_buf_t0 *z = NULL; + ec_ovec_buf_t *p = NULL; CALLOC(p, 1); + p->n = n; CALLOC(p->a, p->n); + for (k = 0; k < p->n; k++) { + z = &(p->a[k]); + z->is_final = !!is_final; z->save_ov = !!save_ov; + init_UC_Read(&z->self_read); + init_UC_Read(&z->ovlp_read); + init_Candidates_list(&z->clist); + init_overlap_region_alloc(&z->olist); + + init_fake_cigar(&(z->tmp.f_cigar)); + memset(&(z->tmp.w_list), 0, sizeof(z->tmp.w_list)); + CALLOC(z->tmp.w_list.a, 1); z->tmp.w_list.n = z->tmp.w_list.m = 1; + + // kv_init(z->b_buf.a); + kv_init(z->r_buf.a); + kv_init(z->k_flag.a); + kv_init(z->sp); + kv_init(z->pidx); + kv_init(z->v64); + kv_init(z->v32); + kv_init(z->v16); + init_bit_extz_t(&(z->exz), 31); + + z->ab = ha_abuf_init(); + if (!z->is_final) { + init_Cigar_record(&z->cigar); + // init_Graph(&b->POA_Graph); + // init_Graph(&b->DAGCon); + init_Correct_dumy(&z->correct); + InitHaplotypeEvdience(&z->hap); + } + } + + return p; +} + +void destroy_cns_gfa(cns_gfa *p) +{ + size_t k; + for (k = 0; k < p->m; k++) { + kv_destroy(p->a[k].in); + kv_destroy(p->a[k].ou); + } + free(p->a); +} + +void destroy_ec_ovec_buf_t(ec_ovec_buf_t *p) +{ + uint32_t k; ec_ovec_buf_t0 *z = NULL; + for (k = 0; k < p->n; k++) { + z = &(p->a[k]); + destory_UC_Read(&z->self_read); + destory_UC_Read(&z->ovlp_read); + destory_Candidates_list(&z->clist); + destory_overlap_region_alloc(&z->olist); + + destory_fake_cigar(&(z->tmp.f_cigar)); + free(z->tmp.w_list.a); free(z->tmp.w_list.c.a); + + kv_destroy(z->r_buf.a); + kv_destroy(z->k_flag.a); + kv_destroy(z->sp); + kv_destroy(z->pidx); + kv_destroy(z->v64); + kv_destroy(z->v32); + kv_destroy(z->v16); + destroy_bit_extz_t(&(z->exz)); + + ha_abuf_destroy(z->ab); + if (!z->is_final) { + destory_Cigar_record(&z->cigar); + // destory_Graph(&b->POA_Graph); + // destory_Graph(&b->DAGCon); + destory_Correct_dumy(&z->correct); + destoryHaplotypeEvdience(&z->hap); + } + destroy_cns_gfa(&(z->cns)); + + asm_opt.num_bases += z->num_read_base; + asm_opt.num_corrected_bases += z->num_correct_base; + asm_opt.num_recorrected_bases += z->num_recorrect_base; + // asm_opt.mem_buf += ha_ovec_mem(b[i], NULL); + } + free(p->a); free(p); + + fprintf(stderr, "[M::%s-chains] #->%lld\n", __func__, asm_opt.num_bases); + fprintf(stderr, "[M::%s-passed-chains-0] #->%lld\n", __func__, asm_opt.num_corrected_bases); + fprintf(stderr, "[M::%s-cis-chains-1] #->%lld\n", __func__, asm_opt.num_recorrected_bases); +} + +void prt_chain(overlap_region_alloc *o) +{ + uint64_t k; + for (k = 0; k < o->length; k++) { + fprintf(stderr, "[M::%s]\t#%u\tlen::%lu\t%u\t%u\t%c\t#%u\tlen::%lu\t%u\t%u\tsc::%d\taln::%u\terr::%u\n", __func__, o->list[k].x_id, Get_READ_LENGTH(R_INF, o->list[k].x_id), o->list[k].x_pos_s, o->list[k].x_pos_e+1, "+-"[o->list[k].y_pos_strand], + o->list[k].y_id, Get_READ_LENGTH(R_INF, o->list[k].y_id), o->list[k].y_pos_s, o->list[k].y_pos_e+1, o->list[k].shared_seed, o->list[k].align_length, o->list[k].non_homopolymer_errors); + } +} + +overlap_region *fetch_aux_ovlp(overlap_region_alloc* ol) /// exactly same to gen_aux_ovlp +{ + if (ol->length + 1 > ol->size) { + uint64_t sl = ol->size; + ol->size = ol->length + 1; + kroundup64(ol->size); + REALLOC(ol->list, ol->size); + /// need to set new space to be 0 + memset(ol->list + sl, 0, sizeof(overlap_region)*(ol->size - sl)); + } + return &(ol->list[ol->length+1]); +} + +typedef struct { + ul_ov_t *c_idx; + asg64_v *idx; + int64_t i, i0, srt_n, rr; + uint64_t mms, mme; +} cc_idx_t; + + +///[s, e) +int64_t extract_sub_cigar_mm(overlap_region *z, int64_t s, int64_t e, ul_ov_t *p, uint64_t *ct) +{ + int64_t wk = ovlp_cur_wid(*p), xk = ovlp_cur_xoff(*p), yk = ovlp_cur_yoff(*p), ck = ovlp_cur_coff(*p), os, oe, t; + bit_extz_t ez; int64_t bd = ovlp_bd(*p), s0, e0; + s0 = ((int64_t)(z->w_list.a[wk].x_start)) + bd; + e0 = ((int64_t)(z->w_list.a[wk].x_end)) + 1 - bd; + if(s < s0) s = s0; if(e > e0) e = e0;///exclude boundary + if(s >= e) return -1; + os = MAX(s, s0); oe = MIN(e, e0); + if(oe <= os) return -1; + + set_bit_extz_t(ez, (*z), wk); + if(!ez.cigar.n) return -1; + int64_t cn = ez.cigar.n, op; int64_t ws, we, ovlp; + if((ck < 0) || (ck > cn)) {//(*ck) == cn is allowed + ck = 0; xk = ez.ts; yk = ez.ps; + } + + while (ck > 0 && xk >= s) {///x -> t; y -> p; first insertion and then match/mismatch + --ck; + op = ez.cigar.a[ck]>>14; + if(op!=2) xk -= (ez.cigar.a[ck]&(0x3fff)); + if(op!=3) yk -= (ez.cigar.a[ck]&(0x3fff)); + } + + //some cigar will span s or e + while (ck < cn && xk < e) {//[s, e) + ws = xk; + op = ez.cigar.a[ck]>>14; + ///op == 3: -> x; op == 2: -> y; + if(op!=2) xk += (ez.cigar.a[ck]&(0x3fff)); + if(op!=3) yk += (ez.cigar.a[ck]&(0x3fff)); + ck++; we = xk; + + os = MAX(s, ws); oe = MIN(e, we); + ovlp = ((oe>os)? (oe-os):0); + if(op != 2) { + if(!ovlp) continue; + } else {///ws == we + if(ws < s || ws >= e) continue; + } + + + if(op == 0) { + for (t = os + 1; t < oe; t++) { + ct[(t-s)<<1]++; ct[(t-s)<<1] += ((uint64_t)(0x100000000)); + ct[((t-s)<<1)+1]++; ct[((t-s)<<1)+1] += ((uint64_t)(0x100000000)); + } + + t = os; + if(t < oe) { + ct[(t-s)<<1]++; ct[(t-s)<<1] += ((uint64_t)(0x100000000)); + if(os > ws) { + ct[((t-s)<<1)+1]++; ct[((t-s)<<1)+1] += ((uint64_t)(0x100000000)); + } + } + } else if(op!=2) { + for (t = os + 1; t < oe; t++) { + ct[(t-s)<<1]++; + ct[((t-s)<<1)+1]++; + } + + t = os; + if(t < oe) { + ct[(t-s)<<1]++; + if(os > ws) { + ct[((t-s)<<1)+1]++; + } + } + } else { + ct[((ws-s)<<1)+1]++; ///ct[((ws-s)<<1)+1] += ((uint64_t)(0x100000000)); + } + } + + ovlp_cur_xoff(*p) = xk; ovlp_cur_yoff(*p) = yk; ovlp_cur_coff(*p) = ck; ovlp_cur_ylen(*p) = 0; + + return 1; +} + +#define simp_vote_len 6 + +///[s, e) +uint32_t extract_sub_cigar_ii(overlap_region *z, All_reads *rref, int64_t s, int64_t e, UC_Read* tu, ul_ov_t *p) +{ + int64_t wk = ovlp_cur_wid(*p), xk = ovlp_cur_xoff(*p), yk = ovlp_cur_yoff(*p), ck = ovlp_cur_coff(*p), os, oe, ol; + bit_extz_t ez; int64_t bd = ovlp_bd(*p), s0, e0, ii[2], it[2]; uint32_t res = (uint32_t)-1; + s0 = ((int64_t)(z->w_list.a[wk].x_start)) + bd; + e0 = ((int64_t)(z->w_list.a[wk].x_end)) + 1 - bd; + if(s < s0) s = s0; if(e > e0) e = e0;///exclude boundary + if(s > e) return -1;///it is possible s == e + os = MAX(s, s0); oe = MIN(e, e0); + if(oe < os) return -1;///it is possible os == oe + + set_bit_extz_t(ez, (*z), wk); + if(!ez.cigar.n) return -1; + int64_t cn = ez.cigar.n; uint16_t op; int64_t ws, we, wts, wte, ovlp, cc = 0, cci; + if((ck < 0) || (ck > cn)) {//(*ck) == cn is allowed + ck = 0; xk = ez.ts; yk = ez.ps; + } + + while (ck > 0 && xk >= s) {///x -> t; y -> p; first insertion and then match/mismatch + --ck; + op = ez.cigar.a[ck]>>14; + if(op!=2) xk -= (ez.cigar.a[ck]&(0x3fff)); + if(op!=3) yk -= (ez.cigar.a[ck]&(0x3fff)); + } + + char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + //some cigar will span s or e + ii[0] = ii[1] = it[0] = it[1] = -1; res = cc = 0; + while (ck < cn && xk < e) {//[s, e) + ws = xk; wts = yk; + op = ez.cigar.a[ck]>>14; ol = (ez.cigar.a[ck]&(0x3fff)); + ///op == 3: -> x; op == 2: -> y; + if(op!=2) xk += ol; + if(op!=3) yk += ol; + ck++; we = xk; wte = yk; + + // if(s == 10480) { + // fprintf(stderr, "[%ld, %ld)\t%c\n", ws, we, cm[op]); + // } + + os = MAX(s, ws); oe = MIN(e, we); + ovlp = ((oe>os)? (oe-os):0); + if(op != 2) { + if(!ovlp) continue; + } else {///ws == we + if(ws < s || ws >= e) continue; + } + + + + + if(ii[0] == -1) { + ii[0] = os; + if(op < 2) { + it[0] = os - ws + wts; + } else {///op == 2: more y; p == 3: more x + it[0] = wts; + } + } + + ii[1] = oe; + if(op < 2) { + it[1] = oe - ws + wts; + } else {///op == 2: more y; p == 3: more x + it[1] = wte; + } + + + + + if(op != 2) ol = oe-os; + cc += ol; + fprintf(stderr, "%ld%c", ol, cm[op]); + if(cc <= simp_vote_len) { + for (cci = 0; cci < ol; cci++) { + res <<= 2; res |= op; + } + } + } + + while (ck < cn && xk <= e) {//[s, e) + ws = xk; wts = yk; + op = ez.cigar.a[ck]>>14; ol = (ez.cigar.a[ck]&(0x3fff)); + if(op != 2) break; + yk += (ez.cigar.a[ck]&(0x3fff)); + ck++; we = xk; wte = yk; + if(ws >= s && ws <= e) { + + if(ii[0] == -1) { + ii[0] = os; it[0] = wts; + } + ii[1] = oe; it[1] = wte; + + cc += ol; + fprintf(stderr, "%ld%c", ol, cm[op]); + if(cc <= simp_vote_len) { + for (cci = 0; cci < ol; cci++) { + res <<= 2; res |= op; + } + } + } + } + + fprintf(stderr, "\tx::[%ld, %ld)\ty::[%ld, %ld)\tcc::%ld\n", ii[0], ii[1], it[0], it[1], cc); + if((cc <= simp_vote_len) + && (ii[1] >= ii[0]) && (ii[1] - ii[0] <= simp_vote_len) + && (it[1] >= it[0]) && (it[1] - it[0] <= simp_vote_len)) { + // ii[0] = ii[0] - s; ii[1] = e - ii[1]; + if((ii[0] == s) && (ii[1] == e)) { + op = cc; op <<= 12; res |= op; + + char *ystr = NULL; res <<= 16; cc = it[1] - it[0]; op = 0; + if(cc > 0) { + UC_Read_resize(*tu, (it[1] - it[0])); ystr = tu->seq; + recover_UC_Read_sub_region(ystr, it[0], (it[1] - it[0]), z->y_pos_strand, rref, z->y_id); + + for (cci = 0; cci < cc; cci++) { + op <<= 2; op |= seq_nt6_table[(uint32_t)(ystr[cci])]; + } + } + res |= op; + + op = it[1] - it[0]; op <<= 12; res |= op; + } else { + res = (uint32_t)-1; + } + } else { + res = (uint32_t)-1; + } + + + ovlp_cur_xoff(*p) = xk; ovlp_cur_yoff(*p) = yk; ovlp_cur_coff(*p) = ck; ovlp_cur_ylen(*p) = 0; + + return res; +} + + +uint64_t iter_cc_idx_t(overlap_region* ol, cc_idx_t *z, int64_t s, int64_t e, uint64_t is_reduce, uint64_t is_insert, uint64_t **ra) +{ + int64_t rm_n, q[2], os, oe; ul_ov_t *cp; uint64_t m; *ra = NULL; + + // if(s == 15816 && e == 15819) { + // fprintf(stderr, "[M::%s] is_reduce::%lu\n", __func__, is_reduce); + // } + + if(is_reduce) { + for (m = rm_n = z->srt_n; m < z->idx->n; m++) { + cp = &(z->c_idx[z->idx->a[m]]); + // if(s == 15816 && e == 15819) { + // fprintf(stderr, "-0-[M::%s] ii::%lu, ii0::%ld\n", __func__, z->idx->a[m], z->i0); + // } + + q[0] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + q[1] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + os = MAX(q[0], s); oe = MIN(q[1], e); + if((oe > os) || ((is_insert) && (s == e) && (s >= q[0]) && (s <= q[1]))) { + z->idx->a[rm_n++] = z->idx->a[m]; + } + } + z->idx->n = rm_n; + } + + for (; z->i < z->srt_n; ++z->i) { + cp = &(z->c_idx[(uint32_t)z->idx->a[z->i]]); + // if(s == 15816 && e == 15819) { + // fprintf(stderr, "-1-[M::%s] ii::%u, ii0::%ld\n", __func__, (uint32_t)z->idx->a[z->i], z->i0); + // } + q[0] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + q[1] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + if(q[0] > e) break; + if((!is_insert) && (q[0] >= e)) break; + os = MAX(q[0], s); oe = MIN(q[1], e); + if((oe > os) || ((is_insert) && (s == e) && (s >= q[0]) && (s <= q[1]))) { + kv_push(uint64_t, *(z->idx), ((uint32_t)z->idx->a[z->i])); + } + } + + (*ra) = z->idx->a + z->srt_n; + return z->idx->n - z->srt_n; +} + +void debug_inter0(overlap_region* ol, ul_ov_t *c_idx, uint64_t *idx, int64_t idx_n, uint64_t *res, int64_t res_n, int64_t s, int64_t e, uint64_t is_insert, uint64_t is_hard_check, const char *cmd) +{ + ul_ov_t *cp; int64_t q[2], a_n = 0, i, k = 0, os, oe; + for (i = 0; i < idx_n; i++) { + cp = &(c_idx[(uint32_t)idx[i]]); + q[0] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + q[1] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + + // fprintf(stderr, "%s[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\terr::%d\toerr::%u\n", cmd, __func__, ol[ovlp_id(*cp)].y_id, (int)Get_NAME_LENGTH(R_INF, ol[ovlp_id(*cp)].y_id), Get_NAME(R_INF, ol[ovlp_id(*cp)].y_id), + // ovlp_cur_wid(*cp), ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start, ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1, ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].error, ol[ovlp_id(*cp)].non_homopolymer_errors); + + os = MAX(q[0], s); oe = MIN(q[1], e); + if((oe > os) || ((is_insert) && (s == e) && (s >= q[0]) && (s <= q[1]))) { + a_n++; + // if(!(((uint32_t)idx[i]) == res[k])) { + // fprintf(stderr, "[M::%s] a_n::%ld\tres_n::%ld\ts::%ld\te::%ld\ti::%ld\tk::%ld\n", __func__, a_n, res_n, s, e, i, k); + // } + if(is_hard_check) { + assert(((uint32_t)idx[i]) == res[k++]); + } else { + for (; (k < res_n) && (((uint32_t)idx[i]) != res[k]); k++); + assert(k < res_n); + } + } + } + // if(a_n != res_n) { + // fprintf(stderr, "[M::%s] a_n::%ld\tres_n::%ld\ts::%ld\te::%ld\tidx_n::%ld\n", __func__, a_n, res_n, s, e, idx_n); + // } + if(is_hard_check) { + assert(a_n == res_n); + } else { + assert(a_n <= res_n); + } +} + +void prt_cigar0(uint64_t in, int64_t len) +{ + int64_t k; uint64_t mp; + char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + for (k = 0; k < len; k++) { + mp = len - 1 - k; mp <<= 1; + fprintf(stderr, "%c", cm[(in >> mp)&3]); + } + fprintf(stderr, "\n"); +} + +void prt_bp0(uint64_t in, int64_t len) +{ + int64_t k; uint64_t mp; + char cm[4]; cm[0] = 'A'; cm[1] = 'C'; cm[2] = 'G'; cm[3] = 'T'; + for (k = 0; k < len; k++) { + mp = len - 1 - k; mp <<= 1; + fprintf(stderr, "%c", cm[(in >> mp)&3]); + } + fprintf(stderr, "\n"); +} + +uint64_t cns_gen0(overlap_region* ol, All_reads *rref, uint64_t s, uint64_t e, UC_Read* tu, cc_idx_t *idx, uint64_t occ_tot, double occ_max, asg32_v* b32, uint32_t *rc) +{ + if(e > s + simp_vote_len) return 0;///too long + + uint64_t *id_a = NULL, id_n, an = 0, oc[2]; b32->n = 0; uint32_t m, *a = NULL; + id_n = iter_cc_idx_t(ol, idx, s, e, idx->rr, ((s==e)?1:0), &id_a); + // debug_inter0(ol, idx->c_idx, idx->idx->a + idx->i0, idx->srt_n - idx->i0, id_a, id_n, s, e, ((s==e)?1:0), 0, "-1-"); + uint64_t k, l, q[2], os, oe; ul_ov_t *p; overlap_region *z; idx->rr = 0; + fprintf(stderr, "[M::%s] [%lu, %lu) id_n::%lu\n", __func__, s, e, id_n); + for (k = 0; k < id_n; k++) { + p = &(idx->c_idx[id_a[k]]); z = &(ol[ovlp_id(*p)]); + q[0] = z->w_list.a[ovlp_cur_wid(*p)].x_start+ovlp_bd(*p); + q[1] = z->w_list.a[ovlp_cur_wid(*p)].x_end+1-ovlp_bd(*p); + + // fprintf(stderr, "[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\terr::%d\toerr::%u\n", __func__, ol[ovlp_id(*p)].y_id, (int)Get_NAME_LENGTH(R_INF, ol[ovlp_id(*p)].y_id), Get_NAME(R_INF, ol[ovlp_id(*p)].y_id), + // ovlp_cur_wid(*p), ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_start, ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_end+1, ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].error, ol[ovlp_id(*p)].non_homopolymer_errors); + + if(q[1] <= e) idx->rr = 1; + os = MAX(q[0], s); oe = MIN(q[1], e); + if((oe > os) || ((s == e) && (s >= q[0]) && (s <= q[1]))) { + // if(oe >= os) { + ///[-4-][-12-][-4-][-12-] + ///[cigar_len][cigar][base_len][base] + m = extract_sub_cigar_ii(z, rref, os, oe, tu, p); an++; + if(m != ((uint32_t)-1)) {///no gap in both sides + kv_push(uint32_t, *b32, m); + } + } + } + + oc[0] = b32->n; oc[1] = an + 1; //+1 for the reference read + fprintf(stderr, "-0-[M::%s] oc[0]::%lu, oc[1]::%lu\n", __func__, oc[0], oc[1]); + if(((oc[0] > (oc[1]*occ_max)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1))) { + radix_sort_ec32(b32->a, b32->a+b32->n); an = 0; + for (k = 1, l = 0; k <= b32->n; ++k) { + if (k == b32->n || b32->a[k] != b32->a[l]) { + if(k - l > an) { + an = k - l; a = b32->a + l; + } + l = k; + } + } + oc[0] = an; + fprintf(stderr, "-1-[M::%s] oc[0]::%lu, oc[1]::%lu\n", __func__, oc[0], oc[1]); + if(((oc[0] > (oc[1]*occ_max)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1))) { + (*rc) = a[0]; + // prt_cigar0((a[0]<<4)>>20, a[0]>>28); + // prt_bp0((a[0]<<20)>>20, (a[0]<<16)>>28); + return 1; + } + } + + return 0; +} + +void push_correct0(window_list *idx, window_list_alloc *res, uint32_t len0, uint32_t rc) +{ + if(len0 != ((uint32_t)-1)) { + ; + } else if(rc != ((uint32_t)-1)) { + uint32_t cc = (rc<<4)>>20, cn = rc>>28, ck = 0, cs, cp; + uint32_t bc = (rc<<20)>>20, bn = (rc<<16)>>28, bk = 0, bs, bp; + for (ck = 0; ck < cn; ck++) { + cs = (cn-1-ck)<<1; cp = (cc>>cs)&3; + + bp = (uint32_t)-1; + if(cp != 3) { + bs = (bn-1-bk)<<1; bp = (bc>>bs)&3; + bk++; + } + push_trace_bp(((asg16_v *)(&(res->c))), cp, bp, 1, ((idx->clen>0)?1:0)); + + fprintf(stderr, "%c", cm[(in >> mp)&3]); + } + } +} + +void push_cns_anchor(overlap_region* ol, All_reads *rref, uint64_t s, uint64_t e, UC_Read* tu, cc_idx_t *idx, overlap_region *aux_o, uint64_t is_tail, uint64_t occ_tot, double occ_max, asg32_v* b32) +{ + if((!is_tail) && (s >= e)) return;//if s >= e && is_tail = 1, gen the cns of the last a few bases -> s = e = ql + fprintf(stderr, "\n[M::%s] [%lu, %lu)\n", __func__, s, e); + window_list *p = NULL; uint64_t e0 = 0; uint32_t rc; + if(aux_o->w_list.n > 0) { + p = &(aux_o->w_list.a[aux_o->w_list.n-1]); + e0 = p->x_end+1; + ///make sure e > s + } + assert(s >= e0); + if((((!is_tail) && (s > 0)) || ((is_tail) && (s > e0))) + && (cns_gen0(ol, rref, e0, s, tu, idx, occ_tot, occ_max, b32, &rc))) {///CNS in between + push_correct0(p, &(aux_o->w_list), (uint32_t)-1, rc); + } else { + + } + + kv_pushp(window_list, aux_o->w_list, &p); + p->x_start = s; p->x_end = e-1; + // p->y_start = exz->ps; p->y_end = exz->pe; +} + +uint64_t wcns_vote(overlap_region* ol, All_reads *rref, char* qstr, UC_Read* tu, uint64_t *id_a, uint64_t id_n, uint64_t s, uint64_t e, ul_ov_t *c_idx, cc_idx_t *occ, uint64_t occ_tot, double occ_exact, overlap_region *aux_o, asg32_v* b32) +{ + uint64_t k, q[2], rr = 0, os, oe, wl, oc[2], fI; ul_ov_t *p, *gp; overlap_region *z; + // uint64_t *ct = occ->idx->a;///occ->idx->a[0, wl<<1) + for (k = 0; k < id_n; k++) { + p = &(c_idx[id_a[k]]); z = &(ol[ovlp_id(*p)]); + q[0] = z->w_list.a[ovlp_cur_wid(*p)].x_start+ovlp_bd(*p); + q[1] = z->w_list.a[ovlp_cur_wid(*p)].x_end+1-ovlp_bd(*p); + if(q[1] <= e) rr = 1; + os = MAX(q[0], s); oe = MIN(q[1], e); + if(oe > os) { + ///prepare for CNS + gp = &(occ->c_idx[id_a[k]]); + ovlp_cur_xoff(*gp) = ovlp_cur_xoff(*p); ovlp_cur_yoff(*gp) = ovlp_cur_yoff(*p); ovlp_cur_coff(*gp) = ovlp_cur_coff(*p); ovlp_cur_ylen(*gp) = ovlp_cur_ylen(*p); + // assert(ovlp_cur_wid(*p) == ovlp_cur_wid(*gp)); + // assert(ovlp_id(*p) == ovlp_id(*gp)); + // fprintf(stderr, "[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\tos::%lu\toe::%lu\n", __func__, ol[ovlp_id(*p)].y_id, (int)Get_NAME_LENGTH(R_INF, ol[ovlp_id(*p)].y_id), Get_NAME(R_INF, ol[ovlp_id(*p)].y_id), + // ovlp_cur_wid(*p), ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_start, ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_end+1, os, oe); + extract_sub_cigar_mm(z, os, oe, p, occ->idx->a + os - s); + } + } + + wl = e - s; + os = occ->mms; oe = occ->mme; + + // fprintf(stderr, "[M::%s] s::%lu\te::%lu\n", __func__, s, e); + + for (k = 0; k < wl; k++) { + //+1 for the reference read + oc[0] = (occ->idx->a[(k<<1)]>>32) + 1; + oc[1] = ((uint32_t)occ->idx->a[(k<<1)]) + 1; + // fprintf(stderr, "-0-p::%lu\toc[0]::%lu\toc[1]::%lu\tgoc[0]::%lu\tgoc[1]::%u\n", s + k, oc[0], oc[1], (ct[(k<<1)+1]>>32) + 1, ((uint32_t)ct[(k<<1)+1]) + 1); + // if(oc[1] < occ_tot || oc[0] <= 1) { + // ct[(k<<1)] = ct[(k<<1)+1] = 0; + // continue; + // } + // fprintf(stderr, "-1-p::%lu\toc[0]::%lu\toc[1]::%lu\n", s + k, oc[0], oc[1]); + + if((oc[0] > (oc[1]*occ_exact)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1)) { + ///note: there might be insertions at q[k-1, k], insead if q[k, k+1] + fI = 1; + ///make sure there is no insertion + //+1 for the reference read + oc[0] = (occ->idx->a[(k<<1)+1]>>32) + 1; + oc[1] = ((uint32_t)occ->idx->a[(k<<1)+1]) + 1; + if(((oc[0] > (oc[1]*occ_exact)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1))) fI = 0; + if(fI) { + // fprintf(stderr, "-1-p::%lu\toc[0]::%lu\toc[1]::%u\tgoc[0]::%lu\tgoc[1]::%u\n", s + k, (occ->idx->a[(k<<1)]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)]) + 1, (occ->idx->a[(k<<1)+1]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)+1]) + 1); + if(oe > os && os != ((uint64_t)-1)) {///push previous intervals + push_cns_anchor(ol, rref, os, oe, tu, occ, aux_o, 0, occ_tot, occ_exact, b32); + } + os = oe = (uint64_t)-1; + } + + //+1 for the reference read + oc[0] = (occ->idx->a[(k<<1)]>>32) + 1; + oc[1] = ((uint32_t)occ->idx->a[(k<<1)]) + 1; + if((s+k) == oe) { + oe++; + } else { + if(oe > os && os != ((uint64_t)-1)) {///push previous intervals + push_cns_anchor(ol, rref, os, oe, tu, occ, aux_o, 0, occ_tot, occ_exact, b32); + } + os = s+k; oe = s+k+1; + } + } else { + // fprintf(stderr, "-2-p::%lu\toc[0]::%lu\toc[1]::%u\tgoc[0]::%lu\tgoc[1]::%u\n", s + k, (occ->idx->a[(k<<1)]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)]) + 1, (occ->idx->a[(k<<1)+1]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)+1]) + 1); + if(oe > os && os != ((uint64_t)-1)) {///push previous intervals + push_cns_anchor(ol, rref, os, oe, tu, occ, aux_o, 0, occ_tot, occ_exact, b32); + } + os = oe = (uint64_t)-1; + } + occ->idx->a[(k<<1)] = occ->idx->a[(k<<1)+1] = 0; + } + + occ->mms = occ->mme = (uint64_t)-1; + if(oe > os && os != ((uint64_t)-1)) { + occ->mms = os; occ->mme = oe; + } + return rr; +} + + + +void print_debug_ovlp_cigar(overlap_region_alloc* ol, asg64_v* idx, kv_ul_ov_t *c_idx) +{ + uint64_t k, ci; uint32_t cl; ul_ov_t *cp; bit_extz_t ez; uint16_t c; char cm[4]; + cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + for (k = 0; k < idx->n; k++) { + cp = &(c_idx->a[(uint32_t)idx->a[k]]); + fprintf(stderr, "**********[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\terr::%d\toerr::%u**********\n", __func__, ol->list[ovlp_id(*cp)].y_id, (int)Get_NAME_LENGTH(R_INF, ol->list[ovlp_id(*cp)].y_id), Get_NAME(R_INF, ol->list[ovlp_id(*cp)].y_id), + ovlp_cur_wid(*cp), ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start, ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1, ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].error, ol->list[ovlp_id(*cp)].non_homopolymer_errors); + set_bit_extz_t(ez, ol->list[ovlp_id(*cp)], ovlp_cur_wid(*cp)); ci = 0; + while (ci < ez.cigar.n) { + ci = pop_trace(&(ez.cigar), ci, &c, &cl); + fprintf(stderr, "%u%c", cl, cm[c]); + } + fprintf(stderr, "\n"); + } +} + +void wcns_gen(overlap_region_alloc* ol, All_reads *rref, UC_Read* qu, UC_Read* tu, kv_ul_ov_t *c_idx, asg64_v* idx, asg64_v* buf, int64_t bd, uint64_t wl, int64_t ql, uint64_t occ_tot, double occ_exact, overlap_region *aux_o, asg32_v* b32) +{ + int64_t on = ol->length, k, i, zwn, q[2]; + uint64_t m, *ra, rn; overlap_region *z; ul_ov_t *cp; + + for (k = idx->n = c_idx->n = 0; k < on; k++) { + z = &(ol->list[k]); zwn = z->w_list.n; + if((!zwn) || (z->is_match != 1)) continue; + for (i = 0; i < zwn; i++) { + if(is_ualn_win(z->w_list.a[i])) continue; + q[0] = z->w_list.a[i].x_start; q[1] = z->w_list.a[i].x_end; + q[0] += bd; q[1] -= bd; + if(q[1] >= q[0]) { + m = ((uint64_t)q[0]); m <<= 32; + m += c_idx->n; kv_push(uint64_t, *idx, m); + + kv_pushp(ul_ov_t, *c_idx, &cp); + ovlp_id(*cp) = k; ///ovlp id + // ovlp_min_wid(*cp) = i; ///beg id of windows + // ovlp_max_wid(*cp) = i; ///end id of windows + ovlp_cur_wid(*cp) = i; ///cur id of windows + ovlp_cur_xoff(*cp) = z->w_list.a[i].x_start; ///cur xpos + ovlp_cur_yoff(*cp) = z->w_list.a[i].y_start; ///cur xpos + ovlp_cur_ylen(*cp) = 0; + ovlp_cur_coff(*cp) = 0; ///cur cigar off in cur window + ovlp_bd(*cp) = bd; + } + } + } + + int64_t srt_n = idx->n, s, e, t, rr; i = 0; + radix_sort_ec64(idx->a, idx->a+idx->n); + for (k = 1, i = 0; k < srt_n; k++) { + if (k == srt_n || (idx->a[k]>>32) != (idx->a[i]>>32)) { + if(k - i > 1) { + for (t = i; t < k; t++) { + cp = &(c_idx->a[(uint32_t)idx->a[t]]); + // s = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + // assert(s == (int64_t)(idx->a[i]>>32)); + m = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + m <<= 32; m += ((uint32_t)idx->a[t]); idx->a[t] = m; + // fprintf(stderr, "[M::%s] s::%ld\tsi::%lu\n", __func__, s, (idx->a[i]>>32)); + } + radix_sort_ec64(idx->a + i, idx->a + k); + } + i = k; + } + } + + print_debug_ovlp_cigar(ol, idx, c_idx); + + ///second index + kv_resize(ul_ov_t, *c_idx, (c_idx->n<<1)); + ul_ov_t *idx_a = NULL, *idx_b = NULL; + idx_a = c_idx->a; idx_b = c_idx->a; + memcpy(idx_b, idx_a, c_idx->n * (sizeof((*(idx_a))))); + + kv_resize(uint64_t, *buf, ((wl<<1) + idx->n)); buf->n = ((wl<<1) + idx->n); + memcpy(buf->a + (wl<<1), idx->a, idx->n * (sizeof((*(idx->a))))); + memset(buf->a, 0, (wl<<1)*(sizeof((*(idx->a))))); + + cc_idx_t ii_a, ii_b; memset(&ii_a, 0, sizeof(ii_a)); memset(&ii_b, 0, sizeof(ii_b)); + + ii_a.c_idx = idx_a; ii_a.idx = idx; ii_a.i = ii_a.i0 = 0; ii_a.srt_n = ii_a.idx->n; ii_a.mms = ii_a.mme = (uint64_t)-1; + ii_b.c_idx = idx_b; ii_b.idx = buf; ii_b.i = ii_b.i0 = (wl<<1); ii_b.srt_n = ii_b.idx->n; ii_b.mms = ii_b.mme = (uint64_t)-1; + + s = 0; e = wl; e = ((e<=ql)?e:ql); rr = 0; + aux_o->w_list.n = aux_o->w_list.c.n = 0; ///for cigar + for (; s < ql; ) { + rn = iter_cc_idx_t(ol->list, &ii_a, s, e, rr, 0, &ra); + // debug_inter0(ol->list, ii_a.c_idx, ii_a.idx->a + ii_a.i0, ii_a.srt_n - ii_a.i0, ra, rn, s, e, 0, 1, "-0-"); + rr = wcns_vote(ol->list, rref, qu->seq, tu, ra, rn, s, e, ii_a.c_idx, &ii_b, occ_tot, occ_exact, aux_o, b32); + s += wl; e += wl; e = ((e<=ql)?e:ql); + } + + if(ii_b.mme > ii_b.mms && ii_b.mms != (uint64_t)-1) { + push_cns_anchor(ol->list, rref, ii_b.mms, ii_b.mme, tu, &ii_b, aux_o, 0, occ_tot, occ_exact, b32); + } + + push_cns_anchor(ol->list, rref, ql, ql, tu, &ii_b, aux_o, 1, occ_tot, occ_exact, b32); +} + +static void worker_hap_ec(void *data, long i, int tid) +{ + ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]); + uint32_t high_occ = asm_opt.hom_cov * (2.0 - HA_KMER_GOOD_RATIO); + uint32_t low_occ = asm_opt.hom_cov * HA_KMER_GOOD_RATIO; + overlap_region *aux_o = NULL; asg64_v buf0; + + // if (memcmp("m64012_190920_173625/7210046/ccs", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { + // fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i); + // } else { + // return; + // } + + if(i != 596/**1024**/) return; + + recover_UC_Read(&b->self_read, &R_INF, i); + + h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, 0.02, asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, 2, UINT32_MAX); + + b->num_read_base += b->olist.length; + aux_o = fetch_aux_ovlp(&b->olist);///must be here + + gen_hc_r_alin(&b->olist, &b->clist, &R_INF, &b->self_read, &b->ovlp_read, &b->exz, aux_o, asm_opt.max_ov_diff_ec, WINDOW_HC, i, E_KHIT/**asm_opt.k_mer_length**/, 1, &b->v16); + + // fprintf(stderr, "\n[M::%s] rid::%ld\t%.*s\tlen::%lld\tocc::%lu\n", __func__, i, (int)Get_NAME_LENGTH(R_INF, i), + // Get_NAME(R_INF, i), b->self_read.length, b->olist.length); + + b->num_correct_base += b->olist.length; + + copy_asg_arr(buf0, b->sp); + rphase_hc(&b->olist, &R_INF, &b->hap, &b->self_read, &b->ovlp_read, &b->pidx, &b->v64, &buf0, 0, WINDOW_MAX_SIZE, b->self_read.length, 1); + copy_asg_arr(b->sp, buf0); + + copy_asg_arr(buf0, b->sp); + wcns_gen(&b->olist, &R_INF, &b->self_read, &b->ovlp_read, &b->pidx, &b->v64, &buf0, 0, 512, b->self_read.length, 3, 0.500001, aux_o, &b->v32); + copy_asg_arr(b->sp, buf0); + + + uint32_t k; + for (k = 0; k < b->olist.length; k++) { + if(b->olist.list[k].is_match == 1) b->num_recorrect_base++; + } + + // exit(1); + + + + // prt_chain(&b->olist); + + // ul_map_lchain(b->abl, (uint32_t)-1, s->seq[i], s->len[i], s->opt->w, s->opt->k, s->uu, &b->olist, &b->clist, s->opt->bw_thres, + // s->opt->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2/**0.75**/, 2, 3); + + /** + int fully_cov, abnormal; + // if(i != 12578) return; + // fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i); + // if (memcmp("7897e875-76e5-42c8-bc37-94b370c4cc8d", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { + // fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i); + // } else { + // return; + // } + + ha_get_candidates_interface(b->ab, i, &b->self_read, &b->olist, &b->olist_hp, &b->clist, + 0.02, asm_opt.max_n_chain, 1, NULL, &b->r_buf, &(R_INF.paf[i]), &(R_INF.reverse_paf[i]), &(b->tmp_region), NULL, &(b->sp)); + + clear_Cigar_record(&b->cigar1); + clear_Round2_alignment(&b->round2); + + correct_overlap(&b->olist, &R_INF, &b->self_read, &b->correct, &b->ovlp_read, &b->POA_Graph, &b->DAGCon, + &b->cigar1, &b->hap, &b->round2, &b->r_buf, &(b->tmp_region.w_list), 0, 1, &fully_cov, &abnormal); + + b->num_read_base += b->self_read.length; + b->num_correct_base += b->correct.corrected_base; + b->num_recorrect_base += b->round2.dumy.corrected_base; + + push_cigar(R_INF.cigars, i, &b->cigar1); + push_cigar(R_INF.second_round_cigar, i, &b->round2.cigar); + + R_INF.paf[i].is_fully_corrected = 0; + if (fully_cov) { + if (get_cigar_errors(&b->cigar1) == 0 && get_cigar_errors(&b->round2.cigar) == 0) + R_INF.paf[i].is_fully_corrected = 1; + } + R_INF.paf[i].is_abnormal = abnormal; + + R_INF.trio_flag[i] = AMBIGU; + + ///need to be fixed in r305 + // if(ha_idx_hp == NULL) + // { + // R_INF.trio_flag[i] += collect_hp_regions(&b->olist, &R_INF, &(b->k_flag), RESEED_HP_RATE, Get_READ_LENGTH(R_INF, i), NULL); + // } + + if (R_INF.trio_flag[i] != AMBIGU || b->save_ov) { + int is_rev = (asm_opt.number_of_round % 2 == 0); + push_overlaps(&(R_INF.paf[i]), &b->olist, 1, &R_INF, is_rev); + push_overlaps(&(R_INF.reverse_paf[i]), &b->olist, 2, &R_INF, is_rev); + } + + if(het_cnt) het_cnt[i] = get_het_cnt(&b->hap); + // fprintf(stderr, "[M::%s-end] rid->%ld\n", __func__, i); + **/ +} + +void cal_ec_multiple(ec_ovec_buf_t *b, uint64_t n_thre, uint64_t n_a) +{ + double tt0 = yak_realtime_0(); + kt_for(n_thre, worker_hap_ec, b, n_a);///debug_for_fix + fprintf(stderr, "[M::%s-reads] #->%lu\n", __func__, n_a); + fprintf(stderr, "[M::%s::%.3f] ==> chaining\n", __func__, yak_realtime_0()-tt0); +} \ No newline at end of file diff --git a/ecovlp.h b/ecovlp.h new file mode 100644 index 0000000..2b30733 --- /dev/null +++ b/ecovlp.h @@ -0,0 +1,64 @@ +#ifndef __ECOVLP_PARSER__ +#define __ECOVLP_PARSER__ + +#define __STDC_LIMIT_MACROS +#include +#include "Hash_Table.h" +#include "Process_Read.h" + +typedef struct { + uint32_t v:31, f:1; + uint32_t sc; +} cns_arc; +typedef struct {size_t n, m; cns_arc *a;} cns_arc_v; + +typedef struct { + // uint16_t c:2, t:2, f:1, sc:3; + uint32_t c:2, f:1, sc:29; + cns_arc_v in, ou; +}cns_t; + +typedef struct { + size_t n, m; + cns_t *a; + uint32_t si, ei; +}cns_gfa; + +typedef struct { + int is_final, save_ov; + // chaining and overlapping related buffers + UC_Read self_read, ovlp_read; + Candidates_list clist; + overlap_region_alloc olist; + overlap_region tmp; + ha_abuf_t *ab; + // error correction related buffers + int64_t num_read_base, num_correct_base, num_recorrect_base; + Cigar_record cigar; + Correct_dumy correct; + haplotype_evdience_alloc hap; + bit_extz_t exz; + // asg32_v v32; + kv_ul_ov_t pidx; + asg64_v v64; + asg32_v v32; + asg16_v v16; + + kvec_t_u64_warp r_buf; + kvec_t_u8_warp k_flag; + st_mt_t sp; + cns_gfa cns; + +} ec_ovec_buf_t0; + +typedef struct { + ec_ovec_buf_t0 *a; + uint32_t n; +} ec_ovec_buf_t; + +ec_ovec_buf_t* gen_ec_ovec_buf_t(uint32_t n, uint32_t is_final, uint32_t save_ov); +void destroy_ec_ovec_buf_t(ec_ovec_buf_t *p); +void cal_ec_multiple(ec_ovec_buf_t *b, uint64_t n_thre, uint64_t n_a); +void prt_chain(overlap_region_alloc *o); + +#endif \ No newline at end of file diff --git a/htab.h b/htab.h index b501690..9f7c83d 100644 --- a/htab.h +++ b/htab.h @@ -108,6 +108,7 @@ uint64_t ha_abufl_mem(const ha_abufl_t *ab); double yak_cputime(void); void yak_reset_realtime(void); +double yak_realtime_0(void); double yak_realtime(void); long yak_peakrss(void); double yak_peakrss_in_gb(void); diff --git a/sys.cpp b/sys.cpp index 87b5ac2..8dbe050 100644 --- a/sys.cpp +++ b/sys.cpp @@ -31,6 +31,12 @@ double yak_realtime(void) return yak_realtime_core() - yak_realtime0; } +double yak_realtime_0(void) +{ + return yak_realtime_core(); +} + + long yak_peakrss(void) { struct rusage r;