diff --git a/Assembly.cpp b/Assembly.cpp index 2a25122..aa500f7 100644 --- a/Assembly.cpp +++ b/Assembly.cpp @@ -970,86 +970,57 @@ void prt_dbg_rs(FILE *fp, Debug_reads* x, uint64_t round) destory_UC_Read(&g_read); } - -void ha_ec(int64_t round) +void ha_ec(int64_t round, int num_pround, int des_idx, uint64_t *tot_b, uint64_t *tot_e) { - int i, hom_cov, het_cov, r_out = 0; - ec_ovec_buf_t *b = NULL; - ha_ecsave_buf_t *e = NULL; - ha_flt_tab_hp = ha_idx_hp = NULL; + int hom_cov, het_cov, r_out = 0; + ha_flt_tab_hp = ha_idx_hp = NULL; (*tot_b) = (*tot_e) = 0; if((ha_idx == NULL)&&(asm_opt.flag & HA_F_VERBOSE_GFA)&&(round == asm_opt.number_of_round - 1)) r_out = 1; if(asm_opt.required_read_name) init_Debug_reads(&R_INF_FLAG, asm_opt.required_read_name); // for debugging only - // overlap and correct reads - b = gen_ec_ovec_buf_t(asm_opt.thread_num, 0, (round == asm_opt.number_of_round - 1)); - // CALLOC(b, asm_opt.thread_num); - // for (i = 0; i < asm_opt.thread_num; ++i) - // b[i] = ha_ovec_init(0, (round == asm_opt.number_of_round - 1),0); - if(ha_idx) hom_cov = asm_opt.hom_cov; - if(ha_idx == NULL) ha_idx = ha_pt_gen(&asm_opt, ha_flt_tab, round == 0? 0 : 1, 0, &R_INF, &hom_cov, &het_cov); // build the index + if(ha_idx == NULL) { + ha_idx = ha_pt_gen(&asm_opt, ha_flt_tab, round == 0? 0 : 1, 0, &R_INF, &hom_cov, &het_cov); // build the index + asm_opt.hom_cov = hom_cov; asm_opt.het_cov = het_cov; + } ///debug_adapter(&asm_opt, &R_INF); if (round == 0 && ha_flt_tab == 0) // then asm_opt.hom_cov hasn't been updated ha_opt_update_cov(&asm_opt, hom_cov); het_cnt = NULL; if(round == asm_opt.number_of_round-1 && asm_opt.is_dbg_het_cnt) CALLOC(het_cnt, R_INF.total_reads); - /** - // fprintf(stderr, "[M::%s-start]\n", __func__); - if (asm_opt.required_read_name) - kt_for(asm_opt.thread_num, worker_ovec_related_reads, b, R_INF.total_reads); - else - kt_for(asm_opt.thread_num, worker_ovec, b, R_INF.total_reads);///debug_for_fix - // fprintf(stderr, "[M::%s-end]\n", __func__); - **/ - cal_ec_multiple(b, asm_opt.thread_num, R_INF.total_reads); - // kt_for(asm_opt.thread_num, worker_hap_ec, b, R_INF.total_reads);///debug_for_fix - if (r_out) write_pt_index(ha_flt_tab, ha_idx, &R_INF, &asm_opt, asm_opt.output_file_name); - ha_pt_destroy(ha_idx); - ha_idx = NULL; + + cal_ec_r(asm_opt.thread_num, round, num_pround, R_INF.total_reads, (round == (asm_opt.number_of_round-1))?1:0, tot_b, tot_e); + + // exit(1); + + // if (r_out) write_pt_index(ha_flt_tab, ha_idx, &R_INF, &asm_opt, asm_opt.output_file_name); + if(des_idx) { + ha_pt_destroy(ha_idx); ha_idx = NULL; + } + if(het_cnt) { print_het_cnt_log(het_cnt); free(het_cnt); het_cnt = NULL; } - /** - // collect statistics - for (i = 0; i < asm_opt.thread_num; ++i) { - asm_opt.num_bases += b[i]->num_read_base; - asm_opt.num_corrected_bases += b[i]->num_correct_base; - asm_opt.num_recorrected_bases += b[i]->num_recorrect_base; - asm_opt.mem_buf += ha_ovec_mem(b[i], NULL); - ha_ovec_destroy(b[i]); - } - free(b); - **/ - destroy_ec_ovec_buf_t(b); - exit(1); + // exit(1); + if (asm_opt.required_read_name) prt_dbg_rs(R_INF_FLAG.fp_r0, &R_INF_FLAG, 0); // for debugging only // save corrected reads to R_INF - CALLOC(e, asm_opt.thread_num); - for (i = 0; i < asm_opt.thread_num; ++i) { - init_UC_Read(&e[i].g_read); - e[i].first_round_read_size = e[i].second_round_read_size = 50000; - CALLOC(e[i].first_round_read, e[i].first_round_read_size); - CALLOC(e[i].second_round_read, e[i].second_round_read_size); - } - kt_for(asm_opt.thread_num, worker_ec_save, e, R_INF.total_reads); - for (i = 0; i < asm_opt.thread_num; ++i) { - destory_UC_Read(&e[i].g_read); - free(e[i].first_round_read); - free(e[i].second_round_read); - } - free(e); + // sl_ec_r(asm_opt.thread_num, R_INF.total_reads); if (asm_opt.required_read_name) prt_dbg_rs(R_INF_FLAG.fp_r1, &R_INF_FLAG, 1); // for debugging only if (asm_opt.required_read_name) destory_Debug_reads(&R_INF_FLAG), exit(0); // for debugging only ///debug_print_pob_regions(); + + // Output_corrected_reads(); + + // exit(1); } @@ -1927,6 +1898,25 @@ void ha_overlap_final(void) asm_opt.het_cov = het_cov; } +void ha_ec_ff(int renew_idx) +{ + int hom_cov, het_cov; + ha_flt_tab_hp = ha_idx_hp = NULL; + + if(ha_idx && renew_idx) { + ha_pt_destroy(ha_idx); ha_idx = NULL; + } + + if(!ha_idx) { + ha_idx = ha_pt_gen(&asm_opt, ha_flt_tab, 1, 0, &R_INF, &hom_cov, &het_cov); // build the index + asm_opt.hom_cov = hom_cov; asm_opt.het_cov = het_cov; + } + + cal_ov_r(asm_opt.thread_num, R_INF.total_reads, renew_idx); + + ha_pt_destroy(ha_idx); ha_idx = NULL; +} + static void worker_ov_utg(void *data, long i, int tid) { ha_ovec_buf_t *b = ((ha_ovec_buf_t**)data)[tid]; @@ -2016,7 +2006,7 @@ int ha_assemble(void) // debug_mc_gg_t(MC_NAME, 0, 0); // quick_debug_phasing(MC_NAME); extern void ha_extract_print_list(const All_reads *rs, int n_rounds, const char *o); - int r, hom_cov = -1, ovlp_loaded = 0; + int r, hom_cov = -1, ovlp_loaded = 0; uint64_t tot_b, tot_e; if (asm_opt.load_index_from_disk && load_all_data_from_disk(&R_INF.paf, &R_INF.reverse_paf, asm_opt.output_file_name)) { ovlp_loaded = 1; fprintf(stderr, "[M::%s::%.3f*%.2f] ==> loaded corrected reads and overlaps from disk\n", __func__, yak_realtime(), yak_cpu_usage()); @@ -2042,24 +2032,29 @@ int ha_assemble(void) assert(asm_opt.number_of_round > 0); for (r = ha_idx?asm_opt.number_of_round-1:0; r < asm_opt.number_of_round; ++r) { ha_opt_reset_to_round(&asm_opt, r); // this update asm_opt.roundID and a few other fields + tot_b = tot_e = 0; // ha_overlap_and_correct(r); - ha_ec(r); + ha_ec(r, asm_opt.number_of_pround, (r corrected reads for round %d\n", __func__, yak_realtime(), yak_cpu_usage(), yak_peakrss_in_gb(), r + 1); - fprintf(stderr, "[M::%s] # bases: %lld; # corrected bases: %lld; # recorrected bases: %lld\n", __func__, - asm_opt.num_bases, asm_opt.num_corrected_bases, asm_opt.num_recorrected_bases); - fprintf(stderr, "[M::%s] size of buffer: %.3fGB\n", __func__, asm_opt.mem_buf / 1073741824.0); + fprintf(stderr, "[M::%s] # bases: %lu; # corrected bases: %lu\n", __func__, tot_b, tot_e); + // fprintf(stderr, "[M::%s] # bases: %lld; # corrected bases: %lld; # recorrected bases: %lld\n", __func__, + // asm_opt.num_bases, asm_opt.num_corrected_bases, asm_opt.num_recorrected_bases); + // fprintf(stderr, "[M::%s] size of buffer: %.3fGB\n", __func__, asm_opt.mem_buf / 1073741824.0); } if (asm_opt.flag & HA_F_WRITE_EC) Output_corrected_reads(); // overlap between corrected reads ha_opt_reset_to_round(&asm_opt, asm_opt.number_of_round); - ha_overlap_final(); - fprintf(stderr, "[M::%s::%.3f*%.2f@%.3fGB] ==> found overlaps for the final round\n", __func__, yak_realtime(), - yak_cpu_usage(), yak_peakrss_in_gb()); - ha_print_ovlp_stat(R_INF.paf, R_INF.reverse_paf, R_INF.total_reads); + // ha_overlap_final(); + ha_ec_ff(1/**0**/); + fprintf(stderr, "[M::%s::%.3f*%.2f@%.3fGB] ==> found overlaps for the final round\n", __func__, yak_realtime(), yak_cpu_usage(), yak_peakrss_in_gb()); + // fprintf(stderr, "\n[M::%s::%.3f*%.2f@%.3fGB] ==> found overlaps for the final round\n", __func__, yak_realtime(), yak_cpu_usage(), yak_peakrss_in_gb()); + // ha_print_ovlp_stat(R_INF.paf, R_INF.reverse_paf, R_INF.total_reads); ha_ft_destroy(ha_flt_tab); if (asm_opt.flag & HA_F_WRITE_PAF) Output_PAF(); ha_triobin(&asm_opt); + + // exit(1); } if(ovlp_loaded == 2) ovlp_loaded = 0; ha_opt_update_cov_min(&asm_opt, asm_opt.hom_cov, MIN_N_CHAIN); diff --git a/CommandLines.cpp b/CommandLines.cpp index 41c0bf9..a1ef00a 100644 --- a/CommandLines.cpp +++ b/CommandLines.cpp @@ -18,8 +18,8 @@ static ko_longopt_t long_options[] = { { "write-paf", ko_no_argument, 302 }, { "write-ec", ko_no_argument, 303 }, { "skip-triobin", ko_no_argument, 304 }, - { "max-od-ec", ko_no_argument, 305 }, - { "max-od-final", ko_no_argument, 306 }, + { "max-od-ec", ko_required_argument, 305 }, + { "max-od-final", ko_required_argument, 306 }, { "ex-list", ko_required_argument, 307 }, { "ex-iter", ko_required_argument, 308 }, { "hom-cov", ko_required_argument, 309 }, @@ -249,6 +249,7 @@ void init_opt(hifiasm_opt_t* asm_opt) asm_opt->load_index_from_disk = 1; asm_opt->write_index_to_disk = 1; asm_opt->number_of_round = 3; + asm_opt->number_of_pround = 0/**3**/; asm_opt->adapterLen = 0; asm_opt->clean_round = 4; ///asm_opt->small_pop_bubble_size = 100000; diff --git a/CommandLines.h b/CommandLines.h index e8b9b9d..894beae 100644 --- a/CommandLines.h +++ b/CommandLines.h @@ -5,7 +5,7 @@ #include #include -#define HA_VERSION "0.19.9-r616" +#define HA_VERSION "0.20.0-r631" #define VERBOSE 0 @@ -74,6 +74,7 @@ typedef struct { int load_index_from_disk; int write_index_to_disk; int number_of_round; + int number_of_pround; int adapterLen; int clean_round; int roundID; diff --git a/Correct.cpp b/Correct.cpp index e624b13..3fb7bfa 100644 --- a/Correct.cpp +++ b/Correct.cpp @@ -2459,6 +2459,40 @@ int64_t *ts_r, int64_t *aux_beg_r, int64_t *aux_end_r) return 0; } +///cannot use tstr in-place +inline int recal_boundary_non_retrieve_exz(char* qstr, char* tstr, int64_t t_tot_l, +int64_t ql0, int64_t tl0, int64_t thres, +int64_t toff, int64_t ts0, int64_t te0, int64_t err0, +int64_t tid, uint32_t rev, bit_extz_t *exz, +int64_t *ts_r, int64_t *aux_beg_r, int64_t *aux_end_r) +{ + int64_t ts, tl, aux_beg, aux_end, t_pri_l, aln_l = ql0 + (thres << 1); + char *q_string = qstr, *t_string; + + if(ts0 == 0) {//left boundary + ts = toff; + } else if((te0 + 1) == tl0) {//right boundary + ts = toff + te0 - ql0 + 1; + } else { + return 0; + } + if(!init_waln(thres, ts, t_tot_l, aln_l, &aux_beg, &aux_end, &ts, &t_pri_l)) return 0; + if(ts == toff && tl0 == t_pri_l) return 0;//unchanged, make no sense + + tl = t_pri_l; t_string = tstr + ts; + + clear_align(*exz); + ed_band_cal_semi_64_w_absent_diag_trace(t_string, tl, q_string, ql0, thres, aux_beg, exz); + + if(is_align(*exz) && exz->err < err0) { + (*aux_beg_r) = aux_beg; + (*aux_end_r) = aux_end; + (*ts_r) = ts; + return 1; + } + return 0; +} + inline char *update_des_str(char *des, int64_t s, int64_t pri_l, uint8_t rev, All_reads *rref, hpc_t *hpc_g, const ul_idx_t *uref, int64_t id, int64_t aux_beg, int64_t aux_end, char *src) { @@ -4088,6 +4122,70 @@ double e_rate, uint32_t is_cigar) return 0; } +inline uint32_t aln_wlst_adv_non_retrieve_exz(overlap_region *z, char *qstr, char *tstr, int64_t t_tot_l, bit_extz_t *exz, uint32_t max_err, +uint32_t rev, uint32_t id, int64_t qs, int64_t qe, int64_t t_s, int64_t block_s, double e_rate, uint32_t is_cigar) +{ + int64_t ql, tl, aln_l; window_list *p = NULL; ///int r_ts = 0, t_end; + int64_t aux_beg, aux_end, t_pri_l; int64_t thres; char *q_string, *t_string; + ql = qe + 1 - qs; + ///there are two potiential reasons for unmatched window: + ///1. this window has a large number of differences + ///2. DP does not start from the right offset + thres = double_error_threshold(get_init_err_thres(ql, e_rate, block_s, max_err), ql); + + aln_l = ql + (thres << 1); + + if(!init_waln(thres, t_s, t_tot_l, aln_l, &aux_beg, &aux_end, &t_s, &t_pri_l)) return 0; + if(t_pri_l + thres < ql) return 0; + + q_string = qstr + qs; t_string = tstr + t_s; + + tl = t_pri_l; + if(is_cigar) { + clear_align(*exz); + ed_band_cal_semi_64_w_absent_diag_trace(t_string, tl, q_string, ql, thres, aux_beg, exz); + } else { + ed_band_cal_semi_64_w_absent_diag(t_string, tl, q_string, ql, thres, aux_beg, exz); exz->ps = 0; + } + + // if(id == 40 && qs == 79670 && qe == 79824) { + // fprintf(stderr, "\n[M::%s::semi] exz->ps::%d, exz->pe::%d, exz->ts::%d, exz->te::%d, exz->err::%d, exz->cigar.n::%d\n", + // __func__, exz->ps, exz->pe, exz->ts, exz->te, exz->err, (int32_t)exz->cigar.n); + // } + + if(is_align(*exz)) { + kv_pushp(window_list, z->w_list, &p); + p->x_start = qs; p->x_end = qe; ///must set x_start/x_end here + p->y_start = t_s + exz->ps;///difference + p->y_end = t_s + exz->pe; + p->error = exz->err; + p->cidx = p->clen = 0; + if(is_cigar) { + push_wcigar(p, &(z->w_list), exz); + ///this condition is always wrong + ///in best case, r_ts = threshold, t_end = aln_l - thres - 1 + if ((((exz->pe+1) == tl) || (exz->ps == 0)) && (exz->err > 0)) { + if(recal_boundary_non_retrieve_exz(q_string, tstr, t_tot_l, ql, tl, thres, t_s, exz->ps, exz->pe, + exz->err, id, rev, exz, &t_s, &aux_beg, &aux_end)) { + //update cigar + z->w_list.c.n = p->cidx; push_wcigar(p, &(z->w_list), exz); + + p->y_start = t_s + exz->ps;///difference + p->y_end = t_s + exz->pe; + p->error = exz->err; + } + } + } + + p->extra_begin = aux_beg; + p->extra_end = aux_end; + p->error_threshold = thres; + z->align_length += ql; + return 1; + } + return 0; +} + inline uint32_t aln_wlst(overlap_region *z, All_reads *rref, const ul_idx_t *uref, UC_Read* g_read, Correct_dumy* dumy, int32_t y_strand, int32_t y_id, int64_t x_start, int64_t x_end, long long y_start, int64_t block_s, double e_rate, int32_t is_cigar) { @@ -8700,6 +8798,40 @@ void prt_sub_read(char *str, uint64_t str_l, uint64_t site, uint64_t win) fprintf(stderr, "\n"); } +void prt_sub_cigar(overlap_region* z, uint64_t str_l, uint64_t site, uint64_t win) +{ + uint64_t k, s, e, ci, xi, yi, ws, we, os, oe, ovlp; uint32_t cl; uint16_t c; bit_extz_t ez; char cm[4]; + cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + s = ((site >= win)?(site-win):(0)); + e = (((site+win+1) <= str_l)?(site+win+1):(str_l)); + + fprintf(stderr, "[M::%s-site::%lu] [%lu, %lu)\n", __func__, site, s, e); + + for (k = 0; k < z->w_list.n; k++) { + set_bit_extz_t(ez, (*z), k); ci = 0; xi = ez.ts; yi = ez.ps; + while (ci < ez.cigar.n) { + ws = xi; + ci = pop_trace(&(ez.cigar), ci, &c, &cl); + if(c!=2) xi += cl; + if(c!=3) yi += cl; + we = xi; + + os = MAX(s, ws); oe = MIN(e, we); + ovlp = ((oe>os)? (oe-os):0); + + if(c != 2) { + if(!ovlp) continue; + } else {///ws == we + if(ws < s || ws >= e) continue; + ovlp = cl; + } + + fprintf(stderr, "%lu%c[%lu,%lu)", ovlp, cm[c], os, oe); + } + fprintf(stderr, "\n"); + } +} + void generate_haplotypes_naive_HiFi(haplotype_evdience_alloc* hap, overlap_region_alloc* overlap_list, double up, UC_Read* g_read, void *km) { // fprintf(stderr, "[M::%s::] Done\n", __func__); @@ -8755,7 +8887,8 @@ void generate_haplotypes_naive_HiFi(haplotype_evdience_alloc* hap, overlap_regio if(s->occ_0 < 2 || s->occ_1 < 2) continue; if(s->occ_0 >= asm_opt.s_hap_cov && s->occ_1 >= asm_opt.infor_cov) o++;///allels must be real } - // if(overlap_list->list[hap->list[l].overlapID].y_id == 24128) { + + // if(overlap_list->list[hap->list[l].overlapID].y_id == 4290) { // fprintf(stderr, "[M::%s-id::%u] o->%lu(%c)\n", __func__, overlap_list->list[hap->list[l].overlapID].y_id, o, "+-"[overlap_list->list[hap->list[l].overlapID].y_pos_strand]); // for (i = l, o = 0; i < k; i++) { // if(hap->list[i].type!=1) continue;///mismatch @@ -8768,7 +8901,31 @@ void generate_haplotypes_naive_HiFi(haplotype_evdience_alloc* hap, overlap_regio // prt_sub_read(g_read->seq, g_read->length, s->site, 50); // } // } + // } + // else { + // for (i = l, o = 0; i < k; i++) { + // if(hap->list[i].type!=0) continue;///mismatch + // s = &(hap->snp_stat.a[hap->list[i].overlapSite]); + // assert(s->site == hap->list[i].site); + // if(s->occ_0 < 2 || s->occ_1 < 2) continue; + // if(s->occ_0 >= asm_opt.s_hap_cov && s->occ_1 >= asm_opt.infor_cov) { + // if(s->site == 7878 || s->site == 9682) { + + // fprintf(stderr, "***[M::%s] site::%u, rid::%u\t%.*s\n", __func__, s->site, overlap_list->list[hap->list[l].overlapID].y_id, (int)Get_NAME_LENGTH(R_INF, overlap_list->list[hap->list[l].overlapID].y_id), + // Get_NAME(R_INF, overlap_list->list[hap->list[l].overlapID].y_id)); + // prt_sub_cigar(&(overlap_list->list[hap->list[l].overlapID]), g_read->length, s->site, 50); + // } + // // o++;///allels must be real + // // fprintf(stderr, "[M::%s] site::%u, occ_0::%u, occ_1::%u, occ_2::%u, is_homopolymer::%u\n", __func__, s->site, s->occ_0, s->occ_1, s->occ_1, s->is_homopolymer); + // // prt_sub_read(g_read->seq, g_read->length, s->site, 50); + // } + // } // } + + // if(overlap_list->list[hap->list[l].overlapID].y_id == 317 || overlap_list->list[hap->list[l].overlapID].y_id == 287) { + // fprintf(stderr, "***0***[M::%s-id::%u] o->%lu\n", __func__, overlap_list->list[hap->list[l].overlapID].y_id, o); + // } + if(o > 0) { o = ((uint32_t)-1) - o; o <<= 32; o += l; @@ -8789,6 +8946,9 @@ void generate_haplotypes_naive_HiFi(haplotype_evdience_alloc* hap, overlap_regio if(s->occ_0 < 2 || s->occ_1 < 2) continue; if(s->occ_0 >= asm_opt.s_hap_cov && s->occ_1 >= asm_opt.infor_cov) o++; } + // if(overlap_list->list[hap->list[l].overlapID].y_id == 317 || overlap_list->list[hap->list[l].overlapID].y_id == 287) { + // fprintf(stderr, "***1***[M::%s-id::%u] o->%lu\n", __func__, overlap_list->list[hap->list[l].overlapID].y_id, o); + // } if(o == 0) continue; ii = hap->list[l].overlapID; @@ -8864,6 +9024,9 @@ void generate_haplotypes_naive_HiFi(haplotype_evdience_alloc* hap, overlap_regio if(!km) kv_push(uint64_t, hap->snp_srt, hap->list[i].overlapSite); else kv_push_km(km, uint64_t, hap->snp_srt, hap->list[i].overlapSite); } + // if(overlap_list->list[hap->list[l].overlapID].y_id == 317 || overlap_list->list[hap->list[l].overlapID].y_id == 287) { + // fprintf(stderr, "***2***[M::%s-id::%u] o->%lu\n", __func__, overlap_list->list[hap->list[l].overlapID].y_id, o); + // } hap->snp_srt.n -= o; ///there are at least two variants at one read if(o>=(overlap_list->list[hap->list[l].overlapID].align_length*up)) { @@ -11424,6 +11587,74 @@ char *qstr, char *tstr, bit_extz_t *exz, uint32_t rev, uint32_t id) return 0; } +inline uint32_t gen_backtrace_non_retrieve_adv_exz(window_list *p, overlap_region *z, char *qstr, char *tstr, int64_t t_tot_l, bit_extz_t *exz, uint32_t rev, uint32_t id) +{ + if(p->error < 0 || p->y_end < 0) return 0; + int64_t qs, qe, ql, tl, aln_l, t_pri_l, thres, ts; + int64_t aux_beg, aux_end; + char *q_string, *t_string; + ///there is no problem for x + qs = p->x_start; qe = p->x_end; ql = qe + 1 - qs; + thres = p->error_threshold; aln_l = ql + (thres<<1); + + ///y_start is the real y_start + ///for the window with cigar, y_start has already reduced extra_begin + ts = p->y_start; aux_beg = p->extra_begin; aux_end = p->extra_end; + if(aux_end >= 0) { + t_pri_l = aln_l - aux_beg - aux_end; + } else { + t_pri_l = ts + aln_l - aux_beg; if(t_pri_l > t_tot_l) t_pri_l = t_tot_l; + t_pri_l = t_pri_l - ts; + } + + q_string = qstr + qs; t_string = tstr + ts; tl = t_pri_l; + + exz->ts = 0; exz->te = p->x_end-p->x_start; exz->tl = ql; + exz->ps = -1; exz->pe = p->y_end-p->y_start; exz->pl = tl; + exz->err = p->error; exz->thre = p->error_threshold; + // clear_align(*exz); + ed_band_cal_semi_64_w_absent_diag_trace(t_string, tl, q_string, ql, thres, aux_beg, exz); + // if(id == 178 && p->x_start == 86800 && p->x_end == 86807) { + // fprintf(stderr, "\n[M::%s::semi] exz->ps::%d, exz->pe::%d, exz->ts::%d, exz->te::%d, exz->err::%d, exz->cigar.n::%d\n", + // __func__, exz->ps, exz->pe, exz->ts, exz->te, exz->err, (int32_t)exz->cigar.n); + // fprintf(stderr, "[M::%s::semi] p->y_start::%d, p->y_end::%d, p->x_start::%d, p->x_end::%d, p->error::%d\n", + // __func__, p->y_start, p->y_end, p->x_start, p->x_end, p->error); + // if(is_align(*exz)) { + // prt_cigar(exz->cigar.a, exz->cigar.n); + // fprintf(stderr, "[tstr] %.*s\n", exz->pe+1-exz->ps, t_string+exz->ps); + // fprintf(stderr, "[qstr] %.*s\n", exz->te+1-exz->ts, q_string+exz->ts); + // } + // } + // assert(is_align(*exz)); + // assert(cigar_check(t_string, q_string, exz)); + + + if(is_align(*exz)) { + p->y_start = ts + exz->ps;///difference + p->y_end = ts + exz->pe; + p->error = exz->err; + push_wcigar(p, &(z->w_list), exz); + ///this condition is always wrong + ///in best case, r_ts = threshold, t_end = aln_l - thres - 1 + if ((((exz->pe+1) == tl) || (exz->ps == 0)) && (exz->err > 0)) { + if(recal_boundary_non_retrieve_exz(q_string, tstr, t_tot_l, ql, tl, thres, ts, exz->ps, exz->pe, + exz->err, id, rev, exz, &ts, &aux_beg, &aux_end)) { + //update cigar + z->w_list.c.n = p->cidx; push_wcigar(p, &(z->w_list), exz); + + p->y_start = ts + exz->ps;///difference + p->y_end = ts + exz->pe; + p->error = exz->err; + } + } + p->extra_begin = aux_beg; + p->extra_end = aux_end; + return 1; + } + p->error = -1; + return 0; +} + ///ts do not have aux_beg, while te has uint32_t push_wlst_exz(const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, overlap_region* ol, @@ -11553,6 +11784,68 @@ uint32_t push_hc_wlst_exz(const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, o return 1; } + +///ts do not have aux_beg, while te has +uint32_t push_hc_wlst_non_retrieve_exz(overlap_region* ol, char* qa, int64_t ql, char *ta, int64_t tl, bit_extz_t *exz, uint32_t max_err, + int64_t qs, int64_t qe, int64_t ts, int64_t te, int64_t aux_beg, int64_t aux_end, double e_rate, int64_t block_s, double ovlp_cut, int64_t force_aln, void *km) +{ + + window_list p, t, *a; int64_t w_e, w_s, ce = qs - 1, cs = ol->x_pos_s, toff, ovl, ualn, aln, ys; + uint64_t a_n, k; + p.x_start = qs; p.x_end = qe; p.y_start = ts; p.y_end = te; p.error = exz->err; + p.extra_begin = aux_beg; p.extra_end = aux_end; p.error_threshold = exz->thre; p.cidx = p.clen = 0; + if(ol->w_list.n > 0) { //utilize the the end pos of pre-window in forward + w_e = ol->w_list.a[ol->w_list.n-1].x_end; + toff = ol->w_list.a[ol->w_list.n-1].y_end + 1; + while ((w_e < ce) && (toff < tl)) { + w_s = w_e + 1; + get_win_id_by_s(ol, w_s, block_s, &w_e); + // x_start = w_s; x_end = w_e; + if(aln_wlst_adv_non_retrieve_exz(ol, qa, ta, tl, exz, max_err, + ol->y_pos_strand, ol->y_id, w_s, w_e, toff, block_s, e_rate, 0)) { + toff = ol->w_list.a[ol->w_list.n-1].y_end + 1; + } else { + break; + } + } + cs = ol->w_list.a[ol->w_list.n-1].x_end + 1; + } + ///utilize the the start pos of next window in backward + a_n = ol->w_list.n; w_s = qs; + if(w_s > cs) { + gen_backtrace_non_retrieve_adv_exz(&p, ol, qa, ta, tl, exz, ol->y_pos_strand, ol->y_id); + toff = p.y_start - 1; + while (w_s > cs) { + w_e = w_s - 1; + get_win_id_by_e(ol, w_e, block_s, &w_s); ys = toff+1-(w_e+1-w_s); + // x_start = w_s; x_end = w_e; x_len = x_end + 1 - x_start; + if((ys >= 0) && aln_wlst_adv_non_retrieve_exz(ol, qa, ta, tl, exz, max_err, + ol->y_pos_strand, ol->y_id, w_s, w_e, ys, block_s, e_rate, 1)) { + toff = ol->w_list.a[ol->w_list.n-1].y_start - 1; + } else { + break; + } + } + } + + ol->align_length += qe + 1 - qs; + ovl = ol->x_pos_e+1-ol->x_pos_s; ualn = (qe + 1 - ol->x_pos_s) - ol->align_length; aln = ovl-ualn; + // if((!force_aln) && (!simi_pass(ovl, aln, 0, ovlp_cut, &e_rate)) && (!simi_pass(ovl, aln, sec_check, ovlp_cut, NULL))) { + if((!force_aln) && (!pass_qovlp(ovl, aln, ovlp_cut))) { + kv_push(window_list, ol->w_list, p); + return 0; + } + + if(ol->w_list.n > a_n) { + a = ol->w_list.a + a_n; a_n = ol->w_list.n - a_n; toff = a_n; a_n >>=1; + for (k = 0; k < a_n; k++) { + t = a[k]; a[k] = a[toff-1-k]; a[toff-1-k] = t; + } + } + kv_push(window_list, ol->w_list, p); + return 1; +} + uint32_t align_ul_ed_post_extz(overlap_region *z, const ul_idx_t *uref, hpc_t *hpc_g, char* qstr, char *tstr, bit_extz_t *exz, double e_rate, int64_t w_l, double ovlp_cut, int64_t force_aln, void *km) { int64_t q_s, q_e, nw, k, q_l, t_l, t_tot_l, sec_check = (uref&&(!hpc_g))?1:0; @@ -11611,6 +11904,9 @@ uint32_t align_hc_ed_post_extz(overlap_region *z, All_reads *rref, char* qstr, c z->w_list.n = 0; z->is_match = 0; z->align_length = 0; nw = get_num_wins(z->x_pos_s, z->x_pos_e+1, w_l); get_win_se_by_normalize_xs(z, (z->x_pos_s/w_l)*w_l, w_l, &q_s, &q_e); + // if(z->x_id == 19350 && z->y_id == 19324) { + // fprintf(stderr, "-z-[M::%s] tid::%u(%c)\tq::[%u,%u)\tt::[%u,%u)\n", __func__, z->y_id, "+-"[z->y_pos_strand], z->x_pos_s, z->x_pos_e+1, z->y_pos_s, z->y_pos_e+1); + // } for (k = 0; k < nw; k++) { aux_beg = aux_end = 0; q_l = 1 + q_e - q_s; thre = q_l*e_rate; thre = Adjust_Threshold(thre, q_l); @@ -11638,14 +11934,21 @@ uint32_t align_hc_ed_post_extz(overlap_region *z, All_reads *rref, char* qstr, c if (is_align(*exz)) { // ed_band_cal_semi_64_w(t_string, aln_l, q_string, q_l, thre, exz); // assert(exz->err <= exz->thre); - // fprintf(stderr, "[M::%s] exz->err::%d\n", __func__, exz->err); + // if(z->x_id == 19350 && z->y_id == 19324) { + // fprintf(stderr, "+[M::%s]\tq::[%ld,%ld)\tt::[%ld,%ld)\texz->err::%d\n", __func__, q_s, q_e + 1, t_s, t_s + exz->pe + 1, exz->err); + // } ///t_s do not have aux_beg, while t_s + t_end (aka, te) has if(!push_hc_wlst_exz(NULL, NULL, rref, z, qstr, tstr, exz, THRESHOLD_MAX_SIZE, q_s, q_e, t_s, t_s + exz->pe, t_tot_l, aux_beg, aux_end, e_rate, w_l, ovlp_cut, force_aln, km)) { return 0; } // append_window_list(z, q_s, q_e, t_s, t_s + t_end, error, aux_beg, aux_end, thre, w_l, km); - } + } + // else { + // if(z->x_id == 19350 && z->y_id == 19324) { + // fprintf(stderr, "-[M::%s]\tq::[%ld,%ld)\tt::[%ld,%ld)\texz->err::%d\n", __func__, q_s, q_e + 1, t_s, t_s + exz->pe + 1, exz->err); + // } + // } } q_s = q_e + 1; q_e = q_s + w_l - 1; if(q_e >= (int64_t)z->x_pos_e) q_e = z->x_pos_e; @@ -11657,23 +11960,72 @@ uint32_t align_hc_ed_post_extz(overlap_region *z, All_reads *rref, char* qstr, c return 1; } -inline uint32_t ed_cut(const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, -char *qstr, char *tstr, uint32_t rev, uint32_t id, -int64_t qs, int64_t qe, int64_t t_s, int64_t block_s, double e_rate, int64_t max_err, -uint32_t aln_dir, int64_t* r_err, int64_t* qoff, int64_t* toff, int64_t* aln_qlen) +uint32_t align_hc_ed_post_non_retrieve_extz(overlap_region *z, char* qstr, int64_t ql, char *tstr, int64_t tl, bit_extz_t *exz, double e_rate, int64_t w_l, double ovlp_cut, int64_t force_aln, void *km) { - (*aln_qlen) = 0; (*r_err) = INT32_MAX; - if(qoff) (*qoff) = -1; if(toff) (*toff) = -1; - int64_t ql, aln_l, t_tot_l, aux_beg, aux_end, t_pri_l, thres; - char *q_string, *t_string; unsigned int error; int t_end, q_end; - - ql = qe + 1 - qs; - ///there are two potiential reasons for unmatched window: - ///1. this window has a large number of differences - ///2. DP does not start from the right offset - if(rref) { - thres = double_error_threshold(get_init_err_thres(ql, e_rate, block_s, max_err), ql); - } else { + int64_t q_s, q_e, nw, k, q_l, t_l, aux_beg, aux_end, t_s, thre, aln_l, t_pri_l; + char *q_string, *t_string; + z->w_list.n = 0; z->is_match = 0; z->align_length = 0; + nw = get_num_wins(z->x_pos_s, z->x_pos_e+1, w_l); + get_win_se_by_normalize_xs(z, (z->x_pos_s/w_l)*w_l, w_l, &q_s, &q_e); + for (k = 0; k < nw; k++) { + aux_beg = aux_end = 0; q_l = 1 + q_e - q_s; + thre = q_l*e_rate; thre = Adjust_Threshold(thre, q_l); + if(thre > THRESHOLD_MAX_SIZE) thre = THRESHOLD_MAX_SIZE; + ///offset of y + t_s = (q_s - z->x_pos_s) + z->y_pos_s; + t_s += y_start_offset(q_s, &(z->f_cigar)); + + aln_l = q_l + (thre<<1); + if(init_waln(thre, t_s, tl, aln_l, &aux_beg, &aux_end, &t_s, &t_pri_l)) { + q_string = qstr+q_s; t_string = tstr+t_s; t_l = t_pri_l; + + ed_band_cal_semi_64_w_absent_diag(t_string, t_l, q_string, q_l, thre, aux_beg, exz); + + // if(z->x_id == 5569 && z->y_id == 5557 && q_s == 10075 && q_e == 10849) { + // fprintf(stderr, "\n[M::%s::semi::t_s->%ld::t_pri_l->%ld::aux_beg->%ld::aux_end->%ld::thre->%ld] exz->ps::%d, exz->pe::%d, exz->ts::%d, exz->te::%d, exz->err::%d, exz->cigar.n::%d, thre::%ld\n", + // __func__, t_s, t_pri_l, aux_beg, aux_end, thre, exz->ps, exz->pe, exz->ts, exz->te, exz->err, (int32_t)exz->cigar.n, thre); + // fprintf(stderr, "[tstr::len->%ld] %.*s\n", t_l, (int32_t)t_l, t_string); + // fprintf(stderr, "[qstr::len->%ld] %.*s\n", q_l, (int32_t)q_l, q_string); + // } + if (is_align(*exz)) { + // ed_band_cal_semi_64_w(t_string, aln_l, q_string, q_l, thre, exz); + // assert(exz->err <= exz->thre); + // fprintf(stderr, "[M::%s] exz->err::%d\n", __func__, exz->err); + ///t_s do not have aux_beg, while t_s + t_end (aka, te) has + if(!push_hc_wlst_non_retrieve_exz(z, qstr, ql, tstr, tl, exz, THRESHOLD_MAX_SIZE, q_s, q_e, t_s, t_s + exz->pe, + aux_beg, aux_end, e_rate, w_l, ovlp_cut, force_aln, km)) { + return 0; + } + // append_window_list(z, q_s, q_e, t_s, t_s + t_end, error, aux_beg, aux_end, thre, w_l, km); + } + } + q_s = q_e + 1; q_e = q_s + w_l - 1; + if(q_e >= (int64_t)z->x_pos_e) q_e = z->x_pos_e; + } + + // if((!force_aln) && (!simi_pass(z->x_pos_e+1-z->x_pos_s, z->align_length, 0, ovlp_cut, &e_rate)) && + // (!simi_pass(z->x_pos_e+1-z->x_pos_s, z->align_length, sec_check, ovlp_cut, NULL))) return 0; + if((!force_aln) && (!pass_qovlp(z->x_pos_e+1-z->x_pos_s, z->align_length, ovlp_cut))) return 0; + return 1; +} + +inline uint32_t ed_cut(const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, +char *qstr, char *tstr, uint32_t rev, uint32_t id, +int64_t qs, int64_t qe, int64_t t_s, int64_t block_s, double e_rate, int64_t max_err, +uint32_t aln_dir, int64_t* r_err, int64_t* qoff, int64_t* toff, int64_t* aln_qlen) +{ + (*aln_qlen) = 0; (*r_err) = INT32_MAX; + if(qoff) (*qoff) = -1; if(toff) (*toff) = -1; + int64_t ql, aln_l, t_tot_l, aux_beg, aux_end, t_pri_l, thres; + char *q_string, *t_string; unsigned int error; int t_end, q_end; + + ql = qe + 1 - qs; + ///there are two potiential reasons for unmatched window: + ///1. this window has a large number of differences + ///2. DP does not start from the right offset + if(rref) { + thres = double_error_threshold(get_init_err_thres(ql, e_rate, block_s, max_err), ql); + } else { thres = double_ul_error_threshold(get_init_err_thres(ql, e_rate, block_s, max_err), ql); } @@ -11709,6 +12061,45 @@ uint32_t aln_dir, int64_t* r_err, int64_t* qoff, int64_t* toff, int64_t* aln_qle return 1; } +inline uint32_t ed_non_retrieve_cut(char *qstr, char *tstr, int64_t t_tot_l, uint32_t rev, uint32_t id, +int64_t qs, int64_t qe, int64_t t_s, int64_t block_s, double e_rate, int64_t max_err, +uint32_t aln_dir, int64_t* r_err, int64_t* qoff, int64_t* toff, int64_t* aln_qlen) +{ + (*aln_qlen) = 0; (*r_err) = INT32_MAX; + if(qoff) (*qoff) = -1; if(toff) (*toff) = -1; + int64_t ql, aln_l, aux_beg, aux_end, t_pri_l, thres; + char *q_string, *t_string; unsigned int error; int t_end, q_end; + + ql = qe + 1 - qs; + ///there are two potiential reasons for unmatched window: + ///1. this window has a large number of differences + ///2. DP does not start from the right offset + thres = double_error_threshold(get_init_err_thres(ql, e_rate, block_s, max_err), ql); + + aln_l = ql + (thres << 1); + + if(!init_waln(thres, t_s, t_tot_l, aln_l, &aux_beg, &aux_end, &t_s, &t_pri_l)) return 0; + // if(t_pri_l + thres < ql) return 0; + + q_string = qstr + qs; t_string = tstr + t_s; + + // if(id == 6) { + // fprintf(stderr, "-[M::%s::aln_dir->%u] qs->%ld, ts->%ld, thres->%ld, aux_beg->%ld, aux_end->%ld, t_pri_l->%ld\n", + // __func__, aln_dir, qs, t_s, thres, aux_beg, aux_end, t_pri_l); + // } + if(aln_dir == 0) { + Reserve_Banded_BPM_Extension(t_string, aln_l, q_string, ql, thres, &error, &t_end, &q_end); + } else { + Reserve_Banded_BPM_Extension_REV(t_string, aln_l, q_string, ql, thres, &error, &t_end, &q_end); + } + + if(t_end != -1 && q_end != -1) (*aln_qlen) = (aln_dir?(ql-q_end):(q_end+1)); + if(qoff) (*qoff) = q_end; if(toff) (*toff) = t_end; (*r_err) = error; + + if((*aln_qlen) == 0) return 0; + return 1; +} + int64_t gen_extend_err_0(overlap_region *z, const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char* qstr, char *tstr, char *tstr_1, Correct_dumy* dumy, uint64_t *v_idx, int64_t block_s, double e_rate, int64_t qs, int64_t qe, int64_t pk) @@ -11848,6 +12239,69 @@ int64_t qs, int64_t qe, int64_t pk) return tot_e; } +int64_t gen_extend_err_0_non_retrieve_exz(overlap_region *z, char* qstr, char *tstr, int64_t tl, bit_extz_t *exz, uint64_t *v_idx, int64_t block_s, double e_rate, int64_t max_err, int64_t qs, int64_t qe, int64_t pk) +{ + int64_t tot_e = 0, ts, di[2], al[2], tb[2], an = z->w_list.n; double rr; + int64_t id = z->y_id, rev = z->y_pos_strand, ql = qe + 1 - qs; + ///check if there are some windows that cannot be algined by any overlaps/unitigs + ///if no, it is likely that the UL read itself has issues + + ts = (qs - z->x_pos_s) + z->y_pos_s; ts += y_start_offset(qs, &(z->f_cigar)); + + di[0] = di[1] = al[0] = al[1] = 0; tb[0] = tb[1] = -1; + if((pk > 0) && (qs == (z->w_list.a[pk].x_end + 1))) { + if(z->w_list.a[pk].clen == 0) {///do not have cigar + gen_backtrace_non_retrieve_adv_exz(&(z->w_list.a[pk]), z, qstr, tstr, tl, exz, rev, id); + } + tb[0] = z->w_list.a[pk].y_end + 1; + } + + if(((pk+1) < an) && ((qe+1) == (z->w_list.a[pk+1].x_start))) { + if(z->w_list.a[pk+1].clen == 0) {///do not have cigar + gen_backtrace_non_retrieve_adv_exz(&(z->w_list.a[pk+1]), z, qstr, tstr, tl, exz, rev, id); + } + tb[1] = z->w_list.a[pk+1].y_start-ql; + } + + if(tb[0] == -1 && tb[1] == -1) tb[0] = tb[1] = ts; + else if(tb[0] == -1 && tb[1] != -1) tb[0] = tb[1]; + else if(tb[1] == -1 && tb[0] != -1) tb[1] = tb[0]; + + if(tb[0] != -1) { + if(!ed_non_retrieve_cut(qstr, tstr, tl, rev, id, qs, qe, tb[0], block_s, e_rate, max_err, + 0, &(di[0]), NULL, NULL, &(al[0]))) { + di[0] = ql; al[0] = 0; + } + } + + if(tb[1] != -1) { + if(!ed_non_retrieve_cut(qstr, tstr, tl, rev, id, qs, qe, tb[1], block_s, e_rate, max_err, + 1, &(di[1]), NULL, NULL, &(al[1]))) { + di[1] = ql; al[1] = 0; + } + } + + if(al[0] && al[1]) {///matched in both sides + if((al[0] + al[1]) <= ql) { + tot_e += di[0] + di[1] + ql - (al[0] + al[1]); + } else { + rr = ((double)ql)/((double)(al[0] + al[1])); + tot_e += (di[0] + di[1])*rr; + } + } else if((!al[0]) && (!al[1])) {//failed + tot_e += ql; + } else if(al[0]) { + tot_e += di[0] + (ql - al[0]); + }else if(al[1]) { + tot_e += di[1] + (ql - al[1]); + } + // if(z->y_id == 6) { + // fprintf(stderr, "-[M::%s] qs->%ld, ts->%ld, tb[0]->%ld, tb[1]->%ld, di[0]->%ld, di[1]->%ld, al[0]->%ld, al[1]->%ld, block_s->%ld, e_rate->%f\n", __func__, + // qs, ts, tb[0], tb[1], di[0], di[1], al[0], al[1], block_s, e_rate); + // } + return tot_e; +} + double gen_extend_err(overlap_region *z, const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char* qstr, char *tstr, char *tstr_1, Correct_dumy* dumy, uint64_t *v_idx, int64_t block_s, double ovlp_cut, double e_rate, double e_max, int64_t *r_e) { @@ -11936,6 +12390,49 @@ char *tstr, bit_extz_t *exz, uint64_t *v_idx, int64_t block_s, double ovlp_cut, return (double)(tot_e)/(double)(tot_l); } +double gen_extend_err_non_retrieve_exz(overlap_region *z, char* qstr, char *tstr, int64_t tl, bit_extz_t *exz, uint64_t *v_idx, int64_t block_s, double ovlp_cut, double e_rate, double e_max, int64_t max_err, int64_t sec_check, int64_t *r_e) +{ + int64_t ovl, k, ce, an = z->w_list.n, tot_l, tot_e, ws, we, ql; + ovl = z->x_pos_e+1-z->x_pos_s; if(r_e) (*r_e) = INT64_MAX; + if((sec_check) && (!simi_pass(ovl, z->align_length, 0, ovlp_cut, &e_rate))) return DBL_MAX; + // nw = get_num_wins(z->x_pos_s, z->x_pos_e+1, block_s); + // for (k = 0; k < an; k++) { + // if(z->w_list.a[k].clen) z->w_list.a[k].y_end -= z->w_list.a[k].extra_begin; + // } + + tot_l = tot_e = 0; + for (k = an-1, ce = z->x_pos_e; k >= 0; k--) { + // assert(k == 0 || z->w_list.a[k].x_end > z->w_list.a[k-1].x_start);//sorted + tot_l += z->w_list.a[k].x_end + 1 - z->w_list.a[k].x_start; + tot_e += z->w_list.a[k].error;///matched window + + we = z->w_list.a[k].x_end; + while (we < ce) { + ws = we+1; + get_win_id_by_s(z, ws, block_s, &we); + ql = we+1-ws; tot_l += ql; + tot_e += gen_extend_err_0_non_retrieve_exz(z, qstr, tstr, tl, exz, v_idx, block_s, e_rate, max_err, ws, we, k); + if((e_max > 0) && (tot_e > (ovl*e_max))) return DBL_MAX; + } + ce = z->w_list.a[k].x_start-1; + if((e_max > 0) && (tot_e > (ovl*e_max))) return DBL_MAX; + } + + if(ce >= ((int64_t)z->x_pos_s)) { + we = ((int64_t)z->x_pos_s)-1; + while (we < ce) { + ws = we+1; + get_win_id_by_s(z, ws, block_s, &we); + ql = we+1-ws; tot_l += ql; + tot_e += gen_extend_err_0_non_retrieve_exz(z, qstr, tstr, tl, exz, v_idx, block_s, e_rate, max_err, ws, we, k); + if((e_max > 0) && (tot_e > (ovl*e_max))) return DBL_MAX; + } + } + + assert(tot_l == ovl); if(r_e) (*r_e) = tot_e; + return (double)(tot_e)/(double)(tot_l); +} + void push_anchors(window_list *z, window_list_alloc *zidx, asg64_v *anchor, uint64_t *qhp, int64_t qhp_l, int64_t *qhp_k, uint32_t mcl) { @@ -12832,6 +13329,23 @@ int64_t qs, int64_t qe, int64_t thre, int64_t *ts, int64_t *te, int64_t *aux_beg } +///[qs, qe) +int64_t update_semi_non_retrieve_coord(overlap_region *z, int64_t qs, int64_t qe, int64_t thre, int64_t t_tot_l, int64_t *ts, int64_t *te, int64_t *aux_beg) +{ + int64_t ql = qe - qs, aln_l, aux_end, tl; + (*ts) = (qs - z->x_pos_s) + z->y_pos_s; + (*ts) += y_start_offset(qs, &(z->f_cigar)); + aln_l = ql + (thre<<1); + + if(!init_waln(thre, (*ts), t_tot_l, aln_l, aux_beg, &aux_end, ts, &tl)) { + (*ts) = (*te) = (*aux_beg) = -1; + return 0; + } + (*te) = (*ts) + tl; + return 1; +} + + void adjust_ext_offset(int64_t *qs, int64_t *qe, int64_t *ts, int64_t *te, int64_t ql, int64_t tl, int64_t thre, int64_t mode) { int64_t qoff, toff; @@ -14107,6 +14621,56 @@ int64_t *pts, int64_t *pte, int64_t thre, int64_t *pthre, int64_t q_tot_l, int64 } +int64_t cal_exz_infi_non_retrieve_adv(overlap_region *z, bit_extz_t *exz, char* qstr, char* tstr, int64_t qs, int64_t qe, int64_t ts, int64_t te, int64_t thre, int64_t *pthre, int64_t q_tot_l, int64_t t_tot_l, int64_t mode) +{ + clear_align(*exz); + int64_t aux_beg = 0, ql, tl, dd; char *q_string, *t_string; + ql = qe - qs; tl = te - ts; dd = MAX(ql, tl); + + if(mode == 3) { + update_semi_non_retrieve_coord(z, qs, qe, ((thre>dd)?dd:thre), t_tot_l, &ts, &te, &aux_beg); + } else if(mode == 1 || mode == 2) { + adjust_ext_offset(&qs, &qe, &ts, &te, q_tot_l, t_tot_l, ((thre>dd)?dd:thre), mode); + } + + if((qe > qs) && (te > ts) && (ts != -1) && (te != -1)) { + ql = qe - qs; tl = te - ts; + dd = MAX(ql, tl); + if(thre > dd) thre = dd; + if(thre <= (*pthre)) return 0; + (*pthre) = thre; + + q_string = qstr + qs; t_string = tstr + ts; + + if(mode == 0) { //global + cal_exz_global(t_string, tl, q_string, ql, thre, exz); + } else if(mode == 1) {///forward extension + cal_exz_extension_0(t_string, tl, q_string, ql, thre, exz); + } else if(mode == 2) {///backward extension + cal_exz_extension_1(t_string, tl, q_string, ql, thre, exz); + } else if(mode == 3) {//semi-global + cal_exz_semi(t_string, tl, q_string, ql, thre, aux_beg, exz); + } + + if(is_align(*exz)) { + // cigar_check(t_string, q_string, exz); + // if(mode == 1) { + // fprintf(stderr, "\n[M::%s::ql::%ld] qs::%ld, qe::%ld, ts::%ld, te::%ld, mode::%ld, err::%d, thre::%d, exz_q[%d, %d], exz_t[%d, %d]\n", + // __func__, ql, qs, qe, ts, te, mode, exz->err, exz->thre, exz->ts, exz->te, exz->ps, exz->pe); + // fprintf(stderr, "[M::%s::] pstr::%.*s\n", __func__, (int32_t)tl, t_string); + // fprintf(stderr, "[M::%s::] tstr::%.*s\n", __func__, (int32_t)ql, q_string); + // fprintf(stderr, "[M::%s::] exz->cigar.n::%d\n", __func__, (int32_t)exz->cigar.n); + // } + exz->ps += ts; exz->pe += ts; + exz->ts += qs; exz->te += qs; + return 1; + } + return 0; + } + return 0; +} + + int64_t cal_exact_exz(overlap_region *z, const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, bit_extz_t *exz, char* qstr, UC_Read *tu, int64_t qs, int64_t qe, int64_t ts, int64_t te, int64_t *pts, int64_t *pte, int64_t q_tot_l, int64_t mode) @@ -14151,9 +14715,50 @@ int64_t *pts, int64_t *pte, int64_t q_tot_l, int64_t mode) return 1; } -//[qmin, qmax) && [tmin, tmax) -void adjust_ext_offset_fixed_t(int64_t *qs, int64_t *qe, int64_t *ts, int64_t *te, -int64_t qmin, int64_t qmax, int64_t tmin, int64_t tmax, int64_t thre, int64_t mode) +int64_t cal_exact_non_retrieve_exz(overlap_region *z, bit_extz_t *exz, char* qstr, char* tstr, int64_t qs, int64_t qe, int64_t ts, int64_t te, +int64_t q_tot_l, int64_t t_tot_l, int64_t mode) +{ + clear_align(*exz); exz->thre = 0; exz->cigar.n = 0; + int64_t ql, tl; + char *q_string, *t_string; ql = qe - qs; + // if(hpc_g) t_tot_l = hpc_len(*hpc_g, id); + // else if(uref) t_tot_l = uref->ug->u.a[id].len; + // else t_tot_l = Get_READ_LENGTH((*rref), id); + + if(mode == 3) {//semi + ts = (qs - z->x_pos_s) + z->y_pos_s; ts += y_start_offset(qs, &(z->f_cigar)); + te = ts + ql; + } else if(mode == 1) {///forward extension + te = ts + ql; + } else if(mode == 2) {///backward extension + ts = te - ql; + } + if(ts < 0) ts = 0; + if(ts > t_tot_l) ts = t_tot_l; + if(te > t_tot_l) te = t_tot_l; + ql = qe - qs; tl = te - ts; + if(ql != tl) return 0; + + q_string = qstr + qs; t_string = tstr + ts; + + if(memcmp(q_string, t_string, ql)) return 0; + exz->err = 0; push_trace(&(exz->cigar), 0, ql); + exz->pl = tl; exz->ps = 0; exz->pe = tl-1; + exz->tl = ql; exz->ts = 0; exz->te = ql-1; + // cigar_check(t_string, q_string, exz); + // if(!cigar_check(t_string, q_string, exz)) { + // fprintf(stderr, "[M::%s::] cigar_n::%d\n", __func__, (int32_t)exz->cigar.n); + // fprintf(stderr, "[M::%s::] pstr::%.*s\n", __func__, (int32_t)tl, t_string); + // fprintf(stderr, "[M::%s::] tstr::%.*s\n", __func__, (int32_t)ql, q_string); + // } + exz->ps += ts; exz->pe += ts; + exz->ts += qs; exz->te += qs; + return 1; +} + +//[qmin, qmax) && [tmin, tmax) +void adjust_ext_offset_fixed_t(int64_t *qs, int64_t *qe, int64_t *ts, int64_t *te, +int64_t qmin, int64_t qmax, int64_t tmin, int64_t tmax, int64_t thre, int64_t mode) { int64_t qoff, toff; if(mode == 1) {///forward extension @@ -14609,6 +15214,94 @@ int64_t estimate_err, overlap_region *aux_o) } +int64_t hc_aln_exz_non_retrieve_adv_hc(overlap_region *z, char* qstr, char* tstr, int64_t qs, int64_t qe, int64_t ts, int64_t te, int64_t mode, int64_t wl, +bit_extz_t *exz, int64_t q_tot, int64_t t_tot, double e_rate, int64_t maxl, int64_t maxe, int64_t force_l, int64_t estimate_err, overlap_region *aux_o) +{ + clear_align(*exz); exz->thre = 0; + if(((ts == -1) && (te == -1))) mode = 3;///set to semi-global + int64_t thre, ql = qe - qs, thre0, pthre = -1, full = 0; + if(ql == 0 && (te-ts) == 0) return 1; + if((ql <= 0) || (te-ts) <= 0) return 0; + if(estimate_err < 0) estimate_err = cal_estimate_err_hc(z, wl, qs, qe, ts, te, e_rate, &full); + + + + // if(ql <= 16) { + if(estimate_err == 0) { + if(full) { + // if(!cal_exact_exz(z, uref, hpc_g, rref, exz, qstr, tu, qs, qe, ts, te, &pts, &pte, q_tot, mode)) { + // fprintf(stderr, "[M::%s::ql::%ld::%c] xid::%d, yid::%d, qs::[%ld, %ld), ts::[%ld, %ld), mode::%ld, est_err::%ld, e_rate::%f, maxe::%ld\n", + // __func__, ql, "+-"[z->y_pos_strand], z->x_id, z->y_id, qs, qe, ts, te, mode, estimate_err, e_rate, maxe); + // exit(1); + // } + set_exact_exz(exz, qs, qe, ts, te); push_alnw(aux_o, exz); + return 1; + } else if(cal_exact_non_retrieve_exz(z, exz, qstr, tstr, qs, qe, ts, te, q_tot, t_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::0(+)\n", exz->err, exz->thre); + push_alnw(aux_o, exz); + return 1; + } + } + + if(ql <= maxl && (estimate_err>>1) <= maxe) { + thre = scale_ed_thre(estimate_err, maxe); + if(cal_exz_infi_non_retrieve_adv(z, exz, qstr, tstr, qs, qe, ts, te, thre, &pthre, q_tot, t_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::%ld(+)\n", exz->err, exz->thre, thre); + push_alnw(aux_o, exz); + return 1; + } + + thre0 = thre; thre = ql*e_rate; thre = scale_ed_thre(thre, maxe); + if(thre > thre0) { + if(cal_exz_infi_non_retrieve_adv(z, exz, qstr, tstr, qs, qe, ts, te, thre, &pthre, q_tot, t_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::%ld(-)\n", exz->err, exz->thre, thre); + push_alnw(aux_o, exz); + return 1; + } + } + + thre0 = thre; thre <<= 1; thre = scale_ed_thre(thre, maxe); + if(thre > thre0) { + if(cal_exz_infi_non_retrieve_adv(z, exz, qstr, tstr, qs, qe, ts, te, thre, &pthre, q_tot, t_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::%ld(-)\n", exz->err, exz->thre, thre); + push_alnw(aux_o, exz); + return 1; + } + } + + thre0 = thre; thre = ql*0.51; thre = scale_ed_thre(thre, maxe); + if(thre > thre0) { + if(cal_exz_infi_non_retrieve_adv(z, exz, qstr, tstr, qs, qe, ts, te, thre, &pthre, q_tot, t_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::%ld(*)\n", exz->err, exz->thre, thre); + push_alnw(aux_o, exz); + return 1; + } + } + + if(ql <= force_l) { + thre = maxe; + if(cal_exz_infi_non_retrieve_adv(z, exz, qstr, tstr, qs, qe, ts, te, thre, &pthre, q_tot, t_tot, mode)) { + // ref_cigar_check(qstr, tu, uref, hpc_g, rref, z->y_id, z->y_pos_strand, exz); + // fprintf(stderr, ", err::%d, thre::%d, scale::%ld(*)\n", exz->err, exz->thre, thre); + push_alnw(aux_o, exz); + return 1; + } + } + } + // fprintf(stderr, ", err::%d, thre::%d\n", INT32_MAX, exz->thre); + // if(mode == 0) { + // fprintf(stderr, "[M::%s::] pstr::%.*s\n", __func__, (int32_t)tu->length, tu->seq); + // fprintf(stderr, "[M::%s::] tstr::%.*s\n", __func__, (int32_t)(qe-qs), qstr+qs); + // } + return 0; + +} + void prt_k_mer_hit(k_mer_hit *ch_a, int64_t ch_n) { int64_t k; @@ -15296,7 +15989,73 @@ int64_t ql, int64_t tl, double e_rate, int64_t h_khit, int64_t mode, int64_t rid os = MAX(qs, ws); oe = MIN(qe, we); ovlp = ((oe>os)? (oe-os):0); if(!ovlp) continue; if(!(z->w_list.a[k].clen)) { - gen_backtrace_adv_exz(&(z->w_list.a[k]), z, NULL, NULL, uref, qstr, tu->seq, exz, z->y_pos_strand, z->y_id); + gen_backtrace_adv_exz(&(z->w_list.a[k]), z, rref, hpc_g, uref, qstr, tu->seq, exz, z->y_pos_strand, z->y_id); + } + ez.a = z->w_list.c.a + z->w_list.a[k].cidx; + ez.n = ez.m = z->w_list.a[k].clen; + occ += extract_exact_cigar(&ez, z->w_list.a[k].y_start, z->w_list.a[k].x_start, ts, te, qs, qe, cl, 10, wl, h_khit); + } + ///global or backward + if(mode == 0 || mode == 2) push_khit(cl, qe-1, te-1, 0, 0, &w); + // fprintf(stderr, "[M::%s::] rcn::%ld, cl->length::%lld\n", __func__, rcn, cl->length); + if(!occ) { + cl->length = rcn; return 0; + } + ncn = cl->length; cl->length = rcn; + k_mer_hit *ch_a = cl->list + rcn; int64_t ch_n0 = ncn - rcn, ch_n; + int64_t max_skip, max_iter, max_dis, quick_check; double chn_pen_gap, chn_pen_skip; + set_lchain_dp_op(is_accurate, h_khit, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); + max_dis = MAX_SIN_L>>1; + // for (k = 0; k < ch_n0; k++) { + // assert(debug_k_mer_hit_retrive(&(ch_a[k]), hpc_g, rref, uref, qstr, tu, z->y_id, z->y_pos_strand)); + // } + ch_n = lchain_qdp_fix(ch_a, ch_n0, &(cl->chainDP), max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, + e_rate, ql, tl, 1, ((mode==0)||(mode==1))?1:0, ((mode==0)||(mode==2))?1:0); + for (k = occ = 0; k < ch_n; k++) { + ch_a[k] = ch_a[cl->chainDP.tmp[k]]; + if((ch_a[k].cnt&(0xffu))) occ++; + // assert(debug_k_mer_hit_retrive(&(ch_a[k]), hpc_g, rref, uref, qstr, tu, z->y_id, z->y_pos_strand)); + } + // fprintf(stderr, "[M::%s::] ch_n0::%ld, ch_n::%ld, mode::%ld, ql::%ld, tl::%ld, occ::%ld\n", + // __func__, ch_n0, ch_n, mode, qe-qs, te-ts, occ); + if(occ <= 0) return 0; + ch_n = gen_single_khit(cl, ch_n, h_khit, mode, qs, qe, ts, te, max_skip, max_iter, rid); + return ch_n; +} + + +///[qs, qe) && [ts, te) +int64_t gen_win_non_retrieve_chain(overlap_region *z, Candidates_list *cl, int64_t qs, int64_t qe, int64_t ts, int64_t te, +int64_t wl, char* qstr, char* tstr, bit_extz_t *exz, int64_t ql, int64_t tl, double e_rate, int64_t h_khit, int64_t mode, int64_t rid, int64_t is_accurate) +{ + assert(mode < 3); + int64_t k, ws, we, os, oe, wsk, rcn = cl->length, ncn, occ = 0, ovlp, wn = z->w_list.n; asg16_v ez; uint32_t w = 1; + ws = qs; if(ws < z->x_pos_s) ws = z->x_pos_s; + we = qe-1; if(we > z->x_pos_e) we = z->x_pos_e; + wsk = get_win_id_by_s(z, ((ws/wl)*wl), wl, NULL); + // if(rid == 7) { + // fprintf(stderr, "[M::%s::]\tutg%.6u%c\t%u\t%u\t%c\tutg%.6u%c\t%u\t%u\tq::[%ld,%ld)\tw::[%ld,%ld]\twn::%ld\twsk::%ld\n", __func__, + // z->x_id+1, "lc"[uref->ug->u.a[z->x_id].circ], + // z->x_pos_s, z->x_pos_e+1, "+-"[z->y_pos_strand], + // z->y_id+1, "lc"[uref->ug->u.a[z->y_id].circ], + // z->y_pos_s, z->y_pos_e+1, qs, qe, ws, we, wn, wsk); + // } + // assert((ws>=z->w_list.a[wsk].x_start) && (ws<=z->w_list.a[wsk].x_end)); + // wek = get_win_id_by_e(z, ((we/wl)*wl), wl, NULL); + // assert((we>=z->w_list.a[wek].x_start) && (we<=z->w_list.a[wek].x_end)); + for(wsk=((wskz->w_list.a[wsk].x_end; wsk++); + for(wsk=((wsk=0 && qsw_list.a[wsk].x_start; wsk--); + if(wsk < 0) wsk = 0; ///qs >= z->w_list.a[wsk].x_start && qs <= z->w_list.a[wsk].x_end + ///global or forward + if(mode == 0 || mode == 1) push_khit(cl, qs, ts, 0, 0, &w); + //[ws, we] && [wsk, wek]; [qs, qe) && [ts, te) + for (k = wsk; kw_list.a[k].x_startw_list.a[k].y_end == -1) continue; + ws = z->w_list.a[k].x_start; we = z->w_list.a[k].x_end+1; + os = MAX(qs, ws); oe = MIN(qe, we); ovlp = ((oe>os)? (oe-os):0); + if(!ovlp) continue; + if(!(z->w_list.a[k].clen)) { + gen_backtrace_non_retrieve_adv_exz(&(z->w_list.a[k]), z, qstr, tstr, tl, exz, z->y_pos_strand, z->y_id); } ez.a = z->w_list.c.a + z->w_list.a[k].cidx; ez.n = ez.m = z->w_list.a[k].clen; @@ -15331,6 +16090,7 @@ int64_t ql, int64_t tl, double e_rate, int64_t h_khit, int64_t mode, int64_t rid } + void rechain_aln(overlap_region *z, Candidates_list *cl, overlap_region *aux_o, int64_t aux_i, int64_t wl, const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char* qstr, UC_Read *tu, bit_extz_t *exz, double e_rate, int64_t ql, int64_t tl, int64_t h_khit, int64_t rid) @@ -15694,6 +16454,87 @@ bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, int64_t tl, u } } +void hc_ovlp_base_non_retrieve_direct(overlap_region *z, k_mer_hit *ch_a, int64_t ch_n, int64_t wl, char* qstr, char* tstr, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, int64_t tl, uint64_t rid) +{ + int64_t i, l, mode, q[2], t[2], qr, tr, is_done, zn; + + if(z->non_homopolymer_errors == 0 && z->w_list.n) { + zn = z->w_list.n; + for (i = 1; i < zn; i++) { + if((z->w_list.a[i].error == 0 && z->w_list.a[i-1].error == 0) && (z->w_list.a[i].x_start == z->w_list.a[i-1].x_end + 1) && + (z->w_list.a[i].y_end == (z->w_list.a[i-1].y_end + (z->w_list.a[i].x_end-z->w_list.a[i-1].x_end)))) { + continue; + } + break; + } + if(i >= zn) { + q[0] = z->w_list.a[0].x_start; q[1] = z->w_list.a[z->w_list.n-1].x_end; + t[1] = z->w_list.a[z->w_list.n-1].y_end; t[0] = z->w_list.a[0].y_end - (z->w_list.a[0].x_end-z->w_list.a[0].x_start); + + if(q[0] <= t[0]) { + t[0] -= q[0]; q[0] = 0; + } else { + q[0] -= t[0]; t[0] = 0; + } + + qr = ql-q[1]-1; tr = tl-t[1]-1; + if(qr <= tr) { + q[1] = ql-1; t[1] += qr; + } else { + t[1] = tl-1; q[1] += tr; + } + + if(q[0] == z->w_list.a[0].x_start && q[1] == z->w_list.a[z->w_list.n-1].x_end) { + // fprintf(stderr, "[M::%s::%u->%u::%c] ovlp::%u, w_list.n::%u\n", __func__, z->x_id, z->y_id+1, "+-"[z->y_pos_strand], z->x_pos_e+1-z->x_pos_s, (uint32_t)z->w_list.n); + set_exact_exz(exz, q[0], q[1] + 1, t[0], t[1] + 1); push_alnw(aux_o, exz); + return; + } + } + } + + for (l = -1, i = 0; i <= ch_n; i++) { + q[0] = q[1] = t[0] = t[1] = mode = -1; is_done = 0; + if(l >= 0) { + q[0] = ch_a[l].self_offset; t[0] = ch_a[l].offset; + } else { + q[0] = 0; + } + + if(i < ch_n) { + q[1] = ch_a[i].self_offset; t[1] = ch_a[i].offset; + } else { + q[1] = ql; + } + + if((t[0] != -1) && (t[1] != -1)) { + mode = 0;//global + } else if((t[0] != -1) && (t[1] == -1)) { + mode = 1;///forward extension + } else if((t[0] == -1) && (t[1] != -1)) { + mode = 2;///backward extension + } else { + mode = 3;///no primary hit within [ibeg, iend] + } + + // if(z->x_id == 57 && z->y_id == 2175) { + if(mode == 1 || mode == 2) adjust_ext_offset(&(q[0]), &(q[1]), &(t[0]), &(t[1]), ql, tl, 0, mode); + // fprintf(stderr, "#[M::%s::] utg%.6dl(%c), q::[%ld, %ld), t::[%ld, %ld), mode::%ld\n", + // __func__, (int32_t)z->y_id+1, "+-"[z->y_pos_strand], q[0], q[1], t[0], t[1], mode); + // } + is_done = hc_aln_exz_non_retrieve_adv_hc(z, qstr, tstr, q[0], q[1], t[0], t[1], mode, wl, exz, ql, tl, e_rate, MAX_SIN_L, MAX_SIN_E, FORCE_SIN_L, -1, aux_o); + + // if(z->x_id == 57 && z->y_id == 2175) { + // fprintf(stderr, "-is_done::%ld[M::%s::] utg%.6dl(%c), q::[%ld, %ld), t::[%ld, %ld), mode::%ld, ch_n::%ld\n", + // is_done, __func__, (int32_t)z->y_id+1, "+-"[z->y_pos_strand], q[0], q[1], t[0], t[1], mode, ch_n); + // } + + if(!is_done) {///postprocess + push_unmap_alnw(aux_o, q[0], q[1]-1, t[0], t[1]-1, mode); + } + l = i; + } +} + void cigar_gen_by_chain_adv_local(overlap_region *z, Candidates_list *cl, ul_ov_t *ov, int64_t on, uint64_t wl, const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char* qstr, UC_Read *tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, uint64_t rid, int64_t h_khit) { @@ -15741,33 +16582,104 @@ UC_Read *tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, if((i < aux_n) && (is_ualn_win(aux_o->w_list.a[i]))) continue; aux_o->w_list.a[m++] = aux_o->w_list.a[i]; } - aux_o->w_list.n = m; - radix_sort_window_list_xs_srt(aux_o->w_list.a, aux_o->w_list.a+aux_o->w_list.n); + aux_o->w_list.n = m; + radix_sort_window_list_xs_srt(aux_o->w_list.a, aux_o->w_list.a+aux_o->w_list.n); + } + + ///update z by aux_o + update_overlap_region(z, aux_o, ql, tl); + + // debug_overlap_region(aux_o, qstr, tu, uref, hpc_g, rref); + + + // ch_a = cl->list + ch_idx; //update + // for (i = 0; i < wn; i++) z->w_list.a[i].clen = 0;///clean cigar + // if(on > 1) { + // fprintf(stderr, "[M::%s::] rid::%lu, on::%ld\n", __func__, rid, on); + // } + // if(z->y_id == 126) prt_k_mer_hit(ch_a, ch_n); + // for (i = ch_i = 0; i < on; i++) { + // assert((i<=0)||(ov[i].qs > ov[i-1].qe)); + // ov[i].sec = 16;///do not know the aln type + // ch_i = sub_base_aln(z, dp, ch_a, ch_n, pe, ov[i].qs, ov[i].qe, wl, uref, hpc_g, rref, qstr, tu, exz, e_rate, ql, tl, ch_i, rid); + // pe = ov[i].qe; + // } +} + +void rechain_aln_hc(overlap_region *z, Candidates_list *cl, overlap_region *aux_o, int64_t aux_i, int64_t wl, +All_reads *rref, char* qstr, UC_Read *tu, bit_extz_t *exz, double e_rate, int64_t ql, int64_t tl, int64_t h_khit, int64_t rid) +{ + int64_t rcn = cl->length, ch_n, qs, qe, ts, te, mode, an0, an, todo; + k_mer_hit *ch_a; uint8_t q[2], t[2]; ///ul_ov_t idx; + ///[qs, qe) && [ts, te) + qs = aux_o->w_list.a[aux_i].x_start; qe = aux_o->w_list.a[aux_i].x_end+1; + ts = aux_o->w_list.a[aux_i].y_start; te = aux_o->w_list.a[aux_i].y_end+1; + if(qe - qs < FORCE_SIN_L || te - ts < FORCE_SIN_L) return; + mode = aux_o->w_list.a[aux_i].error_threshold; + ch_n = gen_win_chain(z, cl, qs, qe, ts, te, wl, NULL, NULL, rref, qstr, tu, exz, ql, tl, e_rate, h_khit, mode, rid, 1); + ch_a = cl->list + rcn; + if(ch_n) { + todo = 1; ///idx.ts = idx.te = (uint32_t)-1; idx.qs = 0; idx.qe = ql; + if(mode == 0) {//global + // idx.qn = 0; idx.tn = ch_n - 1; + // idx.qs = ch_a[idx.qn].self_offset; + // idx.ts = ch_a[idx.qn].offset; + // idx.qe = ch_a[idx.tn].self_offset; + // idx.te = ch_a[idx.tn].offset; + assert(ch_a[0].self_offset == qs && ch_a[0].offset == ts); + assert(ch_a[ch_n-1].self_offset == qe && ch_a[ch_n-1].offset == te); + if(ch_n <= 2) todo = 0; + } else if(mode == 1) {//forward ext + // idx.qn = 0; idx.tn = ch_n; + // idx.qs = ch_a[idx.qn].self_offset; + // idx.ts = ch_a[idx.qn].offset; + // idx.qe = ql; + assert(ch_a[0].self_offset == qs && ch_a[0].offset == ts); + if(ch_n <= 1) todo = 0; + } else if(mode == 2) {///backward ext + // idx.qn = (uint32_t)-1; idx.tn = ch_n-1; + // idx.qs = 0; + // idx.qe = ch_a[idx.tn].self_offset; + // idx.te = ch_a[idx.tn].offset; + assert(ch_a[ch_n-1].self_offset == qe && ch_a[ch_n-1].offset == te); + if(ch_n <= 1) todo = 0; + } + if(todo) { + an0 = aux_o->w_list.n; + // if(z->x_id == 29033 && z->y_id == 21307) { + // fprintf(stderr, "[M::%s]\tan0::%ld\tq::[%u,\t%u)\tt::[%u,\t%u)\tlw::%u\trw::%u\n", __func__, an0, + // idx.qs, idx.qe, idx.ts, idx.te, idx.qn, idx.tn); + // } + // ovlp_base_aln(z, ch_a, ch_n, &idx, wl, uref, hpc_g, rref, qstr, tu, exz, aux_o, e_rate, ql, tl, (uint64_t)-1); + hc_ovlp_base_direct(z, ch_a, ch_n, wl, rref, qstr, tu, exz, aux_o, e_rate, ql, tl, (uint64_t)-1); + an = aux_o->w_list.n; q[0] = q[1] = t[0] = t[1] = 0; todo = 0; + // if(z->x_id == 29033 && z->y_id == 21307) { + // fprintf(stderr, "[M::%s]\tan::%ld\n", __func__, an); + // } + // fprintf(stderr, "[M::%s::] awn0::%ld, awn::%lu\n", __func__, an0, an); + ///old unaligned window could be replaced by the new aligned window + if((an == (an0 + 1)) && (!(is_ualn_win(aux_o->w_list.a[an-1])))) { + if(aux_o->w_list.a[aux_i].x_start == aux_o->w_list.a[an-1].x_start) q[0] = 1; + if(aux_o->w_list.a[aux_i].x_end == aux_o->w_list.a[an-1].x_end) q[1] = 1; + if(aux_o->w_list.a[aux_i].y_start == aux_o->w_list.a[an-1].y_start) t[0] = 1; + if(aux_o->w_list.a[aux_i].y_end == aux_o->w_list.a[an-1].y_end) t[1] = 1; + if((mode == 0) && q[0] && q[1] && t[0] && t[1]) todo = 1; + if((mode == 1) && q[0] && t[0]) todo = 1; + if((mode == 2) && q[1] && t[1]) todo = 1; + if(todo) { + aux_o->w_list.a[aux_i] = aux_o->w_list.a[an-1]; aux_o->w_list.n--; + } + } + // if(an > an0) {///should always > 0 as there are unmapped windows + // } + // aux_o->w_list.n = an0; + } } - - ///update z by aux_o - update_overlap_region(z, aux_o, ql, tl); - - // debug_overlap_region(aux_o, qstr, tu, uref, hpc_g, rref); - - - // ch_a = cl->list + ch_idx; //update - // for (i = 0; i < wn; i++) z->w_list.a[i].clen = 0;///clean cigar - // if(on > 1) { - // fprintf(stderr, "[M::%s::] rid::%lu, on::%ld\n", __func__, rid, on); - // } - // if(z->y_id == 126) prt_k_mer_hit(ch_a, ch_n); - // for (i = ch_i = 0; i < on; i++) { - // assert((i<=0)||(ov[i].qs > ov[i-1].qe)); - // ov[i].sec = 16;///do not know the aln type - // ch_i = sub_base_aln(z, dp, ch_a, ch_n, pe, ov[i].qs, ov[i].qe, wl, uref, hpc_g, rref, qstr, tu, exz, e_rate, ql, tl, ch_i, rid); - // pe = ov[i].qe; - // } + cl->length = rcn;///must reset!!!! } -void rechain_aln_hc(overlap_region *z, Candidates_list *cl, overlap_region *aux_o, int64_t aux_i, int64_t wl, -const ul_idx_t *uref, hpc_t *hpc_g, All_reads *rref, char* qstr, UC_Read *tu, bit_extz_t *exz, double e_rate, -int64_t ql, int64_t tl, int64_t h_khit, int64_t rid) +void rechain_aln_non_retrieve_hc(overlap_region *z, Candidates_list *cl, overlap_region *aux_o, int64_t aux_i, int64_t wl, +char* qstr, char* tstr, bit_extz_t *exz, double e_rate, int64_t ql, int64_t tl, int64_t h_khit, int64_t rid) { int64_t rcn = cl->length, ch_n, qs, qe, ts, te, mode, an0, an, todo; k_mer_hit *ch_a; uint8_t q[2], t[2]; ///ul_ov_t idx; @@ -15776,7 +16688,7 @@ int64_t ql, int64_t tl, int64_t h_khit, int64_t rid) ts = aux_o->w_list.a[aux_i].y_start; te = aux_o->w_list.a[aux_i].y_end+1; if(qe - qs < FORCE_SIN_L || te - ts < FORCE_SIN_L) return; mode = aux_o->w_list.a[aux_i].error_threshold; - ch_n = gen_win_chain(z, cl, qs, qe, ts, te, wl, uref, hpc_g, rref, qstr, tu, exz, ql, tl, e_rate, h_khit, mode, rid, 1); + ch_n = gen_win_non_retrieve_chain(z, cl, qs, qe, ts, te, wl, qstr, tstr, exz, ql, tl, e_rate, h_khit, mode, rid, 1); ch_a = cl->list + rcn; if(ch_n) { todo = 1; ///idx.ts = idx.te = (uint32_t)-1; idx.qs = 0; idx.qe = ql; @@ -15811,7 +16723,7 @@ int64_t ql, int64_t tl, int64_t h_khit, int64_t rid) // idx.qs, idx.qe, idx.ts, idx.te, idx.qn, idx.tn); // } // ovlp_base_aln(z, ch_a, ch_n, &idx, wl, uref, hpc_g, rref, qstr, tu, exz, aux_o, e_rate, ql, tl, (uint64_t)-1); - hc_ovlp_base_direct(z, ch_a, ch_n, wl, rref, qstr, tu, exz, aux_o, e_rate, ql, tl, (uint64_t)-1); + hc_ovlp_base_non_retrieve_direct(z, ch_a, ch_n, wl, qstr, tstr, exz, aux_o, e_rate, ql, tl, (uint64_t)-1); an = aux_o->w_list.n; q[0] = q[1] = t[0] = t[1] = 0; todo = 0; // if(z->x_id == 29033 && z->y_id == 21307) { // fprintf(stderr, "[M::%s]\tan::%ld\n", __func__, an); @@ -15838,7 +16750,7 @@ int64_t ql, int64_t tl, int64_t h_khit, int64_t rid) cl->length = rcn;///must reset!!!! } -uint64_t gen_hc_fast_cigar0(overlap_region *z, Candidates_list *cl, uint64_t wl, All_reads *rref, char* qstr, UC_Read *tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, uint64_t rid, int64_t h_khit) +uint64_t gen_hc_fast_cigar0(overlap_region *z, Candidates_list *cl, uint64_t wl, All_reads *rref, char* qstr, UC_Read *tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, uint64_t rid, int64_t h_khit, int64_t *re) { int64_t ch_idx = z->shared_seed, ch_n; int64_t i, tl, id = z->y_id, m, tot_e, aln; @@ -15869,7 +16781,7 @@ uint64_t gen_hc_fast_cigar0(overlap_region *z, Candidates_list *cl, uint64_t wl, // aux_o->w_list.a[i].error, aux_o->w_list.a[i].clen, aux_o->w_list.a[i].error_threshold); // } //will overwrite ch_a; does not matter - rechain_aln_hc(z, cl, aux_o, i, wl, NULL, NULL, rref, qstr, tu, exz, e_rate, ql, tl, h_khit, rid); + rechain_aln_hc(z, cl, aux_o, i, wl, rref, qstr, tu, exz, e_rate, ql, tl, h_khit, rid); } if(((int64_t)aux_o->w_list.n) > aux_n) { @@ -15892,6 +16804,7 @@ uint64_t gen_hc_fast_cigar0(overlap_region *z, Candidates_list *cl, uint64_t wl, tot_e += z->w_list.a[i].error; aln += z->w_list.a[i].x_end + 1 - z->w_list.a[i].x_start; } } + *re = tot_e; // fprintf(stderr, "[M::%s::%u->%u::%c] ovlp::%u, aln::%ld, tot_e::%ld, w_list.n::%u, ch_n::%ld\n", // __func__, z->x_id, z->y_id+1, "+-"[z->y_pos_strand], z->x_pos_e+1-z->x_pos_s, aln, tot_e, (uint32_t)z->w_list.n, ch_n); @@ -15915,6 +16828,82 @@ uint64_t gen_hc_fast_cigar0(overlap_region *z, Candidates_list *cl, uint64_t wl, } +uint64_t gen_hc_fast_non_retrieve_cigar0(overlap_region *z, Candidates_list *cl, uint64_t wl, char* qstr, int64_t ql, char* tstr, int64_t tl, bit_extz_t *exz, overlap_region *aux_o, double e_rate, uint64_t rid, int64_t h_khit) +{ + int64_t ch_idx = z->shared_seed, ch_n; + int64_t i, m, tot_e, aln; + k_mer_hit *ch_a = cl->list + ch_idx; + for (i = ch_idx; i < cl->length && cl->list[i].readID == cl->list[ch_idx].readID; i++); ch_n = i-ch_idx; + if(ch_n <= 0) return 0; + + + // fprintf(stderr, "[M::%s::rid->%ld] utg%.6dl(%c), z::[%u, %u)\n", + // __func__, rid, (int32_t)z->y_id+1, "+-"[z->y_pos_strand], z->x_pos_s, z->x_pos_e+1); + aux_o->w_list.n = aux_o->w_list.c.n = 0; + aux_o->y_id = z->y_id; aux_o->y_pos_strand = z->y_pos_strand; + aux_o->x_pos_s = z->x_pos_s; aux_o->x_pos_e = z->x_pos_e; + aux_o->y_pos_s = z->y_pos_s; aux_o->y_pos_e = z->y_pos_e; + + hc_ovlp_base_non_retrieve_direct(z, ch_a, ch_n, wl, qstr, tstr, exz, aux_o, e_rate, ql, tl, rid); + + + int64_t aux_n = aux_o->w_list.n; + for (i = 0; i < aux_n; i++) { + if(!(is_ualn_win(aux_o->w_list.a[i]))) continue; + // if((aux_o->w_list.a[i].x_end+1-aux_o->w_list.a[i].x_start) <= FORCE_CNS_L) { + // fprintf(stderr, "[aln::-i->%ld::ql->%d] q::[%d, %d), t::[%d, %d), err::%d, clen::%u, mode::%d\n", i, + // aux_o->w_list.a[i].x_end+1-aux_o->w_list.a[i].x_start, + // aux_o->w_list.a[i].x_start, aux_o->w_list.a[i].x_end+1, + // aux_o->w_list.a[i].y_start, aux_o->w_list.a[i].y_end+1, + // aux_o->w_list.a[i].error, aux_o->w_list.a[i].clen, aux_o->w_list.a[i].error_threshold); + // } + //will overwrite ch_a; does not matter + rechain_aln_non_retrieve_hc(z, cl, aux_o, i, wl, qstr, tstr, exz, e_rate, ql, tl, h_khit, rid); + } + + if(((int64_t)aux_o->w_list.n) > aux_n) { + for (i = m = 0; i < ((int64_t)aux_o->w_list.n); i++) { + if((i < aux_n) && (is_ualn_win(aux_o->w_list.a[i]))) continue; + aux_o->w_list.a[m++] = aux_o->w_list.a[i]; + } + aux_o->w_list.n = m; + radix_sort_window_list_xs_srt(aux_o->w_list.a, aux_o->w_list.a+aux_o->w_list.n); + } + + ///update z by aux_o + update_overlap_region(z, aux_o, ql, tl); + + aux_n = z->w_list.n; + for (i = tot_e = aln = 0; i < aux_n; i++) { + if(is_ualn_win(z->w_list.a[i])) { + tot_e += z->w_list.a[i].x_end + 1 - z->w_list.a[i].x_start; + } else { + tot_e += z->w_list.a[i].error; aln += z->w_list.a[i].x_end + 1 - z->w_list.a[i].x_start; + } + } + // fprintf(stderr, "[M::%s::%u->%u::%c] ovlp::%u, aln::%ld, tot_e::%ld, w_list.n::%u, ch_n::%ld\n", + // __func__, z->x_id, z->y_id+1, "+-"[z->y_pos_strand], z->x_pos_e+1-z->x_pos_s, aln, tot_e, (uint32_t)z->w_list.n, ch_n); + + // debug_overlap_region(aux_o, qstr, tu, NULL, NULL, rref); + + + // ch_a = cl->list + ch_idx; //update + // for (i = 0; i < wn; i++) z->w_list.a[i].clen = 0;///clean cigar + // if(on > 1) { + // fprintf(stderr, "[M::%s::] rid::%lu, on::%ld\n", __func__, rid, on); + // } + // if(z->y_id == 126) prt_k_mer_hit(ch_a, ch_n); + // for (i = ch_i = 0; i < on; i++) { + // assert((i<=0)||(ov[i].qs > ov[i-1].qe)); + // ov[i].sec = 16;///do not know the aln type + // ch_i = sub_base_aln(z, dp, ch_a, ch_n, pe, ov[i].qs, ov[i].qe, wl, uref, hpc_g, rref, qstr, tu, exz, e_rate, ql, tl, ch_i, rid); + // pe = ov[i].qe; + // } + + + return 1; +} + #define gen_err_unaligned(xl, yl) (((xl)<=FORCE_SIN_L)?(MAX((xl), (yl))):MAX((MIN((xl), (yl))), ((xl*0.51)+1))) @@ -16452,7 +17441,7 @@ int64_t extract_sub_cigar_err_rr(overlap_region *z, int64_t s, int64_t e, ul_ov_ } ///[s, e) -int64_t extract_sub_cigar_hc(overlap_region *z, All_reads *rref, haplotype_evdience_alloc* hp, char* qstr, UC_Read* tu, int64_t s, int64_t e, ul_ov_t *p, int64_t set_f, uint8_t *f, uint8_t occ_thres) +int64_t extract_sub_cigar_hc(overlap_region *z, All_reads *rref, haplotype_evdience_alloc* hp, char* qstr, UC_Read* tu, int64_t s, int64_t e, ul_ov_t *p, int64_t set_f, uint8_t *f, uint8_t occ_thres/**, uint8_t is_dbg**/) { int64_t wk = ovlp_cur_wid(*p), xk = ovlp_cur_xoff(*p), yk = ovlp_cur_yoff(*p), ck = ovlp_cur_coff(*p), os, oe, t; bit_extz_t ez; int64_t bd = ovlp_bd(*p), s0, e0; char *ystr; @@ -16484,9 +17473,11 @@ int64_t extract_sub_cigar_hc(overlap_region *z, All_reads *rref, haplotype_evdie } else { assert(xk0 == xk); assert(yk0 == yk); assert(ck0 == ck); UC_Read_resize(*tu, ovlp_cur_ylen(*p)); ystr = tu->seq; + // if(is_dbg) fprintf(stderr, "[M::%s] set_f::%ld, yk0::%ld, ovlp_cur_ylen::%u, ylen::%lu\n", __func__, set_f, yk0, ovlp_cur_ylen(*p), Get_READ_LENGTH((*rref), z->y_id)); recover_UC_Read_sub_region(ystr, yk0, ovlp_cur_ylen(*p), z->y_pos_strand, rref, z->y_id); } + // if(is_dbg) fprintf(stderr, "---0---[M::%s] set_f::%ld, yk::%ld\n", __func__, set_f, yk); //some cigar will span s or e while (ck < cn && xk < e) {//[s, e) ws = xk; @@ -16522,6 +17513,7 @@ int64_t extract_sub_cigar_hc(overlap_region *z, All_reads *rref, haplotype_evdie } } } + // if(is_dbg) fprintf(stderr, "---1---[M::%s] set_f::%ld, yk::%ld\n", __func__, set_f, yk); if(set_f) { ovlp_cur_xoff(*p) = xk0; ovlp_cur_yoff(*p) = yk0; ovlp_cur_coff(*p) = ck0; ovlp_cur_ylen(*p) = yk - yk0; @@ -16650,7 +17642,7 @@ uint64_t gen_region_phase_robust_rr(overlap_region* ol, uint64_t *id_a, uint64_t return id_n; } -uint64_t hc_phase_robust_rr(overlap_region* ol, All_reads *rref, haplotype_evdience_alloc* hp, char* qstr, UC_Read* tu, uint64_t *id_a, uint64_t id_n, uint64_t s, uint64_t e, ul_ov_t *c_idx, int64_t set_f, uint8_t occ_thres) +uint64_t hc_phase_robust_rr(overlap_region* ol, All_reads *rref, haplotype_evdience_alloc* hp, char* qstr, UC_Read* tu, uint64_t *id_a, uint64_t id_n, uint64_t s, uint64_t e, ul_ov_t *c_idx, int64_t set_f, uint8_t occ_thres/**, uint8_t is_dbg**/) { uint64_t k, q[2], rr = 0, os, oe; ul_ov_t *p; overlap_region *z; for (k = 0; k < id_n; k++) { @@ -16660,7 +17652,8 @@ uint64_t hc_phase_robust_rr(overlap_region* ol, All_reads *rref, haplotype_evdie if(q[1] <= e) rr = 1; os = MAX(q[0], s); oe = MIN(q[1], e); if(oe > os) { - extract_sub_cigar_hc(z, rref, hp, qstr, tu, os, oe, p, set_f, hp->flag + os - s, occ_thres); + // if(is_dbg) fprintf(stderr, "[M::%s]\ttn::%u\t%c\to::[%lu,\t%lu)\n", __func__, z->y_id, "+-"[z->y_pos_strand], os, oe); + extract_sub_cigar_hc(z, rref, hp, qstr, tu, os, oe, p, set_f, hp->flag + os - s, occ_thres/**, is_dbg**/); } } return rr; @@ -17156,7 +18149,7 @@ void debug_snp_site(overlap_region* ol, All_reads *rref, UC_Read *qu, haplotype_ } -void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_alloc* hp, UC_Read* qu, UC_Read* tu, kv_ul_ov_t *c_idx, asg64_v* idx, asg64_v* buf, int64_t bd, int64_t wl, int64_t ql, uint8_t occ_thres) +void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_alloc* hp, UC_Read* qu, UC_Read* tu, kv_ul_ov_t *c_idx, asg64_v* idx, asg64_v* buf, int64_t bd, int64_t wl, int64_t ql, uint8_t occ_thres/**, uint8_t is_dbg**/, uint64_t rid) { int64_t on = ol->length, k, i, zwn, q[2]; uint64_t m, l0, wi, wl0, si, ei, fi; overlap_region *z; ul_ov_t *cp; @@ -17186,6 +18179,10 @@ void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_all ovlp_cur_coff(*cp) = 0; ///cur cigar off in cur window ovlp_bd(*cp) = bd; } + // if(rid == 19350) { + // fprintf(stderr, "#########[M::%s] tid::%u\t%.*s\twid::%ld\tq::[%u, %u)\terr::%d\toerr::%u#########\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), + // i, z->w_list.a[i].x_start, z->w_list.a[i].x_end+1, z->w_list.a[i].error, z->non_homopolymer_errors); + // } } } @@ -17209,9 +18206,10 @@ void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_all } ResizeInitHaplotypeEvdience(hp); - ///fprintf(stderr, "[M::%s] ******\n", __func__); + // if(is_dbg) fprintf(stderr, "[M::%s] ******\n", __func__); i = 0; s = 0; e = wl; e = ((e<=ql)?e:ql); rr = 0; for (; s < ql; ) { + // if(is_dbg) fprintf(stderr, "-0-[M::%s]\ts::%ld\te::%ld\n", __func__, s, e); if(rr) { // rr = 0; for (m = rm_n = srt_n; m < idx->n; m++) { @@ -17242,7 +18240,8 @@ void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_all // fprintf(stderr, "[M::%s] s::%ld, e::%ld, srt_n::%ld, idx->n::%ld\n", __func__, s, e, srt_n, (int64_t)idx->n); // debug_inter(ol, c_idx, idx->a, srt_n, idx->a + srt_n, idx->n - srt_n, s, e); l0 = hp->length; - rr = hc_phase_robust_rr(ol->list, rref, hp, qu->seq, tu, idx->a + srt_n, idx->n - srt_n, s, e, c_idx->a, 1, occ_thres); + // if(is_dbg) fprintf(stderr, "-1-[M::%s]\ts::%ld\te::%ld\n", __func__, s, e); + rr = hc_phase_robust_rr(ol->list, rref, hp, qu->seq, tu, idx->a + srt_n, idx->n - srt_n, s, e, c_idx->a, 1, occ_thres/**, is_dbg**/); for (wi = fi = ei = 0, si = ((uint64_t)-1), wl0 = e - s; wi < wl0; wi++) { if(hp->flag[wi] > 0) { if(hp->flag[wi] > occ_thres) { @@ -17253,12 +18252,13 @@ void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_all } if(fi) { - rr = hc_phase_robust_rr(ol->list, rref, hp, qu->seq, tu, idx->a + srt_n, idx->n - srt_n, s, e, c_idx->a, 0, occ_thres); + // if(is_dbg) fprintf(stderr, "-2-[M::%s]\ts::%ld\te::%ld\n", __func__, s, e); + rr = hc_phase_robust_rr(ol->list, rref, hp, qu->seq, tu, idx->a + srt_n, idx->n - srt_n, s, e, c_idx->a, 0, occ_thres/**, is_dbg**/); if(hp->length > l0) radix_sort_haplotype_evdience_srt(hp->list + l0, hp->list + hp->length); } if(ei > si) memset(hp->flag + si, 0, (ei-si)*sizeof((*(hp->flag)))); - + // if(is_dbg) fprintf(stderr, "-3-[M::%s]\ts::%ld\te::%ld\n", __func__, s, e); s += wl; e += wl; e = ((e<=ql)?e:ql); } @@ -22109,10 +23109,17 @@ void ul_rid_lalign_adv(overlap_region_alloc* ol, Candidates_list *cl, const ul_i } -uint64_t gen_hc_fast_cigar(overlap_region *z, Candidates_list *cl, All_reads *rref, int64_t wl, char *qstr, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, int64_t rid, int64_t khit) +uint64_t gen_hc_fast_cigar(overlap_region *z, Candidates_list *cl, All_reads *rref, int64_t wl, char *qstr, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, int64_t rid, int64_t khit, int64_t *re) +{ + return_t_chain(z, cl); + gen_hc_fast_cigar0(z, cl, wl, rref, qstr, tu, exz, aux_o, e_rate, ql, rid, khit, re); + return 1; +} + +uint64_t gen_hc_fast_non_retrieve_cigar(overlap_region *z, Candidates_list *cl, int64_t wl, char *qstr, int64_t ql, char *tstr, int64_t tl, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t rid, int64_t khit) { return_t_chain(z, cl); - gen_hc_fast_cigar0(z, cl, wl, rref, qstr, tu, exz, aux_o, e_rate, ql, rid, khit); + gen_hc_fast_non_retrieve_cigar0(z, cl, wl, qstr, ql, tstr, tl, exz, aux_o, e_rate, rid, khit); return 1; } @@ -22132,8 +23139,10 @@ void append_cigar(window_list *idx, window_list_alloc *res, uint16_t c, uint32_t idx->clen = res->c.n-idx->cidx; } -uint16_t adjust_gap(window_list *idx, window_list_alloc *res, char *pstr, char *tstr, int64_t pi, int64_t ti, uint16_t op0, asg16_v* buf) +uint16_t adjust_gap(window_list *idx, window_list_alloc *res, char *pstr, char *tstr, int64_t pi, int64_t ti, uint16_t op0, asg16_v* buf, int64_t *rd_err) { + (*rd_err) = 0; + if(idx->clen == 0) { append_cigar(idx, res, op0, 1); return 0;///no move @@ -22145,31 +23154,62 @@ uint16_t adjust_gap(window_list *idx, window_list_alloc *res, char *pstr, char * pi--; } + // if(z->y_id == 3199 && z->x_id == 3196) { + // fprintf(stderr, "\n[M::%s]\tqi::%ld\tti::%ld\n", __func__, ti, pi); + // } + uint16_t *ca = res->c.a+idx->cidx; int64_t ci = idx->clen; int64_t op, cl, k, l[2]; uint16_t p, ff; for (ci--, buf->n = ff = 0; ci >= 0; ci--) { op = ca[ci]>>14; cl = (ca[ci]&(0x3fff)); + // if(z->y_id == 3199 && z->x_id == 3196) { + // fprintf(stderr, "[M::%s]\tqi::%ld\tti::%ld\tci::%ld\tcl::%ld\top::%ld\n", __func__, ti, pi, ci, cl, op); + // } if(op == 2 || op == 3) { p = op0; p <<= 14; p += 1; kv_push(uint16_t, *buf, p); l[0] = cl; p = op; p <<= 14; p += l[0]; kv_push(uint16_t, *buf, p); break; - } - for (k = cl-1, l[0] = l[1] = 0; k >= 0; k--, pi--, ti--) { - if((op == 0) && (pstr[pi] != (tstr[ti]))) break; - if(op == 1) assert(pstr[pi] != (tstr[ti])); - } - l[1] = cl - k - 1; l[0] = k + 1; - if(l[1] > 0) { - p = op; p <<= 14; p += l[1]; kv_push(uint16_t, *buf, p); ff = 1; - } + } else if(op == 0) { + for (k = cl-1, l[0] = l[1] = 0; k >= 0; k--, pi--, ti--) { + if(pstr[pi] != (tstr[ti])) break; + // if(op == 1) { + // if(!(pstr[pi] != (tstr[ti]))) { + // fprintf(stderr, "[M::%s] xid::%u, yid::%u, qi::%ld, ti::%ld\n", __func__, z->x_id, z->y_id, ti, pi); + // } + // assert(pstr[pi] != (tstr[ti])); + // } + } + l[1] = cl - k - 1; l[0] = k + 1; + if(l[1] > 0) { + p = op; p <<= 14; p += l[1]; kv_push(uint16_t, *buf, p); ff = 1; + } - if(l[0] > 0) { - p = op0; p <<= 14; p += 1; kv_push(uint16_t, *buf, p); - } + if(l[0] > 0) { + p = op0; p <<= 14; p += 1; kv_push(uint16_t, *buf, p); + } - if(l[0] > 0) { - p = op; p <<= 14; p += l[0]; kv_push(uint16_t, *buf, p); + if(l[0] > 0) { + p = op; p <<= 14; p += l[0]; kv_push(uint16_t, *buf, p); + } + } else {///op == 1; it is possible since cigar is not optimal + for (k = cl-1, l[0] = cl, l[1] = 0; k >= 0; k--, pi--, ti--) { + if(pstr[pi] == (tstr[ti])) { + l[1] = l[0] - k - 1; + l[0] = k; + + if(l[1] > 0) { + p = op; p <<= 14; p += l[1]; kv_push(uint16_t, *buf, p); ff = 1;///push unmatch + } + p = 0; p <<= 14; p += 1; kv_push(uint16_t, *buf, p);///push match + (*rd_err)++; + } + } + + if(l[0] > 0) { + p = op; p <<= 14; p += l[0]; kv_push(uint16_t, *buf, p); + } + l[0] = 0; } // fprintf(stderr, "[M::%s] ci::%ld, l[0]::%ld, l[1]::%ld\n", __func__, ci, l[0], l[1]); @@ -22224,9 +23264,9 @@ uint16_t ajust_end_cigar(window_list *idx, window_list_alloc *res) return rr; } -uint16_t move_wins(overlap_region *z, uint32_t wid, overlap_region *aux, All_reads *rref, char *tseq, UC_Read *pu, asg16_v* buf) +uint16_t move_wins(overlap_region *z, uint32_t wid, overlap_region *aux, All_reads *rref, char *tstr, char *pstr, UC_Read *pu, asg16_v* buf, int64_t *tot_re) { - // if(z->y_id == 24 && z->x_id == 25) { + // if(z->y_id == 3199 && z->x_id == 3196) { // fprintf(stderr, "\n[M::%s] x_id::%u, y_id::%u, x::[%u, %u), y::[%u, %u)\n", __func__, z->x_id, z->y_id, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); // fprintf(stderr, "[M::%s] wid::%u, x::[%d, %d), y::[%d, %d), err::%d\n", __func__, wid, z->w_list.a[wid].x_start, z->w_list.a[wid].x_end + 1, z->w_list.a[wid].y_start, z->w_list.a[wid].y_end + 1, z->w_list.a[wid].error); // } @@ -22235,7 +23275,7 @@ uint16_t move_wins(overlap_region *z, uint32_t wid, overlap_region *aux, All_rea return 0;///no move } - window_list *p = NULL; + window_list *p = NULL; char *tseq = tstr, *pseq = pstr; bit_extz_t ez; set_bit_extz_t(ez, (*z), wid); kv_pushp(window_list, aux->w_list, &p); p->x_start = ez.ts; p->x_end = ez.te; @@ -22248,7 +23288,28 @@ uint16_t move_wins(overlap_region *z, uint32_t wid, overlap_region *aux, All_rea return 0;///no move } - int64_t pi = ez.ps, ti = ez.ts/**, err = 0**/, cl, op, k, rr = 0; uint64_t ci = 0, mm = 0; char *pseq = NULL; + int64_t pi = ez.ps, ti = ez.ts/**, err = 0**/, cl, op, k, rr = 0, re; uint64_t ci = 0, mm = 0; + // char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + // if(z->y_id == 3199 && z->x_id == 3196) { + // fprintf(stderr, "******\n"); + // for (ci = 0; ci < ez.cigar.n; ci++) { + // op = ez.cigar.a[ci]>>14; cl = (ez.cigar.a[ci]&(0x3fff)); + // fprintf(stderr, "%ld%c(q::[%ld,%ld))(t::[%ld,%ld))(ck::%ld)\n", cl, cm[op], ti, ti + (((op<2)||(op==3))?(cl):(0)), pi, pi + (((op<2)||(op==2))?(cl):(0)), ci); + + // if(op < 2) { + // pi+=cl; ti+=cl; + // } else { + // if(op == 2) {///more p -> y + // pi+=cl; + // } else if(op == 3) {///more t -> x + // ti+=cl; + // } + // } + // } + // pi = ez.ps; ti = ez.ts; + // fprintf(stderr, "******\n"); + // } + for (ci = mm = 0; ci < ez.cigar.n; ci++) { op = ez.cigar.a[ci]>>14; ///cl = (ez.cigar.a[ci]&(0x3fff)); if(op < 2) { @@ -22264,12 +23325,15 @@ uint16_t move_wins(overlap_region *z, uint32_t wid, overlap_region *aux, All_rea return rr; } - if(z->y_pos_strand) { - recover_UC_Read_RC(pu, rref, aux->y_id); - } else { - recover_UC_Read(pu, rref, aux->y_id); + if(!pseq) { + if(z->y_pos_strand) { + recover_UC_Read_RC(pu, rref, aux->y_id); + } else { + recover_UC_Read(pu, rref, aux->y_id); + } + pseq = pu->seq; } - pseq = pu->seq; + kv_resize(uint16_t, aux->w_list.c, (aux->w_list.c.n + ez.cigar.n)); for (ci = mm = 0; ci < ez.cigar.n; ci++) { @@ -22278,7 +23342,10 @@ uint16_t move_wins(overlap_region *z, uint32_t wid, overlap_region *aux, All_rea // fprintf(stderr, "[M::%s] op::%ld, cl::%ld, pi::%ld, ti::%ld\n", __func__, op, cl, pi, ti); // fprintf(stderr, "[M::%s] p->cidx::%lu, p->clen::%lu, cc->n::%lu\n", __func__, (uint64_t)p->cidx, (uint64_t)p->clen, (uint64_t)aux->w_list.c.n); // } - + // if(z->y_id == 3199 && z->x_id == 3196) { + // fprintf(stderr, "%ld%c(q::[%ld,%ld))(t::[%ld,%ld))(ck::%ld)\tmm::%lu\n", cl, cm[op], ti, ti + (((op<2)||(op==3))?(cl):(0)), pi, pi + (((op<2)||(op==2))?(cl):(0)), ci, mm); + // } + if(op < 2) { append_cigar(p, &(aux->w_list), op, cl); pi+=cl; ti+=cl; mm = 1; @@ -22292,7 +23359,12 @@ uint16_t move_wins(overlap_region *z, uint32_t wid, overlap_region *aux, All_rea } } else { for (k = 0; k < cl; k++) { - if(adjust_gap(p, &(aux->w_list), pseq, tseq, pi, ti, op, buf)) rr = 1; + // if(z->y_id == 3199 && z->x_id == 3196) { + // fprintf(stderr, "+++k::%ld\tqi::%ld\tti::%ld\n", k, ti, pi); + // } + if(adjust_gap(p, &(aux->w_list), pseq, tseq, pi, ti, op, buf, &re)) { + rr = 1; p->error -= re; (*tot_re) += re; + } if(op == 2) {///more p -> y pi++; } else if(op == 3) {///more t -> x @@ -22309,7 +23381,7 @@ uint16_t move_wins(overlap_region *z, uint32_t wid, overlap_region *aux, All_rea return rr; } -void reassign_gaps(overlap_region *z, overlap_region *aux, All_reads *rref, UC_Read* qu, UC_Read* tu, int64_t ql, asg16_v* buf) +void reassign_gaps(overlap_region *z, overlap_region *aux, char* qstr, int64_t ql, char* tstr, int64_t tl, All_reads *rref, UC_Read* tu, asg16_v* buf) { // if(z->y_id != 24 || z->x_id != 25) return; if(z->non_homopolymer_errors == 0) return; @@ -22318,29 +23390,38 @@ void reassign_gaps(overlap_region *z, overlap_region *aux, All_reads *rref, UC_R aux->x_pos_s = z->x_pos_s; aux->x_pos_e = z->x_pos_e; aux->y_pos_s = z->y_pos_s; aux->y_pos_e = z->y_pos_e; - int64_t k, z_n = z->w_list.n, rr = 0; + int64_t k, z_n = z->w_list.n, rr = 0, re = 0; for (k = 0; k < z_n; k++) { - if(move_wins(z, k, aux, rref, qu->seq, tu, buf)) rr = 1; + if(move_wins(z, k, aux, rref, qstr, tstr, tu, buf, &re)) rr = 1; } ///update z by aux_o - if(rr) update_overlap_region(z, aux, ql, Get_READ_LENGTH((*rref), z->y_id)); - - - // fprintf(stderr, "[M::%s] rr::%ld\n", __func__, rr); - ///debug - // bit_extz_t ez; - // if(z->y_pos_strand) { - // recover_UC_Read_RC(tu, rref, z->y_id); - // } else { - // recover_UC_Read(tu, rref, z->y_id); - // } - // for (k = 0; k < z_n; k++) { - // if(is_ualn_win((z->w_list.a[k]))) continue; - // set_bit_extz_t(ez, (*z), k); - // if(!cigar_check(tu->seq, qu->seq, &ez)) { - // fprintf(stderr, "\n[M::%s] x_id::%u, y_id::%u, x::[%u, %u), y::[%u, %u)\n", __func__, z->x_id, z->y_id, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); - // exit(1); - // } + if(rr) update_overlap_region(z, aux, ql, ((rref)?(Get_READ_LENGTH((*rref), z->y_id)):(tl))); + z->non_homopolymer_errors -= re; + + // if(z->y_id == 1) { + // fprintf(stderr, "[M::%s] rr::%ld\tx_id::%u\ty_id::%u\tx::[%u, %u)\ty::[%u, %u)\tz_n::%ld\n", + // __func__, rr, z->x_id, z->y_id, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1, z_n); + // fprintf(stderr, "qstr(%lld)::%.*s\n", qu->length, (int32_t)(qu->length), qu->seq); + // fprintf(stderr, "tstr(%lld)::%.*s\n", tu->length, (int32_t)(tu->length), tu->seq); + // ///debug + + // if(z->y_pos_strand) { + // recover_UC_Read_RC(tu, rref, z->y_id); + // } else { + // recover_UC_Read(tu, rref, z->y_id); + // } + // if(tstr) { + // bit_extz_t ez; + // for (k = 0; k < z_n; k++) { + // if(is_ualn_win((z->w_list.a[k]))) continue; + // set_bit_extz_t(ez, (*z), k); + // if(!cigar_check(tstr, qstr, &ez)) { + // fprintf(stderr, "\n[M::%s] x_id::%u, y_id::%u, x::[%u, %u), y::[%u, %u)\n", __func__, z->x_id, z->y_id, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); + // exit(1); + // } + // } + // } + // } } @@ -22350,6 +23431,7 @@ void gen_hc_r_alin(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rre overlap_region t; overlap_region *z; //asg64_v iidx, buf, buf1; ol->mapped_overlaps_length = 0; if(ol->length <= 0) return; + // if(ol->length && ol->list[0].x_id == 19350) e_rate = 0.1; ///base alignment err = e_rate; e_max = err * 1.5; @@ -22360,29 +23442,134 @@ void gen_hc_r_alin(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rre for (i = k = 0; i < ol->length; i++) { z = &(ol->list[i]); z->shared_seed = z->non_homopolymer_errors;///for index + + // if(z->x_id == 19350 && z->y_id == 19324) fprintf(stderr, "-z-[M::%s] tid::%u\t%.*s\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id)); if(!align_hc_ed_post_extz(z, rref, qu->seq, tu->seq, exz, err, w.window_length, OVERLAP_THRESHOLD_HIFI_FILTER, 0, NULL)) continue; + // if(z->x_id == 19350 && z->y_id == 19324) fprintf(stderr, "-m-[M::%s] tid::%u\t%.*s\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id)); + rr = gen_extend_err_exz(z, NULL, NULL, rref, qu->seq, tu->seq, exz, NULL, w.window_length, -1, err, (e_max+0.000001), THRESHOLD_MAX_SIZE, 0, &re); z->is_match = 0; + + // if(z->x_id == 19350 && z->y_id == 19324) fprintf(stderr, "-0-[M::%s] tid::%u\t%.*s\trr::%f\tre::%ld\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, re); + if (rr > err) continue; z->non_homopolymer_errors = re; - if(!gen_hc_fast_cigar(z, cl, rref, w.window_length, qu->seq, tu, exz, aux_o, e_rate, ql, rid, khit)) continue; + if(!gen_hc_fast_cigar(z, cl, rref, w.window_length, qu->seq, tu, exz, aux_o, e_rate, ql, rid, khit, &re)) continue; + + // if(z->x_id == 3196 && z->y_id == 3199) fprintf(stderr, "-1-[M::%s] tid::%u\t%.*s\trr::%f\tre::%ld\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, re); + + reassign_gaps(z, aux_o, qu->seq, ql, NULL, -1, rref, tu, buf); + + // if(z->x_id == 3196 && z->y_id == 3199) fprintf(stderr, "-2-[M::%s] tid::%u\t%.*s\trr::%f\tre::%u\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, z->non_homopolymer_errors); + + // if(z->x_id == 19350) fprintf(stderr, "-1-[M::%s] tid::%u\t%.*s\trr::%f\terr::%u\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, z->non_homopolymer_errors); + + if(k != i) { + t = ol->list[k]; + ol->list[k] = ol->list[i]; + ol->list[i] = t; + } + z = &(ol->list[k++]); z->is_match = 1; ///z->non_homopolymer_errors = re; + z->strong = z->without_large_indel = 0; + } + ol->length = k; + if(ol->length <= 0) return; +} + + +void gen_hc_r_alin_nec(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf) +{ + uint64_t i, bs, k, ql = qu->length; Window_Pool w; double err, e_max, rr; int64_t re; + overlap_region t; overlap_region *z; //asg64_v iidx, buf, buf1; + ol->mapped_overlaps_length = 0; + if(ol->length <= 0) return; + // if(ol->length && ol->list[0].x_id == 19350) e_rate = 0.1; + + ///base alignment + err = e_rate; e_max = err * 1.5; + init_Window_Pool(&w, ql, wl, (int)(1.0/err)); + bs = (w.window_length)+(THRESHOLD_MAX_SIZE<<1)+1; + resize_UC_Read(tu, bs<<1); + // fprintf(stderr, "[M::%s] window_length::%lld\n", __func__, w.window_length); + + for (i = k = 0; i < ol->length; i++) { + z = &(ol->list[i]); + if(z->is_match != 1) { + z->shared_seed = z->non_homopolymer_errors;///for index + + // if(z->x_id == 19350 && z->y_id == 19324) fprintf(stderr, "-z-[M::%s] tid::%u\t%.*s\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id)); + + if(!align_hc_ed_post_extz(z, rref, qu->seq, tu->seq, exz, err, w.window_length, OVERLAP_THRESHOLD_HIFI_FILTER, 0, NULL)) continue; + + // if(z->x_id == 19350 && z->y_id == 19324) fprintf(stderr, "-m-[M::%s] tid::%u\t%.*s\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id)); + + rr = gen_extend_err_exz(z, NULL, NULL, rref, qu->seq, tu->seq, exz, NULL, w.window_length, -1, err, (e_max+0.000001), THRESHOLD_MAX_SIZE, 0, &re); + z->is_match = 0; + + // if(z->x_id == 19350 && z->y_id == 19324) fprintf(stderr, "-0-[M::%s] tid::%u\t%.*s\trr::%f\tre::%ld\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, re); - reassign_gaps(z, aux_o, rref, qu, tu, ql, buf); + if (rr > err) continue; + z->non_homopolymer_errors = re; + + if(!gen_hc_fast_cigar(z, cl, rref, w.window_length, qu->seq, tu, exz, aux_o, e_rate, ql, rid, khit, &re)) continue; + + // if(z->x_id == 3196 && z->y_id == 3199) fprintf(stderr, "-1-[M::%s] tid::%u\t%.*s\trr::%f\tre::%ld\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, re); + + reassign_gaps(z, aux_o, qu->seq, ql, NULL, -1, rref, tu, buf); + } + + // if(z->x_id == 3196 && z->y_id == 3199) fprintf(stderr, "-2-[M::%s] tid::%u\t%.*s\trr::%f\tre::%u\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, z->non_homopolymer_errors); + + // if(z->x_id == 19350) fprintf(stderr, "-1-[M::%s] tid::%u\t%.*s\trr::%f\terr::%u\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, z->non_homopolymer_errors); if(k != i) { t = ol->list[k]; ol->list[k] = ol->list[i]; ol->list[i] = t; } - z = &(ol->list[k++]); z->is_match = 1; z->non_homopolymer_errors = re; + z = &(ol->list[k++]); z->is_match = 1; ///z->non_homopolymer_errors = re; + z->strong = z->without_large_indel = 0; } ol->length = k; if(ol->length <= 0) return; } + +uint64_t gen_hc_r_alin_re(overlap_region* z, Candidates_list *cl, char* qstr, uint64_t ql, char* tstr, uint64_t tl, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf) +{ + double err, e_max, rr; int64_t re; + + err = e_rate; e_max = err * 1.5; + + z->shared_seed = z->non_homopolymer_errors;///for index + + if(!align_hc_ed_post_non_retrieve_extz(z, qstr, ql, tstr, tl, exz, err, wl, OVERLAP_THRESHOLD_HIFI_FILTER, 0, NULL)) return 0; + + rr = gen_extend_err_non_retrieve_exz(z, qstr, tstr, tl, exz, NULL, wl, -1, err, (e_max+0.000001), THRESHOLD_MAX_SIZE, 0, &re); + z->is_match = 0; + + if (rr > err) return 0; + z->non_homopolymer_errors = re; + + // if(z->x_id == 3196 && z->y_id == 3199) fprintf(stderr, "-0-[M::%s] tid::%u\t%.*s\trr::%f\tre::%ld\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, re); + + if(!gen_hc_fast_non_retrieve_cigar(z, cl, wl, qstr, ql, tstr, tl, exz, aux_o, e_rate, rid, khit)) return 0; + + // if(z->x_id == 3196 && z->y_id == 3199) fprintf(stderr, "-1-[M::%s] tid::%u\t%.*s\trr::%f\tre::%ld\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, re); + + reassign_gaps(z, aux_o, qstr, ql, tstr, tl, NULL, NULL, buf); + + // if(z->x_id == 3196 && z->y_id == 3199) fprintf(stderr, "-2-[M::%s] tid::%u\t%.*s\trr::%f\tre::%u\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, z->non_homopolymer_errors); + + z->is_match = 1; ///z->non_homopolymer_errors = re; + z->strong = z->without_large_indel = 0; + + return 1; +} + /** void ul_raw_lalign_adv(overlap_region_alloc* ol, Candidates_list *cl, const ul_idx_t *uref, All_reads *rdb, const ug_opt_t *uopt, char *qstr, uint64_t ql, UC_Read* qu, UC_Read* tu, Correct_dumy* dumy, bit_extz_t *exz, haplotype_evdience_alloc* hap, diff --git a/Correct.h b/Correct.h index 5556a34..9cd4335 100644 --- a/Correct.h +++ b/Correct.h @@ -1389,7 +1389,13 @@ const ul_idx_t *uref, char* qstr, UC_Read *tu, overlap_region_alloc *ol, overlap bit_extz_t *exz, double e_rate, int64_t qs); void gen_hc_r_alin(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf); -void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_alloc* hp, UC_Read* qu, UC_Read* tu, kv_ul_ov_t *c_idx, asg64_v* idx, asg64_v* buf, int64_t bd, int64_t wl, int64_t ql, uint8_t occ_thres); +void gen_hc_r_alin_nec(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf); +uint64_t gen_hc_r_alin_re(overlap_region* z, Candidates_list *cl, char* qstr, uint64_t ql, char* tstr, uint64_t tl, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf); +void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_alloc* hp, UC_Read* qu, UC_Read* tu, kv_ul_ov_t *c_idx, asg64_v* idx, asg64_v* buf, int64_t bd, int64_t wl, int64_t ql, uint8_t occ_thres/**, uint8_t is_dbg**/, uint64_t rid); +void set_exact_exz(bit_extz_t *exz, int64_t qs, int64_t qe, int64_t ts, int64_t te); +void push_alnw(overlap_region *aux_o, bit_extz_t *exz); +void cal_exz_global(char *pstr, int32_t pn, char *tstr, int32_t tn, int32_t thre, bit_extz_t *ez); + #define ovlp_id(x) ((x).tn) #define ovlp_min_wid(x) ((x).ts) @@ -1400,9 +1406,5 @@ void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_all #define ovlp_cur_ylen(x) ((x).te) #define ovlp_cur_coff(x) ((x).qe) #define ovlp_bd(x) ((x).sec) - -#define UC_Read_resize(v, s) do {\ - if ((v).size<(s)) {REALLOC((v).seq,(s));(v).size=(s);}\ - } while (0) #endif diff --git a/Hash_Table.cpp b/Hash_Table.cpp index 58c70bb..885027a 100644 --- a/Hash_Table.cpp +++ b/Hash_Table.cpp @@ -1512,6 +1512,34 @@ inline int32_t comput_sc_ch(const k_mer_hit *ai, const k_mer_hit *aj, double bw_ return sc; } +inline int32_t comput_sc_ch_ec(const k_mer_hit *ai, const k_mer_hit *aj, double bw_rate, double chn_pen_gap, double chn_pen_skip, int64_t sl, int64_t ol) +{ + ///ai is the suffix of aj + int32_t dq, dr, dd, dg, q_span, sc; + dq = (int64_t)(ai->self_offset) - (int64_t)(aj->self_offset); + if(dq <= 0) return INT32_MIN; + dr = (int64_t)(ai->offset) - (int64_t)(aj->offset); + if(dr <= 0) return INT32_MIN; + dd = dr > dq? dr - dq : dq - dr;//gap + if((dd > 16) && (dd > cal_bw(ai, aj, bw_rate, sl, ol))) return INT32_MIN; + dg = dr < dq? dr : dq;//len + q_span = ai->cnt&(0xffu); + sc = q_span < dg? q_span : dg; + sc = normal_w(sc, ((int32_t)(ai->cnt>>8))); + if (dd || (dg > q_span && dg > 0)) { + double lin_pen, a_pen; + lin_pen = (chn_pen_gap*(double)dd); + a_pen = ((double)(sc))*((((double)dd)/((double)dg))/bw_rate); + ///for long gap + // if(lin_pen > a_pen) lin_pen = a_pen; + if(dd < 4) lin_pen = ((lin_pen > a_pen)?(a_pen):(lin_pen)); + else lin_pen = ((lin_pen < a_pen)?(a_pen):(lin_pen)); + lin_pen += (chn_pen_skip*(double)dg); + sc -= (int32_t)lin_pen; + } + return sc; +} + inline int32_t comput_sc_ff(const k_mer_hit *ai, const k_mer_hit *aj, double bw_rate, double chn_pen_gap, double chn_pen_skip, int64_t sl, int64_t ol) { ///ai is the suffix of aj @@ -1999,6 +2027,9 @@ int64_t *p, int64_t *t, int32_t *f, int32_t *ii, int64_t *plus, int64_t *msc, in for (k = 1, l = 0; k <= a_n; k++) { if(k == a_n || a[k].strand != a[l].strand) { t[k-1] = 0; ii[k-1] = 0; + // if(a_n && a[0].readID == 3125488) { + // fprintf(stderr, "[M::%s::] ii::[%ld,%ld)(%c), is_srt::%ld, chn_pen_gap::%f, chn_pen_skip::%f, bw_rate::%f\n", __func__, l, k, "+-"[a[l].strand], is_srt, chn_pen_gap, chn_pen_skip, bw_rate); + // } if(is_srt) { plus0 = 0; msc0 = msc_i0 = INT32_MIN; movl0 = INT32_MAX; ddt = 0; @@ -2016,6 +2047,9 @@ int64_t *p, int64_t *t, int32_t *f, int32_t *ii, int64_t *plus, int64_t *msc, in dr = (int64_t)(ai->offset) - (int64_t)(aj->offset); if(dr <= 0) break; dd = dr > dq? dr - dq : dq - dr;//gap + // if(a_n && a[0].readID == 3125488) { + // fprintf(stderr, "%ld,", dd); + // } if((dd > 16) && (dd > cal_bw(&(a[z]), &(a[z-1]), bw_rate, xl, yl))) break; dg = dr < dq? dr : dq;//len q_span = ai->cnt&(0xffu); @@ -2024,7 +2058,10 @@ int64_t *p, int64_t *t, int32_t *f, int32_t *ii, int64_t *plus, int64_t *msc, in if (dd || (dg > q_span && dg > 0)) { lin_pen = (chn_pen_gap*(double)dd); a_pen = ((double)(sc))*((((double)dd)/((double)dg))/bw_rate); - if(lin_pen > a_pen) lin_pen = a_pen; + ///for long gap + // if(lin_pen > a_pen) lin_pen = a_pen; + if(dd < 4) lin_pen = ((lin_pen > a_pen)?(a_pen):(lin_pen)); + else lin_pen = ((lin_pen < a_pen)?(a_pen):(lin_pen)); lin_pen += (chn_pen_skip*(double)dg); sc -= (int32_t)lin_pen; } @@ -2036,6 +2073,10 @@ int64_t *p, int64_t *t, int32_t *f, int32_t *ii, int64_t *plus, int64_t *msc, in if(f[z] < plus0) plus0 = f[z]; } + // if(a_n && a[0].readID == 3125488) { + // fprintf(stderr, "\n"); + // fprintf(stderr, "[M::%s::] msc0::%ld, msc_i0::%ld, (%c)\n", __func__, msc0, msc_i0, "+-"[a[l].strand]); + // } if((z >= k) && (msc_i0 == (k - 1))) { if((k - l >= 2) && (ddt > 16) && (ddt > cal_bw(&(a[k-1]), &(a[l]), bw_rate, xl, yl))) msc_i0 = INT32_MIN; if(msc_i0 == (k - 1)) { @@ -2087,7 +2128,9 @@ uint64_t lchain_qdp_mcopy_fast(Candidates_list *cl, int64_t a_idx, int64_t a_n, msc = msc_i = INT32_MIN; movl = INT32_MAX; plus = 0; si = 0; ei = a_n; memset(t, 0, (a_n*sizeof((*t)))); } - + // if(a_n && a[0].readID == 3125488) { + // fprintf(stderr, "[M::%s::] si::%ld, ei::%ld, a_n::%ld\n", __func__, si, ei, a_n); + // } for (i = st = si, max_ii = -1; i < ei; ++i) { max_f = a[i].cnt&(0xffu); n_skip = 0; max_j = end_j = -1; @@ -2095,7 +2138,7 @@ uint64_t lchain_qdp_mcopy_fast(Candidates_list *cl, int64_t a_idx, int64_t a_n, while (a[i].strand != a[st].strand) ++st; for (j = i - 1; j >= st; --j) { - sc = comput_sc_ch(&a[i], &a[j], bw_rate, chn_pen_gap, chn_pen_skip, xl, yl); + sc = comput_sc_ch_ec(&a[i], &a[j], bw_rate, chn_pen_gap, chn_pen_skip, xl, yl); if (sc == INT32_MIN) continue; sc += f[j]; if (sc > max_f) { @@ -2119,7 +2162,7 @@ uint64_t lchain_qdp_mcopy_fast(Candidates_list *cl, int64_t a_idx, int64_t a_n, } if ((max_ii >= 0) && (max_ii < end_j) && (a[i].strand == a[max_ii].strand)) {///just have a try with a[i]<->a[max_ii] - tmp = comput_sc_ch(&a[i], &a[max_ii], bw_rate, chn_pen_gap, chn_pen_skip, xl, yl); + tmp = comput_sc_ch_ec(&a[i], &a[max_ii], bw_rate, chn_pen_gap, chn_pen_skip, xl, yl); if (tmp != INT32_MIN && max_f < tmp + f[max_ii]) max_f = tmp + f[max_ii], max_j = max_ii; } diff --git a/Hash_Table.h b/Hash_Table.h index 8ab6142..9b1f946 100644 --- a/Hash_Table.h +++ b/Hash_Table.h @@ -9,6 +9,7 @@ #define WINDOW 375 #define WINDOW_BOUNDARY 375 #define WINDOW_HC 775 +#define WINDOW_HC_FAST 512 ///for one side, the first or last WINDOW_UNCORRECT_SINGLE_SIDE_BOUNDARY bases should not be corrected #define WINDOW_UNCORRECT_SINGLE_SIDE_BOUNDARY 25 #define THRESHOLD 15 diff --git a/Levenshtein_distance.h b/Levenshtein_distance.h index 612a9b9..a7c6ec7 100644 --- a/Levenshtein_distance.h +++ b/Levenshtein_distance.h @@ -548,24 +548,201 @@ inline int32_t pop_trace_back(asg16_v *res, int32_t i, uint16_t *c, uint32_t *le return i; } +///compact functions +#define pop_trac_bpc(in, rc, rb, rl) do { \ + (rc) = ((in)>>14);\ + if((rc) == 1 || (rc) == 2) {(rb) = (((in)>>12)&3); (rl) = ((in)&(0xfff));}\ + else {(rl) = ((in)&(0x3fff));}\ + } while (0) + inline void push_trace_bp(asg16_v *res, uint16_t c, uint16_t b, uint32_t len, uint32_t is_append) { - uint16_t p; + uint16_t p, c0, b0, len0, mm; if((is_append) && (res->n)) { - + b0 = b; + pop_trac_bpc(res->a[res->n-1], c0, b0, len0); + if((c == c0) && (b == b0)) { + res->n--; len += len0; + } } + mm = (0x3fff); c0 = c; c <<= 14; + if(c0 == 1 || c0 == 2) { + mm = (0xfff); c += ((b&3) << 12); + } + + while (len >= mm) { + p = (c + mm); kv_push(uint16_t, *res, p); len -= mm; + } + // fprintf(stderr, "[M::%s] c::%u, len::%u\n", __func__, c, len); + if(len) { + p = (c + len); kv_push(uint16_t, *res, p); + } +} + +inline uint32_t pop_trace_bp(asg16_v *res, uint32_t i, uint16_t *c, uint16_t *b, uint32_t *len) +{ + (*c) = (res->a[i]>>14); + if((*c) == 1 || (*c) == 2) { + (*b) = ((res->a[i]>>12)&3); + (*len) = (res->a[i]&(0xfff)); + } else { + (*b) = (uint16_t)-1; + (*len) = (res->a[i]&(0x3fff)); + } + + uint32_t sl; uint16_t sb; + for (i++; (i < res->n) && ((*c) == (res->a[i]>>14)); i++) { + if((*c) == 1 || (*c) == 2) { + sb = ((res->a[i]>>12)&3); sl = (res->a[i]&(0xfff)); + } else { + sb = (uint16_t)-1; sl = (res->a[i]&(0x3fff)); + } + if((*b) != sb) break; + (*len) += sl; + } + return i; +} + +inline int64_t pop_trace_bp_rev(asg16_v *res, int64_t i, uint16_t *c, uint16_t *b, uint32_t *len) +{ + (*c) = (res->a[i]>>14); + if((*c) == 1 || (*c) == 2) { + (*b) = ((res->a[i]>>12)&3); + (*len) = (res->a[i]&(0xfff)); + } else { + (*b) = (uint16_t)-1; + (*len) = (res->a[i]&(0x3fff)); + } + + uint32_t sl; uint16_t sb; + for (i--; (i >= 0) && ((*c) == (res->a[i]>>14)); i--) { + if((*c) == 1 || (*c) == 2) { + sb = ((res->a[i]>>12)&3); sl = (res->a[i]&(0xfff)); + } else { + sb = (uint16_t)-1; sl = (res->a[i]&(0x3fff)); + } + if((*b) != sb) break; + (*len) += sl; + } + return i; +} + +///full functions +#define pop_trac_bpc_f(in, rc, rbq, rbt, rl) do { \ + (rc) = ((in)>>14);\ + if((rc) == 2 || (rc) == 3) {(rbt) = (((in)>>12)&3); (rl) = ((in)&(0xfff));}\ + else if((rc) == 1) {(rbt) = (((in)>>12)&3); (rbq) = (((in)>>10)&3); (rl) = ((in)&(0x3ff));}\ + else {(rl) = ((in)&(0x3fff));}\ + } while (0) + +inline void push_trace_bp_f(asg16_v *res, uint16_t c, uint16_t bq, uint16_t bt, uint32_t len, uint32_t is_append) +{ + uint16_t p, c0 = c, bq0, bt0, len0, mm; + if(c == 3) { + bt = bq; bq = (uint16_t)-1; + } + if((is_append) && (res->n)) { + bq0 = bq; bt0 = bt; + pop_trac_bpc_f(res->a[res->n-1], c0, bq0, bt0, len0); + if((c == c0) && (bq == bq0) && (bt == bt0)) { + res->n--; len += len0; + } + } + + + c0 = c; c <<= 14; + if(c0 == 2 || c0 == 3) { + mm = (0xfff); c += ((bt&3) << 12); + } else if(c0 == 1) { + mm = (0x3ff); c += ((bt&3) << 12); c += ((bq&3) << 10); + } else { + mm = (0x3fff); + } + - c <<= 14; - while (len >= (0x3fff)) { - p = (c + (0x3fff)); kv_push(uint16_t, *res, p); len -= (0x3fff); + while (len >= mm) { + p = (c + mm); kv_push(uint16_t, *res, p); len -= mm; } + // fprintf(stderr, "[M::%s] c::%u, len::%u\n", __func__, c, len); if(len) { p = (c + len); kv_push(uint16_t, *res, p); } } +inline uint32_t pop_trace_bp_f(asg16_v *res, uint32_t i, uint16_t *c, uint16_t *bq, uint16_t *bt, uint32_t *len) +{ + (*c) = (res->a[i]>>14); (*bq) = (*bt) = (uint16_t)-1; + if((*c) == 2 || (*c) == 3) { + (*bt) = ((res->a[i]>>12)&3); + (*len) = (res->a[i]&(0xfff)); + } else if((*c) == 1) { + (*bt) = ((res->a[i]>>12)&3); + (*bq) = ((res->a[i]>>10)&3); + (*len) = (res->a[i]&(0x3ff)); + } else { + (*len) = (res->a[i]&(0x3fff)); + } + + uint32_t sl; uint16_t sbq, sbt; + for (i++; (i < res->n) && ((*c) == (res->a[i]>>14)); i++) { + sbq = sbt = (uint16_t)-1; + if((*c) == 2 || (*c) == 3) { + sbt = ((res->a[i]>>12)&3); + sl = (res->a[i]&(0xfff)); + } else if((*c) == 1) { + sbt = ((res->a[i]>>12)&3); + sbq = ((res->a[i]>>10)&3); + sl = (res->a[i]&(0x3ff)); + } else { + sl = (res->a[i]&(0x3fff)); + } + if((*bq) != sbq || (*bt) != sbt) break; + (*len) += sl; + } + if((*c) == 3) { + (*bq) = (*bt); (*bt) = (uint16_t)-1; + } + return i; +} + +inline int64_t pop_trace_bp_rev_f(asg16_v *res, int64_t i, uint16_t *c, uint16_t *bq, uint16_t *bt, uint32_t *len) +{ + (*c) = (res->a[i]>>14); (*bq) = (*bt) = (uint16_t)-1; + if((*c) == 2 || (*c) == 3) { + (*bt) = ((res->a[i]>>12)&3); + (*len) = (res->a[i]&(0xfff)); + } else if((*c) == 1) { + (*bt) = ((res->a[i]>>12)&3); + (*bq) = ((res->a[i]>>10)&3); + (*len) = (res->a[i]&(0x3ff)); + } else { + (*len) = (res->a[i]&(0x3fff)); + } + + uint32_t sl; uint16_t sbq, sbt; + for (i--; (i >= 0) && ((*c) == (res->a[i]>>14)); i--) { + sbq = sbt = (uint16_t)-1; + if((*c) == 2 || (*c) == 3) { + sbt = ((res->a[i]>>12)&3); + sl = (res->a[i]&(0xfff)); + } else if((*c) == 1) { + sbt = ((res->a[i]>>12)&3); + sbq = ((res->a[i]>>10)&3); + sl = (res->a[i]&(0x3ff)); + } else { + sl = (res->a[i]&(0x3fff)); + } + if((*bq) != sbq || (*bt) != sbt) break; + (*len) += sl; + } + if((*c) == 3) { + (*bq) = (*bt); (*bt) = (uint16_t)-1; + } + return i; +} + ///511 -> 16 64-bits // #define MAX_E 511 // #define MAX_L 2500 diff --git a/Overlaps.h b/Overlaps.h index 6a866d5..24699f2 100644 --- a/Overlaps.h +++ b/Overlaps.h @@ -1242,4 +1242,8 @@ uint64_t trans_sec_cut0(kv_u_trans_t *ta, asg64_v *srt, uint32_t id, double sec_ void clean_u_trans_t_idx_filter_mmhap_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g, ma_hit_t_alloc* src, ug_rid_cov_t *in); void gen_ug_rid_cov_t_by_ovlp(kv_u_trans_t *ta, ug_rid_cov_t *cc); +#define UC_Read_resize(v, s) do {\ + if ((v).size<(s)) {REALLOC((v).seq,(s));(v).size=(s);}\ + } while (0) + #endif diff --git a/Process_Read.h b/Process_Read.h index a0f3693..4711b24 100644 --- a/Process_Read.h +++ b/Process_Read.h @@ -107,6 +107,8 @@ typedef struct #define CHAIN_MATCH 1 #define CHAIN_UNMATCH 0.334 +#define NEC 1 + typedef struct { uint64_t** N_site; diff --git a/anchor.cpp b/anchor.cpp index 7918b8b..0cc0e28 100644 --- a/anchor.cpp +++ b/anchor.cpp @@ -6,6 +6,7 @@ #include "Hash_Table.h" #include "kalloc.h" #include "Overlaps.h" +#include "Levenshtein_distance.h" #define HA_KMER_GOOD_RATIO 0.333 #define OFL 0.95 @@ -40,6 +41,9 @@ KSORT_INIT(or_occ, overlap_region, oreg_occ_lt) #define oreg_id_lt(a, b) ((a).y_id < (b).y_id) KSORT_INIT(or_id, overlap_region, oreg_id_lt) +#define ha_mz1_t_key(p) ((p).x) +KRADIX_SORT_INIT(ha_mz1_v_srt, ha_mz1_t, ha_mz1_t_key, member_size(ha_mz1_t, x)) + typedef struct { int n; const ha_idxpos_t *a; @@ -1076,6 +1080,207 @@ void *ha_flt_tab, ha_pt_t *ha_idx, All_reads* rdb, kvec_t_u64_warp* dbg_ct, st_m cl->length = ab->n_a; } +void minimizers_qgen0_amz(ha_abuf_t *ab, char* rs, int64_t rl, uint64_t mz_w, uint64_t mz_k, Candidates_list *cl, kvec_t_u8_warp* k_flag, +void *ha_flt_tab, ha_pt_t *ha_idx, All_reads* rdb, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ) +{ + // fprintf(stderr, "+[M::%s]\n", __func__); + uint64_t i, k, l, max_cnt = UINT32_MAX, min_cnt = 0; int n, j; ha_mz1_t *z; seed1_t *s; + if(high_occ) { + max_cnt = (*high_occ); + if(max_cnt < 2) max_cnt = 2; + } + if(low_occ) { + min_cnt = (*low_occ); + if(min_cnt < 2) min_cnt = 2; + } + clear_Candidates_list(cl); ab->mz.n = 0, ab->n_a = 0; + + // get the list of anchors + mz1_ha_sketch(rs, rl, mz_w, mz_k, 0, !(asm_opt.flag & HA_F_NO_HPC), &ab->mz, ha_flt_tab, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 0, NULL); + + // minimizer of queried read + if (ab->mz.m > ab->old_mz_m) { + ab->old_mz_m = ab->mz.m; + REALLOC(ab->seed, ab->old_mz_m); + } + + for (i = 0, ab->n_a = 0; i < ab->mz.n; ++i) { + + ab->seed[i].a = ha_pt_get(ha_idx, ab->mz.a[i].x, &n); + ab->seed[i].n = n; + ab->n_a += n; + } + + if (ab->n_a > ab->m_a) { + ab->m_a = ab->n_a; + REALLOC(ab->a, ab->m_a); + } + + for (i = 0, k = 0; i < ab->mz.n; ++i) { + ///z is one of the minimizer + z = &ab->mz.a[i]; s = &ab->seed[i]; + for (j = 0; j < s->n; ++j) { + const ha_idxpos_t *y = &s->a[j]; + anchor1_t *an = &ab->a[k++]; + uint8_t rev = z->rev == y->rev? 0 : 1; + an->other_off = rev?((uint32_t)-1)-1-(y->pos+1-y->span):y->pos; + an->self_off = z->pos; + ///an->cnt: cnt<<8|span + an->cnt = s->n; if(an->cnt > ((uint32_t)(0xffffffu))) an->cnt = 0xffffffu; + an->cnt <<= 8; an->cnt |= ((z->span <= ((uint32_t)(0xffu)))?z->span:((uint32_t)(0xffu))); + an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | an->self_off; + } + } + + // copy over to _cl_ + if (ab->m_a >= (uint64_t)cl->size) { + cl->size = ab->m_a; + REALLOC(cl->list, cl->size); + } + + k_mer_hit *p; uint64_t tid = (uint64_t)-1, tl = (uint64_t)-1; + radix_sort_ha_an1(ab->a, ab->a + ab->n_a); + for (k = 1, l = 0; k <= ab->n_a; ++k) { + if (k == ab->n_a || ab->a[k].srt != ab->a[l].srt) { + if (k-l>1) radix_sort_ha_an3(ab->a+l, ab->a+k); + if((ab->a[l].srt>>33)!=tid) { + tid = ab->a[l].srt>>33; + tl = Get_READ_LENGTH((*rdb), tid); + // tl = rdb?Get_READ_LENGTH((*rdb), tid):udb->ug->u.a[tid].len; + } + for (i = l; i < k; i++) { + p = &cl->list[i]; + p->readID = ab->a[i].srt>>33; + p->strand = (ab->a[i].srt>>32)&1; + if(!(p->strand)) { + p->offset = ab->a[i].other_off; + } else { + p->offset = ((uint32_t)-1)-ab->a[i].other_off; + p->offset = tl-p->offset; + } + p->self_offset = ab->a[i].self_off; + if(((ab->a[i].cnt>>8) < max_cnt) && ((ab->a[i].cnt>>8) > min_cnt)){ + p->cnt = 1; + } else if((ab->a[i].cnt>>8) <= min_cnt) { + p->cnt = 2; + } else{ + p->cnt = 1 + (((ab->a[i].cnt>>8) + (max_cnt<<1) - 1)/(max_cnt<<1)); + p->cnt = pow(p->cnt, 1.1); + } + if(p->cnt > ((uint32_t)(0xffffffu))) p->cnt = 0xffffffu; + p->cnt <<= 8; p->cnt |= (((uint32_t)(0xffu))&(ab->a[i].cnt)); + } + l = k; + } + } + cl->length = ab->n_a; +} + + +uint64_t lchain_qgen_mcopy_fast_re0(ha_abuf_t *ab, ha_pt_t *ha_idx, ha_mz1_t *ra, uint64_t rn, ha_mz1_t *qa, uint64_t qn, uint64_t qid, Candidates_list *cl, uint32_t *high_occ, uint32_t *low_occ) +{ + // fprintf(stderr, "+[M::%s]\n", __func__); + uint64_t i, k, l, max_cnt = UINT32_MAX, min_cnt = 0, ri, qi, sn; anchor1_t *an; k_mer_hit *p; + if(high_occ) { + max_cnt = (*high_occ); + if(max_cnt < 2) max_cnt = 2; + } + if(low_occ) { + min_cnt = (*low_occ); + if(min_cnt < 2) min_cnt = 2; + } + // clear_Candidates_list(cl); + + ///first try + for (k = 1, l = i = ab->n_a = 0; k <= rn; ++k) { + if (k == rn || ra[k].x != ra[l].x) { + for (; i < qn && qa[i].x < ra[l].x; i++); + if(i < qn && qa[i].x == ra[l].x) { + sn = 0; + if(ab->n_a < ab->m_a) sn = ab->seed[l].n;///ha_pt_cnt(ha_idx, ra[l].x); + + for (qi = i; qi < qn && qa[qi].x == ra[l].x; qi++) { + for (ri = l; ri < k; ri++) { + if(qa[qi].rev != ra[ri].rev) continue; + if(ab->n_a < ab->m_a) { + an = &(ab->a[ab->n_a++]); + + an->other_off = qa[qi].pos; + an->self_off = ra[ri].pos; + ///an->cnt: cnt<<8|span + an->cnt = sn; if(an->cnt > ((uint32_t)(0xffffffu))) an->cnt = 0xffffffu; + an->cnt <<= 8; an->cnt |= ((ra[ri].span <= ((uint32_t)(0xffu)))?ra[ri].span:((uint32_t)(0xffu))); + an->srt = (((uint64_t)(an->self_off))<<32)|((uint64_t)(an->other_off)); + } else { + ab->n_a++; + } + } + } + } + l = k; + } + } + + + if (ab->n_a > ab->m_a) { + ab->m_a = ab->n_a; REALLOC(ab->a, ab->m_a); + + for (k = 1, l = i = ab->n_a = 0; k <= rn; ++k) { + if (k == rn || ra[k].x != ra[l].x) { + for (; i < qn && qa[i].x < ra[l].x; i++); + if(i < qn && qa[i].x == ra[l].x) { + sn = ab->seed[l].n;///ha_pt_cnt(ha_idx, ra[l].x); + + for (qi = i; qi < qn && qa[qi].x == ra[l].x; qi++) { + for (ri = l; ri < k; ri++) { + if(qa[qi].rev != ra[ri].rev) continue; + + an = &(ab->a[ab->n_a++]); + + an->other_off = qa[qi].pos; + an->self_off = ra[ri].pos; + ///an->cnt: cnt<<8|span + an->cnt = sn; if(an->cnt > ((uint32_t)(0xffffffu))) an->cnt = 0xffffffu; + an->cnt <<= 8; an->cnt |= ((ra[ri].span <= ((uint32_t)(0xffu)))?ra[ri].span:((uint32_t)(0xffu))); + an->srt = (((uint64_t)(an->self_off))<<32)|((uint64_t)(an->other_off)); + } + } + } + l = k; + } + } + } + + // copy over to _cl_ + sn = ab->n_a + cl->length; + if (sn > (uint64_t)cl->size) { + cl->size = sn; + REALLOC(cl->list, cl->size); + } + + radix_sort_ha_an1(ab->a, ab->a + ab->n_a); + for (i = 0; i < ab->n_a; i++) { + p = &cl->list[cl->length++]; + p->readID = qid; + p->strand = 0; + p->offset = ab->a[i].other_off; + p->self_offset = ab->a[i].self_off; + + if(((ab->a[i].cnt>>8) < max_cnt) && ((ab->a[i].cnt>>8) > min_cnt)){ + p->cnt = 1; + } else if((ab->a[i].cnt>>8) <= min_cnt) { + p->cnt = 2; + } else{ + p->cnt = 1 + (((ab->a[i].cnt>>8) + (max_cnt<<1) - 1)/(max_cnt<<1)); + p->cnt = pow(p->cnt, 1.1); + } + if(p->cnt > ((uint32_t)(0xffffffu))) p->cnt = 0xffffffu; + p->cnt <<= 8; p->cnt |= (((uint32_t)(0xffu))&(ab->a[i].cnt)); + } + // cl->length = ab->n_a; + return ab->n_a; +} + void gen_pair_chain(ha_abufl_t *ab, uint64_t rid, st_mt_t *tid, uint64_t tid_n, ha_mzl_t *in, uint64_t in_n, ha_mzl_t *idx, int64_t idx_n, uint64_t mzl_cutoff) { if(!tid_n) return; @@ -1717,7 +1922,7 @@ void lchain_qgen_mcopy_fast(Candidates_list* cl, overlap_region_alloc* ol, uint3 int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, int64_t quick_check, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut, st_mt_t *sp) { - // fprintf(stderr, "+[M::%s]\n", __func__); + // fprintf(stderr, "+[M::%s] chain_cutoff::%u\n", __func__, chain_cutoff); uint64_t i, k, l, m, cn = cl->length, yid, ol0, lch; overlap_region *r, t; ///srt = 0 clear_overlap_region_alloc(ol); @@ -1813,52 +2018,22 @@ void lchain_qgen_mcopy_fast(Candidates_list* cl, overlap_region_alloc* ol, uint3 } // fprintf(stderr, "+[M::%s] rid::%u, ol->length0::%lu, ol->length1::%lu\n", __func__, rid, ol->length, l); ol->length = l; + } + for (i = 0; i < ol->length; ++i) ol->list[i].align_length = 0; +} - /** - //@brief r484 - for (i = sp->n = 0; i < ol->length; ++i) { - if(ol->list[i].align_length < chain_cutoff) continue; - os = ol->list[i].x_pos_s; oe = ol->list[i].x_pos_e + 1; - if((sp->n) && (((uint32_t)sp->a[sp->n-1]) >= os)) { - if(oe > ((uint32_t)sp->a[sp->n-1])) { - oe = oe - ((uint32_t)sp->a[sp->n-1]); - sp->a[sp->n-1] += oe; - } - } else { - os = (os<<32)|oe; kv_push(uint64_t, *sp, os); - } - } +void lchain_qgen_mcopy_fast_re1(Candidates_list* cl, uint32_t cl_beg, overlap_region_alloc* ol, uint32_t rid, uint64_t rl, uint64_t tl, + uint32_t apend_be, int64_t max_skip, int64_t max_iter, + int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, int64_t quick_check, + uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut) +{ + uint64_t cn = cl->length, m = cl_beg; + if(cl_beg >= cn) return; - for (i = k = 0; i < ol->length; ++i) { - if(ol->list[i].align_length < chain_cutoff) {///ol has been sorted by x_pos_s - r = &(ol->list[i]); rs = r->x_pos_s; re = r->x_pos_e + 1; - rl = re - rs; ovl = 0; - for (m = 0; (m < sp->n) && (re > (sp->a[m]>>32)); m++) { - os = ((rs>=(sp->a[m]>>32))? rs:(sp->a[m]>>32)); - oe = ((re<=((uint32_t)sp->a[m]))? re:((uint32_t)sp->a[m])); - if(oe > os) { - ovl += (oe - os); if(ovl >= (rl*0.95)) break; - } - } - if(ovl >= (rl*0.95)) continue; - } - if (k != i) { - t = ol->list[k]; - ol->list[k] = ol->list[i]; - ol->list[i] = t; - } - ol->list[k++].align_length = 0; - } - // fprintf(stderr, "+[M::%s] ol->length0::%lu, ol->length1::%lu\n", __func__, ol->length, k); - ol->length = k; - **/ - } - /**else { - for (i = 0; i < ol->length; ++i) ol->list[i].align_length = 0; - } - **/ - for (i = 0; i < ol->length; ++i) ol->list[i].align_length = 0; + m += lchain_qdp_mcopy_fast(cl, cl_beg, cn - cl_beg, m, &(cl->chainDP), ol, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_rate, + rid, rl, tl, quick_check, apend_be, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, 1); + cl->length = m; } inline uint64_t special_lchain(Candidates_list* cl, overlap_region_alloc* ol, uint32_t rid, uint64_t rl, All_reads* rdb, @@ -2063,6 +2238,1291 @@ void h_ec_lchain(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz lchain_qgen_mcopy_fast(cl, overlap_list, rid, rl, rref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, enable_mcopy, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp); } +void h_ec_lchain_amz(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut) +{ + extern void *ha_flt_tab; + extern ha_pt_t *ha_idx; + int64_t max_skip, max_iter, max_dis, quick_check; double chn_pen_gap, chn_pen_skip; + set_lchain_dp_op(is_accurate, mz_k, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); + // minimizers_gen(ab, rs, rl, mz_w, mz_k, cl, k_flag, ha_flt_tab, ha_idx, dbg_ct, sp, high_occ, low_occ); + minimizers_qgen0_amz(ab, rs, rl, mz_w, mz_k, cl, k_flag, ha_flt_tab, ha_idx, rref, dbg_ct, sp, high_occ, low_occ); + // lchain_gen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); + // lchain_qgen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); + ///no need to sort here, overlap_list has been sorted at lchain_gen + lchain_qgen_mcopy_fast(cl, overlap_list, rid, rl, rref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, enable_mcopy, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp); +} + +uint64_t recalu_minimizer0(char *s, uint64_t len, uint64_t is_hpc, int64_t mz_k, uint64_t mz_h, tiny_queue_t *tq, uint64_t *rpos, uint64_t *rspan) +{ + uint64_t k, l, shift1 = mz_k - 1, mask = (1ULL<front = tq->count = 0; + for (k = 1, l = 0, mz_l = mz_span = 0; k <= len; ++k) { + if (k == len || seq_nt4_table[(uint8_t)s[k]] != seq_nt4_table[(uint8_t)s[l]]) { + c = seq_nt4_table[(uint8_t)s[l]]; + if(c < 4) { + kmer[0] = (kmer[0] << 1 | (c&1)) & mask;/**forward k-mer**/ + kmer[1] = (kmer[1] << 1 | (c>>1)) & mask; + kmer[2] = kmer[2] >> 1 | (uint64_t)(1 - (c&1)) << shift1; /**reverse k-mer**/ + kmer[3] = kmer[3] >> 1 | (uint64_t)(1 - (c>>1)) << shift1; + if (kmer[1] == kmer[3]) continue; /** skip "symmetric k-mers" as we don't know it strand**/ + z = kmer[1] < kmer[3]? 0 : 1; /** strand**/ + mz_l++; mz_span += (k - l); + tq_push(tq, k - l); if (tq->count > mz_k) mz_span -= tq_shift(tq); + if(mz_l >= mz_k && mz_span < 256) { + hs = yak_hash64_64(kmer[z<<1|0]) + yak_hash64_64(kmer[z<<1|1]); + if(mz_h == hs) { + (*rpos) = k - 1; (*rspan) = mz_span; + return 1; + } + } + } else { + mz_l = mz_span = 0; + } + + l = k; + } + } + } else { + for (k = 0, mz_l = mz_span = 0; k < len; ++k) { + c = seq_nt4_table[(uint8_t)s[k]]; + if(c < 4) { + kmer[0] = (kmer[0] << 1 | (c&1)) & mask;/**forward k-mer**/ + kmer[1] = (kmer[1] << 1 | (c>>1)) & mask; + kmer[2] = kmer[2] >> 1 | (uint64_t)(1 - (c&1)) << shift1; /**reverse k-mer**/ + kmer[3] = kmer[3] >> 1 | (uint64_t)(1 - (c>>1)) << shift1; + if (kmer[1] == kmer[3]) continue; /** skip "symmetric k-mers" as we don't know it strand**/ + z = kmer[1] < kmer[3]? 0 : 1; /** strand**/ + mz_l++; mz_span++; if(mz_span > mz_k) mz_span = mz_k; + if(mz_l >= mz_k) { + hs = yak_hash64_64(kmer[z<<1|0]) + yak_hash64_64(kmer[z<<1|1]); + if(mz_h == hs) { + (*rpos) = k; (*rspan) = mz_span; + return 1; + } + } + } else { + mz_l = mz_span = 0; + } + } + } + + return 0; +} + +uint64_t recalu_minimizer0_adv(char *s, uint64_t len, uint64_t is_hpc, int64_t mz_k, uint64_t mz_h, uint64_t rev, tiny_queue_t *tq, uint64_t *rpos, uint64_t *rspan) +{ + uint64_t k, l, shift1 = mz_k - 1, mask = (1ULL<front = tq->count = 0; + for (k = 1, l = 0, mz_l = mz_span = 0; k <= len; ++k) { + if (k == len || seq_nt4_table[(uint8_t)s[k]] != seq_nt4_table[(uint8_t)s[l]]) { + c = seq_nt4_table[(uint8_t)s[l]]; + if(c < 4) { + kmer[0] = (kmer[0] << 1 | (c&1)) & mask;/**forward k-mer**/ + kmer[1] = (kmer[1] << 1 | (c>>1)) & mask; + kmer[2] = kmer[2] >> 1 | (uint64_t)(1 - (c&1)) << shift1; /**reverse k-mer**/ + kmer[3] = kmer[3] >> 1 | (uint64_t)(1 - (c>>1)) << shift1; + if (kmer[1] == kmer[3]) continue; /** skip "symmetric k-mers" as we don't know it strand**/ + z = kmer[1] < kmer[3]? 0 : 1; /** strand**/ + mz_l++; mz_span += (k - l); + tq_push(tq, k - l); if (tq->count > mz_k) mz_span -= tq_shift(tq); + if(mz_l >= mz_k && mz_span < 256) { + hs = yak_hash64_64(kmer[z<<1|0]) + yak_hash64_64(kmer[z<<1|1]); + if(mz_h == hs) { + (*rpos) = k - 1; (*rspan) = mz_span; + return 1; + } + } + } else { + mz_l = mz_span = 0; + } + + l = k; + } + } + } else { + for (k = 0, mz_l = mz_span = 0; k < len; ++k) { + c = seq_nt4_table[(uint8_t)s[k]]; + if(c < 4) { + kmer[0] = (kmer[0] << 1 | (c&1)) & mask;/**forward k-mer**/ + kmer[1] = (kmer[1] << 1 | (c>>1)) & mask; + kmer[2] = kmer[2] >> 1 | (uint64_t)(1 - (c&1)) << shift1; /**reverse k-mer**/ + kmer[3] = kmer[3] >> 1 | (uint64_t)(1 - (c>>1)) << shift1; + if (kmer[1] == kmer[3]) continue; /** skip "symmetric k-mers" as we don't know it strand**/ + z = kmer[1] < kmer[3]? 0 : 1; /** strand**/ + mz_l++; mz_span++; if(mz_span > mz_k) mz_span = mz_k; + if(mz_l >= mz_k) { + hs = yak_hash64_64(kmer[z<<1|0]) + yak_hash64_64(kmer[z<<1|1]); + if(mz_h == hs) { + (*rpos) = k; (*rspan) = mz_span; + return 1; + } + } + } else { + mz_l = mz_span = 0; + } + } + } + } else { + uint8_t ch[5] = {3, 2, 1, 0, 5}; + if(is_hpc) { + tq->front = tq->count = 0; + for (k = 1, l = 0, mz_l = mz_span = 0; k <= len; ++k) { + if (k == len || seq_nt4_table[(uint8_t)s[len-k-1]] != seq_nt4_table[(uint8_t)s[len-l-1]]) { + c = ch[seq_nt4_table[(uint8_t)s[len-l-1]]]; + if(c < 4) { + kmer[0] = (kmer[0] << 1 | (c&1)) & mask;/**forward k-mer**/ + kmer[1] = (kmer[1] << 1 | (c>>1)) & mask; + kmer[2] = kmer[2] >> 1 | (uint64_t)(1 - (c&1)) << shift1; /**reverse k-mer**/ + kmer[3] = kmer[3] >> 1 | (uint64_t)(1 - (c>>1)) << shift1; + if (kmer[1] == kmer[3]) continue; /** skip "symmetric k-mers" as we don't know it strand**/ + z = kmer[1] < kmer[3]? 0 : 1; /** strand**/ + mz_l++; mz_span += (k - l); + tq_push(tq, k - l); if (tq->count > mz_k) mz_span -= tq_shift(tq); + if(mz_l >= mz_k && mz_span < 256) { + hs = yak_hash64_64(kmer[z<<1|0]) + yak_hash64_64(kmer[z<<1|1]); + if(mz_h == hs) { + (*rpos) = k - 1; (*rspan) = mz_span; + return 1; + } + } + } else { + mz_l = mz_span = 0; + } + + l = k; + } + } + } else { + for (k = 0, mz_l = mz_span = 0; k < len; ++k) { + c = ch[seq_nt4_table[(uint8_t)s[len-k-1]]]; + if(c < 4) { + kmer[0] = (kmer[0] << 1 | (c&1)) & mask;/**forward k-mer**/ + kmer[1] = (kmer[1] << 1 | (c>>1)) & mask; + kmer[2] = kmer[2] >> 1 | (uint64_t)(1 - (c&1)) << shift1; /**reverse k-mer**/ + kmer[3] = kmer[3] >> 1 | (uint64_t)(1 - (c>>1)) << shift1; + if (kmer[1] == kmer[3]) continue; /** skip "symmetric k-mers" as we don't know it strand**/ + z = kmer[1] < kmer[3]? 0 : 1; /** strand**/ + mz_l++; mz_span++; if(mz_span > mz_k) mz_span = mz_k; + if(mz_l >= mz_k) { + hs = yak_hash64_64(kmer[z<<1|0]) + yak_hash64_64(kmer[z<<1|1]); + if(mz_h == hs) { + (*rpos) = k; (*rspan) = mz_span; + return 1; + } + } + } else { + mz_l = mz_span = 0; + } + } + } + } + + return 0; +} + + +uint64_t recalu_minimizer(uint64_t rid, anchor1_t *z, asg16_v *sc, int64_t *iok, int64_t *ink, int64_t *ick, int64_t *str_s, int64_t *str_e, uint64_t mz_k, uint64_t mz_h, tiny_queue_t *tq, All_reads *rref, UC_Read *tu, char *qstr, uint64_t qs, uint64_t qe) +{ + int64_t id = z->srt>>33; char *str = NULL; uint64_t rpos, rspan; + int64_t ok = *iok, nk = *ink, ck = *ick, cn = sc->n, s0, e0, s1, e1, os, oe, ots, ote, ol, wo[2], wn[2], weo[2], wen[2], ovlp, len = Get_READ_LENGTH((*rref), id); uint16_t op, bq, bt; uint32_t cl; + e0 = ((uint32_t)z->srt) + 1; s0 = e0 - z->other_off; s1 = e1 = -1; weo[0] = weo[1] = wen[0] = wen[1] = -1; + if(e0 <= s0) return 0; + + ///debug + // ok = nk = ck = 0; + + if((ck < 0) || (ck > cn)) {//(*ck) == cn is allowed + ck = ok = nk = 0; + } + + // if(id == 1/**s0 == 10260 && e0 == 10334**/) { + // fprintf(stderr, "\n\n-0-qk::%ld,\ttk::%ld,\tck::%ld,\ts0::%ld,\te0::%ld\n", ok, nk, ck, s0, e0); + // } + + while (ck > 0 && ok >= s0) {///x -> t; y -> p; first insertion and then match/mismatch + --ck; + op = sc->a[ck]>>14; + // ol = (((op == 1) || (op == 2))?(sc->a[ck]&(0xfff)):(sc->a[ck]&(0x3fff))); + if((op == 2) || (op == 3)) { + ol = sc->a[ck]&(0xfff); + } else if(op == 1) { + ol = sc->a[ck]&(0x3ff); + } else { + ol = sc->a[ck]&(0x3fff); + } + if(op != 2) ok -= ol; + if(op != 3) nk -= ol; + } + + // char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + // if(id == 1/**s0 == 10260 && e0 == 10334**/) { + // fprintf(stderr, "-1-qk::%ld,\ttk::%ld,\tck::%ld\n", ok, nk, ck); + // } + + while (ck < cn && ok < e0) { ///[s0, e0) + wo[0] = ok; wn[0] = nk; + // ck = pop_trace_bp(sc, ck, &op, &b, &cl); + ck = pop_trace_bp_f(sc, ck, &op, &bq, &bt, &cl); + if(op != 2) ok += cl; + if(op != 3) nk += cl; + wo[1] = ok; wn[1] = nk; + + os = ((s0 >= wo[0])? s0 : wo[0]); + oe = ((e0 <= wo[1])? e0 : wo[1]); + // os = MAX(s0, wo[0]); oe = MIN(e0, wo[1]); + ovlp = ((oe>os)? (oe-os):0); + // if(id == 1/**s0 == 10260 && e0 == 10334**/) { + // fprintf(stderr, "%u%c(q::[%ld,%ld))(t::[%ld,%ld))(ck::%ld)\n", cl, cm[op], wo[0], wo[1], wn[0], wn[1], ck); + // } + + if(op != 2) { + if(!ovlp) continue; + } else {///wo[0] == wo[1] + if(wo[0] < s0 || wo[0] >= e0) continue; + } + + if(op < 2) { + ots = os - wo[0] + wn[0]; ote = oe - wo[0] + wn[0]; + } else {///op == 2: more y; p == 3: more x + ots = wn[0]; ote = wn[1]; + } + + if(s1 == -1) s1 = ots; + e1 = ote; + + if((op == 0) && (ovlp > 0) && (ovlp > weo[1] - weo[0])) { + weo[0] = os; weo[1] = oe; + wen[0] = ots; wen[1] = ote; + } + } + + while (ck < cn && ok <= e0) { ///[s0, e0) + wo[0] = ok; wn[0] = nk; + // pop_trace_bp(sc, ck, &op, &b, &cl); + pop_trace_bp_f(sc, ck, &op, &bq, &bt, &cl); + if(op != 2) break; + // ck = pop_trace_bp(sc, ck, &op, &b, &cl); + ck = pop_trace_bp_f(sc, ck, &op, &bq, &bt, &cl); + // if(op != 2) ok += cl; + // if(op != 3) nk += cl; + nk += cl; + wo[1] = ok; wn[1] = nk; + + // if(id == 1/**s0 == 10260 && e0 == 10334**/) { + // fprintf(stderr, "%u%c(q::[%ld,%ld))(t::[%ld,%ld))(ck::%ld)\n", cl, cm[op], wo[0], wo[1], wn[0], wn[1], ck); + // } + + if(wo[0] >= s0 && wo[0] <= e0) { + ots = wn[0]; ote = wn[1]; + if(s1 == -1) s1 = ots; + e1 = ote; + } + } + + assert(wen[1] <= len); + assert(e1 <= len); + + *iok = ok; *ink = nk; *ick = ck; + + if(weo[0] == s0 && weo[1] == e0) { + z->srt >>= 32; z->srt <<= 32; z->srt |= ((uint64_t)(wen[1]-1)); + ///debug + // char sstr[256]; recover_UC_Read_sub_region(sstr, wen[0], wen[1] - wen[0], 0, rref, id); + // if(recalu_minimizer0(sstr, wen[1] - wen[0], !(asm_opt.flag & HA_F_NO_HPC), mz_k, mz_h, tq, &rpos, &rspan) && (rpos + 1 == ((uint64_t)(wen[1] - wen[0]))) && (((uint64_t)(wen[1] - wen[0])) == rspan)) { + // // fprintf(stderr, "-0-[M::%s]\n", __func__); + // } else { + // // if(((z->srt>>32)&1) == 0) { + // fprintf(stderr, "-1-[M::%s]\trid::%lu\ttid::%ld\t%c\trg1::[%ld,%ld)\trg0::[%ld,%ld)\tspan::%ld\n", __func__, rid, id, "+-"[(z->srt>>32)&1], wen[0], wen[1], s0, e0, e0 - s0); + // fprintf(stderr, "tstr::%.*s\n", ((uint32_t)(wen[1] - wen[0])), sstr); + // fprintf(stderr, "qstr::%.*s\n", ((uint32_t)(qe - qs)), qstr + qs); + // exit(1); + // // } + // } + // if(rid == 3196 && id == 3199) fprintf(stderr, "-full-[M::%s]\n", __func__); + return 1; + } + + if(e1 <= s1) return 0; + + + + if(s1 >= (*str_s) && e1 <= (*str_e)) { + str = tu->seq + s1 - (*str_s); + } else { + if(s1 >= (*str_s) && (s1 < (*str_e)) && (e1 > (*str_e))) { + UC_Read_resize((*tu), (e1 - (*str_s))); + recover_UC_Read_sub_region(tu->seq + (*str_e) - (*str_s), (*str_e), e1 - (*str_e), 0, rref, id); + str = tu->seq + s1 - (*str_s); (*str_e) = e1; + } else { + UC_Read_resize((*tu), (e1 - s1)); + recover_UC_Read_sub_region(tu->seq, s1, e1 - s1, 0, rref, id); + str = tu->seq; (*str_s) = s1; (*str_e) = e1; + } + } + + if(recalu_minimizer0(str, e1 - s1, !(asm_opt.flag & HA_F_NO_HPC), mz_k, mz_h, tq, &rpos, &rspan)) { + rpos += s1; + z->srt >>= 32; z->srt <<= 32; z->srt |= rpos; z->other_off = rspan; + // fprintf(stderr, "-1-[M::%s]\trid::%lu\ttid::%ld\t%c\trg1::[%ld,%ld)\trg0::[%ld,%ld)\tspan::%ld\n", __func__, rid, id, "+-"[(z->srt>>32)&1], s1, e1, s0, e0, e0 - s0); + // if(rid == 3196 && id == 3199) fprintf(stderr, "-part-[M::%s]\n", __func__); + return 1; + } + // fprintf(stderr, "-0-[M::%s]\trid::%lu\ttid::%ld\t%c\trg1::[%ld,%ld)\trg0::[%ld,%ld)\tspan::%ld\n", __func__, rid, id, "+-"[(z->srt>>32)&1], s1, e1, s0, e0, e0 - s0); + + return 0; +} + +int64_t hpc_minimizer_test(int64_t id, All_reads *rref, UC_Read *bu, int64_t str_s, int64_t str_e, int64_t z, int64_t len, int64_t rev, int64_t step) +{ + if(z < 0 || z >= len) return 0; + if(z == 0 && rev == 1) return 0; + if(z == len - 1 && rev == 0) return 0; + + int64_t an, sc, ec, k, tn = 0; char c = 0, *a = NULL; + if(z >= str_s && z < str_e) c = bu->seq[z-str_s]; + + if(rev) { + if(c != 0) { + if(z > str_s) { + a = bu->seq; an = z - str_s; + for (k = an - 1; k >= 0 && a[k] == c; k--); + tn += an - k - 1; + if((k >= 0) || (z - tn == 0)) return tn; + ec = str_s; + } else { + ec = z; + } + } else { + ec = z + 1; + } + + sc = ec - step; if(sc < 0) sc = 0; + if(ec <= 0 || ec > len || ec <= sc) return tn; + + UC_Read_resize((*bu), (str_e - str_s) + (step)); + a = bu->seq + str_e - str_s; + + while (1) { + recover_UC_Read_sub_region(a, sc, ec - sc, 0, rref, id); an = ec - sc; + if(c == 0) c = a[--an]; + for (k = an - 1; k >= 0 && a[k] == c; k--); + tn += an - k - 1; + if((k >= 0) || (z - tn == 0)) return tn; + ec = sc; + sc = ec - step; if(sc < 0) sc = 0; + if(ec <= 0 || ec > len || ec <= sc) return tn; + } + } else { + if(c != 0) { + if(z + 1 < str_e) { + a = bu->seq + z + 1 - str_s; an = str_e - z - 1; + for (k = 0; k < an && a[k] == c; k++); + tn += k; + if((k < an) || (z + tn + 1 == len)) return tn; + sc = str_e; + } else { + sc = z + 1; + } + } else { + sc = z; + } + + ec = sc + step; if(ec > len) ec = len; + if(sc < 0 || sc >= len || sc >= ec) return tn; + + UC_Read_resize((*bu), (str_e - str_s) + (step)); + a = bu->seq + str_e - str_s; + + while (1) { + // if(bu->size < (a - bu->seq) + (ec - sc)) { + // fprintf(stderr, "-1-[M::%s]\tid::%ld\n", __func__, id); + // } + recover_UC_Read_sub_region(a, sc, ec - sc, 0, rref, id); an = ec - sc; + a = bu->seq + str_e - str_s; + if(c == 0) { + c = a[0]; a = a + 1; an--; + } + for (k = 0; k < an && a[k] == c; k++); + tn += k; + if((k < an) || (z + tn + 1 == len)) return tn; + sc = ec; + ec = sc + step; if(ec > len) ec = len; + if(sc < 0 || sc >= len || sc >= ec) return tn; + } + } +} + +uint64_t recalu_minimizer_bd(uint64_t rid, anchor1_t *z, asg16_v *sc, int64_t *iok, int64_t *ink, int64_t *ick, int64_t *str_s, int64_t *str_e, uint64_t mz_k, uint64_t mz_h, tiny_queue_t *tq, All_reads *rref, UC_Read *tu, char *qstr, uint64_t qs, uint64_t qe) +{ + int64_t id = z->srt>>33; char *str = NULL; uint64_t rpos, rspan; + int64_t ok = *iok, nk = *ink, ck = *ick, cn = sc->n, s0, e0, s1, e1, si, ei, os, oe, ots, ote, ol, wo[2], wn[2], weo[2], wen[2], ovlp, len = Get_READ_LENGTH((*rref), id); uint16_t op, bq, bt; uint32_t cl; + e0 = ((uint32_t)z->srt) + 1; s0 = e0 - z->other_off; s1 = e1 = si = ei = -1; weo[0] = weo[1] = wen[0] = wen[1] = -1; + if(e0 <= s0) return 0; + + ///debug + // ok = nk = ck = 0; + + if((ck < 0) || (ck > cn)) {//(*ck) == cn is allowed + ck = ok = nk = 0; + } + + // if(id == 1/**s0 == 10260 && e0 == 10334**/) { + // fprintf(stderr, "\n\n-0-qk::%ld,\ttk::%ld,\tck::%ld,\ts0::%ld,\te0::%ld\n", ok, nk, ck, s0, e0); + // } + + while (ck > 0 && ok >= s0) {///x -> t; y -> p; first insertion and then match/mismatch + --ck; + op = sc->a[ck]>>14; + // ol = (((op == 1) || (op == 2))?(sc->a[ck]&(0xfff)):(sc->a[ck]&(0x3fff))); + if((op == 2) || (op == 3)) { + ol = sc->a[ck]&(0xfff); + } else if(op == 1) { + ol = sc->a[ck]&(0x3ff); + } else { + ol = sc->a[ck]&(0x3fff); + } + if(op != 2) ok -= ol; + if(op != 3) nk -= ol; + } + + // char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + // if(id == 1/**s0 == 10260 && e0 == 10334**/) { + // fprintf(stderr, "-1-qk::%ld,\ttk::%ld,\tck::%ld\n", ok, nk, ck); + // } + while (ck < cn && ok < e0) { ///[s0, e0) + wo[0] = ok; wn[0] = nk; + // ck = pop_trace_bp(sc, ck, &op, &b, &cl); + ck = pop_trace_bp_f(sc, ck, &op, &bq, &bt, &cl); + if(op != 2) ok += cl; + if(op != 3) nk += cl; + wo[1] = ok; wn[1] = nk; + + os = ((s0 >= wo[0])? s0 : wo[0]); + oe = ((e0 <= wo[1])? e0 : wo[1]); + // os = MAX(s0, wo[0]); oe = MIN(e0, wo[1]); + ovlp = ((oe>os)? (oe-os):0); + // if(id == 1/**s0 == 10260 && e0 == 10334**/) { + // fprintf(stderr, "%u%c(q::[%ld,%ld))(t::[%ld,%ld))(ck::%ld)\n", cl, cm[op], wo[0], wo[1], wn[0], wn[1], ck); + // } + + if(op != 2) { + if(!ovlp) continue; + } else {///wo[0] == wo[1] + if(wo[0] < s0 || wo[0] >= e0) continue; + } + + if(op < 2) { + ots = os - wo[0] + wn[0]; ote = oe - wo[0] + wn[0]; + } else {///op == 2: more y; p == 3: more x + ots = wn[0]; ote = wn[1]; + } + + if(s1 == -1) { + s1 = ots; + // if((op == 0) && (wn[1] > ots) && ((wn[0] < ots) || (ots == 0))) si = 1; + if((op == 0) && (wo[1] > s0) && ((wo[0] < s0) || (s1 == 0))) si = 1; + } + + e1 = ote; + if((op == 0) && (wo[0] < e0) && ((wo[1] > e0) || (e1 == len))) ei = 1; + + if((op == 0) && (ovlp > 0) && (ovlp > weo[1] - weo[0])) { + weo[0] = os; weo[1] = oe; + wen[0] = ots; wen[1] = ote; + } + } + + while (ck < cn && ok <= e0) { ///[s0, e0) + wo[0] = ok; wn[0] = nk; + // pop_trace_bp(sc, ck, &op, &b, &cl); + pop_trace_bp_f(sc, ck, &op, &bq, &bt, &cl); + if(op != 2) break; + // ck = pop_trace_bp(sc, ck, &op, &b, &cl); + ck = pop_trace_bp_f(sc, ck, &op, &bq, &bt, &cl); + // if(op != 2) ok += cl; + // if(op != 3) nk += cl; + nk += cl; + wo[1] = ok; wn[1] = nk; + + // if(id == 1/**s0 == 10260 && e0 == 10334**/) { + // fprintf(stderr, "%u%c(q::[%ld,%ld))(t::[%ld,%ld))(ck::%ld)\n", cl, cm[op], wo[0], wo[1], wn[0], wn[1], ck); + // } + + if(wo[0] >= s0 && wo[0] <= e0) { + ots = wn[0]; ote = wn[1]; + if(s1 == -1) s1 = ots; + e1 = ote; + } + } + + assert(wen[1] <= len); + assert(e1 <= len); + + *iok = ok; *ink = nk; *ick = ck; + if(e1 <= s1) return 0; + rpos = rspan = ((uint64_t)-1); + + if(weo[0] == s0 && weo[1] == e0) { + rpos = wen[1]-1; rspan = e0 - s0; + /** + if(si == 1 && ei == 1) { + z->srt >>= 32; z->srt <<= 32; z->srt |= ((uint64_t)(wen[1]-1)); + ///debug + // char sstr[256]; recover_UC_Read_sub_region(sstr, wen[0], wen[1] - wen[0], 0, rref, id); + // if(recalu_minimizer0(sstr, wen[1] - wen[0], !(asm_opt.flag & HA_F_NO_HPC), mz_k, mz_h, tq, &rpos, &rspan) && (rpos + 1 == ((uint64_t)(wen[1] - wen[0]))) && (((uint64_t)(wen[1] - wen[0])) == rspan)) { + // // fprintf(stderr, "-0-[M::%s]\n", __func__); + // } else { + // // if(((z->srt>>32)&1) == 0) { + // fprintf(stderr, "-1-[M::%s]\trid::%lu\ttid::%ld\t%c\trg1::[%ld,%ld)\trg0::[%ld,%ld)\tspan::%ld\n", __func__, rid, id, "+-"[(z->srt>>32)&1], wen[0], wen[1], s0, e0, e0 - s0); + // fprintf(stderr, "tstr::%.*s\n", ((uint32_t)(wen[1] - wen[0])), sstr); + // fprintf(stderr, "qstr::%.*s\n", ((uint32_t)(qe - qs)), qstr + qs); + // exit(1); + // // } + // } + // if(rid == 3196 && id == 3199) fprintf(stderr, "-full-[M::%s]\n", __func__); + return 1; + } + **/ + } else { + if(s1 >= (*str_s) && e1 <= (*str_e)) { + str = tu->seq + s1 - (*str_s); + } else { + if(s1 >= (*str_s) && (s1 < (*str_e)) && (e1 > (*str_e))) { + UC_Read_resize((*tu), (e1 - (*str_s))); + recover_UC_Read_sub_region(tu->seq + (*str_e) - (*str_s), (*str_e), e1 - (*str_e), 0, rref, id); + str = tu->seq + s1 - (*str_s); (*str_e) = e1; + } else { + UC_Read_resize((*tu), (e1 - s1)); + recover_UC_Read_sub_region(tu->seq, s1, e1 - s1, 0, rref, id); + str = tu->seq; (*str_s) = s1; (*str_e) = e1; + } + } + + if(recalu_minimizer0(str, e1 - s1, !(asm_opt.flag & HA_F_NO_HPC), mz_k, mz_h, tq, &rpos, &rspan)) { + rpos += s1; + } + } + + if(rpos != ((uint64_t)-1) && rspan != ((uint64_t)-1)) { + e0 = rpos + 1; s0 = e0 - rspan; + if(!(asm_opt.flag & HA_F_NO_HPC)) { + if(si == -1) { + s0 -= hpc_minimizer_test(id, rref, tu, *str_s, *str_e, s0, len, 1, 8); + assert(s0 >= 0 && s0 < len && s0 < e0); + } + + if(ei == -1) { + e0 += hpc_minimizer_test(id, rref, tu, *str_s, *str_e, e0-1, len, 0, 8); + assert(e0 >= 0 && e0 < len && s0 < e0); + } + } + + rpos = e0 - 1; rspan = e0 - s0; + if(rspan < 256) { + z->srt >>= 32; z->srt <<= 32; z->srt |= rpos; z->other_off = rspan; + return 1; + } + } + /** + if(recalu_minimizer0(str, e1 - s1, !(asm_opt.flag & HA_F_NO_HPC), mz_k, mz_h, tq, &rpos, &rspan)) { + rpos += s1; + z->srt >>= 32; z->srt <<= 32; z->srt |= rpos; z->other_off = rspan; + // fprintf(stderr, "-1-[M::%s]\trid::%lu\ttid::%ld\t%c\trg1::[%ld,%ld)\trg0::[%ld,%ld)\tspan::%ld\n", __func__, rid, id, "+-"[(z->srt>>32)&1], s1, e1, s0, e0, e0 - s0); + // if(rid == 3196 && id == 3199) fprintf(stderr, "-part-[M::%s]\n", __func__); + return 1; + } + // fprintf(stderr, "-0-[M::%s]\trid::%lu\ttid::%ld\t%c\trg1::[%ld,%ld)\trg0::[%ld,%ld)\tspan::%ld\n", __func__, rid, id, "+-"[(z->srt>>32)&1], s1, e1, s0, e0, e0 - s0); + **/ + + return 0; +} + + +uint64_t recalu_minimizer_non_retrieve(uint64_t rid, anchor1_t *z, asg16_v *sc, int64_t *iok, int64_t *ink, int64_t *ick, uint64_t mz_k, uint64_t mz_h, tiny_queue_t *tq, char *tstr, int64_t tl, uint64_t trev/** , char *qstr, uint64_t qs, uint64_t qe**/) +{ + char *str = NULL; uint64_t rpos, rspan; char c; + int64_t ok = *iok, nk = *ink, ck = *ick, cn = sc->n, s0, e0, s1, e1, os, oe, ots, ote, ol, wo[2], wn[2], weo[2], wen[2], ovlp, k; uint16_t op, bq, bt; uint32_t cl; + e0 = ((uint32_t)z->srt) + 1; s0 = e0 - z->other_off; s1 = e1 = -1; weo[0] = weo[1] = wen[0] = wen[1] = -1; + if(e0 <= s0) return 0; + + + if((ck < 0) || (ck > cn)) {//(*ck) == cn is allowed + ck = ok = nk = 0; + } + + while (ck > 0 && ok >= s0) {///x -> t; y -> p; first insertion and then match/mismatch + --ck; + op = sc->a[ck]>>14; + // ol = (((op == 1) || (op == 2))?(sc->a[ck]&(0xfff)):(sc->a[ck]&(0x3fff))); + if((op == 2) || (op == 3)) { + ol = sc->a[ck]&(0xfff); + } else if(op == 1) { + ol = sc->a[ck]&(0x3ff); + } else { + ol = sc->a[ck]&(0x3fff); + } + if(op != 2) ok -= ol; + if(op != 3) nk -= ol; + } + + while (ck < cn && ok < e0) { ///[s0, e0) + wo[0] = ok; wn[0] = nk; + // ck = pop_trace_bp(sc, ck, &op, &b, &cl); + ck = pop_trace_bp_f(sc, ck, &op, &bq, &bt, &cl); + if(op != 2) ok += cl; + if(op != 3) nk += cl; + wo[1] = ok; wn[1] = nk; + + os = ((s0 >= wo[0])? s0 : wo[0]); + oe = ((e0 <= wo[1])? e0 : wo[1]); + // os = MAX(s0, wo[0]); oe = MIN(e0, wo[1]); + ovlp = ((oe>os)? (oe-os):0); + + if(op != 2) { + if(!ovlp) continue; + } else {///wo[0] == wo[1] + if(wo[0] < s0 || wo[0] >= e0) continue; + } + + if(op < 2) { + ots = os - wo[0] + wn[0]; ote = oe - wo[0] + wn[0]; + } else {///op == 2: more y; p == 3: more x + ots = wn[0]; ote = wn[1]; + } + + if(s1 == -1) s1 = ots; + e1 = ote; + + if((op == 0) && (ovlp > 0) && (ovlp > weo[1] - weo[0])) { + weo[0] = os; weo[1] = oe; + wen[0] = ots; wen[1] = ote; + } + } + + while (ck < cn && ok <= e0) { ///[s0, e0) + wo[0] = ok; wn[0] = nk; + // pop_trace_bp(sc, ck, &op, &b, &cl); + pop_trace_bp_f(sc, ck, &op, &bq, &bt, &cl); + if(op != 2) break; + // ck = pop_trace_bp(sc, ck, &op, &b, &cl); + ck = pop_trace_bp_f(sc, ck, &op, &bq, &bt, &cl); + // if(op != 2) ok += cl; + // if(op != 3) nk += cl; + nk += cl; + wo[1] = ok; wn[1] = nk; + + + if(wo[0] >= s0 && wo[0] <= e0) { + ots = wn[0]; ote = wn[1]; + if(s1 == -1) s1 = ots; + e1 = ote; + } + } + + assert(wen[1] <= tl); + assert(e1 <= tl); + + *iok = ok; *ink = nk; *ick = ck; + + rpos = rspan = ((uint64_t)-1); + if(weo[0] == s0 && weo[1] == e0) { + rpos = wen[1]-1; rspan = e0 - s0; + // z->srt >>= 32; z->srt <<= 32; z->srt |= ((uint64_t)(wen[1]-1)); + ///debug + // char sstr[256]; recover_UC_Read_sub_region(sstr, wen[0], wen[1] - wen[0], 0, rref, id); + // if(recalu_minimizer0(sstr, wen[1] - wen[0], !(asm_opt.flag & HA_F_NO_HPC), mz_k, mz_h, tq, &rpos, &rspan) && (rpos + 1 == ((uint64_t)(wen[1] - wen[0]))) && (((uint64_t)(wen[1] - wen[0])) == rspan)) { + // // fprintf(stderr, "-0-[M::%s]\n", __func__); + // } else { + // // if(((z->srt>>32)&1) == 0) { + // fprintf(stderr, "-1-[M::%s]\trid::%lu\ttid::%ld\t%c\trg1::[%ld,%ld)\trg0::[%ld,%ld)\tspan::%ld\n", __func__, rid, id, "+-"[(z->srt>>32)&1], wen[0], wen[1], s0, e0, e0 - s0); + // fprintf(stderr, "tstr::%.*s\n", ((uint32_t)(wen[1] - wen[0])), sstr); + // fprintf(stderr, "qstr::%.*s\n", ((uint32_t)(qe - qs)), qstr + qs); + // exit(1); + // // } + // } + // if(rid == 3196 && id == 3199) fprintf(stderr, "-full-[M::%s]\n", __func__); + // return 1; + } else { + if(e1 <= s1) return 0; + // if(s1 >= tl || e1 >= tl) return 0; + + if(trev) str = tstr + tl - e1; + else str = tstr + s1; + + if(recalu_minimizer0_adv(str, e1 - s1, !(asm_opt.flag & HA_F_NO_HPC), mz_k, mz_h, trev, tq, &rpos, &rspan)) { + rpos += s1; + // z->srt >>= 32; z->srt <<= 32; z->srt |= rpos; z->other_off = rspan; + // fprintf(stderr, "-1-[M::%s]\trid::%lu\ttid::%ld\t%c\trg1::[%ld,%ld)\trg0::[%ld,%ld)\tspan::%ld\n", __func__, rid, id, "+-"[(z->srt>>32)&1], s1, e1, s0, e0, e0 - s0); + // if(rid == 3196 && id == 3199) fprintf(stderr, "-part-[M::%s]\n", __func__); + // return 1; + } + } + + if(rpos != ((uint64_t)-1) && rspan != ((uint64_t)-1)) { + e0 = rpos + 1; s0 = e0 - rspan; + if(!(asm_opt.flag & HA_F_NO_HPC)) { + if(!trev) { + s1 = s0; e1 = e0; + } else { + s1 = tl - e0; e1 = tl - s0; + } + + c = tstr[s1]; + for (k = s1 - 1; k >= 0 && tstr[k] == c; k--); s1 = k + 1; + + c = tstr[e1-1]; + for (k = e1; k < tl && tstr[k] == c; k++); e1 = k; + + if(!trev) { + s0 = s1; e0 = e1; + } else { + s0 = tl - e1; e0 = tl - s1; + } + } + + rpos = e0 - 1; rspan = e0 - s0; + if(rspan < 256) { + z->srt >>= 32; z->srt <<= 32; z->srt |= rpos; z->other_off = rspan; + return 1; + } + } + // fprintf(stderr, "-0-[M::%s]\trid::%lu\ttid::%ld\t%c\trg1::[%ld,%ld)\trg0::[%ld,%ld)\tspan::%ld\n", __func__, rid, id, "+-"[(z->srt>>32)&1], s1, e1, s0, e0, e0 - s0); + + return 0; +} + +void hpc_ext_check(All_reads *rref, uint32_t id, int64_t s0, int64_t e0, int64_t l, uint64_t rev, char *buf) +{ + if(s0 < 0) s0 = 0; if(e0 > l) e0 = l; + int64_t s = s0 - 256, e = e0 + 256, n, k, os0, oe0, os1, oe1; char c; if(s < 0) s = 0; if(e > l) e = l; + recover_UC_Read_sub_region(buf, s, e - s, rev, rref, id); + os0 = s0 - s; oe0 = e0 - s; n = e - s; os1 = os0; oe1 = oe0; + c = buf[os0]; + for (k = os0 - 1; k >= 0 && buf[k] == c; k--); os1 = k + 1; + + c = buf[oe0-1]; + for (k = oe0; k < n && buf[k] == c; k++); oe1 = k; + + if(os1 != os0 || oe1 != oe0) { + fprintf(stderr, "[M::%s]\to0::[%ld,%ld)\to1::[%ld,%ld)\n", __func__, os0 + s0, oe0 + s0, os1 + s0, oe1 + s0); + } + +} + +void h_ec_lchain_re_gen(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, ha_pt_t *ha_idx, All_reads *rref, overlap_region_alloc *olst, Candidates_list *cl, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, + int64_t max_skip, int64_t max_iter, int64_t max_dis, int64_t quick_check, double chn_pen_gap, double chn_pen_skip, UC_Read *tu, asg64_v *oidx, asg16_v *scc) +{ + uint64_t i, k, l, m, max_cnt = UINT32_MAX, min_cnt = 0; int n, n0, j; ha_mz1_t *z; seed1_t *s; tiny_queue_t tq; memset(&tq, 0, sizeof(tiny_queue_t)); + if(high_occ) { + max_cnt = (*high_occ); + if(max_cnt < 2) max_cnt = 2; + } + if(low_occ) { + min_cnt = (*low_occ); + if(min_cnt < 2) min_cnt = 2; + } + clear_Candidates_list(cl); ab->n_a = 0; + + // minimizer of queried read + if (ab->mz.m > ab->old_mz_m) { + ab->old_mz_m = ab->mz.m; + REALLOC(ab->seed, ab->old_mz_m); + } + + for (i = 0, ab->n_a = 0; i < ab->mz.n; ++i) { + ab->seed[i].a = ha_pt_get(ha_idx, ab->mz.a[i].x, &n); + ab->seed[i].n = n; + ab->n_a += n; + } + + if (ab->n_a > ab->m_a) { + ab->m_a = ab->n_a; + REALLOC(ab->a, ab->m_a); + } + + for (i = 0, k = 0; i < ab->mz.n; ++i) { + ///z is one of the minimizer + z = &ab->mz.a[i]; s = &ab->seed[i]; + + // uint64_t rpos, rspan; + // if(!recalu_minimizer0(rs + (z->pos+1-z->span), z->span, !(asm_opt.flag & HA_F_NO_HPC), mz_k, z->x, &tq, &rpos, &rspan)) { + // fprintf(stderr, "-1-[M::%s]\t%c\trg0::[%u,%u)\trid::%u\n", __func__, "+-"[z->rev], z->pos+1-z->span, z->pos+1, rid); + // } + // else { + // fprintf(stderr, "-0-[M::%s]\t%c\trg0::[%u,%u)\trid::%u\n", __func__, "+-"[z->rev], z->pos+1-z->span, z->pos+1, rid); + // } + + for (j = 0; j < s->n; ++j) { + const ha_idxpos_t *y = &s->a[j]; + anchor1_t *an = &ab->a[k++]; + uint8_t rev = z->rev == y->rev? 0 : 1; + + // an->other_off = rev?((uint32_t)-1)-1-(y->pos+1-y->span):y->pos; + an->other_off = y->span; + + // an->self_off = z->pos; + an->self_off = i; + + ///an->cnt: cnt<<8|span + an->cnt = s->n; if(an->cnt > ((uint32_t)(0xffffffu))) an->cnt = 0xffffffu; + an->cnt <<= 8; an->cnt |= ((z->span <= ((uint32_t)(0xffu)))?z->span:((uint32_t)(0xffu))); + + // an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | an->self_off; + an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | y->pos; + } + } + + // copy over to _cl_ + if (ab->m_a >= (uint64_t)cl->size) { + cl->size = ab->m_a; + REALLOC(cl->list, cl->size); + } + clear_overlap_region_alloc(olst); + + // char dbg[256]; uint64_t rpos, rspan, thash; + // char dbg[768]; + k_mer_hit *p; uint64_t tid = (uint64_t)-1, tl = (uint64_t)-1, trev, tspan, ol, zn, olst_n; int64_t ok, nk, ck, str_s, str_e; + radix_sort_ha_an1(ab->a, ab->a + ab->n_a); + for (k = 1, l = ol = n = zn = 0; k <= ab->n_a; ++k) { + if (k == ab->n_a || (ab->a[k].srt>>32) != (ab->a[l].srt>>32)) { + for (; (ol < oidx->n) && ((oidx->a[ol]>>32) < (ab->a[l].srt>>32)); ol++); + if((ol < oidx->n) && ((ab->a[l].srt>>32) == (oidx->a[ol]>>32))) { + tl = Get_READ_LENGTH((*rref), ab->a[l].srt>>33); tid = ab->a[l].srt>>33; trev = (ab->a[l].srt>>32) & 1; olst_n = olst->length; + for (i = l, m = 0, ok = nk = ck = str_s = str_e = 0; i < k; i++) { + if(!recalu_minimizer(rid, &(ab->a[i]), &(scc[tid]), &ok, &nk, &ck, &str_s, &str_e, mz_k, ab->mz.a[ab->a[i].self_off].x, &tq, rref, tu, rs, ab->mz.a[ab->a[i].self_off].pos+1-ab->mz.a[ab->a[i].self_off].span, ab->mz.a[ab->a[i].self_off].pos+1)) continue; + ///debug + // recover_UC_Read_sub_region(dbg, ((uint32_t)ab->a[i].srt) + 1 - ab->a[i].other_off, ab->a[i].other_off, 0, rref, tid); + // if(recalu_minimizer0(dbg, ab->a[i].other_off, !(asm_opt.flag & HA_F_NO_HPC), mz_k, ab->mz.a[ab->a[i].self_off].x, &tq, &rpos, &rspan) && (rpos + 1 == ab->a[i].other_off) && (ab->a[i].other_off == rspan)) { + // // fprintf(stderr, "-0-[M::%s]\n", __func__); + // } else { + // fprintf(stderr, "-1-[M::%s]\n", __func__); + // } + // thash = ab->mz.a[ab->a[i].self_off].x; + ab->a[m] = ab->a[i]; tspan = ab->a[m].other_off; + ab->a[m].other_off = (uint32_t)ab->a[m].srt; + if(trev) ab->a[m].other_off = tl - (ab->a[m].other_off+1-tspan) - 1;///looks like a bug + ab->a[m].self_off = ab->mz.a[ab->a[m].self_off].pos; + ab->a[m].srt = ab->a[m].self_off; ab->a[m].srt <<= 32; ab->a[m].srt |= ab->a[m].other_off; + ///debug + // recover_UC_Read_sub_region(dbg, ab->a[m].other_off + 1 - tspan, tspan, trev, rref, tid); + // if(recalu_minimizer0(dbg, tspan, !(asm_opt.flag & HA_F_NO_HPC), mz_k, thash, &tq, &rpos, &rspan) && (rpos + 1 == tspan) && (tspan == rspan)) { + // fprintf(stderr, "-0-[M::%s]\n", __func__); + // } else { + // fprintf(stderr, "-1-[M::%s]\ttrev::%lu\n", __func__, trev); + // } + ///debug + // if(rid == 3196 && tid == 3199) hpc_ext_check(rref, tid, ab->a[m].other_off + 1 - tspan, ab->a[m].other_off + 1, tl, trev, dbg); + + + m++; + } + if(m > 1) radix_sort_ha_an1(ab->a, ab->a + m); + for (i = 0, n0 = n; i < m; i++) { + p = &cl->list[n++]; + p->readID = tid; + p->strand = trev; + p->offset = ab->a[i].other_off; + p->self_offset = ab->a[i].self_off; + if(((ab->a[i].cnt>>8) < max_cnt) && ((ab->a[i].cnt>>8) > min_cnt)){ + p->cnt = 1; + } else if((ab->a[i].cnt>>8) <= min_cnt) { + p->cnt = 2; + } else{ + p->cnt = 1 + (((ab->a[i].cnt>>8) + (max_cnt<<1) - 1)/(max_cnt<<1)); + p->cnt = pow(p->cnt, 1.1); + } + if(p->cnt > ((uint32_t)(0xffffffu))) p->cnt = 0xffffffu; + p->cnt <<= 8; p->cnt |= (((uint32_t)(0xffu))&(ab->a[i].cnt)); + } + + // if(rid == 3196 && tid == 3199) fprintf(stderr, "[M::%s]\ttid::%lu\ttrev::%lu\told::%lu\tnew::%lu\n", __func__, tid, trev, k - l, m); + + if(m > 0 && tid != rid) { + zn += lchain_qdp_mcopy_fast(cl, n0, n-n0, zn, &(cl->chainDP), olst, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, + rid, rl, tl, quick_check, apend_be, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, 1); + } + + if(olst->length > olst_n) { + oidx->a[ol] >>= 32; oidx->a[ol] <<= 32; oidx->a[ol] |= ((uint64_t)((uint32_t)-1)); + } + } + + l = k; + } + } + cl->length = zn; + + for (k = m = 0; k < oidx->n; k++) { + if(((uint32_t)oidx->a[k]) == ((uint32_t)-1)) continue; + oidx->a[m++] = oidx->a[k]; + } + // fprintf(stderr, "[M::%s]\ttot::%lu\tremain::%lu\n", __func__, (uint64_t)oidx->n, m); + oidx->n = m; + + for (i = 0; i < olst->length; ++i) olst->list[i].align_length = 0; + + // minimizers_qgen0(ab, rs, rl, mz_w, mz_k, cl, k_flag, ha_flt_tab, ha_idx, rref, dbg_ct, sp, high_occ, low_occ); + + // lchain_qgen_mcopy_fast(cl, overlap_list, rid, rl, rref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, enable_mcopy, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp); +} + + +void h_ec_lchain_re_gen3(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, ha_pt_t *ha_idx, All_reads *rref, overlap_region_alloc *olst, Candidates_list *cl, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, + int64_t max_skip, int64_t max_iter, int64_t max_dis, int64_t quick_check, double chn_pen_gap, double chn_pen_skip, UC_Read *tu, asg64_v *oidx, asg16_v *scc) +{ + uint64_t i, k, l, m, max_cnt = UINT32_MAX, min_cnt = 0; int n, n0, j; ha_mz1_t *z; seed1_t *s; tiny_queue_t tq; memset(&tq, 0, sizeof(tiny_queue_t)); + if(high_occ) { + max_cnt = (*high_occ); + if(max_cnt < 2) max_cnt = 2; + } + if(low_occ) { + min_cnt = (*low_occ); + if(min_cnt < 2) min_cnt = 2; + } + clear_Candidates_list(cl); ab->n_a = 0; + + // minimizer of queried read + if (ab->mz.m > ab->old_mz_m) { + ab->old_mz_m = ab->mz.m; + REALLOC(ab->seed, ab->old_mz_m); + } + + for (i = 0, ab->n_a = 0; i < ab->mz.n; ++i) { + ab->seed[i].a = ha_pt_get(ha_idx, ab->mz.a[i].x, &n); + ab->seed[i].n = n; + ab->n_a += n; + } + + if (ab->n_a > ab->m_a) { + ab->m_a = ab->n_a; + REALLOC(ab->a, ab->m_a); + } + + for (i = 0, k = 0; i < ab->mz.n; ++i) { + ///z is one of the minimizer + z = &ab->mz.a[i]; s = &ab->seed[i]; + + // uint64_t rpos, rspan; + // if(!recalu_minimizer0(rs + (z->pos+1-z->span), z->span, !(asm_opt.flag & HA_F_NO_HPC), mz_k, z->x, &tq, &rpos, &rspan)) { + // fprintf(stderr, "-1-[M::%s]\t%c\trg0::[%u,%u)\trid::%u\n", __func__, "+-"[z->rev], z->pos+1-z->span, z->pos+1, rid); + // } + // else { + // fprintf(stderr, "-0-[M::%s]\t%c\trg0::[%u,%u)\trid::%u\n", __func__, "+-"[z->rev], z->pos+1-z->span, z->pos+1, rid); + // } + + for (j = 0; j < s->n; ++j) { + const ha_idxpos_t *y = &s->a[j]; + anchor1_t *an = &ab->a[k++]; + uint8_t rev = z->rev == y->rev? 0 : 1; + + // an->other_off = rev?((uint32_t)-1)-1-(y->pos+1-y->span):y->pos; + an->other_off = y->span; + + // an->self_off = z->pos; + an->self_off = i; + + ///an->cnt: cnt<<8|span + an->cnt = s->n; if(an->cnt > ((uint32_t)(0xffffffu))) an->cnt = 0xffffffu; + an->cnt <<= 8; an->cnt |= ((z->span <= ((uint32_t)(0xffu)))?z->span:((uint32_t)(0xffu))); + + // an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | an->self_off; + an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | y->pos; + } + } + + // copy over to _cl_ + if (ab->m_a >= (uint64_t)cl->size) { + cl->size = ab->m_a; + REALLOC(cl->list, cl->size); + } + clear_overlap_region_alloc(olst); + + // char dbg[256]; uint64_t rpos, rspan, thash; + // char dbg[768]; uint64_t rpos, rspan, thash; + k_mer_hit *p; uint64_t tid = (uint64_t)-1, tl = (uint64_t)-1, trev, tspan, ol, zn, olst_n, sk, sv; int64_t ok, nk, ck, str_s, str_e; + radix_sort_ha_an1(ab->a, ab->a + ab->n_a); + for (k = 1, l = ol = n = zn = 0; k <= ab->n_a; ++k) { + if (k == ab->n_a || (ab->a[k].srt>>32) != (ab->a[l].srt>>32)) { + + for (sk = sv = 0; (ol < oidx->n) && ((oidx->a[ol]>>33) < (ab->a[l].srt>>33)); ol++); + if((ol < oidx->n) && ((ab->a[l].srt>>33) == (oidx->a[ol]>>33))) { + for (; (ol < oidx->n) && ((oidx->a[ol]>>32) < (ab->a[l].srt>>32)); ol++); + if((ol < oidx->n) && ((ab->a[l].srt>>32) == (oidx->a[ol]>>32))) { + if(((uint32_t)oidx->a[ol]) == ((uint32_t)-1)) sk = 1;///exact match + else sv = 1; + } else {///match in rev + sk = 1; + } + } + + if(!sk) { + tl = Get_READ_LENGTH((*rref), ab->a[l].srt>>33); tid = ab->a[l].srt>>33; trev = (ab->a[l].srt>>32) & 1; olst_n = olst->length; + if(tid != rid) { + for (i = l, m = 0, ok = nk = ck = str_s = str_e = 0; i < k; i++) { + if(!recalu_minimizer_bd(rid, &(ab->a[i]), &(scc[tid]), &ok, &nk, &ck, &str_s, &str_e, mz_k, ab->mz.a[ab->a[i].self_off].x, &tq, rref, tu, rs, ab->mz.a[ab->a[i].self_off].pos+1-ab->mz.a[ab->a[i].self_off].span, ab->mz.a[ab->a[i].self_off].pos+1)) continue; + ///debug + // recover_UC_Read_sub_region(dbg, ((uint32_t)ab->a[i].srt) + 1 - ab->a[i].other_off, ab->a[i].other_off, 0, rref, tid); + // if(recalu_minimizer0(dbg, ab->a[i].other_off, !(asm_opt.flag & HA_F_NO_HPC), mz_k, ab->mz.a[ab->a[i].self_off].x, &tq, &rpos, &rspan) && (rpos + 1 == ab->a[i].other_off) && (ab->a[i].other_off == rspan)) { + // // fprintf(stderr, "-0-[M::%s]\n", __func__); + // } else { + // fprintf(stderr, "-1-[M::%s]\n", __func__); + // } + // thash = ab->mz.a[ab->a[i].self_off].x; + ab->a[m] = ab->a[i]; tspan = ab->a[m].other_off; + ab->a[m].other_off = (uint32_t)ab->a[m].srt; + if(trev) ab->a[m].other_off = tl - (ab->a[m].other_off+1-tspan) - 1;///looks like a bug + ab->a[m].self_off = ab->mz.a[ab->a[m].self_off].pos; + ab->a[m].srt = ab->a[m].self_off; ab->a[m].srt <<= 32; ab->a[m].srt |= ab->a[m].other_off; + ///debug + /** + recover_UC_Read_sub_region(dbg, ab->a[m].other_off + 1 - tspan, tspan, trev, rref, tid); + if(recalu_minimizer0(dbg, tspan, !(asm_opt.flag & HA_F_NO_HPC), mz_k, thash, &tq, &rpos, &rspan) && (rpos + 1 == tspan) && (tspan == rspan)) { + // fprintf(stderr, "-0-[M::%s]\n", __func__); + } else { + fprintf(stderr, "-1-[M::%s]\ttrev::%lu\n", __func__, trev); + } + ///debug + hpc_ext_check(rref, tid, ab->a[m].other_off + 1 - tspan, ab->a[m].other_off + 1, tl, trev, dbg); + **/ + + m++; + } + if(m > 1) radix_sort_ha_an1(ab->a, ab->a + m); + for (i = 0, n0 = n; i < m; i++) { + p = &cl->list[n++]; + p->readID = tid; + p->strand = trev; + p->offset = ab->a[i].other_off; + p->self_offset = ab->a[i].self_off; + if(((ab->a[i].cnt>>8) < max_cnt) && ((ab->a[i].cnt>>8) > min_cnt)){ + p->cnt = 1; + } else if((ab->a[i].cnt>>8) <= min_cnt) { + p->cnt = 2; + } else{ + p->cnt = 1 + (((ab->a[i].cnt>>8) + (max_cnt<<1) - 1)/(max_cnt<<1)); + p->cnt = pow(p->cnt, 1.1); + } + if(p->cnt > ((uint32_t)(0xffffffu))) p->cnt = 0xffffffu; + p->cnt <<= 8; p->cnt |= (((uint32_t)(0xffu))&(ab->a[i].cnt)); + } + + if(m > 0) { + zn += lchain_qdp_mcopy_fast(cl, n0, n-n0, zn, &(cl->chainDP), olst, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, + rid, rl, tl, quick_check, apend_be, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, 1); + } + + if(sv && olst->length > olst_n) { + oidx->a[ol] >>= 32; oidx->a[ol] <<= 32; oidx->a[ol] |= ((uint64_t)((uint32_t)-1)); + } + } + } + + l = k; + } + } + cl->length = zn; + + for (k = m = 0; k < oidx->n; k++) { + if(((uint32_t)oidx->a[k]) == ((uint32_t)-1)) continue; + oidx->a[m++] = oidx->a[k]; + } + // fprintf(stderr, "[M::%s]\ttot::%lu\tremain::%lu\n", __func__, (uint64_t)oidx->n, m); + oidx->n = m; + + for (i = 0; i < olst->length; ++i) olst->list[i].align_length = 0; + + // minimizers_qgen0(ab, rs, rl, mz_w, mz_k, cl, k_flag, ha_flt_tab, ha_idx, rref, dbg_ct, sp, high_occ, low_occ); + + // lchain_qgen_mcopy_fast(cl, overlap_list, rid, rl, rref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, enable_mcopy, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp); +} + + +void h_ec_lchain_re_gen_srt(ha_abuf_t *ab, ha_pt_t *ha_idx, overlap_region_alloc *olst, Candidates_list *cl) +{ + uint64_t i, k; int j, n; ha_mz1_t *z; seed1_t *s; + + clear_Candidates_list(cl); ab->n_a = 0; + + // minimizer of queried read + if (ab->mz.m > ab->old_mz_m) { + ab->old_mz_m = ab->mz.m; + REALLOC(ab->seed, ab->old_mz_m); + } + + for (i = 0, ab->n_a = 0; i < ab->mz.n; ++i) { + ab->seed[i].a = ha_pt_get(ha_idx, ab->mz.a[i].x, &n); + ab->seed[i].n = n; + ab->n_a += n; + } + + if (ab->n_a > ab->m_a) { + ab->m_a = ab->n_a; + REALLOC(ab->a, ab->m_a); + } + + for (i = 0, k = 0; i < ab->mz.n; ++i) { + ///z is one of the minimizer + z = &ab->mz.a[i]; s = &ab->seed[i]; + + for (j = 0; j < s->n; ++j) { + const ha_idxpos_t *y = &s->a[j]; + anchor1_t *an = &ab->a[k++]; + uint8_t rev = z->rev == y->rev? 0 : 1; + + an->other_off = y->span; + an->self_off = i; + ///an->cnt: cnt<<8|span + an->cnt = s->n; if(an->cnt > ((uint32_t)(0xffffffu))) an->cnt = 0xffffffu; + an->cnt <<= 8; an->cnt |= ((z->span <= ((uint32_t)(0xffu)))?z->span:((uint32_t)(0xffu))); + an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | y->pos; + } + } + + // copy over to _cl_ + if (ab->m_a >= (uint64_t)cl->size) { + cl->size = ab->m_a; + REALLOC(cl->list, cl->size); + } + clear_overlap_region_alloc(olst); + radix_sort_ha_an1(ab->a, ab->a + ab->n_a); +} + +uint64_t h_ec_lchain_re_gen_qry(ha_abuf_t *ab, uint64_t *k, uint64_t *l, uint64_t *i, uint64_t *idx_a, uint64_t idx_n, uint64_t *tid, uint64_t *trev) +{ + while ((*k) <= ab->n_a) { + if ((*k) == ab->n_a || (ab->a[*k].srt>>32) != (ab->a[*l].srt>>32)) { + for (; ((*i) < idx_n) && ((idx_a[*i]>>32) < (ab->a[*l].srt>>32)); (*i)++); + if(((*i) < idx_n) && ((ab->a[*l].srt>>32) == (idx_a[*i]>>32))) { + (*tid) = ab->a[*l].srt>>33; (*trev) = (ab->a[*l].srt>>32) & 1; + + return 1; + } + (*l) = (*k); + } + ++(*k); + } + return 0; +} + +uint64_t h_ec_lchain_re_chn(ha_abuf_t *ab, uint64_t si, uint64_t ei, uint32_t rid, char* rs, uint64_t rl, uint64_t tid, char* ts, uint64_t tl, uint64_t trev, uint64_t mz_w, uint64_t mz_k, overlap_region_alloc *olst, Candidates_list *cl, double bw_thres, + int apend_be, uint64_t max_cnt, uint64_t min_cnt, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, int64_t max_skip, int64_t max_iter, int64_t max_dis, int64_t quick_check, double chn_pen_gap, double chn_pen_skip, tiny_queue_t *tq, asg16_v *scc, int64_t *n, int64_t *zn) +{ + // char dbg[768]; uint64_t rpos, rspan, thash; + + int64_t ok, nk, ck, n0; uint64_t i, m, tspan, on0; k_mer_hit *p; + for (i = si, m = ok = nk = ck = 0; i < ei; i++) { + if(!recalu_minimizer_non_retrieve(rid, &(ab->a[i]), &(scc[tid]), &ok, &nk, &ck, mz_k, ab->mz.a[ab->a[i].self_off].x, tq, ts, tl, trev)) continue; + ///debug + // thash = ab->mz.a[ab->a[i].self_off].x; + + ab->a[m] = ab->a[i]; tspan = ab->a[m].other_off; + ab->a[m].other_off = (uint32_t)ab->a[m].srt; + if(trev) ab->a[m].other_off = tl - (ab->a[m].other_off+1-tspan) - 1;///looks like a bug + ab->a[m].self_off = ab->mz.a[ab->a[m].self_off].pos; + ab->a[m].srt = ab->a[m].self_off; ab->a[m].srt <<= 32; ab->a[m].srt |= ab->a[m].other_off; + + + ///debug + // recover_UC_Read_sub_region(dbg, ab->a[m].other_off + 1 - tspan, tspan, trev, &R_INF, tid); + // if(recalu_minimizer0(dbg, tspan, !(asm_opt.flag & HA_F_NO_HPC), mz_k, thash, tq, &rpos, &rspan) && (rpos + 1 == tspan) && (tspan == rspan)) { + // // fprintf(stderr, "-0-[M::%s]\n", __func__); + // } else { + // fprintf(stderr, "-1-[M::%s]\ttrev::%lu\n", __func__, trev); + // } + ///debug + // hpc_ext_check(&R_INF, tid, ab->a[m].other_off + 1 - tspan, ab->a[m].other_off + 1, tl, trev, dbg); + + m++; + } + + if(m > 1) radix_sort_ha_an1(ab->a, ab->a + m); + + for (i = 0, n0 = (*n); i < m; i++) { + p = &cl->list[(*n)++]; + p->readID = tid; + p->strand = trev; + p->offset = ab->a[i].other_off; + p->self_offset = ab->a[i].self_off; + if(((ab->a[i].cnt>>8) < max_cnt) && ((ab->a[i].cnt>>8) > min_cnt)){ + p->cnt = 1; + } else if((ab->a[i].cnt>>8) <= min_cnt) { + p->cnt = 2; + } else{ + p->cnt = 1 + (((ab->a[i].cnt>>8) + (max_cnt<<1) - 1)/(max_cnt<<1)); + p->cnt = pow(p->cnt, 1.1); + } + if(p->cnt > ((uint32_t)(0xffffffu))) p->cnt = 0xffffffu; + p->cnt <<= 8; p->cnt |= (((uint32_t)(0xffu))&(ab->a[i].cnt)); + } + + // if(rid == 3196 && tid == 3199) fprintf(stderr, "[M::%s]\ttid::%lu\ttrev::%lu\told::%lu\tnew::%lu\n", __func__, tid, trev, k - l, m); + // fprintf(stderr, "[M::%s]\ttid::%lu\ttrev::%lu\told::%lu\tnew::%lu\n", __func__, tid, trev, ei - si, m); + + if(m > 0 && tid != rid) { + on0 = olst->length; + *zn += lchain_qdp_mcopy_fast(cl, n0, (*n)-n0, *zn, &(cl->chainDP), olst, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, + rid, rl, tl, quick_check, apend_be, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, 1); + cl->length = *zn; + if(olst->length > on0) return 1; + } + + cl->length = *zn; + return 0; +} + +uint64_t get_mz1(const char *str, int len, int w, int k, uint32_t rid, int is_hpc, ha_abuf_t *ab, const void *hf, ha_pt_t *ha_idx, int sample_dist, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, ha_pt_t *pt, int min_freq, int32_t dp_min_len, float dp_e, st_mt_t *mt, int32_t ws, int32_t is_unique, void *km, uint64_t beg_i) +{ + ab->mz.n = beg_i; + // get the list of anchors + mz1_ha_sketch(str, len, w, k, 0, !(asm_opt.flag & HA_F_NO_HPC), &ab->mz, hf, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, mt, asm_opt.mz_rewin, 0, NULL); + + radix_sort_ha_mz1_v_srt(ab->mz.a + beg_i, ab->mz.a + ab->mz.n); + + if(ha_idx) { + uint64_t i; + + if (ab->mz.m > ab->old_mz_m) { + ab->old_mz_m = ab->mz.m; + REALLOC(ab->seed, ab->old_mz_m); + } + + for (i = 0; i < ab->mz.n; ++i) { + ab->seed[i].a = NULL; + ab->seed[i].n = ha_pt_cnt(ha_idx, ab->mz.a[i].x); + } + } + + return ab->mz.n; +} + +void get_pi_ec_chain(ha_abuf_t *ab, uint64_t rid, uint64_t rl, uint32_t tid, char* ts, uint64_t tl, uint64_t mz_w, uint64_t mz_k, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, /**uint32_t is_accurate,**/ uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, + int64_t max_skip, int64_t max_iter, int64_t max_dis, int64_t quick_check, double chn_pen_gap, double chn_pen_skip) +{ + extern void *ha_flt_tab; + extern ha_pt_t *ha_idx; + uint64_t rn = ab->mz.n, tn, cn = 0; + + tn = get_mz1(ts, tl, mz_w, mz_k, 0, !(asm_opt.flag & HA_F_NO_HPC), ab, ha_flt_tab, NULL, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 0, NULL, rn); + + cn = lchain_qgen_mcopy_fast_re0(ab, ha_idx, ab->mz.a, rn, ab->mz.a + rn, tn - rn, tid, cl, high_occ, low_occ); + + if(cn) { + lchain_qgen_mcopy_fast_re1(cl, cl->length - cn, overlap_list, rid, rl, tl, apend_be, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut); + } + + ab->mz.n = rn; +} + + int64_t ug_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, double bw_thres_sec, int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, double mcopy_rate, uint32_t mcopy_khit_cut, uint32_t is_hpc, ha_mzl_t *res, uint64_t res_n, ha_mzl_t *idx, uint64_t idx_n, uint64_t mzl_cutoff, uint64_t chain_cutoff, kv_u_trans_t *kov) diff --git a/ecovlp.cpp b/ecovlp.cpp index d3f0176..d9de983 100644 --- a/ecovlp.cpp +++ b/ecovlp.cpp @@ -8,36 +8,69 @@ #include "htab.h" #define HA_KMER_GOOD_RATIO 0.333 #define E_KHIT 31 +#define CNS_DEL_E (0x7fffffffu) +#define del_cns_arc(z, arc_i) ((z).arc.a[(arc_i)].v == CNS_DEL_E) +#define CNS_DEL_V (0x1fffffffu) +#define del_cns_nn(z, nn_i) ((z).a[(nn_i)].sc == CNS_DEL_V) +#define REFRESH_N 128 + #define generic_key(x) (x) KRADIX_SORT_INIT(ec16, uint16_t, generic_key, 2) KRADIX_SORT_INIT(ec32, uint32_t, generic_key, 4) KRADIX_SORT_INIT(ec64, uint64_t, generic_key, 8) -void h_ec_lchain(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut); +#define kdq_clear(q) ((q)->count = (q)->front = 0) + +typedef struct {size_t n, m; asg16_v *a; uint8_t *f; } cc_v; +cc_v scc = {0, 0, NULL, NULL}; +cc_v scb = {0, 0, NULL, NULL}; +cc_v sca = {0, 0, NULL, NULL}; +typedef struct {size_t n, m; char *a; UC_Read z;} sl_v; -ec_ovec_buf_t* gen_ec_ovec_buf_t(uint32_t n, uint32_t is_final, uint32_t save_ov) +void h_ec_lchain(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut); +void h_ec_lchain_amz(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut); +void h_ec_lchain_re_gen(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, ha_pt_t *ha_idx, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, + int64_t max_skip, int64_t max_iter, int64_t max_dis, int64_t quick_check, double chn_pen_gap, double chn_pen_skip, UC_Read *tu, asg64_v *oidx, asg16_v *scc); +void h_ec_lchain_re_gen3(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, ha_pt_t *ha_idx, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, + int64_t max_skip, int64_t max_iter, int64_t max_dis, int64_t quick_check, double chn_pen_gap, double chn_pen_skip, UC_Read *tu, asg64_v *oidx, asg16_v *scc); +uint64_t get_mz1(const char *str, int len, int w, int k, uint32_t rid, int is_hpc, ha_abuf_t *ab, const void *hf, ha_pt_t *ha_idx, int sample_dist, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, ha_pt_t *pt, int min_freq, int32_t dp_min_len, float dp_e, st_mt_t *mt, int32_t ws, int32_t is_unique, void *km, uint64_t beg_i); +void get_pi_ec_chain(ha_abuf_t *ab, uint64_t rid, uint64_t rl, uint32_t tid, char* ts, uint64_t tl, uint64_t mz_w, uint64_t mz_k, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, /**uint32_t is_accurate,**/ uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, + int64_t max_skip, int64_t max_iter, int64_t max_dis, int64_t quick_check, double chn_pen_gap, double chn_pen_skip); +void set_lchain_dp_op(uint32_t is_accurate, uint32_t mz_k, int64_t *max_skip, int64_t *max_iter, int64_t *max_dis, double *chn_pen_gap, double *chn_pen_skip, int64_t *quick_check); +void h_ec_lchain_re_gen_srt(ha_abuf_t *ab, ha_pt_t *ha_idx, overlap_region_alloc *olst, Candidates_list *cl); +uint64_t h_ec_lchain_re_gen_qry(ha_abuf_t *ab, uint64_t *k, uint64_t *l, uint64_t *i, uint64_t *idx_a, uint64_t idx_n, uint64_t *tid, uint64_t *trev); +uint64_t h_ec_lchain_re_chn(ha_abuf_t *ab, uint64_t si, uint64_t ei, uint32_t rid, char* rs, uint64_t rl, uint64_t tid, char* ts, uint64_t tl, uint64_t trev, uint64_t mz_w, uint64_t mz_k, overlap_region_alloc *olst, Candidates_list *cl, double bw_thres, + int apend_be, uint64_t max_cnt, uint64_t min_cnt, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, int64_t max_skip, int64_t max_iter, int64_t max_dis, int64_t quick_check, double chn_pen_gap, double chn_pen_skip, tiny_queue_t *tq, asg16_v *scc, int64_t *n, int64_t *zn); +overlap_region* h_ec_lchain_fast(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *ol, Candidates_list *cl, bit_extz_t *exz, asg16_v *buf, asg64_v *srt_i, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, ma_hit_t_alloc *in0, ma_hit_t_alloc *in1, double sh); +void h_ec_lchain_fast_new(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, All_reads *rref, overlap_region_alloc *ol, Candidates_list *cl, bit_extz_t *exz, asg16_v *buf, asg64_v *srt_i, ma_hit_t_alloc *in0, ma_hit_t_alloc *in1, double sh); + +ec_ovec_buf_t* gen_ec_ovec_buf_t(uint32_t n) { uint32_t k; ec_ovec_buf_t0 *z = NULL; ec_ovec_buf_t *p = NULL; CALLOC(p, 1); p->n = n; CALLOC(p->a, p->n); for (k = 0; k < p->n; k++) { z = &(p->a[k]); - z->is_final = !!is_final; z->save_ov = !!save_ov; init_UC_Read(&z->self_read); init_UC_Read(&z->ovlp_read); init_Candidates_list(&z->clist); init_overlap_region_alloc(&z->olist); - init_fake_cigar(&(z->tmp.f_cigar)); - memset(&(z->tmp.w_list), 0, sizeof(z->tmp.w_list)); - CALLOC(z->tmp.w_list.a, 1); z->tmp.w_list.n = z->tmp.w_list.m = 1; + // init_fake_cigar(&(z->tmp.f_cigar)); + // memset(&(z->tmp.w_list), 0, sizeof(z->tmp.w_list)); + // CALLOC(z->tmp.w_list.a, 1); z->tmp.w_list.n = z->tmp.w_list.m = 1; // kv_init(z->b_buf.a); - kv_init(z->r_buf.a); + // kv_init(z->r_buf.a); kv_init(z->k_flag.a); kv_init(z->sp); kv_init(z->pidx); @@ -47,13 +80,9 @@ ec_ovec_buf_t* gen_ec_ovec_buf_t(uint32_t n, uint32_t is_final, uint32_t save_ov init_bit_extz_t(&(z->exz), 31); z->ab = ha_abuf_init(); - if (!z->is_final) { - init_Cigar_record(&z->cigar); - // init_Graph(&b->POA_Graph); - // init_Graph(&b->DAGCon); - init_Correct_dumy(&z->correct); - InitHaplotypeEvdience(&z->hap); - } + + InitHaplotypeEvdience(&z->hap); + z->cns.q = kdq_init(uint32_t); } return p; @@ -63,26 +92,25 @@ void destroy_cns_gfa(cns_gfa *p) { size_t k; for (k = 0; k < p->m; k++) { - kv_destroy(p->a[k].in); - kv_destroy(p->a[k].ou); + kv_destroy(p->a[k].arc); } - free(p->a); + free(p->a); kdq_destroy(uint32_t, p->q); } void destroy_ec_ovec_buf_t(ec_ovec_buf_t *p) { uint32_t k; ec_ovec_buf_t0 *z = NULL; for (k = 0; k < p->n; k++) { - z = &(p->a[k]); + z = &(p->a[k]); z->rr = 0; destory_UC_Read(&z->self_read); destory_UC_Read(&z->ovlp_read); destory_Candidates_list(&z->clist); destory_overlap_region_alloc(&z->olist); - destory_fake_cigar(&(z->tmp.f_cigar)); - free(z->tmp.w_list.a); free(z->tmp.w_list.c.a); + // destory_fake_cigar(&(z->tmp.f_cigar)); + // free(z->tmp.w_list.a); free(z->tmp.w_list.c.a); - kv_destroy(z->r_buf.a); + // kv_destroy(z->r_buf.a); kv_destroy(z->k_flag.a); kv_destroy(z->sp); kv_destroy(z->pidx); @@ -92,27 +120,48 @@ void destroy_ec_ovec_buf_t(ec_ovec_buf_t *p) destroy_bit_extz_t(&(z->exz)); ha_abuf_destroy(z->ab); - if (!z->is_final) { - destory_Cigar_record(&z->cigar); - // destory_Graph(&b->POA_Graph); - // destory_Graph(&b->DAGCon); - destory_Correct_dumy(&z->correct); - destoryHaplotypeEvdience(&z->hap); - } + + destoryHaplotypeEvdience(&z->hap); destroy_cns_gfa(&(z->cns)); - asm_opt.num_bases += z->num_read_base; - asm_opt.num_corrected_bases += z->num_correct_base; - asm_opt.num_recorrected_bases += z->num_recorrect_base; - // asm_opt.mem_buf += ha_ovec_mem(b[i], NULL); } free(p->a); free(p); - fprintf(stderr, "[M::%s-chains] #->%lld\n", __func__, asm_opt.num_bases); - fprintf(stderr, "[M::%s-passed-chains-0] #->%lld\n", __func__, asm_opt.num_corrected_bases); - fprintf(stderr, "[M::%s-cis-chains-1] #->%lld\n", __func__, asm_opt.num_recorrected_bases); + // fprintf(stderr, "[M::%s-chains] #->%lld\n", __func__, asm_opt.num_bases); + // fprintf(stderr, "[M::%s-passed-chains-0] #->%lld\n", __func__, asm_opt.num_corrected_bases); + // fprintf(stderr, "[M::%s-cis-chains-1] #->%lld\n", __func__, asm_opt.num_recorrected_bases); +} + +inline void refresh_ec_ovec_buf_t0(ec_ovec_buf_t0 *z, uint64_t n) +{ + z->rr++; + if((z->rr%n) == 0) { + free(z->self_read.seq); memset(&(z->self_read), 0, sizeof(z->self_read)); + free(z->ovlp_read.seq); memset(&(z->ovlp_read), 0, sizeof(z->ovlp_read)); + + destory_Candidates_list(&z->clist); memset(&(z->clist), 0, sizeof(z->clist)); + destory_overlap_region_alloc(&z->olist); memset(&(z->olist), 0, sizeof(z->olist)); init_overlap_region_alloc(&z->olist); + + kv_destroy(z->k_flag.a); kv_init(z->k_flag.a); + kv_destroy(z->sp); kv_init(z->sp); + kv_destroy(z->pidx); kv_init(z->pidx); + kv_destroy(z->v64); kv_init(z->v64); + kv_destroy(z->v32); kv_init(z->v32); + kv_destroy(z->v16); kv_init(z->v16); + + destroy_bit_extz_t(&(z->exz)); init_bit_extz_t(&(z->exz), 31); + + ha_abuf_destroy(z->ab); z->ab = ha_abuf_init(); + + destoryHaplotypeEvdience(&z->hap); memset(&(z->hap), 0, sizeof(z->hap)); InitHaplotypeEvdience(&z->hap); + + destroy_cns_gfa(&(z->cns)); memset(&(z->cns), 0, sizeof(z->cns)); z->cns.q = kdq_init(uint32_t); + + // z->rr = 1; + } } + void prt_chain(overlap_region_alloc *o) { uint64_t k; @@ -138,7 +187,7 @@ overlap_region *fetch_aux_ovlp(overlap_region_alloc* ol) /// exactly same to gen typedef struct { ul_ov_t *c_idx; asg64_v *idx; - int64_t i, i0, srt_n, rr; + int64_t i, i0, srt_n, rr, ru; uint64_t mms, mme; } cc_idx_t; @@ -226,7 +275,7 @@ int64_t extract_sub_cigar_mm(overlap_region *z, int64_t s, int64_t e, ul_ov_t *p #define simp_vote_len 6 ///[s, e) -uint32_t extract_sub_cigar_ii(overlap_region *z, All_reads *rref, int64_t s, int64_t e, UC_Read* tu, ul_ov_t *p) +uint32_t extract_sub_cigar_ii(overlap_region *z, int64_t ql, All_reads *rref, int64_t s, int64_t e, int64_t iws, int64_t iwe, UC_Read* tu, ul_ov_t *p) { int64_t wk = ovlp_cur_wid(*p), xk = ovlp_cur_xoff(*p), yk = ovlp_cur_yoff(*p), ck = ovlp_cur_coff(*p), os, oe, ol; bit_extz_t ez; int64_t bd = ovlp_bd(*p), s0, e0, ii[2], it[2]; uint32_t res = (uint32_t)-1; @@ -236,6 +285,10 @@ uint32_t extract_sub_cigar_ii(overlap_region *z, All_reads *rref, int64_t s, int if(s > e) return -1;///it is possible s == e os = MAX(s, s0); oe = MIN(e, e0); if(oe < os) return -1;///it is possible os == oe + // fprintf(stderr, "[M::%s] s0::%ld, e0::%ld, iws::%ld, iwe::%ld\n", __func__, s0, e0, iws, iwe); + ///make sure that this alignment block could cover the whole [iws, iwe) -> s0 < iws && e0 > iwe + // if((s0 >= iws) || (e0 <= iwe)) return -1;///!(s0 < iws && e0 > iwe) -> only consider the alignment that could cover the whole [s, e) + if(!(((s0 < iws) || (s0 == 0)) && ((e0 > iwe) || (e0 == ql)))) return -1;///!(s0 < iws && e0 > iwe) -> only consider the alignment that could cover the whole [s, e) set_bit_extz_t(ez, (*z), wk); if(!ez.cigar.n) return -1; @@ -251,7 +304,7 @@ uint32_t extract_sub_cigar_ii(overlap_region *z, All_reads *rref, int64_t s, int if(op!=3) yk -= (ez.cigar.a[ck]&(0x3fff)); } - char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + // char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; //some cigar will span s or e ii[0] = ii[1] = it[0] = it[1] = -1; res = cc = 0; while (ck < cn && xk < e) {//[s, e) @@ -268,15 +321,18 @@ uint32_t extract_sub_cigar_ii(overlap_region *z, All_reads *rref, int64_t s, int os = MAX(s, ws); oe = MIN(e, we); ovlp = ((oe>os)? (oe-os):0); - if(op != 2) { - if(!ovlp) continue; - } else {///ws == we - if(ws < s || ws >= e) continue; + + if(s == e) {///insertion in comparsion with the reference + if(op != 0 || ws >= s || we <= e || e != iwe || s != iws) continue;///must be a match + } else { + if(op != 2) { + if(!ovlp) continue; + } else {///ws == we + if(ws < s || ws >= e) continue; + } } - - - + if(ii[0] == -1) { ii[0] = os; if(op < 2) { @@ -298,7 +354,9 @@ uint32_t extract_sub_cigar_ii(overlap_region *z, All_reads *rref, int64_t s, int if(op != 2) ol = oe-os; cc += ol; - fprintf(stderr, "%ld%c", ol, cm[op]); + // if(s == 11851 && e == 11853) { + // if(!ol) fprintf(stderr, "%ld%c", ol, cm[op]); + // } if(cc <= simp_vote_len) { for (cci = 0; cci < ol; cci++) { res <<= 2; res |= op; @@ -315,12 +373,14 @@ uint32_t extract_sub_cigar_ii(overlap_region *z, All_reads *rref, int64_t s, int if(ws >= s && ws <= e) { if(ii[0] == -1) { - ii[0] = os; it[0] = wts; + ii[0] = ws; it[0] = wts; } - ii[1] = oe; it[1] = wte; + ii[1] = we; it[1] = wte; cc += ol; - fprintf(stderr, "%ld%c", ol, cm[op]); + // if(s == 11851 && e == 11853) { + // fprintf(stderr, "%ld%c", ol, cm[op]); + // } if(cc <= simp_vote_len) { for (cci = 0; cci < ol; cci++) { res <<= 2; res |= op; @@ -328,13 +388,14 @@ uint32_t extract_sub_cigar_ii(overlap_region *z, All_reads *rref, int64_t s, int } } } - - fprintf(stderr, "\tx::[%ld, %ld)\ty::[%ld, %ld)\tcc::%ld\n", ii[0], ii[1], it[0], it[1], cc); + // if(s == 11851 && e == 11853) { + // fprintf(stderr, "\tx::[%ld, %ld)\ty::[%ld, %ld)\tcc::%ld\n", ii[0], ii[1], it[0], it[1], cc); + // } if((cc <= simp_vote_len) && (ii[1] >= ii[0]) && (ii[1] - ii[0] <= simp_vote_len) && (it[1] >= it[0]) && (it[1] - it[0] <= simp_vote_len)) { // ii[0] = ii[0] - s; ii[1] = e - ii[1]; - if((ii[0] == s) && (ii[1] == e)) { + if((ii[0] == iws) && (ii[1] == iwe)) { op = cc; op <<= 12; res |= op; char *ystr = NULL; res <<= 16; cc = it[1] - it[0]; op = 0; @@ -362,499 +423,4919 @@ uint32_t extract_sub_cigar_ii(overlap_region *z, All_reads *rref, int64_t s, int return res; } +typedef struct { + All_reads *rref; + UC_Read *tu; + uint64_t s, e, n0, n1, id, rev; +} rr_seq_t; -uint64_t iter_cc_idx_t(overlap_region* ol, cc_idx_t *z, int64_t s, int64_t e, uint64_t is_reduce, uint64_t is_insert, uint64_t **ra) +inline void insert_cns_arc(cns_gfa *cns, uint32_t src, uint32_t des, uint32_t is_ou, uint32_t plus0, uint32_t rid) { - int64_t rm_n, q[2], os, oe; ul_ov_t *cp; uint64_t m; *ra = NULL; - - // if(s == 15816 && e == 15819) { - // fprintf(stderr, "[M::%s] is_reduce::%lu\n", __func__, is_reduce); - // } + if(src >= cns->n) { + fprintf(stderr, "[M::%s] rid::%u, src::%u, des::%u, (*cns).n::%u\n", __func__, rid, src, des, (uint32_t)(*cns).n); + exit(1); + } + cns_arc *p, t; kv_pushp(cns_arc, (*cns).a[src].arc, &p); + p->f = 0; p->sc = plus0; p->v = des; + if(is_ou) { + (*cns).a[src].arc.nou++; + if((*cns).a[src].arc.nou < (*cns).a[src].arc.n) { + t = (*cns).a[src].arc.a[(*cns).a[src].arc.nou-1]; + (*cns).a[src].arc.a[(*cns).a[src].arc.nou-1] = *p; + *p = t; + } + } +} - if(is_reduce) { - for (m = rm_n = z->srt_n; m < z->idx->n; m++) { - cp = &(z->c_idx[z->idx->a[m]]); - // if(s == 15816 && e == 15819) { - // fprintf(stderr, "-0-[M::%s] ii::%lu, ii0::%ld\n", __func__, z->idx->a[m], z->i0); - // } - - q[0] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); - q[1] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); - os = MAX(q[0], s); oe = MIN(q[1], e); - if((oe > os) || ((is_insert) && (s == e) && (s >= q[0]) && (s <= q[1]))) { - z->idx->a[rm_n++] = z->idx->a[m]; - } +inline uint32_t insert_cns_node(cns_gfa *cns) +{ + cns_t *p; uint32_t m0; + if (((*cns)).n == ((*cns)).m) { + m0 = ((*cns)).m; + ((*cns)).m = ((*cns)).m? ((*cns)).m<<1 : 2; + ((*cns)).a = (cns_t*)realloc(((*cns)).a, sizeof(cns_t) * ((*cns)).m); + if(((*cns)).m > m0) { + memset(((*cns)).a + m0, 0, sizeof(cns_t)*(((*cns)).m-m0)); } - z->idx->n = rm_n; + } + *(&p) = &((*cns)).a[((*cns)).n++]; + p->arc.n = p->arc.nou = 0; + p->c = p->f = p->sc = 0; + return ((*cns)).n - 1; +} + +inline uint32_t add_cns_arc(cns_gfa *cns, uint32_t src, uint32_t des, uint32_t is_ou, uint32_t plus) +{ + uint32_t k, s, e; + if(is_ou) { + s = 0; e = (*cns).a[src].arc.nou; + } else { + s = (*cns).a[src].arc.nou; e = (*cns).a[src].arc.n; } - for (; z->i < z->srt_n; ++z->i) { - cp = &(z->c_idx[(uint32_t)z->idx->a[z->i]]); - // if(s == 15816 && e == 15819) { - // fprintf(stderr, "-1-[M::%s] ii::%u, ii0::%ld\n", __func__, (uint32_t)z->idx->a[z->i], z->i0); - // } - q[0] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); - q[1] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); - if(q[0] > e) break; - if((!is_insert) && (q[0] >= e)) break; - os = MAX(q[0], s); oe = MIN(q[1], e); - if((oe > os) || ((is_insert) && (s == e) && (s >= q[0]) && (s <= q[1]))) { - kv_push(uint64_t, *(z->idx), ((uint32_t)z->idx->a[z->i])); + for (k = s; k < e; k++) { + if((*cns).a[src].arc.a[k].v == des) { + (*cns).a[src].arc.a[k].sc += plus; + break; } } - - (*ra) = z->idx->a + z->srt_n; - return z->idx->n - z->srt_n; + + return ((k < e)?(1):(0)); } -void debug_inter0(overlap_region* ol, ul_ov_t *c_idx, uint64_t *idx, int64_t idx_n, uint64_t *res, int64_t res_n, int64_t s, int64_t e, uint64_t is_insert, uint64_t is_hard_check, const char *cmd) +inline void prt_cns_arc(cns_gfa *cns, uint32_t src, const char* cmd) { - ul_ov_t *cp; int64_t q[2], a_n = 0, i, k = 0, os, oe; - for (i = 0; i < idx_n; i++) { - cp = &(c_idx[(uint32_t)idx[i]]); - q[0] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); - q[1] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + uint32_t k; + fprintf(stderr, "\n%s\t[M::%s] src::%u, sc::%u, c::%u\n", cmd, __func__, src, (*cns).a[src].sc, (*cns).a[src].c); + for (k = 0; k < (*cns).a[src].arc.n; k++) { + fprintf(stderr, "%s\t[M::%s] des::%u, sc::%u, is_ou::%u\n", cmd, __func__, (*cns).a[src].arc.a[k].v, (*cns).a[src].arc.a[k].sc, k<(*cns).a[src].arc.nou?1:0); + } +} - // fprintf(stderr, "%s[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\terr::%d\toerr::%u\n", cmd, __func__, ol[ovlp_id(*cp)].y_id, (int)Get_NAME_LENGTH(R_INF, ol[ovlp_id(*cp)].y_id), Get_NAME(R_INF, ol[ovlp_id(*cp)].y_id), - // ovlp_cur_wid(*cp), ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start, ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1, ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].error, ol[ovlp_id(*cp)].non_homopolymer_errors); +inline uint32_t get_cns_arc_bp(cns_gfa *cns, uint32_t src, uint32_t bp, uint32_t is_ou, uint32_t av_bp) +{ + uint32_t k, s, e; + if(is_ou) { + s = 0; e = (*cns).a[src].arc.nou; + } else { + s = (*cns).a[src].arc.nou; e = (*cns).a[src].arc.n; + } - os = MAX(q[0], s); oe = MIN(q[1], e); - if((oe > os) || ((is_insert) && (s == e) && (s >= q[0]) && (s <= q[1]))) { - a_n++; - // if(!(((uint32_t)idx[i]) == res[k])) { - // fprintf(stderr, "[M::%s] a_n::%ld\tres_n::%ld\ts::%ld\te::%ld\ti::%ld\tk::%ld\n", __func__, a_n, res_n, s, e, i, k); - // } - if(is_hard_check) { - assert(((uint32_t)idx[i]) == res[k++]); - } else { - for (; (k < res_n) && (((uint32_t)idx[i]) != res[k]); k++); - assert(k < res_n); - } + for (k = s; k < e; k++) { + if((*cns).a[src].arc.a[k].v == 0 || (*cns).a[src].arc.a[k].v == 1) continue; + if(av_bp && (*cns).a[src].arc.a[k].v >= (*cns).bb0 && (*cns).a[src].arc.a[k].v < (*cns).bb1) continue;///no backbone + if((*cns).a[(*cns).a[src].arc.a[k].v].c == bp) { + return k; } } - // if(a_n != res_n) { - // fprintf(stderr, "[M::%s] a_n::%ld\tres_n::%ld\ts::%ld\te::%ld\tidx_n::%ld\n", __func__, a_n, res_n, s, e, idx_n); - // } - if(is_hard_check) { - assert(a_n == res_n); - } else { - assert(a_n <= res_n); + + return ((uint32_t)-1); +} + +inline uint32_t add_cns_arc_bp(cns_gfa *cns, uint32_t src, uint32_t bp, uint32_t plus0, uint32_t rid, uint32_t av_bp) +{ + uint32_t rr, des; + rr = get_cns_arc_bp(cns, src, bp, 1, av_bp); + if(rr != ((uint32_t)-1)) {///find an existing node + des = (*cns).a[src].arc.a[rr].v; + (*cns).a[des].sc++; + (*cns).a[src].arc.a[rr].sc += plus0; + + rr = add_cns_arc(cns, des, src, 0, plus0); + // if(rr == 0) { + // fprintf(stderr, "[M::%s] src::%u -> des::%u\n", __func__, src, des); + // prt_cns_arc(cns, src); + // prt_cns_arc(cns, des); + // } + assert(rr); + + return des; + } else {///create a new node + des = insert_cns_node(cns); + (*cns).a[des].sc++; (*cns).a[des].c = bp; + insert_cns_arc(cns, src, des, 1, plus0, rid); + insert_cns_arc(cns, des, src, 0, plus0, rid); } + + return des; } -void prt_cigar0(uint64_t in, int64_t len) +void init_cns_g(cns_gfa *cns, char *s, uint64_t sl, uint32_t rid) { - int64_t k; uint64_t mp; - char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; - for (k = 0; k < len; k++) { - mp = len - 1 - k; mp <<= 1; - fprintf(stderr, "%c", cm[(in >> mp)&3]); + uint32_t m0 = cns->m, m1 = sl + 2, k; cns_t *p; + if ((*cns).m < (m1)) { ///equal to kv_resize() + (*cns).m = (m1); + (--((*cns).m), ((*cns).m)|=((*cns).m)>>1, ((*cns).m)|=((*cns).m)>>2, ((*cns).m)|=((*cns).m)>>4, ((*cns).m)|=((*cns).m)>>8, ((*cns).m)|=((*cns).m)>>16, ++((*cns).m)); + (*cns).a = (cns_t*)realloc((*cns).a, sizeof(cns_t) * (*cns).m); + if((*cns).m > m0) { + memset((*cns).a + m0, 0, sizeof(cns_t)*((*cns).m-m0)); + } } - fprintf(stderr, "\n"); + (*cns).n = 0; (*cns).si = 0; (*cns).ei = 1; (*cns).off = 2; + + p = &((*cns).a[(*cns).n++]); p->arc.nou = p->arc.n = p->c = p->f = p->sc = 0; ///beg + p = &((*cns).a[(*cns).n++]); p->arc.nou = p->arc.n = p->c = p->f = p->sc = 0; ///end + (*cns).bb0 = (*cns).n; + + for (k = 0; k < sl; k++) { + p = &((*cns).a[(*cns).n++]); p->arc.nou = p->arc.n = p->f = 0; + p->c = seq_nt6_table[(uint32_t)(s[k])]; p->sc = 1; + + if(k + 1 < sl) insert_cns_arc(cns, k + (*cns).off, k + 1 + (*cns).off, 1, 1, rid); + + if(k > 0) insert_cns_arc(cns, k + (*cns).off, k - 1 + (*cns).off, 0, 1, rid); + } + + if(sl) { + insert_cns_arc(cns, (*cns).si, 0 + (*cns).off, 1, 1, rid); insert_cns_arc(cns, 0 + (*cns).off, (*cns).si, 0, 1, rid); + insert_cns_arc(cns, sl - 1 + (*cns).off, (*cns).ei, 1, 1, rid); insert_cns_arc(cns, (*cns).ei, sl - 1 + (*cns).off, 0, 1, rid); + } else { + insert_cns_arc(cns, (*cns).si, (*cns).ei, 1, 1, rid); + insert_cns_arc(cns, (*cns).ei, (*cns).si, 0, 1, rid); + } + + // prt_cns_arc(cns, 0, __func__); + // prt_cns_arc(cns, 1, __func__); + + (*cns).bn = (*cns).n; (*cns).bb1 = (*cns).n; } -void prt_bp0(uint64_t in, int64_t len) +///[s, e) +uint32_t push_cns_c0(cns_gfa *cns, uint64_t s0, uint64_t s, uint64_t e, uint32_t plus0, uint32_t rid) { - int64_t k; uint64_t mp; - char cm[4]; cm[0] = 'A'; cm[1] = 'C'; cm[2] = 'G'; cm[3] = 'T'; - for (k = 0; k < len; k++) { - mp = len - 1 - k; mp <<= 1; - fprintf(stderr, "%c", cm[(in >> mp)&3]); + if(s > e) return s0;///it is possible that s == e + uint32_t rr, k, re; + + // rr = add_cns_arc(cns, s0, s, 1, plus0); assert(rr); + // rr = add_cns_arc(cns, s, s0, 0, plus0); assert(rr); + if(!add_cns_arc(cns, s0, s, 1, plus0)) { + insert_cns_arc(cns, s0, s, 1, plus0, rid); + insert_cns_arc(cns, s, s0, 0, plus0, rid); + } else { + rr = add_cns_arc(cns, s, s0, 0, plus0); assert(rr); } - fprintf(stderr, "\n"); + (*cns).a[s].sc++; re = s; + + for (k = s + 1; k < e; k++) { + rr = add_cns_arc(cns, k-1, k, 1, 1); + // if(!rr) { + // fprintf(stderr, "[M::%s] s0::%u, s::%u\n", __func__, k-1, k); + // prt_cns_arc(cns, k-1, __func__); prt_cns_arc(cns, k, __func__); + // } + assert(rr); + + + + rr = add_cns_arc(cns, k, k-1, 0, 1); assert(rr); + (*cns).a[k].sc++; re = k; + } + + return re; } -uint64_t cns_gen0(overlap_region* ol, All_reads *rref, uint64_t s, uint64_t e, UC_Read* tu, cc_idx_t *idx, uint64_t occ_tot, double occ_max, asg32_v* b32, uint32_t *rc) +uint32_t trace_cns_bp(cns_gfa *cns, uint64_t s0, char *tstr, uint64_t tl, asg32_v* b32, uint32_t plus0, uint32_t *rn, uint64_t max_trace, uint32_t av_bp) { - if(e > s + simp_vote_len) return 0;///too long + (*rn) = s0; + if(tl <= 0) return 0; + // fprintf(stderr, "\n[M::%s] tl::%lu\n", __func__, tl); + uint32_t k, i, s, e, m, bp, nm, bi, bn0, src, des, ff = 0; b32->n = 0; - uint64_t *id_a = NULL, id_n, an = 0, oc[2]; b32->n = 0; uint32_t m, *a = NULL; - id_n = iter_cc_idx_t(ol, idx, s, e, idx->rr, ((s==e)?1:0), &id_a); - // debug_inter0(ol, idx->c_idx, idx->idx->a + idx->i0, idx->srt_n - idx->i0, id_a, id_n, s, e, ((s==e)?1:0), 0, "-1-"); - uint64_t k, l, q[2], os, oe; ul_ov_t *p; overlap_region *z; idx->rr = 0; - fprintf(stderr, "[M::%s] [%lu, %lu) id_n::%lu\n", __func__, s, e, id_n); - for (k = 0; k < id_n; k++) { - p = &(idx->c_idx[id_a[k]]); z = &(ol[ovlp_id(*p)]); - q[0] = z->w_list.a[ovlp_cur_wid(*p)].x_start+ovlp_bd(*p); - q[1] = z->w_list.a[ovlp_cur_wid(*p)].x_end+1-ovlp_bd(*p); + kv_push(uint32_t, (*b32), s0); kv_push(uint32_t, (*b32), ((uint32_t)-1)); nm = 2; + // if(s0 == 2863) { + // fprintf(stderr, "***0***[M::%s] s::%lu\tb32->n::%u\n", __func__, s0, (uint32_t)b32->n); + // } - // fprintf(stderr, "[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\terr::%d\toerr::%u\n", __func__, ol[ovlp_id(*p)].y_id, (int)Get_NAME_LENGTH(R_INF, ol[ovlp_id(*p)].y_id), Get_NAME(R_INF, ol[ovlp_id(*p)].y_id), - // ovlp_cur_wid(*p), ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_start, ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_end+1, ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].error, ol[ovlp_id(*p)].non_homopolymer_errors); + for (i = 0; (i < tl) && (!ff); i++) { + bp = seq_nt6_table[(uint32_t)(tstr[i])]; bn0 = b32->n; + for (bi = bn0 - nm; bi < bn0; bi += 2) { + m = b32->a[bi]; s = 0; e = (*cns).a[m].arc.nou; + for (k = s; k < e; k++) { + if((*cns).a[m].arc.a[k].v == 0 || (*cns).a[m].arc.a[k].v == 1) continue; + if(av_bp && (*cns).a[m].arc.a[k].v >= (*cns).bb0 && (*cns).a[m].arc.a[k].v < (*cns).bb1) continue;///no backbone + if((*cns).a[(*cns).a[m].arc.a[k].v].c == bp) { + kv_push(uint32_t, (*b32), (*cns).a[m].arc.a[k].v); + kv_push(uint32_t, (*b32), bi); - if(q[1] <= e) idx->rr = 1; - os = MAX(q[0], s); oe = MIN(q[1], e); - if((oe > os) || ((s == e) && (s >= q[0]) && (s <= q[1]))) { - // if(oe >= os) { - ///[-4-][-12-][-4-][-12-] - ///[cigar_len][cigar][base_len][base] - m = extract_sub_cigar_ii(z, rref, os, oe, tu, p); an++; - if(m != ((uint32_t)-1)) {///no gap in both sides - kv_push(uint32_t, *b32, m); + // if(s0 == 2863) { + // fprintf(stderr, "***1***[M::%s] s::%u\tb32->n::%u\n", __func__, (*cns).a[m].arc.a[k].v, (uint32_t)b32->n); + // } + // if((i + 1) == tl) break;///quick end + // if(b32->n > max_trace) break;///redue the size of b32 + if(((i + 1) == tl) || (b32->n > max_trace)) { + ff = 1; break; + } + } } + if(ff) break; } + + if(b32->n <= bn0) {///no node + break; + } else { + nm = b32->n - bn0; + } } + // fprintf(stderr, "[M::%s] b32->n::%u, nm::%u\n", __func__, (uint32_t)b32->n, nm); - oc[0] = b32->n; oc[1] = an + 1; //+1 for the reference read - fprintf(stderr, "-0-[M::%s] oc[0]::%lu, oc[1]::%lu\n", __func__, oc[0], oc[1]); - if(((oc[0] > (oc[1]*occ_max)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1))) { - radix_sort_ec32(b32->a, b32->a+b32->n); an = 0; - for (k = 1, l = 0; k <= b32->n; ++k) { - if (k == b32->n || b32->a[k] != b32->a[l]) { - if(k - l > an) { - an = k - l; a = b32->a + l; - } - l = k; - } - } - oc[0] = an; - fprintf(stderr, "-1-[M::%s] oc[0]::%lu, oc[1]::%lu\n", __func__, oc[0], oc[1]); - if(((oc[0] > (oc[1]*occ_max)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1))) { - (*rc) = a[0]; - // prt_cigar0((a[0]<<4)>>20, a[0]>>28); - // prt_bp0((a[0]<<20)>>20, (a[0]<<16)>>28); - return 1; + // if(s0 == 2863) { + // fprintf(stderr, "[M::%s] i::%u\tnm::%u\tb32->n::%u\n", __func__, i, nm, (uint32_t)b32->n); + // } + if(i > 0 && nm > 0) { + (*rn) = b32->a[b32->n - nm]; + for (bi = b32->n - nm; b32->a[bi + 1] != ((uint32_t)-1); bi = b32->a[bi + 1]) { + // fprintf(stderr, "[M::%s] bi::%u, p_bi::%u\n", __func__, bi, b32->a[bi + 1]); + des = b32->a[bi]; src = b32->a[b32->a[bi + 1]]; bp = ((src!=s0)?(1):(plus0)); + m = add_cns_arc(cns, src, des, 1, bp); assert(m); + m = add_cns_arc(cns, des, src, 0, bp); assert(m); + (*cns).a[des].sc++; + + // if(s0 == 2863) { + // fprintf(stderr, "***2***[M::%s] src::%u\tdes::%u\n", __func__, src, des); + // } } + } else { + i = 0; } - return 0; -} + return i; +} -void push_correct0(window_list *idx, window_list_alloc *res, uint32_t len0, uint32_t rc) +///[s, e) +uint32_t push_cns_c1(cns_gfa *cns, uint64_t s0, char *tstr, uint64_t tl, uint32_t plus0, asg32_v* b32, uint64_t max_trace, uint32_t rid) { - if(len0 != ((uint32_t)-1)) { - ; - } else if(rc != ((uint32_t)-1)) { - uint32_t cc = (rc<<4)>>20, cn = rc>>28, ck = 0, cs, cp; - uint32_t bc = (rc<<20)>>20, bn = (rc<<16)>>28, bk = 0, bs, bp; - for (ck = 0; ck < cn; ck++) { - cs = (cn-1-ck)<<1; cp = (cc>>cs)&3; + if(tl <= 0) return s0; + uint32_t rr = plus0, k, re = s0; + k = trace_cns_bp(cns, s0, tstr, tl, b32, plus0, &re, max_trace, 1); + if(k > 0) rr = 1; - bp = (uint32_t)-1; - if(cp != 3) { - bs = (bn-1-bk)<<1; bp = (bc>>bs)&3; - bk++; - } - push_trace_bp(((asg16_v *)(&(res->c))), cp, bp, 1, ((idx->clen>0)?1:0)); + // if(s0 == 2863) { + // fprintf(stderr, "[M::%s] (%.*s)\ts0::%lu\tk::%u\ttl::%lu\tre::%u\n", __func__, tstr?((int)(tl)):0, tstr, s0, k, tl, re); + // } + // fprintf(stderr, "[M::%s] s0::%u, s::%u\n", __func__, k-1, k); + // prt_cns_arc(cns, k-1, __func__); prt_cns_arc(cns, k, __func__); - fprintf(stderr, "%c", cm[(in >> mp)&3]); - } + for (; k < tl; k++) {///the weight of (s0 -> tstr[0]) might be 0 + re = add_cns_arc_bp(cns, re, seq_nt6_table[(uint32_t)(tstr[k])], rr, rid, 1); rr = 1; } + + return re; } -void push_cns_anchor(overlap_region* ol, All_reads *rref, uint64_t s, uint64_t e, UC_Read* tu, cc_idx_t *idx, overlap_region *aux_o, uint64_t is_tail, uint64_t occ_tot, double occ_max, asg32_v* b32) +uint64_t append_cns_g(cns_gfa *cns, char *tstr, uint64_t tl, uint64_t qs, uint64_t qe, uint64_t cp, uint64_t cl, uint64_t pe, asg32_v* b32, uint64_t max_trace, uint32_t rid, int64_t insert_pos) { - if((!is_tail) && (s >= e)) return;//if s >= e && is_tail = 1, gen the cns of the last a few bases -> s = e = ql - fprintf(stderr, "\n[M::%s] [%lu, %lu)\n", __func__, s, e); - window_list *p = NULL; uint64_t e0 = 0; uint32_t rc; - if(aux_o->w_list.n > 0) { - p = &(aux_o->w_list.a[aux_o->w_list.n-1]); - e0 = p->x_end+1; - ///make sure e > s + // fprintf(stderr, ">q::[%lu, %lu)\n", qs, qe); + uint64_t s0 = pe, plus0 = 1, ns = qs + cns->off, ne = qe + cns->off; + if(pe == ((uint64_t)-1)) { + if(qs > 0) { + s0 = qs - 1 + cns->off;///just before node in backbone + } else { + s0 = 0;//beg + } + // plus0 = 0; } - assert(s >= e0); - if((((!is_tail) && (s > 0)) || ((is_tail) && (s > e0))) - && (cns_gen0(ol, rref, e0, s, tu, idx, occ_tot, occ_max, b32, &rc))) {///CNS in between - push_correct0(p, &(aux_o->w_list), (uint32_t)-1, rc); - } else { - } + // fprintf(stderr, "+n_nodes::%u, tl::%lu, qs::%lu, qe::%lu\n", (uint32_t)cns->n, tl, qs, qe); - kv_pushp(window_list, aux_o->w_list, &p); - p->x_start = s; p->x_end = e-1; - // p->y_start = exz->ps; p->y_end = exz->pe; + if(cp == 0) { + if((cl == 0) && (cp == 0) && (qs == qe) && (((int64_t)qe) == insert_pos)) { + s0 = 0;//beg + ns = ne = 1;//end + plus0 = 1; + } + // if(cp == 0) { + // fprintf(stderr, "cp::%lu, cl::%lu, qs::%lu, qe::%lu, s0::%lu, ns::%lu, ne::%lu, plus0::%lu\n", cp, cl, qs, qe, s0, ns, ne, plus0); + // } + return push_cns_c0(cns, s0, ns, ne, plus0, rid); + } else if(cp == 1 || cp == 2) { ///cp == 2: more y -> insertion + return push_cns_c1(cns, s0, tstr, tl, plus0, b32, max_trace, rid); + } else { ///more x -> do nothing + return s0; + } } -uint64_t wcns_vote(overlap_region* ol, All_reads *rref, char* qstr, UC_Read* tu, uint64_t *id_a, uint64_t id_n, uint64_t s, uint64_t e, ul_ov_t *c_idx, cc_idx_t *occ, uint64_t occ_tot, double occ_exact, overlap_region *aux_o, asg32_v* b32) +char *get_sub_seq(rr_seq_t *ssq, uint64_t s, uint64_t e) { - uint64_t k, q[2], rr = 0, os, oe, wl, oc[2], fI; ul_ov_t *p, *gp; overlap_region *z; - // uint64_t *ct = occ->idx->a;///occ->idx->a[0, wl<<1) - for (k = 0; k < id_n; k++) { - p = &(c_idx[id_a[k]]); z = &(ol[ovlp_id(*p)]); - q[0] = z->w_list.a[ovlp_cur_wid(*p)].x_start+ovlp_bd(*p); - q[1] = z->w_list.a[ovlp_cur_wid(*p)].x_end+1-ovlp_bd(*p); - if(q[1] <= e) rr = 1; - os = MAX(q[0], s); oe = MIN(q[1], e); - if(oe > os) { - ///prepare for CNS - gp = &(occ->c_idx[id_a[k]]); - ovlp_cur_xoff(*gp) = ovlp_cur_xoff(*p); ovlp_cur_yoff(*gp) = ovlp_cur_yoff(*p); ovlp_cur_coff(*gp) = ovlp_cur_coff(*p); ovlp_cur_ylen(*gp) = ovlp_cur_ylen(*p); - // assert(ovlp_cur_wid(*p) == ovlp_cur_wid(*gp)); - // assert(ovlp_id(*p) == ovlp_id(*gp)); - // fprintf(stderr, "[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\tos::%lu\toe::%lu\n", __func__, ol[ovlp_id(*p)].y_id, (int)Get_NAME_LENGTH(R_INF, ol[ovlp_id(*p)].y_id), Get_NAME(R_INF, ol[ovlp_id(*p)].y_id), - // ovlp_cur_wid(*p), ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_start, ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_end+1, os, oe); - extract_sub_cigar_mm(z, os, oe, p, occ->idx->a + os - s); - } + if(s >= e) return NULL; + + if(s >= ssq->s && e <= ssq->e) { + return ssq->tu->seq + s - ssq->s; } - wl = e - s; - os = occ->mms; oe = occ->mme; + uint64_t l = e - s; + if(ssq->s >= ssq->e) { + if(l < ssq->n0) l = ssq->n0; + } else { + if(l < ssq->n1) l = ssq->n1; + } - // fprintf(stderr, "[M::%s] s::%lu\te::%lu\n", __func__, s, e); + ssq->s = s; ssq->e = s + l; + if(ssq->e > Get_READ_LENGTH((*(ssq->rref)), ssq->id)) { + ssq->e = Get_READ_LENGTH((*(ssq->rref)), ssq->id); + l = ssq->e - ssq->s; + } + UC_Read_resize((*(ssq->tu)), ((int64_t)l)); + recover_UC_Read_sub_region(ssq->tu->seq, ssq->s, l, ssq->rev, ssq->rref, ssq->id); + return ssq->tu->seq; +} + + +///[s, e) +uint32_t extract_sub_cigar_cns(overlap_region *z, int64_t s, int64_t e, int64_t iws, int64_t iwe, int64_t s_end, rr_seq_t *ssq, ul_ov_t *p, cns_gfa *cns, asg32_v* b32, uint64_t max_trace, uint32_t rid) +{ + // if(s == 10539 && e == 10760) { + // fprintf(stderr, "\n>>>>>>iw::[%ld, %ld)\tw::[%ld, %ld)\tox::[%u, %u)<<<<<<\n", iws, iwe, s, e, z->x_pos_s, z->x_pos_e + 1); + // } - for (k = 0; k < wl; k++) { - //+1 for the reference read - oc[0] = (occ->idx->a[(k<<1)]>>32) + 1; - oc[1] = ((uint32_t)occ->idx->a[(k<<1)]) + 1; - // fprintf(stderr, "-0-p::%lu\toc[0]::%lu\toc[1]::%lu\tgoc[0]::%lu\tgoc[1]::%u\n", s + k, oc[0], oc[1], (ct[(k<<1)+1]>>32) + 1, ((uint32_t)ct[(k<<1)+1]) + 1); + int64_t wk = ovlp_cur_wid(*p), xk = ovlp_cur_xoff(*p), yk = ovlp_cur_yoff(*p), ck = ovlp_cur_coff(*p), os, oe, ots, ote, ol, insert_pos = ((iws == iwe)? (0): (-1)); + bit_extz_t ez; int64_t bd = ovlp_bd(*p), s0, e0, ii[2], it[2]; uint64_t pe = (uint64_t)-1; + s0 = ((int64_t)(z->w_list.a[wk].x_start)) + bd; + e0 = ((int64_t)(z->w_list.a[wk].x_end)) + 1 - bd; + if(s < s0) s = s0; if(e > e0) e = e0;///exclude boundary + if(s > e) return -1;///it is possible s == e + os = MAX(s, s0); oe = MIN(e, e0); + if(oe < os) return -1;///it is possible os == oe + + set_bit_extz_t(ez, (*z), wk); + if(!ez.cigar.n) return -1; + int64_t cn = ez.cigar.n; uint16_t op; int64_t ws, we, wts, wte, ovlp; char *tstr; + if((ck < 0) || (ck > cn)) {//(*ck) == cn is allowed + ck = 0; xk = ez.ts; yk = ez.ps; + } + + while (ck > 0 && xk >= s) {///x -> t; y -> p; first insertion and then match/mismatch + --ck; + op = ez.cigar.a[ck]>>14; + if(op!=2) xk -= (ez.cigar.a[ck]&(0x3fff)); + if(op!=3) yk -= (ez.cigar.a[ck]&(0x3fff)); + } + + if(s_end == 0 && s == iws) s_end = 0; + else s_end = 1; + + // if(s_end == 0 || s != iws) {///do not conside the insertion before s + // while (ck < cn && xk < s) { + // } + // } + + // if(s == 10539 && e == 10760) { + // fprintf(stderr, "ck::%ld, cn::%ld, xk::%ld, yk::%ld\n", ck, cn, xk, yk); + // } + + // char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + //some cigar will span s or e + ii[0] = ii[1] = it[0] = it[1] = -1; + ssq->s = ssq->e = 0; ssq->n0 = e - s; + ssq->id = z->y_id; ssq->rev = z->y_pos_strand; + if(ssq->n0 == 0) {ssq->n0 = ssq->n1;} + + while (ck < cn && xk < e) {//[s, e) + ws = xk; wts = yk; + op = ez.cigar.a[ck]>>14; ol = (ez.cigar.a[ck]&(0x3fff)); + + for (ck++; (ck < cn) && (op == (ez.cigar.a[ck]>>14)); ck++) { + ol += (ez.cigar.a[ck]&(0x3fff)); + } + ///op == 3: -> x; op == 2: -> y; + if(op!=2) xk += ol; + if(op!=3) yk += ol; + we = xk; wte = yk; + + // fprintf(stderr, "ck::%ld, cn::%ld, op::%u, ol::%ld\n", ck, cn, op, ol); + + os = MAX(s, ws); oe = MIN(e, we); + ovlp = ((oe>os)? (oe-os):0); + + if(s == e) {///insertion in comparsion with the reference + if(op != 0 || ws >= s || we <= e || e != iwe || s != iws) continue;///must be a match + } else { + if(op != 2) { + if(!ovlp) continue; + } else {///ws == we + if(ws < s || ws >= e) continue; + } + } + + if((s_end == 0) && (op == 2) && (ws == s)) continue;///skip the insertion just before s + + + + if(op < 2) { + ots = os - ws + wts; ote = oe - ws + wts; + } else {///op == 2: more y; p == 3: more x + ots = wts; ote = wte; + } + + + + if(ii[0] == -1) { + ii[0] = os; it[0] = ots; + // if(op < 2) { + // it[0] = os - ws + wts; + // } else {///op == 2: more y; p == 3: more x + // it[0] = wts; + // } + } + + ii[1] = oe; it[1] = ote; + // if(op < 2) { + // it[1] = oe - ws + wts; + // } else {///op == 2: more y; p == 3: more x + // it[1] = wte; + // } + + + + + if(op != 2) ol = oe-os; + + tstr = NULL; + if(op != 0) tstr = get_sub_seq(ssq, ots, ote); + + // if(s == 10539 && e == 10760) { + // fprintf(stderr, "+0-%ld%c(%.*s)\tpe::%lu", ol, cm[op], tstr?((int)(ote - ots)):0, tstr, pe); + // } + // fprintf(stderr, ">q::[%ld, %ld)\n", ws, we); + // fprintf(stderr, "%ld%c(%.*s)", ol, cm[op], tstr?((int)(ote - ots)):0, tstr); + + pe = append_cns_g(cns, tstr, ote - ots, os - iws, oe - iws, op, ol, pe, b32, max_trace, rid, insert_pos); + + // if(s == 10539 && e == 10760) { + // fprintf(stderr, "+1-pe::%lu\n", pe); + // } + } + + while (ck < cn && xk <= e) {//[s, e) + ws = xk; wts = yk; + op = ez.cigar.a[ck]>>14; ol = (ez.cigar.a[ck]&(0x3fff)); + if(op != 2) break; + + for (ck++; (ck < cn) && (op == (ez.cigar.a[ck]>>14)); ck++) { + ol += (ez.cigar.a[ck]&(0x3fff)); + } + yk += ol;//yk += (ez.cigar.a[ck]&(0x3fff)); + we = xk; wte = yk; + + + if(ws >= s && ws <= e) { + ots = wts; ote = wte; + if(ii[0] == -1) { + ii[0] = ws; it[0] = ots; + } + ii[1] = we; it[1] = ote; + + + tstr = NULL; + if(op != 0) tstr = get_sub_seq(ssq, ots, ote); + // if(s == 10539 && e == 10760) { + // fprintf(stderr, "-0-%ld%c(%.*s)\tpe::%lu", ol, cm[op], tstr?((int)(ote - ots)):0, tstr, pe); + // } + // fprintf(stderr, "%ld%c(%.*s)", ol, cm[op], tstr?((int)(ote - ots)):0, tstr); + + pe = append_cns_g(cns, tstr, ote - ots, ws - iws, we - iws, op, ol, pe, b32, max_trace, rid, insert_pos); + + // if(s == 10539 && e == 10760) { + // fprintf(stderr, "-1-pe::%lu\n", pe); + // } + } + } + + // if(s == 10539 && e == 10760) { + // fprintf(stderr, "\tx::[%ld, %ld)\ty::[%ld, %ld)\tiw::[%ld, %ld)\n", ii[0], ii[1], it[0], it[1], iws, iwe); + // } + + // prt_cns_arc(cns, 0, __func__); + // prt_cns_arc(cns, 1, __func__); + if(ii[1] == -1) return -1;///it is possible when s == e and the cigar here is not a match + + uint64_t ae = 1; + if((ii[1] == iwe)) { + ae = 1;///end node + } else { + ae = ii[1] + cns->off - iws; + } + + // if(s == 10539 && e == 10760) { + // fprintf(stderr, "pe::%lu, ae::%lu, n_nodes::%u, ii[1]::%ld\n", pe, ae, (uint32_t)cns->n, ii[1]); + // } + + if(pe == ((uint64_t)-1)) pe = 0;///start node + + ///if iws == iwe and the cigar is a match, pe will be equal to ae + if(pe != ae) { + if(!add_cns_arc(cns, pe, ae, 1, /**ae==1?1:0**/1)) { + insert_cns_arc(cns, pe, ae, 1, /**ae==1?1:0**/1, rid); + insert_cns_arc(cns, ae, pe, 0, /**ae==1?1:0**/1, rid); + } else { + add_cns_arc(cns, ae, pe, 0, /**ae==1?1:0**/1); + } + } + + + // prt_cns_arc(cns, 0, __func__); + // prt_cns_arc(cns, 1, __func__); + + + // if(s == 10539 && e == 10760) { + // fprintf(stderr, "-end-pe::%lu, ae::%lu\n", pe, ae); + // } + + ovlp_cur_xoff(*p) = xk; ovlp_cur_yoff(*p) = yk; ovlp_cur_coff(*p) = ck; ovlp_cur_ylen(*p) = 0; + + return 1; +} + + +uint64_t iter_cc_idx_t(overlap_region* ol, cc_idx_t *z, int64_t s, int64_t e, uint64_t is_reduce, uint64_t is_insert, uint64_t **ra) +{ + int64_t rm_n, q[2], os, oe; ul_ov_t *cp; uint64_t m; *ra = NULL; + + // if(s == 15816 && e == 15819) { + // fprintf(stderr, "[M::%s] is_reduce::%lu\n", __func__, is_reduce); + // } + if(z->ru == 0) { + if(is_reduce) { + for (m = rm_n = z->srt_n; m < z->idx->n; m++) { + cp = &(z->c_idx[z->idx->a[m]]); + // if(s == 15816 && e == 15819) { + // fprintf(stderr, "-0-[M::%s] ii::%lu, ii0::%ld\n", __func__, z->idx->a[m], z->i0); + // } + + q[0] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + q[1] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + os = MAX(q[0], s); oe = MIN(q[1], e); + if((oe > os) || ((is_insert) && (s == e) && (s >= q[0]) && (s <= q[1]))) { + z->idx->a[rm_n++] = z->idx->a[m]; + } + } + z->idx->n = rm_n; + } + + for (; z->i < z->srt_n; ++z->i) { + cp = &(z->c_idx[(uint32_t)z->idx->a[z->i]]); + // if(s == 15816 && e == 15819) { + // fprintf(stderr, "-1-[M::%s] ii::%u, ii0::%ld\n", __func__, (uint32_t)z->idx->a[z->i], z->i0); + // } + q[0] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + q[1] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + if(q[0] > e) break; + if((!is_insert) && (q[0] >= e)) break; + os = MAX(q[0], s); oe = MIN(q[1], e); + if((oe > os) || ((is_insert) && (s == e) && (s >= q[0]) && (s <= q[1]))) { + kv_push(uint64_t, *(z->idx), ((uint32_t)z->idx->a[z->i])); + } + } + } else { + z->ru = 0; + } + + (*ra) = z->idx->a + z->srt_n; + return z->idx->n - z->srt_n; +} + +void debug_inter0(overlap_region* ol, ul_ov_t *c_idx, uint64_t *idx, int64_t idx_n, uint64_t *res, int64_t res_n, int64_t s, int64_t e, uint64_t is_insert, uint64_t is_hard_check, const char *cmd) +{ + ul_ov_t *cp; int64_t q[2], a_n = 0, i, k = 0, os, oe; + for (i = 0; i < idx_n; i++) { + cp = &(c_idx[(uint32_t)idx[i]]); + q[0] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + q[1] = ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + + // fprintf(stderr, "%s[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\terr::%d\toerr::%u\n", cmd, __func__, ol[ovlp_id(*cp)].y_id, (int)Get_NAME_LENGTH(R_INF, ol[ovlp_id(*cp)].y_id), Get_NAME(R_INF, ol[ovlp_id(*cp)].y_id), + // ovlp_cur_wid(*cp), ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start, ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1, ol[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].error, ol[ovlp_id(*cp)].non_homopolymer_errors); + + os = MAX(q[0], s); oe = MIN(q[1], e); + if((oe > os) || ((is_insert) && (s == e) && (s >= q[0]) && (s <= q[1]))) { + a_n++; + // if(!(((uint32_t)idx[i]) == res[k])) { + // fprintf(stderr, "[M::%s] a_n::%ld\tres_n::%ld\ts::%ld\te::%ld\ti::%ld\tk::%ld\n", __func__, a_n, res_n, s, e, i, k); + // } + if(is_hard_check) { + assert(((uint32_t)idx[i]) == res[k++]); + } else { + for (; (k < res_n) && (((uint32_t)idx[i]) != res[k]); k++); + assert(k < res_n); + } + } + } + // if(a_n != res_n) { + // fprintf(stderr, "[M::%s] a_n::%ld\tres_n::%ld\ts::%ld\te::%ld\tidx_n::%ld\n", __func__, a_n, res_n, s, e, idx_n); + // } + if(is_hard_check) { + assert(a_n == res_n); + } else { + assert(a_n <= res_n); + } +} + +void prt_cigar0(uint64_t in, int64_t len) +{ + int64_t k; uint64_t mp; + char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + for (k = 0; k < len; k++) { + mp = len - 1 - k; mp <<= 1; + fprintf(stderr, "%c", cm[(in >> mp)&3]); + } + fprintf(stderr, "\n"); +} + +void prt_bp0(uint64_t in, int64_t len) +{ + int64_t k; uint64_t mp; + char cm[4]; cm[0] = 'A'; cm[1] = 'C'; cm[2] = 'G'; cm[3] = 'T'; + for (k = 0; k < len; k++) { + mp = len - 1 - k; mp <<= 1; + fprintf(stderr, "%c", cm[(in >> mp)&3]); + } + fprintf(stderr, "\n"); +} + +uint64_t cns_gen0(overlap_region* ol, All_reads *rref, uint64_t s, uint64_t e, uint64_t ql, UC_Read* tu, cc_idx_t *idx, uint64_t occ_tot, double occ_max, asg32_v* b32, uint32_t *rc) +{ + if(e > s + simp_vote_len) return 0;///too long + + uint64_t *id_a = NULL, id_n, an = 0, oc[2]; b32->n = 0; uint32_t m, *a = NULL; + id_n = iter_cc_idx_t(ol, idx, s, e, idx->rr, ((s==e)?1:0), &id_a); + // debug_inter0(ol, idx->c_idx, idx->idx->a + idx->i0, idx->srt_n - idx->i0, id_a, id_n, s, e, ((s==e)?1:0), 0, "-1-"); + uint64_t k, l, q[2], os, oe; ul_ov_t *p; overlap_region *z; idx->rr = 0; + // fprintf(stderr, "[M::%s] [%lu, %lu) id_n::%lu\n", __func__, s, e, id_n); + for (k = 0; k < id_n; k++) { + p = &(idx->c_idx[id_a[k]]); z = &(ol[ovlp_id(*p)]); + q[0] = z->w_list.a[ovlp_cur_wid(*p)].x_start+ovlp_bd(*p); + q[1] = z->w_list.a[ovlp_cur_wid(*p)].x_end+1-ovlp_bd(*p); + + // if(s == 11851 && e == 11853) { + // fprintf(stderr, "[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\terr::%d\toerr::%u\n", __func__, ol[ovlp_id(*p)].y_id, (int)Get_NAME_LENGTH(R_INF, ol[ovlp_id(*p)].y_id), Get_NAME(R_INF, ol[ovlp_id(*p)].y_id), + // ovlp_cur_wid(*p), ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_start, ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_end+1, ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].error, ol[ovlp_id(*p)].non_homopolymer_errors); + // } + + if(q[1] <= e) idx->rr = 1; + os = MAX(q[0], s); oe = MIN(q[1], e); + // if((oe > os) || ((s == e) && (s >= q[0]) && (s <= q[1]))) { + if((oe > os) || ((s == e) && (s > q[0]) && (s < q[1]))) { + // if(oe >= os) { + ///[-4-][-12-][-4-][-12-] + ///[cigar_len][cigar][base_len][base] + m = extract_sub_cigar_ii(z, ql, rref, os, oe, s, e, tu, p); an++; + if(m != ((uint32_t)-1)) {///no gap in both sides + kv_push(uint32_t, *b32, m); + } + } + } + + oc[0] = b32->n; oc[1] = an + 1; //+1 for the reference read + // if(s == 11851 && e == 11853) { + // fprintf(stderr, "-0-[M::%s] oc[0]::%lu, oc[1]::%lu\n", __func__, oc[0], oc[1]); + // } + if(((oc[0] > (oc[1]*occ_max)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1))) { + radix_sort_ec32(b32->a, b32->a+b32->n); an = 0; + for (k = 1, l = 0; k <= b32->n; ++k) { + if (k == b32->n || b32->a[k] != b32->a[l]) { + if(k - l > an) { + an = k - l; a = b32->a + l; + } + l = k; + } + } + oc[0] = an; + // fprintf(stderr, "-1-[M::%s] oc[0]::%lu, oc[1]::%lu\n", __func__, oc[0], oc[1]); + if(((oc[0] > (oc[1]*occ_max)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1))) { + (*rc) = a[0]; + // prt_cigar0((a[0]<<4)>>20, a[0]>>28); + // prt_bp0((a[0]<<20)>>20, (a[0]<<16)>>28); + return 1; + } + } + + idx->ru = 1; + return 0; +} + +inline void gen_mm_cns_arc(cns_gfa *cns, uint32_t src, uint32_t des, uint32_t sc, uint32_t f) +{ + cns_t *av = &((*cns).a[src]), *aw; + uint32_t vk, wk; + for (vk = 0; vk < av->arc.nou; vk++) {///out-edge of src + if((av->arc.a[vk].v != des) || (del_cns_arc((*av), vk))) continue; + // av->arc.a[vk].f = 1; ///not sure if we should set these edges as visited + av->arc.a[vk].f = f; + av->arc.a[vk].sc += sc; + + aw = &((*cns).a[des]); + for (wk = aw->arc.nou; wk < aw->arc.n; wk++) {///in-edge of des + if((aw->arc.a[wk].v != src) || (del_cns_arc((*aw), wk))) continue; + // aw->arc.a[wk].f = 1; ///not sure if we should set these edges as visited + aw->arc.a[wk].f = f; + aw->arc.a[wk].sc += sc; + break; + } + + assert(wk < aw->arc.n); + return; + } + + cns_arc *p, t; + ///src -> des + kv_pushp(cns_arc, (*cns).a[src].arc, &p); + p->sc = sc; p->v = des; + // p->f = 1; ///not sure if we should set these edges as visited + p->f = f; + ///ou-edge + (*cns).a[src].arc.nou++; + if((*cns).a[src].arc.nou < (*cns).a[src].arc.n) { + t = (*cns).a[src].arc.a[(*cns).a[src].arc.nou-1]; + (*cns).a[src].arc.a[(*cns).a[src].arc.nou-1] = *p; + *p = t; + } + + ///src <- des; in-edge + kv_pushp(cns_arc, (*cns).a[des].arc, &p); + p->sc = sc; p->v = src; + // p->f = 1; ///not sure if we should set these edges as visited + p->f = f; +} + +void del_cns_g_nn(cns_gfa *cns, uint32_t v) +{ + uint32_t w, vk, wk; cns_t *av = &((*cns).a[v]), *aw = NULL; + + for (vk = 0; vk < av->arc.nou; vk++) {///out-edge of src + if(del_cns_arc((*av), vk)) continue; + w = av->arc.a[vk].v; av->arc.a[vk].v = CNS_DEL_E; + + aw = &((*cns).a[w]); + for (wk = aw->arc.nou; wk < aw->arc.n; wk++) {///in-edge of des + if((aw->arc.a[wk].v != v) || (del_cns_arc((*aw), wk))) continue; + aw->arc.a[wk].v = CNS_DEL_E; break; + } + assert(wk < aw->arc.n); + } + + for (vk = av->arc.nou; vk < av->arc.n; vk++) {///in-edge of src + if(del_cns_arc((*av), vk)) continue; + w = av->arc.a[vk].v; av->arc.a[vk].v = CNS_DEL_E; + + aw = &((*cns).a[w]); + for (wk = 0; wk < aw->arc.nou; wk++) {///out-edge of des + if((aw->arc.a[wk].v != v) || (del_cns_arc((*aw), wk))) continue; + aw->arc.a[wk].v = CNS_DEL_E; break; + } + assert(wk < aw->arc.n); + } + + + cns->a[v].arc.n = cns->a[v].arc.nou = 0; + cns->a[v].c = cns->a[v].f = 0; cns->a[v].sc = CNS_DEL_V; +} + +void merge_cns_g_in(cns_gfa *cns, uint32_t v0, asg32_v* b32) +{ + cns_t *av, *aw; uint32_t v, bp, vk, wk, wka, w, wn, nn, mn, wh, mn_k[2]; + + b32->n = 0; + kv_push(uint32_t, *b32, v0); + while (b32->n) { + v = b32->a[--b32->n]; + if(del_cns_nn((*cns), v)) continue; + + av = &((*cns).a[v]); + for (bp = 0; bp < 4; bp++) { + //nn: number of node; wh: weight + nn = wh = 0; mn = mn_k[0] = mn_k[1] = wka = (uint32_t)-1; + for (vk = av->arc.nou; vk < av->arc.n; vk++) {///in-edge of v + if(del_cns_arc((*av), vk)) continue; + w = av->arc.a[vk].v; aw = &((*cns).a[w]); + if(aw->c != bp) continue; + if(w == cns->si || w == cns->ei) continue; + + for (wk = wn = 0; wk < aw->arc.nou; wk++) {///out-edge of w + if(del_cns_arc((*aw), wk)) continue; + wn++; wka = wk; if(wn > 1) break; + } + + if(wn != 1) continue; + + assert(aw->arc.a[wka].v == v); + + ///deal with out-edge of w + if(nn == 0) { + mn = w; mn_k[0] = vk; mn_k[1] = wka; + wh = av->arc.a[vk].sc; + ///not sure if we should set these edges as visited + // av->arc.a[vk].f = 1; aw->arc.a[wka].f = 1; + } else { + wh += aw->arc.a[wka].sc; + } + + ///deal with in-edge of w + ///all edges to w, should be move to mn + if(nn > 0) {///not sure if we should set these edges as visited; affect when nn == 0 + for (wk = aw->arc.nou; wk < aw->arc.n; wk++) { + if(del_cns_arc((*aw), wk)) continue; + ///previously, aw->arc.a[wk].v -> w + ///currently, aw->arc.a[wk].v -> mn + /// if(nn == 0), then mn = w + gen_mm_cns_arc(cns, aw->arc.a[wk].v, mn, aw->arc.a[wk].sc/**(nn?(aw->arc.a[wk].sc):(0))**/, aw->arc.a[wk].f);///not sure if we should set these edges as visited + } + } + + ///mn != w + if(nn > 0) del_cns_g_nn(cns, w); + + nn++; + } + + if(nn) { + aw = &((*cns).a[mn]); + av->arc.a[mn_k[0]].sc = aw->arc.a[mn_k[1]].sc = wh; + // merge_cns_g_in(cns_gfa *cns, uint32_t v, asg32_v* b32) + kv_push(uint32_t, *b32, mn); + } + } + } +} + +void merge_cns_g_ou(cns_gfa *cns, uint32_t v0, asg32_v* b32) +{ + cns_t *av, *aw; uint32_t v, bp, vk, wk, wka, w, wn, nn, mn, wh, mn_k[2]; + + b32->n = 0; + kv_push(uint32_t, *b32, v0); + while (b32->n) { + v = b32->a[--b32->n]; + if(del_cns_nn((*cns), v)) continue; + + av = &((*cns).a[v]); + for (bp = 0; bp < 4; bp++) { + //nn: number of node; wh: weight + nn = wh = 0; mn = mn_k[0] = mn_k[1] = wka = (uint32_t)-1; + for (vk = 0; vk < av->arc.nou; vk++) {///ou-edge of v + if(del_cns_arc((*av), vk)) continue; + w = av->arc.a[vk].v; aw = &((*cns).a[w]); + if(aw->c != bp) continue; + if(w == cns->si || w == cns->ei) continue; + + for (wk = aw->arc.nou, wn = 0; wk < aw->arc.n; wk++) {///in-edge of w + if(del_cns_arc((*aw), wk)) continue; + wn++; wka = wk; if(wn > 1) break; + } + + if(wn != 1) continue; + + assert(aw->arc.a[wka].v == v); + + + ///deal with in-edge of w + if(nn == 0) { + mn = w; mn_k[0] = vk; mn_k[1] = wka; + wh = av->arc.a[vk].sc; + ///not sure if we should set these edges as visited + // av->arc.a[vk].f = 1; aw->arc.a[wka].f = 1; + } else { + wh += aw->arc.a[wka].sc; + } + + + ///deal with ou-edge of w + ///all edges from w, should be move to mn + if(nn > 0) {///not sure if we should set these edges as visited; affect when nn == 0 + for (wk = 0; wk < aw->arc.nou; wk++) { + if(del_cns_arc((*aw), wk)) continue; + ///previously, w -> aw->arc.a[wk].v + ///currently, mn -> aw->arc.a[wk].v + /// if(nn == 0), then mn = w + gen_mm_cns_arc(cns, mn, aw->arc.a[wk].v, aw->arc.a[wk].sc/**(nn?(aw->arc.a[wk].sc):(0))**/, aw->arc.a[wk].f);///not sure if we should set these edges as visited + } + } + + ///mn != w + if(nn > 0) del_cns_g_nn(cns, w); + + nn++; + } + + if(nn) { + aw = &((*cns).a[mn]); + // fprintf(stderr, "\n[M::%s] nn::%u, mn::%u\n", __func__, nn, mn); + // fprintf(stderr, "[M::%s] vi::%u, vn::%u\n", __func__, mn_k[0], (uint32_t)av->arc.n); + // fprintf(stderr, "[M::%s] wi::%u, wn::%u\n", __func__, mn_k[1], (uint32_t)aw->arc.n); + + av->arc.a[mn_k[0]].sc = aw->arc.a[mn_k[1]].sc = wh; + // merge_cns_g_in(cns_gfa *cns, uint32_t v, asg32_v* b32) + kv_push(uint32_t, *b32, mn); + } + } + } +} + +void refine_cns_g(cns_gfa *cns, asg32_v *b32) +{ + uint32_t v, w, vk, wk, *p = NULL; cns_t *av = NULL, *aw = NULL; + kdq_clear(cns->q); + kdq_push(uint32_t, cns->q, cns->si); ///in-degree == 0 + + while (1) { + p = kdq_shift(uint32_t, cns->q); + if(!p) break; v = *p; + + if(del_cns_nn((*cns), v)) continue; + ///merge in + merge_cns_g_in(cns, v, b32); + ///merge out + merge_cns_g_ou(cns, v, b32); + + av = &((*cns).a[v]); + ///set arcs + for (vk = 0; vk < av->arc.nou; vk++) {///out-edge of v + if(del_cns_arc((*av), vk)) continue; + if(av->arc.a[vk].f == 0) continue; + + av->arc.a[vk].f = 1; w = av->arc.a[vk].v; aw = &((*cns).a[w]); + + for (wk = aw->arc.nou; wk < aw->arc.n; wk++) {///in-edge of w + if((aw->arc.a[wk].v != v) || (del_cns_arc((*aw), wk))) continue; + aw->arc.a[wk].f = 1; break; + } + } + ///set node + av->f = 1; + + + for (vk = 0; vk < av->arc.nou; vk++) {///out-edge of v + if(del_cns_arc((*av), vk)) continue; + w = av->arc.a[vk].v; + + aw = &((*cns).a[w]); + for (wk = aw->arc.nou; wk < aw->arc.n; wk++) {///in-edge of w + if((del_cns_arc((*aw), wk))) continue; + if(aw->arc.a[wk].f) continue;///test arcs + if(cns->a[aw->arc.a[wk].v].f) continue;///test node + break; + } + if(wk >= aw->arc.n) { + kdq_push(uint32_t, cns->q, w); ///in-degree == 0 + } + } + } +} + +void gseq_cns_g(cns_gfa *cns, asg32_v *b32, uint32_t bl) +{ + b32->n = 0; kv_resize(uint32_t, *b32, cns->n); + uint32_t v, vk, w, mme, mmn, mmk, mmw, *ii = b32->a, *p; cns_t *av; + uint32_t bs = cns->off, be = bl + cns->off, sw; + for (v = 0; v < cns->n; v++) { + ii[v] = 0;///score + if(del_cns_nn((*cns), v)) continue; + cns->a[v].sc = 0; ///in-degree or prefix + cns->a[v].f = 0; + + av = &((*cns).a[v]); + for (vk = av->arc.nou; vk < av->arc.n; vk++) {///in-edge of v + if(del_cns_arc((*av), vk)) continue; + cns->a[v].sc++;///in-degree + } + } + + kdq_clear(cns->q); + kdq_push(uint32_t, cns->q, cns->si); ///in-degree == 0 + assert((cns->a[cns->si].sc == 0) && (!del_cns_nn((*cns), cns->si))); + + while (1) { + p = kdq_shift(uint32_t, cns->q); + if(!p) break; v = *p; + + if(del_cns_nn((*cns), v)) continue; + assert(cns->a[v].sc == 0); ///in-degree == 0 + + av = &((*cns).a[v]); + for (vk = av->arc.nou, mme = mmn = mmw = 0, mmk = (uint32_t)-1; vk < av->arc.n; vk++) {///in-edge of v + if(del_cns_arc((*av), vk)) continue; + w = av->arc.a[vk].v; + assert((*cns).a[w].f); + sw = ((w >= bs && w < be)?1:0); ///backbone + + if((mmk == ((uint32_t)-1)) || (av->arc.a[vk].sc > mme) || + (((av->arc.a[vk].sc == mme) && (ii[w] > mmn))) || + (((av->arc.a[vk].sc == mme) && (ii[w] == mmn) && (sw == 1) && (mmw == 0)))) { + mmk = vk; mme = av->arc.a[vk].sc; mmn = ii[w]; mmw = sw; + } + } + + ii[v] = mme + mmn; cns->a[v].f = 1; + if(mmk != ((uint32_t)-1)) cns->a[v].sc = av->arc.a[mmk].v; + else cns->a[v].sc = v; + + for (vk = 0; vk < av->arc.nou; vk++) {///out-edge of v + if(del_cns_arc((*av), vk)) continue; + w = av->arc.a[vk].v; + assert(cns->a[w].f == 0); + assert(cns->a[w].sc); + cns->a[w].sc--; + if(cns->a[w].sc == 0) { + kdq_push(uint32_t, cns->q, w); ///in-degree == 0 + } + } + + // fprintf(stderr, "[M::%s] sc[%u]::%u\n", __func__, v, ii[v]); + } + + for (v = cns->a[cns->ei].sc, b32->n = 0; v != cns->si; v = cns->a[v].sc) { + // fprintf(stderr, "[M::%s] v::%u\n", __func__, v); + kv_push(uint32_t, *b32, v); + } + assert(v == cns->si); + + mmn = b32->n; mmn >>= 1; + for (vk = 0; vk < mmn; vk++) { + v = b32->a[vk]; b32->a[vk] = b32->a[b32->n-vk-1]; b32->a[b32->n-vk-1] = v; + } +} + +uint64_t push_correct1(window_list *idx, window_list_alloc *res, cns_gfa *cns, asg32_v *rc, uint32_t bl) +{ + // fprintf(stderr, "[M::%s]\t", __func__); + uint64_t nec = 0; uint32_t k, l, i, ff, sl, sk, bs = cns->off, be = bl + cns->off, bend = cns->off;///[bs, be) + if(rc->n) {///it is possible that rc->n == 0, which means there is a deletion + for (k = 1, l = 0; k <= rc->n; ++k) { + ff = 0; sl = sk = 0; + if(k == rc->n) { + if(l < rc->n) sl = ((rc->a[l] >= bs && rc->a[l] < be)?1:0); + ff = 1; + } else { + sl = ((rc->a[l] >= bs && rc->a[l] < be)?1:0); + sk = ((rc->a[k] >= bs && rc->a[k] < be)?1:0); + if(sl != sk) { + ff = 1; + } else if((sl == 1) && ((rc->a[k] - rc->a[l]) != (k - l))) { + ff = 1; + } + } + + if(!ff) continue; + + if(sl) { ///match + if(rc->a[l] > bend) {///deltetion [bend, rc->a[l]) + push_trace_bp(((asg16_v *)(&(res->c))), 3, (uint16_t)-1, rc->a[l] - bend, ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; nec += rc->a[l] - bend; + // fprintf(stderr, "%uD", rc->a[l] - bend); + } + + ///push match + push_trace_bp(((asg16_v *)(&(res->c))), 0, (uint16_t)-1, rc->a[k-1] + 1 - rc->a[l], ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; + + // fprintf(stderr, "%uM", rc->a[k-1] + 1 - rc->a[l]); + + bend = rc->a[k-1] + 1; + } else { ///unmatch + for (i = l; i < k; i++) { + push_trace_bp(((asg16_v *)(&(res->c))), 2, cns->a[rc->a[i]].c, 1, ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; nec++; + // fprintf(stderr, "I"); + } + } + l = k; + } + } + + ///push remaining deletion + if(be > bend) { + push_trace_bp(((asg16_v *)(&(res->c))), 3, (uint16_t)-1, be - bend, ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; nec += be - bend; + // fprintf(stderr, "%uD", be - bend); + } + // fprintf(stderr, "\n"); + return nec; +} + +uint64_t push_correct1_fhc_indel_exz(asg16_v *sc, int64_t sc0, window_list *idx, cns_gfa *cns, char *ostr, UC_Read* tu, bit_extz_t *exz, uint64_t gbeg, uint64_t c0, int64_t cl0, int64_t ok0) +{ + int64_t ck = sc->n, k, ok = 0, nk = 0, cn, cn0, nl, ol, diff, diff0, ml, ml0, e0 = 0; uint32_t on, f = 0, nec = 0; uint16_t bq, bt, op; + assert(c0 == 3); + + ///debug + // char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + + // if(c0 != 2) ok += cl0; + // if(c0 != 3) nk += cl0; + ok += cl0; e0 += cl0; + + // fprintf(stderr, "\n%lu%c", cl0, cm[c0]); + + for (ck--; ck >= sc0/**0**/; ck--) { + op = sc->a[ck]>>14; + if(!op) break; + + if((op == 2) || (op == 3)) { + on = sc->a[ck]&(0xfff); + } else if(op == 1) { + on = sc->a[ck]&(0x3ff); + } else { + on = sc->a[ck]&(0x3fff); + } + if(op != 2) ok += on; + if(op != 3) nk += on; + if(op != 0) e0 += on; + if(c0 != op) f = 1; + + // fprintf(stderr, "%u%c(%c)", on, cm[op], "ACGT"[((sc->a[ck]>>12)&3)]); + } + cn0 = ck + 1; cn = sc->n; + // fprintf(stderr, "\n"); + + // fprintf(stderr, "+[M::%s] cn0::%ld, cn::%ld, ok::%ld, sc->n::%u, ok0::%ld, cl0::%ld\n", __func__, cn0, cn, ok, (uint32_t)sc->n, ok0, cl0); + // f = 0; + + if((!f) || (!ok) || (!nk)) { + for (k = 0, ck = ok0 + gbeg; k < cl0; k++, ck++) { + push_trace_bp_f(sc, c0, cns->a[ck].c, (uint16_t)-1, 1, ((idx->clen>0)?1:0)); + idx->clen = sc->n - idx->cidx; nec++; + // fprintf(stderr, "%c\n", "ACGT"[cns->a[ck].c]); + } + } else { + char *oseq = NULL, *nseq = NULL; int64_t wo[2], wn[2]; + UC_Read_resize((*tu), nk); nseq = tu->seq; wo[0] = wo[1] = wn[0] = wn[1] = 0; + // if(c0 != 2) ok0 += cl0; + ok0 += cl0; + + ok0 -= ok; + // if(!(ok0 >= 0)) { + // fprintf(stderr, "+[M::%s] rid::%u, cn0::%ld, cn::%ld, ok0::%ld, ok::%ld, sc->n::%u\n", __func__, rid, cn0, cn, ok0, ok, (uint32_t)sc->n); + // } + assert(ok0 >= 0); + + oseq = ostr + ok0; + ol = ok; nl = nk; + ck = cn0; ok = nk = 0; + while (ck < cn) { + wo[0] = ok; wn[0] = nk; + ck = pop_trace_bp_f(sc, ck, &op, &bq, &bt, &on); + if(op != 2) ok += on; + if(op != 3) nk += on; + wo[1] = ok; wn[1] = nk; + + if(op == 0) { + memcpy(nseq + wn[0], oseq + wo[0], (wo[1]-wo[0])*sizeof((*nseq))); + } else if(op == 1 || op == 2) { + for (k = wn[0]; k < wn[1]; k++) nseq[k] = s_H[bt]; + } + // fprintf(stderr, "[M::%s] ck::%ld, wo::[%ld, %ld), wn::[%ld, %ld)\n", __func__, ck, wo[0], wo[1], wn[0], wn[1]); + } + // fprintf(stderr, "[M::%s] qstr::%.*s*\n", __func__, (int32_t)ol, oseq); + // fprintf(stderr, "[M::%s] tstr::%.*s*\n", __func__, (int32_t)nl, nseq); + + assert(wo[1] + cl0 == ol); + assert(wn[1] == nl); ///since c0 must be 3 + // memcpy(nseq + wn[1], oseq + wo[1], cl0*sizeof((*nseq))); + if(nl == ol && nl == 1) { + sc->n = cn0; + if(oseq[0] == nseq[0]) { + push_trace_bp_f(sc, 0, (uint16_t)-1, (uint16_t)-1, 1, ((idx->clen>0)?1:0)); + idx->clen = sc->n - idx->cidx; + } else { + push_trace_bp_f(sc, 1, seq_nt6_table[(uint32_t)(oseq[0])], seq_nt6_table[(uint32_t)(nseq[0])], 1, ((idx->clen>0)?1:0)); + idx->clen = sc->n - idx->cidx; nec++; + } + } else { + ml = MAX(ol, nl); f = 0; + + diff = 31; + if(diff > ml) diff = ml; + diff0 = diff; clear_align(*exz); + cal_exz_global(nseq, nl, oseq, ol, diff, exz); + if(is_align(*exz)) f = 1; + + if(!f) { + diff = 63; + if(diff > ml) diff = ml; + if(diff > diff0) { + diff0 = diff; clear_align(*exz); + cal_exz_global(nseq, nl, oseq, ol, diff, exz); + if(is_align(*exz)) f = 1; + } + } + + // fprintf(stderr, "[M::%s] f::%u, exz->err::%d, e0::%ld\n", __func__, f, exz->err, e0); + + if(f && exz->err < e0) { + sc->n = cn0; + cn = exz->cigar.n; ok = nk = 0; + for (ck = 0; ck < cn;) { + wo[0] = ok; wn[0] = nk; + ck = pop_trace(&(exz->cigar), ck, &op, &on); + if(op!=2) ok += on; + if(op!=3) nk += on; + wo[1] = ok; wn[1] = nk; + + // fprintf(stderr, "%u%c(", on, cm[op]); + + if(op == 0) { + push_trace_bp_f(sc, op, (uint16_t)-1, (uint16_t)-1, on, ((idx->clen>0)?1:0)); + idx->clen = sc->n - idx->cidx; + } else if(op == 1) { + for (k = 0; k < on; k++) { + push_trace_bp_f(sc, op, seq_nt6_table[(uint32_t)(oseq[wo[0]+k])], seq_nt6_table[(uint32_t)(nseq[wn[0]+k])], 1, ((idx->clen>0)?1:0)); + idx->clen = sc->n - idx->cidx; nec++; + // fprintf(stderr, "<%c|%c>)", oseq[wo[0]+k], nseq[wn[0]+k]); + } + } else if(op == 2) { + for (k = 0; k < on; k++) { + push_trace_bp_f(sc, op, (uint16_t)-1, seq_nt6_table[(uint32_t)(nseq[wn[0]+k])], 1, ((idx->clen>0)?1:0)); + idx->clen = sc->n - idx->cidx; nec++; + // fprintf(stderr, "<|%c>)", nseq[wn[0]+k]); + } + } else if(op == 3) { + for (k = 0; k < on; k++) { + push_trace_bp_f(sc, op, seq_nt6_table[(uint32_t)(oseq[wo[0]+k])], (uint16_t)-1, 1, ((idx->clen>0)?1:0)); + idx->clen = sc->n - idx->cidx; nec++; + // fprintf(stderr, "<%c|>)", oseq[wo[0]+k]); + } + } + // fprintf(stderr, ")"); + } + // fprintf(stderr, "\n"); + } else { + if(ml < e0) { + sc->n = cn0; + ml0 = MIN(ol, nl); op = 1; + for (k = 0; k < ml0; k++) { + push_trace_bp_f(sc, op, seq_nt6_table[(uint32_t)(oseq[k])], seq_nt6_table[(uint32_t)(nseq[k])], 1, ((idx->clen>0)?1:0)); + idx->clen = sc->n - idx->cidx; nec++; + } + + if(ol > ml0) {///op = 3 + for (k = ml0, op = 3; k < ol; k++) { + push_trace_bp_f(sc, op, seq_nt6_table[(uint32_t)(oseq[k])], (uint16_t)-1, 1, ((idx->clen>0)?1:0)); + idx->clen = sc->n - idx->cidx; nec++; + } + } else if(nl > ml0) {///op = 2 + for (k = ml0, op = 2; k < nl; k++) { + push_trace_bp_f(sc, op, (uint16_t)-1, seq_nt6_table[(uint32_t)(nseq[k])], 1, ((idx->clen>0)?1:0)); + idx->clen = sc->n - idx->cidx; nec++; + } + } + } else { + for (k = 0, ck = ok0 + gbeg; k < cl0; k++, ck++) { + push_trace_bp_f(sc, c0, cns->a[ck].c, (uint16_t)-1, 1, ((idx->clen>0)?1:0)); + idx->clen = sc->n - idx->cidx; nec++; + } + } + } + } + } + + // fprintf(stderr, "-[M::%s] cn0::%ld, cn::%ld, ok::%ld, sc->n::%u\n", __func__, cn0, cn, ok, (uint32_t)sc->n); + return nec; +} + +uint64_t push_correct1_fhc(window_list *idx, window_list_alloc *res, cns_gfa *cns, char* qstr, UC_Read* tu, bit_extz_t *exz, asg32_v *rc, uint32_t bl, uint32_t rid) +{ + // fprintf(stderr, "[M::%s]\trc->n::%u\tbl::%u\n", __func__, (uint32_t)rc->n, bl); + uint64_t nec = 0; uint32_t k, l, i, ff, sl, sk, bs = cns->off, be = bl + cns->off, bend = cns->off, is_i = 0, sc0 = res->c.n;///[bs, be) + if(rc->n) {///it is possible that rc->n == 0, which means there is a deletion + for (k = 1, l = 0; k <= rc->n; ++k) { + ff = 0; sl = sk = 0; + if(k == rc->n) { + if(l < rc->n) sl = ((rc->a[l] >= bs && rc->a[l] < be)?1:0); + ff = 1; + } else { + sl = ((rc->a[l] >= bs && rc->a[l] < be)?1:0); + sk = ((rc->a[k] >= bs && rc->a[k] < be)?1:0); + if(sl != sk) { + ff = 1; + } else if((sl == 1) && ((rc->a[k] - rc->a[l]) != (k - l))) { + ff = 1; + } + } + + if(!ff) continue; + + if(sl) { ///match + if(rc->a[l] > bend) {///deltetion [bend, rc->a[l]) + if(is_i && exz) { + nec += push_correct1_fhc_indel_exz(((asg16_v *)(&(res->c))), sc0, idx, cns, qstr, tu, exz, cns->off, 3, rc->a[l] - bend, bend-cns->off); + } else { + for (i = bend; i < rc->a[l]; i++) { + push_trace_bp_f(((asg16_v *)(&(res->c))), 3, cns->a[i].c, (uint16_t)-1, 1, ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; nec++; + } + } + } + + ///push match + push_trace_bp_f(((asg16_v *)(&(res->c))), 0, (uint16_t)-1, (uint16_t)-1, rc->a[k-1] + 1 - rc->a[l], ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; + + bend = rc->a[k-1] + 1; is_i = 0; + } else { ///unmatch + for (i = l; i < k; i++) { + push_trace_bp_f(((asg16_v *)(&(res->c))), 2, (uint16_t)-1, cns->a[rc->a[i]].c, 1, ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; nec++; + } + is_i = 1; + } + l = k; + } + } + + ///push remaining deletion + if(be > bend) { + if(is_i && exz) { + nec += push_correct1_fhc_indel_exz(((asg16_v *)(&(res->c))), sc0, idx, cns, qstr, tu, exz, cns->off, 3, be - bend, bend-cns->off); + } else { + for (i = bend; i < be; i++) { + push_trace_bp_f(((asg16_v *)(&(res->c))), 3, cns->a[i].c, (uint16_t)-1, 1, ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; nec++; + } + } + } + // fprintf(stderr, "\n"); + return nec; +} + +///no e_end since e always covers end; s may not have end +uint64_t cns_gen_full0(overlap_region* ol, All_reads *rref, uint64_t s, uint64_t e, uint64_t s_end, char* qstr, UC_Read* tu, bit_extz_t *exz, cc_idx_t *idx, uint64_t occ_tot, double occ_max, asg32_v* b32, cns_gfa *cns, uint64_t max_trace, window_list *ridx, window_list_alloc *res, uint32_t rid) +{ + init_cns_g(cns, qstr + s, e - s, rid); + // if(e -s > 100) { + // fprintf(stderr, "[M::%s]::[%lu, %lu), cns->n::%u, qstr::%.*s\n", __func__, s, e, (uint32_t)cns->n, (int)(e-s), qstr+s); + // } + // fprintf(stderr, "[M::%s]::[%lu, %lu)\n", __func__, s, e); + + uint64_t *id_a = NULL, id_n, nec = 0; b32->n = 0; + rr_seq_t ssq; ssq.rref = rref; ssq.tu = tu; ssq.s = ssq.e = 0; ssq.n0 = ssq.n1 = 32; ssq.id = ssq.rev = 0; + id_n = iter_cc_idx_t(ol, idx, s, e, idx->rr, ((s==e)?1:0), &id_a); + // debug_inter0(ol, idx->c_idx, idx->idx->a + idx->i0, idx->srt_n - idx->i0, id_a, id_n, s, e, ((s==e)?1:0), 0, "-1-"); + uint64_t k, q[2], os, oe; ul_ov_t *p; overlap_region *z; idx->rr = 0; + // fprintf(stderr, "[M::%s] [%lu, %lu) id_n::%lu\n", __func__, s, e, id_n); + for (k = 0; k < id_n; k++) { + p = &(idx->c_idx[id_a[k]]); z = &(ol[ovlp_id(*p)]); + q[0] = z->w_list.a[ovlp_cur_wid(*p)].x_start+ovlp_bd(*p); + q[1] = z->w_list.a[ovlp_cur_wid(*p)].x_end+1-ovlp_bd(*p); + + // fprintf(stderr, "[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\terr::%d\toerr::%u\n", __func__, ol[ovlp_id(*p)].y_id, (int)Get_NAME_LENGTH(R_INF, ol[ovlp_id(*p)].y_id), Get_NAME(R_INF, ol[ovlp_id(*p)].y_id), + // ovlp_cur_wid(*p), ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_start, ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_end+1, ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].error, ol[ovlp_id(*p)].non_homopolymer_errors); + + if(q[1] <= e) idx->rr = 1; + os = MAX(q[0], s); oe = MIN(q[1], e); + // if((oe > os) || ((s == e) && (s >= q[0]) && (s <= q[1]))) { + if((oe > os) || ((s == e) && (s > q[0]) && (s < q[1]))) { + // if(oe >= os) { + ///[-4-][-12-][-4-][-12-] + ///[cigar_len][cigar][base_len][base] + extract_sub_cigar_cns(z, os, oe, s, e, s_end, &ssq, p, cns, b32, max_trace, rid); + // if(m != ((uint32_t)-1)) {///no gap in both sides + // kv_push(uint32_t, *b32, m); + // } + } + } + + // fprintf(stderr, "-2-[M::%s] cns->n::%u\n", __func__, (uint32_t)cns->n); + + // return; + + refine_cns_g(cns, b32); + + // fprintf(stderr, "-3-[M::%s] cns->n::%u\n", __func__, (uint32_t)cns->n); + + gseq_cns_g(cns, b32, e - s); + + // fprintf(stderr, "-4-[M::%s] cns->n::%u\n", __func__, (uint32_t)cns->n); + + // nec += push_correct1(ridx, res, cns, b32, e - s); + nec += push_correct1_fhc(ridx, res, cns, qstr + s, tu, exz, b32, e - s, rid); + + // fprintf(stderr, "-5-[M::%s] cns->n::%u\n", __func__, (uint32_t)cns->n); + return nec; +} + +uint64_t cns_gen_full(overlap_region* ol, All_reads *rref, uint64_t s0, uint64_t e0, uint64_t wl, char* qstr, UC_Read* tu, bit_extz_t *exz, cc_idx_t *idx, uint64_t occ_tot, double occ_max, asg32_v* b32, cns_gfa *cns, uint64_t max_trace, window_list *ridx, window_list_alloc *res, uint32_t rid) +{ + uint64_t nec = 0; + if(e0 - s0 <= wl) { + nec += cns_gen_full0(ol, rref, s0, e0, 1, qstr, tu, exz, idx, occ_tot, occ_max, b32, cns, max_trace, ridx, res, rid); + } else { + uint64_t s, e; + s = s0; e = s0 + wl; e = ((e<=e0)?e:e0); + for (; s < e0; ) { + // rn = iter_cc_idx_t(ol->list, &ii_a, s, e, rr, 0, &ra); + // debug_inter0(ol->list, ii_a.c_idx, ii_a.idx->a + ii_a.i0, ii_a.srt_n - ii_a.i0, ra, rn, s, e, 0, 1, "-0-"); + // rr = wcns_vote(ol->list, rref, qu->seq, ql, tu, ra, rn, s, e, ii_a.c_idx, &ii_b, occ_tot, occ_exact, aux_o, b32, cns, rid); + nec += cns_gen_full0(ol, rref, s, e, (s==s0)?1:0, qstr, tu, exz, idx, occ_tot, occ_max, b32, cns, max_trace, ridx, res, rid); + s += wl; e += wl; e = ((e<=e0)?e:e0); + } + } + return nec; +} + +uint64_t push_correct0(window_list *idx, window_list_alloc *res, uint32_t len0, uint32_t rc) +{ + uint64_t nec = 0; + // fprintf(stderr, "[M::%s] NULL(idx)::%u\n", __func__, idx?0:1); + if(len0 != ((uint32_t)-1)) { + push_trace_bp(((asg16_v *)(&(res->c))), 0, (uint16_t)-1, len0, ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; + } else if(rc != ((uint32_t)-1)) { + // fprintf(stderr, "[M::%s]\t", __func__); + uint32_t cc = (rc<<4)>>20, cn = rc>>28, ck = 0, cs, cp; + uint32_t bc = (rc<<20)>>20, bn = (rc<<16)>>28, bk = 0, bs, bp; + ///debug + // prt_cigar0(cc, cn); + // prt_bp0(bc, bn); + + for (ck = 0; ck < cn; ck++) { + cs = (cn-1-ck)<<1; cp = (cc>>cs)&3; + + bp = (uint32_t)-1; + if(cp != 3) {///bp == 3: more x + bs = (bn-1-bk)<<1; bp = (bc>>bs)&3; + bk++; + } + // fprintf(stderr, "[M::%s] cp::%u, bp::%u\n", __func__, cp, bp); + + push_trace_bp(((asg16_v *)(&(res->c))), cp, bp, 1, ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; + // fprintf(stderr, "%c", cm[(in >> mp)&3]); + + // fprintf(stderr, "%c", "MSID"[cp]); + if(cp != 0) nec++; + } + + // fprintf(stderr, "\n"); + } + + return nec; +} + +uint64_t push_correct0_fhc(window_list *idx, window_list_alloc *res, uint32_t len0, uint32_t rc, char *qstr) +{ + uint64_t nec = 0; + // fprintf(stderr, "[M::%s] NULL(idx)::%u\n", __func__, idx?0:1); + if(len0 != ((uint32_t)-1)) { + push_trace_bp_f(((asg16_v *)(&(res->c))), 0, (uint16_t)-1, (uint16_t)-1, len0, ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; + } else if(rc != ((uint32_t)-1)) { + // fprintf(stderr, "[M::%s]\t", __func__); + uint32_t cc = (rc<<4)>>20, cn = rc>>28, ck = 0, cs, cp; + uint32_t bc = (rc<<20)>>20, bn = (rc<<16)>>28, btk = 0, bqk = 0, bs, bqp, btp; + ///debug + // prt_cigar0(cc, cn); + // prt_bp0(bc, bn); + + for (ck = 0; ck < cn; ck++) { + cs = (cn-1-ck)<<1; cp = (cc>>cs)&3; + + bqp = btp = (uint32_t)-1; + if(cp != 3) {///bp == 3: more x + bs = (bn-1-btk)<<1; btp = (bc>>bs)&3; btk++; + } + if(cp != 2) {///bp == 2: more y + bqp = seq_nt6_table[(uint32_t)qstr[bqk]]; bqk++; + } + // fprintf(stderr, "[M::%s] cp::%u, btp::%u, bqp::%u\n", __func__, cp, btp, bqp); + + push_trace_bp_f(((asg16_v *)(&(res->c))), cp, bqp, btp, 1, ((idx->clen>0)?1:0)); + idx->clen = res->c.n - idx->cidx; + // fprintf(stderr, "%c", cm[(in >> mp)&3]); + + // fprintf(stderr, "%c", "MSID"[cp]); + if(cp != 0) nec++; + } + + // fprintf(stderr, "\n"); + } + + return nec; +} + +void output_cns_g(cns_gfa *cns, uint64_t s, uint64_t e) +{ + // char *gfa_id = NULL, p; MALLOC(gfa_id, Get_NAME_LENGTH(R_INF, qid) + 128); + // sprintf(gfa_id, "%.*s.%lu_%lu.cns.gfa", (int)Get_NAME_LENGTH(R_INF, qid), Get_NAME(R_INF, qid), s, e); + char *gfa_id = NULL, p, f; MALLOC(gfa_id, 128); + sprintf(gfa_id, "ec.%lu_%lu.cns.gfa", s, e); + + FILE *fp = fopen(gfa_id, "w"); uint64_t k, z; char cm[4]; cm[0] = 'A'; cm[1] = 'C'; cm[2] = 'G'; cm[3] = 'T'; + + sprintf(gfa_id, "s_0_%c", cm[cns->a[0].c]); + fprintf(fp, "S\t%s\t*\tLN:i:%u\trd:i:%c\n", gfa_id, 0/**cns->a[0].sc**/, cm[cns->a[0].c]); + + sprintf(gfa_id, "e_1_%c", cm[cns->a[1].c]); + fprintf(fp, "S\t%s\t*\tLN:i:%u\trd:i:%c\n", gfa_id, 0/**cns->a[1].sc**/, cm[cns->a[1].c]); + + for (k = 2; k < cns->n; k++) { + if(del_cns_nn((*cns), k)) continue; + sprintf(gfa_id, "%c_%lu_%c", ((kbn)?'b':'n'), k, cm[cns->a[k].c]); + fprintf(fp, "S\t%s\t*\tLN:i:%d\trd:i:%c\n", gfa_id, 0/**cns->a[k].sc**/, cm[cns->a[k].c]); + } + + for (k = 0; k < cns->n; k++) { + if(del_cns_nn((*cns), k)) continue; + + if(k == 0) { + p = 's'; + } else if(k == 1) { + p = 'e'; + } else if (kbn) { + p = 'b'; + } else { + p = 'n'; + } + + sprintf(gfa_id, "%c_%lu_%c", p, k, cm[cns->a[k].c]); + for (z = 0; z < cns->a[k].arc.n; z++) { + if(del_cns_arc((cns->a[k]), z)) continue; + + if(cns->a[k].arc.a[z].v == 0) { + p = 's'; + } else if(cns->a[k].arc.a[z].v == 1) { + p = 'e'; + } else if (cns->a[k].arc.a[z].v < cns->bn) { + p = 'b'; + } else { + p = 'n'; + } + + f = ((z < cns->a[k].arc.nou)?('+'):('-')); + fprintf(fp, "L\t%s\t%c\t%c_%u_%c\t%c\t0M\tL1:i:%u\n", + gfa_id, f, + p, cns->a[k].arc.a[z].v, cm[cns->a[cns->a[k].arc.a[z].v].c], f, + cns->a[k].arc.a[z].sc); + } + } + + fclose(fp); free(gfa_id); +} + +uint32_t cal_cigar_xlen(overlap_region *in) +{ + assert(in->w_list.n == 1); + uint32_t cn = in->w_list.a[0].clen; uint16_t *a = in->w_list.c.a + in->w_list.a[0].cidx; + uint32_t ci, len, xk, yk; uint16_t c, b; asg16_v scc; scc.n = scc.m = cn; scc.a = a; + + ci = 0; xk = yk = 0; + while (ci < cn) { + ci = pop_trace_bp(&scc, ci, &c, &b, &len); + if(c != 2) xk += len; + if(c != 3) yk += len; + } + + return xk; +} + +uint32_t cal_cigar_xlen_fhc(overlap_region *in) +{ + assert(in->w_list.n == 1); + uint32_t cn = in->w_list.a[0].clen; uint16_t *a = in->w_list.c.a + in->w_list.a[0].cidx; + uint32_t ci, len, xk, yk; uint16_t c, bq, bt; asg16_v scc; scc.n = scc.m = cn; scc.a = a; + + ci = 0; xk = yk = 0; + while (ci < cn) { + ci = pop_trace_bp_f(&scc, ci, &c, &bq, &bt, &len); + if(c != 2) xk += len; + if(c != 3) yk += len; + } + + return xk; +} + +uint64_t push_cns_anchor(overlap_region* ol, All_reads *rref, uint64_t s, uint64_t e, char* qstr, uint64_t ql, UC_Read* tu, bit_extz_t *exz, cc_idx_t *idx, overlap_region *aux_o, uint64_t is_tail, uint64_t occ_tot, double occ_max, asg32_v* b32, cns_gfa *cns, uint32_t rid) +{ + if((!is_tail) && (s >= e)) return 0;//if s >= e && is_tail = 1, gen the cns of the last a few bases -> s == e == ql + // fprintf(stderr, "\n****************[M::%s::M] [%lu, %lu)****************\n", __func__, s, e); + window_list *p = NULL; uint64_t e0 = 0, nec = 0; uint32_t rc; + if(aux_o->w_list.n > 0) { + p = &(aux_o->w_list.a[aux_o->w_list.n-1]); + e0 = p->x_end+1; + ///make sure e > s + } + assert(s >= e0); + if((s == e) && (is_tail == 1) && (s == e0)) return 0;///in this case, s == e == ql + + if(aux_o->w_list.n == 0) { + kv_pushp(window_list, aux_o->w_list, &p); + p->x_start = -1; p->x_end = -1; + p->clen = p->cidx = 0; + } + + if(((!is_tail) && (s > 0)) || ((is_tail) && (s > e0))) { + // fprintf(stderr, ">>>>>>[M::%s::M] [%lu, %lu), ec::%u\n", __func__, e0, s, cal_cigar_xlen_fhc(aux_o)); + // fprintf(stderr, ">>>>>>[M::%s::M] [%lu, %lu)\n", __func__, e0, s); + if (cns_gen0(ol, rref, e0, s, ql, tu, idx, occ_tot, occ_max, b32, &rc)) { + if(p->x_start == -1 || p->x_end == -1) {///hasn't neem set + p->x_start = e0; p->x_end = s-1; + } + // nec += push_correct0(p, &(aux_o->w_list), (uint32_t)-1, rc); + nec += push_correct0_fhc(p, &(aux_o->w_list), (uint32_t)-1, rc, qstr + e0); + // fprintf(stderr, "-0-[M::%s::M]\n", __func__); + } else { + nec += cns_gen_full(ol, rref, e0, s, cns->cns_g_wl, qstr, tu, exz, idx, occ_tot, occ_max, b32, cns, ql, p, &(aux_o->w_list), rid); + // output_cns_g(cns, e0, s); exit(1); + // fprintf(stderr, "-1-[M::%s::M]\n", __func__); + } + p->x_end = s-1; + + // if((uint32_t)p->x_end + 1 != cal_cigar_xlen_fhc(aux_o)) { + // fprintf(stderr, "sb1::%u\n", cal_cigar_xlen_fhc(aux_o)); + // } + } + + + // fprintf(stderr, "------[M::%s::M] [%lu, %lu), ec::%u\n", __func__, s, e, cal_cigar_xlen_fhc(aux_o)); + // fprintf(stderr, "------[M::%s::M] [%lu, %lu)\n", __func__, s, e); + if(p->x_start == -1 || p->x_end == -1) {///hasn't neem set + p->x_start = s; p->x_end = e-1; + } + // nec += push_correct0(p, &(aux_o->w_list), e-s, (uint32_t)-1); + nec += push_correct0_fhc(p, &(aux_o->w_list), e-s, (uint32_t)-1, NULL); + p->x_end = e-1; + // if((uint32_t)p->x_end + 1 != cal_cigar_xlen_fhc(aux_o)) { + // fprintf(stderr, "ta2::%u\n", cal_cigar_xlen_fhc(aux_o)); + // } + return nec; +} + +void prt_correct0_dbg(overlap_region *in) +{ + int64_t wn = in->w_list.n, k; uint32_t ci, len; asg16_v ff; uint16_t c, b; + char cm[4], cc[4]; + cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + cc[0] = 'A'; cc[1] = 'C'; cc[2] = 'G'; cc[3] = 'T'; + + for (k = 0; k < wn; k++) { + fprintf(stderr, "\n[M::%s] w[%ld] [%d, %d) clen::%u\n", __func__, k, in->w_list.a[k].x_start, in->w_list.a[k].x_end + 1, in->w_list.a[k].clen); + + ci = 0; ff.n = ff.m = in->w_list.a[k].clen; ff.a = in->w_list.c.a + in->w_list.a[k].cidx; + while (ci < ff.n) { + ci = pop_trace_bp(&ff, ci, &c, &b, &len); + fprintf(stderr, "|%u%c(%c)", len, cm[c], ((c==1)||(c==2))?(cc[b]):('*')); + } + + fprintf(stderr, "|\n"); + } +} + +uint64_t wcns_vote(overlap_region* ol, All_reads *rref, char* qstr, uint64_t ql, UC_Read* tu, bit_extz_t *exz, uint64_t *id_a, uint64_t id_n, uint64_t s, uint64_t e, ul_ov_t *c_idx, cc_idx_t *occ, uint64_t occ_tot, double occ_exact, overlap_region *aux_o, asg32_v* b32, cns_gfa *cns, uint32_t rid, uint64_t *nec) +{ + uint64_t k, q[2], rr = 0, os, oe, wl, oc[2], fI; ul_ov_t *p, *gp; overlap_region *z; + // uint64_t *ct = occ->idx->a;///occ->idx->a[0, wl<<1) + for (k = 0; k < id_n; k++) { + p = &(c_idx[id_a[k]]); z = &(ol[ovlp_id(*p)]); + q[0] = z->w_list.a[ovlp_cur_wid(*p)].x_start+ovlp_bd(*p); + q[1] = z->w_list.a[ovlp_cur_wid(*p)].x_end+1-ovlp_bd(*p); + if(q[1] <= e) rr = 1; + os = MAX(q[0], s); oe = MIN(q[1], e); + if(oe > os) { + ///prepare for CNS + gp = &(occ->c_idx[id_a[k]]); + ovlp_cur_xoff(*gp) = ovlp_cur_xoff(*p); ovlp_cur_yoff(*gp) = ovlp_cur_yoff(*p); ovlp_cur_coff(*gp) = ovlp_cur_coff(*p); ovlp_cur_ylen(*gp) = ovlp_cur_ylen(*p); + // assert(ovlp_cur_wid(*p) == ovlp_cur_wid(*gp)); + // assert(ovlp_id(*p) == ovlp_id(*gp)); + // fprintf(stderr, "[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\tos::%lu\toe::%lu\n", __func__, ol[ovlp_id(*p)].y_id, (int)Get_NAME_LENGTH(R_INF, ol[ovlp_id(*p)].y_id), Get_NAME(R_INF, ol[ovlp_id(*p)].y_id), + // ovlp_cur_wid(*p), ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_start, ol[ovlp_id(*p)].w_list.a[ovlp_cur_wid(*p)].x_end+1, os, oe); + extract_sub_cigar_mm(z, os, oe, p, occ->idx->a + os - s); + } + } + + wl = e - s; + os = occ->mms; oe = occ->mme; + + // fprintf(stderr, "[M::%s] s::%lu\te::%lu\n", __func__, s, e); + + for (k = 0; k < wl; k++) { + //+1 for the reference read + oc[0] = (occ->idx->a[(k<<1)]>>32) + 1; + oc[1] = ((uint32_t)occ->idx->a[(k<<1)]) + 1; + // fprintf(stderr, "-0-p::%lu\toc[0]::%lu\toc[1]::%lu\tgoc[0]::%lu\tgoc[1]::%u\n", s + k, oc[0], oc[1], (ct[(k<<1)+1]>>32) + 1, ((uint32_t)ct[(k<<1)+1]) + 1); // if(oc[1] < occ_tot || oc[0] <= 1) { // ct[(k<<1)] = ct[(k<<1)+1] = 0; // continue; // } - // fprintf(stderr, "-1-p::%lu\toc[0]::%lu\toc[1]::%lu\n", s + k, oc[0], oc[1]); + // fprintf(stderr, "-1-p::%lu\toc[0]::%lu\toc[1]::%lu\n", s + k, oc[0], oc[1]); + + ///a) pass coverage check; b) no enough coverage + if(((oc[0] > (oc[1]*occ_exact)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1)) || + (oc[1] < occ_tot)) { + ///note: there might be insertions at q[k-1, k], insead if q[k, k+1] + fI = 1; + ///make sure there is no insertion + //+1 for the reference read + oc[0] = (occ->idx->a[(k<<1)+1]>>32) + 1; + oc[1] = ((uint32_t)occ->idx->a[(k<<1)+1]) + 1; + ///a) pass coverage check; b) no enough coverage + if((((oc[0] > (oc[1]*occ_exact)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1))) || + (oc[1] < occ_tot)) { + fI = 0; + } + + if(fI) { + // fprintf(stderr, "-1-p::%lu\toc[0]::%lu\toc[1]::%u\tgoc[0]::%lu\tgoc[1]::%u\n", s + k, (occ->idx->a[(k<<1)]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)]) + 1, (occ->idx->a[(k<<1)+1]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)+1]) + 1); + if(oe > os && os != ((uint64_t)-1)) {///push previous intervals + (*nec) += push_cns_anchor(ol, rref, os, oe, qstr, ql, tu, exz, occ, aux_o, 0, occ_tot, occ_exact, b32, cns, rid); + } + os = oe = (uint64_t)-1; + } + + //+1 for the reference read + oc[0] = (occ->idx->a[(k<<1)]>>32) + 1; + oc[1] = ((uint32_t)occ->idx->a[(k<<1)]) + 1; + if((s+k) == oe) { + oe++; + } else { + if(oe > os && os != ((uint64_t)-1)) {///push previous intervals + (*nec) += push_cns_anchor(ol, rref, os, oe, qstr, ql, tu, exz, occ, aux_o, 0, occ_tot, occ_exact, b32, cns, rid); + } + os = s+k; oe = s+k+1; + } + } else { + // fprintf(stderr, "-2-p::%lu\toc[0]::%lu\toc[1]::%u\tgoc[0]::%lu\tgoc[1]::%u\n", s + k, (occ->idx->a[(k<<1)]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)]) + 1, (occ->idx->a[(k<<1)+1]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)+1]) + 1); + if(oe > os && os != ((uint64_t)-1)) {///push previous intervals + (*nec) += push_cns_anchor(ol, rref, os, oe, qstr, ql, tu, exz, occ, aux_o, 0, occ_tot, occ_exact, b32, cns, rid); + } + os = oe = (uint64_t)-1; + } + occ->idx->a[(k<<1)] = occ->idx->a[(k<<1)+1] = 0; + } + + occ->mms = occ->mme = (uint64_t)-1; + if(oe > os && os != ((uint64_t)-1)) { + occ->mms = os; occ->mme = oe; + } + return rr; +} + + + +void print_debug_ovlp_cigar(overlap_region_alloc* ol, asg64_v* idx, kv_ul_ov_t *c_idx) +{ + uint64_t k, ci; uint32_t cl; ul_ov_t *cp; bit_extz_t ez; uint16_t c; char cm[4]; + cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + for (k = 0; k < idx->n; k++) { + cp = &(c_idx->a[(uint32_t)idx->a[k]]); + fprintf(stderr, "**********[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\terr::%d\toerr::%u**********\n", __func__, ol->list[ovlp_id(*cp)].y_id, (int)Get_NAME_LENGTH(R_INF, ol->list[ovlp_id(*cp)].y_id), Get_NAME(R_INF, ol->list[ovlp_id(*cp)].y_id), + ovlp_cur_wid(*cp), ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start, ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1, ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].error, ol->list[ovlp_id(*cp)].non_homopolymer_errors); + set_bit_extz_t(ez, ol->list[ovlp_id(*cp)], ovlp_cur_wid(*cp)); ci = 0; + while (ci < ez.cigar.n) { + ci = pop_trace(&(ez.cigar), ci, &c, &cl); + fprintf(stderr, "%u%c", cl, cm[c]); + } + fprintf(stderr, "\n"); + } +} + +uint64_t wcns_gen(overlap_region_alloc* ol, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, kv_ul_ov_t *c_idx, asg64_v* idx, asg64_v* buf, int64_t bd, uint64_t wl, int64_t ql, uint64_t occ_tot, double occ_exact, overlap_region *aux_o, asg32_v* b32, cns_gfa *cns, uint64_t cns_g_wl, uint32_t rid) +{ + int64_t on = ol->length, k, i, zwn, q[2]; cns->cns_g_wl = cns_g_wl; + uint64_t m, *ra, rn, nec = 0, n_id, l_nid, p[2], li; overlap_region *z; ul_ov_t *cp; + bit_extz_t ez; uint64_t ci; uint32_t cl; uint16_t c; + + for (k = idx->n = c_idx->n = 0; k < on; k++) { + z = &(ol->list[k]); zwn = z->w_list.n; z->without_large_indel = l_nid = 0; + if((!zwn) || (z->is_match != 1)) continue; + for (i = 0, li = (uint64_t)-1; i < zwn; i++) { + if(is_ualn_win(z->w_list.a[i])) { + n_id = z->w_list.a[i].x_end + 1 - z->w_list.a[i].x_start; + if(n_id >= 6) l_nid = 1; + continue; + } + q[0] = z->w_list.a[i].x_start; q[1] = z->w_list.a[i].x_end; + q[0] += bd; q[1] -= bd; + if(q[1] >= q[0]) { + m = ((uint64_t)q[0]); m <<= 32; + m += c_idx->n; kv_push(uint64_t, *idx, m); + + kv_pushp(ul_ov_t, *c_idx, &cp); + ovlp_id(*cp) = k; ///ovlp id + // ovlp_min_wid(*cp) = i; ///beg id of windows + // ovlp_max_wid(*cp) = i; ///end id of windows + ovlp_cur_wid(*cp) = i; ///cur id of windows + ovlp_cur_xoff(*cp) = z->w_list.a[i].x_start; ///cur xpos + ovlp_cur_yoff(*cp) = z->w_list.a[i].y_start; ///cur xpos + ovlp_cur_ylen(*cp) = 0; + ovlp_cur_coff(*cp) = 0; ///cur cigar off in cur window + ovlp_bd(*cp) = bd; + } + + if(l_nid == 0) { + if(i == 0) { + p[0] = z->w_list.a[i].x_start; p[1] = z->x_pos_s; + n_id = ((p[0] >= p[1])? (p[0] - p[1]): (p[1] - p[0])); + if(n_id >= 6) l_nid = 1; + } + + if(li != (uint64_t)-1) { + p[0] = z->w_list.a[i].x_start; p[1] = z->w_list.a[li].x_end + 1; + n_id = ((p[0] >= p[1])? (p[0] - p[1]): (p[1] - p[0])); + if(n_id >= 6) l_nid = 1; + + p[0] = z->w_list.a[i].y_start; p[1] = z->w_list.a[li].y_end + 1; + n_id = ((p[0] >= p[1])? (p[0] - p[1]): (p[1] - p[0])); + if(n_id >= 6) l_nid = 1; + } + + if(i + 1 == zwn) { + p[0] = z->w_list.a[i].x_end; p[1] = z->x_pos_e; + n_id = ((p[0] >= p[1])? (p[0] - p[1]): (p[1] - p[0])); + if(n_id >= 6) l_nid = 1; + } + + if(l_nid == 0) { + set_bit_extz_t(ez, (*z), i); ci = 0; + while (ci < ez.cigar.n && l_nid == 0) { + ci = pop_trace(&(ez.cigar), ci, &c, &cl); + if(c >= 2 && cl >= 6) l_nid = 1; + } + } + } + + li = i; + } + z->without_large_indel = (l_nid?0:1); + } + + int64_t srt_n = idx->n, s, e, t, rr; i = 0; + radix_sort_ec64(idx->a, idx->a+idx->n); + for (k = 1, i = 0; k < srt_n; k++) { + if (k == srt_n || (idx->a[k]>>32) != (idx->a[i]>>32)) { + if(k - i > 1) { + for (t = i; t < k; t++) { + cp = &(c_idx->a[(uint32_t)idx->a[t]]); + // s = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); + // assert(s == (int64_t)(idx->a[i]>>32)); + m = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); + m <<= 32; m += ((uint32_t)idx->a[t]); idx->a[t] = m; + // fprintf(stderr, "[M::%s] s::%ld\tsi::%lu\n", __func__, s, (idx->a[i]>>32)); + } + radix_sort_ec64(idx->a + i, idx->a + k); + } + i = k; + } + } + + // print_debug_ovlp_cigar(ol, idx, c_idx); + + ///second index + kv_resize(ul_ov_t, *c_idx, (c_idx->n<<1)); + ul_ov_t *idx_a = NULL, *idx_b = NULL; + idx_a = c_idx->a; idx_b = c_idx->a; + memcpy(idx_b, idx_a, c_idx->n * (sizeof((*(idx_a))))); + + kv_resize(uint64_t, *buf, ((wl<<1) + idx->n)); buf->n = ((wl<<1) + idx->n); + memcpy(buf->a + (wl<<1), idx->a, idx->n * (sizeof((*(idx->a))))); + memset(buf->a, 0, (wl<<1)*(sizeof((*(idx->a))))); + + cc_idx_t ii_a, ii_b; memset(&ii_a, 0, sizeof(ii_a)); memset(&ii_b, 0, sizeof(ii_b)); + + ii_a.c_idx = idx_a; ii_a.idx = idx; ii_a.i = ii_a.i0 = 0; ii_a.srt_n = ii_a.idx->n; ii_a.mms = ii_a.mme = (uint64_t)-1; + ii_b.c_idx = idx_b; ii_b.idx = buf; ii_b.i = ii_b.i0 = (wl<<1); ii_b.srt_n = ii_b.idx->n; ii_b.mms = ii_b.mme = (uint64_t)-1; + + s = 0; e = wl; e = ((e<=ql)?e:ql); rr = 0; + aux_o->w_list.n = aux_o->w_list.c.n = 0; ///for cigar + for (; s < ql; ) { + rn = iter_cc_idx_t(ol->list, &ii_a, s, e, rr, 0, &ra); + // debug_inter0(ol->list, ii_a.c_idx, ii_a.idx->a + ii_a.i0, ii_a.srt_n - ii_a.i0, ra, rn, s, e, 0, 1, "-0-"); + rr = wcns_vote(ol->list, rref, qu->seq, ql, tu, exz, ra, rn, s, e, ii_a.c_idx, &ii_b, occ_tot, occ_exact, aux_o, b32, cns, rid, &nec); + s += wl; e += wl; e = ((e<=ql)?e:ql); + } + + if(ii_b.mme > ii_b.mms && ii_b.mms != (uint64_t)-1) { + nec += push_cns_anchor(ol->list, rref, ii_b.mms, ii_b.mme, qu->seq, ql, tu, exz, &ii_b, aux_o, 0, occ_tot, occ_exact, b32, cns, rid); + } + + nec += push_cns_anchor(ol->list, rref, ql, ql, qu->seq, ql, tu, exz, &ii_b, aux_o, 1, occ_tot, occ_exact, b32, cns, rid); + + + + ///for debug + // zwn = aux_o->w_list.n; + // fprintf(stderr, "\n******[M::%s]****** wn::%ld, ql::%ld\n", __func__, zwn, ql); + // prt_correct0_dbg(aux_o); + // for (i = 0; i < zwn; i++) { + // fprintf(stderr, "[M::%s] (%ld)[%d, %d)\n", __func__, i, aux_o->w_list.a[i].x_start, aux_o->w_list.a[i].x_end+1); + // } + return nec; +} + +void push_nec_re(overlap_region *in, asg16_v *ou) +{ + assert(in->w_list.n == 1); + uint32_t n1 = in->w_list.a[0].clen; + if(n1 > ou->m) { + REALLOC(ou->a, n1); ou->m = n1; + } + ou->n = n1; + memcpy(ou->a, in->w_list.c.a + in->w_list.a[0].cidx, n1*sizeof((*(in->w_list.c.a)))); +} + +uint32_t extract_max_exact_sub(asg16_v *in, int64_t xs0, int64_t xe0, int64_t ys0, int64_t ye0, int64_t *exk, int64_t *eyk, int64_t *eck, uint64_t *rxs, uint64_t *rxe, uint64_t *rys, uint64_t *rye) +{ + int64_t xk = *exk, yk = *eyk, ck = *eck, cn = in->n, ol, wx[2], wy[2], os, oe; uint16_t op, bq, bt; uint32_t cl, ovlp; + *rxs = *rxe = *rys = *rye = 0; + + if((ck < 0) || (ck > cn)) {//(*ck) == cn is allowed + ck = 0; xk = 0; yk = 0; + } + + while (ck > 0 && xk >= xs0) {///x -> t; y -> p; first insertion and then match/mismatch + --ck; + op = in->a[ck]>>14; + // ol = (((op == 1) || (op == 2))?(in->a[ck]&(0xfff)):(in->a[ck]&(0x3fff))); + if((op == 2) || (op == 3)) { + ol = in->a[ck]&(0xfff); + } else if(op == 1) { + ol = in->a[ck]&(0x3ff); + } else { + ol = in->a[ck]&(0x3fff); + } + if(op != 2) xk -= ol; + if(op != 3) yk -= ol; + } + + while (ck < cn && xk < xe0) { + wx[0] = xk; wy[0] = yk; + // ck = pop_trace_bp(in, ck, &op, &b, &cl); + ck = pop_trace_bp_f(in, ck, &op, &bq, &bt, &cl); + if(op != 2) xk += cl; + if(op != 3) yk += cl; + wx[1] = xk; wy[1] = yk; + if(op == 0) { + os = MAX(xs0, wx[0]); oe = MIN(xe0, wx[1]); + ovlp = ((oe>os)? (oe-os):0); + if((ovlp > 0) && (ovlp > (*rxe) - (*rxs))) { + (*rxs) = wy[0] + os - wx[0]; + (*rxe) = wy[0] + oe - wx[0]; + + (*rys) = ys0 + os - xs0; + (*rye) = ys0 + oe - xs0; + } + } + } + + *exk = xk; *eyk = yk; *eck = ck; + if((*rxe) > (*rxs)) return 1; + return 0; +} + +void debug_extract_max_exact_sub(uint32_t qid, UC_Read* qu, UC_Read* tu) +{ + uint32_t ci = 0, len, xk, yk, wx[2], wy[2], k; uint16_t c, b; + + recover_UC_Read(tu, &R_INF, qid); + + ci = 0; yk = 0; + while (ci < scc.a[qid].n) { + ci = pop_trace_bp(&scc.a[qid], ci, &c, &b, &len); + if(c != 3) yk += len; + } + resize_UC_Read(qu, yk); + + ci = 0; xk = yk = 0; + while (ci < scc.a[qid].n) { + wx[0] = xk; wy[0] = yk; + ci = pop_trace_bp(&scc.a[qid], ci, &c, &b, &len); + if(c != 2) xk += len; + if(c != 3) yk += len; + wx[1] = xk; wy[1] = yk; + if(c == 0) { + // memcpy(p->a + wy[0], p->z.seq + wx[0], (wx[1]-wx[0])*sizeof((*(p->a)))); + for (; wx[0] < wx[1]; wx[0]++, wy[0]++) { + qu->seq[wy[0]] = tu->seq[wx[0]]; + } + } else if(c == 1 || c == 2) { + for (k = wy[0]; k < wy[1]; k++) { + qu->seq[k] = s_H[b]; + } + } + // if(i == 700) fprintf(stderr, "|%u%c(%c)(x::%u)(y::%u)", len, cm[c], ((c==1)||(c==2))?(cc[b]):('*'), wx[1], wy[1]); // s_H + } +} + +uint64_t extract_max_exact(overlap_region *z, asg16_v *ec, /**UC_Read *qu, UC_Read *tu,**/ uint32_t *rxs, uint32_t *rxe, uint32_t *rys, uint32_t *rye) +{ + // if(z->x_id == 75 && z->y_id == 59) { + // fprintf(stderr, "[M::%s]\tz->y_id::%u\n", __func__, z->y_id); + // if(z->y_pos_strand) { + // recover_UC_Read_RC(tu, &R_INF, z->y_id); + // } else { + // recover_UC_Read(tu, &R_INF, z->y_id); ///b->z.length + // } + // } + + *rxs = *rxe = *rys = *rye = (uint32_t)-1; + uint64_t k, rx[2], ry[2], xk, yk, wx[2], wy[2], mx[2], my[2]; + uint32_t cl, ck; uint16_t c; asg16_v ct; int64_t exk, eyk, eck; + exk = eyk = eck = 0; mx[0] = mx[1] = my[0] = my[1] = 0; + for (k = 0; k < z->w_list.n; k++) { + ct.a = z->w_list.c.a + z->w_list.a[k].cidx; + ct.n = ct.m = z->w_list.a[k].clen; ck = 0; + xk = z->w_list.a[k].x_start; yk = z->w_list.a[k].y_start; + while (ck < ct.n) { + wx[0] = xk; wy[0] = yk; + ck = pop_trace(&ct, ck, &c, &cl); + if(c != 2) xk += cl; + if(c != 3) yk += cl; + wx[1] = xk; wy[1] = yk; + // if(c == 0) { + // if(memcmp(qref + wx[0], tu->seq + wy[0], wx[1] - wx[0])) { + // fprintf(stderr, "-0-[M::%s]\teq::[%lu,\t%lu)\tet::[%lu,\t%lu)\t%c\n", __func__, wx[0], wx[1], wy[0], wy[1], "+-"[z->y_pos_strand]); + // // exit(1); + // } + // } + if(wx[1] <= wx[0]) continue; + if((wx[1] - wx[0]) <= (mx[1] - mx[0])) continue; + if((c == 0) && (extract_max_exact_sub(ec, wx[0], wx[1], wy[0], wy[1], &exk, &eyk, &eck, &(rx[0]), &(rx[1]), &(ry[0]), &(ry[1])))) { + // if(z->x_id == 75 && z->y_id == 59) { + // if(memcmp(qu->seq + rx[0], tu->seq + ry[0], rx[1] - rx[0])) { + // fprintf(stderr, "-1-[M::%s]\tzq::[%lu,\t%lu)\tzt::[%lu,\t%lu)\teq::[%lu,\t%lu)\tet::[%lu,\t%lu)\n", + // __func__, wx[0], wx[1], wy[0], wy[1], rx[0], rx[1], ry[0], ry[1]); + // exit(1); + // } + // else { + // fprintf(stderr, "-2-[M::%s]\tzq::[%lu,\t%lu)\tzt::[%lu,\t%lu)\teq::[%lu,\t%lu)\tet::[%lu,\t%lu)\n", + // __func__, wx[0], wx[1], wy[0], wy[1], rx[0], rx[1], ry[0], ry[1]); + + // fprintf(stderr, "[M::%s] qstr::%.*s\n", __func__, ((int)(rx[1] - rx[0])), qu->seq + rx[0]); + // fprintf(stderr, "[M::%s] tstr::%.*s\n", __func__, ((int)(ry[1] - ry[0])), tu->seq + ry[0]); + // } + // } + if((rx[1] - rx[0]) > (mx[1] - mx[0])) { + mx[0] = rx[0]; mx[1] = rx[1]; + my[0] = ry[0]; my[1] = ry[1]; + } + } + } + } + + if(mx[1] > mx[0]) { + *rxs = mx[0]; *rxe = mx[1]; + *rys = my[0]; *rye = my[1]; + return 1; + } + + return 0; +} + +void push_ne_ovlp(ma_hit_t_alloc* paf, overlap_region_alloc* ov, uint32_t flag, All_reads* R_INF, asg16_v *ec/**, uint64_t qid, UC_Read *qu, UC_Read *tu**/) +{ + // if(qu && tu) { + // debug_extract_max_exact_sub(qid, qu, tu); + // } + uint64_t k, n; ma_hit_t *z; uint32_t rxs, rxe, rys, rye; + for (k = n = 0; k < ov->length; k++) { + if(ov->list[k].is_match == flag) n++; + } + + if(n > paf->size) { + paf->size = n; + REALLOC(paf->buffer, paf->size); + } + + for (k = paf->length = 0; k < ov->length; k++) { + if(ov->list[k].is_match == flag) { + // fprintf(stderr, "@%s\tSN:%.*s(id::%u)\terr::%u\n", flag==1?"SQ":"RQ", (int32_t)Get_NAME_LENGTH((*R_INF), ov->list[k].y_id), Get_NAME((*R_INF), ov->list[k].y_id), ov->list[k].y_id, ov->list[k].non_homopolymer_errors); + + z = &(paf->buffer[paf->length++]); + + z->qns = ov->list[k].x_id; + z->qns = z->qns << 32; + z->tn = ov->list[k].y_id; + + z->qns = z->qns | (uint64_t)(ov->list[k].x_pos_s); + z->qe = ov->list[k].x_pos_e + 1; + z->ts = ov->list[k].y_pos_s; + z->te = ov->list[k].y_pos_e + 1; + + ///for overlap_list, the x_strand of all overlaps are 0, so the tmp.rev is the same as the y_strand + z->rev = ov->list[k].y_pos_strand; + + z->bl = Get_READ_LENGTH((*R_INF), ov->list[k].y_id); + z->ml = ov->list[k].strong; + z->no_l_indel = ov->list[k].without_large_indel; + + if(ec) { + extract_max_exact(&ov->list[k], ec, /**qu, tu,**/ &rxs, &rxe, &rys, &rye); + z->el = 0; + // fprintf(stderr, "[M::%s]\tq::[%u,\t%u)\tt::[%u,\t%u)\teq::[%u,\t%u)\tet::[%u,\t%u)\n", __func__, ov->list[k].x_pos_s, ov->list[k].x_pos_e + 1, ov->list[k].y_pos_s, ov->list[k].y_pos_e + 1, rxs, rxe, rys, rye); + if(rxe > rxs) { + z->qns = ov->list[k].x_id; + z->qns = z->qns << 32; + z->qns = z->qns | (uint64_t)(rxs); + z->qe = rxe; + z->ts = rys; + z->te = rye; + + z->el = 1; + } + } + } + } +} + +void push_ff_ovlp(ma_hit_t_alloc* paf, overlap_region_alloc* ov, uint32_t flag, All_reads* R_INF, uint64_t *cnt) +{ + // if(qu && tu) { + // debug_extract_max_exact_sub(qid, qu, tu); + // } + uint64_t k, n; ma_hit_t *z; + for (k = n = 0; k < ov->length; k++) { + if(ov->list[k].is_match == flag) n++; + } + + if(n > paf->size) { + paf->size = n; + REALLOC(paf->buffer, paf->size); + } + + for (k = paf->length = 0; k < ov->length; k++) { + if(ov->list[k].is_match == flag) { + z = &(paf->buffer[paf->length++]); + + z->qns = ov->list[k].x_id; + z->qns = z->qns << 32; + z->tn = ov->list[k].y_id; + + z->qns = z->qns | (uint64_t)(ov->list[k].x_pos_s); + z->qe = ov->list[k].x_pos_e + 1; + z->ts = ov->list[k].y_pos_s; + z->te = ov->list[k].y_pos_e + 1; + + ///for overlap_list, the x_strand of all overlaps are 0, so the tmp.rev is the same as the y_strand + z->rev = ov->list[k].y_pos_strand; + + z->bl = Get_READ_LENGTH((*R_INF), ov->list[k].y_id); + z->ml = ov->list[k].strong; + z->no_l_indel = ov->list[k].without_large_indel; + z->el = ov->list[k].shared_seed; + + if(z->rev) { + z->ts = z->bl - ov->list[k].y_pos_e - 1; + z->te = z->bl - ov->list[k].y_pos_s; + } + + if(flag == 1) { + if(z->ml == 1) cnt[2]++; + if(z->ml == 0) cnt[3]++; + if(z->el == 1) cnt[4]++; + if(z->no_l_indel) cnt[5]++; + } + } + } + + if(flag == 1) cnt[0] += paf->length; + if(flag == 2) cnt[1] += paf->length; +} + +void debug_mm_exact_cigar(overlap_region_alloc* ol, uint32_t qid, UC_Read *qu, UC_Read *tu) +{ + int64_t on = ol->length, k, i, zwn, xk, yk; uint32_t cl, ck; ///bit_extz_t ez; + overlap_region *z; asg16_v ct; uint64_t wx[2], wy[2]; uint16_t c; + recover_UC_Read(qu, &R_INF, qid); + + for (i = 0; i < on; i++) { + z = &(ol->list[i]); zwn = z->w_list.n; + if(z->y_pos_strand) { + recover_UC_Read_RC(tu, &R_INF, z->y_id); + } else { + recover_UC_Read(tu, &R_INF, z->y_id); ///b->z.length + } + + // fprintf(stderr, "[M::%s] x_id::%u\ty_id::%u\tx::[%u, %u)\ty::[%u, %u)\tzwn::%ld\n", + // __func__, z->x_id, z->y_id, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1, zwn); + // fprintf(stderr, "qstr(%lld)::%.*s\n", qu->length, (int32_t)(qu->length), qu->seq); + // fprintf(stderr, "tstr(%lld)::%.*s\n", tu->length, (int32_t)(tu->length), tu->seq); + + for (k = 0; k < zwn; k++) { + // set_bit_extz_t(ez, (*z), k); + // if(!cigar_check(tu->seq, qu->seq, &ez)) { + // fprintf(stderr, "\n[M::%s] x_id::%u, y_id::%u, x::[%u, %u), y::[%u, %u)\n", __func__, z->x_id, z->y_id, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); + // exit(1); + // } + // continue; + ct.a = z->w_list.c.a + z->w_list.a[k].cidx; + ct.n = ct.m = z->w_list.a[k].clen; ck = 0; + xk = z->w_list.a[k].x_start; yk = z->w_list.a[k].y_start; + while (ck < ct.n) { + wx[0] = xk; wy[0] = yk; + ck = pop_trace(&ct, ck, &c, &cl); + if(c != 2) xk += cl; + if(c != 3) yk += cl; + wx[1] = xk; wy[1] = yk; + if(c == 0) { + if(memcmp(qu->seq + wx[0], tu->seq + wy[0], wx[1] - wx[0])) { + fprintf(stderr, "\n-0-[M::%s]\teq::[%lu,\t%lu)\tet::[%lu,\t%lu)\t%c\n", __func__, wx[0], wx[1], wy[0], wy[1], "+-"[z->y_pos_strand]); + fprintf(stderr, "qstr(%u)::%.*s\n", z->x_id, (int32_t)(wx[1] - wx[0]), qu->seq + wx[0]); + fprintf(stderr, "tstr(%u)::%.*s\n", z->y_id, (int32_t)(wy[1] - wy[0]), tu->seq + wy[0]); + // exit(1); + } + // else { + // fprintf(stderr, "\n-1-[M::%s]\teq::[%lu,\t%lu)\tet::[%lu,\t%lu)\t%c\n", __func__, wx[0], wx[1], wy[0], wy[1], "+-"[z->y_pos_strand]); + // fprintf(stderr, "qstr(%u)::%.*s\n", z->x_id, (int32_t)(wx[1] - wx[0]), qu->seq + wx[0]); + // fprintf(stderr, "tstr(%u)::%.*s\n", z->y_id, (int32_t)(wy[1] - wy[0]), tu->seq + wy[0]); + // } + } + } + } + + } +} + +void check_well_cal(asg16_v *sc, asg64_v *idx, uint8_t *f_ec, uint8_t *abnormal, int64_t len, int64_t min_dp, ma_hit_t_alloc *in) +{ + uint64_t k, s, e; int64_t dp, old_dp, st = 0, ed; ma_hit_t *z; + + (*f_ec) = 1; (*abnormal) = 0; idx->n = 0; + for (k = 0; k < in->length; k++) { + z = &(in->buffer[k]); + s = ((uint32_t)(z->qns)); e = z->qe; + kv_push(uint64_t, (*idx), (s<<1)); + kv_push(uint64_t, (*idx), (e<<1)|1); + } + + radix_sort_ec64(idx->a, idx->a + idx->n); + for (k = 0, dp = 0, st = ed = 0; k < idx->n; ++k) { + old_dp = dp; + ///if a[j] is qe + if (idx->a[k]&1) --dp; + else ++dp; + + ed = idx->a[k]>>1; + if(ed > st) { + if(old_dp < min_dp) (*f_ec) = 0; + if(old_dp == 0) { + if(st > 0 && ed < len) { + (*abnormal) = 1; + }else if((*abnormal)==0){ + (*abnormal) = 2; + } + } + } + + st = ed; + } + + + ed = len; old_dp = dp; + if(ed > st) { + if(old_dp < min_dp) (*f_ec) = 0; + if(old_dp == 0) { + if(st > 0 && ed < len) { + (*abnormal) = 1; + }else if((*abnormal) == 0){ + (*abnormal) = 2; + } + } + } + + if((*f_ec)) { + for (k = 0; (k < sc->n) && ((sc->a[k]>>14) == 0); k++); + if(k < sc->n) (*f_ec) = 0; + } +} + +inline uint64_t exact_ec_check(char *qstr, uint64_t ql, char *tstr, uint64_t tl, int64_t qs, int64_t qe, int64_t ts, int64_t te) +{ + if(qe - qs != te - ts) return 0; + if(memcmp(qstr + qs, tstr + ts, qe - qs) == 0) return 1; + return 0; +} + +void gen_hc_r_alin_ea(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v *buf, asg64_v *srt, ma_hit_t_alloc *in) +{ + if(ol->length <= 0) return; + + uint64_t k, i, m, *ei, en, *oi, on, tid, trev, nec; overlap_region *z; ma_hit_t *p; + srt->n = 0; + for (k = 0; k < in->length; k++) { + if(in->buffer[k].el) { + m = in->buffer[k].tn; m <<= 1; m |= in->buffer[k].rev; + m <<= 32; m |= k; kv_push(uint64_t, (*srt), m); + } + } + + if(!(srt->n)) { + gen_hc_r_alin(ol, cl, rref, qu, tu, exz, aux_o, e_rate, wl, rid, khit, move_gap, buf); + } else { + kv_resize(uint64_t, *srt, (srt->n + ol->length)); + ei = srt->a; en = srt->n; oi = srt->a + srt->n; on = ol->length; + for (k = 0; k < on; k++) { + z = &(ol->list[k]); z->is_match = z->strong = z->without_large_indel = 0; + oi[k] = z->y_id; oi[k] <<= 1; oi[k] |= z->y_pos_strand; + oi[k] <<= 32; oi[k] |= k; + } + + radix_sort_ec64(ei, ei + en); radix_sort_ec64(oi, oi + on); + for (k = i = nec = 0; k < on; k++) { + z = &(ol->list[(uint32_t)oi[k]]); tid = z->y_id; trev = z->y_pos_strand; + for (; (i < en) && ((ei[i]>>32) < ((tid<<1)|trev)); i++); + if((i < en) && ((ei[i]>>32) == ((tid<<1)|trev))) { + p = &(in->buffer[(uint32_t)ei[i]]); + if((z->x_pos_s == ((uint32_t)p->qns)) && (z->x_pos_e + 1 == p->qe) && + (z->y_pos_s == p->ts) && (z->y_pos_e + 1 == p->te)) { + resize_UC_Read(tu, p->te - p->ts); recover_UC_Read_sub_region(tu->seq, p->ts, p->te - p->ts, trev, rref, tid); + if(exact_ec_check(qu->seq, qu->length, tu->seq, p->te - p->ts, ((uint32_t)p->qns), p->qe, 0, p->te - p->ts)) { + z->is_match = 1; z->shared_seed = z->non_homopolymer_errors;///for index + z->non_homopolymer_errors = 0; z->strong = z->without_large_indel = 0; + set_exact_exz(exz, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); push_alnw(z, exz); + nec++; + } + } + } + } + + if(on > nec) gen_hc_r_alin_nec(ol, cl, rref, qu, tu, exz, aux_o, e_rate, wl, rid, khit, move_gap, buf); + } +} + +void prt_ovlp_sam_0(char *cm, FILE *fp, char *ref_id, int32_t ref_id_n, char *qry_id, int32_t qry_id_n, char *qry_seq, uint64_t qry_seq_n, uint64_t rs, uint64_t re, uint64_t qs, uint64_t qe, uint64_t flag, uint64_t err, bit_extz_t *ez) +{ + uint64_t ci = 0; uint16_t c; uint32_t cl; + fprintf(fp, "%.*s\t%lu\t%.*s\t%lu\t60\t", qry_id_n, qry_id, flag, ref_id_n, ref_id, rs + 1); + + if(qs) fprintf(fp, "%luS", qs); + while (ci < ez->cigar.n) { + ci = pop_trace(&(ez->cigar), ci, &c, &cl); + fprintf(fp, "%u%c", cl, cm[c]); + } + if(qry_seq_n > qe) fprintf(fp, "%luS", qry_seq_n - qe); + fprintf(fp, "\t*\t0\t0\t%.*s\t", (int32_t)qry_seq_n, qry_seq); + for (ci = 0; ci < qry_seq_n; ci++) fprintf(fp, "~"); + fprintf(fp, "\tNM:i:%lu\n", err); +} + + +void prt_ovlp_sam(overlap_region_alloc* ol, UC_Read* tu, char *ref_seq, int32_t ref_seq_n) +{ + int64_t on = ol->length, k, i, zwn; overlap_region *z; bit_extz_t ez; + char *qry = NULL, *ref = Get_NAME(R_INF, ol->list[0].x_id); + uint64_t qry_n = 0, ref_n = Get_NAME_LENGTH(R_INF, ol->list[0].x_id), qid, rev; + char cm[4]; cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + FILE *fp = fopen("aln.sam", "w"); + fprintf(fp, "@HD\tVN:1.6\tSO:unknown\n"); + fprintf(fp, "@SQ\tSN:%.*s\tLN:%lu\n", (int32_t)ref_n, ref, Get_READ_LENGTH(R_INF, ol->list[0].x_id)); + for (k = 0; k < on; k++) { + z = &(ol->list[k]); zwn = z->w_list.n; + if(!zwn) continue; + qid = ol->list[k].y_id; + if(z->y_pos_strand) { + recover_UC_Read_RC(tu, &R_INF, qid); rev = 16; + } else { + recover_UC_Read(tu, &R_INF, qid); rev = 0; + } + for (i = 0; i < zwn; i++) { + if(is_ualn_win(z->w_list.a[i])) continue; + qry_n = Get_NAME_LENGTH(R_INF, qid); + qry = Get_NAME(R_INF, qid); + set_bit_extz_t(ez, (*z), i); + + prt_ovlp_sam_0(cm, fp, ref, ref_n, qry, qry_n, tu->seq, tu->length, z->w_list.a[i].x_start, z->w_list.a[i].x_end + 1, z->w_list.a[i].y_start, z->w_list.a[i].y_end + 1, rev, z->w_list.a[i].error, &ez); + } + } + fclose(fp); + + fp = fopen("ref.fa", "w"); + fprintf(fp, ">%.*s\n", (int32_t)ref_n, ref); + fprintf(fp, "%.*s\n", ref_seq_n, ref_seq); + fclose(fp); +} + +static void worker_hap_ec(void *data, long i, int tid) +{ + ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]); + uint32_t high_occ = asm_opt.hom_cov * (2.0 - HA_KMER_GOOD_RATIO); + uint32_t low_occ = asm_opt.hom_cov * HA_KMER_GOOD_RATIO; + overlap_region *aux_o = NULL; asg64_v buf0; uint32_t qlen = 0; + + // if (memcmp(/**"m64062_190807_194840/180552420/ccs"**/"m64062_190803_042216/161743554/ccs", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { + // fprintf(stderr, "-a-[M::%s-beg] rid->%ld\n", __func__, i); + // } else { + // return; + // } + + // if(i != 3028559) return; + // if(i != 306) return; + // if(i != 1124) return; + // if(i != 700) return; + // if(i != 2243244) return; + // if(i != 19350) return; + + recover_UC_Read(&b->self_read, &R_INF, i); qlen = b->self_read.length; + + h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, 0.02, asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, /**0**/2, UINT32_MAX); + + // b->num_read_base += b->olist.length; + b->cnt[0] += b->self_read.length; + + aux_o = fetch_aux_ovlp(&b->olist);///must be here + + // gen_hc_r_alin(&b->olist, &b->clist, &R_INF, &b->self_read, &b->ovlp_read, &b->exz, aux_o, asm_opt.max_ov_diff_ec, WINDOW_HC, i, E_KHIT/**asm_opt.k_mer_length**/, 1, &b->v16); + gen_hc_r_alin_ea(&b->olist, &b->clist, &R_INF, &b->self_read, &b->ovlp_read, &b->exz, aux_o, asm_opt.max_ov_diff_ec, WINDOW_HC, i, E_KHIT/**asm_opt.k_mer_length**/, 1, &b->v16, &b->v64, &(R_INF.paf[i])); + + // prt_ovlp_sam(&b->olist, &b->ovlp_read, b->self_read.seq, b->self_read.length); + + + // fprintf(stderr, "\n[M::%s] rid::%ld\t%.*s\tlen::%lld\tocc::%lu\n", __func__, i, (int)Get_NAME_LENGTH(R_INF, i), + // Get_NAME(R_INF, i), b->self_read.length, b->olist.length); + + // fprintf(stderr, "[M::%s] rid::%ld\n", __func__, i); + // debug_mm_exact_cigar(&b->olist, i, &b->self_read, &b->ovlp_read); + + // b->num_correct_base += b->olist.length; + + copy_asg_arr(buf0, b->sp); + rphase_hc(&b->olist, &R_INF, &b->hap, &b->self_read, &b->ovlp_read, &b->pidx, &b->v64, &buf0, 0, WINDOW_MAX_SIZE, b->self_read.length, 1/**, 0**/, i); + copy_asg_arr(b->sp, buf0); + + copy_asg_arr(buf0, b->sp); + b->cnt[1] += wcns_gen(&b->olist, &R_INF, &b->self_read, &b->ovlp_read, &b->exz, &b->pidx, &b->v64, &buf0, 0, 512, b->self_read.length, 3, 0.500001, aux_o, &b->v32, &b->cns, 256, i); + copy_asg_arr(b->sp, buf0); + + push_nec_re(aux_o, &(scc.a[i])); + push_nec_re(aux_o, &(scb.a[i])); + + + + push_ne_ovlp(&(R_INF.paf[i]), &b->olist, 1, &R_INF, &(scc.a[i])/**, i, &b->self_read, &b->ovlp_read**/); + push_ne_ovlp(&(R_INF.reverse_paf[i]), &b->olist, 2, &R_INF, NULL/**, i, NULL, NULL**/); + + + check_well_cal(&(scc.a[i]), &b->v64, &(R_INF.paf[i].is_fully_corrected), &(R_INF.paf[i].is_abnormal), qlen, (MIN_COVERAGE_THRESHOLD*2), &(R_INF.paf[i])); + R_INF.trio_flag[i] = AMBIGU; + + // uint32_t k; + // for (k = 0; k < b->olist.length; k++) { + // if(b->olist.list[k].is_match == 1) b->num_recorrect_base++; + // } + + // exit(1); + + + + // prt_chain(&b->olist); + + // ul_map_lchain(b->abl, (uint32_t)-1, s->seq[i], s->len[i], s->opt->w, s->opt->k, s->uu, &b->olist, &b->clist, s->opt->bw_thres, + // s->opt->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2/**0.75**/, 2, 3); + + /** + int fully_cov, abnormal; + // if(i != 12578) return; + // fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i); + // if (memcmp("7897e875-76e5-42c8-bc37-94b370c4cc8d", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { + // fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i); + // } else { + // return; + // } + + ha_get_candidates_interface(b->ab, i, &b->self_read, &b->olist, &b->olist_hp, &b->clist, + 0.02, asm_opt.max_n_chain, 1, NULL, &b->r_buf, &(R_INF.paf[i]), &(R_INF.reverse_paf[i]), &(b->tmp_region), NULL, &(b->sp)); + + clear_Cigar_record(&b->cigar1); + clear_Round2_alignment(&b->round2); + + correct_overlap(&b->olist, &R_INF, &b->self_read, &b->correct, &b->ovlp_read, &b->POA_Graph, &b->DAGCon, + &b->cigar1, &b->hap, &b->round2, &b->r_buf, &(b->tmp_region.w_list), 0, 1, &fully_cov, &abnormal); + + b->num_read_base += b->self_read.length; + b->num_correct_base += b->correct.corrected_base; + b->num_recorrect_base += b->round2.dumy.corrected_base; + + push_cigar(R_INF.cigars, i, &b->cigar1); + push_cigar(R_INF.second_round_cigar, i, &b->round2.cigar); + + R_INF.paf[i].is_fully_corrected = 0; + if (fully_cov) { + if (get_cigar_errors(&b->cigar1) == 0 && get_cigar_errors(&b->round2.cigar) == 0) + R_INF.paf[i].is_fully_corrected = 1; + } + R_INF.paf[i].is_abnormal = abnormal; + + R_INF.trio_flag[i] = AMBIGU; + + ///need to be fixed in r305 + // if(ha_idx_hp == NULL) + // { + // R_INF.trio_flag[i] += collect_hp_regions(&b->olist, &R_INF, &(b->k_flag), RESEED_HP_RATE, Get_READ_LENGTH(R_INF, i), NULL); + // } + + if (R_INF.trio_flag[i] != AMBIGU || b->save_ov) { + int is_rev = (asm_opt.number_of_round % 2 == 0); + push_overlaps(&(R_INF.paf[i]), &b->olist, 1, &R_INF, is_rev); + push_overlaps(&(R_INF.reverse_paf[i]), &b->olist, 2, &R_INF, is_rev); + } + + if(het_cnt) het_cnt[i] = get_het_cnt(&b->hap); + // fprintf(stderr, "[M::%s-end] rid->%ld\n", __func__, i); + **/ + // exit(1); + refresh_ec_ovec_buf_t0(b, REFRESH_N); +} + +uint32_t adjust_exact_match(asg16_v *in, int64_t xs0, int64_t xe0, int64_t ys0, int64_t ye0, uint64_t *rxs, uint64_t *rxe, uint64_t *rys, uint64_t *rye, uint32_t rev) +{ + *rxs = *rxe = *rys = *rye = 0; + if((xe0 <= xs0) || (ye0 <= ys0)) return 0; + int64_t xk, yk, ck, cn = in->n, wx[2], wy[2], os, oe; uint16_t op, bq, bt; uint32_t cl; uint64_t ovlp; + xk = yk = 0; + + // fprintf(stderr, "[M::%s]\tx0::[%ld,%ld)\ty0::[%ld,%ld)\trev::%u\n", __func__, xs0, xe0, ys0, ye0, rev); + + if(!rev) { + ck = 0; + while (ck < cn && xk < xe0) { + wx[0] = xk; wy[0] = yk; + // ck = pop_trace_bp(in, ck, &op, &b, &cl); + ck = pop_trace_bp_f(in, ck, &op, &bq, &bt, &cl); + if(op != 2) xk += cl; + if(op != 3) yk += cl; + wx[1] = xk; wy[1] = yk; + // fprintf(stderr, "[M::%s]\told::[%ld,%ld]\tnew::[%ld,%ld]\t%u%c\n", __func__, wx[0], wx[1], wy[0], wy[1], cl, "MSID"[op]); + if(op == 0) { + os = MAX(xs0, wx[0]); oe = MIN(xe0, wx[1]); + ovlp = ((oe>os)? (oe-os):0); + if((ovlp > 0) && (ovlp > (*rxe) - (*rxs))) { + // fprintf(stderr, "[M::%s]\to::[%ld,%ld)\n", __func__, os, oe); + (*rxs) = wy[0] + os - wx[0]; + (*rxe) = wy[0] + oe - wx[0]; + + (*rys) = ys0 + os - xs0; + (*rye) = ys0 + oe - xs0; + } + } + // if((op == 0) && (wx[0] <= s) && (wx[1] >= e)) { + // (*rs) = wy[0] + s - wx[0]; + // (*re) = wy[0] + e - wx[0]; + // } + } + } else { + ck = cn - 1; + while (ck >= 0 && xk < xe0) { + wx[0] = xk; wy[0] = yk; + // ck = pop_trace_bp_rev(in, ck, &op, &b, &cl); + ck = pop_trace_bp_rev_f(in, ck, &op, &bq, &bt, &cl); + if(op != 2) xk += cl; + if(op != 3) yk += cl; + wx[1] = xk; wy[1] = yk; + if(op == 0) { + os = MAX(xs0, wx[0]); oe = MIN(xe0, wx[1]); + ovlp = ((oe>os)? (oe-os):0); + if((ovlp > 0) && (ovlp > (*rxe) - (*rxs))) { + (*rxs) = wy[0] + os - wx[0]; + (*rxe) = wy[0] + oe - wx[0]; + + (*rys) = ys0 + os - xs0; + (*rye) = ys0 + oe - xs0; + } + } + // if((op == 0) && (wx[0] <= s) && (wx[1] >= e)) { + // (*rs) = wy[0] + s - wx[0]; + // (*re) = wy[0] + e - wx[0]; + // } + } + } + + + return (*rxe) - (*rxs); +} + +uint32_t quick_exact_match(ma_hit_t *z, All_reads *rref, UC_Read* qu, UC_Read* tu, cc_v *sc) +{ + uint64_t rts, rte, rqs, rqe, f = 0; int64_t ql, tl, qr, tr, qs, qe, ts, te; + + // fprintf(stderr, "-0-[M::%s]\tf::%lu\n", __func__, f); + if(adjust_exact_match(&(sc->a[z->tn]), z->ts, z->te, ((uint32_t)(z->qns)), z->qe, &rts, &rte, &rqs, &rqe, z->rev)) { + z->ts = rts; z->te = rte; f = 1; + z->qns >>= 32; z->qns <<= 32; z->qns |= ((uint64_t)(rqs)); z->qe = rqe; + + ///debug + // qs = rqs; qe = rqe; ts = rts; te = rte; + // resize_UC_Read(tu, te - ts); + // recover_UC_Read_sub_region(tu->seq, ts, te - ts, z->rev, rref, z->tn); + + // fprintf(stderr, "[M::%s]\trq::[%lu,%lu)\trt::[%lu,%lu)\n", __func__, rqs, rqe, rts, rte); + // fprintf(stderr, "-0-[M::%s] qstr::%.*s\n", __func__, ((int)(qe - qs)), qu->seq + qs); + // fprintf(stderr, "-0-[M::%s] tstr::%.*s\n", __func__, ((int)(te - ts)), tu->seq); + + // if(memcmp(qu->seq + qs, tu->seq, qe - qs) == 0) { + // fprintf(stderr, "-0-[M::%s]\tsb\n", __func__); + // } else { + // fprintf(stderr, "-1-[M::%s]\tsa\n", __func__); + // } + } + // fprintf(stderr, "-1-[M::%s]\tf::%lu\n", __func__, f); + ql = qu->length; tl = Get_READ_LENGTH((*rref), z->tn); + qs = ((uint32_t)(z->qns)); qe = z->qe; ts = z->ts; te = z->te; + if(qs >= ql) qs = ql; if(qe > ql) qe = ql; if(qe <= qs) f = 0; + // fprintf(stderr, "-2-[M::%s]\tf::%lu\n", __func__, f); + if(ts >= tl) ts = tl; if(te > tl) te = tl; if(te <= ts) f = 0; + // fprintf(stderr, "-3-[M::%s]\tf::%lu\n", __func__, f); + if((qe - qs) != (te - ts)) f = 0; + // fprintf(stderr, "-4-[M::%s]\tf::%lu\n", __func__, f); + + if(qs <= ts) { + ts -= qs; qs = 0; + } else { + qs -= ts; ts = 0; + } + + + qr = ql - qe; tr = tl - te; + if(qr <= tr) { + qe = ql; te += qr; + } else { + te = tl; qe += tr; + } + + // fprintf(stderr, "-5-[M::%s]\tzq::[%ld,\t%ld)\tzt::[%ld,\t%ld)\teq::[%u,\t%u)\tet::[%u,\t%u)\tql::%ld\ttl::%ld\tf::%lu\n", + // __func__, qs, qe, ts, te, ((uint32_t)(z->qns)), z->qe, z->ts, z->te, ql, tl, f); + + z->qns >>= 32; z->qns <<= 32; z->qns |= ((uint64_t)(qs)); z->qe = qe; + z->ts = ts; z->te = te; + + if((f) && ((te - ts) == (qe - qs)) && (qe > qs)) { + resize_UC_Read(tu, te - ts); + recover_UC_Read_sub_region(tu->seq, ts, te - ts, z->rev, rref, z->tn); + + // fprintf(stderr, "[M::%s] qstr::%.*s\n", __func__, ((int)(qe - qs)), qu->seq + qs); + // fprintf(stderr, "[M::%s] tstr::%.*s\n", __func__, ((int)(te - ts)), tu->seq); + + if(memcmp(qu->seq + qs, tu->seq, qe - qs) == 0) return 1; + } + + return 0; +} + +uint64_t cal_cov_re(asg64_v *idx, int64_t *k, uint64_t s, uint64_t e) +{ + uint64_t *a = idx->a, ws, we, cn, os, oe, ovlp, tot = 0; int64_t n = idx->n; + if(n <= 0) return 0; + if((*k) >= n) (*k) = 0; + while (((*k) > 0) && (((uint32_t)a[*k]) > s)) (*k) -= 2; + if((*k) < 0) (*k) = 0; + + while (((*k) < n) && ((a[*k]>>32) < e)) { + ws = (a[*k]>>32); we = ((uint32_t)a[*k]); cn = a[(*k) + 1]; + os = MAX(s, ws); oe = MIN(e, we); + ovlp = ((oe>os)? (oe-os):0); + tot += (ovlp*cn); + (*k) += 2; + } + + return tot; +} + +uint64_t gen_hap_dc_cov(asg64_v *be, asg64_v *ba, ma_hit_t_alloc *paf, All_reads *rref, uint64_t wl, int64_t occ_exact, double occ_exact_rate, UC_Read* qu, UC_Read* tu, cc_v *sc, uint64_t rid) +{ + ma_hit_t *z; be->n = ba->n = 0; uint64_t k, s, e, vn, m; asg64_v *v = NULL; + int64_t dp, old_dp, st = 0, ed, ql = qu->length, qs, qe, ff; + for (k = 0; k < paf->length; k++) { + z = &(paf->buffer[k]); + + // if(rid == 16) { + // fprintf(stderr, "[M::%s]\tqn::%u::%.*s\ttn::%u::%.*s\t%c\tq::[%u,%u)\tq::[%u,%u)\tel::%u\n", __func__, + // (uint32_t)(z->qns>>32), (int)Get_NAME_LENGTH(R_INF, (uint32_t)(z->qns>>32)), Get_NAME(R_INF, (uint32_t)(z->qns>>32)), z->tn, (int)Get_NAME_LENGTH(R_INF, z->tn), Get_NAME(R_INF, z->tn), "+-"[z->rev], + // (uint32_t)z->qns, z->qe, z->ts, z->te, z->el); + // } + + if((z->el) && (quick_exact_match(z, rref, qu, tu, sc))) { + s = ((uint32_t)(z->qns)); e = z->qe; + kv_push(uint64_t, (*be), (s<<1)); + kv_push(uint64_t, (*be), (e<<1)|1); + z->el = 1; + // fprintf(stderr, "-gmm-[M::%s]\tqn::%u::%.*s\ttn::%u::%.*s\t%c\tq::[%u,%u)\tq::[%u,%u)\n", __func__, + // (uint32_t)(z->qns>>32), (int)Get_NAME_LENGTH(R_INF, (uint32_t)(z->qns>>32)), Get_NAME(R_INF, (uint32_t)(z->qns>>32)), z->tn, (int)Get_NAME_LENGTH(R_INF, z->tn), Get_NAME(R_INF, z->tn), "+-"[z->rev], + // (uint32_t)z->qns, z->qe, z->ts, z->te); + } else { + s = ((uint32_t)(z->qns)); e = z->qe; + kv_push(uint64_t, (*ba), (s<<1)); + kv_push(uint64_t, (*ba), (e<<1)|1); + z->el = 0; + // fprintf(stderr, "-gum-[M::%s]\tqn::%u::%.*s\ttn::%u::%.*s\t%c\tq::[%u,%u)\tq::[%u,%u)\n", __func__, + // (uint32_t)(z->qns>>32), (int)Get_NAME_LENGTH(R_INF, (uint32_t)(z->qns>>32)), Get_NAME(R_INF, (uint32_t)(z->qns>>32)), z->tn, (int)Get_NAME_LENGTH(R_INF, z->tn), Get_NAME(R_INF, z->tn), "+-"[z->rev], + // (uint32_t)z->qns, z->qe, z->ts, z->te); + } + /** + s = ((uint32_t)(z->qns)); e = z->qe; + // sc->a[z->tn] + if(z->el) { + // if((z->qns>>32) == 75 && z->tn == 59) { + if(quick_exact_match(z, rref, qu, tu, sc)) { + s = ((uint32_t)(z->qns)); e = z->qe; + // fprintf(stderr, "-mm-[M::%s]\tqn::%u::%.*s\ttn::%u::%.*s\t%c\n", __func__, + // (uint32_t)(z->qns>>32), (int)Get_NAME_LENGTH(R_INF, (uint32_t)(z->qns>>32)), Get_NAME(R_INF, (uint32_t)(z->qns>>32)), z->tn, (int)Get_NAME_LENGTH(R_INF, z->tn), Get_NAME(R_INF, z->tn), "+-"[z->rev]); + } else { + s = ((uint32_t)(z->qns)); e = z->qe; + // fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\ttn::%u::%.*s\t%c\n", __func__, + // (uint32_t)(z->qns>>32), (int)Get_NAME_LENGTH(R_INF, (uint32_t)(z->qns>>32)), Get_NAME(R_INF, (uint32_t)(z->qns>>32)), z->tn, (int)Get_NAME_LENGTH(R_INF, z->tn), Get_NAME(R_INF, z->tn), "+-"[z->rev]); + } + // } + } + **/ + } + + + v = be; vn = v->n; + radix_sort_ec64(v->a, v->a+v->n); + for (k = 0, dp = 0, st = ed = 0; k < vn; ++k) { + old_dp = dp; + ///if a[j] is qe + if (v->a[k]&1) --dp; + else ++dp; + + ed = v->a[k]>>1; + if(ed > st) { + m = st; m <<= 32; m |= ((uint64_t)ed); + kv_push(uint64_t, (*v), m); + kv_push(uint64_t, (*v), old_dp); + if((old_dp + 1) < occ_exact) return 0;///+1 for self + } + st = ed; + } + ed = ql; old_dp = dp; + if(ed > st) { + m = st; m <<= 32; m |= ((uint64_t)ed); + kv_push(uint64_t, (*v), m); + kv_push(uint64_t, (*v), old_dp); + if((old_dp + 1) < occ_exact) return 0;///+1 for self + } + + for (k = vn, m = 0; k < v->n; k++) { + v->a[m++] = v->a[k]; + } + v->n = m; + + + v = ba; vn = v->n; + radix_sort_ec64(v->a, v->a+v->n); + for (k = 0, dp = 0, st = ed = 0; k < vn; ++k) { + old_dp = dp; + ///if a[j] is qe + if (v->a[k]&1) --dp; + else ++dp; + + ed = v->a[k]>>1; + if(ed > st) { + m = st; m <<= 32; m |= ((uint64_t)ed); + kv_push(uint64_t, (*v), m); + kv_push(uint64_t, (*v), old_dp); + } + st = ed; + } + ed = ql; old_dp = dp; + if(ed > st) { + m = st; m <<= 32; m |= ((uint64_t)ed); + kv_push(uint64_t, (*v), m); + kv_push(uint64_t, (*v), old_dp); + } + + for (k = vn, m = 0; k < v->n; k++) { + v->a[m++] = v->a[k]; + } + v->n = m; + + + + + + + + ///debug + // v = be; + // for (k = 0; k < v->n; k += 2) { + // fprintf(stderr, "-be-[M::%s]\tq::[%lu,%u)\tocc::%lu\n", __func__, v->a[k]>>32, (uint32_t)v->a[k], v->a[k+1]); + // } + + // v = ba; + // for (k = 0; k < v->n; k += 2) { + // fprintf(stderr, "-ba-[M::%s]\tq::[%lu,%u)\tocc::%lu\n", __func__, v->a[k]>>32, (uint32_t)v->a[k], v->a[k+1]); + // } + + + + + + + + + + int64_t ke = 0, ka = 0, cc[2]; + qs = 0; qe = wl; qe = ((qe<=ql)?qe:ql); + for (; qs < ql; ) { + cc[0] = cal_cov_re(be, &ke, qs, qe); + cc[1] = cal_cov_re(ba, &ka, qs, qe); + + // fprintf(stderr, "[M::%s]\tq::[%ld,%ld)\tcc[0]::%ld\tcc[1]::%ld\n", __func__, qs, qe, cc[0], cc[1]); + + ff = 0; + if((cc[0]) && (cc[0] > cc[1])) { + cc[0] += qe - qs;///+(qe - qs) for self + cc[1] += cc[0]; + if(cc[0] > (cc[1]*occ_exact_rate)) { + ff = 1; + } + } + + if(ff == 0) return 0; + // fprintf(stderr, "[M::%s-beg]\tq::[%ld,%ld)\tcc[0]::%ld\tcc[1]::%ld\n", __func__, qs, qe, cc[0], cc[1]); + qs += wl; qe += wl; qe = ((qe<=ql)?qe:ql); + } + + + return 1; +} + +static void worker_hap_dc_ec(void *data, long i, int tid) +{ + ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]); + // fprintf(stderr, "-0-[M::%s-beg] rid->%ld\n", __func__, i); + // if (memcmp("m64012_190921_234837/139067658/ccs", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { + // fprintf(stderr, "-0-[M::%s-beg] rid->%ld\n", __func__, i); + // } else if (memcmp("m64012_190921_234837/28968323/ccs", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { + // fprintf(stderr, "-1-[M::%s-beg] rid->%ld\n", __func__, i); + // } else { + // return; + // } + // if(i != 2851) return; + + // if(scb.a[i].m < scc.a[i].n) { + // scb.a[i].m = scc.a[i].n; + // REALLOC(scb.a[i].a, scb.a[i].m); + // } + // scb.a[i].n = scc.a[i].n; + // memcpy(scb.a[i].a, scc.a[i].a, scc.a[i].n*sizeof((*(scb.a[i].a)))); + + scc.f[i] = 0; + + if(!(R_INF.paf[i].length)) return; + // if(scc.f[i]) return; + asg64_v buf0; + + recover_UC_Read(&b->self_read, &R_INF, i); + + copy_asg_arr(buf0, b->sp); + if(gen_hap_dc_cov(&(b->v64), &buf0, &(R_INF.paf[i]), &R_INF, WINDOW_HC_FAST, 4, 0.7, &b->self_read, &b->ovlp_read, &scc, i)) { + scc.f[i] = 1; b->cnt[0]++; + // fprintf(stderr, "-mm-[M::%s]\tqn::%u::%.*s\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i)); + } else { + scc.f[i] = 0; b->cnt[1]++; + // fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i)); + } + copy_asg_arr(b->sp, buf0); + + refresh_ec_ovec_buf_t0(b, REFRESH_N); +} + + +void flip_paf_rc(uint64_t rid, ma_hit_t_alloc *paf, All_reads *rref) +{ + ma_hit_t *z; uint64_t k, m; int64_t ql = Get_READ_LENGTH((*rref), rid), tl, qs, qe, ts, te; + for (k = m = 0; k < paf->length; k++) { + z = &(paf->buffer[k]); tl = Get_READ_LENGTH((*rref), z->tn); + qs = (uint32_t)z->qns; if(qs < 0) qs = 0; if(qs > ql) qs = ql; + qe = z->qe; if(qe < 0) qe = 0; if(qe > ql) qe = ql; + ts = z->ts; if(ts < 0) ts = 0; if(ts > tl) ts = tl; + te = z->te; if(te < 0) te = 0; if(te > tl) te = tl; + if(qe > qs && te > ts) { + z->qns >>= 32; z->qns <<= 32; + z->qns |= ((uint64_t)(ql - qe)); + z->qe = ql - qs; + z->ts = tl - te; + z->te = tl - ts; + paf->buffer[m++] = *z; + } + } + paf->length = m; +} + +static void worker_hap_post_rev(void *data, long i, int tid) +{ + ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]); + uint64_t k, l, kl, nn; char *a, c; + // fprintf(stderr, "-0-[M::%s-beg] rid->%ld\n", __func__, i); + // if (memcmp("m64012_190921_234837/139067658/ccs", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { + // fprintf(stderr, "-0-[M::%s-beg] rid->%ld\n", __func__, i); + // } else if (memcmp("m64012_190921_234837/28968323/ccs", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { + // fprintf(stderr, "-1-[M::%s-beg] rid->%ld\n", __func__, i); + // } else { + // return; + // } + // if(i != 2851) return; + + // if(scb.a[i].m < scc.a[i].n) { + // scb.a[i].m = scc.a[i].n; + // REALLOC(scb.a[i].a, scb.a[i].m); + // } + // scb.a[i].n = scc.a[i].n; + // memcpy(scb.a[i].a, scc.a[i].a, scc.a[i].n*sizeof((*(scb.a[i].a)))); + + flip_paf_rc(i, &(R_INF.paf[i]), &R_INF); + flip_paf_rc(i, &(R_INF.reverse_paf[i]), &R_INF); + + recover_UC_Read(&b->self_read, &R_INF, i); + l = b->self_read.length; kl = l>>1; a = b->self_read.seq; + for (k = nn = 0; k < kl; k++) { + c = a[l-k-1]; a[l-k-1] = RC_CHAR(a[k]); a[k] = RC_CHAR(c); + if(a[k] == 'N') nn++; + if(a[l-k-1] == 'N') nn++; + } + if(l&1) { + a[k] = RC_CHAR(a[k]); + if(a[k] == 'N') nn++; + } + + ha_compress_base(Get_READ(R_INF, i), a, l, &R_INF.N_site[i], nn); +} + +static void worker_hap_dc_ec_gen(void *data, long i, int tid) +{ + + ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]); + uint32_t high_occ = asm_opt.hom_cov * (2.0 - HA_KMER_GOOD_RATIO); + uint32_t low_occ = asm_opt.hom_cov * HA_KMER_GOOD_RATIO; + + recover_UC_Read(&b->self_read, &R_INF, i); + + // overlap_region_sort_y_id(b->olist.list, b->olist.length); + // ma_hit_sort_tn(R_INF.paf[i].buffer, R_INF.paf[i].length); + // ma_hit_sort_tn(R_INF.reverse_paf[i].buffer, R_INF.reverse_paf[i].length); + + // R_INF.paf[i].is_fully_corrected = is_well_cal(&b->v64, &(R_INF.paf[i]), &(R_INF.reverse_paf[i]), b->self_read.length, 4); + + // R_INF.paf[i].is_abnormal = abnormal; + // R_INF.trio_flag[i] = AMBIGU; + + h_ec_lchain_fast(b->ab, i, &b->self_read, &b->ovlp_read, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, &b->exz, &b->v16, &b->v64, 0.02, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, UINT32_MAX, &(R_INF.paf[i]), &(R_INF.reverse_paf[i]), 0.866666); + + push_ff_ovlp(&(R_INF.paf[i]), &b->olist, 1, &R_INF, b->cnt); + push_ff_ovlp(&(R_INF.reverse_paf[i]), &b->olist, 2, &R_INF, b->cnt); + + /** + copy_asg_arr(buf0, b->sp); + if(gen_hap_dc_cov(&(b->v64), &buf0, &(R_INF.paf[i]), &R_INF, WINDOW_HC_FAST, 4, 0.7, &b->self_read, &b->ovlp_read, &scc, i)) { + scc.f[i] = 1; b->num_read_base++; + // fprintf(stderr, "-mm-[M::%s]\tqn::%u::%.*s\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i)); + } else { + scc.f[i] = 0; b->num_correct_base++; + // fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i)); + } + copy_asg_arr(b->sp, buf0); + **/ + refresh_ec_ovec_buf_t0(b, REFRESH_N); +} + +static void worker_hap_dc_ec_gen_new_idx(void *data, long i, int tid) +{ + + ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]); + uint32_t high_occ = asm_opt.hom_cov * (2.0 - HA_KMER_GOOD_RATIO); + uint32_t low_occ = asm_opt.hom_cov * HA_KMER_GOOD_RATIO; + + recover_UC_Read(&b->self_read, &R_INF, i); + + h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, /**0.02**/0.001, asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, /**0**/2, UINT32_MAX); + + overlap_region_sort_y_id(b->olist.list, b->olist.length); + + // overlap_region_sort_y_id(b->olist.list, b->olist.length); + // ma_hit_sort_tn(R_INF.paf[i].buffer, R_INF.paf[i].length); + // ma_hit_sort_tn(R_INF.reverse_paf[i].buffer, R_INF.reverse_paf[i].length); + + // R_INF.paf[i].is_fully_corrected = is_well_cal(&b->v64, &(R_INF.paf[i]), &(R_INF.reverse_paf[i]), b->self_read.length, 4); + + // R_INF.paf[i].is_abnormal = abnormal; + // R_INF.trio_flag[i] = AMBIGU; + + h_ec_lchain_fast_new(b->ab, i, &b->self_read, &b->ovlp_read, &R_INF, &b->olist, &b->clist, &b->exz, &b->v16, &b->v64, &(R_INF.paf[i]), &(R_INF.reverse_paf[i]), 0.866666); + + push_ff_ovlp(&(R_INF.paf[i]), &b->olist, 1, &R_INF, b->cnt); + push_ff_ovlp(&(R_INF.reverse_paf[i]), &b->olist, 2, &R_INF, b->cnt); + + /** + copy_asg_arr(buf0, b->sp); + if(gen_hap_dc_cov(&(b->v64), &buf0, &(R_INF.paf[i]), &R_INF, WINDOW_HC_FAST, 4, 0.7, &b->self_read, &b->ovlp_read, &scc, i)) { + scc.f[i] = 1; b->num_read_base++; + // fprintf(stderr, "-mm-[M::%s]\tqn::%u::%.*s\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i)); + } else { + scc.f[i] = 0; b->num_correct_base++; + // fprintf(stderr, "-um-[M::%s]\tqn::%u::%.*s\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i)); + } + copy_asg_arr(b->sp, buf0); + **/ + refresh_ec_ovec_buf_t0(b, REFRESH_N); +} + +void gen_ovlst_paf(ma_hit_t_alloc *in_e, ma_hit_t_alloc *in_r, asg64_v *ou) +{ + uint32_t n = 0, k; + + for (k = 0; k < in_e->length; k++) { + if(!(in_e->buffer[k].el)) n++; + } + n += in_r->length; + + kv_resize(uint64_t, *ou, n); ou->n = 0; + + for (k = 0; k < in_e->length; k++) { + if(!(in_e->buffer[k].el)) { + ou->a[ou->n] = in_e->buffer[k].tn; + ou->a[ou->n] <<= 1; ou->a[ou->n] |= in_e->buffer[k].rev; + ou->n++; + } + } + + for (k = 0; k < in_r->length; k++) { + ou->a[ou->n] = in_r->buffer[k].tn; + ou->a[ou->n] <<= 1; ou->a[ou->n] |= in_r->buffer[k].rev; + ou->n++; + } + + radix_sort_ec64(ou->a, ou->a+ou->n); +} + +void dbg_overlap_region_cigar(overlap_region *a, uint64_t a_n, char *qstr, All_reads *rref, UC_Read *tu) +{ + bit_extz_t ez; uint64_t i, k; + for (i = 0; i < a_n; i++) { + if(a[i].y_pos_strand) { + recover_UC_Read_RC(tu, rref, a[i].y_id); + } else { + recover_UC_Read(tu, rref, a[i].y_id); + } + for (k = 0; k < a[i].w_list.n; k++) { + if(is_ualn_win((a[i].w_list.a[k]))) continue; + set_bit_extz_t(ez, a[i], k); + if(!cigar_check(tu->seq, qstr, &ez)) { + fprintf(stderr, "\n-0-[M::%s] x_id::%u, y_id::%u, x::[%u, %u), y::[%u, %u)\n", __func__, a[i].x_id, a[i].y_id, a[i].x_pos_s, a[i].x_pos_e + 1, a[i].y_pos_s, a[i].y_pos_e + 1); + exit(1); + } else { + // fprintf(stderr, "\n-1-[M::%s] x_id::%u, y_id::%u, x::[%u, %u), y::[%u, %u)\n", __func__, a[i].x_id, a[i].y_id, a[i].x_pos_s, a[i].x_pos_e + 1, a[i].y_pos_s, a[i].y_pos_e + 1); + } + } + } +} + +overlap_region* h_ec_lchain_re(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, UC_Read *tu, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *ol, Candidates_list *cl, bit_extz_t *exz, asg16_v* buf, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, ma_hit_t_alloc *in0, ma_hit_t_alloc *in1) +{ + // fprintf(stderr, "-mm-[M::%s]\tchain_cutoff::%u\n", __func__, chain_cutoff); + uint64_t on = 0, k, ol0, wl = WINDOW_HC, m; ma_hit_t *oa = NULL; overlap_region *z = NULL, *aux_o = NULL, t; Window_Pool w; double err = asm_opt.max_ov_diff_ec; + + int64_t max_skip, max_iter, max_dis, quick_check; double chn_pen_gap, chn_pen_skip; + set_lchain_dp_op(is_accurate, mz_k, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); + + init_Window_Pool(&w, rl, wl, (int)(1.0/err)); + + on = in0->length + in1->length + 1; + clear_overlap_region_alloc(ol); + clear_Candidates_list(cl); + if(on > ol->size) { + REALLOC(ol->list, on); + memset(ol->list+ol->size, 0, sizeof(overlap_region)*(on-ol->size)); + ol->size = on; + } + on = in0->length + in1->length; aux_o = &(ol->list[on]); + ol->length = 0; ol->mapped_overlaps_length = 0; m = 0; + + + // get the list of anchors + get_mz1(rs, rl, mz_w, mz_k, 0, !(asm_opt.flag & HA_F_NO_HPC), ab, ha_flt_tab, ha_idx, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 0, NULL, 0); + + oa = in0->buffer; on = in0->length; + for (k = 0; k < on; k++) { + ol0 = ol->length; + if(oa[k].el) { + z = &(ol->list[ol->length++]); + z->x_id = rid; z->y_id = oa[k].tn; + z->x_pos_strand = 0; z->y_pos_strand = oa[k].rev; + z->x_pos_s = (uint32_t)oa[k].qns; + z->x_pos_e = oa[k].qe - 1; + z->y_pos_s = oa[k].ts; + z->y_pos_e = oa[k].te - 1; + + z->is_match = 1; + z->align_length = z->overlapLen = z->shared_seed = z->x_pos_e + 1 - z->x_pos_s; + z->non_homopolymer_errors = z->strong = 0; + + set_exact_exz(exz, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); push_alnw(z, exz); + + // if(oa[k].tn == 15382) fprintf(stderr, "-em-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } else { + if(oa[k].rev) recover_UC_Read_RC(tu, rref, oa[k].tn); + else recover_UC_Read(tu, rref, oa[k].tn); + + get_pi_ec_chain(ab, rid, rl, oa[k].tn, tu->seq, tu->length, mz_w, mz_k, ol, cl, bw_thres, apend_be, k_flag, dbg_ct, sp, high_occ, low_occ, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip); + assert(ol->length - ol0 <= 1); + if((ol->length > ol0) && (gen_hc_r_alin_re(&(ol->list[ol0]), cl, rs, rl, tu->seq, tu->length, exz, aux_o, asm_opt.max_ov_diff_ec, w.window_length, rid, E_KHIT, 1, buf))) { + ol->list[ol0].y_pos_strand = oa[k].rev; + // if(oa[k].tn == 15382) fprintf(stderr, "-mm-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + } + + if((ol->length <= ol0) || (ol->list[ol0].is_match != 1)) continue; + + if(m != ol0) { + t = ol->list[m]; + ol->list[m] = ol->list[ol0]; + ol->list[ol0] = t; + } + + m++; + } + + + oa = in1->buffer; on = in1->length; + for (k = 0; k < on; k++) { + ol0 = ol->length; + + if(oa[k].rev) recover_UC_Read_RC(tu, rref, oa[k].tn); + else recover_UC_Read(tu, rref, oa[k].tn); + + get_pi_ec_chain(ab, rid, rl, oa[k].tn, tu->seq, tu->length, mz_w, mz_k, ol, cl, bw_thres, apend_be, k_flag, dbg_ct, sp, high_occ, low_occ, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip); + assert(ol->length - ol0 <= 1); + if((ol->length > ol0) && (gen_hc_r_alin_re(&(ol->list[ol0]), cl, rs, rl, tu->seq, tu->length, exz, aux_o, asm_opt.max_ov_diff_ec, w.window_length, rid, E_KHIT, 1, buf))) { + ol->list[ol0].y_pos_strand = oa[k].rev; + // if(oa[k].tn == 15382) fprintf(stderr, "-mm-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + + if((ol->length <= ol0) || (ol->list[ol0].is_match != 1)) continue; + + if(m != ol0) { + t = ol->list[m]; + ol->list[m] = ol->list[ol0]; + ol->list[ol0] = t; + } + + m++; + } + + ol->length = m; + + // fprintf(stderr, "[M::%s]\tnew_n::%lu\told_n::%u\n", __func__, ol->length, in0->length + in1->length); + assert(ol->length <= (in0->length + in1->length)); + + + // dbg_overlap_region_cigar(ol->list, ol->length, rs, rref, tu); + + + return aux_o; +} + + +overlap_region* h_ec_lchain_re1(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *ol, Candidates_list *cl, bit_extz_t *exz, asg16_v *buf, asg64_v *srt_i, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, ma_hit_t_alloc *in0, ma_hit_t_alloc *in1) +{ + // fprintf(stderr, "-mm-[M::%s]\tchain_cutoff::%u\n", __func__, chain_cutoff); + uint64_t on = 0, k, one = 0, ol0, wl = WINDOW_HC, m, m0, tid, trev; ma_hit_t *oa = NULL, *p = NULL; overlap_region *aux_o = NULL, *z = NULL, t; Window_Pool w; double err = asm_opt.max_ov_diff_ec; + char* rs = qu->seq; uint64_t rl = qu->length; + int64_t max_skip, max_iter, max_dis, quick_check; double chn_pen_gap, chn_pen_skip; + set_lchain_dp_op(is_accurate, mz_k, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); + init_Window_Pool(&w, rl, wl, (int)(1.0/err)); + + srt_i->n = 0; + oa = in0->buffer; on = in0->length; m0 = 0; + for (k = 0; k < on; k++) { + if(oa[k].el) { + one++; continue; + } + m = oa[k].tn; m <<= 1; m |= ((uint64_t)oa[k].rev); m <<= 32; m |= (k<<1); m |= m0; + kv_push(uint64_t, *srt_i, m); + } + oa = in1->buffer; on = in1->length; m0 = 1; + for (k = 0; k < on; k++) { + // if(oa[k].el) continue; + m = oa[k].tn; m <<= 1; m |= ((uint64_t)oa[k].rev); m <<= 32; m |= (k<<1); m |= m0; + kv_push(uint64_t, *srt_i, m); + } + radix_sort_ec64(srt_i->a, srt_i->a + srt_i->n); + + // get the list of anchors + get_mz1(rs, rl, mz_w, mz_k, 0, !(asm_opt.flag & HA_F_NO_HPC), ab, ha_flt_tab, NULL/**ha_idx**/, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 0, NULL, 0); + + h_ec_lchain_re_gen(ab, rid, rs, rl, mz_w, mz_k, ha_idx, rref, ol, cl, bw_thres, apend_be, k_flag, dbg_ct, sp, high_occ, low_occ, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, + max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip, tu, srt_i, scb.a); + + + ///max size + on = ol->length + one + srt_i->n + 1; m0 = on - 1; + if(on > ol->size) { + REALLOC(ol->list, on); + memset(ol->list+ol->size, 0, sizeof(overlap_region)*(on-ol->size)); + ol->size = on; + } + aux_o = &(ol->list[on-1]); ol->mapped_overlaps_length = 0; on = ol->length; + + // fprintf(stderr, "-0-[M::%s]\n", __func__); + + gen_hc_r_alin(ol, cl, rref, qu, tu, exz, aux_o, asm_opt.max_ov_diff_ec, w.window_length, rid, E_KHIT, 1, buf); rs = qu->seq; + + // fprintf(stderr, "-1-[M::%s]\n", __func__); + + ///handle unmatched chain + for (k = m = ol->length; k < on; k++) { + clear_fake_cigar(&(ol->list[k].f_cigar)); + clear_window_list_alloc(&(ol->list[k].w_list)); + clear_window_list_alloc(&(ol->list[k].boundary_cigars)); + + ol0 = ol->length; + + tid = ol->list[k].y_id; trev = ol->list[k].y_pos_strand; + if(trev) recover_UC_Read_RC(tu, rref, tid); + else recover_UC_Read(tu, rref, tid); + + get_pi_ec_chain(ab, rid, rl, tid, tu->seq, tu->length, mz_w, mz_k, ol, cl, bw_thres, apend_be, k_flag, dbg_ct, sp, high_occ, low_occ, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip); + assert(ol->length - ol0 <= 1); + if((ol->length > ol0) && (gen_hc_r_alin_re(&(ol->list[ol0]), cl, rs, rl, tu->seq, tu->length, exz, aux_o, asm_opt.max_ov_diff_ec, w.window_length, rid, E_KHIT, 1, buf))) { + ol->list[ol0].y_pos_strand = trev; + // if(oa[k].tn == 15382) fprintf(stderr, "-mm-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + + if((ol->length <= ol0) || (ol->list[ol0].is_match != 1)) continue; + + if(m != ol0) { + t = ol->list[m]; + ol->list[m] = ol->list[ol0]; + ol->list[ol0] = t; + } + + m++; + } + ol->length = m; + + for (k = ol->length; k < on; k++) { + clear_fake_cigar(&(ol->list[k].f_cigar)); + clear_window_list_alloc(&(ol->list[k].w_list)); + clear_window_list_alloc(&(ol->list[k].boundary_cigars)); + } + + // fprintf(stderr, "[M::%s]\tnew::%lu\told::%lu\n", __func__, ol->length, ol0); + + oa = in0->buffer; on = in0->length; + for (k = 0; k < on; k++) { + if(oa[k].el) { + z = &(ol->list[ol->length++]); + z->x_id = rid; z->y_id = oa[k].tn; + z->x_pos_strand = 0; z->y_pos_strand = oa[k].rev; + z->x_pos_s = (uint32_t)oa[k].qns; + z->x_pos_e = oa[k].qe - 1; + z->y_pos_s = oa[k].ts; + z->y_pos_e = oa[k].te - 1; + + z->is_match = 1; + z->align_length = z->overlapLen = z->shared_seed = z->x_pos_e + 1 - z->x_pos_s; + z->non_homopolymer_errors = z->strong = 0; + + set_exact_exz(exz, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); push_alnw(z, exz); + + // if(oa[k].tn == 1945) fprintf(stderr, "-em-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + } + m = ol->length; + + for (k = 0; k < srt_i->n; k++) { + ol0 = ol->length; + if(srt_i->a[k]&1) { + p = &(in1->buffer[((uint32_t)srt_i->a[k])>>1]); + } else { + p = &(in0->buffer[((uint32_t)srt_i->a[k])>>1]); + } + + if(p->rev) recover_UC_Read_RC(tu, rref, p->tn); + else recover_UC_Read(tu, rref, p->tn); + + get_pi_ec_chain(ab, rid, rl, p->tn, tu->seq, tu->length, mz_w, mz_k, ol, cl, bw_thres, apend_be, k_flag, dbg_ct, sp, high_occ, low_occ, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip); + assert(ol->length - ol0 <= 1); + if((ol->length > ol0) && (gen_hc_r_alin_re(&(ol->list[ol0]), cl, rs, rl, tu->seq, tu->length, exz, aux_o, asm_opt.max_ov_diff_ec, w.window_length, rid, E_KHIT, 1, buf))) { + ol->list[ol0].y_pos_strand = p->rev; + // if(oa[k].tn == 15382) fprintf(stderr, "-mm-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + + if((ol->length <= ol0) || (ol->list[ol0].is_match != 1)) continue; + + if(m != ol0) { + t = ol->list[m]; + ol->list[m] = ol->list[ol0]; + ol->list[ol0] = t; + } + + m++; + } + + ol->length = m; + + + // fprintf(stderr, "[M::%s]\tnew_n::%lu\told_n::%lu\n", __func__, ol->length, m0); + // fprintf(stderr, "[M::%s]\tnew_n::%lu\told_n::%u\n", __func__, ol->length, in0->length + in1->length); + // assert(ol->length <= (in0->length + in1->length)); + + + // dbg_overlap_region_cigar(ol->list, ol->length, rs, rref, tu); + + return aux_o; +} + +uint64_t direct_chain_cal(ha_abuf_t *ab, uint64_t qid, char *qs, uint64_t ql, uint64_t tid, char *ts, uint64_t tl, uint64_t trev, uint64_t mz_w, uint64_t mz_k, overlap_region_alloc *olst, Candidates_list *cl, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, double bw_thres, + int apend_be, uint64_t max_cnt, uint64_t min_cnt, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, int64_t max_skip, int64_t max_iter, int64_t max_dis, int64_t quick_check, double chn_pen_gap, double chn_pen_skip, + bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t khit, int64_t move_gap, asg16_v* buf) +{ + uint64_t ol0 = olst->length; + get_pi_ec_chain(ab, qid, ql, tid, ts, tl, mz_w, mz_k, olst, cl, bw_thres, apend_be, k_flag, dbg_ct, sp, high_occ, low_occ, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip); + assert(olst->length - ol0 <= 1); + if(olst->length > ol0) { + if(gen_hc_r_alin_re(&(olst->list[ol0]), cl, qs, ql, ts, tl, exz, aux_o, e_rate, wl, qid, E_KHIT, 1, buf)) { + olst->list[ol0].y_pos_strand = trev; + return 1; + } else { + clear_fake_cigar(&(olst->list[ol0].f_cigar)); + clear_window_list_alloc(&(olst->list[ol0].w_list)); + clear_window_list_alloc(&(olst->list[ol0].boundary_cigars)); + olst->length--; + } + } + return 0; +} + + +overlap_region* h_ec_lchain_re2(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *ol, Candidates_list *cl, bit_extz_t *exz, asg16_v *buf, asg64_v *srt_i, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, ma_hit_t_alloc *in0, ma_hit_t_alloc *in1) +{ + // fprintf(stderr, "-0-[M::%s]\tnew_n::%lu\told_n::%u\n", __func__, ol->length, in0->length + in1->length); + // fprintf(stderr, "-mm-[M::%s]\tchain_cutoff::%u\n", __func__, chain_cutoff); + uint64_t on = 0, k, l, i, one = 0, ol0, wl = WINDOW_HC, m, m0, tid, trev, max_cnt = UINT32_MAX, min_cnt = 0; ma_hit_t *oa = NULL, *p = NULL; overlap_region *aux_o = NULL, *z = NULL, t; Window_Pool w; double err = asm_opt.max_ov_diff_ec; tiny_queue_t tq; memset(&tq, 0, sizeof(tiny_queue_t)); + char* rs = qu->seq; uint64_t rl = qu->length; int64_t n, zn, om; + int64_t max_skip, max_iter, max_dis, quick_check; double chn_pen_gap, chn_pen_skip; + set_lchain_dp_op(is_accurate, mz_k, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); + init_Window_Pool(&w, rl, wl, (int)(1.0/err)); + + ///cutoff + if(high_occ) { + max_cnt = (*high_occ); + if(max_cnt < 2) max_cnt = 2; + } + if(low_occ) { + min_cnt = (*low_occ); + if(min_cnt < 2) min_cnt = 2; + } + + ///memory + ol->length = 0; on = in0->length + in1->length + 1; + if(on > ol->size) { + REALLOC(ol->list, on); + memset(ol->list+ol->size, 0, sizeof(overlap_region)*(on-ol->size)); + ol->size = on; + } + aux_o = &(ol->list[on-1]); ol->mapped_overlaps_length = 0; on = in0->length + in1->length; + + ///overlap idx + srt_i->n = 0; + oa = in0->buffer; on = in0->length; m0 = 0; + for (k = 0; k < on; k++) { + if(oa[k].el) { + one++; continue; + } + m = oa[k].tn; m <<= 1; m |= ((uint64_t)oa[k].rev); m <<= 32; m |= (k<<1); m |= m0; + kv_push(uint64_t, *srt_i, m); + } + oa = in1->buffer; on = in1->length; m0 = 1; + for (k = 0; k < on; k++) { + // if(oa[k].el) continue; + m = oa[k].tn; m <<= 1; m |= ((uint64_t)oa[k].rev); m <<= 32; m |= (k<<1); m |= m0; + kv_push(uint64_t, *srt_i, m); + } + radix_sort_ec64(srt_i->a, srt_i->a + srt_i->n); + + // get the list of anchors + get_mz1(rs, rl, mz_w, mz_k, 0, !(asm_opt.flag & HA_F_NO_HPC), ab, ha_flt_tab, NULL/**ha_idx**/, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 0, NULL, 0); + + h_ec_lchain_re_gen_srt(ab, ha_idx, ol, cl); + + k = 1; l = 0; i = 0; n = zn = 0; + while(h_ec_lchain_re_gen_qry(ab, &k, &l, &i, srt_i->a, srt_i->n, &tid, &trev)) { + + if(trev) recover_UC_Read_RC(tu, rref, tid); + else recover_UC_Read(tu, rref, tid); + + ol0 = ol->length; om = 0; + if(h_ec_lchain_re_chn(ab, l, k, rid, rs, rl, tid, tu->seq, tu->length, trev, mz_w, mz_k, ol, cl, bw_thres, apend_be, max_cnt, min_cnt, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip, &tq, scb.a, &n, &zn)) { + assert(ol->length - ol0 == 1); + ol->list[ol0].y_pos_strand = 0; + if(gen_hc_r_alin_re(&(ol->list[ol0]), cl, rs, rl, tu->seq, tu->length, exz, aux_o, asm_opt.max_ov_diff_ec, w.window_length, rid, E_KHIT, 1, buf)) { + ol->list[ol0].y_pos_strand = trev; om = 1; + } else {///unmatch + clear_fake_cigar(&(ol->list[ol0].f_cigar)); + clear_window_list_alloc(&(ol->list[ol0].w_list)); + clear_window_list_alloc(&(ol->list[ol0].boundary_cigars)); + ol->length--; + } + } + + if(om) { + srt_i->a[i] >>= 32; srt_i->a[i] <<= 32; srt_i->a[i] |= ((uint64_t)((uint32_t)-1)); + } + + l = k; k++; + } + + for (k = m = 0; k < srt_i->n; k++) { + if(((uint32_t)srt_i->a[k]) == ((uint32_t)-1)) continue; + srt_i->a[m++] = srt_i->a[k]; + } + // fprintf(stderr, "[M::%s]\ttot::%lu\tremain::%lu\n", __func__, (uint64_t)srt_i->n, m); + srt_i->n = m; + + + + oa = in0->buffer; on = in0->length; + for (k = 0; k < on; k++) { + if(oa[k].el) { + z = &(ol->list[ol->length++]); + z->x_id = rid; z->y_id = oa[k].tn; + z->x_pos_strand = 0; z->y_pos_strand = oa[k].rev; + z->x_pos_s = (uint32_t)oa[k].qns; + z->x_pos_e = oa[k].qe - 1; + z->y_pos_s = oa[k].ts; + z->y_pos_e = oa[k].te - 1; + + z->is_match = 1; + z->align_length = z->overlapLen = z->shared_seed = z->x_pos_e + 1 - z->x_pos_s; + z->non_homopolymer_errors = z->strong = 0; + + set_exact_exz(exz, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); push_alnw(z, exz); + + // if(oa[k].tn == 1945) fprintf(stderr, "-em-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + } + m = ol->length; + + + clear_Candidates_list(cl); + for (k = 0; k < srt_i->n; k++) { + ol0 = ol->length; + if(srt_i->a[k]&1) { + p = &(in1->buffer[((uint32_t)srt_i->a[k])>>1]); + } else { + p = &(in0->buffer[((uint32_t)srt_i->a[k])>>1]); + } + + if(p->rev) recover_UC_Read_RC(tu, rref, p->tn); + else recover_UC_Read(tu, rref, p->tn); + + get_pi_ec_chain(ab, rid, rl, p->tn, tu->seq, tu->length, mz_w, mz_k, ol, cl, bw_thres, apend_be, k_flag, dbg_ct, sp, high_occ, low_occ, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip); + assert(ol->length - ol0 <= 1); + if((ol->length > ol0) && (gen_hc_r_alin_re(&(ol->list[ol0]), cl, rs, rl, tu->seq, tu->length, exz, aux_o, asm_opt.max_ov_diff_ec, w.window_length, rid, E_KHIT, 1, buf))) { + ol->list[ol0].y_pos_strand = p->rev; + // if(oa[k].tn == 15382) fprintf(stderr, "-mm-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + + if((ol->length <= ol0) || (ol->list[ol0].is_match != 1)) continue; + + if(m != ol0) { + t = ol->list[m]; + ol->list[m] = ol->list[ol0]; + ol->list[ol0] = t; + } + + m++; + } + + ol->length = m; + + // fprintf(stderr, "[M::%s]\tnew_n::%lu\told_n::%lu\n", __func__, ol->length, m0); + // fprintf(stderr, "-1-[M::%s]\tnew_n::%lu\told_n::%u\n", __func__, ol->length, in0->length + in1->length); + assert(ol->length <= (in0->length + in1->length)); + + + // dbg_overlap_region_cigar(ol->list, ol->length, rs, rref, tu); + + return aux_o; +} + + +overlap_region* h_ec_lchain_fast(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *ol, Candidates_list *cl, bit_extz_t *exz, asg16_v *buf, asg64_v *srt_i, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, ma_hit_t_alloc *in0, ma_hit_t_alloc *in1, double sh) +{ + // fprintf(stderr, "-0-[M::%s]\tnew_n::%lu\told_n::%u\n", __func__, ol->length, in0->length + in1->length); + // fprintf(stderr, "-mm-[M::%s]\tchain_cutoff::%u\n", __func__, chain_cutoff); + uint64_t on = 0, k, l, i, one = 0, ol0, wl = WINDOW_HC, m, m0, tid, trev, max_cnt = UINT32_MAX, min_cnt = 0, is_match; ma_hit_t *oa = NULL, *p = NULL; overlap_region *aux_o = NULL, *z = NULL; Window_Pool w; double err = asm_opt.max_ov_diff_ec; tiny_queue_t tq; memset(&tq, 0, sizeof(tiny_queue_t)); + char* rs = qu->seq; uint64_t rl = qu->length; int64_t n, zn, om; uint64_t aq[2], at[2], bq[2], bt[2], ovlp, os, oe; + int64_t max_skip, max_iter, max_dis, quick_check; double chn_pen_gap, chn_pen_skip; + set_lchain_dp_op(is_accurate, mz_k, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); + init_Window_Pool(&w, rl, wl, (int)(1.0/err)); + + ///cutoff + if(high_occ) { + max_cnt = (*high_occ); + if(max_cnt < 2) max_cnt = 2; + } + if(low_occ) { + min_cnt = (*low_occ); + if(min_cnt < 2) min_cnt = 2; + } + + ///memory + ol->length = 0; on = in0->length + in1->length + 1; + if(on > ol->size) { + REALLOC(ol->list, on); + memset(ol->list+ol->size, 0, sizeof(overlap_region)*(on-ol->size)); + ol->size = on; + } + aux_o = &(ol->list[on-1]); ol->mapped_overlaps_length = 0; on = in0->length + in1->length; + + ///overlap idx + srt_i->n = 0; + oa = in0->buffer; on = in0->length; m0 = 0; + for (k = 0; k < on; k++) { + if(oa[k].el) { + one++; continue; + } + m = oa[k].tn; m <<= 1; m |= ((uint64_t)oa[k].rev); m <<= 32; m |= (k<<1); m |= m0; + kv_push(uint64_t, *srt_i, m); + } + oa = in1->buffer; on = in1->length; m0 = 1; + for (k = 0; k < on; k++) { + // if(oa[k].el) continue; + m = oa[k].tn; m <<= 1; m |= ((uint64_t)oa[k].rev); m <<= 32; m |= (k<<1); m |= m0; + kv_push(uint64_t, *srt_i, m); + } + radix_sort_ec64(srt_i->a, srt_i->a + srt_i->n); + + // get the list of anchors + get_mz1(rs, rl, mz_w, mz_k, 0, !(asm_opt.flag & HA_F_NO_HPC), ab, ha_flt_tab, NULL/**ha_idx**/, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 0, NULL, 0); + + h_ec_lchain_re_gen_srt(ab, ha_idx, ol, cl); + + k = 1; l = 0; i = 0; n = zn = 0; + while(h_ec_lchain_re_gen_qry(ab, &k, &l, &i, srt_i->a, srt_i->n, &tid, &trev)) { + + if(trev) recover_UC_Read_RC(tu, rref, tid); + else recover_UC_Read(tu, rref, tid); + + ol0 = ol->length; om = 0; + if(h_ec_lchain_re_chn(ab, l, k, rid, rs, rl, tid, tu->seq, tu->length, trev, mz_w, mz_k, ol, cl, bw_thres, apend_be, max_cnt, min_cnt, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip, &tq, scb.a, &n, &zn)) { + assert(ol->length - ol0 == 1); + ol->list[ol0].y_pos_strand = trev; om = 1; ol->list[ol0].shared_seed = 0; ol->list[ol0].is_match = 0; + + assert(((srt_i->a[i]>>33) == ol->list[ol0].y_id) && (((srt_i->a[i]>>32)&1) == ol->list[ol0].y_pos_strand)); + if(srt_i->a[i]&1) { + p = &(in1->buffer[((uint32_t)srt_i->a[i])>>1]); is_match = 2; + } else { + p = &(in0->buffer[((uint32_t)srt_i->a[i])>>1]); is_match = 1; + } + ol->list[ol0].strong = p->ml; ol->list[ol0].without_large_indel = p->no_l_indel; + + aq[0] = (uint32_t)p->qns; aq[1] = p->qe; + at[0] = p->ts; at[1] = p->te; + + bq[0] = ol->list[ol0].x_pos_s; bq[1] = ol->list[ol0].x_pos_e + 1; + bt[0] = ol->list[ol0].y_pos_s; bt[1] = ol->list[ol0].y_pos_e + 1; + + os = MAX(aq[0], bq[0]); oe = MIN(aq[1], bq[1]); + ovlp = ((oe>os)? (oe-os):0); + if(!((ovlp) && (ovlp >= ((aq[1] - aq[0])*sh)) && ((ovlp >= ((bq[1] - bq[0])*sh))))) om = 0; + + os = MAX(at[0], bt[0]); oe = MIN(at[1], bt[1]); + ovlp = ((oe>os)? (oe-os):0); + if(!((ovlp) && (ovlp >= ((at[1] - at[0])*sh)) && ((ovlp >= ((bt[1] - bt[0])*sh))))) om = 0; + + if(om) { + if(exact_ec_check(rs, rl, tu->seq, tu->length, bq[0], bq[1], bt[0], bt[1])) { + if(is_match == 2) { + ol->list[ol0].strong = 0; ol->list[ol0].without_large_indel = 1; + } + is_match = 1; ol->list[ol0].shared_seed = 1; + } + ol->list[ol0].is_match = is_match; + } else {///unmatch + clear_fake_cigar(&(ol->list[ol0].f_cigar)); + clear_window_list_alloc(&(ol->list[ol0].w_list)); + clear_window_list_alloc(&(ol->list[ol0].boundary_cigars)); + ol->length--; + } + } + + if(om) { + srt_i->a[i] >>= 32; srt_i->a[i] <<= 32; srt_i->a[i] |= ((uint64_t)((uint32_t)-1)); + } + + l = k; k++; + } + + for (k = m = 0; k < srt_i->n; k++) { + if(((uint32_t)srt_i->a[k]) == ((uint32_t)-1)) continue; + srt_i->a[m++] = srt_i->a[k]; + } + // fprintf(stderr, "[M::%s]\ttot::%lu\tremain::%lu\n", __func__, (uint64_t)srt_i->n, m); + srt_i->n = m; + + + + oa = in0->buffer; on = in0->length; + for (k = 0; k < on; k++) { + if(oa[k].el) { + z = &(ol->list[ol->length++]); + z->x_id = rid; z->y_id = oa[k].tn; + z->x_pos_strand = 0; z->y_pos_strand = oa[k].rev; + z->x_pos_s = (uint32_t)oa[k].qns; + z->x_pos_e = oa[k].qe - 1; + z->y_pos_s = oa[k].ts; + z->y_pos_e = oa[k].te - 1; + + z->align_length = z->overlapLen = z->x_pos_e + 1 - z->x_pos_s; + z->non_homopolymer_errors = 0; + + z->is_match = 1; z->shared_seed = 1; + z->strong = oa[k].ml; z->without_large_indel = oa[k].no_l_indel; + + set_exact_exz(exz, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); push_alnw(z, exz); + + // if(oa[k].tn == 1945) fprintf(stderr, "-em-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + } + /** + m = ol->length; + + + clear_Candidates_list(cl); + for (k = 0; k < srt_i->n; k++) { + ol0 = ol->length; + if(srt_i->a[k]&1) { + p = &(in1->buffer[((uint32_t)srt_i->a[k])>>1]); + } else { + p = &(in0->buffer[((uint32_t)srt_i->a[k])>>1]); + } + + if(p->rev) recover_UC_Read_RC(tu, rref, p->tn); + else recover_UC_Read(tu, rref, p->tn); + + get_pi_ec_chain(ab, rid, rl, p->tn, tu->seq, tu->length, mz_w, mz_k, ol, cl, bw_thres, apend_be, k_flag, dbg_ct, sp, high_occ, low_occ, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip); + assert(ol->length - ol0 <= 1); + if((ol->length > ol0) && (gen_hc_r_alin_re(&(ol->list[ol0]), cl, rs, rl, tu->seq, tu->length, exz, aux_o, asm_opt.max_ov_diff_ec, w.window_length, rid, E_KHIT, 1, buf))) { + ol->list[ol0].y_pos_strand = p->rev; + // if(oa[k].tn == 15382) fprintf(stderr, "-mm-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + + if((ol->length <= ol0) || (ol->list[ol0].is_match != 1)) continue; + + if(m != ol0) { + t = ol->list[m]; + ol->list[m] = ol->list[ol0]; + ol->list[ol0] = t; + } + + m++; + } + + ol->length = m; + **/ + + // fprintf(stderr, "[M::%s]\tnew_n::%lu\told_n::%lu\n", __func__, ol->length, m0); + // fprintf(stderr, "-1-[M::%s]\tnew_n::%lu\told_n::%u\n", __func__, ol->length, in0->length + in1->length); + assert(ol->length <= (in0->length + in1->length)); + + + // dbg_overlap_region_cigar(ol->list, ol->length, rs, rref, tu); + + return aux_o; +} + +void h_ec_lchain_fast_new(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, All_reads *rref, overlap_region_alloc *ol, Candidates_list *cl, bit_extz_t *exz, asg16_v *buf, asg64_v *srt_i, ma_hit_t_alloc *in0, ma_hit_t_alloc *in1, double sh) +{ + // fprintf(stderr, "-0-[M::%s]\tnew_n::%lu\told_n::%u\n", __func__, ol->length, in0->length + in1->length); + // fprintf(stderr, "-mm-[M::%s]\tchain_cutoff::%u\n", __func__, chain_cutoff); + uint64_t on = 0, k, i, m, m0, tid, trev, is_match; ma_hit_t *oa = NULL, *p = NULL; overlap_region *z = NULL, t; + char* rs = qu->seq; uint64_t rl = qu->length; int64_t om; uint64_t aq[2], at[2], bq[2], bt[2], ovlp, os, oe; + + + + ///overlap idx + srt_i->n = 0; + oa = in0->buffer; on = in0->length; m0 = 0; + for (k = 0; k < on; k++) { + // if(oa[k].el) continue; + m = oa[k].tn; m <<= 1; m |= ((uint64_t)oa[k].rev); m <<= 32; m |= (k<<1); m |= m0; + kv_push(uint64_t, *srt_i, m); + } + oa = in1->buffer; on = in1->length; m0 = 1; + for (k = 0; k < on; k++) { + // if(oa[k].el) continue; + m = oa[k].tn; m <<= 1; m |= ((uint64_t)oa[k].rev); m <<= 32; m |= (k<<1); m |= m0; + kv_push(uint64_t, *srt_i, m); + } + radix_sort_ec64(srt_i->a, srt_i->a + srt_i->n); + + k = 0; i = 0; + for (k = m = 0; k < ol->length; k++) { + z = &(ol->list[k]); tid = z->y_id; trev = z->y_pos_strand; + for (; (i < srt_i->n) && ((srt_i->a[i]>>32) < ((tid<<1)|trev)); i++); + if((i < srt_i->n) && ((srt_i->a[i]>>32) == ((tid<<1)|trev))) { + om = 1; z->shared_seed = 0; z->is_match = 0; + if(srt_i->a[i]&1) { + p = &(in1->buffer[((uint32_t)srt_i->a[i])>>1]); is_match = 2; + } else { + p = &(in0->buffer[((uint32_t)srt_i->a[i])>>1]); is_match = 1; + } + z->strong = p->ml; z->without_large_indel = p->no_l_indel; + + aq[0] = (uint32_t)p->qns; aq[1] = p->qe; + at[0] = p->ts; at[1] = p->te; + + bq[0] = z->x_pos_s; bq[1] = z->x_pos_e + 1; + bt[0] = z->y_pos_s; bt[1] = z->y_pos_e + 1; + + os = MAX(aq[0], bq[0]); oe = MIN(aq[1], bq[1]); + ovlp = ((oe>os)? (oe-os):0); + if(!((ovlp) && (ovlp >= ((aq[1] - aq[0])*sh)) && ((ovlp >= ((bq[1] - bq[0])*sh))))) om = 0; + + os = MAX(at[0], bt[0]); oe = MIN(at[1], bt[1]); + ovlp = ((oe>os)? (oe-os):0); + if(!((ovlp) && (ovlp >= ((at[1] - at[0])*sh)) && ((ovlp >= ((bt[1] - bt[0])*sh))))) om = 0; + + if(om) { + if(is_match == 1 && p->el == 1) p->el = 0; + resize_UC_Read(tu, bt[1] - bt[0]); + recover_UC_Read_sub_region(tu->seq, bt[0], bt[1] - bt[0], trev, rref, tid); + if(exact_ec_check(rs, rl, tu->seq, bt[1] - bt[0], bq[0], bq[1], 0, bt[1] - bt[0])) { + if(is_match == 2) { + z->strong = 0; z->without_large_indel = 1; + } + is_match = 1; z->shared_seed = 1; + } + z->is_match = is_match; + } + } else { + om = 0; + bq[0] = z->x_pos_s; bq[1] = z->x_pos_e + 1; + bt[0] = z->y_pos_s; bt[1] = z->y_pos_e + 1; + resize_UC_Read(tu, bt[1] - bt[0]); + recover_UC_Read_sub_region(tu->seq, bt[0], bt[1] - bt[0], trev, rref, tid); + if(exact_ec_check(rs, rl, tu->seq, bt[1] - bt[0], bq[0], bq[1], 0, bt[1] - bt[0])) { + z->strong = 0; z->without_large_indel = 1; + z->shared_seed = 1; z->is_match = 1; om = 1; + } + } + + if(om) { + if(m != k) { + t = ol->list[m]; + ol->list[m] = ol->list[k]; + ol->list[k] = t; + } + m++; + } + } + ol->length = m; + + + oa = in0->buffer; on = in0->length; + for (k = 0; k < on; k++) { + if(oa[k].el) { + z = &(ol->list[ol->length++]); + z->x_id = rid; z->y_id = oa[k].tn; + z->x_pos_strand = 0; z->y_pos_strand = oa[k].rev; + z->x_pos_s = (uint32_t)oa[k].qns; + z->x_pos_e = oa[k].qe - 1; + z->y_pos_s = oa[k].ts; + z->y_pos_e = oa[k].te - 1; + + z->align_length = z->overlapLen = z->x_pos_e + 1 - z->x_pos_s; + z->non_homopolymer_errors = 0; + + z->is_match = 1; z->shared_seed = 1; + z->strong = oa[k].ml; z->without_large_indel = oa[k].no_l_indel; + + set_exact_exz(exz, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); push_alnw(z, exz); + + // if(oa[k].tn == 1945) fprintf(stderr, "-em-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + } +} + +overlap_region* h_ec_lchain_re3(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *ol, Candidates_list *cl, bit_extz_t *exz, asg16_v *buf, asg64_v *srt_i, double bw_thres, + int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t mcopy_khit_cut, ma_hit_t_alloc *in0, ma_hit_t_alloc *in1) +{ + // fprintf(stderr, "-mm-[M::%s]\tchain_cutoff::%u\n", __func__, chain_cutoff); + uint64_t on = 0, k, one = 0, ol0, wl = WINDOW_HC, m, m0, tid, trev; ma_hit_t *oa = NULL, *p = NULL; overlap_region *aux_o = NULL, *z = NULL, t; Window_Pool w; double err = asm_opt.max_ov_diff_ec; + char* rs = qu->seq; uint64_t rl = qu->length; + int64_t max_skip, max_iter, max_dis, quick_check; double chn_pen_gap, chn_pen_skip; + set_lchain_dp_op(is_accurate, mz_k, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); + init_Window_Pool(&w, rl, wl, (int)(1.0/err)); + + srt_i->n = 0; + oa = in0->buffer; on = in0->length; m0 = 0; + for (k = 0; k < on; k++) { + m = oa[k].tn; m <<= 1; m |= ((uint64_t)oa[k].rev); m <<= 32; + if(oa[k].el) { + one++; m |= ((uint32_t)-1); + } else { + m |= (k<<1); m |= m0; + } + kv_push(uint64_t, *srt_i, m); + } + oa = in1->buffer; on = in1->length; m0 = 1; + for (k = 0; k < on; k++) { + // if(oa[k].el) continue; + m = oa[k].tn; m <<= 1; m |= ((uint64_t)oa[k].rev); m <<= 32; m |= (k<<1); m |= m0; + kv_push(uint64_t, *srt_i, m); + } + radix_sort_ec64(srt_i->a, srt_i->a + srt_i->n); + + // get the list of anchors + get_mz1(rs, rl, mz_w, mz_k, 0, !(asm_opt.flag & HA_F_NO_HPC), ab, ha_flt_tab, NULL/**ha_idx**/, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 0, NULL, 0); + + h_ec_lchain_re_gen3(ab, rid, rs, rl, mz_w, mz_k, ha_idx, rref, ol, cl, bw_thres, apend_be, k_flag, dbg_ct, sp, high_occ, low_occ, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, + max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip, tu, srt_i, scb.a); + + + ///max size + on = ol->length + one + srt_i->n + 1; m0 = on - 1; + if(on > ol->size) { + REALLOC(ol->list, on); + memset(ol->list+ol->size, 0, sizeof(overlap_region)*(on-ol->size)); + ol->size = on; + } + aux_o = &(ol->list[on-1]); ol->mapped_overlaps_length = 0; on = ol->length; + + // fprintf(stderr, "-0-[M::%s]\n", __func__); + + gen_hc_r_alin(ol, cl, rref, qu, tu, exz, aux_o, asm_opt.max_ov_diff_ec, w.window_length, rid, E_KHIT, 1, buf); rs = qu->seq; + + // fprintf(stderr, "-1-[M::%s]\n", __func__); + + ///handle unmatched chain + for (k = m = ol->length; k < on; k++) { + clear_fake_cigar(&(ol->list[k].f_cigar)); + clear_window_list_alloc(&(ol->list[k].w_list)); + clear_window_list_alloc(&(ol->list[k].boundary_cigars)); + + ol0 = ol->length; + + tid = ol->list[k].y_id; trev = ol->list[k].y_pos_strand; + if(trev) recover_UC_Read_RC(tu, rref, tid); + else recover_UC_Read(tu, rref, tid); + + get_pi_ec_chain(ab, rid, rl, tid, tu->seq, tu->length, mz_w, mz_k, ol, cl, bw_thres, apend_be, k_flag, dbg_ct, sp, high_occ, low_occ, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip); + assert(ol->length - ol0 <= 1); + if((ol->length > ol0) && (gen_hc_r_alin_re(&(ol->list[ol0]), cl, rs, rl, tu->seq, tu->length, exz, aux_o, asm_opt.max_ov_diff_ec, w.window_length, rid, E_KHIT, 1, buf))) { + ol->list[ol0].y_pos_strand = trev; + // if(oa[k].tn == 15382) fprintf(stderr, "-mm-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + + if((ol->length <= ol0) || (ol->list[ol0].is_match != 1)) continue; + + if(m != ol0) { + t = ol->list[m]; + ol->list[m] = ol->list[ol0]; + ol->list[ol0] = t; + } + + m++; + } + ol->length = m; + + for (k = ol->length; k < on; k++) { + clear_fake_cigar(&(ol->list[k].f_cigar)); + clear_window_list_alloc(&(ol->list[k].w_list)); + clear_window_list_alloc(&(ol->list[k].boundary_cigars)); + } + + // fprintf(stderr, "[M::%s]\tnew::%lu\told::%lu\n", __func__, ol->length, ol0); + + oa = in0->buffer; on = in0->length; + for (k = 0; k < on; k++) { + if(oa[k].el) { + z = &(ol->list[ol->length++]); + z->x_id = rid; z->y_id = oa[k].tn; + z->x_pos_strand = 0; z->y_pos_strand = oa[k].rev; + z->x_pos_s = (uint32_t)oa[k].qns; + z->x_pos_e = oa[k].qe - 1; + z->y_pos_s = oa[k].ts; + z->y_pos_e = oa[k].te - 1; + + z->is_match = 1; + z->align_length = z->overlapLen = z->shared_seed = z->x_pos_e + 1 - z->x_pos_s; + z->non_homopolymer_errors = z->strong = 0; + + set_exact_exz(exz, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); push_alnw(z, exz); + + // if(oa[k].tn == 1945) fprintf(stderr, "-em-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + } + m = ol->length; + + for (k = 0; k < srt_i->n; k++) { + ol0 = ol->length; + if(srt_i->a[k]&1) { + p = &(in1->buffer[((uint32_t)srt_i->a[k])>>1]); + } else { + p = &(in0->buffer[((uint32_t)srt_i->a[k])>>1]); + } + + if(p->rev) recover_UC_Read_RC(tu, rref, p->tn); + else recover_UC_Read(tu, rref, p->tn); + + get_pi_ec_chain(ab, rid, rl, p->tn, tu->seq, tu->length, mz_w, mz_k, ol, cl, bw_thres, apend_be, k_flag, dbg_ct, sp, high_occ, low_occ, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, max_skip, max_iter, max_dis, quick_check, chn_pen_gap, chn_pen_skip); + assert(ol->length - ol0 <= 1); + if((ol->length > ol0) && (gen_hc_r_alin_re(&(ol->list[ol0]), cl, rs, rl, tu->seq, tu->length, exz, aux_o, asm_opt.max_ov_diff_ec, w.window_length, rid, E_KHIT, 1, buf))) { + ol->list[ol0].y_pos_strand = p->rev; + // if(oa[k].tn == 15382) fprintf(stderr, "-mm-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); + } + + if((ol->length <= ol0) || (ol->list[ol0].is_match != 1)) continue; + + if(m != ol0) { + t = ol->list[m]; + ol->list[m] = ol->list[ol0]; + ol->list[ol0] = t; + } + + m++; + } + + ol->length = m; + + + // fprintf(stderr, "[M::%s]\tnew_n::%lu\told_n::%lu\n", __func__, ol->length, m0); + // fprintf(stderr, "[M::%s]\tnew_n::%lu\told_n::%u\n", __func__, ol->length, in0->length + in1->length); + // assert(ol->length <= (in0->length + in1->length)); + + + // dbg_overlap_region_cigar(ol->list, ol->length, rs, rref, tu); + + return aux_o; +} + + +void gen_ori_seq0(char *tstr, uint64_t tl, UC_Read *qu, asg16_v *sc, uint64_t rid) +{ + uint64_t ck, qk, tk, k, wq[2], wt[2]; uint32_t len; uint16_t c, bq, bt; char *qstr = NULL; + + ck = qk = tk = 0; + while (ck < sc->n) { + wq[0] = qk; wt[0] = tk; + ck = pop_trace_bp_f(sc, ck, &c, &bq, &bt, &len); + if(c != 2) qk += len; + if(c != 3) tk += len; + wq[1] = qk; wt[1] = tk; + } + if(!(tk == tl)) { + fprintf(stderr, "[M::%s] rid::%lu, tk::%lu, tl::%lu\n", __func__, rid, tk, tl); + } + assert(tk == tl); + + resize_UC_Read(qu, qk); qstr = qu->seq; qu->length = qk; + ck = qk = tk = 0; + while (ck < sc->n) { + wq[0] = qk; wt[0] = tk; + ck = pop_trace_bp_f(sc, ck, &c, &bq, &bt, &len); + if(c != 2) qk += len; + if(c != 3) tk += len; + wq[1] = qk; wt[1] = tk; + + if(c == 0) { + memcpy(qstr + wq[0], tstr + wt[0], (wq[1]-wq[0])*sizeof((*qstr))); + } else if(c == 1 || c == 3) { + for (k = wq[0]; k < wq[1]; k++) qstr[k] = s_H[bq]; + } + // fprintf(stderr, "%u%c(%c)(x::[%lu,%ld))(y::[%lu,%ld))\n", len, cm[c], ((c==1)||(c==2))?(cc[bt]):('*'), wx[0], wx[1], wy[0], wy[1]); // s_H + } +} + +void gen_cc_fly(asg16_v *sc, char *qstr, uint64_t ql, char *tstr, uint64_t tl, bit_extz_t *exz, double e_rate, uint64_t maxn, uint64_t maxe) +{ + // fprintf(stderr, "[M::%s] ql::%lu, tl::%lu\n", __func__, ql, tl); + if(ql == 0 && tl == 0) return; + uint64_t k, ck, qk, tk, wq[2], wt[2], maxl, minl, diff, f, diff0 = 0; + if(ql > 0 && tl == 0) { + for (k = 0; k < ql; k++) { + push_trace_bp_f(sc, 3, seq_nt6_table[(uint32_t)(qstr[k])], (uint16_t)-1, 1, 1); + } + return; + } + if(ql == 0 && tl > 0) { + for (k = 0; k < tl; k++) { + push_trace_bp_f(sc, 2, (uint16_t)-1, seq_nt6_table[(uint32_t)(tstr[k])], 1, 1); + } + return; + } + if(ql == tl && ql == 1) { + if(qstr[0] == tstr[0]) push_trace_bp_f(sc, 0, (uint16_t)-1, (uint16_t)-1, 1, 1); + else push_trace_bp_f(sc, 1, seq_nt6_table[(uint32_t)(qstr[0])], seq_nt6_table[(uint32_t)(tstr[0])], 1, 1); + return; + } + + + + if(ql >= tl) { + maxl = ql; minl = tl; + } else { + maxl = tl; minl = ql; + } + f = 0; + + diff = 31; + if(diff > (maxl - minl)) { + if(diff > maxl) diff = maxl; + diff0 = diff; clear_align(*exz); + cal_exz_global(tstr, tl, qstr, ql, diff, exz); + if(is_align(*exz)) f = 1; + } + + if(!f) { + diff = 63; + if(diff > (maxl - minl)) { + if(diff > maxl) diff = maxl; + if(diff > diff0) { + diff0 = diff; clear_align(*exz); + cal_exz_global(tstr, tl, qstr, ql, diff, exz); + if(is_align(*exz)) f = 1; + } + } + } + + if(!f) { + if((maxn > maxl) && (maxe > (maxl - minl))) { + diff = maxl * e_rate; + if(diff < 1) diff = 1; + if(diff > maxe) diff = maxe; + if(diff > diff0) { + diff0 = diff; clear_align(*exz); + cal_exz_global(tstr, tl, qstr, ql, diff, exz); + if(is_align(*exz)) f = 1; + } + } + } + + // fprintf(stderr, "[M::%s] f::%lu, err::%d\n", __func__, f, exz->err); + + uint32_t on; uint16_t op; + if(f) { + + for (ck = qk = tk = 0; ck < exz->cigar.n;) { + wq[0] = qk; wt[0] = tk; + ck = pop_trace(&(exz->cigar), ck, &op, &on); + if(op!=2) qk += on; + if(op!=3) tk += on; + wq[1] = qk; wt[1] = tk; + + if(op == 0) { + push_trace_bp_f(sc, op, (uint16_t)-1, (uint16_t)-1, on, 1); + } else if(op == 1) { + for (k = 0; k < on; k++) { + push_trace_bp_f(sc, op, seq_nt6_table[(uint32_t)(qstr[wq[0]+k])], seq_nt6_table[(uint32_t)(tstr[wt[0]+k])], 1, 1); + } + } else if(op == 2) { + for (k = 0; k < on; k++) { + push_trace_bp_f(sc, op, (uint16_t)-1, seq_nt6_table[(uint32_t)(tstr[wt[0]+k])], 1, 1); + } + } else if(op == 3) { + for (k = 0; k < on; k++) { + push_trace_bp_f(sc, op, seq_nt6_table[(uint32_t)(qstr[wq[0]+k])], (uint16_t)-1, 1, 1); + } + } + } + } else { + if(ql > 0) { + op = 3; on = ql; + for (k = 0; k < on; k++) { + push_trace_bp_f(sc, op, seq_nt6_table[(uint32_t)(qstr[k])], (uint16_t)-1, 1, 1); + } + } + + if(tl > 0) { + op = 2; on = tl; + for (k = 0; k < on; k++) { + push_trace_bp_f(sc, op, (uint16_t)-1, seq_nt6_table[(uint32_t)(tstr[k])], 1, 1); + } + } + + } +} + +void cal_updated_trace_len(asg16_v *sc, uint64_t *ql, uint64_t *tl) +{ + uint64_t ck = 0, qk = 0, tk = 0; uint32_t len; uint16_t c, bq, bt; + while (ck < sc->n) { + ck = pop_trace_bp_f(sc, ck, &c, &bq, &bt, &len); + if(c != 2) qk += len; + if(c != 3) tk += len; + } + *ql = qk; *tl = tk; +} + +void gen_updated_trace(asg16_v *qcc, asg16_v *tcc, asg16_v *tcc_res, char *qstr, uint64_t ql, char *tstr, uint64_t tl, asg64_v *srt, bit_extz_t *exz, uint64_t rid) +{ + uint64_t k, ck, qk, tk, wq[2], wt[2], old_dp, dp, s, e, srt_n, si, ei, so, os, oe, *qd, *td, qs, qe, ts, te, q0, t0; + asg16_v *cc; uint32_t len; uint16_t c, bq, bt; + + srt->n = 0; + + cc = qcc; + ck = qk = tk = wq[0] = wq[1] = wt[0] = wt[1] = 0; + while (ck < cc->n) { + wq[0] = qk; wt[0] = tk; + ck = pop_trace_bp_f(cc, ck, &c, &bq, &bt, &len); + if(c != 2) qk += len; + if(c != 3) tk += len; + wq[1] = qk; wt[1] = tk; + if(c != 0) continue; + + kv_push(uint64_t, (*srt), (wq[0]<<1)); + kv_push(uint64_t, (*srt), ((wq[1]<<1)|1)); + } + + cc = tcc; + ck = qk = tk = wq[0] = wq[1] = wt[0] = wt[1] = 0; + while (ck < cc->n) { + wq[0] = qk; wt[0] = tk; + ck = pop_trace_bp_f(cc, ck, &c, &bq, &bt, &len); + if(c != 2) qk += len; + if(c != 3) tk += len; + wq[1] = qk; wt[1] = tk; + if(c != 0) continue; + + kv_push(uint64_t, (*srt), (wt[0]<<1)); + kv_push(uint64_t, (*srt), ((wt[1]<<1)|1)); + } + + // fprintf(stderr, "[M::%s] rid::%lu, ql::%lu, tl::%lu\n", __func__, rid, ql, tl); + + radix_sort_ec64(srt->a, srt->a + srt->n); + for (k = 0, dp = e = srt_n = 0, s = (uint64_t)-1; k < srt->n; k++) { + old_dp = dp; + //if a[k] is qe + if (srt->a[k]&1) --dp; + else ++dp; + + if(old_dp >= 2 && s != (uint64_t)-1) { + e = srt->a[k]>>1; + if(e > s) srt->a[srt_n++] = ((s<<32)|(e)); + } + + s = (uint64_t)-1; + if(dp >= 2) s = srt->a[k]>>1; + // if (old_dp < 2 && dp >= 2) {///old_dp < dp, a[k] is qs + // s = srt->a[k]>>1; + // } else if (old_dp >= 2 && dp < 2) {///old_dp > dp, a[k] is qe + // e = srt->a[k]>>1; + // if(e > s) { + // srt->a[srt_n++] = ((s<<32)|(e)); + // fprintf(stderr, "[M::%s] x::[%lu,\t%lu)\n", __func__, s, e); + // } + // } + } + + + // for (k = 0; k < srt_n; k++) { + // fprintf(stderr, "[M::%s] x[%lu]::[%lu,\t%u)\n", __func__, k, srt->a[k]>>32, (uint32_t)srt->a[k]); + // } + + + if(srt_n > 0) { + srt->n = srt_n; + + cc = qcc; + k = ck = qk = tk = wq[0] = wq[1] = wt[0] = wt[1] = 0; + while (ck < cc->n) { + wq[0] = qk; wt[0] = tk; + ck = pop_trace_bp_f(cc, ck, &c, &bq, &bt, &len); + if(c != 2) qk += len; + if(c != 3) tk += len; + wq[1] = qk; wt[1] = tk; + if(c != 0) continue; + si = wq[0]; ei = wq[1]; + so = wt[0]; ///eo = wt[1]; + + for (; (k > 0) && ((k >= srt_n) || (((uint32_t)(srt->a[k])) > si)); k--); + for (; k < srt_n; k++) { + s = srt->a[k]>>32; e = (uint32_t)(srt->a[k]); + if(s >= ei) break; + if(s >= si && e <= ei) { + os = so + s - si; + oe = so + e - si; + kv_push(uint64_t, (*srt), ((os<<32)|(oe))); + } + } + } + assert(srt->n == (srt_n<<1)); + + cc = tcc; + k = ck = qk = tk = wq[0] = wq[1] = wt[0] = wt[1] = 0; + while (ck < cc->n) { + wq[0] = qk; wt[0] = tk; + ck = pop_trace_bp_f(cc, ck, &c, &bq, &bt, &len); + if(c != 2) qk += len; + if(c != 3) tk += len; + wq[1] = qk; wt[1] = tk; + if(c != 0) continue; + si = wt[0]; ei = wt[1]; + so = wq[0]; ///eo = wq[1]; + + for (; (k > 0) && ((k >= srt_n) || (((uint32_t)(srt->a[k])) > si)); k--); + for (; k < srt_n; k++) { + s = srt->a[k]>>32; e = (uint32_t)(srt->a[k]); + // fprintf(stderr, "######[M::%s] t[%lu]::[%lu,\t%lu) i::[%lu,\t%lu)\n", __func__, k, s, e, si, ei); + if(s >= ei) break; + if(s >= si && e <= ei) { + os = so + s - si; + oe = so + e - si; + kv_push(uint64_t, (*srt), ((os<<32)|(oe))); + // fprintf(stderr, "[M::%s] ******\n", __func__); + } + } + } + // if(!(srt->n == (srt_n*3))) { + // fprintf(stderr, "[M::%s] rid::%lu, srt_n::%lu, srt->n::%lu\n", __func__, rid, srt_n, (uint64_t)srt->n); + // } + assert(srt->n == (srt_n*3)); + + + tcc_res->n = 0; ///reset tcc + qd = srt->a + srt_n; td = srt->a + srt_n + srt_n; uint64_t nl = 0;///, dbg_ql, dbg_tl; + for (k = q0 = t0 = 0; k < srt_n; k++) { + qs = qd[k]>>32; qe = (uint32_t)qd[k]; + ts = td[k]>>32; te = (uint32_t)td[k]; + nl += qs - qe; + // assert((qe - qs) == (te - ts)); + // assert(!memcmp(qstr + qs, tstr + ts, sizeof((*qstr))*(qe - qs))); + + // if(t0 > ts || q0 > qs) { + // fprintf(stderr, "[M::%s] rid::%lu, ql::%lu, tl::%lu\n", __func__, rid, ql, tl); + // } + // fprintf(stderr, "[M::%s] qseq::[%lu,%lu), ql::%lu, tseq::[%lu,%lu), tl::%lu\n", __func__, t0, ts, tl, q0, qs, ql); + + gen_cc_fly(tcc_res, tstr + t0, ts - t0, qstr + q0, qs - q0, exz, 0.25, MAX_SIN_L, MAX_SIN_E); + + // cal_updated_trace_len(tcc_res, &dbg_ql, &dbg_tl); + // assert(dbg_ql == ts && dbg_tl == qs); + + // fprintf(stderr, "******\n"); + + push_trace_bp_f(tcc_res, 0, (uint16_t)-1, (uint16_t)-1, qe - qs, 1); + + // cal_updated_trace_len(tcc_res, &dbg_ql, &dbg_tl); + // assert(dbg_ql == te && dbg_tl == qe); + + q0 = qe; t0 = te; + } + + qs = ql; ts = tl; + // fprintf(stderr, "[M::%s] qseq::[%lu,%lu), ql::%lu, tseq::[%lu,%lu), tl::%lu\n", __func__, t0, ts, tl, q0, qs, ql); + gen_cc_fly(tcc_res, tstr + t0, ts - t0, qstr + q0, qs - q0, exz, 0.25, MAX_SIN_L, MAX_SIN_E); + // if(!(dbg_ql == ts && dbg_tl == qs)) { + // fprintf(stderr, "[M::%s] rid::%lu, qseq::[%lu,%lu), ql::%lu, tseq::[%lu,%lu), tl::%lu\n", __func__, rid, t0, ts, tl, q0, qs, ql); + // } + // cal_updated_trace_len(tcc_res, &dbg_ql, &dbg_tl); + // assert(dbg_ql == ts && dbg_tl == qs); + + // fprintf(stderr, "[M::%s] srt_n::%lu, nl::%lu, ql::%lu, tl::%lu\n", __func__, srt_n, nl, ql, tl); + } else { + gen_cc_fly(tcc_res, tstr, tl, qstr, ql, exz, 0.25, MAX_SIN_L, MAX_SIN_E); + } +} + +void update_scb(All_reads *R_INF, asg16_v *scc, asg16_v *scb, asg16_v *scb_res, UC_Read *qu, UC_Read *tu, asg64_v *srt, bit_extz_t *exz, uint64_t rid) +{ + char *qstr = NULL, *tstr = NULL; uint64_t ql = 0, tl = 0; + uint64_t ck, qk, tk, k, wq[2], wt[2]; uint32_t len; uint16_t c, bq, bt; + gen_ori_seq0(qu->seq, qu->length, tu, scb, rid); ///tstr = tu->seq; tl = tu->length; + + ck = qk = tk = 0; ql = qu->length; + while (ck < scc->n) { + wq[0] = qk; wt[0] = tk; + ck = pop_trace_bp_f(scc, ck, &c, &bq, &bt, &len); + if(c != 2) qk += len; + if(c != 3) tk += len; + wq[1] = qk; wt[1] = tk; + } + assert(qk == ql); + tl = tk; resize_UC_Read(qu, ql + tl); + qstr = qu->seq; tstr = qu->seq + ql; + + ck = 0; qk = tk = 0; + while (ck < scc->n) { + wq[0] = qk; wt[0] = tk; + ck = pop_trace_bp_f(scc, ck, &c, &bq, &bt, &len); + if(c != 2) qk += len; + if(c != 3) tk += len; + wq[1] = qk; wt[1] = tk; + // if(xk > (uint32_t)p->z.length) fprintf(stderr, "[M::%s] xk::%u, len::%u, c::%u, rid::%ld\n", __func__, xk, (uint32_t)p->z.length, c, i); + if(c == 0) { + memcpy(tstr + wt[0], qstr + wq[0], (wq[1]-wq[0])*sizeof((*qstr))); + } else if(c == 1 || c == 2) { + for (k = wt[0]; k < wt[1]; k++) tstr[k] = s_H[bt]; + } + // if(i == 700) fprintf(stderr, "|%u%c(%c)(x::%u)(y::%u)", len, cm[c], ((c==1)||(c==2))?(cc[b]):('*'), wx[1], wy[1]); // s_H + } + + qstr = tstr; ql = tl; + tstr = tu->seq; tl = tu->length; + + // fprintf(stderr, "\n[M::%s] ql::%lu, tl::%lu, rid::%lu\n", __func__, ql, tl, rid); + + gen_updated_trace(scc, scb, scb_res, qstr, ql, tstr, tl, srt, exz, rid); + + + + ///debug + // resize_UC_Read(tu, ql + tl); + // memcpy(tu->seq + tl, qstr, ql); tstr = tu->seq; qstr = tu->seq + tl; + + // resize_UC_Read(qu, ql + tl); + // memcpy(qu->seq, tu->seq, ql + tl); tstr = qu->seq; qstr = qu->seq + tl; + + // gen_ori_seq0(qstr, ql, tu, scb_res, rid); + // assert(memcmp(tstr, tu->seq, tl) == 0); + +} + +uint32_t is_well_cal(asg64_v *idx, ma_hit_t_alloc *in0, ma_hit_t_alloc *in1, int64_t ql, int64_t occ_exact) +{ + ma_hit_t_alloc *paf = NULL; uint64_t k, s, e, vn; ma_hit_t *z; idx->n = 0; + int64_t dp, old_dp, st = 0, ed; + + paf = in0; + for (k = 0; k < paf->length; k++) { + z = &(paf->buffer[k]); + s = ((uint32_t)(z->qns)); e = z->qe; + kv_push(uint64_t, (*idx), (s<<1)); + kv_push(uint64_t, (*idx), (e<<1)|1); + } + + paf = in1; + for (k = 0; k < paf->length; k++) { + z = &(paf->buffer[k]); + s = ((uint32_t)(z->qns)); e = z->qe; + kv_push(uint64_t, (*idx), (s<<1)); + kv_push(uint64_t, (*idx), (e<<1)|1); + } + + radix_sort_ec64(idx->a, idx->a + idx->n); vn = idx->n; + for (k = 0, dp = 0, st = ed = 0; k < vn; ++k) { + old_dp = dp; + ///if a[j] is qe + if (idx->a[k]&1) --dp; + else ++dp; + + ed = idx->a[k]>>1; + if((ed > st) && ((old_dp + 1) < occ_exact)) return 0;///+1 for self + + st = ed; + } + + + ed = ql; old_dp = dp; + if((ed > st) && ((old_dp + 1) < occ_exact)) return 0;///+1 for self + + return 1; +} + +static void worker_hap_dc_ec0(void *data, long i, int tid) +{ + // if(i == 6) fprintf(stderr, "-mm-[M::%s]\tqn::%u::%.*s\tf[i]::%u\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i), scc.f[i]); + if(scc.f[i]) { + scc.a[i].n = 0; sca.a[i].n = 0; + // push_trace_bp(&(scc.a[i]), 0, (uint16_t)-1, Get_READ_LENGTH(R_INF, i), 0); + push_trace_bp_f(&(scc.a[i]), 0, (uint16_t)-1, (uint16_t)-1, Get_READ_LENGTH(R_INF, i), 0); + return; + } + ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]); + uint32_t high_occ = asm_opt.hom_cov * (2.0 - HA_KMER_GOOD_RATIO); + uint32_t low_occ = asm_opt.hom_cov * HA_KMER_GOOD_RATIO; + asg64_v buf0; overlap_region *aux_o = NULL; uint32_t qlen = 0; + // overlap_region *aux_o = NULL; asg64_v buf0; + + // gen_ovlst_paf(&(R_INF.paf[i]), &(R_INF.reverse_paf[i]), &(b->v64)); + // if(i != 181) return; + + // fprintf(stderr, "-mm-[M::%s]\tqn::%u::%.*s\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i)); + + recover_UC_Read(&b->self_read, &R_INF, i); qlen = b->self_read.length; + + + /** + if(is_well_cal(&b->v64, &(R_INF.paf[i]), &(R_INF.reverse_paf[i]), b->self_read.length, 4)) { + // aux_o = h_ec_lchain_re1(b->ab, i, &b->self_read, &b->ovlp_read, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, &b->exz, &b->v16, &b->v64, 0.02, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, UINT32_MAX, &(R_INF.paf[i]), &(R_INF.reverse_paf[i])); + aux_o = h_ec_lchain_re2(b->ab, i, &b->self_read, &b->ovlp_read, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, &b->exz, &b->v16, &b->v64, 0.02, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, UINT32_MAX, &(R_INF.paf[i]), &(R_INF.reverse_paf[i])); + } else { + aux_o = h_ec_lchain_re3(b->ab, i, &b->self_read, &b->ovlp_read, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, &b->exz, &b->v16, &b->v64, 0.02, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, UINT32_MAX, &(R_INF.paf[i]), &(R_INF.reverse_paf[i])); + // fprintf(stderr, "[M::%s]\tqn::%u::%.*s\n", __func__, (uint32_t)(i), (int)Get_NAME_LENGTH(R_INF, i), Get_NAME((R_INF), i)); + } + **/ + aux_o = h_ec_lchain_re2(b->ab, i, &b->self_read, &b->ovlp_read, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, &b->exz, &b->v16, &b->v64, 0.02, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, UINT32_MAX, &(R_INF.paf[i]), &(R_INF.reverse_paf[i])); + + ////for debug + // scc.a[i].n = 0; + // push_trace_bp(&(scc.a[i]), 0, (uint16_t)-1, Get_READ_LENGTH(R_INF, i), 0); + + // return; + + // aux_o = h_ec_lchain_re(b->ab, i, b->self_read.seq, b->self_read.length, &b->ovlp_read, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, &b->exz, &b->v16, 0.02, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, UINT32_MAX, &(R_INF.paf[i]), &(R_INF.reverse_paf[i])); + + b->cnt[0] += b->self_read.length; + + copy_asg_arr(buf0, b->sp); + rphase_hc(&b->olist, &R_INF, &b->hap, &b->self_read, &b->ovlp_read, &b->pidx, &b->v64, &buf0, 0, WINDOW_MAX_SIZE, b->self_read.length, 1/**, 1**/, i); + copy_asg_arr(b->sp, buf0); + + copy_asg_arr(buf0, b->sp); + b->cnt[1] += wcns_gen(&b->olist, &R_INF, &b->self_read, &b->ovlp_read, &b->exz, &b->pidx, &b->v64, &buf0, 0, 512, b->self_read.length, 3, 0.500001, aux_o, &b->v32, &b->cns, 256, i); + copy_asg_arr(b->sp, buf0); + + push_nec_re(aux_o, &(scc.a[i])); + update_scb(&R_INF, &(scc.a[i]), &(scb.a[i]), &(sca.a[i]), &b->self_read, &b->ovlp_read, &b->v64, &b->exz, i); + + push_ne_ovlp(&(R_INF.paf[i]), &b->olist, 1, &R_INF, &(scc.a[i])/**, i, &b->self_read, &b->ovlp_read**/); + push_ne_ovlp(&(R_INF.reverse_paf[i]), &b->olist, 2, &R_INF, NULL/**, i, NULL, NULL**/); + + check_well_cal(&(scc.a[i]), &b->v64, &(R_INF.paf[i].is_fully_corrected), &(R_INF.paf[i].is_abnormal), qlen, (MIN_COVERAGE_THRESHOLD*2), &(R_INF.paf[i])); + R_INF.trio_flag[i] = AMBIGU; + + refresh_ec_ovec_buf_t0(b, REFRESH_N); +} + +void get_origin_ec_coor(asg16_v *ec, uint64_t *ts, uint64_t *te) +{ + uint64_t ts0 = *ts, te0 = *te, qk = 0, tk = 0, ck = 0, wq[2], wt[2]; uint16_t op, bq, bt, f = 0; uint32_t cl; + while (ck < ec->n) { + wq[0] = qk; wt[0] = tk; + ck = pop_trace_bp_f(ec, ck, &op, &bq, &bt, &cl); + if(op != 2) qk += cl; + if(op != 3) tk += cl; + wq[1] = qk; wt[1] = tk; + if((op == 0) && (wt[0] <= ts0) && (wt[1] >= te0)) { + (*ts) = wq[0] + ts0 - wt[0]; + (*te) = wq[0] + te0 - wt[0]; + f = 1; + break; + } + } + assert(f); +} + +static void update_scb0(void *data, long i, int tid) +{ + if(sca.a[i].n) { + kv_resize(uint16_t, scb.a[i], sca.a[i].n); scb.a[i].n = sca.a[i].n; + memcpy(scb.a[i].a, sca.a[i].a, scb.a[i].n*sizeof((*(sca.a[i].a)))); + } + + // return; - if((oc[0] > (oc[1]*occ_exact)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1)) { - ///note: there might be insertions at q[k-1, k], insead if q[k, k+1] - fI = 1; - ///make sure there is no insertion - //+1 for the reference read - oc[0] = (occ->idx->a[(k<<1)+1]>>32) + 1; - oc[1] = ((uint32_t)occ->idx->a[(k<<1)+1]) + 1; - if(((oc[0] > (oc[1]*occ_exact)) && (oc[0] > (oc[1]-oc[0])) && (oc[1] >= occ_tot) && (oc[0] > 1))) fI = 0; - if(fI) { - // fprintf(stderr, "-1-p::%lu\toc[0]::%lu\toc[1]::%u\tgoc[0]::%lu\tgoc[1]::%u\n", s + k, (occ->idx->a[(k<<1)]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)]) + 1, (occ->idx->a[(k<<1)+1]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)+1]) + 1); - if(oe > os && os != ((uint64_t)-1)) {///push previous intervals - push_cns_anchor(ol, rref, os, oe, tu, occ, aux_o, 0, occ_tot, occ_exact, b32); - } - os = oe = (uint64_t)-1; + if(!scc.f[i]) return; + + ma_hit_t_alloc *ov, *os; uint64_t k, kr, qn, tn, ql, tl, qs, qe, ts, te; ma_hit_t *z, *r; + uint64_t ck; uint16_t op, bq, bt; uint32_t cl; + + ov = &(R_INF.paf[i]); + for (k = 0; k < ov->length; k++) { + z = &(ov->buffer[k]); + qn = z->qns>>32; tn = z->tn; + if(scc.f[tn]) continue; + + os = &(R_INF.paf[tn]); + for (kr = 0; kr < os->length; kr++) { + if(os->buffer[kr].tn == qn) { + r = &(os->buffer[kr]); + break; } + } + if(kr >= os->length) continue; + // if(!(r->el)) continue; - //+1 for the reference read - oc[0] = (occ->idx->a[(k<<1)]>>32) + 1; - oc[1] = ((uint32_t)occ->idx->a[(k<<1)]) + 1; - if((s+k) == oe) { - oe++; - } else { - if(oe > os && os != ((uint64_t)-1)) {///push previous intervals - push_cns_anchor(ol, rref, os, oe, tu, occ, aux_o, 0, occ_tot, occ_exact, b32); - } - os = s+k; oe = s+k+1; + qs = r->ts; qe = r->te; + ts = (uint32_t)r->qns; te = r->qe; + z->el = r->el; z->rev = r->rev; z->ml = r->ml; z->no_l_indel = r->no_l_indel; + if(z->el) { + get_origin_ec_coor(&(scc.a[tn]), &ts, &te); + } + + z->qns = qn; + z->qns = z->qns << 32; + if(z->rev) { + ql = Get_READ_LENGTH(R_INF, qn); + tl = ck = 0; + while (ck < scc.a[tn].n) { + ck = pop_trace_bp_f(&(scc.a[tn]), ck, &op, &bq, &bt, &cl); + if(op != 2) tl += cl; } + z->qns = z->qns | (ql - qe); + z->qe = ql - qs; + z->ts = tl - te; + z->te = tl - ts; } else { - // fprintf(stderr, "-2-p::%lu\toc[0]::%lu\toc[1]::%u\tgoc[0]::%lu\tgoc[1]::%u\n", s + k, (occ->idx->a[(k<<1)]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)]) + 1, (occ->idx->a[(k<<1)+1]>>32) + 1, ((uint32_t)occ->idx->a[(k<<1)+1]) + 1); - if(oe > os && os != ((uint64_t)-1)) {///push previous intervals - push_cns_anchor(ol, rref, os, oe, tu, occ, aux_o, 0, occ_tot, occ_exact, b32); - } - os = oe = (uint64_t)-1; + z->qns = z->qns | qs; + z->qe = qe; + z->ts = ts; + z->te = te; } - occ->idx->a[(k<<1)] = occ->idx->a[(k<<1)+1] = 0; } +} - occ->mms = occ->mme = (uint64_t)-1; - if(oe > os && os != ((uint64_t)-1)) { - occ->mms = os; occ->mme = oe; +void dbg_rsc(char *str0, uint64_t l0, char *str1, uint64_t l1, asg16_v *sc, char *real, uint32_t id) +{ + uint64_t ck, xk, yk, k, wx[2], wy[2]; uint32_t len; uint16_t c, bq, bt; + + ck = xk = yk = 0; + while (ck < sc->n) { + wx[0] = xk; wy[0] = yk; + ck = pop_trace_bp_f(sc, ck, &c, &bq, &bt, &len); + if(c != 2) xk += len; + if(c != 3) yk += len; + wx[1] = xk; wy[1] = yk; } - return rr; -} + assert(xk == l0); + // char cm[4], cc[4]; + // cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + // cc[0] = 'A'; cc[1] = 'C'; cc[2] = 'G'; cc[3] = 'T'; + ck = xk = yk = 0; + while (ck < sc->n) { + wx[0] = xk; wy[0] = yk; + ck = pop_trace_bp_f(sc, ck, &c, &bq, &bt, &len); + if(c != 2) xk += len; + if(c != 3) yk += len; + wx[1] = xk; wy[1] = yk; -void print_debug_ovlp_cigar(overlap_region_alloc* ol, asg64_v* idx, kv_ul_ov_t *c_idx) -{ - uint64_t k, ci; uint32_t cl; ul_ov_t *cp; bit_extz_t ez; uint16_t c; char cm[4]; - cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; - for (k = 0; k < idx->n; k++) { - cp = &(c_idx->a[(uint32_t)idx->a[k]]); - fprintf(stderr, "**********[M::%s] tid::%u\t%.*s\twid::%u\tq::[%u, %u)\terr::%d\toerr::%u**********\n", __func__, ol->list[ovlp_id(*cp)].y_id, (int)Get_NAME_LENGTH(R_INF, ol->list[ovlp_id(*cp)].y_id), Get_NAME(R_INF, ol->list[ovlp_id(*cp)].y_id), - ovlp_cur_wid(*cp), ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start, ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1, ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].error, ol->list[ovlp_id(*cp)].non_homopolymer_errors); - set_bit_extz_t(ez, ol->list[ovlp_id(*cp)], ovlp_cur_wid(*cp)); ci = 0; - while (ci < ez.cigar.n) { - ci = pop_trace(&(ez.cigar), ci, &c, &cl); - fprintf(stderr, "%u%c", cl, cm[c]); + if(c == 0) { + memcpy(str0 + wx[0], str1 + wy[0], (wx[1]-wx[0])*sizeof((*str1))); + } else if(c == 1 || c == 3) { + for (k = wx[0]; k < wx[1]; k++) str0[k] = s_H[bq]; } - fprintf(stderr, "\n"); - } + // fprintf(stderr, "%u%c(%c)(x::[%lu,%ld))(y::[%lu,%ld))\n", len, cm[c], ((c==1)||(c==2))?(cc[bt]):('*'), wx[0], wx[1], wy[0], wy[1]); // s_H + } + + if(memcmp(str0, real, l0*sizeof((*str0)))) { + fprintf(stderr, "-0-[M::%s]\tid::%u\n", __func__, id); + // for (k = 0; k < l0 && str0[k] == real[k]; k++); + + // fprintf(stderr, "-0-[M::%s]\tid::%u\tk::%lu\tl0::%lu\tnc::%c\toc::%c\n", __func__, id, k, l0, str0[k], real[k]); + // exit(1); + } else { + // fprintf(stderr, "-1-[M::%s]\tid::%u\n", __func__, id); + } + } -void wcns_gen(overlap_region_alloc* ol, All_reads *rref, UC_Read* qu, UC_Read* tu, kv_ul_ov_t *c_idx, asg64_v* idx, asg64_v* buf, int64_t bd, uint64_t wl, int64_t ql, uint64_t occ_tot, double occ_exact, overlap_region *aux_o, asg32_v* b32) +static void worker_sl_ec(void *data, long i, int tid) { - int64_t on = ol->length, k, i, zwn, q[2]; - uint64_t m, *ra, rn; overlap_region *z; ul_ov_t *cp; + // if(i != 0) return; - for (k = idx->n = c_idx->n = 0; k < on; k++) { - z = &(ol->list[k]); zwn = z->w_list.n; - if((!zwn) || (z->is_match != 1)) continue; - for (i = 0; i < zwn; i++) { - if(is_ualn_win(z->w_list.a[i])) continue; - q[0] = z->w_list.a[i].x_start; q[1] = z->w_list.a[i].x_end; - q[0] += bd; q[1] -= bd; - if(q[1] >= q[0]) { - m = ((uint64_t)q[0]); m <<= 32; - m += c_idx->n; kv_push(uint64_t, *idx, m); + sl_v *p = &(((sl_v*)data)[tid]); + uint32_t ci = 0, len, xk, yk, wx[2], wy[2], k, Nn, yn = 0, tot_e; uint16_t c, bq, bt; - kv_pushp(ul_ov_t, *c_idx, &cp); - ovlp_id(*cp) = k; ///ovlp id - // ovlp_min_wid(*cp) = i; ///beg id of windows - // ovlp_max_wid(*cp) = i; ///end id of windows - ovlp_cur_wid(*cp) = i; ///cur id of windows - ovlp_cur_xoff(*cp) = z->w_list.a[i].x_start; ///cur xpos - ovlp_cur_yoff(*cp) = z->w_list.a[i].y_start; ///cur xpos - ovlp_cur_ylen(*cp) = 0; - ovlp_cur_coff(*cp) = 0; ///cur cigar off in cur window - ovlp_bd(*cp) = bd; - } - } + + ci = 0; xk = yk = 0; tot_e = 0; + while (ci < scc.a[i].n) { + // ci = pop_trace_bp(&scc.a[i], ci, &c, &b, &len); + ci = pop_trace_bp_f(&scc.a[i], ci, &c, &bq, &bt, &len); + if(c != 3) yk += len; + if(c != 0) tot_e += len; + // fprintf(stderr, "|%u%c(%c)", len, cm[c], ((c==1)||(c==2))?(cc[b]):('*')); // s_H } + if(tot_e == 0) return;///no change - int64_t srt_n = idx->n, s, e, t, rr; i = 0; - radix_sort_ec64(idx->a, idx->a+idx->n); - for (k = 1, i = 0; k < srt_n; k++) { - if (k == srt_n || (idx->a[k]>>32) != (idx->a[i]>>32)) { - if(k - i > 1) { - for (t = i; t < k; t++) { - cp = &(c_idx->a[(uint32_t)idx->a[t]]); - // s = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_start+ovlp_bd(*cp); - // assert(s == (int64_t)(idx->a[i]>>32)); - m = ol->list[ovlp_id(*cp)].w_list.a[ovlp_cur_wid(*cp)].x_end+1-ovlp_bd(*cp); - m <<= 32; m += ((uint32_t)idx->a[t]); idx->a[t] = m; - // fprintf(stderr, "[M::%s] s::%ld\tsi::%lu\n", __func__, s, (idx->a[i]>>32)); - } - radix_sort_ec64(idx->a + i, idx->a + k); + yn = yk; yk++; kv_resize(char, (*p), yk); p->a[yn] = '\0'; + recover_UC_Read(&p->z, &R_INF, i); ///b->z.length + + + // char cm[4], cc[4]; + // cm[0] = 'M'; cm[1] = 'S'; cm[2] = 'I'; cm[3] = 'D'; + // cc[0] = 'A'; cc[1] = 'C'; cc[2] = 'G'; cc[3] = 'T'; + + ci = 0; xk = yk = 0; Nn = 0; + while (ci < scc.a[i].n) { + wx[0] = xk; wy[0] = yk; + // ci = pop_trace_bp(&scc.a[i], ci, &c, &b, &len); + ci = pop_trace_bp_f(&scc.a[i], ci, &c, &bq, &bt, &len); + if(c != 2) xk += len; + if(c != 3) yk += len; + wx[1] = xk; wy[1] = yk; + // if(xk > (uint32_t)p->z.length) fprintf(stderr, "[M::%s] xk::%u, len::%u, c::%u, rid::%ld\n", __func__, xk, (uint32_t)p->z.length, c, i); + if(c == 0) { + // memcpy(p->a + wy[0], p->z.seq + wx[0], (wx[1]-wx[0])*sizeof((*(p->a)))); + for (; wx[0] < wx[1]; wx[0]++, wy[0]++) { + p->a[wy[0]] = p->z.seq[wx[0]]; + if(p->a[wy[0]] == 'N') Nn++; + } + } else if(c == 1 || c == 2) { + for (k = wy[0]; k < wy[1]; k++) { + p->a[k] = s_H[bt]; + if(p->a[k] == 'N') Nn++; } - i = k; } + + // if(i == 700) fprintf(stderr, "|%u%c(%c)(x::%u)(y::%u)", len, cm[c], ((c==1)||(c==2))?(cc[b]):('*'), wx[1], wy[1]); // s_H } - print_debug_ovlp_cigar(ol, idx, c_idx); + // if(i == 700) fprintf(stderr, "|\n"); - ///second index - kv_resize(ul_ov_t, *c_idx, (c_idx->n<<1)); - ul_ov_t *idx_a = NULL, *idx_b = NULL; - idx_a = c_idx->a; idx_b = c_idx->a; - memcpy(idx_b, idx_a, c_idx->n * (sizeof((*(idx_a))))); - kv_resize(uint64_t, *buf, ((wl<<1) + idx->n)); buf->n = ((wl<<1) + idx->n); - memcpy(buf->a + (wl<<1), idx->a, idx->n * (sizeof((*(idx->a))))); - memset(buf->a, 0, (wl<<1)*(sizeof((*(idx->a))))); + if (R_INF.read_size[i] < yn) { + R_INF.read_size[i] = yn; + REALLOC(R_INF.read_sperate[i], R_INF.read_size[i]/4+1); + } + R_INF.read_length[i] = yn; + // if(Nn > 0) fprintf(stderr, "[M::%s] Nn->%u\n", __func__, Nn); - cc_idx_t ii_a, ii_b; memset(&ii_a, 0, sizeof(ii_a)); memset(&ii_b, 0, sizeof(ii_b)); + // for (k = 0; k < yn; k++) { + // c = seq_nt6_table[(uint8_t)p->a[k]]; + // if (c >= 4) { + // fprintf(stderr, "[M::%s] Nn->%u, yn::%u, xn::%lld, k::%u, str::%c, c::%u, rid::%ld\n", __func__, Nn, yn, p->z.length, k, p->a[k], c, i); + // } + // } - ii_a.c_idx = idx_a; ii_a.idx = idx; ii_a.i = ii_a.i0 = 0; ii_a.srt_n = ii_a.idx->n; ii_a.mms = ii_a.mme = (uint64_t)-1; - ii_b.c_idx = idx_b; ii_b.idx = buf; ii_b.i = ii_b.i0 = (wl<<1); ii_b.srt_n = ii_b.idx->n; ii_b.mms = ii_b.mme = (uint64_t)-1; - s = 0; e = wl; e = ((e<=ql)?e:ql); rr = 0; - aux_o->w_list.n = aux_o->w_list.c.n = 0; ///for cigar - for (; s < ql; ) { - rn = iter_cc_idx_t(ol->list, &ii_a, s, e, rr, 0, &ra); - // debug_inter0(ol->list, ii_a.c_idx, ii_a.idx->a + ii_a.i0, ii_a.srt_n - ii_a.i0, ra, rn, s, e, 0, 1, "-0-"); - rr = wcns_vote(ol->list, rref, qu->seq, tu, ra, rn, s, e, ii_a.c_idx, &ii_b, occ_tot, occ_exact, aux_o, b32); - s += wl; e += wl; e = ((e<=ql)?e:ql); + ///debug + // resize_UC_Read(&p->z, p->z.length * 2); + // dbg_rsc(p->z.seq + p->z.length, p->z.length, p->a, yn, &(scc.a[i]), p->z.seq, i); + + + ha_compress_base(Get_READ(R_INF, i), p->a, yn, &R_INF.N_site[i], Nn); + +} + +uint64_t cal_ec_multiple(ec_ovec_buf_t *b, uint64_t n_thre, uint64_t n_a, uint64_t *r_base) +{ + double tt0 = yak_realtime_0(); + uint64_t k, num_base = 0, num_correct = 0; (*r_base) = 0; + + if(!(scc.a)) { + scc.n = scc.m = n_a; CALLOC(scc.a, n_a); CALLOC(scc.f, n_a); } - if(ii_b.mme > ii_b.mms && ii_b.mms != (uint64_t)-1) { - push_cns_anchor(ol->list, rref, ii_b.mms, ii_b.mme, tu, &ii_b, aux_o, 0, occ_tot, occ_exact, b32); + if(!(scb.a)) { + scb.n = scb.m = n_a; CALLOC(scb.a, n_a); + } + + for (k = 0; k < n_thre; ++k) b->a[k].cnt[0] = b->a[k].cnt[1] = 0; + + kt_for(n_thre, worker_hap_ec, b, n_a);///debug_for_fix + + for (k = 0; k < n_thre; ++k) { + num_base += b->a[k].cnt[0]; + num_correct += b->a[k].cnt[1]; } - push_cns_anchor(ol->list, rref, ql, ql, tu, &ii_b, aux_o, 1, occ_tot, occ_exact, b32); + // fprintf(stderr, "\n[M::%s] # reads->%lu\n", __func__, n_a); + // fprintf(stderr, "[M::%s] # input bases->%lu\n", __func__, num_base); + // fprintf(stderr, "[M::%s] # corrected bases->%lu\n", __func__, num_correct); + // fprintf(stderr, "[M::%s::%.3f] running time\n", __func__, yak_realtime_0()-tt0); + fprintf(stderr, "[M::pec::%.3f] # bases: %lu; # corrected bases: %lu\n", yak_realtime_0()-tt0, num_base, num_correct); + + (*r_base) = num_base; + return num_correct; } -static void worker_hap_ec(void *data, long i, int tid) -{ - ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]); - uint32_t high_occ = asm_opt.hom_cov * (2.0 - HA_KMER_GOOD_RATIO); - uint32_t low_occ = asm_opt.hom_cov * HA_KMER_GOOD_RATIO; - overlap_region *aux_o = NULL; asg64_v buf0; - // if (memcmp("m64012_190920_173625/7210046/ccs", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { - // fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i); - // } else { - // return; - // } +void ha_print_ovlp_stat_1(ec_ovec_buf_t *b, uint64_t n_thre, uint64_t n_a) +{ + double tt0 = yak_realtime_0(); + uint64_t k, forward, reverse, strong, weak, exact, no_l_indel; - if(i != 596/**1024**/) return; + forward = reverse = strong = weak = exact = no_l_indel = 0; - recover_UC_Read(&b->self_read, &R_INF, i); + ///calculate overlaps + for (k = 0; k < n_thre; ++k) { + b->a[k].cnt[0] = b->a[k].cnt[1] = b->a[k].cnt[2] = b->a[k].cnt[3] = b->a[k].cnt[4] = b->a[k].cnt[5] = 0; + } - h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, 0.02, asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, 2, UINT32_MAX); + kt_for(n_thre, worker_hap_dc_ec_gen, b, n_a); - b->num_read_base += b->olist.length; - aux_o = fetch_aux_ovlp(&b->olist);///must be here + for (k = 0; k < n_thre; ++k) { + forward += b->a[k].cnt[0]; + reverse += b->a[k].cnt[1]; + strong += b->a[k].cnt[2]; + weak += b->a[k].cnt[3]; + exact += b->a[k].cnt[4]; + no_l_indel += b->a[k].cnt[5]; + } - gen_hc_r_alin(&b->olist, &b->clist, &R_INF, &b->self_read, &b->ovlp_read, &b->exz, aux_o, asm_opt.max_ov_diff_ec, WINDOW_HC, i, E_KHIT/**asm_opt.k_mer_length**/, 1, &b->v16); + fprintf(stderr, "[M::%s] # overlaps: %lu\n", __func__, forward); + fprintf(stderr, "[M::%s] # strong overlaps: %lu\n", __func__, strong); + fprintf(stderr, "[M::%s] # weak overlaps: %lu\n", __func__, weak); + fprintf(stderr, "[M::%s] # exact overlaps: %lu\n", __func__, exact); // this seems not right + fprintf(stderr, "[M::%s] # inexact overlaps: %lu\n", __func__, forward - exact); + fprintf(stderr, "[M::%s] # overlaps without large indels: %lu\n", __func__, no_l_indel); + fprintf(stderr, "[M::%s] # reverse overlaps: %lu\n", __func__, reverse); + fprintf(stderr, "[M::%s] # running time: %.3f\n", __func__, yak_realtime_0()-tt0); - // fprintf(stderr, "\n[M::%s] rid::%ld\t%.*s\tlen::%lld\tocc::%lu\n", __func__, i, (int)Get_NAME_LENGTH(R_INF, i), - // Get_NAME(R_INF, i), b->self_read.length, b->olist.length); + // fprintf(stderr, "\n[M::%s] # reads->%lu\n", __func__, n_a); + // fprintf(stderr, "[M::%s] # corrected reads->%lu\n", __func__, rb); + // fprintf(stderr, "[M::%s] # uncorrected reads->%lu\n", __func__, urb); + // fprintf(stderr, "[M::%s::%.3f]\n", __func__, yak_realtime_0()-tt0); +} - b->num_correct_base += b->olist.length; +void ha_print_ovlp_stat_0(ec_ovec_buf_t *b, uint64_t n_thre, uint64_t n_a) +{ + double tt0 = yak_realtime_0(); + uint64_t k, forward, reverse, strong, weak, exact, no_l_indel; - copy_asg_arr(buf0, b->sp); - rphase_hc(&b->olist, &R_INF, &b->hap, &b->self_read, &b->ovlp_read, &b->pidx, &b->v64, &buf0, 0, WINDOW_MAX_SIZE, b->self_read.length, 1); - copy_asg_arr(b->sp, buf0); + forward = reverse = strong = weak = exact = no_l_indel = 0; - copy_asg_arr(buf0, b->sp); - wcns_gen(&b->olist, &R_INF, &b->self_read, &b->ovlp_read, &b->pidx, &b->v64, &buf0, 0, 512, b->self_read.length, 3, 0.500001, aux_o, &b->v32); - copy_asg_arr(b->sp, buf0); + ///calculate overlaps + for (k = 0; k < n_thre; ++k) { + b->a[k].cnt[0] = b->a[k].cnt[1] = b->a[k].cnt[2] = b->a[k].cnt[3] = b->a[k].cnt[4] = b->a[k].cnt[5] = 0; + } + kt_for(n_thre, worker_hap_dc_ec_gen_new_idx, b, n_a); - uint32_t k; - for (k = 0; k < b->olist.length; k++) { - if(b->olist.list[k].is_match == 1) b->num_recorrect_base++; + for (k = 0; k < n_thre; ++k) { + forward += b->a[k].cnt[0]; + reverse += b->a[k].cnt[1]; + strong += b->a[k].cnt[2]; + weak += b->a[k].cnt[3]; + exact += b->a[k].cnt[4]; + no_l_indel += b->a[k].cnt[5]; } - // exit(1); + fprintf(stderr, "[M::%s] # overlaps: %lu\n", __func__, forward); + fprintf(stderr, "[M::%s] # strong overlaps: %lu\n", __func__, strong); + fprintf(stderr, "[M::%s] # weak overlaps: %lu\n", __func__, weak); + fprintf(stderr, "[M::%s] # exact overlaps: %lu\n", __func__, exact); // this seems not right + fprintf(stderr, "[M::%s] # inexact overlaps: %lu\n", __func__, forward - exact); + fprintf(stderr, "[M::%s] # overlaps without large indels: %lu\n", __func__, no_l_indel); + fprintf(stderr, "[M::%s] # reverse overlaps: %lu\n", __func__, reverse); + fprintf(stderr, "[M::%s] # running time: %.3f\n", __func__, yak_realtime_0()-tt0); + + // fprintf(stderr, "\n[M::%s] # reads->%lu\n", __func__, n_a); + // fprintf(stderr, "[M::%s] # corrected reads->%lu\n", __func__, rb); + // fprintf(stderr, "[M::%s] # uncorrected reads->%lu\n", __func__, urb); + // fprintf(stderr, "[M::%s::%.3f]\n", __func__, yak_realtime_0()-tt0); +} + +uint64_t cal_sec_ec_multiple(ec_ovec_buf_t *b, uint64_t n_thre, uint64_t n_a, int64_t round) +{ + double tt0 = yak_realtime_0(); + uint64_t k, num_base, num_correct, rb, urb; + num_base = num_correct = 0; + ////counting + rb = urb = 0; + for (k = 0; k < n_thre; ++k) b->a[k].cnt[0] = b->a[k].cnt[1] = 0; + + kt_for(n_thre, worker_hap_dc_ec, b, n_a);///debug_for_fix + for (k = 0; k < n_thre; ++k) { + rb += b->a[k].cnt[0]; urb += b->a[k].cnt[1]; + } - // prt_chain(&b->olist); + if(round >= 0) { + if(!(sca.a)) { + sca.n = sca.m = n_a; CALLOC(sca.a, n_a); + } + ////correct + + for (k = 0; k < n_thre; ++k) b->a[k].cnt[0] = b->a[k].cnt[1] = 0; + + kt_for(n_thre, worker_hap_dc_ec0, b, n_a);///debug_for_fix - // ul_map_lchain(b->abl, (uint32_t)-1, s->seq[i], s->len[i], s->opt->w, s->opt->k, s->uu, &b->olist, &b->clist, s->opt->bw_thres, - // s->opt->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2/**0.75**/, 2, 3); + for (k = 0; k < n_thre; ++k) { + num_base += b->a[k].cnt[0]; + num_correct += b->a[k].cnt[1]; + } - /** - int fully_cov, abnormal; - // if(i != 12578) return; - // fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i); - // if (memcmp("7897e875-76e5-42c8-bc37-94b370c4cc8d", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { - // fprintf(stderr, "[M::%s-beg] rid->%ld\n", __func__, i); - // } else { - // return; + kt_for(n_thre, update_scb0, b, n_a); + } + + if(round >= 0) { + fprintf(stderr, "[M::sec::%.3f] # bases: %lu; # corrected bases: %lu; # reads: %lu; # corrected reads: %lu\n", yak_realtime_0()-tt0, num_base, num_correct, rb, urb); + } else { + fprintf(stderr, "[M::sec::%.3f] # reads: %lu; # corrected reads: %lu\n", yak_realtime_0()-tt0, rb, urb); + } + + // fprintf(stderr, "\n[M::%s] # reads->%lu\n", __func__, n_a); + // fprintf(stderr, "[M::%s] # corrected reads->%lu\n", __func__, rb); + // fprintf(stderr, "[M::%s] # uncorrected reads->%lu\n", __func__, urb); + // if(round >= 0) { + // fprintf(stderr, "[M::%s] # input bases->%lu\n", __func__, num_base); + // fprintf(stderr, "[M::%s] # corrected bases->%lu\n", __func__, num_correct); + // fprintf(stderr, "[M::%s::%.3f] ==> round %ld\n", __func__, yak_realtime_0()-tt0, round); // } + return num_correct; +} - ha_get_candidates_interface(b->ab, i, &b->self_read, &b->olist, &b->olist_hp, &b->clist, - 0.02, asm_opt.max_n_chain, 1, NULL, &b->r_buf, &(R_INF.paf[i]), &(R_INF.reverse_paf[i]), &(b->tmp_region), NULL, &(b->sp)); - clear_Cigar_record(&b->cigar1); - clear_Round2_alignment(&b->round2); +void write_ec_reads(const char *suffix_ou) +{ + uint64_t k, strl; UC_Read qstr, tstr; char *nn = NULL, *str = NULL; + init_UC_Read(&qstr); init_UC_Read(&tstr); + MALLOC(nn, strlen(suffix_ou) + strlen(asm_opt.output_file_name) + 36); + sprintf(nn, "%s.%s", asm_opt.output_file_name, suffix_ou); + FILE *ou = fopen(nn, "w"); + free(nn); - correct_overlap(&b->olist, &R_INF, &b->self_read, &b->correct, &b->ovlp_read, &b->POA_Graph, &b->DAGCon, - &b->cigar1, &b->hap, &b->round2, &b->r_buf, &(b->tmp_region.w_list), 0, 1, &fully_cov, &abnormal); + for (k = 0; k < R_INF.total_reads; k++) { + recover_UC_Read(&qstr, &R_INF, k); + if(scb.a) { + gen_ori_seq0(qstr.seq, qstr.length, &tstr, &(scb.a[k]), k); str = tstr.seq; strl = tstr.length; + } else { + str = qstr.seq; strl = qstr.length; + } - b->num_read_base += b->self_read.length; - b->num_correct_base += b->correct.corrected_base; - b->num_recorrect_base += b->round2.dumy.corrected_base; + fwrite(">", 1, 1, ou); + fwrite(Get_NAME(R_INF, k), 1, Get_NAME_LENGTH(R_INF, k), ou); + fwrite("\n", 1, 1, ou); + fwrite(str, 1, strl, ou); + fwrite("\n", 1, 1, ou); + } - push_cigar(R_INF.cigars, i, &b->cigar1); - push_cigar(R_INF.second_round_cigar, i, &b->round2.cigar); + fclose(ou); destory_UC_Read(&qstr); destory_UC_Read(&tstr); +} - R_INF.paf[i].is_fully_corrected = 0; - if (fully_cov) { - if (get_cigar_errors(&b->cigar1) == 0 && get_cigar_errors(&b->round2.cigar) == 0) - R_INF.paf[i].is_fully_corrected = 1; - } - R_INF.paf[i].is_abnormal = abnormal; - R_INF.trio_flag[i] = AMBIGU; - - ///need to be fixed in r305 - // if(ha_idx_hp == NULL) - // { - // R_INF.trio_flag[i] += collect_hp_regions(&b->olist, &R_INF, &(b->k_flag), RESEED_HP_RATE, Get_READ_LENGTH(R_INF, i), NULL); +void cal_ec_r(uint64_t n_thre, uint64_t round, uint64_t n_round, uint64_t n_a, uint64_t is_sv, uint64_t *tot_b, uint64_t *tot_e) +{ + // write_ec_reads("ec0.fa"); + + ec_ovec_buf_t *b = NULL; uint64_t k, is_cr = (round&1); + (*tot_b) = (*tot_e) = 0; + + + b = gen_ec_ovec_buf_t(n_thre); + (*tot_e) += cal_ec_multiple(b, n_thre, n_a, tot_b); ////exit(1); + sl_ec_r(n_thre, n_a); + + for (k = 0; k < n_round; k++) { + (*tot_e) += cal_sec_ec_multiple(b, n_thre, n_a, k); + sl_ec_r(n_thre, n_a); + } + + if(is_sv) kt_for(n_thre, worker_hap_dc_ec, b, n_a);///update overlaps + + if((!is_sv) || (is_sv && is_cr)) { + kt_for(n_thre, worker_hap_post_rev, b, n_a); + } + + // cal_sec_ec_multiple(b, n_thre, n_a, -1); + + // gen_sec_ec_multiple(b, n_thre, n_a); + + destroy_ec_ovec_buf_t(b); + + // write_ec_reads("ec16.fa"); + + // uint64_t z; + // for (z = 0; z < scc.n; z++) { + // if(scc.f[z]) continue; + // fprintf(stderr, "[M::%s]\tid::%lu::%.*s\n", __func__, z, (int)Get_NAME_LENGTH(R_INF, z), Get_NAME((R_INF), z)); // } +} - if (R_INF.trio_flag[i] != AMBIGU || b->save_ov) { - int is_rev = (asm_opt.number_of_round % 2 == 0); - push_overlaps(&(R_INF.paf[i]), &b->olist, 1, &R_INF, is_rev); - push_overlaps(&(R_INF.reverse_paf[i]), &b->olist, 2, &R_INF, is_rev); - } +void destroy_cc_v(cc_v *z) +{ + uint64_t k; + for (k = 0; k < z->m; k++) free(z->a[k].a); + free(z->f); free(z->a); + z->n = z->m = 0; z->f = NULL; z->a = NULL; +} - if(het_cnt) het_cnt[i] = get_het_cnt(&b->hap); - // fprintf(stderr, "[M::%s-end] rid->%ld\n", __func__, i); - **/ +void cal_ov_r(uint64_t n_thre, uint64_t n_a, uint64_t new_idx) +{ + ec_ovec_buf_t *b = NULL; + b = gen_ec_ovec_buf_t(n_thre); + if(new_idx) { + // kt_for(n_thre, worker_hap_dc_ec, b, n_a);///update overlaps + destroy_cc_v(&scc); destroy_cc_v(&scb); destroy_cc_v(&sca); + + ha_print_ovlp_stat_0(b, n_thre, n_a); + } else { + ha_print_ovlp_stat_1(b, n_thre, n_a); + destroy_cc_v(&scc); destroy_cc_v(&scb); destroy_cc_v(&sca); + } + + destroy_ec_ovec_buf_t(b); } -void cal_ec_multiple(ec_ovec_buf_t *b, uint64_t n_thre, uint64_t n_a) +void sl_ec_r(uint64_t n_thre, uint64_t n_a) { - double tt0 = yak_realtime_0(); - kt_for(n_thre, worker_hap_ec, b, n_a);///debug_for_fix - fprintf(stderr, "[M::%s-reads] #->%lu\n", __func__, n_a); - fprintf(stderr, "[M::%s::%.3f] ==> chaining\n", __func__, yak_realtime_0()-tt0); + sl_v *b = NULL; uint64_t k; MALLOC(b, n_thre); + for (k = 0; k < n_thre; k++) { + b[k].a = NULL; b[k].n = b[k].m = 0; + init_UC_Read(&b[k].z); + } + + kt_for(n_thre, worker_sl_ec, b, n_a);///debug_for_fix + + for (k = 0; k < n_thre; k++) { + free(b[k].a); destory_UC_Read(&b[k].z); + } + free(b); } \ No newline at end of file diff --git a/ecovlp.h b/ecovlp.h index 2b30733..fe5cc89 100644 --- a/ecovlp.h +++ b/ecovlp.h @@ -5,60 +5,59 @@ #include #include "Hash_Table.h" #include "Process_Read.h" +#include "kdq.h" + +KDQ_INIT(uint32_t) typedef struct { uint32_t v:31, f:1; uint32_t sc; } cns_arc; -typedef struct {size_t n, m; cns_arc *a;} cns_arc_v; +typedef struct {size_t n, m, nou; cns_arc *a; } cns_arc_v; typedef struct { // uint16_t c:2, t:2, f:1, sc:3; uint32_t c:2, f:1, sc:29; - cns_arc_v in, ou; + cns_arc_v arc; }cns_t; typedef struct { size_t n, m; cns_t *a; - uint32_t si, ei; + uint32_t si, ei, off, bn, bb0, bb1, cns_g_wl; + kdq_t(uint32_t) *q; }cns_gfa; typedef struct { - int is_final, save_ov; // chaining and overlapping related buffers UC_Read self_read, ovlp_read; Candidates_list clist; overlap_region_alloc olist; - overlap_region tmp; ha_abuf_t *ab; - // error correction related buffers - int64_t num_read_base, num_correct_base, num_recorrect_base; - Cigar_record cigar; - Correct_dumy correct; + // int64_t num_read_base, num_correct_base, num_recorrect_base; + uint64_t cnt[6], rr; haplotype_evdience_alloc hap; bit_extz_t exz; - // asg32_v v32; kv_ul_ov_t pidx; asg64_v v64; asg32_v v32; asg16_v v16; - kvec_t_u64_warp r_buf; kvec_t_u8_warp k_flag; st_mt_t sp; - cns_gfa cns; - + cns_gfa cns; } ec_ovec_buf_t0; typedef struct { ec_ovec_buf_t0 *a; - uint32_t n; + uint32_t n, rev; } ec_ovec_buf_t; -ec_ovec_buf_t* gen_ec_ovec_buf_t(uint32_t n, uint32_t is_final, uint32_t save_ov); +ec_ovec_buf_t* gen_ec_ovec_buf_t(uint32_t n); void destroy_ec_ovec_buf_t(ec_ovec_buf_t *p); -void cal_ec_multiple(ec_ovec_buf_t *b, uint64_t n_thre, uint64_t n_a); void prt_chain(overlap_region_alloc *o); +void cal_ec_r(uint64_t n_thre, uint64_t round, uint64_t n_round, uint64_t n_a, uint64_t is_sv, uint64_t *tot_b, uint64_t *tot_e); +void sl_ec_r(uint64_t n_thre, uint64_t n_a); +void cal_ov_r(uint64_t n_thre, uint64_t n_a, uint64_t new_idx); #endif \ No newline at end of file diff --git a/htab.h b/htab.h index 9f7c83d..d17f812 100644 --- a/htab.h +++ b/htab.h @@ -125,6 +125,7 @@ int adj_m_peak_hom(int m_peak_hom, int max_i, int max2_i, int max3_i, int *peak_ void print_hist_lines(int n_cnt, int start_cnt, const int64_t *cnt); void debug_adapter(const hifiasm_opt_t *asm_opt, All_reads *rs); + inline int mz_low_b(int peak_hom, int peak_het) { int low_freq = 2;