From 49dd83df2a7196dce395bb342c1c78cc553d71e1 Mon Sep 17 00:00:00 2001 From: chhylp123 Date: Sun, 13 Oct 2024 18:47:05 -0400 Subject: [PATCH] fix memory issue --- CommandLines.h | 2 +- Correct.cpp | 21 ++++++- Hash_Table.cpp | 22 ++++--- anchor.cpp | 8 +-- ecovlp.cpp | 165 ++++++++++++++++++++++++++++++++++++++++++++++--- htab.cpp | 50 ++++++++++++--- 6 files changed, 235 insertions(+), 33 deletions(-) diff --git a/CommandLines.h b/CommandLines.h index 894beae..2c121cd 100644 --- a/CommandLines.h +++ b/CommandLines.h @@ -5,7 +5,7 @@ #include #include -#define HA_VERSION "0.20.0-r631" +#define HA_VERSION "0.20.0-r639" #define VERBOSE 0 diff --git a/Correct.cpp b/Correct.cpp index 3fb7bfa..9ed26c9 100644 --- a/Correct.cpp +++ b/Correct.cpp @@ -16753,12 +16753,14 @@ char* qstr, char* tstr, bit_extz_t *exz, double e_rate, int64_t ql, int64_t tl, uint64_t gen_hc_fast_cigar0(overlap_region *z, Candidates_list *cl, uint64_t wl, All_reads *rref, char* qstr, UC_Read *tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t ql, uint64_t rid, int64_t h_khit, int64_t *re) { int64_t ch_idx = z->shared_seed, ch_n; - int64_t i, tl, id = z->y_id, m, tot_e, aln; + int64_t i, tl, id = z->y_id, m, tot_e, aln, xe, ye; k_mer_hit *ch_a = cl->list + ch_idx; tl = Get_READ_LENGTH((*rref), id); for (i = ch_idx; i < cl->length && cl->list[i].readID == cl->list[ch_idx].readID; i++); ch_n = i-ch_idx; if(ch_n <= 0) return 0; - + + ///debug for memory + // snprintf(NULL, 0, "dwn::%u\tdcn::%u", (uint32_t)aux_o->w_list.n, (uint32_t)aux_o->w_list.c.n); // fprintf(stderr, "[M::%s::rid->%ld] utg%.6dl(%c), z::[%u, %u)\n", // __func__, rid, (int32_t)z->y_id+1, "+-"[z->y_pos_strand], z->x_pos_s, z->x_pos_e+1); @@ -16799,7 +16801,9 @@ uint64_t gen_hc_fast_cigar0(overlap_region *z, Candidates_list *cl, uint64_t wl, aux_n = z->w_list.n; for (i = tot_e = aln = 0; i < aux_n; i++) { if(is_ualn_win(z->w_list.a[i])) { - tot_e += z->w_list.a[i].x_end + 1 - z->w_list.a[i].x_start; + xe = z->w_list.a[i].x_end + 1 - z->w_list.a[i].x_start; + ye = z->w_list.a[i].y_end + 1 - z->w_list.a[i].y_start; + tot_e += ((xe >= ye)?(xe):(ye)); } else { tot_e += z->w_list.a[i].error; aln += z->w_list.a[i].x_end + 1 - z->w_list.a[i].x_start; } @@ -23495,6 +23499,9 @@ void gen_hc_r_alin_nec(overlap_region_alloc* ol, Candidates_list *cl, All_reads resize_UC_Read(tu, bs<<1); // fprintf(stderr, "[M::%s] window_length::%lld\n", __func__, w.window_length); + ///debug for memory + // snprintf(NULL, 0, "dwn::%u\tdcn::%u", (uint32_t)aux_o->w_list.n, (uint32_t)aux_o->w_list.c.n); + for (i = k = 0; i < ol->length; i++) { z = &(ol->list[i]); if(z->is_match != 1) { @@ -23514,11 +23521,19 @@ void gen_hc_r_alin_nec(overlap_region_alloc* ol, Candidates_list *cl, All_reads if (rr > err) continue; z->non_homopolymer_errors = re; + ///debug for memory + // snprintf(NULL, 0, "dwn::%u\tdcn::%u", (uint32_t)aux_o->w_list.n, (uint32_t)aux_o->w_list.c.n); + if(!gen_hc_fast_cigar(z, cl, rref, w.window_length, qu->seq, tu, exz, aux_o, e_rate, ql, rid, khit, &re)) continue; // if(z->x_id == 3196 && z->y_id == 3199) fprintf(stderr, "-1-[M::%s] tid::%u\t%.*s\trr::%f\tre::%ld\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, re); + ///debug for memory + // snprintf(NULL, 0, "dwn::%u\tdcn::%u", (uint32_t)aux_o->w_list.n, (uint32_t)aux_o->w_list.c.n); reassign_gaps(z, aux_o, qu->seq, ql, NULL, -1, rref, tu, buf); + + ///debug for memory + // snprintf(NULL, 0, "dwn::%u\tdcn::%u", (uint32_t)aux_o->w_list.n, (uint32_t)aux_o->w_list.c.n); } // if(z->x_id == 3196 && z->y_id == 3199) fprintf(stderr, "-2-[M::%s] tid::%u\t%.*s\trr::%f\tre::%u\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, z->non_homopolymer_errors); diff --git a/Hash_Table.cpp b/Hash_Table.cpp index 885027a..c39c542 100644 --- a/Hash_Table.cpp +++ b/Hash_Table.cpp @@ -2108,7 +2108,7 @@ uint64_t lchain_qdp_mcopy_fast(Candidates_list *cl, int64_t a_idx, int64_t a_n, Chain_Data* dp, overlap_region_alloc* res, int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, uint32_t xid, int64_t xl, int64_t yl, int64_t quick_check, uint32_t apend_be, - int64_t gen_cigar, int64_t enable_mcopy, double mcopy_rate, int64_t mcopy_khit_cutoff, + int64_t gen_cigar, int64_t mcopy_num, double mcopy_rate, int64_t mcopy_khit_cutoff, int64_t khit_n) { if(a_n <= 0) return 0; @@ -2186,19 +2186,21 @@ uint64_t lchain_qdp_mcopy_fast(Candidates_list *cl, int64_t a_idx, int64_t a_n, } for (i = msc_i, cL = 0; i >= 0; i = p[i]) { ii[i] = 1; t[cL++] = i;}///label the best chain - if((movl < xl) && enable_mcopy/**(movl < yl)**/) { + + + if(mcopy_num > 1) { if(cL >= mcopy_khit_cutoff) {///if there are too few k-mers, disable mcopy msc -= plus; min_sc = msc*mcopy_rate/**0.2**/; ii[msc_i] = 0; for (i = ch_n = 0; i < a_n; ++i) {///make all f[] positive f[i] -= plus; if(i >= ch_n) t[i] = 0; - if((!(ii[i])) && (f[i] >= min_sc)) { + if((!(ii[i])) && (f[i] >= min_sc)) {///!(ii[i]): skip the best chain t[ch_n] = ((uint64_t)f[i])<<32; t[ch_n] += (i<<1); ch_n++; } } if(ch_n > 1) { int64_t n_v, n_v0, ni, n_u, n_u0 = res->length; radix_sort_hc64i(t, t + ch_n); - for (k = ch_n-1, n_v = n_u = 0; k >= 0; --k) { + for (k = ch_n-1, n_v = n_u = 0; k >= 0 && n_u < mcopy_num; --k) { n_v0 = n_v; for (i = ((uint32_t)t[k])>>1; i >= 0 && (t[i]&1) == 0; ) { ii[n_v++] = i; t[i] |= 1; i = p[i]; @@ -2209,10 +2211,11 @@ uint64_t lchain_qdp_mcopy_fast(Candidates_list *cl, int64_t a_idx, int64_t a_n, kv_pushp_ol(overlap_region, (*res), &z); push_ovlp_chain_qgen(z, xid, xl, yl, sc+plus, &(a[ii[n_v-1]]), &(a[ii[n_v0]])); ///mcopy_khit_cutoff <= 1: disable the mcopy_khit_cutoff filtering, for the realignment - if((mcopy_khit_cutoff <= 1) || ((z->x_pos_e+1-z->x_pos_s) <= (movl<<2))) { + // if((mcopy_khit_cutoff <= 1) || ((z->x_pos_e+1-z->x_pos_s) <= (movl<<2))) { + if((!n_u) || (n_v - n_v0 > 1)) { z->align_length = n_v-n_v0; z->x_id = n_v0; n_u++; - } else {///non-best is too long + } else {///non-best is tiny res->length--; n_v = n_v0; } } else { @@ -2220,8 +2223,8 @@ uint64_t lchain_qdp_mcopy_fast(Candidates_list *cl, int64_t a_idx, int64_t a_n, } } - if(n_u > 1) ks_introsort_or_sss(n_u, res->list + n_u0); - res->length = n_u0 + filter_non_ovlp_xchains(res->list + n_u0, n_u, &n_v); + // if(n_u > 1) ks_introsort_or_sss(n_u, res->list + n_u0); + // res->length = n_u0 + filter_non_ovlp_xchains(res->list + n_u0, n_u, &n_v); n_u = res->length; if(n_u > n_u0 + 1) { kv_resize_cl(k_mer_hit, (*cl), (n_v+cl->length)); @@ -2264,6 +2267,9 @@ uint64_t lchain_qdp_mcopy_fast(Candidates_list *cl, int64_t a_idx, int64_t a_n, } } } + + + ///a[] has been sorted by self_offset // i = msc_i; cL = 0; // while (i >= 0) {t[cL++] = i; i = p[i];} diff --git a/anchor.cpp b/anchor.cpp index 0cc0e28..450ebcf 100644 --- a/anchor.cpp +++ b/anchor.cpp @@ -1920,7 +1920,7 @@ void lchain_qgen_mcopy(Candidates_list* cl, overlap_region_alloc* ol, uint32_t r void lchain_qgen_mcopy_fast(Candidates_list* cl, overlap_region_alloc* ol, uint32_t rid, uint64_t rl, All_reads* rdb, uint32_t apend_be, uint64_t max_n_chain, int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, int64_t quick_check, - uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut, st_mt_t *sp) + uint32_t gen_off, int64_t mcopy_num, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut, st_mt_t *sp) { // fprintf(stderr, "+[M::%s] chain_cutoff::%u\n", __func__, chain_cutoff); uint64_t i, k, l, m, cn = cl->length, yid, ol0, lch; overlap_region *r, t; ///srt = 0 @@ -1931,7 +1931,7 @@ void lchain_qgen_mcopy_fast(Candidates_list* cl, overlap_region_alloc* ol, uint3 if(cl->list[l].readID != rid) { yid = cl->list[l].readID; ol0 = ol->length; m += lchain_qdp_mcopy_fast(cl, l, k-l, m, &(cl->chainDP), ol, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_rate, - rid, rl, Get_READ_LENGTH((*rdb), yid), quick_check, apend_be, gen_off, enable_mcopy, mcopy_rate, mcopy_khit_cut, 1); + rid, rl, Get_READ_LENGTH((*rdb), yid), quick_check, apend_be, gen_off, mcopy_num, mcopy_rate, mcopy_khit_cut, 1); if((chain_cutoff >= 2) && (!lch)) { for (i = ol0; (ilength) && (!lch); i++) { if(ol->list[i].align_length < chain_cutoff) lch = 1; @@ -2224,7 +2224,7 @@ void ul_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t } void h_ec_lchain(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut) + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t mcopy_num, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut) { extern void *ha_flt_tab; extern ha_pt_t *ha_idx; @@ -2235,7 +2235,7 @@ void h_ec_lchain(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz // lchain_gen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); // lchain_qgen(cl, overlap_list, rid, rl, NULL, uref, apend_be, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off); ///no need to sort here, overlap_list has been sorted at lchain_gen - lchain_qgen_mcopy_fast(cl, overlap_list, rid, rl, rref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, enable_mcopy, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp); + lchain_qgen_mcopy_fast(cl, overlap_list, rid, rl, rref, apend_be, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check, gen_off, mcopy_num, mcopy_rate, chain_cutoff, mcopy_khit_cut, sp); } void h_ec_lchain_amz(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, diff --git a/ecovlp.cpp b/ecovlp.cpp index d9de983..e20a91e 100644 --- a/ecovlp.cpp +++ b/ecovlp.cpp @@ -31,7 +31,7 @@ typedef struct {size_t n, m; char *a; UC_Read z;} sl_v; void h_ec_lchain(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, - int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut); + int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t mcopy_num, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut); void h_ec_lchain_amz(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, int max_n_chain, int apend_be, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t *high_occ, uint32_t *low_occ, uint32_t is_accurate, uint32_t gen_off, int64_t enable_mcopy, double mcopy_rate, uint32_t chain_cutoff, uint32_t mcopy_khit_cut); void h_ec_lchain_re_gen(ha_abuf_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, ha_pt_t *ha_idx, All_reads *rref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, @@ -173,7 +173,7 @@ void prt_chain(overlap_region_alloc *o) overlap_region *fetch_aux_ovlp(overlap_region_alloc* ol) /// exactly same to gen_aux_ovlp { - if (ol->length + 1 > ol->size) { + if (ol->length + 1 >= ol->size) { uint64_t sl = ol->size; ol->size = ol->length + 1; kroundup64(ol->size); @@ -181,6 +181,10 @@ overlap_region *fetch_aux_ovlp(overlap_region_alloc* ol) /// exactly same to gen /// need to set new space to be 0 memset(ol->list + sl, 0, sizeof(overlap_region)*(ol->size - sl)); } + ///debug for memory + // if(ol->length + 1 >= ol->size) { + // fprintf(stderr, "[M::%s] length::%lu, size::%lu\n", __func__, ol->length, ol->size); + // } return &(ol->list[ol->length+1]); } @@ -2723,6 +2727,7 @@ void gen_hc_r_alin_ea(overlap_region_alloc* ol, Candidates_list *cl, All_reads * { if(ol->length <= 0) return; + // uint64_t k, l, i, s, m, mm_k, *ei, en, *oi, on, tid, trev, nec; int64_t sc, mm_sc, plus, minus; overlap_region *z, t; ma_hit_t *p; uint64_t k, i, m, *ei, en, *oi, on, tid, trev, nec; overlap_region *z; ma_hit_t *p; srt->n = 0; for (k = 0; k < in->length; k++) { @@ -2735,6 +2740,9 @@ void gen_hc_r_alin_ea(overlap_region_alloc* ol, Candidates_list *cl, All_reads * if(!(srt->n)) { gen_hc_r_alin(ol, cl, rref, qu, tu, exz, aux_o, e_rate, wl, rid, khit, move_gap, buf); } else { + ///debug for memory + // snprintf(NULL, 0, "dwn::%u\tdcn::%u", (uint32_t)aux_o->w_list.n, (uint32_t)aux_o->w_list.c.n); + kv_resize(uint64_t, *srt, (srt->n + ol->length)); ei = srt->a; en = srt->n; oi = srt->a + srt->n; on = ol->length; for (k = 0; k < on; k++) { @@ -2761,9 +2769,48 @@ void gen_hc_r_alin_ea(overlap_region_alloc* ol, Candidates_list *cl, All_reads * } } } + ///debug for memory + // snprintf(NULL, 0, "dwn::%u\tdcn::%u", (uint32_t)aux_o->w_list.n, (uint32_t)aux_o->w_list.c.n); if(on > nec) gen_hc_r_alin_nec(ol, cl, rref, qu, tu, exz, aux_o, e_rate, wl, rid, khit, move_gap, buf); + + ///debug for memory + // snprintf(NULL, 0, "dwn::%u\tdcn::%u", (uint32_t)aux_o->w_list.n, (uint32_t)aux_o->w_list.c.n); } + + /** + if(ol->length > 1) {///for duplicated chains + overlap_region_sort_y_id(ol->list, ol->length); + for (k = 1, l = m = 0; k <= ol->length; k++) { + if(k == ol->length || ol->list[k].y_id != ol->list[l].y_id) { + // fprintf(stderr, "\n[M::%s::tid->%u] n->%lu\n", __func__, ol->list[l].y_id, k - l); + mm_k = l; + if(k - l > 1) { + for (s = l, mm_sc = INT32_MIN, mm_k = ((uint64_t)-1); s < k; s++) { + z = &(ol->list[s]); + plus = z->x_pos_e + 1 - z->x_pos_s; minus = (z->non_homopolymer_errors) * 12; + sc = plus - minus; + if((sc > mm_sc) || ((sc == mm_sc) && ((ol->list[mm_k].x_pos_e+1-ol->list[mm_k].x_pos_s) < (z->x_pos_e+1-z->x_pos_s)))) { + mm_sc = sc; mm_k = s; + } + // fprintf(stderr, "[M::%s::%c] q::[%u, %u), t::[%u, %u), sc::%ld, err::%u, s::%lu\n", __func__, "+-"[z->y_pos_strand], z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1, sc, z->non_homopolymer_errors, s); + } + } + // fprintf(stderr, "[M::%s::tid->%u] mm_k::%lu\n", __func__, ol->list[l].y_id, mm_k); + if(mm_k != ((uint64_t)-1)) { + if(mm_k != m) { + t = ol->list[mm_k]; + ol->list[mm_k] = ol->list[m]; + ol->list[m] = t; + } + m++; + } + l = k; + } + } + ol->length = m; + } + **/ } void prt_ovlp_sam_0(char *cm, FILE *fp, char *ref_id, int32_t ref_id_n, char *qry_id, int32_t qry_id_n, char *qry_seq, uint64_t qry_seq_n, uint64_t rs, uint64_t re, uint64_t qs, uint64_t qe, uint64_t flag, uint64_t err, bit_extz_t *ez) @@ -2818,6 +2865,54 @@ void prt_ovlp_sam(overlap_region_alloc* ol, UC_Read* tu, char *ref_seq, int32_t fclose(fp); } +void dedup_chains(overlap_region_alloc* ol) +{ + uint64_t k, l, s, m, mm_k, mm_m, sf; int64_t sc, mm_sc, plus, minus; overlap_region *z, t; + if(ol->length > 1) {///for duplicated chains + overlap_region_sort_y_id(ol->list, ol->length); + for (k = 1, l = m = 0; k <= ol->length; k++) { + if(k == ol->length || ol->list[k].y_id != ol->list[l].y_id) { + // fprintf(stderr, "\n[M::%s::tid->%u] n->%lu\n", __func__, ol->list[l].y_id, k - l); + mm_k = l; + if(k - l > 1) { + for (s = l, mm_sc = INT32_MIN, mm_k = ((uint64_t)-1), mm_m = 3; s < k; s++) { + z = &(ol->list[s]); + plus = z->x_pos_e + 1 - z->x_pos_s; minus = (z->non_homopolymer_errors) * 12; + sc = plus - minus; + + sf = 0; + if(z->is_match < mm_m) { + sf = 1; + } else if(z->is_match == mm_m) { + if(sc > mm_sc) { + sf = 1; + } else if((sc == mm_sc) && ((z->x_pos_e+1-z->x_pos_s) > (ol->list[mm_k].x_pos_e+1-ol->list[mm_k].x_pos_s))) { + sf = 1; + } + } + if(sf) { + mm_sc = sc; mm_k = s; mm_m = z->is_match; + } + + // fprintf(stderr, "[M::%s::%c] q::[%u, %u), t::[%u, %u), sc::%ld, err::%u, mm::%u, s::%lu\n", __func__, "+-"[z->y_pos_strand], z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1, sc, z->non_homopolymer_errors, z->is_match, s); + } + } + // fprintf(stderr, "[M::%s::tid->%u] mm_k::%lu\n", __func__, ol->list[l].y_id, mm_k); + if(mm_k != ((uint64_t)-1)) { + if(mm_k != m) { + t = ol->list[mm_k]; + ol->list[mm_k] = ol->list[m]; + ol->list[m] = t; + } + m++; + } + l = k; + } + } + ol->length = m; + } +} + static void worker_hap_ec(void *data, long i, int tid) { ec_ovec_buf_t0 *b = &(((ec_ovec_buf_t*)data)->a[tid]); @@ -2825,7 +2920,18 @@ static void worker_hap_ec(void *data, long i, int tid) uint32_t low_occ = asm_opt.hom_cov * HA_KMER_GOOD_RATIO; overlap_region *aux_o = NULL; asg64_v buf0; uint32_t qlen = 0; - // if (memcmp(/**"m64062_190807_194840/180552420/ccs"**/"m64062_190803_042216/161743554/ccs", Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { + /** + if((i != 1129685) && (i != 1137865) && (i != 1137917) && (i != 1140647) && (i != 1144740) && (i != 1148936) && (i != 1149134) && (i != 1151224) && (i != 1151386) && (i != 1152960) && (i != 1154846) && (i != 1154881) && (i != 1155112) && + (i != 1156823) && (i != 1157099) && (i != 1157393) && (i != 1158300) && (i != 1158368) && (i != 1160411) && (i != 1160659) && (i != 1161458) && (i != 1163595) && (i != 1164084) && (i != 1164230) && (i != 1165050) && (i != 1168249) && + (i != 1168304) && (i != 1168514) && (i != 1170447) && (i != 1171377) && (i != 1171387) && (i != 1172376) && (i != 1173566) && (i != 1174275) && (i != 1174434) && (i != 1174511) && (i != 1174860) && (i != 1175306) && (i != 1175314) && + (i != 1177101) && (i != 1178196) && (i != 1178470) && (i != 1179327) && (i != 1179626) && (i != 1180357) && (i != 1181347) && (i != 1181422) && (i != 1181725) && (i != 1183135) && (i != 1183734) && (i != 1185569) && (i != 1185604) && + (i != 1186192) && (i != 1188441) && (i != 1188487) && (i != 1189865) && (i != 1189943) && (i != 1192819) && (i != 1193133) && (i != 1196908) && (i != 1197590) && (i != 1200549) && (i != 1200757) && (i != 1205500)) return; + + fprintf(stderr, "%ld\t+++\n", i); + **/ + // if(i < 1100000 || i > 1400000) return; + // if(i % 100000 == 0) fprintf(stderr, "-a-[M::%s-beg] rid->%ld\n", __func__, i); + // if (memcmp("m64062_190807_194840/180552420/ccs"/**"m64062_190803_042216/161743554/ccs"**//**"m64062_190806_063919/15403289/ccs"**/, Get_NAME((R_INF), i), Get_NAME_LENGTH((R_INF),i)) == 0) { // fprintf(stderr, "-a-[M::%s-beg] rid->%ld\n", __func__, i); // } else { // return; @@ -2840,13 +2946,16 @@ static void worker_hap_ec(void *data, long i, int tid) recover_UC_Read(&b->self_read, &R_INF, i); qlen = b->self_read.length; - h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, 0.02, asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, /**0**/2, UINT32_MAX); + h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, 0.02, asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 3, 0.7, 2, 32); // b->num_read_base += b->olist.length; b->cnt[0] += b->self_read.length; aux_o = fetch_aux_ovlp(&b->olist);///must be here + ///debug for memory + // snprintf(NULL, 0, "dwn::%u\tdcn::%u", (uint32_t)aux_o->w_list.n, (uint32_t)aux_o->w_list.c.n); + // gen_hc_r_alin(&b->olist, &b->clist, &R_INF, &b->self_read, &b->ovlp_read, &b->exz, aux_o, asm_opt.max_ov_diff_ec, WINDOW_HC, i, E_KHIT/**asm_opt.k_mer_length**/, 1, &b->v16); gen_hc_r_alin_ea(&b->olist, &b->clist, &R_INF, &b->self_read, &b->ovlp_read, &b->exz, aux_o, asm_opt.max_ov_diff_ec, WINDOW_HC, i, E_KHIT/**asm_opt.k_mer_length**/, 1, &b->v16, &b->v64, &(R_INF.paf[i])); @@ -2865,6 +2974,8 @@ static void worker_hap_ec(void *data, long i, int tid) rphase_hc(&b->olist, &R_INF, &b->hap, &b->self_read, &b->ovlp_read, &b->pidx, &b->v64, &buf0, 0, WINDOW_MAX_SIZE, b->self_read.length, 1/**, 0**/, i); copy_asg_arr(b->sp, buf0); + dedup_chains(&b->olist); + copy_asg_arr(buf0, b->sp); b->cnt[1] += wcns_gen(&b->olist, &R_INF, &b->self_read, &b->ovlp_read, &b->exz, &b->pidx, &b->v64, &buf0, 0, 512, b->self_read.length, 3, 0.500001, aux_o, &b->v32, &b->cns, 256, i); copy_asg_arr(b->sp, buf0); @@ -2947,6 +3058,10 @@ static void worker_hap_ec(void *data, long i, int tid) **/ // exit(1); refresh_ec_ovec_buf_t0(b, REFRESH_N); + + /** + fprintf(stderr, "%ld\t---\n", i); + **/ } uint32_t adjust_exact_match(asg16_v *in, int64_t xs0, int64_t xe0, int64_t ys0, int64_t ye0, uint64_t *rxs, uint64_t *rxe, uint64_t *rys, uint64_t *rye, uint32_t rev) @@ -3410,7 +3525,7 @@ static void worker_hap_dc_ec_gen_new_idx(void *data, long i, int tid) recover_UC_Read(&b->self_read, &R_INF, i); - h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, /**0.02**/0.001, asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 0, 2, /**0**/2, UINT32_MAX); + h_ec_lchain(b->ab, i, b->self_read.seq, b->self_read.length, asm_opt.mz_win, asm_opt.k_mer_length, &R_INF, &b->olist, &b->clist, /**0.02**/0.001, asm_opt.max_n_chain, 1, NULL, NULL, &(b->sp), &high_occ, &low_occ, 1, 1, 3, 0.7, 2, 32); overlap_region_sort_y_id(b->olist.list, b->olist.length); @@ -4119,7 +4234,7 @@ void h_ec_lchain_fast_new(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, { // fprintf(stderr, "-0-[M::%s]\tnew_n::%lu\told_n::%u\n", __func__, ol->length, in0->length + in1->length); // fprintf(stderr, "-mm-[M::%s]\tchain_cutoff::%u\n", __func__, chain_cutoff); - uint64_t on = 0, k, i, m, m0, tid, trev, is_match; ma_hit_t *oa = NULL, *p = NULL; overlap_region *z = NULL, t; + uint64_t on = 0, k, i, l, m, m0, tid, trev, is_match, is_usrt = 0; ma_hit_t *oa = NULL, *p = NULL; overlap_region *z = NULL, t; char* rs = qu->seq; uint64_t rl = qu->length; int64_t om; uint64_t aq[2], at[2], bq[2], bt[2], ovlp, os, oe; @@ -4143,6 +4258,7 @@ void h_ec_lchain_fast_new(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, k = 0; i = 0; for (k = m = 0; k < ol->length; k++) { z = &(ol->list[k]); tid = z->y_id; trev = z->y_pos_strand; + z->non_homopolymer_errors = 0; for (; (i < srt_i->n) && ((srt_i->a[i]>>32) < ((tid<<1)|trev)); i++); if((i < srt_i->n) && ((srt_i->a[i]>>32) == ((tid<<1)|trev))) { om = 1; z->shared_seed = 0; z->is_match = 0; @@ -4152,7 +4268,7 @@ void h_ec_lchain_fast_new(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, p = &(in0->buffer[((uint32_t)srt_i->a[i])>>1]); is_match = 1; } z->strong = p->ml; z->without_large_indel = p->no_l_indel; - + aq[0] = (uint32_t)p->qns; aq[1] = p->qe; at[0] = p->ts; at[1] = p->te; @@ -4162,10 +4278,12 @@ void h_ec_lchain_fast_new(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, os = MAX(aq[0], bq[0]); oe = MIN(aq[1], bq[1]); ovlp = ((oe>os)? (oe-os):0); if(!((ovlp) && (ovlp >= ((aq[1] - aq[0])*sh)) && ((ovlp >= ((bq[1] - bq[0])*sh))))) om = 0; + z->non_homopolymer_errors += ((aq[1] - aq[0]) - ovlp); os = MAX(at[0], bt[0]); oe = MIN(at[1], bt[1]); ovlp = ((oe>os)? (oe-os):0); if(!((ovlp) && (ovlp >= ((at[1] - at[0])*sh)) && ((ovlp >= ((bt[1] - bt[0])*sh))))) om = 0; + z->non_homopolymer_errors += ((at[1] - at[0]) - ovlp); if(om) { if(is_match == 1 && p->el == 1) p->el = 0; @@ -4222,9 +4340,40 @@ void h_ec_lchain_fast_new(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, set_exact_exz(exz, z->x_pos_s, z->x_pos_e + 1, z->y_pos_s, z->y_pos_e + 1); push_alnw(z, exz); + is_usrt = 1; // if(oa[k].tn == 1945) fprintf(stderr, "-em-[M::%s]\tqn::%u\ttn::%u\terr::%u\n", __func__, rid, oa[k].tn, ol->list[ol->length-1].non_homopolymer_errors); } } + + if(is_usrt) overlap_region_sort_y_id(ol->list, ol->length); + + if(ol->length > 1) {///for duplicated chains + uint64_t mm_k, s; int64_t mm_sc, sc; + for (k = 1, l = m = 0; k <= ol->length; k++) { + if(k == ol->length || ol->list[k].y_id != ol->list[l].y_id) { + mm_k = l; + if(k - l > 1) { + for (s = l, mm_sc = INT32_MIN, mm_k = ((uint64_t)-1); s < k; s++) { + z = &(ol->list[s]); + sc = z->non_homopolymer_errors; sc = - sc; + if((sc > mm_sc) || ((sc == mm_sc) && ((ol->list[mm_k].x_pos_e+1-ol->list[mm_k].x_pos_s) < (z->x_pos_e+1-z->x_pos_s)))) { + mm_sc = sc; mm_k = s; + } + } + } + if(mm_k != ((uint64_t)-1)) { + if(mm_k != m) { + t = ol->list[mm_k]; + ol->list[mm_k] = ol->list[m]; + ol->list[m] = t; + } + m++; + } + l = k; + } + } + ol->length = m; + } } overlap_region* h_ec_lchain_re3(ha_abuf_t *ab, uint32_t rid, UC_Read *qu, UC_Read *tu, uint64_t mz_w, uint64_t mz_k, All_reads *rref, overlap_region_alloc *ol, Candidates_list *cl, bit_extz_t *exz, asg16_v *buf, asg64_v *srt_i, double bw_thres, @@ -5270,7 +5419,7 @@ void cal_ec_r(uint64_t n_thre, uint64_t round, uint64_t n_round, uint64_t n_a, u b = gen_ec_ovec_buf_t(n_thre); - (*tot_e) += cal_ec_multiple(b, n_thre, n_a, tot_b); ////exit(1); + (*tot_e) += cal_ec_multiple(b, n_thre, n_a, tot_b); ///exit(1); sl_ec_r(n_thre, n_a); for (k = 0; k < n_round; k++) { diff --git a/htab.cpp b/htab.cpp index 306619a..c061bf0 100644 --- a/htab.cpp +++ b/htab.cpp @@ -1329,7 +1329,7 @@ int load_ct_index(void **i_ct_idx, char* file_name) int write_pt_index(void *flt_tab, ha_pt_t *ha_idx, All_reads* r, hifiasm_opt_t* opt, char* file_name) { - char* gfa_name = (char*)malloc(strlen(file_name)+25); + char* gfa_name = (char*)malloc(strlen(file_name)+64); sprintf(gfa_name, "%s.pt_flt", file_name); FILE* fp = fopen(gfa_name, "w"); if (!fp) { @@ -1372,15 +1372,29 @@ int write_pt_index(void *flt_tab, ha_pt_t *ha_idx, All_reads* r, hifiasm_opt_t* write_All_reads(r, gfa_name); + sprintf(gfa_name, "%s.pt_flt.paf.bin", file_name); + fclose(fp); fp = fopen(gfa_name, "w"); uint64_t k; + if (!fp) { + free(gfa_name); + return 0; + } + fwrite(&(r->total_reads), sizeof(r->total_reads), 1, fp); + for (k = 0; k < r->total_reads; k++) { + fwrite(&(r->paf[k].is_fully_corrected), sizeof(r->paf[k].is_fully_corrected), 1, fp); + fwrite(&(r->paf[k].is_abnormal), sizeof(r->paf[k].is_abnormal), 1, fp); + fwrite(&(r->paf[k].length), sizeof(r->paf[k].length), 1, fp); + fwrite(r->paf[k].buffer, sizeof((*(r->paf[k].buffer))), r->paf[k].length, fp); + } + fprintf(stderr, "[M::%s] Index has been written.\n", __func__); free(gfa_name); fclose(fp); return 1; } -int load_pt_index(void **r_flt_tab, ha_pt_t **r_ha_idx, All_reads* r, hifiasm_opt_t* opt, char* file_name) +int load_pt_index(void **r_flt_tab, ha_pt_t **r_ha_idx, All_reads *r, hifiasm_opt_t* opt, char* file_name) { - char* gfa_name = (char*)malloc(strlen(file_name)+25); + char* gfa_name = (char*)malloc(strlen(file_name)+64); sprintf(gfa_name, "%s.pt_flt", file_name); FILE* fp = fopen(gfa_name, "r"); if (!fp) { @@ -1460,7 +1474,7 @@ int load_pt_index(void **r_flt_tab, ha_pt_t **r_ha_idx, All_reads* r, hifiasm_op f_flag += fread(&opt->max_n_chain, sizeof(opt->max_n_chain), 1, fp); - fclose(fp); + // fclose(fp); if(!load_All_reads(r, gfa_name)) { @@ -1468,15 +1482,33 @@ int load_pt_index(void **r_flt_tab, ha_pt_t **r_ha_idx, All_reads* r, hifiasm_op return 0; } - memset(r->trio_flag, AMBIGU, r->total_reads*sizeof(uint8_t)); + + sprintf(gfa_name, "%s.pt_flt.paf.bin", file_name); + fclose(fp); fp = fopen(gfa_name, "r"); uint64_t k; + if (!fp) { + free(gfa_name); + return 0; + } + f_flag += fread(&(r->total_reads), sizeof(r->total_reads), 1, fp); r->paf = (ma_hit_t_alloc*)malloc(sizeof(ma_hit_t_alloc)*r->total_reads); r->reverse_paf = (ma_hit_t_alloc*)malloc(sizeof(ma_hit_t_alloc)*r->total_reads); - for (i = 0; i < (long long)r->total_reads; i++) - { - init_ma_hit_t_alloc(&(r->paf[i])); - init_ma_hit_t_alloc(&(r->reverse_paf[i])); + for (k = 0; k < r->total_reads; k++) { + // init_ma_hit_t_alloc(&(r->paf[k])); + init_ma_hit_t_alloc(&(r->reverse_paf[k])); + + f_flag += fread(&(r->paf[k].is_fully_corrected), sizeof(r->paf[k].is_fully_corrected), 1, fp); + f_flag += fread(&(r->paf[k].is_abnormal), sizeof(r->paf[k].is_abnormal), 1, fp); + f_flag += fread(&(r->paf[k].length), sizeof(r->paf[k].length), 1, fp); + r->paf[k].size = r->paf[k].length; + + r->paf[k].buffer = NULL; + if(r->paf[k].length == 0) continue; + + r->paf[k].buffer = (ma_hit_t*)malloc(sizeof(ma_hit_t)*r->paf[k].length); + fread(r->paf[k].buffer, sizeof((*(r->paf[k].buffer))), r->paf[k].length, fp); } + fclose(fp); fprintf(stderr, "[M::%s] Index has been loaded.\n", __func__);