From 8a6f18e378e9a243fc648aa4a1ea2f6e35dbf5e7 Mon Sep 17 00:00:00 2001 From: Nick Moore Date: Thu, 18 Apr 2024 10:59:51 +1000 Subject: [PATCH] add cs_long flag to mappy.Aligner.map, and a "-C" option to minimap2.py --- python/mappy.pyx | 15 +++++++-------- python/minimap2.py | 7 +++++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/python/mappy.pyx b/python/mappy.pyx index 0a6b93cd..7aa38af1 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -163,7 +163,7 @@ cdef class Aligner: def __bool__(self): return (self._idx != NULL) - def map(self, seq, seq2=None, buf=None, cs=False, MD=False, max_frag_len=None, extra_flags=None): + def map(self, seq, seq2=None, buf=None, cs=False, cs_long=False, MD=False, max_frag_len=None, extra_flags=None): cdef cmappy.mm_reg1_t *regs cdef cmappy.mm_hitpy_t h cdef ThreadBuffer b @@ -199,13 +199,12 @@ cdef class Aligner: for k in range(h.n_cigar32): # convert the 32-bit CIGAR encoding to Python array c = h.cigar32[k] cigar.append([c>>4, c&0xf]) - if cs or MD: # generate the cs and/or the MD tag, if requested - if cs: - l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq, 1) - _cs = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode() - if MD: - l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq) - _MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode() + if cs or cs_long: + l_cs_str = cmappy.mm_gen_cs(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq, 0 if cs_long else 1) + _cs = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode() + if MD: + l_cs_str = cmappy.mm_gen_MD(km, &cs_str, &m_cs_str, self._idx, ®s[i], _seq) + _MD = cs_str[:l_cs_str] if isinstance(cs_str, str) else cs_str[:l_cs_str].decode() yield Alignment(h.ctg, h.ctg_len, h.ctg_start, h.ctg_end, h.strand, h.qry_start, h.qry_end, h.mapq, cigar, h.is_primary, h.mlen, h.blen, h.NM, h.trans_strand, h.seg_id, _cs, _MD) cmappy.mm_free_reg1(®s[i]) i += 1 diff --git a/python/minimap2.py b/python/minimap2.py index db6125da..ff5fd40e 100755 --- a/python/minimap2.py +++ b/python/minimap2.py @@ -5,7 +5,7 @@ import mappy as mp def main(argv): - opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:cM") + opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:cCM") if len(args) < 2: print("Usage: minimap2.py [options] | ") print("Options:") @@ -16,11 +16,13 @@ def main(argv): print(" -w INT minimizer window length") print(" -r INT band width") print(" -c output the cs tag") + print(" -C output the cs tag (long version)") print(" -M output the MD tag") sys.exit(1) preset = min_cnt = min_sc = k = w = bw = None out_cs = out_MD = False + out_cs_long = False for opt, arg in opts: if opt == '-x': preset = arg elif opt == '-n': min_cnt = int(arg) @@ -29,12 +31,13 @@ def main(argv): elif opt == '-k': k = int(arg) elif opt == '-w': w = int(arg) elif opt == '-c': out_cs = True + elif opt == '-C': out_cs_long = True elif opt == '-M': out_MD = True a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw) if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0])) for name, seq, qual in mp.fastx_read(args[1]): # read one sequence - for h in a.map(seq, cs=out_cs, MD=out_MD): # traverse hits + for h in a.map(seq, cs=out_cs, cs_long=out_cs_long, MD=out_MD): # traverse hits print('{}\t{}\t{}'.format(name, len(seq), h)) if __name__ == "__main__":