Skip to content

Commit

Permalink
Merge pull request #1 from mskcc/feature/metafusion_parser
Browse files Browse the repository at this point in the history
Add CFF parser with tests
  • Loading branch information
anoronh4 authored Oct 1, 2024
2 parents fb68065 + 74b4e82 commit 8aca5da
Show file tree
Hide file tree
Showing 5 changed files with 188 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ You can provide as input output files from fusion-finding algorithms. Currently
* MapSplice (only if --gene-gtf specified)
* [STAR-Fusion](https://github.com/STAR-Fusion/STAR-Fusion)
* TopHat-Fusion
* [Common Fusion Format](https://github.com/ccmbioinfo/MetaFusion/wiki/metafusion-file-formats#cff-fields)



Expand Down
4 changes: 4 additions & 0 deletions agfusion/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,8 @@ def save_tables(self, out_dir="."):
"3'_gene",
"5'_transcript",
"3'_transcript",
"5'_breakpoint",
"3'_breakpoint",
"5'_strand",
"3'_strand",
"5'_transcript_biotype",
Expand Down Expand Up @@ -720,6 +722,8 @@ def save_tables(self, out_dir="."):
transcript.gene3prime.gene.gene_name,
transcript.transcript1.id,
transcript.transcript2.id,
transcript.gene5prime.junction,
transcript.gene3prime.junction,
transcript.transcript1.strand,
transcript.transcript2.strand,
transcript.transcript1.biotype,
Expand Down
91 changes: 91 additions & 0 deletions agfusion/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,94 @@ def __init__(self, infile, logger):

self._check_data()

class _CommonFusionFormatBasic(_Parser):
"""
Parent class of CommonFusionFormat* parsers.
Defined here: https://github.com/ccmbioinfo/MetaFusion/wiki/metafusion-file-formats#cff-fields
"""

def _load_data_indices(self,infile,data_indices):
fin = open(infile, "r")
n = 0
for line in fin.readlines():
n += 1
line = line.strip().split("\t")

try:
self.fusions.append(
{
"gene5prime": line[data_indices["gene5prime"]],
"gene3prime": line[data_indices["gene3prime"]],
"gene5prime_junction": int(line[data_indices["gene5prime_junction"]]),
"gene3prime_junction": int(line[data_indices["gene3prime_junction"]]),
}
)
except ValueError as e:
print(e)
self.logger.warn(
f"Skipping fusion on line {n} because one or more "
+ "of the provided breakpoints is not a valid"
+ " integer value."
)

fin.close()

self._check_data()

class CommonFusionFormat(_CommonFusionFormatBasic):
"""
CommonFusionFormat parser.
Defined here: https://github.com/ccmbioinfo/MetaFusion/wiki/metafusion-file-formats#cff-fields
"""

def __init__(self, infile, logger):
super().__init__(logger)

data_indices = {
"gene5prime": 13,
"gene3prime": 15,
"gene5prime_junction": 1,
"gene3prime_junction": 4,
}

self._load_data_indices(infile, data_indices)

class CommonFusionFormatReann(_CommonFusionFormatBasic):
"""
CommonFusionFormat parser.
Defined here: https://github.com/ccmbioinfo/MetaFusion/wiki/metafusion-file-formats#cff-fields
"""

def __init__(self, infile, logger):
super().__init__(logger)

data_indices = {
"gene5prime": 18,
"gene3prime": 20,
"gene5prime_junction": 1,
"gene3prime_junction": 4,
}

self._load_data_indices(infile, data_indices)

class CommonFusionFormatTranscript(_CommonFusionFormatBasic):
"""
CommonFusionFormatTranscript parser.
Defined here: https://github.com/ccmbioinfo/MetaFusion/wiki/metafusion-file-formats#cff-fields
"""

def __init__(self, infile, logger):
super().__init__(logger)

data_indices = {
"gene5prime": 37,
"gene3prime": 38,
"gene5prime_junction": 1,
"gene3prime_junction": 4,
}

self._load_data_indices(infile,data_indices)


parsers = {
"arriba": Arriba,
Expand All @@ -791,6 +879,9 @@ def __init__(self, infile, logger):
"starfusion": STARFusion,
"tophatfusion": TopHatFusion,
"mapsplice": MapSplice,
"cff": CommonFusionFormat,
"cff_reann": CommonFusionFormatReann,
"cff_transcript": CommonFusionFormatTranscript,
# 'fusionseq':FusionSeq,
# 'prada':Prada,
# 'gfusion':GFusion,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
chr17 35696764 - chr17 37922746 - RNA Sample_A Tumor NA arriba 113 108 ACACA CDS/splice-site IKZF3 CDS/splice-site GeneFusion ACACA utr3 IKZF3 cds True True True True 5.5 24 2154289 -9 F00002817 GTTTGTTTCTTAAAAAAAAAAACCTTAAAATCTTTCTCTCTTTTTCTCTCTAGGTCTTTCTGGAAGTGGATATCTACTCAGACAGTAAGAATTATAAGAG CCTGGAACAAGTGACAGAAAGGGTTACAAAGGGAACACTGCCAAGGCAGAGAGTTTGTAAATATTTTCTAGAGACCTCAAAAAGGGAGGAGAGTTAAGTT False chr17:35696764-35696810 chr17:37922043-37922746 -1 ENST00000353139 ENST00000346872
chr9 129143450 + chr9 114476903 - RNA Sample_A Tumor NA arriba 1 0 MVB12B CDS/splice-site C9orf84 CDS/splice-site GeneFusion MVB12B cds C9orf84 cds True True True True 6 291 14531840 -9 F00002851 GACAGCAGATGGTGTGGATGCTGACCTCTGGAAAGACGGCTTATTTAAATCCAAGGTTACCAGATACCTGTGTTTCACAAGATCATTTTCCAAAGAAAAT CCTATTGAAGCAAAATTAGAAAAAAATTGTCTAAGTATTTGTTTTTTAAGAAACTCCGTGGTAAAAAAGGCTTAAAGAAAATAAGTCAAAGCAGAGACAG False chr9:129143343-129143450 chr9:114476725-114476903 -1 ENST00000361171 ENST00000394777
chr9 114476725 - chr9 129143343 + RNA Sample_A Tumor NA starfusion 85 11 C9orf84 INFRAME MVB12B INFRAME GeneFusion C9orf84 cds MVB12B cds True True True True 6 291 14531840 -9 F00003651 TGTACAGTTTATTAGGGGGAAAAAGCCTGAAACCAACTACAAGATACAAGAATTGCAATGTCAGATACTAAGTTGGATGCAAAGTCAACAGCAAATTAAG CCTACACCGGAGAGACAGAGAAAAGGATTGCAGGTAAGGCAGGAAGGAACTGAAGGTGCGCGTGGGCCACTTCTTCCACGCCTGTTCCAATTAACACCAG False chr9:114476725-114476903 chr9:129143343-129143450 -1 ENST00000374287 ENST00000361171
89 changes: 89 additions & 0 deletions test/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,95 @@ def test_parse_human(self):
)
assert fusion.name in all_fusions, f"{fusion.name} not in list!"

class TestCommonFusionFormat(unittest.TestCase):
"""Test parse Common Fusion Format"""

def test_basic(self):
"""Test basic parsing."""

all_fusions = ["ACACA_IKZF3", "MVB12B_C9orf84", "C9orf84_MVB12B"]
for fusion in parsers.parsers["cff"](
f"{BASEDIR}/CommonFusionFormat/" + "fids_out.tsv",
db_human.logger,
):
fusion = model.Fusion(
gene5prime=fusion["gene5prime"],
gene5primejunction=fusion["gene5prime_junction"],
gene3prime=fusion["gene3prime"],
gene3primejunction=fusion["gene3prime_junction"],
db=db_human,
pyensembl_data=data_human,
protein_databases=["pfam"],
noncanonical=False,
)
assert fusion.name in all_fusions, f"{fusion.name} not in list!"

# def test_with_coding_effect(self):
# """Test parse output with coding effect."""

# all_fusions = ["ACACA_IKZF3", "MVB12B_C9orf84", "C9orf84_MVB12B"]
# for fusion in parsers.parsers["cff"](
# f"{BASEDIR}/CommonFusionFormat/" + "fids_out.tsv",
# db_human95.logger,
# ):
# fusion = model.Fusion(
# gene5prime=fusion["gene5prime"],
# gene5primejunction=fusion["gene5prime_junction"],
# gene3prime=fusion["gene3prime"],
# gene3primejunction=fusion["gene3prime_junction"],
# db=db_human95,
# pyensembl_data=data_human95,
# protein_databases=["pfam"],
# noncanonical=False,
# )
# assert fusion.name in all_fusions, f"{fusion.name} not in list!"

class TestCommonFusionFormatReann(unittest.TestCase):
"""Test parse Common Fusion Format"""

def test_basic(self):
"""Test basic parsing."""

all_fusions = ["ACACA_IKZF3", "MVB12B_C9orf84", "C9orf84_MVB12B"]
for fusion in parsers.parsers["cff_reann"](
f"{BASEDIR}/CommonFusionFormat/" + "fids_out.tsv",
db_human.logger,
):
fusion = model.Fusion(
gene5prime=fusion["gene5prime"],
gene5primejunction=fusion["gene5prime_junction"],
gene3prime=fusion["gene3prime"],
gene3primejunction=fusion["gene3prime_junction"],
db=db_human,
pyensembl_data=data_human,
protein_databases=["pfam"],
noncanonical=False,
)
assert fusion.name in all_fusions, f"{fusion.name} not in list!"

class TestCommonFusionFormatTranscript(unittest.TestCase):
"""Test parse Common Fusion Format using Transcript ID columns"""

def test_basic(self):
"""Test basic parsing."""

all_fusions = ["ACACA_IKZF3", "MVB12B_C9orf84", "C9orf84_MVB12B"]
for fusion in parsers.parsers["cff_transcript"](
f"{BASEDIR}/CommonFusionFormat/" + "fids_out.tsv",
db_human.logger,
):
fusion = model.Fusion(
gene5prime=fusion["gene5prime"],
gene5primejunction=fusion["gene5prime_junction"],
gene3prime=fusion["gene3prime"],
gene3primejunction=fusion["gene3prime_junction"],
db=db_human,
pyensembl_data=data_human,
protein_databases=["pfam"],
noncanonical=False,
)
assert fusion.name in all_fusions, f"{fusion.name} not in list!"


if __name__ == "__main__":
unittest.main()

0 comments on commit 8aca5da

Please sign in to comment.