Skip to content

Commit

Permalink
Merge pull request murphycj#50 from murphycj/bug/fix-fusioninspector-…
Browse files Browse the repository at this point in the history
…parser

Fix Fusioninspector parser
  • Loading branch information
murphycj authored Feb 7, 2023
2 parents 749c814 + cbe6215 commit 189f1b7
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 31 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
python-version: ["3.7", "3.8", "3.9", "3.10"]
steps:
- name: Checkout source code
uses: actions/checkout@v1
uses: actions/checkout@v3
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
35 changes: 13 additions & 22 deletions agfusion/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,30 +701,22 @@ class FusionInspector(_Parser):
def __init__(self, infile, logger):
super().__init__(logger)

fin = open(infile, "r")
for line in fin.readlines():
if re.findall(r"^#", line):
line = line.rstrip().split("\t")
if line[0] != "#FusionName" and line[0] != "#fusion_name":
raise AssertionError(
"Unrecognized FusionInspector input for first column"
+ " in header. Should be #FusionName or #fusion_name."
)
data = pd.read_csv(infile, delimiter="\t")
data.columns = [i.replace("#", "") for i in data.columns]

assert line[3] == "LeftGene", "Unrecognized " + "FusionInspector input"
assert line[5] == "LeftBreakpoint", "Unrecognized " + "FusionInspector input"
assert line[6] == "RightGene", "Unrecognized " + "FusionInspector input"
assert line[8] == "RightBreakpoint", "Unrecognized " + "FusionInspector input"
continue
cols = ["LeftGene", "LeftBreakpoint", "RightGene", "RightBreakpoint"]
assert all(
i in data.columns for i in cols
), "Unrecognized FusionInspector input. Could not find all columns: " + ",".join(cols)

line = line.strip().split("\t")
for i in data.index:

gene_5prime = line[3].split("^")[1].split(".")[0]
gene_5prime_name = line[3].split("^")[0]
gene_5prime_junction = int(line[5].split(":")[1])
gene_3prime = line[6].split("^")[1].split(".")[0]
gene_3prime_name = line[6].split("^")[0]
gene_3prime_junction = int(line[8].split(":")[1])
gene_5prime = data.at[i, "LeftGene"].split("^")[1].split(".")[0]
gene_5prime_name = data.at[i, "LeftGene"].split("^")[0]
gene_5prime_junction = int(data.at[i, "LeftBreakpoint"].split(":")[1])
gene_3prime = data.at[i, "RightGene"].split("^")[1].split(".")[0]
gene_3prime_name = data.at[i, "RightGene"].split("^")[0]
gene_3prime_junction = int(data.at[i, "RightBreakpoint"].split(":")[1])
self.fusions.append(
{
"gene5prime": gene_5prime,
Expand All @@ -735,7 +727,6 @@ def __init__(self, infile, logger):
"gene3prime_junction": gene_3prime_junction,
}
)
fin.close()

self._check_data()

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#FusionName JunctionReadCount SpanningFragCount est_J est_S LeftGene LeftLocalBreakpoint LeftBreakpoint RightGene RightLocalBreakpoint RightBreakpoint SpliceType LargeAnchorSupport NumCounterFusionLeft NumCounterFusionRight FAR_left FAR_right LeftBreakDinuc LeftBreakEntropy RightBreakDinuc RightBreakEntropy FFPM microh_brkpt_dist num_microh_near_brkpt
AL627171.2--TPM3 1551 3 1538.54 3.00 AL627171.2^ENSG00000282885.2 2641 chr14:49862686:- TPM3^ENSG00000143549.21 23194 chr1:154166382:- INCL_NON_REF_SPLICE NO 152 89 10.16 17.28 GT 1.5058 AG 1.8892 26.6109 1 30
STAT3--AL627171.2 955 1 934.46 1.00 STAT3^ENSG00000168610.16 21538 chr17:42321234:- AL627171.2^ENSG00000282885.2 32742 chr14:49862799:- INCL_NON_REF_SPLICE YES 0 152 957.00 6.25 GT 1.8295 AG 1.7968 16.1485 1 25

Large diffs are not rendered by default.

58 changes: 50 additions & 8 deletions test/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
db_human95.build = "homo_sapiens_95"


BASEDIR = "./data/FusionsFindingAlgorithms"


class TestFusionCatcher(unittest.TestCase):
"""Test parse FusionCatcher parse."""

Expand All @@ -37,7 +40,7 @@ def test_parse(self):
"C920009B18Rik_H60b",
]
for fusion in parsers.parsers["fusioncatcher"](
"./data/FusionsFindingAlgorithms/FusionCatcher/final-list_candidate-fusion-genes.txt",
f"{BASEDIR}/FusionCatcher/final-list_candidate-fusion-genes.txt",
db_mouse.logger,
):
fusion = model.Fusion(
Expand All @@ -63,7 +66,7 @@ def test_parse(self):
"BCR_ABL1",
]
for fusion in parsers.parsers["arriba"](
"./data/FusionsFindingAlgorithms/Arriba/fusions.tsv",
f"{BASEDIR}/Arriba/fusions.tsv",
db_human.logger,
):
fusion = model.Fusion(
Expand All @@ -87,8 +90,7 @@ def test_basic(self):

all_fusions = ["ACACA_STAC2", "RPS6KB1_SNF8"]
for fusion in parsers.parsers["starfusion"](
"./data/FusionsFindingAlgorithms/STARFusion/"
+ "star-fusion.fusion_candidates.final.abridged",
f"{BASEDIR}/STARFusion/" + "star-fusion.fusion_candidates.final.abridged",
db_human.logger,
):
fusion = model.Fusion(
Expand All @@ -108,8 +110,7 @@ def test_with_coding_effect(self):

all_fusions = ["ARID3B_MYCNUT", "ARID3B_MYCN", "TVP23C_CDRT4"]
for fusion in parsers.parsers["starfusion"](
"./data/FusionsFindingAlgorithms/STARFusion/"
+ "star-fusion.fusion_predictions.abridged.coding_effect.tsv",
f"{BASEDIR}/STARFusion/" + "star-fusion.fusion_predictions.abridged.coding_effect.tsv",
db_human95.logger,
):
fusion = model.Fusion(
Expand All @@ -133,7 +134,7 @@ def test_parse_mouse(self):

all_fusions = ["Mocos_Rprd1a", "Ubc_Ubb", "Ubc_Gm11808", "Gm21887_Gm47283"]
for fusion in parsers.parsers["longgf"](
"./data/FusionsFindingAlgorithms/LongGF/fusions_mouse.log",
f"{BASEDIR}/LongGF/fusions_mouse.log",
db_mouse.logger,
):
fusion = model.Fusion(
Expand All @@ -153,7 +154,48 @@ def test_parse_human(self):

all_fusions = ["BCAS4_BCAS3", "HNRNPC_ACIN1"]
for fusion in parsers.parsers["longgf"](
"./data/FusionsFindingAlgorithms/LongGF/fusions_hg38.log",
f"{BASEDIR}/LongGF/fusions_hg38.log",
db_human95.logger,
):
fusion = model.Fusion(
gene5prime=fusion["gene5prime"],
gene5primejunction=fusion["gene5prime_junction"],
gene3prime=fusion["gene3prime"],
gene3primejunction=fusion["gene3prime_junction"],
db=db_human95,
pyensembl_data=data_human95,
protein_databases=["pfam"],
noncanonical=False,
)
assert fusion.name in all_fusions, f"{fusion.name} not in list!"


class TestFusionInspector(unittest.TestCase):
"""Test parse FusionInspector"""

def test_parse_human(self):
"""Test basic parsing."""

all_fusions = ["AL627171.2_TPM3", "STAT3_AL627171.2"]

for fusion in parsers.parsers["fusioninspector"](
f"{BASEDIR}/FusionInspector/test.FusionInspector.fusions.abridged.txt",
db_human95.logger,
):
fusion = model.Fusion(
gene5prime=fusion["gene5prime"],
gene5primejunction=fusion["gene5prime_junction"],
gene3prime=fusion["gene3prime"],
gene3primejunction=fusion["gene3prime_junction"],
db=db_human95,
pyensembl_data=data_human95,
protein_databases=["pfam"],
noncanonical=False,
)
assert fusion.name in all_fusions, f"{fusion.name} not in list!"

for fusion in parsers.parsers["fusioninspector"](
f"{BASEDIR}/FusionInspector/test.FusionInspector.fusions.txt",
db_human95.logger,
):
fusion = model.Fusion(
Expand Down

0 comments on commit 189f1b7

Please sign in to comment.