Skip to content

Commit

Permalink
Merge pull request #548 from julie-sullivan/release-4.9.x
Browse files Browse the repository at this point in the history
HGVSp: update to skip first codon
  • Loading branch information
julie-sullivan authored Mar 23, 2021
2 parents 593edca + 013d207 commit 2ef9d3b
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 67 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -803,9 +803,9 @@ private SequenceLocation parseSequenceLocation(String[] parts) {
if (emptySequence(reference) && !emptySequence(alternate) && end == (start + 1)) {
// NOTE! swapped start and end
return new SequenceLocation(chromosome, end, start, reference, alternate);
// } else if (alternate.length() == 1 && reference.length() > 1 && reference.startsWith(alternate)) {
// // variant summary file has the wrong location for deletions. Adjust!
// return new SequenceLocation(chromosome, start - 1, end, reference, alternate);
} else if (alternate.length() == 1 && reference.length() > 1 && reference.startsWith(alternate)) {
// variant summary file has the wrong location for deletions. Adjust!
return new SequenceLocation(chromosome, start - 1, end, reference, alternate);
} else {
return new SequenceLocation(chromosome, start, end, reference, alternate);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public void testReallyLongVariant() throws Exception {
assertEquals(1, variantList.size());
Variant variant = variantList.get(0);
assertEquals("10", variant.getChromosome());
assertEquals(Integer.valueOf(102617429), variant.getStart());
assertEquals(Integer.valueOf(102617428), variant.getStart());
assertEquals("AGTGAGAAGGCCCTTTTTCTTCTCCCTCCTTCCTTTCATAGACTTCCTTGCCCACCCCTCCTCTTCTCCCTTGGCAGCTCTTGATGGCACCCCTTCCTGGGGGGCTGGTCATGAATGCCTCATGGATTCAGGGCCTGGGGCCTGTGTGTAGGTATGGAGTGTGGATGCTGCTACCCACTCCAGCAGCTTAGGAGCACTTCCTGACCTTCTCCCCCTGTCACCTGAGACACAAGTGTTAACTCTCCAGGCCCTGGCTCTTGGTAATTCTGGTTCCCCGTGGAAATCCAGGTTGGAGGGATATAAGACTTTCTGCACCTTGGGTAAACCAAGGTACAAGAACTCAAGGATGAAGCAAGATGGGAGGATGTGTGGAGGCCACTCTCCAATGGCTACATGGAAATCCCACCAGAATTCAGACAGTGGCATGTGTGCCTGGACCAGGGCTGGGCAGGCCTC", variant.getReference());
assertEquals("", variant.getAlternate());

Expand Down Expand Up @@ -132,15 +132,15 @@ public void testDecompose() throws Exception {
assertEquals(1, variantList.size());
variant = variantList.get(0);
assertEquals("11", variant.getChromosome());
assertEquals(Integer.valueOf(5225488), variant.getStart());
assertEquals(Integer.valueOf(5225487), variant.getStart());
assertEquals("TA", variant.getReference());
assertEquals("", variant.getAlternate());

variantList = getVariantByAccession(parsedVariantList, "RCV000003943");
assertEquals(1, variantList.size());
variant = variantList.get(0);
assertEquals("19", variant.getChromosome());
assertEquals(Integer.valueOf(11089412), variant.getStart());
assertEquals(Integer.valueOf(11089411), variant.getStart());
assertEquals("T", variant.getReference());
assertEquals("", variant.getAlternate());

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.opencb.cellbase.core.variant.annotation.hgvs;

import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.models.core.Exon;
import org.opencb.biodata.models.core.Transcript;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.cellbase.core.variant.annotation.VariantAnnotationUtils;
Expand Down Expand Up @@ -262,6 +261,7 @@ private HgvsProtein calculateInsertionHgvs() {

// Get the reference codon and the new sequence inserted
String refCodon = transcriptUtils.getCodon(codonPosition);
String afterRefCodon = transcriptUtils.getCodon(codonPosition);
String refAa = VariantAnnotationUtils.getAminoacid(MT.equals(variant.getChromosome()), refCodon);

// Build the new inserted sequence = split codon + alternate allele
Expand All @@ -270,6 +270,12 @@ private HgvsProtein calculateInsertionHgvs() {
}
String insSequence = refCodon.substring(0, positionAtCodon - 1) + alternate + refCodon.substring(positionAtCodon - 1);

// need inserted sequence to be divisible by 3 to predict AAs later
if (insSequence.length() % 3 > 0) {
// append NTs until sequence has enough codons
insSequence = insSequence + afterRefCodon.substring(0, (3 - insSequence.length() % 3));
}

// Insertion or Duplication need the reference codon to be the same.
// Check if the reference codon is at any end of the new inserted sequence.
String insertLeftAa = VariantAnnotationUtils.getAminoacid(MT.equals(variant.getChromosome()), insSequence.substring(0, 3));
Expand Down Expand Up @@ -311,7 +317,7 @@ private HgvsProtein calculateInsertionHgvs() {
} else {
// HGVS DELETION-INSERTION
// The insertion happens between codonPosition - 1 and codonPosition
return this.hgvsDeletionInsertionFormatter(codonPosition - 1, codonPosition, alternate);
return this.hgvsDeletionInsertionFormatter(codonPosition, codonPosition, insSequence);
}
}
} else {
Expand Down Expand Up @@ -863,28 +869,47 @@ private HgvsProtein calculateFrameshiftHgvs() {
int phaseOffset = 0;
int currentAaIndex = 0;
StringBuilder alternateProteinSeq = new StringBuilder();
String alternateCdnaSeq = transcriptUtils.getAlternateCdnaSequence(variant);

if (alternateCdnaSeq == null) {
return null;
}
int codonIndex = transcript.getCdnaCodingStart() - 1;
if (transcriptUtils.hasUnconfirmedStart()) {
phaseOffset = transcriptUtils.getFirstCodonPhase();

codonIndex += phaseOffset;
// if reference protein sequence start with X, prepend X to our new alternate sequence also
if (transcript.getProteinSequence().startsWith(VariantAnnotationUtils.UNKNOWN_AMINOACID)) {
alternateProteinSeq.append("X");
currentAaIndex++;
}
} else if (transcript.getProteinSequence().startsWith("M") && !"ATG".equals(alternateCdnaSeq.substring(0, 3))) {
/*
First codon is NOT ATG but protein sequence starts with M. This is due to Ensembl curation. From Ensembl:
"We have some information about non-ATG start codons in our blog post from release 102:
https://www.ensembl.info/2020/11/30/ensembl-102-has-been-released/
Quite simply, there is not a rule. This is a situation of exceptional biology which we are only able to annotate correctly
because of our expert manual gene annotators analysing the data in detail."
Only relevant for frameshifts and where transcripts with confirmed starts. Should not be needed if Ensembl ever makes the
non-ATG start tag available in the GTF file.
*/
// fast forward past first codon
alternateProteinSeq.append("M");
currentAaIndex++;
codonIndex += 3;
}

int codonIndex = transcript.getCdnaCodingStart() + phaseOffset - 1;

int firstDiffIndex = -1;
String firstReferencedAa = "";
String firstAlternateAa = "";
int stopIndex = -1;
String stopAlternateAa = "";
int originalStopIndex = -1;
String alternateCdnaSeq = transcriptUtils.getAlternateCdnaSequence(variant);

if (alternateCdnaSeq == null) {
return null;
}

// We ned to include the STOP codon in the loop to check if there is a variant braking the STOP codon
while (codonIndex + 3 <= alternateCdnaSeq.length()) {
Expand All @@ -897,7 +922,8 @@ private HgvsProtein calculateFrameshiftHgvs() {
// Alternate protein can miss a STOP codon and be longer than the reference protein
if (currentAaIndex < transcript.getProteinSequence().length()) {
String referenceCodedAa = String.valueOf(transcript.getProteinSequence().charAt(currentAaIndex));
// System.out.println((currentAaIndex + 1) + ": " + referenceCodedAa + " - " + alternateCodedAa + " - " + alternateCodon);
//System.out.println((currentAaIndex + 1) + ": " + referenceCodedAa + " - " + alternateCodedAa + " - " + alternateCodon +
// "(" + codonIndex + ")");

// STOP codons cannot exist inside the protein, and if a new STOP codon is generated the while loop os broken below.
// The only explanation for this STOP codpn is to be the SEC amino acid.
Expand All @@ -920,7 +946,6 @@ private HgvsProtein calculateFrameshiftHgvs() {
firstReferencedAa = StringUtils.capitalize(longAA.toLowerCase());
firstAlternateAa = StringUtils.capitalize(alternateAa.toLowerCase());
firstDiffIndex = currentAaIndex;

}

if (alternateAa.equalsIgnoreCase(STOP_STRING)) {
Expand All @@ -931,7 +956,7 @@ private HgvsProtein calculateFrameshiftHgvs() {
} else {
// We have passed the protein sequence, the first time we get here is the STOP codon.
// Incomplete 3' proteins do not reach this point, while finish before because last codon is not complete.'
// System.out.println((currentAaIndex + 1) + ": null - " + alternateCodedAa + " - " + alternateCodon);
//System.out.println((currentAaIndex + 1) + ": null - " + alternateCodedAa + " - " + alternateCodon);
if (currentAaIndex == transcript.getProteinSequence().length()) {
if (alternateAa.equalsIgnoreCase(STOP_STRING)) {
// STOP codon remains the same
Expand Down Expand Up @@ -1010,41 +1035,6 @@ private HgvsProtein calculateFrameshiftHgvs() {
return new HgvsProtein(getProteinIds(), hgvsString, alternateProteinSeq.toString());
}

protected char aaAt(int position) {
return alternateProteinSequence.subSequence(position, position).charAt(0);
}

/**
*
* @return the AA sequence for the alternate
*/
protected String getSequence() {
return alternateProteinSequence.toString();
}

protected String getSequence(int start, int end) {
return alternateProteinSequence.subSequence(start, end).toString();
}

// FIXME need to do this check early
protected boolean onlySpansCodingSequence(Variant variant, Transcript transcript) {
if (buildingComponents.getCdnaStart().getOffset() == 0 // Start falls within coding exon
&& buildingComponents.getCdnaEnd().getOffset() == 0) { // End falls within coding exon

List<Exon> exonList = transcript.getExons();
// Get the closest exon to the variant start, measured as the exon that presents the closest start OR end
// coordinate to the position
Exon nearestExon = exonList.stream().min(Comparator.comparing(exon ->
Math.min(Math.abs(variant.getStart() - exon.getStart()),
Math.abs(variant.getStart() - exon.getEnd())))).get();

// Check if the same exon contains the variant end
return variant.getEnd() >= nearestExon.getStart() && variant.getEnd() <= nearestExon.getEnd();

}
return false;
}

private List<String> getProteinIds() {
List<String> proteinIds = new ArrayList<>();
proteinIds.add(transcript.getProteinID());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ public void testCellBaseEmpty() throws Exception {
}

// investigating why these are failing
//@Test
@Test
public void testMoreMismatches() throws Exception {
// from Antonio
Gene gene = null;
Expand All @@ -148,11 +148,19 @@ public void testMoreMismatches() throws Exception {
HgvsProteinCalculator predictor = null;
HgvsProtein hgvsProtein = null;

// 19:48955714:-:G ENSP00000375744 p.Glu24GlyfsTer33 p.Asn21LysfsTer36
// gene = getGene("ENSG00000087088");
// transcript = getTranscript(gene, "ENST00000391871");
// variant = new Variant("19",
// 48955714,
// "-",
// "G");
//
// predictor = new HgvsProteinCalculator(variant, transcript);
// hgvsProtein = predictor.calculate();
// Assert.assertEquals("p.Glu24GlyfsTer33", hgvsProtein.getHgvs());


// var_id entity_id vep_hgvs cb_hgvs mismatch_category
// MT:8247:-:CCC ENSP00000354876 p.Met221delinsThrLeu p.Glu220_Met221delinsPro other

// MT:8247:-:CCC ENSP00000354876 p.Met221delinsThrLeu p.Glu220_Met221delinsPro
gene = getGene("ENSG00000198712");
transcript = getTranscript(gene, "ENST00000361739");
variant = new Variant("MT",
Expand All @@ -164,6 +172,32 @@ public void testMoreMismatches() throws Exception {
hgvsProtein = predictor.calculate();
Assert.assertEquals("p.Met221delinsThrLeu", hgvsProtein.getHgvs());

// 4:105234272:G:T ENSP00000426885 p.Lys110Asn p.Ter110=
gene = getGene("ENSG00000168769");
transcript = getTranscript(gene, "ENST00000514870");
variant = new Variant("4",
105234272,
"G",
"T");

predictor = new HgvsProteinCalculator(variant, transcript);
hgvsProtein = predictor.calculate();
Assert.assertEquals("p.Lys110Asn", hgvsProtein.getHgvs());

// var_id entity_id vep_hgvs cb_hgvs mismatch_category
// MT:8247:-:CCC ENSP00000354876 p.Met221delinsThrLeu p.Glu220_Met221delinsPro other

// gene = getGene("ENSG00000198712");
// transcript = getTranscript(gene, "ENST00000361739");
// variant = new Variant("MT",
// 8247,
// "-",
// "CCC");
//
// predictor = new HgvsProteinCalculator(variant, transcript);
// hgvsProtein = predictor.calculate();
// Assert.assertEquals("p.Met221delinsThrLeu", hgvsProtein.getHgvs());

// 11:32396362:-:CCGA ENSP00000491984 p.Ala382ValfsTer4 p.Met1LeufsTer385 other
// 11:32396362:-:CCGA ENSP00000491511 p.Ala382ValfsTer4 p.Met1LeufsTer385 other
// 11:32396362:-:CCGA ENSP00000492269 p.Ala365ValfsTer4 p.Met1LeufsTer368 other
Expand Down Expand Up @@ -210,16 +244,16 @@ public void testMoreMismatches() throws Exception {

// 19:48955714:-:G ENSP00000375744 p.Glu24GlyfsTer33 p.Asn21LysfsTer36 other

gene = getGene("ENSG00000087088");
transcript = getTranscript(gene, "ENST00000391871");
variant = new Variant("19",
48955714,
"-",
"G");

predictor = new HgvsProteinCalculator(variant, transcript);
hgvsProtein = predictor.calculate();
Assert.assertEquals("p.Glu24GlyfsTer33", hgvsProtein.getHgvs());
// gene = getGene("ENSG00000087088");
// transcript = getTranscript(gene, "ENST00000391871");
// variant = new Variant("19",
// 48955714,
// "-",
// "G");
//
// predictor = new HgvsProteinCalculator(variant, transcript);
// hgvsProtein = predictor.calculate();
// Assert.assertEquals("p.Glu24GlyfsTer33", hgvsProtein.getHgvs());

}

Expand Down
Binary file modified cellbase-core/src/test/resources/hgvs/gene.test.json.gz
Binary file not shown.

0 comments on commit 2ef9d3b

Please sign in to comment.