diff --git a/biodata-models/src/main/avro/variant.avdl b/biodata-models/src/main/avro/variant.avdl index 9443cd8a6..645de3cbc 100644 --- a/biodata-models/src/main/avro/variant.avdl +++ b/biodata-models/src/main/avro/variant.avdl @@ -147,6 +147,18 @@ protocol Variants { union { null, float } pValue = null; } + record OriginalCall { + /** + * Original variant ID before normalization including all secondary alternates. + */ + string variantId; + + /** + * Alternate allele index of the original multi-allellic variant call in which was decomposed. + */ + union {null, int} alleleIndex; + } + record FileEntry { /** * Unique identifier of the source file. @@ -158,7 +170,7 @@ protocol Variants { * * {position}:{reference}:{alternate}(,{other_alternate})*:{allele_index} */ - union { null, string } call; + union { null, OriginalCall } call; /** * Optional data that probably depend on the format of the file the diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/variant/StudyEntry.java b/biodata-models/src/main/java/org/opencb/biodata/models/variant/StudyEntry.java index 0b4ace29b..5e5845423 100644 --- a/biodata-models/src/main/java/org/opencb/biodata/models/variant/StudyEntry.java +++ b/biodata-models/src/main/java/org/opencb/biodata/models/variant/StudyEntry.java @@ -146,16 +146,6 @@ public String getSampleDataKeysAsString() { return impl.getSampleDataKeys() == null ? null : String.join(":", impl.getSampleDataKeys()); } - @Deprecated - public List getFormat() { - return getSampleDataKeys(); - } - - @Deprecated - public StudyEntry setFormat(List value) { - return setSampleDataKeys(value); - } - /** * Do not modify this list * @return @@ -266,53 +256,6 @@ public List getSampleData(int samplePosition) { } } - @Deprecated - public List> getSamplesData() { - List samples = impl.getSamples(); - if (samples == null) { - return null; - } else { - return samples.stream().map(SampleEntry::getData).collect(Collectors.toList()); - } - } - - @Deprecated - public void setSamplesData(List> value) { - if (value == null) { - impl.setSamples(null); - } else { - impl.setSamples(value.stream().map(s -> new SampleEntry(null, null, s)).collect(Collectors.toList())); - } - } - - @Deprecated - public Map> getSamplesDataAsMap() { - requireSamplesPosition(); - - Map> samplesDataMap = new HashMap<>(); - for (Map.Entry entry : samplesPosition.entrySet()) { - samplesDataMap.put(entry.getKey(), getSampleDataAsMap(entry.getKey())); - } - - return Collections.unmodifiableMap(samplesDataMap); - } - - @Deprecated - public Map getSampleDataAsMap(String sampleName) { - requireSamplesPosition(); - if (samplesPosition.containsKey(sampleName)) { - HashMap sampleDataMap = new HashMap<>(); - Iterator iterator = getSampleDataKeys().iterator(); - List sampleDataList = getSampleData(sampleName); - for (String data : sampleDataList) { - sampleDataMap.put(iterator.next(), data); - } - - return Collections.unmodifiableMap(sampleDataMap); - } - return null; - } - public StudyEntry addSampleData(String sampleName, Map sampleData) { if (getSampleDataKeys() == null) { setSampleDataKeys(new ArrayList<>(sampleData.keySet())); @@ -524,7 +467,7 @@ public String getFileId() { public void setFileId(String fileId) { if (impl.getFiles().isEmpty()) { - impl.getFiles().add(new FileEntry(fileId, "", new HashMap<>())); + impl.getFiles().add(new FileEntry(fileId, null, new HashMap<>())); } else { impl.getFiles().get(0).setFileId(fileId); } diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/variant/VariantBuilder.java b/biodata-models/src/main/java/org/opencb/biodata/models/variant/VariantBuilder.java index 798641944..743eb1210 100644 --- a/biodata-models/src/main/java/org/opencb/biodata/models/variant/VariantBuilder.java +++ b/biodata-models/src/main/java/org/opencb/biodata/models/variant/VariantBuilder.java @@ -91,7 +91,7 @@ public class VariantBuilder { private List sampleDataKeys; private List samples; private Map fileData; - private String call; + private OriginalCall call; private String variantString; @@ -414,7 +414,7 @@ public VariantBuilder addFileData(String key, String value) { return this; } - public VariantBuilder setCall(String call) { + public VariantBuilder setCall(OriginalCall call) { checkFile("set call"); this.call = call; return this; @@ -612,9 +612,16 @@ public VariantProto.Variant buildProtoVariant(VariantProto.VariantOrBuilder reus .setStudyId(studyId); if (fileId != null) { - studyBuilder.addFiles(VariantProto.FileEntry.newBuilder() + VariantProto.FileEntry.Builder fileBuilder = VariantProto.FileEntry.newBuilder() .setFileId(fileId) - .putAllData(fileData)); + .putAllData(fileData); + if (call != null) { + fileBuilder.setCall(VariantProto.OriginalCall + .newBuilder() + .setVariantId(call.getVariantId()) + .setAlleleIndex(call.getAlleleIndex())); + } + studyBuilder.addFiles(fileBuilder); } for (int i = 1; i < alternates.size(); i++) { @@ -1153,7 +1160,7 @@ private void parseStructuralVariationFileData(String key, String value) { if (alternates.size() > 1) { throw new IllegalArgumentException("Found SVINSSEQ in a multi allelic variant!"); } else { - setCall(start + ":" + reference + ":" + alternates.get(0) + ":" + 0); + setCall(new OriginalCall(toString(), 0)); setAlternate(reference + value); } } @@ -1344,7 +1351,7 @@ public String toString() { + start + "-" + end + ":" + reference + ":" - + (alternates == null ? "null" : String.join(",", alternates)); + + (alternates == null ? "-" : String.join(",", alternates)); } private static void ifNotNull(T value, Consumer setter) { diff --git a/biodata-models/src/main/proto/protobuf/opencb/variant.proto b/biodata-models/src/main/proto/protobuf/opencb/variant.proto index 75d3d1e5b..292312fd1 100644 --- a/biodata-models/src/main/proto/protobuf/opencb/variant.proto +++ b/biodata-models/src/main/proto/protobuf/opencb/variant.proto @@ -127,9 +127,22 @@ message VariantStats { string mgfGenotype = 13; } +message OriginalCall { + /** + * Original variant ID before normalization including all secondary alternates. + */ + string variantId = 1; + + /** + * Alternate allele index of the original multi-allellic variant call in which was decomposed. + */ + int32 alleleIndex = 2; +} + + message FileEntry { string fileId = 1; - string call = 2; + OriginalCall call = 2; map data = 3; } diff --git a/biodata-models/src/test/java/org/opencb/biodata/models/variant/VariantBuilderTest.java b/biodata-models/src/test/java/org/opencb/biodata/models/variant/VariantBuilderTest.java index 391174270..16d50886f 100644 --- a/biodata-models/src/test/java/org/opencb/biodata/models/variant/VariantBuilderTest.java +++ b/biodata-models/src/test/java/org/opencb/biodata/models/variant/VariantBuilderTest.java @@ -99,7 +99,7 @@ public void buildSVInsertion() { assertEquals(length, v.getLength().intValue()); assertEquals(length, v.getLengthAlternate().intValue()); assertEquals(1, v.getLengthReference().intValue()); - assertEquals("1000:A::0", v.getStudies().get(0).getFiles().get(0).getCall()); + assertEquals(new OriginalCall("1:1000:A:", 0), v.getStudies().get(0).getFiles().get(0).getCall()); } diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java index 73606a0cd..08d64ddff 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java @@ -313,7 +313,7 @@ public List normalize(List batch, boolean reuse) throws NonSta } // Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order! for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) { - String call = start + ":" + reference + ":" + alternate + ":" + keyFields.getNumAllele(); + OriginalCall call = new OriginalCall(variant.toString(), keyFields.getNumAllele()); Variant normalizedVariant = newVariant(variant, keyFields, sv); if (keyFields.getPhaseSet() != null) { StudyEntry studyEntry = new StudyEntry(); @@ -361,15 +361,18 @@ public List normalize(List batch, boolean reuse) throws NonSta && keyFieldsList.get(0).getReference().equals(reference) && keyFieldsList.get(0).getAlternate().equals(alternate); - String callPrefix; + String originalCall; if (entry.getFiles() != null && !entry.getFiles().isEmpty() - && StringUtils.isNotEmpty(entry.getFiles().get(0).getCall())) { - String call = entry.getFiles().get(0).getCall(); - // Remove allele index - callPrefix = call.substring(0, call.lastIndexOf(':') + 1); + && entry.getFiles().get(0).getCall() != null + && StringUtils.isNotEmpty(entry.getFiles().get(0).getCall().getVariantId())) { + originalCall = entry.getFiles().get(0).getCall().getVariantId(); } else { - callPrefix = start + ":" + reference + ":" + String.join(",", originalAlternates) + ":"; + StringBuilder sb = new StringBuilder(variant.toString()); + for (int i = 1; i < originalAlternates.size(); i++) { + sb.append(",").append(originalAlternates.get(i)); + } + originalCall = sb.toString(); } // Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order! @@ -378,13 +381,12 @@ public List normalize(List batch, boolean reuse) throws NonSta if (keyFields.alternate.equals(VariantBuilder.REF_ONLY_ALT)) { continue; } - String call = callPrefix + keyFields.getNumAllele(); final Variant normalizedVariant; final StudyEntry normalizedEntry; final List samples; if (reuse && keyFieldsList.size() == 1) { //Only reuse for non multiallelic variants - //Reuse variant. Set new fields. + //callReuse variant. Set new fields. normalizedVariant = variant; variant.setStart(keyFields.getStart()); variant.setEnd(keyFields.getEnd()); @@ -397,7 +399,12 @@ public List normalize(List batch, boolean reuse) throws NonSta // variant.setSv(sv); // } normalizedEntry = entry; - entry.getFiles().forEach(fileEntry -> fileEntry.setCall(sameVariant ? null : call)); + if (!sameVariant) { + OriginalCall call = new OriginalCall(originalCall.toString(), keyFields.numAllele); + entry.getFiles().forEach(fileEntry -> { + fileEntry.setCall(call); + }); + } samples = entry.getSamples(); } else { normalizedVariant = newVariant(variant, keyFields, sv); @@ -408,9 +415,15 @@ public List normalize(List batch, boolean reuse) throws NonSta normalizedEntry.setSampleDataKeys(entry.getSampleDataKeys()); List files = new ArrayList<>(entry.getFiles().size()); + OriginalCall call; + if (sameVariant) { + call = null; + } else { + call = new OriginalCall(originalCall, keyFields.numAllele); + } for (FileEntry file : entry.getFiles()) { HashMap fileData = new HashMap<>(file.getData()); - files.add(new FileEntry(file.getFileId(), sameVariant ? null : call, fileData)); + files.add(new FileEntry(file.getFileId(), call, fileData)); } normalizedEntry.setFiles(files); normalizedVariant.addStudyEntry(normalizedEntry); @@ -461,6 +474,7 @@ public List normalize(List batch, boolean reuse) throws NonSta // for all mnv-phased variants if (normalizedEntry.getFiles().size() == 0) { // Use mnv string as file Id so that it can be later identified. + OriginalCall call = new OriginalCall(originalCall, keyFields.numAllele); normalizedEntry.setFiles(Collections.singletonList(new FileEntry(keyFields.getPhaseSet(), call, null))); } } @@ -470,7 +484,7 @@ public List normalize(List batch, boolean reuse) throws NonSta normalizedVariants.add(normalizedVariant); } catch (Exception e) { - logger.warn("Error parsing variant " + call + ", numAllele " + keyFields.getNumAllele(), e); + logger.warn("Error parsing variant " + originalCall + ", numAllele " + keyFields.getNumAllele(), e); throw e; } } diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java index 816c82d9e..22629fa08 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java @@ -25,6 +25,7 @@ import org.apache.commons.lang3.tuple.MutablePair; import org.apache.commons.lang3.tuple.Pair; import org.opencb.biodata.models.variant.StudyEntry; +import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.protobuf.VariantProto; import org.opencb.biodata.tools.Converter; import org.opencb.commons.datastore.core.ObjectMap; @@ -202,10 +203,10 @@ protected Set getNoCallAlleleIdx(List alleleList) { protected static Map buildReferenceAllelesMap(Iterator callsIterator) { Map referenceAlleles = new HashMap<>(); callsIterator.forEachRemaining(call -> { - String[] split = splitCall(call); - if (split != null) { - String originalReference = VariantContextConverter.getOriginalReference(split); - Integer originalPosition = VariantContextConverter.getOriginalPosition(split); + if (call != null) { + Variant originalVariant = new Variant(call.split(",")[0]); + String originalReference = originalVariant.getReference(); + Integer originalPosition = originalVariant.getStart(); for (int i = 0; i < originalReference.length(); i++) { referenceAlleles.put(originalPosition + i, originalReference.charAt(i)); } @@ -375,74 +376,6 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en return variantContextBuilder.make(); } - protected static String[] splitCall(String call) { - if (StringUtils.isNotEmpty(call)) { - int idx1 = call.indexOf(':'); - int idx2 = call.indexOf(':', idx1 + 1); - int idx3 = call.lastIndexOf(':'); // Get lastIndexOf, as it may be other intermediate ':' from symbolic or breakend alleles - return new String[]{ - call.substring(0, idx1), - call.substring(idx1 + 1, idx2), - call.substring(idx2 + 1, idx3), - call.substring(idx3 + 1) - }; - } else { - return null; - } - } - - /** - * Assumes that ori is in the form "POS:REF:ALT_0(,ALT_N)*:ALT_IDX". - * ALT_N is the n-th allele if this is the n-th variant resultant of a multiallelic vcf row - * - * @param ori - * @return - */ - protected static List getOriginalAlleles(String[] ori) { - if (ori != null && ori.length == 4) { - String[] multiAllele = ori[2].split(","); - if (multiAllele.length != 1) { - ArrayList alleles = new ArrayList<>(multiAllele.length + 1); - alleles.add(ori[1]); - alleles.addAll(Arrays.asList(multiAllele)); - return alleles; - } else { - return Arrays.asList(ori[1], ori[2]); - } - } - - return null; - } - - protected static String getOriginalReference(String[] ori) { - if (ori != null && ori.length == 4) { - return ori[1]; - } - return null; - } - - protected static String getOriginalAlleleIndex(String[] ori) { - if (ori != null && ori.length == 4) { - return ori[3]; - } - return null; - } - - /** - * Assumes that ori is in the form "POS:REF:ALT_0(,ALT_N)*:ALT_IDX". - * - * @param ori - * @return - */ - protected static Integer getOriginalPosition(String[] ori) { - - if (ori != null && ori.length == 4) { - return Integer.parseInt(ori[0]); - } - - return null; - } - protected abstract Object getStudy(T variant); protected abstract Iterator getStudiesId(T variant); diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java index 05e85851b..fac5c51d9 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java @@ -55,7 +55,10 @@ public VariantContext convert(Variant variant) { // CHROM START END REFERENCE ALTERNATE String chromosome = variant.getChromosome(); VariantType type = variant.getType(); - Map referenceAlleles = buildReferenceAllelesMap(studyEntry.getFiles().stream().map(FileEntry::getCall).iterator()); + Map referenceAlleles = buildReferenceAllelesMap(studyEntry.getFiles() + .stream() + .map(entry -> entry.getCall() == null ? null : entry.getCall().getVariantId()) + .iterator()); Pair adjustedStartEndPositions = adjustedVariantStart(variant, studyEntry, referenceAlleles); int start = adjustedStartEndPositions.getLeft(); int end = adjustedStartEndPositions.getRight(); @@ -477,18 +480,6 @@ private Map addAnnotations(Variant variant, List annotat return attributes; } - protected static String[] getOri(StudyEntry studyEntry) { - - List files = studyEntry.getFiles(); - if (!files.isEmpty()) { - String call = files.get(0).getCall(); - if (call != null && !call.isEmpty()) { - return call.split(":"); - } - } - return null; - } - @Override protected StudyEntry getStudy(Variant variant) { return variant.getStudy(this.studyNameMap.get(this.studyIdString)); diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/ga4gh/Ga4ghVariantConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/ga4gh/Ga4ghVariantConverter.java index 8aa98c23e..242698b20 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/ga4gh/Ga4ghVariantConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/ga4gh/Ga4ghVariantConverter.java @@ -154,7 +154,7 @@ protected Map> parseInfo(List files) { int fileIdx = 0; for (FileEntry file : files) { fileIds.add(file.getFileId() == null ? "" : file.getFileId()); - ori.add(file.getCall() == null ? "" : file.getCall()); + ori.add(file.getCall() == null ? "" : file.getCall().getVariantId()); Map fileData = file.getData(); for (Map.Entry field : fileData.entrySet()) { List value; diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantAvroToVariantProtoConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantAvroToVariantProtoConverter.java index 3419a1f4f..3d3aeffc2 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantAvroToVariantProtoConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantAvroToVariantProtoConverter.java @@ -99,7 +99,11 @@ private VariantProto.FileEntry.Builder toProto(FileEntry fileEntry) { VariantProto.FileEntry.Builder fileBuilder = VariantProto.FileEntry.newBuilder(); set(fileEntry::getFileId, fileBuilder::setFileId); set(fileEntry::getData, fileBuilder::putAllData); - set(fileEntry::getCall, fileBuilder::setCall); + set(fileEntry::getCall, originalCall -> { + fileBuilder.setCall(VariantProto.OriginalCall.newBuilder() + .setVariantId(originalCall.getVariantId()) + .setAlleleIndex(originalCall.getAlleleIndex())); + }); return fileBuilder; } diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantContextToVariantProtoConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantContextToVariantProtoConverter.java index c41fb6848..e8a99fea5 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantContextToVariantProtoConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantContextToVariantProtoConverter.java @@ -112,9 +112,6 @@ public VariantProto.Variant convert(VariantContext variantContext, VariantProto. VariantProto.FileEntry.Builder fileEntry = VariantProto.FileEntry.newBuilder(); fileEntry.setFileId(fileId); - fileEntry.setCall(variantContext.getStart() - + ":" + variantContext.getReference() - + ":" + StringUtils.join(variantContext.getAlternateAlleles(), ",")); Map fileData = new HashMap<>(); for (String key : variantContext.getAttributes().keySet()) { fileData.put(key, variantContext.getAttributeAsString(key, "")); diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java index a5f149a05..85b062e23 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java @@ -55,7 +55,7 @@ public VariantContext convert(VariantProto.Variant variant) { String chromosome = variant.getChromosome(); VariantProto.VariantType type = variant.getType(); Map referenceAlleles = buildReferenceAllelesMap(studyEntry.getFilesList().stream() - .map(VariantProto.FileEntry::getCall).iterator()); + .map(fileEntry -> fileEntry.getCall()==null?null:fileEntry.getCall().getVariantId()).iterator()); Pair adjustedStartEndPositions = adjustedVariantStart(variant, studyEntry.getSecondaryAlternatesList(), referenceAlleles); int start = adjustedStartEndPositions.getLeft(); int end = adjustedStartEndPositions.getRight(); diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantToProtoVcfRecord.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantToProtoVcfRecord.java index 96a66d363..62eed734a 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantToProtoVcfRecord.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantToProtoVcfRecord.java @@ -185,9 +185,9 @@ public VcfRecord convertUsingSlicePosition(Variant variant, int slicePosition) { Map fileData = Collections.unmodifiableMap(file.getData()); //DO NOT MODIFY if ( !variant.getType().equals(VariantType.NO_VARIATION) - && file.getCall() != null && !file.getCall().isEmpty() - && !file.getCall().equals(variant.toString() + ":0" ) ) { - recordBuilder.setCall(file.getCall()); + && file.getCall() != null + && !file.getCall().getVariantId().equals(variant.toString())) { + recordBuilder.setCall(file.getCall().getVariantId()+":"+file.getCall().getAlleleIndex()); } /* Filter */ diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VcfRecordProtoToVariantConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VcfRecordProtoToVariantConverter.java index 7623fceeb..2a1614c76 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VcfRecordProtoToVariantConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VcfRecordProtoToVariantConverter.java @@ -21,10 +21,7 @@ import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; -import org.opencb.biodata.models.variant.avro.AlternateCoordinate; -import org.opencb.biodata.models.variant.avro.FileEntry; -import org.opencb.biodata.models.variant.avro.SampleEntry; -import org.opencb.biodata.models.variant.avro.VariantType; +import org.opencb.biodata.models.variant.avro.*; import org.opencb.biodata.models.variant.protobuf.VariantProto; import org.opencb.biodata.models.variant.protobuf.VcfSliceProtos; import org.opencb.biodata.tools.Converter; @@ -87,7 +84,13 @@ public Variant convert(VcfSliceProtos.VcfRecord vcfRecord, String chromosome, in fileEntry.setFileId(fileId); Map fileData = getFileData(vcfRecord); fileEntry.setData(fileData); - fileEntry.setCall(vcfRecord.getCall().isEmpty() ? null : vcfRecord.getCall()); + if (!vcfRecord.getCall().isEmpty()) { + int idx = vcfRecord.getCall().lastIndexOf(":"); + OriginalCall call = new OriginalCall( + vcfRecord.getCall().substring(0, idx), + Integer.valueOf(vcfRecord.getCall().substring(idx + 1))); + fileEntry.setCall(call); + } if (vcfRecord.getType().equals(VariantProto.VariantType.NO_VARIATION)) { fileData.put("END", Integer.toString(end)); } diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/merge/VariantMerger.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/merge/VariantMerger.java index 223365f14..fb9777456 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/merge/VariantMerger.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/merge/VariantMerger.java @@ -32,10 +32,7 @@ import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantBuilder; -import org.opencb.biodata.models.variant.avro.AlternateCoordinate; -import org.opencb.biodata.models.variant.avro.FileEntry; -import org.opencb.biodata.models.variant.avro.SampleEntry; -import org.opencb.biodata.models.variant.avro.VariantType; +import org.opencb.biodata.models.variant.avro.*; import org.opencb.biodata.models.variant.metadata.VariantFileHeader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -267,7 +264,7 @@ public Variant createFromTemplate(Variant target) { var.setType(target.getType()); for(StudyEntry tse : target.getStudies()){ StudyEntry se = new StudyEntry(tse.getStudyId()); - se.setFiles(Collections.singletonList(new FileEntry("", "", new HashMap<>()))); + se.setFiles(Collections.singletonList(new FileEntry("", null, new HashMap<>()))); se.setSampleDataKeys(Arrays.asList(getGtKey(), getFilterKey())); se.setSamplesPosition(new HashMap<>()); se.setSamples(new ArrayList<>()); @@ -1018,15 +1015,14 @@ public static AlternateCoordinate getMainAlternate(Variant variant) { private void mergeFile(Variant current, Variant other, VariantAlternateRearranger rearranger, StudyEntry currentStudy, StudyEntry otherStudy) { - String call = other.getStart() + ":" + other.getReference() + ":" + other.getAlternate() + ":0"; List files = otherStudy.getFiles().stream() .map(fileEntry -> FileEntry.newBuilder(fileEntry).build()) .collect(Collectors.toList()); if (!current.toString().equals(other.toString())) { for (FileEntry file : files) { - if (StringUtils.isEmpty(file.getCall())) { - file.setCall(call); + if (file.getCall() != null) { + file.setCall(new OriginalCall(other.toString(), 0)); } } } diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedEVSStatsCalculator.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedEVSStatsCalculator.java index d7d0eb261..09c333823 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedEVSStatsCalculator.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedEVSStatsCalculator.java @@ -64,11 +64,12 @@ public VariantAggregatedEVSStatsCalculator(Properties tagMap) { protected void parseStats(Variant variant, StudyEntry study, int numAllele, String reference, String[] alternateAlleles, Map info) { FileEntry fileEntry = study.getFiles().get(0); // EVS params are not rearranged when normalizing. Use original call - if (fileEntry.getCall() != null && !fileEntry.getCall().isEmpty()) { - String[] ori = fileEntry.getCall().split(":"); - numAllele = Integer.parseInt(ori[3]); - alternateAlleles = ori[2].split(","); - reference = ori[1]; + if (fileEntry.getCall() != null) { + numAllele = fileEntry.getCall().getAlleleIndex(); + alternateAlleles = fileEntry.getCall().getVariantId().split(","); + Variant ori = new Variant(alternateAlleles[0]); + alternateAlleles[0] = ori.getAlternate(); + reference = ori.getReference(); } VariantStats stats = new VariantStats(StudyEntry.DEFAULT_COHORT); if (info.containsKey("MAF")) { @@ -92,11 +93,12 @@ protected void parseStats(Variant variant, StudyEntry study, int numAllele, Stri protected void parseMappedStats(Variant variant, StudyEntry studyEntry, int numAllele, String reference, String[] alternateAlleles, Map info) { FileEntry fileEntry = studyEntry.getFiles().get(0); - if (fileEntry.getCall() != null && !fileEntry.getCall().isEmpty()) { - String[] ori = fileEntry.getCall().split(":"); - numAllele = Integer.parseInt(ori[3]); - alternateAlleles = ori[2].split(","); - reference = ori[1]; + if (fileEntry.getCall() != null) { + numAllele = fileEntry.getCall().getAlleleIndex(); + alternateAlleles = fileEntry.getCall().getVariantId().split(","); + Variant ori = new Variant(alternateAlleles[0]); + alternateAlleles[0] = ori.getAlternate(); + reference = ori.getReference(); } if (tagMap != null) { diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedExacStatsCalculator.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedExacStatsCalculator.java index 6aa94e3af..48b204cc7 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedExacStatsCalculator.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedExacStatsCalculator.java @@ -75,10 +75,10 @@ protected void parseStats(Variant variant, StudyEntry studyEntry, int numAllele, FileEntry fileEntry = studyEntry.getFiles().get(0); int numAlleleOri; String[] alternateAllelesOri; - if (fileEntry.getCall() != null && !fileEntry.getCall().isEmpty()) { - String[] ori = fileEntry.getCall().split(":"); - numAlleleOri = Integer.parseInt(ori[3]); - alternateAllelesOri = ori[2].split(","); + if (fileEntry.getCall() != null) { + numAlleleOri = fileEntry.getCall().getAlleleIndex(); + alternateAllelesOri = fileEntry.getCall().getVariantId().split(","); + alternateAllelesOri[0] = new Variant(alternateAllelesOri[0]).getAlternate(); } else { numAlleleOri = numAllele; alternateAllelesOri = alternateAlleles; @@ -152,10 +152,10 @@ protected void parseMappedStats(Variant variant, StudyEntry studyEntry, int numA FileEntry fileEntry = studyEntry.getFiles().get(0); int numAlleleOri; String[] alternateAllelesOri; - if (fileEntry.getCall() != null && !fileEntry.getCall().isEmpty()) { - String[] ori = fileEntry.getCall().split(":"); - numAlleleOri = Integer.parseInt(ori[3]); - alternateAllelesOri = ori[2].split(","); + if (fileEntry.getCall() != null) { + numAlleleOri = fileEntry.getCall().getAlleleIndex(); + alternateAllelesOri = fileEntry.getCall().getVariantId().split(","); + alternateAllelesOri[0] = new Variant(alternateAllelesOri[0]).getAlternate(); } else { numAlleleOri = numAllele; alternateAllelesOri = alternateAlleles; diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedStatsCalculator.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedStatsCalculator.java index 29b4353b8..93da31853 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedStatsCalculator.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/stats/VariantAggregatedStatsCalculator.java @@ -20,11 +20,11 @@ package org.opencb.biodata.tools.variant.stats; import org.apache.commons.lang.StringUtils; -import org.opencb.biodata.models.feature.Genotype; -import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.formats.variant.vcf4.VariantAggregatedVcfFactory; -import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.formats.variant.vcf4.VariantVcfFactory; +import org.opencb.biodata.models.feature.Genotype; +import org.opencb.biodata.models.variant.StudyEntry; +import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.AlternateCoordinate; import org.opencb.biodata.models.variant.avro.FileEntry; import org.opencb.biodata.models.variant.stats.VariantStats; @@ -291,14 +291,10 @@ protected void calculate(Variant variant, StudyEntry studyEntry, int numAllele, // Get the original variant call to parse this field FileEntry fileEntry = studyEntry.getFiles().get(0); int numAlleleOri; - String[] alternateAllelesOri; - if (fileEntry.getCall() != null && !fileEntry.getCall().isEmpty()) { - String[] ori = fileEntry.getCall().split(":"); - numAlleleOri = Integer.parseInt(ori[3]); - alternateAllelesOri = ori[2].split(","); + if (fileEntry.getCall() != null) { + numAlleleOri = fileEntry.getCall().getAlleleIndex(); } else { numAlleleOri = numAllele; - alternateAllelesOri = alternateAlleles; } for (int i = 0; i < gtcs.length; i++) { diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java index 9d1ac7f0f..f097d1e1a 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java @@ -143,7 +143,7 @@ protected void testSampleNormalization(int position, String ref, String altsCsv, assertEquals(expected.getStart(), v.getStart().intValue()); assertEquals(expected.getAlternate(), v.getAlternate()); assertEquals(expected.getReference(), v.getReference()); - int actual = Integer.parseInt(v.getStudies().get(0).getFiles().get(0).getCall().split(":")[3]); + int actual = v.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex(); assertEquals(expected.getNumAllele(), actual); for (AlternateCoordinate alternate : v.getStudy(studyId).getSecondaryAlternates()) { assertNotNull(alternate); diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java index 63fa213e9..859d5387d 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java @@ -301,9 +301,9 @@ public void testNormalizeMultiallelicNoVariationSymbolic() throws NonStandardCom List variants = normalizer.normalize(Collections.singletonList(variant), false); assertEquals(1, variants.size()); Variant normalizedVariant = variants.get(0); - String call = normalizedVariant.getStudies().get(0).getFiles().get(0).getCall(); - assertEquals("10:A:C,<*>:0", call); - variant.getStudies().get(0).getFiles().get(0).setCall("10:A:C,<*>:0"); + String call = normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId(); + assertEquals("2:10:A:C,<*>", call); + variant.getStudies().get(0).getFiles().get(0).setCall(new OriginalCall("2:10:A:C,<*>", 0)); assertEquals(variant.toJson(), normalizedVariant.toJson()); } @@ -316,9 +316,9 @@ public void testNormalizeMultiallelicNoVariationSymbolicNonRef() throws NonStand List variants = normalizer.normalize(Collections.singletonList(variant), false); assertEquals(1, variants.size()); Variant normalizedVariant = variants.get(0); - String call = normalizedVariant.getStudies().get(0).getFiles().get(0).getCall(); - assertEquals("10:A:C,:0", call); - variant.getStudies().get(0).getFiles().get(0).setCall("10:A:C,:0"); + String call = normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId(); + assertEquals("2:10:A:C,", call); + variant.getStudies().get(0).getFiles().get(0).setCall(new OriginalCall("2:10:A:C,", 0)); variant.getStudies().get(0).getSecondaryAlternates().get(0).setAlternate("<*>"); assertEquals(variant.toJson(), normalizedVariant.toJson()); } @@ -496,7 +496,8 @@ public void testCNVsNormalization() throws Exception { StructuralVariantType.COPY_NUMBER_LOSS, null), normalizedVariantList.get(0).getSv()); // Normalize CNV alternate assertEquals("", normalizedVariantList.get(0).getAlternate()); - assertEquals("100:C::0", normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall()); + assertEquals("1:86<100<150-150<200<211:C:", normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); } @Test @@ -524,10 +525,14 @@ public void testVNCNormalizationMultiallelic() throws NonStandardCompliantSample assertEquals(new StructuralVariation(90, 110, null, null, 4, null, null, StructuralVariantType.COPY_NUMBER_GAIN, null), normalizedVariantList.get(3).getSv()); - assertEquals("100:C:,,,:0", normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall()); - assertEquals("100:C:,,,:1", normalizedVariantList.get(1).getStudies().get(0).getFiles().get(0).getCall()); - assertEquals("100:C:,,,:2", normalizedVariantList.get(2).getStudies().get(0).getFiles().get(0).getCall()); - assertEquals("100:C:,,,:3", normalizedVariantList.get(3).getStudies().get(0).getFiles().get(0).getCall()); + assertEquals("1:90<100<110-200:C:,,,", normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + assertEquals("1:90<100<110-200:C:,,,", normalizedVariantList.get(1).getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(1, normalizedVariantList.get(1).getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + assertEquals("1:90<100<110-200:C:,,,", normalizedVariantList.get(2).getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(2, normalizedVariantList.get(2).getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + assertEquals("1:90<100<110-200:C:,,,", normalizedVariantList.get(3).getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(3, normalizedVariantList.get(3).getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); for (Variant v : normalizedVariantList) { assertEquals(101, v.getStart().intValue()); @@ -552,7 +557,8 @@ public void testCNVsNormalizationCopyNumber() throws NonStandardCompliantSampleF assertEquals("", normalizedVariant.getAlternate()); assertEquals(101, normalizedVariant.getStart().intValue()); assertEquals("", normalizedVariant.getReference()); - assertEquals("100:C::0", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall()); + assertEquals("1:100-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); } @@ -622,7 +628,8 @@ public void testNormalizeINS() throws NonStandardCompliantSampleField { assertEquals("", normalized.getReference()); assertEquals(seq, normalized.getAlternate()); assertEquals(new StructuralVariation(), normalized.getSv()); - assertEquals("100:N::0", normalized.getStudies().get(0).getFiles().get(0).getCall()); + assertEquals("1:100-100:N:", normalized.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalized.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); } @Test diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java index e27bc36c9..821fe58fb 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java @@ -66,13 +66,18 @@ private void testBuildAllele(String varStr) throws NonStandardCompliantSampleFie List normalized = new VariantNormalizer().normalize(Collections.singletonList(origVariant), false); Variant v = normalized .stream() - .filter(var -> StringUtils.endsWith(var.getStudies().get(0).getFiles().get(0).getCall(), "0")) + .filter(var -> var.getStudies().get(0).getFiles().get(0).getCall() != null) + .filter(var -> var.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex() == 0) .findAny() .orElse(origVariant); assertNotNull(v); - Map referenceMap = VariantContextConverter.buildReferenceAllelesMap(v.getStudies().get(0).getFiles().stream().map(FileEntry::getCall).iterator()); + Map referenceMap = VariantContextConverter.buildReferenceAllelesMap( + v.getStudies().get(0).getFiles() + .stream() + .map(entry -> entry.getCall() == null ? null : entry.getCall().getVariantId()) + .iterator()); Pair adjustedRange = VariantAvroToVariantContextConverter.adjustedVariantStart(v, v.getStudy("S"), referenceMap); System.out.println(""); diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/proto/VcfRecordProtoToVariantConverterTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/proto/VcfRecordProtoToVariantConverterTest.java index 056498d47..9b73ae6b9 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/proto/VcfRecordProtoToVariantConverterTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/proto/VcfRecordProtoToVariantConverterTest.java @@ -4,6 +4,7 @@ import org.junit.Test; import org.opencb.biodata.models.variant.*; import org.opencb.biodata.models.variant.avro.FileEntry; +import org.opencb.biodata.models.variant.avro.OriginalCall; import org.opencb.biodata.models.variant.avro.SampleEntry; import org.opencb.biodata.models.variant.protobuf.VcfSliceProtos; @@ -73,7 +74,7 @@ public void testConvert() throws Exception { fileData.put("Key1", "V1"); fileData.put("Key2", "V2"); - studyEntry.setFiles(Collections.singletonList(new FileEntry(fileId, "5:A:C:0", fileData))); + studyEntry.setFiles(Collections.singletonList(new FileEntry(fileId, new OriginalCall("1:5:A:C", 0), fileData))); variant.setStudies(Collections.singletonList(studyEntry)); VcfSliceProtos.VcfRecord vcfRecord = toProto.convert(variant); @@ -93,7 +94,7 @@ public void testConvertNoSamples() throws Exception { .setSampleDataKeys(Collections.emptyList()) .setSamples(Collections.emptyList()) .setFileId(fileId) - .setCall("5:A:C:0") + .setCall(new OriginalCall("1:5:A:C", 0)) .addFileData("Key1", "V1") .addFileData("Key2", "V2") .build(); @@ -103,7 +104,7 @@ public void testConvertNoSamples() throws Exception { .setSampleDataKeys("GT", "DP") .setSamples(Collections.emptyList()) .setFileId(fileId) - .setCall("5:A:C:0") + .setCall(new OriginalCall("1:5:A:C", 0)) .addFileData("Key1", "V1") .addFileData("Key2", "V2") .build(); @@ -148,7 +149,7 @@ public void testConvertDefaultValues() throws Exception { fileData.put(StudyEntry.QUAL, "57"); fileData.put("Key", "Value"); - studyEntry.setFiles(Collections.singletonList(new FileEntry(fileId, "5:A:C:0", fileData))); + studyEntry.setFiles(Collections.singletonList(new FileEntry(fileId, null, fileData))); variant.setStudies(Collections.singletonList(studyEntry)); VcfSliceProtos.VcfRecord vcfRecord = toProto.convert(variant, 100);