Skip to content

Commit

Permalink
models: Replace FileEntry.call with dedicated model #179
Browse files Browse the repository at this point in the history
  • Loading branch information
j-coll committed Mar 24, 2020
1 parent 93591c2 commit 6875e4a
Show file tree
Hide file tree
Showing 22 changed files with 157 additions and 233 deletions.
14 changes: 13 additions & 1 deletion biodata-models/src/main/avro/variant.avdl
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,18 @@ protocol Variants {
union { null, float } pValue = null;
}

record OriginalCall {
/**
* Original variant ID before normalization including all secondary alternates.
*/
string variantId;

/**
* Alternate allele index of the original multi-allellic variant call in which was decomposed.
*/
union {null, int} alleleIndex;
}

record FileEntry {
/**
* Unique identifier of the source file.
Expand All @@ -158,7 +170,7 @@ protocol Variants {
*
* {position}:{reference}:{alternate}(,{other_alternate})*:{allele_index}
*/
union { null, string } call;
union { null, OriginalCall } call;

/**
* Optional data that probably depend on the format of the file the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,16 +146,6 @@ public String getSampleDataKeysAsString() {
return impl.getSampleDataKeys() == null ? null : String.join(":", impl.getSampleDataKeys());
}

@Deprecated
public List<String> getFormat() {
return getSampleDataKeys();
}

@Deprecated
public StudyEntry setFormat(List<String> value) {
return setSampleDataKeys(value);
}

/**
* Do not modify this list
* @return
Expand Down Expand Up @@ -266,53 +256,6 @@ public List<String> getSampleData(int samplePosition) {
}
}

@Deprecated
public List<List<String>> getSamplesData() {
List<SampleEntry> samples = impl.getSamples();
if (samples == null) {
return null;
} else {
return samples.stream().map(SampleEntry::getData).collect(Collectors.toList());
}
}

@Deprecated
public void setSamplesData(List<List<String>> value) {
if (value == null) {
impl.setSamples(null);
} else {
impl.setSamples(value.stream().map(s -> new SampleEntry(null, null, s)).collect(Collectors.toList()));
}
}

@Deprecated
public Map<String, Map<String, String>> getSamplesDataAsMap() {
requireSamplesPosition();

Map<String, Map<String, String>> samplesDataMap = new HashMap<>();
for (Map.Entry<String, Integer> entry : samplesPosition.entrySet()) {
samplesDataMap.put(entry.getKey(), getSampleDataAsMap(entry.getKey()));
}

return Collections.unmodifiableMap(samplesDataMap);
}

@Deprecated
public Map<String, String> getSampleDataAsMap(String sampleName) {
requireSamplesPosition();
if (samplesPosition.containsKey(sampleName)) {
HashMap<String, String> sampleDataMap = new HashMap<>();
Iterator<String> iterator = getSampleDataKeys().iterator();
List<String> sampleDataList = getSampleData(sampleName);
for (String data : sampleDataList) {
sampleDataMap.put(iterator.next(), data);
}

return Collections.unmodifiableMap(sampleDataMap);
}
return null;
}

public StudyEntry addSampleData(String sampleName, Map<String, String> sampleData) {
if (getSampleDataKeys() == null) {
setSampleDataKeys(new ArrayList<>(sampleData.keySet()));
Expand Down Expand Up @@ -524,7 +467,7 @@ public String getFileId() {

public void setFileId(String fileId) {
if (impl.getFiles().isEmpty()) {
impl.getFiles().add(new FileEntry(fileId, "", new HashMap<>()));
impl.getFiles().add(new FileEntry(fileId, null, new HashMap<>()));
} else {
impl.getFiles().get(0).setFileId(fileId);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public class VariantBuilder {
private List<String> sampleDataKeys;
private List<SampleEntry> samples;
private Map<String, String> fileData;
private String call;
private OriginalCall call;

private String variantString;

Expand Down Expand Up @@ -414,7 +414,7 @@ public VariantBuilder addFileData(String key, String value) {
return this;
}

public VariantBuilder setCall(String call) {
public VariantBuilder setCall(OriginalCall call) {
checkFile("set call");
this.call = call;
return this;
Expand Down Expand Up @@ -612,9 +612,16 @@ public VariantProto.Variant buildProtoVariant(VariantProto.VariantOrBuilder reus
.setStudyId(studyId);

if (fileId != null) {
studyBuilder.addFiles(VariantProto.FileEntry.newBuilder()
VariantProto.FileEntry.Builder fileBuilder = VariantProto.FileEntry.newBuilder()
.setFileId(fileId)
.putAllData(fileData));
.putAllData(fileData);
if (call != null) {
fileBuilder.setCall(VariantProto.OriginalCall
.newBuilder()
.setVariantId(call.getVariantId())
.setAlleleIndex(call.getAlleleIndex()));
}
studyBuilder.addFiles(fileBuilder);
}

for (int i = 1; i < alternates.size(); i++) {
Expand Down Expand Up @@ -1153,7 +1160,7 @@ private void parseStructuralVariationFileData(String key, String value) {
if (alternates.size() > 1) {
throw new IllegalArgumentException("Found SVINSSEQ in a multi allelic variant!");
} else {
setCall(start + ":" + reference + ":" + alternates.get(0) + ":" + 0);
setCall(new OriginalCall(toString(), 0));
setAlternate(reference + value);
}
}
Expand Down Expand Up @@ -1344,7 +1351,7 @@ public String toString() {
+ start + "-"
+ end + ":"
+ reference + ":"
+ (alternates == null ? "null" : String.join(",", alternates));
+ (alternates == null ? "-" : String.join(",", alternates));
}

private static <T> void ifNotNull(T value, Consumer<T> setter) {
Expand Down
15 changes: 14 additions & 1 deletion biodata-models/src/main/proto/protobuf/opencb/variant.proto
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,22 @@ message VariantStats {
string mgfGenotype = 13;
}

message OriginalCall {
/**
* Original variant ID before normalization including all secondary alternates.
*/
string variantId = 1;

/**
* Alternate allele index of the original multi-allellic variant call in which was decomposed.
*/
int32 alleleIndex = 2;
}


message FileEntry {
string fileId = 1;
string call = 2;
OriginalCall call = 2;
map<string, string> data = 3;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ public void buildSVInsertion() {
assertEquals(length, v.getLength().intValue());
assertEquals(length, v.getLengthAlternate().intValue());
assertEquals(1, v.getLengthReference().intValue());
assertEquals("1000:A:<INS>:0", v.getStudies().get(0).getFiles().get(0).getCall());
assertEquals(new OriginalCall("1:1000:A:<INS>", 0), v.getStudies().get(0).getFiles().get(0).getCall());

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
}
// Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order!
for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) {
String call = start + ":" + reference + ":" + alternate + ":" + keyFields.getNumAllele();
OriginalCall call = new OriginalCall(variant.toString(), keyFields.getNumAllele());
Variant normalizedVariant = newVariant(variant, keyFields, sv);
if (keyFields.getPhaseSet() != null) {
StudyEntry studyEntry = new StudyEntry();
Expand Down Expand Up @@ -361,15 +361,18 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
&& keyFieldsList.get(0).getReference().equals(reference)
&& keyFieldsList.get(0).getAlternate().equals(alternate);

String callPrefix;
String originalCall;
if (entry.getFiles() != null
&& !entry.getFiles().isEmpty()
&& StringUtils.isNotEmpty(entry.getFiles().get(0).getCall())) {
String call = entry.getFiles().get(0).getCall();
// Remove allele index
callPrefix = call.substring(0, call.lastIndexOf(':') + 1);
&& entry.getFiles().get(0).getCall() != null
&& StringUtils.isNotEmpty(entry.getFiles().get(0).getCall().getVariantId())) {
originalCall = entry.getFiles().get(0).getCall().getVariantId();
} else {
callPrefix = start + ":" + reference + ":" + String.join(",", originalAlternates) + ":";
StringBuilder sb = new StringBuilder(variant.toString());
for (int i = 1; i < originalAlternates.size(); i++) {
sb.append(",").append(originalAlternates.get(i));
}
originalCall = sb.toString();
}

// Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order!
Expand All @@ -378,13 +381,12 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
if (keyFields.alternate.equals(VariantBuilder.REF_ONLY_ALT)) {
continue;
}
String call = callPrefix + keyFields.getNumAllele();

final Variant normalizedVariant;
final StudyEntry normalizedEntry;
final List<SampleEntry> samples;
if (reuse && keyFieldsList.size() == 1) { //Only reuse for non multiallelic variants
//Reuse variant. Set new fields.
//callReuse variant. Set new fields.
normalizedVariant = variant;
variant.setStart(keyFields.getStart());
variant.setEnd(keyFields.getEnd());
Expand All @@ -397,7 +399,12 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
// variant.setSv(sv);
// }
normalizedEntry = entry;
entry.getFiles().forEach(fileEntry -> fileEntry.setCall(sameVariant ? null : call));
if (!sameVariant) {
OriginalCall call = new OriginalCall(originalCall.toString(), keyFields.numAllele);
entry.getFiles().forEach(fileEntry -> {
fileEntry.setCall(call);
});
}
samples = entry.getSamples();
} else {
normalizedVariant = newVariant(variant, keyFields, sv);
Expand All @@ -408,9 +415,15 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
normalizedEntry.setSampleDataKeys(entry.getSampleDataKeys());

List<FileEntry> files = new ArrayList<>(entry.getFiles().size());
OriginalCall call;
if (sameVariant) {
call = null;
} else {
call = new OriginalCall(originalCall, keyFields.numAllele);
}
for (FileEntry file : entry.getFiles()) {
HashMap<String, String> fileData = new HashMap<>(file.getData());
files.add(new FileEntry(file.getFileId(), sameVariant ? null : call, fileData));
files.add(new FileEntry(file.getFileId(), call, fileData));
}
normalizedEntry.setFiles(files);
normalizedVariant.addStudyEntry(normalizedEntry);
Expand Down Expand Up @@ -461,6 +474,7 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
// for all mnv-phased variants
if (normalizedEntry.getFiles().size() == 0) {
// Use mnv string as file Id so that it can be later identified.
OriginalCall call = new OriginalCall(originalCall, keyFields.numAllele);
normalizedEntry.setFiles(Collections.singletonList(new FileEntry(keyFields.getPhaseSet(), call, null)));
}
}
Expand All @@ -470,7 +484,7 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
normalizedVariants.add(normalizedVariant);

} catch (Exception e) {
logger.warn("Error parsing variant " + call + ", numAllele " + keyFields.getNumAllele(), e);
logger.warn("Error parsing variant " + originalCall + ", numAllele " + keyFields.getNumAllele(), e);
throw e;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.protobuf.VariantProto;
import org.opencb.biodata.tools.Converter;
import org.opencb.commons.datastore.core.ObjectMap;
Expand Down Expand Up @@ -202,10 +203,10 @@ protected Set<Integer> getNoCallAlleleIdx(List<String> alleleList) {
protected static Map<Integer, Character> buildReferenceAllelesMap(Iterator<String> callsIterator) {
Map<Integer, Character> referenceAlleles = new HashMap<>();
callsIterator.forEachRemaining(call -> {
String[] split = splitCall(call);
if (split != null) {
String originalReference = VariantContextConverter.getOriginalReference(split);
Integer originalPosition = VariantContextConverter.getOriginalPosition(split);
if (call != null) {
Variant originalVariant = new Variant(call.split(",")[0]);
String originalReference = originalVariant.getReference();
Integer originalPosition = originalVariant.getStart();
for (int i = 0; i < originalReference.length(); i++) {
referenceAlleles.put(originalPosition + i, originalReference.charAt(i));
}
Expand Down Expand Up @@ -375,74 +376,6 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en
return variantContextBuilder.make();
}

protected static String[] splitCall(String call) {
if (StringUtils.isNotEmpty(call)) {
int idx1 = call.indexOf(':');
int idx2 = call.indexOf(':', idx1 + 1);
int idx3 = call.lastIndexOf(':'); // Get lastIndexOf, as it may be other intermediate ':' from symbolic or breakend alleles
return new String[]{
call.substring(0, idx1),
call.substring(idx1 + 1, idx2),
call.substring(idx2 + 1, idx3),
call.substring(idx3 + 1)
};
} else {
return null;
}
}

/**
* Assumes that ori is in the form "POS:REF:ALT_0(,ALT_N)*:ALT_IDX".
* ALT_N is the n-th allele if this is the n-th variant resultant of a multiallelic vcf row
*
* @param ori
* @return
*/
protected static List<String> getOriginalAlleles(String[] ori) {
if (ori != null && ori.length == 4) {
String[] multiAllele = ori[2].split(",");
if (multiAllele.length != 1) {
ArrayList<String> alleles = new ArrayList<>(multiAllele.length + 1);
alleles.add(ori[1]);
alleles.addAll(Arrays.asList(multiAllele));
return alleles;
} else {
return Arrays.asList(ori[1], ori[2]);
}
}

return null;
}

protected static String getOriginalReference(String[] ori) {
if (ori != null && ori.length == 4) {
return ori[1];
}
return null;
}

protected static String getOriginalAlleleIndex(String[] ori) {
if (ori != null && ori.length == 4) {
return ori[3];
}
return null;
}

/**
* Assumes that ori is in the form "POS:REF:ALT_0(,ALT_N)*:ALT_IDX".
*
* @param ori
* @return
*/
protected static Integer getOriginalPosition(String[] ori) {

if (ori != null && ori.length == 4) {
return Integer.parseInt(ori[0]);
}

return null;
}

protected abstract Object getStudy(T variant);

protected abstract Iterator<String> getStudiesId(T variant);
Expand Down
Loading

0 comments on commit 6875e4a

Please sign in to comment.