Skip to content

Commit

Permalink
Handling of UMI sequencing error
Browse files Browse the repository at this point in the history
  • Loading branch information
Gardeux Vincent Roland Julien committed Apr 6, 2018
1 parent 878c83a commit e2f0813
Show file tree
Hide file tree
Showing 10 changed files with 148 additions and 42 deletions.
Binary file added releases/BRBseqTools.1.2.jar
Binary file not shown.
2 changes: 1 addition & 1 deletion src/AnnotateBAMManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ public static void annotate() throws Exception
}
if(Parameters.UMILength != -1)
{
r_bam.samRecord.setAttribute("BX", r_fq.UMI);
r_bam.samRecord.setAttribute("BX", r_fq.UMI); // BX or Maybe should be RX since UMI is not corrected
r_bam.samRecord.setAttribute("QX", r_fq.qualityUMI);
}
samWriter.addAlignment(r_bam.samRecord);
Expand Down
4 changes: 2 additions & 2 deletions src/BRBseqTools.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* @see vincent.gardeux@epfl.ch
*
*/
public class BRBseqTools
public class BRBseqTools
{
public static void main(String[] args) throws Exception
{
Expand Down Expand Up @@ -61,7 +61,7 @@ public static void main(String[] args) throws Exception
break;
case "Demultiplex":
System.out.println("BRBSeqTools 1.1 [Demultiplex]\n");
Parameters.loadDemultiplex(argsParsed);
Parameters.loadDemultiplex(argsParsed);
Parameters.barcodes = Utils.readConfig();
Parameters.BC1 = new ArrayList<String>();
Parameters.barcodeIndex = new HashMap<String, Integer>();
Expand Down
37 changes: 27 additions & 10 deletions src/DGEMatrixManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,42 +43,45 @@ public static void readR2BAM() throws Exception
SAMRecord samRecord = it.next();
Parameters.nbReads++;

String toAdd = samRecord.getReadName();

if(samRecord.getSupplementaryAlignmentFlag())
{
Parameters.notUnique++;
lines.add(samRecord.getReadName() + "\t__alignment_not_unique");
toAdd += "\t__alignment_not_unique\t";
}
else if(samRecord.getReadUnmappedFlag())
{
Parameters.unmapped++;
lines.add(samRecord.getReadName() + "\t__not_aligned");
toAdd += "\t__not_aligned\t";
}
else if(samRecord.getMappingQuality() < 10)
{
Parameters.toolowAqual++;
lines.add(samRecord.getReadName() + "\t__too_low_aQual");
toAdd += "\t__too_low_aQual\t";
}
else
{
HashSet<String> overlappingGenes = Utils.getOverlappingGenes(samRecord.getReferenceName(), samRecord.getAlignmentStart(), samRecord.getAlignmentEnd(), samRecord.getCigar(), samRecord.getReadNegativeStrandFlag());
if(overlappingGenes.size() == 0)
{
Parameters.noFeature++;
lines.add(samRecord.getReadName() + "\t__no_feature");
toAdd += "\t__no_feature\t";
}
else if(overlappingGenes.size() == 1)
{
Parameters.mapped++;
String gene = overlappingGenes.iterator().next();
uniqueGenes.add(gene);
lines.add(samRecord.getReadName() + "\t" + gene);
toAdd += "\t" + gene+ "\t";
}
else
{
Parameters.ambiguous++;
lines.add(samRecord.getReadName() + "\t__ambiguous");
toAdd += "\t__ambiguous\t";
}
}
}
lines.add(toAdd + samRecord.getAlignmentStart() + "\t" + samRecord.getAlignmentEnd());

if(Parameters.nbReads %Parameters.chunkSize == 0)
{
Expand Down Expand Up @@ -118,23 +121,33 @@ public static void createOutputDGE() throws Exception
UMI[][] umis = new UMI[Parameters.barcodeIndex.size()][Parameters.geneIndex.size()]; // Sample / gene
for(int i = 0; i < umis.length; i++) for(int j = 0; j < umis[i].length; j++) umis[i][j] = new UMI();

int nbReadsWritten = 0;
int lastWritten = nbReadsWritten; // For not writing twice the same line [Mehhhh]

Read r_fq = fm_fastq.getFirstRead();
while(r_fq != null)
{
Read r_bam = fm_bam.getReadB(r_fq.name);
if(r_bam != null)
{
nbReadsWritten++;
int x = Parameters.barcodeIndex.get(r_fq.barcode);
int y = Parameters.geneIndex.get(r_bam.gene);
counts[x][y]++;
umis[x][y].addUMI(r_fq.UMI); //TODO take into account UMI mismatches
umis[x][y].addUMI(r_fq.UMI + ":" + r_bam.start + ":" + r_bam.end);
}
r_fq = fm_fastq.getFirstRead();
if(nbReadsWritten %Parameters.chunkSize == 0 && lastWritten != nbReadsWritten)
{
lastWritten = nbReadsWritten;
System.out.println(nbReadsWritten + " reads were demultiplexed [" + Utils.toReadableTime(System.currentTimeMillis() - start) + "]");
}
}

// Clean tmp files
TmpFileManager.delete("fastq", Parameters.nbTmpFastq);
TmpFileManager.delete("bam", Parameters.nbTmpBAM);
System.out.println(nbReadsWritten + " reads were demultiplexed [" + Utils.toReadableTime(System.currentTimeMillis() - start) + "]\n");

// Create the read count and transcript count matrices
BufferedWriter bw_reads = new BufferedWriter(new FileWriter(Parameters.outputFolder + "output.dge.reads.txt"));
Expand Down Expand Up @@ -163,7 +176,9 @@ public static void createOutputDGE() throws Exception
bw_umis_detailed.write("\n");
}

System.out.println("\nStarting writing output file (" + ((Parameters.hammingDistanceUMI == 0)?"without sequencing error correction":"with sequencing error correction: hamming distance <= " + Parameters.hammingDistanceUMI) + ")...");
String[] sortedKeys = Utils.sortKeys(Parameters.geneIndex);
int nbUMIs = 0;
for(String gene:sortedKeys)
{
String mappedGene = Parameters.mappingGeneIdGeneName.get(gene);
Expand All @@ -174,17 +189,19 @@ public static void createOutputDGE() throws Exception
for(String barcode:Parameters.barcodeIndex.keySet())
{
String mappedBarcode = Parameters.mappingBarcodeName.get(barcode);
nbUMIs += umis[Parameters.barcodeIndex.get(barcode)][Parameters.geneIndex.get(gene)].getCorrectedSize();
if(!mappedGene.equals("") && mappedBarcode != null)
{
bw_reads.write("\t" + counts[Parameters.barcodeIndex.get(barcode)][Parameters.geneIndex.get(gene)]);
if(Parameters.UMILength != -1) bw_umis.write("\t" + umis[Parameters.barcodeIndex.get(barcode)][Parameters.geneIndex.get(gene)].umis.size());
if(Parameters.UMILength != -1) bw_umis.write("\t" + umis[Parameters.barcodeIndex.get(barcode)][Parameters.geneIndex.get(gene)].getCorrectedSize());
}
bw_reads_detailed.write("\t" + counts[Parameters.barcodeIndex.get(barcode)][Parameters.geneIndex.get(gene)]);
if(Parameters.UMILength != -1) bw_umis_detailed.write("\t" + umis[Parameters.barcodeIndex.get(barcode)][Parameters.geneIndex.get(gene)].umis.size());
if(Parameters.UMILength != -1) bw_umis_detailed.write("\t" + umis[Parameters.barcodeIndex.get(barcode)][Parameters.geneIndex.get(gene)].getCorrectedSize());
}
if(!mappedGene.equals("")) { bw_reads.write("\n"); if(Parameters.UMILength != -1) bw_umis.write("\n"); }
bw_reads_detailed.write("\n"); if(Parameters.UMILength != -1) bw_umis_detailed.write("\n");
}
System.out.println(nbUMIs + " UMI counts were written (" + (nbReadsWritten - nbUMIs) + " duplicates = "+ Parameters.pcFormatter.format(((nbReadsWritten - nbUMIs) / (float)nbReadsWritten)*100) + "%)");
bw_reads.close(); bw_reads_detailed.close();
if(Parameters.UMILength != -1) { bw_umis.close(); bw_umis_detailed.close();}
System.out.println("DGE count & UMI matrices were created [" + Utils.toReadableTime(System.currentTimeMillis() - start) + "]");
Expand Down
30 changes: 24 additions & 6 deletions src/model/GTF.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package model;

import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
Expand All @@ -24,6 +25,7 @@ public static void readGTF() throws Exception
int nbExons = 0;
int nbGenes = 0;
HashSet<String> uniqueGeneId = new HashSet<String>();
ArrayList<String> uniqueGeneName = new ArrayList<String>(); // It's actually not unique and should not be since two geneID can have the same gene Name => ArrayList
while(line != null)
{
if(!line.startsWith("#"))
Expand All @@ -40,9 +42,10 @@ public static void readGTF() throws Exception
boolean strand = tokens[6].equals("+");
for(String param:params)
{
String value = param.substring(param.indexOf("\"")+1, param.lastIndexOf("\""));
if(param.contains("gene_name")) gene_name = value;
if(param.contains("gene_id")) gene_id = value;
String[] values = param.trim().split("\\s+");
values[1] = values[1].replaceAll("\"", "");
if(values[0].equals("gene_name")) gene_name = values[1];
else if(values[0].equals("gene_id")) gene_id = values[1];
}
if(gene_name == null) gene_name = gene_id;
// Which type is it?
Expand All @@ -51,7 +54,7 @@ public static void readGTF() throws Exception
nbExons++;
IntervalTree tree = forest.get(chr);
if(tree == null) tree = new IntervalTree();
uniqueGeneId.add(gene_id);
if(uniqueGeneId.add(gene_id)) uniqueGeneName.add(gene_name);
tree.insert(new IntervalLabelled((int)start, (int)end, gene_id, strand));
forest.put(chr, tree);
}
Expand All @@ -66,13 +69,28 @@ else if(type.equals("gene"))
}
br.close();

if(nbGenes == 0) {
System.out.println("No Genes were detected in the GTF file. Probably the \"gene\" annotation is missing from the GTF file 3rd column?");
System.out.println("Trying to \"save the day\" by collapsing exons to their annotated gene_id");
for(String gene_id:uniqueGeneId) {
Parameters.geneIndex.put(gene_id, nbGenes);
Parameters.mappingGeneIdGeneName.put(gene_id, uniqueGeneName.get(nbGenes));
nbGenes++;
}
}

System.out.println(nbExons + " 'exons' are annotating " + uniqueGeneId.size() + " unique genes in the provided GTF file. In total " + nbGenes + " 'gene' annotations are found in the GTF file.");

if(nbGenes == 0) {
System.err.println("We couldn't parse the GTF file. Please report this problem if the GTF is in standard format. Or use another GTF from another source.");
System.exit(-1);
}

Parameters.geneIndex.put("__alignment_not_unique", Parameters.geneIndex.size());
Parameters.geneIndex.put("__no_feature", Parameters.geneIndex.size());
Parameters.geneIndex.put("__ambiguous", Parameters.geneIndex.size());
Parameters.geneIndex.put("__too_low_aQual", Parameters.geneIndex.size());
Parameters.geneIndex.put("__not_aligned", Parameters.geneIndex.size());

System.out.println(nbExons + " 'exons' are annotating " + uniqueGeneId.size() + " unique genes in the provided GTF file. In total " + nbGenes + " 'gene' annotations are found in the GTF file.");
}

public static HashSet<String> findOverlappingGenes(String chr, int start, int end, boolean readNegativeStrandFlag)
Expand Down
53 changes: 34 additions & 19 deletions src/model/Parameters.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public class Parameters
public static ArrayList<File> fastqToTrim = null;
public static String barcodePattern = "BU";
public static int nbAllowedDiff = 1;
public static int hammingDistanceUMI = 0;
public static int UMILength = -1;
public static int polyALength = 6;
public static int minReadLength = 10;
Expand Down Expand Up @@ -269,15 +270,27 @@ public static void loadDGE(String[] args) throws Exception
System.exit(-1);
}
break;
case "-n":
case "-nb":
i++;
try
{
nbAllowedDiff = Integer.parseInt(args[i]);
}
catch(NumberFormatException nfe)
{
System.err.println("The '-n' option should be followed by an Integer. You entered " + args[i]);
System.err.println("The '-nb' option should be followed by an Integer. You entered " + args[i]);
System.exit(-1);
}
break;
case "-nu":
i++;
try
{
hammingDistanceUMI = Integer.parseInt(args[i]);
}
catch(NumberFormatException nfe)
{
System.err.println("The '-nu' option should be followed by an Integer. You entered " + args[i]);
System.exit(-1);
}
break;
Expand Down Expand Up @@ -416,7 +429,8 @@ public static void loadDGE(String[] args) throws Exception
System.exit(-1);
}
System.out.println("Barcode Pattern = " + barcodePattern);
System.out.println("Nb Allowed Diff = " + nbAllowedDiff);
System.out.println("Maximum allowed Hamming distance for sequencing error correction [Barcode] = " + nbAllowedDiff);
if(UMILength != -1) System.out.println("Maximum allowed Hamming distance for sequencing error correction [UMI] = " + hammingDistanceUMI);
System.out.println("ChunkSize = " + chunkSize + " i.e. no more than " + chunkSize + " reads will be stored in RAM.");
System.out.println("Stranded = " + Parameters.stranded);
if(outputFolder == null)
Expand Down Expand Up @@ -819,24 +833,25 @@ public static void loadTrim(String[] args) throws Exception
private static void printHelpDGE()
{
System.out.println("BRBseq-tools 1.1 [CreateDGEMatrix]\n\nOptions:");
System.out.println("-f %s \t\tPath of R1 FastQ file.");
System.out.println("-b %s \t\tPath of R2 aligned BAM file [do not need to be sorted or indexed].");
System.out.println("-c %s \t\tPath of Barcode/Samplename mapping file.");
System.out.println("-gtf %s \tPath of GTF file.");
System.out.println("-s %s \t\tDo you want to count only reads falling on same strand than gene? [no, yes, reverse] (default = yes since BRB-seq is stranded protocol).");
System.out.println("-n %i \t\tNumber of allowed difference with the barcode [ambiguous reads will be automatically discarded].");
System.out.println("-o %s \t\tOutput folder");
System.out.println("-t %s \t\tPath of existing folder for storing temporary files");
System.out.println("-chunkSize %i\tMaximum number of reads to be stored in RAM (default = 1000000)");
System.out.println("-p %s \t\tBarcode pattern/order found in the reads of the R1 FastQ file. Barcode names should match the barcode file (default = 'BU' i.e. barcode followed by the UMI).\n\t\t\t'B' [required] is used for specifying the barcode position.\n\t\t\t'U' [optional] can be used for specifying a UMI value position.\n\t\t\t'?' [optional] can be used to ignore specific nucleotides.");
System.out.println("-f %s \t\t[Required] Path of R1 FastQ file.");
System.out.println("-b %s \t\t[Required] Path of R2 aligned BAM file [do not need to be sorted or indexed].");
System.out.println("-c %s \t\t[Required] Path of Barcode/Samplename mapping file.");
System.out.println("-gtf %s \t[Required] Path of GTF file.");
System.out.println("-s %s \t\tDo you want to count only reads falling on same strand than gene? [no, yes, reverse] [default = yes, since BRB-seq is stranded protocol].");
System.out.println("-nb %i \t\tNumber of allowed difference (hamming distance) with the given barcodes in config file [default = 1].");
System.out.println("-nu %i \t\tNumber of allowed difference (hamming distance) for two UMIs to be counted only once [default = 1].");
System.out.println("-o %s \t\tOutput folder [default = folder of BAM file]");
System.out.println("-t %s \t\tPath of existing folder for storing temporary files [default = folder of BAM file]");
System.out.println("-chunkSize %i\tMaximum number of reads to be stored in RAM [default = 1000000]");
System.out.println("-p %s \t\tBarcode pattern/order found in the reads of the R1 FastQ file. Barcode names should match the barcode file [default = 'BU', i.e. barcode followed by the UMI].\n\t\t\t'B' [Required] is used for specifying the barcode position.\n\t\t\t'U' can be used for specifying a UMI value position.\n\t\t\t'?' can be used to ignore specific nucleotides.");
System.out.println("-UMI %i \tIf your barcode pattern contains UMI ('U'), you should specify this parameter as the length of the UMI.");
}

private static void printHelpAnnoBAM()
{
System.out.println("BRBseq-tools 1.1 [AnnotateBAM]\n\nOptions:");
System.out.println("-f %s \t\tPath of R1 FastQ file.");
System.out.println("-b %s \t\tPath of R2 aligned BAM file [do not need to be sorted or indexed].");
System.out.println("-f %s \t\t[Required] Path of R1 FastQ file.");
System.out.println("-b %s \t\t[Required] Path of R2 aligned BAM file [do not need to be sorted or indexed].");
System.out.println("-c %s \t\tPath of Barcode/Samplename mapping file.");
System.out.println("-gtf %s \tPath of GTF file (for UMI counting).");
System.out.println("-s %s \t\tDo you want to annotate genes only in case where reads are falling on same strand? [no, yes, reverse] (default = yes since BRB-seq is stranded protocol).");
Expand All @@ -851,9 +866,9 @@ private static void printHelpAnnoBAM()
private static void printHelpDemultiplex()
{
System.out.println("BRBseq-tools 1.1 [Demultiplex]\n\nOptions:");
System.out.println("-r1 %s \t\tPath of R1 FastQ files (containing barcode and optionally UMIs).");
System.out.println("-r2 %s \t\tPath of R2 FastQ files (containing read sequence).");
System.out.println("-c %s \t\tPath of Barcode/Samplename mapping file.");
System.out.println("-r1 %s \t\t[Required] Path of R1 FastQ files (containing barcode and optionally UMIs).");
System.out.println("-r2 %s \t\t[Required] Path of R2 FastQ files (containing read sequence).");
System.out.println("-c %s \t\t[Required] Path of Barcode/Samplename mapping file.");
System.out.println("-n %i \t\tNumber of allowed difference with the barcode [ambiguous reads will be automatically discarded].");
System.out.println("-o %s \t\tOutput folder");
System.out.println("-p %s \t\tBarcode pattern/order found in the reads of the R1 FastQ file. Barcode names should match the barcode file (default = 'BU' i.e. barcode followed by the UMI).\n\t\t\t'B' [required] is used for specifying the barcode position.\n\t\t\t'U' [optional] can be used for specifying a UMI value position.\n\t\t\t'?' [optional] can be used to ignore specific nucleotides.");
Expand All @@ -863,7 +878,7 @@ private static void printHelpDemultiplex()
private static void printHelpTrim()
{
System.out.println("BRBseq-tools 1.1 [Trim]\n\nOptions:");
System.out.println("-f %s \t\tPath of FastQ file to trim (or containing folder for processing all fastq files recursively).");
System.out.println("-f %s \t\t[Required] Path of FastQ file to trim (or containing folder for processing all fastq files recursively).");
System.out.println("-o %s \t\tOutput folder");
System.out.println("-uniqueBarcode\tIf the fastq file(s) contain(s) only one barcode (for e.g. after demultiplexing), this option can be used for searching the specific barcode (most occuring) in the construct and trimming it when present.");
System.out.println("-polyA %i \tTrim polyA strings that have more than this length (without mismatch), and all 3' string that follows [default=6]");
Expand Down
2 changes: 2 additions & 0 deletions src/model/Read.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ public class Read implements Comparable<Read>
public String qualityUMI;
public String gene;
public SAMRecord samRecord;
public String start;
public String end;
public String[] rawData; // 4 lines
public boolean barcodeMatch = false;

Expand Down
2 changes: 2 additions & 0 deletions src/model/TmpFileManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ private Read parse(String type, BufferedReader br) throws IOException
{
case "bam":
r.gene = tokens[1];
r.start = tokens[2];
r.end = tokens[3];
break;
case "fastq":
r.barcode = tokens[1];
Expand Down
Loading

0 comments on commit e2f0813

Please sign in to comment.