Skip to content

Commit

Permalink
v0.1.7: support donor VCF with different formats for variants
Browse files Browse the repository at this point in the history
  • Loading branch information
huangyh09 committed Oct 5, 2019
1 parent e72fe53 commit f1474e4
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 15 deletions.
9 changes: 7 additions & 2 deletions doc/manual.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ to demultiplex scRNA-seq data.

vireo -c $CELL_DATA -N $n_donor -o $OUT_DIR

2) with genotype for all samples (GT, GP, or PL)
2) with genotype for all samples (genoTag: GT, GP, or PL; default is PL, please
choose the existing one)

::

Expand All @@ -41,6 +42,10 @@ to demultiplex scRNA-seq data.
Optionally, `-N` can be provided if it is samller than that in DONOR_GT_FILE
for finding the relevant subset of donors.

**Note** For efficient loading of donor VCF file, we recommend subset it
``bcftools view donor.vcf.gz -R cellSNP.cells.vcf.gz -Oz -o sub.vcf.gz``
Also, add ``-s`` or ``-S`` for subsetting samples.

3) with genotype for part of the samples (n_donor is larger than that in
DONOR_GT_FILE)

Expand All @@ -63,7 +68,7 @@ Viroe supports the cell data in three formats:


Vireo full arguments
====================
--------------------

Type ``vireo -h`` for details of all arguments:

Expand Down
4 changes: 4 additions & 0 deletions doc/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
History
=======

Release v0.1.7 (05/10/2019)
===========================
* Support donor genotype vcf file with different FORMAT for different variants

Release v0.1.6 (05/10/2019)
===========================
* Fix a bug when variants in donor genotype are not in cell vcf file
Expand Down
22 changes: 12 additions & 10 deletions vireoSNP/utils/vcf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import subprocess
import numpy as np

def parse_sample_info(sample_dat, sparse=True):
def parse_sample_info(sample_dat, sparse=True, format_list=None):
"""
Parse genotype information for each sample
Note, it requires the format for each variants to
Expand All @@ -19,18 +19,19 @@ def parse_sample_info(sample_dat, sparse=True):

# require the same format for all variants
format_all = [x[0].split(":") for x in sample_dat]
format_list = format_all[0]
if format_list is None:
format_list = format_all[0]

## sparse matrix requires all keys
format_set_all = [set(x) for x in format_all]
if format_set_all.count(set(format_all[0])) != len(format_all):
print("Error: require the same format for all variants.")
exit()

RV = {}
for _key in format_list:
RV[_key] = []
if sparse:
## sparse matrix requires all keys
format_set_all = [set(x) for x in format_all]
if format_set_all.count(set(format_list)) != len(format_all):
print("Error: require the same format for all variants.")
exit()

RV['indices'] = []
RV['indptr'] = [0]
RV['shape'] = (len(sample_dat[0][1:]), len(sample_dat))
Expand Down Expand Up @@ -64,7 +65,8 @@ def parse_sample_info(sample_dat, sparse=True):
return RV


def load_VCF(vcf_file, biallelic_only=False, load_sample=True, sparse=True):
def load_VCF(vcf_file, biallelic_only=False, load_sample=True, sparse=True,
format_list=None):
"""
Load whole VCF file
-------------------
Expand Down Expand Up @@ -121,7 +123,7 @@ def load_VCF(vcf_file, biallelic_only=False, load_sample=True, sparse=True):
RV["comments"] = comment_lines
if load_sample:
RV["samples"] = obs_ids
RV["GenoINFO"] = parse_sample_info(obs_dat, sparse=sparse)
RV["GenoINFO"] = parse_sample_info(obs_dat, sparse, format_list)
return RV


Expand Down
2 changes: 1 addition & 1 deletion vireoSNP/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.6"
__version__ = "0.1.7"
4 changes: 2 additions & 2 deletions vireoSNP/vireo.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,8 @@ def main():
n_donor = options.n_donor
if options.donor_file is not None:
print("[vireo] Loading donor VCF file ...")
donor_vcf = load_VCF(options.donor_file, sparse=False,
biallelic_only=True)
donor_vcf = load_VCF(options.donor_file, biallelic_only=True,
sparse=False, format_list=[options.geno_tag])
if (options.geno_tag not in donor_vcf['GenoINFO']):
print("[vireo] No " + options.geno_tag + " tag in donor genotype; "
"please try another tag for genotype, e.g., GT")
Expand Down

0 comments on commit f1474e4

Please sign in to comment.