splitbed.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Splits a .bed file into a single file for each chromosome
"""
import os

# ~~~~~ FUNCTIONS ~~~~~ #
def get_bed_chroms(bed_file):
    """
    Returns a list of all chromosomes in a .bed file

    Parameters
    ----------
    bed_file: str
        the path to a .bed formatted file

    Returns
    -------
    list
        a list of the unique chromosomes in the file (e.g. the unique list of entries in the first column)
    """
    chroms = []
    with open(bed_file) as f:
        for line in f:
            parts = line.split()
            # make sure there's an entry
            if len(parts[0]) > 0:
                chroms.append(parts[0])
    # get unique entries
    chroms = list(set(chroms))
    return(chroms)


def make_bed_splitchrom_filenames(bed_file, chroms = None, output_dir = None):
    """
    Makes a dict of filenames for a per-chromosome split .bed file based on chroms present

    Parameters
    ----------
    bed_file: str
        the path to a .bed formatted file
    chroms: list
        a list of unique chromosomes in the .bed file; if ``None``, the list will be retrieved from the ``bed_file``
    output_dir: str
        path to an output directory for the files; if ``None``, the dirname of the .bed file will be used as the output location.

    Returns
    -------
    dict
        a dictionary of the filename for each chrom in the .bed file

    Examples
    --------
    Example usage::

        >>> make_bed_splitchrom_filenames("targets.bed", output_dir = "output")
        {'chr15': 'output/targets_chr15.bed', 'chr14': 'output/targets_chr14.bed', 'chrY': 'output/targets_chrY.bed', 'chrX': 'output/targets_chrX.bed', 'chr13': 'output/targets_chr13.bed', 'chr12': 'output/targets_chr12.bed', 'chr11': 'output/targets_chr11.bed', 'chr10': 'output/targets_chr10.bed', 'chr17': 'output/targets_chr17.bed', 'chr16': 'output/targets_chr16.bed', 'chr20': 'output/targets_chr20.bed', 'chr21': 'output/targets_chr21.bed', 'chr19': 'output/targets_chr19.bed', 'chr18': 'output/targets_chr18.bed', 'chr7': 'output/targets_chr7.bed', 'chr6': 'output/targets_chr6.bed', 'chr5': 'output/targets_chr5.bed', 'chr4': 'output/targets_chr4.bed', 'chr3': 'output/targets_chr3.bed', 'chr2': 'output/targets_chr2.bed', 'chr1': 'output/targets_chr1.bed', 'chr9': 'output/targets_chr9.bed', 'chr8': 'output/targets_chr8.bed', 'chr22': 'output/targets_chr22.bed'}

    Notes
    -----
    The full absolute path to the output filename will be returned; not shown here, for brevity

    """
    if not chroms:
        chroms = get_bed_chroms(bed_file)
    if output_dir:
        file_dirpath = output_dir
    else:
        file_dirpath = os.path.dirname(bed_file)

    file_dirpath = os.path.realpath(file_dirpath)

    file_base, file_ext = os.path.splitext(os.path.basename(bed_file))
    chrom_filenames = {}
    for chrom in chroms:
        filename = '{0}_{1}{2}'.format(file_base, chrom, file_ext)
        file_path = os.path.join(file_dirpath, filename)
        chrom_filenames.update({chrom: file_path})
    return(chrom_filenames)


def append_line(output_file, line):
    """
    Appends a line of text to a file

    Parameters
    ----------
    output_file: str
        path to the file to add the text
    line: str
        character string of the text to be appended to the file
    """
    with open(output_file, 'a') as f:
        f.write(line)

def split_bed_by_chrom(bed_file, chroms = None, chrom_filenames = None, **kwargs):
    """
    Splits a .bed file into sub-files for each chromosome present

    Parameters
    ----------
    bed_file: str
        the path to a .bed formatted file
    chroms: list
        a list of unique chromosomes in the .bed file; if ``None``, the list will be retrieved from the ``bed_file``
    chrom_filenames: dict
        a dictionary of the filename for each chrom in the .bed file, generated by ``make_bed_splitchrom_filenames``; if ``None``, the filenames will be automatically generated from the ``bed_file``
    kwargs: dict
        dictionary of keyword arguments to pass on to ``make_bed_splitchrom_filenames``

    Examples
    --------
    Example usage::

        >>> bed_file = 'targets.bed'
        >>> output_dir = 'test'
        >>> split_bed_by_chrom(bed_file = bed_file, output_dir = output_dir)

    """
    # get the full path to the file
    bed_file = os.path.realpath(bed_file)
    # get the unique chroms in the file, if none were passed
    if not chroms:
        chroms = get_bed_chroms(bed_file)
    # get the chrom output filesnames, if none were passed
    if not chrom_filenames:
        chrom_filenames = make_bed_splitchrom_filenames(bed_file = bed_file, chroms = chroms, **kwargs)
    # write the split output files
    with open(bed_file) as f:
        for line in f:
            # split the line on tabs
            parts = line.split('\t')
            # check if the first entry matches a chrom_filenames key
            if chrom_filenames.get(parts[0], None):
                chrom = parts[0]
                output_file = chrom_filenames[chrom]
                append_line(output_file = output_file, line = line)