-
Notifications
You must be signed in to change notification settings - Fork 2
/
splitbed.py
executable file
·139 lines (117 loc) · 5.05 KB
/
splitbed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Splits a .bed file into a single file for each chromosome
"""
import os
# ~~~~~ FUNCTIONS ~~~~~ #
def get_bed_chroms(bed_file):
"""
Returns a list of all chromosomes in a .bed file
Parameters
----------
bed_file: str
the path to a .bed formatted file
Returns
-------
list
a list of the unique chromosomes in the file (e.g. the unique list of entries in the first column)
"""
chroms = []
with open(bed_file) as f:
for line in f:
parts = line.split()
# make sure there's an entry
if len(parts[0]) > 0:
chroms.append(parts[0])
# get unique entries
chroms = list(set(chroms))
return(chroms)
def make_bed_splitchrom_filenames(bed_file, chroms = None, output_dir = None):
"""
Makes a dict of filenames for a per-chromosome split .bed file based on chroms present
Parameters
----------
bed_file: str
the path to a .bed formatted file
chroms: list
a list of unique chromosomes in the .bed file; if ``None``, the list will be retrieved from the ``bed_file``
output_dir: str
path to an output directory for the files; if ``None``, the dirname of the .bed file will be used as the output location.
Returns
-------
dict
a dictionary of the filename for each chrom in the .bed file
Examples
--------
Example usage::
>>> make_bed_splitchrom_filenames("targets.bed", output_dir = "output")
{'chr15': 'output/targets_chr15.bed', 'chr14': 'output/targets_chr14.bed', 'chrY': 'output/targets_chrY.bed', 'chrX': 'output/targets_chrX.bed', 'chr13': 'output/targets_chr13.bed', 'chr12': 'output/targets_chr12.bed', 'chr11': 'output/targets_chr11.bed', 'chr10': 'output/targets_chr10.bed', 'chr17': 'output/targets_chr17.bed', 'chr16': 'output/targets_chr16.bed', 'chr20': 'output/targets_chr20.bed', 'chr21': 'output/targets_chr21.bed', 'chr19': 'output/targets_chr19.bed', 'chr18': 'output/targets_chr18.bed', 'chr7': 'output/targets_chr7.bed', 'chr6': 'output/targets_chr6.bed', 'chr5': 'output/targets_chr5.bed', 'chr4': 'output/targets_chr4.bed', 'chr3': 'output/targets_chr3.bed', 'chr2': 'output/targets_chr2.bed', 'chr1': 'output/targets_chr1.bed', 'chr9': 'output/targets_chr9.bed', 'chr8': 'output/targets_chr8.bed', 'chr22': 'output/targets_chr22.bed'}
Notes
-----
The full absolute path to the output filename will be returned; not shown here, for brevity
"""
if not chroms:
chroms = get_bed_chroms(bed_file)
if output_dir:
file_dirpath = output_dir
else:
file_dirpath = os.path.dirname(bed_file)
file_dirpath = os.path.realpath(file_dirpath)
file_base, file_ext = os.path.splitext(os.path.basename(bed_file))
chrom_filenames = {}
for chrom in chroms:
filename = '{0}_{1}{2}'.format(file_base, chrom, file_ext)
file_path = os.path.join(file_dirpath, filename)
chrom_filenames.update({chrom: file_path})
return(chrom_filenames)
def append_line(output_file, line):
"""
Appends a line of text to a file
Parameters
----------
output_file: str
path to the file to add the text
line: str
character string of the text to be appended to the file
"""
with open(output_file, 'a') as f:
f.write(line)
def split_bed_by_chrom(bed_file, chroms = None, chrom_filenames = None, **kwargs):
"""
Splits a .bed file into sub-files for each chromosome present
Parameters
----------
bed_file: str
the path to a .bed formatted file
chroms: list
a list of unique chromosomes in the .bed file; if ``None``, the list will be retrieved from the ``bed_file``
chrom_filenames: dict
a dictionary of the filename for each chrom in the .bed file, generated by ``make_bed_splitchrom_filenames``; if ``None``, the filenames will be automatically generated from the ``bed_file``
kwargs: dict
dictionary of keyword arguments to pass on to ``make_bed_splitchrom_filenames``
Examples
--------
Example usage::
>>> bed_file = 'targets.bed'
>>> output_dir = 'test'
>>> split_bed_by_chrom(bed_file = bed_file, output_dir = output_dir)
"""
# get the full path to the file
bed_file = os.path.realpath(bed_file)
# get the unique chroms in the file, if none were passed
if not chroms:
chroms = get_bed_chroms(bed_file)
# get the chrom output filesnames, if none were passed
if not chrom_filenames:
chrom_filenames = make_bed_splitchrom_filenames(bed_file = bed_file, chroms = chroms, **kwargs)
# write the split output files
with open(bed_file) as f:
for line in f:
# split the line on tabs
parts = line.split('\t')
# check if the first entry matches a chrom_filenames key
if chrom_filenames.get(parts[0], None):
chrom = parts[0]
output_file = chrom_filenames[chrom]
append_line(output_file = output_file, line = line)