-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_reference.R
117 lines (98 loc) · 5.31 KB
/
get_reference.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# SETUP -------------------------------------------------------------------------------------------
options(warn = -1)
# Command line parser accept short and long flag/options
suppressPackageStartupMessages(library("optparse", character.only = TRUE))
# Parse the options
opt_list <- list(
make_option(opt_str = c("-s", "--seq"),
type = "character",
default = NULL,
help = "Sequence to download"),
make_option(opt_str = c("-o", "--out"),
type = "character",
default = NULL,
help = "Output file in FASTA fomat (.fasta)"),
make_option(opt_str = c("-a", "--ann"),
type = "logical",
default = FALSE,
help = "Download the annotation of the sequence"),
make_option(opt_str = c("-p", "--cds"),
type = "logical",
default = FALSE,
help = "Extract protein coding sequences"))
opt_parser <- OptionParser(option_list = opt_list)
opt <- parse_args(opt_parser)
chr_names <- c(paste("chr", 1:22, sep = ""), "chrX", "chrY")
if (is.null(opt$seq) || !opt$seq %in% c("genome", chr_names)) {
print_help(opt_parser)
stop(paste0("[", Sys.time(), "] [FAIL]: Sequence name to downlaod is missed or not allowed"))
}
if (!opt$ann && opt$cds) {
stop(paste0("[", Sys.time(), "] [FAIL]: Sequence annotation required to extract protein-coding sequences"))
}
if (is.null(opt$out)) {
opt$out <- file.path(getwd(), paste0(opt$seq, ".fasta"))
} else {
opt$out <- normalizePath(opt$out)
}
# Define the chromosomes accession
hsa_accessions <- c(
"NC_000001", "NC_000002", "NC_000003", "NC_000004", "NC_000005", "NC_000006",
"NC_000007", "NC_000008", "NC_000009", "NC_000010", "NC_000011", "NC_000012",
"NC_000013", "NC_000014", "NC_000015", "NC_000016", "NC_000017", "NC_000018",
"NC_000019", "NC_000020", "NC_000021", "NC_000022", "NC_000023", "NC_000024")
names(hsa_accessions) <- chr_names
# DOWNLOAD THE SEQUENCE ---------------------------------------------------------------------------
# Manipulation of large biological sequences
suppressPackageStartupMessages(library("Biostrings", character.only = TRUE))
suppressPackageStartupMessages(library("stringr", character.only = TRUE))
suppressPackageStartupMessages(library("readr", character.only = TRUE))
suppressPackageStartupMessages(library("dplyr", character.only = TRUE))
# Download and unzip the genome sequence
sequence_url <- "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz"
download.file(url = sequence_url, destfile = file.path(tempdir(), "hsa38.fasta.gz"), quiet = TRUE)
system(command = paste("gunzip", file.path(tempdir(), "hsa38.fasta.gz")))
all_sequence <- readDNAStringSet(filepath = file.path(tempdir(), "hsa38.fasta"))
# Select the sequences to export
if (opt$seq %in% chr_names) {
message(paste0("[", Sys.time(), "] [INFO]: Downloading ", opt$seq, " sequence"))
chr_sequence <- all_sequence[which(str_detect(string = names(all_sequence), pattern = hsa_accessions[opt$seq]))]
} else {
message(paste0("[", Sys.time(), "] [INFO]: Downloading human genome GRCh38 sequence"))
chr_sequence <- list()
for (I in 1:length(hsa_accessions)) {
chr_sequence[[I]] <- all_sequence[which(str_detect(string = names(all_sequence), pattern = hsa_accessions[I]))]
}
chr_sequence <- unlist(DNAStringSetList(chr_sequence))
}
# Export the sequence
writeXStringSet(x = chr_sequence, filepath = opt$out, append = FALSE)
# DOWNLOAD THE ANNOTATION -------------------------------------------------------------------------
if (opt$ann) {
message(paste0("[", Sys.time(), "] [INFO]: Downloading sequence annotation"))
# Download and unzip the genome annotation
annotation_url <- "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gff.gz"
download.file(url = annotation_url, destfile = file.path(tempdir(), "hsa38.gff.gz"), quiet = TRUE)
suppressMessages(annotation_tbl <- read_delim(file = file.path(tempdir(), "hsa38.gff.gz"), delim = "\t", col_names = FALSE, col_types = cols(), comment = "#"))
# Filter the annotation to extract only the chromosome of interest (if needed)
if (opt$seq %in% chr_names) {
annotation_tbl <- annotation_tbl %>%
dplyr::filter(str_detect(X1, hsa_accessions[opt$seq]))
}
# Export the annotation
annotation_gff <- str_replace(string = opt$out, pattern = ".fasta$", replacement = ".gff")
annotation_gtf <- str_replace(string = opt$out, pattern = ".fasta$", replacement = ".gtf")
write_delim(x = annotation_tbl, file = annotation_gff, delim = "\t", append = FALSE, col_names = FALSE)
# Convert the annotation format from gff to gtf
cmd <- paste("gffread -T", annotation_gff, ">", annotation_gtf)
system(cmd)
}
# EXTRACT THE PROTEIN-CODING SEQUENCES ------------------------------------------------------------
message(paste0("[", Sys.time(), "] [INFO]: Extracting protein-coding sequences"))
# Extract protein-coding sequences (CDS)
if (opt$cds) {
coding_sequences <- str_replace(string = opt$out, pattern = ".fasta$", replacement = ".CDS.fasta")
cmd <- paste("gffread -g", opt$out, "-x", coding_sequences, annotation_gff)
system(cmd, ignore.stdout = TRUE, ignore.stderr = TRUE)
out <- file.remove(paste0(opt$out, ".fai"))
}