-
Notifications
You must be signed in to change notification settings - Fork 1
/
subset_gene_features.R
30 lines (23 loc) · 1.05 KB
/
subset_gene_features.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#Load required package
library(stringr)
#Define function for subsetting gff file to only gene features, then replacing feature names with gene names
subset_gff = function(input, output){
gff = read.delim(input, sep = "\t", header = F)
#Subset only entries for desired feature
gffsub = gff[which(gff$V3 == "gene"),]
#Mark the transcription start sight of each gene feature
#If strand is +, reassign end to start
gffsub[which(gffsub$V7 == "+"),"V5"] = gffsub[which(gffsub$V7 == "+"),"V4"]
#If strand is -, assign start same value as end
gffsub[which(gffsub$V7 == "-"),"V4"] = gffsub[which(gffsub$V7 == "-"),"V5"]
#Extract gene names, remove extraneous dictionary keys and values as needed
names = str_extract(gffsub$V9, "Name=.*")
names = gsub("Name=", "", names)
names = gsub(";.*$", "", names)
#Replace feature column with gene names
gffsub$V3 = names
#write output
write.table(gffsub, output, quote = F, sep = "\t", row.names = F, col.names = F)
}
#Execute function
subset_gff("notags_gff.tmp", "notags_wnames_gff.tmp")