-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_shared_regions_and_genes.Rmd
154 lines (126 loc) · 8.37 KB
/
find_shared_regions_and_genes.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
---
title: "find_shared_regions_and_genes"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
dyn.load("/home/rstudio/libs/libxml2.so.2")
#library(DESeq2)
library(dplyr)
#library(GenomicRanges)
library(tibble)
library(rtracklayer)
library(ggplot2)
#library(stringr)
```
Define cutoff sets:
```{r, include=T}
region.cutoffs = list("set1" = list("fdr" = 0.01, "min.l2fc" = 2, "min.baseMean" = 100),
"set2" = list("fdr" = 0.01, "min.l2fc" = 2, "min.baseMean" = 80),
"set3" = list("fdr" = 0.01, "min.l2fc" = 2, "min.baseMean" = 60),
"set4" = list("fdr" = 0.01, "min.l2fc" = 2, "min.baseMean" = 40),
"set5" = list("fdr" = 0.01, "min.l2fc" = 2, "min.baseMean" = 20),
"set6" = list("fdr" = 0.01, "min.l2fc" = 1.5, "min.baseMean" = 20),
"set7" = list("fdr" = 0.01, "min.l2fc" = 1, "min.baseMean" = 20),
"set8" = list("fdr" = 0.01, "min.l2fc" = 0.5, "min.baseMean" = 20),
"set9" = list("fdr" = 0.02, "min.l2fc" = 0.5, "min.baseMean" = 20),
"set10" = list("fdr" = 0.03, "min.l2fc" = 0.5, "min.baseMean" = 20),
"set11" = list("fdr" = 0.04, "min.l2fc" = 0.5, "min.baseMean" = 20),
"set12" = list("fdr" = 0.05, "min.l2fc" = 0.5, "min.baseMean" = 20))
```
Define a general function to find the proportions of shared and unique features in three sets:
```{r, include=T}
find_shared_features = function(p1, p2, pM, plot.filename) {
union.all = unique(c(p1, p2, pM))
union.12 = unique(c(p1, p2))
union.1M = unique(c(p1, pM))
union.2M = unique(c(p2, pM))
inter.all = intersect(intersect(p1, p2), pM)
inter.12 = intersect(p1, p2)
inter.1M = intersect(p1, pM)
inter.2M = intersect(p2, pM)
proportions.df = data.frame(p1_num = length(p1),
p2_num = length(p2),
pM_num = length(pM),
p1 = length(setdiff(union.all, union.2M)) / length(p1),
p2 = length(setdiff(union.all, union.1M)) / length(p2),
pM = length(setdiff(union.all, union.12)) / length(pM),
p12_1 = length(setdiff(inter.12, inter.all)) / length(p1),
p12_2 = length(setdiff(inter.12, inter.all)) / length(p2),
p1M_1 = length(setdiff(inter.1M, inter.all)) / length(p1),
p1M_M = length(setdiff(inter.1M, inter.all)) / length(pM),
p2M_2 = length(setdiff(inter.2M, inter.all)) / length(p2),
p2M_M = length(setdiff(inter.2M, inter.all)) / length(pM),
p12M_1 = length(inter.all) / length(p1),
p12M_2 = length(inter.all) / length(p2),
p12M_M = length(inter.all) / length(pM))
p = data.frame(category = c("1", "2", "M",
rep("12", 2), rep("1M", 2), rep("2M", 2),
rep("12M", 3)),
domain = c("p1", "p2", "pM",
"p1", "p2", "p1", "pM", "p2", "pM",
"p1", "p2", "pM"),
proportion = c(proportions.df$p1,
proportions.df$p2,
proportions.df$pM,
proportions.df$p12_1,
proportions.df$p12_2,
proportions.df$p1M_1,
proportions.df$p1M_M,
proportions.df$p2M_2,
proportions.df$p2M_M,
proportions.df$p12M_1,
proportions.df$p12M_2,
proportions.df$p12M_M)) %>%
mutate(category = factor(category, levels = c("1", "2", "M",
"12", "1M", "2M",
"12M"))) %>%
ggplot(aes(x = category, y = proportion, fill = domain)) +
geom_col(position = position_dodge2(preserve = "single")) +
theme_classic()
ggsave(filename = plot.filename,
plot = p)
return(proportions.df)
}
```
Find NFIA-dependent genes that are shared between two or three domains or are unique to particular domains:
```{r, include=T}
p1.dep.genes = readRDS(file = "../r_results/diff_expression/tables/p1_ref_genes.rds")
p2.dep.genes = readRDS(file = "../r_results/diff_expression/tables/p2_ref_genes.rds")
pM.dep.genes = readRDS(file = "../r_results/diff_expression/tables/pM_ref_genes.rds")
gene.proportions = find_shared_features(p1.dep.genes,
p2.dep.genes,
pM.dep.genes,
paste0("../r_results/find_shared_regions_and_genes/plots/",
"proportions_of_shared_and_unique_genes.pdf"))
```
From 40% (p2) to ~70% (pM) of NFIA-dependent genes are unique to their respective domains, hence we do not expect many NFIA-dependent region-gene assignments in common between domains, independently of an assignment method.
Find NFIA-dependent regions that are shared between two or three domains or are unique to particular domains:
```{r, include=T}
region.proportions = bind_rows(lapply(paste0("set", seq(1:(length(region.cutoffs)))),
function(s) {
fdr = region.cutoffs[[s]]$fdr
min.l2fc = region.cutoffs[[s]]$min.l2fc
min.baseMean = region.cutoffs[[s]]$min.baseMean
p1.dep.regions = import(paste0("../r_results/select_diff_regions/p1_dep_ranges",
"_fdr", fdr, "_min-l2fc", min.l2fc, "_min-baseMean", min.baseMean,
".bed"))
p2.dep.regions = import(paste0("../r_results/select_diff_regions/p2_dep_ranges",
"_fdr", fdr, "_min-l2fc", min.l2fc, "_min-baseMean", min.baseMean,
".bed"))
pM.dep.regions = import(paste0("../r_results/select_diff_regions/pM_dep_ranges",
"_fdr", fdr, "_min-l2fc", min.l2fc, "_min-baseMean", min.baseMean,
".bed"))
return(find_shared_features(p1.dep.regions$name,
p2.dep.regions$name,
pM.dep.regions$name,
paste0("../r_results/find_shared_regions_and_genes/plots/",
"proportions_of_shared_and_unique_regions",
"_fdr", fdr, "_min-l2fc", min.l2fc, "_min-baseMean", min.baseMean,
".pdf")))
}))
```
The proportions of shared and unique NFIA-dependent regions demonstrate two trends:
1) In contrast to NFIA-dependent genes, NFIA-dependent regions are predominantly shared, especially, those from p1.
2) The proportion of NFIA-dependent regions unique to a particular domain ranges from ~10% to ~20%, depending on particular cutoffs, but does not increase considerably with any set of cutoffs.
Therefore, independently of a region-gene assignment method, we expect that the majority of assigned NFIA-dependent regions will demonstrate domain-specific assignments because their partner gene is likely lacking in the other two domains. This differential assignment could be realised in two ways: (1) a region becomes unassigned in other domains; (b) a region becomes assigned to another partner gene in other domains.