diff --git a/R/utilities.R b/R/utilities.R index 95183de..ab2c36d 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -3713,6 +3713,69 @@ collate_pga <- function( } +#' @title Collate PGA results per chromosome for samples with CN data. +#' +#' @description Expand a metadata table horizontally with PGA_chr metrics. +#' +#' @details Helper function called by `collate_results`, not meant for out-of-package usage. +#' +#' @param these_samples_metadata The metadata to be expanded with sample_id column. +#' @param this_seq_type Seq type for returned CN segments. One of "genome" (default) or "capture". +#' +#' @noRd +#' +#' @return data frame +#' @import dplyr +#' +#' @examples +#' # For genomes +#' meta <- get_gambl_metadata() +#' pga_metrics <- collate_pga_chr(these_samples_metadata = meta) +#' # For exomes +#' meta_capture <- get_gambl_metadata(seq_type_filter = "capture") +#' pga_metrics_capture <- collate_pga_chr(these_samples_metadata = meta_capture) +#' +collate_pga_chr <- function( + these_samples_metadata, + this_seq_type = "genome" +) { + + message( + "Collating the PGA results per chromosome..." + ) + # Currently only works for genomes + if(! this_seq_type %in% c("genome", "capture")) { + stop("Please provide a valid seq_type (\"genome\" or \"capture\").") + } + + # Default to all samples if sample table is missing + if (missing(these_samples_metadata)) { + message("No sample table was provided. Defaulting to all metadata ...") + these_samples_metadata <- get_gambl_metadata( + seq_type_filter = this_seq_type + ) + } + + # Get the CN segments + multi_sample_seg <- get_sample_cn_segments( + sample_list = these_samples_metadata$sample_id, + multiple_samples = TRUE, + this_seq_type = this_seq_type + ) %>% + dplyr::rename("sample" = "ID") + + these_samples_pga <- calculate_pga_chr( + this_seg = multi_sample_seg + ) + + these_samples_metadata <- left_join( + these_samples_metadata, + these_samples_pga + ) + + return(these_samples_metadata) + +} #' @title Standardize Chromosome Prefix. #' @@ -3902,6 +3965,173 @@ calculate_pga = function(this_seg, } + +#' @title Calculate proportion of each chromosome altered by CNV. +#' +#' @description `calculate_pga_chr` returns a data.frame with estimated proportion of each chromosome altered for each sample. +#' +#' @details This function calculates the percent of genome altered (PGA) by CNV. It takes into account the total length of +#' sample's CNV and relates it to the total chromosome length to return the proportion affected by CNV. The input is expected to be a seg file. +#' The path to a local SEG file can be provided instead. If The custom seg file is provided, the minimum required columns are +#' sample, chrom, start, end, and log.ratio. The function can work with either individual or multi-sample seg files. The telomeres are always +#' excluded from calculation, and centromeres/sex chromosomes can be optionally included or excluded. +#' +#' @param this_seg Input data frame of seg file. +#' @param seg_path Optionally, specify the path to a local seg file. +#' @param projection Argument specifying the projection of seg file, which will determine chr prefix, chromosome coordinates, and genome size. Default is grch37, but hg38 is also accepted. +#' @param cutoff The minimum log.ratio for the segment to be considered as CNV. Default is 0.56, which is 1 copy. This value is expected to be a positive float of log.ratio for both deletions and amplifications. +#' @param exclude_sex Boolean argument specifying whether to exclude sex chromosomes from calculation. Default is TRUE. +#' @param exclude_centromeres Boolean argument specifying whether to exclude centromeres from calculation. Default is TRUE. +#' +#' @return data frame +#' +#' @import dplyr readr tidyr +#' @export +#' +#' @examples +#' sample_seg = get_sample_cn_segments(this_sample_id = "14-36022T") +#' sample_seg = dplyr::rename(sample_seg, "sample" = "ID") +#' +#' calculate_pga_chr(this_seg = sample_seg) +#' +#' calculate_pga_chr(this_seg = sample_seg, +#' exclude_sex = FALSE) +#' +#' one_sample = get_sample_cn_segments(this_sample_id = "14-36022T") +#' one_sample = dplyr::rename(one_sample, "sample" = "ID") +#' +#' another_sample = get_sample_cn_segments(this_sample_id = "BLGSP-71-21-00243-01A-11E") +#' another_sample = dplyr::rename(another_sample, "sample" = "ID") +#' +#' multi_sample_seg = rbind(one_sample, another_sample) +#' +#' calculate_pga_chr(this_seg = multi_sample_seg) +#' +calculate_pga_chr = function(this_seg, + seg_path, + projection = "grch37", + cutoff = 0.56, + exclude_sex = TRUE, + exclude_centromeres = TRUE) { + # check for required argument + if (missing(this_seg) & missing (seg_path)) { + stop("Please provide the data frame of seg file or path to the local seg.") + } + + # ensure the specified projection is correct and define chromosome coordinates + if (projection == "grch37") { + chr_coordinates = chromosome_arms_grch37 + } else if (projection == "hg38") { + chr_coordinates = chromosome_arms_hg38 + } else { + stop( + "You specified projection that is currently not supported. Please provide seg files in either hg38 or grch37." + ) + } + + all_chr <- c(1:22,"X","Y") %>% matrix() %>% + as.data.frame(row.names = NULL) %>% + as_tibble() %>% + rename("chrom" = "V1") + + # exclude sex chromosomes + if (exclude_sex) { + chr_coordinates = chr_coordinates %>% + dplyr::filter(!grepl("X|Y", chromosome)) + all_chr = all_chr %>% + dplyr::filter(!grepl("X|Y", chrom)) + } + + # does the user's seg file contain centromeres? + if (exclude_centromeres) { + chr_coordinates = chr_coordinates %>% + group_by(chromosome) %>% + mutate(start = min(start), + end = max(end)) %>% + ungroup %>% + distinct(chromosome, start, end) + } + + # prepare for the overlaps + chr_coordinates = chr_coordinates %>% + rename("arm_start" = "start", + "arm_end" = "end", + "chrom" = "chromosome") %>% + mutate( + chr_len = arm_start + arm_end + ) + + # work out the seg file + if (!missing(seg_path)) { + message(paste0("Reading the seg file from ", seg_path)) + this_seg = suppressMessages(read_tsv(seg_path)) + } + + # preserve the sample ids to account later for those with 0 PGA + sample_set = this_seg %>% distinct(sample) + + this_seg = this_seg %>% + dplyr::filter(abs(log.ratio) >= cutoff) %>% + dplyr::relocate(sample, .after = last_col()) + + # ensure consistent chromosome prefixing + if (projection == "grch37") { + this_seg$chrom = gsub("chr", "", this_seg$chrom) + } else { + this_seg$chrom = gsub("chr", "", this_seg$chrom) # if there is a mish-mash of prefixes, strip them all + this_seg$chrom = paste0("chr", this_seg$chrom) + } + + # exclude sex chromosomes + if (exclude_sex) { + this_seg = this_seg %>% + dplyr::filter(!grepl("X|Y", chrom)) + } + + # prepare for the overlaps + this_seg = inner_join( + this_seg, + chr_coordinates, + by = "chrom", + relationship = "many-to-many" + ) + + # what are the segments that overlap? + this_seg = this_seg %>% + dplyr::filter(start <= arm_end & arm_start <= end) %>% + arrange(sample, chrom, start) + + # calculate total length of CNV per chr + affected_regions = this_seg %>% + dplyr::mutate(size = end - start) %>% + group_by(sample, chrom) %>% + summarise(total = sum(size), + chrom_len = unique(chr.len)) + + affected_regions$PGA.chr = affected_regions$total / affected_regions$chrom_len + + # add chr with 0 + affected_regions = affected_regions %>% select(!all_of(c("total", "chrom_len"))) %>% + dplyr::mutate(PGA.chr = round(PGA.chr, 3)) %>% + pivot_wider(names_from = chrom, values_from = PGA.chr) + + + # now add any samples that can have 0 PGA + affected_regions = base::merge(sample_set, + affected_regions, + all.x = TRUE) + + affected_regions[is.na(affected_regions)] <- 0 + + affected_regions = affected_regions %>% + rename("sample_id" = "sample") + colnames(affected_regions)[2:ncol(affected_regions)] <- paste0("chr",colnames(affected_regions)[2:ncol(affected_regions)],"_pga") + + return(affected_regions) + +} + + #' @title Adjust ploidy for samples with CNV data. #' #' @description `adjust_ploidy` returns a seg file with log.ratios adjusted to the overall sample ploidy. diff --git a/R/viz.R b/R/viz.R index 7775fe6..deda9e1 100644 --- a/R/viz.R +++ b/R/viz.R @@ -22,7 +22,8 @@ colour_aliases = list("COO_consensus" = "coo", "COO" = "coo", "DHITsig_consensus #' @param this_maf Specify custom MAF data frame of mutations. #' @param maf_path Specify path to MAF file if it is not already loaded into data frame. #' @param zoom_in_region Provide a specific region in the format "chromosome:start-end" to zoom in to a specific region. -#' @param label_sv Boolean argument to specify whether label SVs or not. Only supported if a specific chromosome or zoom in region are specified. +#' @param label_sv Boolean argument to specify whether label SVs or not with green line on rainfall plot. +#' @param annotate_sv Boolean argument to specify whether to restrict SVs to those annotated with the annotate_sv function (i.e. relevant oncogenes). #' @param seq_type Specify one of "genome" or "capture" when relying on the function to obtain mutations from a region (i.e. if you haven't provided a MAF or single sample_id) #' #' @return a ggplot2 plot. Print it using print() or save it using ggsave() @@ -42,26 +43,27 @@ colour_aliases = list("COO_consensus" = "coo", "COO" = "coo", "DHITsig_consensus #' seq_type = "genome") #' prettyRainfallPlot = function(this_sample_id, - label_ashm_genes = TRUE, - projection = "grch37", - chromosome, - this_maf, - maf_path, - zoom_in_region, - seq_type, - label_sv = FALSE) { + label_ashm_genes = TRUE, + projection = "grch37", + chromosome, + this_maf, + maf_path, + zoom_in_region, + seq_type, + label_sv = FALSE, + annotate_sv = TRUE) { if (missing(this_sample_id)) { warning("No sample_id was provided. Using all mutations in the MAF within your region!") if(missing(zoom_in_region)){ stop("Must provide a zoom_in_region to plot when showing data from more than one patient") } } - + # allow user to specify chromosome prefix inconsistent with chromosome names if (!missing(chromosome)) { chromosome = standardize_chr_prefix(incoming_vector = chromosome, projection = projection) } - + # allow to zoom in to a specific region if (!missing(zoom_in_region)) { region = zoom_in_region @@ -71,7 +73,7 @@ prettyRainfallPlot = function(this_sample_id, zoom_in_region$start = as.numeric(zoom_in_region$start) zoom_in_region$end = as.numeric(zoom_in_region$end) } - + if (label_ashm_genes) { if (projection == "grch37") { ashm_regions = GAMBLR.data::grch37_ashm_regions %>% @@ -104,7 +106,7 @@ prettyRainfallPlot = function(this_sample_id, group_by(gene) %>% slice_head() %>% ungroup() - + # this will be needed for consistent labeling with rainfall plots ashm_regions = ashm_regions %>% arrange(match( @@ -113,8 +115,8 @@ prettyRainfallPlot = function(this_sample_id, )) ashm_regions = ashm_regions %>% mutate(Chromosome_f = factor(Chromosome, levels = unique(ashm_regions$Chromosome))) - } - + + # if user is subsetting by chromosome or zooming in to a specific region, it is possible there are no aSHM features to show # handle this case separately if (nrow(ashm_regions) == 0) { @@ -123,6 +125,7 @@ prettyRainfallPlot = function(this_sample_id, ) label_ashm_genes = FALSE } + } # get ssm for the requested sample if (!missing(this_maf)) { @@ -136,7 +139,7 @@ prettyRainfallPlot = function(this_sample_id, } } else if (!missing (maf_path)) { message ("Path to custom MAF file was provided, reading SSM using the custom path ...") - + this_maf = suppressMessages(read_tsv(maf_path)) if(!missing(this_sample_id)){ this_maf = this_maf %>% dplyr::filter(Tumor_Sample_Barcode %in% this_sample_id) @@ -154,7 +157,7 @@ prettyRainfallPlot = function(this_sample_id, message(paste("Will use all mutations for",seq_type, "in this region:",zoom_in_region)) these_ssm = get_ssm_by_region(region = region,seq_type = seq_type,projection=projection) } - + # do rainfall calculation using lag rainfall_points = dplyr::select( these_ssm, @@ -179,13 +182,13 @@ prettyRainfallPlot = function(this_sample_id, "InDel" ) ) %>% - dplyr::mutate(IMD = log(IMD)) %>% + dplyr::mutate(IMD = log10(IMD)) %>% ungroup() %>% drop_na(IMD) # for the first point of each chromosome, NAs are produced generating a warning message - + # collapse substitutions into classes rainfall_points$Substitution = rainfall_conv[as.character(rainfall_points$Substitution)] - + # ensure order of grids in the plot is sorted rainfall_points = rainfall_points %>% arrange(match( @@ -207,23 +210,21 @@ prettyRainfallPlot = function(this_sample_id, ) ) } - + # if user is subsetting by chromosome or zooming in to a specific region, are there any SSM left to plot? if (nrow(rainfall_points) == 0) { stop("After subsetting to a regions you requested to plot, there are no SSM to display.") } - + # label SVs if user wants to overlap this data if (!missing(chromosome) & label_sv) { sv_chromosome = chromosome } else if (!missing(zoom_in_region) & label_sv) { sv_chromosome = zoom_in_region$chromosome } else if (label_sv) { - stop( - "Labeling SV is only supported when a particular chromosome or zoomed region is plotted." - ) + sv_chromosome = 1:22 } - + if (label_sv) { message("Getting combined manta + GRIDSS SVs using GAMBLR ...") these_sv = get_combined_sv(these_sample_ids = this_sample_id) @@ -232,11 +233,23 @@ prettyRainfallPlot = function(this_sample_id, rename("SOMATIC_SCORE" = "SCORE") } # annotate SV - these_sv = annotate_sv(these_sv) - + if (annotate_sv){ + these_sv = annotate_sv(these_sv, genome_build = projection) + } else { + these_sv = these_sv %>% + dplyr::rename(chrom1 = "CHROM_A", + start1 = "START_A", + end1 = "END_A", + chrom2 = "CHROM_B", + start2 = "START_B", + end2 = "END_B") %>% + mutate(gene = 1:nrow(these_sv), + partner = letters[gene], + fusion = paste(gene, partner, sep="-")) + } # make SVs a long df with 1 record per SV corresponding to the strand sv_to_label = - melt( + reshape2::melt( these_sv %>% select( chrom1, start1, @@ -263,12 +276,12 @@ prettyRainfallPlot = function(this_sample_id, value.name = "Chromosome" ) %>% dplyr::filter(Chromosome %in% sv_chromosome) - + # are there any SVs on this chromosome/region? if (nrow(sv_to_label) > 0) { sv_to_label = sv_to_label %>% - melt( + reshape2::melt( ., id.vars = c( "tumour_sample_id", @@ -296,7 +309,7 @@ prettyRainfallPlot = function(this_sample_id, ) label_sv = FALSE } - + # when we are plotting region and not whole chromosome, ensure SV is within that region if (!missing(zoom_in_region) & label_sv) { sv_to_label = dplyr::filter( @@ -314,20 +327,25 @@ prettyRainfallPlot = function(this_sample_id, label_sv = FALSE } } - + sv_to_label = sv_to_label %>% mutate(Chromosome_f = factor(Chromosome)) } - - p = ggplot(rainfall_points) + - geom_point(aes(x = Start_Position, y = IMD, color = Substitution)) + + + p = ggplot(rainfall_points, aes(x = Start_Position, y = IMD)) + scale_color_manual(values = get_gambl_colours("rainfall")) + - ylab("log(IMD)") + + ylab(expression(log[10](IMD))) + theme_Morons() + - facet_wrap( ~ Chromosome_f, scales = "free_x") + + facet_grid(. ~ Chromosome_f, scales = "free_x", space = "free_x", switch="x") + ggtitle(this_sample_id) + - theme(plot.title = element_text(hjust = 0)) # left-align title plot - + theme(plot.title = element_text(hjust = 0), # left-align title plot + axis.title.x = element_blank(), axis.text.x = element_blank(), axis.text.y = element_text(size = 16, colour = "black"), + axis.ticks.x = element_blank(), axis.ticks.y = element_line(colour = "black"), + panel.spacing.x = unit(0.1, "lines"), panel.border = element_blank(), text = element_text(size = 16, colour = "black", family="sans"), + strip.background = element_blank(), + strip.placement = "outside", + panel.grid = element_blank()) + if (label_ashm_genes) { p = p + ggrepel::geom_text_repel( @@ -344,7 +362,7 @@ prettyRainfallPlot = function(this_sample_id, segment.angle = 25 ) } - + if (label_sv) { p = p + geom_vline( @@ -352,20 +370,22 @@ prettyRainfallPlot = function(this_sample_id, aes(xintercept = Start_Position), color = "lightgreen", alpha = .7 - ) + - geom_text(data = sv_to_label, - aes(End_Position, 15, label = fusion, color = "lightgreen")) - } - - # show x-axis coordinates if zooming in to a specific region, but not if looking chromosome/genome-wide - if (missing(zoom_in_region)) { - p = p + guides(x = "none") + ) } + + if(annotate_sv) { + max_val = max(rainfall_points$IMD) + p = p + + geom_text(data = sv_to_label, + aes(End_Position, max_val+1, label = fusion, color = "lightgreen"), + show.legend = FALSE) + } + p = p + geom_point(inherit.aes=TRUE, aes(color = Substitution)) + return(p) } - gene_mutation_tally = function(maf_df,these_samples_metadata,these_genes,grouping_variable="cohort"){ meta = dplyr::select(these_samples_metadata,sample_id,{{grouping_variable}}) maf_filt = dplyr::filter(maf_df,Hugo_Symbol %in% these_genes, Variant_Classification %in% coding_class) %>%