From b4277caecdd984bd2b6a7edc56be3ead3f007ed8 Mon Sep 17 00:00:00 2001 From: ch99l Date: Tue, 14 Apr 2026 15:19:19 +0800 Subject: [PATCH 1/6] Add summariseByExon function --- R/summariseByExon.R | 76 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 R/summariseByExon.R diff --git a/R/summariseByExon.R b/R/summariseByExon.R new file mode 100644 index 00000000..e04c4435 --- /dev/null +++ b/R/summariseByExon.R @@ -0,0 +1,76 @@ +#' Summarise transcript expression to exon-level expression +#' @title summarise by exon +#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} +#' @return A data.table with columns: exon_id, seqnames, start, end, strand, +#' GENEID, and one count column per sample +#' @details Counts are summed across all transcripts that share the same exon +#' (defined by identical seqnames, start, end, and strand). The returned +#' counts therefore represent the total evidence attributed to each unique +#' exonic locus across all overlapping transcripts. +#' @import data.table +#' @importFrom Matrix sparseMatrix +#' @importFrom SummarizedExperiment assays rowRanges rowData +#' @importFrom GenomicRanges seqnames start end strand +#' @export +summariseByExon <- function(se) { + # Unlist GRangesList: one row per exon-transcript combination + exonRanges <- unlist(rowRanges(se), use.names = TRUE) + + # Build a data.table of exon-transcript pairs + txNames <- rownames(se) + exonDt <- data.table( + TXNAME = names(exonRanges), + seqnames = as.character(seqnames(exonRanges)), + start = start(exonRanges), + end = end(exonRanges), + strand = as.character(strand(exonRanges)) + ) + + # Unique exon key: seqnames:start:end:strand + exonDt[, exon_id := paste(seqnames, start, end, strand, sep = ":")] + + # Attach GENEID from rowData + geneDt <- data.table( + TXNAME = rownames(se), + GENEID = rowData(se)$GENEID + ) + exonDt <- geneDt[exonDt, on = "TXNAME"] + + # Collapse metadata per unique exon + exonMeta <- exonDt[, .( + seqnames = seqnames[1], + start = start[1], + end = end[1], + strand = strand[1], + GENEID = paste(sort(unique(GENEID)), collapse = ",") + ), by = exon_id] + + # Build sparse binary matrix: unique_exons x transcripts + # entry [i, j] = 1 if transcript j contains unique exon i + uniqueExons <- exonMeta$exon_id + exonIdx <- match(exonDt$exon_id, uniqueExons) + txIdx <- match(exonDt$TXNAME, txNames) + + exonTxMat <- sparseMatrix( + i = exonIdx, + j = txIdx, + x = 1L, + dims = c(length(uniqueExons), length(txNames)), + dimnames = list(uniqueExons, txNames) + ) + + # Aggregate counts: unique_exons x samples + txCounts <- assays(se)$counts + exonCounts <- exonTxMat %*% txCounts + + # Combine metadata with aggregated counts + result <- cbind( + exonMeta, + as.data.table(as.matrix(exonCounts)) + ) + + # Sort by genomic position + result <- result[order(seqnames, start, end)] + + return(result) +} From dbe2af974b6e97dd3d79df8ce8a01f07af51f28c Mon Sep 17 00:00:00 2001 From: ch99l Date: Tue, 14 Apr 2026 15:48:50 +0800 Subject: [PATCH 2/6] Return exon-level SummarisedExperiment instead of data.table --- R/summariseByExon.R | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/R/summariseByExon.R b/R/summariseByExon.R index e04c4435..cfefd833 100644 --- a/R/summariseByExon.R +++ b/R/summariseByExon.R @@ -1,16 +1,16 @@ #' Summarise transcript expression to exon-level expression #' @title summarise by exon #' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} -#' @return A data.table with columns: exon_id, seqnames, start, end, strand, -#' GENEID, and one count column per sample +#' @return A \code{RangedSummarizedExperiment} with exon-level counts #' @details Counts are summed across all transcripts that share the same exon #' (defined by identical seqnames, start, end, and strand). The returned #' counts therefore represent the total evidence attributed to each unique #' exonic locus across all overlapping transcripts. #' @import data.table #' @importFrom Matrix sparseMatrix -#' @importFrom SummarizedExperiment assays rowRanges rowData -#' @importFrom GenomicRanges seqnames start end strand +#' @importFrom SummarizedExperiment assays rowRanges rowData colData SummarizedExperiment +#' @importFrom GenomicRanges GRanges seqnames start end strand +#' @importFrom IRanges IRanges #' @export summariseByExon <- function(se) { # Unlist GRangesList: one row per exon-transcript combination @@ -63,14 +63,19 @@ summariseByExon <- function(se) { txCounts <- assays(se)$counts exonCounts <- exonTxMat %*% txCounts - # Combine metadata with aggregated counts - result <- cbind( - exonMeta, - as.data.table(as.matrix(exonCounts)) + # Build GRanges for unique exons + exonGRanges <- GRanges( + seqnames = exonMeta$seqnames, + ranges = IRanges(start = exonMeta$start, end = exonMeta$end), + strand = exonMeta$strand ) + names(exonGRanges) <- exonMeta$exon_id + mcols(exonGRanges)$GENEID <- exonMeta$GENEID - # Sort by genomic position - result <- result[order(seqnames, start, end)] - - return(result) + # Return as SummarizedExperiment + return(SummarizedExperiment( + assays = list(counts = exonCounts), + rowRanges = exonGRanges, + colData = colData(se) + )) } From 16648135076aba5a9bd33ceee657cb5a2a1e92fe Mon Sep 17 00:00:00 2001 From: ch99l Date: Fri, 17 Apr 2026 15:35:15 +0800 Subject: [PATCH 3/6] refactor summarisebyExon --- R/summariseByExon.R | 81 ----------------------------- R/summariseExpression.R | 91 +++++++++++++++++++++++++++++++++ R/summariseExpression_utility.R | 31 +++++++++++ 3 files changed, 122 insertions(+), 81 deletions(-) delete mode 100644 R/summariseByExon.R create mode 100644 R/summariseExpression.R create mode 100644 R/summariseExpression_utility.R diff --git a/R/summariseByExon.R b/R/summariseByExon.R deleted file mode 100644 index cfefd833..00000000 --- a/R/summariseByExon.R +++ /dev/null @@ -1,81 +0,0 @@ -#' Summarise transcript expression to exon-level expression -#' @title summarise by exon -#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} -#' @return A \code{RangedSummarizedExperiment} with exon-level counts -#' @details Counts are summed across all transcripts that share the same exon -#' (defined by identical seqnames, start, end, and strand). The returned -#' counts therefore represent the total evidence attributed to each unique -#' exonic locus across all overlapping transcripts. -#' @import data.table -#' @importFrom Matrix sparseMatrix -#' @importFrom SummarizedExperiment assays rowRanges rowData colData SummarizedExperiment -#' @importFrom GenomicRanges GRanges seqnames start end strand -#' @importFrom IRanges IRanges -#' @export -summariseByExon <- function(se) { - # Unlist GRangesList: one row per exon-transcript combination - exonRanges <- unlist(rowRanges(se), use.names = TRUE) - - # Build a data.table of exon-transcript pairs - txNames <- rownames(se) - exonDt <- data.table( - TXNAME = names(exonRanges), - seqnames = as.character(seqnames(exonRanges)), - start = start(exonRanges), - end = end(exonRanges), - strand = as.character(strand(exonRanges)) - ) - - # Unique exon key: seqnames:start:end:strand - exonDt[, exon_id := paste(seqnames, start, end, strand, sep = ":")] - - # Attach GENEID from rowData - geneDt <- data.table( - TXNAME = rownames(se), - GENEID = rowData(se)$GENEID - ) - exonDt <- geneDt[exonDt, on = "TXNAME"] - - # Collapse metadata per unique exon - exonMeta <- exonDt[, .( - seqnames = seqnames[1], - start = start[1], - end = end[1], - strand = strand[1], - GENEID = paste(sort(unique(GENEID)), collapse = ",") - ), by = exon_id] - - # Build sparse binary matrix: unique_exons x transcripts - # entry [i, j] = 1 if transcript j contains unique exon i - uniqueExons <- exonMeta$exon_id - exonIdx <- match(exonDt$exon_id, uniqueExons) - txIdx <- match(exonDt$TXNAME, txNames) - - exonTxMat <- sparseMatrix( - i = exonIdx, - j = txIdx, - x = 1L, - dims = c(length(uniqueExons), length(txNames)), - dimnames = list(uniqueExons, txNames) - ) - - # Aggregate counts: unique_exons x samples - txCounts <- assays(se)$counts - exonCounts <- exonTxMat %*% txCounts - - # Build GRanges for unique exons - exonGRanges <- GRanges( - seqnames = exonMeta$seqnames, - ranges = IRanges(start = exonMeta$start, end = exonMeta$end), - strand = exonMeta$strand - ) - names(exonGRanges) <- exonMeta$exon_id - mcols(exonGRanges)$GENEID <- exonMeta$GENEID - - # Return as SummarizedExperiment - return(SummarizedExperiment( - assays = list(counts = exonCounts), - rowRanges = exonGRanges, - colData = colData(se) - )) -} diff --git a/R/summariseExpression.R b/R/summariseExpression.R new file mode 100644 index 00000000..3d5cf687 --- /dev/null +++ b/R/summariseExpression.R @@ -0,0 +1,91 @@ +#' Summarise transcript expression to exon-level expression +#' @title summarise by exon +#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} +#' @return A \code{RangedSummarizedExperiment} with exon-level expression. +#' Rows are named \code{EX1, EX2, ...} in order of first appearance. +#' \code{rowRanges} is a \code{GRanges} with one entry per unique exon locus. +#' \code{mcols} columns: +#' \describe{ +#' \item{GENEID}{Comma-separated gene IDs for transcripts sharing the exon} +#' \item{txNames}{Comma-separated transcript IDs that contain this exon} +#' \item{exonClass}{\code{"annotated"} if the exon coordinates exist in the +#' reference annotation, \code{"novel"} otherwise} +#' } +#' @details Counts (and other assays) are summed across all transcripts sharing +#' the same exon (identical seqnames, start, end, strand). CPM is recomputed +#' from the aggregated counts. \code{fullLengthCounts} and +#' \code{uniqueCounts} are summed where present. +#' @importFrom Matrix sparseMatrix +#' @importFrom SummarizedExperiment rowRanges rowData +#' @importFrom GenomicRanges GRanges seqnames start end strand +#' @importFrom IRanges IRanges +#' @importFrom dplyr left_join mutate group_by summarise filter distinct +#' @export +summariseByExon <- function(se) { + exonRanges <- unlist(rowRanges(se), use.names = TRUE) + txNames <- rownames(se) + + # Build data.frame of exon-transcript pairs + exonDf <- data.frame( + TXNAME = names(exonRanges), + seqnames = as.character(seqnames(exonRanges)), + start = start(exonRanges), + end = end(exonRanges), + strand = as.character(strand(exonRanges)), + stringsAsFactors = FALSE + ) |> + mutate(exon_id = paste(seqnames, start, end, strand, sep = ":")) + + # Attach GENEID and txClassDescription from rowData + geneDf <- as.data.frame(rowData(se))[, c("GENEID", "txClassDescription"), drop = FALSE] + geneDf$TXNAME <- rownames(se) + exonDf <- left_join(exonDf, geneDf, by = "TXNAME") + + # Collapse coordinates and GENEID/txNames per unique exon + exonMeta <- exonDf |> + group_by(exon_id) |> + summarise( + seqnames = seqnames[1], + start = start[1], + end = end[1], + strand = strand[1], + GENEID = paste(sort(unique(GENEID)), collapse = ","), + txNames = paste(sort(unique(TXNAME)), collapse = ","), + .groups = "drop" + ) + + # exonClass: "annotated" if the exon coordinates (seqnames, start, end, strand) + # exist in the reference annotation, "novel" otherwise. + annotatedCoords <- exonDf |> + filter(txClassDescription == "annotation") |> + distinct(seqnames, start, end, strand) |> + mutate(exonClass = "annotated") + + exonMeta <- exonMeta |> + left_join(annotatedCoords, by = c("seqnames", "start", "end", "strand")) |> + mutate(exonClass = ifelse(is.na(exonClass), "novel", "annotated")) + + # Sparse binary matrix: unique_exons x transcripts + uniqueExons <- exonMeta$exon_id + exonNames <- paste0("EX", seq_along(uniqueExons)) + aggregationMat <- sparseMatrix( + i = match(exonDf$exon_id, uniqueExons), + j = match(exonDf$TXNAME, txNames), + x = 1L, + dims = c(length(uniqueExons), length(txNames)), + dimnames = list(exonNames, txNames) + ) + + # Build output GRanges + outRanges <- GRanges( + seqnames = exonMeta$seqnames, + ranges = IRanges(start = exonMeta$start, end = exonMeta$end), + strand = exonMeta$strand + ) + names(outRanges) <- exonNames + mcols(outRanges)$GENEID <- exonMeta$GENEID + mcols(outRanges)$txNames <- exonMeta$txNames + mcols(outRanges)$exonClass <- exonMeta$exonClass + + .aggregateAssays(se, aggregationMat, outRanges) +} diff --git a/R/summariseExpression_utility.R b/R/summariseExpression_utility.R new file mode 100644 index 00000000..893cea39 --- /dev/null +++ b/R/summariseExpression_utility.R @@ -0,0 +1,31 @@ +# Core aggregation engine shared by summariseByExon (and any future summariseBy* functions). +# aggregationMat : sparse matrix (output_features x input_transcripts) +# outRanges : GRanges / GRangesList for the output rows +#' @importFrom SummarizedExperiment assays colData SummarizedExperiment +#' @noRd +.aggregateAssays <- function(se, aggregationMat, outRanges) { + counts <- aggregationMat %*% assays(se)$counts + + counts.total <- colSums(counts) + counts.total[counts.total == 0] <- 1 + cpm <- counts / counts.total * 10^6 + + outAssays <- SimpleList(counts = counts, CPM = cpm) + + for (nm in c("fullLengthCounts", "uniqueCounts")) { + if (nm %in% names(assays(se))) { + outAssays[[nm]] <- aggregationMat %*% assays(se)[[nm]] + } + } + + ColNames <- colnames(counts) + ColData <- colData(se) + ColData@rownames <- ColNames + ColData@listData$name <- ColNames + + SummarizedExperiment( + assays = outAssays, + rowRanges = outRanges, + colData = ColData + ) +} From eefbe1628c7d1967188154de1d0d73e8017a3230 Mon Sep 17 00:00:00 2001 From: ch99l Date: Mon, 20 Apr 2026 15:54:36 +0800 Subject: [PATCH 4/6] refactor:generalise summarisation by type --- R/summariseExpression.R | 183 ++++++++++++++++++++------------ R/summariseExpression_utility.R | 31 ------ 2 files changed, 113 insertions(+), 101 deletions(-) delete mode 100644 R/summariseExpression_utility.R diff --git a/R/summariseExpression.R b/R/summariseExpression.R index 3d5cf687..f9840edc 100644 --- a/R/summariseExpression.R +++ b/R/summariseExpression.R @@ -1,91 +1,134 @@ -#' Summarise transcript expression to exon-level expression -#' @title summarise by exon +#' Summarise transcript expression to a specified unit +#' @title summarise by expression #' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} -#' @return A \code{RangedSummarizedExperiment} with exon-level expression. -#' Rows are named \code{EX1, EX2, ...} in order of first appearance. -#' \code{rowRanges} is a \code{GRanges} with one entry per unique exon locus. -#' \code{mcols} columns: -#' \describe{ -#' \item{GENEID}{Comma-separated gene IDs for transcripts sharing the exon} -#' \item{txNames}{Comma-separated transcript IDs that contain this exon} -#' \item{exonClass}{\code{"annotated"} if the exon coordinates exist in the -#' reference annotation, \code{"novel"} otherwise} -#' } -#' @details Counts (and other assays) are summed across all transcripts sharing -#' the same exon (identical seqnames, start, end, strand). CPM is recomputed -#' from the aggregated counts. \code{fullLengthCounts} and -#' \code{uniqueCounts} are summed where present. +#' @param type character, the unit to summarise by. Currently supports +#' \code{"exon"} (default). +#' @return A \code{RangedSummarizedExperiment} with expression summarised to +#' the requested unit. Assays \code{counts} and \code{CPM} are always +#' present; \code{fullLengthCounts} and \code{uniqueCounts} are included +#' when present in the input. For \code{type = "exon"}, rows are named +#' \code{EX1, EX2, ...} and \code{rowRanges} mcols include \code{GENEID}, +#' \code{txNames}, and \code{exonClass} (\code{"annotated"} or +#' \code{"novel"}). +#' @details Counts are summed across all transcripts belonging to the same +#' group (e.g. identical exon locus). CPM is recomputed from the aggregated +#' counts. An exon is \code{"novel"} only if all contributing transcripts +#' have \code{novelTranscript = TRUE}. +#' @examples +#' se <- readRDS(system.file("extdata", +#' "seOutput_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds", +#' package = "bambu" +#' )) +#' summariseByExpression(se, type = "exon") #' @importFrom Matrix sparseMatrix -#' @importFrom SummarizedExperiment rowRanges rowData +#' @importFrom S4Vectors SimpleList +#' @importFrom SummarizedExperiment rowRanges rowData assays colData SummarizedExperiment #' @importFrom GenomicRanges GRanges seqnames start end strand #' @importFrom IRanges IRanges -#' @importFrom dplyr left_join mutate group_by summarise filter distinct +#' @importFrom dplyr left_join mutate group_by summarise cur_group_id ungroup #' @export -summariseByExon <- function(se) { +summariseByExpression <- function(se, type = "exon") { + index <- switch(type, + exon = .buildExonIndex(se), + stop("Unsupported type: '", type, "'") + ) + + txNames <- rownames(se) + nGroups <- length(index$outRanges) + groupNames <- names(index$outRanges) + + aggregationMat <- sparseMatrix( + i = index$indexDf$group_idx, + j = match(index$indexDf$TXNAME, txNames), + x = 1L, + dims = c(nGroups, length(txNames)), + dimnames = list(groupNames, txNames) + ) + + counts <- aggregationMat %*% assays(se)$counts + + counts.total <- colSums(counts) + counts.total[counts.total == 0] <- 1 + cpm <- counts / counts.total * 10^6 + + outAssays <- SimpleList(counts = counts, CPM = cpm) + + for (nm in c("fullLengthCounts", "uniqueCounts")) { + if (nm %in% names(assays(se))) { + outAssays[[nm]] <- aggregationMat %*% assays(se)[[nm]] + } + } + + ColNames <- colnames(counts) + ColData <- colData(se) + ColData@rownames <- ColNames + ColData@listData$name <- ColNames + + SummarizedExperiment( + assays = outAssays, + rowRanges = index$outRanges, + colData = ColData + ) +} + +#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} +#' @return a data.frame with columns TXNAME, seqnames, start, end, strand, +#' GENEID, and novelTranscript — one row per exon-transcript pair. +#' @noRd +.buildFlatDf <- function(se) { exonRanges <- unlist(rowRanges(se), use.names = TRUE) - txNames <- rownames(se) + geneDf <- as.data.frame(rowData(se))[, c("GENEID", "novelTranscript"), drop = FALSE] + geneDf$TXNAME <- rownames(se) - # Build data.frame of exon-transcript pairs - exonDf <- data.frame( + data.frame( TXNAME = names(exonRanges), seqnames = as.character(seqnames(exonRanges)), start = start(exonRanges), end = end(exonRanges), strand = as.character(strand(exonRanges)), stringsAsFactors = FALSE - ) |> - mutate(exon_id = paste(seqnames, start, end, strand, sep = ":")) + ) %>% + left_join(geneDf, by = "TXNAME") +} - # Attach GENEID and txClassDescription from rowData - geneDf <- as.data.frame(rowData(se))[, c("GENEID", "txClassDescription"), drop = FALSE] - geneDf$TXNAME <- rownames(se) - exonDf <- left_join(exonDf, geneDf, by = "TXNAME") +#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} +#' @return a list with: +#' \code{indexDf} — data.frame with columns TXNAME and group_idx +#' (one row per exon-transcript pair); +#' \code{outRanges} — GRanges with one entry per unique exon locus, +#' named EX1, EX2, ..., with mcols GENEID, txNames, and exonClass. +#' @noRd +.buildExonIndex <- function(se) { + flatDf <- .buildFlatDf(se) %>% + group_by(seqnames, start, end, strand) %>% + mutate(group_idx = cur_group_id()) %>% + ungroup() - # Collapse coordinates and GENEID/txNames per unique exon - exonMeta <- exonDf |> - group_by(exon_id) |> + groupMeta <- flatDf %>% + group_by(group_idx) %>% summarise( - seqnames = seqnames[1], - start = start[1], - end = end[1], - strand = strand[1], - GENEID = paste(sort(unique(GENEID)), collapse = ","), - txNames = paste(sort(unique(TXNAME)), collapse = ","), - .groups = "drop" + seqnames = seqnames[1], + start = start[1], + end = end[1], + strand = strand[1], + GENEID = paste(sort(unique(GENEID)), collapse = ","), + txNames = paste(sort(unique(TXNAME)), collapse = ","), + exonClass = ifelse(all(novelTranscript), "novel", "annotated"), + .groups = "drop" ) - # exonClass: "annotated" if the exon coordinates (seqnames, start, end, strand) - # exist in the reference annotation, "novel" otherwise. - annotatedCoords <- exonDf |> - filter(txClassDescription == "annotation") |> - distinct(seqnames, start, end, strand) |> - mutate(exonClass = "annotated") - - exonMeta <- exonMeta |> - left_join(annotatedCoords, by = c("seqnames", "start", "end", "strand")) |> - mutate(exonClass = ifelse(is.na(exonClass), "novel", "annotated")) - - # Sparse binary matrix: unique_exons x transcripts - uniqueExons <- exonMeta$exon_id - exonNames <- paste0("EX", seq_along(uniqueExons)) - aggregationMat <- sparseMatrix( - i = match(exonDf$exon_id, uniqueExons), - j = match(exonDf$TXNAME, txNames), - x = 1L, - dims = c(length(uniqueExons), length(txNames)), - dimnames = list(exonNames, txNames) - ) - - # Build output GRanges outRanges <- GRanges( - seqnames = exonMeta$seqnames, - ranges = IRanges(start = exonMeta$start, end = exonMeta$end), - strand = exonMeta$strand + seqnames = groupMeta$seqnames, + ranges = IRanges(start = groupMeta$start, end = groupMeta$end), + strand = groupMeta$strand ) - names(outRanges) <- exonNames - mcols(outRanges)$GENEID <- exonMeta$GENEID - mcols(outRanges)$txNames <- exonMeta$txNames - mcols(outRanges)$exonClass <- exonMeta$exonClass + names(outRanges) <- paste0("EX", seq_len(nrow(groupMeta))) + mcols(outRanges)$GENEID <- groupMeta$GENEID + mcols(outRanges)$txNames <- groupMeta$txNames + mcols(outRanges)$exonClass <- groupMeta$exonClass - .aggregateAssays(se, aggregationMat, outRanges) -} + list( + indexDf = flatDf[, c("TXNAME", "group_idx")], + outRanges = outRanges + ) +} \ No newline at end of file diff --git a/R/summariseExpression_utility.R b/R/summariseExpression_utility.R deleted file mode 100644 index 893cea39..00000000 --- a/R/summariseExpression_utility.R +++ /dev/null @@ -1,31 +0,0 @@ -# Core aggregation engine shared by summariseByExon (and any future summariseBy* functions). -# aggregationMat : sparse matrix (output_features x input_transcripts) -# outRanges : GRanges / GRangesList for the output rows -#' @importFrom SummarizedExperiment assays colData SummarizedExperiment -#' @noRd -.aggregateAssays <- function(se, aggregationMat, outRanges) { - counts <- aggregationMat %*% assays(se)$counts - - counts.total <- colSums(counts) - counts.total[counts.total == 0] <- 1 - cpm <- counts / counts.total * 10^6 - - outAssays <- SimpleList(counts = counts, CPM = cpm) - - for (nm in c("fullLengthCounts", "uniqueCounts")) { - if (nm %in% names(assays(se))) { - outAssays[[nm]] <- aggregationMat %*% assays(se)[[nm]] - } - } - - ColNames <- colnames(counts) - ColData <- colData(se) - ColData@rownames <- ColNames - ColData@listData$name <- ColNames - - SummarizedExperiment( - assays = outAssays, - rowRanges = outRanges, - colData = ColData - ) -} From 1039570d16b9ec51ee748617ce53c4f781d7efd7 Mon Sep 17 00:00:00 2001 From: ch99l Date: Wed, 22 Apr 2026 17:47:18 +0800 Subject: [PATCH 5/6] add gene summarisation and unit testing to summariseExpression --- R/summariseExpression.R | 110 ++++++++++++++-------- tests/testthat/test_summariseExpression.R | 57 +++++++++++ 2 files changed, 129 insertions(+), 38 deletions(-) create mode 100644 tests/testthat/test_summariseExpression.R diff --git a/R/summariseExpression.R b/R/summariseExpression.R index f9840edc..6174a9c1 100644 --- a/R/summariseExpression.R +++ b/R/summariseExpression.R @@ -2,34 +2,43 @@ #' @title summarise by expression #' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} #' @param type character, the unit to summarise by. Currently supports -#' \code{"exon"} (default). +#' \code{"exon"} (default) and \code{"gene"}. #' @return A \code{RangedSummarizedExperiment} with expression summarised to #' the requested unit. Assays \code{counts} and \code{CPM} are always #' present; \code{fullLengthCounts} and \code{uniqueCounts} are included #' when present in the input. For \code{type = "exon"}, rows are named #' \code{EX1, EX2, ...} and \code{rowRanges} mcols include \code{GENEID}, #' \code{txNames}, and \code{exonClass} (\code{"annotated"} or -#' \code{"novel"}). +#' \code{"novel"}). For \code{type = "gene"}, rows are named by GENEID and +#' \code{rowRanges} is a \code{GRangesList} of reduced exon ranges per gene; +#' mcols include \code{GENEID} and \code{newGeneClass}. #' @details Counts are summed across all transcripts belonging to the same -#' group (e.g. identical exon locus). CPM is recomputed from the aggregated -#' counts. An exon is \code{"novel"} only if all contributing transcripts -#' have \code{novelTranscript = TRUE}. +#' group (e.g. identical exon locus or gene). CPM is recomputed from the +#' aggregated counts. For \code{type = "exon"}, an exon is \code{"novel"} +#' only if all contributing transcripts have \code{novelTranscript = TRUE}. +#' For \code{type = "gene"}, \code{newGeneClass} is derived from +#' \code{txClassDescription}: Ensembl genes (GENEID matching \code{"ENSG"}) +#' are labelled \code{"annotation"}, others inherit the transcript class. +#' Incompatible and non-unique counts stored in \code{metadata(se)} are +#' added to the aggregated counts before CPM computation. #' @examples #' se <- readRDS(system.file("extdata", #' "seOutput_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds", #' package = "bambu" #' )) #' summariseByExpression(se, type = "exon") -#' @importFrom Matrix sparseMatrix -#' @importFrom S4Vectors SimpleList +#' summariseByExpression(se, type = "gene") +#' @importFrom Matrix sparseMatrix Matrix +#' @importFrom S4Vectors SimpleList metadata #' @importFrom SummarizedExperiment rowRanges rowData assays colData SummarizedExperiment -#' @importFrom GenomicRanges GRanges seqnames start end strand +#' @importFrom GenomicRanges GRanges seqnames start end strand mcols #' @importFrom IRanges IRanges #' @importFrom dplyr left_join mutate group_by summarise cur_group_id ungroup #' @export summariseByExpression <- function(se, type = "exon") { index <- switch(type, - exon = .buildExonIndex(se), + exon = buildExonIndex(se), + gene = buildGeneIndex(se), stop("Unsupported type: '", type, "'") ) @@ -47,40 +56,47 @@ summariseByExpression <- function(se, type = "exon") { counts <- aggregationMat %*% assays(se)$counts + if (type == "gene" && !is.null(metadata(se)$incompatibleCounts)) { + incompat <- metadata(se)$incompatibleCounts + if ("nonuniqueCounts" %in% names(metadata(se))) + incompat <- incompat + metadata(se)$nonuniqueCounts + incompat <- Matrix( + incompat[match(rownames(counts), rownames(incompat)), ], + sparse = TRUE + ) + counts <- counts + incompat + } + counts.total <- colSums(counts) counts.total[counts.total == 0] <- 1 cpm <- counts / counts.total * 10^6 outAssays <- SimpleList(counts = counts, CPM = cpm) - for (nm in c("fullLengthCounts", "uniqueCounts")) { - if (nm %in% names(assays(se))) { - outAssays[[nm]] <- aggregationMat %*% assays(se)[[nm]] + for (i in c("fullLengthCounts", "uniqueCounts")) { + if (i %in% names(assays(se))) { + outAssays[[i]] <- aggregationMat %*% assays(se)[[i]] } } - ColNames <- colnames(counts) - ColData <- colData(se) - ColData@rownames <- ColNames - ColData@listData$name <- ColNames - SummarizedExperiment( assays = outAssays, rowRanges = index$outRanges, - colData = ColData + colData = colData(se) ) } #' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} -#' @return a data.frame with columns TXNAME, seqnames, start, end, strand, -#' GENEID, and novelTranscript — one row per exon-transcript pair. +#' @return a list with: +#' \code{indexDf} — data.frame with columns TXNAME and group_idx +#' (one row per exon-transcript pair); +#' \code{outRanges} — GRanges with one entry per unique exon locus, +#' named EX1, EX2, ..., with mcols GENEID, txNames, and exonClass. #' @noRd -.buildFlatDf <- function(se) { +buildExonIndex <- function(se) { + txDf <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "novelTranscript"), drop = FALSE] exonRanges <- unlist(rowRanges(se), use.names = TRUE) - geneDf <- as.data.frame(rowData(se))[, c("GENEID", "novelTranscript"), drop = FALSE] - geneDf$TXNAME <- rownames(se) - - data.frame( + flatDf <- data.frame( TXNAME = names(exonRanges), seqnames = as.character(seqnames(exonRanges)), start = start(exonRanges), @@ -88,18 +104,7 @@ summariseByExpression <- function(se, type = "exon") { strand = as.character(strand(exonRanges)), stringsAsFactors = FALSE ) %>% - left_join(geneDf, by = "TXNAME") -} - -#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} -#' @return a list with: -#' \code{indexDf} — data.frame with columns TXNAME and group_idx -#' (one row per exon-transcript pair); -#' \code{outRanges} — GRanges with one entry per unique exon locus, -#' named EX1, EX2, ..., with mcols GENEID, txNames, and exonClass. -#' @noRd -.buildExonIndex <- function(se) { - flatDf <- .buildFlatDf(se) %>% + left_join(txDf, by = "TXNAME") %>% group_by(seqnames, start, end, strand) %>% mutate(group_idx = cur_group_id()) %>% ungroup() @@ -131,4 +136,33 @@ summariseByExpression <- function(se, type = "exon") { indexDf = flatDf[, c("TXNAME", "group_idx")], outRanges = outRanges ) -} \ No newline at end of file +} + +#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}} +#' @return a list with: +#' \code{indexDf} — data.frame with columns TXNAME and group_idx +#' (one row per transcript); +#' \code{outRanges} — GRangesList with one element per gene, named by +#' GENEID, with mcols GENEID and newGeneClass. +#' @noRd +buildGeneIndex <- function(se) { + rd <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "txClassDescription"), drop = FALSE] %>% + mutate(group_idx = match(GENEID, unique(GENEID))) + + groupMeta <- rd %>% + group_by(group_idx) %>% + summarise( + GENEID = GENEID[1], + newGeneClass = ifelse(grepl("ENSG", GENEID[1]), "annotation", unique(txClassDescription)[1]), + .groups = "drop" + ) + + outRanges <- reducedRangesByGenes(rowRanges(se))[groupMeta$GENEID] + mcols(outRanges)$GENEID <- groupMeta$GENEID + mcols(outRanges)$newGeneClass <- groupMeta$newGeneClass + + list( + indexDf = rd[, c("TXNAME", "group_idx")], + outRanges = outRanges + ) +} diff --git a/tests/testthat/test_summariseExpression.R b/tests/testthat/test_summariseExpression.R new file mode 100644 index 00000000..e77b22c2 --- /dev/null +++ b/tests/testthat/test_summariseExpression.R @@ -0,0 +1,57 @@ +context("Summarise expression") + +test_that("summariseByExpression correctly reduces transcript counts to exon and gene counts", { + # TX1: exons at 1-100, 200-300 -> GENE1, count = 10 + # TX2: exons at 1-100, 400-500 -> GENE1, shares first exon with TX1, count = 5 + # TX3: exon at 600-700 -> GENE2, count = 8 + # + # Expected exon counts: + # chr1:1-100 (TX1 + TX2) -> 15 + # chr1:200-300 (TX1 only) -> 10 + # chr1:400-500 (TX2 only) -> 5 + # chr1:600-700 (TX3 only) -> 8 + # + # Expected gene counts: + # GENE1 (TX1 + TX2) -> 15 + # GENE2 (TX3 only) -> 8 + + txRanges <- GRangesList( + TX1 = GRanges("chr1", IRanges(c(1, 200), c(100, 300)), strand = "+"), + TX2 = GRanges("chr1", IRanges(c(1, 400), c(100, 500)), strand = "+"), + TX3 = GRanges("chr1", IRanges(600, 700), strand = "+") + ) + mcols(txRanges)$TXNAME <- c("TX1", "TX2", "TX3") + mcols(txRanges)$GENEID <- c("GENE1", "GENE1", "GENE2") + mcols(txRanges)$novelTranscript <- c(FALSE, FALSE, FALSE) + mcols(txRanges)$txClassDescription <- c("annotation", "annotation", "annotation") + + counts <- matrix(c(10, 5, 8), nrow = 3, ncol = 1, + dimnames = list(c("TX1", "TX2", "TX3"), "sample1")) + + se <- SummarizedExperiment( + assays = list(counts = counts), + rowRanges = txRanges + ) + + # exon level + seExon <- summariseByExpression(se, type = "exon") + exonCounts <- as.matrix(assays(seExon)$counts) + exonRanges <- rowRanges(seExon) + + shared <- which(start(exonRanges) == 1 & end(exonRanges) == 100) + tx1only <- which(start(exonRanges) == 200 & end(exonRanges) == 300) + tx2only <- which(start(exonRanges) == 400 & end(exonRanges) == 500) + tx3only <- which(start(exonRanges) == 600 & end(exonRanges) == 700) + + expect_equal(exonCounts[shared, 1], 15) + expect_equal(exonCounts[tx1only, 1], 10) + expect_equal(exonCounts[tx2only, 1], 5) + expect_equal(exonCounts[tx3only, 1], 8) + + # gene level + seGene <- summariseByExpression(se, type = "gene") + geneCounts <- as.matrix(assays(seGene)$counts) + + expect_equal(geneCounts["GENE1", 1], 15) + expect_equal(geneCounts["GENE2", 1], 8) +}) From 1d99a73ebbb76007a038036fa437e1f60323d582 Mon Sep 17 00:00:00 2001 From: ch99l Date: Wed, 22 Apr 2026 17:59:22 +0800 Subject: [PATCH 6/6] add comments --- R/summariseExpression.R | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/R/summariseExpression.R b/R/summariseExpression.R index 6174a9c1..05597890 100644 --- a/R/summariseExpression.R +++ b/R/summariseExpression.R @@ -36,12 +36,15 @@ #' @importFrom dplyr left_join mutate group_by summarise cur_group_id ungroup #' @export summariseByExpression <- function(se, type = "exon") { + # build the grouping index and output ranges for the requested type index <- switch(type, exon = buildExonIndex(se), gene = buildGeneIndex(se), stop("Unsupported type: '", type, "'") ) + # sparse aggregation matrix: rows = groups, cols = transcripts + # multiplying by transcript counts gives group-level counts txNames <- rownames(se) nGroups <- length(index$outRanges) groupNames <- names(index$outRanges) @@ -54,8 +57,11 @@ summariseByExpression <- function(se, type = "exon") { dimnames = list(groupNames, txNames) ) + # aggregate transcript counts to group level counts <- aggregationMat %*% assays(se)$counts + # for gene level, add reads that could not be assigned to any transcript + # but were localised to a gene, stored in metadata by bambu if (type == "gene" && !is.null(metadata(se)$incompatibleCounts)) { incompat <- metadata(se)$incompatibleCounts if ("nonuniqueCounts" %in% names(metadata(se))) @@ -67,12 +73,13 @@ summariseByExpression <- function(se, type = "exon") { counts <- counts + incompat } + # compute CPM from aggregated counts counts.total <- colSums(counts) counts.total[counts.total == 0] <- 1 cpm <- counts / counts.total * 10^6 + # aggregate optional assays if present outAssays <- SimpleList(counts = counts, CPM = cpm) - for (i in c("fullLengthCounts", "uniqueCounts")) { if (i %in% names(assays(se))) { outAssays[[i]] <- aggregationMat %*% assays(se)[[i]] @@ -94,6 +101,7 @@ summariseByExpression <- function(se, type = "exon") { #' named EX1, EX2, ..., with mcols GENEID, txNames, and exonClass. #' @noRd buildExonIndex <- function(se) { + # one row per exon-transcript pair, joined with transcript metadata txDf <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "novelTranscript"), drop = FALSE] exonRanges <- unlist(rowRanges(se), use.names = TRUE) flatDf <- data.frame( @@ -105,10 +113,12 @@ buildExonIndex <- function(se) { stringsAsFactors = FALSE ) %>% left_join(txDf, by = "TXNAME") %>% + # assign each unique exon locus a group index group_by(seqnames, start, end, strand) %>% mutate(group_idx = cur_group_id()) %>% ungroup() + # one row per unique exon locus with summarised metadata groupMeta <- flatDf %>% group_by(group_idx) %>% summarise( @@ -122,6 +132,7 @@ buildExonIndex <- function(se) { .groups = "drop" ) + # build output GRanges with exon metadata outRanges <- GRanges( seqnames = groupMeta$seqnames, ranges = IRanges(start = groupMeta$start, end = groupMeta$end), @@ -146,9 +157,11 @@ buildExonIndex <- function(se) { #' GENEID, with mcols GENEID and newGeneClass. #' @noRd buildGeneIndex <- function(se) { + # one row per transcript; group_idx preserves first-appearance order of GENEIDs rd <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "txClassDescription"), drop = FALSE] %>% mutate(group_idx = match(GENEID, unique(GENEID))) + # one row per gene with class label derived from txClassDescription groupMeta <- rd %>% group_by(group_idx) %>% summarise( @@ -157,6 +170,7 @@ buildGeneIndex <- function(se) { .groups = "drop" ) + # build output GRangesList of reduced exon ranges per gene outRanges <- reducedRangesByGenes(rowRanges(se))[groupMeta$GENEID] mcols(outRanges)$GENEID <- groupMeta$GENEID mcols(outRanges)$newGeneClass <- groupMeta$newGeneClass