From b4277caecdd984bd2b6a7edc56be3ead3f007ed8 Mon Sep 17 00:00:00 2001
From: ch99l <chinhaolee10@gmail.com>
Date: Tue, 14 Apr 2026 15:19:19 +0800
Subject: [PATCH 1/6] Add summariseByExon function

---
 R/summariseByExon.R | 76 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 R/summariseByExon.R

diff --git a/R/summariseByExon.R b/R/summariseByExon.R
new file mode 100644
index 00000000..e04c4435
--- /dev/null
+++ b/R/summariseByExon.R
@@ -0,0 +1,76 @@
+#' Summarise transcript expression to exon-level expression
+#' @title summarise by exon
+#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
+#' @return A data.table with columns: exon_id, seqnames, start, end, strand,
+#'   GENEID, and one count column per sample
+#' @details Counts are summed across all transcripts that share the same exon
+#'   (defined by identical seqnames, start, end, and strand). The returned
+#'   counts therefore represent the total evidence attributed to each unique
+#'   exonic locus across all overlapping transcripts.
+#' @import data.table
+#' @importFrom Matrix sparseMatrix
+#' @importFrom SummarizedExperiment assays rowRanges rowData
+#' @importFrom GenomicRanges seqnames start end strand
+#' @export
+summariseByExon <- function(se) {
+    # Unlist GRangesList: one row per exon-transcript combination
+    exonRanges <- unlist(rowRanges(se), use.names = TRUE)
+
+    # Build a data.table of exon-transcript pairs
+    txNames <- rownames(se)
+    exonDt <- data.table(
+        TXNAME   = names(exonRanges),
+        seqnames = as.character(seqnames(exonRanges)),
+        start    = start(exonRanges),
+        end      = end(exonRanges),
+        strand   = as.character(strand(exonRanges))
+    )
+
+    # Unique exon key: seqnames:start:end:strand
+    exonDt[, exon_id := paste(seqnames, start, end, strand, sep = ":")]
+
+    # Attach GENEID from rowData
+    geneDt <- data.table(
+        TXNAME = rownames(se),
+        GENEID = rowData(se)$GENEID
+    )
+    exonDt <- geneDt[exonDt, on = "TXNAME"]
+
+    # Collapse metadata per unique exon
+    exonMeta <- exonDt[, .(
+        seqnames = seqnames[1],
+        start    = start[1],
+        end      = end[1],
+        strand   = strand[1],
+        GENEID   = paste(sort(unique(GENEID)), collapse = ",")
+    ), by = exon_id]
+
+    # Build sparse binary matrix: unique_exons x transcripts
+    # entry [i, j] = 1 if transcript j contains unique exon i
+    uniqueExons <- exonMeta$exon_id
+    exonIdx <- match(exonDt$exon_id, uniqueExons)
+    txIdx   <- match(exonDt$TXNAME,  txNames)
+
+    exonTxMat <- sparseMatrix(
+        i    = exonIdx,
+        j    = txIdx,
+        x    = 1L,
+        dims = c(length(uniqueExons), length(txNames)),
+        dimnames = list(uniqueExons, txNames)
+    )
+
+    # Aggregate counts: unique_exons x samples
+    txCounts   <- assays(se)$counts
+    exonCounts <- exonTxMat %*% txCounts
+
+    # Combine metadata with aggregated counts
+    result <- cbind(
+        exonMeta,
+        as.data.table(as.matrix(exonCounts))
+    )
+
+    # Sort by genomic position
+    result <- result[order(seqnames, start, end)]
+
+    return(result)
+}

From dbe2af974b6e97dd3d79df8ce8a01f07af51f28c Mon Sep 17 00:00:00 2001
From: ch99l <chinhaolee10@gmail.com>
Date: Tue, 14 Apr 2026 15:48:50 +0800
Subject: [PATCH 2/6] Return exon-level SummarisedExperiment instead of
 data.table

---
 R/summariseByExon.R | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/R/summariseByExon.R b/R/summariseByExon.R
index e04c4435..cfefd833 100644
--- a/R/summariseByExon.R
+++ b/R/summariseByExon.R
@@ -1,16 +1,16 @@
 #' Summarise transcript expression to exon-level expression
 #' @title summarise by exon
 #' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
-#' @return A data.table with columns: exon_id, seqnames, start, end, strand,
-#'   GENEID, and one count column per sample
+#' @return A \code{RangedSummarizedExperiment} with exon-level counts
 #' @details Counts are summed across all transcripts that share the same exon
 #'   (defined by identical seqnames, start, end, and strand). The returned
 #'   counts therefore represent the total evidence attributed to each unique
 #'   exonic locus across all overlapping transcripts.
 #' @import data.table
 #' @importFrom Matrix sparseMatrix
-#' @importFrom SummarizedExperiment assays rowRanges rowData
-#' @importFrom GenomicRanges seqnames start end strand
+#' @importFrom SummarizedExperiment assays rowRanges rowData colData SummarizedExperiment
+#' @importFrom GenomicRanges GRanges seqnames start end strand
+#' @importFrom IRanges IRanges
 #' @export
 summariseByExon <- function(se) {
     # Unlist GRangesList: one row per exon-transcript combination
@@ -63,14 +63,19 @@ summariseByExon <- function(se) {
     txCounts   <- assays(se)$counts
     exonCounts <- exonTxMat %*% txCounts
 
-    # Combine metadata with aggregated counts
-    result <- cbind(
-        exonMeta,
-        as.data.table(as.matrix(exonCounts))
+    # Build GRanges for unique exons
+    exonGRanges <- GRanges(
+        seqnames = exonMeta$seqnames,
+        ranges   = IRanges(start = exonMeta$start, end = exonMeta$end),
+        strand   = exonMeta$strand
     )
+    names(exonGRanges) <- exonMeta$exon_id
+    mcols(exonGRanges)$GENEID <- exonMeta$GENEID
 
-    # Sort by genomic position
-    result <- result[order(seqnames, start, end)]
-
-    return(result)
+    # Return as SummarizedExperiment
+    return(SummarizedExperiment(
+        assays   = list(counts = exonCounts),
+        rowRanges = exonGRanges,
+        colData  = colData(se)
+    ))
 }

From 16648135076aba5a9bd33ceee657cb5a2a1e92fe Mon Sep 17 00:00:00 2001
From: ch99l <chinhaolee10@gmail.com>
Date: Fri, 17 Apr 2026 15:35:15 +0800
Subject: [PATCH 3/6] refactor summarisebyExon

---
 R/summariseByExon.R             | 81 -----------------------------
 R/summariseExpression.R         | 91 +++++++++++++++++++++++++++++++++
 R/summariseExpression_utility.R | 31 +++++++++++
 3 files changed, 122 insertions(+), 81 deletions(-)
 delete mode 100644 R/summariseByExon.R
 create mode 100644 R/summariseExpression.R
 create mode 100644 R/summariseExpression_utility.R

diff --git a/R/summariseByExon.R b/R/summariseByExon.R
deleted file mode 100644
index cfefd833..00000000
--- a/R/summariseByExon.R
+++ /dev/null
@@ -1,81 +0,0 @@
-#' Summarise transcript expression to exon-level expression
-#' @title summarise by exon
-#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
-#' @return A \code{RangedSummarizedExperiment} with exon-level counts
-#' @details Counts are summed across all transcripts that share the same exon
-#'   (defined by identical seqnames, start, end, and strand). The returned
-#'   counts therefore represent the total evidence attributed to each unique
-#'   exonic locus across all overlapping transcripts.
-#' @import data.table
-#' @importFrom Matrix sparseMatrix
-#' @importFrom SummarizedExperiment assays rowRanges rowData colData SummarizedExperiment
-#' @importFrom GenomicRanges GRanges seqnames start end strand
-#' @importFrom IRanges IRanges
-#' @export
-summariseByExon <- function(se) {
-    # Unlist GRangesList: one row per exon-transcript combination
-    exonRanges <- unlist(rowRanges(se), use.names = TRUE)
-
-    # Build a data.table of exon-transcript pairs
-    txNames <- rownames(se)
-    exonDt <- data.table(
-        TXNAME   = names(exonRanges),
-        seqnames = as.character(seqnames(exonRanges)),
-        start    = start(exonRanges),
-        end      = end(exonRanges),
-        strand   = as.character(strand(exonRanges))
-    )
-
-    # Unique exon key: seqnames:start:end:strand
-    exonDt[, exon_id := paste(seqnames, start, end, strand, sep = ":")]
-
-    # Attach GENEID from rowData
-    geneDt <- data.table(
-        TXNAME = rownames(se),
-        GENEID = rowData(se)$GENEID
-    )
-    exonDt <- geneDt[exonDt, on = "TXNAME"]
-
-    # Collapse metadata per unique exon
-    exonMeta <- exonDt[, .(
-        seqnames = seqnames[1],
-        start    = start[1],
-        end      = end[1],
-        strand   = strand[1],
-        GENEID   = paste(sort(unique(GENEID)), collapse = ",")
-    ), by = exon_id]
-
-    # Build sparse binary matrix: unique_exons x transcripts
-    # entry [i, j] = 1 if transcript j contains unique exon i
-    uniqueExons <- exonMeta$exon_id
-    exonIdx <- match(exonDt$exon_id, uniqueExons)
-    txIdx   <- match(exonDt$TXNAME,  txNames)
-
-    exonTxMat <- sparseMatrix(
-        i    = exonIdx,
-        j    = txIdx,
-        x    = 1L,
-        dims = c(length(uniqueExons), length(txNames)),
-        dimnames = list(uniqueExons, txNames)
-    )
-
-    # Aggregate counts: unique_exons x samples
-    txCounts   <- assays(se)$counts
-    exonCounts <- exonTxMat %*% txCounts
-
-    # Build GRanges for unique exons
-    exonGRanges <- GRanges(
-        seqnames = exonMeta$seqnames,
-        ranges   = IRanges(start = exonMeta$start, end = exonMeta$end),
-        strand   = exonMeta$strand
-    )
-    names(exonGRanges) <- exonMeta$exon_id
-    mcols(exonGRanges)$GENEID <- exonMeta$GENEID
-
-    # Return as SummarizedExperiment
-    return(SummarizedExperiment(
-        assays   = list(counts = exonCounts),
-        rowRanges = exonGRanges,
-        colData  = colData(se)
-    ))
-}
diff --git a/R/summariseExpression.R b/R/summariseExpression.R
new file mode 100644
index 00000000..3d5cf687
--- /dev/null
+++ b/R/summariseExpression.R
@@ -0,0 +1,91 @@
+#' Summarise transcript expression to exon-level expression
+#' @title summarise by exon
+#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
+#' @return A \code{RangedSummarizedExperiment} with exon-level expression.
+#'   Rows are named \code{EX1, EX2, ...} in order of first appearance.
+#'   \code{rowRanges} is a \code{GRanges} with one entry per unique exon locus.
+#'   \code{mcols} columns:
+#'   \describe{
+#'     \item{GENEID}{Comma-separated gene IDs for transcripts sharing the exon}
+#'     \item{txNames}{Comma-separated transcript IDs that contain this exon}
+#'     \item{exonClass}{\code{"annotated"} if the exon coordinates exist in the
+#'       reference annotation, \code{"novel"} otherwise}
+#'   }
+#' @details Counts (and other assays) are summed across all transcripts sharing
+#'   the same exon (identical seqnames, start, end, strand). CPM is recomputed
+#'   from the aggregated counts. \code{fullLengthCounts} and
+#'   \code{uniqueCounts} are summed where present.
+#' @importFrom Matrix sparseMatrix
+#' @importFrom SummarizedExperiment rowRanges rowData
+#' @importFrom GenomicRanges GRanges seqnames start end strand
+#' @importFrom IRanges IRanges
+#' @importFrom dplyr left_join mutate group_by summarise filter distinct
+#' @export
+summariseByExon <- function(se) {
+    exonRanges <- unlist(rowRanges(se), use.names = TRUE)
+    txNames    <- rownames(se)
+
+    # Build data.frame of exon-transcript pairs
+    exonDf <- data.frame(
+        TXNAME   = names(exonRanges),
+        seqnames = as.character(seqnames(exonRanges)),
+        start    = start(exonRanges),
+        end      = end(exonRanges),
+        strand   = as.character(strand(exonRanges)),
+        stringsAsFactors = FALSE
+    ) |>
+        mutate(exon_id = paste(seqnames, start, end, strand, sep = ":"))
+
+    # Attach GENEID and txClassDescription from rowData
+    geneDf <- as.data.frame(rowData(se))[, c("GENEID", "txClassDescription"), drop = FALSE]
+    geneDf$TXNAME <- rownames(se)
+    exonDf <- left_join(exonDf, geneDf, by = "TXNAME")
+
+    # Collapse coordinates and GENEID/txNames per unique exon
+    exonMeta <- exonDf |>
+        group_by(exon_id) |>
+        summarise(
+            seqnames = seqnames[1],
+            start    = start[1],
+            end      = end[1],
+            strand   = strand[1],
+            GENEID   = paste(sort(unique(GENEID)),  collapse = ","),
+            txNames  = paste(sort(unique(TXNAME)),  collapse = ","),
+            .groups  = "drop"
+        )
+
+    # exonClass: "annotated" if the exon coordinates (seqnames, start, end, strand)
+    # exist in the reference annotation, "novel" otherwise.
+    annotatedCoords <- exonDf |>
+        filter(txClassDescription == "annotation") |>
+        distinct(seqnames, start, end, strand) |>
+        mutate(exonClass = "annotated")
+
+    exonMeta <- exonMeta |>
+        left_join(annotatedCoords, by = c("seqnames", "start", "end", "strand")) |>
+        mutate(exonClass = ifelse(is.na(exonClass), "novel", "annotated"))
+
+    # Sparse binary matrix: unique_exons x transcripts
+    uniqueExons <- exonMeta$exon_id
+    exonNames   <- paste0("EX", seq_along(uniqueExons))
+    aggregationMat <- sparseMatrix(
+        i    = match(exonDf$exon_id, uniqueExons),
+        j    = match(exonDf$TXNAME,  txNames),
+        x    = 1L,
+        dims = c(length(uniqueExons), length(txNames)),
+        dimnames = list(exonNames, txNames)
+    )
+
+    # Build output GRanges
+    outRanges <- GRanges(
+        seqnames = exonMeta$seqnames,
+        ranges   = IRanges(start = exonMeta$start, end = exonMeta$end),
+        strand   = exonMeta$strand
+    )
+    names(outRanges) <- exonNames
+    mcols(outRanges)$GENEID  <- exonMeta$GENEID
+    mcols(outRanges)$txNames <- exonMeta$txNames
+    mcols(outRanges)$exonClass <- exonMeta$exonClass
+
+    .aggregateAssays(se, aggregationMat, outRanges)
+}
diff --git a/R/summariseExpression_utility.R b/R/summariseExpression_utility.R
new file mode 100644
index 00000000..893cea39
--- /dev/null
+++ b/R/summariseExpression_utility.R
@@ -0,0 +1,31 @@
+# Core aggregation engine shared by summariseByExon (and any future summariseBy* functions).
+# aggregationMat : sparse matrix (output_features x input_transcripts)
+# outRanges      : GRanges / GRangesList for the output rows
+#' @importFrom SummarizedExperiment assays colData SummarizedExperiment
+#' @noRd
+.aggregateAssays <- function(se, aggregationMat, outRanges) {
+    counts <- aggregationMat %*% assays(se)$counts
+
+    counts.total <- colSums(counts)
+    counts.total[counts.total == 0] <- 1
+    cpm <- counts / counts.total * 10^6
+
+    outAssays <- SimpleList(counts = counts, CPM = cpm)
+
+    for (nm in c("fullLengthCounts", "uniqueCounts")) {
+        if (nm %in% names(assays(se))) {
+            outAssays[[nm]] <- aggregationMat %*% assays(se)[[nm]]
+        }
+    }
+
+    ColNames <- colnames(counts)
+    ColData  <- colData(se)
+    ColData@rownames       <- ColNames
+    ColData@listData$name  <- ColNames
+
+    SummarizedExperiment(
+        assays    = outAssays,
+        rowRanges = outRanges,
+        colData   = ColData
+    )
+}

From eefbe1628c7d1967188154de1d0d73e8017a3230 Mon Sep 17 00:00:00 2001
From: ch99l <chinhaolee10@gmail.com>
Date: Mon, 20 Apr 2026 15:54:36 +0800
Subject: [PATCH 4/6] refactor:generalise summarisation by type

---
 R/summariseExpression.R         | 183 ++++++++++++++++++++------------
 R/summariseExpression_utility.R |  31 ------
 2 files changed, 113 insertions(+), 101 deletions(-)
 delete mode 100644 R/summariseExpression_utility.R

diff --git a/R/summariseExpression.R b/R/summariseExpression.R
index 3d5cf687..f9840edc 100644
--- a/R/summariseExpression.R
+++ b/R/summariseExpression.R
@@ -1,91 +1,134 @@
-#' Summarise transcript expression to exon-level expression
-#' @title summarise by exon
+#' Summarise transcript expression to a specified unit
+#' @title summarise by expression
 #' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
-#' @return A \code{RangedSummarizedExperiment} with exon-level expression.
-#'   Rows are named \code{EX1, EX2, ...} in order of first appearance.
-#'   \code{rowRanges} is a \code{GRanges} with one entry per unique exon locus.
-#'   \code{mcols} columns:
-#'   \describe{
-#'     \item{GENEID}{Comma-separated gene IDs for transcripts sharing the exon}
-#'     \item{txNames}{Comma-separated transcript IDs that contain this exon}
-#'     \item{exonClass}{\code{"annotated"} if the exon coordinates exist in the
-#'       reference annotation, \code{"novel"} otherwise}
-#'   }
-#' @details Counts (and other assays) are summed across all transcripts sharing
-#'   the same exon (identical seqnames, start, end, strand). CPM is recomputed
-#'   from the aggregated counts. \code{fullLengthCounts} and
-#'   \code{uniqueCounts} are summed where present.
+#' @param type character, the unit to summarise by. Currently supports
+#'   \code{"exon"} (default).
+#' @return A \code{RangedSummarizedExperiment} with expression summarised to
+#'   the requested unit. Assays \code{counts} and \code{CPM} are always
+#'   present; \code{fullLengthCounts} and \code{uniqueCounts} are included
+#'   when present in the input. For \code{type = "exon"}, rows are named
+#'   \code{EX1, EX2, ...} and \code{rowRanges} mcols include \code{GENEID},
+#'   \code{txNames}, and \code{exonClass} (\code{"annotated"} or
+#'   \code{"novel"}).
+#' @details Counts are summed across all transcripts belonging to the same
+#'   group (e.g. identical exon locus). CPM is recomputed from the aggregated
+#'   counts. An exon is \code{"novel"} only if all contributing transcripts
+#'   have \code{novelTranscript = TRUE}.
+#' @examples
+#' se <- readRDS(system.file("extdata",
+#'     "seOutput_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds",
+#'     package = "bambu"
+#' ))
+#' summariseByExpression(se, type = "exon")
 #' @importFrom Matrix sparseMatrix
-#' @importFrom SummarizedExperiment rowRanges rowData
+#' @importFrom S4Vectors SimpleList
+#' @importFrom SummarizedExperiment rowRanges rowData assays colData SummarizedExperiment
 #' @importFrom GenomicRanges GRanges seqnames start end strand
 #' @importFrom IRanges IRanges
-#' @importFrom dplyr left_join mutate group_by summarise filter distinct
+#' @importFrom dplyr left_join mutate group_by summarise cur_group_id ungroup
 #' @export
-summariseByExon <- function(se) {
+summariseByExpression <- function(se, type = "exon") {
+    index <- switch(type,
+        exon = .buildExonIndex(se),
+        stop("Unsupported type: '", type, "'")
+    )
+
+    txNames <- rownames(se)
+    nGroups <- length(index$outRanges)
+    groupNames <- names(index$outRanges)
+
+    aggregationMat <- sparseMatrix(
+        i    = index$indexDf$group_idx,
+        j    = match(index$indexDf$TXNAME, txNames),
+        x    = 1L,
+        dims = c(nGroups, length(txNames)),
+        dimnames = list(groupNames, txNames)
+    )
+
+    counts <- aggregationMat %*% assays(se)$counts
+
+    counts.total <- colSums(counts)
+    counts.total[counts.total == 0] <- 1
+    cpm <- counts / counts.total * 10^6
+
+    outAssays <- SimpleList(counts = counts, CPM = cpm)
+
+    for (nm in c("fullLengthCounts", "uniqueCounts")) {
+        if (nm %in% names(assays(se))) {
+            outAssays[[nm]] <- aggregationMat %*% assays(se)[[nm]]
+        }
+    }
+
+    ColNames <- colnames(counts)
+    ColData  <- colData(se)
+    ColData@rownames      <- ColNames
+    ColData@listData$name <- ColNames
+
+    SummarizedExperiment(
+        assays    = outAssays,
+        rowRanges = index$outRanges,
+        colData   = ColData
+    )
+}
+
+#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
+#' @return a data.frame with columns TXNAME, seqnames, start, end, strand,
+#'   GENEID, and novelTranscript — one row per exon-transcript pair.
+#' @noRd
+.buildFlatDf <- function(se) {
     exonRanges <- unlist(rowRanges(se), use.names = TRUE)
-    txNames    <- rownames(se)
+    geneDf <- as.data.frame(rowData(se))[, c("GENEID", "novelTranscript"), drop = FALSE]
+    geneDf$TXNAME <- rownames(se)
 
-    # Build data.frame of exon-transcript pairs
-    exonDf <- data.frame(
+    data.frame(
         TXNAME   = names(exonRanges),
         seqnames = as.character(seqnames(exonRanges)),
         start    = start(exonRanges),
         end      = end(exonRanges),
         strand   = as.character(strand(exonRanges)),
         stringsAsFactors = FALSE
-    ) |>
-        mutate(exon_id = paste(seqnames, start, end, strand, sep = ":"))
+    ) %>%
+        left_join(geneDf, by = "TXNAME")
+}
 
-    # Attach GENEID and txClassDescription from rowData
-    geneDf <- as.data.frame(rowData(se))[, c("GENEID", "txClassDescription"), drop = FALSE]
-    geneDf$TXNAME <- rownames(se)
-    exonDf <- left_join(exonDf, geneDf, by = "TXNAME")
+#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
+#' @return a list with:
+#'   \code{indexDf} — data.frame with columns TXNAME and group_idx
+#'     (one row per exon-transcript pair);
+#'   \code{outRanges} — GRanges with one entry per unique exon locus,
+#'     named EX1, EX2, ..., with mcols GENEID, txNames, and exonClass.
+#' @noRd
+.buildExonIndex <- function(se) {
+    flatDf <- .buildFlatDf(se) %>%
+        group_by(seqnames, start, end, strand) %>%
+        mutate(group_idx = cur_group_id()) %>%
+        ungroup()
 
-    # Collapse coordinates and GENEID/txNames per unique exon
-    exonMeta <- exonDf |>
-        group_by(exon_id) |>
+    groupMeta <- flatDf %>%
+        group_by(group_idx) %>%
         summarise(
-            seqnames = seqnames[1],
-            start    = start[1],
-            end      = end[1],
-            strand   = strand[1],
-            GENEID   = paste(sort(unique(GENEID)),  collapse = ","),
-            txNames  = paste(sort(unique(TXNAME)),  collapse = ","),
-            .groups  = "drop"
+            seqnames  = seqnames[1],
+            start     = start[1],
+            end       = end[1],
+            strand    = strand[1],
+            GENEID    = paste(sort(unique(GENEID)), collapse = ","),
+            txNames   = paste(sort(unique(TXNAME)), collapse = ","),
+            exonClass = ifelse(all(novelTranscript), "novel", "annotated"),
+            .groups   = "drop"
         )
 
-    # exonClass: "annotated" if the exon coordinates (seqnames, start, end, strand)
-    # exist in the reference annotation, "novel" otherwise.
-    annotatedCoords <- exonDf |>
-        filter(txClassDescription == "annotation") |>
-        distinct(seqnames, start, end, strand) |>
-        mutate(exonClass = "annotated")
-
-    exonMeta <- exonMeta |>
-        left_join(annotatedCoords, by = c("seqnames", "start", "end", "strand")) |>
-        mutate(exonClass = ifelse(is.na(exonClass), "novel", "annotated"))
-
-    # Sparse binary matrix: unique_exons x transcripts
-    uniqueExons <- exonMeta$exon_id
-    exonNames   <- paste0("EX", seq_along(uniqueExons))
-    aggregationMat <- sparseMatrix(
-        i    = match(exonDf$exon_id, uniqueExons),
-        j    = match(exonDf$TXNAME,  txNames),
-        x    = 1L,
-        dims = c(length(uniqueExons), length(txNames)),
-        dimnames = list(exonNames, txNames)
-    )
-
-    # Build output GRanges
     outRanges <- GRanges(
-        seqnames = exonMeta$seqnames,
-        ranges   = IRanges(start = exonMeta$start, end = exonMeta$end),
-        strand   = exonMeta$strand
+        seqnames = groupMeta$seqnames,
+        ranges   = IRanges(start = groupMeta$start, end = groupMeta$end),
+        strand   = groupMeta$strand
     )
-    names(outRanges) <- exonNames
-    mcols(outRanges)$GENEID  <- exonMeta$GENEID
-    mcols(outRanges)$txNames <- exonMeta$txNames
-    mcols(outRanges)$exonClass <- exonMeta$exonClass
+    names(outRanges) <- paste0("EX", seq_len(nrow(groupMeta)))
+    mcols(outRanges)$GENEID    <- groupMeta$GENEID
+    mcols(outRanges)$txNames   <- groupMeta$txNames
+    mcols(outRanges)$exonClass <- groupMeta$exonClass
 
-    .aggregateAssays(se, aggregationMat, outRanges)
-}
+    list(
+        indexDf   = flatDf[, c("TXNAME", "group_idx")],
+        outRanges = outRanges
+    )
+}
\ No newline at end of file
diff --git a/R/summariseExpression_utility.R b/R/summariseExpression_utility.R
deleted file mode 100644
index 893cea39..00000000
--- a/R/summariseExpression_utility.R
+++ /dev/null
@@ -1,31 +0,0 @@
-# Core aggregation engine shared by summariseByExon (and any future summariseBy* functions).
-# aggregationMat : sparse matrix (output_features x input_transcripts)
-# outRanges      : GRanges / GRangesList for the output rows
-#' @importFrom SummarizedExperiment assays colData SummarizedExperiment
-#' @noRd
-.aggregateAssays <- function(se, aggregationMat, outRanges) {
-    counts <- aggregationMat %*% assays(se)$counts
-
-    counts.total <- colSums(counts)
-    counts.total[counts.total == 0] <- 1
-    cpm <- counts / counts.total * 10^6
-
-    outAssays <- SimpleList(counts = counts, CPM = cpm)
-
-    for (nm in c("fullLengthCounts", "uniqueCounts")) {
-        if (nm %in% names(assays(se))) {
-            outAssays[[nm]] <- aggregationMat %*% assays(se)[[nm]]
-        }
-    }
-
-    ColNames <- colnames(counts)
-    ColData  <- colData(se)
-    ColData@rownames       <- ColNames
-    ColData@listData$name  <- ColNames
-
-    SummarizedExperiment(
-        assays    = outAssays,
-        rowRanges = outRanges,
-        colData   = ColData
-    )
-}

From 1039570d16b9ec51ee748617ce53c4f781d7efd7 Mon Sep 17 00:00:00 2001
From: ch99l <chinhaolee10@gmail.com>
Date: Wed, 22 Apr 2026 17:47:18 +0800
Subject: [PATCH 5/6] add gene summarisation and unit testing to
 summariseExpression

---
 R/summariseExpression.R                   | 110 ++++++++++++++--------
 tests/testthat/test_summariseExpression.R |  57 +++++++++++
 2 files changed, 129 insertions(+), 38 deletions(-)
 create mode 100644 tests/testthat/test_summariseExpression.R

diff --git a/R/summariseExpression.R b/R/summariseExpression.R
index f9840edc..6174a9c1 100644
--- a/R/summariseExpression.R
+++ b/R/summariseExpression.R
@@ -2,34 +2,43 @@
 #' @title summarise by expression
 #' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
 #' @param type character, the unit to summarise by. Currently supports
-#'   \code{"exon"} (default).
+#'   \code{"exon"} (default) and \code{"gene"}.
 #' @return A \code{RangedSummarizedExperiment} with expression summarised to
 #'   the requested unit. Assays \code{counts} and \code{CPM} are always
 #'   present; \code{fullLengthCounts} and \code{uniqueCounts} are included
 #'   when present in the input. For \code{type = "exon"}, rows are named
 #'   \code{EX1, EX2, ...} and \code{rowRanges} mcols include \code{GENEID},
 #'   \code{txNames}, and \code{exonClass} (\code{"annotated"} or
-#'   \code{"novel"}).
+#'   \code{"novel"}). For \code{type = "gene"}, rows are named by GENEID and
+#'   \code{rowRanges} is a \code{GRangesList} of reduced exon ranges per gene;
+#'   mcols include \code{GENEID} and \code{newGeneClass}.
 #' @details Counts are summed across all transcripts belonging to the same
-#'   group (e.g. identical exon locus). CPM is recomputed from the aggregated
-#'   counts. An exon is \code{"novel"} only if all contributing transcripts
-#'   have \code{novelTranscript = TRUE}.
+#'   group (e.g. identical exon locus or gene). CPM is recomputed from the
+#'   aggregated counts. For \code{type = "exon"}, an exon is \code{"novel"}
+#'   only if all contributing transcripts have \code{novelTranscript = TRUE}.
+#'   For \code{type = "gene"}, \code{newGeneClass} is derived from
+#'   \code{txClassDescription}: Ensembl genes (GENEID matching \code{"ENSG"})
+#'   are labelled \code{"annotation"}, others inherit the transcript class.
+#'   Incompatible and non-unique counts stored in \code{metadata(se)} are
+#'   added to the aggregated counts before CPM computation.
 #' @examples
 #' se <- readRDS(system.file("extdata",
 #'     "seOutput_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds",
 #'     package = "bambu"
 #' ))
 #' summariseByExpression(se, type = "exon")
-#' @importFrom Matrix sparseMatrix
-#' @importFrom S4Vectors SimpleList
+#' summariseByExpression(se, type = "gene")
+#' @importFrom Matrix sparseMatrix Matrix
+#' @importFrom S4Vectors SimpleList metadata
 #' @importFrom SummarizedExperiment rowRanges rowData assays colData SummarizedExperiment
-#' @importFrom GenomicRanges GRanges seqnames start end strand
+#' @importFrom GenomicRanges GRanges seqnames start end strand mcols
 #' @importFrom IRanges IRanges
 #' @importFrom dplyr left_join mutate group_by summarise cur_group_id ungroup
 #' @export
 summariseByExpression <- function(se, type = "exon") {
     index <- switch(type,
-        exon = .buildExonIndex(se),
+        exon = buildExonIndex(se),
+        gene = buildGeneIndex(se),
         stop("Unsupported type: '", type, "'")
     )
 
@@ -47,40 +56,47 @@ summariseByExpression <- function(se, type = "exon") {
 
     counts <- aggregationMat %*% assays(se)$counts
 
+    if (type == "gene" && !is.null(metadata(se)$incompatibleCounts)) {
+        incompat <- metadata(se)$incompatibleCounts
+        if ("nonuniqueCounts" %in% names(metadata(se)))
+            incompat <- incompat + metadata(se)$nonuniqueCounts
+        incompat <- Matrix(
+            incompat[match(rownames(counts), rownames(incompat)), ],
+            sparse = TRUE
+        )
+        counts <- counts + incompat
+    }
+
     counts.total <- colSums(counts)
     counts.total[counts.total == 0] <- 1
     cpm <- counts / counts.total * 10^6
 
     outAssays <- SimpleList(counts = counts, CPM = cpm)
 
-    for (nm in c("fullLengthCounts", "uniqueCounts")) {
-        if (nm %in% names(assays(se))) {
-            outAssays[[nm]] <- aggregationMat %*% assays(se)[[nm]]
+    for (i in c("fullLengthCounts", "uniqueCounts")) {
+        if (i %in% names(assays(se))) {
+            outAssays[[i]] <- aggregationMat %*% assays(se)[[i]]
         }
     }
 
-    ColNames <- colnames(counts)
-    ColData  <- colData(se)
-    ColData@rownames      <- ColNames
-    ColData@listData$name <- ColNames
-
     SummarizedExperiment(
         assays    = outAssays,
         rowRanges = index$outRanges,
-        colData   = ColData
+        colData   = colData(se)
     )
 }
 
 #' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
-#' @return a data.frame with columns TXNAME, seqnames, start, end, strand,
-#'   GENEID, and novelTranscript — one row per exon-transcript pair.
+#' @return a list with:
+#'   \code{indexDf} — data.frame with columns TXNAME and group_idx
+#'     (one row per exon-transcript pair);
+#'   \code{outRanges} — GRanges with one entry per unique exon locus,
+#'     named EX1, EX2, ..., with mcols GENEID, txNames, and exonClass.
 #' @noRd
-.buildFlatDf <- function(se) {
+buildExonIndex <- function(se) {
+    txDf <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "novelTranscript"), drop = FALSE]
     exonRanges <- unlist(rowRanges(se), use.names = TRUE)
-    geneDf <- as.data.frame(rowData(se))[, c("GENEID", "novelTranscript"), drop = FALSE]
-    geneDf$TXNAME <- rownames(se)
-
-    data.frame(
+    flatDf <- data.frame(
         TXNAME   = names(exonRanges),
         seqnames = as.character(seqnames(exonRanges)),
         start    = start(exonRanges),
@@ -88,18 +104,7 @@ summariseByExpression <- function(se, type = "exon") {
         strand   = as.character(strand(exonRanges)),
         stringsAsFactors = FALSE
     ) %>%
-        left_join(geneDf, by = "TXNAME")
-}
-
-#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
-#' @return a list with:
-#'   \code{indexDf} — data.frame with columns TXNAME and group_idx
-#'     (one row per exon-transcript pair);
-#'   \code{outRanges} — GRanges with one entry per unique exon locus,
-#'     named EX1, EX2, ..., with mcols GENEID, txNames, and exonClass.
-#' @noRd
-.buildExonIndex <- function(se) {
-    flatDf <- .buildFlatDf(se) %>%
+        left_join(txDf, by = "TXNAME") %>%
         group_by(seqnames, start, end, strand) %>%
         mutate(group_idx = cur_group_id()) %>%
         ungroup()
@@ -131,4 +136,33 @@ summariseByExpression <- function(se, type = "exon") {
         indexDf   = flatDf[, c("TXNAME", "group_idx")],
         outRanges = outRanges
     )
-}
\ No newline at end of file
+}
+
+#' @param se a \code{SummarizedExperiment} object from \code{\link{bambu}}
+#' @return a list with:
+#'   \code{indexDf} — data.frame with columns TXNAME and group_idx
+#'     (one row per transcript);
+#'   \code{outRanges} — GRangesList with one element per gene, named by
+#'     GENEID, with mcols GENEID and newGeneClass.
+#' @noRd
+buildGeneIndex <- function(se) {
+    rd <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "txClassDescription"), drop = FALSE] %>%
+        mutate(group_idx = match(GENEID, unique(GENEID)))
+
+    groupMeta <- rd %>%
+        group_by(group_idx) %>%
+        summarise(
+            GENEID       = GENEID[1],
+            newGeneClass = ifelse(grepl("ENSG", GENEID[1]), "annotation", unique(txClassDescription)[1]),
+            .groups      = "drop"
+        )
+
+    outRanges <- reducedRangesByGenes(rowRanges(se))[groupMeta$GENEID]
+    mcols(outRanges)$GENEID       <- groupMeta$GENEID
+    mcols(outRanges)$newGeneClass <- groupMeta$newGeneClass
+
+    list(
+        indexDf   = rd[, c("TXNAME", "group_idx")],
+        outRanges = outRanges
+    )
+}
diff --git a/tests/testthat/test_summariseExpression.R b/tests/testthat/test_summariseExpression.R
new file mode 100644
index 00000000..e77b22c2
--- /dev/null
+++ b/tests/testthat/test_summariseExpression.R
@@ -0,0 +1,57 @@
+context("Summarise expression")
+
+test_that("summariseByExpression correctly reduces transcript counts to exon and gene counts", {
+    # TX1: exons at 1-100, 200-300 -> GENE1, count = 10
+    # TX2: exons at 1-100, 400-500 -> GENE1, shares first exon with TX1, count = 5
+    # TX3: exon at 600-700         -> GENE2, count = 8
+    #
+    # Expected exon counts:
+    #   chr1:1-100   (TX1 + TX2) -> 15
+    #   chr1:200-300 (TX1 only)  -> 10
+    #   chr1:400-500 (TX2 only)  ->  5
+    #   chr1:600-700 (TX3 only)  ->  8
+    #
+    # Expected gene counts:
+    #   GENE1 (TX1 + TX2) -> 15
+    #   GENE2 (TX3 only)  ->  8
+
+    txRanges <- GRangesList(
+        TX1 = GRanges("chr1", IRanges(c(1, 200), c(100, 300)), strand = "+"),
+        TX2 = GRanges("chr1", IRanges(c(1, 400), c(100, 500)), strand = "+"),
+        TX3 = GRanges("chr1", IRanges(600, 700), strand = "+")
+    )
+    mcols(txRanges)$TXNAME             <- c("TX1", "TX2", "TX3")
+    mcols(txRanges)$GENEID             <- c("GENE1", "GENE1", "GENE2")
+    mcols(txRanges)$novelTranscript    <- c(FALSE, FALSE, FALSE)
+    mcols(txRanges)$txClassDescription <- c("annotation", "annotation", "annotation")
+
+    counts <- matrix(c(10, 5, 8), nrow = 3, ncol = 1,
+        dimnames = list(c("TX1", "TX2", "TX3"), "sample1"))
+
+    se <- SummarizedExperiment(
+        assays    = list(counts = counts),
+        rowRanges = txRanges
+    )
+
+    # exon level
+    seExon <- summariseByExpression(se, type = "exon")
+    exonCounts <- as.matrix(assays(seExon)$counts)
+    exonRanges <- rowRanges(seExon)
+
+    shared  <- which(start(exonRanges) == 1   & end(exonRanges) == 100)
+    tx1only <- which(start(exonRanges) == 200 & end(exonRanges) == 300)
+    tx2only <- which(start(exonRanges) == 400 & end(exonRanges) == 500)
+    tx3only <- which(start(exonRanges) == 600 & end(exonRanges) == 700)
+
+    expect_equal(exonCounts[shared,  1], 15)
+    expect_equal(exonCounts[tx1only, 1], 10)
+    expect_equal(exonCounts[tx2only, 1],  5)
+    expect_equal(exonCounts[tx3only, 1],  8)
+
+    # gene level
+    seGene <- summariseByExpression(se, type = "gene")
+    geneCounts <- as.matrix(assays(seGene)$counts)
+
+    expect_equal(geneCounts["GENE1", 1], 15)
+    expect_equal(geneCounts["GENE2", 1],  8)
+})

From 1d99a73ebbb76007a038036fa437e1f60323d582 Mon Sep 17 00:00:00 2001
From: ch99l <chinhaolee10@gmail.com>
Date: Wed, 22 Apr 2026 17:59:22 +0800
Subject: [PATCH 6/6] add comments

---
 R/summariseExpression.R | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/R/summariseExpression.R b/R/summariseExpression.R
index 6174a9c1..05597890 100644
--- a/R/summariseExpression.R
+++ b/R/summariseExpression.R
@@ -36,12 +36,15 @@
 #' @importFrom dplyr left_join mutate group_by summarise cur_group_id ungroup
 #' @export
 summariseByExpression <- function(se, type = "exon") {
+    # build the grouping index and output ranges for the requested type
     index <- switch(type,
         exon = buildExonIndex(se),
         gene = buildGeneIndex(se),
         stop("Unsupported type: '", type, "'")
     )
 
+    # sparse aggregation matrix: rows = groups, cols = transcripts
+    # multiplying by transcript counts gives group-level counts
     txNames <- rownames(se)
     nGroups <- length(index$outRanges)
     groupNames <- names(index$outRanges)
@@ -54,8 +57,11 @@ summariseByExpression <- function(se, type = "exon") {
         dimnames = list(groupNames, txNames)
     )
 
+    # aggregate transcript counts to group level
     counts <- aggregationMat %*% assays(se)$counts
 
+    # for gene level, add reads that could not be assigned to any transcript
+    # but were localised to a gene, stored in metadata by bambu
     if (type == "gene" && !is.null(metadata(se)$incompatibleCounts)) {
         incompat <- metadata(se)$incompatibleCounts
         if ("nonuniqueCounts" %in% names(metadata(se)))
@@ -67,12 +73,13 @@ summariseByExpression <- function(se, type = "exon") {
         counts <- counts + incompat
     }
 
+    # compute CPM from aggregated counts
     counts.total <- colSums(counts)
     counts.total[counts.total == 0] <- 1
     cpm <- counts / counts.total * 10^6
 
+    # aggregate optional assays if present
     outAssays <- SimpleList(counts = counts, CPM = cpm)
-
     for (i in c("fullLengthCounts", "uniqueCounts")) {
         if (i %in% names(assays(se))) {
             outAssays[[i]] <- aggregationMat %*% assays(se)[[i]]
@@ -94,6 +101,7 @@ summariseByExpression <- function(se, type = "exon") {
 #'     named EX1, EX2, ..., with mcols GENEID, txNames, and exonClass.
 #' @noRd
 buildExonIndex <- function(se) {
+    # one row per exon-transcript pair, joined with transcript metadata
     txDf <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "novelTranscript"), drop = FALSE]
     exonRanges <- unlist(rowRanges(se), use.names = TRUE)
     flatDf <- data.frame(
@@ -105,10 +113,12 @@ buildExonIndex <- function(se) {
         stringsAsFactors = FALSE
     ) %>%
         left_join(txDf, by = "TXNAME") %>%
+        # assign each unique exon locus a group index
         group_by(seqnames, start, end, strand) %>%
         mutate(group_idx = cur_group_id()) %>%
         ungroup()
 
+    # one row per unique exon locus with summarised metadata
     groupMeta <- flatDf %>%
         group_by(group_idx) %>%
         summarise(
@@ -122,6 +132,7 @@ buildExonIndex <- function(se) {
             .groups   = "drop"
         )
 
+    # build output GRanges with exon metadata
     outRanges <- GRanges(
         seqnames = groupMeta$seqnames,
         ranges   = IRanges(start = groupMeta$start, end = groupMeta$end),
@@ -146,9 +157,11 @@ buildExonIndex <- function(se) {
 #'     GENEID, with mcols GENEID and newGeneClass.
 #' @noRd
 buildGeneIndex <- function(se) {
+    # one row per transcript; group_idx preserves first-appearance order of GENEIDs
     rd <- as.data.frame(rowData(se))[, c("TXNAME", "GENEID", "txClassDescription"), drop = FALSE] %>%
         mutate(group_idx = match(GENEID, unique(GENEID)))
 
+    # one row per gene with class label derived from txClassDescription
     groupMeta <- rd %>%
         group_by(group_idx) %>%
         summarise(
@@ -157,6 +170,7 @@ buildGeneIndex <- function(se) {
             .groups      = "drop"
         )
 
+    # build output GRangesList of reduced exon ranges per gene
     outRanges <- reducedRangesByGenes(rowRanges(se))[groupMeta$GENEID]
     mcols(outRanges)$GENEID       <- groupMeta$GENEID
     mcols(outRanges)$newGeneClass <- groupMeta$newGeneClass