Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 165 additions & 21 deletions R/readWrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
#' @param path the destination of the output files
#' (gtf, transcript counts, and gene counts)
#' @param prefix the prefix of the output files
#' @details The function will write the output from Bambu to files. The
#' annotations will be written to a .gtf file, transcript counts (total counts,
#' @param outputAnnFormat the file format in which to output annotations, must
#' be one of \code{"gtf"} or \code{"gff3"}. \code{"gtf"} is specified by default.
#' @details The function will write the output from Bambu to files. The
#' annotations will be written to a .gtf or .gff3 file, transcript counts (total counts,
#' CPM, full-length counts, partial-length counts, and unique counts) and gene counts
#' will be written to .txt files.
#' will be written to .txt files.
Comment on lines 4 to +12
#' @export
#' @examples
#' se <- readRDS(system.file("extdata",
Expand All @@ -17,23 +19,36 @@
#' path <- tempdir()
#' writeBambuOutput(se, path)
writeBambuOutput <- function(se, path, prefix = "", outputExtendedAnno = TRUE,
outputAll = TRUE, outputBambuModels = TRUE, outputNovelOnly = TRUE, seperateSamples = FALSE) {
if (missing(se) | missing(path)) {
stop("Both summarizedExperiment object from bambu and
outputAll = TRUE, outputBambuModels = TRUE, outputNovelOnly = TRUE, seperateSamples = FALSE,
outputAnnFormat = "gtf") {
Comment on lines 21 to +23

if (missing(se) | missing(path)) {
stop("Both summarizedExperiment object from bambu and
the path for the output files are required.")
} else {
outdir <- paste0(path, "/")
if (!dir.exists(outdir))
dir.create(outdir, recursive = TRUE)

transcript_grList <- rowRanges(se)
prefix <- ifelse(prefix != "", paste0(prefix, "_"), "")
transcript_gtffn <- paste(outdir, prefix, sep = "")
gtf <- writeAnnotationsToGTF(annotation = transcript_grList,
file = transcript_gtffn, outputExtendedAnno = outputExtendedAnno,
outputAll = outputAll, outputBambuModels = outputBambuModels, outputNovelOnly = outputNovelOnly)
} else if (!outputAnnFormat %in% c("gtf", "gff3")) {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe should also allow for gff? and assume gff meant for gff3?

stop("Please specify a valid annotation outputAnnFormat: 'gtf' or 'gff3', or leave blank to output to GTF by default.")
} else {
outdir <- paste0(path, "/")
if (!dir.exists(outdir))
dir.create(outdir, recursive = TRUE)

transcript_grList <- rowRanges(se)
prefix <- ifelse(prefix != "", paste0(prefix, "_"), "")
transcript_annfn <- paste(outdir, prefix, sep = "")

if (outputAnnFormat == "gtf") {
gtf <- writeAnnotationsToGTF(annotation = transcript_grList,
file = transcript_annfn, outputExtendedAnno = outputExtendedAnno,
outputAll = outputAll, outputBambuModels = outputBambuModels, outputNovelOnly = outputNovelOnly)
}

if (outputAnnFormat == "gff3") {
gff3 <- writeAnnotationsToGFF3(annotation = transcript_grList,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

writeAnnotationsToGFF3 and writeAnnotationsToGTF already write to files, so not need to assign to gff3, same for above, no need to assign to gtf, and both are not used in following lines as well

Suggested change
gff3 <- writeAnnotationsToGFF3(annotation = transcript_grList,
writeAnnotationsToGFF3(annotation = transcript_grList,

file = transcript_annfn, outputExtendedAnno = outputExtendedAnno,
outputAll = outputAll, outputBambuModels = outputBambuModels, outputNovelOnly = outputNovelOnly)
}
Comment on lines +39 to +49

utils::write.table(colData(se), file = paste0(transcript_gtffn, "sampleData.tsv"),
utils::write.table(colData(se), file = paste0(transcript_annfn, "sampleData.tsv"),
sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
for(d in names(assays(se))){
writeCountsOutput(se, varname=d,
Expand All @@ -43,17 +58,17 @@ writeBambuOutput <- function(se, path, prefix = "", outputExtendedAnno = TRUE,
#write incompatible counts
if(!is.null(metadata(se)$incompatibleCounts)){
estimates <- metadata(se)$incompatibleCounts
estimatesfn <- paste(transcript_gtffn, "incompatibleCounts.mtx", sep = "")
estimatesfn <- paste(transcript_annfn, "incompatibleCounts.mtx", sep = "")
Matrix::writeMM(estimates, estimatesfn)
}
seGene <- transcriptToGeneExpression(se)
writeCountsOutput(seGene, varname='counts', feature='gene',outdir, prefix)
#utils::write.table(paste0(colnames(se), "-1"), file = paste0(outdir, "barcodes.tsv"), quote = FALSE, row.names = FALSE, col.names = FALSE)
#R.utils::gzip(paste0(outdir, "barcodes.tsv"))
txANDGenes <- data.table(as.data.frame(rowData(se))[,c("TXNAME","GENEID")])
utils::write.table(txANDGenes, file = paste0(transcript_gtffn, "txANDgenes.tsv"),
utils::write.table(txANDGenes, file = paste0(transcript_annfn, "txANDgenes.tsv"),
sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
utils::write.table(names(seGene), file = paste0(transcript_gtffn, "genes.tsv"),
utils::write.table(names(seGene), file = paste0(transcript_annfn, "genes.tsv"),
sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)

#R.utils::gzip(paste0(outdir, "txANDgenes.tsv"))
Expand Down Expand Up @@ -247,6 +262,135 @@ writeAnnotationsToGTF <- function(annotation, file, geneIDs = NULL, outputExtend
}
}

# Write to GFF3 file
writeToGFF3 <- function(annotation, file, geneIDs = NULL) {

if (missing(annotation) | missing(file)) {
stop("Both GRangesList and the name of the output file are required.")
} else if (!is(annotation, "CompressedGRangesList")) { # Check filename
stop("The inputted GRangesList is of the wrong class.")
}

NDR = NULL
txScore = NULL
txScore.noFit = NULL
novelGene = NULL
novelTranscript = NULL
txClassDescription = NULL

df <- as_tibble(annotation)

# Build exon number in GTF format
df$exon_rank <- paste("rank=", df$exon_rank, ";", sep = "")

# if there is an NDR column
if(!is.null(mcols(annotation)$NDR)){

NDR = rep(mcols(annotation)$NDR, unname(elementNROWS(annotation)))
txScore = rep(mcols(annotation)$maxTxScore, unname(elementNROWS(annotation)))
txScore.noFit = rep(mcols(annotation)$maxTxScore.noFit, unname(elementNROWS(annotation)))
novelGene = rep(mcols(annotation)$novelGene, unname(elementNROWS(annotation)))
novelTranscript = rep(mcols(annotation)$novelTranscript, unname(elementNROWS(annotation)))
txClassDescription = rep(mcols(annotation)$txClassDescription, unname(elementNROWS(annotation)))

df$NDR <- paste("NDR=", as.character(NDR), ";", sep = "")
df$txScore <- paste("maxTxScore=", as.character(txScore), ";", sep = "")
df$txScore.noFit <- paste("maxTxScore.noFit=", as.character(txScore.noFit), ";", sep = "")
df$novelGene <- paste("novelGene=", as.character(novelGene), ";", sep = "")
df$novelTranscript <- paste("novelTranscript=", as.character(novelTranscript), ";", sep = "")
df$txClassDescription <- paste("txClassDescription=", as.character(txClassDescription), ";", sep = "")
}

# Join gene IDs
if (is.null(geneIDs)) {
if (!is.null(mcols(annotation, use.names = FALSE)$GENEID)) {
geneIDs <- as_tibble(mcols(annotation, use.names = FALSE)[,
c("TXNAME", "GENEID")])
geneIDs$seqnames <- unlist(unique(seqnames(annotation)))
geneIDs$strand <- unlist(unique(strand(annotation)))
}
}

df <- left_join(df, geneIDs[, c("TXNAME", "GENEID")],
by = c("group_name" = "TXNAME"))

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

processing until this step is duplicated from writeToGTF and writeToGFF3, maybe can consider to combine 2 functions?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to do in the future as part of deduplicating code

# Convert group_name and gene ID into GFF3 format
df$group_name <- paste("ID=", df$group_name, ";", sep = "")
df$GENEID <- paste("Parent=", df$GENEID, ";", sep = "")

# Create GFF3 formatted dataframes for exons and transcripts separately

# Exons: no need gene IDs but set Parent = transcript ID
dfExon <- mutate(df, source = "Bambu", feature = "exon", score = ".",
frame = ".", attributes = paste0(group_name, exon_rank, NDR, txScore, txScore.noFit, novelGene, novelTranscript, txClassDescription )) %>%
select(seqnames, source, feature, start, end, score,
strand, frame, attributes, group_name) %>%
mutate(attributes = sub("ID=", "Parent=", attributes))

# Dataframe for transcripts
dfTx <- as_tibble(as.data.frame(range(ranges(annotation))))
dfTx <- left_join(dfTx, geneIDs, by = c("group_name" = "TXNAME"))
dfTx$group_name <- paste("ID=", dfTx$group_name, ";", sep = "")
dfTx$GENEID <- paste("Parent=", dfTx$GENEID, ";", sep = "")


if(!is.null(mcols(annotation)$NDR)) {
Copy link
Copy Markdown
Collaborator

@cying111 cying111 Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this if section is duplicated (appears 4 times)? maybe can modularise it as well?

Copy link
Copy Markdown
Collaborator Author

@hafiz-ismail hafiz-ismail Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the column assignment is slightly different from the one at line 287

i.e. lines 296-302

    df$NDR <- paste("NDR=", as.character(NDR), ";", sep = "")
    df$txScore <- paste("maxTxScore=", as.character(txScore), ";", sep = "")
    df$txScore.noFit <- paste("maxTxScore.noFit=", as.character(txScore.noFit), ";", sep = "")
    df$novelGene <- paste("novelGene=", as.character(novelGene), ";", sep = "")
    df$novelTranscript <- paste("novelTranscript=", as.character(novelTranscript), ";", sep = "")
    df$txClassDescription <- paste("txClassDescription=", as.character(txClassDescription), ";", sep = "")
  }

vs lines 338-344

    dfTx$NDR <- paste("NDR=", as.character(mcols(annotation)$NDR), ";", sep = "")
    dfTx$txScore <- paste("txScore=", as.character(mcols(annotation)$txScore), ";", sep = "")
    dfTx$txScore.noFit <- paste("txScore.noFit=", as.character(mcols(annotation)$txScore.noFit), ";", sep = "")
    dfTx$novelGene <- paste("novelGene=", as.character(mcols(annotation)$novelGene), ";", sep = "")
    dfTx$novelTranscript <- paste("novelTranscript=", as.character(mcols(annotation)$novelTranscript), ";", sep = "")
    dfTx$txClassDescription <- paste("txClassDescription=", as.character(mcols(annotation)$txClassDescription), ";", sep = "")
  }

likewise the same block of code is also 'duplicated' in writeToGTF.
nevertheless, they are definitely very similar, can be rewritten and also modularized in the future

dfTx$NDR <- paste("NDR=", as.character(mcols(annotation)$NDR), ";", sep = "")
dfTx$txScore <- paste("txScore=", as.character(mcols(annotation)$txScore), ";", sep = "")
dfTx$txScore.noFit <- paste("txScore.noFit=", as.character(mcols(annotation)$txScore.noFit), ";", sep = "")
dfTx$novelGene <- paste("novelGene=", as.character(mcols(annotation)$novelGene), ";", sep = "")
dfTx$novelTranscript <- paste("novelTranscript=", as.character(mcols(annotation)$novelTranscript), ";", sep = "")
dfTx$txClassDescription <- paste("txClassDescription=", as.character(mcols(annotation)$txClassDescription), ";", sep = "")
}

# Parse to gff3 format
dfTx <- mutate(dfTx, source = "Bambu", feature = "mRNA", score = ".",
frame = ".", attributes = paste0(GENEID, group_name, NDR, txScore, txScore.noFit, novelGene, novelTranscript, txClassDescription )) %>%
select(seqnames, source, feature, start, end, score,
strand, frame, attributes, group_name)

# Concat tx and exon, then drop the group_name
gff3 <- rbind(dfTx, dfExon) %>% group_by(group_name) %>%
arrange(as.character(seqnames), start) %>% ungroup() %>%
select(seqnames, source, feature, start, end, score,
strand, frame, attributes)

# replace * with . and remove trailing ;
gff3 <- mutate(gff3, strand = recode_factor(strand, `*` = "."),
attributes = sub(";$", "", attributes))

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure where sorting should happen, I think maybe the step before mutate would be fine

# Export
writeLines("##gff-version 3", file)
utils::write.table(gff3, file = file, append = TRUE, quote = FALSE, row.names = FALSE,
col.names = FALSE, sep = "\t")
}


writeAnnotationsToGFF3 <- function(annotation, file, geneIDs = NULL, outputExtendedAnno = TRUE,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function maybe can be combined with writeAnnotationsToGTF to one writeAnnotations function, with one more parameter to call writeToGTF or writeToGFF3 later?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to do in the future as part of deduplicating code

outputAll = TRUE, outputBambuModels = TRUE, outputNovelOnly = TRUE){
if(outputExtendedAnno){
writeToGFF3(annotation, paste0(file, "extendedAnnotations.gff3"), geneIDs)
}
if(outputAll){
annotationAll = setNDR(annotation, 1)
if(length(annotationAll) == length(annotation))
message("The current NDR threshold already outputs all transcript models. This may result in reduced precision for th extendedAnnotations and supportedTranscriptModels gtfs")
writeToGFF3(annotationAll, paste0(file, "allTranscriptModels.gff3"), geneIDs)
}

#todo - have this write bambu start and ends for annotated transcripts
if(outputBambuModels){
annotationBambu = annotation[!is.na(mcols(annotation)$readCount)]
writeToGFF3(annotationBambu, paste0(file, "supportedTranscriptModels.gff3"), geneIDs)
}

if(outputNovelOnly){
annotationNovel = annotation[mcols(annotation)$novelTranscript]
writeToGFF3(annotationNovel, paste0(file, "novelTranscripts.gff3"), geneIDs)
}
}



#' Outputs GRangesList object from reading a GTF file
#' @title convert a GTF file into a GRangesList
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,9 @@ Access transcript expression estimates by extracting a variable (such as counts

For a full description of the other outputs see [Output Description](#Output-Description)

The full output can be written to a file using writeBambuOutput(). Using this function will generate six files, including 4four .gtf files(detailed below), and two .txt files for the expression counts at transcript and gene levels.
The full output can be written to a file using writeBambuOutput(). Using this function will generate six files, including four .gtf/.gff3 files(detailed below), and two .txt files for the expression counts at transcript and gene levels.

By default bambu will write four .gtf files
By default bambu will write four .gtf files. Functionality to write .gff3 files can be specified with the parameter `outputAnnFormat = "gff3"`
- **extendedAnnotations.gtf** - Contains all transcript models from the reference annotations and any novel high confidence transcript models (below NDR threshold) from Bambu
- **allTranscriptModels** - Contains all transcript models from the reference annotations and all novel transcript models, irrespective of their NDR score. This is useful for reloading into Bambu with prepareAnnotations() to redo the analysis or reoutput the annotations at different NDR thresholds.
- **supportedTranscriptModels** - Contains only transcript models that are fully supported by at least one read across the samples provided. Note that if multiple reference annotations share the same intron junctions, an abitrary one will selected to be be included in this output.
Expand Down
Loading