Remove unknown file warning from read_annotations() (#169)

scheidec · web-flow · commit fbc5bf473aae · 2026-02-12T16:23:19.000-05:00
* Increment version number to 6.5.0.9000

* Remove restrictive file validation from `read_annotations()`

- removed md5sum checksum validation and version dictionary checks
  resulting in warnings of unknown annotations files
- warning was often misleading, as menu annotations file updates are
  not always in alignment with timing of CRAN releases
- removed `getAnnoVer()` function and `ver_dict` object
- updated documentation to reflect more flexible file requirements
- removed `tools::md5sum` import dependency
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: SomaDataIO
 Title: Input/Output 'SomaScan' Data
-Version: 6.5.0
+Version: 6.5.0.9000
 Authors@R: c(
     person(given = "Stu",
            family = "Field",
diff --git a/NAMESPACE b/NAMESPACE
@@ -214,7 +214,6 @@ importFrom(tidyr,gather)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,separate)
 importFrom(tidyr,unite)
-importFrom(tools,md5sum)
 importFrom(utils,capture.output)
 importFrom(utils,head)
 importFrom(utils,read.csv)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,15 @@
+# SomaDataIO (6.5.0.9000)
+
+### Function and Object Improvements
+
+* Removed restrictive file validation from `read_annotations()`
+  - removed `md5sum` checksum validation and version dictionary checks
+    resulting in misleading warnings about unknown annotations files
+  - warning was often misleading, as menu annotations file updates are
+    not always in alignment with timing of CRAN releases  
+  - removed `getAnnoVer()` function and `ver_dict` object
+  - removed `tools::md5sum` import dependency
+
 # SomaDataIO (6.5.0)
 
 ### Function and Object Improvements
diff --git a/R/read-annotations.R b/R/read-annotations.R
@@ -1,61 +1,27 @@
 #' Import a SomaLogic Annotations File
 #'
 #' @param file A path to an annotations file location.
-#'   This is a sanctioned, versioned file provided by
-#'   SomaLogic Operating Co., Inc. and should be an _unmodified_
-#'   `*.xlsx` file.
+#'   This should be a SomaLogic annotations file in
+#'   `*.xlsx` format.
 #' @return A `tibble` containing analyte-specific annotations and
 #'   related (e.g. lift/bridging) information, keyed on SomaLogic
 #'   [SeqId], the unique SomaScan analyte identifier.
 #' @examples
 #' \dontrun{
 #'   # for example
-#'   file <- "~/Downloads/SomaScan_11K_Annotated_Content.xlsx"
+#'   file <- "~/Downloads/SomaScan_11K_v5.0_Plasma_Serum_Annotated_Menu.xlsx"
 #'   anno_tbl <- read_annotations(file)
 #' }
 #' @importFrom readxl read_xlsx
-#' @importFrom tools md5sum
 #' @export
 read_annotations <- function(file) {
 
-  if ( !(endsWith(file, "xlsx") || endsWith(file, "json")) ) {
-    stop("Annotations file must be either ", .value("*.xlsx"),
-         " or ", .value("*.json"), ".", call. = FALSE)
+  if ( !grepl("\\.xlsx$", file, ignore.case = TRUE) ) {
+    stop("Annotations file must be in `*.xlsx` format.", call. = FALSE)
   }
 
-  ver <- getAnnoVer(file)
-
-  # cannot determine version
-  if ( !grepl("^SL-[0-9]+-rev[0-9]+", ver) ) {
-    stop(
-      "Unable to determine annotations file version: ", .value(ver),
-      ".\nA valid annotations file version is required to proceed.",
-      call. = FALSE
-    )
-  }
-
-  # check if recognized version
-  if ( ver %in% names(ver_dict) ) {
-    md5_file <- strtrim(md5sum(file), 7L) |> unname()
-    md5_true <- strtrim(ver_dict[[ver]]$sha, 7L)
-
-    # file modified
-    if ( !identical(md5_file, md5_true) ) {
-      warning(
-        "Checksum mismatch. ", basename(file), " may have been modified.",
-        call. = FALSE
-      )
-    }
-    skip <- ver_dict[[ver]]$skip
-  } else {
-    warning(
-      "Unknown version of the annotations file: ", ver, ".",
-      call. = FALSE
-    )
-    skip <- 8L
-  }
-
-  tbl  <- readxl::read_xlsx(file, sheet = "Annotations", skip = skip)
+  # Read the annotations file with standard skip value = 8L
+  tbl  <- readxl::read_xlsx(file, sheet = "Annotations", skip = 8L)
 
   # map these fields to match those in ADATs
   map <- c(Target           = "Target Name",
@@ -64,84 +30,17 @@ read_annotations <- function(file) {
            EntrezGeneID     = "Entrez Gene ID",
            EntrezGeneSymbol = "Entrez Gene Name")
   tbl <- dplyr::rename(tbl, !!!map)
-  stopifnot(
-    all(c("SeqId", "SomaId", "Target", "Type", "TargetFullName",
-          "Organism", "UniProt", "Dilution", "EntrezGeneID",
-          "EntrezGeneSymbol") %in% names(tbl)
-       )
-  )
-  structure(tbl, version = ver)
-}
 
-# assumes line7 contains the version info
-getAnnoVer <- function(file) {
-  rev <- readxl::read_xlsx(file, sheet = "Annotations", skip = 6L, n_max = 1L,
-                           col_names = c("text", "doc", "version", "date"),
-                           col_types = "text")
-  ver <- paste(toupper(rev$text), rev$doc, tolower(rev$version), rev$date, sep = "-")
-  gsub(" +", "", ver)
-}
+  # check for expected fields in annotations file
+  required_cols <- c("SeqId", "SomaId", "Target", "Type", "TargetFullName",
+                     "Organism", "UniProt", "EntrezGeneID", "EntrezGeneSymbol")
+  missing_cols <- setdiff(required_cols, names(tbl))
 
-# version dictionary of key-value pairs
-# for file characteristics
-# SHA hashes are calculated with `tools::md5sum()`
-ver_dict <- list(
-  # The first 2 are for testing
-  # dummy version; 5k -> 7k
-  "SL-99999999-rev99-1999-01" = list(col_serum  = "Serum Scalar v4.0 to v4.1",
-                                     col_plasma = "Plasma Scalar v4.0 to v4.1"),
-  # test-anno.xlsx file; 7k -> 5k
-  "SL-12345678-rev0-2021-01" = list(sha = "8a345fa621377d0bac40fc8c47f5579d",
-                                    col_serum  = "Serum Scalar v4.1 to v4.0",
-                                    col_plasma = "Plasma Scalar v4.1 to v4.0",
-                                    which_serum  = 40,
-                                    which_plasma = 42,
-                                    skip = 8L,
-                                    rows = 1,
-                                    cols = 43),
-  # 7k -> 5k
-  "SL-00000571-rev2-2021-06" = list(sha = "5fa46834ed826eb1e8dba88698cf7a76",
-                                    col_serum  = "Serum Scalar v4.1 to v4.0",
-                                    col_plasma = "Plasma Scalar v4.1 to v4.0",
-                                    which_serum  = 40,
-                                    which_plasma = 42,
-                                    skip = 8L,
-                                    rows = 7605,
-                                    cols = 43),
-  # 5k -> 7k
-  "SL-00000246-rev5-2021-06" = list(sha = "7d92666369d4e33364b11804f2d1f8ce",
-                                    col_serum  = "Serum Scalar v4.0 to v4.1",
-                                    col_plasma = "Plasma Scalar v4.0 to v4.1",
-                                    which_serum  = 40,
-                                    which_plasma = 42,
-                                    skip = 8L,
-                                    rows = 5293,
-                                    cols = 43),
+  if ( length(missing_cols) > 0 ) {
+    stop("Missing required columns in annotations file: ",
+         paste(missing_cols, collapse = ", "), call. = FALSE)
+  }
+
+  tbl
+}
 
-  # source 7k ----
-  #   https://menu.somalogic.com/file-downloads/menu-annotations
-  #   SL00000571_SomaScan_7K_v4.1_Plasma_Serum_Annotated_Menu.xlsx
-  "SL-00000571-rev11-2025-09" = list(sha = "f13dbe8d5f97bdf56eb107d2cff15408",
-                                    col_serum  = c("Serum Scalar v4.1 7K to v4.0 5K",
-                                                   "Serum Scalar v4.1 7K to v5.0 11K"),
-                                    col_plasma = c("Plasma Scalar v4.1 7K to v4.0 5K",
-                                                   "Plasma Scalar v4.1 7K to v5.0 11K"),
-                                    which_serum  = c(43, 47),
-                                    which_plasma = c(45, 49),
-                                    skip = 8L,
-                                    rows = 7605,
-                                    cols = 50),
-  # source 11k ----
-  #   https://menu.somalogic.com/file-downloads/menu-annotations
-  #   SL00000906_SomaScan_11K_v5.0_Plasma_Serum_Annotated_Menu.xlsx
-  "SL-00000906-rev8-2025-09" = list(sha = "48f7aafc713acdd7896f010f62506b51",
-                                    col_serum  = c("Serum Scalar v5.0 11K to v4.1 7K",
-                                                   "Serum Scalar v5.0 11K to v4.0 5K"),
-                                    col_plasma = c("Plasma Scalar v5.0 11K to v4.1 7K",
-                                                   "Plasma Scalar v5.0 11K to v4.0 5K"),
-                                    which_serum  = c(43, 47),
-                                    which_plasma = c(45, 49),
-                                    skip = 8L,
-                                    rows = 11092,
-                                    cols = 51)
-)
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 <!-- badges: start -->
 
 ![GitHub
-version](https://img.shields.io/badge/Version-6.4.0.9000-success.svg?style=flat&logo=github)
+version](https://img.shields.io/badge/Version-6.5.0.9000-success.svg?style=flat&logo=github)
 [![CRAN
 status](http://www.r-pkg.org/badges/version/SomaDataIO)](https://cran.r-project.org/package=SomaDataIO)
 [![Downloads](https://cranlogs.r-pkg.org/badges/SomaDataIO)](https://cran.r-project.org/package=SomaDataIO)
diff --git a/man/read_annotations.Rd b/man/read_annotations.Rd
diff --git a/tests/testthat/test-read-annotations.R b/tests/testthat/test-read-annotations.R
@@ -1,48 +1,23 @@
 
+# Setup ----
 file <- test_path("testdata", "test-anno.xlsx")
 
-test_that("`ver_dict` is updated and correct", {
-  expect_length(ver_dict, 6L)
-  expect_named(ver_dict,
-               c("SL-99999999-rev99-1999-01",
-                 "SL-12345678-rev0-2021-01",
-                 "SL-00000571-rev2-2021-06",
-                 "SL-00000246-rev5-2021-06",
-                 "SL-00000571-rev11-2025-09",
-                 "SL-00000906-rev8-2025-09"))
-})
-
-test_that("`getAnnoVer()` parses the version correctly", {
-  expect_equal(getAnnoVer(file), "SL-12345678-rev0-2021-01")
-})
-
+# Testing ----
 test_that("`read_annotations()` parses the annotations file correctly", {
   tbl <- read_annotations(file)
   expect_s3_class(tbl, "tbl_df")
   expect_equal(dim(tbl), c(1L, 43L))
-  ver <- attr(tbl, "version")
-  expect_equal(ver, "SL-12345678-rev0-2021-01")
-  expect_true(ver_dict[[ver]]$col_serum == names(tbl)[ver_dict[[ver]]$which_serum])
-  expect_true(ver_dict[[ver]]$col_plasma == names(tbl)[ver_dict[[ver]]$which_plasma])
-})
 
-test_that("error conditions trigger stop and warnings when appropriate", {
+  # Check that required columns are present after field mapping
+  expected_cols <- c("SeqId", "SomaId", "Target", "Type", "TargetFullName",
+                     "Organism", "UniProt", "EntrezGeneID",
+                     "EntrezGeneSymbol")
+  expect_true(all(expected_cols %in% names(tbl)))
+})
 
+test_that("error conditions trigger appropriate errors", {
   expect_error(
     read_annotations("foo.txt"),
-    "Annotations file must be either"
-  )
-
-  expect_warning(
-    with_pkg_object(SomaDataIO:::ver_dict[-2L], read_annotations(file)),
-    "Unknown version of the annotations file:"
-  )
-
-  # temp modify md5sha
-  tmp <- SomaDataIO:::ver_dict
-  tmp$`SL-12345678-rev0-2021-01`$sha <- "x0x0x0x0x"
-  expect_warning(
-    with_pkg_object(tmp, read_annotations(file)),
-    "Checksum mismatch. test-anno.xlsx may have been modified"
+    "Annotations file must be"
   )
 })