Skip to content

Commit fbc5bf4

Browse files
authored
Remove unknown file warning from read_annotations() (#169)
* Increment version number to 6.5.0.9000 * Remove restrictive file validation from `read_annotations()` - removed md5sum checksum validation and version dictionary checks resulting in warnings of unknown annotations files - warning was often misleading, as menu annotations file updates are not always in alignment with timing of CRAN releases - removed `getAnnoVer()` function and `ver_dict` object - updated documentation to reflect more flexible file requirements - removed `tools::md5sum` import dependency
1 parent 59444ef commit fbc5bf4

File tree

7 files changed

+45
-161
lines changed

7 files changed

+45
-161
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: SomaDataIO
33
Title: Input/Output 'SomaScan' Data
4-
Version: 6.5.0
4+
Version: 6.5.0.9000
55
Authors@R: c(
66
person(given = "Stu",
77
family = "Field",

NAMESPACE

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,6 @@ importFrom(tidyr,gather)
214214
importFrom(tidyr,pivot_longer)
215215
importFrom(tidyr,separate)
216216
importFrom(tidyr,unite)
217-
importFrom(tools,md5sum)
218217
importFrom(utils,capture.output)
219218
importFrom(utils,head)
220219
importFrom(utils,read.csv)

NEWS.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
# SomaDataIO (6.5.0.9000)
2+
3+
### Function and Object Improvements
4+
5+
* Removed restrictive file validation from `read_annotations()`
6+
- removed `md5sum` checksum validation and version dictionary checks
7+
resulting in misleading warnings about unknown annotations files
8+
- warning was often misleading, as menu annotations file updates are
9+
not always in alignment with timing of CRAN releases
10+
- removed `getAnnoVer()` function and `ver_dict` object
11+
- removed `tools::md5sum` import dependency
12+
113
# SomaDataIO (6.5.0)
214

315
### Function and Object Improvements

R/read-annotations.R

Lines changed: 18 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,27 @@
11
#' Import a SomaLogic Annotations File
22
#'
33
#' @param file A path to an annotations file location.
4-
#' This is a sanctioned, versioned file provided by
5-
#' SomaLogic Operating Co., Inc. and should be an _unmodified_
6-
#' `*.xlsx` file.
4+
#' This should be a SomaLogic annotations file in
5+
#' `*.xlsx` format.
76
#' @return A `tibble` containing analyte-specific annotations and
87
#' related (e.g. lift/bridging) information, keyed on SomaLogic
98
#' [SeqId], the unique SomaScan analyte identifier.
109
#' @examples
1110
#' \dontrun{
1211
#' # for example
13-
#' file <- "~/Downloads/SomaScan_11K_Annotated_Content.xlsx"
12+
#' file <- "~/Downloads/SomaScan_11K_v5.0_Plasma_Serum_Annotated_Menu.xlsx"
1413
#' anno_tbl <- read_annotations(file)
1514
#' }
1615
#' @importFrom readxl read_xlsx
17-
#' @importFrom tools md5sum
1816
#' @export
1917
read_annotations <- function(file) {
2018

21-
if ( !(endsWith(file, "xlsx") || endsWith(file, "json")) ) {
22-
stop("Annotations file must be either ", .value("*.xlsx"),
23-
" or ", .value("*.json"), ".", call. = FALSE)
19+
if ( !grepl("\\.xlsx$", file, ignore.case = TRUE) ) {
20+
stop("Annotations file must be in `*.xlsx` format.", call. = FALSE)
2421
}
2522

26-
ver <- getAnnoVer(file)
27-
28-
# cannot determine version
29-
if ( !grepl("^SL-[0-9]+-rev[0-9]+", ver) ) {
30-
stop(
31-
"Unable to determine annotations file version: ", .value(ver),
32-
".\nA valid annotations file version is required to proceed.",
33-
call. = FALSE
34-
)
35-
}
36-
37-
# check if recognized version
38-
if ( ver %in% names(ver_dict) ) {
39-
md5_file <- strtrim(md5sum(file), 7L) |> unname()
40-
md5_true <- strtrim(ver_dict[[ver]]$sha, 7L)
41-
42-
# file modified
43-
if ( !identical(md5_file, md5_true) ) {
44-
warning(
45-
"Checksum mismatch. ", basename(file), " may have been modified.",
46-
call. = FALSE
47-
)
48-
}
49-
skip <- ver_dict[[ver]]$skip
50-
} else {
51-
warning(
52-
"Unknown version of the annotations file: ", ver, ".",
53-
call. = FALSE
54-
)
55-
skip <- 8L
56-
}
57-
58-
tbl <- readxl::read_xlsx(file, sheet = "Annotations", skip = skip)
23+
# Read the annotations file with standard skip value = 8L
24+
tbl <- readxl::read_xlsx(file, sheet = "Annotations", skip = 8L)
5925

6026
# map these fields to match those in ADATs
6127
map <- c(Target = "Target Name",
@@ -64,84 +30,17 @@ read_annotations <- function(file) {
6430
EntrezGeneID = "Entrez Gene ID",
6531
EntrezGeneSymbol = "Entrez Gene Name")
6632
tbl <- dplyr::rename(tbl, !!!map)
67-
stopifnot(
68-
all(c("SeqId", "SomaId", "Target", "Type", "TargetFullName",
69-
"Organism", "UniProt", "Dilution", "EntrezGeneID",
70-
"EntrezGeneSymbol") %in% names(tbl)
71-
)
72-
)
73-
structure(tbl, version = ver)
74-
}
7533

76-
# assumes line7 contains the version info
77-
getAnnoVer <- function(file) {
78-
rev <- readxl::read_xlsx(file, sheet = "Annotations", skip = 6L, n_max = 1L,
79-
col_names = c("text", "doc", "version", "date"),
80-
col_types = "text")
81-
ver <- paste(toupper(rev$text), rev$doc, tolower(rev$version), rev$date, sep = "-")
82-
gsub(" +", "", ver)
83-
}
34+
# check for expected fields in annotations file
35+
required_cols <- c("SeqId", "SomaId", "Target", "Type", "TargetFullName",
36+
"Organism", "UniProt", "EntrezGeneID", "EntrezGeneSymbol")
37+
missing_cols <- setdiff(required_cols, names(tbl))
8438

85-
# version dictionary of key-value pairs
86-
# for file characteristics
87-
# SHA hashes are calculated with `tools::md5sum()`
88-
ver_dict <- list(
89-
# The first 2 are for testing
90-
# dummy version; 5k -> 7k
91-
"SL-99999999-rev99-1999-01" = list(col_serum = "Serum Scalar v4.0 to v4.1",
92-
col_plasma = "Plasma Scalar v4.0 to v4.1"),
93-
# test-anno.xlsx file; 7k -> 5k
94-
"SL-12345678-rev0-2021-01" = list(sha = "8a345fa621377d0bac40fc8c47f5579d",
95-
col_serum = "Serum Scalar v4.1 to v4.0",
96-
col_plasma = "Plasma Scalar v4.1 to v4.0",
97-
which_serum = 40,
98-
which_plasma = 42,
99-
skip = 8L,
100-
rows = 1,
101-
cols = 43),
102-
# 7k -> 5k
103-
"SL-00000571-rev2-2021-06" = list(sha = "5fa46834ed826eb1e8dba88698cf7a76",
104-
col_serum = "Serum Scalar v4.1 to v4.0",
105-
col_plasma = "Plasma Scalar v4.1 to v4.0",
106-
which_serum = 40,
107-
which_plasma = 42,
108-
skip = 8L,
109-
rows = 7605,
110-
cols = 43),
111-
# 5k -> 7k
112-
"SL-00000246-rev5-2021-06" = list(sha = "7d92666369d4e33364b11804f2d1f8ce",
113-
col_serum = "Serum Scalar v4.0 to v4.1",
114-
col_plasma = "Plasma Scalar v4.0 to v4.1",
115-
which_serum = 40,
116-
which_plasma = 42,
117-
skip = 8L,
118-
rows = 5293,
119-
cols = 43),
39+
if ( length(missing_cols) > 0 ) {
40+
stop("Missing required columns in annotations file: ",
41+
paste(missing_cols, collapse = ", "), call. = FALSE)
42+
}
43+
44+
tbl
45+
}
12046

121-
# source 7k ----
122-
# https://menu.somalogic.com/file-downloads/menu-annotations
123-
# SL00000571_SomaScan_7K_v4.1_Plasma_Serum_Annotated_Menu.xlsx
124-
"SL-00000571-rev11-2025-09" = list(sha = "f13dbe8d5f97bdf56eb107d2cff15408",
125-
col_serum = c("Serum Scalar v4.1 7K to v4.0 5K",
126-
"Serum Scalar v4.1 7K to v5.0 11K"),
127-
col_plasma = c("Plasma Scalar v4.1 7K to v4.0 5K",
128-
"Plasma Scalar v4.1 7K to v5.0 11K"),
129-
which_serum = c(43, 47),
130-
which_plasma = c(45, 49),
131-
skip = 8L,
132-
rows = 7605,
133-
cols = 50),
134-
# source 11k ----
135-
# https://menu.somalogic.com/file-downloads/menu-annotations
136-
# SL00000906_SomaScan_11K_v5.0_Plasma_Serum_Annotated_Menu.xlsx
137-
"SL-00000906-rev8-2025-09" = list(sha = "48f7aafc713acdd7896f010f62506b51",
138-
col_serum = c("Serum Scalar v5.0 11K to v4.1 7K",
139-
"Serum Scalar v5.0 11K to v4.0 5K"),
140-
col_plasma = c("Plasma Scalar v5.0 11K to v4.1 7K",
141-
"Plasma Scalar v5.0 11K to v4.0 5K"),
142-
which_serum = c(43, 47),
143-
which_plasma = c(45, 49),
144-
skip = 8L,
145-
rows = 11092,
146-
cols = 51)
147-
)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
<!-- badges: start -->
77

88
![GitHub
9-
version](https://img.shields.io/badge/Version-6.4.0.9000-success.svg?style=flat&logo=github)
9+
version](https://img.shields.io/badge/Version-6.5.0.9000-success.svg?style=flat&logo=github)
1010
[![CRAN
1111
status](http://www.r-pkg.org/badges/version/SomaDataIO)](https://cran.r-project.org/package=SomaDataIO)
1212
[![Downloads](https://cranlogs.r-pkg.org/badges/SomaDataIO)](https://cran.r-project.org/package=SomaDataIO)

man/read_annotations.Rd

Lines changed: 3 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 10 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,23 @@
11

2+
# Setup ----
23
file <- test_path("testdata", "test-anno.xlsx")
34

4-
test_that("`ver_dict` is updated and correct", {
5-
expect_length(ver_dict, 6L)
6-
expect_named(ver_dict,
7-
c("SL-99999999-rev99-1999-01",
8-
"SL-12345678-rev0-2021-01",
9-
"SL-00000571-rev2-2021-06",
10-
"SL-00000246-rev5-2021-06",
11-
"SL-00000571-rev11-2025-09",
12-
"SL-00000906-rev8-2025-09"))
13-
})
14-
15-
test_that("`getAnnoVer()` parses the version correctly", {
16-
expect_equal(getAnnoVer(file), "SL-12345678-rev0-2021-01")
17-
})
18-
5+
# Testing ----
196
test_that("`read_annotations()` parses the annotations file correctly", {
207
tbl <- read_annotations(file)
218
expect_s3_class(tbl, "tbl_df")
229
expect_equal(dim(tbl), c(1L, 43L))
23-
ver <- attr(tbl, "version")
24-
expect_equal(ver, "SL-12345678-rev0-2021-01")
25-
expect_true(ver_dict[[ver]]$col_serum == names(tbl)[ver_dict[[ver]]$which_serum])
26-
expect_true(ver_dict[[ver]]$col_plasma == names(tbl)[ver_dict[[ver]]$which_plasma])
27-
})
2810

29-
test_that("error conditions trigger stop and warnings when appropriate", {
11+
# Check that required columns are present after field mapping
12+
expected_cols <- c("SeqId", "SomaId", "Target", "Type", "TargetFullName",
13+
"Organism", "UniProt", "EntrezGeneID",
14+
"EntrezGeneSymbol")
15+
expect_true(all(expected_cols %in% names(tbl)))
16+
})
3017

18+
test_that("error conditions trigger appropriate errors", {
3119
expect_error(
3220
read_annotations("foo.txt"),
33-
"Annotations file must be either"
34-
)
35-
36-
expect_warning(
37-
with_pkg_object(SomaDataIO:::ver_dict[-2L], read_annotations(file)),
38-
"Unknown version of the annotations file:"
39-
)
40-
41-
# temp modify md5sha
42-
tmp <- SomaDataIO:::ver_dict
43-
tmp$`SL-12345678-rev0-2021-01`$sha <- "x0x0x0x0x"
44-
expect_warning(
45-
with_pkg_object(tmp, read_annotations(file)),
46-
"Checksum mismatch. test-anno.xlsx may have been modified"
21+
"Annotations file must be"
4722
)
4823
})

0 commit comments

Comments
 (0)