11# ' Import a SomaLogic Annotations File
22# '
33# ' @param file A path to an annotations file location.
4- # ' This is a sanctioned, versioned file provided by
5- # ' SomaLogic Operating Co., Inc. and should be an _unmodified_
6- # ' `*.xlsx` file.
4+ # ' This should be a SomaLogic annotations file in
5+ # ' `*.xlsx` format.
76# ' @return A `tibble` containing analyte-specific annotations and
87# ' related (e.g. lift/bridging) information, keyed on SomaLogic
98# ' [SeqId], the unique SomaScan analyte identifier.
109# ' @examples
1110# ' \dontrun{
1211# ' # for example
13- # ' file <- "~/Downloads/SomaScan_11K_Annotated_Content .xlsx"
12+ # ' file <- "~/Downloads/SomaScan_11K_v5.0_Plasma_Serum_Annotated_Menu .xlsx"
1413# ' anno_tbl <- read_annotations(file)
1514# ' }
1615# ' @importFrom readxl read_xlsx
17- # ' @importFrom tools md5sum
1816# ' @export
1917read_annotations <- function (file ) {
2018
21- if ( ! (endsWith(file , " xlsx" ) || endsWith(file , " json" )) ) {
22- stop(" Annotations file must be either " , .value(" *.xlsx" ),
23- " or " , .value(" *.json" ), " ." , call. = FALSE )
19+ if ( ! grepl(" \\ .xlsx$" , file , ignore.case = TRUE ) ) {
20+ stop(" Annotations file must be in `*.xlsx` format." , call. = FALSE )
2421 }
2522
26- ver <- getAnnoVer(file )
27-
28- # cannot determine version
29- if ( ! grepl(" ^SL-[0-9]+-rev[0-9]+" , ver ) ) {
30- stop(
31- " Unable to determine annotations file version: " , .value(ver ),
32- " .\n A valid annotations file version is required to proceed." ,
33- call. = FALSE
34- )
35- }
36-
37- # check if recognized version
38- if ( ver %in% names(ver_dict ) ) {
39- md5_file <- strtrim(md5sum(file ), 7L ) | > unname()
40- md5_true <- strtrim(ver_dict [[ver ]]$ sha , 7L )
41-
42- # file modified
43- if ( ! identical(md5_file , md5_true ) ) {
44- warning(
45- " Checksum mismatch. " , basename(file ), " may have been modified." ,
46- call. = FALSE
47- )
48- }
49- skip <- ver_dict [[ver ]]$ skip
50- } else {
51- warning(
52- " Unknown version of the annotations file: " , ver , " ." ,
53- call. = FALSE
54- )
55- skip <- 8L
56- }
57-
58- tbl <- readxl :: read_xlsx(file , sheet = " Annotations" , skip = skip )
23+ # Read the annotations file with standard skip value = 8L
24+ tbl <- readxl :: read_xlsx(file , sheet = " Annotations" , skip = 8L )
5925
6026 # map these fields to match those in ADATs
6127 map <- c(Target = " Target Name" ,
@@ -64,84 +30,17 @@ read_annotations <- function(file) {
6430 EntrezGeneID = " Entrez Gene ID" ,
6531 EntrezGeneSymbol = " Entrez Gene Name" )
6632 tbl <- dplyr :: rename(tbl , !!! map )
67- stopifnot(
68- all(c(" SeqId" , " SomaId" , " Target" , " Type" , " TargetFullName" ,
69- " Organism" , " UniProt" , " Dilution" , " EntrezGeneID" ,
70- " EntrezGeneSymbol" ) %in% names(tbl )
71- )
72- )
73- structure(tbl , version = ver )
74- }
7533
76- # assumes line7 contains the version info
77- getAnnoVer <- function (file ) {
78- rev <- readxl :: read_xlsx(file , sheet = " Annotations" , skip = 6L , n_max = 1L ,
79- col_names = c(" text" , " doc" , " version" , " date" ),
80- col_types = " text" )
81- ver <- paste(toupper(rev $ text ), rev $ doc , tolower(rev $ version ), rev $ date , sep = " -" )
82- gsub(" +" , " " , ver )
83- }
34+ # check for expected fields in annotations file
35+ required_cols <- c(" SeqId" , " SomaId" , " Target" , " Type" , " TargetFullName" ,
36+ " Organism" , " UniProt" , " EntrezGeneID" , " EntrezGeneSymbol" )
37+ missing_cols <- setdiff(required_cols , names(tbl ))
8438
85- # version dictionary of key-value pairs
86- # for file characteristics
87- # SHA hashes are calculated with `tools::md5sum()`
88- ver_dict <- list (
89- # The first 2 are for testing
90- # dummy version; 5k -> 7k
91- " SL-99999999-rev99-1999-01" = list (col_serum = " Serum Scalar v4.0 to v4.1" ,
92- col_plasma = " Plasma Scalar v4.0 to v4.1" ),
93- # test-anno.xlsx file; 7k -> 5k
94- " SL-12345678-rev0-2021-01" = list (sha = " 8a345fa621377d0bac40fc8c47f5579d" ,
95- col_serum = " Serum Scalar v4.1 to v4.0" ,
96- col_plasma = " Plasma Scalar v4.1 to v4.0" ,
97- which_serum = 40 ,
98- which_plasma = 42 ,
99- skip = 8L ,
100- rows = 1 ,
101- cols = 43 ),
102- # 7k -> 5k
103- " SL-00000571-rev2-2021-06" = list (sha = " 5fa46834ed826eb1e8dba88698cf7a76" ,
104- col_serum = " Serum Scalar v4.1 to v4.0" ,
105- col_plasma = " Plasma Scalar v4.1 to v4.0" ,
106- which_serum = 40 ,
107- which_plasma = 42 ,
108- skip = 8L ,
109- rows = 7605 ,
110- cols = 43 ),
111- # 5k -> 7k
112- " SL-00000246-rev5-2021-06" = list (sha = " 7d92666369d4e33364b11804f2d1f8ce" ,
113- col_serum = " Serum Scalar v4.0 to v4.1" ,
114- col_plasma = " Plasma Scalar v4.0 to v4.1" ,
115- which_serum = 40 ,
116- which_plasma = 42 ,
117- skip = 8L ,
118- rows = 5293 ,
119- cols = 43 ),
39+ if ( length(missing_cols ) > 0 ) {
40+ stop(" Missing required columns in annotations file: " ,
41+ paste(missing_cols , collapse = " , " ), call. = FALSE )
42+ }
43+
44+ tbl
45+ }
12046
121- # source 7k ----
122- # https://menu.somalogic.com/file-downloads/menu-annotations
123- # SL00000571_SomaScan_7K_v4.1_Plasma_Serum_Annotated_Menu.xlsx
124- " SL-00000571-rev11-2025-09" = list (sha = " f13dbe8d5f97bdf56eb107d2cff15408" ,
125- col_serum = c(" Serum Scalar v4.1 7K to v4.0 5K" ,
126- " Serum Scalar v4.1 7K to v5.0 11K" ),
127- col_plasma = c(" Plasma Scalar v4.1 7K to v4.0 5K" ,
128- " Plasma Scalar v4.1 7K to v5.0 11K" ),
129- which_serum = c(43 , 47 ),
130- which_plasma = c(45 , 49 ),
131- skip = 8L ,
132- rows = 7605 ,
133- cols = 50 ),
134- # source 11k ----
135- # https://menu.somalogic.com/file-downloads/menu-annotations
136- # SL00000906_SomaScan_11K_v5.0_Plasma_Serum_Annotated_Menu.xlsx
137- " SL-00000906-rev8-2025-09" = list (sha = " 48f7aafc713acdd7896f010f62506b51" ,
138- col_serum = c(" Serum Scalar v5.0 11K to v4.1 7K" ,
139- " Serum Scalar v5.0 11K to v4.0 5K" ),
140- col_plasma = c(" Plasma Scalar v5.0 11K to v4.1 7K" ,
141- " Plasma Scalar v5.0 11K to v4.0 5K" ),
142- which_serum = c(43 , 47 ),
143- which_plasma = c(45 , 49 ),
144- skip = 8L ,
145- rows = 11092 ,
146- cols = 51 )
147- )
0 commit comments