nf-core · timosachsenberg · Apr 1, 2025
@@ -36,26 +36,26 @@ On release, automated continuous integration tests run the pipeline on a full-si
 > to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline)
 > with `-profile test` before running the workflow on actual data.
 
-First, prepare a samplesheet with your input data that looks as follows:
+First, prepare an SDRF (Sample and Data Relationship Format) file with your input data. SDRF is a standardized format used in proteomics that contains rich metadata about samples. Here's a simplified example:
 
-`samplesheet.tsv`
+`sdrf_file.tsv`
 
-```tsv title="samplesheet.tsv
-ID	Sample	Condition	ReplicateFileName
-1	tumor	treated	/path/to/msrun1.raw|mzML|d
-2	tumor	treated	/path/to/msrun2.raw|mzML|d
-3	tumor	untreated	/path/to/msrun3.raw|mzML|d
-4	tumor	untreated	/path/to/msrun4.raw|mzML|d
+```tsv title="sdrf_file.tsv
+source name	characteristics[organism]	characteristics[organism part]	comment[data file]	comment[technical replicate]	factor value[organism part]
+sample1_classI	Homo sapiens	Brain	/path/to/sample1_classI_techRep1.mzML	1	Brain
+sample1_classI	Homo sapiens	Brain	/path/to/sample1_classI_techRep2.mzML	2	Brain
+sample2_classI	Homo sapiens	Liver	/path/to/sample2_classI_techRep1.mzML	1	Liver
+sample2_classI	Homo sapiens	Liver	/path/to/sample2_classI_techRep2.mzML	2	Liver
 ```
 
 Each row represents a mass spectrometry run in one of the formats: raw, RAW, mzML, mzML.gz, d, d.tar.gz, d.zip
 
-Now, you can run the pipeline using:
+Now, you can run the pipeline using your SDRF file:
 
 ```bash
 nextflow run nf-core/mhcquant
     -profile <docker/singularity/.../institute> \
-    --input 'samplesheet.tsv' \
+    --input 'sdrf_file.tsv' \
     --fasta 'SWISSPROT_2020.fasta' \
     --outdir ./results
 ```
@@ -71,6 +71,7 @@ For more details and further functionality, please refer to the [usage documenta
 
 By default the pipeline currently performs identification of MHC class I peptides with HCD settings:
 
+- Converting SDRF file to internal format (`SDRF_CONVERT`)
 - Preparing spectra dependent on the input format (`PrepareSpectra`)
 - Creation of reversed decoy database (`DecoyDatabase`)
 - Identification of peptides in the MS/MS spectra (`CometAdapter`)

@@ -0,0 +1,10 @@
+source name	characteristics[organism]	characteristics[organism part]	characteristics[cell type]	characteristics[cell line]	characteristics[developmental stage]	characteristics[disease]	characteristics[ancestry category]	characteristics[sex]	characteristics[age]	characteristics[biological replicate]	characteristics[mhc type]	material type	assay name	technology type	comment[data file]	comment[file uri]	comment[technical replicate]	comment[fraction identifier]	comment[label]	comment[cleavage agent details]	comment[instrument]	comment[modification parameters]	comment[dissociation method]	comment[precursor mass tolerance]	comment[fragment mass tolerance]	factor value[organism part]
+sample1_classII	Homo sapiens	Brain	not applicable	not applicable	adult	none	not available	not available	not available	1	A*11:01;A*68:01;B*15:01;B*35:03;C*03:03;C*04:01;DRB1*04:01;DRB4*01:01;DQB1*03:01;DQA1*03:01;DPB1*04:01;DPB1*04:02	tissue	none	proteomic profiling by mass spectrometry	/path/to/sample1_classII_techRep1.mzML	TODO	1	1	AC=MS:1002038;NT=label free sample	NT=unspecific cleavage; AC=MS:1001956	NT=timsTOF Pro 2;AC=MS:1003230	NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35;PP=Anywhere	AC=MS:1000133;NT=CID	20 ppm	0.02 Da	Brain
+sample1_classII	Homo sapiens	Brain	not applicable	not applicable	adult	none	not available	not available	not available	1	A*11:01;A*68:01;B*15:01;B*35:03;C*03:03;C*04:01;DRB1*04:01;DRB4*01:01;DQB1*03:01;DQA1*03:01;DPB1*04:01;DPB1*04:02	tissue	none	proteomic profiling by mass spectrometry	/path/to/sample1_classII_techRep2.mzML	TODO	2	1	AC=MS:1002038;NT=label free sample	NT=unspecific cleavage; AC=MS:1001956	NT=timsTOF Pro 2;AC=MS:1003230	NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35;PP=Anywhere	AC=MS:1000133;NT=CID	20 ppm	0.02 Da	Brain
+sample1_classII	Homo sapiens	Brain	not applicable	not applicable	adult	none	not available	not available	not available	1	A*11:01;A*68:01;B*15:01;B*35:03;C*03:03;C*04:01;DRB1*04:01;DRB4*01:01;DQB1*03:01;DQA1*03:01;DPB1*04:01;DPB1*04:02	tissue	none	proteomic profiling by mass spectrometry	/path/to/sample1_classII_techRep3.mzML	TODO	3	1	AC=MS:1002038;NT=label free sample	NT=unspecific cleavage; AC=MS:1001956	NT=timsTOF Pro 2;AC=MS:1003230	NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35;PP=Anywhere	AC=MS:1000133;NT=CID	20 ppm	0.02 Da	Brain
+sample1_classI	Homo sapiens	Brain	not applicable	not applicable	adult	none	not available	not available	not available	1	A*11:01;A*68:01;B*15:01;B*35:03;C*03:03;C*04:01;DRB1*04:01;DRB4*01:01;DQB1*03:01;DQA1*03:01;DPB1*04:01;DPB1*04:02	tissue	none	proteomic profiling by mass spectrometry	/path/to/sample1_classI_techRep1.mzML	TODO	1	1	AC=MS:1002038;NT=label free sample	NT=unspecific cleavage; AC=MS:1001956	NT=timsTOF Pro 2;AC=MS:1003230	NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35;PP=Anywhere	AC=MS:1000133;NT=CID	20 ppm	0.02 Da	Brain
+sample1_classI	Homo sapiens	Brain	not applicable	not applicable	adult	none	not available	not available	not available	1	A*11:01;A*68:01;B*15:01;B*35:03;C*03:03;C*04:01;DRB1*04:01;DRB4*01:01;DQB1*03:01;DQA1*03:01;DPB1*04:01;DPB1*04:02	tissue	none	proteomic profiling by mass spectrometry	/path/to/sample1_classI_techRep2.mzML	TODO	2	1	AC=MS:1002038;NT=label free sample	NT=unspecific cleavage; AC=MS:1001956	NT=timsTOF Pro 2;AC=MS:1003230	NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35;PP=Anywhere	AC=MS:1000133;NT=CID	20 ppm	0.02 Da	Brain
+sample2_classII	Homo sapiens	Liver	not applicable	not applicable	adult	none	not available	not available	not available	2	A*02:01;A*24:02;B*07:02;B*40:01;C*03:04;C*07:02;DRB1*15:01;DRB5*01:01;DQB1*06:02;DQA1*01:02;DPB1*04:01;DPB1*04:02	tissue	none	proteomic profiling by mass spectrometry	/path/to/sample2_classII_techRep1.mzML	TODO	1	1	AC=MS:1002038;NT=label free sample	NT=unspecific cleavage; AC=MS:1001956	NT=timsTOF Pro 2;AC=MS:1003230	NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35;PP=Anywhere	AC=MS:1000133;NT=CID	20 ppm	0.02 Da	Liver
+sample2_classII	Homo sapiens	Liver	not applicable	not applicable	adult	none	not available	not available	not available	2	A*02:01;A*24:02;B*07:02;B*40:01;C*03:04;C*07:02;DRB1*15:01;DRB5*01:01;DQB1*06:02;DQA1*01:02;DPB1*04:01;DPB1*04:02	tissue	none	proteomic profiling by mass spectrometry	/path/to/sample2_classII_techRep2.mzML	TODO	2	1	AC=MS:1002038;NT=label free sample	NT=unspecific cleavage; AC=MS:1001956	NT=timsTOF Pro 2;AC=MS:1003230	NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35;PP=Anywhere	AC=MS:1000133;NT=CID	20 ppm	0.02 Da	Liver
+sample2_classI	Homo sapiens	Liver	not applicable	not applicable	adult	none	not available	not available	not available	2	A*02:01;A*24:02;B*07:02;B*40:01;C*03:04;C*07:02;DRB1*15:01;DRB5*01:01;DQB1*06:02;DQA1*01:02;DPB1*04:01;DPB1*04:02	tissue	none	proteomic profiling by mass spectrometry	/path/to/sample2_classI_techRep1.mzML	TODO	1	1	AC=MS:1002038;NT=label free sample	NT=unspecific cleavage; AC=MS:1001956	NT=timsTOF Pro 2;AC=MS:1003230	NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35;PP=Anywhere	AC=MS:1000133;NT=CID	20 ppm	0.02 Da	Liver
+sample2_classI	Homo sapiens	Liver	not applicable	not applicable	adult	none	not available	not available	not available	2	A*02:01;A*24:02;B*07:02;B*40:01;C*03:04;C*07:02;DRB1*15:01;DRB5*01:01;DQB1*06:02;DQA1*01:02;DPB1*04:01;DPB1*04:02	tissue	none	proteomic profiling by mass spectrometry	/path/to/sample2_classI_techRep2.mzML	TODO	2	1	AC=MS:1002038;NT=label free sample	NT=unspecific cleavage; AC=MS:1001956	NT=timsTOF Pro 2;AC=MS:1003230	NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35;PP=Anywhere	AC=MS:1000133;NT=CID	20 ppm	0.02 Da	Liver
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+
+"""
+Script to convert SDRF files to MHCquant-compatible samplesheet format.
+"""
+
+import argparse
+import pandas as pd
+import sys
+import os
+from sdrf.sdrf import SdrfDataFrame
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description='Convert SDRF files to MHCquant-compatible samplesheet format.')
+    parser.add_argument('-s', '--sdrf', required=True, help='Path to SDRF file')
+    parser.add_argument('-o', '--output', required=True, help='Path to output samplesheet file')
+    parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
+    return parser.parse_args()
+
+def convert_sdrf_to_mhcquant(sdrf_file, output_file):
+    """
+    Convert SDRF file to MHCquant-compatible samplesheet format.
+
+    Parameters:
+    -----------
+    sdrf_file : str
+        Path to SDRF file
+    output_file : str
+        Path to output samplesheet file
+    """
+    try:
+        # Parse SDRF file
+        sdrf_df = SdrfDataFrame.parse_sdrf(sdrf_file)
+
+        # Create output dataframe with required columns
+        output_df = pd.DataFrame(columns=['ID', 'Sample', 'Condition', 'ReplicateFileName'])
+
+        # Extract necessary information from SDRF
+        for idx, row in sdrf_df.iterrows():
+            # Get source name as sample
+            sample = row.get('source name', f'sample_{idx}')
+
+            # Get condition from factor value or characteristics
+            condition = None
+            for col in row.index:
+                if col.startswith('factor value['):
+                    condition = row[col]
+                    break
+
+            if condition is None:
+                # If no factor value is found, use organism part or other characteristic
+                for col in row.index:
+                    if col.startswith('characteristics[organism part]'):
+                        condition = row[col]
+                        break
+
+                if condition is None:
+                    # Default condition if nothing else is found
+                    condition = 'unknown'
+
+            # Get file path
+            file_path = None
+            for col in row.index:
+                if col == 'comment[data file]':
+                    file_path = row[col]
+                    break
+
+            if file_path is None:
+                continue
+
+            # Add to output dataframe
+            output_df = output_df._append({
+                'ID': idx + 1,
+                'Sample': sample,
+                'Condition': condition,
+                'ReplicateFileName': file_path
+            }, ignore_index=True)
+
+        # Write to output file
+        output_df.to_csv(output_file, sep='\t', index=False)
+        print(f"Successfully converted {sdrf_file} to {output_file}")
+
+    except Exception as e:
+        print(f"Error converting SDRF file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+def main():
+    """Main function."""
+    args = parse_args()
+    convert_sdrf_to_mhcquant(args.sdrf, args.output)
+
+if __name__ == '__main__':
+    main()
@@ -4,41 +4,44 @@
 
 > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._
 
-## Samplesheet input
+## SDRF input
 
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a tab-separated file with 4 columns, and a header row as shown in the examples below.
+You will need to create an SDRF (Sample and Data Relationship Format) file with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. SDRF is a standardized format used in proteomics that contains rich metadata about samples.
 
 ```bash
---input '[path to samplesheet file]'
+--input '[path to SDRF file]'
 ```
 
-### Samplesheet columns
+### SDRF format
 
-| Column              | Description                                                                                           |
-| ------------------- | ----------------------------------------------------------------------------------------------------- |
-| `ID`                | An incrementing value which acts as a unique number for the given sample                              |
-| `Sample`            | Custom sample name. This entry will be identical for multiple MS runs from the same sample.           |
-| `Condition`         | Additional information of the sample can be defined here.                                             |
-| `ReplicateFileName` | Full path to the MS file. These files have the extentions .raw, .mzML, mzML.gz, .d, .d.tar.gz, .d.zip |
+SDRF files contain detailed sample metadata in a tabular format with specific columns for sample characteristics, data file locations, and experimental factors. The pipeline uses sdrf-pipelines to process this file and extract the necessary information.
 
-The pipeline will auto-detect whether a sample is either in mzML, raw or tdf file format using the information provided in the samplesheet.
+Key columns in an SDRF file include:
 
-An [example samplesheet](../assets/samplesheet.tsv) has been provided with the pipeline.
+| Column                      | Description                                                                                           |
+| --------------------------- | ----------------------------------------------------------------------------------------------------- |
+| `source name`               | Custom sample name. This entry will be identical for multiple MS runs from the same sample.           |
+| `characteristics[organism]` | The organism from which the sample was derived                                                        |
+| `comment[data file]`        | Full path to the MS file. These files have the extentions .raw, .mzML, mzML.gz, .d, .d.tar.gz, .d.zip |
+| `factor value[...]`         | Experimental factors that can be used for grouping samples                                            |
+| `characteristics[mhc type]` | MHC alleles present in the sample (important for MHC peptide analysis)                                |
+
+The pipeline will auto-detect whether a sample is either in mzML, raw or tdf file format using the information provided in the SDRF file.
+
+An [example SDRF file](../assets/example_sdrf.tsv) has been provided with the pipeline.
 
 ### Multiple runs of the same sample
 
-MS runs are merged on the `Sample` and `Condition` identifier combination before they are rescored with `Percolator`. Typically technical replicates of a sample are merged together to report one peptide list per sample. Below is an example of two runs from a treated and untreated tumor sample.
-
-```tsv title="samplesheet.tsv
-ID	Sample	Condition	ReplicateFileName
-1	tumor	treated	/path/to/msrun1.raw|mzML|d
-2	tumor	treated	/path/to/msrun2.raw|mzML|d
-3	tumor	untreated	/path/to/msrun3.raw|mzML|d
-4	tumor	untreated	/path/to/msrun4.raw|mzML|d
-5	control	treated	/path/to/msrun5.raw|mzML|d
-6	control	treated	/path/to/msrun6.raw|mzML|d
-7	control	untreated	/path/to/msrun7.raw|mzML|d
-8	control	untreated	/path/to/msrun8.raw|mzML|d
+MS runs are merged based on the `source name` and factor values before they are rescored with `Percolator`. Typically technical replicates of a sample are merged together to report one peptide list per sample.
+
+For example, in the provided example SDRF file, there are multiple technical replicates for each sample (e.g., sample1_classI, sample1_classII), and these will be merged together during processing.
+
+### Creating SDRF files
+
+You can create SDRF files manually or use tools like the [SDRF Composer](https://github.com/bigbio/sdrf-composer) to generate them. The sdrf-pipelines tool also provides validation functionality to ensure your SDRF file is correctly formatted:
+
+```bash
+parse_sdrf.py validate -s your_sdrf_file.tsv
 ```
 
 ## Recommended search settings
@@ -74,7 +77,7 @@ The typical command for running the pipeline is as follows:
 
 ```console
 nextflow run nf-core/mhcquant \
-  --input 'samplesheet.tsv' \
+  --input 'sdrf_file.tsv' \
   --outdir <OUTDIR> \
   --fasta 'SWISSPROT_2020.fasta' \
   <SEARCH PARAMS> \
@@ -111,7 +114,7 @@ nextflow run nf-core/mhcquant -profile docker -params-file params.yaml
 with:
 
 ```yaml title="params.yaml"
-input: './samplesheet.csv'
+input: './sdrf_file.tsv'
 outdir: './results/'
 <...>
 ```
@@ -232,4 +235,3 @@ We recommend adding the following line to your environment to limit this (typica
 
 ```bash
 NXF_OPTS='-Xms1g -Xmx4g'
-```
@@ -0,0 +1,28 @@
+process SDRF_CONVERT {
+    tag "$sdrf"
+    label 'process_low'
+
+    conda "bioconda::sdrf-pipelines=0.0.20"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.20--pyhdfd78af_0' :
+        'biocontainers/sdrf-pipelines:0.0.20--pyhdfd78af_0' }"
+
+    input:
+    path sdrf
+
+    output:
+    path "samplesheet.tsv", emit: samplesheet
+    path "versions.yml", emit: versions
+
+    script:
+    """
+    parse_sdrf.py convert-mhcquant \\
+        -s $sdrf \\
+        -o samplesheet.tsv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        sdrf-pipelines: \$(parse_sdrf.py --version | sed 's/parse_sdrf.py version //g')
+    END_VERSIONS
+    """
+}
@@ -0,0 +1,9 @@
+name: sdrf_convert
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::sdrf-pipelines=0.0.20
+  - conda-forge::pandas=1.3.5
+  - python=3.8
@@ -0,0 +1,29 @@
+process SDRF_CONVERT {
+    tag "$sdrf"
+    label 'process_low'
+
+    conda "bioconda::sdrf-pipelines=0.0.20 conda-forge::pandas=1.3.5 python=3.8"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.20--pyhdfd78af_0' :
+        'biocontainers/sdrf-pipelines:0.0.20--pyhdfd78af_0' }"
+
+    input:
+    path sdrf
+
+    output:
+    path "samplesheet.tsv", emit: samplesheet
+    path "versions.yml", emit: versions
+
+    script:
+    """
+    python ${projectDir}/bin/sdrf_to_mhcquant.py \\
+        -s $sdrf \\
+        -o samplesheet.tsv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        sdrf-pipelines: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('sdrf-pipelines').version)")
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}
@@ -0,0 +1,33 @@
+name: sdrf_convert
+description: Converts SDRF files to MHCquant-compatible samplesheet format
+keywords:
+  - sdrf
+  - convert
+  - samplesheet
+tools:
+  - sdrf-pipelines:
+      description: A set of pipelines for extracting, processing and validating SDRF files
+      homepage: https://github.com/bigbio/sdrf-pipelines
+      documentation: https://github.com/bigbio/sdrf-pipelines
+      tool_dev_url: https://github.com/bigbio/sdrf-pipelines
+      doi: ""
+      licence: ["Apache-2.0"]
+
+input:
+  - sdrf:
+      type: file
+      description: SDRF file
+      pattern: "*.{tsv}"
+
+output:
+  - samplesheet:
+      type: file
+      description: Converted samplesheet in MHCquant-compatible format
+      pattern: "*.{tsv}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@nf-core/mhcquant"