Updates to basecalling

kopardev · kopardev · commit 1d73386631bd · 2020-11-23T16:31:39.000-05:00
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -6,22 +6,24 @@ import os, sys
 
 # Global workflow variables
 configfile: 'config/config.yaml'              # config dictionary
-#common.smk already defines
-#samples, samplesdf
-#tools
-#project
+
+WORKDIR = config["workdir"]
+# FLOWCELL = config["flowcell"]
+# KIT = config["kit"]
+
+# common.smk already defines
+# SAMPLES
+# SAMPLESDF
+# TOOLS
 
 include: "rules/common.smk"
 include: "rules/other.smk"
 
 # convert = {'Y': False, 'N': True}
 if config['basecalling'].startswith("Y"):
-    basecalling = True
+	basecalling = True
 else:
-    basecalling = False
-
-# The main entry point of your workflow.
-# After configuring, running snakemake -n in a clone of this repository should successfully execute a dry-run of the workflow.
+	basecalling = False
 
 
 report: "report/workflow.rst"
@@ -30,12 +32,11 @@ report: "report/workflow.rst"
 # singularity: "docker://continuumio/miniconda3"
 
 
-rule all:
-    input:
-        expand(join(workdir,fastqs,"{sample}.fastq.gz"), sample=samples)
-
-
 if basecalling:
-    include: "rules/basecalling.smk"
-
-
+	include: "rules/basecalling.smk"
+	rule all:
+		input:
+			expand(join(WORKDIR,"fastqs","{sample}.fastq.gz"), sample=SAMPLES),
+			expand(join(WORKDIR,"fastqs","{sample}.sequencing_summary.txt.gz"),sample=SAMPLES),
+			expand(join(WORKDIR,"qc","fastqc","{sample}_fastqc.zip"),sample=SAMPLES),
+			expand(join(WORKDIR,"qc","pycoQC","{sample}.pycoQC.html"),sample=SAMPLES)
diff --git a/workflow/rules/basecalling.smk b/workflow/rules/basecalling.smk
@@ -1,22 +1,67 @@
+def get_fast5_path(wildcards):
+    return SAMPLESDF["path_to_fast5_parent_folder"][wildcards.sample]
+
+
 rule guppy:
     input:
-        fast5path=samplesdf['path_to_fast5_parent_folder'][{sample}]
+        fast5path=get_fast5_path
     output:
-        outfastq=join(workdir,fastqs,"{sample}.fastq.gz")
-	params:
-		flowcell = config['flowcell'],
-		kit = config['kit'],
-		sample = {sample}
-    envmodules: tools['guppy']['version']
-	shell:"""
+        outfastq=join(WORKDIR,"fastqs","{sample}.fastq.gz"),
+        sequencing_summary=join(WORKDIR,"fastqs","{sample}.sequencing_summary.txt.gz")
+    params:
+        flowcell = config["flowcell"],
+        kit = config["kit"],
+        sample = "{sample}"
+    envmodules: TOOLS["guppy"]["version"], TOOLS["pigz"]["version"]
+    log: join(WORKDIR,"logs","{sample}.guppy.log")
+    shell:"""
+guppy_basecaller --print_workflows 2>&1 | tee -a {log}
 guppy_basecaller \
-	--input_path {input.fast5path} \
-    --recursive \
-    --flowcell {params.flowcell} \
-	--kit {params.kit} \
-    -x cuda:all \
-	--records_per_fastq 0 \
-	--save_path /lscratch/$SLURM_JOBID/{params.sample}
-find /lscratch/$SLURM_JOBID/{params.sample} -name "*.fastq" -exec cat {} \; \
-	| gzip -n - > {output.outfastq}
+ --input_path {input.fast5path} \
+ --recursive \
+ --flowcell {params.flowcell} \
+ --kit {params.kit} \
+ -x cuda:all \
+ --records_per_fastq 0 \
+ --save_path /lscratch/$SLURM_JOBID/{params.sample} 2>&1 |tee -a {log}
+find /lscratch/$SLURM_JOBID/{params.sample} -name "*.fastq" -exec cat {{}} \; \
+ | gzip -n -> {output.outfastq} 2>&1 |tee -a {log}
+pigz -p $(nproc) /lscratch/$SLURM_JOBID/{params.sample}/sequencing_summary.txt && cp /lscratch/$SLURM_JOBID/{params.sample}/sequencing_summary.txt.gz {output.sequencing_summary} 2>&1 |tee -a {log}
+"""
+## Files created by guppy look like this:
+# -rw-r--r-- 1 kopardevn CCBR 3.0K Nov 19 14:20 sequencing_summary.txt
+# -rw-r--r-- 1 kopardevn CCBR 3.7K Nov 19 14:20 fastq_runid_d531634aaf7cba4fd8f7a1fba1d8ed9f0f81be2a_0_0.fastq
+# -rw-r--r-- 1 kopardevn CCBR 3.3K Nov 19 14:20 fastq_runid_84d34f66eed213a95bd9b6aff2d24aac498555ff_0_0.fastq
+# -rw-r--r-- 1 kopardevn CCBR  11K Nov 19 14:20 fastq_runid_013ea2ec6aebadbd80826ad673b152e04460f452_0_0.fastq
+# -rw-r--r-- 1 kopardevn CCBR  64K Nov 19 14:20 sequencing_telemetry.js
+# -rw-r--r-- 1 kopardevn CCBR 7.4K Nov 19 14:20 guppy_basecaller_log-2020-11-19_14-20-41.log
+
+rule fastqc:
+    input:
+        expand(join(WORKDIR,"fastqs","{sample}.fastq.gz"),sample=SAMPLES)
+    output:
+        expand(join(WORKDIR,"qc","fastqc","{sample}_fastqc.zip"),sample=SAMPLES)
+    params:
+        outdir=join(WORKDIR,"qc","fastqc")
+    threads: 16
+    envmodules: TOOLS["fastqc"]["version"]
+    log: join(WORKDIR,"logs","fastqc.log")
+    shell:"""
+fastqc {input} -t {threads} -o {params.outdir}
+"""
+
+rule pycoqc:
+    input:
+        # sequencing_summary=join(WORKDIR,"fastqs","{sample}.sequencing_summary.txt")
+        sequencing_summary=rules.guppy.output.sequencing_summary
+    output:
+        pycoQChtml=join(WORKDIR,"qc","pycoQC","{sample}.pycoQC.html"),
+        pycoQCjson=join(WORKDIR,"qc","pycoQC","{sample}.pycoQC.json"),
+    params:
+        outdir=join(WORKDIR,"qc","fastqc")
+    conda: "envs/pycoqc.yaml"
+    container: "docker://nciccbr/ccbr_pycoqc_v2.5.0.23:latest"
+    log: join(WORKDIR,"logs","{sample}.pycoQC.log")
+    shell:"""
+pycoQC -f {input.sequencing_summary} -o {output.pycoQChtml} -j {output.pycoQCjson} 2>&1 |tee -a {log}
 """
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -1,6 +1,7 @@
 from snakemake.utils import validate
 import pandas as pd
 import yaml
+import pprint
 
 # this container defines the underlying OS for each job when using the workflow
 # with --use-conda --use-singularity
@@ -11,10 +12,10 @@ import yaml
 configfile: "config/config.yaml"
 validate(config, schema="../schemas/config.schema.yaml")
 
-samplesdf = pd.read_csv(config["samples"],sep="\t",header=0,index_col="sampleName")
-samples = list(samples.index)
+SAMPLESDF = pd.read_csv(config["samples"],sep="\t",header=0,index_col="sampleName")
+SAMPLES = list(SAMPLESDF.index)
 #now path to fast5 folder for sampleA will be sampledf["path_to_fast5_parent_folder"]["sampleA"]
-validate(samplesdf, schema="../schemas/samples.schema.yaml")
+# validate(SAMPLESDF, schema="../schemas/samples.schema.yaml")
 
 ## Load tools from TSV file
 # tools = pd.read_csv(config["tools"], sep="\t",header=0,index_col="tool")
@@ -23,7 +24,9 @@ validate(samplesdf, schema="../schemas/samples.schema.yaml")
 
 ## Load tools from YAML file
 with open(config["tools"]) as f:
-	tools = yaml.safe_load(f)
+	TOOLS = yaml.safe_load(f)
+# pprint.pprint(tools)
+
 
 ## Load project metadata from TSV file
 # project = pd.read_csv(config["project"], sep="\t",header=0,index_col="key")