Skip to content

Commit 1d73386

Browse files
committed
Updates to basecalling
1 parent 84c9db4 commit 1d73386

File tree

3 files changed

+87
-38
lines changed

3 files changed

+87
-38
lines changed

workflow/Snakefile

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,24 @@ import os, sys
66

77
# Global workflow variables
88
configfile: 'config/config.yaml' # config dictionary
9-
#common.smk already defines
10-
#samples, samplesdf
11-
#tools
12-
#project
9+
10+
WORKDIR = config["workdir"]
11+
# FLOWCELL = config["flowcell"]
12+
# KIT = config["kit"]
13+
14+
# common.smk already defines
15+
# SAMPLES
16+
# SAMPLESDF
17+
# TOOLS
1318

1419
include: "rules/common.smk"
1520
include: "rules/other.smk"
1621

1722
# convert = {'Y': False, 'N': True}
1823
if config['basecalling'].startswith("Y"):
19-
basecalling = True
24+
basecalling = True
2025
else:
21-
basecalling = False
22-
23-
# The main entry point of your workflow.
24-
# After configuring, running snakemake -n in a clone of this repository should successfully execute a dry-run of the workflow.
26+
basecalling = False
2527

2628

2729
report: "report/workflow.rst"
@@ -30,12 +32,11 @@ report: "report/workflow.rst"
3032
# singularity: "docker://continuumio/miniconda3"
3133

3234

33-
rule all:
34-
input:
35-
expand(join(workdir,fastqs,"{sample}.fastq.gz"), sample=samples)
36-
37-
3835
if basecalling:
39-
include: "rules/basecalling.smk"
40-
41-
36+
include: "rules/basecalling.smk"
37+
rule all:
38+
input:
39+
expand(join(WORKDIR,"fastqs","{sample}.fastq.gz"), sample=SAMPLES),
40+
expand(join(WORKDIR,"fastqs","{sample}.sequencing_summary.txt.gz"),sample=SAMPLES),
41+
expand(join(WORKDIR,"qc","fastqc","{sample}_fastqc.zip"),sample=SAMPLES),
42+
expand(join(WORKDIR,"qc","pycoQC","{sample}.pycoQC.html"),sample=SAMPLES)

workflow/rules/basecalling.smk

Lines changed: 62 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,67 @@
1+
def get_fast5_path(wildcards):
2+
return SAMPLESDF["path_to_fast5_parent_folder"][wildcards.sample]
3+
4+
15
rule guppy:
26
input:
3-
fast5path=samplesdf['path_to_fast5_parent_folder'][{sample}]
7+
fast5path=get_fast5_path
48
output:
5-
outfastq=join(workdir,fastqs,"{sample}.fastq.gz")
6-
params:
7-
flowcell = config['flowcell'],
8-
kit = config['kit'],
9-
sample = {sample}
10-
envmodules: tools['guppy']['version']
11-
shell:"""
9+
outfastq=join(WORKDIR,"fastqs","{sample}.fastq.gz"),
10+
sequencing_summary=join(WORKDIR,"fastqs","{sample}.sequencing_summary.txt.gz")
11+
params:
12+
flowcell = config["flowcell"],
13+
kit = config["kit"],
14+
sample = "{sample}"
15+
envmodules: TOOLS["guppy"]["version"], TOOLS["pigz"]["version"]
16+
log: join(WORKDIR,"logs","{sample}.guppy.log")
17+
shell:"""
18+
guppy_basecaller --print_workflows 2>&1 | tee -a {log}
1219
guppy_basecaller \
13-
--input_path {input.fast5path} \
14-
--recursive \
15-
--flowcell {params.flowcell} \
16-
--kit {params.kit} \
17-
-x cuda:all \
18-
--records_per_fastq 0 \
19-
--save_path /lscratch/$SLURM_JOBID/{params.sample}
20-
find /lscratch/$SLURM_JOBID/{params.sample} -name "*.fastq" -exec cat {} \; \
21-
| gzip -n - > {output.outfastq}
20+
--input_path {input.fast5path} \
21+
--recursive \
22+
--flowcell {params.flowcell} \
23+
--kit {params.kit} \
24+
-x cuda:all \
25+
--records_per_fastq 0 \
26+
--save_path /lscratch/$SLURM_JOBID/{params.sample} 2>&1 |tee -a {log}
27+
find /lscratch/$SLURM_JOBID/{params.sample} -name "*.fastq" -exec cat {{}} \; \
28+
| gzip -n -> {output.outfastq} 2>&1 |tee -a {log}
29+
pigz -p $(nproc) /lscratch/$SLURM_JOBID/{params.sample}/sequencing_summary.txt && cp /lscratch/$SLURM_JOBID/{params.sample}/sequencing_summary.txt.gz {output.sequencing_summary} 2>&1 |tee -a {log}
30+
"""
31+
## Files created by guppy look like this:
32+
# -rw-r--r-- 1 kopardevn CCBR 3.0K Nov 19 14:20 sequencing_summary.txt
33+
# -rw-r--r-- 1 kopardevn CCBR 3.7K Nov 19 14:20 fastq_runid_d531634aaf7cba4fd8f7a1fba1d8ed9f0f81be2a_0_0.fastq
34+
# -rw-r--r-- 1 kopardevn CCBR 3.3K Nov 19 14:20 fastq_runid_84d34f66eed213a95bd9b6aff2d24aac498555ff_0_0.fastq
35+
# -rw-r--r-- 1 kopardevn CCBR 11K Nov 19 14:20 fastq_runid_013ea2ec6aebadbd80826ad673b152e04460f452_0_0.fastq
36+
# -rw-r--r-- 1 kopardevn CCBR 64K Nov 19 14:20 sequencing_telemetry.js
37+
# -rw-r--r-- 1 kopardevn CCBR 7.4K Nov 19 14:20 guppy_basecaller_log-2020-11-19_14-20-41.log
38+
39+
rule fastqc:
40+
input:
41+
expand(join(WORKDIR,"fastqs","{sample}.fastq.gz"),sample=SAMPLES)
42+
output:
43+
expand(join(WORKDIR,"qc","fastqc","{sample}_fastqc.zip"),sample=SAMPLES)
44+
params:
45+
outdir=join(WORKDIR,"qc","fastqc")
46+
threads: 16
47+
envmodules: TOOLS["fastqc"]["version"]
48+
log: join(WORKDIR,"logs","fastqc.log")
49+
shell:"""
50+
fastqc {input} -t {threads} -o {params.outdir}
51+
"""
52+
53+
rule pycoqc:
54+
input:
55+
# sequencing_summary=join(WORKDIR,"fastqs","{sample}.sequencing_summary.txt")
56+
sequencing_summary=rules.guppy.output.sequencing_summary
57+
output:
58+
pycoQChtml=join(WORKDIR,"qc","pycoQC","{sample}.pycoQC.html"),
59+
pycoQCjson=join(WORKDIR,"qc","pycoQC","{sample}.pycoQC.json"),
60+
params:
61+
outdir=join(WORKDIR,"qc","fastqc")
62+
conda: "envs/pycoqc.yaml"
63+
container: "docker://nciccbr/ccbr_pycoqc_v2.5.0.23:latest"
64+
log: join(WORKDIR,"logs","{sample}.pycoQC.log")
65+
shell:"""
66+
pycoQC -f {input.sequencing_summary} -o {output.pycoQChtml} -j {output.pycoQCjson} 2>&1 |tee -a {log}
2267
"""

workflow/rules/common.smk

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from snakemake.utils import validate
22
import pandas as pd
33
import yaml
4+
import pprint
45

56
# this container defines the underlying OS for each job when using the workflow
67
# with --use-conda --use-singularity
@@ -11,10 +12,10 @@ import yaml
1112
configfile: "config/config.yaml"
1213
validate(config, schema="../schemas/config.schema.yaml")
1314

14-
samplesdf = pd.read_csv(config["samples"],sep="\t",header=0,index_col="sampleName")
15-
samples = list(samples.index)
15+
SAMPLESDF = pd.read_csv(config["samples"],sep="\t",header=0,index_col="sampleName")
16+
SAMPLES = list(SAMPLESDF.index)
1617
#now path to fast5 folder for sampleA will be sampledf["path_to_fast5_parent_folder"]["sampleA"]
17-
validate(samplesdf, schema="../schemas/samples.schema.yaml")
18+
# validate(SAMPLESDF, schema="../schemas/samples.schema.yaml")
1819

1920
## Load tools from TSV file
2021
# tools = pd.read_csv(config["tools"], sep="\t",header=0,index_col="tool")
@@ -23,7 +24,9 @@ validate(samplesdf, schema="../schemas/samples.schema.yaml")
2324

2425
## Load tools from YAML file
2526
with open(config["tools"]) as f:
26-
tools = yaml.safe_load(f)
27+
TOOLS = yaml.safe_load(f)
28+
# pprint.pprint(tools)
29+
2730

2831
## Load project metadata from TSV file
2932
# project = pd.read_csv(config["project"], sep="\t",header=0,index_col="key")

0 commit comments

Comments
 (0)