Skip to content

Commit

Permalink
Restructure all output files for more user convenience
Browse files Browse the repository at this point in the history
  • Loading branch information
lczech committed Jul 13, 2024
1 parent ae832b7 commit c079a75
Show file tree
Hide file tree
Showing 33 changed files with 537 additions and 497 deletions.
9 changes: 5 additions & 4 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 19,11 @@ include: "rules/initialize.smk"
rule all:
input:
# Basic steps
"genotyped/all.vcf.gz",
"filtered/all.vcf.gz" if not config["settings"]["filter-variants"] == "none" else [],
"annotated/snpeff.vcf.gz" if config["settings"]["snpeff"] else [],
"annotated/vep.vcf.gz" if config["settings"]["vep"] else [],
"mapping/final.done",
"calling/genotyped-all.vcf.gz",
"calling/filtered-all.vcf.gz" if not config["settings"]["filter-variants"] == "none" else [],
"annotation/snpeff.vcf.gz" if config["settings"]["snpeff"] else [],
"annotation/vep.vcf.gz" if config["settings"]["vep"] else [],
# Quality control
"qc/multiqc.html",
# Reference genome statistics
Expand Down
22 changes: 11 additions & 11 deletions workflow/rules/annotation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -64,21 64,21 @@ rule snpeff:
# (vcf, bcf, or vcf.gz)
# we use the filtered file if a filtering is done, or the unfiltered if not.
calls=(
"filtered/all.vcf.gz"
"calling/filtered-all.vcf.gz"
if not config["settings"]["filter-variants"] == "none"
else "genotyped/all.vcf.gz"
else "calling/genotyped-all.vcf.gz"
),
# path to reference db downloaded with the snpeff download wrapper above
db=get_snpeff_db_path(),
output:
# annotated calls (vcf, bcf, or vcf.gz)
calls=report("annotated/snpeff.vcf.gz", caption="../report/vcf.rst", category="Calls"),
calls=report("annotation/snpeff.vcf.gz", caption="../report/vcf.rst", category="Calls"),
# summary statistics (in HTML), optional
stats=report("annotated/snpeff.html", category="Calls"),
stats=report("annotation/snpeff.html", category="Calls"),
# summary statistics in CSV, optional
csvstats="annotated/snpeff.csv",
csvstats="annotation/snpeff.csv",
log:
"logs/snpeff.log",
"logs/annotation/snpeff.log",
group:
"snpeff"
params:
Expand Down Expand Up @@ -186,15 186,15 @@ rule vep:
input:
# we use the filtered file if a filtering is done, or the unfiltered if not.
calls=(
"filtered/all.vcf.gz"
"calling/filtered-all.vcf.gz"
if not config["settings"]["filter-variants"] == "none"
else "genotyped/all.vcf.gz"
else "calling/genotyped-all.vcf.gz"
),
cache=get_vep_cache_dir(),
plugins=get_vep_plugins_dir(),
output:
calls=report(
"annotated/vep.vcf.gz",
"annotation/vep.vcf.gz",
caption="../report/vcf.rst",
category="Calls",
),
Expand All @@ -204,7 204,7 @@ rule vep:
# MultiQC recently, see https://github.com/ewels/MultiQC/issues/1438
# Will need to update to MultiQC v1.11 at some point.
stats=report(
"annotated/vep_summary.html",
"annotation/vep_summary.html",
caption="../report/stats.rst",
category="Calls",
),
Expand All @@ -215,7 215,7 @@ rule vep:
plugins=config["params"]["vep"]["plugins"],
extra=config["params"]["vep"]["extra"],
log:
"logs/vep-annotate.log",
"logs/annotation/vep-annotate.log",
threads: 4
conda:
# Use our own env definition here, to ensure that we are working with the same vep
Expand Down
50 changes: 25 additions & 25 deletions workflow/rules/calling-bcftools-combined.smk
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 17,33 @@ rule call_variants:
indices=get_all_bais(),
# If we use restricted regions, set them here. If not, empty, which will propagate to the
# get_mpileup_params function as well. Same for small contig groups.
# regions="called/{contig}.regions.bed" if config["settings"].get("restrict-regions") else []
# regions="calling/regions/{contig}.bed" if config["settings"].get("restrict-regions") else []
regions=(
"called/{contig}.regions.bed"
"calling/regions/{contig}.bed"
if (config["settings"].get("restrict-regions"))
else (
"contig-groups/{contig}.bed"
"calling/contig-groups/{contig}.bed"
if (config["settings"].get("contig-group-size"))
else []
)
),
output:
vcf=(
"called/{contig}.vcf.gz"
"calling/called/{contig}.vcf.gz"
if config["settings"]["keep-intermediate"]["calling"]
else temp("called/{contig}.vcf.gz")
else temp("calling/called/{contig}.vcf.gz")
),
# vcf=protected("called/{contig}.vcf.gz")
done=touch("called/{contig}.done"),
# vcf=protected("calling/called/{contig}.vcf.gz")
done=touch("calling/called/{contig}.done"),
params:
# Optional parameters for bcftools mpileup (except -g, -f).
mpileup=get_mpileup_params,
# Optional parameters for bcftools call (except -v, -o, -m).
call=config["params"]["bcftools"]["call"],
log:
"logs/bcftools/call-{contig}.log",
"logs/calling/bcftools/call-{contig}.log",
benchmark:
"benchmarks/bcftools/call-{contig}.bench.log"
"benchmarks/calling/called/bcftools/call-{contig}.log"
conda:
"../envs/bcftools.yaml"
threads: config["params"]["bcftools"]["threads"]
Expand Down Expand Up @@ -80,29 80,29 @@ rule call_variants:
# Need an input function to work with the fai checkpoint
def merge_variants_vcfs_input(wildcards):
fai = checkpoints.samtools_faidx.get().output[0]
return expand("called/{contig}.vcf.gz", contig=get_contigs(fai))
return expand("calling/called/{contig}.vcf.gz", contig=get_contigs(fai))


# Need index files for some of the downstream tools.
# Rational for the fai: see merge_variants_vcfs_input()
def merge_variants_tbis_input(wildcards):
fai = checkpoints.samtools_faidx.get().output[0]
return expand("called/{contig}.vcf.gz.tbi", contig=get_contigs(fai))
return expand("calling/called/{contig}.vcf.gz.tbi", contig=get_contigs(fai))


# bcftools does not automatically create vcf index files, so we need a rule for that...
# ... but picard does! So, if we continue using the below picard mergevcfs tool, we don't need tabix
# ...... buuuuut vcflib does not! So, we reactivate it again!
rule called_vcf_index:
input:
"called/{contig}.vcf.gz",
"calling/called/{contig}.vcf.gz",
output:
"called/{contig}.vcf.gz.tbi",
"calling/called/{contig}.vcf.gz.tbi",
params:
# pass arguments to tabix (e.g. index a vcf)
"-p vcf",
log:
"logs/tabix/called/{contig}.log",
"logs/calling/tabix/{contig}.log",
conda:
"../envs/tabix.yaml"
wrapper:
Expand All @@ -112,9 112,9 @@ rule called_vcf_index:
# Cannot use bcftools concat, as it does not except vcf/bcf files without any calls in them.
# rule merge_variants:
# input:
# calls=lambda w: expand("called/{contig}.bcf", contig=get_contigs())
# calls=lambda w: expand("calling/called/{contig}.bcf", contig=get_contigs())
# output:
# "genotyped/all.vcf.gz"
# "calling/genotyped-all.vcf.gz"
# log:
# "logs/bcftools/concat.log"
# params:
Expand All @@ -134,14 134,14 @@ rule called_vcf_index:
# ref=get_fai,
#
# # The wrapper expects input to be called `vcfs`, but we can use `vcf.gz` as well.
# # vcfs=lambda w: expand("called/{contig}.vcf.gz", contig=get_contigs())
# # vcfs=lambda w: expand("calling/called/{contig}.vcf.gz", contig=get_contigs())
# vcfs=merge_variants_vcfs_input
# output:
# vcf="genotyped/all.vcf.gz"
# vcf="calling/genotyped-all.vcf.gz"
# log:
# "logs/picard/merge-genotyped.log"
# benchmark:
# "benchmarks/picard/merge-genotyped.bench.log"
# "benchmarks/picard/merge-genotyped.log"
# conda:
# "../envs/picard.yaml"
# wrapper:
Expand All @@ -164,19 164,19 @@ rule merge_variants:
# Unfortunately, we cannot pipe here, as Picard fails with that, so temp file it is...
# If we do not use small contigs, we directly output the final file.
vcf=(
temp("genotyped/merged-all.vcf.gz")
temp("calling/called/merged-all.vcf.gz")
if (config["settings"].get("contig-group-size"))
else "genotyped/all.vcf.gz"
else "calling/genotyped-all.vcf.gz"
),
done=(
touch("genotyped/merged-all.done")
touch("calling/called/merged-all.done")
if (config["settings"].get("contig-group-size"))
else touch("genotyped/all.done")
else touch("calling/genotyped-all.done")
),
log:
"logs/vcflib/merge-genotyped.log",
"logs/calling/vcflib/merge-genotyped.log",
benchmark:
"benchmarks/vcflib/merge-genotyped.bench.log"
"benchmarks/calling/genotyped/vcflib/merge-genotyped.log"
conda:
"../envs/vcflib.yaml"
shell:
Expand Down
67 changes: 34 additions & 33 deletions workflow/rules/calling-bcftools-individual.smk
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 16,33 @@ rule call_variants:
indices=get_sample_bais_wildcards,
# If we use restricted regions, set them here. If not, empty, which will propagate to the
# get_mpileup_params function as well. Same for small contig groups.
# regions="called/{contig}.regions.bed" if config["settings"].get("restrict-regions") else []
# regions="calling/regions/{contig}.bed" if config["settings"].get("restrict-regions") else []
regions=(
"called/{contig}.regions.bed"
"calling/regions/{contig}.bed"
if (config["settings"].get("restrict-regions"))
else (
"contig-groups/{contig}.bed"
"calling/contig-groups/{contig}.bed"
if (config["settings"].get("contig-group-size"))
else []
)
),
output:
gvcf=(
"called/{sample}.{contig}.g.vcf.gz"
"calling/called/{sample}.{contig}.g.vcf.gz"
if config["settings"]["keep-intermediate"]["calling"]
else temp("called/{sample}.{contig}.g.vcf.gz")
else temp("calling/called/{sample}.{contig}.g.vcf.gz")
),
gtbi="called/{sample}.{contig}.g.vcf.gz.tbi",
done=touch("called/{sample}.{contig}.g.done"),
gtbi="calling/called/{sample}.{contig}.g.vcf.gz.tbi",
done=touch("calling/called/{sample}.{contig}.g.done"),
params:
# Optional parameters for bcftools mpileup (except -g, -f).
mpileup=get_mpileup_params,
# Optional parameters for bcftools call (except -v, -o, -m).
call=config["params"]["bcftools"]["call"],
log:
"logs/bcftools/call-{sample}.{contig}.log",
"logs/calling/bcftools/call-{sample}.{contig}.log",
benchmark:
"benchmarks/bcftools/call-{sample}.{contig}.bench.log"
"benchmarks/calling/called/bcftools/call-{sample}.{contig}.log"
conda:
"../envs/bcftools.yaml"
threads: config["params"]["bcftools"]["threads"]
Expand Down Expand Up @@ -83,21 83,22 @@ rule combine_contig:
ext=["amb", "ann", "bwt", "pac", "sa", "fai"],
),
# Get the sample data, including indices, which are produced above.
gvcfs=(
expand("called/{sample}.{{contig}}.g.vcf.gz", sample=config["global"]["sample-names"])
gvcfs=expand(
"calling/called/{sample}.{{contig}}.g.vcf.gz", sample=config["global"]["sample-names"]
),
indices=expand(
"called/{sample}.{{contig}}.g.vcf.gz.tbi", sample=config["global"]["sample-names"]
"calling/called/{sample}.{{contig}}.g.vcf.gz.tbi",
sample=config["global"]["sample-names"],
),
output:
gvcf="called/all.{contig}.g.vcf.gz",
gtbi="called/all.{contig}.g.vcf.gz.tbi",
gvcflist="called/all.{contig}.g.txt",
done=touch("called/all.{contig}.g.done"),
gvcf="calling/called/all.{contig}.g.vcf.gz",
gtbi="calling/called/all.{contig}.g.vcf.gz.tbi",
gvcflist="calling/called/all.{contig}.g.txt",
done=touch("calling/called/all.{contig}.g.done"),
log:
"logs/bcftools/combine-contig-{contig}.log",
"logs/calling/bcftools/combine-contig-{contig}.log",
benchmark:
"benchmarks/bcftools/combine-contig-{contig}.bench.log"
"benchmarks/calling/called/bcftools/combine-contig-{contig}.log"
conda:
"../envs/bcftools.yaml"
shell:
Expand Down Expand Up @@ -127,7 128,7 @@ rule combine_contig:
# config file, this is the list of the group names.
def combined_contig_gvcfs(wildcards):
fai = checkpoints.samtools_faidx.get().output[0]
return expand("called/all.{contig}.g.vcf.gz", contig=get_contigs(fai))
return expand("calling/called/all.{contig}.g.vcf.gz", contig=get_contigs(fai))


# We also need a comma-separated list of the contigs, so that bcftools can output
Expand All @@ -154,37 155,37 @@ rule combine_all:
contig_groups=contigs_groups_input,
gvcfs=combined_contig_gvcfs,
output:
# vcf="genotyped/all.vcf.gz",
# tbi="genotyped/all.vcf.gz.tbi",
# lst="genotyped/all.txt",
# done="genotyped/all.done"
# vcf="calling/genotyped-all.vcf.gz",
# tbi="calling/genotyped/all.vcf.gz.tbi",
# lst="calling/genotyped/all.txt",
# done="calling/genotyped-all.done"
vcf=(
temp("genotyped/merged-all.vcf.gz")
temp("calling/called/merged-all.vcf.gz")
if (config["settings"].get("contig-group-size"))
else "genotyped/all.vcf.gz"
else "calling/genotyped-all.vcf.gz"
),
tbi=(
temp("genotyped/merged-all.vcf.gz.tbi")
temp("calling/called/merged-all.vcf.gz.tbi")
if (config["settings"].get("contig-group-size"))
else "genotyped/all.vcf.gz.tbi"
else "calling/genotyped-all.vcf.gz.tbi"
),
lst=(
temp("genotyped/merged-all.txt")
temp("calling/called/merged-all.txt")
if (config["settings"].get("contig-group-size"))
else "genotyped/all.txt"
else "calling/genotyped-all.txt"
),
done=(
touch("genotyped/merged-all.done")
touch("calling/called/merged-all.done")
if (config["settings"].get("contig-group-size"))
else touch("genotyped/all.done")
else touch("calling/genotyped-all.done")
),
params:
# Use a list of the chromosomes in the same order as the fai, for bcftools to sort the output.
regionorder=combined_contig_order,
log:
"logs/bcftools/combine-all.log",
"logs/calling/bcftools/combine-all.log",
benchmark:
"benchmarks/bcftools/combine-all.bench.log"
"benchmarks/calling/genotyped/bcftools/combine-all.log"
conda:
"../envs/bcftools.yaml"
shell:
Expand Down
10 changes: 5 additions & 5 deletions workflow/rules/calling-bcftools.smk
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 63,11 @@ if config["settings"].get("contig-group-size"):

rule sort_variants:
input:
vcf="genotyped/merged-all.vcf.gz",
vcf="calling/called/merged-all.vcf.gz",
refdict=genome_dict(),
output:
vcf="genotyped/all.vcf.gz",
done=touch("genotyped/all.done"),
vcf="calling/genotyped-all.vcf.gz",
done=touch("calling/genotyped-all.done"),
params:
# See duplicates-picard.smk for the reason whe need this on MacOS.
extra=(
Expand All @@ -77,9 77,9 @@ if config["settings"].get("contig-group-size"):
),
java_opts=config["params"]["picard"]["SortVcf-java-opts"],
log:
"logs/picard/sort-genotyped.log",
"logs/calling/picard/sort-genotyped.log",
benchmark:
"benchmarks/picard/sort-genotyped.bench.log"
"benchmarks/calling/genotyped/picard/sort-genotyped.log"
conda:
"../envs/picard.yaml"
shell:
Expand Down
Loading

0 comments on commit c079a75

Please sign in to comment.