Example Snakefile¶
#Data downloaded from: https://osf.io/vzfc6/
# Default rule to run entire workflow, only works once inputs/outputs correctly filled in all rules
rule all:
input: "variants.vcf"
# Download raw sequence data
rule download_data:
output: "SRR2584857_1.fastq.gz"
shell:
"wget https://osf.io/4rdza/download -O {output}"
# Download reference genome
rule download_genome:
output: "ecoli-rel606.fa.gz"
shell:
"wget https://osf.io/8sm92/download -O {output}"
# The .gz file must be uncompressed before it can be accessed
rule uncompress_genome:
input: "ecoli-rel606.fa.gz"
output: "ecoli-rel606.fa"
shell:
"gunzip -c {input} > {output}"
# Create an index for the reference genome for bwa
rule index_genome_bwa:
input: "ecoli-rel606.fa"
output:
expand("ecoli-rel606.fa.{ext}", ext=['sa', 'amb', 'ann', 'pac', 'bwt'])
shell:
"bwa index {input}"
# Map the raw reads to the reference genome
rule map_reads:
input:
genome = "ecoli-rel606.fa",
reads = "SRR2584857_1.fastq.gz",
idxfile = expand("ecoli-rel606.fa.{ext}", ext=['sa', 'amb', 'ann', 'pac', 'bwt'])
output: "SRR2584857.sam"
shell:
"bwa mem -t 4 {input.genome} {input.reads} > {output}"
# Create an index for the reference genome fasta file for samtools
# Note: this indexing step is different and unrelated to the one above for bwa
rule index_genome_samtools:
input: "ecoli-rel606.fa"
output: "ecoli-rel606.fa.fai"
shell:
"samtools faidx {input}"
# Convert .sam to .bam file
rule samtools_import:
input:
index="ecoli-rel606.fa.fai",
samfile="SRR2584857.sam"
output:"SRR2584857.bam"
shell:
"""
samtools view -bt {input.index} -o {output} {input.samfile}
"""
# original command with samtools v1.9
## samtools import {input.index} {input.samfile} {output}
# but it gave segmentation fault error with samtools v1.10, so here we use samtools view instead
# Sort the bam alignment file
rule samtools_sort:
input: "SRR2584857.bam"
output: "SRR2584857.sorted.bam"
shell:
"samtools sort {input} -o {output}"
# Create an index for the sorted bam file
# again, this is different from the above indexing steps
rule samtools_index_sorted:
input: "SRR2584857.sorted.bam"
output: "SRR2584857.sorted.bam.bai"
shell: "samtools index {input}"
# Generate pileup file with samtools, then call variants with bcftools
# From samtools doc: 'Pileup format consists of TAB-separated lines, with each line representing the pileup of reads at a single genomic position.'
rule samtools_mpileup:
input:
index="ecoli-rel606.fa",
sorted="SRR2584857.sorted.bam",
sorted_bai="SRR2584857.sorted.bam.bai"
output:
"variants.raw.bcf"
shell:
"""
samtools mpileup -u -t DP -f {input.index} {input.sorted} | \
bcftools call -mv -Ob -o - > {output}
"""
# Convert bcf to vcf file so we can look at the resulting mappings
rule make_vcf:
input: "variants.raw.bcf"
output: "variants.vcf"
shell: "bcftools view {input} > {output}"
# at end, run this command in the terminal to view the mapping:
## samtools tview -p ecoli:4202391 SRR2584857.sorted.bam ecoli-rel606.fa
Last update: April 2, 2021