this is a follow-up question to this one: snakemake: Missing input files for rule all
I have changed my pipeline for variant calling; it now handles only the files for one single patient. It is supposed to find all samples in that patient's folder, produce the relevant files and then run a variant caller. The samples_dict looks like this (the outer keys are the sample indices according to the date):
{'0': {'date': '2019-09-23', 'id': 'DXcf196139_01'},
'1': {'date': '2019-10-28', 'id': 'DXcf197130_01'},
'2': {'date': '2020-01-28', 'id': 'DXcf200691_01'},
...}
The tumor_dict looks like this (keys: sample_ids, values: tumor file names):
{'DXcf204265_01': '204194 ',
'DXcf203641_01': '204194 ',
'DXcf205934_01': '204194 ',
...}
I am getting a
WildcardError in line 36 of /mnt/storage1/users/ahwacko1/patient_data_chris/pati
ent_data/patient_12/scripts/Snakefile:
No values given for wildcard 'samples_dict'.
File "/mnt/storage1/users/ahwacko1/patient_data_chris/patient_data/patient_12/
scripts/Snakefile", line 36, in <module>
I don't know how to indicate I want to access the variable samples_dict in the expand without iterating, as the iteration has to go over its keys. I tried to put {{samples_dict}} in rule all into double curly brackets to not expand it, but that just resulted in a
ValueError in line 34 of /mnt/storage1/users/ahwacko1/patient_data_chris/patient_data/patient_12/scripts/Snakefile:
Single '}' encountered in format string
even though I checked multiple times that no single brackets were in the code... The only bigger pipeline I wrote before created all files newly and produced various combinations of the different wildcards; in this pipeline, however, I have to process already present data, and I am struggling to write a pipeline which handles the different wildcards that these data provide (i.e. there are no recombinations, just different dates and IDs). I was starting to consider just writing everything in a python script considering that the expand cannot create recombinations but has to just iterate over the already existing samples.
Thanks!
import os
from sort_files import read_table
import subprocess
configfile: 'config_analyze.yaml'
folder = os.path.dirname(os.getcwd())
#get info of samples_dict in this folder into a nested dict: {"sample_index": {"sample_date": "2019-09-23", "sample_id": "DXcf196139_01"}, ...}
samples_dict = sorted([sample for sample in os.listdir(folder) if "sample_" in sample], key=lambda name: name.split("_")[1])
samples_dict = {sample.split("_")[1]: {'date': sample.split("_")[3], 'id': sample.split("_")[4]+"_"+sample.split("_")[5]} for sample in samples_dict}
#dict with names of tumor files for monitoring
tumor_dict = read_table(os.path.join(os.path.join(folder, "../../"), "table.xlsx"))[2]
#combine multi lane FASTQs into one pair
def combine_fastqs(sample, strand):
path = os.path.join(folder, sample)
cat_file = os.path.join(path, strand+".fastq.gz")
if strand+".fastq.gz" not in os.listdir(path):
fastqs = []
for file in os.listdir(path):
if file.endswith("fastq.gz"):
if (strand == "forward" and "_R1_" in file) or (strand == "reverse" and "_R2_" in file):
fastqs.append(os.path.join(path, file))
fastqs = sorted(fastqs)
command = "zcat "
for forward in fastqs:
command += (forward+" ")
command += "> " + cat_file
ret = subprocess.run(command, capture_output=True, shell=True)
return cat_file
#run all the rest
rule all:
input:
expand("../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/Var/{samples_dict[index]['id']}_monitoring.vcf", index=samples_dict.keys())
#map, sort and index reads
rule map_sort_index:
input:
f = lambda wildcards: combine_fastqs("sample_"+{wildcards.sample}+"_date_"+{wildcards.date}+"_"+{wildcards.id}, "forward"),
r = lambda wildcards: combine_fastqs("sample_"+{wildcards.sample}+"_date_"+{wildcards.date}+"_"+{wildcards.id}, "reverse")
output:
bam = "../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/{samples_dict[index]['id']}_before_dedup.bam",
bai = "../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/{samples_dict[index]['id']}_before_dedup.bam.bai"
threads:
config['threads']
params:
ref = config['reference']
shell:
"""
bwa mem -t {threads} {params.ref} {input.f} {input.r} | samtools sort -@{threads} -o {output.bam} -
samtools index {output.bam}
"""
#dedup reads
rule dedup:
input:
bam = "../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/{samples_dict[index]['id']}_before_dedup.bam"
output:
bam = "../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/{samples_dict[index]['id']}.bam",
bai = "../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/{samples_dict[index]['id']}.bam.bai"
params:
dedup = config['dedup']
shell:
"""
{params.dedup} --infile {input.bam} --outfile {output.deduped}
samtools index {output.bam}
"""
#generate chromsizes for varcalling
rule make_chromsizes:
input:
reference = config['reference']
output:
reference_fai = config['reference_faidx'],
chromsizes = config['chromsizes']
shell:
"""
samtools faidx {input.reference}
cut -f 1,2 {output.reference_fai} > {output.chromsizes}
"""
#run varcalling
rule var_call:
input:
chromsizes = config['chromsizes'],
moni = "../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/{tumor_dict[{samples_dict[index]['id']}]}.bed",
bam = "../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/{samples_dict[index]['id']}.bam",
bai = "../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/{samples_dict[index]['id']}.bam.bai"
output:
folder = directory("../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/var"),
bed = "../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/{tumor_dict[{sample[index]['id']}]}_extended_60.bed",
vcf = "../sample_{samples_dict[index]}_date_{samples_dict[index]['date']}_{samples_dict[index]['id']}/Var/{samples_dict[index]['id']}_monitoring.vcf"
params:
var = config['var'],
reference = config['reference'],
shell:
"""
bedtools slop -i {input.moni} -g {input.chromsizes} -b 60 > {output.bed}
{params.var} -tbam {input.bam} -r {params.reference} -o {output.folder} -m {input.moni} -b {output.bed}
"""