import os
SAMPLES = [
"HG00438", "HG00621", "HG00673", "HG00735", "HG00741",
"HG01071", "HG01106", "HG01123", "HG01175", "HG01258",
"HG01358", "HG01361", "HG01891", "HG01928", "HG01952",
"HG01978", "HG02148", "HG02257", "HG02486", "HG02559",
"HG02572", "HG02622", "HG02630", "HG02717", "HG02886",
"HG03453", "HG03516", "HG03540", "HG03579"
]
# Define directories
input_dir = "/gpfs/home/hce24xau/scratch/gen_kmers/data/pan_genome/input_files"
kmer_dir = "/gpfs/home/hce24xau/scratch/gen_kmers/data/pan_genome/kmers_files"
stats_dir = "/gpfs/home/hce24xau/scratch/gen_kmers/data/pan_genome/stats_files"
output_dir = "/gpfs/home/hce24xau/scratch/gen_kmers/data/pan_genome/output_files"
rule all:
input:
expand(f"{output_dir}/{{sample}}.merged.kmers.txt.gz", sample=SAMPLES),
expand(f"{stats_dir}/{{sample}}.stats.txt", sample=SAMPLES),
rule count_kmers:
input:
maternal=f"{input_dir}/{{sample}}.maternal.fa.gz",
paternal=f"{input_dir}/{{sample}}.paternal.fa.gz",
output:
maternal_jf=f"{kmer_dir}/{{sample}}.maternal.jf",
paternal_jf=f"{kmer_dir}/{{sample}}.paternal.jf",
dump_stats=f"{stats_dir}/{{sample}}.dump_stats.txt" # This should be correctly referenced as an output
resources:
time=config["count_kmers"]["time"], # Referencing time from config.yaml
mem_mb=config["count_kmers"]["mem_mb"], # Referencing memory from config.yaml
threads: config["count_kmers"]["threads"]
shell:
"""
jellyfish count -m 31 -s 100M -t 48 -C -o {output.maternal_jf} <(zcat {input.maternal})
jellyfish count -m 31 -s 100M -t 48 -C -o {output.paternal_jf} <(zcat {input.paternal})
jellyfish stats {output.maternal_jf} > {output.dump_stats} # Write maternal stats to combined_stats
jellyfish stats {output.paternal_jf} >> {output.dump_stats} # Append paternal stats to the same file
"""
rule merge_jf:
input:
maternal_jf=f"{kmer_dir}/{{sample}}.maternal.jf",
paternal_jf=f"{kmer_dir}/{{sample}}.paternal.jf",
dump_stats=f"{stats_dir}/{{sample}}.dump_stats.txt"
output:
merged_jf=f"{kmer_dir}/{{sample}}.merged.jf",
combined_stats=f"{stats_dir}/{{sample}}.stats.txt" # Only listed as output
resources:
time=config["merge_jf"]["time"], # Referencing time from config.yaml
mem_mb=config["merge_jf"]["mem_mb"], # Referencing memory from config.yaml
threads=config["merge_jf"]["threads"]
shell:
"""
jellyfish merge {input.maternal_jf} {input.paternal_jf} -o {output.merged_jf}
jellyfish stats {output.merged_jf} > {output.combined_stats} # Create combined stats directly
cat {input.dump_stats} >> {output.combined_stats}
rm {input.maternal_jf} {input.paternal_jf} {input.dump_stats}
"""
rule kmer_dump:
input:
merged_jf=f"{kmer_dir}/{{sample}}.merged.jf"
output:
merged_dump=f"{kmer_dir}/{{sample}}.merged.txt"
resources:
time=config["kmer_dump"]["time"], # Referencing time from config.yaml
mem_mb=config["kmer_dump"]["mem_mb"], # Referencing memory from config.yaml
threads=config["kmer_dump"]["threads"]
shell:
"""
jellyfish dump -c {input.merged_jf} | sort -k1,1 -u > {output.merged_dump}
rm {input.merged_jf}
"""
rule kmer_list_only:
input:
merged_dump=f"{kmer_dir}/{{sample}}.merged.txt"
output:
merged_kmers=f"{output_dir}/{{sample}}.merged.kmers.txt.gz"
resources:
time=config["kmer_list_only"]["time"], # Referencing time from config.yaml
mem_mb=config["kmer_list_only"]["mem_mb"], # Referencing memory from config.yaml
threads=config["kmer_list_only"]["threads"]
shell:
"""
cut -d ' ' -f 1 {input.merged_dump} | gzip > {output.merged_kmers}
rm {input.merged_dump}
"""
And this is the script I am running, what am I doing wrong?
config.yaml
:
__default__:
time="08:00:00" # Default time limit for all jobs
mem_mb=200000 # Default memory (in MB) for all jobs
threads=24 # Default number of threads for all jobs
count_kmers:
time="08:00:00" # Time limit for the count_kmers rule
mem_mb=200000 # Memory for the count_kmers rule
threads=24 # Threads for the count_kmers rule
merge_jf:
time: 08:00:00 # Time limit for the merge_jf rule
mem_mb: 200000 # Memory for the merge_jf rule
threads: 24 # Threads for the merge_jf rule
kmer_dump:
time: 12:00:00 # Time limit for the kmer_dump rule
mem_mb: 400000 # Memory for the kmer_dump rule (400GB)
threads: 40 # Threads for the kmer_dump rule
kmer_list_only:
time: 12:00:00 # Time limit for the kmer_list_only rule
mem_mb: 200000 # Memory for the kmer_list_only rule
threads: 24 # Threads for the kmer_list_only rule
When I run snakemake -s HPRC_kmer_final.smk --config /gpfs/home/hce24xau/.config/snakemake/slurm/config.yaml -np
I get the error: Invalid config definition: Config entries have to be defined as name=value pairs.
I've been learning snakemake as the first part of my PhD and the the turtorial i have been following lead me to this but i cant seem to work it out, been working days on this, please someone help
error: i get the error: Invalid config definition: Config entries have to be defined as name=value pairs.
…what exactly is the error?
when i run snakemake -s HPRC_kmer_final.smk --config /gpfs/home/hce24xau/.config/snakemake/slurm/config.yaml -np HPRC_kmer_final.smk -> snakefile
i get the error: Invalid config definition: Config entries have to be defined as name=value pairs.
sorry i forgot to put it in.
Correct the
=
in your config, ietime="08:00:00"
totime: 08:00:00
still the same errors :(