import os
SAMPLES = [
    "HG00438", "HG00621", "HG00673", "HG00735", "HG00741",
    "HG01071", "HG01106", "HG01123", "HG01175", "HG01258",
    "HG01358", "HG01361", "HG01891", "HG01928", "HG01952",
    "HG01978", "HG02148", "HG02257", "HG02486", "HG02559",
    "HG02572", "HG02622", "HG02630", "HG02717", "HG02886",
    "HG03453", "HG03516", "HG03540", "HG03579"
]
# Define directories
input_dir = "/gpfs/home/hce24xau/scratch/gen_kmers/data/pan_genome/input_files"
kmer_dir = "/gpfs/home/hce24xau/scratch/gen_kmers/data/pan_genome/kmers_files"
stats_dir = "/gpfs/home/hce24xau/scratch/gen_kmers/data/pan_genome/stats_files"
output_dir = "/gpfs/home/hce24xau/scratch/gen_kmers/data/pan_genome/output_files"
rule all:
    input:
    expand(f"{output_dir}/{{sample}}.merged.kmers.txt.gz", sample=SAMPLES),
        expand(f"{stats_dir}/{{sample}}.stats.txt", sample=SAMPLES),
rule count_kmers:
    input:
    maternal=f"{input_dir}/{{sample}}.maternal.fa.gz",
        paternal=f"{input_dir}/{{sample}}.paternal.fa.gz",
    output:
    maternal_jf=f"{kmer_dir}/{{sample}}.maternal.jf",
        paternal_jf=f"{kmer_dir}/{{sample}}.paternal.jf",
        dump_stats=f"{stats_dir}/{{sample}}.dump_stats.txt"  # This should be correctly referenced as an output
    resources:
    time=config["count_kmers"]["time"],   # Referencing time from config.yaml
        mem_mb=config["count_kmers"]["mem_mb"],    # Referencing memory from config.yaml
    threads: config["count_kmers"]["threads"]
    shell:
    """
    jellyfish count -m 31 -s 100M -t 48 -C -o {output.maternal_jf} <(zcat {input.maternal})
        jellyfish count -m 31 -s 100M -t 48 -C -o {output.paternal_jf} <(zcat {input.paternal})
        jellyfish stats {output.maternal_jf} > {output.dump_stats}  # Write maternal stats to combined_stats
        jellyfish stats {output.paternal_jf} >> {output.dump_stats}  # Append paternal stats to the same file
        """
rule merge_jf:
    input:
    maternal_jf=f"{kmer_dir}/{{sample}}.maternal.jf",
        paternal_jf=f"{kmer_dir}/{{sample}}.paternal.jf",
        dump_stats=f"{stats_dir}/{{sample}}.dump_stats.txt"
    output:
    merged_jf=f"{kmer_dir}/{{sample}}.merged.jf",
        combined_stats=f"{stats_dir}/{{sample}}.stats.txt"  # Only listed as output
    resources:
    time=config["merge_jf"]["time"],  # Referencing time from config.yaml
        mem_mb=config["merge_jf"]["mem_mb"],    # Referencing memory from config.yaml
        threads=config["merge_jf"]["threads"]
    shell:
    """
    jellyfish merge {input.maternal_jf} {input.paternal_jf} -o {output.merged_jf}
        jellyfish stats {output.merged_jf} > {output.combined_stats}  # Create combined stats directly
        cat {input.dump_stats} >> {output.combined_stats}
        rm {input.maternal_jf} {input.paternal_jf} {input.dump_stats}
        """
rule kmer_dump:
    input:
    merged_jf=f"{kmer_dir}/{{sample}}.merged.jf"
    output:
    merged_dump=f"{kmer_dir}/{{sample}}.merged.txt"
    resources:
    time=config["kmer_dump"]["time"],  # Referencing time from config.yaml
        mem_mb=config["kmer_dump"]["mem_mb"],    # Referencing memory from config.yaml
        threads=config["kmer_dump"]["threads"]
    shell:
    """
    jellyfish dump -c {input.merged_jf} | sort -k1,1 -u > {output.merged_dump}
        rm {input.merged_jf}
        """
rule kmer_list_only:
    input:
    merged_dump=f"{kmer_dir}/{{sample}}.merged.txt"
    output:
    merged_kmers=f"{output_dir}/{{sample}}.merged.kmers.txt.gz"
    resources:
    time=config["kmer_list_only"]["time"],  # Referencing time from config.yaml
        mem_mb=config["kmer_list_only"]["mem_mb"],    # Referencing memory from config.yaml
        threads=config["kmer_list_only"]["threads"]
    shell:
    """
    cut -d ' ' -f 1 {input.merged_dump} | gzip > {output.merged_kmers}
        rm {input.merged_dump}
        """
And this is the script I am running, what am I doing wrong?
config.yaml:
__default__:
  time="08:00:00"    # Default time limit for all jobs
  mem_mb=200000      # Default memory (in MB) for all jobs
  threads=24         # Default number of threads for all jobs
count_kmers:
  time="08:00:00"    # Time limit for the count_kmers rule
  mem_mb=200000      # Memory for the count_kmers rule
  threads=24         # Threads for the count_kmers rule
merge_jf:
  time: 08:00:00    # Time limit for the merge_jf rule
  mem_mb: 200000      # Memory for the merge_jf rule
  threads: 24         # Threads for the merge_jf rule
kmer_dump:
  time: 12:00:00    # Time limit for the kmer_dump rule
  mem_mb: 400000      # Memory for the kmer_dump rule (400GB)
  threads: 40         # Threads for the kmer_dump rule
kmer_list_only:
  time: 12:00:00    # Time limit for the kmer_list_only rule
  mem_mb: 200000      # Memory for the kmer_list_only rule
  threads: 24         # Threads for the kmer_list_only rule
When I run snakemake -s HPRC_kmer_final.smk --config /gpfs/home/hce24xau/.config/snakemake/slurm/config.yaml -np
I get the error: Invalid config definition: Config entries have to be defined as name=value pairs.
I've been learning snakemake as the first part of my PhD and the the turtorial i have been following lead me to this but i cant seem to work it out, been working days on this, please someone help
error: i get the error: Invalid config definition: Config entries have to be defined as name=value pairs.
…what exactly is the error?
when i run snakemake -s HPRC_kmer_final.smk --config /gpfs/home/hce24xau/.config/snakemake/slurm/config.yaml -np HPRC_kmer_final.smk -> snakefile
i get the error: Invalid config definition: Config entries have to be defined as name=value pairs.
sorry i forgot to put it in.
Correct the
=in your config, ietime="08:00:00"totime: 08:00:00still the same errors :(