Entering edit mode
3 months ago
eesha28112001
•
0
Hello all! I have been facing a peristent issue with GATK in my nextflow pipeline.
I have attached my script below. The error I am facing is in the applyBQSR step: Nextflow keeps telling me "chr7.fa" (the reference fasta file) does not exist. How do I troubleshoot this? All the previous steps are working properly. I also made sure the chr7.fa file exists in the specified directory. Please help if you can, thanks!
params.refgenome = "/home/mahe/eesha/transcriptome.fa"
params.outdir = "/home/mahe/eesha/temp_results/"
params.inputdir = "/home/mahe/eesha/temp_data/"
params.sampleid = "gut"
params.vcf_file = "/home/mahe/eesha/known_sites.vcf"
log.info"""\
refgenome: ${params.refgenome}
outdir: ${params.outdir}
inputdir: ${params.inputdir}
sampleid: ${params.sampleid}
vcf_file: ${params.vcf_file}
"""
process indexing {
container 'biocontainers/bwa:v0.7.17_cv1'
publishDir params.outdir, mode: "copy"
input:
path refgenome
output:
path 'bwa_indexes'
script:
"""
mkdir -p bwa_indexes
bwa index ${refgenome} -p ${refgenome}
mv ${refgenome.baseName}* ./bwa_indexes
"""
}
process samtools_index {
container 'mgibio/samtools:1.15.1-buster'
publishDir params.outdir, mode: "copy"
input:
path bwa_indexes
path refgenome
output:
path 'bwa_indexes'
script:
"""
samtools faidx ${params.refgenome}
samtools dict ${params.refgenome} --output ${refgenome.baseName}.fa.dict
mv ${params.refgenome}.fai /home/mahe/eesha/temp_results/bwa_indexes
"""
}
process bwa_alignment {
container 'biocontainers/bwa:v0.7.17_cv1'
publishDir params.outdir, mode: "copy"
input:
path bwa_indexes
path inputdir
val sampleid
output:
path 'alignment_results'
script:
"""
mkdir alignment_results
bwa mem -M -R "@RG\\tID:${sampleid}\\tSM:${sampleid}" $bwa_indexes/*.fa ${inputdir}/${sampleid}_1.fq ${inputdir}/${sampleid}_2.fq > alignment_${sampleid}.sam
mv alignment_${sampleid}.sam ./alignment_results
"""
}
process samtools_sort_index {
container 'mgibio/samtools:1.15.1-buster'
input:
path alignment_results
val sampleid
output:
path 'alignment_results'
script:
"""
samtools view -b -S $alignment_results/alignment_${sampleid}.sam | samtools sort -o sorted_${sampleid}.bam
samtools index sorted_${sampleid}.bam
mv sorted_${sampleid}.bam ./alignment_results
"""
}
process markdups_gatk {
container 'broadinstitute/gatk:4.5.0.0'
publishDir params.outdir, mode: "copy"
input:
path alignment_results
output:
path 'dedup_result'
script:
"""
mkdir dedup_result
gatk MarkDuplicates -I $alignment_results/sorted_${params.sampleid}.bam -O sorted_dedup_${params.sampleid}.bam -M sorted_dedup_metrics_${params.sampleid}.txt
mv sorted_dedup_* ./dedup_result
"""
}
process markdups_samtools_index {
container 'mgibio/samtools:1.15.1-buster'
publishDir params.outdir, mode: "copy"
input:
path dedup_result
output:
path 'dedup_result'
script:
"""
samtools index $dedup_result/sorted_dedup_${params.sampleid}.bam
"""
}
process readgroups {
container 'broadinstitute/gatk:4.5.0.0'
publishDir params.outdir, mode: "copy"
input:
path dedup_result
path vcf_file
output:
path 'dedup_result'
script:
"""
gatk AddOrReplaceReadGroups -I $dedup_result/sorted_dedup_${params.sampleid}.bam -O rg_sorted_dedup_${params.sampleid}.bam -LB readgroup -PL illumina -PU ESWJFHU537GDFJK -SM Sample-SRA
mv rg_sorted_dedup* ./dedup_result
gatk IndexFeatureFile --input ${params.vcf_file}
mv known_sites* ./dedup_result
"""
}
process baserecal {
container 'broadinstitute/gatk:4.5.0.0'
publishDir params.outdir, mode: "copy"
input:
path bwa_indexes
path dedup_result
path vcf_file
output:
path 'recal_report'
script:
"""
mkdir recal_report
gatk BaseRecalibrator -R $bwa_indexes/transcriptome.fa -I $dedup_result/rg_sorted_dedup_${params.sampleid}.bam --known-sites ${params.vcf_file} -O recal_report_${params.sampleid}.table
mv recal_report_* ./recal_report
"""
}
process apply_bqsr {
container 'broadinstitute/gatk:4.5.0.0'
publishDir params.outdir, mode:"copy"
input:
path bwa_indexes
path dedup_result
path recal_report
output:
path 'recal_report'
script:
"""
gatk ApplyBQSR -R $bwa_indexes/transcriptome.fa -I $dedup_result/rg_sorted_dedup_${params.sampleid}.bam --bqsr-recal-file $recal_report/recal_report_${params.sampleid}.table -O recalibrated_${params.sampleid}.bam
mv recalibrated_* ./recal_report
"""
}
process haplo {
container 'broadinstitute/gatk:4.5.0.0'
publishDir params.outdir, mode:"copy"
input:
path bwa_indexes
path recal_report
output:
path 'variant_file'
script:
"""
mkdir variant_file
gatk HaplotypeCaller -R $bwa_indexes/transcriptome.fa -I $recal_report/recalibrated_${params.sampleid}.bam -O variants_${params.sampleid}.vcf
mv variants_* ./variant_file
"""
}
workflow {
bwa_index_files = indexing(params.refgenome)
samtools_indexes = samtools_index(bwa_index_files, params.refgenome)
sorted_bam_file = bwa_alignment(bwa_index_files, params.inputdir, params.sampleid)
sorted_bam_with_index = samtools_sort_index(sorted_bam_file, params.sampleid)
dedup_file = markdups_gatk(sorted_bam_with_index)
dedup_file_with_index = markdups_samtools_index(dedup_file)
rg_sorted_file = readgroups(dedup_file, params.vcf_file)
recalibration_report = baserecal(bwa_index_files, rg_sorted_file, params.vcf_file)
recalibrated_bam_file = apply_bqsr(bwa_index_files, rg_sorted_file, recalibration_report)
haplo(bwa_index_files, recalibrated_bam_file)
}
not an answer but all those ' publishDir params.outdir, mode: "copy"' are most probably useless. You just want the final output
try to debug what's happening in bqsr and show us the error(s).
not an answer but using:
is probably wrong. You most probably want your workflow to scale with more than one sample. Use a params.samplesheet and process it with
splitCsv