Question

How to use `bai_file`/`sample_id` (cardinality=10) and `bin_id` (cardinality=2875) much efficiently here?

0

Entering edit mode

7 months ago

Đặng Hải Đăng • 0

Hello everybody.
Here is my new code:

params.OUTDIR = "/data/data_dang/cfDNA_lowdepth"
params.deduplicated = "${params.OUTDIR}/*.deduplicated.bam"
params.sortedDir = "${params.OUTDIR}/sorted"
params.binDir = "${params.OUTDIR}/6bin"
params.txtDir = "${params.OUTDIR}/6txt"
params.pythonPath = "/conda/env_dangdang/anaconda3/envs/python/bin"
params.samtoolsPath = "/conda/env_dangdang/anaconda3/envs/samtools/bin"

deduplicated_ch = Channel.fromPath(params.deduplicated)

process sorted_bam {
    executor 'local'  
    cache "deep"
    tag "sorted_bam"
    storeDir params.sortedDir
    errorStrategy 'retry'
    maxRetries 1
    cpus 4
    input:
        path deduplicated_file from deduplicated_ch
    output:
        set file("${deduplicated_file.baseName.replaceAll('.deduplicated', '')}.sorted.bam"), 
            file("${deduplicated_file.baseName.replaceAll('.deduplicated', '')}.sorted.bam.bai"),
            val deduplicated_file.baseName.replaceAll('.deduplicated', '') into sorted_ch #this line is total wrong!
    script:
    """
    export PATH=/conda/env_dangdang/anaconda3/envs/samtools/bin:\$PATH
    samtools sort -@ 4 -o ${deduplicated_file.baseName.replaceAll('.deduplicated', '')}.sorted.bam ${deduplicated_file}
    samtools index ${deduplicated_file.baseName.replaceAll('.deduplicated', '')}.sorted.bam
    """
}

def generateBins() {
    def num_bins = [249, 243, 198, 191, 180, 171, 159, 146, 141, 135, 135, 133, 115, 107, 102, 90, 81, 78, 59, 63, 48, 51]
    def bins = [:]

    (1..22).each { chr ->
        def num_chr_bins = num_bins[chr - 1]
        (0..<num_chr_bins).each { bin_index ->
            def start = 1 + bin_index * 1000000
            def end = start + 999999
            def bin_id = "chr${chr}_${start}_${end}"
            bins.put(bin_id, [chr: chr, start: start, end: end])
        }
    }
    return bins
}

def binChannel = generateBins()
// binChannel.each { binId, binInfo ->
//     println("Bin ID: $binId, Chromosome: ${binInfo.chr}, Start: ${binInfo.start}, End: ${binInfo.end}")
// }
process processBam {
    executor 'local'
    cache "deep"
    storeDir params.binDir
    errorStrategy 'retry'
    maxRetries 1
    cpus 4
    input:
        set bin_id, chr, start, end from binChannel.collect { binId, binInfo ->
            tuple(binId, binInfo.chr, binInfo.start, binInfo.end)
        }
        set sorted_bam, bai_file, sample_id from sorted_ch
    output:
        set file("${sorted_bam.baseName.replaceAll('.sorted', '')}_${chr}_${start}_${end}.txt") into bin_ch
    script:
    """
    export PATH=${params.samtoolsPath}:$PATH
    for (bin_id, sample_id) in bin_id.collectMany { id -> sample_id.collect { bam -> [id, bam] } } {
    samtools view -h ${sorted_bam} ${chr}:${start}-${end} | awk -v OFS='\t' '
          BEGIN {
              prev = ""
              prevline = ""
          }
          {
              if (\$0 ~ "^@") {
                  print \$0
                  next
              }
              if (\$1 == prev) {
                  print prevline
                  print \$0
              } else {
                  prev = \$1
                  prevline = \$0
              }
          }' > "${sorted_bam.baseName.replaceAll('.sorted', '')}_${chr}_${start}_${end}.final.sam"
    samtools view "${sorted_bam.baseName.replaceAll('.sorted', '')}_${chr}_${start}_${end}.final.sam" | awk -v OFS='\t' '{
          for (i = 12; i <= NF; i++) {
              if (\$i ~ /^XM:Z:/) {
                  tag = substr(\$i, 6);
                  if (index(tag, "z") > 0) {
                      two_hot = 2;
                  } else if (index(tag, "Z") > 0) {
                      two_hot = 0;
                  } else {
                      two_hot = 1;
                  }
                  print \$1, tag, two_hot;
              }
          }
      }' > "${sorted_bam.baseName.replaceAll('.sorted', '')}_${chr}_${start}_${end}.txt"}
    """
}

process processTXT {
    executor 'local'
    cache "deep"
    tag "processTXT"
    storeDir params.txtDir
    errorStrategy 'retry'
    maxRetries 1
    cpus 4
    input:
        set txt_path from bin_ch
    output:
        set file("${txt_path.baseName}.txt") into txt_ch
    script:
    """
    export PATH=${params.pythonPath}:$PATH
    python /data/data_dang/cfDNA_lowdepth/src/0000.py ${txt_path} ${txt_path.baseName}.txt
    """
}

How to use bai_file/sample_id(cardinality 10) and bin_id(cardinality 2875) wisely? I need your help. Thanks a real lot.
(Updated).

params.OUTDIR = "/data/data_dang/cfDNA_lowdepth"
params.deduplicated = "${params.OUTDIR}/*.deduplicated.bam"
params.sortedDir = "${params.OUTDIR}/sorted"
params.binDir = "${params.OUTDIR}/6bin"
params.txtDir = "${params.OUTDIR}/6txt"
params.pythonPath = "/conda/env_dangdang/anaconda3/envs/python/bin"
params.samtoolsPath = "/conda/env_dangdang/anaconda3/envs/samtools/bin"

deduplicated_ch = Channel.fromPath(params.deduplicated)

process sorted_bam {
    executor 'local'  
    cache "deep"
    tag "sorted_bam"
    storeDir params.sortedDir
    errorStrategy 'retry'
    maxRetries 1
    cpus 4
    input:
        path deduplicated_file from deduplicated_ch
    output:
        set file("${deduplicated_file.baseName.replaceAll('.deduplicated', '')}.sorted.bam"), 
            file("${deduplicated_file.baseName.replaceAll('.deduplicated', '')}.sorted.bam.bai") into sorted_ch
    script:
    """
    export PATH=/conda/env_dangdang/anaconda3/envs/samtools/bin:\$PATH
    samtools sort -@ 4 -o ${deduplicated_file.baseName.replaceAll('.deduplicated', '')}.sorted.bam ${deduplicated_file}
    samtools index ${deduplicated_file.baseName.replaceAll('.deduplicated', '')}.sorted.bam
    """
}

def generateBins() {
    def num_bins = [249, 243, 198, 191, 180, 171, 159, 146, 141, 135, 135, 133, 115, 107, 102, 90, 81, 78, 59, 63, 48, 51]
    def bins = [:]

    (1..22).each { chr ->
        def num_chr_bins = num_bins[chr - 1]
        (0..<num_chr_bins).each { bin_index ->
            def start = 1 + bin_index * 1000000
            def end = start + 999999
            def bin_id = "chr${chr}_${start}_${end}"
            bins.put(bin_id, [chr: chr, start: start, end: end])
        }
    }
    return bins
}

def binChannel = generateBins()
// binChannel.each { binId, binInfo ->
//     println("Bin ID: $binId, Chromosome: ${binInfo.chr}, Start: ${binInfo.start}, End: ${binInfo.end}")
// }
process processBam {
    executor 'local'
    cache "deep"
    storeDir params.binDir
    errorStrategy 'retry'
    maxRetries 1
    cpus 4
    input:
        set bin_id, chr, start, end from binChannel.collect { binId, binInfo ->
            tuple(binId, binInfo.chr, binInfo.start, binInfo.end)
        }
        set sorted_bam, bai_file from sorted_ch
    output:
        set file("${sorted_bam.baseName.replaceAll('.sorted', '')}_${chr}_${start}_${end}.txt") into bin_ch
    script:
    """
    export PATH=${params.samtoolsPath}:$PATH
    for i in {1..2875}; do
    samtools view -h ${sorted_bam} ${chr}:${start}-${end} | awk -v OFS='\t' '
          BEGIN {
              prev = ""
              prevline = ""
          }
          {
              if (\$0 ~ "^@") {
                  print \$0
                  next
              }
              if (\$1 == prev) {
                  print prevline
                  print \$0
              } else {
                  prev = \$1
                  prevline = \$0
              }
          }' > "${sorted_bam.baseName.replaceAll('.sorted', '')}_${chr}_${start}_${end}.final.sam"
    done
    samtools view "${sorted_bam.baseName.replaceAll('.sorted', '')}_${chr}_${start}_${end}.final.sam" | awk -v OFS='\t' '{
          for (i = 12; i <= NF; i++) {
              if (\$i ~ /^XM:Z:/) {
                  tag = substr(\$i, 6);
                  if (index(tag, "z") > 0) {
                      two_hot = 2;
                  } else if (index(tag, "Z") > 0) {
                      two_hot = 0;
                  } else {
                      two_hot = 1;
                  }
                  print \$1, tag, two_hot;
              }
          }
      }' > "${sorted_bam.baseName.replaceAll('.sorted', '')}_${chr}_${start}_${end}.txt"
    """
}

process processTXT {
    executor 'local'
    cache "deep"
    tag "processTXT"
    storeDir params.txtDir
    errorStrategy 'retry'
    maxRetries 1
    cpus 4
    input:
        set txt_path from bin_ch
    output:
        set file("${txt_path.baseName}.txt") into txt_ch
    script:
    """
    export PATH=${params.pythonPath}:$PATH
    python /data/data_dang/cfDNA_lowdepth/src/0000.py ${txt_path} ${txt_path.baseName}.txt
    """
}

Running 2875 times is failed.

nextflow • 433 views

ADD COMMENT • link 7 months ago by Đặng Hải Đăng • 0

score 3 · Accepted Answer · 2024-03-04

3

Entering edit mode

7 months ago

ATpoint 84k

Remember that the script: section of processBam is a bash script, not Groovy/Java/Nextflow, so the // used to comment your Process BAM file using samtools and awk is messing things up, as it is being interpreted as a path address (.command.sh: line 3: //: Is a directory). Replace that by a hash #.

ADD COMMENT • link 7 months ago by ATpoint 84k