How To Identify/Flag The Presence Of Indels On The Read Level Nearby A Given Position In A Bam File?
1
2
Entering edit mode
12.3 years ago

Let's say I have list of genomic positions, e.g. SNP calls in a VCF file, and a BAM file with read alignments. What would be the best way to, for each of those positions, determine if in the BAM file there are reads with indels either at/overlapping the position, or in the vicinity of the position, say 5 bp upstream or downstream. This is a read level query, it's not indel calling. Basically I want to for a given position consider the ±5 bp window around the position and ask if there are any indels within this window, at the level of reads, and if so count the number of indels in the window.

I think the CIGAR strings of the BAM file probably holds all the information needed, but the question is how to do this in a simple, practical way.

indel bam cigar • 3.8k views
ADD COMMENT
2
Entering edit mode
12.3 years ago

You could use the program I previously wrote for biostar. This program produces a XML description of the Reads overlapping a given genomic region. See the CIGAR elements under the 'align' tag.

Position Of Mismatches Per Read From A Sam/Bam File

/*
Author: Pierre Lindenbaum PhD.
*/
import java.io.File;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamWriter;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileReader.ValidationStringency;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMRecordIterator;
public class Biostar59647
{
public static void main(String[] args)
throws Exception
{
if(args.length<3)
{
System.err.println("Usage: ref bam chr:start-end chr:start-end ...");
return;
}
File refFile=new File(args[0]);
File bamFile=new File(args[1]);
SAMFileReader.setDefaultValidationStringency(ValidationStringency.SILENT);
ReferenceSequenceFile reference=ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
SAMFileReader samFileReader=new SAMFileReader(bamFile);
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance();
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(System.out,"UTF-8");
w.writeStartDocument("UTF-8","1.0");
w.writeStartElement("biostar59647");
w.writeAttribute("ref", args[0]);
w.writeAttribute("bam", args[1]);
for(int optind=2;optind<args.length;++optind)
{
String s=args[optind];
int colon=s.indexOf(':'); if(colon==-1) continue;
String chrom=s.substring(0,colon);
int t=s.indexOf('-',colon+1);if(t==-1) continue;
int start=Integer.parseInt(s.substring(colon+1,t));
int end=Integer.parseInt(s.substring(t+1));
ReferenceSequence refseq=reference.getSubsequenceAt(chrom, start, end);
if(refseq==null) continue;
byte dna[]=refseq.getBases();
w.writeStartElement("interval");
w.writeAttribute("chrom", chrom);
w.writeAttribute("start", String.valueOf(start));
w.writeAttribute("end", String.valueOf(end));
SAMRecordIterator iter=samFileReader.queryOverlapping(chrom, start, end);
while(iter.hasNext())
{
SAMRecord rec=iter.next();
final byte readbases[]=rec.getReadBases();
w.writeStartElement("read");
w.writeStartElement("name");
w.writeCharacters(rec.getReadName());
w.writeEndElement();
w.writeStartElement("sequence");
w.writeCharacters(new String(readbases));
w.writeEndElement();
w.writeStartElement("flags");
w.writeCharacters(String.valueOf(rec.getFlags()));
w.writeEndElement();
w.writeStartElement("chrom");
w.writeCharacters(rec.getReferenceName());
w.writeEndElement();
w.writeStartElement("pos");
w.writeCharacters(String.valueOf(rec.getAlignmentStart()));
w.writeEndElement();
w.writeStartElement("cigar");
w.writeCharacters(rec.getCigarString());
w.writeEndElement();
w.writeStartElement("align");
int readIndex = 0;
int refIndex = rec.getAlignmentStart();
for (final CigarElement e : rec.getCigar().getCigarElements())
{
switch (e.getOperator())
{
case H : break; // ignore hard clips
case P : break; // ignore pads
case I : //cont.
case S :
{
final int length = e.getLength();
for(int i=0;i<length;++i)
{
w.writeEmptyElement(e.getOperator().name());
w.writeAttribute("read-index",String.valueOf(readIndex+1));
if(readIndex>=0 && readIndex< readbases.length)
{
w.writeAttribute("read-base",String.valueOf((char)(readbases[readIndex])));
}
readIndex++;
}
break;
}
case N : //cont. -- reference skip
case D :
{
final int length = e.getLength();
for(int i=0;i<length;++i)
{
w.writeEmptyElement(e.getOperator().name());
w.writeAttribute("ref-index",String.valueOf(refIndex));
int idx=refIndex-start;
if(idx>=0 && idx< dna.length)
{
w.writeAttribute("ref-base",String.valueOf((char)(dna[idx])));
}
refIndex++;
}
break;
}
case M :
case EQ :
case X :
{
final int length = e.getLength();
for(int i=0;i<length;++i)
{
w.writeEmptyElement(e.getOperator().name());
char baseRead='\0';
if(readIndex>=0 && readIndex< readbases.length)
{
baseRead=(char)(rec.getReadBases()[readIndex]);
w.writeAttribute("read-index",String.valueOf(readIndex+1));
w.writeAttribute("read-base",String.valueOf(baseRead));
}
w.writeAttribute("ref-index",String.valueOf(refIndex));
int idx2=refIndex-start;
if(idx2>=0 && idx2< dna.length)
{
char baseRef=(char)(dna[idx2]);
w.writeAttribute("ref-base",String.valueOf(baseRef));
if(Character.toUpperCase(baseRef)!=Character.toUpperCase(baseRead))
{
w.writeAttribute("mismatch","true");
}
}
refIndex++;
readIndex++;
}
break;
}
default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator());
}
}
w.writeEndElement();
w.writeEndElement();
}
iter.close();
w.writeEndElement();
}
samFileReader.close();
w.writeEndElement();
w.writeEndDocument();
w.flush();
w.close();
}
}
<?xml version="1.0" encoding="UTF-8"?>
<biostar59647 ref="/home/lindenb/samtools-0.1.18/examples/toy.fa" bam="/home/lindenb/samtools-0.1.18/examples/toy.bam">
<interval chrom="ref" start="9" end="20">
<read>
<name>r001</name>
<sequence>TTAGATAAAGAGGATACTG</sequence>
<flags>163</flags>
<chrom>ref</chrom>
<pos>7</pos>
<cigar>8M4I4M1D3M</cigar>
<align>
<M read-index="1" read-base="T" ref-index="7"/>
<M read-index="2" read-base="T" ref-index="8"/>
<M read-index="3" read-base="A" ref-index="9" ref-base="A"/>
<M read-index="4" read-base="G" ref-index="10" ref-base="G"/>
<M read-index="5" read-base="A" ref-index="11" ref-base="A"/>
<M read-index="6" read-base="T" ref-index="12" ref-base="T"/>
<M read-index="7" read-base="A" ref-index="13" ref-base="A"/>
<M read-index="8" read-base="A" ref-index="14" ref-base="A"/>
<I read-index="9" read-base="A"/>
<I read-index="10" read-base="G"/>
<I read-index="11" read-base="A"/>
<I read-index="12" read-base="G"/>
<M read-index="13" read-base="G" ref-index="15" ref-base="G"/>
<M read-index="14" read-base="A" ref-index="16" ref-base="A"/>
<M read-index="15" read-base="T" ref-index="17" ref-base="T"/>
<M read-index="16" read-base="A" ref-index="18" ref-base="A"/>
<D ref-index="19" ref-base="G"/>
<M read-index="17" read-base="C" ref-index="20" ref-base="C"/>
<M read-index="18" read-base="T" ref-index="21"/>
<M read-index="19" read-base="G" ref-index="22"/>
</align>
</read>
<read>
<name>r002</name>
<sequence>AAAAGATAAGGGATAAA</sequence>
<flags>0</flags>
<chrom>ref</chrom>
<pos>9</pos>
<cigar>1S2I6M1P1I1P1I4M2I</cigar>
<align>
<S read-index="1" read-base="A"/>
<I read-index="2" read-base="A"/>
<I read-index="3" read-base="A"/>
<M read-index="4" read-base="A" ref-index="9" ref-base="A"/>
<M read-index="5" read-base="G" ref-index="10" ref-base="G"/>
<M read-index="6" read-base="A" ref-index="11" ref-base="A"/>
<M read-index="7" read-base="T" ref-index="12" ref-base="T"/>
<M read-index="8" read-base="A" ref-index="13" ref-base="A"/>
<M read-index="9" read-base="A" ref-index="14" ref-base="A"/>
<I read-index="10" read-base="G"/>
<I read-index="11" read-base="G"/>
<M read-index="12" read-base="G" ref-index="15" ref-base="G"/>
<M read-index="13" read-base="A" ref-index="16" ref-base="A"/>
<M read-index="14" read-base="T" ref-index="17" ref-base="T"/>
<M read-index="15" read-base="A" ref-index="18" ref-base="A"/>
<I read-index="16" read-base="A"/>
<I read-index="17" read-base="A"/>
</align>
</read>
<read>
<name>r003</name>
<sequence>AGCTAA</sequence>
<flags>0</flags>
<chrom>ref</chrom>
<pos>9</pos>
<cigar>5H6M</cigar>
<align>
<M read-index="1" read-base="A" ref-index="9" ref-base="A"/>
<M read-index="2" read-base="G" ref-index="10" ref-base="G"/>
<M read-index="3" read-base="C" ref-index="11" ref-base="A" mismatch="true"/>
<M read-index="4" read-base="T" ref-index="12" ref-base="T"/>
<M read-index="5" read-base="A" ref-index="13" ref-base="A"/>
<M read-index="6" read-base="A" ref-index="14" ref-base="A"/>
</align>
</read>
<read>
<name>r004</name>
<sequence>ATAGCTCTCAGC</sequence>
<fl
view raw output.xml hosted with ❤ by GitHub

ADD COMMENT

Login before adding your answer.

Traffic: 3877 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6