22 months ago by
France/Nantes/Institut du Thorax - INSERM UMR1087
the sort order of samtools sort
is not the same as the linux sort / comm
https://github.com/samtools/samtools/blob/develop/bam_sort.c#L1840
samtools sort
calls strnum_cmp
which is not a simple comparaison of bytes :
static int strnum_cmp(const char *_a, const char *_b)
{
const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
const unsigned char *pa = a, *pb = b;
while (*pa && *pb) {
if (isdigit(*pa) && isdigit(*pb)) {
while (*pa == '0') ++pa;
while (*pb == '0') ++pb;
while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb;
if (isdigit(*pa) && isdigit(*pb)) {
int i = 0;
while (isdigit(pa[i]) && isdigit(pb[i])) ++i;
return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb;
} else if (isdigit(*pa)) return 1;
else if (isdigit(*pb)) return -1;
else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1;
} else {
if (*pa != *pb) return (int)*pa - (int)*pb;
++pa; ++pb;
}
}
return *pa? 1 : *pb? -1 : 0;
}
you want:
samtools view unmapped_reads1.bam | cut -f1 | LC_ALL=C sort | uniq > unmapped_reads1_sorted.bam
samtools view unmapped_reads2.bam | cut -f1 | LC_ALL=C sort | uniq > unmapped_reads2_sorted.bam
LC_ALL=C comm -12 unmapped_reads1_sorted.bam unmapped_reads2_sorted.bam > common_seqs.bam