Here I will use 3 tools from my "variation toolkit" :
- mysqlquery: query a mysql database using the column of a table. Here I will query the position of your snp in UCSC/snp132
- mysqlucsc: finds the genomic features from table in the UCSC. (knownGene)
- verticalize: verticalize the result
and a simple awk script.
The script:
VARKIT=/path/to/variationtoolkit/bin
SQLCFG="--host genome-mysql.cse.ucsc.edu --user genome --database hg19"
${VARKIT}/mysqlquery ${SQLCFG} -e 'select chrom,chromStart from snp132 where name="$1"' |\
${VARKIT}/mysqlucsc -1 ${SQLCFG} -T knownGene -C 2 -S 3 -E 3 |\
awk -F ' ' '
/^#/ {printf("%s\tEXON\tcount(EXONS)\n",$0);next;}
{
printf("%s\t",$0);
posSnp=int($3);
nExons=int($11);
split($12,exonStarts,"[,]");
split($13,exonEnds,"[,]");
for(i=1;i<=nExons;i++)
{
if(posSnp>=int(exonStarts[i]) && posSnp< int(exonEnds[i]))
{
if($6=="+")
{
printf("Exon %d\t%d\n",i,nExons);
}
else
{
printf("Exon %d\t%d\n",(nExons+1)-i,nExons);
}
next;
}
}
printf(".\t.\n");
}' |\
${VARKIT}/verticalize
Usage:
cat snp.txt | sh biostar15405.sh
Result (the two last column gives you the Exon n° and the number of exons)
>>> 2
$1 #SNP_ID rs1354034
$2 chrom chr3
$3 chromStart 56849748
$4 name uc011bew.1
$5 chrom chr3
$6 strand -
$7 txStart 56761445
$8 txEnd 56950499
$9 cdsStart 56763297
$10 cdsEnd 56950369
$11 exonCount 11
$12 exonStarts 56761445,56766265,56771212,56779232,56785083,56785327,56787531,56789008,56807736,56916319,56950336,
$13 exonEnds 56763650,56766452,56771383,56779490,56785160,56785424,56787594,56789179,56807844,56916382,56950499,
$14 proteinID Q9NR81-2
$15 alignID uc011bew.1
$16 EXON .
$17 count(EXONS) .
<<< 2
>>> 3
$1 #SNP_ID rs1354034
$2 chrom chr3
$3 chromStart 56849748
$4 name uc003dih.2
$5 chrom chr3
$6 strand -
$7 txStart 56761445
$8 txEnd 57113336
$9 cdsStart 56763297
$10 cdsEnd 57069177
$11 exonCount 13
$12 exonStarts 56761445,56766265,56771212,56779232,56785083,56785327,56787531,56789008,56807736,56916319,56992850,57069115,57113253,
$13 exonEnds 56763650,56766452,56771383,56779490,56785160,56785424,56787594,56789179,56807844,56916382,56992917,57069204,57113336,
$14 proteinID Q9NR81-2
$15 alignID uc003dih.2
$16 EXON .
$17 count(EXONS) .
<<< 3
>>> 4
$1 #SNP_ID rs1344142
$2 chrom chr3
$3 chromStart 56857432
$4 name uc011bew.1
$5 chrom chr3
$6 strand -
$7 txStart 56761445
$8 txEnd 56950499
$9 cdsStart 56763297
$10 cdsEnd 56950369
$11 exonCount 11
$12 exonStarts 56761445,56766265,56771212,56779232,56785083,56785327,56787531,56789008,56807736,56916319,56950336,
$13 exonEnds 56763650,56766452,56771383,56779490,56785160,56785424,56787594,56789179,56807844,56916382,56950499,
$14 proteinID Q9NR81-2
$15 alignID uc011bew.1
$16 EXON .
$17 count(EXONS) .
<<< 4
>>> 5
$1 #SNP_ID rs1344142
$2 chrom chr3
$3 chromStart 56857432
$4 name uc003dih.2
$5 chrom chr3
$6 strand -
$7 txStart 56761445
$8 txEnd 57113336
$9 cdsStart 56763297
$10 cdsEnd 57069177
$11 exonCount 13
$12 exonStarts 56761445,56766265,56771212,56779232,56785083,56785327,56787531,56789008,56807736,56916319,56992850,57069115,57113253,
$13 exonEnds 56763650,56766452,56771383,56779490,56785160,56785424,56787594,56789179,56807844,56916382,56992917,57069204,57113336,
$14 proteinID Q9NR81-2
$15 alignID uc003dih.2
$16 EXON .
$17 count(EXONS) .
<<< 5
>>> 6
$1 #SNP_ID rs10866003
$2 chrom chr3
$3 chromStart 56859423
$4 name uc011bew.1
$5 chrom chr3
$6 strand -
$7 txStart 56761445
$8 txEnd 56950499
$9 cdsStart 56763297
$10 cdsEnd 56950369
$11 exonCount 11
$12 exonStarts 56761445,56766265,56771212,56779232,56785083,56785327,56787531,56789008,56807736,56916319,56950336,
$13 exonEnds 56763650,56766452,56771383,56779490,56785160,56785424,56787594,56789179,56807844,56916382,56950499,
$14 proteinID Q9NR81-2
$15 alignID uc011bew.1
$16 EXON .
$17 count(EXONS) .
<<< 6
>>> 7
$1 #SNP_ID rs10866003
$2 chrom chr3
$3 chromStart 56859423
$4 name uc003dih.2
$5 chrom chr3
$6 strand -
$7 txStart 56761445
$8 txEnd 57113336
$9 cdsStart 56763297
$10 cdsEnd 57069177
$11 exonCount 13
$12 exonStarts 56761445,56766265,56771212,56779232,56785083,56785327,56787531,56789008,56807736,56916319,56992850,57069115,57113253,
$13 exonEnds 56763650,56766452,56771383,56779490,56785160,56785424,56787594,56789179,56807844,56916382,56992917,57069204,57113336,
$14 proteinID Q9NR81-2
$15 alignID uc003dih.2
$16 EXON .
$17 count(EXONS) .
<<< 7
>>> 8
$1 #SNP_ID rs11130549
$2 chrom chr3
$3 chromStart 56880443
$4 name uc011bew.1
$5 chrom chr3
$6 strand -
$7 txStart 56761445
$8 txEnd 56950499
$9 cdsStart 56763297
$10 cdsEnd 56950369
$11 exonCount 11
$12 exonStarts 56761445,56766265,56771212,56779232,56785083,56785327,56787531,56789008,56807736,56916319,56950336,
$13 exonEnds 56763650,56766452,56771383,56779490,56785160,56785424,56787594,56789179,56807844,56916382,56950499,
$14 proteinID Q9NR81-2
$15 alignID uc011bew.1
$16 EXON .
$17 count(EXONS) .
<<< 8
>>> 9
$1 #SNP_ID rs11130549
$2 chrom chr3
$3 chromStart 56880443
$4 name uc003dih.2
$5 chrom chr3
$6 strand -
$7 txStart 56761445
$8 txEnd 57113336
$9 cdsStart 56763297
$10 cdsEnd 57069177
$11 exonCount 13
$12 exonStarts 56761445,56766265,56771212,56779232,56785083,56785327,56787531,56789008,56807736,56916319,56992850,57069115,57113253,
$13 exonEnds 56763650,56766452,56771383,56779490,56785160,56785424,56787594,56789179,56807844,56916382,56992917,57069204,57113336,
$14 proteinID Q9NR81-2
$15 alignID uc003dih.2
$16 EXON .
$17 count(EXONS) .
<<< 9
>>> 10
$1 #SNP_ID rs2133886
$2 chrom chr3
$3 chromStart 56942069
$4 name uc011bew.1
$5 chrom chr3
$6 strand -
$7 txStart 56761445
$8 txEnd 56950499
$9 cdsStart 56763297
$10 cdsEnd 56950369
$11 exonCount 11
$12 exonStarts 56761445,56766265,56771212,56779232,56785083,56785327,56787531,56789008,56807736,56916319,56950336,
$13 exonEnds 56763650,56766452,56771383,56779490,56785160,56785424,56787594,56789179,56807844,56916382,56950499,
$14 proteinID Q9NR81-2
$15 alignID uc011bew.1
$16 EXON .
$17 count(EXONS) .
<<< 10
>>> 11
$1 #SNP_ID rs2133886
$2 chrom chr3
$3 chromStart 56942069
$4 name uc003dih.2
$5 chrom chr3
$6 strand -
$7 txStart 56761445
$8 txEnd 57113336
$9 cdsStart 56763297
$10 cdsEnd 57069177
$11 exonCount 13
$12 exonStarts 56761445,56766265,56771212,56779232,56785083,56785327,56787531,56789008,56807736,56916319,56992850,57069115,57113253,
$13 exonEnds 56763650,56766452,56771383,56779490,56785160,56785424,56787594,56789179,56807844,56916382,56992917,57069204,57113336,
$14 proteinID Q9NR81-2
$15 alignID uc003dih.2
$16 EXON .
$17 count(EXONS) .
<<< 11
>>> 12
$1 #SNP_ID rs3184504
$2 chrom chr12
$3 chromStart 111884607
$4 name uc001tse.2
$5 chrom chr12
$6 strand +
$7 txStart 111843751
$8 txEnd 111889426
$9 cdsStart 111855949
$10 cdsEnd 111886106
$11 exonCount 8
$12 exonStarts 111843751,111855922,111884556,111884745,111884928,111885133,111885459,111885786,
$13 exonEnds 111844081,111856681,111884658,111884837,111885023,111885348,111885631,111889426,
$14 proteinID B9EGG5
$15 alignID uc001tse.2
$16 EXON Exon 3
$17 count(EXONS) 8
<<< 12
>>> 13
$1 #SNP_ID rs3184504
$2 chrom chr12
$3 chromStart 111884607
$4 name uc010syf.1
$5 chrom chr12
$6 strand +
$7 txStart 111855912
$8 txEnd 111889426
$9 cdsStart 111855949
$10 cdsEnd 111886106
$11 exonCount 7
$12 exonStarts 111855912,111884556,111884745,111884928,111885133,111885459,111885786,
$13 exonEnds 111856681,111884658,111884837,111885023,111885348,111885631,111889426,
$14 proteinID B9EGG5
$15 alignID uc010syf.1
$16 EXON Exon 2
$17 count(EXONS) 7
<<< 13
>>> 14
$1 #SNP_ID rs3184504
$2 chrom chr12
$3 chromStart 111884607
$4 name uc001tsf.2
$5 chrom chr12
$6 strand +
$7 txStart 111855922
$8 txEnd 111889426
$9 cdsStart 111855949
$10 cdsEnd 111886106
$11 exonCount 7
$12 exonStarts 111855922,111884556,111884745,111884925,111885133,111885459,111885786,
$13 exonEnds 111856681,111884658,111884837,111885023,111885348,111885631,111889426,
$14 proteinID B9EGG5
$15 alignID uc001tsf.2
$16 EXON Exon 2
$17 count(EXONS) 7
<<< 14
>>> 15
$1 #SNP_ID rs3184504
$2 chrom chr12
$3 chromStart 111884607
$4 name uc010syg.1
$5 chrom chr12
$6 strand +
$7 txStart 111872665
$8 txEnd 111889426
$9 cdsStart 111872705
$10 cdsEnd 111886106
$11 exonCount 7
$12 exonStarts 111872665,111884556,111884745,111884928,111885133,111885459,111885786,
$13 exonEnds 111872831,111884658,111884837,111885023,111885348,111885631,111889426,
$14 proteinID B7Z7K6
$15 alignID uc010syg.1
$16 EXON Exon 2
$17 count(EXONS) 7
<<< 15
Thanks a lot Pierre.
Installed "variation toolkit" and tried this, am getting "unknown option '--database'. Tried options including standard -D, but not able to solve the issues. Any pointers to solve this issue ?
Installed "variation toolkit" and tried this, but am getting "unknown option '--database'. Tried options including standard -D and other variations, but not able to solve this. Any idea how can I solve this ?
what does 'svn info' says ? the rev number should be >=239
Thanks, updated to latest version and it is working. I am able to find the number of exon where a given SNPs is present and total number of exons. This is perfect for SNPs in exons, how about SNPs in introns ? May be am missing something ?
change the awk script: see http://biostar.stackexchange.com/questions/15819