有时候,我们需要大批量的查询ncbi数据库,这时,我们很可能需要用到ncbi为我门提供的程序化接口:Eutils。
Eutils有五大部分组成,具体的doc参见ncbi的online book。这里提供一个检索snp信息的例子:
use LWP::Simple;
use List::MoreUtils qw/ uniq /;
sub xmlparse{
#parameter: 1,xmlstring 2,out file descriptor
$snpfetchrst = @_[0];
$outpfile = @_[1];
#get rsID, validation info, allele info, gene info, maf allele info
my $rsid;
my $taxid;
my $content;
my @geneid;
my $alleinfo;
my $filtr = false;
#print table head
#print $outpfile "refSNP","\t","alleles","\t","minoralle","\t","entrezGeneID","\n";
while ($snpfetchrst =~ /<Rs rsId=\"(\d+)\".*?taxId=\"(\d+)\">(.*?)<\/Rs>/sg) {
@geneid = ();
$filtr = false;
$rsid = $1;
$taxid = $2;
$content = $3;
if($taxid != 9606){
next;
}
else{
if (($content =~ /<Validation (.*?)\/>/sg)|($content =~ /<Validation .*?>(.*?)<\/Validation>/sg)) {
$valid_state = $1;
#print $valid_state,"\n";
if($valid_state){ #validation state
$filtr = true;
if ($content =~ /<Sequence.*?>(.*?)<\/Sequence>/sg) {
$seqinfo = $1;
$alleinfo = $1 if ($seqinfo =~ /<Observed>(.*?)<\/Observed>/);
}
if ($content =~ /<Assembly.*?reference="true">(.*?)<\/Assembly>/sg){
$assemContent = $1;
if($assemContent =~ /<Component.*?>(.*?)<\/Component>/sg){
$compcontent = $1;
if($compcontent =~ /<MapLoc.*?>(.*?)<\/MapLoc>/sg){
$funcinfo = $1;
#print $funcinfo,"\n";
while($funcinfo =~ /<FxnSet geneId=\"(\d+)\".*?\/>/sg){
#print $1,"\n";
push(@geneid,$1);
}
@geneid = uniq(@geneid); #require moreUtils module
}
}
}
if ($content =~ /<Frequency.*?allele=\"(.*?)\".*?\/>/sg){
$malle = $1;
}
}
else{
next;
}
}
else{
$filtr = false;
next;
}
}
if($filtr) {print $outpfile ($rsid,"\t",$alleinfo,"\t",$malle,"\t",join(',',@geneid),"\n");}
}
} #end of xml_parse
my $utils = "http://www.ncbi.nlm.nih.gov/entrez/eutils";
open($refsnp,"<","snps.txt");
while (<$refsnp>) {
chomp;
$_ =~ /(\d+)/;
$rsid = $1;
push(@rss, $rsid);
}
close($refsnp);
my $id_list = join(',',@rss);
#print $id_list,"\n";
#assemble the epost URL as an HTTP POST call
$url = $utils . "/epost.fcgi";
$url_params = "db=snp&id=$id_list";
#create HTTP user agent
$ua = new LWP::UserAgent;
$ua->agent("elink/1.0 " . $ua->agent);
#create HTTP request object
$req = new HTTP::Request POST => "$url";
$req->content_type('application/x-www-form-urlencoded');
$req->content("$url_params");
#post the HTTP request
$response = $ua->request($req);
$epostrst = $response->content;
my $querykey;
my $webEnv;
if($epostrst =~ /<QueryKey>(.*?)<\/QueryKey>/sg){
$querykey = $1;
#print $querykey,"\n";
}
if($epostrst =~ /<WebEnv>(.*?)<\/WebEnv>/sg){
$webEnv = $1;
#print $webEnv,"\n";
}
my $count = @rss;
my $retmax = 600;
my $efetch = $utils."/efetch.fcgi?";
open(my $outpf,">>","snp_queryinfo.txt");
if(defined($querykey)&&defined($webEnv)){
for($retstart = 0;$retstart < $count;$retstart += $retmax){
my $progress = $retstart/$count;
my $prog = sprintf( "%.2f ",$progress);
print "progression: $prog\n";
$efetch_url = $efetch."db=snp&WebEnv=$webEnv&query_key=$querykey";
$efetch_url .= "&retstart=$retstart&retmax=$retmax&retmode=xml";
#print $efetch_url,"\n";
$efetch_out = get($efetch_url);
#print $efetch_out,"\n";
xmlparse($efetch_out,$outpf);
}
}
close($outpf);
上面的过程将读入名为snps.txt所存储的refsnp号,然后利用efetch检索数据库信息,最后从xml结果中抽提有用的信息。