1Boulder::Genbank(3)   User Contributed Perl Documentation  Boulder::Genbank(3)
2
3
4

NAME

6       Boulder::Genbank - Fetch Genbank data records as parsed Boulder Stones
7

SYNOPSIS

9         use Boulder::Genbank
10
11         # network access via Entrez
12          $gb = Boulder::Genbank->newFh( qw(M57939 M28274 L36028) );
13
14          while ($data = <$gb>) {
15              print $data->Accession;
16
17              @introns = $data->features->Intron;
18              print "There are ",scalar(@introns)," introns.\n";
19              $dna = $data->Sequence;
20              print "The dna is ",length($dna)," bp long.\n";
21
22              my @features = $data->features(-type=>[ qw(Exon Source Satellite) ],
23                                             -pos=>[90,310] );
24              foreach (@features) {
25                 print $_->Type,"\n";
26                 print $_->Position,"\n";
27                 print $_->Gene,"\n";
28             }
29           }
30
31         # another syntax
32         $gb = new Boulder::Genbank(-accessor=>'Entrez',
33                                    -fetch => [qw/M57939 M28274 L36028/]);
34
35         # local access via Yank
36         $gb = new Boulder::Genbank(-accessor=>'Yank',
37                                    -fetch=>[qw/M57939 M28274 L36028/]);
38         while (my $s = $gb->get) {
39            # etc.
40         }
41
42         # parse a file of Genbank records
43         $gb = new Boulder::Genbank(-accessor=>'File',
44                                    -fetch => '/usr/local/db/gbpri3.seq');
45         while (my $s = $gb->get) {
46            # etc.
47         }
48
49         # parse flatfile records yourself
50         open (GB,"/usr/local/db/gbpri3.seq");
51         local $/ = "//\n";
52         while (<GB>) {
53            my $s = Boulder::Genbank->parse($_);
54            # etc.
55         }
56

DESCRIPTION

58       Boulder::Genbank provides retrieval and parsing services for NCBI Gen‐
59       bank-format records.  It returns Genbank entries in Stone format,
60       allowing easy access to the various fields and values.  Boulder::Gen‐
61       bank is a descendent of Boulder::Stream, and provides a stream-like
62       interface to a series of Stone objects.
63
64       >> IMPORTANT NOTE <<
65
66       As of January 2002, NCBI has changed their Batch Entrez interface.  I
67       have modified Boulder::Genbank so as to use a "demo" interface, which
68       fixes things, but this isn't guaranteed in the long run.
69
70       I have written to NCBI, and they may fix this -- or they may not.
71
72       >> IMPORTANT NOTE <<
73
74       Access to Genbank is provided by three different accessors, which
75       together give access to remote and local Genbank databases.  When you
76       create a new Boulder::Genbank stream, you provide one of the three
77       accessors, along with accessor-specific parameters that control what
78       entries to fetch.  The three accessors are:
79
80       Entrez
81           This provides access to NetEntrez, accessing the most recent Gen‐
82           bank information directly from NCBI's Web site.  The parameters
83           passed to this accessor are either a series of Genbank accession
84           numbers, or an Entrez query (see
85           http://www.ncbi.nlm.nih.gov/Entrez/linking.html).  If you provide a
86           list of accession numbers, the stream will return a series of
87           stones corresponding to the numbers.  Otherwise, if you provided an
88           Entrez query, the entries returned will be in the order returned by
89           Entez.
90
91       File
92           This provides access to local Genbank entries by reading from a
93           flat file (typically one of the .seq files downloadable from NCBI's
94           Web site).  The stream will return a Stone corresponding to each of
95           the entries in the file, starting from the top of the file and
96           working downward.  The parameter in this case is the path to the
97           local file.
98
99       Yank
100           This provides access to local Genbank entries using Will Fitzhugh's
101           Yank program.  Yank provides fast indexed access to a Genbank flat
102           file using the accession number as the key.  The parameter passed
103           to the Yank accessor is a list of accession numbers.  Stones will
104           be returned in the requested order.  By default the yank binary
105           lives in /usr/local/bin/yank.  To support other locations, you may
106           define the environment variable YANK to contain the full path.
107
108       It is also possible to parse a single Genbank entry from a text string
109       stored in a scalar variable, returning a Stone object.
110
111       Boulder::Genbank methods
112
113       This section lists the public methods that the Boulder::Genbank class
114       makes available.
115
116       new()
117              # Network fetch via Entrez, with accession numbers
118              $gb=new Boulder::Genbank(-accessor  =>  'Entrez',
119                                       -fetch     =>  [qw/M57939 M28274 L36028/]);
120
121              # Same, but shorter and uses -> operator
122              $gb = Boulder::Genbank->new qw(M57939 M28274 L36028);
123
124              # Network fetch via Entrez, with a query
125
126              # Network fetch via Entrez, with a query
127              $query = 'Homo sapiens[Organism] AND EST[Keyword]';
128              $gb=new Boulder::Genbank(-accessor  =>  'Entrez',
129                                       -fetch     =>  $query);
130
131              # Local fetch via Yank, with accession numbers
132              $gb=new Boulder::Genbank(-accessor  =>  'Yank',
133                                       -fetch     =>  [qw/M57939 M28274 L36028/]);
134
135              # Local fetch via File
136              $gb=new Boulder::Genbank(-accessor  =>  'File',
137                                       -fetch     =>  '/usr/local/genbank/gbpri3.seq');
138
139           The new() method creates a new Boulder::Genbank stream on the
140           accessor provided.  The three possible accessors are Entrez, Yank
141           and File.  If successful, the method returns the stream object.
142           Otherwise it returns undef.
143
144           new() takes the following arguments:
145
146                   -accessor       Name of the accessor to use
147                   -fetch          Parameters to pass to the accessor
148                   -proxy          Path to an HTTP proxy, used when using
149                                    the Entrez accessor over a firewall.
150
151           Specify the accessor to use with the -accessor argument.  If not
152           specified, it defaults to Entrez.
153
154           -fetch is an accessor-specific argument.  The possibilities are:
155
156           For Entrez, the -fetch argument may point to a scalar, in which
157           case it is interpreted as an Entrez query string.  See
158           http://www.ncbi.nlm.nih.gov/Entrez/linking.html for a description
159           of the query syntax.  Alternatively, -fetch may point to an array
160           reference, in which case it is interpreted as a list of accession
161           numbers to retrieve.  If -fetch points to a hash, it is interpreted
162           as extended information.  See "Extended Entrez Parameters" below.
163
164           For Yank, the -fetch argument must point to an array reference con‐
165           taining the accession numbers to retrieve.
166
167           For File, the -fetch argument must point to a string-valued scalar,
168           which will be interpreted as the path to the file to read Genbank
169           entries from.
170
171           For Entrez (and Entrez only) Boulder::Genbank allows you to use a
172           shortcut syntax in which you provde new() with a list of accession
173           numbers:
174
175             $gb = new Boulder::Genbank('M57939','M28274','L36028');
176
177       newFh()
178           This works like new(), but returns a filehandle.  To recover each
179           GenBank record read from the filehandle with the <> operator:
180
181             $fh = Boulder::GenBank->newFh('M57939','M28274','L36028');
182             while ($record = <$fh>) {
183                print $record->asString;
184             }
185
186       get()
187           The get() method is inherited from Boulder::Stream, and simply
188           returns the next parsed Genbank Stone, or undef if there is nothing
189           more to fetch.  It has the same semantics as the parent class,
190           including the ability to restrict access to certain top-level tags.
191
192           The object returned is a Stone::GB_Sequence object, which is a
193           descendent of Stone.
194
195       put()
196           The put() method is inherited from the parent Boulder::Stream
197           class, and will write the passed Stone to standard output in Boul‐
198           der format.  This means that it is currently not possible to write
199           a Boulder::Genbank object back into Genbank flatfile form.
200
201       Extended Entrez Parameters
202
203       The Entrez accessor recognizes extended parameters that allow you the
204       ability to customize the search.  Instead of passing a query string
205       scalar or a list of accession numbers as the -fetch argument, pass a
206       hash reference.  The hashref should contain one or more of the follow‐
207       ing keys:
208
209       -query
210           The Entrez query to process.
211
212       -accession
213           The list of accession numbers to fetch, as an array ref.
214
215       -db The database to search.  This is a single-letter database code
216           selected from the following list:
217
218             m  MEDLINE
219             p  Protein
220             n  Nucleotide
221             s  Popset
222
223       -proxy
224           An HTTP proxy to use.  For example:
225
226              -proxy => http://www.firewall.com:9000
227
228           If you think you need this, get the correct URL from your system
229           administrator.
230
231       As an example, here's how to search for ESTs from Oryza sativa that
232       have been entered or modified since 1999.
233
234         my $gb = new Boulder::Genbank( -accessor=>Entrez,
235                                        -query=>'Oryza sativa[Organism] AND EST[Keyword] AND 1999[MDAT]',
236                                        -db   => 'n'
237                                       });
238

METHODS DEFINED BY THE GENBANK STONE OBJECT

240       Each record returned from the Boulder::Genbank stream defines a set of
241       methods that correspond to features and other fields in the Genbank
242       flat file record.  Stone::GB_Sequence gives the full details, but they
243       are listed for reference here:
244
245       $length = $entry->length
246
247       Get the length of the sequence.
248
249       $start = $entry->start
250
251       Get the start position of the sequence, currently always "1".
252
253       $end = $entry->end
254
255       Get the end position of the sequence, currently always the same as the
256       length.
257
258       @feature_list = $entry->features(-pos=>[50,450],-type=>['CDS','Exon'])
259
260       features() will search the entry feature list for those features that
261       meet certain criteria.  The criteria are specified using the -pos
262       and/or -type argument names, as shown below.
263
264       -pos
265           Provide a position or range of positions which the feature must
266           overlap.  A single position is specified in this way:
267
268              -pos => 1500;         # feature must overlap postion 1500
269
270           or a range of positions in this way:
271
272              -pos => [1000,1500];  # 1000 to 1500 inclusive
273
274           If no criteria are provided, then features() returns all the fea‐
275           tures, and is equivalent to calling the Features() accessor.
276
277       -type, -types
278           Filter the list of features by type or a set of types.  Matches are
279           case-insensitive, so "exon", "Exon" and "EXON" are all equivalent.
280           You may call with a single type as in:
281
282              -type => 'Exon'
283
284           or with a list of types, as in
285
286              -types => ['Exon','CDS']
287
288           The names "-type" and "-types" can be used interchangeably.
289
290       $seqObj = $entry->bioSeq;
291
292       Returns a Bio::Seq object from the Bioperl project.  Dies with an error
293       message unless the Bio::Seq module is installed.
294

OUTPUT TAGS

296       The tags returned by the parsing operation are taken from the NCBI
297       ASN.1 schema.  For consistency, they are normalized so that the initial
298       letter is capitalized, and all subsequent letters are lowercase.  This
299       section contains an abbreviated list of the most useful/common tags.
300       See "The NCBI Data Model", by James Ostell and Jonathan Kans in "Bioin‐
301       formatics: A Practical Guide to the Analysis of Genes and Proteins"
302       (Eds. A. Baxevanis and F. Ouellette), pp 121-144 for the full listing.
303
304       Top-Level Tags
305
306       These are tags that appear at the top level of the parsed Genbank
307       entry.
308
309       Accession
310           The accession number of this entry.  Because of the vagaries of the
311           Genbank data model, an entry may have multiple accession numbers
312           (e.g. after a merging operation).  Accession may therefore be a
313           multi-valued tag.
314
315           Example:
316
317                 my $accessionNo = $s->Accession;
318
319       Authors
320           The list of authors, as they appear on the AUTHORS line of the Gen‐
321           bank record.  No attempt is made to parse them into individual
322           authors.
323
324       Basecount
325           The nucleotide basecount for the entry.  It is presented as a Boul‐
326           der Stone with keys "a", "c", "t" and "g".  Example:
327
328                my $A = $s->Basecount->A;
329                my $C = $s->Basecount->C;
330                my $G = $s->Basecount->G;
331                my $T = $s->Basecount->T;
332                print "GC content is ",($G+$C)/($A+$C+$G+$T),"\n";
333
334       Blob
335           The entire flatfile record as an unparsed chunk of text (a "blob").
336           This is a handy way of reassembling the record for human inspec‐
337           tion.
338
339       Comment
340           The COMMENT line from the Genbank record.
341
342       Definition
343           The DEFINITION line from the Genbank record, unmodified.
344
345       Features
346           The FEATURES table.  This is a complex stone object with multiple
347           subtags.  See the "The Features Tag" for details.
348
349       Journal
350           The JOURNAL line from the Genbank record, unmodified.
351
352       Keywords
353           The KEYWORDS line from the Genbank record, unmodified.  No attempt
354           is made to parse the keywords into separate values.
355
356           Example:
357
358               my $keywords = $s->Keywords
359
360       Locus
361           The LOCUS line from the Genbank record.  It is not further parsed.
362
363       Medline, Nid
364           References to other database accession numbers.
365
366       Organism
367           The taxonomic name of the organism from which this entry was
368           derived. This line is taken from the Genbank entry unmodified.  See
369           the NCBI data model documentation for an explanation of their taxo‐
370           nomic syntax.
371
372       Reference
373           The REFERENCE line from the Genbank entry.  There are often multi‐
374           ple Reference lines.  Example:
375
376             my @references = $s->Reference;
377
378       Sequence
379           The DNA or RNA sequence of the entry.  This is presented as a sin‐
380           gle lower-case string, with all base numbers and formatting charac‐
381           ters removed.
382
383       Source
384           The entry's SOURCE field; often giving clues on how the sequencing
385           was performed.
386
387       Title
388           The TITLE field from the paper describing this entry, if any.
389
390       The Features Tag
391
392       The Features tag points to a Stone record that contains multiple sub‐
393       tags.  Each subtag is the name of a feature which points, in turn, to a
394       Stone that describes the feature's location and other attributes.  The
395       full list of feature is beyond this document, but the following are the
396       features that are most often seen:
397
398               Cds             a CDS
399               Intron          an intron
400               Exon            an exon
401               Gene            a gene
402               Mrna            an mRNA
403               Polya_site      a putative polyadenylation signal
404               Repeat_unit     a repetitive region
405               Source          More information about the organism and cell
406                               type the sequence was derived from
407               Satellite       a microsatellite (dinucleotide repeat)
408
409       Each feature will contain one or more of the following subtags:
410
411       DB_xref
412           A cross-reference to another database in the form DB_NAME:acces‐
413           sion_number.  See the NCBI Web site for a description of these
414           cross references.
415
416       Evidence
417           The evidence for this feature, either "experimental" or "pre‐
418           dicted".
419
420       Gene
421           If the feature involves a gene, this will be the gene's name (or
422           one of its names).  This subtag is often seen in "Gene" and Cds
423           features.
424
425           Example:
426
427                   foreach ($s->Features->Cds) {
428                      my $gene = $_->Gene;
429                      my $position = $_->Position;
430                      Print "Gene $gene ($position)\n";
431                   }
432
433       Map If the feature is mapped, this provides a map position, usually as
434           a cytogenetic band.
435
436       Note
437           A grab-back for various text notes.
438
439       Number
440           When multiple features of this type occur, this field is used to
441           number them.  Ordinarily this field is not needed because Boul‐
442           der::Genbank preserves the order of features.
443
444       Organism
445           If the feature is Source, this provides the source organism.
446
447       Position
448           The position of this feature, usually expresed as a range
449           (1970..1975).
450
451       Product
452           The protein product of the feature, if applicable, as a text
453           string.
454
455       Translation
456           The protein translation of the feature, if applicable.
457

SEE ALSO

459       Boulder, Boulder::Blast
460

AUTHOR

462       Lincoln Stein <lstein@cshl.org>.
463
464       Copyright (c) 1997-2000 Lincoln D. Stein
465
466       This library is free software; you can redistribute it and/or modify it
467       under the same terms as Perl itself.  See DISCLAIMER.txt for dis‐
468       claimers of warranty.
469

EXAMPLE GENBANK OBJECT

471       The following is an excerpt from a moderately complex Genbank Stone.
472       The Sequence line and several other long lines have been truncated for
473       readability.
474
475        Authors=Spritz,R.A., Strunk,K., Surowy,C.S.O., Hoch,S., Barton,D.E. and Francke,U.
476        Authors=Spritz,R.A., Strunk,K., Surowy,C.S. and Mohrenweiser,H.W.
477        Locus=HUMRNP7011   2155 bp    DNA             PRI       03-JUL-1991
478        Accession=M57939
479        Accession=J04772
480        Accession=M57733
481        Keywords=ribonucleoprotein antigen.
482        Sequence=aagcttttccaggcagtgcgagatagaggagcgcttgagaaggcaggttttgcagcagacggcagtgacagcccag...
483        Definition=Human small nuclear ribonucleoprotein (U1-70K) gene, exon 10 and 11.
484        Journal=Nucleic Acids Res. 15, 10373-10391 (1987)
485        Journal=Genomics 8, 371-379 (1990)
486        Nid=g337441
487        Medline=88096573
488        Medline=91065657
489        Features={
490          Polya_site={
491            Evidence=experimental
492            Position=1989
493            Gene=U1-70K
494          }
495          Polya_site={
496            Position=1990
497            Gene=U1-70K
498          }
499          Polya_site={
500            Evidence=experimental
501            Position=1992
502            Gene=U1-70K
503          }
504          Polya_site={
505            Evidence=experimental
506            Position=1998
507            Gene=U1-70K
508          }
509          Source={
510            Organism=Homo sapiens
511            Db_xref=taxon:9606
512            Position=1..2155
513            Map=19q13.3
514          }
515          Cds={
516            Codon_start=1
517            Product=ribonucleoprotein antigen
518            Db_xref=PID:g337445
519            Position=join(M57929:329..475,M57930:183..245,M57930:358..412, ...
520            Gene=U1-70K
521            Translation=MTQFLPPNLLALFAPRDPIPYLPPLEKLPHEKHHNQPYCGIAPYIREFEDPRDAPPPTR...
522          }
523          Cds={
524            Codon_start=1
525            Product=ribonucleoprotein antigen
526            Db_xref=PID:g337444
527            Evidence=experimental
528            Position=join(M57929:329..475,M57930:183..245,M57930:358..412, ...
529            Gene=U1-70K
530            Translation=MTQFLPPNLLALFAPRDPIPYLPPLEKLPHEKHHNQPYCGIAPYIREFEDPR...
531          }
532          Polya_signal={
533            Position=1970..1975
534            Note=putative
535            Gene=U1-70K
536          }
537          Intron={
538            Evidence=experimental
539            Position=1100..1208
540            Gene=U1-70K
541          }
542          Intron={
543            Number=10
544            Evidence=experimental
545            Position=1100..1181
546            Gene=U1-70K
547          }
548          Intron={
549            Number=9
550            Evidence=experimental
551            Position=order(M57937:702..921,1..1011)
552            Note=2.1 kb gap
553            Gene=U1-70K
554          }
555          Intron={
556            Position=order(M57935:272..406,M57936:1..284,M57937:1..599, <1..>1208)
557            Gene=U1-70K
558          }
559          Intron={
560            Evidence=experimental
561            Position=order(M57935:284..406,M57936:1..284,M57937:1..599, <1..>1208)
562            Note=first gap-0.14 kb, second gap-0.62 kb
563            Gene=U1-70K
564          }
565          Intron={
566            Number=8
567            Evidence=experimental
568            Position=order(M57935:272..406,M57936:1..284,M57937:1..599, <1..>1181)
569            Note=first gap-0.14 kb, second gap-0.62 kb
570            Gene=U1-70K
571          }
572          Exon={
573            Number=10
574            Evidence=experimental
575            Position=1012..1099
576            Gene=U1-70K
577          }
578          Exon={
579            Number=11
580            Evidence=experimental
581            Position=1182..(1989.1998)
582            Gene=U1-70K
583          }
584          Exon={
585            Evidence=experimental
586            Position=1209..(1989.1998)
587            Gene=U1-70K
588          }
589          Mrna={
590            Product=ribonucleoprotein antigen
591            Position=join(M57928:358..668,M57929:319..475,M57930:183..245, ...
592            Gene=U1-70K
593          }
594          Mrna={
595            Product=ribonucleoprotein antigen
596            Citation=[2]
597            Evidence=experimental
598            Position=join(M57928:358..668,M57929:319..475,M57930:183..245, ...
599            Gene=U1-70K
600          }
601          Gene={
602            Position=join(M57928:207..719,M57929:1..562,M57930:1..577, ...
603            Gene=U1-70K
604          }
605        }
606        Reference=1  (sites)
607        Reference=2  (bases 1 to 2155)
608        =
609
610
611
612perl v5.8.8                       2000-06-08               Boulder::Genbank(3)
Impressum