File Coverage

Bio/SeqIO/locuslink.pm
Criterion Covered Total %
statement 152 173 87.8
branch 37 56 66.0
condition 2 6 33.3
subroutine 18 19 94.7
pod 1 9 11.1
total 210 263 79.8


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SeqIO::locuslink
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Keith Ching
7             #
8             # Copyright Keith Ching
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             #
13             # (c) Keith Ching, kching at gnf.org, 2002.
14             # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
15             #
16             # You may distribute this module under the same terms as perl itself.
17             # Refer to the Perl Artistic License (see the license accompanying this
18             # software package, or see http://www.perl.com/language/misc/Artistic.html)
19             # for the terms under which you may use, modify, and redistribute this module.
20             #
21             # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
22             # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
23             # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
24             #
25              
26             # POD documentation - main docs before the code
27              
28             =head1 NAME
29              
30             Bio::SeqIO::locuslink - LocusLink input/output stream
31              
32             =head1 SYNOPSIS
33              
34             # don't instantiate directly - instead do
35             my $seqio = Bio::SeqIO->new(-format => "locuslink", -file => \STDIN);
36              
37             =head1 DESCRIPTION
38              
39             This module parses LocusLink into Bio::SeqI objects with rich
40             annotation, but no sequence.
41              
42             The input file has to be in the LL_tmpl format - the tabular format
43             will not work.
44              
45             The way the current implementation populates the object is rather a
46             draft work than a finished work of art. Note that at this stage the
47             LocusLink entries cannot be round-tripped, because the parser loses
48             certain information. For instance, most of the alternative transcript
49             descriptions are not retained. The parser also misses any element
50             that deals with visual representation (e.g., 'button') except for the
51             URLs. Almost all of the pieces of the annotation are kept in a
52             Bio::Annotation::Collection object, see L
53             for more information.
54              
55             =head1 FEEDBACK
56              
57             =head2 Mailing Lists
58              
59             User feedback is an integral part of the evolution of this and other
60             Bioperl modules. Send your comments and suggestions preferably to
61             the Bioperl mailing list. Your participation is much appreciated.
62              
63             bioperl-l@bioperl.org - General discussion
64             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
65              
66             =head2 Support
67              
68             Please direct usage questions or support issues to the mailing list:
69              
70             I
71              
72             rather than to the module maintainer directly. Many experienced and
73             reponsive experts will be able look at the problem and quickly
74             address it. Please include a thorough description of the problem
75             with code and data examples if at all possible.
76              
77             =head2 Reporting Bugs
78              
79             Report bugs to the Bioperl bug tracking system to help us keep track
80             of the bugs and their resolution. Bug reports can be submitted via
81             the web:
82              
83             https://github.com/bioperl/bioperl-live/issues
84              
85             =head1 AUTHOR - Keith Ching
86              
87             Email kching at gnf.org
88              
89             =head1 CONTRIBUTORS
90              
91             Hilmar Lapp, hlapp at gmx.net
92              
93             =head1 APPENDIX
94              
95             The rest of the documentation details each of the object methods.
96             Internal methods are usually preceded with a _
97              
98             =cut
99              
100             package Bio::SeqIO::locuslink;
101              
102 1     1   525 use strict;
  1         1  
  1         26  
103              
104 1     1   215 use Bio::Seq::SeqFactory;
  1         2  
  1         29  
105 1     1   303 use Bio::Species;
  1         3  
  1         30  
106 1     1   271 use Bio::Annotation::DBLink;
  1         3  
  1         24  
107             #use Bio::Annotation::Reference;
108 1     1   216 use Bio::Annotation::Comment;
  1         3  
  1         23  
109 1     1   220 use Bio::Annotation::SimpleValue;
  1         2  
  1         25  
110 1     1   247 use Bio::Annotation::OntologyTerm;
  1         3  
  1         34  
111 1     1   294 use Bio::Annotation::Collection;
  1         4  
  1         27  
112              
113 1     1   7 use base qw(Bio::SeqIO);
  1         2  
  1         388  
114              
115             # list of all the field names in locuslink
116             my @locuslink_keys = qw(
117             ACCNUM
118             ALIAS_PROT
119             ALIAS_SYMBOL
120             ASSEMBLY
121             BUTTON
122             CDD
123             CHR
124             COMP
125             CONTIG
126             CURRENT_LOCUSID
127             DB_DESCR
128             DB_LINK
129             ECNUM
130             EVID
131             EXTANNOT
132             GO
133             GRIF
134             LINK
135             LOCUSID
136             LOCUS_CONFIRMED
137             LOCUS_TYPE
138             MAP
139             MAPLINK
140             NC
141             NG
142             NM
143             NP
144             NR
145             OFFICIAL_GENE_NAME
146             OFFICIAL_SYMBOL
147             OMIM
148             ORGANISM
149             PHENOTYPE
150             PHENOTYPE_ID
151             PMID
152             PREFERRED_GENE_NAME
153             PREFERRED_PRODUCT
154             PREFERRED_SYMBOL
155             PRODUCT
156             PROT
157             RELL
158             STATUS
159             STS
160             SUMFUNC
161             SUMMARY
162             TRANSVAR
163             TYPE
164             UNIGENE
165             XG
166             XM
167             XP
168             XR
169             );
170              
171             # list of fields to make simple annotations from
172             # fields not listed here or as a key in feature hash are ignored (lost).
173             my %anntype_map = (
174             SimpleValue => [qw(
175             ALIAS_PROT
176             ALIAS_SYMBOL
177             CDD
178             CHR
179             CURRENT_LOCUSID
180             ECNUM
181             EXTANNOT
182             MAP
183             NC
184             NR
185             OFFICIAL_GENE_NAME
186             OFFICIAL_SYMBOL
187             PHENOTYPE
188             PREFERRED_GENE_NAME
189             PREFERRED_PRODUCT
190             PREFERRED_SYMBOL
191             PRODUCT
192             RELL
193             SUMFUNC
194             )
195             ],
196             Comment => [qw(
197             SUMMARY
198             )
199             ],
200             );
201              
202              
203             # certain fields are not named the same as the symgene database list
204             my %dbname_map = (
205             pfam => 'Pfam',
206             smart => 'SMART',
207             NM => 'RefSeq',
208             NP => 'RefSeq',
209             XP => 'RefSeq',
210             XM => 'RefSeq',
211             NG => 'RefSeq',
212             XG => 'RefSeq',
213             XR => 'RefSeq',
214             PROT => 'GenBank',
215             ACCNUM => 'GenBank',
216             CONTIG => 'GenBank',
217             # certain fields are not named the same as the symgene
218             # database list: rename the fields the symgene database name
219             # key = field name in locuslink
220             # value = database name in sym
221             #GO => 'GO',
222             OMIM => 'MIM',
223             GRIF => 'GRIF',
224             STS => 'STS',
225             UNIGENE => 'UniGene',
226             );
227              
228             # certain CDD entries use the wrong prefix for the accession number
229             # cddprefix will replace the key w/ the value for these entries
230             my %cddprefix = (
231             pfam => 'PF',
232             smart => 'SM',
233             );
234              
235             # alternate mappings if one field does not exist
236             my %alternate_map = (
237             OFFICIAL_GENE_NAME => 'PREFERRED_GENE_NAME',
238             OFFICIAL_SYMBOL => 'PREFERRED_SYMBOL',
239             );
240              
241             # for these field names, we only care about the first value X in value X|Y|Z
242             my @ll_firstelements = qw(
243             NM
244             NP
245             NG
246             XG
247             XM
248             XP
249             XR
250             PROT
251             STS
252             ACCNUM
253             CONTIG
254             GRIF
255             );
256              
257             # these fields need to be flattened into a single string, using the given
258             # join string
259             my %flatten_tags = (
260             ASSEMBLY => ',',
261             ORGANISM => '', # this should occur only once
262             OFFICIAL_SYMBOL => '', # this should occur only once
263             OFFICIAL_GENE_NAME => '', # this should occur only once
264             LOCUSID => '', # this should occur only once
265             PMID => ',',
266             PREFERRED_SYMBOL => ', ',
267             PREFERRED_GENE_NAME => ', '
268             );
269              
270             # set the default search pattern for all the field names
271             my %feature_pat_map = map { ($_ , "^$_: (.+)\n"); } @locuslink_keys;
272              
273             sub _initialize {
274 1     1   5 my($self,@args) = @_;
275              
276 1         12 $self->SUPER::_initialize(@args);
277              
278             # overwrite the search pattern w/ the first value pattern
279 1         3 foreach my $key(@ll_firstelements){
280 12         29 $feature_pat_map{$key}="^$key: ([^|]+)";
281             }
282              
283             # special search pattern for cdd entries
284 1         5 foreach my $key(keys %cddprefix) {
285 2         6 $feature_pat_map{$key}='^CDD: .+\|'.$key.'(\d+)';
286             }
287              
288             # special patterns for specific fields
289 1         3 $feature_pat_map{MAP} = '^MAP: (.+?)\|';
290 1         2 $feature_pat_map{MAPHTML} = '^MAP: .+\|(<.+>)\|';
291 1         3 $feature_pat_map{GO} = '^GO: .+\|.+\|\w+\|(GO:\d+)\|';
292 1         3 $feature_pat_map{GO_DESC} = '^GO: .+\|(.+)\|\w+\|GO:\d+\|';
293 1         2 $feature_pat_map{GO_CAT} = '^GO: (.+)\|.+\|\w+\|GO:\d+\|';
294 1         2 $feature_pat_map{EXTANNOT} = '^EXTANNOT: (.+)\|(.+)\|\w+\|.+\|\d+';
295              
296             # set the sequence factory of none has been set already
297 1 50       7 if(! $self->sequence_factory()) {
298 1         8 $self->sequence_factory(Bio::Seq::SeqFactory->new(
299             -type => 'Bio::Seq::RichSeq'));
300             }
301             }
302              
303              
304             #########################
305             #
306             sub search_pattern{
307             #
308             #########################
309 114     114 0 132 my ($self,
310             $entry, #text to search
311             $searchconfirm, #to make sure you got the right thing
312             $searchpattern,
313             $searchtype) = @_;
314 114         1769 my @query = $entry=~/$searchpattern/gm;
315 114 50       236 if ($searchconfirm ne "FALSE"){
316 0 0       0 $self->warn("No $searchtype found\n$entry\n") unless @query;
317 0         0 foreach (@query){
318 0 0       0 if (!($_=~/$searchconfirm/)){
319 0         0 $self->throw("error\n$entry\n$searchtype parse $_ does not match $searchconfirm\n");
320             }
321             }#endforeach
322             }#endsearchconfirm
323 114         204 return(@query);
324             }#endsub
325             ############
326             #
327             sub read_species{
328             #
329             ############
330 2     2 0 7 my ($spline)=@_;
331 2         6 my $species;
332             my $genus;
333 2         42 ($genus,$species)=$spline=~/([^ ]+) ([^ ]+)/;
334 2         36 my $make = Bio::Species->new();
335 2         10 $make->classification( ($species,$genus) );
336 2         29 return $make;
337             }
338             ################
339             #
340             sub read_dblink{
341             #
342             ################
343 36     36 0 61 my ($ann,$db,$ref)=@_;
344 36 100       76 my @results=$ref ? @$ref : ();
345 36         49 foreach my $id(@results){
346 50 50       73 if($id){
347 50         139 $ann->add_Annotation('dblink',
348             Bio::Annotation::DBLink->new(
349             -database =>$db ,
350             -primary_id =>$id));
351             }
352             }
353 36         64 return($ann);
354             }
355              
356             ################
357             #
358             sub read_reference{
359             #
360             ################
361 2     2 0 10 my ($ann,$db,$results)=@_;
362              
363 2 50       9 if($results){
364 2         9 chomp($results);
365 2         19 my @ids=split(/,/,$results);
366 2 50       15 $ann = read_dblink($ann,$db,\@ids) if @ids;
367             }
368 2         5 return $ann;
369             }#endsub
370              
371             ################
372             #
373             sub add_annotation{
374             #
375             ################
376 43     43 0 69 my ($ac,$type,$text,$anntype)=@_;
377 43         43 my @args;
378              
379 43 50       62 $anntype = 'SimpleValue' unless $anntype;
380             SWITCH : {
381 43 100       45 $anntype eq 'SimpleValue' && do {
  43         71  
382 41         74 push(@args, -value => $text, -tagname => $type);
383 41         54 last SWITCH;
384             };
385 2 50       5 $anntype eq 'Comment' && do {
386 2         6 push(@args, -text => $text, -tagname => 'comment');
387 2         9 last SWITCH;
388             };
389             }
390 43         144 $ac->add_Annotation("Bio::Annotation::$anntype"->new(@args));
391 43         122 return($ac);
392             }#endsub
393              
394             ################
395             #
396             sub add_annotation_ref{
397             #
398             ################
399 4     4 0 8 my ($ann,$type,$textref)=@_;
400 4 50       13 my @text=$textref ? @$textref : ();
401            
402 4         8 foreach my $text(@text){
403 22         68 $ann->add_Annotation($type,Bio::Annotation::SimpleValue->new(-value => $text));
404             }
405 4         12 return($ann);
406             }#endsub
407              
408             ################
409             #
410             sub make_unique{
411             #
412             ##############
413 2     2 0 7 my ($ann,$key) = @_;
414            
415 2         6 my %seen = ();
416 2         9 foreach my $dbl ($ann->remove_Annotations($key)) {
417 50 100       84 if(!exists($seen{$dbl->as_text()})) {
418 43         79 $seen{$dbl->as_text()} = 1;
419 43         88 $ann->add_Annotation($dbl);
420             }
421             }
422 2         22 return $ann;
423             }
424              
425             ################
426             #
427             sub next_seq{
428             #
429             ##############
430 3     3 1 19 my ($self, @args)=@_;
431 3         5 my (@results,$search,$ref,$cddref);
432              
433             # LOCUSLINK entries begin w/ >>
434 3         25 local $/="\n>>";
435              
436             # slurp in a whole entry and return if no more entries
437 3 100       22 return unless my $entry = $self->_readline;
438              
439             # strip the leading '>>' if it's the first entry
440 2 100       16 if (index($entry,'>>') == 0) { #first entry
441 1         6 $entry = substr($entry,2);
442             }
443              
444             # we aren't interested in obsoleted entries, so we need to loop
445             # and skip those until we've found the next not obsoleted
446 2         6 my %record = ();
447 2   33     24 while($entry && ($entry =~ /\w/)) {
448 2 50       11 if (!($entry=~/LOCUSID/)){
449 0         0 $self->throw("No LOCUSID in first line of record. ".
450             "Not LocusLink in my book.");
451             }
452             # see whether it's an obsoleted entry, and if so jump to the next
453             # one entry right away
454 2 50       15 if($entry =~ /^CURRENT_LOCUSID:/m) {
455             # read next entry and continue
456 0         0 $entry = $self->_readline;
457 0         0 %record = ();
458 0         0 next;
459             }
460             # loop through list of features and get field values
461             # place into record hash as array refs
462 2         39 foreach my $key (keys %feature_pat_map){
463 114         143 $search=$feature_pat_map{$key};
464 114         148 @results=$self->search_pattern($entry,'FALSE',$search,$search);
465 114 100       281 $record{$key} = @results ? [@results] : undef;
466             }#endfor
467             # terminate loop as this one hasn't been obsoleted
468 2         9 last;
469             }
470              
471             # we have reached the end-of-file ...
472 2 50       8 return unless %record;
473              
474             # special processing for CDD entries like pfam and smart
475 2         3 my ($PRESENT,@keep);
476 2         8 foreach my $key(keys %cddprefix){
477             #print "check CDD $key\n";
478 4 100       10 if($record{$key}) {
479 2         3 @keep=();
480 2         4 foreach my $list (@{$record{$key}}) {
  2         7  
481             # replace AC with correct AC number
482 4         9 push(@keep,$cddprefix{$key}.$list);
483             }
484             # replace CDD ref with correctly prefixed AC number
485 2         7 $record{$key} = [@keep];
486             }
487             }
488             # modify CDD references @=();
489 2 50       5 if($record{CDD}) {
490 2         4 @keep=();
491 2         4 foreach my $cdd (@{$record{CDD}}) {
  2         6  
492 8         10 $PRESENT = undef;
493 8         11 foreach my $key (keys %cddprefix) {
494 16 100       104 if ($cdd=~/$key/){
495 4         8 $PRESENT = 1;
496 4         5 last;
497             }
498             }
499 8 100       18 push(@keep,$cdd) if(! $PRESENT);
500             }
501 2         8 $record{CDD} = [@keep];
502             }
503              
504             # create annotation collection - we'll need it now
505 2         16 my $ann = Bio::Annotation::Collection->new();
506              
507 2         12 foreach my $field(keys %dbname_map){
508 32         56 $ann=read_dblink($ann,$dbname_map{$field},$record{$field});
509             }
510            
511             # add GO link as an OntologyTerm annotation
512 2 50       9 if($record{GO}) {
513 2         8 for(my $j = 0; $j < @{$record{GO}}; $j++) {
  10         22  
514             my $goann = Bio::Annotation::OntologyTerm->new(
515             -identifier => $record{GO}->[$j],
516             -name => $record{GO_DESC}->[$j],
517 8         43 -ontology => $record{GO_CAT}->[$j]);
518 8         24 $ann->add_Annotation($goann);
519             }
520             }
521              
522 2         10 $ann=add_annotation_ref($ann,'URL',$record{LINK});
523 2         7 $ann=add_annotation_ref($ann,'URL',$record{DB_LINK});
524              
525             # everything else gets a simple tag or comment value annotation
526 2         10 foreach my $anntype (keys %anntype_map) {
527 4         7 foreach my $key (@{$anntype_map{$anntype}}){
  4         12  
528 40 100       92 if($record{$key}){
529 27         31 foreach (@{$record{$key}}){
  27         51  
530             #print "$key\t\t$_\n";
531 43         76 $ann=add_annotation($ann,$key,$_,$anntype);
532             }
533             }
534             }
535             }
536              
537             # flatten designated attributes into a scalar value
538 2         21 foreach my $field (keys %flatten_tags) {
539 16 100       42 if($record{$field}) {
540             $record{$field} = join($flatten_tags{$field},
541 12         21 @{$record{$field}});
  12         59  
542             }
543             }
544              
545             # annotation that expects the array flattened out
546 2         16 $ann=read_reference($ann,'PUBMED',$record{PMID});
547 2 50       8 if($record{ASSEMBLY}) {
548 2         8 my @assembly=split(/,/,$record{ASSEMBLY});
549 2         9 $ann=read_dblink($ann,'GenBank',\@assembly);
550             }
551              
552             # replace fields w/ alternate if original does not exist
553 2         9 foreach my $fieldval (keys %alternate_map){
554 4 50 33     13 if((! $record{$fieldval}) && ($record{$alternate_map{$fieldval}})){
555 0         0 $record{$fieldval}=$record{$alternate_map{$fieldval}};
556             }
557             }
558              
559             # presently we can't store types or context of dblinks - therefore
560             # we need to remove duplicates that only differ in context
561 2         15 make_unique($ann,'dblink');
562              
563             # create sequence object (i.e., let seq.factory create one)
564             my $seq = $self->sequence_factory->create(
565             -verbose => $self->verbose(),
566             -accession_number => $record{LOCUSID},
567             -desc => $record{OFFICIAL_GENE_NAME},
568             -display_id => $record{OFFICIAL_SYMBOL},
569 2         22 -species => read_species($record{ORGANISM}),
570             -annotation => $ann);
571              
572             # dump out object contents
573             # show_obj([$seq]);
574              
575 2         126 return($seq);
576             }
577              
578             ################
579             #
580             sub show_obj{
581             #
582             ################
583 0     0 0   my ($seqlistref)=@_;
584 0           my @list=@$seqlistref;
585 0           my $out = Bio::SeqIO->new('-fh' => \*STDOUT, -format => 'genbank' );
586 0           my ($ann,@values,$val);
587              
588 0           foreach my $seq(@list){
589 0           $out->write_seq($seq);
590 0           $ann=$seq->annotation;
591 0           foreach my $key ( $ann->get_all_annotation_keys() ) {
592 0           @values = $ann->get_Annotations($key);
593 0           foreach my $value ( @values ) {
594             # value is an Bio::AnnotationI, and defines a "as_text" method
595 0           $val=$value->as_text;
596 0           print "Annotation ",$key,"\t\t",$val,"\n";
597             }
598             }
599             }
600             }#endsub
601              
602             1;