File Coverage

Bio/Factory/FTLocationFactory.pm
Criterion Covered Total %
statement 119 120 99.1
branch 61 64 95.3
condition 40 44 90.9
subroutine 9 9 100.0
pod 1 1 100.0
total 230 238 96.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Factory::FTLocationFactory
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Hilmar Lapp
7             #
8             # Copyright Hilmar Lapp
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # (c) Hilmar Lapp, hlapp at gnf.org, 2002.
13             # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
14             #
15             # You may distribute this module under the same terms as perl itself.
16             # Refer to the Perl Artistic License (see the license accompanying this
17             # software package, or see http://www.perl.com/language/misc/Artistic.html)
18             # for the terms under which you may use, modify, and redistribute this module.
19             #
20             # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
21             # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
22             # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
23             #
24              
25             # POD documentation - main docs before the code
26              
27             =head1 NAME
28              
29             Bio::Factory::FTLocationFactory - A FeatureTable Location Parser
30              
31             =head1 SYNOPSIS
32              
33             # parse a string into a location object
34             $loc = Bio::Factory::FTLocationFactory->from_string("join(100..200,
35             400..500");
36              
37             =head1 DESCRIPTION
38              
39             Implementation of string-encoded location parsing for the Genbank feature
40             table encoding of locations.
41              
42             =head1 FEEDBACK
43              
44             =head2 Mailing Lists
45              
46             User feedback is an integral part of the evolution of this and other
47             Bioperl modules. Send your comments and suggestions preferably to
48             the Bioperl mailing list. Your participation is much appreciated.
49              
50             bioperl-l@bioperl.org - General discussion
51             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52              
53             =head2 Support
54              
55             Please direct usage questions or support issues to the mailing list:
56              
57             I
58              
59             rather than to the module maintainer directly. Many experienced and
60             reponsive experts will be able look at the problem and quickly
61             address it. Please include a thorough description of the problem
62             with code and data examples if at all possible.
63              
64             =head2 Reporting Bugs
65              
66             Report bugs to the Bioperl bug tracking system to help us keep track
67             of the bugs and their resolution. Bug reports can be submitted via the
68             web:
69              
70             https://github.com/bioperl/bioperl-live/issues
71              
72             =head1 AUTHOR - Hilmar Lapp
73              
74             Email hlapp at gmx.net
75              
76             =head1 CONTRIBUTORS
77              
78             Jason Stajich, jason-at-bioperl-dot-org
79             Chris Fields, cjfields-at-uiuc-dot-edu
80              
81             =head1 APPENDIX
82              
83             The rest of the documentation details each of the object methods.
84             Internal methods are usually preceded with a _
85              
86             =cut
87              
88              
89             # Let the code begin...
90              
91             package Bio::Factory::FTLocationFactory;
92 83     83   787 use vars qw($LOCREG);
  83         88  
  83         3006  
93 83     83   266 use strict;
  83         110  
  83         1343  
94              
95             # Object preamble - inherits from Bio::Root::Root
96              
97 83     83   15436 use Bio::Location::Simple;
  83         130  
  83         1941  
98 83     83   22077 use Bio::Location::Split;
  83         131  
  83         2076  
99 83     83   18565 use Bio::Location::Fuzzy;
  83         120  
  83         2204  
100              
101              
102 83     83   345 use base qw(Bio::Root::Root Bio::Factory::LocationFactoryI);
  83         205  
  83         26096  
103              
104             BEGIN {
105             # the below is an optimized regex obj. from J. Freidl's Mastering Reg Exp.
106             $LOCREG = qr{
107             (?>
108             [^()]+
109             |
110             \(
111             (??{$LOCREG})
112             \)
113             )*
114 83     83   74658 }x;
115             }
116              
117             =head2 new
118              
119             Title : new
120             Usage : my $obj = Bio::Factory::FTLocationFactory->new();
121             Function: Builds a new Bio::Factory::FTLocationFactory object
122             Returns : an instance of Bio::Factory::FTLocationFactory
123             Args :
124              
125             =cut
126              
127             =head2 from_string
128              
129             Title : from_string
130             Usage : $loc = $locfactory->from_string("100..200");
131             Function: Parses the given string and returns a Bio::LocationI implementing
132             object representing the location encoded by the string.
133              
134             This implementation parses the Genbank feature table
135             encoding of locations.
136             Example :
137             Returns : A Bio::LocationI implementing object.
138             Args : A string.
139              
140             =cut
141              
142             sub from_string {
143 13741     13741 1 16891 my ($self,$locstr,$op) = @_;
144 13741         9719 my $loc;
145            
146             #$self->debug("$locstr\n");
147            
148             # $op for operator (error handling)
149            
150             # run on first pass only
151             # Note : These location types are now deprecated in GenBank (Oct. 2006)
152 13741 100       18980 if (!defined($op)) {
153             # convert all (X.Y) to [X.Y]
154 9940         21771 $locstr =~ s{\((\d+\.\d+)\)}{\[$1\]}g;
155             # convert ABC123:(X..Y) to ABC123:[X..Y]
156             # we should never see the above
157 9940         13513 $locstr =~ s{:\((\d+\.{2}\d+)\)}{:\[$1\]}g;
158             }
159            
160 13741 100       50765 if ($locstr =~ m{(.*?)\(($LOCREG)\)(.*)}o) { # any matching parentheses?
161 7418         16442 my ($beg, $mid, $end) = ($1, $2, $3);
162 7418         18700 my (@sublocs) = (split(q(,),$beg), $mid, split(q(,),$end));
163            
164 7418         7485 my @loc_objs;
165             my $loc_obj;
166            
167             SUBLOCS:
168 7418         11332 while (@sublocs) {
169 7429         8437 my $subloc = shift @sublocs;
170 7429 100       10296 next if !$subloc;
171 7428 100 100     32546 my $oparg = ($subloc eq 'join' || $subloc eq 'bond' ||
172             $subloc eq 'order' || $subloc eq 'complement') ? $subloc : undef;
173             # has operator, requires further work (recurse)
174 7428 100       8150 if ($oparg) {
175 7417         6223 my $sub = shift @sublocs;
176             # simple split operators (no recursive calls needed)
177 7417 100 66     38199 if (($oparg eq 'join' || $oparg eq 'order' || $oparg eq 'bond' )
      100        
178             && $sub !~ m{(?:join|order|bond)}) {
179 3627         10601 my @splitlocs = split(q(,), $sub);
180 3627         12201 $loc_obj = Bio::Location::Split->new(-verbose => 1,
181             -splittype => $oparg);
182             # Store strand values for later consistency check
183 3627         3630 my @subloc_strands;
184             my @s_objs;
185 3627         4503 foreach my $splitloc (@splitlocs) {
186 35515 50       48523 next unless $splitloc;
187 35515         23282 my $sobj;
188 35515 100       703753 if ($splitloc =~ m{\(($LOCREG)\)}) {
189 43         152 my $comploc = $1;
190 43         103 $sobj = $self->_parse_location($comploc);
191 43         97 $sobj->strand(-1);
192 43         65 push @subloc_strands, -1;
193             } else {
194 35472         62820 $sobj = $self->_parse_location($splitloc);
195 35472         37141 push @subloc_strands, 1;
196             }
197 35515         62555 push @s_objs, $sobj;
198             }
199              
200             # Sublocations strand values consistency check to set
201             # Guide Strand and sublocations adding order
202 3627 50       6713 if (scalar @s_objs > 0) {
203 3627         3311 my $identical = 0;
204              
205 3627         3421 my $first_value = $subloc_strands[0];
206 3627         4412 foreach my $strand (@subloc_strands) {
207 35515 100       41773 $identical++ if ($strand == $first_value);
208             }
209              
210 3627 100       5322 if ($identical == scalar @subloc_strands) {
211             # Set guide_strand if all sublocations have the same strand
212 3625         7333 $loc_obj->guide_strand($first_value);
213              
214             # Reverse sublocation order for negative strand locations, e.g.:
215             # Common (CAA24672.1):
216             # join(complement(4918..5163),complement(2691..4571))
217             # Trans-splicing (NP_958375.1):
218             # join(32737..32825,complement(174205..174384),complement(69520..71506))
219 3625 100       5534 if ($first_value == -1) {
220 15         27 @s_objs = reverse @s_objs;
221             }
222             }
223             else {
224             # Mixed strand values
225 2         10 $loc_obj->guide_strand(undef);
226             }
227              
228             # Add sublocations
229 3627         4147 foreach my $s_obj (@s_objs) {
230 35515         41413 $loc_obj->add_sub_Location($s_obj);
231             }
232             }
233             } else {
234 3790         7624 $loc_obj = $self->from_string($sub, $oparg);
235             # reinsure the operator is set correctly for this level
236             # unless it is complement
237 3790 100       7380 $loc_obj->splittype($oparg) unless $oparg eq 'complement';
238             }
239             }
240             # no operator, simple or fuzzy
241             else {
242 11         23 $loc_obj = $self->from_string($subloc,1);
243             }
244 7428 100 100     15273 if ($op && $op eq 'complement') {
245 1660         3078 $loc_obj->strand(-1);
246             }
247              
248             # For Split-type $loc_obj, if guide strand is set (meaning consistent strand for
249             # all sublocs) and guide strand is the same than the last location from @loc_objs,
250             # then recover the sublocations and add them to @loc_objs. This way,
251             # "join(10..20,join(30..40,50..60))" becomes "join(10..20,30..40,50..60)"
252 7428 100 100     22694 my $guide_strand = ($loc_obj->isa('Bio::Location::SplitLocationI')) ? ($loc_obj->guide_strand || 0) : 0;
253 7428 100       10952 my $last_strand = (scalar @loc_objs > 0) ? $loc_objs[-1]->strand : 0;
254 7428 100 100     23565 if ( $guide_strand != 0
      100        
255             and $guide_strand == $last_strand
256             and $oparg eq $op # join(,join()) OK, order(join()) NOT OK
257             ) {
258 2         5 my @subloc_objs = $loc_obj->sub_Location(0);
259 2         3 foreach my $subloc_obj (@subloc_objs) {
260 5         8 push @loc_objs, $subloc_obj;
261             }
262             }
263             else {
264 7426         14422 push @loc_objs, $loc_obj;
265             }
266             }
267 7418         5902 my $ct = @loc_objs;
268 7418 50 66     22318 if ($op && !($op eq 'join' || $op eq 'order' || $op eq 'bond')
      100        
      66        
269             && $ct > 1 ) {
270 0         0 $self->throw("Bad operator $op: had multiple locations ".
271             scalar(@loc_objs).", should be SplitLocationI");
272             }
273 7418 100       8587 if ($ct > 1) {
274 7         12 $loc = Bio::Location::Split->new();
275 7         17 $loc->add_sub_Location(shift @loc_objs) while (@loc_objs);
276 7         16 return $loc;
277             } else {
278 7411         5984 $loc = shift @loc_objs;
279 7411         20323 return $loc;
280             }
281             } else { # simple location(s)
282 6323         8680 $loc = $self->_parse_location($locstr);
283 6323 100 100     16775 $loc->strand(-1) if ($op && $op eq 'complement');
284             }
285 6323         10185 return $loc;
286             }
287              
288             =head2 _parse_location
289              
290             Title : _parse_location
291             Usage : $loc = $locfactory->_parse_location( $loc_string)
292              
293             Function: Parses the given location string and returns a location object
294             with start() and end() and strand() set appropriately.
295             Note that this method is private.
296             Returns : A Bio::LocationI implementing object or undef on failure
297             Args : location string
298              
299             =cut
300              
301             sub _parse_location {
302 41838     41838   43449 my ($self, $locstr) = @_;
303 41838         29506 my ($loc, $seqid);
304             #$self->debug( "Location parse, processing $locstr\n");
305             # 'remote' location?
306 41838 100       66966 if($locstr =~ m{^(\S+):(.*)$}o) {
307             # yes; memorize remote ID and strip from location string
308 139         226 $seqid = $1;
309 139         170 $locstr = $2;
310             }
311            
312             # split into start and end
313 41838         72958 my ($start, $end) = split(/\.\./, $locstr);
314             # remove enclosing parentheses if any; note that because of parentheses
315             # possibly surrounding the entire location the parentheses around start
316             # and/or may be asymmetrical
317             # Note: these are from X.Y fuzzy locations, which are deprecated!
318 41838 100       142138 $start =~ s/(?:^\[+|\]+$)//g if $start;
319 41838 100       98530 $end =~ s/(?:^\[+|\]+$)//g if $end;
320              
321             # Is this a simple (exact) or a fuzzy location? Simples have exact start
322             # and end, or is between two adjacent bases. Everything else is fuzzy.
323 41838         33566 my $loctype = ".."; # exact with start and end as default
324              
325 41838 100 100     75653 $loctype = '?' if ( ($locstr =~ /\?/) && ($locstr !~ /\?\d+/) );
326              
327 41838         30735 my $locclass = "Bio::Location::Simple";
328 41838 100       57146 if(! defined($end)) {
329 794 100       1929 if($locstr =~ /(\d+)([\.\^])(\d+)/) {
330 35         65 $start = $1;
331 35         58 $end = $3;
332 35         47 $loctype = $2;
333 35 100 66     239 $locclass = "Bio::Location::Fuzzy"
334             unless (abs($end-$start) <= 1) && ($loctype eq "^");
335             } else {
336 759         744 $end = $start;
337             }
338             }
339             # start_num and end_num are for the numeric only versions of
340             # start and end so they can be compared
341             # in a few lines
342 41838         40130 my ($start_num, $end_num) = ($start,$end);
343 41838 100 100     146148 if ( ($start =~ /[\>\<\?\.\^]/) || ($end =~ /[\>\<\?\.\^]/) ) {
344 456         527 $locclass = 'Bio::Location::Fuzzy';
345 456 100       1183 if($start =~ /(\d+)/) {
346 435         849 ($start_num) = $1;
347             } else {
348 21         35 $start_num = 0
349             }
350 456 100       1026 if ($end =~ /(\d+)/) {
351 435         620 ($end_num) = $1;
352 21         34 } else { $end_num = 0 }
353             }
354 41838         35852 my $strand = 1;
355              
356 41838 100 100     91347 if( $start_num > $end_num && $loctype ne '?') {
357 13         21 ($start,$end,$strand) = ($end,$start,-1);
358             }
359             # instantiate location and initialize
360 41838         98019 $loc = $locclass->new(-verbose => $self->verbose,
361             -start => $start,
362             -end => $end,
363             -strand => $strand,
364             -location_type => $loctype);
365             # set remote ID if remote location
366 41838 100       64593 if($seqid) {
367 139         251 $loc->is_remote(1);
368 139         226 $loc->seq_id($seqid);
369             }
370              
371             # done (hopefully)
372 41838         54101 return $loc;
373             }
374              
375             1;