File Coverage

Bio/Factory/FTLocationFactory.pm
Criterion Covered Total %
statement 119 120 99.1
branch 61 64 95.3
condition 40 44 90.9
subroutine 9 9 100.0
pod 1 1 100.0
total 230 238 96.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Factory::FTLocationFactory
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Hilmar Lapp
7             #
8             # Copyright Hilmar Lapp
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # (c) Hilmar Lapp, hlapp at gnf.org, 2002.
13             # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
14             #
15             # You may distribute this module under the same terms as perl itself.
16             # Refer to the Perl Artistic License (see the license accompanying this
17             # software package, or see http://www.perl.com/language/misc/Artistic.html)
18             # for the terms under which you may use, modify, and redistribute this module.
19             #
20             # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
21             # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
22             # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
23             #
24              
25             # POD documentation - main docs before the code
26              
27             =head1 NAME
28              
29             Bio::Factory::FTLocationFactory - A FeatureTable Location Parser
30              
31             =head1 SYNOPSIS
32              
33             # parse a string into a location object
34             $loc = Bio::Factory::FTLocationFactory->from_string("join(100..200,
35             400..500");
36              
37             =head1 DESCRIPTION
38              
39             Implementation of string-encoded location parsing for the Genbank feature
40             table encoding of locations.
41              
42             =head1 FEEDBACK
43              
44             =head2 Mailing Lists
45              
46             User feedback is an integral part of the evolution of this and other
47             Bioperl modules. Send your comments and suggestions preferably to
48             the Bioperl mailing list. Your participation is much appreciated.
49              
50             bioperl-l@bioperl.org - General discussion
51             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52              
53             =head2 Support
54              
55             Please direct usage questions or support issues to the mailing list:
56              
57             I
58              
59             rather than to the module maintainer directly. Many experienced and
60             reponsive experts will be able look at the problem and quickly
61             address it. Please include a thorough description of the problem
62             with code and data examples if at all possible.
63              
64             =head2 Reporting Bugs
65              
66             Report bugs to the Bioperl bug tracking system to help us keep track
67             of the bugs and their resolution. Bug reports can be submitted via the
68             web:
69              
70             https://github.com/bioperl/bioperl-live/issues
71              
72             =head1 AUTHOR - Hilmar Lapp
73              
74             Email hlapp at gmx.net
75              
76             =head1 CONTRIBUTORS
77              
78             Jason Stajich, jason-at-bioperl-dot-org
79             Chris Fields, cjfields-at-uiuc-dot-edu
80              
81             =head1 APPENDIX
82              
83             The rest of the documentation details each of the object methods.
84             Internal methods are usually preceded with a _
85              
86             =cut
87              
88              
89             # Let the code begin...
90              
91             package Bio::Factory::FTLocationFactory;
92 83     83   673 use vars qw($LOCREG);
  83         101  
  83         3048  
93 83     83   277 use strict;
  83         103  
  83         1417  
94              
95             # Object preamble - inherits from Bio::Root::Root
96              
97 83     83   15275 use Bio::Location::Simple;
  83         125  
  83         1915  
98 83     83   21747 use Bio::Location::Split;
  83         133  
  83         2070  
99 83     83   18025 use Bio::Location::Fuzzy;
  83         131  
  83         2095  
100              
101              
102 83     83   343 use base qw(Bio::Root::Root Bio::Factory::LocationFactoryI);
  83         199  
  83         25162  
103              
104             BEGIN {
105             # the below is an optimized regex obj. from J. Freidl's Mastering Reg Exp.
106             $LOCREG = qr{
107             (?>
108             [^()]+
109             |
110             \(
111             (??{$LOCREG})
112             \)
113             )*
114 83     83   75827 }x;
115             }
116              
117             =head2 new
118              
119             Title : new
120             Usage : my $obj = Bio::Factory::FTLocationFactory->new();
121             Function: Builds a new Bio::Factory::FTLocationFactory object
122             Returns : an instance of Bio::Factory::FTLocationFactory
123             Args :
124              
125             =cut
126              
127             =head2 from_string
128              
129             Title : from_string
130             Usage : $loc = $locfactory->from_string("100..200");
131             Function: Parses the given string and returns a Bio::LocationI implementing
132             object representing the location encoded by the string.
133              
134             This implementation parses the Genbank feature table
135             encoding of locations.
136             Example :
137             Returns : A Bio::LocationI implementing object.
138             Args : A string.
139              
140             =cut
141              
142             sub from_string {
143 13741     13741 1 17511 my ($self,$locstr,$op) = @_;
144 13741         9112 my $loc;
145            
146             #$self->debug("$locstr\n");
147            
148             # $op for operator (error handling)
149            
150             # run on first pass only
151             # Note : These location types are now deprecated in GenBank (Oct. 2006)
152 13741 100       18155 if (!defined($op)) {
153             # convert all (X.Y) to [X.Y]
154 9940         22191 $locstr =~ s{\((\d+\.\d+)\)}{\[$1\]}g;
155             # convert ABC123:(X..Y) to ABC123:[X..Y]
156             # we should never see the above
157 9940         13031 $locstr =~ s{:\((\d+\.{2}\d+)\)}{:\[$1\]}g;
158             }
159            
160 13741 100       53026 if ($locstr =~ m{(.*?)\(($LOCREG)\)(.*)}o) { # any matching parentheses?
161 7418         17143 my ($beg, $mid, $end) = ($1, $2, $3);
162 7418         19469 my (@sublocs) = (split(q(,),$beg), $mid, split(q(,),$end));
163            
164 7418         6741 my @loc_objs;
165             my $loc_obj;
166            
167             SUBLOCS:
168 7418         12092 while (@sublocs) {
169 7429         7939 my $subloc = shift @sublocs;
170 7429 100       10820 next if !$subloc;
171 7428 100 100     33920 my $oparg = ($subloc eq 'join' || $subloc eq 'bond' ||
172             $subloc eq 'order' || $subloc eq 'complement') ? $subloc : undef;
173             # has operator, requires further work (recurse)
174 7428 100       7996 if ($oparg) {
175 7417         6927 my $sub = shift @sublocs;
176             # simple split operators (no recursive calls needed)
177 7417 100 66     40146 if (($oparg eq 'join' || $oparg eq 'order' || $oparg eq 'bond' )
      100        
178             && $sub !~ m{(?:join|order|bond)}) {
179 3627         11213 my @splitlocs = split(q(,), $sub);
180 3627         13848 $loc_obj = Bio::Location::Split->new(-verbose => 1,
181             -splittype => $oparg);
182             # Store strand values for later consistency check
183 3627         3517 my @subloc_strands;
184             my @s_objs;
185 3627         4586 foreach my $splitloc (@splitlocs) {
186 35515 50       47253 next unless $splitloc;
187 35515         22508 my $sobj;
188 35515 100       726709 if ($splitloc =~ m{\(($LOCREG)\)}) {
189 43         120 my $comploc = $1;
190 43         89 $sobj = $self->_parse_location($comploc);
191 43         96 $sobj->strand(-1);
192 43         61 push @subloc_strands, -1;
193             } else {
194 35472         60137 $sobj = $self->_parse_location($splitloc);
195 35472         37913 push @subloc_strands, 1;
196             }
197 35515         62776 push @s_objs, $sobj;
198             }
199              
200             # Sublocations strand values consistency check to set
201             # Guide Strand and sublocations adding order
202 3627 50       6893 if (scalar @s_objs > 0) {
203 3627         3151 my $identical = 0;
204              
205 3627         3623 my $first_value = $subloc_strands[0];
206 3627         4221 foreach my $strand (@subloc_strands) {
207 35515 100       41596 $identical++ if ($strand == $first_value);
208             }
209              
210 3627 100       5578 if ($identical == scalar @subloc_strands) {
211             # Set guide_strand if all sublocations have the same strand
212 3625         7686 $loc_obj->guide_strand($first_value);
213              
214             # Reverse sublocation order for negative strand locations, e.g.:
215             # Common (CAA24672.1):
216             # join(complement(4918..5163),complement(2691..4571))
217             # Trans-splicing (NP_958375.1):
218             # join(32737..32825,complement(174205..174384),complement(69520..71506))
219 3625 100       5137 if ($first_value == -1) {
220 15         27 @s_objs = reverse @s_objs;
221             }
222             }
223             else {
224             # Mixed strand values
225 2         11 $loc_obj->guide_strand(undef);
226             }
227              
228             # Add sublocations
229 3627         4755 foreach my $s_obj (@s_objs) {
230 35515         40617 $loc_obj->add_sub_Location($s_obj);
231             }
232             }
233             } else {
234 3790         6614 $loc_obj = $self->from_string($sub, $oparg);
235             # reinsure the operator is set correctly for this level
236             # unless it is complement
237 3790 100       6640 $loc_obj->splittype($oparg) unless $oparg eq 'complement';
238             }
239             }
240             # no operator, simple or fuzzy
241             else {
242 11         27 $loc_obj = $self->from_string($subloc,1);
243             }
244 7428 100 100     16061 if ($op && $op eq 'complement') {
245 1660         3548 $loc_obj->strand(-1);
246             }
247              
248             # For Split-type $loc_obj, if guide strand is set (meaning consistent strand for
249             # all sublocs) and guide strand is the same than the last location from @loc_objs,
250             # then recover the sublocations and add them to @loc_objs. This way,
251             # "join(10..20,join(30..40,50..60))" becomes "join(10..20,30..40,50..60)"
252 7428 100 100     24443 my $guide_strand = ($loc_obj->isa('Bio::Location::SplitLocationI')) ? ($loc_obj->guide_strand || 0) : 0;
253 7428 100       10679 my $last_strand = (scalar @loc_objs > 0) ? $loc_objs[-1]->strand : 0;
254 7428 100 100     23356 if ( $guide_strand != 0
      100        
255             and $guide_strand == $last_strand
256             and $oparg eq $op # join(,join()) OK, order(join()) NOT OK
257             ) {
258 2         5 my @subloc_objs = $loc_obj->sub_Location(0);
259 2         3 foreach my $subloc_obj (@subloc_objs) {
260 5         7 push @loc_objs, $subloc_obj;
261             }
262             }
263             else {
264 7426         15589 push @loc_objs, $loc_obj;
265             }
266             }
267 7418         6686 my $ct = @loc_objs;
268 7418 50 66     23424 if ($op && !($op eq 'join' || $op eq 'order' || $op eq 'bond')
      100        
      66        
269             && $ct > 1 ) {
270 0         0 $self->throw("Bad operator $op: had multiple locations ".
271             scalar(@loc_objs).", should be SplitLocationI");
272             }
273 7418 100       8748 if ($ct > 1) {
274 7         15 $loc = Bio::Location::Split->new();
275 7         17 $loc->add_sub_Location(shift @loc_objs) while (@loc_objs);
276 7         12 return $loc;
277             } else {
278 7411         6478 $loc = shift @loc_objs;
279 7411         21242 return $loc;
280             }
281             } else { # simple location(s)
282 6323         9501 $loc = $self->_parse_location($locstr);
283 6323 100 100     17005 $loc->strand(-1) if ($op && $op eq 'complement');
284             }
285 6323         10213 return $loc;
286             }
287              
288             =head2 _parse_location
289              
290             Title : _parse_location
291             Usage : $loc = $locfactory->_parse_location( $loc_string)
292              
293             Function: Parses the given location string and returns a location object
294             with start() and end() and strand() set appropriately.
295             Note that this method is private.
296             Returns : A Bio::LocationI implementing object or undef on failure
297             Args : location string
298              
299             =cut
300              
301             sub _parse_location {
302 41838     41838   41729 my ($self, $locstr) = @_;
303 41838         29330 my ($loc, $seqid);
304             #$self->debug( "Location parse, processing $locstr\n");
305             # 'remote' location?
306 41838 100       64055 if($locstr =~ m{^(\S+):(.*)$}o) {
307             # yes; memorize remote ID and strip from location string
308 139         272 $seqid = $1;
309 139         202 $locstr = $2;
310             }
311            
312             # split into start and end
313 41838         73399 my ($start, $end) = split(/\.\./, $locstr);
314             # remove enclosing parentheses if any; note that because of parentheses
315             # possibly surrounding the entire location the parentheses around start
316             # and/or may be asymmetrical
317             # Note: these are from X.Y fuzzy locations, which are deprecated!
318 41838 100       150651 $start =~ s/(?:^\[+|\]+$)//g if $start;
319 41838 100       91389 $end =~ s/(?:^\[+|\]+$)//g if $end;
320              
321             # Is this a simple (exact) or a fuzzy location? Simples have exact start
322             # and end, or is between two adjacent bases. Everything else is fuzzy.
323 41838         31760 my $loctype = ".."; # exact with start and end as default
324              
325 41838 100 100     71577 $loctype = '?' if ( ($locstr =~ /\?/) && ($locstr !~ /\?\d+/) );
326              
327 41838         27901 my $locclass = "Bio::Location::Simple";
328 41838 100       59123 if(! defined($end)) {
329 794 100       1975 if($locstr =~ /(\d+)([\.\^])(\d+)/) {
330 35         68 $start = $1;
331 35         52 $end = $3;
332 35         47 $loctype = $2;
333 35 100 66     197 $locclass = "Bio::Location::Fuzzy"
334             unless (abs($end-$start) <= 1) && ($loctype eq "^");
335             } else {
336 759         744 $end = $start;
337             }
338             }
339             # start_num and end_num are for the numeric only versions of
340             # start and end so they can be compared
341             # in a few lines
342 41838         40934 my ($start_num, $end_num) = ($start,$end);
343 41838 100 100     151869 if ( ($start =~ /[\>\<\?\.\^]/) || ($end =~ /[\>\<\?\.\^]/) ) {
344 456         532 $locclass = 'Bio::Location::Fuzzy';
345 456 100       1269 if($start =~ /(\d+)/) {
346 435         857 ($start_num) = $1;
347             } else {
348 21         31 $start_num = 0
349             }
350 456 100       998 if ($end =~ /(\d+)/) {
351 435         664 ($end_num) = $1;
352 21         30 } else { $end_num = 0 }
353             }
354 41838         35165 my $strand = 1;
355              
356 41838 100 100     86134 if( $start_num > $end_num && $loctype ne '?') {
357 13         25 ($start,$end,$strand) = ($end,$start,-1);
358             }
359             # instantiate location and initialize
360 41838         94635 $loc = $locclass->new(-verbose => $self->verbose,
361             -start => $start,
362             -end => $end,
363             -strand => $strand,
364             -location_type => $loctype);
365             # set remote ID if remote location
366 41838 100       65761 if($seqid) {
367 139         301 $loc->is_remote(1);
368 139         236 $loc->seq_id($seqid);
369             }
370              
371             # done (hopefully)
372 41838         55038 return $loc;
373             }
374              
375             1;