File Coverage

Bio/Factory/FTLocationFactory.pm
Criterion Covered Total %
statement 119 120 99.1
branch 61 64 95.3
condition 40 44 90.9
subroutine 9 9 100.0
pod 1 1 100.0
total 230 238 96.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Factory::FTLocationFactory
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Hilmar Lapp
7             #
8             # Copyright Hilmar Lapp
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # (c) Hilmar Lapp, hlapp at gnf.org, 2002.
13             # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
14             #
15             # You may distribute this module under the same terms as perl itself.
16             # Refer to the Perl Artistic License (see the license accompanying this
17             # software package, or see http://www.perl.com/language/misc/Artistic.html)
18             # for the terms under which you may use, modify, and redistribute this module.
19             #
20             # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
21             # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
22             # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
23             #
24              
25             # POD documentation - main docs before the code
26              
27             =head1 NAME
28              
29             Bio::Factory::FTLocationFactory - A FeatureTable Location Parser
30              
31             =head1 SYNOPSIS
32              
33             # parse a string into a location object
34             $loc = Bio::Factory::FTLocationFactory->from_string("join(100..200,
35             400..500");
36              
37             =head1 DESCRIPTION
38              
39             Implementation of string-encoded location parsing for the Genbank feature
40             table encoding of locations.
41              
42             =head1 FEEDBACK
43              
44             =head2 Mailing Lists
45              
46             User feedback is an integral part of the evolution of this and other
47             Bioperl modules. Send your comments and suggestions preferably to
48             the Bioperl mailing list. Your participation is much appreciated.
49              
50             bioperl-l@bioperl.org - General discussion
51             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
52              
53             =head2 Support
54              
55             Please direct usage questions or support issues to the mailing list:
56              
57             I
58              
59             rather than to the module maintainer directly. Many experienced and
60             reponsive experts will be able look at the problem and quickly
61             address it. Please include a thorough description of the problem
62             with code and data examples if at all possible.
63              
64             =head2 Reporting Bugs
65              
66             Report bugs to the Bioperl bug tracking system to help us keep track
67             of the bugs and their resolution. Bug reports can be submitted via the
68             web:
69              
70             https://github.com/bioperl/bioperl-live/issues
71              
72             =head1 AUTHOR - Hilmar Lapp
73              
74             Email hlapp at gmx.net
75              
76             =head1 CONTRIBUTORS
77              
78             Jason Stajich, jason-at-bioperl-dot-org
79             Chris Fields, cjfields-at-uiuc-dot-edu
80              
81             =head1 APPENDIX
82              
83             The rest of the documentation details each of the object methods.
84             Internal methods are usually preceded with a _
85              
86             =cut
87              
88              
89             # Let the code begin...
90              
91             package Bio::Factory::FTLocationFactory;
92 83     83   974 use vars qw($LOCREG);
  83         151  
  83         3505  
93 83     83   402 use strict;
  83         147  
  83         1657  
94              
95             # Object preamble - inherits from Bio::Root::Root
96              
97 83     83   14280 use Bio::Location::Simple;
  83         177  
  83         2322  
98 83     83   21200 use Bio::Location::Split;
  83         202  
  83         2502  
99 83     83   16794 use Bio::Location::Fuzzy;
  83         186  
  83         2572  
100              
101              
102 83     83   466 use base qw(Bio::Root::Root Bio::Factory::LocationFactoryI);
  83         205  
  83         23127  
103              
104             BEGIN {
105             # the below is an optimized regex obj. from J. Freidl's Mastering Reg Exp.
106             $LOCREG = qr{
107             (?>
108             [^()]+
109             |
110             \(
111             (??{$LOCREG})
112             \)
113             )*
114 83     83   80521 }x;
115             }
116              
117             =head2 new
118              
119             Title : new
120             Usage : my $obj = Bio::Factory::FTLocationFactory->new();
121             Function: Builds a new Bio::Factory::FTLocationFactory object
122             Returns : an instance of Bio::Factory::FTLocationFactory
123             Args :
124              
125             =cut
126              
127             =head2 from_string
128              
129             Title : from_string
130             Usage : $loc = $locfactory->from_string("100..200");
131             Function: Parses the given string and returns a Bio::LocationI implementing
132             object representing the location encoded by the string.
133              
134             This implementation parses the Genbank feature table
135             encoding of locations.
136             Example :
137             Returns : A Bio::LocationI implementing object.
138             Args : A string.
139              
140             =cut
141              
142             sub from_string {
143 13741     13741 1 26851 my ($self,$locstr,$op) = @_;
144 13741         14155 my $loc;
145            
146             #$self->debug("$locstr\n");
147            
148             # $op for operator (error handling)
149            
150             # run on first pass only
151             # Note : These location types are now deprecated in GenBank (Oct. 2006)
152 13741 100       20602 if (!defined($op)) {
153             # convert all (X.Y) to [X.Y]
154 9940         25988 $locstr =~ s{\((\d+\.\d+)\)}{\[$1\]}g;
155             # convert ABC123:(X..Y) to ABC123:[X..Y]
156             # we should never see the above
157 9940         18130 $locstr =~ s{:\((\d+\.{2}\d+)\)}{:\[$1\]}g;
158             }
159            
160 13741 100       60187 if ($locstr =~ m{(.*?)\(($LOCREG)\)(.*)}o) { # any matching parentheses?
161 7418         24737 my ($beg, $mid, $end) = ($1, $2, $3);
162 7418         23056 my (@sublocs) = (split(q(,),$beg), $mid, split(q(,),$end));
163            
164 7418         10820 my @loc_objs;
165             my $loc_obj;
166            
167             SUBLOCS:
168 7418         12888 while (@sublocs) {
169 7429         11042 my $subloc = shift @sublocs;
170 7429 100       11375 next if !$subloc;
171 7428 100 100     29401 my $oparg = ($subloc eq 'join' || $subloc eq 'bond' ||
172             $subloc eq 'order' || $subloc eq 'complement') ? $subloc : undef;
173             # has operator, requires further work (recurse)
174 7428 100       10480 if ($oparg) {
175 7417         9038 my $sub = shift @sublocs;
176             # simple split operators (no recursive calls needed)
177 7417 100 66     35317 if (($oparg eq 'join' || $oparg eq 'order' || $oparg eq 'bond' )
      100        
178             && $sub !~ m{(?:join|order|bond)}) {
179 3627         14063 my @splitlocs = split(q(,), $sub);
180 3627         14343 $loc_obj = Bio::Location::Split->new(-verbose => 1,
181             -splittype => $oparg);
182             # Store strand values for later consistency check
183 3627         6443 my @subloc_strands;
184             my @s_objs;
185 3627         5735 foreach my $splitloc (@splitlocs) {
186 35515 50       50177 next unless $splitloc;
187 35515         32993 my $sobj;
188 35515 100       777155 if ($splitloc =~ m{\(($LOCREG)\)}) {
189 43         125 my $comploc = $1;
190 43         95 $sobj = $self->_parse_location($comploc);
191 43         106 $sobj->strand(-1);
192 43         73 push @subloc_strands, -1;
193             } else {
194 35472         85478 $sobj = $self->_parse_location($splitloc);
195 35472         46853 push @subloc_strands, 1;
196             }
197 35515         71785 push @s_objs, $sobj;
198             }
199              
200             # Sublocations strand values consistency check to set
201             # Guide Strand and sublocations adding order
202 3627 50       6536 if (scalar @s_objs > 0) {
203 3627         4282 my $identical = 0;
204              
205 3627         4001 my $first_value = $subloc_strands[0];
206 3627         5546 foreach my $strand (@subloc_strands) {
207 35515 100       46971 $identical++ if ($strand == $first_value);
208             }
209              
210 3627 100       6312 if ($identical == scalar @subloc_strands) {
211             # Set guide_strand if all sublocations have the same strand
212 3625         9500 $loc_obj->guide_strand($first_value);
213              
214             # Reverse sublocation order for negative strand locations, e.g.:
215             # Common (CAA24672.1):
216             # join(complement(4918..5163),complement(2691..4571))
217             # Trans-splicing (NP_958375.1):
218             # join(32737..32825,complement(174205..174384),complement(69520..71506))
219 3625 100       6045 if ($first_value == -1) {
220 15         28 @s_objs = reverse @s_objs;
221             }
222             }
223             else {
224             # Mixed strand values
225 2         7 $loc_obj->guide_strand(undef);
226             }
227              
228             # Add sublocations
229 3627         5103 foreach my $s_obj (@s_objs) {
230 35515         47669 $loc_obj->add_sub_Location($s_obj);
231             }
232             }
233             } else {
234 3790         7235 $loc_obj = $self->from_string($sub, $oparg);
235             # reinsure the operator is set correctly for this level
236             # unless it is complement
237 3790 100       7366 $loc_obj->splittype($oparg) unless $oparg eq 'complement';
238             }
239             }
240             # no operator, simple or fuzzy
241             else {
242 11         32 $loc_obj = $self->from_string($subloc,1);
243             }
244 7428 100 100     16579 if ($op && $op eq 'complement') {
245 1660         3847 $loc_obj->strand(-1);
246             }
247              
248             # For Split-type $loc_obj, if guide strand is set (meaning consistent strand for
249             # all sublocs) and guide strand is the same than the last location from @loc_objs,
250             # then recover the sublocations and add them to @loc_objs. This way,
251             # "join(10..20,join(30..40,50..60))" becomes "join(10..20,30..40,50..60)"
252 7428 100 100     24707 my $guide_strand = ($loc_obj->isa('Bio::Location::SplitLocationI')) ? ($loc_obj->guide_strand || 0) : 0;
253 7428 100       13527 my $last_strand = (scalar @loc_objs > 0) ? $loc_objs[-1]->strand : 0;
254 7428 100 100     20707 if ( $guide_strand != 0
      100        
255             and $guide_strand == $last_strand
256             and $oparg eq $op # join(,join()) OK, order(join()) NOT OK
257             ) {
258 2         3 my @subloc_objs = $loc_obj->sub_Location(0);
259 2         3 foreach my $subloc_obj (@subloc_objs) {
260 5         10 push @loc_objs, $subloc_obj;
261             }
262             }
263             else {
264 7426         16721 push @loc_objs, $loc_obj;
265             }
266             }
267 7418         9710 my $ct = @loc_objs;
268 7418 50 66     20443 if ($op && !($op eq 'join' || $op eq 'order' || $op eq 'bond')
      100        
      66        
269             && $ct > 1 ) {
270 0         0 $self->throw("Bad operator $op: had multiple locations ".
271             scalar(@loc_objs).", should be SplitLocationI");
272             }
273 7418 100       10365 if ($ct > 1) {
274 7         12 $loc = Bio::Location::Split->new();
275 7         19 $loc->add_sub_Location(shift @loc_objs) while (@loc_objs);
276 7         20 return $loc;
277             } else {
278 7411         9323 $loc = shift @loc_objs;
279 7411         23090 return $loc;
280             }
281             } else { # simple location(s)
282 6323         12153 $loc = $self->_parse_location($locstr);
283 6323 100 100     16686 $loc->strand(-1) if ($op && $op eq 'complement');
284             }
285 6323         12338 return $loc;
286             }
287              
288             =head2 _parse_location
289              
290             Title : _parse_location
291             Usage : $loc = $locfactory->_parse_location( $loc_string)
292              
293             Function: Parses the given location string and returns a location object
294             with start() and end() and strand() set appropriately.
295             Note that this method is private.
296             Returns : A Bio::LocationI implementing object or undef on failure
297             Args : location string
298              
299             =cut
300              
301             sub _parse_location {
302 41838     41838   65796 my ($self, $locstr) = @_;
303 41838         44592 my ($loc, $seqid);
304             #$self->debug( "Location parse, processing $locstr\n");
305             # 'remote' location?
306 41838 100       70436 if($locstr =~ m{^(\S+):(.*)$}o) {
307             # yes; memorize remote ID and strip from location string
308 139         315 $seqid = $1;
309 139         236 $locstr = $2;
310             }
311            
312             # split into start and end
313 41838         100270 my ($start, $end) = split(/\.\./, $locstr);
314             # remove enclosing parentheses if any; note that because of parentheses
315             # possibly surrounding the entire location the parentheses around start
316             # and/or may be asymmetrical
317             # Note: these are from X.Y fuzzy locations, which are deprecated!
318 41838 100       160925 $start =~ s/(?:^\[+|\]+$)//g if $start;
319 41838 100       112644 $end =~ s/(?:^\[+|\]+$)//g if $end;
320              
321             # Is this a simple (exact) or a fuzzy location? Simples have exact start
322             # and end, or is between two adjacent bases. Everything else is fuzzy.
323 41838         48118 my $loctype = ".."; # exact with start and end as default
324              
325 41838 100 100     71199 $loctype = '?' if ( ($locstr =~ /\?/) && ($locstr !~ /\?\d+/) );
326              
327 41838         47356 my $locclass = "Bio::Location::Simple";
328 41838 100       58492 if(! defined($end)) {
329 794 100       2275 if($locstr =~ /(\d+)([\.\^])(\d+)/) {
330 35         96 $start = $1;
331 35         85 $end = $3;
332 35         61 $loctype = $2;
333 35 100 66     232 $locclass = "Bio::Location::Fuzzy"
334             unless (abs($end-$start) <= 1) && ($loctype eq "^");
335             } else {
336 759         1088 $end = $start;
337             }
338             }
339             # start_num and end_num are for the numeric only versions of
340             # start and end so they can be compared
341             # in a few lines
342 41838         58298 my ($start_num, $end_num) = ($start,$end);
343 41838 100 100     136562 if ( ($start =~ /[\>\<\?\.\^]/) || ($end =~ /[\>\<\?\.\^]/) ) {
344 456         758 $locclass = 'Bio::Location::Fuzzy';
345 456 100       1580 if($start =~ /(\d+)/) {
346 435         1017 ($start_num) = $1;
347             } else {
348 21         36 $start_num = 0
349             }
350 456 100       1241 if ($end =~ /(\d+)/) {
351 435         750 ($end_num) = $1;
352 21         44 } else { $end_num = 0 }
353             }
354 41838         44528 my $strand = 1;
355              
356 41838 100 100     91670 if( $start_num > $end_num && $loctype ne '?') {
357 13         32 ($start,$end,$strand) = ($end,$start,-1);
358             }
359             # instantiate location and initialize
360 41838         89200 $loc = $locclass->new(-verbose => $self->verbose,
361             -start => $start,
362             -end => $end,
363             -strand => $strand,
364             -location_type => $loctype);
365             # set remote ID if remote location
366 41838 100       75898 if($seqid) {
367 139         351 $loc->is_remote(1);
368 139         290 $loc->seq_id($seqid);
369             }
370              
371             # done (hopefully)
372 41838         64311 return $loc;
373             }
374              
375             1;