File Coverage

Bio/Restriction/Enzyme.pm
Criterion Covered Total %
statement 297 363 81.8
branch 152 224 67.8
condition 37 62 59.6
subroutine 47 53 88.6
pod 36 42 85.7
total 569 744 76.4


line stmt bran cond sub pod time code
1             #------------------------------------------------------------------
2             #
3             # BioPerl module Bio::Restriction::Enzyme
4             #
5             # Please direct questions and support issues to
6             #
7             # Cared for by Rob Edwards
8             #
9             # You may distribute this module under the same terms as perl itself
10             #------------------------------------------------------------------
11              
12             ## POD Documentation:
13              
14             =head1 NAME
15              
16             Bio::Restriction::Enzyme - A single restriction endonuclease
17             (cuts DNA at specific locations)
18              
19             =head1 SYNOPSIS
20              
21             # set up a single restriction enzyme. This contains lots of
22             # information about the enzyme that is generally parsed from a
23             # rebase file and can then be read back
24              
25             use Bio::Restriction::Enzyme;
26              
27             # define a new enzyme with the cut sequence
28             my $re=Bio::Restriction::Enzyme->new
29             (-enzyme=>'EcoRI', -seq=>'G^AATTC');
30              
31             # once the sequence has been defined a bunch of stuff is calculated
32             # for you:
33              
34             #### PRECALCULATED
35              
36             # find where the enzyme cuts after ...
37             my $ca=$re->cut;
38              
39             # ... and where it cuts on the opposite strand
40             my $oca = $re->complementary_cut;
41              
42             # get the cut sequence string back.
43             # Note that site will return the sequence with a caret
44             my $with_caret=$re->site; #returns 'G^AATTC';
45              
46             # but it is also a Bio::PrimarySeq object ....
47             my $without_caret=$re->seq; # returns 'GAATTC';
48             # ... and so does string
49             $without_caret=$re->string; #returns 'GAATTC';
50              
51             # what is the reverse complement of the cut site
52             my $rc=$re->revcom; # returns 'GAATTC';
53              
54             # now the recognition length. There are two types:
55             # recognition_length() is the length of the sequence
56             # cutter() estimate of cut frequency
57              
58             my $recog_length = $re->recognition_length; # returns 6
59             # also returns 6 in this case but would return
60             # 4 for GANNTC and 5 for RGATCY (BstX2I)!
61             $recog_length=$re->cutter;
62              
63             # is the sequence a palindrome - the same forwards and backwards
64             my $pal= $re->palindromic; # this is a boolean
65              
66             # is the sequence blunt (i.e. no overhang - the forward and reverse
67             # cuts are the same)
68             print "blunt\n" if $re->overhang eq 'blunt';
69              
70             # Overhang can have three values: "5'", "3'", "blunt", and undef
71             # Direction is very important if you use Klenow!
72             my $oh=$re->overhang;
73              
74             # what is the overhang sequence
75             my $ohseq=$re->overhang_seq; # will return 'AATT';
76              
77             # is the sequence ambiguous - does it contain non-GATC bases?
78             my $ambig=$re->is_ambiguous; # this is boolean
79              
80             print "Stuff about the enzyme\nCuts after: $ca\n",
81             "Complementary cut: $oca\nSite:\n\t$with_caret or\n",
82             "\t$without_caret\n";
83             print "Reverse of the sequence: $rc\nRecognition length: $recog_length\n",
84             "Is it palindromic? $pal\n";
85             print "The overhang is $oh with sequence $ohseq\n",
86             "And is it ambiguous? $ambig\n\n";
87              
88              
89             ### THINGS YOU CAN SET, and get from rich REBASE file
90              
91             # get or set the isoschizomers (enzymes that recognize the same
92             # site)
93             $re->isoschizomers('PvuII', 'SmaI'); # not really true :)
94             print "Isoschizomers are ", join " ", $re->isoschizomers, "\n";
95              
96             # get or set the methylation sites
97             $re->methylation_sites(2); # not really true :)
98             print "Methylated at ", join " ", keys %{$re->methylation_sites},"\n";
99              
100             #Get or set the source microbe
101             $re->microbe('E. coli');
102             print "It came from ", $re->microbe, "\n";
103              
104             # get or set the person who isolated it
105             $re->source("Rob"); # not really true :)
106             print $re->source, " sent it to us\n";
107              
108             # get or set whether it is commercially available and the company
109             # that it can be bought at
110             $re->vendors('NEB'); # my favorite
111             print "Is it commercially available :";
112             print $re->vendors ? "Yes" : "No";
113             print " and it can be got from ", join " ",
114             $re->vendors, "\n";
115              
116             # get or set a reference for this
117             $re->reference('Edwards et al. J. Bacteriology');
118             print "It was not published in ", $re->reference, "\n";
119              
120             # get or set the enzyme name
121             $re->name('BamHI');
122             print "The name of EcoRI is not really ", $re->name, "\n";
123              
124              
125             =head1 DESCRIPTION
126              
127             This module defines a single restriction endonuclease. You can use it
128             to make custom restriction enzymes, and it is used by
129             Bio::Restriction::IO to define enzymes in the New England Biolabs
130             REBASE collection.
131              
132             Use Bio::Restriction::Analysis to figure out which enzymes are available
133             and where they cut your sequence.
134              
135              
136             =head1 RESTRICTION MODIFICATION SYSTEMS
137              
138             At least three geneticaly and biochamically distinct restriction
139             modification systems exist. The cutting components of them are known
140             as restriction endonuleases. The three systems are known by roman
141             numerals: Type I, II, and III restriction enzymes.
142              
143             REBASE format 'cutzymes'(#15) lists enzyme type in its last field. The
144             categories there do not always match the the following short
145             descriptions of the enzymes types. See
146             http://it.stlawu.edu/~tbudd/rmsyst.html for a better overview.
147              
148              
149             =head2 TypeI
150              
151             Type I systems recognize a bipartite asymetrical sequence of 5-7 bp:
152              
153             ---TGA*NnTGCT--- * = methylation sites
154             ---ACTNnA*CGA--- n = 6 for EcoK, n = 8 for EcoB
155              
156             The cleavage site is roughly 1000 (400-7000) base pairs from the
157             recognition site.
158              
159             =head2 TypeII
160              
161             The simplest and most common (at least commercially).
162              
163             Site recognition is via short palindromic base sequences that are 4-6
164             base pairs long. Cleavage is at the recognition site (but may
165             occasionally be just adjacent to the palindromic sequence, usually
166             within) and may produce blunt end termini or staggered, "sticky
167             end" termini.
168              
169             =head2 TypeIII
170              
171             The recognition site is a 5-7 bp asymmetrical sequence. Cleavage is
172             ATP dependent 24-26 base pairs downstream from the recognition site
173             and usually yields staggered cuts 2-4 bases apart.
174              
175              
176             =head1 COMMENTS
177              
178             I am trying to make this backwards compatible with
179             Bio::Tools::RestrictionEnzyme. Undoubtedly some things will break,
180             but we can fix things as we progress.....!
181              
182             I have added another comments section at the end of this POD that
183             discusses a couple of areas I know are broken (at the moment)
184              
185              
186             =head1 TO DO
187              
188             =over 2
189              
190             =item *
191              
192             Convert vendors touse full names of companies instead of code
193              
194             =item *
195              
196             Add regular expression based matching to vendors
197              
198             =item *
199              
200             Move away from the archaic ^ notation for cut sites. Ideally
201             I'd totally like to remove this altogether, or add a method
202             that adds it in if someone really wants it. We should be
203             fixed on a sequence, number notation.
204              
205             =back
206              
207             =head1 FEEDBACK
208              
209             =head2 Mailing Lists
210              
211             User feedback is an integral part of the evolution of this and other
212             Bioperl modules. Send your comments and suggestions preferably to one
213             of the Bioperl mailing lists. Your participation is much appreciated.
214              
215             bioperl-l@bioperl.org - General discussion
216             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
217              
218             =head2 Support
219              
220             Please direct usage questions or support issues to the mailing list:
221              
222             I
223              
224             rather than to the module maintainer directly. Many experienced and
225             reponsive experts will be able look at the problem and quickly
226             address it. Please include a thorough description of the problem
227             with code and data examples if at all possible.
228              
229             =head2 Reporting Bugs
230              
231             Report bugs to the Bioperl bug tracking system to help us keep track
232             the bugs and their resolution. Bug reports can be submitted via the
233             web:
234              
235             https://github.com/bioperl/bioperl-live/issues
236              
237             =head1 AUTHOR
238              
239             Rob Edwards, redwards@utmem.edu
240              
241             =head1 CONTRIBUTORS
242              
243             Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
244             Peter Blaiklock, pblaiklo@restrictionmapper.org
245             Mark A. Jensen, maj-at-fortinbras-dot-us
246              
247             =head1 COPYRIGHT
248              
249             Copyright (c) 2003 Rob Edwards.
250              
251             Some of this work is Copyright (c) 1997-2002 Steve A. Chervitz. All
252             Rights Reserved. This module is free software; you can redistribute
253             it and/or modify it under the same terms as Perl itself.
254              
255             =head1 SEE ALSO
256              
257             L,
258             L, L
259              
260             =head1 APPENDIX
261              
262             Methods beginning with a leading underscore are considered private and
263             are intended for internal use by this module. They are not considered
264             part of the public interface and are described here for documentation
265             purposes only.
266              
267             =cut
268              
269             package Bio::Restriction::Enzyme;
270 4     4   608 use strict;
  4         7  
  4         96  
271              
272 4     4   574 use Bio::PrimarySeq;
  4         12  
  4         116  
273              
274 4     4   25 use Data::Dumper;
  4         8  
  4         233  
275 4     4   1240 use Tie::RefHash;
  4         10831  
  4         126  
276 4     4   26 use vars qw (%TYPE);
  4         8  
  4         149  
277 4     4   23 use base qw(Bio::Root::Root Bio::Restriction::EnzymeI);
  4         6  
  4         1544  
278              
279             BEGIN {
280 4     4   8171 my %TYPE = (I => 1, II => 1, III => 1);
281             }
282              
283             =head2 new
284              
285             Title : new
286             Function
287             Function : Initializes the Enzyme object
288             Returns : The Restriction::Enzyme object
289             Argument : A standard definition can have several formats. For example:
290             $re->new(-enzyme='EcoRI', -seq->'GAATTC' -cut->'1')
291             Or, you can define the cut site in the sequence, for example
292             $re->new(-enzyme='EcoRI', -seq->'G^AATTC'), but you must use a caret
293             Or, a sequence can cut outside the recognition site, for example
294             $re->new(-enzyme='AbeI', -seq->'CCTCAGC' -cut->'-5/-2')
295              
296             Other arguments:
297             -isoschizomers=>\@list a reference to an array of
298             known isoschizomers
299             -references=>$ref a reference to the enzyme
300             -source=>$source the source (person) of the enzyme
301             -commercial_availability=>@companies a list of companies
302             that supply the enzyme
303             -methylation_site=>\%sites a reference to hash that has
304             the position as the key and the type of methylation
305             as the value
306             -xln_sub => sub { ($self,$cut) = @_; ...; return $xln_cut },
307             a coderef to a routine that translates the input cut value
308             into Bio::Restriction::Enzyme coordinates
309             ( e.g., for withrefm format, this might be
310             -xln_sub => sub { length( shift()->string ) + shift } )
311              
312             A Restriction::Enzyme object manages its recognition sequence as a
313             Bio::PrimarySeq object.
314              
315             The minimum requirement is for a name and a sequence.
316              
317             This will create the restriction enzyme object, and define several
318             things about the sequence, such as palindromic, size, etc.
319              
320             =cut
321              
322             # do all cut/comp cut setting within the constructor
323             # new args
324              
325             sub new {
326 10163     10163 1 31797 my($class, @args) = @_;
327 10163         23526 my $self = $class->SUPER::new(@args);
328              
329 10163         43720 my ($name,$enzyme,$site,$seq,$precut, $postcut,$cut,$complementary_cut, $is_prototype, $prototype,
330             $isoschizomers, $meth, $microbe, $source, $vendors, $references, $neo, $recog, $xln_sub) =
331             $self->_rearrange([qw(
332             NAME
333             ENZYME
334             SITE
335             SEQ
336             PRECUT
337             POSTCUT
338             CUT
339             COMPLEMENTARY_CUT
340             IS_PROTOTYPE
341             PROTOTYPE
342             ISOSCHIZOMERS
343             METHYLATION_SITES
344             MICROBE
345             SOURCE
346             VENDORS
347             REFERENCES
348             IS_NEOSCHIZOMER
349             RECOG
350             XLN_SUB
351             )], @args);
352              
353 10163 0 66     44162 $self->throw('At the minimum, you must define a name and '.
      33        
      33        
354             'recognition site for the restriction enzyme')
355             unless (($name || $enzyme) && ($site || $recog || $seq));
356              
357 10163         15038 $self->{_isoschizomers} = [];
358 10163         12967 $self->{_methylation_sites} = {};
359 10163         13973 $self->{_vendors} = [];
360 10163         13361 $self->{_references} = [];
361              
362             # squelch warnings
363 10163   100     26789 $postcut ||='';
364              
365             # enzyme name
366 10163 100       14048 $enzyme && $self->name($enzyme);
367 10163 100       20883 $name && $self->name($name);
368              
369             # site
370             #
371             # note that the site() setter will automatically set
372             # cut(), complementary_cut(), if the cut site is indicated
373             # in $site with '^' /maj
374              
375             # create the cut site if appropriate/this is a kludge due to
376             # the base.pm format in the new B:R order...
377 10163 100 100     19296 if ( $cut and $cut <= length $site) {
378 2531         5337 $site = substr($site, 0, $cut).'^'.substr($site, $cut);
379             }
380            
381 10163 50       13305 if ($site) {
382 10163         14429 $self->site($site);
383             }
384             else {
385 0 0       0 $seq && $self->site($seq);
386             }
387              
388 10162 100       13022 if ($recog) {
389 7495         10824 $self->recog($recog);
390             }
391             else {
392 2667 50       3544 $seq && $self->recog($seq);
393 2667 50       5264 $site && $self->recog($site);
394             }
395             # call revcom_site to initialize it and revcom_recog:
396 10162         17517 $self->revcom_site();
397              
398 10162         13862 $recog = $self->string; # for length calculations below
399            
400 10162 100       15297 if ($xln_sub) {
401 7494 50       13698 $self->warn("Translation subroutine is not a coderef; ignoring") unless
402             ref($xln_sub) eq 'CODE';
403             }
404              
405             # cut coordinates
406 10162         14988 my ($pc_cut, $pc_comp_cut) = ( $postcut =~ /(-?\d+)\/(-?\d+)/ );
407              
408             # cut definitions in constructor override any autoset in
409             # site()
410             # definitions in site conform to withrefm coords, translation
411             # happens here
412              
413 10162 100       18198 if (defined $cut) {
    100          
414 2664 50       4723 $self->cut( $xln_sub ? $xln_sub->($self, $cut) : $cut );
415             }
416             elsif ( defined $pc_cut ) {
417 457 100       1364 $self->cut( $xln_sub ? $xln_sub->($self, $pc_cut) : $pc_cut );
418             }
419              
420 10162 100       16194 if (defined $complementary_cut) {
    100          
421 4 50       12 $self->complementary_cut($xln_sub ? $xln_sub->($self,$complementary_cut) : $complementary_cut);
422             }
423             elsif (defined $pc_comp_cut) {
424 457 100       1007 $self->complementary_cut($xln_sub ? $xln_sub->($self,$pc_comp_cut) : $pc_comp_cut);
425             }
426              
427 10162 100       19224 $is_prototype && $self->is_prototype($is_prototype);
428 10162 100       13115 $prototype && $self->prototype($prototype);
429 10162 100       17803 $isoschizomers && $self->isoschizomers($isoschizomers);
430 10162 50       13788 $meth && $self->methylation_sites($meth);
431 10162 50       13205 $microbe && $self->microbe($microbe);
432 10162 100       18288 $source && $self->source($source);
433 10162 100       18089 $vendors && $self->vendors($vendors);
434 10162 100       18778 $references && $self->references($references);
435 10162 50       13559 $neo && $self->is_neoschizomer($neo);
436              
437             # create multicut enzymes here if $precut defined
438 10162 100       12808 if (defined $precut) {
439 49         133 bless $self, 'Bio::Restriction::Enzyme::MultiCut';
440 49         221 my ($pc_cut, $pc_comp_cut) = $precut =~ /(-?\d+)\/(-?\d+)/;
441 49         196 my $re2 = $self->clone;
442 49 50       204 $re2->cut($xln_sub ? $xln_sub->($self, -$pc_cut) : -$pc_cut);
443 49 50       174 $re2->complementary_cut($xln_sub ? $xln_sub->($self, -$pc_comp_cut) : -$pc_comp_cut);
444 49         151 $self->others($re2);
445             }
446              
447 10162         30983 return $self;
448             }
449              
450             =head1 Essential methods
451              
452             =cut
453              
454             =head2 name
455              
456             Title : name
457             Usage : $re->name($newval)
458             Function : Gets/Sets the restriction enzyme name
459             Example : $re->name('EcoRI')
460             Returns : value of name
461             Args : newvalue (optional)
462              
463             This will also clean up the name. I have added this because some
464             people get confused about restriction enzyme names. The name should
465             be One upper case letter, and two lower case letters (because it is
466             derived from the organism name, eg. EcoRI is from E. coli). After
467             that it is all confused, but the numbers should be roman numbers not
468             numbers, therefore we'll correct those. At least this will provide
469             some standard, I hope.
470              
471             =cut
472              
473             sub name{
474 70047     70047 1 83648 my ($self, $name)=@_;
475              
476 70047 100       83626 if ($name) { # correct and set the name
477 10164         11084 my $old_name = $name;
478              
479             # remove spaces. Some people write HindIII as Hind III
480 10164         17638 $name =~ s/\s+//g;
481             # change TAILING ones to I's
482 10164 50       19746 if ($name =~ m/(1+)$/) {
483 0         0 my $i = 'I' x length($1);
484 0         0 $name =~ s/1+$/$i/;
485             }
486              
487             # make the first letter upper case
488 10164         30374 $name =~ s/^(\w)/uc($1)/e;
  10164         27516  
489              
490 10164 50       18081 unless ($name eq $old_name) {
491             # we have changed the name, so send a warning
492 0         0 $self->warn("The enzyme name $old_name was changed to $name");
493             }
494 10164         16218 $self->{'_name'} = $name;
495             }
496 70047         154195 return $self->{'_name'};
497             }
498              
499              
500             =head2 site
501              
502             Title : site
503             Usage : $re->site();
504             Function : Gets/sets the recognition sequence for the enzyme.
505             Example : $seq_string = $re->site();
506             Returns : String containing recognition sequence indicating
507             : cleavage site as in 'G^AATTC'.
508             Argument : n/a
509             Throws : n/a
510              
511              
512             Side effect: the sequence is always converted to upper case.
513              
514             The cut site can also be set by using methods L and
515             L.
516              
517             This will pad out missing sequence with N's. For example the enzyme
518             Acc36I cuts at ACCTGC(4/8). This will be returned as ACCTGCNNNN^
519              
520             Note that the common notation ACCTGC(4/8) means that the forward
521             strand cut is four nucleotides after the END of the recognition
522             site. The forward cut() in the coordinates used here in Acc36I
523             ACCTGC(4/8) is at 6+4 i.e. 10.
524              
525             ** This is the main setable method for the recognition site.
526              
527             =cut
528              
529             sub site {
530 10169     10169 1 13339 my ( $self, $site ) = @_;
531            
532 10169 100       13468 if ($site) {
533              
534 10163 100       18373 $self->throw("Unrecognized characters in site: [$site]")
535             if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
536              
537             # we may have to redefine this if there is a ^ in the sequence
538              
539             # first, check and see if we have a cut site in the sequence
540             # if so, find the position, and set the target sequence and cut site
541              
542 10162         12440 $self->{'_site'} = $site;
543              
544 10162         23626 my ( $first, $second ) = $site =~ /(.*)\^(.*)/;
545 10162 100       21444 $site = "$1$2" if defined $first;
546 10162         12509 $self->{'_site'} = $site;
547              
548             # now set the recognition site as a new Bio::PrimarySeq object
549             # we need it before calling cut() and complementary_cut()
550 10162         13202 $self->{_seq} = Bio::PrimarySeq->new(
551             -id => $self->name,
552             -seq => $site,
553             -verbose => $self->verbose,
554             -alphabet => 'dna'
555             );
556              
557 10162 100       21432 if ( defined $first ) {
558 4945         10968 $self->cut( length $first );
559 4945         9226 $self->complementary_cut( length $second );
560 4945         8076 $self->revcom_site();
561             }
562             }
563 10168         12593 return $self->{'_site'};
564             }
565              
566             =head2 revcom_site
567              
568             Title : revcom_site
569             Usage : $re->revcom_site();
570             Function : Gets/sets the complementary recognition sequence for the enzyme.
571             Example : $seq_string = $re->revcom_site();
572             Returns : String containing recognition sequence indicating
573             : cleavage site as in 'G^AATTC'.
574             Argument : none (sets on first call)
575             Throws : n/a
576              
577             This is the same as site, except it returns the revcom site. For
578             palindromic enzymes these two are identical. For non-palindromic
579             enzymes they are not!
580              
581             On set, this also handles setting the revcom_recog attribute.
582              
583             See also L above.
584              
585             =cut
586              
587             sub revcom_site {
588 15112     15112 1 14970 my $self = shift;
589             # getter
590 15112 100       22317 return $self->{'_revcom_site'} unless !$self->{'_revcom_site'};
591              
592             # setter
593 10162         11325 my $site = $self->{'_site'};
594 10162 100       14372 if ($self->is_palindromic) {
595 9266         14497 $self->{'_revcom_site'}=$self->{'_site'};
596 9266         13051 $self->revcom_recog( $self->string );
597 9266         13916 return $self->{'_revcom_site'};
598             }
599              
600 896 50       2124 $self->throw("Unrecognized characters in revcom site: [$site]")
601             if $site =~ /[^ATGCMRWSYKVHDBN\^]/i;
602            
603 896 100       1670 if ($site =~ /\^/) {
604             # first, check and see if we have a cut site indicated in the sequence
605             # if so, find the position, and set the target sequence and cut site
606 16         46 $site = $self->revcom;
607 16         49 $self->revcom_recog( $site );
608 16         45 my $c = length($site)-$self->cut;
609 16         70 $site = substr($site, 0, $c).'^'.substr($site,$c);
610 16         36 $self->{'_revcom_site'} = $site;
611             }
612             else {
613 880         1283 my $revcom=$self->revcom;
614 880         2014 $self->revcom_recog( $revcom );
615             # my $cc=$self->complementary_cut;
616             # my $hat=length($revcom)-$cc+1; # we need it on the other strand!
617             # if ($cc > length($revcom)) {
618             # my $pad= "N" x ($cc-length($revcom));
619             # $revcom = $pad. $revcom;
620             # $hat=length($revcom)-$cc+1;
621             # }
622             # elsif ($cc < 0) {
623             # my $pad = "N" x -$cc;
624             # $revcom .= $pad;
625             # $hat=length($revcom);
626             # }
627             # $revcom =~ s/(.{$hat})/$1\^/;
628 880         1258 $self->{'_revcom_site'}=$revcom;
629             }
630 896         1056 return $self->{'_revcom_site'};
631             }
632              
633             =head2 cut
634              
635             Title : cut
636             Usage : $num = $re->cut(1);
637             Function : Sets/gets an integer indicating the position of cleavage
638             relative to the 5' end of the recognition sequence in the
639             forward strand.
640              
641             For type II enzymes, sets the symmetrically positioned
642             reverse strand cut site by calling complementary_cut().
643              
644             Returns : Integer, 0 if not set
645             Argument : an integer for the forward strand cut site (optional)
646              
647             Note that the common notation ACCTGC(4/8) means that the forward
648             strand cut is four nucleotides after the END of the recognition
649             site. The forward cut in the coordinates used here in Acc36I
650             ACCTGC(4/8) is at 6+4 i.e. 10.
651              
652             Note that REBASE uses notation where cuts within symmetic sites are
653             marked by '^' within the forward sequence but if the site is
654             asymmetric the parenthesis syntax is used where numbering ALWAYS
655             starts from last nucleotide in the forward strand. That's why AciI has
656             a site usually written as CCGC(-3/-1) actualy cuts in
657              
658             C^C G C
659             G G C^G
660              
661             In our notation, these locations are 1 and 3.
662              
663              
664             The cuts locations in the notation used are relative to the first
665             (non-N) nucleotide of the reported forward strand of the recognition
666             sequence. The following diagram numbers the phosphodiester bonds
667             (marked by + ) which can be cut by the restriction enzymes:
668              
669             1 2 3 4 5 6 7 8 ...
670             N + N + N + N + N + G + A + C + T + G + G + N + N + N
671             ... -5 -4 -3 -2 -1
672              
673              
674             =cut
675              
676             sub cut {
677 24294     24294 1 30164 my ($self, $value) = @_;
678 24294 100       30387 if (defined $value) {
679 8117 50       21994 $self->throw("The cut position needs to be an integer [$value]")
680             unless $value =~ /[-+]?\d+/;
681 8117         12214 $self->{'_cut'} = $value;
682              
683             # add the caret to the site attribute only if internal /maj
684 8117 100 100     24314 if ( ($self->{_site} !~ /\^/) && ($value <= length ($self->{_site}))) {
685             $self->{_site} =
686 5174         12602 substr($self->{_site}, 0, $value). '^'. substr($self->{_site}, $value);
687             }
688              
689             # auto-set comp cut only if cut site is inside the recog site./maj
690             $self->complementary_cut(length ($self->seq->seq) - $value )
691 8117 100 100     20786 if (($self->{_site} =~ /\^/) && ($self->type eq 'II'));
692              
693             }
694             # return undef if not defined yet, not 0 /maj
695 24294         35816 return $self->{'_cut'};
696             }
697              
698             =head2 cuts_after
699              
700             Title : cuts_after
701             Usage : Alias for cut()
702              
703             =cut
704              
705             sub cuts_after {
706 0     0 1 0 shift->cut(@_);
707             }
708            
709              
710             =head2 complementary_cut
711              
712             Title : complementary_cut
713             Usage : $num = $re->complementary_cut('1');
714             Function : Sets/Gets an integer indicating the position of cleavage
715             : on the reverse strand of the restriction site.
716             Returns : Integer
717             Argument : An integer (optional)
718             Throws : Exception if argument is non-numeric.
719              
720             This method determines the cut on the reverse strand of the sequence.
721             For most enzymes this will be within the sequence, and will be set
722             automatically based on the forward strand cut, but it need not be.
723              
724             B that the returned location indicates the location AFTER the
725             first non-N site nucleotide in the FORWARD strand.
726              
727             =cut
728              
729             sub complementary_cut {
730 16431     16431 1 20143 my ($self, $num)=@_;
731              
732 16431 100       21529 if (defined $num) {
733 13152 50       32465 $self->throw("The cut position needs to be an integer [$num]")
734             unless $num =~ /[-+]?\d+/;
735 13152         17461 $self->{'_rc_cut'} = $num;
736             }
737             # return undef, not 0, if not yet defined /maj
738 16431         19047 return $self->{'_rc_cut'};
739             }
740              
741              
742             =head1 Read only (usually) recognition site descriptive methods
743              
744             =cut
745              
746             =head2 type
747              
748             Title : type
749             Usage : $re->type();
750             Function : Get/set the restriction system type
751             Returns :
752             Argument : optional type: ('I'|II|III)
753              
754             Restriction enzymes have been catezorized into three types. Some
755             REBASE formats give the type, but the following rules can be used to
756             classify the known enzymes:
757              
758             =over 4
759              
760             =item 1
761              
762             Bipartite site (with 6-8 Ns in the middle and the cut site
763             is E 50 nt away) =E type I
764              
765             =item 2
766              
767             Site length E 3 =E type I
768              
769             =item 3
770              
771             5-6 asymmetric site and cuts E20 nt away =E type III
772              
773             =item 4
774              
775             All other =E type II
776              
777             =back
778              
779             There are some enzymes in REBASE which have bipartite recognition site
780             and cat far from the site but are still classified as type I. I've no
781             idea if this is really so.
782              
783             =cut
784              
785             sub type {
786 7707     7707 1 10410 my ($self, $value) = @_;
787              
788 7707 50       10678 if ($value) {
789             $self->throw("Not a valid value [$value], needs to one of : ".
790             join (', ', sort keys %TYPE) )
791 0 0       0 unless $TYPE{$value};
792 0         0 return $self->{'_type'} = $value;
793             }
794              
795             # pre set
796             #return $self->{'_type'} if $self->{'_type'};
797             # bipartite
798             return $self->{'_type'} = 'I'
799 7707 50 66     12808 if $self->{'_seq'}->seq =~ /N*[^N]+N{6,8}[^N]/ and abs($self->cut) > 50 ;
800             # 3 nt site
801             return $self->{'_type'} = 'I'
802 7707 100       13889 if $self->{'_seq'}->length == 3;
803             # asymmetric and cuts > 20 nt
804 7697 50 100     10938 return $self->{'_type'} = 'III'
      100        
      66        
805             if (length $self->string == 5 or length $self->string == 6 ) and
806             not $self->palindromic and abs($self->cut) > 20;
807 7697         28448 return $self->{'_type'} = 'II';
808             }
809              
810             =head2 seq
811              
812             Title : seq
813             Usage : $re->seq();
814             Function : Get the Bio::PrimarySeq.pm object representing
815             : the recognition sequence
816             Returns : A Bio::PrimarySeq object representing the
817             enzyme recognition site
818             Argument : n/a
819             Throws : n/a
820              
821              
822             =cut
823              
824             sub seq {
825 7854     7854 1 14054 shift->{'_seq'};
826             }
827              
828             =head2 string
829              
830             Title : string
831             Usage : $re->string();
832             Function : Get a string representing the recognition sequence.
833             Returns : String. Does NOT contain a '^' representing the cut location
834             as returned by the site() method.
835             Argument : n/a
836             Throws : n/a
837              
838             =cut
839              
840             sub string {
841 52617     52617 1 81505 shift->{'_seq'}->seq;
842             }
843              
844             =head2 recog
845              
846             Title : recog
847             Usage : $enz->recog($recognition_sequence)
848             Function: Gets/sets the pure recognition site. Sets as
849             regexp if appropriate.
850             As for string(), the cut indicating carets (^)
851             are expunged.
852             Example :
853             Returns : value of recog (a scalar)
854             Args : on set, new value (a scalar or undef, optional)
855              
856             =cut
857              
858             sub recog{
859 25643     25643 1 26639 my $self = shift;
860 25643         23116 my $recog = shift;
861 25643 100       48079 return $self->{'recog'} unless $recog;
862 10162         18322 $recog =~ s/\^//g;
863 10162 100       22553 $recog = _expand($recog) if $recog =~ /[^ATGC]/;
864 10162         15960 return $self->{'recog'} = $recog;
865             }
866              
867             =head2 revcom_recog
868              
869             Title : revcom_recog
870             Usage : $enz->revcom_recog($recognition_sequence)
871             Function: Gets/sets the pure reverse-complemented recognition site.
872             Sets as regexp if appropriate.
873             As for string(), the cut indicating carets (^) are expunged.
874             Example :
875             Returns : value of recog (a scalar)
876             Args : on set, new value (a scalar or undef, optional)
877              
878             =cut
879              
880             sub revcom_recog{
881 12969     12969 1 13325 my $self = shift;
882 12969         13083 my $recog = shift;
883 12969 100       17041 unless ($recog) {
884 2807 50       3839 $self->throw( "revcom recognition site not set; call \$enz->revcom_site to initialize" ) unless $self->{'revcom_recog'};
885 2807         4503 return $self->{'revcom_recog'};
886             }
887 10162         14467 $recog =~ s/\^//g;
888 10162 100       23762 $recog = _expand($recog) if $recog =~ /[^ATGC]/;
889 10162         15320 return $self->{'revcom_recog'} = $recog;
890             }
891              
892             =head2 revcom
893              
894             Title : revcom
895             Usage : $re->revcom();
896             Function : Get a string representing the reverse complement of
897             : the recognition sequence.
898             Returns : String
899             Argument : n/a
900             Throws : n/a
901              
902             =cut
903              
904             sub revcom {
905 11059     11059 1 22985 shift->{'_seq'}->revcom->seq();
906             }
907              
908             =head2 recognition_length
909              
910             Title : recognition_length
911             Usage : $re->recognition_length();
912             Function : Get the length of the RECOGNITION sequence.
913             This is the total recognition sequence,
914             inluding the ambiguous codes.
915             Returns : An integer
916             Argument : Nothing
917              
918             See also: L
919              
920             =cut
921              
922             sub recognition_length {
923 1     1 1 2 my $self = shift;
924 1         2 return length($self->string);
925             }
926              
927             =head2 cutter
928              
929             Title : cutter
930             Usage : $re->cutter
931             Function : Returns the "cutter" value of the recognition site.
932              
933             This is a value relative to site length and lack of
934             ambiguity codes. Hence: 'RCATGY' is a five (5) cutter site
935             and 'CCTNAGG' a six cutter
936              
937             This measure correlates to the frequency of the enzyme
938             cuts much better than plain recognition site length.
939              
940             Example : $re->cutter
941             Returns : integer or float number
942             Args : none
943              
944             Why is this better than just stripping the ambiguos codes? Think about
945             it like this: You have a random sequence; all nucleotides are equally
946             probable. You have a four nucleotide re site. The probability of that
947             site finding a match is one out of 4^4 or 256, meaning that on average
948             a four cutter finds a match every 256 nucleotides. For a six cutter,
949             the average fragment length is 4^6 or 4096. In the case of ambiguity
950             codes the chances are finding the match are better: an R (A|T) has 1/2
951             chance of finding a match in a random sequence. Therefore, for RGCGCY
952             the probability is one out of (2*4*4*4*4*2) which exactly the same as
953             for a five cutter! Cutter, although it can have non-integer values
954             turns out to be a useful and simple measure.
955              
956             From bug 2178: VHDB are ambiguity symbols that match three different
957             nucleotides, so they contribute less to the effective recognition sequence
958             length than e.g. Y which matches only two nucleotides. A symbol which matches n
959             of the 4 nucleotides has an effective length of 1 - log(n) / log(4).
960              
961             =cut
962              
963             sub cutter {
964 3532     3532 1 3761 my ($self)=@_;
965 3532         3781 $_ = uc $self->string;
966              
967 3532         4084 my $cutter = tr/[ATGC]//d;
968 3532         3398 my $count = tr/[MRWSYK]//d;
969 3532         3998 $cutter += $count/2;
970 3532         3185 $count = tr/[VHDB]//d;
971 3532         3580 $cutter += $count * (1 - log(3) / log(4));
972 3532         7160 return $cutter;
973             }
974              
975              
976             =head2 is_palindromic
977              
978             Title : is_palindromic
979             Alias : palindromic
980             Usage : $re->is_palindromic();
981             Function : Determines if the recognition sequence is palindromic
982             : for the current restriction enzyme.
983             Returns : Boolean
984             Argument : n/a
985             Throws : n/a
986              
987             A palindromic site (EcoRI):
988              
989             5-GAATTC-3
990             3-CTTAAG-5
991              
992             =cut
993              
994             sub is_palindromic {
995 36823     36823 1 36956 my $self = shift;
996 36823 100       75928 return $self->{_palindromic} if defined $self->{_palindromic};
997 10162 100       12865 if ($self->string eq $self->revcom) {
998 9266         28340 return $self->{_palindromic}=1;
999             }
1000 896         2283 return $self->{_palindromic} = 0;
1001             }
1002              
1003 5824     5824 0 10508 sub palindromic { shift->is_palindromic(@_) }
1004              
1005             =head2 is_symmetric
1006              
1007             Title : is_symmetric
1008             Alias : symmetric
1009             Usage : $re->is_symmetric();
1010             Function : Determines if the enzyme is a symmetric cutter
1011             Returns : Boolean
1012             Argument : none
1013              
1014             A symmetric but non-palindromic site (HindI):
1015             v
1016             5-C A C-3
1017             3-G T G-5
1018             ^
1019              
1020             =cut
1021              
1022             sub is_symmetric {
1023 4     4   35 no warnings qw( uninitialized );
  4         11  
  4         6928  
1024 9641     9641 1 9425 my $self = shift;
1025              
1026 9641 100       17783 return $self->{_symmetric} if defined $self->{_symmetric};
1027 5355 100       6011 if ($self->is_palindromic) {
1028 4892         13276 return $self->{_symmetric} = 1;
1029             }
1030 463 100       629 if ($self->cut == length($self->string) - $self->complementary_cut) {
1031 27         116 return $self->{_symmetric}=1;
1032             }
1033 436         959 return $self->{_symmetric} = 0;
1034             }
1035              
1036              
1037 0     0 0 0 sub symmetric { shift->is_symmetric(@_) }
1038              
1039             =head2 overhang
1040              
1041             Title : overhang
1042             Usage : $re->overhang();
1043             Function : Determines the overhang of the restriction enzyme
1044             Returns : "5'", "3'", "blunt" of undef
1045             Argument : n/a
1046             Throws : n/a
1047              
1048             A blunt site in SmaI returns C
1049              
1050             5' C C C^G G G 3'
1051             3' G G G^C C C 5'
1052              
1053             A 5' overhang in EcoRI returns C<5'>
1054              
1055             5' G^A A T T C 3'
1056             3' C T T A A^G 5'
1057              
1058             A 3' overhang in KpnI returns C<3'>
1059              
1060             5' G G T A C^C 3'
1061             3' C^C A T G G 5'
1062              
1063             =cut
1064              
1065             sub overhang {
1066 545     545 1 502 my $self = shift;
1067 545 100 100     1389 unless ($self->{'_cut'} && $self->{'_rc_cut'}) {
1068 32         60 return "unknown";
1069             }
1070 513 100       768 if ($self->{_cut} < $self->{_rc_cut}) {
    100          
    50          
1071 287         567 $self->{_overhang}="5'";
1072             } elsif ($self->{_cut} == $self->{_rc_cut}) {
1073 116         207 $self->{_overhang}="blunt";
1074             } elsif ($self->{_cut} > $self->{_rc_cut}) {
1075 110         212 $self->{_overhang}="3'";
1076             } else {
1077 0         0 $self->{_overhang}="unknown";
1078             }
1079             return $self->{_overhang}
1080 513         964 }
1081              
1082             =head2 overhang_seq
1083              
1084             Title : overhang_seq
1085             Usage : $re->overhang_seq();
1086             Function : Determines the overhang sequence of the restriction enzyme
1087             Returns : a Bio::LocatableSeq
1088             Argument : n/a
1089             Throws : n/a
1090              
1091             I do not think it is necessary to create a seq object of these. (Heikki)
1092              
1093             Note: returns empty string for blunt sequences and undef for ones that
1094             we don't know. Compare these:
1095              
1096             A blunt site in SmaI returns empty string
1097              
1098             5' C C C^G G G 3'
1099             3' G G G^C C C 5'
1100              
1101             A 5' overhang in EcoRI returns C
1102              
1103             5' G^A A T T C 3'
1104             3' C T T A A^G 5'
1105              
1106             A 3' overhang in KpnI returns C
1107              
1108             5' G G T A C^C 3'
1109             3' C^C A T G G 5'
1110              
1111             Note that you need to use method L to decide
1112             whether it is a 5' or 3' overhang!!!
1113              
1114             Note: The overhang stuff does not work if the site is asymmetric! Rethink!
1115              
1116             =cut
1117              
1118             sub overhang_seq {
1119 5     5 1 7 my $self = shift;
1120              
1121             # my $overhang->Bio::PrimarySeq(-id=>$self->name . '-overhang',
1122             # -verbose=>$self->verbose,
1123             # -alphabet=>'dna');
1124              
1125 5 50       8 return '' if $self->overhang eq 'blunt' ;
1126              
1127 5 50 33     16 unless ($self->{_cut} && $self->{_rc_cut}) {
1128             # lets just check that we really can't figure it out
1129 0         0 $self->cut;
1130 0         0 $self->complementary_cut;
1131 0 0 0     0 unless ($self->{_cut} && $self->{_rc_cut}) {
1132 0         0 return;
1133             }
1134             }
1135              
1136             # this is throwing an error for sequences outside the restriction
1137             # site (eg ^NNNNGATCNNNN^)
1138             # So if this is the case we need to fake these guys
1139 5 50 33     21 if (($self->{_cut}<0) ||
      33        
      33        
1140             ($self->{_rc_cut}<0) ||
1141             ($self->{_cut}>$self->seq->length) ||
1142             ($self->{_rc_cut}>$self->seq->length)) {
1143 0         0 my $tempseq=$self->site;
1144 0         0 my ($five, $three)=split /\^/, $tempseq;
1145 0 0       0 if ($self->{_cut} > $self->{_rc_cut}) {
    0          
1146             return substr($five, $self->{_rc_cut})
1147 0         0 } elsif ($self->{_cut} < $self->{_rc_cut}) {
1148             return substr($three, 0, $self->{_rc_cut})
1149 0         0 } else {
1150 0         0 return '';
1151             }
1152             }
1153              
1154 5 50       13 if ($self->{_cut} > $self->{_rc_cut}) {
    50          
1155 0         0 return $self->seq->subseq($self->{_rc_cut}+1,$self->{_cut});
1156             } elsif ($self->{_cut} < $self->{_rc_cut}) {
1157 5         8 return $self->seq->subseq($self->{_cut}+1, $self->{_rc_cut});
1158             } else {
1159 0         0 return '';
1160             }
1161             }
1162              
1163              
1164              
1165             =head2 compatible_ends
1166              
1167             Title : compatible_ends
1168             Usage : $re->compatible_ends($re2);
1169             Function : Determines if the two restriction enzyme cut sites
1170             have compatible ends.
1171             Returns : 0 if not, 1 if only one pair ends match, 2 if both ends.
1172             Argument : a Bio::Restriction::Enzyme
1173             Throws : unless the argument is a Bio::Resriction::Enzyme and
1174             if there are Ns in the ovarhangs
1175              
1176             In case of type II enzymes which which cut symmetrically, this
1177             function can be considered to return a boolean value.
1178              
1179              
1180             =cut
1181              
1182             sub compatible_ends {
1183 1     1 1 3 my ($self, $re) = @_;
1184              
1185 1 50       6 $self->throw("Need a Bio::Restriction::Enzyme as an argument, [$re]")
1186             unless $re->isa('Bio::Restriction::Enzyme');
1187              
1188             # $self->throw("Only type II enzymes work now")
1189             # unless $self->type eq 'II';
1190              
1191 1 50 33     3 $self->debug("N(s) in overhangs. Can not compare")
1192             if $self->overhang_seq =~ /N/ or $re->overhang_seq =~ /N/;
1193              
1194 1 50 33     2 return 2 if $self->overhang_seq eq $re->overhang_seq and
1195             $self->overhang eq $re->overhang;
1196              
1197 0         0 return 0;
1198             }
1199              
1200             =head2 is_ambiguous
1201              
1202             Title : is_ambiguous
1203             Usage : $re->is_ambiguous();
1204             Function : Determines if the restriction enzyme contains ambiguous sequences
1205             Returns : Boolean
1206             Argument : n/a
1207             Throws : n/a
1208              
1209             =cut
1210              
1211             sub is_ambiguous {
1212 1     1 1 3 my $self = shift;
1213 1 50       2 return $self->string =~ m/[^AGCT]/ ? 1 : 0 ;
1214             }
1215              
1216             =head2 Additional methods from Rebase
1217              
1218             =cut
1219              
1220             =head2 is_prototype
1221              
1222             Title : is_prototype
1223             Usage : $re->is_prototype
1224             Function : Get/Set method for finding out if this enzyme is a prototype
1225             Example : $re->is_prototype(1)
1226             Returns : Boolean
1227             Args : none
1228              
1229             Prototype enzymes are the most commonly available and usually first
1230             enzymes discoverd that have the same recognition site. Using only
1231             prototype enzymes in restriction analysis avoids redundancy and
1232             speeds things up.
1233              
1234             =cut
1235              
1236             sub is_prototype {
1237 7213     7213 1 8603 my ($self, $value) = @_;
1238 7213 100       9158 if (defined $value) {
1239 7210         14174 return $self->{'_is_prototype'} = $value ;
1240             }
1241 3 100       7 if (defined $self->{'_is_prototype'}) {
1242 2         8 return $self->{'_is_prototype'}
1243             } else {
1244 1         10 $self->warn("Can't unequivocally assign prototype based on input format alone");
1245             return
1246 0         0 }
1247             }
1248              
1249             =head2 is_neoschizomer
1250              
1251             Title : is_neoschizomer
1252             Usage : $re->is_neoschizomer
1253             Function : Get/Set method for finding out if this enzyme is a neoschizomer
1254             Example : $re->is_neoschizomer(1)
1255             Returns : Boolean
1256             Args : none
1257              
1258             Neoschizomers are distinguishable from the prototype enzyme by having a
1259             different cleavage pattern. Note that not all formats report this
1260              
1261             =cut
1262              
1263             sub is_neoschizomer {
1264 0     0 1 0 my ($self, $value) = @_;
1265 0 0       0 if (defined $value) {
1266 0         0 return $self->{'_is_neoschizomer'} = $value ;
1267             }
1268 0 0       0 if (defined $self->{'_is_neoschizomer'}) {
1269 0         0 return $self->{'_is_neoschizomer'}
1270             } else {
1271 0         0 $self->warn("Can't unequivocally assign neoschizomer based on input format alone");
1272             return
1273 0         0 }
1274             }
1275              
1276             =head2 prototype_name
1277              
1278             Title : prototype_name
1279             Alias : prototype
1280             Usage : $re->prototype_name
1281             Function : Get/Set method for the name of prototype for
1282             this enzyme's recognition site
1283             Example : $re->prototype_name(1)
1284             Returns : prototype enzyme name string or an empty string
1285             Args : optional prototype enzyme name string
1286              
1287             If the enzyme itself is the prototype, its own name is returned. Not to
1288             confuse the negative result with an unset value, use method
1289             L.
1290              
1291             This method is called I rather than I,
1292             because it returns a string rather than on object.
1293              
1294             =cut
1295              
1296             sub prototype_name {
1297 12     12 1 15 my $self = shift;
1298              
1299 12 100       30 $self->{'_prototype'} = shift if @_;
1300 12 100       30 return $self->name if $self->{'_is_prototype'};
1301 2   50     10 return $self->{'_prototype'} || '';
1302             }
1303              
1304 9     9 0 15 sub prototype { shift->prototype_name(@_) }
1305              
1306             =head2 isoschizomers
1307              
1308             Title : isoschizomers
1309             Alias : isos
1310             Usage : $re->isoschizomers(@list);
1311             Function : Gets/Sets a list of known isoschizomers (enzymes that
1312             recognize the same site, but don't necessarily cut at
1313             the same position).
1314             Arguments : A reference to an array that contains the isoschizomers
1315             Returns : A reference to an array of the known isoschizomers or 0
1316             if not defined.
1317              
1318             This has to be the hardest name to spell, so now you can use the alias
1319             'isos'. Added for compatibility to REBASE
1320              
1321             =cut
1322              
1323             sub isoschizomers {
1324 7456     7456 1 8025 my ($self) = shift;
1325 7456 100       11821 push @{$self->{_isoschizomers}}, @_ if @_;
  7454         13336  
1326             # make sure that you don't dereference if null
1327             # chad believes quite strongly that you should return
1328             # a reference to an array anyway. don't bother dereferencing.
1329             # i'll post that to the list.
1330 7456 50       11437 if ($self->{'_isoschizomers'}) {
1331 7456         7165 return @{$self->{_isoschizomers}};
  7456         8736  
1332             }
1333            
1334             }
1335              
1336 0     0 0 0 sub isos { shift->isoschizomers(@_) }
1337              
1338             =head2 purge_isoschizomers
1339              
1340             Title : purge_isoschizomers
1341             Alias : purge_isos
1342             Usage : $re->purge_isoschizomers();
1343             Function : Purges the set of isoschizomers for this enzyme
1344             Arguments :
1345             Returns : 1
1346              
1347             =cut
1348              
1349             sub purge_isoschizomers {
1350 1     1 1 3 my ($self) = shift;
1351 1         6 $self->{_isoschizomers} = [];
1352              
1353             }
1354              
1355 0     0 0 0 sub purge_isos { shift->purge_isoschizomers(@_) }
1356              
1357             =head2 methylation_sites
1358              
1359             Title : methylation_sites
1360             Usage : $re->methylation_sites(\%sites);
1361             Function : Gets/Sets known methylation sites (positions on the sequence
1362             that get modified to promote or prevent cleavage).
1363             Arguments : A reference to a hash that contains the methylation sites
1364             Returns : A reference to a hash of the methylation sites or
1365             an empty string if not defined.
1366              
1367             There are three types of methylation sites:
1368              
1369             =over 3
1370              
1371             =item * (6) = N6-methyladenosine
1372              
1373             =item * (5) = 5-methylcytosine
1374              
1375             =item * (4) = N4-methylcytosine
1376              
1377             =back
1378              
1379             These are stored as 6, 5, and 4 respectively. The hash has the
1380             sequence position as the key and the type of methylation as the value.
1381             A negative number in the sequence position indicates that the DNA is
1382             methylated on the complementary strand.
1383              
1384             Note that in REBASE, the methylation positions are given
1385             Added for compatibility to REBASE.
1386              
1387             =cut
1388              
1389             sub methylation_sites {
1390 782     782 1 899 my $self = shift;
1391              
1392 782         1341 while (@_) {
1393 829         926 my $key = shift;
1394 829         2162 $self->{'_methylation_sites'}->{$key} = shift;
1395             }
1396 782         849 return %{$self->{_methylation_sites}};
  782         1279  
1397             }
1398              
1399              
1400             =head2 purge_methylation_sites
1401              
1402             Title : purge_methylation_sites
1403             Usage : $re->purge_methylation_sites();
1404             Function : Purges the set of methylation_sites for this enzyme
1405             Arguments :
1406             Returns :
1407              
1408             =cut
1409              
1410             sub purge_methylation_sites {
1411 23     23 1 37 my ($self) = shift;
1412 23         48 $self->{_methylation_sites} = {};
1413             }
1414              
1415             =head2 microbe
1416              
1417             Title : microbe
1418             Usage : $re->microbe($microbe);
1419             Function : Gets/Sets microorganism where the restriction enzyme was found
1420             Arguments : A scalar containing the microbes name
1421             Returns : A scalar containing the microbes name or 0 if not defined
1422              
1423             Added for compatibility to REBASE
1424              
1425             =cut
1426              
1427             sub microbe {
1428 2     2 1 4 my ($self, $microbe) = @_;
1429 2 100       5 if ($microbe) {
1430 1         2 $self->{_microbe}=$microbe;
1431             }
1432 2   50     9 return $self->{_microbe} || '';
1433              
1434             }
1435              
1436              
1437             =head2 source
1438              
1439             Title : source
1440             Usage : $re->source('Rob Edwards');
1441             Function : Gets/Sets the person who provided the enzyme
1442             Arguments : A scalar containing the persons name
1443             Returns : A scalar containing the persons name or 0 if not defined
1444              
1445             Added for compatibility to REBASE
1446              
1447             =cut
1448              
1449             sub source {
1450 7455     7455 1 8993 my ($self, $source) = @_;
1451 7455 100       9729 if ($source) {
1452 7454         10072 $self->{_source}=$source;
1453             }
1454 7455   50     11083 return $self->{_source} || '';
1455             }
1456              
1457              
1458             =head2 vendors
1459              
1460             Title : vendors
1461             Usage : $re->vendor(@list_of_companies);
1462             Function : Gets/Sets the a list of companies that you can get the enzyme from.
1463             Also sets the commercially_available boolean
1464             Arguments : A reference to an array containing the names of companies
1465             that you can get the enzyme from
1466             Returns : A reference to an array containing the names of companies
1467             that you can get the enzyme from
1468              
1469             Added for compatibility to REBASE
1470              
1471             =cut
1472              
1473             sub vendors {
1474 7472     7472 1 7918 my $self = shift;
1475 7472 100       9756 push @{$self->{_vendors}}, @_ if @_;
  7470         10253  
1476 7472 50       11070 if ($self->{'_vendors'}) {
1477 7472         6251 return @{$self->{'_vendors'}};
  7472         9015  
1478             }
1479             }
1480              
1481              
1482             =head2 purge_vendors
1483              
1484             Title : purge_vendors
1485             Usage : $re->purge_references();
1486             Function : Purges the set of references for this enzyme
1487             Arguments :
1488             Returns :
1489              
1490             =cut
1491              
1492             sub purge_vendors {
1493 1     1 1 3 my ($self) = shift;
1494 1         3 $self->{_vendors} = [];
1495              
1496             }
1497              
1498             =head2 vendor
1499              
1500             Title : vendor
1501             Usage : $re->vendor(@list_of_companies);
1502             Function : Gets/Sets the a list of companies that you can get the enzyme from.
1503             Also sets the commercially_available boolean
1504             Arguments : A reference to an array containing the names of companies
1505             that you can get the enzyme from
1506             Returns : A reference to an array containing the names of companies
1507             that you can get the enzyme from
1508              
1509             Added for compatibility to REBASE
1510              
1511             =cut
1512              
1513              
1514             sub vendor {
1515 1     1 1 2 my $self = shift;
1516 1         2 return push @{$self->{_vendors}}, @_;
  1         4  
1517 0         0 return $self->{_vendors};
1518             }
1519              
1520              
1521             =head2 references
1522              
1523             Title : references
1524             Usage : $re->references(string);
1525             Function : Gets/Sets the references for this enzyme
1526             Arguments : an array of string reference(s) (optional)
1527             Returns : an array of references
1528              
1529             Use L to reset the list of references
1530              
1531             This should be a L object, but its not (yet)
1532              
1533             =cut
1534              
1535             sub references {
1536 7472     7472 1 7554 my ($self) = shift;
1537 7472 100       10460 push @{$self->{_references}}, @_ if @_;
  7470         9613  
1538 7472         7415 return @{$self->{_references}};
  7472         9547  
1539             }
1540              
1541              
1542             =head2 purge_references
1543              
1544             Title : purge_references
1545             Usage : $re->purge_references();
1546             Function : Purges the set of references for this enzyme
1547             Arguments :
1548             Returns : 1
1549              
1550             =cut
1551              
1552             sub purge_references {
1553 1     1 1 3 my ($self) = shift;
1554 1         3 $self->{_references} = [];
1555              
1556             }
1557              
1558             =head2 clone
1559              
1560             Title : clone
1561             Usage : $re->clone
1562             Function : Deep copy of the object
1563             Arguments : -
1564             Returns : new Bio::Restriction::EnzymeI object
1565              
1566             This works as long as the object is a clean in-memory object using
1567             scalars, arrays and hashes. You have been warned.
1568              
1569             If you have module Storable, it is used, otherwise local code is used.
1570             Todo: local code cuts circular references.
1571              
1572             =cut
1573              
1574             # there's some issue here; deprecating and rolling another below/maj
1575              
1576             sub clone_depr {
1577 0     0 0 0 my ($self, $this) = @_;
1578              
1579 0         0 eval { require Storable; };
  0         0  
1580 0 0       0 return Storable::dclone($self) unless $@;
1581             # modified from deep_copy() @ http://www.stonehenge.com/merlyn/UnixReview/col30.html
1582 0 0       0 unless ($this) {
1583 0         0 my $new;
1584 0         0 foreach my $k (keys %$self) {
1585 0 0       0 if (not ref $self->{$k}) {
1586 0         0 $new->{$k} = $self->{$k};
1587             } else {
1588 0         0 $new->{$k} = $self->clone($self->{$k});
1589             }
1590             #print Dumper $new;
1591             }
1592 0         0 bless $new, ref($self);
1593 0         0 return $new;
1594             }
1595 0 0       0 if (not ref $this) {
    0          
    0          
1596 0         0 $this;
1597             }
1598             elsif (ref $this eq "ARRAY") {
1599 0         0 [map $self->clone($_), @$this];
1600             }
1601             elsif (ref $this eq "HASH") {
1602 0         0 +{map { $_ => $self->clone($this->{$_}) } keys %$this};
  0         0  
1603             } else { # objects
1604 0 0       0 return if $this->isa('Bio::Restriction::EnzymeI');
1605 0 0       0 return $this->clone if $this->can('clone');
1606 0         0 my $obj;
1607 0         0 foreach my $k (keys %$this) {
1608 0 0       0 if (not ref $this->{$k}) {
1609 0         0 $obj->{$k} = $this->{$k};
1610             } else {
1611 0         0 $obj->{$k} = $this->clone($this->{$k});
1612             }
1613             }
1614 0         0 bless $obj, ref($this);
1615 0         0 return $obj;
1616             }
1617             }
1618              
1619             sub clone {
1620 1501     1501 1 1370 my $self = shift;
1621 1501         1764 my ($this, $visited) = @_;
1622 1501 100       1796 unless (defined $this) {
1623 52         70 my %h;
1624 52         352 tie %h, 'Tie::RefHash';
1625 52         612 my $visited = \%h;
1626 52         124 return $self->clone($self, $visited);
1627             }
1628 1449         1197 my $thing;
1629 1449         1546 for ($this) {
1630 1449 100       1792 if (ref) {
1631 475 100       1326 return $visited->{$this} if $visited->{$this};
1632             }
1633             # scalar
1634 1447 100       5309 (!ref) && do {
1635 974         888 $thing = $this;
1636 974         883 last;
1637             };
1638             # object
1639 473 100       892 (ref =~ /^Bio::/) && do {
1640 108         155 $thing = {};
1641 108         165 bless($thing, ref);
1642 108         332 $visited->{$this} = $thing;
1643 108         1030 foreach my $attr (keys %{$_}) {
  108         385  
1644 1196 100       2001 $thing->{$attr} = (defined $_->{$attr} ? $self->clone($_->{$attr},$visited) : undef );
1645             }
1646 108         169 last;
1647             };
1648 365 100       540 (ref eq 'ARRAY') && do {
1649 311         373 $thing = [];
1650 311         722 $visited->{$this} = $thing;
1651 311         2219 foreach my $elt (@{$_}) {
  311         477  
1652 309 50       526 push @$thing, (defined $elt ? $self->clone($elt,$visited) : undef);
1653             }
1654 311         360 last;
1655             };
1656 54 50       133 (ref eq 'HASH') && do {
1657 54         89 $thing = {};
1658 54         151 $visited->{$this} = $thing;
1659 4     4   34 no warnings qw( uninitialized ); # avoid 'uninitialized value' warning against $key
  4         11  
  4         270  
1660 54         440 foreach my $key (%{$_}) {
  54         165  
1661 0 0       0 $thing->{$key} = (defined $_->{key} ? $self->clone( $_->{$key},$visited) : undef );
1662             }
1663 4     4   24 use warnings;
  4         8  
  4         1078  
1664 54         79 last;
1665             };
1666 0 0       0 (ref eq 'SCALAR') && do {
1667 0         0 $thing = ${$_};
  0         0  
1668 0         0 $visited->{$this} = $thing;
1669 0         0 $thing = \$thing;
1670 0         0 last;
1671             };
1672             }
1673              
1674 1447         2944 return $thing;
1675             }
1676              
1677              
1678              
1679             =head2 _expand
1680              
1681             Title : _expand
1682             Function : Expand nucleotide ambiguity codes to their representative letters
1683             Returns : The full length string
1684             Arguments : The string to be expanded.
1685              
1686             Stolen from the original RestrictionEnzyme.pm
1687              
1688             =cut
1689              
1690              
1691             sub _expand {
1692 7774     7774   9469 my $str = shift;
1693              
1694 7774         24464 $str =~ s/N|X/\./g;
1695 7774         12072 $str =~ s/R/\[AG\]/g;
1696 7774         9512 $str =~ s/Y/\[CT\]/g;
1697 7774         8448 $str =~ s/S/\[GC\]/g;
1698 7774         9412 $str =~ s/W/\[AT\]/g;
1699 7774         8169 $str =~ s/M/\[AC\]/g;
1700 7774         7469 $str =~ s/K/\[TG\]/g;
1701 7774         7132 $str =~ s/B/\[CGT\]/g;
1702 7774         7753 $str =~ s/D/\[AGT\]/g;
1703 7774         7529 $str =~ s/H/\[ACT\]/g;
1704 7774         7309 $str =~ s/V/\[ACG\]/g;
1705              
1706 7774         10984 return $str;
1707             }
1708              
1709             1;
1710