| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Ace::Sequence; |
|
2
|
1
|
|
|
1
|
|
3411
|
use strict; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
33
|
|
|
3
|
|
|
|
|
|
|
|
|
4
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
96
|
|
|
5
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
34
|
|
|
6
|
1
|
|
|
1
|
|
857
|
use Ace 1.50 qw(:DEFAULT rearrange); |
|
|
1
|
|
|
|
|
37
|
|
|
|
1
|
|
|
|
|
195
|
|
|
7
|
1
|
|
|
1
|
|
836
|
use Ace::Sequence::FeatureList; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
29
|
|
|
8
|
1
|
|
|
1
|
|
653
|
use Ace::Sequence::Feature; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
35
|
|
|
9
|
1
|
|
|
1
|
|
7
|
use AutoLoader 'AUTOLOAD'; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
6
|
|
|
10
|
1
|
|
|
1
|
|
28
|
use vars '$VERSION'; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
45
|
|
|
11
|
|
|
|
|
|
|
my %CACHE; |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
$VERSION = '1.51'; |
|
14
|
|
|
|
|
|
|
|
|
15
|
1
|
|
|
1
|
|
5
|
use constant CACHE => 1; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
72
|
|
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
use overload |
|
18
|
1
|
|
|
|
|
6
|
'""' => 'asString', |
|
19
|
|
|
|
|
|
|
cmp => 'cmp', |
|
20
|
1
|
|
|
1
|
|
6
|
; |
|
|
1
|
|
|
|
|
2
|
|
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# synonym: stop = end |
|
23
|
|
|
|
|
|
|
*stop = \&end; |
|
24
|
|
|
|
|
|
|
*abs = \&absolute; |
|
25
|
|
|
|
|
|
|
*source_seq = \&source; |
|
26
|
|
|
|
|
|
|
*source_tag = \&subtype; |
|
27
|
|
|
|
|
|
|
*primary_tag = \&type; |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
my %plusminus = ( '+' => '-', |
|
30
|
|
|
|
|
|
|
'-' => '+', |
|
31
|
|
|
|
|
|
|
'.' => '.'); |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# internal keys |
|
34
|
|
|
|
|
|
|
# parent => reference Sequence in "+" strand |
|
35
|
|
|
|
|
|
|
# p_offset => our start in the parent |
|
36
|
|
|
|
|
|
|
# length => our length |
|
37
|
|
|
|
|
|
|
# strand => our strand (+ or -) |
|
38
|
|
|
|
|
|
|
# refseq => reference Sequence for coordinate system |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# object constructor |
|
41
|
|
|
|
|
|
|
# usually called like this: |
|
42
|
|
|
|
|
|
|
# $seq = Ace::Sequence->new($object); |
|
43
|
|
|
|
|
|
|
# but can be called like this: |
|
44
|
|
|
|
|
|
|
# $seq = Ace::Sequence->new(-db=>$db,-name=>$name); |
|
45
|
|
|
|
|
|
|
# or |
|
46
|
|
|
|
|
|
|
# $seq = Ace::Sequence->new(-seq => $object, |
|
47
|
|
|
|
|
|
|
# -offset => $offset, |
|
48
|
|
|
|
|
|
|
# -length => $length, |
|
49
|
|
|
|
|
|
|
# -ref => $refseq |
|
50
|
|
|
|
|
|
|
# ); |
|
51
|
|
|
|
|
|
|
# $refseq, if provided, will be used to establish the coordinate |
|
52
|
|
|
|
|
|
|
# system. Otherwise the first base pair will be set to 1. |
|
53
|
|
|
|
|
|
|
sub new { |
|
54
|
0
|
|
|
0
|
0
|
|
my $pack = shift; |
|
55
|
0
|
|
|
|
|
|
my ($seq,$start,$end,$offset,$length,$refseq,$db) = |
|
56
|
|
|
|
|
|
|
rearrange([ |
|
57
|
|
|
|
|
|
|
['SEQ','SEQUENCE','SOURCE'], |
|
58
|
|
|
|
|
|
|
'START', |
|
59
|
|
|
|
|
|
|
['END','STOP'], |
|
60
|
|
|
|
|
|
|
['OFFSET','OFF'], |
|
61
|
|
|
|
|
|
|
['LENGTH','LEN'], |
|
62
|
|
|
|
|
|
|
'REFSEQ', |
|
63
|
|
|
|
|
|
|
['DATABASE','DB'], |
|
64
|
|
|
|
|
|
|
],@_); |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# Object must have a parent sequence and/or a reference |
|
67
|
|
|
|
|
|
|
# sequence. In some cases, the parent sequence will be the |
|
68
|
|
|
|
|
|
|
# object itself. The reference sequence is used to set up |
|
69
|
|
|
|
|
|
|
# the frame of reference for the coordinate system. |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# fetch the sequence object if we don't have it already |
|
72
|
0
|
0
|
0
|
|
|
|
croak "Please provide either a Sequence object or a database and name" |
|
|
|
|
0
|
|
|
|
|
|
73
|
|
|
|
|
|
|
unless ref($seq) || ($seq && $db); |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# convert start into offset |
|
76
|
0
|
0
|
0
|
|
|
|
$offset = $start - 1 if defined($start) and !defined($offset); |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
# convert stop/end into length |
|
79
|
0
|
0
|
0
|
|
|
|
$length = ($end > $start) ? $end - $offset : $end - $offset - 2 |
|
|
|
0
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
if defined($end) && !defined($length); |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# if just a string is passed, try to fetch a Sequence object |
|
83
|
0
|
0
|
|
|
|
|
my $obj = ref($seq) ? $seq : $db->fetch('Sequence'=>$seq); |
|
84
|
0
|
0
|
|
|
|
|
unless ($obj) { |
|
85
|
0
|
|
|
|
|
|
Ace->error("No Sequence named $obj found in database"); |
|
86
|
0
|
|
|
|
|
|
return; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# get parent coordinates and length of this sequence |
|
90
|
|
|
|
|
|
|
# the parent is an Ace Sequence object in the "+" strand |
|
91
|
0
|
|
|
|
|
|
my ($parent,$p_offset,$p_length,$strand) = find_parent($obj); |
|
92
|
0
|
0
|
|
|
|
|
return unless $parent; |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# handle negative strands |
|
95
|
0
|
|
|
|
|
|
my $r_strand = $strand; |
|
96
|
0
|
|
|
|
|
|
my $r_offset = $p_offset; |
|
97
|
0
|
|
0
|
|
|
|
$offset ||= 0; |
|
98
|
0
|
0
|
|
|
|
|
$offset *= -1 if $strand < 0; |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# handle feature objects |
|
101
|
0
|
0
|
|
|
|
|
$offset += $obj->offset if $obj->can('smapped'); |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# get source |
|
104
|
0
|
0
|
|
|
|
|
my $source = $obj->can('smapped') ? $obj->source : $obj; |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# store the object into our instance variables |
|
107
|
0
|
|
0
|
|
|
|
my $self = bless { |
|
108
|
|
|
|
|
|
|
obj => $source, |
|
109
|
|
|
|
|
|
|
offset => $offset, |
|
110
|
|
|
|
|
|
|
length => $length || $p_length, |
|
111
|
|
|
|
|
|
|
parent => $parent, |
|
112
|
|
|
|
|
|
|
p_offset => $p_offset, |
|
113
|
|
|
|
|
|
|
refseq => [$source,$r_offset,$r_strand], |
|
114
|
|
|
|
|
|
|
strand => $strand, |
|
115
|
|
|
|
|
|
|
absolute => 0, |
|
116
|
|
|
|
|
|
|
automerge => 1, |
|
117
|
|
|
|
|
|
|
},$pack; |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# set the reference sequence |
|
120
|
0
|
0
|
0
|
|
|
|
eval { $self->refseq($refseq) } or return if defined $refseq; |
|
|
0
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# wheww! |
|
123
|
0
|
|
|
|
|
|
return $self; |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
# return the "source" object that the user offset from |
|
127
|
|
|
|
|
|
|
sub source { |
|
128
|
0
|
|
|
0
|
0
|
|
$_[0]->{obj}; |
|
129
|
|
|
|
|
|
|
} |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# return the parent object |
|
132
|
0
|
|
|
0
|
0
|
|
sub parent { $_[0]->{parent} } |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# return the length |
|
135
|
|
|
|
|
|
|
#sub length { $_[0]->{length} } |
|
136
|
|
|
|
|
|
|
sub length { |
|
137
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
138
|
0
|
|
|
|
|
|
my ($start,$end) = ($self->start,$self->end); |
|
139
|
0
|
0
|
|
|
|
|
return $end - $start + ($end > $start ? 1 : -1); # for stupid 1-based adjustments |
|
140
|
|
|
|
|
|
|
} |
|
141
|
|
|
|
|
|
|
|
|
142
|
0
|
|
|
0
|
1
|
|
sub reversed { return shift->strand < 0; } |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub automerge { |
|
145
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
146
|
0
|
|
|
|
|
|
my $d = $self->{automerge}; |
|
147
|
0
|
0
|
|
|
|
|
$self->{automerge} = shift if @_; |
|
148
|
0
|
|
|
|
|
|
$d; |
|
149
|
|
|
|
|
|
|
} |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# return reference sequence |
|
152
|
|
|
|
|
|
|
sub refseq { |
|
153
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
154
|
0
|
|
|
|
|
|
my $prev = $self->{refseq}; |
|
155
|
0
|
0
|
|
|
|
|
if (@_) { |
|
156
|
0
|
|
|
|
|
|
my $refseq = shift; |
|
157
|
0
|
|
|
|
|
|
my $arrayref; |
|
158
|
|
|
|
|
|
|
|
|
159
|
0
|
0
|
|
|
|
|
BLOCK: { |
|
160
|
0
|
|
|
|
|
|
last BLOCK unless defined ($refseq); |
|
161
|
|
|
|
|
|
|
|
|
162
|
0
|
0
|
0
|
|
|
|
if (ref($refseq) && ref($refseq) eq 'ARRAY') { |
|
163
|
0
|
|
|
|
|
|
$arrayref = $refseq; |
|
164
|
0
|
|
|
|
|
|
last BLOCK; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
0
|
0
|
0
|
|
|
|
if (ref($refseq) && ($refseq->can('smapped'))) { |
|
168
|
0
|
0
|
|
|
|
|
croak "Reference sequence has no common ancestor with sequence" |
|
169
|
|
|
|
|
|
|
unless $self->parent eq $refseq->parent; |
|
170
|
0
|
|
|
|
|
|
my ($a,$b,$c) = @{$refseq->{refseq}}; |
|
|
0
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# $b += $refseq->offset; |
|
172
|
0
|
|
|
|
|
|
$b += $refseq->offset; |
|
173
|
0
|
|
|
|
|
|
$arrayref = [$refseq,$b,$refseq->strand]; |
|
174
|
0
|
|
|
|
|
|
last BLOCK; |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# look up reference sequence in database if we aren't given |
|
179
|
|
|
|
|
|
|
# database object already |
|
180
|
0
|
0
|
|
|
|
|
$refseq = $self->db->fetch('Sequence' => $refseq) |
|
181
|
|
|
|
|
|
|
unless $refseq->isa('Ace::Object'); |
|
182
|
0
|
0
|
|
|
|
|
croak "Invalid reference sequence" unless $refseq; |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# find position of ref sequence in parent strand |
|
185
|
0
|
|
|
|
|
|
my ($r_parent,$r_offset,$r_length,$r_strand) = find_parent($refseq); |
|
186
|
0
|
0
|
|
|
|
|
croak "Reference sequence has no common ancestor with sequence" |
|
187
|
|
|
|
|
|
|
unless $r_parent eq $self->{parent}; |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# set to array reference containing this information |
|
190
|
0
|
|
|
|
|
|
$arrayref = [$refseq,$r_offset,$r_strand]; |
|
191
|
|
|
|
|
|
|
} |
|
192
|
0
|
|
|
|
|
|
$self->{refseq} = $arrayref; |
|
193
|
|
|
|
|
|
|
} |
|
194
|
0
|
0
|
|
|
|
|
return unless $prev; |
|
195
|
0
|
0
|
|
|
|
|
return $self->parent if $self->absolute; |
|
196
|
0
|
0
|
|
|
|
|
return wantarray ? @{$prev} : $prev->[0]; |
|
|
0
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
} |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# return strand |
|
200
|
0
|
|
|
0
|
1
|
|
sub strand { return $_[0]->{strand} } |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# return reference strand |
|
203
|
|
|
|
|
|
|
sub r_strand { |
|
204
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
205
|
0
|
0
|
|
|
|
|
return "+1" if $self->absolute; |
|
206
|
0
|
0
|
|
|
|
|
if (my ($ref,$r_offset,$r_strand) = $self->refseq) { |
|
207
|
0
|
|
|
|
|
|
return $r_strand; |
|
208
|
|
|
|
|
|
|
} else { |
|
209
|
0
|
|
|
|
|
|
return $self->{strand} |
|
210
|
|
|
|
|
|
|
} |
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
|
|
213
|
0
|
|
|
0
|
1
|
|
sub offset { $_[0]->{offset} } |
|
214
|
0
|
|
|
0
|
0
|
|
sub p_offset { $_[0]->{p_offset} } |
|
215
|
|
|
|
|
|
|
|
|
216
|
0
|
|
|
0
|
0
|
|
sub smapped { 1; } |
|
217
|
0
|
|
|
0
|
0
|
|
sub type { 'Sequence' } |
|
218
|
0
|
|
|
0
|
0
|
|
sub subtype { } |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
sub debug { |
|
221
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
222
|
0
|
|
|
|
|
|
my $d = $self->{_debug}; |
|
223
|
0
|
0
|
|
|
|
|
$self->{_debug} = shift if @_; |
|
224
|
0
|
|
|
|
|
|
$d; |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
# return the database this sequence is associated with |
|
228
|
|
|
|
|
|
|
sub db { |
|
229
|
0
|
|
0
|
0
|
1
|
|
return Ace->name2db($_[0]->{db} ||= $_[0]->source->db); |
|
230
|
|
|
|
|
|
|
} |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub start { |
|
233
|
0
|
|
|
0
|
1
|
|
my ($self,$abs) = @_; |
|
234
|
0
|
0
|
|
|
|
|
$abs = $self->absolute unless defined $abs; |
|
235
|
0
|
0
|
|
|
|
|
return $self->{p_offset} + $self->{offset} + 1 if $abs; |
|
236
|
|
|
|
|
|
|
|
|
237
|
0
|
0
|
|
|
|
|
if ($self->refseq) { |
|
238
|
0
|
|
|
|
|
|
my ($ref,$r_offset,$r_strand) = $self->refseq; |
|
239
|
0
|
0
|
|
|
|
|
return $r_strand < 0 ? 1 + $r_offset - ($self->{p_offset} + $self->{offset}) |
|
240
|
|
|
|
|
|
|
: 1 + $self->{p_offset} + $self->{offset} - $r_offset; |
|
241
|
|
|
|
|
|
|
} |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
else { |
|
244
|
0
|
|
|
|
|
|
return $self->{offset} +1; |
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
sub end { |
|
250
|
0
|
|
|
0
|
1
|
|
my ($self,$abs) = @_; |
|
251
|
0
|
|
|
|
|
|
my $start = $self->start($abs); |
|
252
|
0
|
0
|
|
|
|
|
my $f = $self->{length} > 0 ? 1 : -1; # for stupid 1-based adjustments |
|
253
|
0
|
0
|
0
|
|
|
|
if ($abs && $self->refseq ne $self->parent) { |
|
254
|
0
|
|
|
|
|
|
my $r_strand = $self->r_strand; |
|
255
|
0
|
0
|
0
|
|
|
|
return $start - $self->{length} + $f |
|
|
|
|
0
|
|
|
|
|
|
256
|
|
|
|
|
|
|
if $r_strand < 0 or $self->{strand} < 0 or $self->{length} < 0; |
|
257
|
0
|
|
|
|
|
|
return $start + $self->{length} - $f |
|
258
|
|
|
|
|
|
|
} |
|
259
|
0
|
0
|
|
|
|
|
return $start + $self->{length} - $f if $self->r_strand eq $self->{strand}; |
|
260
|
0
|
|
|
|
|
|
return $start - $self->{length} + $f; |
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# turn on absolute coordinates (relative to reference sequence) |
|
264
|
|
|
|
|
|
|
sub absolute { |
|
265
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
266
|
0
|
|
|
|
|
|
my $prev = $self->{absolute}; |
|
267
|
0
|
0
|
|
|
|
|
$self->{absolute} = $_[0] if defined $_[0]; |
|
268
|
0
|
|
|
|
|
|
return $prev; |
|
269
|
|
|
|
|
|
|
} |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
# human readable string (for debugging) |
|
272
|
|
|
|
|
|
|
sub asString { |
|
273
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
274
|
0
|
0
|
|
|
|
|
if ($self->absolute) { |
|
|
|
0
|
|
|
|
|
|
|
275
|
0
|
|
|
|
|
|
return join '',$self->parent,'/',$self->start,',',$self->end; |
|
276
|
|
|
|
|
|
|
} elsif (my $ref = $self->refseq){ |
|
277
|
0
|
0
|
|
|
|
|
my $label = $ref->isa('Ace::Sequence::Feature') ? $ref->info : "$ref"; |
|
278
|
0
|
|
|
|
|
|
return join '',$label,'/',$self->start,',',$self->end; |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
} else { |
|
281
|
0
|
|
|
|
|
|
join '',$self->source,'/',$self->start,',',$self->end; |
|
282
|
|
|
|
|
|
|
} |
|
283
|
|
|
|
|
|
|
} |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
sub cmp { |
|
286
|
0
|
|
|
0
|
0
|
|
my ($self,$arg,$reversed) = @_; |
|
287
|
0
|
0
|
0
|
|
|
|
if (ref($arg) and $arg->isa('Ace::Sequence')) { |
|
288
|
0
|
|
0
|
|
|
|
my $cmp = $self->parent cmp $arg->parent |
|
289
|
|
|
|
|
|
|
|| $self->start <=> $arg->start; |
|
290
|
0
|
0
|
|
|
|
|
return $reversed ? -$cmp : $cmp; |
|
291
|
|
|
|
|
|
|
} |
|
292
|
0
|
|
|
|
|
|
my $name = $self->asString; |
|
293
|
0
|
0
|
|
|
|
|
return $reversed ? $arg cmp $name : $name cmp $arg; |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# Return the DNA |
|
297
|
|
|
|
|
|
|
sub dna { |
|
298
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
299
|
0
|
0
|
|
|
|
|
return $self->{dna} if $self->{dna}; |
|
300
|
0
|
|
|
|
|
|
my $raw = $self->_query('seqdna'); |
|
301
|
0
|
|
|
|
|
|
$raw=~s/^>.*\n//m; |
|
302
|
0
|
|
|
|
|
|
$raw=~s/^\/\/.*//mg; |
|
303
|
0
|
|
|
|
|
|
$raw=~s/\n//g; |
|
304
|
0
|
|
|
|
|
|
$raw =~ s/\0+\Z//; # blasted nulls! |
|
305
|
0
|
0
|
|
|
|
|
my $effective_strand = $self->end >= $self->start ? '+1' : '-1'; |
|
306
|
0
|
0
|
|
|
|
|
_complement(\$raw) if $self->r_strand ne $effective_strand; |
|
307
|
0
|
|
|
|
|
|
return $self->{dna} = $raw; |
|
308
|
|
|
|
|
|
|
} |
|
309
|
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
# return a gff file |
|
311
|
|
|
|
|
|
|
sub gff { |
|
312
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
313
|
0
|
|
|
|
|
|
my ($abs,$features) = rearrange([['ABS','ABSOLUTE'],'FEATURES'],@_); |
|
314
|
0
|
0
|
|
|
|
|
$abs = $self->absolute unless defined $abs; |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# can provide list of feature names, such as 'similarity', or 'all' to get 'em all |
|
317
|
|
|
|
|
|
|
# !THIS IS BROKEN; IT SHOULD LOOK LIKE FEATURE()! |
|
318
|
0
|
|
|
|
|
|
my $opt = $self->_feature_filter($features); |
|
319
|
|
|
|
|
|
|
|
|
320
|
0
|
|
|
|
|
|
my $gff = $self->_gff($opt); |
|
321
|
0
|
0
|
|
|
|
|
warn $gff if $self->debug; |
|
322
|
|
|
|
|
|
|
|
|
323
|
0
|
0
|
|
|
|
|
$self->transformGFF(\$gff) unless $abs; |
|
324
|
0
|
|
|
|
|
|
return $gff; |
|
325
|
|
|
|
|
|
|
} |
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
# return a GFF object using the optional GFF.pm module |
|
328
|
|
|
|
|
|
|
sub GFF { |
|
329
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
330
|
0
|
|
|
|
|
|
my ($filter,$converter) = @_; # anonymous subs |
|
331
|
0
|
0
|
|
|
|
|
croak "GFF module not installed" unless require GFF; |
|
332
|
0
|
|
|
|
|
|
require GFF::Filehandle; |
|
333
|
|
|
|
|
|
|
|
|
334
|
0
|
|
|
|
|
|
my @lines = grep !/^\/\//,split "\n",$self->gff(@_); |
|
335
|
0
|
|
|
|
|
|
local *IN; |
|
336
|
0
|
|
|
|
|
|
local ($^W) = 0; # prevent complaint by GFF module |
|
337
|
0
|
|
|
|
|
|
tie *IN,'GFF::Filehandle',\@lines; |
|
338
|
0
|
|
|
|
|
|
my $gff = GFF::GeneFeatureSet->new; |
|
339
|
0
|
0
|
|
|
|
|
$gff->read(\*IN,$filter,$converter) if $gff; |
|
340
|
0
|
|
|
|
|
|
return $gff; |
|
341
|
|
|
|
|
|
|
} |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
# Get the features table. Can filter by type/subtype this way: |
|
344
|
|
|
|
|
|
|
# features('similarity:EST','annotation:assembly_tag') |
|
345
|
|
|
|
|
|
|
sub features { |
|
346
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
347
|
0
|
|
|
|
|
|
my ($filter,$opt) = $self->_make_filter(@_); |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# get raw gff file |
|
350
|
0
|
|
|
|
|
|
my $gff = $self->gff(-features=>$opt); |
|
351
|
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# turn it into a list of features |
|
353
|
0
|
|
|
|
|
|
my @features = $self->_make_features($gff,$filter); |
|
354
|
|
|
|
|
|
|
|
|
355
|
0
|
0
|
|
|
|
|
if ($self->automerge) { # automatic merging |
|
356
|
|
|
|
|
|
|
# fetch out constructed transcripts and clones |
|
357
|
0
|
|
|
|
|
|
my %types = map {lc($_)=>1} (@$opt,@_); |
|
|
0
|
|
|
|
|
|
|
|
358
|
0
|
0
|
|
|
|
|
if ($types{'transcript'}) { |
|
359
|
0
|
|
|
|
|
|
push @features,$self->_make_transcripts(\@features); |
|
360
|
0
|
|
|
|
|
|
@features = grep {$_->type !~ /^(intron|exon)$/ } @features; |
|
|
0
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
} |
|
362
|
0
|
0
|
|
|
|
|
push @features,$self->_make_clones(\@features) if $types{'clone'}; |
|
363
|
0
|
0
|
|
|
|
|
if ($types{'similarity'}) { |
|
364
|
0
|
|
|
|
|
|
my @f = $self->_make_alignments(\@features); |
|
365
|
0
|
|
|
|
|
|
@features = grep {$_->type ne 'similarity'} @features; |
|
|
0
|
|
|
|
|
|
|
|
366
|
0
|
|
|
|
|
|
push @features,@f; |
|
367
|
|
|
|
|
|
|
} |
|
368
|
|
|
|
|
|
|
} |
|
369
|
|
|
|
|
|
|
|
|
370
|
0
|
0
|
|
|
|
|
return wantarray ? @features : \@features; |
|
371
|
|
|
|
|
|
|
} |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# A little bit more complex - assemble a list of "transcripts" |
|
374
|
|
|
|
|
|
|
# consisting of Ace::Sequence::Transcript objects. These objects |
|
375
|
|
|
|
|
|
|
# contain a list of exons and introns. |
|
376
|
|
|
|
|
|
|
sub transcripts { |
|
377
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
378
|
0
|
|
|
|
|
|
my $curated = shift; |
|
379
|
0
|
0
|
|
|
|
|
my $ef = $curated ? "exon:curated" : "exon"; |
|
380
|
0
|
0
|
|
|
|
|
my $if = $curated ? "intron:curated" : "intron"; |
|
381
|
0
|
0
|
|
|
|
|
my $sf = $curated ? "Sequence:curated" : "Sequence"; |
|
382
|
0
|
|
|
|
|
|
my @features = $self->features($ef,$if,$sf); |
|
383
|
0
|
0
|
|
|
|
|
return unless @features; |
|
384
|
0
|
|
|
|
|
|
return $self->_make_transcripts(\@features); |
|
385
|
|
|
|
|
|
|
} |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
sub _make_transcripts { |
|
388
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
389
|
0
|
|
|
|
|
|
my $features = shift; |
|
390
|
|
|
|
|
|
|
|
|
391
|
0
|
|
|
|
|
|
require Ace::Sequence::Transcript; |
|
392
|
0
|
|
|
|
|
|
my %transcripts; |
|
393
|
|
|
|
|
|
|
|
|
394
|
0
|
|
|
|
|
|
for my $feature (@$features) { |
|
395
|
0
|
|
|
|
|
|
my $transcript = $feature->info; |
|
396
|
0
|
0
|
|
|
|
|
next unless $transcript; |
|
397
|
0
|
0
|
|
|
|
|
if ($feature->type =~ /^(exon|intron|cds)$/) { |
|
|
|
0
|
|
|
|
|
|
|
398
|
0
|
|
|
|
|
|
my $type = $1; |
|
399
|
0
|
|
|
|
|
|
push @{$transcripts{$transcript}{$type}},$feature; |
|
|
0
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
} elsif ($feature->type eq 'Sequence') { |
|
401
|
0
|
|
0
|
|
|
|
$transcripts{$transcript}{base} ||= $feature; |
|
402
|
|
|
|
|
|
|
} |
|
403
|
|
|
|
|
|
|
} |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
# get rid of transcripts without exons |
|
406
|
0
|
|
|
|
|
|
foreach (keys %transcripts) { |
|
407
|
0
|
0
|
|
|
|
|
delete $transcripts{$_} unless exists $transcripts{$_}{exon} |
|
408
|
|
|
|
|
|
|
} |
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
# map the rest onto Ace::Sequence::Transcript objects |
|
411
|
0
|
|
|
|
|
|
return map {Ace::Sequence::Transcript->new($transcripts{$_})} keys %transcripts; |
|
|
0
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
} |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# Reassemble clones from clone left and right ends |
|
415
|
|
|
|
|
|
|
sub clones { |
|
416
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
417
|
0
|
|
|
|
|
|
my @clones = $self->features('Clone_left_end','Clone_right_end','Sequence'); |
|
418
|
0
|
|
|
|
|
|
my %clones; |
|
419
|
0
|
0
|
|
|
|
|
return unless @clones; |
|
420
|
0
|
|
|
|
|
|
return $self->_make_clones(\@clones); |
|
421
|
|
|
|
|
|
|
} |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
sub _make_clones { |
|
424
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
425
|
0
|
|
|
|
|
|
my $features = shift; |
|
426
|
|
|
|
|
|
|
|
|
427
|
0
|
|
|
|
|
|
my (%clones,@canonical_clones); |
|
428
|
0
|
0
|
|
|
|
|
my $start_label = $self->strand < 0 ? 'end' : 'start'; |
|
429
|
0
|
0
|
|
|
|
|
my $end_label = $self->strand < 0 ? 'start' : 'end'; |
|
430
|
0
|
|
|
|
|
|
for my $feature (@$features) { |
|
431
|
0
|
0
|
|
|
|
|
$clones{$feature->info}{$start_label} = $feature->start if $feature->type eq 'Clone_left_end'; |
|
432
|
0
|
0
|
|
|
|
|
$clones{$feature->info}{$end_label} = $feature->start if $feature->type eq 'Clone_right_end'; |
|
433
|
|
|
|
|
|
|
|
|
434
|
0
|
0
|
|
|
|
|
if ($feature->type eq 'Sequence') { |
|
435
|
0
|
|
|
|
|
|
my $info = $feature->info; |
|
436
|
0
|
0
|
|
|
|
|
next if $info =~ /LINK|CHROMOSOME|\.\w+$/; |
|
437
|
0
|
0
|
|
|
|
|
if ($info->Genomic_canonical(0)) { |
|
438
|
0
|
0
|
|
|
|
|
push (@canonical_clones,$info->Clone) if $info->Clone; |
|
439
|
|
|
|
|
|
|
} |
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
} |
|
442
|
|
|
|
|
|
|
|
|
443
|
0
|
|
|
|
|
|
foreach (@canonical_clones) { |
|
444
|
0
|
|
0
|
|
|
|
$clones{$_} ||= {}; |
|
445
|
|
|
|
|
|
|
} |
|
446
|
|
|
|
|
|
|
|
|
447
|
0
|
|
|
|
|
|
my @features; |
|
448
|
0
|
|
|
|
|
|
my ($r,$r_offset,$r_strand) = $self->refseq; |
|
449
|
0
|
|
|
|
|
|
my $parent = $self->parent; |
|
450
|
0
|
|
|
|
|
|
my $abs = $self->absolute; |
|
451
|
0
|
0
|
|
|
|
|
if ($abs) { |
|
452
|
0
|
|
|
|
|
|
$r_offset = 0; |
|
453
|
0
|
|
|
|
|
|
$r = $parent; |
|
454
|
0
|
|
|
|
|
|
$r_strand = '+1'; |
|
455
|
|
|
|
|
|
|
} |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# BAD HACK ALERT. WE DON'T KNOW WHERE THE LEFT END OF THE CLONE IS SO WE USE |
|
458
|
|
|
|
|
|
|
# THE MAGIC NUMBER -99_999_999 to mean "off left end" and |
|
459
|
|
|
|
|
|
|
# +99_999_999 to mean "off right end" |
|
460
|
0
|
|
|
|
|
|
for my $clone (keys %clones) { |
|
461
|
0
|
|
0
|
|
|
|
my $start = $clones{$clone}{start} || -99_999_999; |
|
462
|
0
|
|
0
|
|
|
|
my $end = $clones{$clone}{end} || +99_999_999; |
|
463
|
0
|
|
|
|
|
|
my $phony_gff = join "\t",($parent,'Clone','structural',$start,$end,'.','.','.',qq(Clone "$clone")); |
|
464
|
0
|
|
|
|
|
|
push @features,Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$phony_gff); |
|
465
|
|
|
|
|
|
|
} |
|
466
|
0
|
|
|
|
|
|
return @features; |
|
467
|
|
|
|
|
|
|
} |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
# Assemble a list of "GappedAlignment" objects. These objects |
|
470
|
|
|
|
|
|
|
# contain a list of aligned segments. |
|
471
|
|
|
|
|
|
|
sub alignments { |
|
472
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
473
|
0
|
|
|
|
|
|
my @subtypes = @_; |
|
474
|
0
|
|
|
|
|
|
my @types = map { "similarity:\^$_\$" } @subtypes; |
|
|
0
|
|
|
|
|
|
|
|
475
|
0
|
0
|
|
|
|
|
push @types,'similarity' unless @types; |
|
476
|
0
|
|
|
|
|
|
return $self->features(@types); |
|
477
|
|
|
|
|
|
|
} |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
sub segments { |
|
480
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
481
|
0
|
|
|
|
|
|
return; |
|
482
|
|
|
|
|
|
|
} |
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
sub _make_alignments { |
|
485
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
486
|
0
|
|
|
|
|
|
my $features = shift; |
|
487
|
0
|
|
|
|
|
|
require Ace::Sequence::GappedAlignment; |
|
488
|
|
|
|
|
|
|
|
|
489
|
0
|
|
|
|
|
|
my %homol; |
|
490
|
|
|
|
|
|
|
|
|
491
|
0
|
|
|
|
|
|
for my $feature (@$features) { |
|
492
|
0
|
0
|
|
|
|
|
next unless $feature->type eq 'similarity'; |
|
493
|
0
|
|
|
|
|
|
my $target = $feature->info; |
|
494
|
0
|
|
|
|
|
|
my $subtype = $feature->subtype; |
|
495
|
0
|
|
|
|
|
|
push @{$homol{$target,$subtype}},$feature; |
|
|
0
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
} |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
# map onto Ace::Sequence::GappedAlignment objects |
|
499
|
0
|
|
|
|
|
|
return map {Ace::Sequence::GappedAlignment->new($homol{$_})} keys %homol; |
|
|
0
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
} |
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
# return list of features quickly |
|
503
|
|
|
|
|
|
|
sub feature_list { |
|
504
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
505
|
0
|
0
|
|
|
|
|
return $self->{'feature_list'} if $self->{'feature_list'}; |
|
506
|
0
|
0
|
|
|
|
|
return unless my $raw = $self->_query('seqfeatures -version 2 -list'); |
|
507
|
0
|
|
|
|
|
|
return $self->{'feature_list'} = Ace::Sequence::FeatureList->new($raw); |
|
508
|
|
|
|
|
|
|
} |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# transform a GFF file into the coordinate system of the sequence |
|
511
|
|
|
|
|
|
|
sub transformGFF { |
|
512
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
513
|
0
|
|
|
|
|
|
my $gff = shift; |
|
514
|
0
|
|
|
|
|
|
my $parent = $self->parent; |
|
515
|
0
|
|
|
|
|
|
my $strand = $self->{strand}; |
|
516
|
0
|
|
|
|
|
|
my $source = $self->source; |
|
517
|
0
|
|
|
|
|
|
my ($ref_source,$ref_offset,$ref_strand) = $self->refseq; |
|
518
|
0
|
|
0
|
|
|
|
$ref_source ||= $source; |
|
519
|
0
|
|
0
|
|
|
|
$ref_strand ||= $strand; |
|
520
|
|
|
|
|
|
|
|
|
521
|
0
|
0
|
|
|
|
|
if ($ref_strand > 0) { |
|
522
|
0
|
0
|
|
|
|
|
my $o = defined($ref_offset) ? $ref_offset : ($self->p_offset + $self->offset); |
|
523
|
|
|
|
|
|
|
# find anything that looks like a numeric field and subtract offset from it |
|
524
|
0
|
|
|
|
|
|
$$gff =~ s/(?
|
|
|
0
|
|
|
|
|
|
|
|
525
|
0
|
|
|
|
|
|
$$gff =~ s/^$parent/$source/mg; |
|
526
|
0
|
|
|
|
|
|
$$gff =~ s/\#\#sequence-region\s+\S+/##sequence-region $ref_source/m; |
|
527
|
0
|
|
|
|
|
|
$$gff =~ s/FMAP_FEATURES\s+"\S+"/FMAP_FEATURES "$ref_source"/m; |
|
528
|
0
|
|
|
|
|
|
return; |
|
529
|
|
|
|
|
|
|
} else { # strand eq '-' |
|
530
|
0
|
0
|
|
|
|
|
my $o = defined($ref_offset) ? (2 + $ref_offset) : (2 + $self->p_offset - $self->offset); |
|
531
|
0
|
|
|
|
|
|
$$gff =~ s/(?
|
|
|
0
|
|
|
|
|
|
|
|
532
|
0
|
|
|
|
|
|
$$gff =~ s/(Target \"[^\"]+\" )(-?\d+) (-?\d+)/$1 $3 $2/g; |
|
533
|
0
|
|
|
|
|
|
$$gff =~ s/^$parent/$source/mg; |
|
534
|
0
|
|
|
|
|
|
$$gff =~ s/\#\#sequence-region\s+\S+\s+(-?\d+)\s+(-?\d+)/"##sequence-region $ref_source " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em; |
|
|
0
|
|
|
|
|
|
|
|
535
|
0
|
|
|
|
|
|
$$gff =~ s/FMAP_FEATURES\s+"\S+"\s+(-?\d+)\s+(-?\d+)/"FMAP_FEATURES \"$ref_source\" " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em; |
|
|
0
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
} |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
} |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
# return a name for the object |
|
541
|
|
|
|
|
|
|
sub name { |
|
542
|
0
|
|
|
0
|
1
|
|
return shift->source_seq->name; |
|
543
|
|
|
|
|
|
|
} |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# for compatibility with Ace::Sequence::Feature |
|
546
|
|
|
|
|
|
|
sub info { |
|
547
|
0
|
|
|
0
|
0
|
|
return shift->source_seq; |
|
548
|
|
|
|
|
|
|
} |
|
549
|
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
###################### internal functions ################# |
|
551
|
|
|
|
|
|
|
# not necessarily object-oriented!! |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# return parent, parent offset and strand |
|
554
|
|
|
|
|
|
|
sub find_parent { |
|
555
|
0
|
|
|
0
|
0
|
|
my $obj = shift; |
|
556
|
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
# first, if we are passed an Ace::Sequence, then we can inherit |
|
558
|
|
|
|
|
|
|
# these settings directly |
|
559
|
0
|
0
|
|
|
|
|
return (@{$obj}{qw(parent p_offset length)},$obj->r_strand) |
|
|
0
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
if $obj->isa('Ace::Sequence'); |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
# otherwise, if we are passed an Ace::Object, then we must |
|
563
|
|
|
|
|
|
|
# traverse upwards until we find a suitable parent |
|
564
|
0
|
0
|
|
|
|
|
return _traverse($obj) if $obj->isa('Ace::Object'); |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
# otherwise, we don't know what to do... |
|
567
|
0
|
|
|
|
|
|
croak "Source sequence not an Ace::Object or an Ace::Sequence"; |
|
568
|
|
|
|
|
|
|
} |
|
569
|
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
sub _get_parent { |
|
571
|
0
|
|
|
0
|
|
|
my $obj = shift; |
|
572
|
|
|
|
|
|
|
# ** DANGER DANGER WILL ROBINSON! ** |
|
573
|
|
|
|
|
|
|
# This is an experiment in caching parents to speed lookups. Probably eats memory voraciously. |
|
574
|
0
|
0
|
|
|
|
|
return $CACHE{$obj} if CACHE && exists $CACHE{$obj}; |
|
575
|
0
|
|
0
|
|
|
|
my $p = $obj->get(S_Parent=>2)|| $obj->get(Source=>1); |
|
576
|
0
|
0
|
|
|
|
|
return unless $p; |
|
577
|
0
|
|
|
|
|
|
return CACHE ? $CACHE{$obj} = $p->fetch |
|
578
|
|
|
|
|
|
|
: $p->fetch; |
|
579
|
|
|
|
|
|
|
} |
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
sub _get_children { |
|
582
|
0
|
|
|
0
|
|
|
my $obj = shift; |
|
583
|
0
|
|
|
|
|
|
my @pieces = $obj->get(S_Child=>2); |
|
584
|
0
|
0
|
|
|
|
|
return @pieces if @pieces; |
|
585
|
0
|
|
|
|
|
|
return @pieces = $obj->get('Subsequence'); |
|
586
|
|
|
|
|
|
|
} |
|
587
|
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
# get sequence, offset and strand of topmost container |
|
589
|
|
|
|
|
|
|
sub _traverse { |
|
590
|
0
|
|
|
0
|
|
|
my $obj = shift; |
|
591
|
0
|
|
|
|
|
|
my ($offset,$length); |
|
592
|
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
# invoke seqget to find the top-level container for this sequence |
|
594
|
0
|
|
|
|
|
|
my ($tl,$tl_start,$tl_end) = _get_toplevel($obj); |
|
595
|
0
|
|
0
|
|
|
|
$tl_start ||= 0; |
|
596
|
0
|
|
0
|
|
|
|
$tl_end ||= 0; |
|
597
|
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
# make it an object |
|
599
|
0
|
|
|
|
|
|
$tl = ref($obj)->new(-name=>$tl,-class=>'Sequence',-db=>$obj->db); |
|
600
|
|
|
|
|
|
|
|
|
601
|
0
|
|
|
|
|
|
$offset += $tl_start - 1; # offset to beginning of toplevel |
|
602
|
0
|
|
0
|
|
|
|
$length ||= abs($tl_end - $tl_start) + 1; |
|
603
|
0
|
0
|
|
|
|
|
my $strand = $tl_start < $tl_end ? +1 : -1; |
|
604
|
|
|
|
|
|
|
|
|
605
|
0
|
0
|
|
|
|
|
return ($tl,$offset,$strand < 0 ? ($length,'-1') : ($length,'+1') ) if $length; |
|
|
|
0
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
} |
|
607
|
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
sub _get_toplevel { |
|
609
|
0
|
|
|
0
|
|
|
my $obj = shift; |
|
610
|
0
|
|
|
|
|
|
my $class = $obj->class; |
|
611
|
0
|
|
|
|
|
|
my $name = $obj->name; |
|
612
|
|
|
|
|
|
|
|
|
613
|
0
|
|
|
|
|
|
my $smap = $obj->db->raw_query("gif smap -from $class:$name"); |
|
614
|
0
|
|
|
|
|
|
my ($parent,$pstart,$pstop,$tstart,$tstop,$map_type) = |
|
615
|
|
|
|
|
|
|
$smap =~ /^SMAP\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.+)/; |
|
616
|
|
|
|
|
|
|
|
|
617
|
0
|
|
0
|
|
|
|
$parent ||= ''; |
|
618
|
0
|
|
|
|
|
|
$parent =~ s/^Sequence://; # remove this in next version of Acedb |
|
619
|
0
|
|
|
|
|
|
return ($parent,$pstart,$pstop); |
|
620
|
|
|
|
|
|
|
} |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# create subroutine that filters GFF files for certain feature types |
|
623
|
|
|
|
|
|
|
sub _make_filter { |
|
624
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
625
|
0
|
|
|
|
|
|
my $automerge = $self->automerge; |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
# parse out the filter |
|
628
|
0
|
|
|
|
|
|
my %filter; |
|
629
|
0
|
|
|
|
|
|
foreach (@_) { |
|
630
|
0
|
|
|
|
|
|
my ($type,$filter) = split(':',$_,2); |
|
631
|
0
|
0
|
0
|
|
|
|
if ($automerge && lc($type) eq 'transcript') { |
|
|
|
0
|
0
|
|
|
|
|
|
632
|
0
|
|
|
|
|
|
@filter{'exon','intron','Sequence','cds'} = ([undef],[undef],[undef],[undef]); |
|
633
|
|
|
|
|
|
|
} elsif ($automerge && lc($type) eq 'clone') { |
|
634
|
0
|
|
|
|
|
|
@filter{'Clone_left_end','Clone_right_end','Sequence'} = ([undef],[undef],[undef]); |
|
635
|
|
|
|
|
|
|
} else { |
|
636
|
0
|
|
|
|
|
|
push @{$filter{$type}},$filter; |
|
|
0
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
} |
|
638
|
|
|
|
|
|
|
} |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
# create pattern-match sub |
|
641
|
0
|
|
|
|
|
|
my $sub; |
|
642
|
|
|
|
|
|
|
my $promiscuous; # indicates that there is a subtype without a type |
|
643
|
|
|
|
|
|
|
|
|
644
|
0
|
0
|
|
|
|
|
if (%filter) { |
|
645
|
0
|
|
|
|
|
|
my $s = "sub { my \@d = split(\"\\t\",\$_[0]);\n"; |
|
646
|
0
|
|
|
|
|
|
for my $type (keys %filter) { |
|
647
|
0
|
|
|
|
|
|
my $expr; |
|
648
|
0
|
|
|
|
|
|
my $subtypes = $filter{$type}; |
|
649
|
0
|
0
|
|
|
|
|
if ($type ne '') { |
|
650
|
0
|
|
|
|
|
|
for my $st (@$subtypes) { |
|
651
|
0
|
0
|
|
|
|
|
$expr .= defined $st ? "return 1 if \$d[2]=~/$type/i && \$d[1]=~/$st/i;\n" |
|
652
|
|
|
|
|
|
|
: "return 1 if \$d[2]=~/$type/i;\n" |
|
653
|
|
|
|
|
|
|
} |
|
654
|
|
|
|
|
|
|
} else { # no type, only subtypes |
|
655
|
0
|
|
|
|
|
|
$promiscuous++; |
|
656
|
0
|
|
|
|
|
|
for my $st (@$subtypes) { |
|
657
|
0
|
0
|
|
|
|
|
next unless defined $st; |
|
658
|
0
|
|
|
|
|
|
$expr .= "return 1 if \$d[1]=~/$st/i;\n"; |
|
659
|
|
|
|
|
|
|
} |
|
660
|
|
|
|
|
|
|
} |
|
661
|
0
|
|
|
|
|
|
$s .= $expr; |
|
662
|
|
|
|
|
|
|
} |
|
663
|
0
|
|
|
|
|
|
$s .= "return;\n }"; |
|
664
|
|
|
|
|
|
|
|
|
665
|
0
|
|
|
|
|
|
$sub = eval $s; |
|
666
|
0
|
0
|
|
|
|
|
croak $@ if $@; |
|
667
|
|
|
|
|
|
|
} else { |
|
668
|
0
|
|
|
0
|
|
|
$sub = sub { 1; } |
|
669
|
0
|
|
|
|
|
|
} |
|
670
|
0
|
0
|
|
|
|
|
return ($sub,$promiscuous ? [] : [keys %filter]); |
|
671
|
|
|
|
|
|
|
} |
|
672
|
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
# turn a GFF file and a filter into a list of Ace::Sequence::Feature objects |
|
674
|
|
|
|
|
|
|
sub _make_features { |
|
675
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
676
|
0
|
|
|
|
|
|
my ($gff,$filter) = @_; |
|
677
|
|
|
|
|
|
|
|
|
678
|
0
|
|
|
|
|
|
my ($r,$r_offset,$r_strand) = $self->refseq; |
|
679
|
0
|
|
|
|
|
|
my $parent = $self->parent; |
|
680
|
0
|
|
|
|
|
|
my $abs = $self->absolute; |
|
681
|
0
|
0
|
|
|
|
|
if ($abs) { |
|
682
|
0
|
|
|
|
|
|
$r_offset = 0; |
|
683
|
0
|
|
|
|
|
|
$r = $parent; |
|
684
|
0
|
|
|
|
|
|
$r_strand = '+1'; |
|
685
|
|
|
|
|
|
|
} |
|
686
|
0
|
|
0
|
|
|
|
my @features = map {Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$_)} |
|
|
0
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
grep !m@^(?:\#|//)@ && $filter->($_),split("\n",$gff); |
|
688
|
|
|
|
|
|
|
} |
|
689
|
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
# low level GFF call, no changing absolute to relative coordinates |
|
692
|
|
|
|
|
|
|
sub _gff { |
|
693
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
694
|
0
|
|
|
|
|
|
my ($opt,$db) = @_; |
|
695
|
0
|
|
|
|
|
|
my $data = $self->_query("seqfeatures -version 2 $opt",$db); |
|
696
|
0
|
|
|
|
|
|
$data =~ s/\0+\Z//; |
|
697
|
0
|
|
|
|
|
|
return $data; #blasted nulls! |
|
698
|
|
|
|
|
|
|
} |
|
699
|
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
# shortcut for running a gif query |
|
701
|
|
|
|
|
|
|
sub _query { |
|
702
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
703
|
0
|
|
|
|
|
|
my $command = shift; |
|
704
|
0
|
|
0
|
|
|
|
my $db = shift || $self->db; |
|
705
|
|
|
|
|
|
|
|
|
706
|
0
|
|
|
|
|
|
my $parent = $self->parent; |
|
707
|
0
|
|
|
|
|
|
my $start = $self->start(1); |
|
708
|
0
|
|
|
|
|
|
my $end = $self->end(1); |
|
709
|
0
|
0
|
|
|
|
|
($start,$end) = ($end,$start) if $start > $end; #flippity floppity |
|
710
|
|
|
|
|
|
|
|
|
711
|
0
|
|
|
|
|
|
my $coord = "-coords $start $end"; |
|
712
|
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
# BAD BAD HACK ALERT - CHECKS THE QUERY THAT IS PASSED DOWN |
|
714
|
|
|
|
|
|
|
# ALSO MAKES THINGS INCOMPATIBLE WITH PRIOR 4.9 servers. |
|
715
|
|
|
|
|
|
|
# my $opt = $command =~ /seqfeatures/ ? '-nodna' : ''; |
|
716
|
0
|
|
|
|
|
|
my $opt = '-noclip'; |
|
717
|
|
|
|
|
|
|
|
|
718
|
0
|
|
|
|
|
|
my $query = "gif seqget $parent $opt $coord ; $command"; |
|
719
|
0
|
0
|
|
|
|
|
warn $query if $self->debug; |
|
720
|
|
|
|
|
|
|
|
|
721
|
0
|
|
|
|
|
|
return $db->raw_query("gif seqget $parent $opt $coord ; $command"); |
|
722
|
|
|
|
|
|
|
} |
|
723
|
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
# utility function -- reverse complement |
|
725
|
|
|
|
|
|
|
sub _complement { |
|
726
|
0
|
|
|
0
|
|
|
my $dna = shift; |
|
727
|
0
|
|
|
|
|
|
$$dna =~ tr/GATCgatc/CTAGctag/; |
|
728
|
0
|
|
|
|
|
|
$$dna = scalar reverse $$dna; |
|
729
|
|
|
|
|
|
|
} |
|
730
|
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
sub _feature_filter { |
|
732
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
733
|
0
|
|
|
|
|
|
my $features = shift; |
|
734
|
0
|
0
|
|
|
|
|
return '' unless $features; |
|
735
|
0
|
|
|
|
|
|
my $opt = ''; |
|
736
|
0
|
0
|
0
|
|
|
|
$opt = '+feature ' . join('|',@$features) if ref($features) eq 'ARRAY' && @$features; |
|
737
|
0
|
0
|
|
|
|
|
$opt = "+feature $features" unless ref $features; |
|
738
|
0
|
|
|
|
|
|
$opt; |
|
739
|
|
|
|
|
|
|
} |
|
740
|
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
1; |
|
742
|
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
=head1 NAME |
|
744
|
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
Ace::Sequence - Examine ACeDB Sequence Objects |
|
746
|
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
748
|
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
# open database connection and get an Ace::Object sequence |
|
750
|
|
|
|
|
|
|
use Ace::Sequence; |
|
751
|
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
$db = Ace->connect(-host => 'stein.cshl.org',-port => 200005); |
|
753
|
|
|
|
|
|
|
$obj = $db->fetch(Predicted_gene => 'ZK154.3'); |
|
754
|
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
# Wrap it in an Ace::Sequence object |
|
756
|
|
|
|
|
|
|
$seq = Ace::Sequence->new($obj); |
|
757
|
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
# Find all the exons |
|
759
|
|
|
|
|
|
|
@exons = $seq->features('exon'); |
|
760
|
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
# Find all the exons predicted by various versions of "genefinder" |
|
762
|
|
|
|
|
|
|
@exons = $seq->features('exon:genefinder.*'); |
|
763
|
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
# Iterate through the exons, printing their start, end and DNA |
|
765
|
|
|
|
|
|
|
for my $exon (@exons) { |
|
766
|
|
|
|
|
|
|
print join "\t",$exon->start,$exon->end,$exon->dna,"\n"; |
|
767
|
|
|
|
|
|
|
} |
|
768
|
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
# Find the region 1000 kb upstream of the first exon |
|
770
|
|
|
|
|
|
|
$sub = Ace::Sequence->new(-seq=>$exons[0], |
|
771
|
|
|
|
|
|
|
-offset=>-1000,-length=>1000); |
|
772
|
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
# Find all features in that area |
|
774
|
|
|
|
|
|
|
@features = $sub->features; |
|
775
|
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
# Print its DNA |
|
777
|
|
|
|
|
|
|
print $sub->dna; |
|
778
|
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
# Create a new Sequence object from the first 500 kb of chromosome 1 |
|
780
|
|
|
|
|
|
|
$seq = Ace::Sequence->new(-name=>'CHROMOSOME_I',-db=>$db, |
|
781
|
|
|
|
|
|
|
-offset=>0,-length=>500_000); |
|
782
|
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
# Get the GFF dump as a text string |
|
784
|
|
|
|
|
|
|
$gff = $seq->gff; |
|
785
|
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
# Limit dump to Predicted_genes |
|
787
|
|
|
|
|
|
|
$gff_genes = $seq->gff(-features=>'Predicted_gene'); |
|
788
|
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
# Return a GFF object (using optional GFF.pm module from Sanger) |
|
790
|
|
|
|
|
|
|
$gff_obj = $seq->GFF; |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
793
|
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
I, and its allied classes L and |
|
795
|
|
|
|
|
|
|
L, provide a convenient interface to the |
|
796
|
|
|
|
|
|
|
ACeDB Sequence classes and the GFF sequence feature file format. |
|
797
|
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
Using this class, you can define a region of the genome by using a |
|
799
|
|
|
|
|
|
|
landmark (sequenced clone, link, superlink, predicted gene), an offset |
|
800
|
|
|
|
|
|
|
from that landmark, and a distance. Offsets and distances can be |
|
801
|
|
|
|
|
|
|
positive or negative. This will return an I object. |
|
802
|
|
|
|
|
|
|
Once a region is defined, you may retrieve its DNA sequence, or query |
|
803
|
|
|
|
|
|
|
the database for any features that may be contained within this |
|
804
|
|
|
|
|
|
|
region. Features can be returned as objects (using the |
|
805
|
|
|
|
|
|
|
I class), as GFF text-only dumps, or in the |
|
806
|
|
|
|
|
|
|
form of the GFF class defined by the Sanger Centre's GFF.pm module. |
|
807
|
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
This class builds on top of L and L. Please see |
|
809
|
|
|
|
|
|
|
their manual pages before consulting this one. |
|
810
|
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
=head1 Creating New Ace::Sequence Objects, the new() Method |
|
812
|
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
$seq = Ace::Sequence->new($object); |
|
814
|
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
$seq = Ace::Sequence->new(-source => $object, |
|
816
|
|
|
|
|
|
|
-offset => $offset, |
|
817
|
|
|
|
|
|
|
-length => $length, |
|
818
|
|
|
|
|
|
|
-refseq => $reference_sequence); |
|
819
|
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
$seq = Ace::Sequence->new(-name => $name, |
|
821
|
|
|
|
|
|
|
-db => $db, |
|
822
|
|
|
|
|
|
|
-offset => $offset, |
|
823
|
|
|
|
|
|
|
-length => $length, |
|
824
|
|
|
|
|
|
|
-refseq => $reference_sequence); |
|
825
|
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
In order to create an I you will need an active I |
|
827
|
|
|
|
|
|
|
database accessor. Sequence regions are defined using a "source" |
|
828
|
|
|
|
|
|
|
sequence, an offset, and a length. Optionally, you may also provide a |
|
829
|
|
|
|
|
|
|
"reference sequence" to establish the coordinate system for all |
|
830
|
|
|
|
|
|
|
inquiries. Sequences may be generated from existing I |
|
831
|
|
|
|
|
|
|
sequence objects, from other I and |
|
832
|
|
|
|
|
|
|
I objects, or from a sequence name and a |
|
833
|
|
|
|
|
|
|
database handle. |
|
834
|
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
The class method named new() is the interface to these facilities. In |
|
836
|
|
|
|
|
|
|
its simplest, one-argument form, you provide new() with a |
|
837
|
|
|
|
|
|
|
previously-created I that points to Sequence or |
|
838
|
|
|
|
|
|
|
sequence-like object (the meaning of "sequence-like" is explained in |
|
839
|
|
|
|
|
|
|
more detail below.) The new() method will return an I |
|
840
|
|
|
|
|
|
|
object extending from the beginning of the object through to its |
|
841
|
|
|
|
|
|
|
natural end. |
|
842
|
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
In the named-parameter form of new(), the following arguments are |
|
844
|
|
|
|
|
|
|
recognized: |
|
845
|
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
=over 4 |
|
847
|
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
=item -source |
|
849
|
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
The sequence source. This must be an I of the "Sequence" |
|
851
|
|
|
|
|
|
|
class, or be a sequence-like object containing the SMap tag (see |
|
852
|
|
|
|
|
|
|
below). |
|
853
|
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
=item -offset |
|
855
|
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
An offset from the beginning of the source sequence. The retrieved |
|
857
|
|
|
|
|
|
|
I will begin at this position. The offset can be any |
|
858
|
|
|
|
|
|
|
positive or negative integer. Offets are B<0-based>. |
|
859
|
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
=item -length |
|
861
|
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
The length of the sequence to return. Either a positive or negative |
|
863
|
|
|
|
|
|
|
integer can be specified. If a negative length is given, the returned |
|
864
|
|
|
|
|
|
|
sequence will be complemented relative to the source sequence. |
|
865
|
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
=item -refseq |
|
867
|
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
The sequence to use to establish the coordinate system for the |
|
869
|
|
|
|
|
|
|
returned sequence. Normally the source sequence is used to establish |
|
870
|
|
|
|
|
|
|
the coordinate system, but this can be used to override that choice. |
|
871
|
|
|
|
|
|
|
You can provide either an I or just a sequence name for |
|
872
|
|
|
|
|
|
|
this argument. The source and reference sequences must share a common |
|
873
|
|
|
|
|
|
|
ancestor, but do not have to be directly related. An attempt to use a |
|
874
|
|
|
|
|
|
|
disjunct reference sequence, such as one on a different chromosome, |
|
875
|
|
|
|
|
|
|
will fail. |
|
876
|
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
=item -name |
|
878
|
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
As an alternative to using an I with the B<-source> |
|
880
|
|
|
|
|
|
|
argument, you may specify a source sequence using B<-name> and B<-db>. |
|
881
|
|
|
|
|
|
|
The I module will use the provided database accessor to |
|
882
|
|
|
|
|
|
|
fetch a Sequence object with the specified name. new() will return |
|
883
|
|
|
|
|
|
|
undef is no Sequence by this name is known. |
|
884
|
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
=item -db |
|
886
|
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
This argument is required if the source sequence is specified by name |
|
888
|
|
|
|
|
|
|
rather than by object reference. |
|
889
|
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
=back |
|
891
|
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
If new() is successful, it will create an I object and |
|
893
|
|
|
|
|
|
|
return it. Otherwise it will return undef and return a descriptive |
|
894
|
|
|
|
|
|
|
message in Ace->error(). Certain programming errors, such as a |
|
895
|
|
|
|
|
|
|
failure to provide required arguments, cause a fatal error. |
|
896
|
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
=head2 Reference Sequences and the Coordinate System |
|
898
|
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
When retrieving information from an I, the coordinate |
|
900
|
|
|
|
|
|
|
system is based on the sequence segment selected at object creation |
|
901
|
|
|
|
|
|
|
time. That is, the "+1" strand is the natural direction of the |
|
902
|
|
|
|
|
|
|
I object, and base pair 1 is its first base pair. This |
|
903
|
|
|
|
|
|
|
behavior can be overridden by providing a reference sequence to the |
|
904
|
|
|
|
|
|
|
new() method, in which case the orientation and position of the |
|
905
|
|
|
|
|
|
|
reference sequence establishes the coordinate system for the object. |
|
906
|
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
In addition to the reference sequence, there are two other sequences |
|
908
|
|
|
|
|
|
|
used by I for internal bookeeping. The "source" |
|
909
|
|
|
|
|
|
|
sequence corresponds to the smallest ACeDB sequence object that |
|
910
|
|
|
|
|
|
|
completely encloses the selected sequence segment. The "parent" |
|
911
|
|
|
|
|
|
|
sequence is the smallest ACeDB sequence object that contains the |
|
912
|
|
|
|
|
|
|
"source". The parent is used to derive the length and orientation of |
|
913
|
|
|
|
|
|
|
source sequences that are not directly associated with DNA objects. |
|
914
|
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
In many cases, the source sequence will be identical to the sequence |
|
916
|
|
|
|
|
|
|
initially passed to the new() method. However, there are exceptions |
|
917
|
|
|
|
|
|
|
to this rule. One common exception occurs when the offset and/or |
|
918
|
|
|
|
|
|
|
length cross the boundaries of the passed-in sequence. In this case, |
|
919
|
|
|
|
|
|
|
the ACeDB database is searched for the smallest sequence that contains |
|
920
|
|
|
|
|
|
|
both endpoints of the I object. |
|
921
|
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
The other common exception occurs in Ace 4.8, where there is support |
|
923
|
|
|
|
|
|
|
for "sequence-like" objects that contain the C ("Sequence Map") |
|
924
|
|
|
|
|
|
|
tag. The C tag provides genomic location information for |
|
925
|
|
|
|
|
|
|
arbitrary object -- not just those descended from the Sequence class. |
|
926
|
|
|
|
|
|
|
This allows ACeDB to perform genome map operations on objects that are |
|
927
|
|
|
|
|
|
|
not directly related to sequences, such as genetic loci that have been |
|
928
|
|
|
|
|
|
|
interpolated onto the physical map. When an C-containing object |
|
929
|
|
|
|
|
|
|
is passed to the I new() method, the module will again |
|
930
|
|
|
|
|
|
|
choose the smallest ACeDB Sequence object that contains both |
|
931
|
|
|
|
|
|
|
end-points of the desired region. |
|
932
|
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
If an I object is used to create a new I |
|
934
|
|
|
|
|
|
|
object, then the original object's source is inherited. |
|
935
|
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
=head1 Object Methods |
|
937
|
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
Once an I object is created, you can query it using the |
|
939
|
|
|
|
|
|
|
following methods: |
|
940
|
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
=head2 asString() |
|
942
|
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
$name = $seq->asString; |
|
944
|
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
Returns a human-readable identifier for the sequence in the form |
|
946
|
|
|
|
|
|
|
I, where "Source" is the name of the source |
|
947
|
|
|
|
|
|
|
sequence, and "start" and "end" are the endpoints of the sequence |
|
948
|
|
|
|
|
|
|
relative to the source (using 1-based indexing). This method is |
|
949
|
|
|
|
|
|
|
called automatically when the I is used in a string |
|
950
|
|
|
|
|
|
|
context. |
|
951
|
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
=head2 source_seq() |
|
953
|
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
$source = $seq->source_seq; |
|
955
|
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
Return the source of the I. |
|
957
|
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
=head2 parent_seq() |
|
959
|
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
$parent = $seq->parent_seq; |
|
961
|
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
Return the immediate ancestor of the sequence. The parent of the |
|
963
|
|
|
|
|
|
|
top-most sequence (such as the CHROMOSOME link) is itself. This |
|
964
|
|
|
|
|
|
|
method is used internally to ascertain the length of source sequences |
|
965
|
|
|
|
|
|
|
which are not associated with a DNA object. |
|
966
|
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
NOTE: this procedure is a trifle funky and cannot reliably be used to |
|
968
|
|
|
|
|
|
|
traverse upwards to the top-most sequence. The reason for this is |
|
969
|
|
|
|
|
|
|
that it will return an I in some cases, and an |
|
970
|
|
|
|
|
|
|
I in others. Use get_parent() to traverse upwards |
|
971
|
|
|
|
|
|
|
through a uniform series of I objects upwards. |
|
972
|
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
=head2 refseq([$seq]) |
|
974
|
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
$refseq = $seq->refseq; |
|
976
|
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
Returns the reference sequence, if one is defined. |
|
978
|
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
$seq->refseq($new_ref); |
|
980
|
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
Set the reference sequence. The reference sequence must share the same |
|
982
|
|
|
|
|
|
|
ancestor with $seq. |
|
983
|
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
=head2 start() |
|
985
|
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
$start = $seq->start; |
|
987
|
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
Start of this sequence, relative to the source sequence, using 1-based |
|
989
|
|
|
|
|
|
|
indexing. |
|
990
|
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
=head2 end() |
|
992
|
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
$end = $seq->end; |
|
994
|
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
End of this sequence, relative to the source sequence, using 1-based |
|
996
|
|
|
|
|
|
|
indexing. |
|
997
|
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
=head2 offset() |
|
999
|
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
$offset = $seq->offset; |
|
1001
|
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
Offset of the beginning of this sequence relative to the source |
|
1003
|
|
|
|
|
|
|
sequence, using 0-based indexing. The offset may be negative if the |
|
1004
|
|
|
|
|
|
|
beginning of the sequence is to the left of the beginning of the |
|
1005
|
|
|
|
|
|
|
source sequence. |
|
1006
|
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
=head2 length() |
|
1008
|
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
$length = $seq->length; |
|
1010
|
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
The length of this sequence, in base pairs. The length may be |
|
1012
|
|
|
|
|
|
|
negative if the sequence's orientation is reversed relative to the |
|
1013
|
|
|
|
|
|
|
source sequence. Use abslength() to obtain the absolute value of |
|
1014
|
|
|
|
|
|
|
the sequence length. |
|
1015
|
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
=head2 abslength() |
|
1017
|
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
$length = $seq->abslength; |
|
1019
|
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
Return the absolute value of the length of the sequence. |
|
1021
|
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
=head2 strand() |
|
1023
|
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
$strand = $seq->strand; |
|
1025
|
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
Returns +1 for a sequence oriented in the natural direction of the |
|
1027
|
|
|
|
|
|
|
genomic reference sequence, or -1 otherwise. |
|
1028
|
|
|
|
|
|
|
|
|
1029
|
|
|
|
|
|
|
=head2 reversed() |
|
1030
|
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
Returns true if the segment is reversed relative to the canonical |
|
1032
|
|
|
|
|
|
|
genomic direction. This is the same as $seq->strand < 0. |
|
1033
|
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
=head2 dna() |
|
1035
|
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
$dna = $seq->dna; |
|
1037
|
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
Return the DNA corresponding to this sequence. If the sequence length |
|
1039
|
|
|
|
|
|
|
is negative, the reverse complement of the appropriate segment will be |
|
1040
|
|
|
|
|
|
|
returned. |
|
1041
|
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
ACeDB allows Sequences to exist without an associated DNA object |
|
1043
|
|
|
|
|
|
|
(which typically happens during intermediate stages of a sequencing |
|
1044
|
|
|
|
|
|
|
project. In such a case, the returned sequence will contain the |
|
1045
|
|
|
|
|
|
|
correct number of "-" characters. |
|
1046
|
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
=head2 name() |
|
1048
|
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
$name = $seq->name; |
|
1050
|
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
Return the name of the source sequence as a string. |
|
1052
|
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
=head2 get_parent() |
|
1054
|
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
$parent = $seq->parent; |
|
1056
|
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
Return the immediate ancestor of this I (i.e., the |
|
1058
|
|
|
|
|
|
|
sequence that contains this one). The return value is a new |
|
1059
|
|
|
|
|
|
|
I or undef, if no parent sequence exists. |
|
1060
|
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
=head2 get_children() |
|
1062
|
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
@children = $seq->get_children(); |
|
1064
|
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
Returns all subsequences that exist as independent objects in the |
|
1066
|
|
|
|
|
|
|
ACeDB database. What exactly is returned is dependent on the data |
|
1067
|
|
|
|
|
|
|
model. In older ACeDB databases, the only subsequences are those |
|
1068
|
|
|
|
|
|
|
under the catchall Subsequence tag. In newer ACeDB databases, the |
|
1069
|
|
|
|
|
|
|
objects returned correspond to objects to the right of the S_Child |
|
1070
|
|
|
|
|
|
|
subtag using a tag[2] syntax, and may include Predicted_genes, |
|
1071
|
|
|
|
|
|
|
Sequences, Links, or other objects. The return value is a list of |
|
1072
|
|
|
|
|
|
|
I objects. |
|
1073
|
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
=head2 features() |
|
1075
|
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
@features = $seq->features; |
|
1077
|
|
|
|
|
|
|
@features = $seq->features('exon','intron','Predicted_gene'); |
|
1078
|
|
|
|
|
|
|
@features = $seq->features('exon:GeneFinder','Predicted_gene:hand.*'); |
|
1079
|
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
features() returns an array of I objects. If |
|
1081
|
|
|
|
|
|
|
called without arguments, features() returns all features that cross |
|
1082
|
|
|
|
|
|
|
the sequence region. You may also provide a filter list to select a |
|
1083
|
|
|
|
|
|
|
set of features by type and subtype. The format of the filter list |
|
1084
|
|
|
|
|
|
|
is: |
|
1085
|
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
type:subtype |
|
1087
|
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
Where I is the class of the feature (the "feature" field of the |
|
1089
|
|
|
|
|
|
|
GFF format), and I is a description of how the feature was |
|
1090
|
|
|
|
|
|
|
derived (the "source" field of the GFF format). Either of these |
|
1091
|
|
|
|
|
|
|
fields can be absent, and either can be a regular expression. More |
|
1092
|
|
|
|
|
|
|
advanced filtering is not supported, but is provided by the Sanger |
|
1093
|
|
|
|
|
|
|
Centre's GFF module. |
|
1094
|
|
|
|
|
|
|
|
|
1095
|
|
|
|
|
|
|
The order of the features in the returned list is not specified. To |
|
1096
|
|
|
|
|
|
|
obtain features sorted by position, use this idiom: |
|
1097
|
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
@features = sort { $a->start <=> $b->start } $seq->features; |
|
1099
|
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
=head2 feature_list() |
|
1101
|
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
my $list = $seq->feature_list(); |
|
1103
|
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
This method returns a summary list of the features that cross the |
|
1105
|
|
|
|
|
|
|
sequence in the form of a L object. From the |
|
1106
|
|
|
|
|
|
|
L object you can obtain the list of feature names |
|
1107
|
|
|
|
|
|
|
and the number of each type. The feature list is obtained from the |
|
1108
|
|
|
|
|
|
|
ACeDB server with a single short transaction, and therefore has much |
|
1109
|
|
|
|
|
|
|
less overhead than features(). |
|
1110
|
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
See L for more details. |
|
1112
|
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
=head2 transcripts() |
|
1114
|
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
This returns a list of Ace::Sequence::Transcript objects, which are |
|
1116
|
|
|
|
|
|
|
specializations of Ace::Sequence::Feature. See L |
|
1117
|
|
|
|
|
|
|
for details. |
|
1118
|
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
=head2 clones() |
|
1120
|
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
This returns a list of Ace::Sequence::Feature objects containing |
|
1122
|
|
|
|
|
|
|
reconstructed clones. This is a nasty hack, because ACEDB currently |
|
1123
|
|
|
|
|
|
|
records clone ends, but not the clones themselves, meaning that we |
|
1124
|
|
|
|
|
|
|
will not always know both ends of the clone. In this case the missing |
|
1125
|
|
|
|
|
|
|
end has a synthetic position of -99,999,999 or +99,999,999. Sorry. |
|
1126
|
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
=head2 gff() |
|
1128
|
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
$gff = $seq->gff(); |
|
1130
|
|
|
|
|
|
|
$gff = $seq->gff(-abs => 1, |
|
1131
|
|
|
|
|
|
|
-features => ['exon','intron:GeneFinder']); |
|
1132
|
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
This method returns a GFF file as a scalar. The following arguments |
|
1134
|
|
|
|
|
|
|
are optional: |
|
1135
|
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
=over 4 |
|
1137
|
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
=item -abs |
|
1139
|
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
Ordinarily the feature entries in the GFF file will be returned in |
|
1141
|
|
|
|
|
|
|
coordinates relative to the start of the I object. |
|
1142
|
|
|
|
|
|
|
Position 1 will be the start of the sequence object, and the "+" |
|
1143
|
|
|
|
|
|
|
strand will be the sequence object's natural orientation. However if |
|
1144
|
|
|
|
|
|
|
a true value is provided to B<-abs>, the coordinate system used will |
|
1145
|
|
|
|
|
|
|
be relative to the start of the source sequence, i.e. the native ACeDB |
|
1146
|
|
|
|
|
|
|
Sequence object (usually a cosmid sequence or a link). |
|
1147
|
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
If a reference sequence was provided when the I was |
|
1149
|
|
|
|
|
|
|
created, it will be used by default to set the coordinate system. |
|
1150
|
|
|
|
|
|
|
Relative coordinates can be reenabled by providing a false value to |
|
1151
|
|
|
|
|
|
|
B<-abs>. |
|
1152
|
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
Ordinarily the coordinate system manipulations automatically "do what |
|
1154
|
|
|
|
|
|
|
you want" and you will not need to adjust them. See also the abs() |
|
1155
|
|
|
|
|
|
|
method described below. |
|
1156
|
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
=item -features |
|
1158
|
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
The B<-features> argument filters the features according to a list of |
|
1160
|
|
|
|
|
|
|
types and subtypes. The format is identical to the one described for |
|
1161
|
|
|
|
|
|
|
the features() method. A single filter may be provided as a scalar |
|
1162
|
|
|
|
|
|
|
string. Multiple filters may be passed as an array reference. |
|
1163
|
|
|
|
|
|
|
|
|
1164
|
|
|
|
|
|
|
=back |
|
1165
|
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
See also the GFF() method described next. |
|
1167
|
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
=head2 GFF() |
|
1169
|
|
|
|
|
|
|
|
|
1170
|
|
|
|
|
|
|
$gff_object = $seq->gff; |
|
1171
|
|
|
|
|
|
|
$gff_object = $seq->gff(-abs => 1, |
|
1172
|
|
|
|
|
|
|
-features => ['exon','intron:GeneFinder']); |
|
1173
|
|
|
|
|
|
|
|
|
1174
|
|
|
|
|
|
|
The GFF() method takes the same arguments as gff() described above, |
|
1175
|
|
|
|
|
|
|
but it returns a I object from the GFF.pm |
|
1176
|
|
|
|
|
|
|
module. If the GFF module is not installed, this method will generate |
|
1177
|
|
|
|
|
|
|
a fatal error. |
|
1178
|
|
|
|
|
|
|
|
|
1179
|
|
|
|
|
|
|
=head2 absolute() |
|
1180
|
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
$abs = $seq->absolute; |
|
1182
|
|
|
|
|
|
|
$abs = $seq->absolute(1); |
|
1183
|
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
This method controls whether the coordinates of features are returned |
|
1185
|
|
|
|
|
|
|
in absolute or relative coordinates. "Absolute" coordinates are |
|
1186
|
|
|
|
|
|
|
relative to the underlying source or reference sequence. "Relative" |
|
1187
|
|
|
|
|
|
|
coordinates are relative to the I object. By default, |
|
1188
|
|
|
|
|
|
|
coordinates are relative unless new() was provided with a reference |
|
1189
|
|
|
|
|
|
|
sequence. This default can be examined and changed using absolute(). |
|
1190
|
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
=head2 automerge() |
|
1192
|
|
|
|
|
|
|
|
|
1193
|
|
|
|
|
|
|
$merge = $seq->automerge; |
|
1194
|
|
|
|
|
|
|
$seq->automerge(0); |
|
1195
|
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
This method controls whether groups of features will automatically be |
|
1197
|
|
|
|
|
|
|
merged together by the features() call. If true (the default), then |
|
1198
|
|
|
|
|
|
|
the left and right end of clones will be merged into "clone" features, |
|
1199
|
|
|
|
|
|
|
introns, exons and CDS entries will be merged into |
|
1200
|
|
|
|
|
|
|
Ace::Sequence::Transcript objects, and similarity entries will be |
|
1201
|
|
|
|
|
|
|
merged into Ace::Sequence::GappedAlignment objects. |
|
1202
|
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
=head2 db() |
|
1204
|
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
$db = $seq->db; |
|
1206
|
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
Returns the L database accessor associated with this sequence. |
|
1208
|
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
1210
|
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
L, L, L, |
|
1212
|
|
|
|
|
|
|
L, L |
|
1213
|
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
=head1 AUTHOR |
|
1215
|
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
Lincoln Stein with extensive help from Jean |
|
1217
|
|
|
|
|
|
|
Thierry-Mieg |
|
1218
|
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
Many thanks to David Block for finding and |
|
1220
|
|
|
|
|
|
|
fixing the nasty off-by-one errors. |
|
1221
|
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
Copyright (c) 1999, Lincoln D. Stein |
|
1223
|
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
1225
|
|
|
|
|
|
|
it under the same terms as Perl itself. See DISCLAIMER.txt for |
|
1226
|
|
|
|
|
|
|
disclaimers of warranty. |
|
1227
|
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
=cut |
|
1229
|
|
|
|
|
|
|
|
|
1230
|
|
|
|
|
|
|
__END__ |