File Coverage

blib/lib/Bio/ToolBox/Data/core.pm
Criterion Covered Total %
statement 414 757 54.6
branch 216 502 43.0
condition 120 239 50.2
subroutine 50 59 84.7
pod 48 49 97.9
total 848 1606 52.8


line stmt bran cond sub pod time code
1             package Bio::ToolBox::Data::core;
2             our $VERSION = '1.69';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::Data::core - Common functions to Bio:ToolBox::Data family
7              
8             =head1 DESCRIPTION
9              
10             Common methods for metadata and manipulation in a L
11             data table and L file stream. This module
12             should not be used directly. See the respective modules for more information.
13              
14             =cut
15              
16 3     3   29 use strict;
  3         5  
  3         79  
17 3     3   10 use Carp qw(carp cluck croak confess);
  3         4  
  3         116  
18 3     3   12 use base 'Bio::ToolBox::Data::file';
  3         3  
  3         1171  
19 3         261 use Bio::ToolBox::db_helper qw(
20             open_db_connection
21             verify_or_request_feature_types
22             use_bam_adapter
23             use_big_adapter
24 3     3   2259 );
  3         17  
25 3     3   14 use Module::Load;
  3         60  
  3         20  
26              
27             1;
28              
29             #### Initialization and verification ###############################################
30              
31             sub new {
32 73     73 1 129 my $class = shift;
33            
34             # in case someone calls this from an established object
35 73 100       116 if (ref($class)) {
36 1         2 $class = ref($class);
37             }
38            
39             # Initialize the hash structure
40 73         717 my %data = (
41             'program' => undef,
42             'feature' => undef,
43             'feature_type' => undef,
44             'db' => undef,
45             'format' => '',
46             'gff' => 0,
47             'bed' => 0,
48             'ucsc' => 0,
49             'vcf' => 0,
50             'number_columns' => 0,
51             'last_row' => 0,
52             'headers' => 1,
53             'column_names' => [],
54             'filename' => undef,
55             'basename' => undef,
56             'extension' => undef,
57             'path' => undef,
58             'comments' => [],
59             'data_table' => [],
60             'header_line_count' => 0,
61             'zerostart' => 0,
62             );
63            
64             # Finished
65 73         217 return bless \%data, $class;
66             }
67              
68              
69             sub verify {
70             # this function does not rely on any self functions for two reasons
71             # this is a low level integrity checker
72             # this is very old code from before the days of an OO API of Bio::ToolBox
73             # although a lot of things have been added and changed since then....
74 52     52 1 1208 my $self = shift;
75 52   100     125 my $silence = shift || 0; # default is to yell
76            
77             # check for data table
78 52 50 33     211 unless (
79             defined $self->{'data_table'} and
80             ref $self->{'data_table'} eq 'ARRAY'
81             ) {
82 0 0       0 carp sprintf "\n DATA INTEGRITY ERROR: No data table in %s object!", ref $self
83             unless $silence;
84 0         0 return;
85             }
86            
87             # check for last row index
88 52 50       121 if (defined $self->{'last_row'}) {
89 52         49 my $number = scalar( @{ $self->{'data_table'} } ) - 1;
  52         108  
90 52 50       100 if ($self->{'last_row'} != $number) {
91             # carp sprintf "TABLE INTEGRITY ERROR: data table last_row index [%d] doesn't match " .
92             # "metadata value [%d]!\n", $number, $self->{'last_row'};
93             # fix it for them
94 0         0 $self->{'last_row'} = $number;
95             }
96             }
97             else {
98             # define it for them
99             $self->{'last_row'} =
100 0         0 scalar( @{ $self->{'data_table'} } ) - 1;
  0         0  
101             }
102            
103             # check for consistent number of columns
104 52 50       84 if (defined $self->{'number_columns'}) {
105 52         69 my $number = $self->{'number_columns'};
106 52         72 my @problems;
107 52         51 my $too_low = 0;
108 52         57 my $too_high = 0;
109 52         100 for (my $row = 0; $row <= $self->{'last_row'}; $row++) {
110 945         792 my $count = scalar @{ $self->{'data_table'}->[$row] };
  945         975  
111 945 50       1421 if ($count != $number) {
112 0         0 push @problems, $row;
113 0 0       0 $too_low++ if $count < $number;
114 0 0       0 $too_high++ if $count > $number;
115 0         0 while ($count < $number) {
116             # we can sort-of-fix this problem
117 0         0 $self->{'data_table'}->[$row][$count] = '.';
118 0         0 $count++;
119             }
120             }
121             }
122            
123             # we found errors
124 52 50       103 if (@problems) {
125             # collapse problem list into compact string
126             # from http://www.perlmonks.org/?node_id=87538
127 0         0 my $problem = join(',', @problems);
128 0         0 $problem =~ s/(?
  0         0  
129              
130 0 0       0 if ($too_low) {
131 0 0       0 print "\n COLUMN INCONSISTENCY ERRORS: $too_low rows had fewer than expected " .
132             "columns!\n padded rows $problem with null values\n"
133             unless $silence;
134             }
135 0 0       0 if ($too_high) {
136 0 0       0 print "\n COLUMN INCONSISTENCY ERRORS: $too_high rows had more columns than " .
137             "expected!\n Problem rows: $problem\n"
138             unless $silence;
139 0         0 return;
140             }
141             }
142             }
143             else {
144             # this wasn't set???? then set it
145             $self->{'number_columns'} =
146 0         0 scalar @{ $self->{'data_table'}->[0] };
  0         0  
147             }
148            
149             # check metadata and table names
150 52         60 my $mdcheck = 0;
151 52         119 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
152 415 50       632 unless (
153             $self->{$i}{'name'} eq
154             $self->{'data_table'}->[0][$i]
155             ) {
156             printf( "\n TABLE/METADATA MISMATCH ERROR: Column header names don't" .
157             " match metadata name values for index $i!" .
158             "\n compare '%s' with '%s'\n", $self->{'data_table'}->[0][$i],
159 0 0       0 $self->{$i}{'name'} ) unless $silence;
160 0         0 $mdcheck++;
161             }
162 415 50       752 unless ($self->{$i}{'index'} == $i) {
163             printf( "\n METADATA INDEX ERROR: index $i metadata doesn't match its " .
164 0 0       0 "index value %s\n", $self->{$i}{'index'} ) unless $silence;
165 0         0 $mdcheck++;
166             }
167             }
168 52 50       83 return if $mdcheck;
169            
170             ### Defined file format structure integrity
171 52         61 my $error;
172            
173             # check for proper gff structure
174 52 100       97 if ($self->{'gff'}) {
175             # if any of these checks fail, we will reset the gff version to
176             # the default of 0, or no gff
177 6         9 my $gff_check = 1; # start with assumption it is true
178            
179             # check number of columns
180 6 100       15 if ($self->{'number_columns'} != 9) {
181 1         3 $gff_check = 0;
182 1         4 $error .= " Number of columns not 9.";
183             }
184            
185             # check column indices
186 6 50 33     51 if (
187             # column 0 should look like chromosome
188             exists $self->{0} and
189             $self->{0}{'name'} !~
190             m/^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i
191             ) {
192 0         0 $gff_check = 0;
193 0         0 $error .= " Column 0 name not chromosome-like.";
194             }
195 6 50 33     46 if (
196             # column 3 should look like start
197             exists $self->{3} and
198             $self->{3}{'name'} !~ m/start|pos|position/i
199             ) {
200 0         0 $gff_check = 0;
201 0         0 $error .= " Column 3 name not start-like.";
202             }
203 6 50 33     33 if (
204             # column 4 should look like end
205             exists $self->{4} and
206             $self->{4}{'name'} !~ m/stop|end|pos|position/i
207             ) {
208 0         0 $gff_check = 0;
209 0         0 $error .= " Column 4 name not stop-like.";
210             }
211 6 50 33     24 if (
212             # column 6 should look like strand
213             exists $self->{6} and
214             $self->{6}{'name'} !~ m/strand/i
215             ) {
216 0         0 $gff_check = 0;
217 0         0 $error .= " Column 6 name not strand-like.";
218             }
219            
220             # check column data
221 6 50       23 unless ($self->_column_is_integers(3,4)) {
222 0         0 $gff_check = 0;
223 0         0 $error .= " Columns 3,4 not integers.";
224             }
225 6 50       20 unless ($self->_column_is_numeric(5)) {
226 0         0 $gff_check = 0;
227 0         0 $error .= " Column 5 not numeric.";
228             }
229 6 50       21 unless ($self->_column_is_stranded(6)) {
230 0         0 $gff_check = 0;
231 0         0 $error .= " Column 6 not strand values.";
232             }
233            
234             # update gff value as necessary
235 6 100       20 if ($gff_check == 0) {
236             # reset metadata
237 1         2 $self->{'gff'} = 0;
238 1         3 $self->{'headers'} = 1;
239            
240             # remove the AUTO key from the metadata
241 1         3 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
242 8 50       10 if (exists $self->{$i}{'AUTO'}) {
243 8         15 delete $self->{$i}{'AUTO'};
244             }
245             }
246 1 50       3 print "\n GFF FILE FORMAT ERROR: $error\n" unless $silence;
247             }
248             }
249            
250             # check for proper BED structure
251 52 100       88 if ($self->{'bed'}) {
252             # if any of these checks fail, we will reset the bed flag to 0
253             # to make it not a bed file format
254 21         29 my $bed_check = 1; # start with assumption it is correct
255            
256             # check number of columns
257 21 50       63 if ($self->{'number_columns'} < 3) {
258 0         0 $bed_check = 0;
259 0         0 $error .= " Number of columns not at least 3.";
260             }
261            
262             # check column index names
263 21 50 33     160 if (
264             exists $self->{0} and
265             $self->{0}{'name'} !~
266             m/^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i
267             ) {
268 0         0 $bed_check = 0;
269 0         0 $error .= " Column 0 name not chromosome-like.";
270             }
271 21 50 33     141 if (
272             exists $self->{1} and
273             $self->{1}{'name'} !~ m/start|pos|position/i
274             ) {
275 0         0 $bed_check = 0;
276 0         0 $error .= " Column 1 name not start-like.";
277             }
278 21 50 33     99 if (
279             exists $self->{2} and
280             $self->{2}{'name'} !~ m/stop|end|pos|position/i
281             ) {
282 0         0 $bed_check = 0;
283 0         0 $error .= " Column 2 name not stop-like.";
284             }
285 21 50 66     78 if (
286             exists $self->{5} and
287             $self->{5}{'name'} !~ m/strand/i
288             ) {
289 0         0 $bed_check = 0;
290 0         0 $error .= " Column 5 name not strand-like.";
291             }
292 21 50 100     129 if (
      66        
293             exists $self->{6} and
294             $self->{'format'} !~ /narrow|broad/i and
295             $self->{6}{'name'} !~ m/start|thick|cds/i
296             ) {
297 0         0 $bed_check = 0;
298 0         0 $error .= " Column 6 name not thickStart-like.";
299             }
300 21 50 100     108 if (
      66        
301             exists $self->{7} and
302             $self->{'format'} !~ /narrow|broad/i and
303             $self->{7}{'name'} !~ m/end|stop|thick|cds/i
304             ) {
305 0         0 $bed_check = 0;
306 0         0 $error .= " Column 7 name not thickEnd-like.";
307             }
308 21 50 100     85 if (
      66        
309             exists $self->{8} and
310             $self->{'format'} !~ /narrow|broad/i and
311             $self->{8}{'name'} !~ m/item|rgb|color/i
312             ) {
313 0         0 $bed_check = 0;
314 0         0 $error .= " Column 8 name not itemRGB-like.";
315             }
316 21 50 100     97 if (
      66        
317             exists $self->{9} and
318             $self->{'format'} !~ /narrow|broad/i and
319             $self->{9}{'name'} !~ m/count|number|block|exon/i
320             ) {
321 0         0 $bed_check = 0;
322 0         0 $error .= " Column 9 name not blockCount-like.";
323             }
324 21 50 66     65 if (
325             exists $self->{10} and
326             $self->{10}{'name'} !~ m/size|length|block|exon/i
327             ) {
328 0         0 $bed_check = 0;
329 0         0 $error .= " Column 10 name not blockSizes-like.";
330             }
331 21 50 66     64 if (
332             exists $self->{11} and
333             $self->{11}{'name'} !~ m/start|block|exon/i
334             ) {
335 0         0 $bed_check = 0;
336 0         0 $error .= " Column 11 name not blockStarts-like.";
337             }
338            
339             # check column data
340 21 50       64 unless ($self->_column_is_integers(1,2)) {
341 0         0 $bed_check = 0;
342 0         0 $error .= " Columns 1,2 not integers.";
343             }
344 21 100       75 if ($self->{'number_columns'} >= 5) {
345             # only check if it is actually present, since could be optional
346 15 50       36 unless ($self->_column_is_numeric(4) ) {
347 0         0 $bed_check = 0;
348 0         0 $error .= " Column 4 not numeric.";
349             }
350             }
351 21 100       41 if ($self->{'number_columns'} >= 6) {
352             # only check if it is actually present, since could be optional
353 15 50       33 unless ($self->_column_is_stranded(5) ) {
354 0         0 $bed_check = 0;
355 0         0 $error .= " Column 5 not strand values.";
356             }
357             }
358 21 100 100     77 if ($self->{'format'} and
359             $self->{'format'} =~ /narrow|broad/i) {
360 4 50       11 unless ($self->_column_is_numeric(6,7,8) ) {
361 0         0 $bed_check = 0;
362 0         0 $error .= " Columns 6,7,8 not numeric.";
363             }
364             }
365 21 100       55 if ($self->{'number_columns'} == 12) {
366             # bed12 has extra special limitations
367 3 50       7 unless ($self->_column_is_integers(6,7,9) ) {
368 0         0 $bed_check = 0;
369 0         0 $error .= " Column 6,7,9 not integers.";
370             }
371 3 50       5 unless ($self->_column_is_comma_integers(10,11) ) {
372 0         0 $bed_check = 0;
373 0         0 $error .= " Column 10,11 not comma-delimited integers.";
374             }
375             }
376 21 100 66     844 if (
377             $self->{'number_columns'} == 15 and
378             $self->{'format'} =~ /gapped/i
379             ) {
380             # gappedPeak has extra special limitations
381 4 50       9 unless ($self->_column_is_integers(6,7,9) ) {
382 0         0 $bed_check = 0;
383 0         0 $error .= " Column 6,7,9 not integers.";
384             }
385 4 50       20 unless ($self->_column_is_comma_integers(10,11) ) {
386 0         0 $bed_check = 0;
387 0         0 $error .= " Column 10,11 not comma-delimited integers.";
388             }
389 4 50       8 unless ($self->_column_is_numeric(12,13,14) ) {
390 0         0 $bed_check = 0;
391 0         0 $error .= " Columns 12,13,14 not numeric.";
392             }
393             }
394            
395             # peak file format
396 21 50 100     91 if ($self->{'format'} and
      66        
397             $self->{'format'} =~ /narrowpeak/i and
398             $self->{'number_columns'} != 10
399             ) {
400 0         0 $bed_check = 0;
401 0         0 $error .= " NarrowPeak has 10 columns only.";
402             }
403 21 50 66     81 if ($self->{'format'} and
      33        
404             $self->{'format'} =~ /broadpeak/i and
405             $self->{'number_columns'} != 9
406             ) {
407 0         0 $bed_check = 0;
408 0         0 $error .= " BroadPeak has 9 columns only.";
409             }
410 21 50 100     68 if ($self->{'format'} and
      66        
411             $self->{'format'} =~ /gappedpeak/i and
412             $self->{'number_columns'} != 15
413             ) {
414 0         0 $bed_check = 0;
415 0         0 $error .= " GappeddPeak has 15 columns only.";
416             }
417            
418             # reset the BED tag value as appropriate
419 21 50       38 if ($bed_check) {
420 21         42 $self->{'bed'} = $self->{'number_columns'}; # in case we had a fake true
421             }
422             else {
423             # reset metadata
424 0         0 $self->{'bed'} = 0;
425 0         0 $self->{'headers'} = 1;
426 0         0 my $ext = $self->{'extension'};
427 0         0 $self->{'filename'} =~ s/$ext/.txt/;
428 0         0 $self->{'extension'} = '.txt';
429 0         0 $self->{'format'} = '';
430            
431             # remove the AUTO key from the metadata
432 0         0 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
433 0 0       0 if (exists $self->{$i}{'AUTO'}) {
434 0         0 delete $self->{$i}{'AUTO'};
435             }
436             }
437 0 0       0 print "\n BED FILE FORMAT ERROR: $error\n" unless $silence;
438             }
439             }
440            
441             # check refFlat or genePred gene structure
442 52 100       94 if ($self->{'ucsc'}) {
443             # if any of these checks fail, we will reset the extension
444 6         7 my $ucsc_check = 1; # start with assumption it is correct
445 6         10 my $ucsc_type;
446            
447             # check number of columns
448 6         7 my $colnumber = $self->{number_columns};
449 6 50 0     12 if ($colnumber == 16) {
    0          
    0          
    0          
450 6         7 $ucsc_type = 'genePredExtBin';
451             # genePredExt with bin
452             # bin name chrom strand txStart txEnd cdsStart cdsEnd
453             # exonCount exonStarts exonEnds score name2 cdsStartSt
454             # cdsEndStat exonFrames
455 6 50       25 unless($self->{2}{name} =~
456             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
457 0         0 $ucsc_check = 0;
458 0         0 $error .= " Column 2 name not chromosome-like.";
459             }
460 6 100       20 unless($self->{4}{name} =~ /start|position/i) {
461 3         4 $ucsc_check = 0;
462 3         6 $error .= " Column 4 name not start-like.";
463             }
464 6 100       17 unless($self->{5}{name} =~ /stop|end|position/i) {
465 3         4 $ucsc_check = 0;
466 3         4 $error .= " Column 5 name not stop-like.";
467             }
468 6 100       16 unless($self->{6}{name} =~ /start|position/i) {
469 3         3 $ucsc_check = 0;
470 3         4 $error .= " Column 6 name not start-like.";
471             }
472 6 100       16 unless($self->{7}{name} =~ /stop|end|position/i) {
473 3         3 $ucsc_check = 0;
474 3         4 $error .= " Column 7 name not stop-like.";
475             }
476 6 50       12 unless($self->_column_is_integers(4,5,6,7,8)) {
477 0         0 $ucsc_check = 0;
478 0         0 $error .= " Columns 4,5,6,7,8 not integers.";
479             }
480 6 50       20 unless ($self->_column_is_comma_integers(9,10)) {
481 0         0 $ucsc_check = 0;
482 0         0 $error .= " Columns 9,10 not comma-delimited integers.";
483             }
484 6 50       11 unless($self->_column_is_stranded(3)) {
485 0         0 $ucsc_check = 0;
486 0         0 $error .= " Column 3 not strand values.";
487             }
488             }
489             elsif ($colnumber == 15 or $colnumber == 12) {
490 0 0       0 $ucsc_type = $colnumber == 15 ? 'genePredExt' : 'knownGene';
491             # GenePredExt
492             # name chrom strand txStart txEnd cdsStart cdsEnd
493             # exonCount exonStarts exonEnds score name2 cdsStartSt
494             # cdsEndStat exonFrames
495             # or knownGene
496             # name chrom strand txStart txEnd cdsStart cdsEnd
497             # exonCount exonStarts exonEnds proteinID alignID
498 0 0       0 unless($self->{1}{name} =~
499             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
500 0         0 $ucsc_check = 0;
501 0         0 $error .= " Column 1 name not chromosome-like.";
502             }
503 0 0       0 unless($self->{3}{name} =~ /start|position/i) {
504 0         0 $ucsc_check = 0;
505 0         0 $error .= " Column 3 name not start-like.";
506             }
507 0 0       0 unless($self->{4}{name} =~ /stop|end|position/i) {
508 0         0 $ucsc_check = 0;
509 0         0 $error .= " Column 4 name not stop-like.";
510             }
511 0 0       0 unless($self->{5}{name} =~ /start|position/i) {
512 0         0 $ucsc_check = 0;
513 0         0 $error .= " Column 5 name not start-like.";
514             }
515 0 0       0 unless($self->{6}{name} =~ /stop|end|position/i) {
516 0         0 $ucsc_check = 0;
517 0         0 $error .= " Column 6 name not stop-like.";
518             }
519 0 0       0 unless($self->_column_is_integers(3,4,5,6,7)) {
520 0         0 $ucsc_check = 0;
521 0         0 $error .= " Columns 3,4,5,6,7 not integers.";
522             }
523 0 0       0 unless ($self->_column_is_comma_integers(8,9)) {
524 0         0 $ucsc_check = 0;
525 0         0 $error .= " Columns 8,9 not comma-delimited integers.";
526             }
527 0 0       0 unless($self->_column_is_stranded(2)) {
528 0         0 $ucsc_check = 0;
529 0         0 $error .= " Column 2 not strand values.";
530             }
531             }
532             elsif ($colnumber == 11) {
533 0         0 $ucsc_type = 'refFlat';
534             # geneName transcriptName chrom strand txStart txEnd
535             # cdsStart cdsEnd exonCount exonStarts exonEnds
536 0 0       0 unless($self->{2}{name} =~
537             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
538 0         0 $ucsc_check = 0;
539 0         0 $error .= " Column 2 name not chromosome-like.";
540             }
541 0 0       0 unless($self->{4}{name} =~ /start|position/i) {
542 0         0 $ucsc_check = 0;
543 0         0 $error .= " Column 4 name not start-like.";
544             }
545 0 0       0 unless($self->{5}{name} =~ /stop|end|position/i) {
546 0         0 $ucsc_check = 0;
547 0         0 $error .= " Column 5 name not stop-like.";
548             }
549 0 0       0 unless($self->{6}{name} =~ /start|position/i) {
550 0         0 $ucsc_check = 0;
551 0         0 $error .= " Column 6 name not start-like.";
552             }
553 0 0       0 unless($self->{7}{name} =~ /stop|end|position/i) {
554 0         0 $ucsc_check = 0;
555 0         0 $error .= " Column 7 name not stop-like.";
556             }
557 0 0       0 unless($self->_column_is_integers(4,5,6,7,8)) {
558 0         0 $ucsc_check = 0;
559 0         0 $error .= " Columns 4,5,6,7,8 not integers.";
560             }
561 0 0       0 unless ($self->_column_is_comma_integers(9,10)) {
562 0         0 $ucsc_check = 0;
563 0         0 $error .= " Columns 9,10 not comma-delimited integers.";
564             }
565 0 0       0 unless($self->_column_is_stranded(3)) {
566 0         0 $ucsc_check = 0;
567 0         0 $error .= " Column 3 not strand values.";
568             }
569             }
570             elsif ($colnumber == 10) {
571 0         0 $ucsc_type = 'genePred';
572             # name chrom strand txStart txEnd cdsStart cdsEnd
573             # exonCount exonStarts exonEnds
574 0 0       0 unless($self->{1}{name} =~
575             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
576 0         0 $ucsc_check = 0;
577 0         0 $error .= " Column 1 name not chromosome-like.";
578             }
579 0 0       0 unless($self->{3}{name} =~ /start|position/i) {
580 0         0 $ucsc_check = 0;
581 0         0 $error .= " Column 3 name not start-like.";
582             }
583 0 0       0 unless($self->{4}{name} =~ /stop|end|position/i) {
584 0         0 $ucsc_check = 0;
585 0         0 $error .= " Column 4 name not stop-like.";
586             }
587 0 0       0 unless($self->{5}{name} =~ /start|position/i) {
588 0         0 $ucsc_check = 0;
589 0         0 $error .= " Column 5 name not start-like.";
590             }
591 0 0       0 unless($self->{6}{name} =~ /stop|end|position/i) {
592 0         0 $ucsc_check = 0;
593 0         0 $error .= " Column 6 name not stop-like.";
594             }
595 0 0       0 unless($self->_column_is_integers(3,4,5,6,7)) {
596 0         0 $ucsc_check = 0;
597 0         0 $error .= " Columns 3,4,5,6,7 not integers.";
598             }
599 0 0       0 unless ($self->_column_is_comma_integers(8,9)) {
600 0         0 $ucsc_check = 0;
601 0         0 $error .= " Columns 8,9 not comma-delimited integers.";
602             }
603 0 0       0 unless($self->_column_is_stranded(2)) {
604 0         0 $ucsc_check = 0;
605 0         0 $error .= " Column 2 not strand values.";
606             }
607             }
608             else {
609 0         0 $ucsc_type = 'UCSC';
610 0         0 $ucsc_check = 0;
611 0         0 $error .= " Wrong # of columns.";
612             }
613              
614 6 100       11 if ($ucsc_check == 0) {
615             # failed the check
616 3         6 my $ext = $self->{'extension'};
617 3         38 $self->{'filename'} =~ s/$ext/.txt/;
618 3         7 $self->{'extension'} = '.txt';
619 3         4 $self->{'ucsc'} = 0;
620 3         5 $self->{'format'} = '';
621            
622             # remove the AUTO key
623 3         9 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
624 48 50       70 if (exists $self->{$i}{'AUTO'}) {
625 48         82 delete $self->{$i}{'AUTO'};
626             }
627             }
628 3 50       7 print "\n $ucsc_type FILE FORMAT ERROR: $error\n" unless $silence;
629             }
630             }
631            
632             # check VCF format
633 52 50       95 if ($self->{vcf}) {
634             # if any of these checks fail, we will reset the vcf flag to 0
635             # to make it not a vcf file format
636 0         0 my $vcf_check = 1; # start with assumption it is correct
637            
638             # check number of columns
639 0 0       0 if ($self->{'number_columns'} < 8) {
640 0         0 $vcf_check = 0;
641 0         0 $error .= " Number of Columns is too few.";
642             }
643            
644             # check column index names
645 0 0       0 if ($self->{0}{'name'} !~ m/chrom/i) {
646 0         0 $vcf_check = 0;
647 0         0 $error .= " Column 0 name not chromosome.";
648             }
649 0 0 0     0 if (
650             exists $self->{1} and
651             $self->{1}{'name'} !~ m/^pos|start/i
652             ) {
653 0         0 $vcf_check = 0;
654 0         0 $error .= " Column 1 name not position.";
655             }
656            
657             # check column data
658 0 0       0 unless ($self->_column_is_integers(1)) {
659 0         0 $vcf_check = 0;
660 0         0 $error .= " Columns 1 not integers.";
661             }
662            
663             # reset the vcf tag value as appropriate
664 0 0       0 if ($vcf_check) {
665             # in case we had a fake true set it to a more reasonable value?
666 0 0       0 $self->{'vcf'} = 4 if $self->{'vcf'} == 1;
667             }
668             else {
669             # reset metadata
670 0         0 $self->{'vcf'} = 0;
671 0         0 $self->{'headers'} = 1;
672 0         0 $self->{'format'} = '';
673            
674             # remove the AUTO key from the metadata
675 0         0 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
676 0 0       0 if (exists $self->{$i}{'AUTO'}) {
677 0         0 delete $self->{$i}{'AUTO'};
678             }
679             }
680 0 0       0 print "\n VCF FILE FORMAT ERROR: $error\n" unless $silence;
681             }
682             }
683            
684             # check proper SGR file structure
685 52 50 66     266 if (exists $self->{'extension'} and
      66        
686             defined $self->{'extension'} and
687             $self->{'extension'} =~ /sgr/i
688             ) {
689             # there is no sgr field in the data structure
690             # so we're just checking for the extension
691             # we will change the extension as necessary if it doesn't conform
692 0         0 my $sgr_check = 1;
693 0 0       0 if ($self->{'number_columns'} != 3) {
694 0         0 $sgr_check = 0;
695 0         0 $error .= " Column number is not 3.";
696             }
697 0 0       0 if ($self->{0}{'name'} !~ m/^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
698 0         0 $sgr_check = 0;
699 0         0 $error .= " Column 0 name not chromosome-like.";
700             }
701 0 0       0 if ($self->{1}{'name'} !~ /start|position/i) {
702 0         0 $sgr_check = 0;
703 0         0 $error .= " Column 1 name not start-like.";
704             }
705 0 0       0 unless ($self->_column_is_integers(1)) {
706 0         0 $sgr_check = 0;
707 0         0 $error .= " Columns 1 not integers.";
708             }
709 0 0       0 if ($sgr_check == 0) {
710             # doesn't smell like a SGR file
711             # change the extension so the write subroutine won't think it is
712             # make it a text file
713 0         0 $self->{'extension'} =~ s/sgr/txt/i;
714 0         0 $self->{'filename'} =~ s/sgr/txt/i;
715 0         0 $self->{'headers'} = 1;
716 0         0 $self->{'format'} = '';
717            
718             # remove the AUTO key from the metadata
719 0         0 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
720 0 0       0 if (exists $self->{$i}{'AUTO'}) {
721 0         0 delete $self->{$i}{'AUTO'};
722             }
723             }
724 0 0       0 print "\n SGR FILE FORMAT ERROR: $error\n" unless $silence;
725             }
726             }
727            
728             # check file headers value because this may have changed
729             # this can happen when we reset bed/gff/vcf flags when we add columns
730 52 100 100     378 if (
    100 100        
      66        
      66        
      33        
      33        
      66        
      33        
731             $self->{'bed'} or
732             $self->{'gff'} or
733             $self->{'ucsc'} or
734             ($self->{'extension'} and $self->{'extension'} =~ /sgr/i)
735             ) {
736 29         46 $self->{'headers'} = 0;
737             }
738             elsif (
739             $self->{'bed'} == 0 and
740             $self->{'gff'} == 0 and
741             $self->{'ucsc'} == 0 and
742             ($self->{'extension'} and $self->{'extension'} !~ /sgr/i)
743             ) {
744 18 50       45 $self->{'headers'} = 1 unless $self->{'headers'} == -1;
745             }
746            
747             # if we have made it here, then there were no major structural problems
748             # file is verified, any minor issues should have been fixed
749 52         128 return 1;
750             }
751              
752             # internal method to check if a column is nothing but integers, i.e. start, stop
753             sub _column_is_integers {
754 40     40   46 my $self = shift;
755 40         78 my @index = @_;
756 40 100       94 return 1 if ($self->{last_row} == 0); # can't check if table is empty
757 38         82 foreach (@index) {
758 101 50       181 return 0 unless exists $self->{$_};
759             }
760 38         77 for my $row (1 .. $self->{last_row}) {
761 709         686 for my $i (@index) {
762 1648 50       2728 return 0 unless ($self->{data_table}->[$row][$i] =~ /^\d+$/);
763             }
764             }
765 38         99 return 1;
766             }
767              
768             # internal method to check if a column appears numeric, i.e. scores
769             sub _column_is_numeric {
770 29     29   36 my $self = shift;
771 29         42 my @index = @_;
772 29 50       55 return 1 if ($self->{last_row} == 0); # can't check if table is empty
773 29         35 foreach (@index) {
774 45 50       79 return 0 unless exists $self->{$_};
775             }
776 29         45 for my $row (1 .. $self->{last_row}) {
777 327         344 for my $i (@index) {
778             # we have a very loose definition of numeric: exponents, signs, commas
779 407 50       698 return 0 unless ($self->{data_table}->[$row][$i] =~ /^[\d\-\+\.,eE]+$/);
780             }
781             }
782 29         67 return 1;
783             }
784              
785              
786              
787             # internal method to check if a column is nothing but comma delimited integers
788             sub _column_is_comma_integers {
789 13     13   17 my $self = shift;
790 13         19 my @index = @_;
791 13 50       26 return 1 if ($self->{last_row} == 0); # can't check if table is empty
792 13         18 foreach (@index) {
793 26 50       46 return 0 unless exists $self->{$_};
794             }
795 13         33 for my $row (1 .. $self->{last_row}) {
796 110         125 for my $i (@index) {
797 220 50       400 return 0 unless ($self->{data_table}->[$row][$i] =~ /^[\d,]+$/);
798             }
799             }
800 13         24 return 1;
801             }
802              
803             # internal method to check if a column looks like a strand column
804             sub _column_is_stranded {
805 27     27   40 my ($self, $index) = @_;
806 27 50       56 return unless exists $self->{$index};
807 27         49 for my $row (1 .. $self->{last_row}) {
808 347 50       623 return 0 if ($self->{data_table}->[$row][$index] !~ /^(?:\-1|0|1|\+|\-|\.)$/);
809             }
810 27         50 return 1;
811             }
812              
813              
814              
815              
816              
817             #### Database methods ##############################################################
818              
819             sub open_database {
820 1     1 1 3 my $self = shift;
821 1 50 0     3 if (not defined $_[0]) {
    0          
    0          
822 1         4 return $self->open_meta_database;
823             }
824             elsif ($_[0] eq '0' or $_[0] eq '1') {
825             # likely a boolean value to indicate force
826 0         0 return $self->open_meta_database($_[0]);
827             }
828             elsif ($_[0] =~ /[a-zA-Z]+/) {
829             # likely the name of a database
830 0         0 return $self->open_new_database(@_);
831             }
832             else {
833             # original default
834 0         0 return $self->open_meta_database(@_);
835             }
836             }
837              
838             sub open_meta_database {
839 15     15 1 27 my $self = shift;
840 15   50     42 my $force = shift || 0;
841 15 50       30 return unless $self->{db};
842 15 50       41 return if $self->{db} =~ /^Parsed:/; # we don't open parsed annotation files
843 15 100       27 if (exists $self->{db_connection}) {
844 14 50       61 return $self->{db_connection} unless $force;
845             }
846 1         4 my $db = open_db_connection($self->{db}, $force);
847 1 50       3 return unless $db;
848 1         2 $self->{db_connection} = $db;
849 1         3 return $db;
850             }
851              
852             sub open_new_database {
853 0     0 1 0 my $self = shift;
854 0         0 my $database = shift;
855 0   0     0 my $force = shift || 0;
856 0         0 return open_db_connection($database, $force);
857             }
858              
859             sub verify_dataset {
860 12     12 1 27 my ($self, $dataset, $database) = @_;
861 12 50       23 return unless $dataset;
862 12 100       36 if (exists $self->{verfied_dataset}{$dataset}) {
863 11         32 return $self->{verfied_dataset}{$dataset};
864             }
865             else {
866 1 50       5 if ($dataset =~ /^(?:file|http|ftp)/) {
867             # local or remote file already verified?
868 0         0 $self->{verfied_dataset}{$dataset} = $dataset;
869 0         0 return $dataset;
870             }
871 1   33     2 $database ||= $self->open_meta_database;
872 1         5 my ($verified) = verify_or_request_feature_types(
873             # normally returns an array of verified features, we're only checking one
874             db => $database,
875             feature => $dataset,
876             );
877 1 50       3 if ($verified) {
878 1         2 $self->{verfied_dataset}{$dataset} = $verified;
879 1         3 return $verified;
880             }
881             }
882 0         0 return;
883             }
884              
885              
886              
887             #### Column Manipulation ####
888              
889             sub delete_column {
890 2     2 1 5 my $self = shift;
891            
892             # check for Stream
893 2 50       7 if (ref $self eq 'Bio::ToolBox::Data::Stream') {
894 0 0       0 unless ($self->mode) {
895 0         0 cluck "We have a read-only Stream object, cannot add columns";
896 0         0 return;
897             }
898 0 0       0 if (defined $self->{fh}) {
899             # Stream file handle is opened
900 0         0 cluck "Cannot modify columns when a Stream file handle is opened!";
901 0         0 return;
902             }
903             }
904 2 50       8 unless (@_) {
905 0         0 cluck "must provide a list";
906 0         0 return;
907             }
908            
909 2         6 my @deletion_list = sort {$a <=> $b} @_;
  0         0  
910 2         3 my @retain_list;
911 2         7 for (my $i = 0; $i < $self->number_columns; $i++) {
912             # compare each current index with the first one in the list of
913             # deleted indices. if it matches, delete. if not, keep
914 18 100       22 if ( $i == $deletion_list[0] ) {
915             # this particular index should be deleted
916 2         3 shift @deletion_list;
917             }
918             else {
919             # this particular index should be kept
920 16         23 push @retain_list, $i;
921             }
922             }
923 2         8 return $self->reorder_column(@retain_list);
924             }
925              
926             sub reorder_column {
927 3     3 1 3 my $self = shift;
928            
929             # check for Stream
930 3 50       11 if (ref $self eq 'Bio::ToolBox::Data::Stream') {
931 0 0       0 unless ($self->mode) {
932 0         0 cluck "We have a read-only Stream object, cannot add columns";
933 0         0 return;
934             }
935 0 0       0 if (defined $self->{fh}) {
936             # Stream file handle is opened
937 0         0 cluck "Cannot modify columns when a Stream file handle is opened!";
938 0         0 return;
939             }
940             }
941            
942             # reorder data table
943 3 50       6 unless (@_) {
944 0         0 carp "must provide a list";
945 0         0 return;
946             }
947 3         8 my @order = @_;
948 3         9 for (my $row = 0; $row <= $self->last_row; $row++) {
949 237         324 my @old = $self->row_values($row);
950 237         264 my @new = map { $old[$_] } @order;
  1580         1619  
951 237         212 splice( @{ $self->{data_table} }, $row, 1, \@new);
  237         521  
952             }
953            
954             # reorder metadata
955 3         4 my %old_metadata;
956 3         11 for (my $i = 0; $i < $self->number_columns; $i++) {
957             # copy the metadata info hash into a temporary hash
958 28         52 $old_metadata{$i} = $self->{$i};
959 28         40 delete $self->{$i}; # delete original
960             }
961 3         9 for (my $i = 0; $i < scalar(@order); $i++) {
962             # now copy back from the old_metadata into the main data hash
963             # using the new index number in the @order array
964             # must regenerate the hash, not just link to the old anonymous hash, in
965             # case we're duplicating columns
966 20         27 $self->{$i} = {};
967 20         18 foreach my $k (keys %{ $old_metadata{$order[$i]} }) {
  20         45  
968 50         75 $self->{$i}{$k} = $old_metadata{$order[$i]}{$k};
969             }
970             # assign new index number
971 20         33 $self->{$i}{'index'} = $i;
972             }
973 3         4 $self->{'number_columns'} = scalar @order;
974 3 100       14 delete $self->{column_indices} if exists $self->{column_indices};
975 3 50 66     10 if ($self->gff or $self->bed or $self->ucsc or $self->vcf) {
      66        
      33        
976             # check if we maintain integrity, at least insofar what we test
977 1         4 $self->verify(1); # silence so user doesn't get these messages
978             }
979 3         22 return 1;
980             }
981              
982              
983              
984             #### General Metadata ####
985              
986             sub feature {
987 54     54 1 537 my $self = shift;
988 54 100       99 if (@_) {
989 20         29 $self->{feature} = shift;
990             }
991 54         140 return $self->{feature};
992             }
993              
994             sub feature_type {
995 37     37 1 51 my $self = shift;
996 37 50       75 carp "feature_type is a read only method" if @_;
997 37 100       88 if (defined $self->{feature_type}) {
998 24         87 return $self->{feature_type};
999             }
1000 13         77 my $feature_type;
1001 13 100 66     48 if (defined $self->chromo_column and defined $self->start_column) {
    50 0        
      33        
      0        
      0        
1002 8         16 $feature_type = 'coordinate';
1003             }
1004             elsif (defined $self->id_column or
1005             ( defined $self->type_column and defined $self->name_column ) or
1006             ( defined $self->feature and defined $self->name_column )
1007             ) {
1008 5         9 $feature_type = 'named';
1009             }
1010             else {
1011 0         0 $feature_type = 'unknown';
1012             }
1013 13         26 $self->{feature_type} = $feature_type;
1014 13         52 return $feature_type;
1015             }
1016              
1017             sub program {
1018 68     68 1 80 my $self = shift;
1019 68 100       119 if (@_) {
1020 60         97 $self->{program} = shift;
1021             }
1022 68         163 return $self->{program};
1023             }
1024              
1025             sub database {
1026 46     46 1 512 my $self = shift;
1027 46 100       86 if (@_) {
1028 20         27 $self->{db} = shift;
1029 20 0 33     41 if (exists $self->{db_connection} and $self->{db_connection}) {
1030 0         0 $self->open_meta_database(1);
1031             }
1032             }
1033 46         160 return $self->{db};
1034             }
1035              
1036             sub bam_adapter {
1037 0     0 1 0 my $self = shift;
1038 0         0 return use_bam_adapter(@_);
1039             }
1040              
1041             sub big_adapter {
1042 0     0 1 0 my $self = shift;
1043 0         0 return use_big_adapter(@_);
1044             }
1045              
1046             sub format {
1047 223     223 1 277 my $self = shift;
1048 223 100       323 if (defined $_[0]) {
1049 72         108 $self->{format} = $_[0];
1050             }
1051 223         875 return $self->{format};
1052             }
1053              
1054             sub gff {
1055 102     102 1 2251 my $self = shift;
1056 102 100 66     234 if (defined $_[0] and $_[0] =~ /^(?:0|1|2|2\.[2|5]|3)$/) {
1057 5         11 $self->{gff} = $_[0];
1058             }
1059 102         338 return $self->{gff};
1060             }
1061              
1062             sub bed {
1063 137     137 1 513 my $self = shift;
1064 137 100 66     406 if (defined $_[0] and $_[0] =~ /^\d+$/) {
1065 43         71 $self->{bed} = $_[0];
1066             }
1067 137         393 return $self->{bed};
1068             }
1069              
1070             sub ucsc {
1071 62     62 1 130 my $self = shift;
1072 62 100 66     153 if (defined $_[0] and $_[0] =~ /^\d+$/) {
1073 3         8 $self->{ucsc} = $_[0];
1074             }
1075 62         204 return $self->{ucsc};
1076             }
1077              
1078             sub vcf {
1079 44     44 1 55 my $self = shift;
1080 44 50 33     76 if (defined $_[0] and $_[0] =~ /^[\d\.]+$/) {
1081 0         0 $self->{vcf} = $_[0];
1082             }
1083 44         122 return $self->{vcf};
1084             }
1085              
1086             sub number_columns {
1087 553     553 1 5980 my $self = shift;
1088 553 50       742 carp "number_columns is a read only method" if @_;
1089 553         1094 return $self->{number_columns};
1090             }
1091              
1092             sub number_rows {
1093 3     3 1 7 my $self = shift;
1094 3 50       10 carp "number_rows is a read only method" if @_;
1095 3         13 return $self->{last_row};
1096             }
1097              
1098             sub last_column {
1099 0     0 1 0 my $self = shift;
1100 0 0       0 carp "last_column is a read only method" if @_;
1101 0         0 return $self->{number_columns} - 1;
1102             }
1103              
1104             sub last_row {
1105 892     892 1 2665 my $self = shift;
1106 892 50       1214 carp "last_row is a read only method" if @_;
1107 892         1586 return $self->{last_row};
1108             }
1109              
1110             sub filename {
1111 138     138 1 205 my $self = shift;
1112 138 50       225 carp "filename is a read only method. Use add_file_metadata()." if @_;
1113 138         400 return $self->{filename};
1114             }
1115              
1116             sub basename {
1117 29     29 1 2087 my $self = shift;
1118 29 50       59 carp "basename is a read only method. Use add_file_metadata()." if @_;
1119 29         110 return $self->{basename};
1120             }
1121              
1122             sub path {
1123 0     0 1 0 my $self = shift;
1124 0 0       0 carp "path is a read only method. Use add_file_metadata()." if @_;
1125 0         0 return $self->{path};
1126             }
1127              
1128             sub extension {
1129 129     129 1 170 my $self = shift;
1130 129 50       209 carp "extension() is a read only method. Use add_file_metadata()." if @_;
1131 129         433 return $self->{extension};
1132             }
1133              
1134              
1135              
1136             #### General Comments ####
1137              
1138             sub comments {
1139 26     26 1 39 my $self = shift;
1140 26         33 my @comments = @{ $self->{comments} };
  26         52  
1141 26         45 foreach (@comments) {s/[\r\n]+//g}
  25         118  
1142             # comments are not chomped when loading
1143             # side effect of dealing with rare commented header lines with null values at end
1144 26         62 return @comments;
1145             }
1146              
1147             sub add_comment {
1148 53     53 1 521 my $self = shift;
1149 53 50       94 my $comment = shift or return;
1150             # comment is not required to be prefixed with "# ", it will be added when saving
1151 53         57 push @{ $self->{comments} }, $comment;
  53         100  
1152 53         75 return 1;
1153             }
1154              
1155             sub delete_comment {
1156 1     1 1 806 my $self = shift;
1157 1         2 my $index = shift;
1158 1 50       4 if (defined $index) {
1159 1         1 eval {splice @{$self->{comments}}, $index, 1};
  1         2  
  1         3  
1160             }
1161             else {
1162 0         0 $self->{comments} = [];
1163             }
1164             }
1165              
1166             sub vcf_headers {
1167 0     0 1 0 my $self = shift;
1168 0 0       0 return unless $self->vcf;
1169 0 0       0 return $self->{vcf_headers} if exists $self->{vcf_headers};
1170 0         0 my $headers = {};
1171 0         0 foreach my $comment ($self->comments) {
1172 0         0 my ($key, $value);
1173 0 0       0 if ($comment =~ /^##([\w\-\.]+)=(.+)$/) {
1174 0         0 $key = $1;
1175 0         0 $value = $2;
1176             }
1177             else {
1178             # invalid vcf header format!?
1179 0         0 next;
1180             }
1181 0 0       0 if ($value !~ /^<.+>$/) {
1182             # simple value
1183 0         0 $headers->{$key} = $value;
1184             }
1185             else {
1186             # process complex values
1187             # extract ID with regex which should have
1188 0         0 my $id = ($value =~ /ID=([\w\-\.:]+)/)[0];
1189 0         0 $headers->{$key}{$id} = $value;
1190             }
1191             }
1192            
1193             # store and return
1194 0         0 $self->{vcf_headers} = $headers;
1195 0         0 return $headers;
1196             }
1197              
1198             sub rewrite_vcf_headers {
1199 0     0 1 0 my $self = shift;
1200 0 0       0 return unless $self->vcf;
1201 0 0       0 return unless exists $self->{vcf_headers};
1202 0         0 my @newComments;
1203            
1204             # file format always comes first
1205             push @newComments, sprintf("##fileformat=%s\n",
1206 0         0 $self->{vcf_headers}{fileformat});
1207            
1208             # common attributes
1209 0         0 foreach my $key (sort {$a cmp $b} keys %{ $self->{vcf_headers} } ) {
  0         0  
  0         0  
1210 0 0       0 next if $key eq 'fileformat';
1211 0 0       0 if (ref $self->{vcf_headers}{$key} eq 'HASH') {
1212             # we have a complex VCF header with multiple keys
1213             # we will rewrite for each ID
1214 0         0 foreach my $id (sort {$a cmp $b}
  0         0  
1215 0         0 keys %{ $self->{vcf_headers}{$key} }
1216             ) {
1217             # to avoid complexity of writing correct formatting
1218             push @newComments, sprintf("##%s=%s\n", $key,
1219 0         0 $self->{vcf_headers}{$key}{$id} );
1220             }
1221             }
1222             else {
1223             # a simple value
1224             push @newComments, sprintf("##%s=%s\n", $key,
1225 0         0 $self->{vcf_headers}{$key} );
1226             }
1227             }
1228            
1229             # replace the headers
1230 0         0 $self->{comments} = \@newComments;
1231             }
1232              
1233              
1234              
1235             #### Column Metadata ####
1236              
1237             sub list_columns {
1238 4     4 1 7 my $self = shift;
1239 4 50       11 carp "list_columns is a read only method" if @_;
1240 4         7 my @list;
1241 4         15 for (my $i = 0; $i < $self->number_columns; $i++) {
1242 26         50 push @list, $self->{$i}{'name'};
1243             }
1244 4 100       19 return wantarray ? @list : \@list;
1245             }
1246              
1247             sub name {
1248 24     24 1 1900 my $self = shift;
1249 24         48 my ($index, $new_name) = @_;
1250 24 50       53 return unless defined $index;
1251 24 50       69 return unless exists $self->{$index}{name};
1252 24 100       46 if (defined $new_name) {
1253 1         2 $self->{$index}{name} = $new_name;
1254 1 50       3 if (exists $self->{data_table}) {
    0          
1255 1         3 $self->{data_table}->[0][$index] = $new_name;
1256             }
1257             elsif (exists $self->{column_names}) {
1258 0         0 $self->{column_names}->[$index] = $new_name;
1259             }
1260             }
1261 24         123 return $self->{$index}{name};
1262             }
1263              
1264             sub metadata {
1265 26     26 1 29 my $self = shift;
1266 26         36 my ($index, $key, $value) = @_;
1267 26 50       44 return unless defined $index;
1268 26 50       75 return unless exists $self->{$index};
1269 26 100 100     54 if ($key and $key eq 'name') {
1270 2         6 return $self->name($index, $value);
1271             }
1272 24 100 66     58 if ($key and defined $value) {
    50 33        
1273             # we are setting a new value
1274 3         7 $self->{$index}{$key} = $value;
1275 3         5 return $value;
1276             }
1277             elsif ($key and not defined $value) {
1278 0 0       0 if (exists $self->{$index}{$key}) {
1279             # retrieve a value
1280 0         0 return $self->{$index}{$key};
1281             }
1282             else {
1283             # key does not exist
1284 0         0 return;
1285             }
1286             }
1287             else {
1288 21         20 my %hash = %{ $self->{$index} };
  21         62  
1289 21 100       70 return wantarray ? %hash : \%hash;
1290             }
1291             }
1292              
1293             sub delete_metadata {
1294 0     0 1 0 my $self = shift;
1295 0         0 my ($index, $key) = @_;
1296 0 0       0 return unless defined $index;
1297 0 0       0 if (defined $key) {
1298 0 0       0 if (exists $self->{$index}{$key}) {
1299 0         0 return delete $self->{$index}{$key};
1300             }
1301             }
1302             else {
1303             # user wants to delete the metadata
1304             # but we need to keep the basics name and index
1305 0         0 foreach my $key (keys %{ $self->{$index} }) {
  0         0  
1306 0 0       0 next if $key eq 'name';
1307 0 0       0 next if $key eq 'index';
1308 0         0 delete $self->{$index}{$key};
1309             }
1310             }
1311             }
1312              
1313             sub copy_metadata {
1314 2     2 1 5 my ($self, $source, $target) = @_;
1315 2 50 33     9 return unless (exists $self->{$source}{name} and exists $self->{$target}{name});
1316 2         7 my $md = $self->metadata($source);
1317 2         3 delete $md->{name};
1318 2         3 delete $md->{'index'};
1319 2 50       31 delete $md->{'AUTO'} if exists $md->{'AUTO'}; # presume this is no longer auto index
1320 2         7 foreach (keys %$md) {
1321 1         2 $self->{$target}{$_} = $md->{$_};
1322             }
1323 2         5 return 1;
1324             }
1325              
1326              
1327              
1328             #### Column Indices ####
1329              
1330             sub find_column {
1331 220     220 1 335 my ($self, $name) = @_;
1332 220 50       306 return unless $name;
1333            
1334             # the $name variable will be used as a regex in identifying the name
1335             # fix it so that it will possible accept a # character at the beginning
1336             # without a following space, in case the first column has a # prefix
1337             # also place the remainder of the text in a non-capturing parentheses for
1338             # grouping purposes while maintaining the anchors
1339 220         1114 $name =~ s/ \A (\^?) (.+) (\$?)\Z /$1#?(?:$2)$3/x;
1340            
1341             # walk through each column index
1342 220         279 my $index;
1343 220         387 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
1344             # check the names of each column
1345 736 100       5874 if ($self->{$i}{'name'} =~ /$name/i) {
1346 88         137 $index = $i;
1347 88         99 last;
1348             }
1349             }
1350 220         452 return $index;
1351             }
1352              
1353             sub _find_column_indices {
1354 24     24   34 my $self = shift;
1355             # these are hard coded index name regex to accomodate different possibilities
1356             # these do not include parentheses for grouping
1357             # non-capturing parentheses will be added later in the sub for proper
1358             # anchoring and grouping - long story why, don't ask
1359 24         67 my $name = $self->find_column('^name|gene.?name|transcript.?name|geneid|id|gene|alias');
1360 24         51 my $type = $self->find_column('^type|class|primary_tag|biotype');
1361 24         54 my $id = $self->find_column('^primary_id');
1362 24         51 my $chromo = $self->find_column('^chr|seq|ref|ref.?seq');
1363 24         45 my $start = $self->find_column('^start|position|pos|txStart');
1364 24         63 my $stop = $self->find_column('^stop|end|txEnd');
1365 24         70 my $strand = $self->find_column('^strand');
1366 24         186 my $score = $self->find_column('^score$');
1367            
1368             # check for coordinate string
1369 24         61 my $coord = $self->find_column('^coordinate$');
1370 24 50 66     103 if (not defined $coord and not defined $chromo and not defined $start) {
      66        
1371             # check if ID or name id looks like a coordinate string
1372 12 50 33     134 if (defined $id and defined $self->{data_table}->[1]) {
1373 12 100       46 if ($self->{data_table}->[1][$id] =~ /^[\w\-\.]+:\d+(?:[\-\.]{1,2}\d+)?$/) {
1374             # first value looks like a coordinate string
1375 8         14 $coord = $id;
1376             }
1377             }
1378 12 50 66     35 if (
      66        
1379             not defined $coord and defined $name and
1380             defined defined $self->{data_table}->[1]
1381             ) {
1382 4 50       15 if ($self->{data_table}->[1][$name] =~ /^[\w\-\.]+:\d+(?:[\-\.]{1,2}\d+)?$/) {
1383             # first value looks like a coordinate string
1384 0         0 $coord = $name;
1385             }
1386             }
1387             }
1388            
1389             # determine zero-based start coordinates
1390 24 100 100     104 if (
      100        
1391             $self->{zerostart} == 0 and
1392             defined $start and
1393             substr($self->name($start), -1) eq '0'
1394             ) {
1395 1         2 $self->{zerostart} = 1;
1396             }
1397            
1398             # cache the indexes
1399             $self->{column_indices} = {
1400 24         194 'name' => $name,
1401             'type' => $type,
1402             'id' => $id,
1403             'seq_id' => $chromo,
1404             'chromo' => $chromo,
1405             'start' => $start,
1406             'stop' => $stop,
1407             'end' => $stop,
1408             'strand' => $strand,
1409             'score' => $score,
1410             'coord' => $coord,
1411             };
1412 24         41 return 1;
1413             }
1414              
1415             sub chromo_column {
1416 26     26 1 38 my $self = shift;
1417 26 100       118 $self->_find_column_indices unless exists $self->{column_indices};
1418 26 100       57 if (defined $_[0]) {
1419 2 50 33     17 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1420 2         5 $self->{column_indices}{chromo} = $_[0];
1421             }
1422             else {
1423 0         0 carp sprintf("chromo_column %s is not a valid index!", $_[0]);
1424             }
1425             }
1426 26         113 return $self->{column_indices}{chromo};
1427             }
1428              
1429             sub start_column {
1430 117     117 1 133 my $self = shift;
1431 117 100       226 $self->_find_column_indices unless exists $self->{column_indices};
1432 117 100       178 if (defined $_[0]) {
1433 2 50 33     13 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1434 2         4 $self->{column_indices}{start} = $_[0];
1435             }
1436             else {
1437 0         0 carp sprintf("start_column %s is not a valid index!", $_[0]);
1438             }
1439             }
1440 117         225 return $self->{column_indices}{start};
1441             }
1442              
1443             sub stop_column {
1444 21     21 1 40 my $self = shift;
1445 21 50       57 $self->_find_column_indices unless exists $self->{column_indices};
1446 21 100       48 if (defined $_[0]) {
1447 2 50 33     12 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1448 2         5 $self->{column_indices}{stop} = $_[0];
1449             }
1450             else {
1451 0         0 carp sprintf("stop_column %s is not a valid index!", $_[0]);
1452             }
1453             }
1454 21         64 return $self->{column_indices}{stop};
1455             }
1456             *end_column = \&stop_column;
1457              
1458             sub coord_column {
1459 109     109 0 104 my $self = shift;
1460 109 50       158 $self->_find_column_indices unless exists $self->{column_indices};
1461 109 50       152 if (defined $_[0]) {
1462 0 0 0     0 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1463 0         0 $self->{column_indices}{coord} = $_[0];
1464             }
1465             else {
1466 0         0 carp sprintf("coord_column %s is not a valid index!", $_[0]);
1467             }
1468             }
1469 109         171 return $self->{column_indices}{coord};
1470             }
1471              
1472             sub strand_column {
1473 21     21 1 32 my $self = shift;
1474 21 100       74 $self->_find_column_indices unless exists $self->{column_indices};
1475 21 100       45 if (defined $_[0]) {
1476 2 50 33     12 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1477 2         4 $self->{column_indices}{strand} = $_[0];
1478             }
1479             else {
1480 0         0 carp sprintf("strand_column %s is not a valid index!", $_[0]);
1481             }
1482             }
1483 21         72 return $self->{column_indices}{strand};
1484             }
1485              
1486             sub name_column {
1487 72     72 1 81 my $self = shift;
1488 72 100       112 $self->_find_column_indices unless exists $self->{column_indices};
1489 72 50       96 if (defined $_[0]) {
1490 0 0 0     0 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1491 0         0 $self->{column_indices}{name} = $_[0];
1492             }
1493             else {
1494 0         0 carp sprintf("name_column %s is not a valid index!", $_[0]);
1495             }
1496             }
1497 72         138 return $self->{column_indices}{name};
1498             }
1499              
1500             sub type_column {
1501 14     14 1 20 my $self = shift;
1502 14 100       42 $self->_find_column_indices unless exists $self->{column_indices};
1503 14 100       29 if (defined $_[0]) {
1504 2 50 33     12 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1505 2         4 $self->{column_indices}{type} = $_[0];
1506             }
1507             else {
1508 0         0 carp sprintf("type_column %s is not a valid index!", $_[0]);
1509             }
1510             }
1511 14         35 return $self->{column_indices}{type};
1512             }
1513              
1514             sub id_column {
1515 71     71 1 76 my $self = shift;
1516 71 100       97 $self->_find_column_indices unless exists $self->{column_indices};
1517 71 50       93 if (defined $_[0]) {
1518 0 0 0     0 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1519 0         0 $self->{column_indices}{id} = $_[0];
1520             }
1521             else {
1522 0         0 carp sprintf("id_column %s is not a valid index!", $_[0]);
1523             }
1524             }
1525 71         128 return $self->{column_indices}{id};
1526             }
1527              
1528             sub score_column {
1529 0     0 1 0 my $self = shift;
1530 0 0       0 $self->_find_column_indices unless exists $self->{column_indices};
1531 0 0       0 if (defined $_[0]) {
1532 0 0 0     0 if ($_[0] =~ /^\d+$/ and $_[0] < $self->{number_columns}) {
1533 0         0 $self->{column_indices}{score} = $_[0];
1534             }
1535             else {
1536 0         0 carp sprintf("score_column %s is not a valid index!", $_[0]);
1537             }
1538             }
1539 0         0 return $self->{column_indices}{score};
1540             }
1541              
1542             *zero_start = \&interbase;
1543             sub interbase {
1544 90     90 1 93 my $self = shift;
1545 90 50       139 if (@_) {
1546 0         0 my $i = $self->start_column;
1547 0         0 my $n = $self->name($i);
1548 0 0 0     0 if ($_[0] eq '1' and $n =~ /^start$/i) {
    0 0        
1549 0         0 $self->{zerostart} = 1;
1550 0         0 $self->name($i, 'Start0');
1551             }
1552             elsif ($_[0] eq '0' and $n =~ /^start0$/i) {
1553 0         0 $self->{zerostart} = 0;
1554 0         0 $self->name($i, 'Start');
1555             }
1556             else {
1557 0         0 carp "use 1 (true) or 0 (false) to set interbase mode";
1558             }
1559             }
1560 90         163 return $self->{zerostart};
1561             }
1562              
1563             #### Special Row methods ####
1564              
1565             # Why is this in core and not in Data? I keep asking myself.
1566             # Because this can get called from a Data::Feature object, which might be
1567             # associated with a Data::Stream object. No, Stream objects don't have stored
1568             # SeqFeatures, but I don't want the entire program to crash because of an
1569             # undefined method because some doofus forgot. Since both Data and Stream
1570             # objects inherit from Data::core, this is in here.
1571             sub get_seqfeature {
1572 13     13 1 32 my ($self, $row) = @_;
1573 13 50       37 return unless (ref($self) eq 'Bio::ToolBox::Data');
1574 13 50 33     57 return unless ($row and $row <= $self->{last_row});
1575 13 50       23 return unless exists $self->{SeqFeatureObjects};
1576 13   50     35 return $self->{SeqFeatureObjects}->[$row] || undef;
1577             }
1578              
1579              
1580              
1581             __END__