File Coverage

blib/lib/Bio/ToolBox/Data/core.pm
Criterion Covered Total %
statement 386 706 54.6
branch 194 462 41.9
condition 101 188 53.7
subroutine 47 57 82.4
pod 46 47 97.8
total 774 1460 53.0


line stmt bran cond sub pod time code
1             package Bio::ToolBox::Data::core;
2             our $VERSION = '1.66';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::Data::core - Common functions to Bio:ToolBox::Data family
7              
8             =head1 DESCRIPTION
9              
10             Common methods for metadata and manipulation in a L
11             data table and L file stream. This module
12             should not be used directly. See the respective modules for more information.
13              
14             =cut
15              
16 3     3   25 use strict;
  3         6  
  3         99  
17 3     3   15 use Carp qw(carp cluck croak confess);
  3         6  
  3         165  
18 3     3   31 use base 'Bio::ToolBox::Data::file';
  3         6  
  3         2010  
19 3         279 use Bio::ToolBox::db_helper qw(
20             open_db_connection
21             verify_or_request_feature_types
22             use_bam_adapter
23             use_big_adapter
24 3     3   2966 );
  3         13  
25 3     3   23 use Module::Load;
  3         8  
  3         27  
26              
27             1;
28              
29             #### Initialization and verification ###############################################
30              
31             sub new {
32 72     72 1 179 my $class = shift;
33            
34             # in case someone calls this from an established object
35 72 100       196 if (ref($class)) {
36 26         70 $class = ref($class);
37             }
38            
39             # Initialize the hash structure
40 72         974 my %data = (
41             'program' => undef,
42             'feature' => undef,
43             'feature_type' => undef,
44             'db' => undef,
45             'format' => '',
46             'gff' => 0,
47             'bed' => 0,
48             'ucsc' => 0,
49             'vcf' => 0,
50             'number_columns' => 0,
51             'last_row' => 0,
52             'headers' => 1,
53             'column_names' => [],
54             'filename' => undef,
55             'basename' => undef,
56             'extension' => undef,
57             'path' => undef,
58             'comments' => [],
59             'data_table' => [],
60             'header_line_count' => 0,
61             );
62            
63             # Finished
64 72         291 return bless \%data, $class;
65             }
66              
67              
68             sub verify {
69             # this function does not rely on any self functions for two reasons
70             # this is a low level integrity checker
71             # this is very old code from before the days of an OO API of Bio::ToolBox
72             # although a lot of things have been added and changed since then....
73 52     52 1 1124 my $self = shift;
74 52   100     160 my $silence = shift || 0; # default is to yell
75            
76             # check for data table
77 52 50 33     274 unless (
78             defined $self->{'data_table'} and
79             ref $self->{'data_table'} eq 'ARRAY'
80             ) {
81 0 0       0 carp sprintf "\n DATA INTEGRITY ERROR: No data table in %s object!", ref $self
82             unless $silence;
83 0         0 return;
84             }
85            
86             # check for last row index
87 52 50       129 if (defined $self->{'last_row'}) {
88 52         78 my $number = scalar( @{ $self->{'data_table'} } ) - 1;
  52         112  
89 52 50       134 if ($self->{'last_row'} != $number) {
90             # carp sprintf "TABLE INTEGRITY ERROR: data table last_row index [%d] doesn't match " .
91             # "metadata value [%d]!\n", $number, $self->{'last_row'};
92             # fix it for them
93 0         0 $self->{'last_row'} = $number;
94             }
95             }
96             else {
97             # define it for them
98             $self->{'last_row'} =
99 0         0 scalar( @{ $self->{'data_table'} } ) - 1;
  0         0  
100             }
101            
102             # check for consistent number of columns
103 52 50       118 if (defined $self->{'number_columns'}) {
104 52         105 my $number = $self->{'number_columns'};
105 52         87 my @problems;
106 52         84 my $too_low = 0;
107 52         75 my $too_high = 0;
108 52         142 for (my $row = 0; $row <= $self->{'last_row'}; $row++) {
109 945         1181 my $count = scalar @{ $self->{'data_table'}->[$row] };
  945         1306  
110 945 50       2029 if ($count != $number) {
111 0         0 push @problems, $row;
112 0 0       0 $too_low++ if $count < $number;
113 0 0       0 $too_high++ if $count > $number;
114 0         0 while ($count < $number) {
115             # we can sort-of-fix this problem
116 0         0 $self->{'data_table'}->[$row][$count] = '.';
117 0         0 $count++;
118             }
119             }
120             }
121            
122             # we found errors
123 52 50       152 if (@problems) {
124             # collapse problem list into compact string
125             # from http://www.perlmonks.org/?node_id=87538
126 0         0 my $problem = join(',', @problems);
127 0         0 $problem =~ s/(?
  0         0  
128              
129 0 0       0 if ($too_low) {
130 0 0       0 print "\n COLUMN INCONSISTENCY ERRORS: $too_low rows had fewer than expected " .
131             "columns!\n padded rows $problem with null values\n"
132             unless $silence;
133             }
134 0 0       0 if ($too_high) {
135 0 0       0 print "\n COLUMN INCONSISTENCY ERRORS: $too_high rows had more columns than " .
136             "expected!\n Problem rows: $problem\n"
137             unless $silence;
138 0         0 return;
139             }
140             }
141             }
142             else {
143             # this wasn't set???? then set it
144             $self->{'number_columns'} =
145 0         0 scalar @{ $self->{'data_table'}->[0] };
  0         0  
146             }
147            
148             # check metadata and table names
149 52         109 my $mdcheck = 0;
150 52         149 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
151 415 50       881 unless (
152             $self->{$i}{'name'} eq
153             $self->{'data_table'}->[0][$i]
154             ) {
155             printf( "\n TABLE/METADATA MISMATCH ERROR: Column header names don't" .
156             " match metadata name values for index $i!" .
157             "\n compare '%s' with '%s'\n", $self->{'data_table'}->[0][$i],
158 0 0       0 $self->{$i}{'name'} ) unless $silence;
159 0         0 $mdcheck++;
160             }
161 415 50       1003 unless ($self->{$i}{'index'} == $i) {
162             printf( "\n METADATA INDEX ERROR: index $i metadata doesn't match its " .
163 0 0       0 "index value %s\n", $self->{$i}{'index'} ) unless $silence;
164 0         0 $mdcheck++;
165             }
166             }
167 52 50       158 return if $mdcheck;
168            
169             ### Defined file format structure integrity
170 52         91 my $error;
171            
172             # check for proper gff structure
173 52 100       118 if ($self->{'gff'}) {
174             # if any of these checks fail, we will reset the gff version to
175             # the default of 0, or no gff
176 6         23 my $gff_check = 1; # start with assumption it is true
177            
178             # check number of columns
179 6 100       19 if ($self->{'number_columns'} != 9) {
180 1         3 $gff_check = 0;
181 1         4 $error .= " Number of columns not 9.";
182             }
183            
184             # check column indices
185 6 50 33     65 if (
186             # column 0 should look like chromosome
187             exists $self->{0} and
188             $self->{0}{'name'} !~
189             m/^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i
190             ) {
191 0         0 $gff_check = 0;
192 0         0 $error .= " Column 0 name not chromosome-like.";
193             }
194 6 50 33     75 if (
195             # column 3 should look like start
196             exists $self->{3} and
197             $self->{3}{'name'} !~ m/start|pos|position/i
198             ) {
199 0         0 $gff_check = 0;
200 0         0 $error .= " Column 3 name not start-like.";
201             }
202 6 50 33     52 if (
203             # column 4 should look like end
204             exists $self->{4} and
205             $self->{4}{'name'} !~ m/stop|end|pos|position/i
206             ) {
207 0         0 $gff_check = 0;
208 0         0 $error .= " Column 4 name not stop-like.";
209             }
210 6 50 33     46 if (
211             # column 6 should look like strand
212             exists $self->{6} and
213             $self->{6}{'name'} !~ m/strand/i
214             ) {
215 0         0 $gff_check = 0;
216 0         0 $error .= " Column 6 name not strand-like.";
217             }
218            
219             # check column data
220 6 50       36 unless ($self->_column_is_integers(3,4)) {
221 0         0 $gff_check = 0;
222 0         0 $error .= " Columns 3,4 not integers.";
223             }
224 6 50       50 unless ($self->_column_is_numeric(5)) {
225 0         0 $gff_check = 0;
226 0         0 $error .= " Column 5 not numeric.";
227             }
228 6 50       36 unless ($self->_column_is_stranded(6)) {
229 0         0 $gff_check = 0;
230 0         0 $error .= " Column 6 not strand values.";
231             }
232            
233             # update gff value as necessary
234 6 100       21 if ($gff_check == 0) {
235             # reset metadata
236 1         3 $self->{'gff'} = 0;
237 1         3 $self->{'headers'} = 1;
238            
239             # remove the AUTO key from the metadata
240 1         6 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
241 8 50       16 if (exists $self->{$i}{'AUTO'}) {
242 8         19 delete $self->{$i}{'AUTO'};
243             }
244             }
245 1 50       4 print "\n GFF FILE FORMAT ERROR: $error\n" unless $silence;
246             }
247             }
248            
249             # check for proper BED structure
250 52 100       128 if ($self->{'bed'}) {
251             # if any of these checks fail, we will reset the bed flag to 0
252             # to make it not a bed file format
253 21         36 my $bed_check = 1; # start with assumption it is correct
254            
255             # check number of columns
256 21 50       59 if ($self->{'number_columns'} < 3) {
257 0         0 $bed_check = 0;
258 0         0 $error .= " Number of columns not at least 3.";
259             }
260            
261             # check column index names
262 21 50 33     175 if (
263             exists $self->{0} and
264             $self->{0}{'name'} !~
265             m/^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i
266             ) {
267 0         0 $bed_check = 0;
268 0         0 $error .= " Column 0 name not chromosome-like.";
269             }
270 21 50 33     197 if (
271             exists $self->{1} and
272             $self->{1}{'name'} !~ m/start|pos|position/i
273             ) {
274 0         0 $bed_check = 0;
275 0         0 $error .= " Column 1 name not start-like.";
276             }
277 21 50 33     134 if (
278             exists $self->{2} and
279             $self->{2}{'name'} !~ m/stop|end|pos|position/i
280             ) {
281 0         0 $bed_check = 0;
282 0         0 $error .= " Column 2 name not stop-like.";
283             }
284 21 50 66     107 if (
285             exists $self->{5} and
286             $self->{5}{'name'} !~ m/strand/i
287             ) {
288 0         0 $bed_check = 0;
289 0         0 $error .= " Column 5 name not strand-like.";
290             }
291 21 50 100     190 if (
      66        
292             exists $self->{6} and
293             $self->{'format'} !~ /narrow|broad/i and
294             $self->{6}{'name'} !~ m/start|thick|cds/i
295             ) {
296 0         0 $bed_check = 0;
297 0         0 $error .= " Column 6 name not thickStart-like.";
298             }
299 21 50 100     123 if (
      66        
300             exists $self->{7} and
301             $self->{'format'} !~ /narrow|broad/i and
302             $self->{7}{'name'} !~ m/end|stop|thick|cds/i
303             ) {
304 0         0 $bed_check = 0;
305 0         0 $error .= " Column 7 name not thickEnd-like.";
306             }
307 21 50 100     134 if (
      66        
308             exists $self->{8} and
309             $self->{'format'} !~ /narrow|broad/i and
310             $self->{8}{'name'} !~ m/item|rgb|color/i
311             ) {
312 0         0 $bed_check = 0;
313 0         0 $error .= " Column 8 name not itemRGB-like.";
314             }
315 21 50 100     140 if (
      66        
316             exists $self->{9} and
317             $self->{'format'} !~ /narrow|broad/i and
318             $self->{9}{'name'} !~ m/count|number|block|exon/i
319             ) {
320 0         0 $bed_check = 0;
321 0         0 $error .= " Column 9 name not blockCount-like.";
322             }
323 21 50 66     93 if (
324             exists $self->{10} and
325             $self->{10}{'name'} !~ m/size|length|block|exon/i
326             ) {
327 0         0 $bed_check = 0;
328 0         0 $error .= " Column 10 name not blockSizes-like.";
329             }
330 21 50 66     91 if (
331             exists $self->{11} and
332             $self->{11}{'name'} !~ m/start|block|exon/i
333             ) {
334 0         0 $bed_check = 0;
335 0         0 $error .= " Column 11 name not blockStarts-like.";
336             }
337            
338             # check column data
339 21 50       63 unless ($self->_column_is_integers(1,2)) {
340 0         0 $bed_check = 0;
341 0         0 $error .= " Columns 1,2 not integers.";
342             }
343 21 100       92 if ($self->{'number_columns'} >= 5) {
344             # only check if it is actually present, since could be optional
345 15 50       40 unless ($self->_column_is_numeric(4) ) {
346 0         0 $bed_check = 0;
347 0         0 $error .= " Column 4 not numeric.";
348             }
349             }
350 21 100       64 if ($self->{'number_columns'} >= 6) {
351             # only check if it is actually present, since could be optional
352 15 50       52 unless ($self->_column_is_stranded(5) ) {
353 0         0 $bed_check = 0;
354 0         0 $error .= " Column 5 not strand values.";
355             }
356             }
357 21 100 100     110 if ($self->{'format'} and
358             $self->{'format'} =~ /narrow|broad/i) {
359 4 50       14 unless ($self->_column_is_numeric(6,7,8) ) {
360 0         0 $bed_check = 0;
361 0         0 $error .= " Columns 6,7,8 not numeric.";
362             }
363             }
364 21 100       66 if ($self->{'number_columns'} == 12) {
365             # bed12 has extra special limitations
366 3 50       8 unless ($self->_column_is_integers(6,7,9) ) {
367 0         0 $bed_check = 0;
368 0         0 $error .= " Column 6,7,9 not integers.";
369             }
370 3 50       10 unless ($self->_column_is_comma_integers(10,11) ) {
371 0         0 $bed_check = 0;
372 0         0 $error .= " Column 10,11 not comma-delimited integers.";
373             }
374             }
375 21 100 66     85 if (
376             $self->{'number_columns'} == 15 and
377             $self->{'format'} =~ /gapped/i
378             ) {
379             # gappedPeak has extra special limitations
380 4 50       14 unless ($self->_column_is_integers(6,7,9) ) {
381 0         0 $bed_check = 0;
382 0         0 $error .= " Column 6,7,9 not integers.";
383             }
384 4 50       25 unless ($self->_column_is_comma_integers(10,11) ) {
385 0         0 $bed_check = 0;
386 0         0 $error .= " Column 10,11 not comma-delimited integers.";
387             }
388 4 50       14 unless ($self->_column_is_numeric(12,13,14) ) {
389 0         0 $bed_check = 0;
390 0         0 $error .= " Columns 12,13,14 not numeric.";
391             }
392             }
393            
394             # peak file format
395 21 50 100     129 if ($self->{'format'} and
      66        
396             $self->{'format'} =~ /narrowpeak/i and
397             $self->{'number_columns'} != 10
398             ) {
399 0         0 $bed_check = 0;
400 0         0 $error .= " NarrowPeak has 10 columns only.";
401             }
402 21 50 66     119 if ($self->{'format'} and
      33        
403             $self->{'format'} =~ /broadpeak/i and
404             $self->{'number_columns'} != 9
405             ) {
406 0         0 $bed_check = 0;
407 0         0 $error .= " BroadPeak has 9 columns only.";
408             }
409 21 50 100     107 if ($self->{'format'} and
      66        
410             $self->{'format'} =~ /gappedpeak/i and
411             $self->{'number_columns'} != 15
412             ) {
413 0         0 $bed_check = 0;
414 0         0 $error .= " GappeddPeak has 15 columns only.";
415             }
416            
417             # reset the BED tag value as appropriate
418 21 50       49 if ($bed_check) {
419 21         55 $self->{'bed'} = $self->{'number_columns'}; # in case we had a fake true
420             }
421             else {
422             # reset metadata
423 0         0 $self->{'bed'} = 0;
424 0         0 $self->{'headers'} = 1;
425 0         0 my $ext = $self->{'extension'};
426 0         0 $self->{'filename'} =~ s/$ext/.txt/;
427 0         0 $self->{'extension'} = '.txt';
428 0         0 $self->{'format'} = '';
429            
430             # remove the AUTO key from the metadata
431 0         0 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
432 0 0       0 if (exists $self->{$i}{'AUTO'}) {
433 0         0 delete $self->{$i}{'AUTO'};
434             }
435             }
436 0 0       0 print "\n BED FILE FORMAT ERROR: $error\n" unless $silence;
437             }
438             }
439            
440             # check refFlat or genePred gene structure
441 52 100       141 if ($self->{'ucsc'}) {
442             # if any of these checks fail, we will reset the extension
443 6         12 my $ucsc_check = 1; # start with assumption it is correct
444 6         8 my $ucsc_type;
445            
446             # check number of columns
447 6         14 my $colnumber = $self->{number_columns};
448 6 50 0     12 if ($colnumber == 16) {
    0          
    0          
    0          
449 6         12 $ucsc_type = 'genePredExtBin';
450             # genePredExt with bin
451             # bin name chrom strand txStart txEnd cdsStart cdsEnd
452             # exonCount exonStarts exonEnds score name2 cdsStartSt
453             # cdsEndStat exonFrames
454 6 50       30 unless($self->{2}{name} =~
455             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
456 0         0 $ucsc_check = 0;
457 0         0 $error .= " Column 2 name not chromosome-like.";
458             }
459 6 100       45 unless($self->{4}{name} =~ /start|position/i) {
460 3         10 $ucsc_check = 0;
461 3         8 $error .= " Column 4 name not start-like.";
462             }
463 6 100       27 unless($self->{5}{name} =~ /stop|end|position/i) {
464 3         5 $ucsc_check = 0;
465 3         7 $error .= " Column 5 name not stop-like.";
466             }
467 6 100       22 unless($self->{6}{name} =~ /start|position/i) {
468 3         7 $ucsc_check = 0;
469 3         5 $error .= " Column 6 name not start-like.";
470             }
471 6 100       23 unless($self->{7}{name} =~ /stop|end|position/i) {
472 3         5 $ucsc_check = 0;
473 3         6 $error .= " Column 7 name not stop-like.";
474             }
475 6 50       15 unless($self->_column_is_integers(4,5,6,7,8)) {
476 0         0 $ucsc_check = 0;
477 0         0 $error .= " Columns 4,5,6,7,8 not integers.";
478             }
479 6 50       24 unless ($self->_column_is_comma_integers(9,10)) {
480 0         0 $ucsc_check = 0;
481 0         0 $error .= " Columns 9,10 not comma-delimited integers.";
482             }
483 6 50       18 unless($self->_column_is_stranded(3)) {
484 0         0 $ucsc_check = 0;
485 0         0 $error .= " Column 3 not strand values.";
486             }
487             }
488             elsif ($colnumber == 15 or $colnumber == 12) {
489 0 0       0 $ucsc_type = $colnumber == 15 ? 'genePredExt' : 'knownGene';
490             # GenePredExt
491             # name chrom strand txStart txEnd cdsStart cdsEnd
492             # exonCount exonStarts exonEnds score name2 cdsStartSt
493             # cdsEndStat exonFrames
494             # or knownGene
495             # name chrom strand txStart txEnd cdsStart cdsEnd
496             # exonCount exonStarts exonEnds proteinID alignID
497 0 0       0 unless($self->{1}{name} =~
498             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
499 0         0 $ucsc_check = 0;
500 0         0 $error .= " Column 1 name not chromosome-like.";
501             }
502 0 0       0 unless($self->{3}{name} =~ /start|position/i) {
503 0         0 $ucsc_check = 0;
504 0         0 $error .= " Column 3 name not start-like.";
505             }
506 0 0       0 unless($self->{4}{name} =~ /stop|end|position/i) {
507 0         0 $ucsc_check = 0;
508 0         0 $error .= " Column 4 name not stop-like.";
509             }
510 0 0       0 unless($self->{5}{name} =~ /start|position/i) {
511 0         0 $ucsc_check = 0;
512 0         0 $error .= " Column 5 name not start-like.";
513             }
514 0 0       0 unless($self->{6}{name} =~ /stop|end|position/i) {
515 0         0 $ucsc_check = 0;
516 0         0 $error .= " Column 6 name not stop-like.";
517             }
518 0 0       0 unless($self->_column_is_integers(3,4,5,6,7)) {
519 0         0 $ucsc_check = 0;
520 0         0 $error .= " Columns 3,4,5,6,7 not integers.";
521             }
522 0 0       0 unless ($self->_column_is_comma_integers(8,9)) {
523 0         0 $ucsc_check = 0;
524 0         0 $error .= " Columns 8,9 not comma-delimited integers.";
525             }
526 0 0       0 unless($self->_column_is_stranded(2)) {
527 0         0 $ucsc_check = 0;
528 0         0 $error .= " Column 2 not strand values.";
529             }
530             }
531             elsif ($colnumber == 11) {
532 0         0 $ucsc_type = 'refFlat';
533             # geneName transcriptName chrom strand txStart txEnd
534             # cdsStart cdsEnd exonCount exonStarts exonEnds
535 0 0       0 unless($self->{2}{name} =~
536             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
537 0         0 $ucsc_check = 0;
538 0         0 $error .= " Column 2 name not chromosome-like.";
539             }
540 0 0       0 unless($self->{4}{name} =~ /start|position/i) {
541 0         0 $ucsc_check = 0;
542 0         0 $error .= " Column 4 name not start-like.";
543             }
544 0 0       0 unless($self->{5}{name} =~ /stop|end|position/i) {
545 0         0 $ucsc_check = 0;
546 0         0 $error .= " Column 5 name not stop-like.";
547             }
548 0 0       0 unless($self->{6}{name} =~ /start|position/i) {
549 0         0 $ucsc_check = 0;
550 0         0 $error .= " Column 6 name not start-like.";
551             }
552 0 0       0 unless($self->{7}{name} =~ /stop|end|position/i) {
553 0         0 $ucsc_check = 0;
554 0         0 $error .= " Column 7 name not stop-like.";
555             }
556 0 0       0 unless($self->_column_is_integers(4,5,6,7,8)) {
557 0         0 $ucsc_check = 0;
558 0         0 $error .= " Columns 4,5,6,7,8 not integers.";
559             }
560 0 0       0 unless ($self->_column_is_comma_integers(9,10)) {
561 0         0 $ucsc_check = 0;
562 0         0 $error .= " Columns 9,10 not comma-delimited integers.";
563             }
564 0 0       0 unless($self->_column_is_stranded(3)) {
565 0         0 $ucsc_check = 0;
566 0         0 $error .= " Column 3 not strand values.";
567             }
568             }
569             elsif ($colnumber == 10) {
570 0         0 $ucsc_type = 'genePred';
571             # name chrom strand txStart txEnd cdsStart cdsEnd
572             # exonCount exonStarts exonEnds
573 0 0       0 unless($self->{1}{name} =~
574             /^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
575 0         0 $ucsc_check = 0;
576 0         0 $error .= " Column 1 name not chromosome-like.";
577             }
578 0 0       0 unless($self->{3}{name} =~ /start|position/i) {
579 0         0 $ucsc_check = 0;
580 0         0 $error .= " Column 3 name not start-like.";
581             }
582 0 0       0 unless($self->{4}{name} =~ /stop|end|position/i) {
583 0         0 $ucsc_check = 0;
584 0         0 $error .= " Column 4 name not stop-like.";
585             }
586 0 0       0 unless($self->{5}{name} =~ /start|position/i) {
587 0         0 $ucsc_check = 0;
588 0         0 $error .= " Column 5 name not start-like.";
589             }
590 0 0       0 unless($self->{6}{name} =~ /stop|end|position/i) {
591 0         0 $ucsc_check = 0;
592 0         0 $error .= " Column 6 name not stop-like.";
593             }
594 0 0       0 unless($self->_column_is_integers(3,4,5,6,7)) {
595 0         0 $ucsc_check = 0;
596 0         0 $error .= " Columns 3,4,5,6,7 not integers.";
597             }
598 0 0       0 unless ($self->_column_is_comma_integers(8,9)) {
599 0         0 $ucsc_check = 0;
600 0         0 $error .= " Columns 8,9 not comma-delimited integers.";
601             }
602 0 0       0 unless($self->_column_is_stranded(2)) {
603 0         0 $ucsc_check = 0;
604 0         0 $error .= " Column 2 not strand values.";
605             }
606             }
607             else {
608 0         0 $ucsc_type = 'UCSC';
609 0         0 $ucsc_check = 0;
610 0         0 $error .= " Wrong # of columns.";
611             }
612              
613 6 100       18 if ($ucsc_check == 0) {
614             # failed the check
615 3         8 my $ext = $self->{'extension'};
616 3         26 $self->{'filename'} =~ s/$ext/.txt/;
617 3         10 $self->{'extension'} = '.txt';
618 3         6 $self->{'ucsc'} = 0;
619 3         5 $self->{'format'} = '';
620            
621             # remove the AUTO key
622 3         12 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
623 48 50       81 if (exists $self->{$i}{'AUTO'}) {
624 48         96 delete $self->{$i}{'AUTO'};
625             }
626             }
627 3 50       13 print "\n $ucsc_type FILE FORMAT ERROR: $error\n" unless $silence;
628             }
629             }
630            
631             # check VCF format
632 52 50       120 if ($self->{vcf}) {
633             # if any of these checks fail, we will reset the vcf flag to 0
634             # to make it not a vcf file format
635 0         0 my $vcf_check = 1; # start with assumption it is correct
636            
637             # check number of columns
638 0 0       0 if ($self->{'number_columns'} < 8) {
639 0         0 $vcf_check = 0;
640 0         0 $error .= " Number of Columns is too few.";
641             }
642            
643             # check column index names
644 0 0       0 if ($self->{0}{'name'} !~ m/chrom/i) {
645 0         0 $vcf_check = 0;
646 0         0 $error .= " Column 0 name not chromosome.";
647             }
648 0 0 0     0 if (
649             exists $self->{1} and
650             $self->{1}{'name'} !~ m/^pos|start/i
651             ) {
652 0         0 $vcf_check = 0;
653 0         0 $error .= " Column 1 name not position.";
654             }
655            
656             # check column data
657 0 0       0 unless ($self->_column_is_integers(1)) {
658 0         0 $vcf_check = 0;
659 0         0 $error .= " Columns 1 not integers.";
660             }
661            
662             # reset the vcf tag value as appropriate
663 0 0       0 if ($vcf_check) {
664             # in case we had a fake true set it to a more reasonable value?
665 0 0       0 $self->{'vcf'} = 4 if $self->{'vcf'} == 1;
666             }
667             else {
668             # reset metadata
669 0         0 $self->{'vcf'} = 0;
670 0         0 $self->{'headers'} = 1;
671 0         0 $self->{'format'} = '';
672            
673             # remove the AUTO key from the metadata
674 0         0 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
675 0 0       0 if (exists $self->{$i}{'AUTO'}) {
676 0         0 delete $self->{$i}{'AUTO'};
677             }
678             }
679 0 0       0 print "\n VCF FILE FORMAT ERROR: $error\n" unless $silence;
680             }
681             }
682            
683             # check proper SGR file structure
684 52 50 66     444 if (exists $self->{'extension'} and
      66        
685             defined $self->{'extension'} and
686             $self->{'extension'} =~ /sgr/i
687             ) {
688             # there is no sgr field in the data structure
689             # so we're just checking for the extension
690             # we will change the extension as necessary if it doesn't conform
691 0         0 my $sgr_check = 1;
692 0 0       0 if ($self->{'number_columns'} != 3) {
693 0         0 $sgr_check = 0;
694 0         0 $error .= " Column number is not 3.";
695             }
696 0 0       0 if ($self->{0}{'name'} !~ m/^#?(?:chr|chromo|seq|refseq|ref_seq|seq|seq_id)/i) {
697 0         0 $sgr_check = 0;
698 0         0 $error .= " Column 0 name not chromosome-like.";
699             }
700 0 0       0 if ($self->{1}{'name'} !~ /start|position/i) {
701 0         0 $sgr_check = 0;
702 0         0 $error .= " Column 1 name not start-like.";
703             }
704 0 0       0 unless ($self->_column_is_integers(1)) {
705 0         0 $sgr_check = 0;
706 0         0 $error .= " Columns 1 not integers.";
707             }
708 0 0       0 if ($sgr_check == 0) {
709             # doesn't smell like a SGR file
710             # change the extension so the write subroutine won't think it is
711             # make it a text file
712 0         0 $self->{'extension'} =~ s/sgr/txt/i;
713 0         0 $self->{'filename'} =~ s/sgr/txt/i;
714 0         0 $self->{'headers'} = 1;
715 0         0 $self->{'format'} = '';
716            
717             # remove the AUTO key from the metadata
718 0         0 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
719 0 0       0 if (exists $self->{$i}{'AUTO'}) {
720 0         0 delete $self->{$i}{'AUTO'};
721             }
722             }
723 0 0       0 print "\n SGR FILE FORMAT ERROR: $error\n" unless $silence;
724             }
725             }
726            
727             # check file headers value because this may have changed
728             # this can happen when we reset bed/gff/vcf flags when we add columns
729 52 100 100     492 if (
    100 100        
      66        
      66        
      33        
      33        
      66        
      33        
730             $self->{'bed'} or
731             $self->{'gff'} or
732             $self->{'ucsc'} or
733             ($self->{'extension'} and $self->{'extension'} =~ /sgr/i)
734             ) {
735 29         64 $self->{'headers'} = 0;
736             }
737             elsif (
738             $self->{'bed'} == 0 and
739             $self->{'gff'} == 0 and
740             $self->{'ucsc'} == 0 and
741             ($self->{'extension'} and $self->{'extension'} !~ /sgr/i)
742             ) {
743 18 50       54 $self->{'headers'} = 1 unless $self->{'headers'} == -1;
744             }
745            
746             # if we have made it here, then there were no major structural problems
747             # file is verified, any minor issues should have been fixed
748 52         165 return 1;
749             }
750              
751             # internal method to check if a column is nothing but integers, i.e. start, stop
752             sub _column_is_integers {
753 40     40   71 my $self = shift;
754 40         99 my @index = @_;
755 40 100       107 return 1 if ($self->{last_row} == 0); # can't check if table is empty
756 38         87 foreach (@index) {
757 101 50       254 return 0 unless exists $self->{$_};
758             }
759 38         100 for my $row (1 .. $self->{last_row}) {
760 709         1044 for my $i (@index) {
761 1648 50       3945 return 0 unless ($self->{data_table}->[$row][$i] =~ /^\d+$/);
762             }
763             }
764 38         135 return 1;
765             }
766              
767             # internal method to check if a column appears numeric, i.e. scores
768             sub _column_is_numeric {
769 29     29   49 my $self = shift;
770 29         76 my @index = @_;
771 29 50       88 return 1 if ($self->{last_row} == 0); # can't check if table is empty
772 29         55 foreach (@index) {
773 45 50       116 return 0 unless exists $self->{$_};
774             }
775 29         63 for my $row (1 .. $self->{last_row}) {
776 327         473 for my $i (@index) {
777             # we have a very loose definition of numeric: exponents, signs, commas
778 407 50       1384 return 0 unless ($self->{data_table}->[$row][$i] =~ /^[\d\-\+\.,eE]+$/);
779             }
780             }
781 29         128 return 1;
782             }
783              
784              
785              
786             # internal method to check if a column is nothing but comma delimited integers
787             sub _column_is_comma_integers {
788 13     13   26 my $self = shift;
789 13         25 my @index = @_;
790 13 50       34 return 1 if ($self->{last_row} == 0); # can't check if table is empty
791 13         27 foreach (@index) {
792 26 50       64 return 0 unless exists $self->{$_};
793             }
794 13         31 for my $row (1 .. $self->{last_row}) {
795 110         159 for my $i (@index) {
796 220 50       564 return 0 unless ($self->{data_table}->[$row][$i] =~ /^[\d,]+$/);
797             }
798             }
799 13         48 return 1;
800             }
801              
802             # internal method to check if a column looks like a strand column
803             sub _column_is_stranded {
804 27     27   61 my ($self, $index) = @_;
805 27 50       74 return unless exists $self->{$index};
806 27         60 for my $row (1 .. $self->{last_row}) {
807 347 50       866 return 0 if ($self->{data_table}->[$row][$index] !~ /^(?:\-1|0|1|\+|\-|\.)$/);
808             }
809 27         90 return 1;
810             }
811              
812              
813              
814              
815              
816             #### Database methods ##############################################################
817              
818             sub open_database {
819 1     1 1 3 my $self = shift;
820 1 50 0     5 if (not defined $_[0]) {
    0          
    0          
821 1         5 return $self->open_meta_database;
822             }
823             elsif ($_[0] eq '0' or $_[0] eq '1') {
824             # likely a boolean value to indicate force
825 0         0 return $self->open_meta_database($_[0]);
826             }
827             elsif ($_[0] =~ /[a-zA-Z]+/) {
828             # likely the name of a database
829 0         0 return $self->open_new_database(@_);
830             }
831             else {
832             # original default
833 0         0 return $self->open_meta_database(@_);
834             }
835             }
836              
837             sub open_meta_database {
838 15     15 1 28 my $self = shift;
839 15   50     53 my $force = shift || 0;
840 15 50       35 return unless $self->{db};
841 15 50       44 return if $self->{db} =~ /^Parsed:/; # we don't open parsed annotation files
842 15 100       32 if (exists $self->{db_connection}) {
843 14 50       60 return $self->{db_connection} unless $force;
844             }
845 1         4 my $db = open_db_connection($self->{db}, $force);
846 1 50       3 return unless $db;
847 1         2 $self->{db_connection} = $db;
848 1         3 return $db;
849             }
850              
851             sub open_new_database {
852 0     0 1 0 my $self = shift;
853 0         0 my $database = shift;
854 0   0     0 my $force = shift || 0;
855 0         0 return open_db_connection($database, $force);
856             }
857              
858             sub verify_dataset {
859 12     12 1 27 my ($self, $dataset, $database) = @_;
860 12 50       25 return unless $dataset;
861 12 100       39 if (exists $self->{verfied_dataset}{$dataset}) {
862 11         34 return $self->{verfied_dataset}{$dataset};
863             }
864             else {
865 1 50       5 if ($dataset =~ /^(?:file|http|ftp)/) {
866             # local or remote file already verified?
867 0         0 $self->{verfied_dataset}{$dataset} = $dataset;
868 0         0 return $dataset;
869             }
870 1   33     3 $database ||= $self->open_meta_database;
871 1         6 my ($verified) = verify_or_request_feature_types(
872             # normally returns an array of verified features, we're only checking one
873             db => $database,
874             feature => $dataset,
875             );
876 1 50       4 if ($verified) {
877 1         4 $self->{verfied_dataset}{$dataset} = $verified;
878 1         3 return $verified;
879             }
880             }
881 0         0 return;
882             }
883              
884              
885              
886             #### Column Manipulation ####
887              
888             sub delete_column {
889 2     2 1 6 my $self = shift;
890            
891             # check for Stream
892 2 50       9 if (ref $self eq 'Bio::ToolBox::Data::Stream') {
893 0 0       0 unless ($self->mode) {
894 0         0 cluck "We have a read-only Stream object, cannot add columns";
895 0         0 return;
896             }
897 0 0       0 if (defined $self->{fh}) {
898             # Stream file handle is opened
899 0         0 cluck "Cannot modify columns when a Stream file handle is opened!";
900 0         0 return;
901             }
902             }
903 2 50       5 unless (@_) {
904 0         0 cluck "must provide a list";
905 0         0 return;
906             }
907            
908 2         7 my @deletion_list = sort {$a <=> $b} @_;
  0         0  
909 2         5 my @retain_list;
910 2         7 for (my $i = 0; $i < $self->number_columns; $i++) {
911             # compare each current index with the first one in the list of
912             # deleted indices. if it matches, delete. if not, keep
913 18 100       28 if ( $i == $deletion_list[0] ) {
914             # this particular index should be deleted
915 2         5 shift @deletion_list;
916             }
917             else {
918             # this particular index should be kept
919 16         32 push @retain_list, $i;
920             }
921             }
922 2         8 return $self->reorder_column(@retain_list);
923             }
924              
925             sub reorder_column {
926 3     3 1 6 my $self = shift;
927            
928             # check for Stream
929 3 50       10 if (ref $self eq 'Bio::ToolBox::Data::Stream') {
930 0 0       0 unless ($self->mode) {
931 0         0 cluck "We have a read-only Stream object, cannot add columns";
932 0         0 return;
933             }
934 0 0       0 if (defined $self->{fh}) {
935             # Stream file handle is opened
936 0         0 cluck "Cannot modify columns when a Stream file handle is opened!";
937 0         0 return;
938             }
939             }
940            
941             # reorder data table
942 3 50       8 unless (@_) {
943 0         0 carp "must provide a list";
944 0         0 return;
945             }
946 3         7 my @order = @_;
947 3         14 for (my $row = 0; $row <= $self->last_row; $row++) {
948 237         449 my @old = $self->row_values($row);
949 237         364 my @new = map { $old[$_] } @order;
  1580         2322  
950 237         308 splice( @{ $self->{data_table} }, $row, 1, \@new);
  237         785  
951             }
952            
953             # reorder metadata
954 3         7 my %old_metadata;
955 3         8 for (my $i = 0; $i < $self->number_columns; $i++) {
956             # copy the metadata info hash into a temporary hash
957 28         56 $old_metadata{$i} = $self->{$i};
958 28         51 delete $self->{$i}; # delete original
959             }
960 3         12 for (my $i = 0; $i < scalar(@order); $i++) {
961             # now copy back from the old_metadata into the main data hash
962             # using the new index number in the @order array
963             # must regenerate the hash, not just link to the old anonymous hash, in
964             # case we're duplicating columns
965 20         36 $self->{$i} = {};
966 20         23 foreach my $k (keys %{ $old_metadata{$order[$i]} }) {
  20         67  
967 50         102 $self->{$i}{$k} = $old_metadata{$order[$i]}{$k};
968             }
969             # assign new index number
970 20         49 $self->{$i}{'index'} = $i;
971             }
972 3         12 $self->{'number_columns'} = scalar @order;
973 3 100       14 delete $self->{column_indices} if exists $self->{column_indices};
974 3 50 66     9 if ($self->gff or $self->bed or $self->ucsc or $self->vcf) {
      66        
      33        
975             # check if we maintain integrity, at least insofar what we test
976 1         5 $self->verify(1); # silence so user doesn't get these messages
977             }
978 3         26 return 1;
979             }
980              
981              
982              
983             #### General Metadata ####
984              
985             sub feature {
986 52     52 1 87 my $self = shift;
987 52 100       115 if (@_) {
988 19         39 $self->{feature} = shift;
989             }
990 52         171 return $self->{feature};
991             }
992              
993             sub feature_type {
994 40     40 1 75 my $self = shift;
995 40 50       91 carp "feature_type is a read only method" if @_;
996 40 100       100 if (defined $self->{feature_type}) {
997 28         127 return $self->{feature_type};
998             }
999 12         22 my $feature_type;
1000 12 100 66     45 if (defined $self->chromo_column and defined $self->start_column) {
    50 0        
      33        
      0        
      0        
1001 7         17 $feature_type = 'coordinate';
1002             }
1003             elsif (defined $self->id_column or
1004             ( defined $self->type_column and defined $self->name_column ) or
1005             ( defined $self->feature and defined $self->name_column )
1006             ) {
1007 5         10 $feature_type = 'named';
1008             }
1009             else {
1010 0         0 $feature_type = 'unknown';
1011             }
1012 12         34 $self->{feature_type} = $feature_type;
1013 12         59 return $feature_type;
1014             }
1015              
1016             sub program {
1017 68     68 1 127 my $self = shift;
1018 68 100       257 if (@_) {
1019 60         178 $self->{program} = shift;
1020             }
1021 68         148 return $self->{program};
1022             }
1023              
1024             sub database {
1025 45     45 1 708 my $self = shift;
1026 45 100       107 if (@_) {
1027 19         80 $self->{db} = shift;
1028 19 0 33     66 if (exists $self->{db_connection} and $self->{db_connection}) {
1029 0         0 $self->open_meta_database(1);
1030             }
1031             }
1032 45         247 return $self->{db};
1033             }
1034              
1035             sub bam_adapter {
1036 0     0 1 0 my $self = shift;
1037 0         0 return use_bam_adapter(@_);
1038             }
1039              
1040             sub big_adapter {
1041 0     0 1 0 my $self = shift;
1042 0         0 return use_big_adapter(@_);
1043             }
1044              
1045             sub format {
1046 182     182 0 310 my $self = shift;
1047 182 100       387 if (defined $_[0]) {
1048 59         124 $self->{format} = $_[0];
1049             }
1050 182         706 return $self->{format};
1051             }
1052              
1053             sub gff {
1054 91     91 1 2425 my $self = shift;
1055 91 100 66     289 if (defined $_[0] and $_[0] =~ /^(?:0|1|2|2\.[2|5]|3)$/) {
1056 5         15 $self->{gff} = $_[0];
1057 5 50 33     40 if ($_[0] eq '2.2' or $_[0] eq '2.5') {
    50          
1058 0         0 $self->format('gtf');
1059             }
1060             elsif ($_[0] eq '3') {
1061 5         24 $self->format('gff3');
1062             }
1063             else {
1064 0         0 $self->format('gff');
1065             }
1066            
1067             }
1068 91         441 return $self->{gff};
1069             }
1070              
1071             sub bed {
1072 134     134 1 730 my $self = shift;
1073 134 100 66     510 if (defined $_[0] and $_[0] =~ /^\d+$/) {
1074 43         112 $self->{bed} = $_[0];
1075             }
1076 134         564 return $self->{bed};
1077             }
1078              
1079             sub ucsc {
1080 56     56 1 108 my $self = shift;
1081 56 100 66     143 if (defined $_[0] and $_[0] =~ /^\d+$/) {
1082 3         9 $self->{ucsc} = $_[0];
1083             }
1084 56         359 return $self->{ucsc};
1085             }
1086              
1087             sub vcf {
1088 39     39 1 65 my $self = shift;
1089 39 50 33     93 if (defined $_[0] and $_[0] =~ /^[\d\.]+$/) {
1090 0         0 $self->{vcf} = $_[0];
1091             }
1092 39         174 return $self->{vcf};
1093             }
1094              
1095             sub number_columns {
1096 552     552 1 8542 my $self = shift;
1097 552 50       1045 carp "number_columns is a read only method" if @_;
1098 552         1648 return $self->{number_columns};
1099             }
1100              
1101             sub last_column {
1102 0     0 1 0 my $self = shift;
1103 0 0       0 carp "last_column is a read only method" if @_;
1104 0         0 return $self->{number_columns} - 1;
1105             }
1106              
1107              
1108             sub last_row {
1109 892     892 1 3801 my $self = shift;
1110 892 50       1631 carp "last_row is a read only method" if @_;
1111 892         2246 return $self->{last_row};
1112             }
1113              
1114             sub filename {
1115 138     138 1 245 my $self = shift;
1116 138 50       331 carp "filename is a read only method. Use add_file_metadata()." if @_;
1117 138         491 return $self->{filename};
1118             }
1119              
1120             sub basename {
1121 29     29 1 2807 my $self = shift;
1122 29 50       85 carp "basename is a read only method. Use add_file_metadata()." if @_;
1123 29         138 return $self->{basename};
1124             }
1125              
1126             sub path {
1127 0     0 1 0 my $self = shift;
1128 0 0       0 carp "path is a read only method. Use add_file_metadata()." if @_;
1129 0         0 return $self->{path};
1130             }
1131              
1132             sub extension {
1133 124     124 1 202 my $self = shift;
1134 124 50       256 carp "extension() is a read only method. Use add_file_metadata()." if @_;
1135 124         552 return $self->{extension};
1136             }
1137              
1138              
1139              
1140             #### General Comments ####
1141              
1142             sub comments {
1143 26     26 1 53 my $self = shift;
1144 26         44 my @comments = @{ $self->{comments} };
  26         79  
1145 26         62 foreach (@comments) {s/[\r\n]+//g}
  25         151  
1146             # comments are not chomped when loading
1147             # side effect of dealing with rare commented header lines with null values at end
1148 26         98 return @comments;
1149             }
1150              
1151             sub add_comment {
1152 53     53 1 690 my $self = shift;
1153 53 50       133 my $comment = shift or return;
1154             # comment is not required to be prefixed with "# ", it will be added when saving
1155 53         73 push @{ $self->{comments} }, $comment;
  53         138  
1156 53         114 return 1;
1157             }
1158              
1159             sub delete_comment {
1160 1     1 1 1329 my $self = shift;
1161 1         3 my $index = shift;
1162 1 50       4 if (defined $index) {
1163 1         3 eval {splice @{$self->{comments}}, $index, 1};
  1         3  
  1         5  
1164             }
1165             else {
1166 0         0 $self->{comments} = [];
1167             }
1168             }
1169              
1170             sub vcf_headers {
1171 0     0 1 0 my $self = shift;
1172 0 0       0 return unless $self->vcf;
1173 0 0       0 return $self->{vcf_headers} if exists $self->{vcf_headers};
1174 0         0 my $headers = {};
1175 0         0 foreach my $comment ($self->comments) {
1176 0         0 my ($key, $value);
1177 0 0       0 if ($comment =~ /^##([\w\-\.]+)=(.+)$/) {
1178 0         0 $key = $1;
1179 0         0 $value = $2;
1180             }
1181             else {
1182             # invalid vcf header format!?
1183 0         0 next;
1184             }
1185 0 0       0 if ($value !~ /^<.+>$/) {
1186             # simple value
1187 0         0 $headers->{$key} = $value;
1188             }
1189             else {
1190             # process complex values
1191             # extract ID with regex which should have
1192 0         0 my $id = ($value =~ /ID=([\w\-\.:]+)/)[0];
1193 0         0 $headers->{$key}{$id} = $value;
1194             }
1195             }
1196            
1197             # store and return
1198 0         0 $self->{vcf_headers} = $headers;
1199 0         0 return $headers;
1200             }
1201              
1202             sub rewrite_vcf_headers {
1203 0     0 1 0 my $self = shift;
1204 0 0       0 return unless $self->vcf;
1205 0 0       0 return unless exists $self->{vcf_headers};
1206 0         0 my @newComments;
1207            
1208             # file format always comes first
1209             push @newComments, sprintf("##fileformat=%s\n",
1210 0         0 $self->{vcf_headers}{fileformat});
1211            
1212             # common attributes
1213 0         0 foreach my $key (sort {$a cmp $b} keys %{ $self->{vcf_headers} } ) {
  0         0  
  0         0  
1214 0 0       0 next if $key eq 'fileformat';
1215 0 0       0 if (ref $self->{vcf_headers}{$key} eq 'HASH') {
1216             # we have a complex VCF header with multiple keys
1217             # we will rewrite for each ID
1218 0         0 foreach my $id (sort {$a cmp $b}
  0         0  
1219 0         0 keys %{ $self->{vcf_headers}{$key} }
1220             ) {
1221             # to avoid complexity of writing correct formatting
1222             push @newComments, sprintf("##%s=%s\n", $key,
1223 0         0 $self->{vcf_headers}{$key}{$id} );
1224             }
1225             }
1226             else {
1227             # a simple value
1228             push @newComments, sprintf("##%s=%s\n", $key,
1229 0         0 $self->{vcf_headers}{$key} );
1230             }
1231             }
1232            
1233             # replace the headers
1234 0         0 $self->{comments} = \@newComments;
1235             }
1236              
1237              
1238              
1239             #### Column Metadata ####
1240              
1241             sub list_columns {
1242 4     4 1 11 my $self = shift;
1243 4 50       13 carp "list_columns is a read only method" if @_;
1244 4         9 my @list;
1245 4         14 for (my $i = 0; $i < $self->number_columns; $i++) {
1246 26         67 push @list, $self->{$i}{'name'};
1247             }
1248 4 100       27 return wantarray ? @list : \@list;
1249             }
1250              
1251             sub name {
1252 115     115 1 2781 my $self = shift;
1253 115         210 my ($index, $new_name) = @_;
1254 115 50       251 return unless defined $index;
1255 115 50       246 return unless exists $self->{$index}{name};
1256 115 100       208 if (defined $new_name) {
1257 1         4 $self->{$index}{name} = $new_name;
1258 1 50       6 if (exists $self->{data_table}) {
    0          
1259 1         4 $self->{data_table}->[0][$index] = $new_name;
1260             }
1261             elsif (exists $self->{column_names}) {
1262 0         0 $self->{column_names}->[$index] = $new_name;
1263             }
1264             }
1265 115         421 return $self->{$index}{name};
1266             }
1267              
1268             sub metadata {
1269 24     24 1 44 my $self = shift;
1270 24         47 my ($index, $key, $value) = @_;
1271 24 50       51 return unless defined $index;
1272 24 50       57 return unless exists $self->{$index};
1273 24 100 100     62 if ($key and $key eq 'name') {
1274 2         8 return $self->name($index, $value);
1275             }
1276 22 100 66     71 if ($key and defined $value) {
    50 33        
1277             # we are setting a new value
1278 1         3 $self->{$index}{$key} = $value;
1279 1         3 return $value;
1280             }
1281             elsif ($key and not defined $value) {
1282 0 0       0 if (exists $self->{$index}{$key}) {
1283             # retrieve a value
1284 0         0 return $self->{$index}{$key};
1285             }
1286             else {
1287             # key does not exist
1288 0         0 return;
1289             }
1290             }
1291             else {
1292 21         31 my %hash = %{ $self->{$index} };
  21         78  
1293 21 100       105 return wantarray ? %hash : \%hash;
1294             }
1295             }
1296              
1297             sub delete_metadata {
1298 0     0 1 0 my $self = shift;
1299 0         0 my ($index, $key) = @_;
1300 0 0       0 return unless defined $index;
1301 0 0       0 if (defined $key) {
1302 0 0       0 if (exists $self->{$index}{$key}) {
1303 0         0 return delete $self->{$index}{$key};
1304             }
1305             }
1306             else {
1307             # user wants to delete the metadata
1308             # but we need to keep the basics name and index
1309 0         0 foreach my $key (keys %{ $self->{$index} }) {
  0         0  
1310 0 0       0 next if $key eq 'name';
1311 0 0       0 next if $key eq 'index';
1312 0         0 delete $self->{$index}{$key};
1313             }
1314             }
1315             }
1316              
1317             sub copy_metadata {
1318 2     2 1 5 my ($self, $source, $target) = @_;
1319 2 50 33     14 return unless (exists $self->{$source}{name} and exists $self->{$target}{name});
1320 2         29 my $md = $self->metadata($source);
1321 2         4 delete $md->{name};
1322 2         4 delete $md->{'index'};
1323 2 50       8 delete $md->{'AUTO'} if exists $md->{'AUTO'}; # presume this is no longer auto index
1324 2         6 foreach (keys %$md) {
1325 1         3 $self->{$target}{$_} = $md->{$_};
1326             }
1327 2         7 return 1;
1328             }
1329              
1330              
1331              
1332             #### Column Indices ####
1333              
1334             sub find_column {
1335 187     187 1 375 my ($self, $name) = @_;
1336 187 50       367 return unless $name;
1337            
1338             # the $name variable will be used as a regex in identifying the name
1339             # fix it so that it will possible accept a # character at the beginning
1340             # without a following space, in case the first column has a # prefix
1341             # also place the remainder of the text in a non-capturing parentheses for
1342             # grouping purposes while maintaining the anchors
1343 187         1207 $name =~ s/ \A (\^?) (.+) (\$?)\Z /$1#?(?:$2)$3/x;
1344            
1345             # walk through each column index
1346 187         336 my $index;
1347 187         437 for (my $i = 0; $i < $self->{'number_columns'}; $i++) {
1348             # check the names of each column
1349 599 100       6882 if ($self->{$i}{'name'} =~ /$name/i) {
1350 84         237 $index = $i;
1351 84         129 last;
1352             }
1353             }
1354 187         498 return $index;
1355             }
1356              
1357             sub _find_column_indices {
1358 23     23   42 my $self = shift;
1359             # these are hard coded index name regex to accomodate different possibilities
1360             # these do not include parentheses for grouping
1361             # non-capturing parentheses will be added later in the sub for proper
1362             # anchoring and grouping - long story why, don't ask
1363 23         72 my $name = $self->find_column('^name|gene.?name|transcript.?name|geneid|id|gene|alias');
1364 23         134 my $type = $self->find_column('^type|class|primary_tag|biotype');
1365 23         72 my $id = $self->find_column('^primary_id');
1366 23         71 my $chromo = $self->find_column('^chr|seq|ref|ref.?seq');
1367 23         61 my $start = $self->find_column('^start|position|pos|txStart');
1368 23         61 my $stop = $self->find_column('^stop|end|txEnd');
1369 23         57 my $strand = $self->find_column('^strand');
1370 23         66 my $score = $self->find_column('^score$');
1371             $self->{column_indices} = {
1372 23         216 'name' => $name,
1373             'type' => $type,
1374             'id' => $id,
1375             'seq_id' => $chromo,
1376             'chromo' => $chromo,
1377             'start' => $start,
1378             'stop' => $stop,
1379             'end' => $stop,
1380             'strand' => $strand,
1381             'score' => $score,
1382             };
1383 23         51 return 1;
1384             }
1385              
1386             sub chromo_column {
1387 37     37 1 73 my $self = shift;
1388 37 50       86 carp "chromo_column is a read only method" if @_;
1389 37 100       114 $self->_find_column_indices unless exists $self->{column_indices};
1390 37         170 return $self->{column_indices}{chromo};
1391             }
1392              
1393             sub start_column {
1394 118     118 1 193 my $self = shift;
1395 118 50       217 carp "start_column is a read only method" if @_;
1396 118 100       229 $self->_find_column_indices unless exists $self->{column_indices};
1397 118         307 return $self->{column_indices}{start};
1398             }
1399              
1400             sub stop_column {
1401 33     33 1 61 my $self = shift;
1402 33 50       79 carp "stop_column is a read only method" if @_;
1403 33 50       78 $self->_find_column_indices unless exists $self->{column_indices};
1404 33         99 return $self->{column_indices}{stop};
1405             }
1406              
1407             sub end_column {
1408 0     0 1 0 return shift->stop_column;
1409             }
1410              
1411             sub strand_column {
1412 32     32 1 58 my $self = shift;
1413 32 50       87 carp "strand_column is a read only method" if @_;
1414 32 100       101 $self->_find_column_indices unless exists $self->{column_indices};
1415 32         96 return $self->{column_indices}{strand};
1416             }
1417              
1418             sub name_column {
1419 72     72 1 110 my $self = shift;
1420 72 50       136 carp "name_column is a read only method" if @_;
1421 72 100       154 $self->_find_column_indices unless exists $self->{column_indices};
1422 72         201 return $self->{column_indices}{name};
1423             }
1424              
1425             sub type_column {
1426 12     12 1 25 my $self = shift;
1427 12 50       33 carp "type_column is a read only method" if @_;
1428 12 100       44 $self->_find_column_indices unless exists $self->{column_indices};
1429 12         40 return $self->{column_indices}{type};
1430             }
1431              
1432             sub id_column {
1433 70     70 1 107 my $self = shift;
1434 70 50       126 carp "id_column is a read only method" if @_;
1435 70 50       139 $self->_find_column_indices unless exists $self->{column_indices};
1436 70         160 return $self->{column_indices}{id};
1437             }
1438              
1439             sub score_column {
1440 0     0 1 0 my $self = shift;
1441 0 0       0 carp "score_column is a read only method" if @_;
1442 0 0       0 $self->_find_column_indices unless exists $self->{column_indices};
1443 0         0 return $self->{column_indices}{score};
1444             }
1445              
1446              
1447             #### Special Row methods ####
1448              
1449             # Why is this in core and not in Data? I keep asking myself.
1450             # Because this can get called from a Data::Feature object, which might be
1451             # associated with a Data::Stream object. No, Stream objects don't have stored
1452             # SeqFeatures, but I don't want the entire program to crash because of an
1453             # undefined method because some doofus forgot. Since both Data and Stream
1454             # objects inherit from Data::core, this is in here.
1455             sub get_seqfeature {
1456 13     13 1 58 my ($self, $row) = @_;
1457 13 50 33     79 return unless ($row and $row <= $self->{last_row});
1458 13 50       44 return unless exists $self->{SeqFeatureObjects};
1459 13   50     47 return $self->{SeqFeatureObjects}->[$row] || undef;
1460             }
1461              
1462              
1463              
1464             __END__