File Coverage

blib/lib/Bio/ToolBox/Data.pm
Criterion Covered Total %
statement 334 622 53.7
branch 115 300 38.3
condition 65 181 35.9
subroutine 30 35 85.7
pod 22 22 100.0
total 566 1160 48.7


line stmt bran cond sub pod time code
1             package Bio::ToolBox::Data;
2             our $VERSION = '1.69';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::Data - Reading, writing, and manipulating data structure
7              
8             =head1 SYNOPSIS
9              
10             use Bio::ToolBox::Data;
11            
12             ### Open a pre-existing file
13             # a data table with same columns as input file
14             my $Data = Bio::ToolBox::Data->new(
15             file => 'coordinates.bed',
16             );
17            
18             ### Parse a GTF, GFF3, refFlat, or genePred gene table
19             # data table with names and references to fully parsed
20             # SeqFeature transcript objects with exon SeqFeature subfeatures
21             my $Data = Bio::ToolBox::Data->new(
22             file => 'annotation.gtf.gz',
23             parse => 1,
24             feature => 'transcript',
25             subfeature => 'exon'
26             );
27            
28             ### New gene list from a Bio::DB::SeqFeature::Store database
29             # data table with name and reference ID to database SeqFeature objects
30             my $Data = Bio::ToolBox::Data->new(
31             db => 'hg19.sqlite',
32             feature => 'gene:ensGene',
33             );
34            
35            
36             ### Get a specific value
37             my $value = $Data->value($row, $column);
38            
39             ### Replace or add a value
40             $Data->value($row, $column, $new_value);
41            
42             ### Add a column
43             my $new_index = $Data->add_column('Data2');
44            
45             ### Find a column
46             my $old_index = $Data->find_column('Data1');
47            
48             ### Return a specific row as an object
49             my $row = $Data->get_row($i); # Bio::ToolBox::Data::Feature
50            
51             ### Iterate through a Data structure one row at a time
52             my $stream = $Data->row_stream;
53             while (my $row = $stream->next_row) {
54             # each row is returned as Bio::ToolBox::Data::Feature object
55             # get the positional information from the file data
56             # assuming that the input file had these identifiable columns
57             my $seq_id = $row->seq_id;
58             my $start = $row->start;
59             my $stop = $row->end;
60            
61             # work with the referenced SeqFeature object
62             my $seqf = $row->seqfeature;
63            
64             # generate a new Bio::Seq object from the database using
65             # these coordinates
66             my $region = $db->segment($seq_id, $start, $stop);
67            
68             # modify a row value
69             my $value = $row->value($old_index);
70             my $new_value = $value + 1;
71             $row->value($new_index, $new_value);
72             }
73            
74             ### Iterate through a Data table with a code reference
75             $Data->iterate(\&my_code);
76            
77            
78             ### write the data to file
79             my $success = $Data->write_file(
80             filename => 'new_data.txt',
81             gz => 1,
82             );
83             print "wrote new file $success\n"; # file is new_data.txt.gz
84              
85             =head1 DESCRIPTION
86              
87             This module works with the primary Bio::ToolBox Data structure. Simply, it
88             is a complex data structure representing a tabbed-delimited table (array
89             of arrays), with plenty of options for metadata. Many common bioinformatic
90             file formats are simply tabbed-delimited text files (think BED, GFF, VCF).
91             Each row is a feature or genomic interval, and each column is a piece of
92             information about that feature, such as name, type, and/or coordinates.
93             We can append to that file additional columns of information, perhaps
94             scores from genomic data sets. We can record metadata regarding how
95             and where we obtained that data. Finally, we can write the updated
96             table to a new file.
97              
98             =head1 METHODS
99              
100             =head2 Initializing the structure
101              
102             =over 4
103              
104             =item new
105              
106             Initialize a new Data structure. This generally requires options,
107             provided as an array of key =E values. A new list of features
108             may be obtained from an annotation database or an existing file
109             may be loaded. If you do not pass any options, a new empty
110             structure will be generated for you to populate.
111              
112             These are the options available.
113              
114             =over 4
115              
116             =item file
117              
118             =item in
119              
120             my $Data = Bio::ToolBox::Data->new(file => $file);
121              
122             Provide the path and name to an existing tabbed-delimited text
123             file from which to load the contents. This is a shortcut to the
124             load_file() method. See that method for more details.
125              
126             =item stream
127              
128             my $Data = Bio::ToolBox::Data->new(file => $file, stream => 1);
129              
130             Boolean option indicating that the file should be opened as a file
131             stream. A L object will be returned. This
132             is a convenience method.
133              
134             =item noheader
135              
136             my $Data = Bio::ToolBox::Data->new(file => $file, noheader => 1);
137              
138             Boolean option indicating that the file does not have file headers,
139             in which case dummy headers are provided. This is not necessary for
140             defined file types that don't normally have file headers, such as
141             BED, GFF, or UCSC files.
142              
143             =item parse
144              
145             my $Data = Bio::ToolBox::Data->new(file => $file, parse => 1);
146              
147             Boolean option indicating that a gene annotation table or file should
148             be parsed into SeqFeature objects and a general table of names and IDs
149             representing those objects be generated. The annotation file may
150             be specified in one of two ways: Through the file option above,
151             or in the database metadata of an existing table file representing
152             previously parsed objects.
153              
154             =item db
155              
156             my $Data = Bio::ToolBox::Data->new(db => 'hg19', feature => 'gene');
157              
158             Provide the name of the database from which to collect the
159             features. It may be a short name, whereupon it is checked in
160             the L configuration file F<.biotoolbox.cfg> for
161             connection information. Alternatively, a path to a database
162             file or directory may be given.
163              
164             If you already have an opened L database
165             object, you can simply pass that. See L for
166             more information. However, this in general should be discouraged,
167             since the name of the database will not be properly recorded when
168             saving to file. It is better to simply pass the name of database
169             again; multiple connections to the same database are smartly handled
170             in the background.
171              
172             =item feature
173              
174             my $Data = Bio::ToolBox::Data->new(file => $filename, feature => 'gene');
175              
176             For de novo lists from an annotation database, provide the GFF
177             type or type:source (columns 3 and 2) for collection. A comma
178             delimited string may be accepted for databases. For parsed files,
179             only a simple string is accepted.
180              
181             For a list of genomic intervals across the genome, specify a
182             feature of 'genome' with a database object.
183              
184             =item subfeature
185              
186             When parsing annotation files such as GTF, one or more subfeature
187             types, e.g. C or C, may be specified as a comma-delimited
188             string. This ensures that the subfeatures will be parsed into
189             SeqFeature objects. Otherwise, only the top level features will be
190             parsed. This expedites parsing by skipping unwanted features.
191              
192             =item win
193              
194             =item step
195              
196             my $Data = Bio::ToolBox::Data->new(db => $dbname, win => $integer);
197              
198             If generating a list of genomic intervals, optionally provide
199             the window and step values. The default is 500 bp.
200              
201             =item chrskip
202              
203             Provide a regular expression compatible or C string for skipping or
204             excluding specific or classes of chromosomes, for example the mitochondrial
205             chromosome or unmapped contigs. This works with both feature collection
206             and genomic intervals. The default is to take all chromosomes.
207              
208             =back
209              
210             When no file is given or database given to search, then a new,
211             empty Data object is returned. In this case, you may optionally
212             specify the names of the columns or indicate a specific file
213             format structure to assign column names. The following options can
214             then be provided.
215              
216             =over 4
217              
218             =item columns
219              
220             =item datasets
221              
222             my $Data = Bio::ToolBox::Data->new(columns => [qw(Column1 Column2 ...)] );
223              
224             Provide the column names in advance as an anonymous array.
225              
226             =item gff
227              
228             Pass the GFF version of the file: 1, 2, 2.5 (GTF), or 3.
229              
230             =item bed
231              
232             Pass the number of BED columns (3-12).
233              
234             =item ucsc
235              
236             Pass the number of columns to indicate the type of UCSC format. These
237             include 10 (refFlat without gene names), 11 (refFlat with gene names),
238             12 (knownGene gene prediction table), and 15
239             (an extended gene prediction or genePredExt table).
240              
241             =back
242              
243             If successful, the method will return a Bio::ToolBox::Data object.
244              
245             =item duplicate
246              
247             This will create a new Data object containing the same column
248             headers and metadata, but lacking the table content, i.e. no
249             rows of data. File name metadata, if present in the original, is
250             not preserved. The purpose here, for example, is to allow one
251             to selectively copy rows from one Data object to another.
252              
253             =item parse_table
254              
255             $Data->parse_table($file)
256             $Data->parse_table( {
257             file => $file,
258             feature => 'gene',
259             subfeature => 'exon',
260             chrskip => 'chrM|contig',
261             } );
262              
263             This will parse a gene annotation table into SeqFeature objects.
264             If this is called from an empty Data object, then the table will
265             be filled with the SeqFeature object names and IDs. If this is
266             called from a non-empty Data object, then the table's contents
267             will be associated with the SeqFeature objects using their name and
268             ID. The stored SeqFeature objects can be retrieved using the
269             L method.
270              
271             Pass the method a single argument. This may be either a simple
272             scalar to a filename to be parsed, or a reference to hash of
273             one or more argument options. Possible options include:
274              
275             =over 4
276              
277             =item file
278              
279             The file name of the gene table to be parsed. This may
280             be a GFF, GFF3, GTF, or any of the UCSC gene table formats.
281             These will be parsed using Bio::ToolBox::parser::* adapters.
282              
283             =item feature
284              
285             A regular expression compatible string or C string to match
286             the top features C to keep. The C is not
287             checked. The default is 'gene', although a transcript type could
288             alternatively be specified (in which case transcripts are not
289             parsed in gene features).
290              
291             =item subfeature
292              
293             A regular expression compatible string or C string to match
294             any sub features C to parse. The C is not checked.
295             Typically these include exon, CDS, or UTR. The default is nothing.
296              
297             =item chrskip
298              
299             A regular expression compatible string or C string to match
300             chromosomes to be skipped or excluded. Any feature with a matching
301             C chromosome name will be skipped.
302              
303             =back
304              
305             =back
306              
307             =head2 General Metadata
308              
309             There is a variety of general metadata regarding the Data
310             structure. The following methods may be used to access or set these
311             metadata properties. Some of these are stored as comment lines at
312             the beginning of the file, and will be read or set when the file is
313             loaded.
314              
315             =over
316              
317             =item feature
318              
319             $Data->feature('something');
320             my $feature = $Data->feature;
321              
322             Returns or sets the name of the features used to collect
323             the list of features. The actual feature types are listed
324             in the table, so this metadata is merely descriptive.
325              
326             =item feature_type
327              
328             Returns one of three specific values describing the contents
329             of the data table inferred by the presence of specific column
330             names. This provides a clue as to whether the table features
331             represent genomic regions (defined by coordinate positions) or
332             named database features. The return values include:
333              
334             =over 4
335              
336             =item * coordinate
337              
338             Table includes at least chromosome and start columns.
339              
340             =item * named
341              
342             Table includes name, type, and/or Primary_ID, possibly
343             referring to named database features.
344              
345             =item * unknown
346              
347             Table is unrecognized format.
348              
349             =back
350              
351             =item program
352              
353             Returns or sets the name of the program generating the list.
354              
355             =item database
356              
357             Returns or sets the name or path of the database from which the
358             features were derived.
359              
360             =item gff
361              
362             Returns or sets the version of loaded GFF files. Supported versions
363             included 1, 2, 2.5 (GTF), and 3.
364              
365             =item bed
366              
367             Returns or sets the BED file version. Here, the BED version is simply
368             the number of columns.
369              
370             =item ucsc
371              
372             Returns or sets the UCSC file format version. Here, the version is
373             simply the number of columns. Supported versions include 10 (gene
374             prediction), 11 (refFlat, or gene prediction with gene name), 12
375             (knownGene table), 15 (extended gene prediction), or 16 (extended
376             gene prediction with bin).
377              
378             =item vcf
379              
380             Returns or sets the VCF file version number. VCF support is limited.
381              
382             =back
383              
384             =head2 File information
385              
386             These methods provide information about the file from which the
387             data table was loaded. This does not include parsed annotation tables.
388              
389             =over 4
390              
391             =item filename
392              
393             =item path
394              
395             =item basename
396              
397             =item extension
398              
399             Returns the filename, full path, basename, and extension of
400             the filename. Concatenating the last three values will reconstitute
401             the first original filename.
402              
403             =item add_file_metadata
404              
405             $Data->add_file_metadata('/path/to/file.txt');
406              
407             Add filename metadata. This will automatically parse the path,
408             basename, and recognized extension from the passed filename and
409             set the appropriate metadata attributes.
410              
411             =back
412              
413             =head2 Metadata comments
414              
415             Comments are any other commented lines from a text file (lines
416             beginning with a #) that were not parsed as general metadata.
417              
418             =over 4
419              
420             =item comments
421              
422             Returns a copy of the array containing commented lines. Each
423             comment line becomes an element in the array.
424              
425             =item add_comment
426              
427             Appends the text string to the comment array.
428              
429             =item delete_comment
430              
431             Deletes a comment. Provide the array index of the comment to
432             delete. If an index is not provided, ALL comments will be deleted!
433              
434             =item vcf_headers
435              
436             For VCF files, this will partially parse the VCF headers into a
437             hash structure that can be queried or manipulated. Each header
438             line is parsed for the primary key, being the first word after the
439             ## prefix, e.g. INFO, FORMAT, FILTER, contig, etc. For the simple
440             values, they are stored as the value. For complex entries, such as
441             with INFO and FORMAT, a second level hash is created with the ID
442             extracted and used as the second level key. The value is always the
443             always the remainder of the string.
444              
445             For example, the following would be a simple parsed vcf header in
446             code representation.
447              
448             $vcf_header = {
449             FORMAT => {
450             GT = q(ID=GT,Number=1,Type=String,Description="Genotype"),
451             AD = q(ID=AD,Number=.,Type=Integer,Description="ref,alt Allelic depths"),
452             },
453             fileDate => 20150715,
454             }
455              
456             =item rewrite_vcf_headers
457              
458             If you have altered the vcf headers exported by the vcf_headers()
459             method, then this method will rewrite the hash structure as new
460             comment lines. Do this prior to writing or saving the Data sturcture
461             or else you will lose your changed VCF header metadata.
462              
463             =back
464              
465             =head2 The Data table
466              
467             The Data table is the array of arrays containing all of the
468             actual information. Rows and columns are indexed using 0-based
469             indexing as with all Perl arrays. Row 0 is always the column
470             header row containing the column names, regardless whether an
471             actual header name existed in the original file format (e.g.
472             BED or GFF formats). Any individual table "cell" can be
473             specified as C<[$row][$column]>.
474              
475             =over 4
476              
477             =item list_columns
478              
479             Returns an array or array reference of the column names
480             in ascending (left to right) order.
481              
482             =item number_columns
483              
484             Returns the number of columns in the Data table.
485              
486             =item number_rows
487              
488             Returns the number of rows in the Data table.
489              
490             =item last_column
491              
492             Returns the array index number for the last (right most)
493             column. This number is always 1 less than the value
494             returned by number_columns() due to 0-based indexing.
495              
496             =item last_row
497              
498             Returns the array index number of the last row.
499             Since the header row is index 0, this is also the
500             number of actual content rows.
501              
502             =item column_values
503              
504             my $values = $Data->column_values($i);
505              
506             Returns an array or array reference representing the values
507             in the specified column. This includes the column header as
508             the first element. Pass the method the column index.
509              
510             =item add_column
511              
512             # add empty column with name
513             my $i = $Data->add_column($name);
514            
515             # add entire column
516             my $i = $Data->add_column(\@array);
517              
518             Appends a new column to the Data table at the
519             rightmost position (highest index). It adds the column
520             header name and creates a new column metadata hash.
521             Pass the method one of two possibilities. Pass a text
522             string representing the new column name, in which case
523             no data will be added to the column. Alternatively, pass
524             an array reference, and the contents of the array will
525             become the column data. If the Data table already has
526             rows, then the passed array reference must have the same
527             number of elements.
528              
529             It returns the new column index if successful.
530              
531             =item copy_column
532              
533             my $j = $Data->copy_column($i);
534              
535             This will copy a column, appending the duplicate column at
536             the rightmost position (highest index). It will duplicate
537             column metadata as well. It will return the new index
538             position.
539              
540             =item delete_column
541              
542             $Data->delete_column($i, $j);
543              
544             Deletes one or more specified columns. Any remaining
545             columns rightwards will have their indices shifted
546             down appropriately. If you had identified one of the
547             shifted columns, you may need to re-find or calculate
548             its new index.
549              
550             =item reorder_column
551              
552             $Data->reorder_column($c,$b,$a,$a);
553              
554             Reorders columns into the specified order. Provide the
555             new desired order of indices. Columns could be duplicated
556             or deleted using this method. The columns will adopt their
557             new index numbers.
558              
559             =item add_row
560              
561             $Data->add_row(\@values);
562             $Data->add_row($Row); # Bio::ToolBox::Data::Feature object
563              
564             Add a new row of data values to the end of the Data table.
565             Optionally provide either a reference to an array of values
566             to put in the row, or pass a L
567             Row object, such as one obtained from another Data object.
568             If the number of columns do not match, the array is filled
569             up with null values for missing columns, or excess values
570             are dropped.
571              
572             =item get_row
573              
574             $Data->get_row($i); # Bio::ToolBox::Data::Feature object
575              
576             Returns the specified row as a L
577             object.
578              
579             =item delete_row
580              
581             $Data->delete_row($i, $j);
582              
583             Deletes one or more specified rows. Rows are spliced out
584             highest to lowest index to avoid issues. Be very careful
585             deleting rows while simultaneously iterating through the
586             table!
587              
588             =item row_values
589              
590             my $row_values = $Data->row_values($i);
591              
592             Returns a copy of an array for the specified row index.
593             Modifying this returned array does not migrate back to the
594             Data table; Use the L method below instead.
595              
596             =item value
597              
598             my $value = $Data->value($row, $column);
599             $Data->value($row, $column, $new_value);
600              
601             Returns or sets the value at a specific row or column index.
602             Index positions are 0-based (header row is index 0).
603              
604             =back
605              
606             =head2 Column Metadata
607              
608             Each column has metadata. Each metadata is a series of key =E
609             value pairs. The minimum keys are 'index' (the 0-based index
610             of the column) and 'name' (the column header name). Additional
611             keys and values may be queried or set as appropriate. When the
612             file is written, these are stored as commented metadata lines at
613             the beginning of the file.
614              
615             =over 4
616              
617             =item name
618              
619             $Data->name($index, $new_name);
620             my $name = $Data->name($i);
621              
622             Convenient method to return the name of the column given the
623             index number. A column may also be renamed by passing a new name.
624              
625             =item metadata
626              
627             $Data->metadata($index, $key, $new_value);
628             my $value = $Data->metadata($index, $key)
629              
630             Returns or sets the metadata value for a specific $key for a
631             specific column $index.
632              
633             This may also be used to add a new metadata key. Simply provide
634             the name of a new $key that is not present
635              
636             If no key is provided, then a hash or hash reference is returned
637             representing the entire metadata for that column.
638              
639             =item copy_metadata
640              
641             $Data->copy_metadata($source, $target);
642              
643             This method will copy the metadata (everything except name and
644             index) between the source column and target column. Returns 1 if
645             successful.
646              
647             =item delete_metadata
648              
649             $Data->delete_metadata($index, $key);
650              
651             Deletes a column-specific metadata $key and value for a specific
652             column $index. If a $key is not provided, then all metadata keys
653             for that index will be deleted.
654              
655             =item find_column
656              
657             my $i = $Data->find_column('Gene');
658             my $i = $Data->find_column('^Gene$')
659              
660             Searches the column names for the specified column name. This
661             employs a case-insensitive grep search, so simple substitutions
662             may be made.
663              
664             =item chromo_column
665              
666             =item start_column
667              
668             =item stop_column
669              
670             =item strand_column
671              
672             =item name_column
673              
674             =item type_column
675              
676             =item id_column
677              
678             These methods will return the identified column best matching
679             the description. Returns C if that column is not present.
680             These use the L method with a predefined list of
681             aliases.
682              
683             =back
684              
685             =head2 Working with databases
686              
687             These are methods for working primarily with annotation databases.
688              
689             =over 4
690              
691             =item open_database
692              
693             This is wrapper method that tries to do the right thing and passes
694             on to either L or L methods.
695             Basically a legacy method for L.
696              
697             =item open_meta_database
698              
699             Open the database that is listed in the metadata. Returns the
700             database connection. Pass a true value to force a new database
701             connection to be opened, rather than returning a cached connection
702             object (useful when forking).
703              
704             =item open_new_database
705              
706             Convenience method for opening a second or new database that is
707             not specified in the metadata, useful for data collection. This
708             is a shortcut to L.
709             Pass the database name.
710              
711             =back
712              
713             =head2 SeqFeature Objects
714              
715             SeqFeature objects corresponding to data rows can be stored in the Data
716             object. This can be useful if the SeqFeature object is not readily
717             available from a database or is processor intensive in generating or
718             parsing. Note that storing large numbers of objects will increase memory
719             usage.
720              
721             SeqFeature objects are usually either L,
722             L, or L objects, depending
723             upon their source. More information can obtained from the L
724             abstract API.
725              
726             =over 4
727              
728             =item store_seqfeature
729              
730             $Data->store_seqfeature($row_index, $seqfeature);
731              
732             Stores the SeqFeature object for the given row index. Only one SeqFeature
733             object can be stored per row.
734              
735             =item get_seqfeature
736              
737             my $feature = $Data->get_seqfeature($row_index);
738              
739             Retrieves the SeqFeature object for the given row index.
740              
741             =item delete_seqfeature
742              
743             $Data->store_seqfeature($row_index);
744              
745             Removes the SeqFeature object for the given row index.
746              
747             =item collapse_gene_transcripts
748              
749             my $success = $Data->collapse_gene_transcripts;
750              
751             This method will iterate through a Data table and collapse multiple alternative
752             transcript subfeatures of stored gene SeqFeature objects in the table. Exons
753             of multiple transcripts will be merged, maximizing exon size and minimizing
754             intron size. Genes with only one transcript will not be affected. Stored
755             SeqFeature objects that are do not have a C of "gene" are silently
756             skipped. Refer to the L method
757             for more details. The number of rows successfully collapsed is returned.
758              
759             =item add_transcript_length
760              
761             my $length_index = $Data->add_transcript_length;
762             my $length_index = $Data->add_transcript_length('cds');
763              
764             This method will generate a new column in the Data table representing the
765             length of a transcript, i.e. the sum of the length of subfeatures for
766             the stored SeqFeature object in the Data table. The default subfeature is
767             C; however, alternative subfeature types may be passed to the method
768             and used. These include C, C<5p_utr>, and C<3p_utr> for CDS, the 5C<'> UTR,
769             and the 3C<'> UTR, respectively. See the corresponding transcript length
770             methods in L for more information. If a length
771             is not calculated, for example the feature C is a "gene",
772             then the simple length of the feature is recorded.
773              
774             The name of the new column is one of "Merged_Transcript_Length" for exon,
775             "Transcript_CDS_Length", "Transcript_5p_UTR_Length", or "Transcript_3p_UTR_Length".
776             The index of the new length column is returned.
777              
778             =back
779              
780             =head2 Data Table Functions
781              
782             These methods alter the Data table en masse.
783              
784             =over 4
785              
786             =item verify
787              
788             This method will verify the Data structure, including the metadata and the
789             Data table. It ensures that the table has the correct number of rows and
790             columns as described in the metadata, and that each column has the basic
791             metadata.
792              
793             If the Data structure is marked as a GFF or BED structure, then the table
794             is checked that the structure matches the proper format. If not, for
795             example when additional columns have been added, then the GFF or BED value
796             is set to null.
797              
798             This method is automatically called prior to writing the Data table to file.
799              
800             =item sort_data
801              
802             $Data->sort_data($index, 'i'); # increasing sort
803              
804             This method will sort the Data table by the values in the indicated column.
805             It will automatically determine whether the contents of the column are
806             numbers or alphanumeric, and will sort accordingly, either numerically or
807             asciibetically. The first non-null value in the column is used to determine.
808             The sort may fail if the values are not consistent. Pass a second optional
809             value to indicate the direction of the sort. The value should be either
810             'i' for 'increasing' or 'd' for 'decreasing'. The default order is
811             increasing.
812              
813             =item gsort_data
814              
815             This method will sort the Data table by increasing genomic coordinates. It
816             requires the presence of chromosome and start (or position) columns,
817             identified by their column names. These are automatically identified.
818             Failure to find these columns mean a failure to sort the table. Chromosome
819             names are sorted first by their digits (e.g. chr2 before chr10), and then
820             alphanumerically. Base coordinates are sorted by increasing value.
821             Identical positions are kept in their original order.
822              
823             =item splice_data
824              
825             my $Data = Bio::ToolBox::Data->new(file => $file);
826             my $pm = Parallel::ForkManager->new(4);
827             for my $i (1..4) {
828             $pm->start and next;
829             ### in child ###
830             $Data->splice_data($i, 4);
831             # do something with this portion
832             # then save to a temporary unique file
833             $Data->save("$file_$i");
834             $pm->finish;
835             }
836             $pm->wait_all_children;
837             # reload children files
838             $Data->reload_children(glob "$file_*");
839              
840             This method will splice the Data table into C<$total_parts> number of pieces,
841             retaining the C<$current_part> piece. The other parts are discarded. This
842             method is intended to be used when a program is forked into separate
843             processes, allowing each child process to work on a subset of the original
844             Data table.
845              
846             Two values are passed to the method. The first is the current part number,
847             1-based. The second value is the total number of parts that the table
848             should be divided, corresponding to the number of concurrent processes.
849             One easy approach to forking is to use L. The
850             above example shows how to fork into four concurrent processes.
851              
852             Since each forked child process is separate from their parent process,
853             their contents must be reloaded into the current Data object. The
854             L documentation recommends going through a disk
855             file intermediate. Therefore, write each child Data object to file using
856             a unique name. Once all children have been reaped, they can be reloaded
857             into the current Data object using the L method.
858              
859             Remember that if you fork your script into child processes, any database
860             connections must be re-opened; they are typically not clone safe. If you
861             have an existing database connection by using the L method,
862             it should be automatically re-opened for you when you use the L
863             method, but you will need to call L again in the child
864             process to obtain the new database object.
865              
866             =item reload_children
867              
868             Discards the current data table in memory and reloads two or more files
869             written from forked children processes. Provide the name of the child
870             files in the order you want them loaded. The files will be automatically
871             deleted if successfully loaded. Returns the number of lines reloaded on
872             success.
873              
874             =back
875              
876             =head2 File Functions
877              
878             The Data table may be read in from a file or written out as a file. In
879             all cases, it is a tab-delimited text file, whether as an ad hoc table
880             or a specific bioinformatic format, e.g. BED, GFF, etc. Multiple common
881             file formats are supported. Column headers are assumed, except in those
882             cases where it is not, e.g. BED, GFF, etc. Metadata may be included as
883             commented lines at the beginning of the file, prefixed with a C<#> symbol.
884             Reading and writing gzip compressed files is fully supported.
885              
886             =over 4
887              
888             =item load_file
889              
890             $Data->load_file($filename);
891              
892             This will load a file into a new, empty Data table. This function is
893             called automatically when a filename is provided to the L function.
894             The existence of the file is first checked (appending common missing
895             extensions as necessary), metadata and column headers processed and/or
896             generated from default settings, the content loaded into the table, and
897             the structure verified. Error messages may be printed if the structure or
898             format is inconsistent or doesn't match the expected format, e.g a file
899             with a F<.bed> extension doesn't match the UCSC specification.
900             Pass the name of the filename.
901              
902             =item taste_file
903              
904             my ($flavor, $format) = $Data->taste_file($filename);
905             # $flavor is generic term: gff, bed, ucsc, or undef
906             # $format is specific term: gtf, gff3, narrowPeak, genePred, etc.
907              
908             Tastes, or checks, a file for a certain flavor, or known gene file formats.
909             Useful for determining if the file represents a known gene table format
910             that lacks a defined file extension, e.g. UCSC formats. This can be based
911             on the file extension, metadata headers, and/or file contents from the
912             first 10 lines. Returns two strings: the first is a generic flavor, and the
913             second is a more specific format, if applicable. Generic flavor values will
914             be one of `gff`, `bed`, `ucsc`, or `undefined`. These correlate to specific
915             Parser adapters. Specific formats could be any number of possibilities, for
916             example `undefined`, `gtf`, `gff3`, `narrowPeak`, `genePred`, etc.
917              
918              
919             =item save
920              
921             =item write_file
922              
923             my $success = $Data->save;
924             my $success = $Data->save('my_file.txt');
925             my $success = $Data->save(filename => $file, gz => 1);
926             print "file $success was saved!\n";
927              
928             Pass the file name to be written. If no file name is passed, then
929             the filename and path stored in the metadata are used, if present.
930              
931             These methods will write the Data structure out to file. It will
932             be first verified as to proper structure. Opened BED and GFF files
933             are checked to see if their structure is maintained. If so, they
934             are written in the same format; if not, they are written as regular
935             tab-delimited text files.
936              
937             You may pass additional options.
938              
939             =over 4
940              
941             =item filename
942              
943             Optionally pass a new filename. Required for new objects; previous
944             opened files may be overwritten if a new name is not provided. If
945             necessary, the file extension may be changed; for example, BED files
946             that no longer match the defined format lose the .bed and gain a .txt
947             extension. Compression may or add or strip .gz as appropriate. If
948             a path is not provided, the current working directory is used.
949              
950             =item gz
951              
952             Boolean value to change the compression status of the output file. If
953             overwriting an input file, the default is maintain the compression status,
954             otherwise no compression. Pass a 0 for no compression, 1 for standard
955             gzip compression, or 2 for block gzip (bgzip) compression for tabix
956             compatibility.
957              
958             =back
959              
960             If the file save is successful, it will return the full path and
961             name of the saved file, complete with any changes to the file extension.
962              
963             =item summary_file
964              
965             Write a separate file summarizing columns of data (mean values).
966             The mean value of each column becomes a row value, and each column
967             header becomes a row identifier (i.e. the table is transposed). The
968             best use of this is to summarize the mean profile of windowed data
969             collected across a feature. See the Bio::ToolBox scripts
970             L and L as examples.
971             For data from L where the columns are expressed
972             as percentile bins, the reported midpoint column is automatically
973             converted based on a length of 1000 bp.
974              
975             You may pass these options. They are optional.
976              
977             =over 4
978              
979             =item filename
980              
981             Pass an optional new filename. The default is to take the basename
982             and append "_summed" to it.
983              
984             =item startcolumn
985              
986             =item stopcolumn
987              
988             Provide the starting and ending columns to summarize. The default
989             start is the leftmost column without a recognized standard name.
990             The default ending column is the last rightmost column. Indexes are
991             0-based.
992              
993             =item dataset
994              
995             Pass a string that is the name of the dataset. This could be collected
996             from the metadata, if present. This will become the name of the score
997             column if defined.
998              
999             =back
1000              
1001             The name of the summarized column is either the provided dataset name,
1002             the defined basename in the metadata of the Data structure, or a generic
1003             name. If successful, it will return the name of the file saved.
1004              
1005             =back
1006              
1007             =head2 Verifying Datasets
1008              
1009             When working with row Features and collecting scores, the dataset
1010             from which you are collecting must be verified prior to collection.
1011             This ensures that the proper database adaptor is available and loaded,
1012             and that the dataset is correctly specified (otherwise nothing would be
1013             collected). This verification is normally performed transparently when
1014             you call L or
1015             L.
1016             However, datasets may be explicitly verified prior to calling the score
1017             methods.
1018              
1019             =over 4
1020              
1021             =item verify_dataset
1022              
1023             my $dataset = $Data->verify_dataset($dataset, $database);
1024              
1025             Pass the name of the dataset (GFF type or type:source) for a GFF3-based
1026             database, e.g. , or path and file name for a
1027             data file, e.g. Bam, BigWig, BigBed, or USeq file. If a separate database
1028             is being used, pass the name or opened database object as a second
1029             parameter. For more advance options, see
1030             L.
1031              
1032             The name of the verified dataset, with a prefix if necessary, is returned.
1033              
1034             =back
1035              
1036             =head2 Efficient Data Access
1037              
1038             Most of the time we need to iterate over the Data table, one row
1039             at a time, collecting data or processing information. These methods
1040             simplify the process.
1041              
1042             =over 4
1043              
1044             =item iterate
1045              
1046             $Data->iterate( sub {
1047             my $row = shift;
1048             my $number = $row->value($index);
1049             my $log_number = log($number);
1050             $row->value($index, $log_number);
1051             } );
1052              
1053             This method will process a code reference on every row in the data
1054             table. Pass a subroutine or code reference. The subroutine will
1055             receive the row as a L object. With this
1056             object, you can retrieve values, set values, and add new values.
1057              
1058             =item row_stream
1059              
1060             This returns an C object, which has one
1061             method, C. Call this method repeatedly until it returns
1062             C to work through each row of data.
1063              
1064             Users of the C family of database adaptors may recognize the
1065             analogy to the C method.
1066              
1067             =item next_row
1068              
1069             my $stream = $Data->row_stream;
1070             while (my $row = $stream->next_row) {
1071             # each $row is a Bio::ToolBox::Data::Feature object
1072             # representing the row in the data table
1073             my $value = $row->value($index);
1074             # do something with $value
1075             }
1076              
1077             Called from a C object, it returns a
1078             L row object. If SeqFeature objects are
1079             associated with the row, perhaps from a parsed input annotation file,
1080             then they are automatically associated with the row object. (Previous
1081             versions required separately calling the seqfeature() row method to
1082             perform this.)
1083              
1084             =back
1085              
1086             =head1 SEE ALSO
1087              
1088             L, L, L,
1089             L, L, L, L,
1090             L, L
1091              
1092             =cut
1093              
1094 3     3   2718 use strict;
  3         4  
  3         79  
1095 3     3   10 use Carp qw(carp cluck croak confess);
  3         6  
  3         143  
1096 3     3   14 use List::Util qw(sum0);
  3         4  
  3         116  
1097 3     3   12 use base 'Bio::ToolBox::Data::core';
  3         4  
  3         1178  
1098 3         143 use Bio::ToolBox::db_helper qw(
1099             get_new_feature_list
1100             get_new_genome_list
1101             get_db_feature
1102 3     3   15 );
  3         5  
1103 3     3   16 use Bio::ToolBox::utility qw(simplify_dataset_name sane_chromo_sort);
  3         4  
  3         125  
1104 3     3   13 use Module::Load;
  3         3  
  3         10  
1105              
1106             1;
1107              
1108              
1109             ### Initialize
1110              
1111             sub new {
1112 46     46 1 6880 my $class = shift;
1113 46         115 my %args = @_;
1114 46 100       157 if (ref($class)) {
1115 25         47 $class = ref($class);
1116             }
1117            
1118             # check for important arguments
1119 46   100     352 $args{features} ||= $args{feature} || 'gene';
      33        
1120 46   50     546 $args{stream} ||= $args{Stream} || 0;
      66        
1121 46   100     241 $args{file} ||= $args{in} || undef;
      100        
1122 46   100     168 $args{parse} ||= 0;
1123 46   50     181 $args{noheader} ||= 0;
1124            
1125             # check for stream
1126 46 100       114 if ($args{stream}) {
1127 1         2 $class = "Bio::ToolBox::Data::Stream";
1128 1         4 load($class);
1129 1         81 return $class->new(@_);
1130             }
1131            
1132             # initialize
1133 45         221 my $self = $class->SUPER::new();
1134            
1135             # prepare a new table based on the provided arguments
1136 45 100 100     302 if ($args{file} and $args{parse}) {
    100 66        
    100          
1137             # parse from file
1138 7   100     31 $args{subfeature} ||= '';
1139 7 100       24 unless ( $self->parse_table(\%args) ) {
1140 5         27 my $l = $self->load_file($args{file});
1141 5 50       12 return unless $l;
1142 5 50 33     15 if ($self->database =~ /^Parsed:(.+)$/ and $self->feature_type eq 'named') {
1143             # looks like the loaded file was from a previously parsed table
1144             # let's try this again
1145             # this may die if it doesn't work
1146 5         15 $args{file} = $1;
1147 5         12 $args{feature} = $self->feature;
1148 5         16 $self->parse_table(\%args);
1149             }
1150             }
1151             }
1152             elsif ($args{file}) {
1153             # load from file
1154 5 50       25 unless ( $self->load_file($args{file}, $args{noheader}) ) {
1155 0         0 return;
1156             }
1157             }
1158             elsif (exists $args{db} and $args{features}) {
1159             # generate new list
1160 1         10 $self->feature($args{features});
1161 1         5 $self->database($args{db});
1162 1         2 $args{data} = $self;
1163 1         2 my $result;
1164 1 50       3 if ($args{features} eq 'genome') {
1165             # create a genomic interval list
1166             # first must parse any exclusion list provided as a new Data object
1167 1   50     9 $args{blacklist} ||= $args{exclude} || undef;
      33        
1168 1 50       3 if (defined $args{blacklist}) {
1169             my $exclusion_Data = $self->new(
1170             file => $args{blacklist},
1171 0         0 parse => 0 # we don't need to parse
1172             );
1173 0 0 0     0 if ($exclusion_Data and $exclusion_Data->feature_type eq 'coordinate') {
1174 0         0 printf " Loaded %d exclusion list items\n",
1175             $exclusion_Data->number_rows;
1176 0         0 delete $args{blacklist};
1177 0         0 delete $args{exclude};
1178 0         0 $args{exclude} = $exclusion_Data;
1179             }
1180             else {
1181 0         0 carp " Cannot not load exclusion coordinate list file $args{blacklist}!";
1182 0         0 return;
1183             }
1184             }
1185 1         6 $result = get_new_genome_list(%args);
1186             }
1187             else {
1188 0         0 $result = get_new_feature_list(%args);
1189             }
1190 1 50       4 unless ($result) {
1191 0         0 carp " Cannot generate new $args{features} list!";
1192 0         0 return;
1193             }
1194             }
1195             else {
1196             # a new empty structure
1197            
1198             # check to see if user provided column names
1199 32   50     203 $args{columns} ||= $args{datasets} || undef;
      66        
1200 32 100 33     256 if (defined $args{columns}) {
    50 33        
    50 33        
    50          
1201 1         1 foreach my $c ( @{ $args{columns} } ) {
  1         3  
1202 9         12 $self->add_column($c);
1203             }
1204 1 50       4 $self->{feature} = $args{feature} if exists $args{feature};
1205             }
1206            
1207             # or possibly a specified format structure
1208             elsif (exists $args{gff} and $args{gff}) {
1209             # use standard names for the number of columns indicated
1210             # we trust that the user knows the subtle difference between gff versions
1211 0         0 $self->add_gff_metadata($args{gff});
1212 0 0       0 unless ($self->extension =~ /g[tf]f/) {
1213             $self->{extension} = $args{gff} == 2.5 ? '.gtf' :
1214 0 0       0 $args{gff} == 3 ? '.gff3' : '.gff';
    0          
1215             }
1216             }
1217             elsif (exists $args{bed} and $args{bed}) {
1218             # use standard names for the number of columns indicated
1219 0 0 0     0 unless ($args{bed} =~ /^\d{1,2}$/ and $args{bed} >= 3) {
1220 0         0 carp "bed parameter must be an integer 3-12!";
1221 0         0 return;
1222             }
1223 0         0 $self->add_bed_metadata($args{bed});
1224 0 0       0 unless ($self->extension =~ /bed|peak/) {
1225 0         0 $self->{extension} = '.bed';
1226             }
1227             }
1228             elsif (exists $args{ucsc} and $args{ucsc}) {
1229             # a ucsc format such as refFlat, genePred, or genePredExt
1230 0         0 my $u = $self->add_ucsc_metadata($args{ucsc});
1231 0 0       0 unless ($u) {
1232 0         0 carp "unrecognized number of columns for ucsc format!";
1233 0         0 return;
1234             };
1235 0 0       0 unless ($self->extension =~ /ucsc|ref+lat|genepred/) {
1236 0         0 $self->{extension} = '.ucsc';
1237             }
1238             }
1239             }
1240            
1241 45         256 return $self;
1242             }
1243              
1244             sub duplicate {
1245 1     1 1 1175 my $self = shift;
1246            
1247             # duplicate the data structure
1248 1         3 my $columns = $self->list_columns;
1249 1 50       4 my $Dupe = $self->new(
1250             'columns' => $columns,
1251             ) or return;
1252            
1253             # copy the metadata
1254 1         3 for (my $i = 0; $i < $self->number_columns; $i++) {
1255             # column metadata
1256 9         14 my %md = $self->metadata($i);
1257 9         22 $Dupe->{$i} = \%md;
1258             }
1259 1         3 foreach (qw(feature program db bed gff ucsc vcf headers)) {
1260             # various keys
1261 8         11 $Dupe->{$_} = $self->{$_};
1262             }
1263 1         2 my @comments = $self->comments;
1264 1         1 push @{$Dupe->{comments}}, @comments;
  1         10  
1265            
1266 1         5 return $Dupe;
1267             }
1268              
1269             sub parse_table {
1270 18     18 1 5869 my $self = shift;
1271 18         38 my $args = shift;
1272 18         35 my ($file, $feature, $subfeature, $simplify);
1273 18 100       48 if (ref $args) {
1274 12   50     34 $file = $args->{file} || '';
1275 12   100     32 $feature = $args->{feature} || '';
1276 12   100     42 $subfeature = $args->{subfeature} || '';
1277             $simplify = (exists $args->{simplify} and defined $args->{simplify}) ?
1278 12 50 33     39 $args->{simplify} : 1; # default is to simplify
1279             }
1280             else {
1281             # no hash reference, assume just a file name
1282 6         10 $file = $args;
1283 6         10 undef $args;
1284 6         11 $feature = undef;
1285 6         12 $subfeature = '';
1286 6         9 $simplify = 1;
1287             }
1288 18 50       48 unless ($file) {
1289 0         0 carp "no annotation file provided to parse!";
1290 0         0 return;
1291             }
1292            
1293             # the file format determines the parser class
1294 18 100       84 my ($flavor, $format) = $self->taste_file($file) or return;
1295 13         40 my $class = 'Bio::ToolBox::parser::' . $flavor;
1296 13         23 eval {load $class};
  13         54  
1297 13 50       981 if ($@) {
1298 0         0 carp "unable to load $class! cannot parse $flavor!";
1299 0         0 return;
1300             }
1301 13         41 $self->format($format); # specify the specific format
1302            
1303             # open parser
1304 13 50       108 my $parser = $class->new() or return;
1305 13 50       54 $parser->open_file($file) or return;
1306 13         48 my $typelist = $parser->typelist;
1307            
1308             # set feature based on the type list from the parser
1309 13 100       42 unless ($feature) {
1310 6 100       26 if ($typelist =~ /gene/i) {
    100          
1311 2         4 $feature = 'gene';
1312             }
1313             elsif ($typelist eq 'region') {
1314 3         6 $feature = 'region';
1315             }
1316             else {
1317 1         2 $feature = 'rna'; # generic RNA
1318             }
1319             }
1320            
1321             # set parser parameters
1322 13         45 $parser->simplify($simplify);
1323 13 100       32 if ($subfeature) {
1324 2 100       11 $parser->do_exon(1) if $subfeature =~ /exon/i;
1325 2 100       11 $parser->do_cds(1) if $subfeature =~ /cds/i;
1326 2 50       9 $parser->do_utr(1) if $subfeature =~ /utr|untranslated/i;
1327 2 50       7 $parser->do_codon(1) if $subfeature =~/codon/i;
1328             }
1329 13 100       40 if ($feature =~ /gene$/i) {
1330 4         13 $parser->do_gene(1);
1331             }
1332             else {
1333 9         29 $parser->do_gene(0);
1334             }
1335 13         25 my $mrna_check = 0;
1336 13 50 66     52 if (lc($feature) eq 'mrna' and $parser->typelist !~ /mrna/i and not $self->last_row) {
      33        
1337             # user requested mRNA for a new data file but it's not present in the type list
1338             # look for it the hard way by parsing CDS too - sigh
1339 0         0 load('Bio::ToolBox::GeneTools', 'is_coding');
1340 0         0 $parser->do_cds(1);
1341 0         0 $mrna_check = 1;
1342             }
1343            
1344             # parse the table
1345 13 50       47 $parser->parse_file or return;
1346            
1347             # store the SeqFeature objects
1348 13 100       78 if ($self->last_row > 0) {
1349             # we already have a table, presumably representing the features
1350 5         11 my $count = 0;
1351             $self->iterate( sub {
1352 65     65   62 my $row = shift;
1353 65         114 my $f = $parser->find_gene(
1354             name => $row->name,
1355             id => $row->id,
1356             );
1357 65 50       105 if ($f) {
1358 65         102 $self->store_seqfeature($row->row_index, $f);
1359 65         154 $count++;
1360             }
1361 5         43 } );
1362 5 50       31 unless ($count == $self->last_row) {
1363 0         0 die <
1364             Not all features in the input file could be matched to a corresponding SeqFeature
1365             object in the annotation file $file.
1366             Double check your input and annotation files. You can create a new table by just
1367             providing your annotation file.
1368             PARSEFAIL
1369             }
1370             }
1371             else {
1372             # create a new table
1373 8         36 $self->add_column('Primary_ID');
1374 8         20 $self->add_column('Name');
1375            
1376             # check for chromosome exclude
1377 8         9 my $chr_exclude;
1378 8 100       18 if ($args) {
1379 2   50     11 $chr_exclude = $args->{chrskip} || undef;
1380             }
1381            
1382             # fill table with features
1383 8         27 while (my $f = $parser->next_top_feature) {
1384 120 50       160 if ($chr_exclude) {
1385 0 0       0 next if $f->seq_id =~ /$chr_exclude/i;
1386             }
1387 120         194 my $type = $f->type;
1388 120 100 33     189 if ($f->type =~ /$feature/i or ($mrna_check and is_coding($f)) ) {
      66        
1389 113         203 my $index = $self->add_row([ $f->primary_id, $f->display_name ]);
1390 113         188 $self->store_seqfeature($index, $f);
1391             }
1392             }
1393 8 50       15 unless ($self->last_row) {
1394 0         0 printf " Zero '%s' features found!\n Check your feature or try generic features like gene, mRNA, or transcript\n",
1395             $feature;
1396             }
1397 8         45 $self->database("Parsed:$file");
1398 8         30 $self->feature($feature);
1399 8 50       15 $self->add_comment("Chromosomes excluded: $chr_exclude") if $chr_exclude;
1400            
1401             # add input parsed file metadata
1402 8         40 $self->add_file_metadata($file);
1403             # but delete some stuff, just want basename
1404 8         14 undef $self->{extension};
1405 8         12 undef $self->{filename};
1406 8         11 undef $self->{path};
1407             }
1408 13         357 return 1;
1409             }
1410              
1411              
1412              
1413              
1414             ### Column manipulation
1415              
1416             sub column_values {
1417 2     2 1 940 my ($self, $column) = @_;
1418 2 50       6 return unless defined $column;
1419 2 50       5 return unless exists $self->{$column}{name};
1420 2         6 my @values = map {$self->value($_, $column)} (0 .. $self->last_row);
  159         171  
1421 2 50       8 return wantarray ? @values : \@values;
1422             }
1423              
1424             sub add_column {
1425 31     31 1 54 my ($self, $name) = @_;
1426 31 50       118 return unless $name;
1427 31         43 my $column = $self->{number_columns};
1428            
1429             # check for array of column data
1430 31         43 my $name_ref = ref $name;
1431 31 100       79 if ($name_ref eq 'ARRAY') {
    50          
    50          
1432 2 50       4 if ($self->last_row > 1) {
1433             # table has existing data beyond column headers
1434 2 50       3 if ($self->last_row == (scalar @$name - 1)) {
1435             # same number of elements, add it the table
1436 2         6 $self->{$column} = {
1437             'name' => $name->[0],
1438             'index' => $column,
1439             };
1440 2         5 for (my $r = 0; $r <= $self->last_row; $r++) {
1441 158         245 $self->{data_table}->[$r][$column] = $name->[$r];
1442             }
1443             }
1444             else {
1445             # different number of elements
1446 0         0 cluck "array has different number of elements than Data table!\n";
1447 0         0 return;
1448             }
1449             }
1450             else {
1451             # table has no existing data
1452 0         0 $self->{$column} = {
1453             'name' => $name->[0],
1454             'index' => $column,
1455             };
1456 0         0 for (my $i = 0; $i < scalar @$name; $i++) {
1457 0         0 $self->value($i, $column, $name->[$i]);
1458             }
1459 0         0 $self->{last_row} = scalar @$name - 1;
1460 0         0 $self->{headers} = 1; # boolean to indicate the table now has headers
1461             }
1462             }
1463             elsif ($name_ref eq 'Bio::DB::GFF::Typename') {
1464             # a Typename object that was selected from a SeqFeature::Store database
1465 0         0 $self->{$column} = {
1466             'name' => $name->asString,
1467             'index' => $column,
1468             };
1469 0         0 $self->{data_table}->[0][$column] = $name->asString;
1470             }
1471             elsif ($name_ref eq '') {
1472             # just a name
1473 29         82 $self->{$column} = {
1474             'name' => $name,
1475             'index' => $column,
1476             };
1477 29         66 $self->{data_table}->[0][$column] = $name;
1478             }
1479             else {
1480 0         0 cluck "unrecognized reference '$name_ref'! pass a scalar value or array reference";
1481 0         0 return;
1482             }
1483            
1484 31         40 $self->{number_columns}++;
1485 31 50       56 delete $self->{column_indices} if exists $self->{column_indices};
1486 31 50 33     77 if ($self->gff or $self->bed or $self->ucsc or $self->vcf) {
      33        
      33        
1487             # check if we maintain integrity, at least insofar what we test
1488 0         0 $self->verify(1); # silence so user doesn't get these messages
1489             }
1490 31         55 return $column;
1491             }
1492              
1493             sub copy_column {
1494 1     1 1 2 my $self = shift;
1495 1         1 my $index = shift;
1496 1 50       4 return unless defined $index;
1497 1         2 my $data = $self->column_values($index);
1498 1         2 my $new_index = $self->add_column($data);
1499 1         6 $self->copy_metadata($index, $new_index);
1500 1         5 return $new_index;
1501             }
1502              
1503              
1504              
1505             ### Rows and Data access
1506              
1507             sub add_row {
1508 353     353 1 346 my $self = shift;
1509 353         337 my @row_data;
1510 353 100 66     826 if ($_[0] and ref $_[0] eq 'ARRAY') {
    50 33        
    0 0        
1511 313         292 @row_data = @{ $_[0] };
  313         531  
1512             }
1513             elsif ($_[0] and ref $_[0] eq 'Bio::ToolBox::Data::Feature') {
1514 40         76 @row_data = $_[0]->row_values;
1515             }
1516             elsif ($_[0] and $_[0] =~ /\t/) {
1517 0         0 @row_data = split /\t/, $_[0];
1518             }
1519             else {
1520 0         0 @row_data = map {'.'} (1 .. $self->{number_columns});
  0         0  
1521             }
1522 353 50       548 if (scalar @row_data > $self->{number_columns}) {
1523 0         0 cluck "row added has more elements than table columns! truncating row elements\n";
1524 0         0 splice @row_data, 0, $self->{number_columns};
1525             }
1526 353         512 until (scalar @row_data == $self->{number_columns}) {
1527 0         0 push @row_data, '.';
1528             }
1529 353         362 my $row_index = $self->{last_row} + 1;
1530 353         441 $self->{data_table}->[$row_index] = \@row_data;
1531 353         622 $self->{last_row}++;
1532 353         663 return $row_index;
1533             }
1534              
1535             sub get_row {
1536 9     9 1 502 my ($self, $row_number) = @_;
1537 9 50       36 return unless defined $self->{data_table}->[$row_number];
1538 9         24 my @options = (
1539             'data' => $self,
1540             'index' => $row_number,
1541             );
1542 9 100 66     48 if (
1543             exists $self->{SeqFeatureObjects} and
1544             defined $self->{SeqFeatureObjects}->[$row_number]
1545             ) {
1546 5         14 push @options, 'feature', $self->{SeqFeatureObjects}->[$row_number];
1547             }
1548 9         70 return Bio::ToolBox::Data::Feature->new(@options);
1549             }
1550              
1551             sub delete_row {
1552 1     1 1 2 my $self = shift;
1553 1         4 my @deleted = sort {$b <=> $a} @_;
  0         0  
1554 1         3 while (@deleted) {
1555 1         1 my $d = shift @deleted;
1556 1         2 splice( @{ $self->{data_table} }, $d, 1);
  1         2  
1557 1         3 $self->{last_row}--;
1558 1 50       3 if (exists $self->{SeqFeatureObjects}) {
1559 0         0 splice( @{ $self->{SeqFeatureObjects} }, $d, 1);
  0         0  
1560             }
1561             }
1562 1         2 return 1;
1563             }
1564              
1565             sub row_values {
1566 237     237 1 263 my ($self, $row) = @_;
1567 237         222 my @data = @{ $self->{data_table}->[$row] };
  237         436  
1568 237 50       597 return wantarray ? @data : \@data;
1569             }
1570              
1571             sub value {
1572 272     272 1 1085 my ($self, $row, $column, $value) = @_;
1573 272 50 33     570 return unless (defined $row and defined $column);
1574 272 50       314 if (defined $value) {
1575 0         0 $self->{data_table}->[$row][$column] = $value;
1576             }
1577 272         444 return $self->{data_table}->[$row][$column];
1578             }
1579              
1580             sub row_stream {
1581 9     9 1 1454 my $self = shift;
1582 9         63 return Bio::ToolBox::Data::Iterator->new($self);
1583             }
1584              
1585             sub iterate {
1586 6     6 1 462 my $self = shift;
1587 6         63 my $code = shift;
1588 6 50       24 unless (ref $code eq 'CODE') {
1589 0         0 confess "iterate_function() method requires a code reference!";
1590             }
1591 6         18 my $stream = $self->row_stream;
1592 6         23 while (my $row = $stream->next_row) {
1593 143         172 &$code($row);
1594             }
1595 6         23 return 1;
1596             }
1597              
1598              
1599             #### Stored SeqFeature manipulation
1600              
1601             sub store_seqfeature {
1602 178     178 1 227 my ($self, $row_i, $seqfeature) = @_;
1603 178 50 33     424 unless (defined $row_i and ref($seqfeature)) {
1604 0         0 confess "must provide a row index and SeqFeature object!";
1605             }
1606 178 50       271 confess "invalid row index" unless ($row_i <= $self->last_row);
1607 178   100     288 $self->{SeqFeatureObjects} ||= [];
1608 178         206 $self->{SeqFeatureObjects}->[$row_i] = $seqfeature;
1609 178         297 return 1;
1610             }
1611              
1612             sub delete_seqfeature {
1613 0     0 1 0 my ($self, $row_i) = @_;
1614 0 0       0 confess "invalid row index" unless ($row_i <= $self->last_row);
1615 0 0       0 return unless $self->{SeqFeatureObjects};
1616 0         0 undef $self->{SeqFeatureObjects}->[$row_i];
1617             }
1618              
1619             sub collapse_gene_transcripts {
1620 0     0 1 0 my $self = shift;
1621 0 0       0 unless ($self->feature_type eq 'named') {
1622 0         0 carp "Table does not contain named features!";
1623 0         0 return;
1624             }
1625            
1626             # load module
1627 0         0 my $class = "Bio::ToolBox::GeneTools";
1628 0         0 eval {load $class, qw(collapse_transcripts)};
  0         0  
1629 0 0       0 if ($@) {
1630 0         0 carp "unable to load $class! cannot collapse transcripts!";
1631 0         0 return;
1632             }
1633            
1634             # collapse the transcripts
1635 0         0 my $success = 0;
1636 0 0       0 if (exists $self->{SeqFeatureObjects}) {
1637             # we should have stored SeqFeature objects, probably from parsed table
1638 0         0 for (my $i = 1; $i <= $self->last_row; $i++) {
1639 0 0       0 my $feature = $self->{SeqFeatureObjects}->[$i] or next;
1640 0 0       0 my $collSeqFeat = collapse_transcripts($feature) or next;
1641 0         0 $success += $self->store_seqfeature($i, $collSeqFeat);
1642             }
1643             }
1644             else {
1645             # no stored SeqFeature objects, probably names pointing to a database
1646             # we will have to fetch the feature from a database
1647 0 0       0 my $db = $self->open_meta_database(1) or # force open a new db connection
1648             confess "No SeqFeature objects stored and no database connection!";
1649 0         0 my $name_i = $self->name_column;
1650 0         0 my $id_i = $self->id_column;
1651 0         0 my $type_i = $self->type_column;
1652 0         0 for (my $i = 1; $i <= $self->last_row; $i++) {
1653 0 0 0     0 my $feature = get_db_feature(
      0        
      0        
1654             db => $db,
1655             name => $self->value($i, $name_i) || undef,
1656             type => $self->value($i, $type_i) || undef,
1657             id => $self->value($i, $id_i) || undef,
1658             ) or next;
1659 0 0       0 my $collSeqFeat = collapse_transcripts($feature) or next;
1660 0         0 $success += $self->store_seqfeature($i, $collSeqFeat);
1661             }
1662             }
1663            
1664 0         0 return $success;
1665             }
1666              
1667             sub add_transcript_length {
1668 0     0 1 0 my $self = shift;
1669 0   0     0 my $subfeature = shift || 'exon';
1670 0 0       0 unless (exists $self->{SeqFeatureObjects}) {
1671 0         0 carp "no SeqFeature objects stored for collapsing!";
1672 0         0 return;
1673             }
1674            
1675             # load module
1676 0         0 eval {load "Bio::ToolBox::GeneTools", qw(get_transcript_length get_transcript_cds_length
  0         0  
1677             get_transcript_5p_utr_length get_transcript_3p_utr_length)};
1678 0 0       0 if ($@) {
1679 0         0 carp "unable to load Bio::ToolBox::GeneTools! cannot collapse transcripts!";
1680 0         0 return;
1681             }
1682            
1683             # determine name and calculation method
1684 0         0 my $length_calculator;
1685             my $length_name;
1686 0 0       0 if ($subfeature eq 'exon') {
    0          
    0          
    0          
1687 0         0 $length_calculator = \&get_transcript_length;
1688 0         0 $length_name= 'Merged_Transcript_Length';
1689             }
1690             elsif ($subfeature eq 'cds') {
1691 0         0 $length_calculator = \&get_transcript_cds_length;
1692 0         0 $length_name= 'Transcript_CDS_Length';
1693             }
1694             elsif ($subfeature eq '5p_utr') {
1695 0         0 $length_calculator = \&get_transcript_5p_utr_length;
1696 0         0 $length_name= 'Transcript_5p_UTR_Length';
1697             }
1698             elsif ($subfeature eq '3p_utr') {
1699 0         0 $length_calculator = \&get_transcript_3p_utr_length;
1700 0         0 $length_name= 'Transcript_3p_UTR_Length';
1701             }
1702             else {
1703 0         0 carp "unrecognized subfeature type '$subfeature'! No length calculated";
1704 0         0 return;
1705             }
1706            
1707             # add new column and calculate
1708 0         0 my $length_i = $self->add_column($length_name);
1709 0 0       0 if (exists $self->{SeqFeatureObjects}) {
1710             # we should have stored SeqFeature objects, probably from parsed table
1711 0         0 for (my $i = 1; $i <= $self->last_row; $i++) {
1712 0 0       0 my $feature = $self->{SeqFeatureObjects}->[$i] or next;
1713 0         0 my $length = &$length_calculator($feature);
1714 0   0     0 $self->{data_table}->[$i][$length_i] = $length || $feature->length;
1715             }
1716             }
1717             else {
1718             # no stored SeqFeature objects, probably names pointing to a database
1719             # we will have to fetch the feature from a database
1720 0 0       0 my $db = $self->open_meta_database(1) or # force open a new db connection
1721             confess "No SeqFeature objects stored and no database connection!";
1722 0         0 my $name_i = $self->name_column;
1723 0         0 my $id_i = $self->id_column;
1724 0         0 my $type_i = $self->type_column;
1725 0         0 for (my $i = 1; $i <= $self->last_row; $i++) {
1726 0 0 0     0 my $feature = get_db_feature(
      0        
      0        
1727             db => $db,
1728             name => $self->value($i, $name_i) || undef,
1729             type => $self->value($i, $type_i) || undef,
1730             id => $self->value($i, $id_i) || undef,
1731             ) or next;
1732 0         0 my $length = &$length_calculator($feature);
1733 0   0     0 $self->{data_table}->[$i][$length_i] = $length || $feature->length;
1734 0         0 $self->store_seqfeature($i, $feature);
1735             # to store or not to store? May explode memory, but could save from
1736             # expensive db calls later
1737             }
1738             }
1739            
1740 0         0 return $length_i;
1741             }
1742              
1743              
1744              
1745             ### Data structure manipulation
1746              
1747             sub sort_data {
1748 1     1 1 1334 my $self = shift;
1749 1         2 my $index = shift;
1750 1   50     3 my $direction = shift || 'i';
1751            
1752             # confirm options
1753 1 50       3 return unless exists $self->{$index}{name};
1754 1 50       4 unless ($direction =~ /^[id]/i) {
1755 0         0 carp "unrecognized sort order '$direction'! Must be i or d";
1756 0         0 return;
1757             }
1758            
1759             # Sample the dataset values
1760             # this will be used to guess the sort method, below
1761 1         1 my $example; # an example of the dataset
1762 1         4 foreach (my $i = 1; $i <= $self->last_row; $i++) {
1763             # we want to avoid null values, so keep trying
1764             # null being . or any variation of N/A, NaN, inf
1765 1         3 my $v = $self->value($i, $index);
1766 1 50 33     8 if (defined $v and $v !~ /^(?:\.|n\/?a|nan|\-?inf)$/i) {
1767             # a non-null value, take it
1768 1         2 $example = $v;
1769 1         1 last;
1770             }
1771             }
1772            
1773             # Determine sort method, either numerical or alphabetical
1774 1         2 my $sortmethod;
1775 1 50       3 if ($example =~ /[a-z]/i) {
    0          
1776             # there are detectable letters
1777 1         1 $sortmethod = 'ascii';
1778             }
1779             elsif ($example =~ /^\-?\d+\.?\d*$/) {
1780             # there are only digits, allowing for minus sign and a decimal point
1781             # I don't think this allows for exponents, though
1782 0         0 $sortmethod = 'numeric';
1783             }
1784             else {
1785             # unable to determine (probably alphanumeric), sort asciibetical
1786 0         0 $sortmethod = 'ascii';
1787             }
1788            
1789             # Re-order the datasets
1790             # Directly sorting the @data array is proving difficult. It keeps giving me
1791             # a segmentation fault. So I'm using a different approach by copying the
1792             # @data_table into a temporary hash.
1793             # put data_table array into a temporary hash
1794             # the hash key will the be dataset value,
1795             # the hash value will be the reference the row data
1796 1         2 my %datahash;
1797            
1798             # reorder numerically
1799 1 50       4 if ($sortmethod eq 'numeric') {
    50          
1800 0         0 for my $row (1..$self->last_row) {
1801 0         0 my $value = $self->value($row, $index);
1802            
1803             # check to see whether this value exists or not
1804 0         0 while (exists $datahash{$value}) {
1805             # add a really small number to bump it up and make it unique
1806             # this, of course, presumes that none of the dataset values
1807             # are really this small - this may be an entirely bad
1808             # assumption!!!!! I suppose we could somehow calculate an
1809             # appropriate value.... nah.
1810             # don't worry, we're only modifying the value used for sorting,
1811             # not the actual value
1812 0         0 $value += 0.00000001;
1813             }
1814             # store the row data reference
1815 0         0 $datahash{$value} = $self->{data_table}->[$row];
1816             }
1817            
1818             # re-fill the array based on the sort direction
1819 0 0       0 if ($direction =~ /^i/i) {
1820             # increasing sort
1821 0         0 my $i = 1; # keep track of the row, skip the header
1822 0         0 foreach (sort {$a <=> $b} keys %datahash) {
  0         0  
1823             # put back the reference to the anonymous array of row data
1824 0         0 $self->{data_table}->[$i] = $datahash{$_};
1825 0         0 $i++; # increment for next row
1826             }
1827             }
1828             else {
1829             # decreasing sort
1830 0         0 my $i = 1; # keep track of the row, skip the header
1831 0         0 foreach (sort {$b <=> $a} keys %datahash) {
  0         0  
1832             # put back the reference to the anonymous array of row data
1833 0         0 $self->{data_table}->[$i] = $datahash{$_};
1834 0         0 $i++; # increment for next row
1835             }
1836             }
1837            
1838             # summary prompt
1839 0         0 printf " Data table sorted numerically by the contents of %s\n",
1840             $self->name($index);
1841             }
1842            
1843             # reorder asciibetically
1844             elsif ($sortmethod eq 'ascii') {
1845 1         3 for my $row (1..$self->last_row) {
1846             # get the value to sort by
1847 78         83 my $value = $self->value($row, $index);
1848 78 50       92 if (exists $datahash{$value}) {
1849             # not unique
1850 0         0 my $n = 1;
1851 0         0 my $lookup = $value . sprintf("03%d", $n);
1852             # we'll try to make a unique value by appending
1853             # a number to the original value
1854 0         0 while (exists $datahash{$lookup}) {
1855             # keep bumping up the number till it's unique
1856 0         0 $n++;
1857 0         0 $lookup = $value . sprintf("03%d", $n);
1858             }
1859 0         0 $datahash{$lookup} = $self->{data_table}->[$row];
1860             }
1861             else {
1862             # unique
1863 78         119 $datahash{$value} = $self->{data_table}->[$row];
1864             }
1865             }
1866            
1867             # re-fill the array based on the sort direction
1868 1 50 33     6 if ($direction eq 'i' or $direction eq 'I') {
    50 33        
1869             # increasing
1870 0         0 my $i = 1; # keep track of the row
1871 0         0 foreach (sort {$a cmp $b} keys %datahash) {
  0         0  
1872             # put back the reference to the anonymous array of row data
1873 0         0 $self->{data_table}->[$i] = $datahash{$_};
1874 0         0 $i++; # increment for next row
1875             }
1876             }
1877             elsif ($direction eq 'd' or $direction eq 'D') {
1878             # decreasing
1879 1         2 my $i = 1; # keep track of the row
1880 1         18 foreach (sort {$b cmp $a} keys %datahash) {
  401         340  
1881             # put back the reference to the anonymous array of row data
1882 78         74 $self->{data_table}->[$i] = $datahash{$_};
1883 78         69 $i++; # increment for next row
1884             }
1885             }
1886            
1887             # summary prompt
1888 1         5 printf " Data table sorted asciibetically by the contents of '%s'\n",
1889             $self->name($index);
1890             }
1891            
1892 1         11 return 1;
1893             }
1894              
1895             sub gsort_data {
1896 1     1 1 3 my $self = shift;
1897            
1898             # identify indices
1899 1 50       2 unless ($self->feature_type eq 'coordinate') {
1900 0         0 carp "no chromosome and start/position columns to sort!\n";
1901 0         0 return;
1902             }
1903 1         3 my $chromo_i = $self->chromo_column;
1904 1         3 my $start_i = $self->start_column;
1905 1   33     3 my $stop_i = $self->stop_column || $start_i;
1906            
1907             # collect the data by chromosome: start and end
1908 1         1 my %chrom2row;
1909 1         3 for my $row (1 .. $self->last_row) {
1910 78         85 my $c = $self->{data_table}->[$row]->[$chromo_i];
1911 78   100     94 $chrom2row{$c} ||= [];
1912 78         140 push @{ $chrom2row{$c} }, [
1913             $self->{data_table}->[$row]->[$start_i],
1914             $self->{data_table}->[$row]->[$stop_i],
1915 78         67 $self->{data_table}->[$row]
1916             ];
1917             }
1918            
1919             # get sane chromosome order
1920 1         6 my @chroms = sane_chromo_sort(keys %chrom2row);
1921            
1922             # sort the table
1923 1         1 my @sorted_data;
1924 1         3 push @sorted_data, $self->{data_table}->[0];
1925 1 50       3 if ($self->gff) {
1926             # sort by increasing start and decreasing end, i.e. decreasing length
1927             # this should mostly handle putting genes/transcripts before exons
1928 0         0 foreach my $chr (@chroms) {
1929             push @sorted_data,
1930 0         0 map { $_->[2] }
1931 0 0       0 sort { $a->[0] <=> $b->[0] or $b->[1] <=> $a->[1] }
1932 0         0 @{ $chrom2row{$chr} };
  0         0  
1933             }
1934             }
1935             else {
1936             # sort by increasing start followed by increasing end, i.e. increasing length
1937 1         2 foreach my $chr (@chroms) {
1938             push @sorted_data,
1939 78         84 map { $_->[2] }
1940 245 50       302 sort { $a->[0] <=> $b->[0] or $a->[1] <=> $b->[1] }
1941 1         1 @{ $chrom2row{$chr} };
  1         4  
1942             }
1943             }
1944 1         4 $self->{data_table} = \@sorted_data;
1945 1         10 return 1;
1946             }
1947              
1948             sub splice_data {
1949 1     1 1 3 my ($self, $part, $total_parts) = @_;
1950            
1951 1 50 33     5 unless ($part and $total_parts) {
1952 0         0 confess "ordinal part and total number of parts not passed\n";
1953             }
1954 1         3 my $part_length = int($self->last_row / $total_parts);
1955            
1956             # check for SeqFeatureObjects array
1957 1 50       2 if (exists $self->{SeqFeatureObjects}) {
1958             # it needs to be the same length as the data table, it should be
1959 0         0 while (scalar @{$self->{SeqFeatureObjects}} < scalar @{$self->{data_table}}) {
  0         0  
  0         0  
1960 0         0 push @{$self->{SeqFeatureObjects}}, undef;
  0         0  
1961             }
1962             }
1963            
1964             # splicing based on which part we do
1965 1 50       4 if ($part == 1) {
    50          
1966             # remove all but the first part
1967             splice(
1968 0         0 @{$self->{'data_table'}},
  0         0  
1969             $part_length + 1
1970             );
1971 0 0       0 if (exists $self->{SeqFeatureObjects}) {
1972             splice(
1973 0         0 @{$self->{SeqFeatureObjects}},
  0         0  
1974             $part_length + 1
1975             );
1976             }
1977             }
1978             elsif ($part == $total_parts) {
1979             # remove all but the last part
1980             splice(
1981 1         2 @{$self->{'data_table'}},
  1         10  
1982             1,
1983             $part_length * ($total_parts - 1)
1984             );
1985 1 50       3 if (exists $self->{SeqFeatureObjects}) {
1986             splice(
1987 0         0 @{$self->{SeqFeatureObjects}},
  0         0  
1988             1,
1989             $part_length * ($total_parts - 1)
1990             );
1991             }
1992             }
1993             else {
1994             # splicing in the middle requires two rounds
1995            
1996             # remove the last parts
1997             splice(
1998 0         0 @{$self->{'data_table'}},
  0         0  
1999             ($part * $part_length) + 1
2000             );
2001            
2002             # remove the first parts
2003             splice(
2004 0         0 @{$self->{'data_table'}},
  0         0  
2005             1,
2006             $part_length * ($part - 1)
2007             );
2008            
2009 0 0       0 if (exists $self->{SeqFeatureObjects}) {
2010             splice(
2011 0         0 @{$self->{SeqFeatureObjects}},
  0         0  
2012             ($part * $part_length) + 1
2013             );
2014             splice(
2015 0         0 @{$self->{SeqFeatureObjects}},
  0         0  
2016             1,
2017             $part_length * ($part - 1)
2018             );
2019             }
2020             }
2021            
2022             # update last row metadata
2023 1         1 $self->{'last_row'} = scalar(@{$self->{'data_table'}}) - 1;
  1         3  
2024            
2025             # re-open a new un-cached database connection
2026 1 50       2 if (exists $self->{db_connection}) {
2027 0         0 delete $self->{db_connection};
2028             }
2029 1         2 return 1;
2030             }
2031              
2032              
2033              
2034             ### File functions
2035              
2036             sub reload_children {
2037 1     1 1 3 my $self = shift;
2038 1         3 my @files = @_;
2039 1 50       3 return unless @files;
2040            
2041             # prepare Stream
2042 1         2 my $class = "Bio::ToolBox::Data::Stream";
2043 1         1 eval {load $class};
  1         4  
2044 1 50       52 if ($@) {
2045 0         0 carp "unable to load $class! can't reload children!";
2046 0         0 return;
2047             }
2048            
2049             # open first stream
2050 1         1 my $first = shift @files;
2051 1         5 my $Stream = $class->new(in => $first);
2052 1 50       4 unless ($Stream) {
2053 0         0 carp "unable to load first child file $first";
2054 0         0 return;
2055             }
2056            
2057             # destroy current data table and metadata
2058 1         2 $self->{data_table} = [];
2059 1         3 for (my $i = 0; $i < $self->number_columns; $i++) {
2060 0         0 delete $self->{$i};
2061             }
2062            
2063             # copy the metadata
2064 1         2 foreach (qw(program feature db bed gff ucsc headers number_columns last_row)) {
2065             # various keys
2066 9         12 $self->{$_} = $Stream->{$_};
2067             }
2068 1         4 for (my $i = 0; $i < $Stream->number_columns; $i++) {
2069             # column metadata
2070 4         9 my %md = $Stream->metadata($i);
2071 4         10 $self->{$i} = \%md;
2072             }
2073 1         3 my @comments = $Stream->comments;
2074 1         1 push @{$self->{other}}, @comments;
  1         12  
2075            
2076             # first row column headers
2077 1         2 $self->{data_table}->[0] = [ @{ $Stream->{data_table}->[0] } ];
  1         3  
2078            
2079             # load the data
2080 1         4 while (my $row = $Stream->next_row) {
2081 39         51 $self->add_row($row);
2082             }
2083 1         31 $Stream->close_fh;
2084            
2085             # load remaining files
2086 1         14 foreach my $file (@files) {
2087 2         21 my $Stream = $class->new(in => $file);
2088 2 50       5 if ($Stream->number_columns != $self->number_columns) {
2089 0         0 confess "fatal error: child file $file has a different number of columns!";
2090             }
2091 2         4 while (my $row = $Stream->next_row) {
2092 78         107 my $a = $row->row_values;
2093 78         110 $self->add_row($a);
2094             }
2095 2         59 $Stream->close_fh;
2096             }
2097 1         15 $self->verify;
2098            
2099             # clean up
2100 1         112 unlink($first, @files);
2101 1         6 return $self->last_row;
2102             }
2103              
2104              
2105              
2106              
2107             ### Export summary data file
2108             sub summary_file {
2109 0     0 1 0 my $self = shift;
2110            
2111             # Collect passed arguments
2112 0         0 my %args = @_;
2113            
2114             # parameters
2115             # either one or more datasets can be summarized
2116 0   0     0 my $outfile = $args{'filename'} || undef;
2117 0         0 my @datasets;
2118 0 0 0     0 if ($args{dataset} and ref $args{dataset} eq 'ARRAY') {
    0 0        
2119 0         0 @datasets = @{ $args{dataset} };
  0         0  
2120             }
2121             elsif ($args{dataset} and ref $args{dataset} eq 'SCALAR') {
2122 0         0 push @datasets, $args{dataset};
2123             }
2124 0         0 my @startcolumns;
2125 0 0 0     0 if ($args{startcolumn} and ref $args{startcolumn} eq 'ARRAY') {
    0 0        
2126 0         0 @startcolumns = @{ $args{startcolumn} };
  0         0  
2127             }
2128             elsif ($args{startcolumn} and ref $args{startcolumn} eq 'SCALAR') {
2129 0         0 push @startcolumns, $args{startcolumn};
2130             }
2131 0         0 my @endcolumns;
2132 0   0     0 $args{endcolumn} ||= $args{stopcolumn};
2133 0 0 0     0 if ($args{endcolumn} and ref $args{endcolumn} eq 'ARRAY') {
    0 0        
2134 0         0 @endcolumns = @{ $args{endcolumn} };
  0         0  
2135             }
2136             elsif ($args{endcolumn} and ref $args{endcolumn} eq 'SCALAR') {
2137 0         0 push @endcolumns, $args{endcolumn};
2138             }
2139            
2140            
2141             # Check required values
2142 0 0       0 unless ($self->verify) {
2143 0         0 cluck "bad data structure!";
2144 0         0 return;
2145             }
2146 0 0       0 unless (defined $outfile) {
2147 0 0       0 if ($self->basename) {
2148             # use the opened file's filename if it exists
2149             # prepend the path if it exists
2150             # the extension will be added later
2151 0         0 $outfile = $self->path . $self->basename;
2152             }
2153             else {
2154 0         0 cluck "no filename passed to write_summary_data!\n";
2155 0         0 return;
2156             }
2157             }
2158            
2159             # Identify possible dataset columns
2160 0         0 my %possibles;
2161 0         0 my %skip = map {$_ => 1} qw (systematicname name id alias aliases type class
  0         0  
2162             geneclass chromosome chromo seq_id seqid start stop end gene strand
2163             length primary_id merged_transcript_length transcript_cds_length
2164             transcript_5p_utr_length transcript_3p_utr_length);
2165             # walk through the dataset names
2166 0         0 $possibles{unknown} = [];
2167 0         0 for (my $i = 0; $i < $self->number_columns; $i++) {
2168 0 0       0 if (not exists $skip{ lc $self->{$i}{'name'} }) {
2169 0   0     0 my $d = $self->metadata($i, 'dataset') || undef;
2170 0 0       0 if (defined $d) {
2171             # we have what appears to be a dataset column
2172 0   0     0 $possibles{$d} ||= [];
2173 0         0 push @{$possibles{$d}}, $i;
  0         0  
2174             }
2175             else {
2176             # still an unknown possibility
2177 0         0 push @{$possibles{unknown}}, $i;
  0         0  
2178             }
2179             }
2180             }
2181            
2182             # check datasets
2183 0 0       0 unless (@datasets) {
2184 0 0       0 if (scalar keys %possibles > 1) {
2185             # we will always have the unknown category, so anything more than one
2186             # means we found legitimate dataset columns
2187 0         0 delete $possibles{unknown};
2188             }
2189 0         0 @datasets = sort {$a cmp $b} keys %possibles;
  0         0  
2190             }
2191            
2192             # check starts
2193 0 0       0 if (scalar @startcolumns != scalar @datasets) {
2194 0         0 @startcolumns = (); # ignore what we were given?
2195 0         0 foreach my $d (@datasets) {
2196             # take the first column with this dataset
2197 0         0 push @startcolumns, $possibles{$d}->[0];
2198             }
2199             }
2200            
2201             # check stops
2202 0 0       0 if (scalar @endcolumns != scalar @datasets) {
2203 0         0 @endcolumns = (); # ignore what we were given?
2204 0         0 foreach my $d (@datasets) {
2205             # take the last column with this dataset
2206 0         0 push @endcolumns, $possibles{$d}->[-1];
2207             }
2208             }
2209            
2210            
2211             # Prepare Data object to store the summed data
2212 0         0 my $summed_data = $self->new(
2213             feature => 'averaged_windows',
2214             columns => ['Window','Midpoint'],
2215             );
2216              
2217             # Go through each dataset
2218 0         0 foreach my $d (0 .. $#datasets) {
2219            
2220             # Prepare score column name
2221 0         0 my $data_name = simplify_dataset_name($datasets[$d]);
2222            
2223             # add column
2224 0         0 my $i = $summed_data->add_column($data_name);
2225 0         0 $summed_data->metadata($i, 'dataset', $datasets[$d]);
2226            
2227             # tag for remembering we're working with percentile bins
2228 0         0 my $do_percentile = 0;
2229            
2230             # remember the row
2231 0         0 my $row = 1;
2232            
2233             # Collect summarized data
2234 0         0 for (
2235             my $column = $startcolumns[$d];
2236             $column <= $endcolumns[$d];
2237             $column++
2238             ) {
2239            
2240             # determine the midpoint position of the window
2241             # this assumes the column metadata has start and stop
2242 0         0 my $midpoint = int(sum0($self->metadata($column, 'start'),
2243             $self->metadata($column, 'stop')) / 2);
2244            
2245             # convert midpoint to fraction of 1000 for plotting if necessary
2246 0 0       0 if (substr($self->name($column), -1) eq '%') {
2247 0         0 $midpoint *= 10; # midpoint * 0.01 * 1000 bp
2248 0         0 $do_percentile++;
2249             }
2250 0 0 0     0 if ($do_percentile and substr($self->name($column), -2) eq 'bp') {
2251             # working on the extension after the percentile bins
2252 0         0 $midpoint += 1000;
2253             }
2254            
2255             # collect the values in the column
2256 0         0 my @values;
2257 0         0 for my $row (1..$self->last_row) {
2258 0         0 my $v = $self->value($row, $column);
2259 0 0       0 push @values, $v eq '.' ? 0 : $v; # treat nulls as zero
2260             }
2261            
2262             # adjust if log value
2263 0   0     0 my $log = $self->metadata($column, 'log2') || 0;
2264 0 0       0 if ($log) {
2265 0         0 @values = map { 2 ** $_ } @values;
  0         0  
2266             }
2267            
2268             # determine mean value
2269 0         0 my $window_mean = sum0(@values) / scalar(@values);
2270 0 0       0 if ($log) {
2271 0         0 $window_mean = log($window_mean) / log(2);
2272             }
2273            
2274             # push to summed output
2275 0 0       0 if ($d == 0) {
2276             # this is the first dataset, so we need to add a row
2277 0         0 $summed_data->add_row( [ $self->{$column}{'name'}, $midpoint, $window_mean ] );
2278             }
2279             else {
2280             # we're summarizing multiple datasets, we already have name midpoint
2281             # first do sanity check
2282 0 0       0 if ($summed_data->value($row, 1) != $midpoint) {
2283 0         0 carp("unable to summarize multiple datasets with nonequal columns of data!");
2284 0         0 return;
2285             }
2286 0         0 $summed_data->value($row, $i, $window_mean);
2287             }
2288 0         0 $row++;
2289             }
2290             }
2291            
2292            
2293            
2294             # Write summed data
2295 0         0 $outfile =~ s/\.txt(\.gz)?$//i; # strip any .txt or .gz extensions if present
2296 0         0 my $written_file = $summed_data->write_file(
2297             'filename' => $outfile . '_summary.txt',
2298             'gz' => 0,
2299             );
2300 0         0 return $written_file;
2301             }
2302              
2303              
2304              
2305              
2306              
2307             ####################################################
2308              
2309             package Bio::ToolBox::Data::Iterator;
2310 3     3   15620 use Bio::ToolBox::Data::Feature;
  3         5  
  3         80  
2311 3     3   16 use Carp;
  3         6  
  3         579  
2312              
2313             sub new {
2314 9     9   18 my ($class, $data) = @_;
2315 9         30 my %iterator = (
2316             'index' => 1,
2317             'data' => $data,
2318             );
2319 9         26 return bless \%iterator, $class;
2320             }
2321              
2322             sub next_row {
2323 154     154   1190 my $self = shift;
2324 154 100       264 return if $self->{'index'} > $self->{data}->{last_row}; # no more
2325 148         146 my $i = $self->{'index'};
2326 148         142 $self->{'index'}++;
2327             my @options = (
2328             'data' => $self->{data},
2329 148         217 'index' => $i,
2330             );
2331 148 50 66     271 if (
2332             exists $self->{data}->{SeqFeatureObjects} and
2333             defined $self->{data}->{SeqFeatureObjects}->[$i]
2334             ) {
2335 0         0 push @options, 'feature', $self->{data}->{SeqFeatureObjects}->[$i];
2336             }
2337 148         292 return Bio::ToolBox::Data::Feature->new(@options);
2338             }
2339              
2340             sub row_index {
2341 0     0     my $self = shift;
2342 0 0         carp "row_index is a read only method" if @_;
2343 0           return $self->{'index'};
2344             }
2345              
2346              
2347              
2348             ####################################################
2349              
2350             __END__