File Coverage

blib/lib/Bio/ToolBox/Data.pm
Criterion Covered Total %
statement 321 614 52.2
branch 108 290 37.2
condition 59 168 35.1
subroutine 30 35 85.7
pod 22 22 100.0
total 540 1129 47.8


line stmt bran cond sub pod time code
1             package Bio::ToolBox::Data;
2             our $VERSION = '1.68';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::Data - Reading, writing, and manipulating data structure
7              
8             =head1 SYNOPSIS
9              
10             use Bio::ToolBox::Data;
11            
12             ### Open a pre-existing file
13             # a data table with same columns as input file
14             my $Data = Bio::ToolBox::Data->new(
15             file => 'coordinates.bed',
16             );
17            
18             ### Parse a GTF, GFF3, refFlat, or genePred gene table
19             # data table with names and references to fully parsed
20             # SeqFeature transcript objects with exon SeqFeature subfeatures
21             my $Data = Bio::ToolBox::Data->new(
22             file => 'annotation.gtf.gz',
23             parse => 1,
24             feature => 'transcript',
25             subfeature => 'exon'
26             );
27            
28             ### New gene list from a Bio::DB::SeqFeature::Store database
29             # data table with name and reference ID to database SeqFeature objects
30             my $Data = Bio::ToolBox::Data->new(
31             db => 'hg19.sqlite',
32             feature => 'gene:ensGene',
33             );
34            
35            
36             ### Get a specific value
37             my $value = $Data->value($row, $column);
38            
39             ### Replace or add a value
40             $Data->value($row, $column, $new_value);
41            
42             ### Add a column
43             my $new_index = $Data->add_column('Data2');
44            
45             ### Find a column
46             my $old_index = $Data->find_column('Data1');
47            
48             ### Return a specific row as an object
49             my $row = $Data->get_row($i); # Bio::ToolBox::Data::Feature
50            
51             ### Iterate through a Data structure one row at a time
52             my $stream = $Data->row_stream;
53             while (my $row = $stream->next_row) {
54             # each row is returned as Bio::ToolBox::Data::Feature object
55             # get the positional information from the file data
56             # assuming that the input file had these identifiable columns
57             my $seq_id = $row->seq_id;
58             my $start = $row->start;
59             my $stop = $row->end;
60            
61             # work with the referenced SeqFeature object
62             my $seqf = $row->seqfeature;
63            
64             # generate a new Bio::Seq object from the database using
65             # these coordinates
66             my $region = $db->segment($seq_id, $start, $stop);
67            
68             # modify a row value
69             my $value = $row->value($old_index);
70             my $new_value = $value + 1;
71             $row->value($new_index, $new_value);
72             }
73            
74             ### Iterate through a Data table with a code reference
75             $Data->iterate(\&my_code);
76            
77            
78             ### write the data to file
79             my $success = $Data->write_file(
80             filename => 'new_data.txt',
81             gz => 1,
82             );
83             print "wrote new file $success\n"; # file is new_data.txt.gz
84              
85             =head1 DESCRIPTION
86              
87             This module works with the primary Bio::ToolBox Data structure. Simply, it
88             is a complex data structure representing a tabbed-delimited table (array
89             of arrays), with plenty of options for metadata. Many common bioinformatic
90             file formats are simply tabbed-delimited text files (think BED, GFF, VCF).
91             Each row is a feature or genomic interval, and each column is a piece of
92             information about that feature, such as name, type, and/or coordinates.
93             We can append to that file additional columns of information, perhaps
94             scores from genomic data sets. We can record metadata regarding how
95             and where we obtained that data. Finally, we can write the updated
96             table to a new file.
97              
98             =head1 METHODS
99              
100             =head2 Initializing the structure
101              
102             =over 4
103              
104             =item new
105              
106             Initialize a new Data structure. This generally requires options,
107             provided as an array of key =E values. A new list of features
108             may be obtained from an annotation database or an existing file
109             may be loaded. If you do not pass any options, a new empty
110             structure will be generated for you to populate.
111              
112             These are the options available.
113              
114             =over 4
115              
116             =item file
117              
118             =item in
119              
120             my $Data = Bio::ToolBox::Data->new(file => $file);
121              
122             Provide the path and name to an existing tabbed-delimited text
123             file from which to load the contents. This is a shortcut to the
124             load_file() method. See that method for more details.
125              
126             =item stream
127              
128             my $Data = Bio::ToolBox::Data->new(file => $file, stream => 1);
129              
130             Boolean option indicating that the file should be opened as a file
131             stream. A L object will be returned. This
132             is a convenience method.
133              
134             =item noheader
135              
136             my $Data = Bio::ToolBox::Data->new(file => $file, noheader => 1);
137              
138             Boolean option indicating that the file does not have file headers,
139             in which case dummy headers are provided. This is not necessary for
140             defined file types that don't normally have file headers, such as
141             BED, GFF, or UCSC files.
142              
143             =item parse
144              
145             my $Data = Bio::ToolBox::Data->new(file => $file, parse => 1);
146              
147             Boolean option indicating that a gene annotation table or file should
148             be parsed into SeqFeature objects and a general table of names and IDs
149             representing those objects be generated. The annotation file may
150             be specified in one of two ways: Through the file option above,
151             or in the database metadata of an existing table file representing
152             previously parsed objects.
153              
154             =item db
155              
156             my $Data = Bio::ToolBox::Data->new(db => 'hg19', feature => 'gene');
157              
158             Provide the name of the database from which to collect the
159             features. It may be a short name, whereupon it is checked in
160             the L configuration file F<.biotoolbox.cfg> for
161             connection information. Alternatively, a path to a database
162             file or directory may be given.
163              
164             If you already have an opened L database
165             object, you can simply pass that. See L for
166             more information. However, this in general should be discouraged,
167             since the name of the database will not be properly recorded when
168             saving to file. It is better to simply pass the name of database
169             again; multiple connections to the same database are smartly handled
170             in the background.
171              
172             =item feature
173              
174             my $Data = Bio::ToolBox::Data->new(file => $filename, feature => 'gene');
175              
176             For de novo lists from an annotation database, provide the GFF
177             type or type:source (columns 3 and 2) for collection. A comma
178             delimited string may be accepted for databases. For parsed files,
179             only a simple string is accepted.
180              
181             For a list of genomic intervals across the genome, specify a
182             feature of 'genome' with a database object.
183              
184             =item subfeature
185              
186             When parsing annotation files such as GTF, one or more subfeature
187             types, e.g. C or C, may be specified as a comma-delimited
188             string. This ensures that the subfeatures will be parsed into
189             SeqFeature objects. Otherwise, only the top level features will be
190             parsed. This expedites parsing by skipping unwanted features.
191              
192             =item win
193              
194             =item step
195              
196             my $Data = Bio::ToolBox::Data->new(db => $dbname, win => $integer);
197              
198             If generating a list of genomic intervals, optionally provide
199             the window and step values. The default is 500 bp.
200              
201             =item chrskip
202              
203             Provide a regular expression compatible or C string for skipping or
204             excluding specific or classes of chromosomes, for example the mitochondrial
205             chromosome or unmapped contigs. This works with both feature collection
206             and genomic intervals. The default is to take all chromosomes.
207              
208             =back
209              
210             When no file is given or database given to search, then a new,
211             empty Data object is returned. In this case, you may optionally
212             specify the names of the columns or indicate a specific file
213             format structure to assign column names. The following options can
214             then be provided.
215              
216             =over 4
217              
218             =item columns
219              
220             =item datasets
221              
222             my $Data = Bio::ToolBox::Data->new(columns => [qw(Column1 Column2 ...)] );
223              
224             Provide the column names in advance as an anonymous array.
225              
226             =item gff
227              
228             Pass the GFF version of the file: 1, 2, 2.5 (GTF), or 3.
229              
230             =item bed
231              
232             Pass the number of BED columns (3-12).
233              
234             =item ucsc
235              
236             Pass the number of columns to indicate the type of UCSC format. These
237             include 10 (refFlat without gene names), 11 (refFlat with gene names),
238             12 (knownGene gene prediction table), and 15
239             (an extended gene prediction or genePredExt table).
240              
241             =back
242              
243             If successful, the method will return a Bio::ToolBox::Data object.
244              
245             =item duplicate
246              
247             This will create a new Data object containing the same column
248             headers and metadata, but lacking the table content, i.e. no
249             rows of data. File name metadata, if present in the original, is
250             not preserved. The purpose here, for example, is to allow one
251             to selectively copy rows from one Data object to another.
252              
253             =item parse_table
254              
255             $Data->parse_table($file)
256             $Data->parse_table( {
257             file => $file,
258             feature => 'gene',
259             subfeature => 'exon',
260             chrskip => 'chrM|contig',
261             } );
262              
263             This will parse a gene annotation table into SeqFeature objects.
264             If this is called from an empty Data object, then the table will
265             be filled with the SeqFeature object names and IDs. If this is
266             called from a non-empty Data object, then the table's contents
267             will be associated with the SeqFeature objects using their name and
268             ID. The stored SeqFeature objects can be retrieved using the
269             L method.
270              
271             Pass the method a single argument. This may be either a simple
272             scalar to a filename to be parsed, or a reference to hash of
273             one or more argument options. Possible options include:
274              
275             =over 4
276              
277             =item file
278              
279             The file name of the gene table to be parsed. This may
280             be a GFF, GFF3, GTF, or any of the UCSC gene table formats.
281             These will be parsed using Bio::ToolBox::parser::* adapters.
282              
283             =item feature
284              
285             A regular expression compatible string or C string to match
286             the top features C to keep. The C is not
287             checked. The default is 'gene', although a transcript type could
288             alternatively be specified (in which case transcripts are not
289             parsed in gene features).
290              
291             =item subfeature
292              
293             A regular expression compatible string or C string to match
294             any sub features C to parse. The C is not checked.
295             Typically these include exon, CDS, or UTR. The default is nothing.
296              
297             =item chrskip
298              
299             A regular expression compatible string or C string to match
300             chromosomes to be skipped or excluded. Any feature with a matching
301             C chromosome name will be skipped.
302              
303             =back
304              
305             =back
306              
307             =head2 General Metadata
308              
309             There is a variety of general metadata regarding the Data
310             structure. The following methods may be used to access or set these
311             metadata properties. Some of these are stored as comment lines at
312             the beginning of the file, and will be read or set when the file is
313             loaded.
314              
315             =over
316              
317             =item feature
318              
319             $Data->feature('something');
320             my $feature = $Data->feature;
321              
322             Returns or sets the name of the features used to collect
323             the list of features. The actual feature types are listed
324             in the table, so this metadata is merely descriptive.
325              
326             =item feature_type
327              
328             Returns one of three specific values describing the contents
329             of the data table inferred by the presence of specific column
330             names. This provides a clue as to whether the table features
331             represent genomic regions (defined by coordinate positions) or
332             named database features. The return values include:
333              
334             =over 4
335              
336             =item * coordinate
337              
338             Table includes at least chromosome and start columns.
339              
340             =item * named
341              
342             Table includes name, type, and/or Primary_ID, possibly
343             referring to named database features.
344              
345             =item * unknown
346              
347             Table is unrecognized format.
348              
349             =back
350              
351             =item program
352              
353             Returns or sets the name of the program generating the list.
354              
355             =item database
356              
357             Returns or sets the name or path of the database from which the
358             features were derived.
359              
360             =item gff
361              
362             Returns or sets the version of loaded GFF files. Supported versions
363             included 1, 2, 2.5 (GTF), and 3.
364              
365             =item bed
366              
367             Returns or sets the BED file version. Here, the BED version is simply
368             the number of columns.
369              
370             =item ucsc
371              
372             Returns or sets the UCSC file format version. Here, the version is
373             simply the number of columns. Supported versions include 10 (gene
374             prediction), 11 (refFlat, or gene prediction with gene name), 12
375             (knownGene table), 15 (extended gene prediction), or 16 (extended
376             gene prediction with bin).
377              
378             =item vcf
379              
380             Returns or sets the VCF file version number. VCF support is limited.
381              
382             =back
383              
384             =head2 File information
385              
386             These methods provide information about the file from which the
387             data table was loaded. This does not include parsed annotation tables.
388              
389             =over 4
390              
391             =item filename
392              
393             =item path
394              
395             =item basename
396              
397             =item extension
398              
399             Returns the filename, full path, basename, and extension of
400             the filename. Concatenating the last three values will reconstitute
401             the first original filename.
402              
403             =item add_file_metadata
404              
405             $Data->add_file_metadata('/path/to/file.txt');
406              
407             Add filename metadata. This will automatically parse the path,
408             basename, and recognized extension from the passed filename and
409             set the appropriate metadata attributes.
410              
411             =back
412              
413             =head2 Metadata comments
414              
415             Comments are any other commented lines from a text file (lines
416             beginning with a #) that were not parsed as general metadata.
417              
418             =over 4
419              
420             =item comments
421              
422             Returns a copy of the array containing commented lines. Each
423             comment line becomes an element in the array.
424              
425             =item add_comment
426              
427             Appends the text string to the comment array.
428              
429             =item delete_comment
430              
431             Deletes a comment. Provide the array index of the comment to
432             delete. If an index is not provided, ALL comments will be deleted!
433              
434             =item vcf_headers
435              
436             For VCF files, this will partially parse the VCF headers into a
437             hash structure that can be queried or manipulated. Each header
438             line is parsed for the primary key, being the first word after the
439             ## prefix, e.g. INFO, FORMAT, FILTER, contig, etc. For the simple
440             values, they are stored as the value. For complex entries, such as
441             with INFO and FORMAT, a second level hash is created with the ID
442             extracted and used as the second level key. The value is always the
443             always the remainder of the string.
444              
445             For example, the following would be a simple parsed vcf header in
446             code representation.
447              
448             $vcf_header = {
449             FORMAT => {
450             GT = q(ID=GT,Number=1,Type=String,Description="Genotype"),
451             AD = q(ID=AD,Number=.,Type=Integer,Description="ref,alt Allelic depths"),
452             },
453             fileDate => 20150715,
454             }
455              
456             =item rewrite_vcf_headers
457              
458             If you have altered the vcf headers exported by the vcf_headers()
459             method, then this method will rewrite the hash structure as new
460             comment lines. Do this prior to writing or saving the Data sturcture
461             or else you will lose your changed VCF header metadata.
462              
463             =back
464              
465             =head2 The Data table
466              
467             The Data table is the array of arrays containing all of the
468             actual information. Rows and columns are indexed using 0-based
469             indexing as with all Perl arrays. Row 0 is always the column
470             header row containing the column names, regardless whether an
471             actual header name existed in the original file format (e.g.
472             BED or GFF formats). Any individual table "cell" can be
473             specified as C<[$row][$column]>.
474              
475             =over 4
476              
477             =item list_columns
478              
479             Returns an array or array reference of the column names
480             in ascending (left to right) order.
481              
482             =item number_columns
483              
484             Returns the number of columns in the Data table.
485              
486             =item last_column
487              
488             Returns the array index number for the last (right most)
489             column. This number is always 1 less than the value
490             returned by number_columns() due to 0-based indexing.
491              
492             =item last_row
493              
494             Returns the array index number of the last row.
495             Since the header row is index 0, this is also the
496             number of actual content rows.
497              
498             =item column_values
499              
500             my $values = $Data->column_values($i);
501              
502             Returns an array or array reference representing the values
503             in the specified column. This includes the column header as
504             the first element. Pass the method the column index.
505              
506             =item add_column
507              
508             # add empty column with name
509             my $i = $Data->add_column($name);
510            
511             # add entire column
512             my $i = $Data->add_column(\@array);
513              
514             Appends a new column to the Data table at the
515             rightmost position (highest index). It adds the column
516             header name and creates a new column metadata hash.
517             Pass the method one of two possibilities. Pass a text
518             string representing the new column name, in which case
519             no data will be added to the column. Alternatively, pass
520             an array reference, and the contents of the array will
521             become the column data. If the Data table already has
522             rows, then the passed array reference must have the same
523             number of elements.
524              
525             It returns the new column index if successful.
526              
527             =item copy_column
528              
529             my $j = $Data->copy_column($i);
530              
531             This will copy a column, appending the duplicate column at
532             the rightmost position (highest index). It will duplicate
533             column metadata as well. It will return the new index
534             position.
535              
536             =item delete_column
537              
538             $Data->delete_column($i, $j);
539              
540             Deletes one or more specified columns. Any remaining
541             columns rightwards will have their indices shifted
542             down appropriately. If you had identified one of the
543             shifted columns, you may need to re-find or calculate
544             its new index.
545              
546             =item reorder_column
547              
548             $Data->reorder_column($c,$b,$a,$a);
549              
550             Reorders columns into the specified order. Provide the
551             new desired order of indices. Columns could be duplicated
552             or deleted using this method. The columns will adopt their
553             new index numbers.
554              
555             =item add_row
556              
557             $Data->add_row(\@values);
558             $Data->add_row($Row); # Bio::ToolBox::Data::Feature object
559              
560             Add a new row of data values to the end of the Data table.
561             Optionally provide either a reference to an array of values
562             to put in the row, or pass a L
563             Row object, such as one obtained from another Data object.
564             If the number of columns do not match, the array is filled
565             up with null values for missing columns, or excess values
566             are dropped.
567              
568             =item get_row
569              
570             $Data->get_row($i); # Bio::ToolBox::Data::Feature object
571              
572             Returns the specified row as a L
573             object.
574              
575             =item delete_row
576              
577             $Data->delete_row($i, $j);
578              
579             Deletes one or more specified rows. Rows are spliced out
580             highest to lowest index to avoid issues. Be very careful
581             deleting rows while simultaneously iterating through the
582             table!
583              
584             =item row_values
585              
586             my $row_values = $Data->row_values($i);
587              
588             Returns a copy of an array for the specified row index.
589             Modifying this returned array does not migrate back to the
590             Data table; Use the L method below instead.
591              
592             =item value
593              
594             my $value = $Data->value($row, $column);
595             $Data->value($row, $column, $new_value);
596              
597             Returns or sets the value at a specific row or column index.
598             Index positions are 0-based (header row is index 0).
599              
600             =back
601              
602             =head2 Column Metadata
603              
604             Each column has metadata. Each metadata is a series of key =E
605             value pairs. The minimum keys are 'index' (the 0-based index
606             of the column) and 'name' (the column header name). Additional
607             keys and values may be queried or set as appropriate. When the
608             file is written, these are stored as commented metadata lines at
609             the beginning of the file.
610              
611             =over 4
612              
613             =item name
614              
615             $Data->name($index, $new_name);
616             my $name = $Data->name($i);
617              
618             Convenient method to return the name of the column given the
619             index number. A column may also be renamed by passing a new name.
620              
621             =item metadata
622              
623             $Data->metadata($index, $key, $new_value);
624             my $value = $Data->metadata($index, $key)
625              
626             Returns or sets the metadata value for a specific $key for a
627             specific column $index.
628              
629             This may also be used to add a new metadata key. Simply provide
630             the name of a new $key that is not present
631              
632             If no key is provided, then a hash or hash reference is returned
633             representing the entire metadata for that column.
634              
635             =item copy_metadata
636              
637             $Data->copy_metadata($source, $target);
638              
639             This method will copy the metadata (everything except name and
640             index) between the source column and target column. Returns 1 if
641             successful.
642              
643             =item delete_metadata
644              
645             $Data->delete_metadata($index, $key);
646              
647             Deletes a column-specific metadata $key and value for a specific
648             column $index. If a $key is not provided, then all metadata keys
649             for that index will be deleted.
650              
651             =item find_column
652              
653             my $i = $Data->find_column('Gene');
654             my $i = $Data->find_column('^Gene$')
655              
656             Searches the column names for the specified column name. This
657             employs a case-insensitive grep search, so simple substitutions
658             may be made.
659              
660             =item chromo_column
661              
662             =item start_column
663              
664             =item stop_column
665              
666             =item strand_column
667              
668             =item name_column
669              
670             =item type_column
671              
672             =item id_column
673              
674             These methods will return the identified column best matching
675             the description. Returns C if that column is not present.
676             These use the L method with a predefined list of
677             aliases.
678              
679             =back
680              
681             =head2 Working with databases
682              
683             These are methods for working primarily with annotation databases.
684              
685             =over 4
686              
687             =item open_database
688              
689             This is wrapper method that tries to do the right thing and passes
690             on to either L or L methods.
691             Basically a legacy method for L.
692              
693             =item open_meta_database
694              
695             Open the database that is listed in the metadata. Returns the
696             database connection. Pass a true value to force a new database
697             connection to be opened, rather than returning a cached connection
698             object (useful when forking).
699              
700             =item open_new_database
701              
702             Convenience method for opening a second or new database that is
703             not specified in the metadata, useful for data collection. This
704             is a shortcut to L.
705             Pass the database name.
706              
707             =back
708              
709             =head2 SeqFeature Objects
710              
711             SeqFeature objects corresponding to data rows can be stored in the Data
712             object. This can be useful if the SeqFeature object is not readily
713             available from a database or is processor intensive in generating or
714             parsing. Note that storing large numbers of objects will increase memory
715             usage.
716              
717             SeqFeature objects are usually either L,
718             L, or L objects, depending
719             upon their source. More information can obtained from the L
720             abstract API.
721              
722             =over 4
723              
724             =item store_seqfeature
725              
726             $Data->store_seqfeature($row_index, $seqfeature);
727              
728             Stores the SeqFeature object for the given row index. Only one SeqFeature
729             object can be stored per row.
730              
731             =item get_seqfeature
732              
733             my $feature = $Data->get_seqfeature($row_index);
734              
735             Retrieves the SeqFeature object for the given row index.
736              
737             =item delete_seqfeature
738              
739             $Data->store_seqfeature($row_index);
740              
741             Removes the SeqFeature object for the given row index.
742              
743             =item collapse_gene_transcripts
744              
745             my $success = $Data->collapse_gene_transcripts;
746              
747             This method will iterate through a Data table and collapse multiple alternative
748             transcript subfeatures of stored gene SeqFeature objects in the table. Exons
749             of multiple transcripts will be merged, maximizing exon size and minimizing
750             intron size. Genes with only one transcript will not be affected. Stored
751             SeqFeature objects that are do not have a C of "gene" are silently
752             skipped. Refer to the L method
753             for more details. The number of rows successfully collapsed is returned.
754              
755             =item add_transcript_length
756              
757             my $length_index = $Data->add_transcript_length;
758             my $length_index = $Data->add_transcript_length('cds');
759              
760             This method will generate a new column in the Data table representing the
761             length of a transcript, i.e. the sum of the length of subfeatures for
762             the stored SeqFeature object in the Data table. The default subfeature is
763             C; however, alternative subfeature types may be passed to the method
764             and used. These include C, C<5p_utr>, and C<3p_utr> for CDS, the 5C<'> UTR,
765             and the 3C<'> UTR, respectively. See the corresponding transcript length
766             methods in L for more information. If a length
767             is not calculated, for example the feature C is a "gene",
768             then the simple length of the feature is recorded.
769              
770             The name of the new column is one of "Merged_Transcript_Length" for exon,
771             "Transcript_CDS_Length", "Transcript_5p_UTR_Length", or "Transcript_3p_UTR_Length".
772             The index of the new length column is returned.
773              
774             =back
775              
776             =head2 Data Table Functions
777              
778             These methods alter the Data table en masse.
779              
780             =over 4
781              
782             =item verify
783              
784             This method will verify the Data structure, including the metadata and the
785             Data table. It ensures that the table has the correct number of rows and
786             columns as described in the metadata, and that each column has the basic
787             metadata.
788              
789             If the Data structure is marked as a GFF or BED structure, then the table
790             is checked that the structure matches the proper format. If not, for
791             example when additional columns have been added, then the GFF or BED value
792             is set to null.
793              
794             This method is automatically called prior to writing the Data table to file.
795              
796             =item sort_data
797              
798             $Data->sort_data($index, 'i'); # increasing sort
799              
800             This method will sort the Data table by the values in the indicated column.
801             It will automatically determine whether the contents of the column are
802             numbers or alphanumeric, and will sort accordingly, either numerically or
803             asciibetically. The first non-null value in the column is used to determine.
804             The sort may fail if the values are not consistent. Pass a second optional
805             value to indicate the direction of the sort. The value should be either
806             'i' for 'increasing' or 'd' for 'decreasing'. The default order is
807             increasing.
808              
809             =item gsort_data
810              
811             This method will sort the Data table by increasing genomic coordinates. It
812             requires the presence of chromosome and start (or position) columns,
813             identified by their column names. These are automatically identified.
814             Failure to find these columns mean a failure to sort the table. Chromosome
815             names are sorted first by their digits (e.g. chr2 before chr10), and then
816             alphanumerically. Base coordinates are sorted by increasing value.
817             Identical positions are kept in their original order.
818              
819             =item splice_data
820              
821             my $Data = Bio::ToolBox::Data->new(file => $file);
822             my $pm = Parallel::ForkManager->new(4);
823             for my $i (1..4) {
824             $pm->start and next;
825             ### in child ###
826             $Data->splice_data($i, 4);
827             # do something with this portion
828             # then save to a temporary unique file
829             $Data->save("$file_$i");
830             $pm->finish;
831             }
832             $pm->wait_all_children;
833             # reload children files
834             $Data->reload_children(glob "$file_*");
835              
836             This method will splice the Data table into C<$total_parts> number of pieces,
837             retaining the C<$current_part> piece. The other parts are discarded. This
838             method is intended to be used when a program is forked into separate
839             processes, allowing each child process to work on a subset of the original
840             Data table.
841              
842             Two values are passed to the method. The first is the current part number,
843             1-based. The second value is the total number of parts that the table
844             should be divided, corresponding to the number of concurrent processes.
845             One easy approach to forking is to use L. The
846             above example shows how to fork into four concurrent processes.
847              
848             Since each forked child process is separate from their parent process,
849             their contents must be reloaded into the current Data object. The
850             L documentation recommends going through a disk
851             file intermediate. Therefore, write each child Data object to file using
852             a unique name. Once all children have been reaped, they can be reloaded
853             into the current Data object using the L method.
854              
855             Remember that if you fork your script into child processes, any database
856             connections must be re-opened; they are typically not clone safe. If you
857             have an existing database connection by using the L method,
858             it should be automatically re-opened for you when you use the L
859             method, but you will need to call L again in the child
860             process to obtain the new database object.
861              
862             =item reload_children
863              
864             Discards the current data table in memory and reloads two or more files
865             written from forked children processes. Provide the name of the child
866             files in the order you want them loaded. The files will be automatically
867             deleted if successfully loaded. Returns the number of lines reloaded on
868             success.
869              
870             =back
871              
872             =head2 File Functions
873              
874             The Data table may be read in from a file or written out as a file. In
875             all cases, it is a tab-delimited text file, whether as an ad hoc table
876             or a specific bioinformatic format, e.g. BED, GFF, etc. Multiple common
877             file formats are supported. Column headers are assumed, except in those
878             cases where it is not, e.g. BED, GFF, etc. Metadata may be included as
879             commented lines at the beginning of the file, prefixed with a C<#> symbol.
880             Reading and writing gzip compressed files is fully supported.
881              
882             =over 4
883              
884             =item load_file
885              
886             $Data->load_file($filename);
887              
888             This will load a file into a new, empty Data table. This function is
889             called automatically when a filename is provided to the L function.
890             The existence of the file is first checked (appending common missing
891             extensions as necessary), metadata and column headers processed and/or
892             generated from default settings, the content loaded into the table, and
893             the structure verified. Error messages may be printed if the structure or
894             format is inconsistent or doesn't match the expected format, e.g a file
895             with a F<.bed> extension doesn't match the UCSC specification.
896             Pass the name of the filename.
897              
898             =item taste_file
899              
900             my $flavor = $Data->taste_file($filename);
901             # returns gff, bed, ucsc, or undef
902              
903             Tastes, or checks, a file for a certain flavor, or known gene file formats.
904             This is based on file extension, metadata headers, and/or file content
905             in the first 10 lines or so. Returns a string based on the file format.
906             Values include gff, bed, ucsc, or undefined. Useful for determining if
907             the file represents a known gene table format that lacks a defined file
908             extension, e.g. UCSC formats.
909              
910             =item save
911              
912             =item write_file
913              
914             my $success = $Data->save;
915             my $success = $Data->save('my_file.txt');
916             my $success = $Data->save(filename => $file, gz => 1);
917             print "file $success was saved!\n";
918              
919             Pass the file name to be written. If no file name is passed, then
920             the filename and path stored in the metadata are used, if present.
921              
922             These methods will write the Data structure out to file. It will
923             be first verified as to proper structure. Opened BED and GFF files
924             are checked to see if their structure is maintained. If so, they
925             are written in the same format; if not, they are written as regular
926             tab-delimited text files.
927              
928             You may pass additional options.
929              
930             =over 4
931              
932             =item filename
933              
934             Optionally pass a new filename. Required for new objects; previous
935             opened files may be overwritten if a new name is not provided. If
936             necessary, the file extension may be changed; for example, BED files
937             that no longer match the defined format lose the .bed and gain a .txt
938             extension. Compression may or add or strip .gz as appropriate. If
939             a path is not provided, the current working directory is used.
940              
941             =item gz
942              
943             Boolean value to change the compression status of the output file. If
944             overwriting an input file, the default is maintain the compression status,
945             otherwise no compression. Pass a 0 for no compression, 1 for standard
946             gzip compression, or 2 for block gzip (bgzip) compression for tabix
947             compatibility.
948              
949             =back
950              
951             If the file save is successful, it will return the full path and
952             name of the saved file, complete with any changes to the file extension.
953              
954             =item summary_file
955              
956             Write a separate file summarizing columns of data (mean values).
957             The mean value of each column becomes a row value, and each column
958             header becomes a row identifier (i.e. the table is transposed). The
959             best use of this is to summarize the mean profile of windowed data
960             collected across a feature. See the Bio::ToolBox scripts
961             L and L as examples.
962             For data from L where the columns are expressed
963             as percentile bins, the reported midpoint column is automatically
964             converted based on a length of 1000 bp.
965              
966             You may pass these options. They are optional.
967              
968             =over 4
969              
970             =item filename
971              
972             Pass an optional new filename. The default is to take the basename
973             and append "_summed" to it.
974              
975             =item startcolumn
976              
977             =item stopcolumn
978              
979             Provide the starting and ending columns to summarize. The default
980             start is the leftmost column without a recognized standard name.
981             The default ending column is the last rightmost column. Indexes are
982             0-based.
983              
984             =item dataset
985              
986             Pass a string that is the name of the dataset. This could be collected
987             from the metadata, if present. This will become the name of the score
988             column if defined.
989              
990             =back
991              
992             The name of the summarized column is either the provided dataset name,
993             the defined basename in the metadata of the Data structure, or a generic
994             name. If successful, it will return the name of the file saved.
995              
996             =back
997              
998             =head2 Verifying Datasets
999              
1000             When working with row Features and collecting scores, the dataset
1001             from which you are collecting must be verified prior to collection.
1002             This ensures that the proper database adaptor is available and loaded,
1003             and that the dataset is correctly specified (otherwise nothing would be
1004             collected). This verification is normally performed transparently when
1005             you call L or
1006             L.
1007             However, datasets may be explicitly verified prior to calling the score
1008             methods.
1009              
1010             =over 4
1011              
1012             =item verify_dataset
1013              
1014             my $dataset = $Data->verify_dataset($dataset, $database);
1015              
1016             Pass the name of the dataset (GFF type or type:source) for a GFF3-based
1017             database, e.g. , or path and file name for a
1018             data file, e.g. Bam, BigWig, BigBed, or USeq file. If a separate database
1019             is being used, pass the name or opened database object as a second
1020             parameter. For more advance options, see
1021             L.
1022              
1023             The name of the verified dataset, with a prefix if necessary, is returned.
1024              
1025             =back
1026              
1027             =head2 Efficient Data Access
1028              
1029             Most of the time we need to iterate over the Data table, one row
1030             at a time, collecting data or processing information. These methods
1031             simplify the process.
1032              
1033             =over 4
1034              
1035             =item iterate
1036              
1037             $Data->iterate( sub {
1038             my $row = shift;
1039             my $number = $row->value($index);
1040             my $log_number = log($number);
1041             $row->value($index, $log_number);
1042             } );
1043              
1044             This method will process a code reference on every row in the data
1045             table. Pass a subroutine or code reference. The subroutine will
1046             receive the row as a L object. With this
1047             object, you can retrieve values, set values, and add new values.
1048              
1049             =item row_stream
1050              
1051             This returns an C object, which has one
1052             method, C. Call this method repeatedly until it returns
1053             C to work through each row of data.
1054              
1055             Users of the C family of database adaptors may recognize the
1056             analogy to the C method.
1057              
1058             =item next_row
1059              
1060             my $stream = $Data->row_stream;
1061             while (my $row = $stream->next_row) {
1062             # each $row is a Bio::ToolBox::Data::Feature object
1063             # representing the row in the data table
1064             my $value = $row->value($index);
1065             # do something with $value
1066             }
1067              
1068             Called from a C object, it returns a
1069             L row object. If SeqFeature objects are
1070             associated with the row, perhaps from a parsed input annotation file,
1071             then they are automatically associated with the row object. (Previous
1072             versions required separately calling the seqfeature() row method to
1073             perform this.)
1074              
1075             =back
1076              
1077             =head1 SEE ALSO
1078              
1079             L, L, L,
1080             L, L, L, L,
1081             L, L
1082              
1083             =cut
1084              
1085 3     3   4197 use strict;
  3         7  
  3         118  
1086 3     3   21 use Carp qw(carp cluck croak confess);
  3         7  
  3         233  
1087 3     3   20 use List::Util qw(sum0);
  3         5  
  3         208  
1088 3     3   19 use base 'Bio::ToolBox::Data::core';
  3         6  
  3         1680  
1089 3         204 use Bio::ToolBox::db_helper qw(
1090             get_new_feature_list
1091             get_new_genome_list
1092             get_db_feature
1093 3     3   26 );
  3         6  
1094 3     3   22 use Bio::ToolBox::utility qw(simplify_dataset_name);
  3         7  
  3         155  
1095 3     3   22 use Module::Load;
  3         7  
  3         17  
1096              
1097             1;
1098              
1099              
1100             ### Initialize
1101              
1102             sub new {
1103 45     45 1 7983 my $class = shift;
1104 45         120 my %args = @_;
1105            
1106             # check for important arguments
1107 45   100     411 $args{features} ||= $args{feature} || 'gene';
      33        
1108 45   50     272 $args{stream} ||= $args{Stream} || 0;
      66        
1109 45   100     245 $args{file} ||= $args{in} || undef;
      100        
1110 45   100     172 $args{parse} ||= 0;
1111 45   50     217 $args{noheader} ||= 0;
1112            
1113             # check for stream
1114 45 100       112 if ($args{stream}) {
1115 1         2 $class = "Bio::ToolBox::Data::Stream";
1116 1         6 load($class);
1117 1         156 return $class->new(@_);
1118             }
1119            
1120             # initialize
1121 44         179 my $self = $class->SUPER::new();
1122            
1123             # prepare a new table based on the provided arguments
1124 44 100 100     257 if ($args{file} and $args{parse}) {
    100 33        
    50          
1125             # parse from file
1126 7   100     39 $args{subfeature} ||= '';
1127 7 100       24 unless ( $self->parse_table(\%args) ) {
1128 5         104 my $l = $self->load_file($args{file});
1129 5 50       23 return unless $l;
1130 5 50 33     19 if ($self->database =~ /^Parsed:(.+)$/ and $self->feature_type eq 'named') {
1131             # looks like the loaded file was from a previously parsed table
1132             # let's try this again
1133             # this may die if it doesn't work
1134 5         19 $args{file} = $1;
1135 5         16 $args{feature} = $self->feature;
1136 5         15 $self->parse_table(\%args);
1137             }
1138             }
1139             }
1140             elsif ($args{file}) {
1141             # load from file
1142 5 50       52 unless ( $self->load_file($args{file}, $args{noheader}) ) {
1143 0         0 return;
1144             }
1145             }
1146             elsif (exists $args{db} and $args{features}) {
1147             # generate new list
1148 0         0 $self->feature($args{features});
1149 0         0 $self->database($args{db});
1150 0         0 $args{data} = $self;
1151 0         0 my $result;
1152 0 0       0 if ($args{features} eq 'genome') {
1153 0         0 $result = get_new_genome_list(%args);
1154             }
1155             else {
1156 0         0 $result = get_new_feature_list(%args);
1157             }
1158 0 0       0 unless ($result) {
1159 0         0 carp " Cannot generate new $args{features} list!";
1160 0         0 return;
1161             }
1162             }
1163             else {
1164             # a new empty structure
1165            
1166             # check to see if user provided column names
1167 32   50     192 $args{columns} ||= $args{datasets} || undef;
      66        
1168 32 100 33     203 if (defined $args{columns}) {
    50 33        
    50 33        
    50          
1169 1         2 foreach my $c ( @{ $args{columns} } ) {
  1         4  
1170 9         19 $self->add_column($c);
1171             }
1172 1 50       6 $self->{feature} = $args{feature} if exists $args{feature};
1173             }
1174            
1175             # or possibly a specified format structure
1176             elsif (exists $args{gff} and $args{gff}) {
1177             # use standard names for the number of columns indicated
1178             # we trust that the user knows the subtle difference between gff versions
1179 0         0 $self->add_gff_metadata($args{gff});
1180 0 0       0 unless ($self->extension =~ /g[tf]f/) {
1181             $self->{extension} = $args{gff} == 2.5 ? '.gtf' :
1182 0 0       0 $args{gff} == 3 ? '.gff3' : '.gff';
    0          
1183             }
1184             }
1185             elsif (exists $args{bed} and $args{bed}) {
1186             # use standard names for the number of columns indicated
1187 0 0 0     0 unless ($args{bed} =~ /^\d{1,2}$/ and $args{bed} >= 3) {
1188 0         0 carp "bed parameter must be an integer 3-12!";
1189 0         0 return;
1190             }
1191 0         0 $self->add_bed_metadata($args{bed});
1192 0 0       0 unless ($self->extension =~ /bed|peak/) {
1193 0         0 $self->{extension} = '.bed';
1194             }
1195             }
1196             elsif (exists $args{ucsc} and $args{ucsc}) {
1197             # a ucsc format such as refFlat, genePred, or genePredExt
1198 0         0 my $u = $self->add_ucsc_metadata($args{ucsc});
1199 0 0       0 unless ($u) {
1200 0         0 carp "unrecognized number of columns for ucsc format!";
1201 0         0 return;
1202             };
1203 0 0       0 unless ($self->extension =~ /ucsc|ref+lat|genepred/) {
1204 0         0 $self->{extension} = '.ucsc';
1205             }
1206             }
1207             }
1208            
1209 44         264 return $self;
1210             }
1211              
1212             sub duplicate {
1213 1     1 1 1578 my $self = shift;
1214            
1215             # duplicate the data structure
1216 1         6 my $columns = $self->list_columns;
1217 1 50       7 my $Dupe = $self->new(
1218             'columns' => $columns,
1219             ) or return;
1220            
1221             # copy the metadata
1222 1         5 for (my $i = 0; $i < $self->number_columns; $i++) {
1223             # column metadata
1224 9         21 my %md = $self->metadata($i);
1225 9         31 $Dupe->{$i} = \%md;
1226             }
1227 1         4 foreach (qw(feature program db bed gff ucsc vcf headers)) {
1228             # various keys
1229 8         16 $Dupe->{$_} = $self->{$_};
1230             }
1231 1         4 my @comments = $self->comments;
1232 1         2 push @{$Dupe->{comments}}, @comments;
  1         3  
1233            
1234 1         15 return $Dupe;
1235             }
1236              
1237             sub parse_table {
1238 18     18 1 4210 my $self = shift;
1239 18         37 my $args = shift;
1240 18         30 my ($file, $feature, $subfeature, $simplify);
1241 18 100       51 if (ref $args) {
1242 12   50     43 $file = $args->{file} || '';
1243 12   100     53 $feature = $args->{feature} || '';
1244 12   100     50 $subfeature = $args->{subfeature} || '';
1245             $simplify = (exists $args->{simplify} and defined $args->{simplify}) ?
1246 12 50 33     40 $args->{simplify} : 1; # default is to simplify
1247             }
1248             else {
1249             # no hash reference, assume just a file name
1250 6         14 $file = $args;
1251 6         14 undef $args;
1252 6         8 $feature = undef;
1253 6         13 $subfeature = '';
1254 6         10 $simplify = 1;
1255             }
1256 18 50       49 unless ($file) {
1257 0         0 carp "no annotation file provided to parse!";
1258 0         0 return;
1259             }
1260            
1261             # the file format determines the parser class
1262 18 100       61 my $flavor = $self->taste_file($file) or return;
1263 13         41 my $class = 'Bio::ToolBox::parser::' . $flavor;
1264 13         27 eval {load $class};
  13         56  
1265 13 50       1063 if ($@) {
1266 0         0 carp "unable to load $class! cannot parse $flavor!";
1267 0         0 return;
1268             }
1269            
1270             # open parser
1271 13 50       116 my $parser = $class->new() or return;
1272 13 50       45 $parser->open_file($file) or return;
1273 13         52 my $typelist = $parser->typelist;
1274            
1275             # set feature based on the type list from the parser
1276 13 100       37 unless ($feature) {
1277 6 100       49 if ($typelist =~ /gene/i) {
    100          
1278 2         12 $feature = 'gene';
1279             }
1280             elsif ($typelist eq 'region') {
1281 3         6 $feature = 'region';
1282             }
1283             else {
1284 1         2 $feature = 'rna'; # generic RNA
1285             }
1286             }
1287            
1288             # set parser parameters
1289 13         46 $parser->simplify($simplify);
1290 13 100       34 if ($subfeature) {
1291 2 100       14 $parser->do_exon(1) if $subfeature =~ /exon/i;
1292 2 100       26 $parser->do_cds(1) if $subfeature =~ /cds/i;
1293 2 50       19 $parser->do_utr(1) if $subfeature =~ /utr|untranslated/i;
1294 2 50       8 $parser->do_codon(1) if $subfeature =~/codon/i;
1295             }
1296 13 100       57 if ($feature =~ /gene$/i) {
1297 4         17 $parser->do_gene(1);
1298             }
1299             else {
1300 9         27 $parser->do_gene(0);
1301             }
1302 13         23 my $mrna_check = 0;
1303 13 50 66     49 if (lc($feature) eq 'mrna' and $parser->typelist !~ /mrna/i and not $self->last_row) {
      33        
1304             # user requested mRNA for a new data file but it's not present in the type list
1305             # look for it the hard way by parsing CDS too - sigh
1306 0         0 load('Bio::ToolBox::GeneTools', 'is_coding');
1307 0         0 $parser->do_cds(1);
1308 0         0 $mrna_check = 1;
1309             }
1310            
1311             # parse the table
1312 13 50       40 $parser->parse_file or return;
1313            
1314             # store the SeqFeature objects
1315 13 100       83 if ($self->last_row > 0) {
1316             # we already have a table, presumably representing the features
1317 5         14 my $count = 0;
1318             $self->iterate( sub {
1319 65     65   85 my $row = shift;
1320 65         170 my $f = $parser->find_gene(
1321             name => $row->name,
1322             id => $row->id,
1323             );
1324 65 50       144 if ($f) {
1325 65         155 $self->store_seqfeature($row->row_index, $f);
1326 65         200 $count++;
1327             }
1328 5         51 } );
1329 5 50       47 unless ($count == $self->last_row) {
1330 0         0 die <
1331             Not all features in the input file could be matched to a corresponding SeqFeature
1332             object in the annotation file $file.
1333             Double check your input and annotation files. You can create a new table by just
1334             providing your annotation file.
1335             PARSEFAIL
1336             }
1337             }
1338             else {
1339             # create a new table
1340 8         37 $self->add_column('Primary_ID');
1341 8         22 $self->add_column('Name');
1342            
1343             # check for chromosome exclude
1344 8         14 my $chr_exclude;
1345 8 100       23 if ($args) {
1346 2   50     12 $chr_exclude = $args->{chrskip} || undef;
1347             }
1348            
1349             # fill table with features
1350 8         35 while (my $f = $parser->next_top_feature) {
1351 120 50       206 if ($chr_exclude) {
1352 0 0       0 next if $f->seq_id =~ /$chr_exclude/i;
1353             }
1354 120         243 my $type = $f->type;
1355 120 100 33     243 if ($f->type =~ /$feature/i or ($mrna_check and is_coding($f)) ) {
      66        
1356 113         261 my $index = $self->add_row([ $f->primary_id, $f->display_name ]);
1357 113         255 $self->store_seqfeature($index, $f);
1358             }
1359             }
1360 8 50       27 unless ($self->last_row) {
1361 0         0 printf " Zero '%s' features found!\n Check your feature or try generic features like gene, mRNA, or transcript\n",
1362             $feature;
1363             }
1364 8         52 $self->database("Parsed:$file");
1365 8         33 $self->feature($feature);
1366 8 50       15 $self->add_comment("Chromosomes excluded: $chr_exclude") if $chr_exclude;
1367            
1368             # add input parsed file metadata
1369 8         44 $self->add_file_metadata($file);
1370             # but delete some stuff, just want basename
1371 8         16 undef $self->{extension};
1372 8         18 undef $self->{filename};
1373 8         14 undef $self->{path};
1374             }
1375 13         300 return 1;
1376             }
1377              
1378              
1379              
1380              
1381             ### Column manipulation
1382              
1383             sub column_values {
1384 2     2 1 1360 my ($self, $column) = @_;
1385 2 50       9 return unless defined $column;
1386 2 50       8 return unless exists $self->{$column}{name};
1387 2         10 my @values = map {$self->value($_, $column)} (0 .. $self->last_row);
  159         240  
1388 2 50       24 return wantarray ? @values : \@values;
1389             }
1390              
1391             sub add_column {
1392 28     28 1 71 my ($self, $name) = @_;
1393 28 50       59 return unless $name;
1394 28         48 my $column = $self->{number_columns};
1395            
1396             # check for array of column data
1397 28         47 my $name_ref = ref $name;
1398 28 100       94 if ($name_ref eq 'ARRAY') {
    50          
    50          
1399 2 50       16 if ($self->last_row > 1) {
1400             # table has existing data beyond column headers
1401 2 50       6 if ($self->last_row == (scalar @$name - 1)) {
1402             # same number of elements, add it the table
1403 2         10 $self->{$column} = {
1404             'name' => $name->[0],
1405             'index' => $column,
1406             };
1407 2         8 for (my $r = 0; $r <= $self->last_row; $r++) {
1408 158         368 $self->{data_table}->[$r][$column] = $name->[$r];
1409             }
1410             }
1411             else {
1412             # different number of elements
1413 0         0 cluck "array has different number of elements than Data table!\n";
1414 0         0 return;
1415             }
1416             }
1417             else {
1418             # table has no existing data
1419 0         0 $self->{$column} = {
1420             'name' => $name->[0],
1421             'index' => $column,
1422             };
1423 0         0 for (my $i = 0; $i < scalar @$name; $i++) {
1424 0         0 $self->value($i, $column, $name->[$i]);
1425             }
1426 0         0 $self->{last_row} = scalar @$name - 1;
1427 0         0 $self->{headers} = 1; # boolean to indicate the table now has headers
1428             }
1429             }
1430             elsif ($name_ref eq 'Bio::DB::GFF::Typename') {
1431             # a Typename object that was selected from a SeqFeature::Store database
1432 0         0 $self->{$column} = {
1433             'name' => $name->asString,
1434             'index' => $column,
1435             };
1436 0         0 $self->{data_table}->[0][$column] = $name->asString;
1437             }
1438             elsif ($name_ref eq '') {
1439             # just a name
1440 26         98 $self->{$column} = {
1441             'name' => $name,
1442             'index' => $column,
1443             };
1444 26         75 $self->{data_table}->[0][$column] = $name;
1445             }
1446             else {
1447 0         0 cluck "unrecognized reference '$name_ref'! pass a scalar value or array reference";
1448 0         0 return;
1449             }
1450            
1451 28         46 $self->{number_columns}++;
1452 28 50       59 delete $self->{column_indices} if exists $self->{column_indices};
1453 28 50 33     70 if ($self->gff or $self->bed or $self->ucsc or $self->vcf) {
      33        
      33        
1454             # check if we maintain integrity, at least insofar what we test
1455 0         0 $self->verify(1); # silence so user doesn't get these messages
1456             }
1457 28         59 return $column;
1458             }
1459              
1460             sub copy_column {
1461 1     1 1 4 my $self = shift;
1462 1         3 my $index = shift;
1463 1 50       5 return unless defined $index;
1464 1         5 my $data = $self->column_values($index);
1465 1         5 my $new_index = $self->add_column($data);
1466 1         12 $self->copy_metadata($index, $new_index);
1467 1         6 return $new_index;
1468             }
1469              
1470              
1471              
1472             ### Rows and Data access
1473              
1474             sub add_row {
1475 231     231 1 331 my $self = shift;
1476 231         310 my @row_data;
1477 231 100 66     918 if ($_[0] and ref $_[0] eq 'ARRAY') {
    50 33        
    0 0        
1478 191         270 @row_data = @{ $_[0] };
  191         565  
1479             }
1480             elsif ($_[0] and ref $_[0] eq 'Bio::ToolBox::Data::Feature') {
1481 40         441 @row_data = $_[0]->row_values;
1482             }
1483             elsif ($_[0] and $_[0] =~ /\t/) {
1484 0         0 @row_data = split /\t/, $_[0];
1485             }
1486             else {
1487 0         0 @row_data = map {'.'} (1 .. $self->{number_columns});
  0         0  
1488             }
1489 231 50       546 if (scalar @row_data > $self->{number_columns}) {
1490 0         0 cluck "row added has more elements than table columns! truncating row elements\n";
1491 0         0 splice @row_data, 0, $self->{number_columns};
1492             }
1493 231         520 until (scalar @row_data == $self->{number_columns}) {
1494 0         0 push @row_data, '.';
1495             }
1496 231         451 my $row_index = $self->{last_row} + 1;
1497 231         443 $self->{data_table}->[$row_index] = \@row_data;
1498 231         332 $self->{last_row}++;
1499 231         631 return $row_index;
1500             }
1501              
1502             sub get_row {
1503 5     5 1 692 my ($self, $row_number) = @_;
1504 5 50       23 return unless defined $self->{data_table}->[$row_number];
1505 5         19 my @options = (
1506             'data' => $self,
1507             'index' => $row_number,
1508             );
1509 5 100 66     30 if (
1510             exists $self->{SeqFeatureObjects} and
1511             defined $self->{SeqFeatureObjects}->[$row_number]
1512             ) {
1513 4         15 push @options, 'feature', $self->{SeqFeatureObjects}->[$row_number];
1514             }
1515 5         47 return Bio::ToolBox::Data::Feature->new(@options);
1516             }
1517              
1518             sub delete_row {
1519 1     1 1 4 my $self = shift;
1520 1         6 my @deleted = sort {$b <=> $a} @_;
  0         0  
1521 1         5 while (@deleted) {
1522 1         3 my $d = shift @deleted;
1523 1         3 splice( @{ $self->{data_table} }, $d, 1);
  1         4  
1524 1         6 $self->{last_row}--;
1525 1 50       5 if (exists $self->{SeqFeatureObjects}) {
1526 0         0 splice( @{ $self->{SeqFeatureObjects} }, $d, 1);
  0         0  
1527             }
1528             }
1529 1         3 return 1;
1530             }
1531              
1532             sub row_values {
1533 237     237 1 371 my ($self, $row) = @_;
1534 237         327 my @data = @{ $self->{data_table}->[$row] };
  237         602  
1535 237 50       811 return wantarray ? @data : \@data;
1536             }
1537              
1538             sub value {
1539 508     508 1 1978 my ($self, $row, $column, $value) = @_;
1540 508 50 33     1518 return unless (defined $row and defined $column);
1541 508 50       808 if (defined $value) {
1542 0         0 $self->{data_table}->[$row][$column] = $value;
1543             }
1544 508         1162 return $self->{data_table}->[$row][$column];
1545             }
1546              
1547             sub row_stream {
1548 9     9 1 2430 my $self = shift;
1549 9         71 return Bio::ToolBox::Data::Iterator->new($self);
1550             }
1551              
1552             sub iterate {
1553 6     6 1 654 my $self = shift;
1554 6         14 my $code = shift;
1555 6 50       25 unless (ref $code eq 'CODE') {
1556 0         0 confess "iterate_function() method requires a code reference!";
1557             }
1558 6         20 my $stream = $self->row_stream;
1559 6         22 while (my $row = $stream->next_row) {
1560 143         293 &$code($row);
1561             }
1562 6         28 return 1;
1563             }
1564              
1565              
1566             #### Stored SeqFeature manipulation
1567              
1568             sub store_seqfeature {
1569 178     178 1 300 my ($self, $row_i, $seqfeature) = @_;
1570 178 50 33     614 unless (defined $row_i and ref($seqfeature)) {
1571 0         0 confess "must provide a row index and SeqFeature object!";
1572             }
1573 178 50       386 confess "invalid row index" unless ($row_i <= $self->last_row);
1574 178   100     364 $self->{SeqFeatureObjects} ||= [];
1575 178         348 $self->{SeqFeatureObjects}->[$row_i] = $seqfeature;
1576 178         383 return 1;
1577             }
1578              
1579             sub delete_seqfeature {
1580 0     0 1 0 my ($self, $row_i) = @_;
1581 0 0       0 confess "invalid row index" unless ($row_i <= $self->last_row);
1582 0 0       0 return unless $self->{SeqFeatureObjects};
1583 0         0 undef $self->{SeqFeatureObjects}->[$row_i];
1584             }
1585              
1586             sub collapse_gene_transcripts {
1587 0     0 1 0 my $self = shift;
1588 0 0       0 unless ($self->feature_type eq 'named') {
1589 0         0 carp "Table does not contain named features!";
1590 0         0 return;
1591             }
1592            
1593             # load module
1594 0         0 my $class = "Bio::ToolBox::GeneTools";
1595 0         0 eval {load $class, qw(collapse_transcripts)};
  0         0  
1596 0 0       0 if ($@) {
1597 0         0 carp "unable to load $class! cannot collapse transcripts!";
1598 0         0 return;
1599             }
1600            
1601             # collapse the transcripts
1602 0         0 my $success = 0;
1603 0 0       0 if (exists $self->{SeqFeatureObjects}) {
1604             # we should have stored SeqFeature objects, probably from parsed table
1605 0         0 for (my $i = 1; $i <= $self->last_row; $i++) {
1606 0 0       0 my $feature = $self->{SeqFeatureObjects}->[$i] or next;
1607 0 0       0 my $collSeqFeat = collapse_transcripts($feature) or next;
1608 0         0 $success += $self->store_seqfeature($i, $collSeqFeat);
1609             }
1610             }
1611             else {
1612             # no stored SeqFeature objects, probably names pointing to a database
1613             # we will have to fetch the feature from a database
1614 0 0       0 my $db = $self->open_meta_database(1) or # force open a new db connection
1615             confess "No SeqFeature objects stored and no database connection!";
1616 0         0 my $name_i = $self->name_column;
1617 0         0 my $id_i = $self->id_column;
1618 0         0 my $type_i = $self->type_column;
1619 0         0 for (my $i = 1; $i <= $self->last_row; $i++) {
1620 0 0 0     0 my $feature = get_db_feature(
      0        
      0        
1621             db => $db,
1622             name => $self->value($i, $name_i) || undef,
1623             type => $self->value($i, $type_i) || undef,
1624             id => $self->value($i, $id_i) || undef,
1625             ) or next;
1626 0 0       0 my $collSeqFeat = collapse_transcripts($feature) or next;
1627 0         0 $success += $self->store_seqfeature($i, $collSeqFeat);
1628             }
1629             }
1630            
1631 0         0 return $success;
1632             }
1633              
1634             sub add_transcript_length {
1635 0     0 1 0 my $self = shift;
1636 0   0     0 my $subfeature = shift || 'exon';
1637 0 0       0 unless (exists $self->{SeqFeatureObjects}) {
1638 0         0 carp "no SeqFeature objects stored for collapsing!";
1639 0         0 return;
1640             }
1641            
1642             # load module
1643 0         0 eval {load "Bio::ToolBox::GeneTools", qw(get_transcript_length get_transcript_cds_length
  0         0  
1644             get_transcript_5p_utr_length get_transcript_3p_utr_length)};
1645 0 0       0 if ($@) {
1646 0         0 carp "unable to load Bio::ToolBox::GeneTools! cannot collapse transcripts!";
1647 0         0 return;
1648             }
1649            
1650             # determine name and calculation method
1651 0         0 my $length_calculator;
1652             my $length_name;
1653 0 0       0 if ($subfeature eq 'exon') {
    0          
    0          
    0          
1654 0         0 $length_calculator = \&get_transcript_length;
1655 0         0 $length_name= 'Merged_Transcript_Length';
1656             }
1657             elsif ($subfeature eq 'cds') {
1658 0         0 $length_calculator = \&get_transcript_cds_length;
1659 0         0 $length_name= 'Transcript_CDS_Length';
1660             }
1661             elsif ($subfeature eq '5p_utr') {
1662 0         0 $length_calculator = \&get_transcript_5p_utr_length;
1663 0         0 $length_name= 'Transcript_5p_UTR_Length';
1664             }
1665             elsif ($subfeature eq '3p_utr') {
1666 0         0 $length_calculator = \&get_transcript_3p_utr_length;
1667 0         0 $length_name= 'Transcript_3p_UTR_Length';
1668             }
1669             else {
1670 0         0 carp "unrecognized subfeature type '$subfeature'! No length calculated";
1671 0         0 return;
1672             }
1673            
1674             # add new column and calculate
1675 0         0 my $length_i = $self->add_column($length_name);
1676 0 0       0 if (exists $self->{SeqFeatureObjects}) {
1677             # we should have stored SeqFeature objects, probably from parsed table
1678 0         0 for (my $i = 1; $i <= $self->last_row; $i++) {
1679 0 0       0 my $feature = $self->{SeqFeatureObjects}->[$i] or next;
1680 0         0 my $length = &$length_calculator($feature);
1681 0   0     0 $self->{data_table}->[$i][$length_i] = $length || $feature->length;
1682             }
1683             }
1684             else {
1685             # no stored SeqFeature objects, probably names pointing to a database
1686             # we will have to fetch the feature from a database
1687 0 0       0 my $db = $self->open_meta_database(1) or # force open a new db connection
1688             confess "No SeqFeature objects stored and no database connection!";
1689 0         0 my $name_i = $self->name_column;
1690 0         0 my $id_i = $self->id_column;
1691 0         0 my $type_i = $self->type_column;
1692 0         0 for (my $i = 1; $i <= $self->last_row; $i++) {
1693 0 0 0     0 my $feature = get_db_feature(
      0        
      0        
1694             db => $db,
1695             name => $self->value($i, $name_i) || undef,
1696             type => $self->value($i, $type_i) || undef,
1697             id => $self->value($i, $id_i) || undef,
1698             ) or next;
1699 0         0 my $length = &$length_calculator($feature);
1700 0   0     0 $self->{data_table}->[$i][$length_i] = $length || $feature->length;
1701 0         0 $self->store_seqfeature($i, $feature);
1702             # to store or not to store? May explode memory, but could save from
1703             # expensive db calls later
1704             }
1705             }
1706            
1707 0         0 return $length_i;
1708             }
1709              
1710              
1711              
1712             ### Data structure manipulation
1713              
1714             sub sort_data {
1715 1     1 1 1860 my $self = shift;
1716 1         3 my $index = shift;
1717 1   50     5 my $direction = shift || 'i';
1718            
1719             # confirm options
1720 1 50       5 return unless exists $self->{$index}{name};
1721 1 50       7 unless ($direction =~ /^[id]/i) {
1722 0         0 carp "unrecognized sort order '$direction'! Must be i or d";
1723 0         0 return;
1724             }
1725            
1726             # Sample the dataset values
1727             # this will be used to guess the sort method, below
1728 1         3 my $example; # an example of the dataset
1729 1         6 foreach (my $i = 1; $i <= $self->last_row; $i++) {
1730             # we want to avoid null values, so keep trying
1731             # null being . or any variation of N/A, NaN, inf
1732 1         7 my $v = $self->value($i, $index);
1733 1 50 33     10 if (defined $v and $v !~ /^(?:\.|n\/?a|nan|\-?inf)$/i) {
1734             # a non-null value, take it
1735 1         3 $example = $v;
1736 1         2 last;
1737             }
1738             }
1739            
1740             # Determine sort method, either numerical or alphabetical
1741 1         2 my $sortmethod;
1742 1 50       8 if ($example =~ /[a-z]/i) {
    0          
1743             # there are detectable letters
1744 1         2 $sortmethod = 'ascii';
1745             }
1746             elsif ($example =~ /^\-?\d+\.?\d*$/) {
1747             # there are only digits, allowing for minus sign and a decimal point
1748             # I don't think this allows for exponents, though
1749 0         0 $sortmethod = 'numeric';
1750             }
1751             else {
1752             # unable to determine (probably alphanumeric), sort asciibetical
1753 0         0 $sortmethod = 'ascii';
1754             }
1755            
1756             # Re-order the datasets
1757             # Directly sorting the @data array is proving difficult. It keeps giving me
1758             # a segmentation fault. So I'm using a different approach by copying the
1759             # @data_table into a temporary hash.
1760             # put data_table array into a temporary hash
1761             # the hash key will the be dataset value,
1762             # the hash value will be the reference the row data
1763 1         2 my %datahash;
1764            
1765             # reorder numerically
1766 1 50       8 if ($sortmethod eq 'numeric') {
    50          
1767 0         0 for my $row (1..$self->last_row) {
1768 0         0 my $value = $self->value($row, $index);
1769            
1770             # check to see whether this value exists or not
1771 0         0 while (exists $datahash{$value}) {
1772             # add a really small number to bump it up and make it unique
1773             # this, of course, presumes that none of the dataset values
1774             # are really this small - this may be an entirely bad
1775             # assumption!!!!! I suppose we could somehow calculate an
1776             # appropriate value.... nah.
1777             # don't worry, we're only modifying the value used for sorting,
1778             # not the actual value
1779 0         0 $value += 0.00000001;
1780             }
1781             # store the row data reference
1782 0         0 $datahash{$value} = $self->{data_table}->[$row];
1783             }
1784            
1785             # re-fill the array based on the sort direction
1786 0 0       0 if ($direction =~ /^i/i) {
1787             # increasing sort
1788 0         0 my $i = 1; # keep track of the row, skip the header
1789 0         0 foreach (sort {$a <=> $b} keys %datahash) {
  0         0  
1790             # put back the reference to the anonymous array of row data
1791 0         0 $self->{data_table}->[$i] = $datahash{$_};
1792 0         0 $i++; # increment for next row
1793             }
1794             }
1795             else {
1796             # decreasing sort
1797 0         0 my $i = 1; # keep track of the row, skip the header
1798 0         0 foreach (sort {$b <=> $a} keys %datahash) {
  0         0  
1799             # put back the reference to the anonymous array of row data
1800 0         0 $self->{data_table}->[$i] = $datahash{$_};
1801 0         0 $i++; # increment for next row
1802             }
1803             }
1804            
1805             # summary prompt
1806 0         0 printf " Data table sorted numerically by the contents of %s\n",
1807             $self->name($index);
1808             }
1809            
1810             # reorder asciibetically
1811             elsif ($sortmethod eq 'ascii') {
1812 1         4 for my $row (1..$self->last_row) {
1813             # get the value to sort by
1814 78         126 my $value = $self->value($row, $index);
1815 78 50       127 if (exists $datahash{$value}) {
1816             # not unique
1817 0         0 my $n = 1;
1818 0         0 my $lookup = $value . sprintf("03%d", $n);
1819             # we'll try to make a unique value by appending
1820             # a number to the original value
1821 0         0 while (exists $datahash{$lookup}) {
1822             # keep bumping up the number till it's unique
1823 0         0 $n++;
1824 0         0 $lookup = $value . sprintf("03%d", $n);
1825             }
1826 0         0 $datahash{$lookup} = $self->{data_table}->[$row];
1827             }
1828             else {
1829             # unique
1830 78         218 $datahash{$value} = $self->{data_table}->[$row];
1831             }
1832             }
1833            
1834             # re-fill the array based on the sort direction
1835 1 50 33     12 if ($direction eq 'i' or $direction eq 'I') {
    50 33        
1836             # increasing
1837 0         0 my $i = 1; # keep track of the row
1838 0         0 foreach (sort {$a cmp $b} keys %datahash) {
  0         0  
1839             # put back the reference to the anonymous array of row data
1840 0         0 $self->{data_table}->[$i] = $datahash{$_};
1841 0         0 $i++; # increment for next row
1842             }
1843             }
1844             elsif ($direction eq 'd' or $direction eq 'D') {
1845             # decreasing
1846 1         4 my $i = 1; # keep track of the row
1847 1         25 foreach (sort {$b cmp $a} keys %datahash) {
  392         499  
1848             # put back the reference to the anonymous array of row data
1849 78         109 $self->{data_table}->[$i] = $datahash{$_};
1850 78         93 $i++; # increment for next row
1851             }
1852             }
1853            
1854             # summary prompt
1855 1         9 printf " Data table sorted asciibetically by the contents of '%s'\n",
1856             $self->name($index);
1857             }
1858            
1859 1         17 return 1;
1860             }
1861              
1862             sub gsort_data {
1863 1     1 1 4 my $self = shift;
1864            
1865             # identify indices
1866 1 50       4 unless ($self->feature_type eq 'coordinate') {
1867 0         0 carp "no chromosome and start/position columns to sort!\n";
1868 0         0 return;
1869             }
1870 1         4 my $chromo_i = $self->chromo_column;
1871 1         4 my $start_i = $self->start_column;
1872            
1873             # Load the data into a temporary hash
1874             # The datalines will be put into a hash of hashes: The first key will be
1875             # the chromosome name, the second hash will be the start value.
1876             #
1877             # To deal with some chromosomes that don't have numbers (e.g. chrM), we'll
1878             # use two separate hashes: one is for numbers, the other for strings
1879             # when it comes time to sort, we'll put the numbers first, then strings
1880            
1881 1         3 my %num_datahash;
1882             my %str_datahash;
1883 1         5 for my $row (1 .. $self->last_row) {
1884            
1885 78         137 my $startvalue = $self->value($row, $start_i);
1886            
1887             # put the dataline into the appropriate temporary hash
1888 78 50       134 if ($self->value($row, $chromo_i) =~ /^(?:chr)?(\d+)$/) {
1889             # dealing with a numeric chromosome name
1890             # restricting to either chr2 or just 2 but not 2-micron
1891 0         0 my $chromovalue = $1;
1892 0         0 while (exists $num_datahash{$chromovalue}{$startvalue}) {
1893             # if another item already exists at this location
1894             # add a really small number to bump it up and make it unique
1895 0         0 $startvalue += 0.001;
1896             }
1897 0         0 $num_datahash{$chromovalue}{$startvalue} = $self->{data_table}->[$row];
1898             }
1899             else {
1900             # dealing with a non-numeric chromosome name
1901 78         125 my $chromovalue = $self->value($row, $chromo_i);
1902             # use the entire chromosome name as key
1903 78         164 while (exists $str_datahash{$chromovalue}{$startvalue}) {
1904             # if another item already exists at this location
1905             # add a really small number to bump it up and make it unique
1906 36         174 $startvalue += 0.001;
1907             }
1908 78         281 $str_datahash{$chromovalue}{$startvalue} = $self->{data_table}->[$row];
1909             }
1910             }
1911            
1912            
1913             # Now re-load the data array with sorted data
1914             # put the numeric chromosome data back first
1915 1         3 my $i = 1; # keep track of the row
1916 1         5 foreach my $chromovalue (sort {$a <=> $b} keys %num_datahash) {
  0         0  
1917             # first, numeric sort on increasing chromosome number
1918 0         0 foreach my $startvalue (
1919 0         0 sort {$a <=> $b} keys %{ $num_datahash{$chromovalue} }
  0         0  
1920             ) {
1921             # second, numeric sort on increasing position value
1922 0         0 $self->{data_table}->[$i] = $num_datahash{$chromovalue}{$startvalue};
1923 0         0 $i++; # increment for next row
1924             }
1925             }
1926             # next put the string chromosome data back
1927 1         6 foreach my $chromovalue (sort {$a cmp $b} keys %str_datahash) {
  0         0  
1928             # first, ascii sort on increasing chromosome name
1929 1         2 foreach my $startvalue (
1930 396         504 sort {$a <=> $b} keys %{ $str_datahash{$chromovalue} }
  1         13  
1931             ) {
1932             # second, numeric sort on increasing position value
1933 78         128 $self->{data_table}->[$i] = $str_datahash{$chromovalue}{$startvalue};
1934 78         101 $i++; # increment for next row
1935             }
1936             }
1937            
1938 1         13 return 1;
1939             }
1940              
1941             sub splice_data {
1942 1     1 1 4 my ($self, $part, $total_parts) = @_;
1943            
1944 1 50 33     10 unless ($part and $total_parts) {
1945 0         0 confess "ordinal part and total number of parts not passed\n";
1946             }
1947 1         6 my $part_length = int($self->last_row / $total_parts);
1948            
1949             # check for SeqFeatureObjects array
1950 1 50       4 if (exists $self->{SeqFeatureObjects}) {
1951             # it needs to be the same length as the data table, it should be
1952 0         0 while (scalar @{$self->{SeqFeatureObjects}} < scalar @{$self->{data_table}}) {
  0         0  
  0         0  
1953 0         0 push @{$self->{SeqFeatureObjects}}, undef;
  0         0  
1954             }
1955             }
1956            
1957             # splicing based on which part we do
1958 1 50       6 if ($part == 1) {
    50          
1959             # remove all but the first part
1960             splice(
1961 0         0 @{$self->{'data_table'}},
  0         0  
1962             $part_length + 1
1963             );
1964 0 0       0 if (exists $self->{SeqFeatureObjects}) {
1965             splice(
1966 0         0 @{$self->{SeqFeatureObjects}},
  0         0  
1967             $part_length + 1
1968             );
1969             }
1970             }
1971             elsif ($part == $total_parts) {
1972             # remove all but the last part
1973             splice(
1974 1         2 @{$self->{'data_table'}},
  1         15  
1975             1,
1976             $part_length * ($total_parts - 1)
1977             );
1978 1 50       4 if (exists $self->{SeqFeatureObjects}) {
1979             splice(
1980 0         0 @{$self->{SeqFeatureObjects}},
  0         0  
1981             1,
1982             $part_length * ($total_parts - 1)
1983             );
1984             }
1985             }
1986             else {
1987             # splicing in the middle requires two rounds
1988            
1989             # remove the last parts
1990             splice(
1991 0         0 @{$self->{'data_table'}},
  0         0  
1992             ($part * $part_length) + 1
1993             );
1994            
1995             # remove the first parts
1996             splice(
1997 0         0 @{$self->{'data_table'}},
  0         0  
1998             1,
1999             $part_length * ($part - 1)
2000             );
2001            
2002 0 0       0 if (exists $self->{SeqFeatureObjects}) {
2003             splice(
2004 0         0 @{$self->{SeqFeatureObjects}},
  0         0  
2005             ($part * $part_length) + 1
2006             );
2007             splice(
2008 0         0 @{$self->{SeqFeatureObjects}},
  0         0  
2009             1,
2010             $part_length * ($part - 1)
2011             );
2012             }
2013             }
2014            
2015             # update last row metadata
2016 1         2 $self->{'last_row'} = scalar(@{$self->{'data_table'}}) - 1;
  1         4  
2017            
2018             # re-open a new un-cached database connection
2019 1 50       4 if (exists $self->{db_connection}) {
2020 0         0 delete $self->{db_connection};
2021             }
2022 1         4 return 1;
2023             }
2024              
2025              
2026              
2027             ### File functions
2028              
2029             sub reload_children {
2030 1     1 1 3 my $self = shift;
2031 1         4 my @files = @_;
2032 1 50       5 return unless @files;
2033            
2034             # prepare Stream
2035 1         3 my $class = "Bio::ToolBox::Data::Stream";
2036 1         3 eval {load $class};
  1         6  
2037 1 50       98 if ($@) {
2038 0         0 carp "unable to load $class! can't reload children!";
2039 0         0 return;
2040             }
2041            
2042             # open first stream
2043 1         4 my $first = shift @files;
2044 1         9 my $Stream = $class->new(in => $first);
2045 1 50       5 unless ($Stream) {
2046 0         0 carp "unable to load first child file $first";
2047 0         0 return;
2048             }
2049            
2050             # destroy current data table and metadata
2051 1         4 $self->{data_table} = [];
2052 1         5 for (my $i = 0; $i < $self->number_columns; $i++) {
2053 0         0 delete $self->{$i};
2054             }
2055            
2056             # copy the metadata
2057 1         5 foreach (qw(program feature db bed gff ucsc headers number_columns last_row)) {
2058             # various keys
2059 9         19 $self->{$_} = $Stream->{$_};
2060             }
2061 1         5 for (my $i = 0; $i < $Stream->number_columns; $i++) {
2062             # column metadata
2063 4         12 my %md = $Stream->metadata($i);
2064 4         15 $self->{$i} = \%md;
2065             }
2066 1         4 my @comments = $Stream->comments;
2067 1         3 push @{$self->{other}}, @comments;
  1         5  
2068            
2069             # first row column headers
2070 1         3 $self->{data_table}->[0] = [ @{ $Stream->{data_table}->[0] } ];
  1         15  
2071            
2072             # load the data
2073 1         7 while (my $row = $Stream->next_row) {
2074 39         81 $self->add_row($row);
2075             }
2076 1         45 $Stream->close_fh;
2077            
2078             # load remaining files
2079 1         24 foreach my $file (@files) {
2080 2         28 my $Stream = $class->new(in => $file);
2081 2 50       8 if ($Stream->number_columns != $self->number_columns) {
2082 0         0 confess "fatal error: child file $file has a different number of columns!";
2083             }
2084 2         7 while (my $row = $Stream->next_row) {
2085 78         158 my $a = $row->row_values;
2086 78         164 $self->add_row($a);
2087             }
2088 2         75 $Stream->close_fh;
2089             }
2090 1         25 $self->verify;
2091            
2092             # clean up
2093 1         224 unlink($first, @files);
2094 1         10 return $self->last_row;
2095             }
2096              
2097              
2098              
2099              
2100             ### Export summary data file
2101             sub summary_file {
2102 0     0 1 0 my $self = shift;
2103            
2104             # Collect passed arguments
2105 0         0 my %args = @_;
2106            
2107             # parameters
2108             # either one or more datasets can be summarized
2109 0   0     0 my $outfile = $args{'filename'} || undef;
2110 0         0 my @datasets;
2111 0 0 0     0 if ($args{dataset} and ref $args{dataset} eq 'ARRAY') {
    0 0        
2112 0         0 @datasets = @{ $args{dataset} };
  0         0  
2113             }
2114             elsif ($args{dataset} and ref $args{dataset} eq 'SCALAR') {
2115 0         0 push @datasets, $args{dataset};
2116             }
2117 0         0 my @startcolumns;
2118 0 0 0     0 if ($args{startcolumn} and ref $args{startcolumn} eq 'ARRAY') {
    0 0        
2119 0         0 @startcolumns = @{ $args{startcolumn} };
  0         0  
2120             }
2121             elsif ($args{startcolumn} and ref $args{startcolumn} eq 'SCALAR') {
2122 0         0 push @startcolumns, $args{startcolumn};
2123             }
2124 0         0 my @endcolumns;
2125 0   0     0 $args{endcolumn} ||= $args{stopcolumn};
2126 0 0 0     0 if ($args{endcolumn} and ref $args{endcolumn} eq 'ARRAY') {
    0 0        
2127 0         0 @endcolumns = @{ $args{endcolumn} };
  0         0  
2128             }
2129             elsif ($args{endcolumn} and ref $args{endcolumn} eq 'SCALAR') {
2130 0         0 push @endcolumns, $args{endcolumn};
2131             }
2132            
2133            
2134             # Check required values
2135 0 0       0 unless ($self->verify) {
2136 0         0 cluck "bad data structure!";
2137 0         0 return;
2138             }
2139 0 0       0 unless (defined $outfile) {
2140 0 0       0 if ($self->basename) {
2141             # use the opened file's filename if it exists
2142             # prepend the path if it exists
2143             # the extension will be added later
2144 0         0 $outfile = $self->path . $self->basename;
2145             }
2146             else {
2147 0         0 cluck "no filename passed to write_summary_data!\n";
2148 0         0 return;
2149             }
2150             }
2151            
2152             # Identify possible dataset columns
2153 0         0 my %possibles;
2154 0         0 my %skip = map {$_ => 1} qw (systematicname name id alias aliases type class
  0         0  
2155             geneclass chromosome chromo seq_id seqid start stop end gene strand
2156             length primary_id merged_transcript_length transcript_cds_length
2157             transcript_5p_utr_length transcript_3p_utr_length);
2158             # walk through the dataset names
2159 0         0 $possibles{unknown} = [];
2160 0         0 for (my $i = 0; $i < $self->number_columns; $i++) {
2161 0 0       0 if (not exists $skip{ lc $self->{$i}{'name'} }) {
2162 0   0     0 my $d = $self->metadata($i, 'dataset') || undef;
2163 0 0       0 if (defined $d) {
2164             # we have what appears to be a dataset column
2165 0   0     0 $possibles{$d} ||= [];
2166 0         0 push @{$possibles{$d}}, $i;
  0         0  
2167             }
2168             else {
2169             # still an unknown possibility
2170 0         0 push @{$possibles{unknown}}, $i;
  0         0  
2171             }
2172             }
2173             }
2174            
2175             # check datasets
2176 0 0       0 unless (@datasets) {
2177 0 0       0 if (scalar keys %possibles > 1) {
2178             # we will always have the unknown category, so anything more than one
2179             # means we found legitimate dataset columns
2180 0         0 delete $possibles{unknown};
2181             }
2182 0         0 @datasets = sort {$a cmp $b} keys %possibles;
  0         0  
2183             }
2184            
2185             # check starts
2186 0 0       0 if (scalar @startcolumns != scalar @datasets) {
2187 0         0 @startcolumns = (); # ignore what we were given?
2188 0         0 foreach my $d (@datasets) {
2189             # take the first column with this dataset
2190 0         0 push @startcolumns, $possibles{$d}->[0];
2191             }
2192             }
2193            
2194             # check stops
2195 0 0       0 if (scalar @endcolumns != scalar @datasets) {
2196 0         0 @endcolumns = (); # ignore what we were given?
2197 0         0 foreach my $d (@datasets) {
2198             # take the last column with this dataset
2199 0         0 push @endcolumns, $possibles{$d}->[-1];
2200             }
2201             }
2202            
2203            
2204             # Prepare Data object to store the summed data
2205 0         0 my $summed_data = $self->new(
2206             feature => 'averaged_windows',
2207             columns => ['Window','Midpoint'],
2208             );
2209              
2210             # Go through each dataset
2211 0         0 foreach my $d (0 .. $#datasets) {
2212            
2213             # Prepare score column name
2214 0         0 my $data_name = simplify_dataset_name($datasets[$d]);
2215            
2216             # add column
2217 0         0 my $i = $summed_data->add_column($data_name);
2218 0         0 $summed_data->metadata($i, 'dataset', $datasets[$d]);
2219            
2220             # tag for remembering we're working with percentile bins
2221 0         0 my $do_percentile = 0;
2222            
2223             # remember the row
2224 0         0 my $row = 1;
2225            
2226             # Collect summarized data
2227 0         0 for (
2228             my $column = $startcolumns[$d];
2229             $column <= $endcolumns[$d];
2230             $column++
2231             ) {
2232            
2233             # determine the midpoint position of the window
2234             # this assumes the column metadata has start and stop
2235 0         0 my $midpoint = int(sum0($self->metadata($column, 'start'),
2236             $self->metadata($column, 'stop')) / 2);
2237            
2238             # convert midpoint to fraction of 1000 for plotting if necessary
2239 0 0       0 if (substr($self->name($column), -1) eq '%') {
2240 0         0 $midpoint *= 10; # midpoint * 0.01 * 1000 bp
2241 0         0 $do_percentile++;
2242             }
2243 0 0 0     0 if ($do_percentile and substr($self->name($column), -2) eq 'bp') {
2244             # working on the extension after the percentile bins
2245 0         0 $midpoint += 1000;
2246             }
2247            
2248             # collect the values in the column
2249 0         0 my @values;
2250 0         0 for my $row (1..$self->last_row) {
2251 0         0 my $v = $self->value($row, $column);
2252 0 0       0 push @values, $v eq '.' ? 0 : $v; # treat nulls as zero
2253             }
2254            
2255             # adjust if log value
2256 0   0     0 my $log = $self->metadata($column, 'log2') || 0;
2257 0 0       0 if ($log) {
2258 0         0 @values = map { 2 ** $_ } @values;
  0         0  
2259             }
2260            
2261             # determine mean value
2262 0         0 my $window_mean = sum0(@values) / scalar(@values);
2263 0 0       0 if ($log) {
2264 0         0 $window_mean = log($window_mean) / log(2);
2265             }
2266            
2267             # push to summed output
2268 0 0       0 if ($d == 0) {
2269             # this is the first dataset, so we need to add a row
2270 0         0 $summed_data->add_row( [ $self->{$column}{'name'}, $midpoint, $window_mean ] );
2271             }
2272             else {
2273             # we're summarizing multiple datasets, we already have name midpoint
2274             # first do sanity check
2275 0 0       0 if ($summed_data->value($row, 1) != $midpoint) {
2276 0         0 carp("unable to summarize multiple datasets with nonequal columns of data!");
2277 0         0 return;
2278             }
2279 0         0 $summed_data->value($row, $i, $window_mean);
2280             }
2281 0         0 $row++;
2282             }
2283             }
2284            
2285            
2286            
2287             # Write summed data
2288 0         0 $outfile =~ s/\.txt(\.gz)?$//i; # strip any .txt or .gz extensions if present
2289 0         0 my $written_file = $summed_data->write_file(
2290             'filename' => $outfile . '_summary.txt',
2291             'gz' => 0,
2292             );
2293 0         0 return $written_file;
2294             }
2295              
2296              
2297              
2298              
2299              
2300             ####################################################
2301              
2302             package Bio::ToolBox::Data::Iterator;
2303 3     3   22464 use Bio::ToolBox::Data::Feature;
  3         12  
  3         142  
2304 3     3   34 use Carp;
  3         8  
  3         926  
2305              
2306             sub new {
2307 9     9   24 my ($class, $data) = @_;
2308 9         35 my %iterator = (
2309             'index' => 1,
2310             'data' => $data,
2311             );
2312 9         39 return bless \%iterator, $class;
2313             }
2314              
2315             sub next_row {
2316 154     154   1985 my $self = shift;
2317 154 100       357 return if $self->{'index'} > $self->{data}->{last_row}; # no more
2318 148         199 my $i = $self->{'index'};
2319 148         207 $self->{'index'}++;
2320             my @options = (
2321             'data' => $self->{data},
2322 148         294 'index' => $i,
2323             );
2324 148 50 66     456 if (
2325             exists $self->{data}->{SeqFeatureObjects} and
2326             defined $self->{data}->{SeqFeatureObjects}->[$i]
2327             ) {
2328 0         0 push @options, 'feature', $self->{data}->{SeqFeatureObjects}->[$i];
2329             }
2330 148         388 return Bio::ToolBox::Data::Feature->new(@options);
2331             }
2332              
2333             sub row_index {
2334 0     0     my $self = shift;
2335 0 0         carp "row_index is a read only method" if @_;
2336 0           return $self->{'index'};
2337             }
2338              
2339              
2340              
2341             ####################################################
2342              
2343             __END__