File Coverage

blib/lib/Bio/ToolBox/Data.pm

Criterion	Covered	Total	%
statement	334	622	53.7
branch	115	300	38.3
condition	65	181	35.9
subroutine	30	35	85.7
pod	22	22	100.0
total	566	1160	48.7

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::Data;
2							our $VERSION = '1.69';
3
4							=head1 NAME
5
6							Bio::ToolBox::Data - Reading, writing, and manipulating data structure
7
8							=head1 SYNOPSIS
9
10							use Bio::ToolBox::Data;
11
12							### Open a pre-existing file
13							# a data table with same columns as input file
14							my $Data = Bio::ToolBox::Data->new(
15							file => 'coordinates.bed',
16							);
17
18							### Parse a GTF, GFF3, refFlat, or genePred gene table
19							# data table with names and references to fully parsed
20							# SeqFeature transcript objects with exon SeqFeature subfeatures
21							my $Data = Bio::ToolBox::Data->new(
22							file => 'annotation.gtf.gz',
23							parse => 1,
24							feature => 'transcript',
25							subfeature => 'exon'
26							);
27
28							### New gene list from a Bio::DB::SeqFeature::Store database
29							# data table with name and reference ID to database SeqFeature objects
30							my $Data = Bio::ToolBox::Data->new(
31							db => 'hg19.sqlite',
32							feature => 'gene:ensGene',
33							);
34
35
36							### Get a specific value
37							my $value = $Data->value($row, $column);
38
39							### Replace or add a value
40							$Data->value($row, $column, $new_value);
41
42							### Add a column
43							my $new_index = $Data->add_column('Data2');
44
45							### Find a column
46							my $old_index = $Data->find_column('Data1');
47
48							### Return a specific row as an object
49							my $row = $Data->get_row($i); # Bio::ToolBox::Data::Feature
50
51							### Iterate through a Data structure one row at a time
52							my $stream = $Data->row_stream;
53							while (my $row = $stream->next_row) {
54							# each row is returned as Bio::ToolBox::Data::Feature object
55							# get the positional information from the file data
56							# assuming that the input file had these identifiable columns
57							my $seq_id = $row->seq_id;
58							my $start = $row->start;
59							my $stop = $row->end;
60
61							# work with the referenced SeqFeature object
62							my $seqf = $row->seqfeature;
63
64							# generate a new Bio::Seq object from the database using
65							# these coordinates
66							my $region = $db->segment($seq_id, $start, $stop);
67
68							# modify a row value
69							my $value = $row->value($old_index);
70							my $new_value = $value + 1;
71							$row->value($new_index, $new_value);
72							}
73
74							### Iterate through a Data table with a code reference
75							$Data->iterate(\&my_code);
76
77
78							### write the data to file
79							my $success = $Data->write_file(
80							filename => 'new_data.txt',
81							gz => 1,
82							);
83							print "wrote new file $success\n"; # file is new_data.txt.gz
84
85							=head1 DESCRIPTION
86
87							This module works with the primary Bio::ToolBox Data structure. Simply, it
88							is a complex data structure representing a tabbed-delimited table (array
89							of arrays), with plenty of options for metadata. Many common bioinformatic
90							file formats are simply tabbed-delimited text files (think BED, GFF, VCF).
91							Each row is a feature or genomic interval, and each column is a piece of
92							information about that feature, such as name, type, and/or coordinates.
93							We can append to that file additional columns of information, perhaps
94							scores from genomic data sets. We can record metadata regarding how
95							and where we obtained that data. Finally, we can write the updated
96							table to a new file.
97
98							=head1 METHODS
99
100							=head2 Initializing the structure
101
102							=over 4
103
104							=item new
105
106							Initialize a new Data structure. This generally requires options,
107							provided as an array of key =E values. A new list of features
108							may be obtained from an annotation database or an existing file
109							may be loaded. If you do not pass any options, a new empty
110							structure will be generated for you to populate.
111
112							These are the options available.
113
114							=over 4
115
116							=item file
117
118							=item in
119
120							my $Data = Bio::ToolBox::Data->new(file => $file);
121
122							Provide the path and name to an existing tabbed-delimited text
123							file from which to load the contents. This is a shortcut to the
124							load_file() method. See that method for more details.
125
126							=item stream
127
128							my $Data = Bio::ToolBox::Data->new(file => $file, stream => 1);
129
130							Boolean option indicating that the file should be opened as a file
131							stream. A L object will be returned. This
132							is a convenience method.
133
134							=item noheader
135
136							my $Data = Bio::ToolBox::Data->new(file => $file, noheader => 1);
137
138							Boolean option indicating that the file does not have file headers,
139							in which case dummy headers are provided. This is not necessary for
140							defined file types that don't normally have file headers, such as
141							BED, GFF, or UCSC files.
142
143							=item parse
144
145							my $Data = Bio::ToolBox::Data->new(file => $file, parse => 1);
146
147							Boolean option indicating that a gene annotation table or file should
148							be parsed into SeqFeature objects and a general table of names and IDs
149							representing those objects be generated. The annotation file may
150							be specified in one of two ways: Through the file option above,
151							or in the database metadata of an existing table file representing
152							previously parsed objects.
153
154							=item db
155
156							my $Data = Bio::ToolBox::Data->new(db => 'hg19', feature => 'gene');
157
158							Provide the name of the database from which to collect the
159							features. It may be a short name, whereupon it is checked in
160							the L configuration file F<.biotoolbox.cfg> for
161							connection information. Alternatively, a path to a database
162							file or directory may be given.
163
164							If you already have an opened L database
165							object, you can simply pass that. See L for
166							more information. However, this in general should be discouraged,
167							since the name of the database will not be properly recorded when
168							saving to file. It is better to simply pass the name of database
169							again; multiple connections to the same database are smartly handled
170							in the background.
171
172							=item feature
173
174							my $Data = Bio::ToolBox::Data->new(file => $filename, feature => 'gene');
175
176							For de novo lists from an annotation database, provide the GFF
177							type or type:source (columns 3 and 2) for collection. A comma
178							delimited string may be accepted for databases. For parsed files,
179							only a simple string is accepted.
180
181							For a list of genomic intervals across the genome, specify a
182							feature of 'genome' with a database object.
183
184							=item subfeature
185
186							When parsing annotation files such as GTF, one or more subfeature
187							types, e.g. C or C, may be specified as a comma-delimited
188							string. This ensures that the subfeatures will be parsed into
189							SeqFeature objects. Otherwise, only the top level features will be
190							parsed. This expedites parsing by skipping unwanted features.
191
192							=item win
193
194							=item step
195
196							my $Data = Bio::ToolBox::Data->new(db => $dbname, win => $integer);
197
198							If generating a list of genomic intervals, optionally provide
199							the window and step values. The default is 500 bp.
200
201							=item chrskip
202
203							Provide a regular expression compatible or C string for skipping or
204							excluding specific or classes of chromosomes, for example the mitochondrial
205							chromosome or unmapped contigs. This works with both feature collection
206							and genomic intervals. The default is to take all chromosomes.
207
208							=back
209
210							When no file is given or database given to search, then a new,
211							empty Data object is returned. In this case, you may optionally
212							specify the names of the columns or indicate a specific file
213							format structure to assign column names. The following options can
214							then be provided.
215
216							=over 4
217
218							=item columns
219
220							=item datasets
221
222							my $Data = Bio::ToolBox::Data->new(columns => [qw(Column1 Column2 ...)] );
223
224							Provide the column names in advance as an anonymous array.
225
226							=item gff
227
228							Pass the GFF version of the file: 1, 2, 2.5 (GTF), or 3.
229
230							=item bed
231
232							Pass the number of BED columns (3-12).
233
234							=item ucsc
235
236							Pass the number of columns to indicate the type of UCSC format. These
237							include 10 (refFlat without gene names), 11 (refFlat with gene names),
238							12 (knownGene gene prediction table), and 15
239							(an extended gene prediction or genePredExt table).
240
241							=back
242
243							If successful, the method will return a Bio::ToolBox::Data object.
244
245							=item duplicate
246
247							This will create a new Data object containing the same column
248							headers and metadata, but lacking the table content, i.e. no
249							rows of data. File name metadata, if present in the original, is
250							not preserved. The purpose here, for example, is to allow one
251							to selectively copy rows from one Data object to another.
252
253							=item parse_table
254
255							$Data->parse_table($file)
256							$Data->parse_table( {
257							file => $file,
258							feature => 'gene',
259							subfeature => 'exon',
260							chrskip => 'chrM\|contig',
261							} );
262
263							This will parse a gene annotation table into SeqFeature objects.
264							If this is called from an empty Data object, then the table will
265							be filled with the SeqFeature object names and IDs. If this is
266							called from a non-empty Data object, then the table's contents
267							will be associated with the SeqFeature objects using their name and
268							ID. The stored SeqFeature objects can be retrieved using the
269							L method.
270
271							Pass the method a single argument. This may be either a simple
272							scalar to a filename to be parsed, or a reference to hash of
273							one or more argument options. Possible options include:
274
275							=over 4
276
277							=item file
278
279							The file name of the gene table to be parsed. This may
280							be a GFF, GFF3, GTF, or any of the UCSC gene table formats.
281							These will be parsed using Bio::ToolBox::parser::* adapters.
282
283							=item feature
284
285							A regular expression compatible string or C string to match
286							the top features C to keep. The C is not
287							checked. The default is 'gene', although a transcript type could
288							alternatively be specified (in which case transcripts are not
289							parsed in gene features).
290
291							=item subfeature
292
293							A regular expression compatible string or C string to match
294							any sub features C to parse. The C is not checked.
295							Typically these include exon, CDS, or UTR. The default is nothing.
296
297							=item chrskip
298
299							A regular expression compatible string or C string to match
300							chromosomes to be skipped or excluded. Any feature with a matching
301							C chromosome name will be skipped.
302
303							=back
304
305							=back
306
307							=head2 General Metadata
308
309							There is a variety of general metadata regarding the Data
310							structure. The following methods may be used to access or set these
311							metadata properties. Some of these are stored as comment lines at
312							the beginning of the file, and will be read or set when the file is
313							loaded.
314
315							=over
316
317							=item feature
318
319							$Data->feature('something');
320							my $feature = $Data->feature;
321
322							Returns or sets the name of the features used to collect
323							the list of features. The actual feature types are listed
324							in the table, so this metadata is merely descriptive.
325
326							=item feature_type
327
328							Returns one of three specific values describing the contents
329							of the data table inferred by the presence of specific column
330							names. This provides a clue as to whether the table features
331							represent genomic regions (defined by coordinate positions) or
332							named database features. The return values include:
333
334							=over 4
335
336							=item * coordinate
337
338							Table includes at least chromosome and start columns.
339
340							=item * named
341
342							Table includes name, type, and/or Primary_ID, possibly
343							referring to named database features.
344
345							=item * unknown
346
347							Table is unrecognized format.
348
349							=back
350
351							=item program
352
353							Returns or sets the name of the program generating the list.
354
355							=item database
356
357							Returns or sets the name or path of the database from which the
358							features were derived.
359
360							=item gff
361
362							Returns or sets the version of loaded GFF files. Supported versions
363							included 1, 2, 2.5 (GTF), and 3.
364
365							=item bed
366
367							Returns or sets the BED file version. Here, the BED version is simply
368							the number of columns.
369
370							=item ucsc
371
372							Returns or sets the UCSC file format version. Here, the version is
373							simply the number of columns. Supported versions include 10 (gene
374							prediction), 11 (refFlat, or gene prediction with gene name), 12
375							(knownGene table), 15 (extended gene prediction), or 16 (extended
376							gene prediction with bin).
377
378							=item vcf
379
380							Returns or sets the VCF file version number. VCF support is limited.
381
382							=back
383
384							=head2 File information
385
386							These methods provide information about the file from which the
387							data table was loaded. This does not include parsed annotation tables.
388
389							=over 4
390
391							=item filename
392
393							=item path
394
395							=item basename
396
397							=item extension
398
399							Returns the filename, full path, basename, and extension of
400							the filename. Concatenating the last three values will reconstitute
401							the first original filename.
402
403							=item add_file_metadata
404
405							$Data->add_file_metadata('/path/to/file.txt');
406
407							Add filename metadata. This will automatically parse the path,
408							basename, and recognized extension from the passed filename and
409							set the appropriate metadata attributes.
410
411							=back
412
413							=head2 Metadata comments
414
415							Comments are any other commented lines from a text file (lines
416							beginning with a #) that were not parsed as general metadata.
417
418							=over 4
419
420							=item comments
421
422							Returns a copy of the array containing commented lines. Each
423							comment line becomes an element in the array.
424
425							=item add_comment
426
427							Appends the text string to the comment array.
428
429							=item delete_comment
430
431							Deletes a comment. Provide the array index of the comment to
432							delete. If an index is not provided, ALL comments will be deleted!
433
434							=item vcf_headers
435
436							For VCF files, this will partially parse the VCF headers into a
437							hash structure that can be queried or manipulated. Each header
438							line is parsed for the primary key, being the first word after the
439							## prefix, e.g. INFO, FORMAT, FILTER, contig, etc. For the simple
440							values, they are stored as the value. For complex entries, such as
441							with INFO and FORMAT, a second level hash is created with the ID
442							extracted and used as the second level key. The value is always the
443							always the remainder of the string.
444
445							For example, the following would be a simple parsed vcf header in
446							code representation.
447
448							$vcf_header = {
449							FORMAT => {
450							GT = q(ID=GT,Number=1,Type=String,Description="Genotype"),
451							AD = q(ID=AD,Number=.,Type=Integer,Description="ref,alt Allelic depths"),
452							},
453							fileDate => 20150715,
454							}
455
456							=item rewrite_vcf_headers
457
458							If you have altered the vcf headers exported by the vcf_headers()
459							method, then this method will rewrite the hash structure as new
460							comment lines. Do this prior to writing or saving the Data sturcture
461							or else you will lose your changed VCF header metadata.
462
463							=back
464
465							=head2 The Data table
466
467							The Data table is the array of arrays containing all of the
468							actual information. Rows and columns are indexed using 0-based
469							indexing as with all Perl arrays. Row 0 is always the column
470							header row containing the column names, regardless whether an
471							actual header name existed in the original file format (e.g.
472							BED or GFF formats). Any individual table "cell" can be
473							specified as C<[$row][$column]>.
474
475							=over 4
476
477							=item list_columns
478
479							Returns an array or array reference of the column names
480							in ascending (left to right) order.
481
482							=item number_columns
483
484							Returns the number of columns in the Data table.
485
486							=item number_rows
487
488							Returns the number of rows in the Data table.
489
490							=item last_column
491
492							Returns the array index number for the last (right most)
493							column. This number is always 1 less than the value
494							returned by number_columns() due to 0-based indexing.
495
496							=item last_row
497
498							Returns the array index number of the last row.
499							Since the header row is index 0, this is also the
500							number of actual content rows.
501
502							=item column_values
503
504							my $values = $Data->column_values($i);
505
506							Returns an array or array reference representing the values
507							in the specified column. This includes the column header as
508							the first element. Pass the method the column index.
509
510							=item add_column
511
512							# add empty column with name
513							my $i = $Data->add_column($name);
514
515							# add entire column
516							my $i = $Data->add_column(\@array);
517
518							Appends a new column to the Data table at the
519							rightmost position (highest index). It adds the column
520							header name and creates a new column metadata hash.
521							Pass the method one of two possibilities. Pass a text
522							string representing the new column name, in which case
523							no data will be added to the column. Alternatively, pass
524							an array reference, and the contents of the array will
525							become the column data. If the Data table already has
526							rows, then the passed array reference must have the same
527							number of elements.
528
529							It returns the new column index if successful.
530
531							=item copy_column
532
533							my $j = $Data->copy_column($i);
534
535							This will copy a column, appending the duplicate column at
536							the rightmost position (highest index). It will duplicate
537							column metadata as well. It will return the new index
538							position.
539
540							=item delete_column
541
542							$Data->delete_column($i, $j);
543
544							Deletes one or more specified columns. Any remaining
545							columns rightwards will have their indices shifted
546							down appropriately. If you had identified one of the
547							shifted columns, you may need to re-find or calculate
548							its new index.
549
550							=item reorder_column
551
552							$Data->reorder_column($c,$b,$a,$a);
553
554							Reorders columns into the specified order. Provide the
555							new desired order of indices. Columns could be duplicated
556							or deleted using this method. The columns will adopt their
557							new index numbers.
558
559							=item add_row
560
561							$Data->add_row(\@values);
562							$Data->add_row($Row); # Bio::ToolBox::Data::Feature object
563
564							Add a new row of data values to the end of the Data table.
565							Optionally provide either a reference to an array of values
566							to put in the row, or pass a L
567							Row object, such as one obtained from another Data object.
568							If the number of columns do not match, the array is filled
569							up with null values for missing columns, or excess values
570							are dropped.
571
572							=item get_row
573
574							$Data->get_row($i); # Bio::ToolBox::Data::Feature object
575
576							Returns the specified row as a L
577							object.
578
579							=item delete_row
580
581							$Data->delete_row($i, $j);
582
583							Deletes one or more specified rows. Rows are spliced out
584							highest to lowest index to avoid issues. Be very careful
585							deleting rows while simultaneously iterating through the
586							table!
587
588							=item row_values
589
590							my $row_values = $Data->row_values($i);
591
592							Returns a copy of an array for the specified row index.
593							Modifying this returned array does not migrate back to the
594							Data table; Use the L method below instead.
595
596							=item value
597
598							my $value = $Data->value($row, $column);
599							$Data->value($row, $column, $new_value);
600
601							Returns or sets the value at a specific row or column index.
602							Index positions are 0-based (header row is index 0).
603
604							=back
605
606							=head2 Column Metadata
607
608							Each column has metadata. Each metadata is a series of key =E
609							value pairs. The minimum keys are 'index' (the 0-based index
610							of the column) and 'name' (the column header name). Additional
611							keys and values may be queried or set as appropriate. When the
612							file is written, these are stored as commented metadata lines at
613							the beginning of the file.
614
615							=over 4
616
617							=item name
618
619							$Data->name($index, $new_name);
620							my $name = $Data->name($i);
621
622							Convenient method to return the name of the column given the
623							index number. A column may also be renamed by passing a new name.
624
625							=item metadata
626
627							$Data->metadata($index, $key, $new_value);
628							my $value = $Data->metadata($index, $key)
629
630							Returns or sets the metadata value for a specific $key for a
631							specific column $index.
632
633							This may also be used to add a new metadata key. Simply provide
634							the name of a new $key that is not present
635
636							If no key is provided, then a hash or hash reference is returned
637							representing the entire metadata for that column.
638
639							=item copy_metadata
640
641							$Data->copy_metadata($source, $target);
642
643							This method will copy the metadata (everything except name and
644							index) between the source column and target column. Returns 1 if
645							successful.
646
647							=item delete_metadata
648
649							$Data->delete_metadata($index, $key);
650
651							Deletes a column-specific metadata $key and value for a specific
652							column $index. If a $key is not provided, then all metadata keys
653							for that index will be deleted.
654
655							=item find_column
656
657							my $i = $Data->find_column('Gene');
658							my $i = $Data->find_column('^Gene$')
659
660							Searches the column names for the specified column name. This
661							employs a case-insensitive grep search, so simple substitutions
662							may be made.
663
664							=item chromo_column
665
666							=item start_column
667
668							=item stop_column
669
670							=item strand_column
671
672							=item name_column
673
674							=item type_column
675
676							=item id_column
677
678							These methods will return the identified column best matching
679							the description. Returns C if that column is not present.
680							These use the L method with a predefined list of
681							aliases.
682
683							=back
684
685							=head2 Working with databases
686
687							These are methods for working primarily with annotation databases.
688
689							=over 4
690
691							=item open_database
692
693							This is wrapper method that tries to do the right thing and passes
694							on to either L or L methods.
695							Basically a legacy method for L.
696
697							=item open_meta_database
698
699							Open the database that is listed in the metadata. Returns the
700							database connection. Pass a true value to force a new database
701							connection to be opened, rather than returning a cached connection
702							object (useful when forking).
703
704							=item open_new_database
705
706							Convenience method for opening a second or new database that is
707							not specified in the metadata, useful for data collection. This
708							is a shortcut to L.
709							Pass the database name.
710
711							=back
712
713							=head2 SeqFeature Objects
714
715							SeqFeature objects corresponding to data rows can be stored in the Data
716							object. This can be useful if the SeqFeature object is not readily
717							available from a database or is processor intensive in generating or
718							parsing. Note that storing large numbers of objects will increase memory
719							usage.
720
721							SeqFeature objects are usually either L,
722							L, or L objects, depending
723							upon their source. More information can obtained from the L
724							abstract API.
725
726							=over 4
727
728							=item store_seqfeature
729
730							$Data->store_seqfeature($row_index, $seqfeature);
731
732							Stores the SeqFeature object for the given row index. Only one SeqFeature
733							object can be stored per row.
734
735							=item get_seqfeature
736
737							my $feature = $Data->get_seqfeature($row_index);
738
739							Retrieves the SeqFeature object for the given row index.
740
741							=item delete_seqfeature
742
743							$Data->store_seqfeature($row_index);
744
745							Removes the SeqFeature object for the given row index.
746
747							=item collapse_gene_transcripts
748
749							my $success = $Data->collapse_gene_transcripts;
750
751							This method will iterate through a Data table and collapse multiple alternative
752							transcript subfeatures of stored gene SeqFeature objects in the table. Exons
753							of multiple transcripts will be merged, maximizing exon size and minimizing
754							intron size. Genes with only one transcript will not be affected. Stored
755							SeqFeature objects that are do not have a C of "gene" are silently
756							skipped. Refer to the L method
757							for more details. The number of rows successfully collapsed is returned.
758
759							=item add_transcript_length
760
761							my $length_index = $Data->add_transcript_length;
762							my $length_index = $Data->add_transcript_length('cds');
763
764							This method will generate a new column in the Data table representing the
765							length of a transcript, i.e. the sum of the length of subfeatures for
766							the stored SeqFeature object in the Data table. The default subfeature is
767							C; however, alternative subfeature types may be passed to the method
768							and used. These include C, C<5p_utr>, and C<3p_utr> for CDS, the 5C<'> UTR,
769							and the 3C<'> UTR, respectively. See the corresponding transcript length
770							methods in L for more information. If a length
771							is not calculated, for example the feature C is a "gene",
772							then the simple length of the feature is recorded.
773
774							The name of the new column is one of "Merged_Transcript_Length" for exon,
775							"Transcript_CDS_Length", "Transcript_5p_UTR_Length", or "Transcript_3p_UTR_Length".
776							The index of the new length column is returned.
777
778							=back
779
780							=head2 Data Table Functions
781
782							These methods alter the Data table en masse.
783
784							=over 4
785
786							=item verify
787
788							This method will verify the Data structure, including the metadata and the
789							Data table. It ensures that the table has the correct number of rows and
790							columns as described in the metadata, and that each column has the basic
791							metadata.
792
793							If the Data structure is marked as a GFF or BED structure, then the table
794							is checked that the structure matches the proper format. If not, for
795							example when additional columns have been added, then the GFF or BED value
796							is set to null.
797
798							This method is automatically called prior to writing the Data table to file.
799
800							=item sort_data
801
802							$Data->sort_data($index, 'i'); # increasing sort
803
804							This method will sort the Data table by the values in the indicated column.
805							It will automatically determine whether the contents of the column are
806							numbers or alphanumeric, and will sort accordingly, either numerically or
807							asciibetically. The first non-null value in the column is used to determine.
808							The sort may fail if the values are not consistent. Pass a second optional
809							value to indicate the direction of the sort. The value should be either
810							'i' for 'increasing' or 'd' for 'decreasing'. The default order is
811							increasing.
812
813							=item gsort_data
814
815							This method will sort the Data table by increasing genomic coordinates. It
816							requires the presence of chromosome and start (or position) columns,
817							identified by their column names. These are automatically identified.
818							Failure to find these columns mean a failure to sort the table. Chromosome
819							names are sorted first by their digits (e.g. chr2 before chr10), and then
820							alphanumerically. Base coordinates are sorted by increasing value.
821							Identical positions are kept in their original order.
822
823							=item splice_data
824
825							my $Data = Bio::ToolBox::Data->new(file => $file);
826							my $pm = Parallel::ForkManager->new(4);
827							for my $i (1..4) {
828							$pm->start and next;
829							### in child ###
830							$Data->splice_data($i, 4);
831							# do something with this portion
832							# then save to a temporary unique file
833							$Data->save("$file_$i");
834							$pm->finish;
835							}
836							$pm->wait_all_children;
837							# reload children files
838							$Data->reload_children(glob "$file_*");
839
840							This method will splice the Data table into C<$total_parts> number of pieces,
841							retaining the C<$current_part> piece. The other parts are discarded. This
842							method is intended to be used when a program is forked into separate
843							processes, allowing each child process to work on a subset of the original
844							Data table.
845
846							Two values are passed to the method. The first is the current part number,
847							1-based. The second value is the total number of parts that the table
848							should be divided, corresponding to the number of concurrent processes.
849							One easy approach to forking is to use L. The
850							above example shows how to fork into four concurrent processes.
851
852							Since each forked child process is separate from their parent process,
853							their contents must be reloaded into the current Data object. The
854							L documentation recommends going through a disk
855							file intermediate. Therefore, write each child Data object to file using
856							a unique name. Once all children have been reaped, they can be reloaded
857							into the current Data object using the L method.
858
859							Remember that if you fork your script into child processes, any database
860							connections must be re-opened; they are typically not clone safe. If you
861							have an existing database connection by using the L method,
862							it should be automatically re-opened for you when you use the L
863							method, but you will need to call L again in the child
864							process to obtain the new database object.
865
866							=item reload_children
867
868							Discards the current data table in memory and reloads two or more files
869							written from forked children processes. Provide the name of the child
870							files in the order you want them loaded. The files will be automatically
871							deleted if successfully loaded. Returns the number of lines reloaded on
872							success.
873
874							=back
875
876							=head2 File Functions
877
878							The Data table may be read in from a file or written out as a file. In
879							all cases, it is a tab-delimited text file, whether as an ad hoc table
880							or a specific bioinformatic format, e.g. BED, GFF, etc. Multiple common
881							file formats are supported. Column headers are assumed, except in those
882							cases where it is not, e.g. BED, GFF, etc. Metadata may be included as
883							commented lines at the beginning of the file, prefixed with a C<#> symbol.
884							Reading and writing gzip compressed files is fully supported.
885
886							=over 4
887
888							=item load_file
889
890							$Data->load_file($filename);
891
892							This will load a file into a new, empty Data table. This function is
893							called automatically when a filename is provided to the L function.
894							The existence of the file is first checked (appending common missing
895							extensions as necessary), metadata and column headers processed and/or
896							generated from default settings, the content loaded into the table, and
897							the structure verified. Error messages may be printed if the structure or
898							format is inconsistent or doesn't match the expected format, e.g a file
899							with a F<.bed> extension doesn't match the UCSC specification.
900							Pass the name of the filename.
901
902							=item taste_file
903
904							my ($flavor, $format) = $Data->taste_file($filename);
905							# $flavor is generic term: gff, bed, ucsc, or undef
906							# $format is specific term: gtf, gff3, narrowPeak, genePred, etc.
907
908							Tastes, or checks, a file for a certain flavor, or known gene file formats.
909							Useful for determining if the file represents a known gene table format
910							that lacks a defined file extension, e.g. UCSC formats. This can be based
911							on the file extension, metadata headers, and/or file contents from the
912							first 10 lines. Returns two strings: the first is a generic flavor, and the
913							second is a more specific format, if applicable. Generic flavor values will
914							be one of `gff`, `bed`, `ucsc`, or `undefined`. These correlate to specific
915							Parser adapters. Specific formats could be any number of possibilities, for
916							example `undefined`, `gtf`, `gff3`, `narrowPeak`, `genePred`, etc.
917
918
919							=item save
920
921							=item write_file
922
923							my $success = $Data->save;
924							my $success = $Data->save('my_file.txt');
925							my $success = $Data->save(filename => $file, gz => 1);
926							print "file $success was saved!\n";
927
928							Pass the file name to be written. If no file name is passed, then
929							the filename and path stored in the metadata are used, if present.
930
931							These methods will write the Data structure out to file. It will
932							be first verified as to proper structure. Opened BED and GFF files
933							are checked to see if their structure is maintained. If so, they
934							are written in the same format; if not, they are written as regular
935							tab-delimited text files.
936
937							You may pass additional options.
938
939							=over 4
940
941							=item filename
942
943							Optionally pass a new filename. Required for new objects; previous
944							opened files may be overwritten if a new name is not provided. If
945							necessary, the file extension may be changed; for example, BED files
946							that no longer match the defined format lose the .bed and gain a .txt
947							extension. Compression may or add or strip .gz as appropriate. If
948							a path is not provided, the current working directory is used.
949
950							=item gz
951
952							Boolean value to change the compression status of the output file. If
953							overwriting an input file, the default is maintain the compression status,
954							otherwise no compression. Pass a 0 for no compression, 1 for standard
955							gzip compression, or 2 for block gzip (bgzip) compression for tabix
956							compatibility.
957
958							=back
959
960							If the file save is successful, it will return the full path and
961							name of the saved file, complete with any changes to the file extension.
962
963							=item summary_file
964
965							Write a separate file summarizing columns of data (mean values).
966							The mean value of each column becomes a row value, and each column
967							header becomes a row identifier (i.e. the table is transposed). The
968							best use of this is to summarize the mean profile of windowed data
969							collected across a feature. See the Bio::ToolBox scripts
970							L and L as examples.
971							For data from L where the columns are expressed
972							as percentile bins, the reported midpoint column is automatically
973							converted based on a length of 1000 bp.
974
975							You may pass these options. They are optional.
976
977							=over 4
978
979							=item filename
980
981							Pass an optional new filename. The default is to take the basename
982							and append "_summed" to it.
983
984							=item startcolumn
985
986							=item stopcolumn
987
988							Provide the starting and ending columns to summarize. The default
989							start is the leftmost column without a recognized standard name.
990							The default ending column is the last rightmost column. Indexes are
991							0-based.
992
993							=item dataset
994
995							Pass a string that is the name of the dataset. This could be collected
996							from the metadata, if present. This will become the name of the score
997							column if defined.
998
999							=back
1000
1001							The name of the summarized column is either the provided dataset name,
1002							the defined basename in the metadata of the Data structure, or a generic
1003							name. If successful, it will return the name of the file saved.
1004
1005							=back
1006
1007							=head2 Verifying Datasets
1008
1009							When working with row Features and collecting scores, the dataset
1010							from which you are collecting must be verified prior to collection.
1011							This ensures that the proper database adaptor is available and loaded,
1012							and that the dataset is correctly specified (otherwise nothing would be
1013							collected). This verification is normally performed transparently when
1014							you call L or
1015							L.
1016							However, datasets may be explicitly verified prior to calling the score
1017							methods.
1018
1019							=over 4
1020
1021							=item verify_dataset
1022
1023							my $dataset = $Data->verify_dataset($dataset, $database);
1024
1025							Pass the name of the dataset (GFF type or type:source) for a GFF3-based
1026							database, e.g. , or path and file name for a
1027							data file, e.g. Bam, BigWig, BigBed, or USeq file. If a separate database
1028							is being used, pass the name or opened database object as a second
1029							parameter. For more advance options, see
1030							L.
1031
1032							The name of the verified dataset, with a prefix if necessary, is returned.
1033
1034							=back
1035
1036							=head2 Efficient Data Access
1037
1038							Most of the time we need to iterate over the Data table, one row
1039							at a time, collecting data or processing information. These methods
1040							simplify the process.
1041
1042							=over 4
1043
1044							=item iterate
1045
1046							$Data->iterate( sub {
1047							my $row = shift;
1048							my $number = $row->value($index);
1049							my $log_number = log($number);
1050							$row->value($index, $log_number);
1051							} );
1052
1053							This method will process a code reference on every row in the data
1054							table. Pass a subroutine or code reference. The subroutine will
1055							receive the row as a L object. With this
1056							object, you can retrieve values, set values, and add new values.
1057
1058							=item row_stream
1059
1060							This returns an C object, which has one
1061							method, C. Call this method repeatedly until it returns
1062							C to work through each row of data.
1063
1064							Users of the C family of database adaptors may recognize the
1065							analogy to the C method.
1066
1067							=item next_row
1068
1069							my $stream = $Data->row_stream;
1070							while (my $row = $stream->next_row) {
1071							# each $row is a Bio::ToolBox::Data::Feature object
1072							# representing the row in the data table
1073							my $value = $row->value($index);
1074							# do something with $value
1075							}
1076
1077							Called from a C object, it returns a
1078							L row object. If SeqFeature objects are
1079							associated with the row, perhaps from a parsed input annotation file,
1080							then they are automatically associated with the row object. (Previous
1081							versions required separately calling the seqfeature() row method to
1082							perform this.)
1083
1084							=back
1085
1086							=head1 SEE ALSO
1087
1088							L, L, L,
1089							L, L, L, L,
1090							L, L
1091
1092							=cut
1093
1094	3			3		2718	use strict;
	3					4
	3					79
1095	3			3		10	use Carp qw(carp cluck croak confess);
	3					6
	3					143
1096	3			3		14	use List::Util qw(sum0);
	3					4
	3					116
1097	3			3		12	use base 'Bio::ToolBox::Data::core';
	3					4
	3					1178
1098	3					143	use Bio::ToolBox::db_helper qw(
1099							get_new_feature_list
1100							get_new_genome_list
1101							get_db_feature
1102	3			3		15	);
	3					5
1103	3			3		16	use Bio::ToolBox::utility qw(simplify_dataset_name sane_chromo_sort);
	3					4
	3					125
1104	3			3		13	use Module::Load;
	3					3
	3					10
1105
1106							1;
1107
1108
1109							### Initialize
1110
1111							sub new {
1112	46			46	1	6880	my $class = shift;
1113	46					115	my %args = @_;
1114	46	100				157	if (ref($class)) {
1115	25					47	$class = ref($class);
1116							}
1117
1118							# check for important arguments
1119	46		100			352	$args{features} \|\|= $args{feature} \|\| 'gene';
			33
1120	46		50			546	$args{stream} \|\|= $args{Stream} \|\| 0;
			66
1121	46		100			241	$args{file} \|\|= $args{in} \|\| undef;
			100
1122	46		100			168	$args{parse} \|\|= 0;
1123	46		50			181	$args{noheader} \|\|= 0;
1124
1125							# check for stream
1126	46	100				114	if ($args{stream}) {
1127	1					2	$class = "Bio::ToolBox::Data::Stream";
1128	1					4	load($class);
1129	1					81	return $class->new(@_);
1130							}
1131
1132							# initialize
1133	45					221	my $self = $class->SUPER::new();
1134
1135							# prepare a new table based on the provided arguments
1136	45	100	100			302	if ($args{file} and $args{parse}) {
		100	66
		100
1137							# parse from file
1138	7		100			31	$args{subfeature} \|\|= '';
1139	7	100				24	unless ( $self->parse_table(\%args) ) {
1140	5					27	my $l = $self->load_file($args{file});
1141	5	50				12	return unless $l;
1142	5	50	33			15	if ($self->database =~ /^Parsed:(.+)$/ and $self->feature_type eq 'named') {
1143							# looks like the loaded file was from a previously parsed table
1144							# let's try this again
1145							# this may die if it doesn't work
1146	5					15	$args{file} = $1;
1147	5					12	$args{feature} = $self->feature;
1148	5					16	$self->parse_table(\%args);
1149							}
1150							}
1151							}
1152							elsif ($args{file}) {
1153							# load from file
1154	5	50				25	unless ( $self->load_file($args{file}, $args{noheader}) ) {
1155	0					0	return;
1156							}
1157							}
1158							elsif (exists $args{db} and $args{features}) {
1159							# generate new list
1160	1					10	$self->feature($args{features});
1161	1					5	$self->database($args{db});
1162	1					2	$args{data} = $self;
1163	1					2	my $result;
1164	1	50				3	if ($args{features} eq 'genome') {
1165							# create a genomic interval list
1166							# first must parse any exclusion list provided as a new Data object
1167	1		50			9	$args{blacklist} \|\|= $args{exclude} \|\| undef;
			33
1168	1	50				3	if (defined $args{blacklist}) {
1169							my $exclusion_Data = $self->new(
1170							file => $args{blacklist},
1171	0					0	parse => 0 # we don't need to parse
1172							);
1173	0	0	0			0	if ($exclusion_Data and $exclusion_Data->feature_type eq 'coordinate') {
1174	0					0	printf " Loaded %d exclusion list items\n",
1175							$exclusion_Data->number_rows;
1176	0					0	delete $args{blacklist};
1177	0					0	delete $args{exclude};
1178	0					0	$args{exclude} = $exclusion_Data;
1179							}
1180							else {
1181	0					0	carp " Cannot not load exclusion coordinate list file $args{blacklist}!";
1182	0					0	return;
1183							}
1184							}
1185	1					6	$result = get_new_genome_list(%args);
1186							}
1187							else {
1188	0					0	$result = get_new_feature_list(%args);
1189							}
1190	1	50				4	unless ($result) {
1191	0					0	carp " Cannot generate new $args{features} list!";
1192	0					0	return;
1193							}
1194							}
1195							else {
1196							# a new empty structure
1197
1198							# check to see if user provided column names
1199	32		50			203	$args{columns} \|\|= $args{datasets} \|\| undef;
			66
1200	32	100	33			256	if (defined $args{columns}) {
		50	33
		50	33
		50
1201	1					1	foreach my $c ( @{ $args{columns} } ) {
	1					3
1202	9					12	$self->add_column($c);
1203							}
1204	1	50				4	$self->{feature} = $args{feature} if exists $args{feature};
1205							}
1206
1207							# or possibly a specified format structure
1208							elsif (exists $args{gff} and $args{gff}) {
1209							# use standard names for the number of columns indicated
1210							# we trust that the user knows the subtle difference between gff versions
1211	0					0	$self->add_gff_metadata($args{gff});
1212	0	0				0	unless ($self->extension =~ /g[tf]f/) {
1213							$self->{extension} = $args{gff} == 2.5 ? '.gtf' :
1214	0	0				0	$args{gff} == 3 ? '.gff3' : '.gff';
		0
1215							}
1216							}
1217							elsif (exists $args{bed} and $args{bed}) {
1218							# use standard names for the number of columns indicated
1219	0	0	0			0	unless ($args{bed} =~ /^\d{1,2}$/ and $args{bed} >= 3) {
1220	0					0	carp "bed parameter must be an integer 3-12!";
1221	0					0	return;
1222							}
1223	0					0	$self->add_bed_metadata($args{bed});
1224	0	0				0	unless ($self->extension =~ /bed\|peak/) {
1225	0					0	$self->{extension} = '.bed';
1226							}
1227							}
1228							elsif (exists $args{ucsc} and $args{ucsc}) {
1229							# a ucsc format such as refFlat, genePred, or genePredExt
1230	0					0	my $u = $self->add_ucsc_metadata($args{ucsc});
1231	0	0				0	unless ($u) {
1232	0					0	carp "unrecognized number of columns for ucsc format!";
1233	0					0	return;
1234							};
1235	0	0				0	unless ($self->extension =~ /ucsc\|ref+lat\|genepred/) {
1236	0					0	$self->{extension} = '.ucsc';
1237							}
1238							}
1239							}
1240
1241	45					256	return $self;
1242							}
1243
1244							sub duplicate {
1245	1			1	1	1175	my $self = shift;
1246
1247							# duplicate the data structure
1248	1					3	my $columns = $self->list_columns;
1249	1	50				4	my $Dupe = $self->new(
1250							'columns' => $columns,
1251							) or return;
1252
1253							# copy the metadata
1254	1					3	for (my $i = 0; $i < $self->number_columns; $i++) {
1255							# column metadata
1256	9					14	my %md = $self->metadata($i);
1257	9					22	$Dupe->{$i} = \%md;
1258							}
1259	1					3	foreach (qw(feature program db bed gff ucsc vcf headers)) {
1260							# various keys
1261	8					11	$Dupe->{$_} = $self->{$_};
1262							}
1263	1					2	my @comments = $self->comments;
1264	1					1	push @{$Dupe->{comments}}, @comments;
	1					10
1265
1266	1					5	return $Dupe;
1267							}
1268
1269							sub parse_table {
1270	18			18	1	5869	my $self = shift;
1271	18					38	my $args = shift;
1272	18					35	my ($file, $feature, $subfeature, $simplify);
1273	18	100				48	if (ref $args) {
1274	12		50			34	$file = $args->{file} \|\| '';
1275	12		100			32	$feature = $args->{feature} \|\| '';
1276	12		100			42	$subfeature = $args->{subfeature} \|\| '';
1277							$simplify = (exists $args->{simplify} and defined $args->{simplify}) ?
1278	12	50	33			39	$args->{simplify} : 1; # default is to simplify
1279							}
1280							else {
1281							# no hash reference, assume just a file name
1282	6					10	$file = $args;
1283	6					10	undef $args;
1284	6					11	$feature = undef;
1285	6					12	$subfeature = '';
1286	6					9	$simplify = 1;
1287							}
1288	18	50				48	unless ($file) {
1289	0					0	carp "no annotation file provided to parse!";
1290	0					0	return;
1291							}
1292
1293							# the file format determines the parser class
1294	18	100				84	my ($flavor, $format) = $self->taste_file($file) or return;
1295	13					40	my $class = 'Bio::ToolBox::parser::' . $flavor;
1296	13					23	eval {load $class};
	13					54
1297	13	50				981	if ($@) {
1298	0					0	carp "unable to load $class! cannot parse $flavor!";
1299	0					0	return;
1300							}
1301	13					41	$self->format($format); # specify the specific format
1302
1303							# open parser
1304	13	50				108	my $parser = $class->new() or return;
1305	13	50				54	$parser->open_file($file) or return;
1306	13					48	my $typelist = $parser->typelist;
1307
1308							# set feature based on the type list from the parser
1309	13	100				42	unless ($feature) {
1310	6	100				26	if ($typelist =~ /gene/i) {
		100
1311	2					4	$feature = 'gene';
1312							}
1313							elsif ($typelist eq 'region') {
1314	3					6	$feature = 'region';
1315							}
1316							else {
1317	1					2	$feature = 'rna'; # generic RNA
1318							}
1319							}
1320
1321							# set parser parameters
1322	13					45	$parser->simplify($simplify);
1323	13	100				32	if ($subfeature) {
1324	2	100				11	$parser->do_exon(1) if $subfeature =~ /exon/i;
1325	2	100				11	$parser->do_cds(1) if $subfeature =~ /cds/i;
1326	2	50				9	$parser->do_utr(1) if $subfeature =~ /utr\|untranslated/i;
1327	2	50				7	$parser->do_codon(1) if $subfeature =~/codon/i;
1328							}
1329	13	100				40	if ($feature =~ /gene$/i) {
1330	4					13	$parser->do_gene(1);
1331							}
1332							else {
1333	9					29	$parser->do_gene(0);
1334							}
1335	13					25	my $mrna_check = 0;
1336	13	50	66			52	if (lc($feature) eq 'mrna' and $parser->typelist !~ /mrna/i and not $self->last_row) {
			33
1337							# user requested mRNA for a new data file but it's not present in the type list
1338							# look for it the hard way by parsing CDS too - sigh
1339	0					0	load('Bio::ToolBox::GeneTools', 'is_coding');
1340	0					0	$parser->do_cds(1);
1341	0					0	$mrna_check = 1;
1342							}
1343
1344							# parse the table
1345	13	50				47	$parser->parse_file or return;
1346
1347							# store the SeqFeature objects
1348	13	100				78	if ($self->last_row > 0) {
1349							# we already have a table, presumably representing the features
1350	5					11	my $count = 0;
1351							$self->iterate( sub {
1352	65			65		62	my $row = shift;
1353	65					114	my $f = $parser->find_gene(
1354							name => $row->name,
1355							id => $row->id,
1356							);
1357	65	50				105	if ($f) {
1358	65					102	$self->store_seqfeature($row->row_index, $f);
1359	65					154	$count++;
1360							}
1361	5					43	} );
1362	5	50				31	unless ($count == $self->last_row) {
1363	0					0	die <
1364							Not all features in the input file could be matched to a corresponding SeqFeature
1365							object in the annotation file $file.
1366							Double check your input and annotation files. You can create a new table by just
1367							providing your annotation file.
1368							PARSEFAIL
1369							}
1370							}
1371							else {
1372							# create a new table
1373	8					36	$self->add_column('Primary_ID');
1374	8					20	$self->add_column('Name');
1375
1376							# check for chromosome exclude
1377	8					9	my $chr_exclude;
1378	8	100				18	if ($args) {
1379	2		50			11	$chr_exclude = $args->{chrskip} \|\| undef;
1380							}
1381
1382							# fill table with features
1383	8					27	while (my $f = $parser->next_top_feature) {
1384	120	50				160	if ($chr_exclude) {
1385	0	0				0	next if $f->seq_id =~ /$chr_exclude/i;
1386							}
1387	120					194	my $type = $f->type;
1388	120	100	33			189	if ($f->type =~ /$feature/i or ($mrna_check and is_coding($f)) ) {
			66
1389	113					203	my $index = $self->add_row([ $f->primary_id, $f->display_name ]);
1390	113					188	$self->store_seqfeature($index, $f);
1391							}
1392							}
1393	8	50				15	unless ($self->last_row) {
1394	0					0	printf " Zero '%s' features found!\n Check your feature or try generic features like gene, mRNA, or transcript\n",
1395							$feature;
1396							}
1397	8					45	$self->database("Parsed:$file");
1398	8					30	$self->feature($feature);
1399	8	50				15	$self->add_comment("Chromosomes excluded: $chr_exclude") if $chr_exclude;
1400
1401							# add input parsed file metadata
1402	8					40	$self->add_file_metadata($file);
1403							# but delete some stuff, just want basename
1404	8					14	undef $self->{extension};
1405	8					12	undef $self->{filename};
1406	8					11	undef $self->{path};
1407							}
1408	13					357	return 1;
1409							}
1410
1411
1412
1413
1414							### Column manipulation
1415
1416							sub column_values {
1417	2			2	1	940	my ($self, $column) = @_;
1418	2	50				6	return unless defined $column;
1419	2	50				5	return unless exists $self->{$column}{name};
1420	2					6	my @values = map {$self->value($_, $column)} (0 .. $self->last_row);
	159					171
1421	2	50				8	return wantarray ? @values : \@values;
1422							}
1423
1424							sub add_column {
1425	31			31	1	54	my ($self, $name) = @_;
1426	31	50				118	return unless $name;
1427	31					43	my $column = $self->{number_columns};
1428
1429							# check for array of column data
1430	31					43	my $name_ref = ref $name;
1431	31	100				79	if ($name_ref eq 'ARRAY') {
		50
		50
1432	2	50				4	if ($self->last_row > 1) {
1433							# table has existing data beyond column headers
1434	2	50				3	if ($self->last_row == (scalar @$name - 1)) {
1435							# same number of elements, add it the table
1436	2					6	$self->{$column} = {
1437							'name' => $name->[0],
1438							'index' => $column,
1439							};
1440	2					5	for (my $r = 0; $r <= $self->last_row; $r++) {
1441	158					245	$self->{data_table}->[$r][$column] = $name->[$r];
1442							}
1443							}
1444							else {
1445							# different number of elements
1446	0					0	cluck "array has different number of elements than Data table!\n";
1447	0					0	return;
1448							}
1449							}
1450							else {
1451							# table has no existing data
1452	0					0	$self->{$column} = {
1453							'name' => $name->[0],
1454							'index' => $column,
1455							};
1456	0					0	for (my $i = 0; $i < scalar @$name; $i++) {
1457	0					0	$self->value($i, $column, $name->[$i]);
1458							}
1459	0					0	$self->{last_row} = scalar @$name - 1;
1460	0					0	$self->{headers} = 1; # boolean to indicate the table now has headers
1461							}
1462							}
1463							elsif ($name_ref eq 'Bio::DB::GFF::Typename') {
1464							# a Typename object that was selected from a SeqFeature::Store database
1465	0					0	$self->{$column} = {
1466							'name' => $name->asString,
1467							'index' => $column,
1468							};
1469	0					0	$self->{data_table}->[0][$column] = $name->asString;
1470							}
1471							elsif ($name_ref eq '') {
1472							# just a name
1473	29					82	$self->{$column} = {
1474							'name' => $name,
1475							'index' => $column,
1476							};
1477	29					66	$self->{data_table}->[0][$column] = $name;
1478							}
1479							else {
1480	0					0	cluck "unrecognized reference '$name_ref'! pass a scalar value or array reference";
1481	0					0	return;
1482							}
1483
1484	31					40	$self->{number_columns}++;
1485	31	50				56	delete $self->{column_indices} if exists $self->{column_indices};
1486	31	50	33			77	if ($self->gff or $self->bed or $self->ucsc or $self->vcf) {
			33
			33
1487							# check if we maintain integrity, at least insofar what we test
1488	0					0	$self->verify(1); # silence so user doesn't get these messages
1489							}
1490	31					55	return $column;
1491							}
1492
1493							sub copy_column {
1494	1			1	1	2	my $self = shift;
1495	1					1	my $index = shift;
1496	1	50				4	return unless defined $index;
1497	1					2	my $data = $self->column_values($index);
1498	1					2	my $new_index = $self->add_column($data);
1499	1					6	$self->copy_metadata($index, $new_index);
1500	1					5	return $new_index;
1501							}
1502
1503
1504
1505							### Rows and Data access
1506
1507							sub add_row {
1508	353			353	1	346	my $self = shift;
1509	353					337	my @row_data;
1510	353	100	66			826	if ($_[0] and ref $_[0] eq 'ARRAY') {
		50	33
		0	0
1511	313					292	@row_data = @{ $_[0] };
	313					531
1512							}
1513							elsif ($_[0] and ref $_[0] eq 'Bio::ToolBox::Data::Feature') {
1514	40					76	@row_data = $_[0]->row_values;
1515							}
1516							elsif ($_[0] and $_[0] =~ /\t/) {
1517	0					0	@row_data = split /\t/, $_[0];
1518							}
1519							else {
1520	0					0	@row_data = map {'.'} (1 .. $self->{number_columns});
	0					0
1521							}
1522	353	50				548	if (scalar @row_data > $self->{number_columns}) {
1523	0					0	cluck "row added has more elements than table columns! truncating row elements\n";
1524	0					0	splice @row_data, 0, $self->{number_columns};
1525							}
1526	353					512	until (scalar @row_data == $self->{number_columns}) {
1527	0					0	push @row_data, '.';
1528							}
1529	353					362	my $row_index = $self->{last_row} + 1;
1530	353					441	$self->{data_table}->[$row_index] = \@row_data;
1531	353					622	$self->{last_row}++;
1532	353					663	return $row_index;
1533							}
1534
1535							sub get_row {
1536	9			9	1	502	my ($self, $row_number) = @_;
1537	9	50				36	return unless defined $self->{data_table}->[$row_number];
1538	9					24	my @options = (
1539							'data' => $self,
1540							'index' => $row_number,
1541							);
1542	9	100	66			48	if (
1543							exists $self->{SeqFeatureObjects} and
1544							defined $self->{SeqFeatureObjects}->[$row_number]
1545							) {
1546	5					14	push @options, 'feature', $self->{SeqFeatureObjects}->[$row_number];
1547							}
1548	9					70	return Bio::ToolBox::Data::Feature->new(@options);
1549							}
1550
1551							sub delete_row {
1552	1			1	1	2	my $self = shift;
1553	1					4	my @deleted = sort {$b <=> $a} @_;
	0					0
1554	1					3	while (@deleted) {
1555	1					1	my $d = shift @deleted;
1556	1					2	splice( @{ $self->{data_table} }, $d, 1);
	1					2
1557	1					3	$self->{last_row}--;
1558	1	50				3	if (exists $self->{SeqFeatureObjects}) {
1559	0					0	splice( @{ $self->{SeqFeatureObjects} }, $d, 1);
	0					0
1560							}
1561							}
1562	1					2	return 1;
1563							}
1564
1565							sub row_values {
1566	237			237	1	263	my ($self, $row) = @_;
1567	237					222	my @data = @{ $self->{data_table}->[$row] };
	237					436
1568	237	50				597	return wantarray ? @data : \@data;
1569							}
1570
1571							sub value {
1572	272			272	1	1085	my ($self, $row, $column, $value) = @_;
1573	272	50	33			570	return unless (defined $row and defined $column);
1574	272	50				314	if (defined $value) {
1575	0					0	$self->{data_table}->[$row][$column] = $value;
1576							}
1577	272					444	return $self->{data_table}->[$row][$column];
1578							}
1579
1580							sub row_stream {
1581	9			9	1	1454	my $self = shift;
1582	9					63	return Bio::ToolBox::Data::Iterator->new($self);
1583							}
1584
1585							sub iterate {
1586	6			6	1	462	my $self = shift;
1587	6					63	my $code = shift;
1588	6	50				24	unless (ref $code eq 'CODE') {
1589	0					0	confess "iterate_function() method requires a code reference!";
1590							}
1591	6					18	my $stream = $self->row_stream;
1592	6					23	while (my $row = $stream->next_row) {
1593	143					172	&$code($row);
1594							}
1595	6					23	return 1;
1596							}
1597
1598
1599							#### Stored SeqFeature manipulation
1600
1601							sub store_seqfeature {
1602	178			178	1	227	my ($self, $row_i, $seqfeature) = @_;
1603	178	50	33			424	unless (defined $row_i and ref($seqfeature)) {
1604	0					0	confess "must provide a row index and SeqFeature object!";
1605							}
1606	178	50				271	confess "invalid row index" unless ($row_i <= $self->last_row);
1607	178		100			288	$self->{SeqFeatureObjects} \|\|= [];
1608	178					206	$self->{SeqFeatureObjects}->[$row_i] = $seqfeature;
1609	178					297	return 1;
1610							}
1611
1612							sub delete_seqfeature {
1613	0			0	1	0	my ($self, $row_i) = @_;
1614	0	0				0	confess "invalid row index" unless ($row_i <= $self->last_row);
1615	0	0				0	return unless $self->{SeqFeatureObjects};
1616	0					0	undef $self->{SeqFeatureObjects}->[$row_i];
1617							}
1618
1619							sub collapse_gene_transcripts {
1620	0			0	1	0	my $self = shift;
1621	0	0				0	unless ($self->feature_type eq 'named') {
1622	0					0	carp "Table does not contain named features!";
1623	0					0	return;
1624							}
1625
1626							# load module
1627	0					0	my $class = "Bio::ToolBox::GeneTools";
1628	0					0	eval {load $class, qw(collapse_transcripts)};
	0					0
1629	0	0				0	if ($@) {
1630	0					0	carp "unable to load $class! cannot collapse transcripts!";
1631	0					0	return;
1632							}
1633
1634							# collapse the transcripts
1635	0					0	my $success = 0;
1636	0	0				0	if (exists $self->{SeqFeatureObjects}) {
1637							# we should have stored SeqFeature objects, probably from parsed table
1638	0					0	for (my $i = 1; $i <= $self->last_row; $i++) {
1639	0	0				0	my $feature = $self->{SeqFeatureObjects}->[$i] or next;
1640	0	0				0	my $collSeqFeat = collapse_transcripts($feature) or next;
1641	0					0	$success += $self->store_seqfeature($i, $collSeqFeat);
1642							}
1643							}
1644							else {
1645							# no stored SeqFeature objects, probably names pointing to a database
1646							# we will have to fetch the feature from a database
1647	0	0				0	my $db = $self->open_meta_database(1) or # force open a new db connection
1648							confess "No SeqFeature objects stored and no database connection!";
1649	0					0	my $name_i = $self->name_column;
1650	0					0	my $id_i = $self->id_column;
1651	0					0	my $type_i = $self->type_column;
1652	0					0	for (my $i = 1; $i <= $self->last_row; $i++) {
1653	0	0	0			0	my $feature = get_db_feature(
			0
			0
1654							db => $db,
1655							name => $self->value($i, $name_i) \|\| undef,
1656							type => $self->value($i, $type_i) \|\| undef,
1657							id => $self->value($i, $id_i) \|\| undef,
1658							) or next;
1659	0	0				0	my $collSeqFeat = collapse_transcripts($feature) or next;
1660	0					0	$success += $self->store_seqfeature($i, $collSeqFeat);
1661							}
1662							}
1663
1664	0					0	return $success;
1665							}
1666
1667							sub add_transcript_length {
1668	0			0	1	0	my $self = shift;
1669	0		0			0	my $subfeature = shift \|\| 'exon';
1670	0	0				0	unless (exists $self->{SeqFeatureObjects}) {
1671	0					0	carp "no SeqFeature objects stored for collapsing!";
1672	0					0	return;
1673							}
1674
1675							# load module
1676	0					0	eval {load "Bio::ToolBox::GeneTools", qw(get_transcript_length get_transcript_cds_length
	0					0
1677							get_transcript_5p_utr_length get_transcript_3p_utr_length)};
1678	0	0				0	if ($@) {
1679	0					0	carp "unable to load Bio::ToolBox::GeneTools! cannot collapse transcripts!";
1680	0					0	return;
1681							}
1682
1683							# determine name and calculation method
1684	0					0	my $length_calculator;
1685							my $length_name;
1686	0	0				0	if ($subfeature eq 'exon') {
		0
		0
		0
1687	0					0	$length_calculator = \&get_transcript_length;
1688	0					0	$length_name= 'Merged_Transcript_Length';
1689							}
1690							elsif ($subfeature eq 'cds') {
1691	0					0	$length_calculator = \&get_transcript_cds_length;
1692	0					0	$length_name= 'Transcript_CDS_Length';
1693							}
1694							elsif ($subfeature eq '5p_utr') {
1695	0					0	$length_calculator = \&get_transcript_5p_utr_length;
1696	0					0	$length_name= 'Transcript_5p_UTR_Length';
1697							}
1698							elsif ($subfeature eq '3p_utr') {
1699	0					0	$length_calculator = \&get_transcript_3p_utr_length;
1700	0					0	$length_name= 'Transcript_3p_UTR_Length';
1701							}
1702							else {
1703	0					0	carp "unrecognized subfeature type '$subfeature'! No length calculated";
1704	0					0	return;
1705							}
1706
1707							# add new column and calculate
1708	0					0	my $length_i = $self->add_column($length_name);
1709	0	0				0	if (exists $self->{SeqFeatureObjects}) {
1710							# we should have stored SeqFeature objects, probably from parsed table
1711	0					0	for (my $i = 1; $i <= $self->last_row; $i++) {
1712	0	0				0	my $feature = $self->{SeqFeatureObjects}->[$i] or next;
1713	0					0	my $length = &$length_calculator($feature);
1714	0		0			0	$self->{data_table}->[$i][$length_i] = $length \|\| $feature->length;
1715							}
1716							}
1717							else {
1718							# no stored SeqFeature objects, probably names pointing to a database
1719							# we will have to fetch the feature from a database
1720	0	0				0	my $db = $self->open_meta_database(1) or # force open a new db connection
1721							confess "No SeqFeature objects stored and no database connection!";
1722	0					0	my $name_i = $self->name_column;
1723	0					0	my $id_i = $self->id_column;
1724	0					0	my $type_i = $self->type_column;
1725	0					0	for (my $i = 1; $i <= $self->last_row; $i++) {
1726	0	0	0			0	my $feature = get_db_feature(
			0
			0
1727							db => $db,
1728							name => $self->value($i, $name_i) \|\| undef,
1729							type => $self->value($i, $type_i) \|\| undef,
1730							id => $self->value($i, $id_i) \|\| undef,
1731							) or next;
1732	0					0	my $length = &$length_calculator($feature);
1733	0		0			0	$self->{data_table}->[$i][$length_i] = $length \|\| $feature->length;
1734	0					0	$self->store_seqfeature($i, $feature);
1735							# to store or not to store? May explode memory, but could save from
1736							# expensive db calls later
1737							}
1738							}
1739
1740	0					0	return $length_i;
1741							}
1742
1743
1744
1745							### Data structure manipulation
1746
1747							sub sort_data {
1748	1			1	1	1334	my $self = shift;
1749	1					2	my $index = shift;
1750	1		50			3	my $direction = shift \|\| 'i';
1751
1752							# confirm options
1753	1	50				3	return unless exists $self->{$index}{name};
1754	1	50				4	unless ($direction =~ /^[id]/i) {
1755	0					0	carp "unrecognized sort order '$direction'! Must be i or d";
1756	0					0	return;
1757							}
1758
1759							# Sample the dataset values
1760							# this will be used to guess the sort method, below
1761	1					1	my $example; # an example of the dataset
1762	1					4	foreach (my $i = 1; $i <= $self->last_row; $i++) {
1763							# we want to avoid null values, so keep trying
1764							# null being . or any variation of N/A, NaN, inf
1765	1					3	my $v = $self->value($i, $index);
1766	1	50	33			8	if (defined $v and $v !~ /^(?:\.\|n\/?a\|nan\|\-?inf)$/i) {
1767							# a non-null value, take it
1768	1					2	$example = $v;
1769	1					1	last;
1770							}
1771							}
1772
1773							# Determine sort method, either numerical or alphabetical
1774	1					2	my $sortmethod;
1775	1	50				3	if ($example =~ /[a-z]/i) {
		0
1776							# there are detectable letters
1777	1					1	$sortmethod = 'ascii';
1778							}
1779							elsif ($example =~ /^\-?\d+\.?\d*$/) {
1780							# there are only digits, allowing for minus sign and a decimal point
1781							# I don't think this allows for exponents, though
1782	0					0	$sortmethod = 'numeric';
1783							}
1784							else {
1785							# unable to determine (probably alphanumeric), sort asciibetical
1786	0					0	$sortmethod = 'ascii';
1787							}
1788
1789							# Re-order the datasets
1790							# Directly sorting the @data array is proving difficult. It keeps giving me
1791							# a segmentation fault. So I'm using a different approach by copying the
1792							# @data_table into a temporary hash.
1793							# put data_table array into a temporary hash
1794							# the hash key will the be dataset value,
1795							# the hash value will be the reference the row data
1796	1					2	my %datahash;
1797
1798							# reorder numerically
1799	1	50				4	if ($sortmethod eq 'numeric') {
		50
1800	0					0	for my $row (1..$self->last_row) {
1801	0					0	my $value = $self->value($row, $index);
1802
1803							# check to see whether this value exists or not
1804	0					0	while (exists $datahash{$value}) {
1805							# add a really small number to bump it up and make it unique
1806							# this, of course, presumes that none of the dataset values
1807							# are really this small - this may be an entirely bad
1808							# assumption!!!!! I suppose we could somehow calculate an
1809							# appropriate value.... nah.
1810							# don't worry, we're only modifying the value used for sorting,
1811							# not the actual value
1812	0					0	$value += 0.00000001;
1813							}
1814							# store the row data reference
1815	0					0	$datahash{$value} = $self->{data_table}->[$row];
1816							}
1817
1818							# re-fill the array based on the sort direction
1819	0	0				0	if ($direction =~ /^i/i) {
1820							# increasing sort
1821	0					0	my $i = 1; # keep track of the row, skip the header
1822	0					0	foreach (sort {$a <=> $b} keys %datahash) {
	0					0
1823							# put back the reference to the anonymous array of row data
1824	0					0	$self->{data_table}->[$i] = $datahash{$_};
1825	0					0	$i++; # increment for next row
1826							}
1827							}
1828							else {
1829							# decreasing sort
1830	0					0	my $i = 1; # keep track of the row, skip the header
1831	0					0	foreach (sort {$b <=> $a} keys %datahash) {
	0					0
1832							# put back the reference to the anonymous array of row data
1833	0					0	$self->{data_table}->[$i] = $datahash{$_};
1834	0					0	$i++; # increment for next row
1835							}
1836							}
1837
1838							# summary prompt
1839	0					0	printf " Data table sorted numerically by the contents of %s\n",
1840							$self->name($index);
1841							}
1842
1843							# reorder asciibetically
1844							elsif ($sortmethod eq 'ascii') {
1845	1					3	for my $row (1..$self->last_row) {
1846							# get the value to sort by
1847	78					83	my $value = $self->value($row, $index);
1848	78	50				92	if (exists $datahash{$value}) {
1849							# not unique
1850	0					0	my $n = 1;
1851	0					0	my $lookup = $value . sprintf("03%d", $n);
1852							# we'll try to make a unique value by appending
1853							# a number to the original value
1854	0					0	while (exists $datahash{$lookup}) {
1855							# keep bumping up the number till it's unique
1856	0					0	$n++;
1857	0					0	$lookup = $value . sprintf("03%d", $n);
1858							}
1859	0					0	$datahash{$lookup} = $self->{data_table}->[$row];
1860							}
1861							else {
1862							# unique
1863	78					119	$datahash{$value} = $self->{data_table}->[$row];
1864							}
1865							}
1866
1867							# re-fill the array based on the sort direction
1868	1	50	33			6	if ($direction eq 'i' or $direction eq 'I') {
		50	33
1869							# increasing
1870	0					0	my $i = 1; # keep track of the row
1871	0					0	foreach (sort {$a cmp $b} keys %datahash) {
	0					0
1872							# put back the reference to the anonymous array of row data
1873	0					0	$self->{data_table}->[$i] = $datahash{$_};
1874	0					0	$i++; # increment for next row
1875							}
1876							}
1877							elsif ($direction eq 'd' or $direction eq 'D') {
1878							# decreasing
1879	1					2	my $i = 1; # keep track of the row
1880	1					18	foreach (sort {$b cmp $a} keys %datahash) {
	401					340
1881							# put back the reference to the anonymous array of row data
1882	78					74	$self->{data_table}->[$i] = $datahash{$_};
1883	78					69	$i++; # increment for next row
1884							}
1885							}
1886
1887							# summary prompt
1888	1					5	printf " Data table sorted asciibetically by the contents of '%s'\n",
1889							$self->name($index);
1890							}
1891
1892	1					11	return 1;
1893							}
1894
1895							sub gsort_data {
1896	1			1	1	3	my $self = shift;
1897
1898							# identify indices
1899	1	50				2	unless ($self->feature_type eq 'coordinate') {
1900	0					0	carp "no chromosome and start/position columns to sort!\n";
1901	0					0	return;
1902							}
1903	1					3	my $chromo_i = $self->chromo_column;
1904	1					3	my $start_i = $self->start_column;
1905	1		33			3	my $stop_i = $self->stop_column \|\| $start_i;
1906
1907							# collect the data by chromosome: start and end
1908	1					1	my %chrom2row;
1909	1					3	for my $row (1 .. $self->last_row) {
1910	78					85	my $c = $self->{data_table}->[$row]->[$chromo_i];
1911	78		100			94	$chrom2row{$c} \|\|= [];
1912	78					140	push @{ $chrom2row{$c} }, [
1913							$self->{data_table}->[$row]->[$start_i],
1914							$self->{data_table}->[$row]->[$stop_i],
1915	78					67	$self->{data_table}->[$row]
1916							];
1917							}
1918
1919							# get sane chromosome order
1920	1					6	my @chroms = sane_chromo_sort(keys %chrom2row);
1921
1922							# sort the table
1923	1					1	my @sorted_data;
1924	1					3	push @sorted_data, $self->{data_table}->[0];
1925	1	50				3	if ($self->gff) {
1926							# sort by increasing start and decreasing end, i.e. decreasing length
1927							# this should mostly handle putting genes/transcripts before exons
1928	0					0	foreach my $chr (@chroms) {
1929							push @sorted_data,
1930	0					0	map { $_->[2] }
1931	0	0				0	sort { $a->[0] <=> $b->[0] or $b->[1] <=> $a->[1] }
1932	0					0	@{ $chrom2row{$chr} };
	0					0
1933							}
1934							}
1935							else {
1936							# sort by increasing start followed by increasing end, i.e. increasing length
1937	1					2	foreach my $chr (@chroms) {
1938							push @sorted_data,
1939	78					84	map { $_->[2] }
1940	245	50				302	sort { $a->[0] <=> $b->[0] or $a->[1] <=> $b->[1] }
1941	1					1	@{ $chrom2row{$chr} };
	1					4
1942							}
1943							}
1944	1					4	$self->{data_table} = \@sorted_data;
1945	1					10	return 1;
1946							}
1947
1948							sub splice_data {
1949	1			1	1	3	my ($self, $part, $total_parts) = @_;
1950
1951	1	50	33			5	unless ($part and $total_parts) {
1952	0					0	confess "ordinal part and total number of parts not passed\n";
1953							}
1954	1					3	my $part_length = int($self->last_row / $total_parts);
1955
1956							# check for SeqFeatureObjects array
1957	1	50				2	if (exists $self->{SeqFeatureObjects}) {
1958							# it needs to be the same length as the data table, it should be
1959	0					0	while (scalar @{$self->{SeqFeatureObjects}} < scalar @{$self->{data_table}}) {
	0					0
	0					0
1960	0					0	push @{$self->{SeqFeatureObjects}}, undef;
	0					0
1961							}
1962							}
1963
1964							# splicing based on which part we do
1965	1	50				4	if ($part == 1) {
		50
1966							# remove all but the first part
1967							splice(
1968	0					0	@{$self->{'data_table'}},
	0					0
1969							$part_length + 1
1970							);
1971	0	0				0	if (exists $self->{SeqFeatureObjects}) {
1972							splice(
1973	0					0	@{$self->{SeqFeatureObjects}},
	0					0
1974							$part_length + 1
1975							);
1976							}
1977							}
1978							elsif ($part == $total_parts) {
1979							# remove all but the last part
1980							splice(
1981	1					2	@{$self->{'data_table'}},
	1					10
1982							1,
1983							$part_length * ($total_parts - 1)
1984							);
1985	1	50				3	if (exists $self->{SeqFeatureObjects}) {
1986							splice(
1987	0					0	@{$self->{SeqFeatureObjects}},
	0					0
1988							1,
1989							$part_length * ($total_parts - 1)
1990							);
1991							}
1992							}
1993							else {
1994							# splicing in the middle requires two rounds
1995
1996							# remove the last parts
1997							splice(
1998	0					0	@{$self->{'data_table'}},
	0					0
1999							($part * $part_length) + 1
2000							);
2001
2002							# remove the first parts
2003							splice(
2004	0					0	@{$self->{'data_table'}},
	0					0
2005							1,
2006							$part_length * ($part - 1)
2007							);
2008
2009	0	0				0	if (exists $self->{SeqFeatureObjects}) {
2010							splice(
2011	0					0	@{$self->{SeqFeatureObjects}},
	0					0
2012							($part * $part_length) + 1
2013							);
2014							splice(
2015	0					0	@{$self->{SeqFeatureObjects}},
	0					0
2016							1,
2017							$part_length * ($part - 1)
2018							);
2019							}
2020							}
2021
2022							# update last row metadata
2023	1					1	$self->{'last_row'} = scalar(@{$self->{'data_table'}}) - 1;
	1					3
2024
2025							# re-open a new un-cached database connection
2026	1	50				2	if (exists $self->{db_connection}) {
2027	0					0	delete $self->{db_connection};
2028							}
2029	1					2	return 1;
2030							}
2031
2032
2033
2034							### File functions
2035
2036							sub reload_children {
2037	1			1	1	3	my $self = shift;
2038	1					3	my @files = @_;
2039	1	50				3	return unless @files;
2040
2041							# prepare Stream
2042	1					2	my $class = "Bio::ToolBox::Data::Stream";
2043	1					1	eval {load $class};
	1					4
2044	1	50				52	if ($@) {
2045	0					0	carp "unable to load $class! can't reload children!";
2046	0					0	return;
2047							}
2048
2049							# open first stream
2050	1					1	my $first = shift @files;
2051	1					5	my $Stream = $class->new(in => $first);
2052	1	50				4	unless ($Stream) {
2053	0					0	carp "unable to load first child file $first";
2054	0					0	return;
2055							}
2056
2057							# destroy current data table and metadata
2058	1					2	$self->{data_table} = [];
2059	1					3	for (my $i = 0; $i < $self->number_columns; $i++) {
2060	0					0	delete $self->{$i};
2061							}
2062
2063							# copy the metadata
2064	1					2	foreach (qw(program feature db bed gff ucsc headers number_columns last_row)) {
2065							# various keys
2066	9					12	$self->{$_} = $Stream->{$_};
2067							}
2068	1					4	for (my $i = 0; $i < $Stream->number_columns; $i++) {
2069							# column metadata
2070	4					9	my %md = $Stream->metadata($i);
2071	4					10	$self->{$i} = \%md;
2072							}
2073	1					3	my @comments = $Stream->comments;
2074	1					1	push @{$self->{other}}, @comments;
	1					12
2075
2076							# first row column headers
2077	1					2	$self->{data_table}->[0] = [ @{ $Stream->{data_table}->[0] } ];
	1					3
2078
2079							# load the data
2080	1					4	while (my $row = $Stream->next_row) {
2081	39					51	$self->add_row($row);
2082							}
2083	1					31	$Stream->close_fh;
2084
2085							# load remaining files
2086	1					14	foreach my $file (@files) {
2087	2					21	my $Stream = $class->new(in => $file);
2088	2	50				5	if ($Stream->number_columns != $self->number_columns) {
2089	0					0	confess "fatal error: child file $file has a different number of columns!";
2090							}
2091	2					4	while (my $row = $Stream->next_row) {
2092	78					107	my $a = $row->row_values;
2093	78					110	$self->add_row($a);
2094							}
2095	2					59	$Stream->close_fh;
2096							}
2097	1					15	$self->verify;
2098
2099							# clean up
2100	1					112	unlink($first, @files);
2101	1					6	return $self->last_row;
2102							}
2103
2104
2105
2106
2107							### Export summary data file
2108							sub summary_file {
2109	0			0	1	0	my $self = shift;
2110
2111							# Collect passed arguments
2112	0					0	my %args = @_;
2113
2114							# parameters
2115							# either one or more datasets can be summarized
2116	0		0			0	my $outfile = $args{'filename'} \|\| undef;
2117	0					0	my @datasets;
2118	0	0	0			0	if ($args{dataset} and ref $args{dataset} eq 'ARRAY') {
		0	0
2119	0					0	@datasets = @{ $args{dataset} };
	0					0
2120							}
2121							elsif ($args{dataset} and ref $args{dataset} eq 'SCALAR') {
2122	0					0	push @datasets, $args{dataset};
2123							}
2124	0					0	my @startcolumns;
2125	0	0	0			0	if ($args{startcolumn} and ref $args{startcolumn} eq 'ARRAY') {
		0	0
2126	0					0	@startcolumns = @{ $args{startcolumn} };
	0					0
2127							}
2128							elsif ($args{startcolumn} and ref $args{startcolumn} eq 'SCALAR') {
2129	0					0	push @startcolumns, $args{startcolumn};
2130							}
2131	0					0	my @endcolumns;
2132	0		0			0	$args{endcolumn} \|\|= $args{stopcolumn};
2133	0	0	0			0	if ($args{endcolumn} and ref $args{endcolumn} eq 'ARRAY') {
		0	0
2134	0					0	@endcolumns = @{ $args{endcolumn} };
	0					0
2135							}
2136							elsif ($args{endcolumn} and ref $args{endcolumn} eq 'SCALAR') {
2137	0					0	push @endcolumns, $args{endcolumn};
2138							}
2139
2140
2141							# Check required values
2142	0	0				0	unless ($self->verify) {
2143	0					0	cluck "bad data structure!";
2144	0					0	return;
2145							}
2146	0	0				0	unless (defined $outfile) {
2147	0	0				0	if ($self->basename) {
2148							# use the opened file's filename if it exists
2149							# prepend the path if it exists
2150							# the extension will be added later
2151	0					0	$outfile = $self->path . $self->basename;
2152							}
2153							else {
2154	0					0	cluck "no filename passed to write_summary_data!\n";
2155	0					0	return;
2156							}
2157							}
2158
2159							# Identify possible dataset columns
2160	0					0	my %possibles;
2161	0					0	my %skip = map {$_ => 1} qw (systematicname name id alias aliases type class
	0					0
2162							geneclass chromosome chromo seq_id seqid start stop end gene strand
2163							length primary_id merged_transcript_length transcript_cds_length
2164							transcript_5p_utr_length transcript_3p_utr_length);
2165							# walk through the dataset names
2166	0					0	$possibles{unknown} = [];
2167	0					0	for (my $i = 0; $i < $self->number_columns; $i++) {
2168	0	0				0	if (not exists $skip{ lc $self->{$i}{'name'} }) {
2169	0		0			0	my $d = $self->metadata($i, 'dataset') \|\| undef;
2170	0	0				0	if (defined $d) {
2171							# we have what appears to be a dataset column
2172	0		0			0	$possibles{$d} \|\|= [];
2173	0					0	push @{$possibles{$d}}, $i;
	0					0
2174							}
2175							else {
2176							# still an unknown possibility
2177	0					0	push @{$possibles{unknown}}, $i;
	0					0
2178							}
2179							}
2180							}
2181
2182							# check datasets
2183	0	0				0	unless (@datasets) {
2184	0	0				0	if (scalar keys %possibles > 1) {
2185							# we will always have the unknown category, so anything more than one
2186							# means we found legitimate dataset columns
2187	0					0	delete $possibles{unknown};
2188							}
2189	0					0	@datasets = sort {$a cmp $b} keys %possibles;
	0					0
2190							}
2191
2192							# check starts
2193	0	0				0	if (scalar @startcolumns != scalar @datasets) {
2194	0					0	@startcolumns = (); # ignore what we were given?
2195	0					0	foreach my $d (@datasets) {
2196							# take the first column with this dataset
2197	0					0	push @startcolumns, $possibles{$d}->[0];
2198							}
2199							}
2200
2201							# check stops
2202	0	0				0	if (scalar @endcolumns != scalar @datasets) {
2203	0					0	@endcolumns = (); # ignore what we were given?
2204	0					0	foreach my $d (@datasets) {
2205							# take the last column with this dataset
2206	0					0	push @endcolumns, $possibles{$d}->[-1];
2207							}
2208							}
2209
2210
2211							# Prepare Data object to store the summed data
2212	0					0	my $summed_data = $self->new(
2213							feature => 'averaged_windows',
2214							columns => ['Window','Midpoint'],
2215							);
2216
2217							# Go through each dataset
2218	0					0	foreach my $d (0 .. $#datasets) {
2219
2220							# Prepare score column name
2221	0					0	my $data_name = simplify_dataset_name($datasets[$d]);
2222
2223							# add column
2224	0					0	my $i = $summed_data->add_column($data_name);
2225	0					0	$summed_data->metadata($i, 'dataset', $datasets[$d]);
2226
2227							# tag for remembering we're working with percentile bins
2228	0					0	my $do_percentile = 0;
2229
2230							# remember the row
2231	0					0	my $row = 1;
2232
2233							# Collect summarized data
2234	0					0	for (
2235							my $column = $startcolumns[$d];
2236							$column <= $endcolumns[$d];
2237							$column++
2238							) {
2239
2240							# determine the midpoint position of the window
2241							# this assumes the column metadata has start and stop
2242	0					0	my $midpoint = int(sum0($self->metadata($column, 'start'),
2243							$self->metadata($column, 'stop')) / 2);
2244
2245							# convert midpoint to fraction of 1000 for plotting if necessary
2246	0	0				0	if (substr($self->name($column), -1) eq '%') {
2247	0					0	$midpoint = 10; # midpoint 0.01 * 1000 bp
2248	0					0	$do_percentile++;
2249							}
2250	0	0	0			0	if ($do_percentile and substr($self->name($column), -2) eq 'bp') {
2251							# working on the extension after the percentile bins
2252	0					0	$midpoint += 1000;
2253							}
2254
2255							# collect the values in the column
2256	0					0	my @values;
2257	0					0	for my $row (1..$self->last_row) {
2258	0					0	my $v = $self->value($row, $column);
2259	0	0				0	push @values, $v eq '.' ? 0 : $v; # treat nulls as zero
2260							}
2261
2262							# adjust if log value
2263	0		0			0	my $log = $self->metadata($column, 'log2') \|\| 0;
2264	0	0				0	if ($log) {
2265	0					0	@values = map { 2 ** $_ } @values;
	0					0
2266							}
2267
2268							# determine mean value
2269	0					0	my $window_mean = sum0(@values) / scalar(@values);
2270	0	0				0	if ($log) {
2271	0					0	$window_mean = log($window_mean) / log(2);
2272							}
2273
2274							# push to summed output
2275	0	0				0	if ($d == 0) {
2276							# this is the first dataset, so we need to add a row
2277	0					0	$summed_data->add_row( [ $self->{$column}{'name'}, $midpoint, $window_mean ] );
2278							}
2279							else {
2280							# we're summarizing multiple datasets, we already have name midpoint
2281							# first do sanity check
2282	0	0				0	if ($summed_data->value($row, 1) != $midpoint) {
2283	0					0	carp("unable to summarize multiple datasets with nonequal columns of data!");
2284	0					0	return;
2285							}
2286	0					0	$summed_data->value($row, $i, $window_mean);
2287							}
2288	0					0	$row++;
2289							}
2290							}
2291
2292
2293
2294							# Write summed data
2295	0					0	$outfile =~ s/\.txt(\.gz)?$//i; # strip any .txt or .gz extensions if present
2296	0					0	my $written_file = $summed_data->write_file(
2297							'filename' => $outfile . '_summary.txt',
2298							'gz' => 0,
2299							);
2300	0					0	return $written_file;
2301							}
2302
2303
2304
2305
2306
2307							####################################################
2308
2309							package Bio::ToolBox::Data::Iterator;
2310	3			3		15620	use Bio::ToolBox::Data::Feature;
	3					5
	3					80
2311	3			3		16	use Carp;
	3					6
	3					579
2312
2313							sub new {
2314	9			9		18	my ($class, $data) = @_;
2315	9					30	my %iterator = (
2316							'index' => 1,
2317							'data' => $data,
2318							);
2319	9					26	return bless \%iterator, $class;
2320							}
2321
2322							sub next_row {
2323	154			154		1190	my $self = shift;
2324	154	100				264	return if $self->{'index'} > $self->{data}->{last_row}; # no more
2325	148					146	my $i = $self->{'index'};
2326	148					142	$self->{'index'}++;
2327							my @options = (
2328							'data' => $self->{data},
2329	148					217	'index' => $i,
2330							);
2331	148	50	66			271	if (
2332							exists $self->{data}->{SeqFeatureObjects} and
2333							defined $self->{data}->{SeqFeatureObjects}->[$i]
2334							) {
2335	0					0	push @options, 'feature', $self->{data}->{SeqFeatureObjects}->[$i];
2336							}
2337	148					292	return Bio::ToolBox::Data::Feature->new(@options);
2338							}
2339
2340							sub row_index {
2341	0			0			my $self = shift;
2342	0	0					carp "row_index is a read only method" if @_;
2343	0						return $self->{'index'};
2344							}
2345
2346
2347
2348							####################################################
2349
2350							__END__