File Coverage

blib/lib/Bio/ToolBox/Data.pm

Criterion	Covered	Total	%
statement	321	614	52.2
branch	108	290	37.2
condition	59	168	35.1
subroutine	30	35	85.7
pod	22	22	100.0
total	540	1129	47.8

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::ToolBox::Data;
2							our $VERSION = '1.68';
3
4							=head1 NAME
5
6							Bio::ToolBox::Data - Reading, writing, and manipulating data structure
7
8							=head1 SYNOPSIS
9
10							use Bio::ToolBox::Data;
11
12							### Open a pre-existing file
13							# a data table with same columns as input file
14							my $Data = Bio::ToolBox::Data->new(
15							file => 'coordinates.bed',
16							);
17
18							### Parse a GTF, GFF3, refFlat, or genePred gene table
19							# data table with names and references to fully parsed
20							# SeqFeature transcript objects with exon SeqFeature subfeatures
21							my $Data = Bio::ToolBox::Data->new(
22							file => 'annotation.gtf.gz',
23							parse => 1,
24							feature => 'transcript',
25							subfeature => 'exon'
26							);
27
28							### New gene list from a Bio::DB::SeqFeature::Store database
29							# data table with name and reference ID to database SeqFeature objects
30							my $Data = Bio::ToolBox::Data->new(
31							db => 'hg19.sqlite',
32							feature => 'gene:ensGene',
33							);
34
35
36							### Get a specific value
37							my $value = $Data->value($row, $column);
38
39							### Replace or add a value
40							$Data->value($row, $column, $new_value);
41
42							### Add a column
43							my $new_index = $Data->add_column('Data2');
44
45							### Find a column
46							my $old_index = $Data->find_column('Data1');
47
48							### Return a specific row as an object
49							my $row = $Data->get_row($i); # Bio::ToolBox::Data::Feature
50
51							### Iterate through a Data structure one row at a time
52							my $stream = $Data->row_stream;
53							while (my $row = $stream->next_row) {
54							# each row is returned as Bio::ToolBox::Data::Feature object
55							# get the positional information from the file data
56							# assuming that the input file had these identifiable columns
57							my $seq_id = $row->seq_id;
58							my $start = $row->start;
59							my $stop = $row->end;
60
61							# work with the referenced SeqFeature object
62							my $seqf = $row->seqfeature;
63
64							# generate a new Bio::Seq object from the database using
65							# these coordinates
66							my $region = $db->segment($seq_id, $start, $stop);
67
68							# modify a row value
69							my $value = $row->value($old_index);
70							my $new_value = $value + 1;
71							$row->value($new_index, $new_value);
72							}
73
74							### Iterate through a Data table with a code reference
75							$Data->iterate(\&my_code);
76
77
78							### write the data to file
79							my $success = $Data->write_file(
80							filename => 'new_data.txt',
81							gz => 1,
82							);
83							print "wrote new file $success\n"; # file is new_data.txt.gz
84
85							=head1 DESCRIPTION
86
87							This module works with the primary Bio::ToolBox Data structure. Simply, it
88							is a complex data structure representing a tabbed-delimited table (array
89							of arrays), with plenty of options for metadata. Many common bioinformatic
90							file formats are simply tabbed-delimited text files (think BED, GFF, VCF).
91							Each row is a feature or genomic interval, and each column is a piece of
92							information about that feature, such as name, type, and/or coordinates.
93							We can append to that file additional columns of information, perhaps
94							scores from genomic data sets. We can record metadata regarding how
95							and where we obtained that data. Finally, we can write the updated
96							table to a new file.
97
98							=head1 METHODS
99
100							=head2 Initializing the structure
101
102							=over 4
103
104							=item new
105
106							Initialize a new Data structure. This generally requires options,
107							provided as an array of key =E values. A new list of features
108							may be obtained from an annotation database or an existing file
109							may be loaded. If you do not pass any options, a new empty
110							structure will be generated for you to populate.
111
112							These are the options available.
113
114							=over 4
115
116							=item file
117
118							=item in
119
120							my $Data = Bio::ToolBox::Data->new(file => $file);
121
122							Provide the path and name to an existing tabbed-delimited text
123							file from which to load the contents. This is a shortcut to the
124							load_file() method. See that method for more details.
125
126							=item stream
127
128							my $Data = Bio::ToolBox::Data->new(file => $file, stream => 1);
129
130							Boolean option indicating that the file should be opened as a file
131							stream. A L object will be returned. This
132							is a convenience method.
133
134							=item noheader
135
136							my $Data = Bio::ToolBox::Data->new(file => $file, noheader => 1);
137
138							Boolean option indicating that the file does not have file headers,
139							in which case dummy headers are provided. This is not necessary for
140							defined file types that don't normally have file headers, such as
141							BED, GFF, or UCSC files.
142
143							=item parse
144
145							my $Data = Bio::ToolBox::Data->new(file => $file, parse => 1);
146
147							Boolean option indicating that a gene annotation table or file should
148							be parsed into SeqFeature objects and a general table of names and IDs
149							representing those objects be generated. The annotation file may
150							be specified in one of two ways: Through the file option above,
151							or in the database metadata of an existing table file representing
152							previously parsed objects.
153
154							=item db
155
156							my $Data = Bio::ToolBox::Data->new(db => 'hg19', feature => 'gene');
157
158							Provide the name of the database from which to collect the
159							features. It may be a short name, whereupon it is checked in
160							the L configuration file F<.biotoolbox.cfg> for
161							connection information. Alternatively, a path to a database
162							file or directory may be given.
163
164							If you already have an opened L database
165							object, you can simply pass that. See L for
166							more information. However, this in general should be discouraged,
167							since the name of the database will not be properly recorded when
168							saving to file. It is better to simply pass the name of database
169							again; multiple connections to the same database are smartly handled
170							in the background.
171
172							=item feature
173
174							my $Data = Bio::ToolBox::Data->new(file => $filename, feature => 'gene');
175
176							For de novo lists from an annotation database, provide the GFF
177							type or type:source (columns 3 and 2) for collection. A comma
178							delimited string may be accepted for databases. For parsed files,
179							only a simple string is accepted.
180
181							For a list of genomic intervals across the genome, specify a
182							feature of 'genome' with a database object.
183
184							=item subfeature
185
186							When parsing annotation files such as GTF, one or more subfeature
187							types, e.g. C or C, may be specified as a comma-delimited
188							string. This ensures that the subfeatures will be parsed into
189							SeqFeature objects. Otherwise, only the top level features will be
190							parsed. This expedites parsing by skipping unwanted features.
191
192							=item win
193
194							=item step
195
196							my $Data = Bio::ToolBox::Data->new(db => $dbname, win => $integer);
197
198							If generating a list of genomic intervals, optionally provide
199							the window and step values. The default is 500 bp.
200
201							=item chrskip
202
203							Provide a regular expression compatible or C string for skipping or
204							excluding specific or classes of chromosomes, for example the mitochondrial
205							chromosome or unmapped contigs. This works with both feature collection
206							and genomic intervals. The default is to take all chromosomes.
207
208							=back
209
210							When no file is given or database given to search, then a new,
211							empty Data object is returned. In this case, you may optionally
212							specify the names of the columns or indicate a specific file
213							format structure to assign column names. The following options can
214							then be provided.
215
216							=over 4
217
218							=item columns
219
220							=item datasets
221
222							my $Data = Bio::ToolBox::Data->new(columns => [qw(Column1 Column2 ...)] );
223
224							Provide the column names in advance as an anonymous array.
225
226							=item gff
227
228							Pass the GFF version of the file: 1, 2, 2.5 (GTF), or 3.
229
230							=item bed
231
232							Pass the number of BED columns (3-12).
233
234							=item ucsc
235
236							Pass the number of columns to indicate the type of UCSC format. These
237							include 10 (refFlat without gene names), 11 (refFlat with gene names),
238							12 (knownGene gene prediction table), and 15
239							(an extended gene prediction or genePredExt table).
240
241							=back
242
243							If successful, the method will return a Bio::ToolBox::Data object.
244
245							=item duplicate
246
247							This will create a new Data object containing the same column
248							headers and metadata, but lacking the table content, i.e. no
249							rows of data. File name metadata, if present in the original, is
250							not preserved. The purpose here, for example, is to allow one
251							to selectively copy rows from one Data object to another.
252
253							=item parse_table
254
255							$Data->parse_table($file)
256							$Data->parse_table( {
257							file => $file,
258							feature => 'gene',
259							subfeature => 'exon',
260							chrskip => 'chrM\|contig',
261							} );
262
263							This will parse a gene annotation table into SeqFeature objects.
264							If this is called from an empty Data object, then the table will
265							be filled with the SeqFeature object names and IDs. If this is
266							called from a non-empty Data object, then the table's contents
267							will be associated with the SeqFeature objects using their name and
268							ID. The stored SeqFeature objects can be retrieved using the
269							L method.
270
271							Pass the method a single argument. This may be either a simple
272							scalar to a filename to be parsed, or a reference to hash of
273							one or more argument options. Possible options include:
274
275							=over 4
276
277							=item file
278
279							The file name of the gene table to be parsed. This may
280							be a GFF, GFF3, GTF, or any of the UCSC gene table formats.
281							These will be parsed using Bio::ToolBox::parser::* adapters.
282
283							=item feature
284
285							A regular expression compatible string or C string to match
286							the top features C to keep. The C is not
287							checked. The default is 'gene', although a transcript type could
288							alternatively be specified (in which case transcripts are not
289							parsed in gene features).
290
291							=item subfeature
292
293							A regular expression compatible string or C string to match
294							any sub features C to parse. The C is not checked.
295							Typically these include exon, CDS, or UTR. The default is nothing.
296
297							=item chrskip
298
299							A regular expression compatible string or C string to match
300							chromosomes to be skipped or excluded. Any feature with a matching
301							C chromosome name will be skipped.
302
303							=back
304
305							=back
306
307							=head2 General Metadata
308
309							There is a variety of general metadata regarding the Data
310							structure. The following methods may be used to access or set these
311							metadata properties. Some of these are stored as comment lines at
312							the beginning of the file, and will be read or set when the file is
313							loaded.
314
315							=over
316
317							=item feature
318
319							$Data->feature('something');
320							my $feature = $Data->feature;
321
322							Returns or sets the name of the features used to collect
323							the list of features. The actual feature types are listed
324							in the table, so this metadata is merely descriptive.
325
326							=item feature_type
327
328							Returns one of three specific values describing the contents
329							of the data table inferred by the presence of specific column
330							names. This provides a clue as to whether the table features
331							represent genomic regions (defined by coordinate positions) or
332							named database features. The return values include:
333
334							=over 4
335
336							=item * coordinate
337
338							Table includes at least chromosome and start columns.
339
340							=item * named
341
342							Table includes name, type, and/or Primary_ID, possibly
343							referring to named database features.
344
345							=item * unknown
346
347							Table is unrecognized format.
348
349							=back
350
351							=item program
352
353							Returns or sets the name of the program generating the list.
354
355							=item database
356
357							Returns or sets the name or path of the database from which the
358							features were derived.
359
360							=item gff
361
362							Returns or sets the version of loaded GFF files. Supported versions
363							included 1, 2, 2.5 (GTF), and 3.
364
365							=item bed
366
367							Returns or sets the BED file version. Here, the BED version is simply
368							the number of columns.
369
370							=item ucsc
371
372							Returns or sets the UCSC file format version. Here, the version is
373							simply the number of columns. Supported versions include 10 (gene
374							prediction), 11 (refFlat, or gene prediction with gene name), 12
375							(knownGene table), 15 (extended gene prediction), or 16 (extended
376							gene prediction with bin).
377
378							=item vcf
379
380							Returns or sets the VCF file version number. VCF support is limited.
381
382							=back
383
384							=head2 File information
385
386							These methods provide information about the file from which the
387							data table was loaded. This does not include parsed annotation tables.
388
389							=over 4
390
391							=item filename
392
393							=item path
394
395							=item basename
396
397							=item extension
398
399							Returns the filename, full path, basename, and extension of
400							the filename. Concatenating the last three values will reconstitute
401							the first original filename.
402
403							=item add_file_metadata
404
405							$Data->add_file_metadata('/path/to/file.txt');
406
407							Add filename metadata. This will automatically parse the path,
408							basename, and recognized extension from the passed filename and
409							set the appropriate metadata attributes.
410
411							=back
412
413							=head2 Metadata comments
414
415							Comments are any other commented lines from a text file (lines
416							beginning with a #) that were not parsed as general metadata.
417
418							=over 4
419
420							=item comments
421
422							Returns a copy of the array containing commented lines. Each
423							comment line becomes an element in the array.
424
425							=item add_comment
426
427							Appends the text string to the comment array.
428
429							=item delete_comment
430
431							Deletes a comment. Provide the array index of the comment to
432							delete. If an index is not provided, ALL comments will be deleted!
433
434							=item vcf_headers
435
436							For VCF files, this will partially parse the VCF headers into a
437							hash structure that can be queried or manipulated. Each header
438							line is parsed for the primary key, being the first word after the
439							## prefix, e.g. INFO, FORMAT, FILTER, contig, etc. For the simple
440							values, they are stored as the value. For complex entries, such as
441							with INFO and FORMAT, a second level hash is created with the ID
442							extracted and used as the second level key. The value is always the
443							always the remainder of the string.
444
445							For example, the following would be a simple parsed vcf header in
446							code representation.
447
448							$vcf_header = {
449							FORMAT => {
450							GT = q(ID=GT,Number=1,Type=String,Description="Genotype"),
451							AD = q(ID=AD,Number=.,Type=Integer,Description="ref,alt Allelic depths"),
452							},
453							fileDate => 20150715,
454							}
455
456							=item rewrite_vcf_headers
457
458							If you have altered the vcf headers exported by the vcf_headers()
459							method, then this method will rewrite the hash structure as new
460							comment lines. Do this prior to writing or saving the Data sturcture
461							or else you will lose your changed VCF header metadata.
462
463							=back
464
465							=head2 The Data table
466
467							The Data table is the array of arrays containing all of the
468							actual information. Rows and columns are indexed using 0-based
469							indexing as with all Perl arrays. Row 0 is always the column
470							header row containing the column names, regardless whether an
471							actual header name existed in the original file format (e.g.
472							BED or GFF formats). Any individual table "cell" can be
473							specified as C<[$row][$column]>.
474
475							=over 4
476
477							=item list_columns
478
479							Returns an array or array reference of the column names
480							in ascending (left to right) order.
481
482							=item number_columns
483
484							Returns the number of columns in the Data table.
485
486							=item last_column
487
488							Returns the array index number for the last (right most)
489							column. This number is always 1 less than the value
490							returned by number_columns() due to 0-based indexing.
491
492							=item last_row
493
494							Returns the array index number of the last row.
495							Since the header row is index 0, this is also the
496							number of actual content rows.
497
498							=item column_values
499
500							my $values = $Data->column_values($i);
501
502							Returns an array or array reference representing the values
503							in the specified column. This includes the column header as
504							the first element. Pass the method the column index.
505
506							=item add_column
507
508							# add empty column with name
509							my $i = $Data->add_column($name);
510
511							# add entire column
512							my $i = $Data->add_column(\@array);
513
514							Appends a new column to the Data table at the
515							rightmost position (highest index). It adds the column
516							header name and creates a new column metadata hash.
517							Pass the method one of two possibilities. Pass a text
518							string representing the new column name, in which case
519							no data will be added to the column. Alternatively, pass
520							an array reference, and the contents of the array will
521							become the column data. If the Data table already has
522							rows, then the passed array reference must have the same
523							number of elements.
524
525							It returns the new column index if successful.
526
527							=item copy_column
528
529							my $j = $Data->copy_column($i);
530
531							This will copy a column, appending the duplicate column at
532							the rightmost position (highest index). It will duplicate
533							column metadata as well. It will return the new index
534							position.
535
536							=item delete_column
537
538							$Data->delete_column($i, $j);
539
540							Deletes one or more specified columns. Any remaining
541							columns rightwards will have their indices shifted
542							down appropriately. If you had identified one of the
543							shifted columns, you may need to re-find or calculate
544							its new index.
545
546							=item reorder_column
547
548							$Data->reorder_column($c,$b,$a,$a);
549
550							Reorders columns into the specified order. Provide the
551							new desired order of indices. Columns could be duplicated
552							or deleted using this method. The columns will adopt their
553							new index numbers.
554
555							=item add_row
556
557							$Data->add_row(\@values);
558							$Data->add_row($Row); # Bio::ToolBox::Data::Feature object
559
560							Add a new row of data values to the end of the Data table.
561							Optionally provide either a reference to an array of values
562							to put in the row, or pass a L
563							Row object, such as one obtained from another Data object.
564							If the number of columns do not match, the array is filled
565							up with null values for missing columns, or excess values
566							are dropped.
567
568							=item get_row
569
570							$Data->get_row($i); # Bio::ToolBox::Data::Feature object
571
572							Returns the specified row as a L
573							object.
574
575							=item delete_row
576
577							$Data->delete_row($i, $j);
578
579							Deletes one or more specified rows. Rows are spliced out
580							highest to lowest index to avoid issues. Be very careful
581							deleting rows while simultaneously iterating through the
582							table!
583
584							=item row_values
585
586							my $row_values = $Data->row_values($i);
587
588							Returns a copy of an array for the specified row index.
589							Modifying this returned array does not migrate back to the
590							Data table; Use the L method below instead.
591
592							=item value
593
594							my $value = $Data->value($row, $column);
595							$Data->value($row, $column, $new_value);
596
597							Returns or sets the value at a specific row or column index.
598							Index positions are 0-based (header row is index 0).
599
600							=back
601
602							=head2 Column Metadata
603
604							Each column has metadata. Each metadata is a series of key =E
605							value pairs. The minimum keys are 'index' (the 0-based index
606							of the column) and 'name' (the column header name). Additional
607							keys and values may be queried or set as appropriate. When the
608							file is written, these are stored as commented metadata lines at
609							the beginning of the file.
610
611							=over 4
612
613							=item name
614
615							$Data->name($index, $new_name);
616							my $name = $Data->name($i);
617
618							Convenient method to return the name of the column given the
619							index number. A column may also be renamed by passing a new name.
620
621							=item metadata
622
623							$Data->metadata($index, $key, $new_value);
624							my $value = $Data->metadata($index, $key)
625
626							Returns or sets the metadata value for a specific $key for a
627							specific column $index.
628
629							This may also be used to add a new metadata key. Simply provide
630							the name of a new $key that is not present
631
632							If no key is provided, then a hash or hash reference is returned
633							representing the entire metadata for that column.
634
635							=item copy_metadata
636
637							$Data->copy_metadata($source, $target);
638
639							This method will copy the metadata (everything except name and
640							index) between the source column and target column. Returns 1 if
641							successful.
642
643							=item delete_metadata
644
645							$Data->delete_metadata($index, $key);
646
647							Deletes a column-specific metadata $key and value for a specific
648							column $index. If a $key is not provided, then all metadata keys
649							for that index will be deleted.
650
651							=item find_column
652
653							my $i = $Data->find_column('Gene');
654							my $i = $Data->find_column('^Gene$')
655
656							Searches the column names for the specified column name. This
657							employs a case-insensitive grep search, so simple substitutions
658							may be made.
659
660							=item chromo_column
661
662							=item start_column
663
664							=item stop_column
665
666							=item strand_column
667
668							=item name_column
669
670							=item type_column
671
672							=item id_column
673
674							These methods will return the identified column best matching
675							the description. Returns C if that column is not present.
676							These use the L method with a predefined list of
677							aliases.
678
679							=back
680
681							=head2 Working with databases
682
683							These are methods for working primarily with annotation databases.
684
685							=over 4
686
687							=item open_database
688
689							This is wrapper method that tries to do the right thing and passes
690							on to either L or L methods.
691							Basically a legacy method for L.
692
693							=item open_meta_database
694
695							Open the database that is listed in the metadata. Returns the
696							database connection. Pass a true value to force a new database
697							connection to be opened, rather than returning a cached connection
698							object (useful when forking).
699
700							=item open_new_database
701
702							Convenience method for opening a second or new database that is
703							not specified in the metadata, useful for data collection. This
704							is a shortcut to L.
705							Pass the database name.
706
707							=back
708
709							=head2 SeqFeature Objects
710
711							SeqFeature objects corresponding to data rows can be stored in the Data
712							object. This can be useful if the SeqFeature object is not readily
713							available from a database or is processor intensive in generating or
714							parsing. Note that storing large numbers of objects will increase memory
715							usage.
716
717							SeqFeature objects are usually either L,
718							L, or L objects, depending
719							upon their source. More information can obtained from the L
720							abstract API.
721
722							=over 4
723
724							=item store_seqfeature
725
726							$Data->store_seqfeature($row_index, $seqfeature);
727
728							Stores the SeqFeature object for the given row index. Only one SeqFeature
729							object can be stored per row.
730
731							=item get_seqfeature
732
733							my $feature = $Data->get_seqfeature($row_index);
734
735							Retrieves the SeqFeature object for the given row index.
736
737							=item delete_seqfeature
738
739							$Data->store_seqfeature($row_index);
740
741							Removes the SeqFeature object for the given row index.
742
743							=item collapse_gene_transcripts
744
745							my $success = $Data->collapse_gene_transcripts;
746
747							This method will iterate through a Data table and collapse multiple alternative
748							transcript subfeatures of stored gene SeqFeature objects in the table. Exons
749							of multiple transcripts will be merged, maximizing exon size and minimizing
750							intron size. Genes with only one transcript will not be affected. Stored
751							SeqFeature objects that are do not have a C of "gene" are silently
752							skipped. Refer to the L method
753							for more details. The number of rows successfully collapsed is returned.
754
755							=item add_transcript_length
756
757							my $length_index = $Data->add_transcript_length;
758							my $length_index = $Data->add_transcript_length('cds');
759
760							This method will generate a new column in the Data table representing the
761							length of a transcript, i.e. the sum of the length of subfeatures for
762							the stored SeqFeature object in the Data table. The default subfeature is
763							C; however, alternative subfeature types may be passed to the method
764							and used. These include C, C<5p_utr>, and C<3p_utr> for CDS, the 5C<'> UTR,
765							and the 3C<'> UTR, respectively. See the corresponding transcript length
766							methods in L for more information. If a length
767							is not calculated, for example the feature C is a "gene",
768							then the simple length of the feature is recorded.
769
770							The name of the new column is one of "Merged_Transcript_Length" for exon,
771							"Transcript_CDS_Length", "Transcript_5p_UTR_Length", or "Transcript_3p_UTR_Length".
772							The index of the new length column is returned.
773
774							=back
775
776							=head2 Data Table Functions
777
778							These methods alter the Data table en masse.
779
780							=over 4
781
782							=item verify
783
784							This method will verify the Data structure, including the metadata and the
785							Data table. It ensures that the table has the correct number of rows and
786							columns as described in the metadata, and that each column has the basic
787							metadata.
788
789							If the Data structure is marked as a GFF or BED structure, then the table
790							is checked that the structure matches the proper format. If not, for
791							example when additional columns have been added, then the GFF or BED value
792							is set to null.
793
794							This method is automatically called prior to writing the Data table to file.
795
796							=item sort_data
797
798							$Data->sort_data($index, 'i'); # increasing sort
799
800							This method will sort the Data table by the values in the indicated column.
801							It will automatically determine whether the contents of the column are
802							numbers or alphanumeric, and will sort accordingly, either numerically or
803							asciibetically. The first non-null value in the column is used to determine.
804							The sort may fail if the values are not consistent. Pass a second optional
805							value to indicate the direction of the sort. The value should be either
806							'i' for 'increasing' or 'd' for 'decreasing'. The default order is
807							increasing.
808
809							=item gsort_data
810
811							This method will sort the Data table by increasing genomic coordinates. It
812							requires the presence of chromosome and start (or position) columns,
813							identified by their column names. These are automatically identified.
814							Failure to find these columns mean a failure to sort the table. Chromosome
815							names are sorted first by their digits (e.g. chr2 before chr10), and then
816							alphanumerically. Base coordinates are sorted by increasing value.
817							Identical positions are kept in their original order.
818
819							=item splice_data
820
821							my $Data = Bio::ToolBox::Data->new(file => $file);
822							my $pm = Parallel::ForkManager->new(4);
823							for my $i (1..4) {
824							$pm->start and next;
825							### in child ###
826							$Data->splice_data($i, 4);
827							# do something with this portion
828							# then save to a temporary unique file
829							$Data->save("$file_$i");
830							$pm->finish;
831							}
832							$pm->wait_all_children;
833							# reload children files
834							$Data->reload_children(glob "$file_*");
835
836							This method will splice the Data table into C<$total_parts> number of pieces,
837							retaining the C<$current_part> piece. The other parts are discarded. This
838							method is intended to be used when a program is forked into separate
839							processes, allowing each child process to work on a subset of the original
840							Data table.
841
842							Two values are passed to the method. The first is the current part number,
843							1-based. The second value is the total number of parts that the table
844							should be divided, corresponding to the number of concurrent processes.
845							One easy approach to forking is to use L. The
846							above example shows how to fork into four concurrent processes.
847
848							Since each forked child process is separate from their parent process,
849							their contents must be reloaded into the current Data object. The
850							L documentation recommends going through a disk
851							file intermediate. Therefore, write each child Data object to file using
852							a unique name. Once all children have been reaped, they can be reloaded
853							into the current Data object using the L method.
854
855							Remember that if you fork your script into child processes, any database
856							connections must be re-opened; they are typically not clone safe. If you
857							have an existing database connection by using the L method,
858							it should be automatically re-opened for you when you use the L
859							method, but you will need to call L again in the child
860							process to obtain the new database object.
861
862							=item reload_children
863
864							Discards the current data table in memory and reloads two or more files
865							written from forked children processes. Provide the name of the child
866							files in the order you want them loaded. The files will be automatically
867							deleted if successfully loaded. Returns the number of lines reloaded on
868							success.
869
870							=back
871
872							=head2 File Functions
873
874							The Data table may be read in from a file or written out as a file. In
875							all cases, it is a tab-delimited text file, whether as an ad hoc table
876							or a specific bioinformatic format, e.g. BED, GFF, etc. Multiple common
877							file formats are supported. Column headers are assumed, except in those
878							cases where it is not, e.g. BED, GFF, etc. Metadata may be included as
879							commented lines at the beginning of the file, prefixed with a C<#> symbol.
880							Reading and writing gzip compressed files is fully supported.
881
882							=over 4
883
884							=item load_file
885
886							$Data->load_file($filename);
887
888							This will load a file into a new, empty Data table. This function is
889							called automatically when a filename is provided to the L function.
890							The existence of the file is first checked (appending common missing
891							extensions as necessary), metadata and column headers processed and/or
892							generated from default settings, the content loaded into the table, and
893							the structure verified. Error messages may be printed if the structure or
894							format is inconsistent or doesn't match the expected format, e.g a file
895							with a F<.bed> extension doesn't match the UCSC specification.
896							Pass the name of the filename.
897
898							=item taste_file
899
900							my $flavor = $Data->taste_file($filename);
901							# returns gff, bed, ucsc, or undef
902
903							Tastes, or checks, a file for a certain flavor, or known gene file formats.
904							This is based on file extension, metadata headers, and/or file content
905							in the first 10 lines or so. Returns a string based on the file format.
906							Values include gff, bed, ucsc, or undefined. Useful for determining if
907							the file represents a known gene table format that lacks a defined file
908							extension, e.g. UCSC formats.
909
910							=item save
911
912							=item write_file
913
914							my $success = $Data->save;
915							my $success = $Data->save('my_file.txt');
916							my $success = $Data->save(filename => $file, gz => 1);
917							print "file $success was saved!\n";
918
919							Pass the file name to be written. If no file name is passed, then
920							the filename and path stored in the metadata are used, if present.
921
922							These methods will write the Data structure out to file. It will
923							be first verified as to proper structure. Opened BED and GFF files
924							are checked to see if their structure is maintained. If so, they
925							are written in the same format; if not, they are written as regular
926							tab-delimited text files.
927
928							You may pass additional options.
929
930							=over 4
931
932							=item filename
933
934							Optionally pass a new filename. Required for new objects; previous
935							opened files may be overwritten if a new name is not provided. If
936							necessary, the file extension may be changed; for example, BED files
937							that no longer match the defined format lose the .bed and gain a .txt
938							extension. Compression may or add or strip .gz as appropriate. If
939							a path is not provided, the current working directory is used.
940
941							=item gz
942
943							Boolean value to change the compression status of the output file. If
944							overwriting an input file, the default is maintain the compression status,
945							otherwise no compression. Pass a 0 for no compression, 1 for standard
946							gzip compression, or 2 for block gzip (bgzip) compression for tabix
947							compatibility.
948
949							=back
950
951							If the file save is successful, it will return the full path and
952							name of the saved file, complete with any changes to the file extension.
953
954							=item summary_file
955
956							Write a separate file summarizing columns of data (mean values).
957							The mean value of each column becomes a row value, and each column
958							header becomes a row identifier (i.e. the table is transposed). The
959							best use of this is to summarize the mean profile of windowed data
960							collected across a feature. See the Bio::ToolBox scripts
961							L and L as examples.
962							For data from L where the columns are expressed
963							as percentile bins, the reported midpoint column is automatically
964							converted based on a length of 1000 bp.
965
966							You may pass these options. They are optional.
967
968							=over 4
969
970							=item filename
971
972							Pass an optional new filename. The default is to take the basename
973							and append "_summed" to it.
974
975							=item startcolumn
976
977							=item stopcolumn
978
979							Provide the starting and ending columns to summarize. The default
980							start is the leftmost column without a recognized standard name.
981							The default ending column is the last rightmost column. Indexes are
982							0-based.
983
984							=item dataset
985
986							Pass a string that is the name of the dataset. This could be collected
987							from the metadata, if present. This will become the name of the score
988							column if defined.
989
990							=back
991
992							The name of the summarized column is either the provided dataset name,
993							the defined basename in the metadata of the Data structure, or a generic
994							name. If successful, it will return the name of the file saved.
995
996							=back
997
998							=head2 Verifying Datasets
999
1000							When working with row Features and collecting scores, the dataset
1001							from which you are collecting must be verified prior to collection.
1002							This ensures that the proper database adaptor is available and loaded,
1003							and that the dataset is correctly specified (otherwise nothing would be
1004							collected). This verification is normally performed transparently when
1005							you call L or
1006							L.
1007							However, datasets may be explicitly verified prior to calling the score
1008							methods.
1009
1010							=over 4
1011
1012							=item verify_dataset
1013
1014							my $dataset = $Data->verify_dataset($dataset, $database);
1015
1016							Pass the name of the dataset (GFF type or type:source) for a GFF3-based
1017							database, e.g. , or path and file name for a
1018							data file, e.g. Bam, BigWig, BigBed, or USeq file. If a separate database
1019							is being used, pass the name or opened database object as a second
1020							parameter. For more advance options, see
1021							L.
1022
1023							The name of the verified dataset, with a prefix if necessary, is returned.
1024
1025							=back
1026
1027							=head2 Efficient Data Access
1028
1029							Most of the time we need to iterate over the Data table, one row
1030							at a time, collecting data or processing information. These methods
1031							simplify the process.
1032
1033							=over 4
1034
1035							=item iterate
1036
1037							$Data->iterate( sub {
1038							my $row = shift;
1039							my $number = $row->value($index);
1040							my $log_number = log($number);
1041							$row->value($index, $log_number);
1042							} );
1043
1044							This method will process a code reference on every row in the data
1045							table. Pass a subroutine or code reference. The subroutine will
1046							receive the row as a L object. With this
1047							object, you can retrieve values, set values, and add new values.
1048
1049							=item row_stream
1050
1051							This returns an C object, which has one
1052							method, C. Call this method repeatedly until it returns
1053							C to work through each row of data.
1054
1055							Users of the C family of database adaptors may recognize the
1056							analogy to the C method.
1057
1058							=item next_row
1059
1060							my $stream = $Data->row_stream;
1061							while (my $row = $stream->next_row) {
1062							# each $row is a Bio::ToolBox::Data::Feature object
1063							# representing the row in the data table
1064							my $value = $row->value($index);
1065							# do something with $value
1066							}
1067
1068							Called from a C object, it returns a
1069							L row object. If SeqFeature objects are
1070							associated with the row, perhaps from a parsed input annotation file,
1071							then they are automatically associated with the row object. (Previous
1072							versions required separately calling the seqfeature() row method to
1073							perform this.)
1074
1075							=back
1076
1077							=head1 SEE ALSO
1078
1079							L, L, L,
1080							L, L, L, L,
1081							L, L
1082
1083							=cut
1084
1085	3			3		4197	use strict;
	3					7
	3					118
1086	3			3		21	use Carp qw(carp cluck croak confess);
	3					7
	3					233
1087	3			3		20	use List::Util qw(sum0);
	3					5
	3					208
1088	3			3		19	use base 'Bio::ToolBox::Data::core';
	3					6
	3					1680
1089	3					204	use Bio::ToolBox::db_helper qw(
1090							get_new_feature_list
1091							get_new_genome_list
1092							get_db_feature
1093	3			3		26	);
	3					6
1094	3			3		22	use Bio::ToolBox::utility qw(simplify_dataset_name);
	3					7
	3					155
1095	3			3		22	use Module::Load;
	3					7
	3					17
1096
1097							1;
1098
1099
1100							### Initialize
1101
1102							sub new {
1103	45			45	1	7983	my $class = shift;
1104	45					120	my %args = @_;
1105
1106							# check for important arguments
1107	45		100			411	$args{features} \|\|= $args{feature} \|\| 'gene';
			33
1108	45		50			272	$args{stream} \|\|= $args{Stream} \|\| 0;
			66
1109	45		100			245	$args{file} \|\|= $args{in} \|\| undef;
			100
1110	45		100			172	$args{parse} \|\|= 0;
1111	45		50			217	$args{noheader} \|\|= 0;
1112
1113							# check for stream
1114	45	100				112	if ($args{stream}) {
1115	1					2	$class = "Bio::ToolBox::Data::Stream";
1116	1					6	load($class);
1117	1					156	return $class->new(@_);
1118							}
1119
1120							# initialize
1121	44					179	my $self = $class->SUPER::new();
1122
1123							# prepare a new table based on the provided arguments
1124	44	100	100			257	if ($args{file} and $args{parse}) {
		100	33
		50
1125							# parse from file
1126	7		100			39	$args{subfeature} \|\|= '';
1127	7	100				24	unless ( $self->parse_table(\%args) ) {
1128	5					104	my $l = $self->load_file($args{file});
1129	5	50				23	return unless $l;
1130	5	50	33			19	if ($self->database =~ /^Parsed:(.+)$/ and $self->feature_type eq 'named') {
1131							# looks like the loaded file was from a previously parsed table
1132							# let's try this again
1133							# this may die if it doesn't work
1134	5					19	$args{file} = $1;
1135	5					16	$args{feature} = $self->feature;
1136	5					15	$self->parse_table(\%args);
1137							}
1138							}
1139							}
1140							elsif ($args{file}) {
1141							# load from file
1142	5	50				52	unless ( $self->load_file($args{file}, $args{noheader}) ) {
1143	0					0	return;
1144							}
1145							}
1146							elsif (exists $args{db} and $args{features}) {
1147							# generate new list
1148	0					0	$self->feature($args{features});
1149	0					0	$self->database($args{db});
1150	0					0	$args{data} = $self;
1151	0					0	my $result;
1152	0	0				0	if ($args{features} eq 'genome') {
1153	0					0	$result = get_new_genome_list(%args);
1154							}
1155							else {
1156	0					0	$result = get_new_feature_list(%args);
1157							}
1158	0	0				0	unless ($result) {
1159	0					0	carp " Cannot generate new $args{features} list!";
1160	0					0	return;
1161							}
1162							}
1163							else {
1164							# a new empty structure
1165
1166							# check to see if user provided column names
1167	32		50			192	$args{columns} \|\|= $args{datasets} \|\| undef;
			66
1168	32	100	33			203	if (defined $args{columns}) {
		50	33
		50	33
		50
1169	1					2	foreach my $c ( @{ $args{columns} } ) {
	1					4
1170	9					19	$self->add_column($c);
1171							}
1172	1	50				6	$self->{feature} = $args{feature} if exists $args{feature};
1173							}
1174
1175							# or possibly a specified format structure
1176							elsif (exists $args{gff} and $args{gff}) {
1177							# use standard names for the number of columns indicated
1178							# we trust that the user knows the subtle difference between gff versions
1179	0					0	$self->add_gff_metadata($args{gff});
1180	0	0				0	unless ($self->extension =~ /g[tf]f/) {
1181							$self->{extension} = $args{gff} == 2.5 ? '.gtf' :
1182	0	0				0	$args{gff} == 3 ? '.gff3' : '.gff';
		0
1183							}
1184							}
1185							elsif (exists $args{bed} and $args{bed}) {
1186							# use standard names for the number of columns indicated
1187	0	0	0			0	unless ($args{bed} =~ /^\d{1,2}$/ and $args{bed} >= 3) {
1188	0					0	carp "bed parameter must be an integer 3-12!";
1189	0					0	return;
1190							}
1191	0					0	$self->add_bed_metadata($args{bed});
1192	0	0				0	unless ($self->extension =~ /bed\|peak/) {
1193	0					0	$self->{extension} = '.bed';
1194							}
1195							}
1196							elsif (exists $args{ucsc} and $args{ucsc}) {
1197							# a ucsc format such as refFlat, genePred, or genePredExt
1198	0					0	my $u = $self->add_ucsc_metadata($args{ucsc});
1199	0	0				0	unless ($u) {
1200	0					0	carp "unrecognized number of columns for ucsc format!";
1201	0					0	return;
1202							};
1203	0	0				0	unless ($self->extension =~ /ucsc\|ref+lat\|genepred/) {
1204	0					0	$self->{extension} = '.ucsc';
1205							}
1206							}
1207							}
1208
1209	44					264	return $self;
1210							}
1211
1212							sub duplicate {
1213	1			1	1	1578	my $self = shift;
1214
1215							# duplicate the data structure
1216	1					6	my $columns = $self->list_columns;
1217	1	50				7	my $Dupe = $self->new(
1218							'columns' => $columns,
1219							) or return;
1220
1221							# copy the metadata
1222	1					5	for (my $i = 0; $i < $self->number_columns; $i++) {
1223							# column metadata
1224	9					21	my %md = $self->metadata($i);
1225	9					31	$Dupe->{$i} = \%md;
1226							}
1227	1					4	foreach (qw(feature program db bed gff ucsc vcf headers)) {
1228							# various keys
1229	8					16	$Dupe->{$_} = $self->{$_};
1230							}
1231	1					4	my @comments = $self->comments;
1232	1					2	push @{$Dupe->{comments}}, @comments;
	1					3
1233
1234	1					15	return $Dupe;
1235							}
1236
1237							sub parse_table {
1238	18			18	1	4210	my $self = shift;
1239	18					37	my $args = shift;
1240	18					30	my ($file, $feature, $subfeature, $simplify);
1241	18	100				51	if (ref $args) {
1242	12		50			43	$file = $args->{file} \|\| '';
1243	12		100			53	$feature = $args->{feature} \|\| '';
1244	12		100			50	$subfeature = $args->{subfeature} \|\| '';
1245							$simplify = (exists $args->{simplify} and defined $args->{simplify}) ?
1246	12	50	33			40	$args->{simplify} : 1; # default is to simplify
1247							}
1248							else {
1249							# no hash reference, assume just a file name
1250	6					14	$file = $args;
1251	6					14	undef $args;
1252	6					8	$feature = undef;
1253	6					13	$subfeature = '';
1254	6					10	$simplify = 1;
1255							}
1256	18	50				49	unless ($file) {
1257	0					0	carp "no annotation file provided to parse!";
1258	0					0	return;
1259							}
1260
1261							# the file format determines the parser class
1262	18	100				61	my $flavor = $self->taste_file($file) or return;
1263	13					41	my $class = 'Bio::ToolBox::parser::' . $flavor;
1264	13					27	eval {load $class};
	13					56
1265	13	50				1063	if ($@) {
1266	0					0	carp "unable to load $class! cannot parse $flavor!";
1267	0					0	return;
1268							}
1269
1270							# open parser
1271	13	50				116	my $parser = $class->new() or return;
1272	13	50				45	$parser->open_file($file) or return;
1273	13					52	my $typelist = $parser->typelist;
1274
1275							# set feature based on the type list from the parser
1276	13	100				37	unless ($feature) {
1277	6	100				49	if ($typelist =~ /gene/i) {
		100
1278	2					12	$feature = 'gene';
1279							}
1280							elsif ($typelist eq 'region') {
1281	3					6	$feature = 'region';
1282							}
1283							else {
1284	1					2	$feature = 'rna'; # generic RNA
1285							}
1286							}
1287
1288							# set parser parameters
1289	13					46	$parser->simplify($simplify);
1290	13	100				34	if ($subfeature) {
1291	2	100				14	$parser->do_exon(1) if $subfeature =~ /exon/i;
1292	2	100				26	$parser->do_cds(1) if $subfeature =~ /cds/i;
1293	2	50				19	$parser->do_utr(1) if $subfeature =~ /utr\|untranslated/i;
1294	2	50				8	$parser->do_codon(1) if $subfeature =~/codon/i;
1295							}
1296	13	100				57	if ($feature =~ /gene$/i) {
1297	4					17	$parser->do_gene(1);
1298							}
1299							else {
1300	9					27	$parser->do_gene(0);
1301							}
1302	13					23	my $mrna_check = 0;
1303	13	50	66			49	if (lc($feature) eq 'mrna' and $parser->typelist !~ /mrna/i and not $self->last_row) {
			33
1304							# user requested mRNA for a new data file but it's not present in the type list
1305							# look for it the hard way by parsing CDS too - sigh
1306	0					0	load('Bio::ToolBox::GeneTools', 'is_coding');
1307	0					0	$parser->do_cds(1);
1308	0					0	$mrna_check = 1;
1309							}
1310
1311							# parse the table
1312	13	50				40	$parser->parse_file or return;
1313
1314							# store the SeqFeature objects
1315	13	100				83	if ($self->last_row > 0) {
1316							# we already have a table, presumably representing the features
1317	5					14	my $count = 0;
1318							$self->iterate( sub {
1319	65			65		85	my $row = shift;
1320	65					170	my $f = $parser->find_gene(
1321							name => $row->name,
1322							id => $row->id,
1323							);
1324	65	50				144	if ($f) {
1325	65					155	$self->store_seqfeature($row->row_index, $f);
1326	65					200	$count++;
1327							}
1328	5					51	} );
1329	5	50				47	unless ($count == $self->last_row) {
1330	0					0	die <
1331							Not all features in the input file could be matched to a corresponding SeqFeature
1332							object in the annotation file $file.
1333							Double check your input and annotation files. You can create a new table by just
1334							providing your annotation file.
1335							PARSEFAIL
1336							}
1337							}
1338							else {
1339							# create a new table
1340	8					37	$self->add_column('Primary_ID');
1341	8					22	$self->add_column('Name');
1342
1343							# check for chromosome exclude
1344	8					14	my $chr_exclude;
1345	8	100				23	if ($args) {
1346	2		50			12	$chr_exclude = $args->{chrskip} \|\| undef;
1347							}
1348
1349							# fill table with features
1350	8					35	while (my $f = $parser->next_top_feature) {
1351	120	50				206	if ($chr_exclude) {
1352	0	0				0	next if $f->seq_id =~ /$chr_exclude/i;
1353							}
1354	120					243	my $type = $f->type;
1355	120	100	33			243	if ($f->type =~ /$feature/i or ($mrna_check and is_coding($f)) ) {
			66
1356	113					261	my $index = $self->add_row([ $f->primary_id, $f->display_name ]);
1357	113					255	$self->store_seqfeature($index, $f);
1358							}
1359							}
1360	8	50				27	unless ($self->last_row) {
1361	0					0	printf " Zero '%s' features found!\n Check your feature or try generic features like gene, mRNA, or transcript\n",
1362							$feature;
1363							}
1364	8					52	$self->database("Parsed:$file");
1365	8					33	$self->feature($feature);
1366	8	50				15	$self->add_comment("Chromosomes excluded: $chr_exclude") if $chr_exclude;
1367
1368							# add input parsed file metadata
1369	8					44	$self->add_file_metadata($file);
1370							# but delete some stuff, just want basename
1371	8					16	undef $self->{extension};
1372	8					18	undef $self->{filename};
1373	8					14	undef $self->{path};
1374							}
1375	13					300	return 1;
1376							}
1377
1378
1379
1380
1381							### Column manipulation
1382
1383							sub column_values {
1384	2			2	1	1360	my ($self, $column) = @_;
1385	2	50				9	return unless defined $column;
1386	2	50				8	return unless exists $self->{$column}{name};
1387	2					10	my @values = map {$self->value($_, $column)} (0 .. $self->last_row);
	159					240
1388	2	50				24	return wantarray ? @values : \@values;
1389							}
1390
1391							sub add_column {
1392	28			28	1	71	my ($self, $name) = @_;
1393	28	50				59	return unless $name;
1394	28					48	my $column = $self->{number_columns};
1395
1396							# check for array of column data
1397	28					47	my $name_ref = ref $name;
1398	28	100				94	if ($name_ref eq 'ARRAY') {
		50
		50
1399	2	50				16	if ($self->last_row > 1) {
1400							# table has existing data beyond column headers
1401	2	50				6	if ($self->last_row == (scalar @$name - 1)) {
1402							# same number of elements, add it the table
1403	2					10	$self->{$column} = {
1404							'name' => $name->[0],
1405							'index' => $column,
1406							};
1407	2					8	for (my $r = 0; $r <= $self->last_row; $r++) {
1408	158					368	$self->{data_table}->[$r][$column] = $name->[$r];
1409							}
1410							}
1411							else {
1412							# different number of elements
1413	0					0	cluck "array has different number of elements than Data table!\n";
1414	0					0	return;
1415							}
1416							}
1417							else {
1418							# table has no existing data
1419	0					0	$self->{$column} = {
1420							'name' => $name->[0],
1421							'index' => $column,
1422							};
1423	0					0	for (my $i = 0; $i < scalar @$name; $i++) {
1424	0					0	$self->value($i, $column, $name->[$i]);
1425							}
1426	0					0	$self->{last_row} = scalar @$name - 1;
1427	0					0	$self->{headers} = 1; # boolean to indicate the table now has headers
1428							}
1429							}
1430							elsif ($name_ref eq 'Bio::DB::GFF::Typename') {
1431							# a Typename object that was selected from a SeqFeature::Store database
1432	0					0	$self->{$column} = {
1433							'name' => $name->asString,
1434							'index' => $column,
1435							};
1436	0					0	$self->{data_table}->[0][$column] = $name->asString;
1437							}
1438							elsif ($name_ref eq '') {
1439							# just a name
1440	26					98	$self->{$column} = {
1441							'name' => $name,
1442							'index' => $column,
1443							};
1444	26					75	$self->{data_table}->[0][$column] = $name;
1445							}
1446							else {
1447	0					0	cluck "unrecognized reference '$name_ref'! pass a scalar value or array reference";
1448	0					0	return;
1449							}
1450
1451	28					46	$self->{number_columns}++;
1452	28	50				59	delete $self->{column_indices} if exists $self->{column_indices};
1453	28	50	33			70	if ($self->gff or $self->bed or $self->ucsc or $self->vcf) {
			33
			33
1454							# check if we maintain integrity, at least insofar what we test
1455	0					0	$self->verify(1); # silence so user doesn't get these messages
1456							}
1457	28					59	return $column;
1458							}
1459
1460							sub copy_column {
1461	1			1	1	4	my $self = shift;
1462	1					3	my $index = shift;
1463	1	50				5	return unless defined $index;
1464	1					5	my $data = $self->column_values($index);
1465	1					5	my $new_index = $self->add_column($data);
1466	1					12	$self->copy_metadata($index, $new_index);
1467	1					6	return $new_index;
1468							}
1469
1470
1471
1472							### Rows and Data access
1473
1474							sub add_row {
1475	231			231	1	331	my $self = shift;
1476	231					310	my @row_data;
1477	231	100	66			918	if ($_[0] and ref $_[0] eq 'ARRAY') {
		50	33
		0	0
1478	191					270	@row_data = @{ $_[0] };
	191					565
1479							}
1480							elsif ($_[0] and ref $_[0] eq 'Bio::ToolBox::Data::Feature') {
1481	40					441	@row_data = $_[0]->row_values;
1482							}
1483							elsif ($_[0] and $_[0] =~ /\t/) {
1484	0					0	@row_data = split /\t/, $_[0];
1485							}
1486							else {
1487	0					0	@row_data = map {'.'} (1 .. $self->{number_columns});
	0					0
1488							}
1489	231	50				546	if (scalar @row_data > $self->{number_columns}) {
1490	0					0	cluck "row added has more elements than table columns! truncating row elements\n";
1491	0					0	splice @row_data, 0, $self->{number_columns};
1492							}
1493	231					520	until (scalar @row_data == $self->{number_columns}) {
1494	0					0	push @row_data, '.';
1495							}
1496	231					451	my $row_index = $self->{last_row} + 1;
1497	231					443	$self->{data_table}->[$row_index] = \@row_data;
1498	231					332	$self->{last_row}++;
1499	231					631	return $row_index;
1500							}
1501
1502							sub get_row {
1503	5			5	1	692	my ($self, $row_number) = @_;
1504	5	50				23	return unless defined $self->{data_table}->[$row_number];
1505	5					19	my @options = (
1506							'data' => $self,
1507							'index' => $row_number,
1508							);
1509	5	100	66			30	if (
1510							exists $self->{SeqFeatureObjects} and
1511							defined $self->{SeqFeatureObjects}->[$row_number]
1512							) {
1513	4					15	push @options, 'feature', $self->{SeqFeatureObjects}->[$row_number];
1514							}
1515	5					47	return Bio::ToolBox::Data::Feature->new(@options);
1516							}
1517
1518							sub delete_row {
1519	1			1	1	4	my $self = shift;
1520	1					6	my @deleted = sort {$b <=> $a} @_;
	0					0
1521	1					5	while (@deleted) {
1522	1					3	my $d = shift @deleted;
1523	1					3	splice( @{ $self->{data_table} }, $d, 1);
	1					4
1524	1					6	$self->{last_row}--;
1525	1	50				5	if (exists $self->{SeqFeatureObjects}) {
1526	0					0	splice( @{ $self->{SeqFeatureObjects} }, $d, 1);
	0					0
1527							}
1528							}
1529	1					3	return 1;
1530							}
1531
1532							sub row_values {
1533	237			237	1	371	my ($self, $row) = @_;
1534	237					327	my @data = @{ $self->{data_table}->[$row] };
	237					602
1535	237	50				811	return wantarray ? @data : \@data;
1536							}
1537
1538							sub value {
1539	508			508	1	1978	my ($self, $row, $column, $value) = @_;
1540	508	50	33			1518	return unless (defined $row and defined $column);
1541	508	50				808	if (defined $value) {
1542	0					0	$self->{data_table}->[$row][$column] = $value;
1543							}
1544	508					1162	return $self->{data_table}->[$row][$column];
1545							}
1546
1547							sub row_stream {
1548	9			9	1	2430	my $self = shift;
1549	9					71	return Bio::ToolBox::Data::Iterator->new($self);
1550							}
1551
1552							sub iterate {
1553	6			6	1	654	my $self = shift;
1554	6					14	my $code = shift;
1555	6	50				25	unless (ref $code eq 'CODE') {
1556	0					0	confess "iterate_function() method requires a code reference!";
1557							}
1558	6					20	my $stream = $self->row_stream;
1559	6					22	while (my $row = $stream->next_row) {
1560	143					293	&$code($row);
1561							}
1562	6					28	return 1;
1563							}
1564
1565
1566							#### Stored SeqFeature manipulation
1567
1568							sub store_seqfeature {
1569	178			178	1	300	my ($self, $row_i, $seqfeature) = @_;
1570	178	50	33			614	unless (defined $row_i and ref($seqfeature)) {
1571	0					0	confess "must provide a row index and SeqFeature object!";
1572							}
1573	178	50				386	confess "invalid row index" unless ($row_i <= $self->last_row);
1574	178		100			364	$self->{SeqFeatureObjects} \|\|= [];
1575	178					348	$self->{SeqFeatureObjects}->[$row_i] = $seqfeature;
1576	178					383	return 1;
1577							}
1578
1579							sub delete_seqfeature {
1580	0			0	1	0	my ($self, $row_i) = @_;
1581	0	0				0	confess "invalid row index" unless ($row_i <= $self->last_row);
1582	0	0				0	return unless $self->{SeqFeatureObjects};
1583	0					0	undef $self->{SeqFeatureObjects}->[$row_i];
1584							}
1585
1586							sub collapse_gene_transcripts {
1587	0			0	1	0	my $self = shift;
1588	0	0				0	unless ($self->feature_type eq 'named') {
1589	0					0	carp "Table does not contain named features!";
1590	0					0	return;
1591							}
1592
1593							# load module
1594	0					0	my $class = "Bio::ToolBox::GeneTools";
1595	0					0	eval {load $class, qw(collapse_transcripts)};
	0					0
1596	0	0				0	if ($@) {
1597	0					0	carp "unable to load $class! cannot collapse transcripts!";
1598	0					0	return;
1599							}
1600
1601							# collapse the transcripts
1602	0					0	my $success = 0;
1603	0	0				0	if (exists $self->{SeqFeatureObjects}) {
1604							# we should have stored SeqFeature objects, probably from parsed table
1605	0					0	for (my $i = 1; $i <= $self->last_row; $i++) {
1606	0	0				0	my $feature = $self->{SeqFeatureObjects}->[$i] or next;
1607	0	0				0	my $collSeqFeat = collapse_transcripts($feature) or next;
1608	0					0	$success += $self->store_seqfeature($i, $collSeqFeat);
1609							}
1610							}
1611							else {
1612							# no stored SeqFeature objects, probably names pointing to a database
1613							# we will have to fetch the feature from a database
1614	0	0				0	my $db = $self->open_meta_database(1) or # force open a new db connection
1615							confess "No SeqFeature objects stored and no database connection!";
1616	0					0	my $name_i = $self->name_column;
1617	0					0	my $id_i = $self->id_column;
1618	0					0	my $type_i = $self->type_column;
1619	0					0	for (my $i = 1; $i <= $self->last_row; $i++) {
1620	0	0	0			0	my $feature = get_db_feature(
			0
			0
1621							db => $db,
1622							name => $self->value($i, $name_i) \|\| undef,
1623							type => $self->value($i, $type_i) \|\| undef,
1624							id => $self->value($i, $id_i) \|\| undef,
1625							) or next;
1626	0	0				0	my $collSeqFeat = collapse_transcripts($feature) or next;
1627	0					0	$success += $self->store_seqfeature($i, $collSeqFeat);
1628							}
1629							}
1630
1631	0					0	return $success;
1632							}
1633
1634							sub add_transcript_length {
1635	0			0	1	0	my $self = shift;
1636	0		0			0	my $subfeature = shift \|\| 'exon';
1637	0	0				0	unless (exists $self->{SeqFeatureObjects}) {
1638	0					0	carp "no SeqFeature objects stored for collapsing!";
1639	0					0	return;
1640							}
1641
1642							# load module
1643	0					0	eval {load "Bio::ToolBox::GeneTools", qw(get_transcript_length get_transcript_cds_length
	0					0
1644							get_transcript_5p_utr_length get_transcript_3p_utr_length)};
1645	0	0				0	if ($@) {
1646	0					0	carp "unable to load Bio::ToolBox::GeneTools! cannot collapse transcripts!";
1647	0					0	return;
1648							}
1649
1650							# determine name and calculation method
1651	0					0	my $length_calculator;
1652							my $length_name;
1653	0	0				0	if ($subfeature eq 'exon') {
		0
		0
		0
1654	0					0	$length_calculator = \&get_transcript_length;
1655	0					0	$length_name= 'Merged_Transcript_Length';
1656							}
1657							elsif ($subfeature eq 'cds') {
1658	0					0	$length_calculator = \&get_transcript_cds_length;
1659	0					0	$length_name= 'Transcript_CDS_Length';
1660							}
1661							elsif ($subfeature eq '5p_utr') {
1662	0					0	$length_calculator = \&get_transcript_5p_utr_length;
1663	0					0	$length_name= 'Transcript_5p_UTR_Length';
1664							}
1665							elsif ($subfeature eq '3p_utr') {
1666	0					0	$length_calculator = \&get_transcript_3p_utr_length;
1667	0					0	$length_name= 'Transcript_3p_UTR_Length';
1668							}
1669							else {
1670	0					0	carp "unrecognized subfeature type '$subfeature'! No length calculated";
1671	0					0	return;
1672							}
1673
1674							# add new column and calculate
1675	0					0	my $length_i = $self->add_column($length_name);
1676	0	0				0	if (exists $self->{SeqFeatureObjects}) {
1677							# we should have stored SeqFeature objects, probably from parsed table
1678	0					0	for (my $i = 1; $i <= $self->last_row; $i++) {
1679	0	0				0	my $feature = $self->{SeqFeatureObjects}->[$i] or next;
1680	0					0	my $length = &$length_calculator($feature);
1681	0		0			0	$self->{data_table}->[$i][$length_i] = $length \|\| $feature->length;
1682							}
1683							}
1684							else {
1685							# no stored SeqFeature objects, probably names pointing to a database
1686							# we will have to fetch the feature from a database
1687	0	0				0	my $db = $self->open_meta_database(1) or # force open a new db connection
1688							confess "No SeqFeature objects stored and no database connection!";
1689	0					0	my $name_i = $self->name_column;
1690	0					0	my $id_i = $self->id_column;
1691	0					0	my $type_i = $self->type_column;
1692	0					0	for (my $i = 1; $i <= $self->last_row; $i++) {
1693	0	0	0			0	my $feature = get_db_feature(
			0
			0
1694							db => $db,
1695							name => $self->value($i, $name_i) \|\| undef,
1696							type => $self->value($i, $type_i) \|\| undef,
1697							id => $self->value($i, $id_i) \|\| undef,
1698							) or next;
1699	0					0	my $length = &$length_calculator($feature);
1700	0		0			0	$self->{data_table}->[$i][$length_i] = $length \|\| $feature->length;
1701	0					0	$self->store_seqfeature($i, $feature);
1702							# to store or not to store? May explode memory, but could save from
1703							# expensive db calls later
1704							}
1705							}
1706
1707	0					0	return $length_i;
1708							}
1709
1710
1711
1712							### Data structure manipulation
1713
1714							sub sort_data {
1715	1			1	1	1860	my $self = shift;
1716	1					3	my $index = shift;
1717	1		50			5	my $direction = shift \|\| 'i';
1718
1719							# confirm options
1720	1	50				5	return unless exists $self->{$index}{name};
1721	1	50				7	unless ($direction =~ /^[id]/i) {
1722	0					0	carp "unrecognized sort order '$direction'! Must be i or d";
1723	0					0	return;
1724							}
1725
1726							# Sample the dataset values
1727							# this will be used to guess the sort method, below
1728	1					3	my $example; # an example of the dataset
1729	1					6	foreach (my $i = 1; $i <= $self->last_row; $i++) {
1730							# we want to avoid null values, so keep trying
1731							# null being . or any variation of N/A, NaN, inf
1732	1					7	my $v = $self->value($i, $index);
1733	1	50	33			10	if (defined $v and $v !~ /^(?:\.\|n\/?a\|nan\|\-?inf)$/i) {
1734							# a non-null value, take it
1735	1					3	$example = $v;
1736	1					2	last;
1737							}
1738							}
1739
1740							# Determine sort method, either numerical or alphabetical
1741	1					2	my $sortmethod;
1742	1	50				8	if ($example =~ /[a-z]/i) {
		0
1743							# there are detectable letters
1744	1					2	$sortmethod = 'ascii';
1745							}
1746							elsif ($example =~ /^\-?\d+\.?\d*$/) {
1747							# there are only digits, allowing for minus sign and a decimal point
1748							# I don't think this allows for exponents, though
1749	0					0	$sortmethod = 'numeric';
1750							}
1751							else {
1752							# unable to determine (probably alphanumeric), sort asciibetical
1753	0					0	$sortmethod = 'ascii';
1754							}
1755
1756							# Re-order the datasets
1757							# Directly sorting the @data array is proving difficult. It keeps giving me
1758							# a segmentation fault. So I'm using a different approach by copying the
1759							# @data_table into a temporary hash.
1760							# put data_table array into a temporary hash
1761							# the hash key will the be dataset value,
1762							# the hash value will be the reference the row data
1763	1					2	my %datahash;
1764
1765							# reorder numerically
1766	1	50				8	if ($sortmethod eq 'numeric') {
		50
1767	0					0	for my $row (1..$self->last_row) {
1768	0					0	my $value = $self->value($row, $index);
1769
1770							# check to see whether this value exists or not
1771	0					0	while (exists $datahash{$value}) {
1772							# add a really small number to bump it up and make it unique
1773							# this, of course, presumes that none of the dataset values
1774							# are really this small - this may be an entirely bad
1775							# assumption!!!!! I suppose we could somehow calculate an
1776							# appropriate value.... nah.
1777							# don't worry, we're only modifying the value used for sorting,
1778							# not the actual value
1779	0					0	$value += 0.00000001;
1780							}
1781							# store the row data reference
1782	0					0	$datahash{$value} = $self->{data_table}->[$row];
1783							}
1784
1785							# re-fill the array based on the sort direction
1786	0	0				0	if ($direction =~ /^i/i) {
1787							# increasing sort
1788	0					0	my $i = 1; # keep track of the row, skip the header
1789	0					0	foreach (sort {$a <=> $b} keys %datahash) {
	0					0
1790							# put back the reference to the anonymous array of row data
1791	0					0	$self->{data_table}->[$i] = $datahash{$_};
1792	0					0	$i++; # increment for next row
1793							}
1794							}
1795							else {
1796							# decreasing sort
1797	0					0	my $i = 1; # keep track of the row, skip the header
1798	0					0	foreach (sort {$b <=> $a} keys %datahash) {
	0					0
1799							# put back the reference to the anonymous array of row data
1800	0					0	$self->{data_table}->[$i] = $datahash{$_};
1801	0					0	$i++; # increment for next row
1802							}
1803							}
1804
1805							# summary prompt
1806	0					0	printf " Data table sorted numerically by the contents of %s\n",
1807							$self->name($index);
1808							}
1809
1810							# reorder asciibetically
1811							elsif ($sortmethod eq 'ascii') {
1812	1					4	for my $row (1..$self->last_row) {
1813							# get the value to sort by
1814	78					126	my $value = $self->value($row, $index);
1815	78	50				127	if (exists $datahash{$value}) {
1816							# not unique
1817	0					0	my $n = 1;
1818	0					0	my $lookup = $value . sprintf("03%d", $n);
1819							# we'll try to make a unique value by appending
1820							# a number to the original value
1821	0					0	while (exists $datahash{$lookup}) {
1822							# keep bumping up the number till it's unique
1823	0					0	$n++;
1824	0					0	$lookup = $value . sprintf("03%d", $n);
1825							}
1826	0					0	$datahash{$lookup} = $self->{data_table}->[$row];
1827							}
1828							else {
1829							# unique
1830	78					218	$datahash{$value} = $self->{data_table}->[$row];
1831							}
1832							}
1833
1834							# re-fill the array based on the sort direction
1835	1	50	33			12	if ($direction eq 'i' or $direction eq 'I') {
		50	33
1836							# increasing
1837	0					0	my $i = 1; # keep track of the row
1838	0					0	foreach (sort {$a cmp $b} keys %datahash) {
	0					0
1839							# put back the reference to the anonymous array of row data
1840	0					0	$self->{data_table}->[$i] = $datahash{$_};
1841	0					0	$i++; # increment for next row
1842							}
1843							}
1844							elsif ($direction eq 'd' or $direction eq 'D') {
1845							# decreasing
1846	1					4	my $i = 1; # keep track of the row
1847	1					25	foreach (sort {$b cmp $a} keys %datahash) {
	392					499
1848							# put back the reference to the anonymous array of row data
1849	78					109	$self->{data_table}->[$i] = $datahash{$_};
1850	78					93	$i++; # increment for next row
1851							}
1852							}
1853
1854							# summary prompt
1855	1					9	printf " Data table sorted asciibetically by the contents of '%s'\n",
1856							$self->name($index);
1857							}
1858
1859	1					17	return 1;
1860							}
1861
1862							sub gsort_data {
1863	1			1	1	4	my $self = shift;
1864
1865							# identify indices
1866	1	50				4	unless ($self->feature_type eq 'coordinate') {
1867	0					0	carp "no chromosome and start/position columns to sort!\n";
1868	0					0	return;
1869							}
1870	1					4	my $chromo_i = $self->chromo_column;
1871	1					4	my $start_i = $self->start_column;
1872
1873							# Load the data into a temporary hash
1874							# The datalines will be put into a hash of hashes: The first key will be
1875							# the chromosome name, the second hash will be the start value.
1876							#
1877							# To deal with some chromosomes that don't have numbers (e.g. chrM), we'll
1878							# use two separate hashes: one is for numbers, the other for strings
1879							# when it comes time to sort, we'll put the numbers first, then strings
1880
1881	1					3	my %num_datahash;
1882							my %str_datahash;
1883	1					5	for my $row (1 .. $self->last_row) {
1884
1885	78					137	my $startvalue = $self->value($row, $start_i);
1886
1887							# put the dataline into the appropriate temporary hash
1888	78	50				134	if ($self->value($row, $chromo_i) =~ /^(?:chr)?(\d+)$/) {
1889							# dealing with a numeric chromosome name
1890							# restricting to either chr2 or just 2 but not 2-micron
1891	0					0	my $chromovalue = $1;
1892	0					0	while (exists $num_datahash{$chromovalue}{$startvalue}) {
1893							# if another item already exists at this location
1894							# add a really small number to bump it up and make it unique
1895	0					0	$startvalue += 0.001;
1896							}
1897	0					0	$num_datahash{$chromovalue}{$startvalue} = $self->{data_table}->[$row];
1898							}
1899							else {
1900							# dealing with a non-numeric chromosome name
1901	78					125	my $chromovalue = $self->value($row, $chromo_i);
1902							# use the entire chromosome name as key
1903	78					164	while (exists $str_datahash{$chromovalue}{$startvalue}) {
1904							# if another item already exists at this location
1905							# add a really small number to bump it up and make it unique
1906	36					174	$startvalue += 0.001;
1907							}
1908	78					281	$str_datahash{$chromovalue}{$startvalue} = $self->{data_table}->[$row];
1909							}
1910							}
1911
1912
1913							# Now re-load the data array with sorted data
1914							# put the numeric chromosome data back first
1915	1					3	my $i = 1; # keep track of the row
1916	1					5	foreach my $chromovalue (sort {$a <=> $b} keys %num_datahash) {
	0					0
1917							# first, numeric sort on increasing chromosome number
1918	0					0	foreach my $startvalue (
1919	0					0	sort {$a <=> $b} keys %{ $num_datahash{$chromovalue} }
	0					0
1920							) {
1921							# second, numeric sort on increasing position value
1922	0					0	$self->{data_table}->[$i] = $num_datahash{$chromovalue}{$startvalue};
1923	0					0	$i++; # increment for next row
1924							}
1925							}
1926							# next put the string chromosome data back
1927	1					6	foreach my $chromovalue (sort {$a cmp $b} keys %str_datahash) {
	0					0
1928							# first, ascii sort on increasing chromosome name
1929	1					2	foreach my $startvalue (
1930	396					504	sort {$a <=> $b} keys %{ $str_datahash{$chromovalue} }
	1					13
1931							) {
1932							# second, numeric sort on increasing position value
1933	78					128	$self->{data_table}->[$i] = $str_datahash{$chromovalue}{$startvalue};
1934	78					101	$i++; # increment for next row
1935							}
1936							}
1937
1938	1					13	return 1;
1939							}
1940
1941							sub splice_data {
1942	1			1	1	4	my ($self, $part, $total_parts) = @_;
1943
1944	1	50	33			10	unless ($part and $total_parts) {
1945	0					0	confess "ordinal part and total number of parts not passed\n";
1946							}
1947	1					6	my $part_length = int($self->last_row / $total_parts);
1948
1949							# check for SeqFeatureObjects array
1950	1	50				4	if (exists $self->{SeqFeatureObjects}) {
1951							# it needs to be the same length as the data table, it should be
1952	0					0	while (scalar @{$self->{SeqFeatureObjects}} < scalar @{$self->{data_table}}) {
	0					0
	0					0
1953	0					0	push @{$self->{SeqFeatureObjects}}, undef;
	0					0
1954							}
1955							}
1956
1957							# splicing based on which part we do
1958	1	50				6	if ($part == 1) {
		50
1959							# remove all but the first part
1960							splice(
1961	0					0	@{$self->{'data_table'}},
	0					0
1962							$part_length + 1
1963							);
1964	0	0				0	if (exists $self->{SeqFeatureObjects}) {
1965							splice(
1966	0					0	@{$self->{SeqFeatureObjects}},
	0					0
1967							$part_length + 1
1968							);
1969							}
1970							}
1971							elsif ($part == $total_parts) {
1972							# remove all but the last part
1973							splice(
1974	1					2	@{$self->{'data_table'}},
	1					15
1975							1,
1976							$part_length * ($total_parts - 1)
1977							);
1978	1	50				4	if (exists $self->{SeqFeatureObjects}) {
1979							splice(
1980	0					0	@{$self->{SeqFeatureObjects}},
	0					0
1981							1,
1982							$part_length * ($total_parts - 1)
1983							);
1984							}
1985							}
1986							else {
1987							# splicing in the middle requires two rounds
1988
1989							# remove the last parts
1990							splice(
1991	0					0	@{$self->{'data_table'}},
	0					0
1992							($part * $part_length) + 1
1993							);
1994
1995							# remove the first parts
1996							splice(
1997	0					0	@{$self->{'data_table'}},
	0					0
1998							1,
1999							$part_length * ($part - 1)
2000							);
2001
2002	0	0				0	if (exists $self->{SeqFeatureObjects}) {
2003							splice(
2004	0					0	@{$self->{SeqFeatureObjects}},
	0					0
2005							($part * $part_length) + 1
2006							);
2007							splice(
2008	0					0	@{$self->{SeqFeatureObjects}},
	0					0
2009							1,
2010							$part_length * ($part - 1)
2011							);
2012							}
2013							}
2014
2015							# update last row metadata
2016	1					2	$self->{'last_row'} = scalar(@{$self->{'data_table'}}) - 1;
	1					4
2017
2018							# re-open a new un-cached database connection
2019	1	50				4	if (exists $self->{db_connection}) {
2020	0					0	delete $self->{db_connection};
2021							}
2022	1					4	return 1;
2023							}
2024
2025
2026
2027							### File functions
2028
2029							sub reload_children {
2030	1			1	1	3	my $self = shift;
2031	1					4	my @files = @_;
2032	1	50				5	return unless @files;
2033
2034							# prepare Stream
2035	1					3	my $class = "Bio::ToolBox::Data::Stream";
2036	1					3	eval {load $class};
	1					6
2037	1	50				98	if ($@) {
2038	0					0	carp "unable to load $class! can't reload children!";
2039	0					0	return;
2040							}
2041
2042							# open first stream
2043	1					4	my $first = shift @files;
2044	1					9	my $Stream = $class->new(in => $first);
2045	1	50				5	unless ($Stream) {
2046	0					0	carp "unable to load first child file $first";
2047	0					0	return;
2048							}
2049
2050							# destroy current data table and metadata
2051	1					4	$self->{data_table} = [];
2052	1					5	for (my $i = 0; $i < $self->number_columns; $i++) {
2053	0					0	delete $self->{$i};
2054							}
2055
2056							# copy the metadata
2057	1					5	foreach (qw(program feature db bed gff ucsc headers number_columns last_row)) {
2058							# various keys
2059	9					19	$self->{$_} = $Stream->{$_};
2060							}
2061	1					5	for (my $i = 0; $i < $Stream->number_columns; $i++) {
2062							# column metadata
2063	4					12	my %md = $Stream->metadata($i);
2064	4					15	$self->{$i} = \%md;
2065							}
2066	1					4	my @comments = $Stream->comments;
2067	1					3	push @{$self->{other}}, @comments;
	1					5
2068
2069							# first row column headers
2070	1					3	$self->{data_table}->[0] = [ @{ $Stream->{data_table}->[0] } ];
	1					15
2071
2072							# load the data
2073	1					7	while (my $row = $Stream->next_row) {
2074	39					81	$self->add_row($row);
2075							}
2076	1					45	$Stream->close_fh;
2077
2078							# load remaining files
2079	1					24	foreach my $file (@files) {
2080	2					28	my $Stream = $class->new(in => $file);
2081	2	50				8	if ($Stream->number_columns != $self->number_columns) {
2082	0					0	confess "fatal error: child file $file has a different number of columns!";
2083							}
2084	2					7	while (my $row = $Stream->next_row) {
2085	78					158	my $a = $row->row_values;
2086	78					164	$self->add_row($a);
2087							}
2088	2					75	$Stream->close_fh;
2089							}
2090	1					25	$self->verify;
2091
2092							# clean up
2093	1					224	unlink($first, @files);
2094	1					10	return $self->last_row;
2095							}
2096
2097
2098
2099
2100							### Export summary data file
2101							sub summary_file {
2102	0			0	1	0	my $self = shift;
2103
2104							# Collect passed arguments
2105	0					0	my %args = @_;
2106
2107							# parameters
2108							# either one or more datasets can be summarized
2109	0		0			0	my $outfile = $args{'filename'} \|\| undef;
2110	0					0	my @datasets;
2111	0	0	0			0	if ($args{dataset} and ref $args{dataset} eq 'ARRAY') {
		0	0
2112	0					0	@datasets = @{ $args{dataset} };
	0					0
2113							}
2114							elsif ($args{dataset} and ref $args{dataset} eq 'SCALAR') {
2115	0					0	push @datasets, $args{dataset};
2116							}
2117	0					0	my @startcolumns;
2118	0	0	0			0	if ($args{startcolumn} and ref $args{startcolumn} eq 'ARRAY') {
		0	0
2119	0					0	@startcolumns = @{ $args{startcolumn} };
	0					0
2120							}
2121							elsif ($args{startcolumn} and ref $args{startcolumn} eq 'SCALAR') {
2122	0					0	push @startcolumns, $args{startcolumn};
2123							}
2124	0					0	my @endcolumns;
2125	0		0			0	$args{endcolumn} \|\|= $args{stopcolumn};
2126	0	0	0			0	if ($args{endcolumn} and ref $args{endcolumn} eq 'ARRAY') {
		0	0
2127	0					0	@endcolumns = @{ $args{endcolumn} };
	0					0
2128							}
2129							elsif ($args{endcolumn} and ref $args{endcolumn} eq 'SCALAR') {
2130	0					0	push @endcolumns, $args{endcolumn};
2131							}
2132
2133
2134							# Check required values
2135	0	0				0	unless ($self->verify) {
2136	0					0	cluck "bad data structure!";
2137	0					0	return;
2138							}
2139	0	0				0	unless (defined $outfile) {
2140	0	0				0	if ($self->basename) {
2141							# use the opened file's filename if it exists
2142							# prepend the path if it exists
2143							# the extension will be added later
2144	0					0	$outfile = $self->path . $self->basename;
2145							}
2146							else {
2147	0					0	cluck "no filename passed to write_summary_data!\n";
2148	0					0	return;
2149							}
2150							}
2151
2152							# Identify possible dataset columns
2153	0					0	my %possibles;
2154	0					0	my %skip = map {$_ => 1} qw (systematicname name id alias aliases type class
	0					0
2155							geneclass chromosome chromo seq_id seqid start stop end gene strand
2156							length primary_id merged_transcript_length transcript_cds_length
2157							transcript_5p_utr_length transcript_3p_utr_length);
2158							# walk through the dataset names
2159	0					0	$possibles{unknown} = [];
2160	0					0	for (my $i = 0; $i < $self->number_columns; $i++) {
2161	0	0				0	if (not exists $skip{ lc $self->{$i}{'name'} }) {
2162	0		0			0	my $d = $self->metadata($i, 'dataset') \|\| undef;
2163	0	0				0	if (defined $d) {
2164							# we have what appears to be a dataset column
2165	0		0			0	$possibles{$d} \|\|= [];
2166	0					0	push @{$possibles{$d}}, $i;
	0					0
2167							}
2168							else {
2169							# still an unknown possibility
2170	0					0	push @{$possibles{unknown}}, $i;
	0					0
2171							}
2172							}
2173							}
2174
2175							# check datasets
2176	0	0				0	unless (@datasets) {
2177	0	0				0	if (scalar keys %possibles > 1) {
2178							# we will always have the unknown category, so anything more than one
2179							# means we found legitimate dataset columns
2180	0					0	delete $possibles{unknown};
2181							}
2182	0					0	@datasets = sort {$a cmp $b} keys %possibles;
	0					0
2183							}
2184
2185							# check starts
2186	0	0				0	if (scalar @startcolumns != scalar @datasets) {
2187	0					0	@startcolumns = (); # ignore what we were given?
2188	0					0	foreach my $d (@datasets) {
2189							# take the first column with this dataset
2190	0					0	push @startcolumns, $possibles{$d}->[0];
2191							}
2192							}
2193
2194							# check stops
2195	0	0				0	if (scalar @endcolumns != scalar @datasets) {
2196	0					0	@endcolumns = (); # ignore what we were given?
2197	0					0	foreach my $d (@datasets) {
2198							# take the last column with this dataset
2199	0					0	push @endcolumns, $possibles{$d}->[-1];
2200							}
2201							}
2202
2203
2204							# Prepare Data object to store the summed data
2205	0					0	my $summed_data = $self->new(
2206							feature => 'averaged_windows',
2207							columns => ['Window','Midpoint'],
2208							);
2209
2210							# Go through each dataset
2211	0					0	foreach my $d (0 .. $#datasets) {
2212
2213							# Prepare score column name
2214	0					0	my $data_name = simplify_dataset_name($datasets[$d]);
2215
2216							# add column
2217	0					0	my $i = $summed_data->add_column($data_name);
2218	0					0	$summed_data->metadata($i, 'dataset', $datasets[$d]);
2219
2220							# tag for remembering we're working with percentile bins
2221	0					0	my $do_percentile = 0;
2222
2223							# remember the row
2224	0					0	my $row = 1;
2225
2226							# Collect summarized data
2227	0					0	for (
2228							my $column = $startcolumns[$d];
2229							$column <= $endcolumns[$d];
2230							$column++
2231							) {
2232
2233							# determine the midpoint position of the window
2234							# this assumes the column metadata has start and stop
2235	0					0	my $midpoint = int(sum0($self->metadata($column, 'start'),
2236							$self->metadata($column, 'stop')) / 2);
2237
2238							# convert midpoint to fraction of 1000 for plotting if necessary
2239	0	0				0	if (substr($self->name($column), -1) eq '%') {
2240	0					0	$midpoint = 10; # midpoint 0.01 * 1000 bp
2241	0					0	$do_percentile++;
2242							}
2243	0	0	0			0	if ($do_percentile and substr($self->name($column), -2) eq 'bp') {
2244							# working on the extension after the percentile bins
2245	0					0	$midpoint += 1000;
2246							}
2247
2248							# collect the values in the column
2249	0					0	my @values;
2250	0					0	for my $row (1..$self->last_row) {
2251	0					0	my $v = $self->value($row, $column);
2252	0	0				0	push @values, $v eq '.' ? 0 : $v; # treat nulls as zero
2253							}
2254
2255							# adjust if log value
2256	0		0			0	my $log = $self->metadata($column, 'log2') \|\| 0;
2257	0	0				0	if ($log) {
2258	0					0	@values = map { 2 ** $_ } @values;
	0					0
2259							}
2260
2261							# determine mean value
2262	0					0	my $window_mean = sum0(@values) / scalar(@values);
2263	0	0				0	if ($log) {
2264	0					0	$window_mean = log($window_mean) / log(2);
2265							}
2266
2267							# push to summed output
2268	0	0				0	if ($d == 0) {
2269							# this is the first dataset, so we need to add a row
2270	0					0	$summed_data->add_row( [ $self->{$column}{'name'}, $midpoint, $window_mean ] );
2271							}
2272							else {
2273							# we're summarizing multiple datasets, we already have name midpoint
2274							# first do sanity check
2275	0	0				0	if ($summed_data->value($row, 1) != $midpoint) {
2276	0					0	carp("unable to summarize multiple datasets with nonequal columns of data!");
2277	0					0	return;
2278							}
2279	0					0	$summed_data->value($row, $i, $window_mean);
2280							}
2281	0					0	$row++;
2282							}
2283							}
2284
2285
2286
2287							# Write summed data
2288	0					0	$outfile =~ s/\.txt(\.gz)?$//i; # strip any .txt or .gz extensions if present
2289	0					0	my $written_file = $summed_data->write_file(
2290							'filename' => $outfile . '_summary.txt',
2291							'gz' => 0,
2292							);
2293	0					0	return $written_file;
2294							}
2295
2296
2297
2298
2299
2300							####################################################
2301
2302							package Bio::ToolBox::Data::Iterator;
2303	3			3		22464	use Bio::ToolBox::Data::Feature;
	3					12
	3					142
2304	3			3		34	use Carp;
	3					8
	3					926
2305
2306							sub new {
2307	9			9		24	my ($class, $data) = @_;
2308	9					35	my %iterator = (
2309							'index' => 1,
2310							'data' => $data,
2311							);
2312	9					39	return bless \%iterator, $class;
2313							}
2314
2315							sub next_row {
2316	154			154		1985	my $self = shift;
2317	154	100				357	return if $self->{'index'} > $self->{data}->{last_row}; # no more
2318	148					199	my $i = $self->{'index'};
2319	148					207	$self->{'index'}++;
2320							my @options = (
2321							'data' => $self->{data},
2322	148					294	'index' => $i,
2323							);
2324	148	50	66			456	if (
2325							exists $self->{data}->{SeqFeatureObjects} and
2326							defined $self->{data}->{SeqFeatureObjects}->[$i]
2327							) {
2328	0					0	push @options, 'feature', $self->{data}->{SeqFeatureObjects}->[$i];
2329							}
2330	148					388	return Bio::ToolBox::Data::Feature->new(@options);
2331							}
2332
2333							sub row_index {
2334	0			0			my $self = shift;
2335	0	0					carp "row_index is a read only method" if @_;
2336	0						return $self->{'index'};
2337							}
2338
2339
2340
2341							####################################################
2342
2343							__END__