File Coverage

Bio/PopGen/IO/csv.pm
Criterion Covered Total %
statement 84 95 88.4
branch 20 28 71.4
condition 13 21 61.9
subroutine 12 12 100.0
pod 5 5 100.0
total 134 161 83.2


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::PopGen::IO::csv
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::PopGen::IO::csv -Extract individual allele data from a CSV parser
17              
18             =head1 SYNOPSIS
19              
20             #Do not use directly, use through the Bio::PopGen::IO driver
21              
22             use Bio::PopGen::IO;
23             my $io = Bio::PopGen::IO->new(-format => 'csv',
24             -file => 'data.csv');
25              
26             # Some IO might support reading in a population at a time
27              
28             my @population;
29             while( my $ind = $io->next_individual ) {
30             push @population, $ind;
31             }
32              
33             =head1 DESCRIPTION
34              
35             This object will parse comma delimited format (CSV) or whatever
36             delimiter you specify. It currently doesn't handle the more complex
37             quote escaped CSV format. There are 3 initialization parameters,
38             the delimiter (-field_delimiter) [default ','], (-allele_delimiter)
39             [default ' ']. The third initialization parameter is a boolean
40             -no_header which specifies if there is no header line to read in. All lines starting with '#' will be skipped
41              
42             When no_header is not specific the data is assumed to be of the following form.
43             Having a header line this
44             SAMPLE,MARKERNAME1,MARKERNAME2,...
45              
46             and each data line having the form (diploid data)
47             SAMP1,101 102,100 90,a b
48             or for haploid data
49             SAMP1,101,100,a
50              
51             =head1 FEEDBACK
52              
53             =head2 Mailing Lists
54              
55             User feedback is an integral part of the evolution of this and other
56             Bioperl modules. Send your comments and suggestions preferably to
57             the Bioperl mailing list. Your participation is much appreciated.
58              
59             bioperl-l@bioperl.org - General discussion
60             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
61              
62             =head2 Support
63              
64             Please direct usage questions or support issues to the mailing list:
65              
66             I
67              
68             rather than to the module maintainer directly. Many experienced and
69             reponsive experts will be able look at the problem and quickly
70             address it. Please include a thorough description of the problem
71             with code and data examples if at all possible.
72              
73             =head2 Reporting Bugs
74              
75             Report bugs to the Bioperl bug tracking system to help us keep track
76             of the bugs and their resolution. Bug reports can be submitted via
77             the web:
78              
79             https://github.com/bioperl/bioperl-live/issues
80              
81             =head1 AUTHOR - Jason Stajich
82              
83             Email jason-at-bioperl.org
84              
85             =head1 CONTRIBUTORS
86              
87             Matthew Hahn, matthew.hahn-at-duke.edu
88              
89             =head1 APPENDIX
90              
91             The rest of the documentation details each of the object methods.
92             Internal methods are usually preceded with a _
93              
94             =cut
95              
96              
97             # Let the code begin...
98              
99              
100             package Bio::PopGen::IO::csv;
101 1     1   7 use vars qw($FieldDelim $AlleleDelim $NoHeader);
  1         2  
  1         59  
102 1     1   4 use strict;
  1         2  
  1         36  
103              
104             ($FieldDelim,$AlleleDelim,$NoHeader) =( ',', '\s+',0);
105              
106             # Object preamble - inherits from Bio::Root::Root
107              
108              
109 1     1   4 use Bio::PopGen::Individual;
  1         1  
  1         21  
110 1     1   4 use Bio::PopGen::Population;
  1         1  
  1         18  
111 1     1   3 use Bio::PopGen::Genotype;
  1         1  
  1         20  
112              
113 1     1   4 use base qw(Bio::PopGen::IO);
  1         1  
  1         871  
114              
115             =head2 new
116              
117             Title : new
118             Usage : my $obj = Bio::PopGen::IO::csv->new();
119             Function: Builds a new Bio::PopGen::IO::csv object
120             Returns : an instance of Bio::PopGen::IO::csv
121             Args : [optional, these are the current defaults]
122             -field_delimiter => ','
123             -allele_delimiter=> '\s+'
124             -no_header => 0,
125              
126              
127             =cut
128              
129             sub _initialize {
130 5     5   16 my($self, @args) = @_;
131 5         18 my ($fieldsep,$all_sep,
132             $noheader) = $self->_rearrange([qw(FIELD_DELIMITER
133             ALLELE_DELIMITER
134             NO_HEADER)],@args);
135              
136              
137 5 50       23 $self->flag('no_header', defined $noheader ? $noheader : $NoHeader);
138 5 50       16 $self->flag('field_delimiter',defined $fieldsep ? $fieldsep : $FieldDelim);
139 5 50       11 $self->flag('allele_delimiter',defined $all_sep ? $all_sep : $AlleleDelim);
140              
141 5         10 $self->{'_header'} = undef;
142 5         12 return 1;
143             }
144              
145             =head2 flag
146              
147             Title : flag
148             Usage : $obj->flag($flagname,$newval)
149             Function: Get/Set the flag value
150             Returns : value of a flag (a boolean)
151             Args : A flag name, currently we expect
152             'no_header', 'field_delimiter', or 'allele_delimiter'
153             on set, new value (a boolean or undef, optional)
154              
155              
156             =cut
157              
158             sub flag{
159 766     766 1 742 my $self = shift;
160 766         671 my $fieldname = shift;
161 766 50       980 return unless defined $fieldname;
162            
163 766 100       1022 return $self->{'_flag'}->{$fieldname} = shift if @_;
164 749         4237 return $self->{'_flag'}->{$fieldname};
165             }
166              
167              
168             =head2 next_individual
169              
170             Title : next_individual
171             Usage : my $ind = $popgenio->next_individual;
172             Function: Retrieve the next individual from a dataset
173             Returns : L object
174             Args : none
175              
176              
177             =cut
178              
179             sub next_individual{
180 56     56 1 200 my ($self) = @_;
181 56         112 while( defined( $_ = $self->_readline) ) {
182 60 100 100     287 next if( /^\s*\#/ || /^\s+$/ || ! length($_) );
      66        
183 53         58 last;
184             }
185 56 100       78 return if ! defined $_;
186 53 100 66     87 if( $self->flag('no_header') ||
187             defined $self->{'_header'} ) {
188              
189             #########new (allows field delim to be the same as the allele delim
190              
191 50         63 my ($samp,@marker_results);
192              
193 50 50       67 if($self->flag('field_delimiter') ne $self->flag('allele_delimiter')){
194              
195 50         67 ($samp,@marker_results) = split($self->flag('field_delimiter'),$_);
196             }
197             else{
198              
199 0         0 my $fielddelim = $self->flag('field_delimiter');
200 0         0 my $alleledelim = $self->flag('allele_delimiter');
201              
202 0         0 ($samp) = /(^.+?)$fielddelim/;
203 0         0 s/^.+?$fielddelim//;
204            
205 0         0 (@marker_results) = /([\d|\w]+$alleledelim[\d|\w]+)/g;
206            
207             }
208              
209             #########end new
210              
211 50         103 my $i = 1;
212 50         60 foreach my $m ( @marker_results ) {
213 492         790 $m =~ s/^\s+//;
214 492         695 $m =~ s/\s+$//;
215 492         467 my $markername;
216 492 50       645 if( defined $self->{'_header'} ) {
217 492         582 $markername = $self->{'_header'}->[$i];
218             } else {
219 0         0 $markername = "Marker$i";
220             }
221 492         1335 $self->debug( "markername is $markername alleles are $m\n");
222              
223 492         736 my @alleles = split($self->flag('allele_delimiter'), $m);
224            
225 492         1462 $m = Bio::PopGen::Genotype->new(-alleles => \@alleles,
226             -marker_name => $markername,
227             -individual_id=> $samp);
228 492         714 $i++;
229             }
230 50         144 return Bio::PopGen::Individual->new(-unique_id => $samp,
231             -genotypes => \@marker_results);
232             } else {
233 3         6 chomp;
234 3         6 $self->{'_header'} = [split($self->flag('field_delimiter'),$_)];
235 3         13 return $self->next_individual; # rerun loop again
236             }
237 0         0 return;
238             }
239              
240              
241             =head2 next_population
242              
243             Title : next_population
244             Usage : my $ind = $popgenio->next_population;
245             Function: Retrieve the next population from a dataset
246             Returns : L object
247             Args : none
248             Note : Many implementation will not implement this
249              
250             =cut
251              
252             # Plan is to just return the whole dataset as a single population by
253             # default I think - people would then have each population in a separate
254             # file.
255              
256             sub next_population{
257 1     1 1 7 my ($self) = @_;
258 1         1 my @inds;
259 1         6 while( my $ind = $self->next_individual ) {
260 4         11 push @inds, $ind;
261             }
262 1         9 Bio::PopGen::Population->new(-individuals => \@inds);
263             }
264              
265              
266              
267              
268             =head2 write_individual
269              
270             Title : write_individual
271             Usage : $popgenio->write_individual($ind);
272             Function: Write an individual out in the file format
273             Returns : none
274             Args : L object(s)
275              
276             =cut
277              
278             sub write_individual{
279 1     1 1 9 my ($self,@inds) = @_;
280 1         3 my $fielddelim = $self->flag('field_delimiter');
281 1         4 my $alleledelim= $self->flag('allele_delimiter');
282            
283 1         3 foreach my $ind ( @inds ) {
284 22 50 33     85 if (! ref($ind) || ! $ind->isa('Bio::PopGen::IndividualI') ) {
285 0         0 $self->warn("Cannot write an object that is not a Bio::PopGen::IndividualI object ($ind)");
286 0         0 next;
287             }
288             # we'll go ahead and sort these until
289             # we have a better way to insure a consistent order
290 22         38 my @marker_names = sort $ind->get_marker_names;
291 22 100 66     49 if( ! $self->flag('no_header') &&
292             ! $self->flag('header_written') ) {
293 1         14 $self->_print(join($fielddelim, ('SAMPLE', @marker_names)), "\n");
294 1         3 $self->flag('header_written',1);
295             }
296             $self->_print( join($fielddelim, $ind->unique_id,
297             # we're chaining map here, pay attention and read
298             # starting with the last map
299            
300             # we'll turn genotypes into allele pairs
301             # which will be separated by the allele delimiter
302 440         602 map { join($alleledelim,$_->get_Alleles) }
303             # marker names will be sorted so we don't
304             # have to worry about this between individuals
305             # unless the individual set you pass in has
306             # a mixed set of markers...
307             # this will turn marker names into Genotypes
308 22         37 map {$ind->get_Genotypes(-marker => $_)}
  440         703  
309             @marker_names), "\n")
310             }
311             }
312              
313             =head2 write_population
314              
315             Title : write_population
316             Usage : $popgenio->write_population($pop);
317             Function: Write a population out in the file format
318             Returns : none
319             Args : L object(s)
320             Note : Many implementation will not implement this
321              
322             =cut
323              
324             sub write_population{
325 1     1 1 8 my ($self,@pops) = @_;
326 1         3 my $fielddelim = $self->flag('field_delimiter');
327             # my $alleledelim= $self->flag('allele_delimiter');
328 1         2 my $alleledelim = ' ';
329 1         3 foreach my $pop ( @pops ) {
330 2 50 33     18 if (! ref($pop) || ! $pop->isa('Bio::PopGen::PopulationI') ) {
331 0         0 $self->warn("Cannot write an object that is not a Bio::PopGen::PopulationI object");
332 0         0 next;
333             }
334             # we'll go ahead and sort these until
335             # we have a better way to insure a consistent order
336 2         8 my @marker_names = sort $pop->get_marker_names;
337 2 100 66     5 if( ! $self->flag('no_header') &&
338             ! $self->flag('header_written') ) {
339 1         7 $self->_print( join($fielddelim, ('SAMPLE', @marker_names)),
340             "\n");
341 1         3 $self->flag('header_written',1);
342             }
343 2         5 foreach my $ind ( $pop->get_Individuals ) {
344             $self->_print( join($fielddelim, $ind->unique_id,
345             # we're chaining map here, pay attention
346             # and read starting with the last map
347            
348             # we'll turn genotypes into allele pairs
349             # which will be separated by the allele
350             # delimiter
351 340         500 map { join($alleledelim,$_->get_Alleles) }
352             # marker names will be sorted so we don't
353             # have to worry about this between individuals
354             # unless the individual set you pass in has
355             # a mixed set of markers...
356             # this will turn marker names into Genotypes
357 17         35 map {$ind->get_Genotypes(-marker => $_)}
  340         555  
358             @marker_names), "\n");
359             }
360             }
361             }
362              
363             1;