File Coverage

blib/lib/RandomJungle/File/RAW.pm
Criterion Covered Total %
statement 109 109 100.0
branch 20 20 100.0
condition 5 5 100.0
subroutine 20 20 100.0
pod 13 14 92.8
total 167 168 99.4


line stmt bran cond sub pod time code
1             package RandomJungle::File::RAW;
2              
3             =head1 NAME
4              
5             RandomJungle::File::RAW - Low level access to the data in the RandomJungle RAW input file
6              
7             =cut
8              
9 1     1   1051 use strict;
  1         2  
  1         36  
10 1     1   5 use warnings;
  1         2  
  1         28  
11              
12 1     1   6 use Carp;
  1         2  
  1         59  
13 1     1   6 use Data::Dumper;
  1         2  
  1         45  
14 1     1   1361 use Devel::StackTrace;
  1         4253  
  1         1827  
15              
16             =head1 VERSION
17              
18             Version 0.03
19              
20             =cut
21              
22             our $VERSION = 0.03;
23             our $ERROR; # used if new() fails
24              
25             =head1 SYNOPSIS
26              
27             RandomJungle::File::RAW provides access to the data contained within the RAW file used as input
28             for RandomJungle. This module was developed to support files in ped format only.
29             See RandomJungle::Jungle for higher-level methods.
30              
31             use RandomJungle::File::RAW;
32              
33             my $raw = RandomJungle::File::RAW->new( filename => $rawfile ) || die $RandomJungle::File::RAW::ERROR;
34             $raw->parse || die $raw->err_str;
35              
36             my $file = $raw->get_filename; # returns the filename of the RAW file
37             my $aref = $raw->get_header_labels; # FID, IID, PAT, MAT
38             my $aref = $raw->get_variable_labels; # SEX, PHENOTYPE, rs... (not incl FID, IID, PAT, MAT)
39             my $aref = $raw->get_sample_labels; # from the IID column in the RAW file (ordered by line in the file)
40             my $href = $raw->get_sample_data; # all sample data records (convience method for RJ::File::DB)
41              
42             # Retrieve information by sample ($iid, from get_sample_labels)
43             # These methods set err_str and return undef on error ($iid not specified or invalid)
44             my $val = $raw->get_phenotype_for_sample( $iid ); # $iid is from get_sample_labels()
45             my $aref = $raw->get_data_for_sample( $iid ); # variable data, suitable for classification (split, spliced)
46             my $line = $raw->get_data_for_sample( $iid, orig => 1 ); # original line (unsplit, unspliced) from the RAW file
47              
48             my $href = $raw->get_data; # for debugging only; returns raw data structs
49              
50             # Error handling
51             $raw->set_err( 'Something went boom' );
52             my $msg = $raw->err_str;
53             my $trace = $raw->err_trace;
54              
55             =cut
56              
57             #*********************************************************************
58             # Public Methods
59             #*********************************************************************
60              
61             =head1 METHODS
62              
63             =head2 new()
64              
65             Creates and returns a new RandomJungle::File::RAW object:
66              
67             my $raw = RandomJungle::File::RAW->new( filename => $rawfile );
68              
69             The 'filename' parameter is required.
70             Sets $ERROR and returns undef on failure.
71              
72             =cut
73              
74             sub new
75             {
76             # Returns RJ::File::RAW object on success
77             # Sets $ERROR and returns undef on failure (e.g., 'filename' param not set)
78 10     10 1 5832 my ( $class, %args ) = @_;
79              
80 10         25 my $obj = {};
81 10         23 bless $obj, $class;
82 10 100       36 $obj->_init( %args ) || return; # $ERROR set by _init()
83              
84 8         24 return $obj;
85             }
86              
87             =head2 parse()
88              
89             Parses the RAW file specified in new():
90              
91             my $retval = $raw->parse;
92              
93             Returns a true value on success. Sets err_str and returns undef on failure.
94              
95             =cut
96              
97             sub parse
98             {
99             # Sets err_str and returns undef if error opening RAW file
100             # Sets err_str and returns undef if unexpected header row format
101             # Returns true on success
102              
103 8     8 1 966 my ( $self ) = @_;
104              
105             open( my $infh, '<', $self->{raw_file}{filename} ) or
106             do
107 8 100       375 {
108 1         7 my $err = 'Error opening ' . $self->{raw_file}{filename} . "\n" . $!;
109 1         3 $self->set_err( $err );
110 1         202 return;
111             };
112              
113             # FID IID PAT MAT SEX PHENOTYPE rs6813086_T ...
114              
115             # expected column names and indices
116 7         33 my %col_names =
117             (
118             IID => 1,
119             SEX => 4,
120             PHENOTYPE => 5,
121             );
122              
123 7         128 my $header = <$infh>;
124 7         21 $header = localize_string( $header ); # for cross-platform compatibility
125 7         14 chomp $header;
126 7         56 my @labels = split( / /, $header );
127              
128 7         29 while( my ( $name, $i ) = each %col_names )
129             {
130 17 100       58 if( $name ne $labels[$i] )
131             {
132 3         10 my $err = "Warning: unexpected name in column $i (expected $name, got $labels[$i])";
133 3         7 $self->set_err( $err );
134 3         529 return;
135             }
136             }
137              
138             # sex and phenotype are included as variables
139 4         24 $self->{raw_data}{header_labels} = [ splice( @labels, 0, $col_names{SEX} ) ];
140 4         25 $self->{raw_data}{variable_labels} = [ @labels ];
141              
142 4         16 while( my $line = <$infh> )
143             {
144 80         122 $line = localize_string( $line ); # for cross-platform compatibility
145 80         100 chomp $line;
146 80         432 my @data = split( / /, $line );
147              
148 80         174 my $iid = $data[ $col_names{IID} ];
149              
150 80         392 $self->{raw_data}{data}{$iid} = { SEX => $data[ $col_names{SEX} ],
151             PHENOTYPE => $data[ $col_names{PHENOTYPE} ],
152             orig_data => $line, };
153              
154 80         96 push( @{ $self->{raw_data}{samples} }, $iid );
  80         452  
155             }
156              
157 4         88 close $infh;
158              
159 4         27 return 1;
160             }
161              
162             =head2 get_filename()
163              
164             Returns the name of the RAW file specified in new():
165              
166             my $file = $raw->get_filename;
167              
168             =cut
169              
170             sub get_filename
171             {
172 1     1 1 6 my ( $self ) = @_;
173 1         5 return $self->{raw_file}{filename};
174             }
175              
176             =head2 get_variable_labels()
177              
178             Returns an array ref containing the labels for the variables in the input file. Note that the first
179             four columns in a ped formatted file (FID, IID, PAT, MAT) are not considered variables and therefore
180             they are not included in the results. The array will likely contain SEX and PHENOTYPE, followed
181             by a list of SNP IDs.
182              
183             my $aref = $raw->get_variable_labels; # SEX, PHENOTYPE, rs...
184              
185             =cut
186              
187             sub get_variable_labels
188             {
189             # returns an aref of variable labels ( SEX, PHENOTYPE, rs... )
190             # this does not include the header labels ( FID, IID, PAT, MAT )
191 1     1 1 2 my ( $self ) = @_;
192 1         2 my @labels = @{ $self->{raw_data}{variable_labels} }; # copy
  1         5  
193 1         3 return \@labels;
194             }
195              
196             =head2 get_header_labels()
197              
198             Returns an array ref containing the header labels from the input file, corresponding to the first
199             four columns of a ped formatted file (FID, IID, PAT, MAT):
200              
201             my $aref = $raw->get_header_labels; # FID, IID, PAT, MAT
202              
203             =cut
204              
205             sub get_header_labels
206             {
207             # returns an aref of header labels ( FID, IID, PAT, MAT )
208 1     1 1 6896 my ( $self ) = @_;
209 1         2 my @labels = @{ $self->{raw_data}{header_labels} }; # copy
  1         6  
210 1         4 return \@labels;
211             }
212              
213             =head2 get_sample_labels()
214              
215             Returns an array ref containing a list of sample labels, ordered according to line number in the
216             input file. The labels are taken from the IID column.
217              
218             my $aref = $raw->get_sample_labels;
219              
220             =cut
221              
222             sub get_sample_labels
223             {
224             # returns an aref of sample labels, in the order they appeared in the RAW file (by row)
225             # these labels correspond to the IID column in the RAW file (ped format)
226 1     1 1 2945 my ( $self ) = @_;
227 1         3 my @labels = @{ $self->{raw_data}{samples} }; # copy
  1         9  
228 1         4 return \@labels;
229             }
230              
231             =head2 get_phenotype_for_sample()
232              
233             Returns the phenotype value for a given sample (specified using the sample label from the IID column):
234              
235             my $val = $raw->get_phenotype_for_sample( $iid ); # see get_sample_labels()
236              
237             Sets err_str and returns undef on error ($iid not specified or invalid).
238              
239             =cut
240              
241             sub get_phenotype_for_sample
242             {
243             # returns the value from the PHENOTYPE column for the specified sample (IID)
244             # sets err_str and returns undef if $iid is not specified or if it is invalid
245 3     3 1 43 my ( $self, $iid ) = @_;
246              
247 3 100       12 if( ! defined $iid )
248             {
249 1         5 $self->set_err( 'No sample specified' );
250 1         206 return;
251             }
252              
253 2 100       10 if( ! exists $self->{raw_data}{data}{$iid} )
254             {
255 1         8 $self->set_err( "Invalid sample [$iid]" );
256 1         161 return;
257             }
258              
259 1         5 return $self->{raw_data}{data}{$iid}{PHENOTYPE};
260             }
261              
262             =head2 get_sample_data()
263              
264             Returns a hash ref containing data for each sample in the input file. This is a convenience method
265             for RandomJungle::File::DB and probably should not be called directly outside of that module, as the
266             interface and return structure is not guaranteed to be stable.
267              
268             my $href = $raw->get_sample_data; # convenience method for RandomJungle::File::DB
269              
270             =cut
271              
272             sub get_sample_data
273             {
274             # returns an href to all the variable data records
275             # this is a convience method for RJ::File::DB
276 1     1 1 12668 my ( $self ) = @_;
277 1         5 return $self->{raw_data}{data};
278             }
279              
280             =head2 get_data_for_sample()
281              
282             This method retrieves data for a given sample, specified using the sample label from the IID column.
283             If called with only a single parameter (the sample label), an array ref will be returned that contains
284             the sample's variable data, suitable for classification by a RandomJungle::Tree object:
285              
286             my $aref = $raw->get_data_for_sample( $iid ); # variable data, suitable for classification
287              
288             If called with 'orig => 1', the original line from the input file (unsplit, unspliced) will be returned:
289              
290             my $line = $raw->get_data_for_sample( $iid, orig => 1 ); # original line from the RAW file
291              
292             Sets err_str and returns undef on error ($iid not specified or invalid).
293              
294             =cut
295              
296             sub get_data_for_sample
297             {
298             # returns an aref to the variable data, suitable for classification (split, spliced)
299             # returns the original (unsplit, unspliced) line from the RAW file if orig => 1 is specified
300             # sets err_str and returns undef if $sample_iid is not specified or if it is invalid
301 5     5 1 8317 my ( $self, $sample_iid, %params ) = @_;
302              
303 5 100       16 if( ! defined $sample_iid )
304             {
305 1         3 $self->set_err( 'No sample specified' );
306 1         122 return;
307             }
308              
309 4 100       14 if( ! exists $self->{raw_data}{data}{$sample_iid} )
310             {
311 1         6 $self->set_err( "Invalid sample [$sample_iid]" );
312 1         175 return;
313             }
314              
315             # $orig_line is unsplit, unspliced: FID IID PAT MAT SEX PHENOTYPE ...
316 3         9 my $orig_line = $self->{raw_data}{data}{$sample_iid}{orig_data};
317              
318 3 100 100     18 if( exists $params{orig} && $params{orig} == 1 )
319             {
320 1         5 return $orig_line;
321             }
322              
323 2         15 my @data = split( / /, $orig_line );
324 2         8 my ( $fid, $iid, $pat, $mat ) = splice( @data, 0, 4 ); # remove for classification
325              
326 2         7 return \@data;
327             }
328              
329             =head2 get_data()
330              
331             Returns the data structures contained in $self:
332              
333             my $href = $raw->get_data;
334              
335             This method is for debugging only and should not be used in production code.
336              
337             =cut
338              
339             sub get_data
340             {
341             # deprecated
342 1     1 1 6 my ( $self ) = @_;
343              
344 1         5 my %h = (
345             raw_file => $self->{raw_file},
346             raw_data => $self->{raw_data},
347             );
348              
349 1         3 return \%h;
350             }
351              
352             =head2 set_err()
353              
354             Sets the error message (provided as a parameter) and creates a stack trace:
355              
356             $raw->set_err( 'Something went boom' );
357              
358             =cut
359              
360             sub set_err
361             {
362 10     10 1 20 my ( $self, $errstr ) = @_;
363              
364 10   100     35 $self->{err_str} = $errstr || '';
365 10         54 $self->{err_trace} = Devel::StackTrace->new;
366             }
367              
368             =head2 err_str()
369              
370             Returns the last error message that was set:
371              
372             my $msg = $raw->err_str;
373              
374             =cut
375              
376             sub err_str
377             {
378 10     10 1 3698 my ( $self ) = @_;
379              
380 10         97 return $self->{err_str};
381             }
382              
383             =head2 err_trace()
384              
385             Returns a backtrace for the last error that was encountered:
386              
387             my $trace = $raw->err_trace;
388              
389             =cut
390              
391             sub err_trace
392             {
393 1     1 1 3 my ( $self ) = @_;
394              
395 1         7 return $self->{err_trace}->as_string;
396             }
397              
398             #*********************************************************************
399             # Private Methods and Routines
400             #*********************************************************************
401              
402             sub _init
403             {
404             # sets $ERROR and returns undef if $args{filename} is not defined or file does not exist
405              
406 10     10   22 my ( $self, %args ) = @_;
407              
408 10         20 @{ $self }{ keys %args } = values %args;
  10         30  
409              
410             # filename (mandatory)
411              
412 10 100       343 if( ! defined $self->{filename} )
    100          
413             {
414 1         2 $ERROR = "'filename' is not defined";
415 1         11 return;
416             }
417             elsif( ! -e $self->{filename} )
418             {
419 1         3 $ERROR = "$self->{filename} does not exist";
420 1         5 return;
421             }
422              
423 8         49 $self->{raw_file}{filename} = $self->{filename};
424             }
425              
426             sub localize_string
427             {
428 87     87 0 185 my ( $content ) = @_;
429              
430 87         107 my $localized = $content;
431 87         124 $localized =~ s[\012\015][\n]g; # added to support x0Ax0D
432 87         101 $localized =~ s[\015{1,2}\012][\n]sg; # from File::LocalizeNewlines, collapse x0Dx0A and x0Dx0Dx0A
433 87         266 $localized =~ s[\015|\012][\n]sg; # from File::LocalizeNewlines, normalize remaining
434              
435 87         190 return $localized;
436             }
437              
438             =head1 SEE ALSO
439              
440             RandomJungle::Jungle, RandomJungle::Tree, RandomJungle::Tree::Node,
441             RandomJungle::XML, RandomJungle::OOB, RandomJungle::RAW,
442             RandomJungle::DB, RandomJungle::Classification_DB
443              
444             =head1 AUTHOR
445              
446             Robert R. Freimuth
447              
448             =head1 COPYRIGHT
449              
450             Copyright (c) 2011 Mayo Foundation for Medical Education and Research. All rights reserved.
451              
452             This program is free software; you can redistribute it and/or modify
453             it under the same terms as Perl itself.
454              
455             The full text of the license can be found in the
456             LICENSE file included with this module.
457              
458             =cut
459              
460             #*********************************************************************
461             # Guts
462             #*********************************************************************
463              
464             =begin guts
465              
466             Guts:
467              
468             $self
469             raw_file
470             filename => $filename
471             raw_data
472             header_labels => [ FID, IID, PAT, MAT ] (expected)
473             variable_labels => [ SEX, PHENOTYPE, rs... ]
474             data
475             $iid
476             SEX => $val
477             PHENOTYPE => $val
478             orig_data => $line (unsplit, unspliced) # use for classifying sample with tree (but not vals for header labels!)
479             samples => [ $iid, ... ]
480             err_str => $errstr
481             err_trace => Devel::StackTrace object
482              
483             =cut
484              
485             1;