File Coverage

blib/lib/Bio/SNP/Inherit.pm
Criterion Covered Total %
statement 5 7 71.4
branch n/a
condition n/a
subroutine 3 3 100.0
pod n/a
total 8 10 80.0


line stmt bran cond sub pod time code
1             package Bio::SNP::Inherit;
2             BEGIN {
3 11     11   302951 $Bio::SNP::Inherit::VERSION = '0.002';
4             }
5 11     11   104 use strict;
  11         19  
  11         398  
6             #ABSTRACT: Module for determining the parental origin of specific SNPs based on genotype data.
7 11     11   5707 use Moose; #Obect system. Also turns on strict and warnings.
  0            
  0            
8             use namespace::autoclean; #"Keep imports out of your namespace"
9             use IO::File; #Provides file methods (in lieu of <> )
10             use Carp qw{ confess }; #Die with stack trace
11             use List::MoreUtils qw{ all any none uniq zip }; #list-related functions
12             #use Smart::Comments '###'; #Used for debugging
13              
14             #****************************************************************************#
15             # BUILD (START)#
16             # (This allows us to perform some tasks just after the object is created #
17             # but before it is returned to the user.) #
18             #----------------------------------------------------------------------------#
19             sub BUILD{
20             my $self = shift;
21              
22             #Get manifest data. Should die if there is an error.
23             $self->_process_manifest_file();
24              
25             #Process data file and output summary and abh results
26             $self->_output_summary_and_abh_files();
27              
28             return;
29             }
30             #----------------------------------------------------------------------------#
31             # BUILD (END)#
32             #****************************************************************************#
33              
34              
35             #****************************************************************************#
36             # CONSTANTS (START)#
37             #----------------------------------------------------------------------------#
38             my $EMPTY_STRING = q{};
39              
40             #Inheritance constants
41             my $MISSING = q{-};
42             my $MISSING_PARENT = $MISSING x 2;
43             my $NONPARENTAL = q{!};
44             my $NONPARENTAL_F1 = $NONPARENTAL x 2;
45             my $POLYM = q{%};
46             my $ALLELE_A = q{A};
47             my $ALLELE_B = q{B};
48             my $HETEROZYGOUS = q{H};
49             my $NONPOLYM = q{~};
50             my $NONPOLYM_CONFLICT = q{#};
51             #----------------------------------------------------------------------------#
52             # CONSTANTS (END)#
53             #****************************************************************************#
54              
55              
56             #****************************************************************************#
57             # MANIFEST FILE (START)#
58             #----------------------------------------------------------------------------#
59             has 'manifest_filename' => (
60             is=>'ro',
61             isa=>'Str',
62             required => 1,
63             );
64              
65             has '_sample_for' => (
66             traits => ['Hash'],
67             is => 'ro',
68             isa => 'HashRef',
69             default => sub{ {} },
70             handles =>{
71             _get__sample_for => 'get',
72             _sample_ids_from_manifest => 'keys',
73             },
74             );
75              
76             sub _process_manifest_file {
77             my $self = shift;
78             my %sample_for;
79              
80             #Open filehandle from manifest filename
81             my $fh = IO::File->new($self->manifest_filename, '<') or die $@;
82              
83             #discard first line
84             my $first_line = <$fh>;
85              
86             #read in each id with its associated data
87             while ( my $line = <$fh> ) {
88              
89             #remove newline character
90             $line = _sans_newlines($line);
91              
92             #load each value from the line into appropriate variables
93             my ( $id, $name, $group, $parentA, $parentB, $replicate_of,
94             $F1_ancestor )
95             = split /\t/, $line;
96              
97             $id = _trim($id);
98              
99             #confess or next . . .
100             #If $id is an empty string, throw an error if anything but
101             # whitespace exists on the line
102             if ( $id eq $EMPTY_STRING ) {
103             if ( $line =~ m{ \A \s+ \z}xms ) {
104             #Skip this empty line
105             next;
106             }
107             else {
108             confess "Empty id in manifest file: '$line'";
109             }
110             }
111              
112             #confess if hash for this given id already exists
113             confess 'duplicate sample id in manifest file' . "'$id'"
114             if exists $sample_for{$id};
115              
116             #remove leading/trailing whitespace
117             foreach ( $id, $name, $group, $parentA, $parentB, $replicate_of, $F1_ancestor ) {
118             $_ = _trim($_);
119             }
120              
121             $sample_for{$id} = {};
122             my $sample = $sample_for{$id};
123              
124             #only store values that aren't empty
125             _add_if_not_empty( $sample, name => $name );
126             _add_if_not_empty( $sample, group => $group );
127             _add_if_not_empty( $sample, parentA => $parentA );
128             _add_if_not_empty( $sample, parentB => $parentB );
129             _add_if_not_empty( $sample, replicate_of => $replicate_of );
130             _add_if_not_empty( $sample, F1_ancestor => $F1_ancestor );
131              
132             #If this is a replicate of another sample, then store id in
133             # replicates array of the proper sample
134             if($replicate_of){
135             push @{ $sample_for{$replicate_of}{replicates} }, $id;
136             }
137             }
138              
139             #Finished reading data, so close the filehandle
140             $fh->close();
141              
142             #Store reference to the hash containing all of the data
143             $self->{_sample_for} = \%sample_for;
144              
145             return;
146             }
147              
148             #Trim away leading and trailing whitespace
149             sub _trim{
150             my $string = shift;
151              
152             #Don't try substitutions on undef
153             return if ! defined( $string);
154              
155             #Trim all leading whitespace
156             $string =~ s/\A \s+//xms;
157              
158             #Trim all trailing whitespace
159             $string =~ s/\s+ \z//xms;
160              
161             #Return trimmed string
162             return $string;
163             }
164              
165             #Adds key, value pairs to a hash if the value is not empty.
166             sub _add_if_not_empty{
167             my $hashref = shift;
168             my $key = shift;
169             my $value = shift;
170              
171             #Only assign to hash for keys that are defined and not empty
172             if(defined($value) && $value ne $EMPTY_STRING){
173             ${$hashref}{$key} = $value;
174             }
175             return $hashref;
176             }
177             #----------------------------------------------------------------------------#
178             # MANIFEST FILE (END)#
179             #****************************************************************************#
180              
181              
182             #****************************************************************************#
183             # SUMMARIZE DATA (START)#
184             #----------------------------------------------------------------------------#
185              
186             #Must be specified in the constructor. This is indicated here by (1) making it
187             # required and by (2) not giving it a default nor a builder
188             has 'data_filename' => (
189             is => 'ro',
190             isa => 'Str',
191             required => 1,
192             );
193              
194             # leading underscore indicates a private attribute by convention
195             has '_data_fh' => (
196             is => 'ro',
197             isa => 'IO::File',
198             lazy => 1,
199             required => 1,
200             builder => '_build__data_fh',
201             );
202              
203             sub _get_data_line{
204             my $self = shift;
205             return $self->_data_fh->getline();
206             }
207              
208             #sub name = '_build_' . '_data_fh'
209             sub _build__data_fh {
210             my $self = shift;
211             my $fh = IO::File->new( $self->data_filename(), '<' ) or die $@;
212             return $fh;
213             }
214              
215             has 'sample_ids' => (
216             traits => ['Array'],
217             is => 'ro',
218             isa => 'ArrayRef',
219             lazy => 1,
220             required => 1,
221             builder => '_build_sample_ids',
222             handles => {
223             _ids_from_data_file => 'elements',
224             },
225             );
226              
227             sub _build_sample_ids {
228             my $self = shift;
229              
230             #skip all lines until finding the [Data] label.
231             while ( my $line = $self->_data_fh->getline() ) {
232             last if $line =~ /\[ data \]/xmsi; #i makes it case insensitive
233             }
234              
235             #read in line containing ids
236             my $line_of_ids = $self->_data_fh->getline();
237             if ( !defined $line_of_ids ) {
238             my $dying_message =
239             'either no sample ids or no "[Data]" tag found in file: '
240             . $self->data_filename;
241             confess $dying_message;
242             }
243              
244             #remove newline(s)
245             $line_of_ids = _sans_newlines($line_of_ids);
246              
247             #extract ids from line, ignoring blank field at the beginning
248             my ( undef, @ids ) = split /\t/, $line_of_ids;
249             return \@ids;
250             }
251              
252             has '_columns_for' => (
253             traits => ['Hash'],
254             is => 'ro',
255             isa => 'HashRef',
256             lazy => 1,
257             builder => '_build__columns_for',
258             handles => { _columns_for_id_aref => 'get', },
259             );
260              
261             #getting columns corresponding to given id.
262             sub _get_columns_for{
263             my $self = shift;
264             my $id = shift;
265             confess if ! defined($id);
266             return @{ $self->_columns_for_id_aref($id) };
267             }
268              
269             sub _build__columns_for{
270             my $self = shift;
271              
272             #get all of the sample ids, in their column order
273             my @data_ids = $self->_ids_from_data_file();
274              
275             #get all of the column indices
276             my @indices = (0 .. $#data_ids);
277              
278             #store column indicdes
279             my @columns_aref;
280             for my $index (0 .. $#indices){
281             push @columns_aref, [ $index ];
282             }
283              
284             #Create hash to hold column info
285             my %columns_for = zip(@data_ids, @columns_aref);
286              
287             #-------------------------------------------------------------------------
288             #For each sample that has replicates, add column indices of the replicates
289             # This just checks ids from the manifest file, since they are the only
290             # ones that have replicate information.
291             for my $manifest_id ( $self->_sample_ids_from_manifest() ) {
292              
293             my $sample = $self->{_sample_for}->{$manifest_id};
294              
295             #if sample has replicates, add column indices of replicates
296             if ( defined $sample->{replicates} ) {
297             for my $replicate ( @{ $sample->{replicates} } ) {
298             if( defined $columns_for{$replicate}){
299             push @{ $columns_for{$manifest_id} }, @{ $columns_for{$replicate} };
300             }
301             }
302             }
303             }
304             #-------------------------------------------------------------------------
305              
306             return \%columns_for;
307             }
308              
309             sub _in_group{
310             my $self = shift;
311             my $group_regex = shift;
312             my $id = shift;
313             my $sample = $self->{_sample_for}->{$id};
314             return 0 if(!defined $sample);
315             if($sample->{group} =~ / $group_regex /xms){
316             return 1;
317             }else{
318             return 0;
319             }
320             }
321              
322             sub _parented_href{
323             my $self = shift;
324             my %parented = (
325             names => [],
326             ids => [],
327             groups => [],
328             parentA_names => [],
329             parentB_names => [],
330             F1_ancestors => [],
331             );
332              
333              
334             GROUPS_OF_CODES:
335             for my $id ($self->_ids_from_data_file()) {
336             my $sample = $self->{_sample_for}->{$id};
337              
338             #Skip this id, if no parentA
339             next if ! defined( $sample->{parentA});
340              
341             if ( defined( $sample->{parentB} ) || defined( $sample->{F1_ancestor} )) {
342              
343             my $parentA_id = $sample->{parentA};
344             my $parentA_name = $self->{_sample_for}->{$parentA_id}->{name};
345              
346             #Get $parentB_id.
347             #Change it to undef if it is the empty string
348             my $parentB_id = $sample->{parentB} || undef;
349              
350             my $F1_ancestor_id = $sample->{F1_ancestor} || undef;
351              
352             my $parentB_name;
353             if ($parentB_id) {
354             $parentB_name = $self->{_sample_for}->{$parentB_id}->{name};
355             }
356             elsif ($F1_ancestor_id) {
357             $parentB_name =
358             '(F1)' . $self->{_sample_for}->{$F1_ancestor_id}->{name};
359             }
360              
361             push @{ $parented{ids} }, $id;
362             push @{ $parented{parentA_names} }, $parentA_name;
363             push @{ $parented{parentB_names} }, $parentB_name;
364             push @{ $parented{groups} }, $sample->{group};
365             push @{ $parented{names} }, $sample->{name};
366             }
367             else{
368             #Skip this one since it doesn't have a parentB or an F1 ancestor
369             next;
370             }
371             }
372             return \%parented;
373             }
374              
375             sub _output_summary_and_abh_files {
376             my $self = shift;
377              
378             #Column headers for Summary output file
379             $self->_summary_fh->print($self->_summary_header());
380              
381             my @all_sample_ids = $self->_ids_from_data_file;
382              
383             my $nam_regex = qr{ NAM [ ]* F1}xms;
384             my @nam_f1_ids =
385             grep { $self->_in_group( $nam_regex, $_ ) } @all_sample_ids;
386              
387             my %parented = %{ $self->_parented_href() };
388              
389             #Columns headers for ABH output file
390             for my $item (qw{ ids parentA_names parentB_names groups names}) {
391             my $abh_header_row = "\t" . join "\t", @{ $parented{$item} };
392             $self->_abh_fh->print($abh_header_row . "\n");
393             }
394              
395             #Read data from file and analyze it one line at a time
396             # (if already done, this will be skipped due to filehandle
397             # already being at the end of the file)
398             while ( my $line = $self->_get_data_line() ) {
399              
400             #Remove newline character
401             $line = _sans_newlines($line);
402              
403             #Extract SNP name and the data from this line
404             my ( $snp, @genotypes ) = split /\t/, $line;
405              
406             #Remove whitespace at beginning and end of SNP name
407             $snp = _trim($snp);
408              
409             #Skip blank lines
410             next if $snp eq $EMPTY_STRING;
411              
412             #Print SNP name in first column of current line of ABH output file
413             $self->_abh_fh->print($snp);
414              
415             #Tally occurrences of eacy genotype
416             my @summarized_genotypes = _summarize_genotypes(@genotypes);
417              
418             #Associated data ids with genotypes
419             my %genotype_for = zip( @all_sample_ids, @genotypes );
420              
421             #Initialize counters with zeros. Thus zeros will show up in reports.
422             my $inconsistent_count = 0;
423             my %num_of = (
424             parentA => {
425             $POLYM => 0,
426             $MISSING => 0,
427             },
428             parentB => {
429             $POLYM => 0,
430             $MISSING => 0,
431             },
432             F1 => { $MISSING => 0, },
433             );
434              
435             #----------------------------------------------------------------START
436             #Summarize this one line of data for all of the NAM F1's
437             #---------------------------------------------------------------------
438             NAM_F1_LOOP:
439             for my $sample_id (@nam_f1_ids) {
440              
441             my $sample = $self->_get__sample_for($sample_id);
442             my $sample_gt = $genotype_for{$sample_id};
443              
444             #For our current purposes, we will not evaluate an F1 unless there is data for both parents.
445             my $both_parents_present =
446             defined $sample->{parentA} && defined $sample->{parentB};
447              
448             next NAM_F1_LOOP if (! $both_parents_present);
449              
450             my @genotype_chars;
451             my $parents_missing_data_for_this_sample_count;
452             my %gt_chars_of;
453             for my $parent (qw{ parentA parentB}) {
454             my @parent_columns =
455             $self->_get_columns_for( $sample->{$parent} );
456             my @parent_genotypes = @genotypes[@parent_columns];
457              
458             #_nonredundant_chars
459             my @parent_gt_chars = @{ $gt_chars_of{$parent} } =
460             _nonredundant_chars(@parent_genotypes);
461             push @genotype_chars, @parent_gt_chars;
462              
463             my $num_parent_char = @parent_gt_chars;
464             if ( $num_parent_char > 1 ) { $num_of{$parent}{$POLYM}++; }
465             elsif ( $num_parent_char == 1 ) {
466             if ( $parent_gt_chars[0] eq $MISSING ) {
467             $num_of{$parent}{$MISSING}++;
468             $parents_missing_data_for_this_sample_count++;
469             }
470             }
471             elsif ( $_ < 1 ) {
472             confess
473             '_nonredundant_chars should return at least one char';
474             }
475             }
476              
477             #Keep track of how many F1s are missing data
478             if ( $sample_gt eq $MISSING x 2 ) {
479             $num_of{F1}{$MISSING}++;
480             }
481             elsif (
482             !_can_be_F1_of(
483             $sample_gt, $gt_chars_of{parentA},
484             $gt_chars_of{parentB}
485             )
486             )
487             {
488             $inconsistent_count++;
489             }
490             }
491             my $sum_of_odds_and_ends = $inconsistent_count
492             + $num_of{parentA}{$POLYM}
493             + $num_of{parentB}{$POLYM}
494             + $num_of{parentA}{$MISSING}
495             + $num_of{parentB}{$MISSING};
496              
497             my $summary_line = join "\t", $snp, @summarized_genotypes,
498             $inconsistent_count, $num_of{parentA}{$POLYM},
499             $num_of{parentB}{$POLYM}, $num_of{parentA}{$MISSING},
500             $num_of{parentB}{$MISSING}, $sum_of_odds_and_ends,
501             $num_of{F1}{$MISSING};
502              
503             $self->_summary_fh->print($summary_line . "\n");
504             #---------------------------------------------------------------------
505             #Summarize this one line of data for all of the NAM F1's
506             #------------------------------------------------------------------END
507              
508              
509             #Output inheritance codes
510             #First step: calculate for those with two parents
511             #Second step: calculate for those with parentA and one F1 ancestor
512             my $abh_line = $EMPTY_STRING;
513              
514             #----------------------------------------------------------------START
515             #Calculate inheritance codes for everything with two parents
516             #---------------------------------------------------------------------
517             for my $sample_id ( @{ $parented{ids} } ) {
518              
519             my $sample = $self->_get__sample_for($sample_id);
520             my $sample_gt = $genotype_for{$sample_id};
521             my $parentA_id = $sample->{parentA};
522             my @parentA_columns = $self->_get_columns_for($parentA_id);
523             my @parentA_genotypes = @genotypes[@parentA_columns];
524              
525             my $sample_abh;
526             if ( $sample->{parentB} ) {
527             my $parentB_id = $sample->{parentB};
528             my @parentB_columns = $self->_get_columns_for($parentB_id);
529             my @parentB_genotypes = @genotypes[@parentB_columns];
530              
531             $sample_abh = _calculate_abh( $sample_gt, \@parentA_genotypes,
532             \@parentB_genotypes );
533             }elsif($sample->{F1_ancestor}){
534             my $F1_ancestor_id = $sample->{F1_ancestor};
535             my @F1_columns = $self->_get_columns_for($F1_ancestor_id);
536             my @F1_genotypes = @genotypes[@F1_columns];
537             $sample_abh = _calculate_abh_from_F1( $sample_gt, \@parentA_genotypes,
538             \@F1_genotypes);
539             }
540             $abh_line .= "\t" . $sample_abh;
541             }
542              
543             #print abh_line to file and add newline
544             $self->_abh_fh->print($abh_line ."\n");
545             #------------------------------------------------------------------END
546              
547              
548             }
549             $self->_summary_fh->flush();
550             $self->_summary_fh->close();
551             $self->_abh_fh->flush();
552             $self->_abh_fh->close();
553              
554             return;
555             }
556              
557              
558             has '_abh_fh' => (
559             is => 'ro',
560             isa => 'IO::File',
561             lazy => 1,
562             required => 1,
563             builder => '_build__abh_fh',
564             );
565              
566             sub _build__abh_fh {
567             my $self = shift;
568             my $filename = $self->data_filename() . '_abh.tab';
569             my $fh = IO::File->new( $filename, '>' ) or die $@;
570             return $fh;
571             }
572              
573             has '_summary_fh' => (
574             is => 'ro',
575             isa => 'IO::File',
576             lazy => 1,
577             required => 1,
578             builder => '_build__summary_fh',
579             );
580              
581             sub _build__summary_fh {
582             my $self = shift;
583             my $filename = $self->data_filename() . '_summary.tab';
584             my $fh = IO::File->new( $filename, '>' ) or die $@;
585             return $fh;
586             }
587              
588             sub _sorted_characters{
589             my $string = shift;
590             $string =~ s/ \s //xms;
591             #mea culpa
592             return join $EMPTY_STRING, sort split $EMPTY_STRING, $string;
593              
594             }
595              
596             sub _sort_and_join{
597             my @chars = @_;
598             return join $EMPTY_STRING, sort @chars;
599             }
600              
601             sub _can_be_F1_of{
602             my $sample_gt = shift;
603             my $parentA_aref = shift; # 'aref' means array reference
604             my $parentB_aref = shift;
605             my @parentA_chars = _nonredundant_chars( @{$parentA_aref} );
606             my @parentB_chars = _nonredundant_chars( @{$parentB_aref} );
607              
608             #If parentA or parentB only have missing data, then they could be
609             # anything
610             if (all { $_ eq $MISSING } @parentA_chars){
611             @parentA_chars = qw{ A C G T - };
612             }
613             if (all { $_ eq $MISSING } @parentB_chars){
614             @parentB_chars = qw{ A C G T - };
615             }
616              
617             my $sorted_sample_gt = _sorted_characters($sample_gt);
618             return 1 if $sorted_sample_gt eq $MISSING x 2;
619              
620             for my $parentA_char (@parentA_chars){
621             for my $parentB_char (@parentB_chars){
622             my $expected_gt = _sort_and_join($parentA_char, $parentB_char);
623             return 1 if($expected_gt eq $sorted_sample_gt);
624             }
625             }
626              
627             #If nothing matched, return false
628             return 0;
629             }
630              
631              
632             sub _calculate_abh_from_F1 {
633             my $sample_gt = shift;
634             my $parentA_aref = shift;
635             my $F1_aref = shift;
636              
637             _calculate_abh($sample_gt, $parentA_aref, $F1_aref, 'F1');
638             }
639              
640             sub _remove_matching_char{
641             my $char = shift;
642             my @array = @_;
643             my @final_array;
644             for(@array){
645             next if $_ eq $char;
646             push @final_array, $_;
647             }
648             return @final_array;
649             }
650              
651             sub _equivalent_arrays{
652             my $first_aref = shift;
653             my $second_aref = shift;
654             my @first = @{ $first_aref };
655             my @second = @{ $second_aref };
656              
657             my $same_size = scalar @first == scalar @second;
658             return if ! $same_size;
659              
660             for (0 .. $#first ){
661             return if $first[$_] ne $second[$_] ;
662             }
663             return 1;
664             }
665              
666             sub _calculate_abh {
667             my $sample_gt = shift; # 'gt' means genotype
668             my $parentA_aref = shift; # 'aref' means array reference
669             my $parentB_aref = shift;
670             my $parentB_is_F1 = shift;
671             my @parentA_chars = _nonredundant_chars( @{$parentA_aref} );
672             my @parentB_chars = _nonredundant_chars( @{$parentB_aref} );
673              
674             #If genotype has one or more $MISSING alleles, then return $MISSING
675             return $MISSING if ( $sample_gt =~ m{ $MISSING }xms );
676              
677             #Find out if the parents have exactly the same genotype
678             my $parents_same;
679             if ( _equivalent_arrays(\@parentA_chars, \@parentB_chars )) {
680             $parents_same = 1;
681             }
682              
683             #Return 'missing' if one of the parents have only missing data
684             #Return error code if sample has alleles not found in parents
685             #Must check in this order, otherwise having no parental info will be
686             # misdiagnosed as having a nonparental allele
687             if ( @parentA_chars == 1 && $parentA_chars[0] eq $MISSING ) {
688             return $MISSING_PARENT;
689             }
690             elsif ( @parentB_chars == 1 && $parentB_chars[0] eq $MISSING ) {
691             return $MISSING_PARENT;
692             }
693             else {
694              
695             #combine parental character sets
696             my @both_parent_chars =
697             _nonredundant_chars( @parentA_chars, @parentB_chars );
698              
699             #Check if the sample genotype can have descended from parents
700             if ( !_is_comprised_from( $sample_gt, @both_parent_chars ) ) {
701             if ($parents_same) {
702             return $NONPOLYM_CONFLICT;
703             }
704             else {
705             return $NONPARENTAL;
706             }
707             }
708             else {
709              
710             #Good! No errors and no missing data.
711             }
712             }
713              
714             if ( !$parentB_is_F1 ) {
715              
716             #If genotype only contains alleles from parents, then assign A, B, or H
717             my $allele_A_count = 0;
718             my $allele_B_count = 0;
719              
720             #Check if both parents are nonpolymorphic
721             if ( @parentA_chars == 1 && @parentB_chars == 1 ) {
722             my $parentA_char = $parentA_chars[0];
723             my $parentB_char = $parentB_chars[0];
724              
725             #Indicate noninformative if parents are the same
726             return $NONPOLYM if ($parents_same);
727              
728             #Extract characters from sample genotype
729             my @sample_chars = _chars_from($sample_gt);
730              
731             #Tally how many are from which parent
732             for my $sample_char (@sample_chars) {
733             if($sample_char eq $parentA_char){
734             $allele_A_count++;
735             }
736             elsif ($sample_char eq $parentB_char) { $allele_B_count++;}
737             }
738              
739             if ( $allele_A_count == 2 ) {
740             return $ALLELE_A;
741             }
742             elsif ( $allele_B_count == 2 ) {
743             return $ALLELE_B;
744             }
745             elsif ( $allele_A_count == 1 && $allele_B_count == 1 ) {
746             return $HETEROZYGOUS;
747             }
748             else {
749              
750             #Since we already checked for values that were missing or
751             # nonparental, this should be unreachable
752             my $dying_message = "error calculating 'ABH' from:\n"
753             . join "'\n",
754             q{parentA => '} . join( "' '", @{$parentA_aref} ),
755             q{parentB => '} . join( "' '", @{$parentB_aref} ),
756             q{sample genotype =>'} . $sample_gt;
757              
758             confess $dying_message;
759             }
760             }
761             else {
762              
763             #indicate that it is noninformative because of a polymorphic parent
764             return $POLYM;
765             }
766             }
767             else {
768             for my $F1_gt ( @{$parentB_aref} ) {
769             return $NONPARENTAL_F1
770             if !_can_be_F1_of( $F1_gt, $parentA_aref, [$MISSING] );
771             }
772              
773             return $POLYM if @parentA_chars > 1;
774              
775             if (@parentB_chars == 2) {
776             my $parentA_char = $parentA_chars[0];
777             my @revised_parentB_chars =
778             _remove_matching_char( $parentA_char, @parentB_chars );
779             return _calculate_abh( $sample_gt, $parentA_aref,
780             \@revised_parentB_chars );
781             }
782             elsif (@parentB_chars == 1) {
783             if ( _equivalent_arrays(\@parentB_chars, \@parentA_chars )) {
784             return _calculate_abh( $sample_gt, $parentA_aref,
785             $parentB_aref );
786             }
787             else {
788             ### $sample_gt
789             ### @parentA_chars
790             ### @parentB_chars
791             confess "I didn't think this was reachable";
792             }
793             }
794             }
795             }
796              
797             sub _summary_header{
798             return "\t" . join("\t", qw{ AA AC AG AT CC CG CT GG GT TT --
799             inconsistent parentA_polymorphic
800             parentB_polymorphic parentA_unknown
801             parentB_unknown sum_of_odds_and_ends
802             F1_missing }) . "\n";
803             }
804              
805             sub _summarize_genotypes {
806             my @data = @_;
807             my @genotypes = qw{ AA AC AG AT CC CG CT GG GT TT -- };
808             my %count;
809              
810             #Initialize hash %count, so that zeros show up in the summary
811             for my $genotype (@genotypes) {
812             $count{$genotype} = 0;
813             }
814              
815             #Count occurrences of each genotype
816             for my $datum (@data){
817              
818             #Remove any leading or trailing whitespace
819             $datum = _trim($datum);
820              
821             #Get just the first two characters of each data string
822             my $sorted_char_datum = _sorted_first_two_char($datum);
823              
824             #Add one to the count for this genotype
825             $count{$sorted_char_datum}++;
826             }
827              
828             #Return an array of just the specified genotypes
829             # (this is a 'hash slice')
830             return @count{ @genotypes };
831             }
832             #----------------------------------------------------------------------------#
833             # SUMMARIZE DATA (END)#
834             #****************************************************************************#
835              
836             #****************************************************************************#
837             # GENERAL UTILITIES #
838             #----------------------------------------------------------------------------#
839              
840             sub _nonredundant_chars {
841             my @strings = @_;
842             confess 'arguments required' if( @strings < 1);
843             for( 0 .. $#strings){
844             $strings[$_] = _sans_newlines($strings[$_]);
845             }
846              
847             #Get just the unique characters out of the strings
848             my $combined = join $EMPTY_STRING, @strings;
849             my @single_chars = uniq _chars_from($combined);
850              
851             #Remove 'missing data' and empty characters
852             my @filtered_chars =
853             grep { $_ ne $MISSING && $_ ne $EMPTY_STRING } @single_chars;
854              
855             #Return $MISSING if there aren't any left
856             return $MISSING if(@filtered_chars == 0);
857              
858             #Check that the characters are all DNA bases
859             my $DNA_BASES = qr{ [ACGT] }xms;
860             for my $filtered_char(@filtered_chars){
861              
862             #Throw exception if character is not a valid DNA base
863             if($filtered_char !~ m{\A $DNA_BASES \z}xms){
864             confess "Unknown character: '$filtered_char'";
865             }
866             }
867              
868             #Return nonredundant characters, now that they've all been checked
869             return @filtered_chars;
870             }
871              
872             # Are all of the characters in a string a subset of the characters found in
873             # this array of strings?
874             sub _is_comprised_from{
875             #Get the string to be analyzed, followed by strings with acceptable chars
876             my ($string, @acceptable_strings) = @_;
877              
878             #Put all of the characters from all of these strings into one array
879             my @acceptable_chars;
880             for my $acceptable_string( @acceptable_strings){
881             push @acceptable_chars, uniq _chars_from($acceptable_string);
882             }
883              
884             confess 'Empty string found'
885             if _is_an_element_of($EMPTY_STRING, @acceptable_chars);
886              
887             #Get all of the characters out of the string to be analyzed
888             my @chars = uniq _chars_from($string);
889              
890             #If any character is not acceptable, return false
891             for my $char (@chars){
892             if(! _is_an_element_of($char, @acceptable_chars)){
893             return 0;
894             }
895             }
896              
897             #Since no unacceptable character was found, return true
898             return 1;
899             }
900              
901             sub _is_an_element_of{
902              
903             my $query = shift;
904             my @elements = @_;
905             for my $element( @elements){
906             return 1 if $query eq $element;
907             }
908              
909             return 0;
910             }
911              
912             sub _chars_from{
913             my @strings = shift;
914              
915             my @string_chars;
916             for my $string (@strings){
917             push @string_chars, split $EMPTY_STRING, $string;
918             }
919              
920             #Return the individual characters from the string as an array
921             return @string_chars;
922             }
923              
924             sub _sorted_first_two_char {
925             my $string = shift;
926              
927             #Get first two characters from the string
928             my $first_two_char = substr $string, 0, 2;
929              
930             #Sort these two characters alphabetically
931             my @sorted_char = sort { $a cmp $b } split $EMPTY_STRING, $first_two_char;
932              
933             #Concatenate and return the characters
934             return join $EMPTY_STRING, @sorted_char;
935             }
936              
937             sub _sans_newlines{
938             my $string = shift;
939             chomp $string;
940              
941             $string =~ s{ \r }{}xms;
942              
943             my $string_sans_newlines = $string;
944              
945             return $string_sans_newlines;
946             }
947              
948              
949             __PACKAGE__->meta->make_immutable;
950              
951             1; # End of Bio::SNP::Inherit
952              
953             __END__
954             =head1 NAME
955              
956             =head1 SYNOPSIS
957              
958             =head1 VERSION
959              
960             version 0.002
961              
962             my $foo = Bio::SNP::Inherit->new(
963             manifest_filename => 'manifest.tab',
964             data_filename => 'data.tab'
965             );
966              
967             #Upon object construction, this outputs a summary file
968             # 'data.tab_summary.tab' and a detailed file 'data.tab_abh.tab'
969             # containing parental allele designations for each sample that has
970             # parents defined for it in the manifest file
971              
972             =head1 DESCRIPTION
973              
974             This is a module for converting Single Nucleotide Polymorphism (SNP) genotype
975             data to parental allele designations. This helps with creating files suitable
976             for mapping, identifying and characterizing crossovers, and also helps with
977             quality control.
978              
979             =head1 SUBROUTINES/METHODS
980              
981             =head2 BUILD
982              
983             Since the integrity of the data in the manifest file is absolutely vital,
984             building an object fails if there are duplicate sample ids in the
985             manifest file.
986              
987             =head1 ATTRIBUTES
988              
989             =head2 manifest_filename
990              
991             Name of the file containing information for each sample id
992              
993             Required in the constructor
994              
995             The first line contains headers and the remaining lines contain
996             tab-delimited fields in the following order:
997              
998             sample id or "Institute Sample Label" (e.g. "WG0096796-DNAA05" )
999             sample name or "Sample name" (e.g. "B73xB97" )
1000             group name or "Group" (e.g. "NAM F1" )
1001             parentA or "Mother" (e.g. "WG0096795-DNAA01" )
1002             parentB or "Father" (e.g. "WG0096796-DNAF01" )
1003             replicate of or "Replicate(s)" (id of sample that this replicates
1004             e.g. "WG0096796-DNAA05" )
1005             AxB F1 or "F1 of parentA and parentB" (e.g. "WG0096795-DNAA02" )
1006              
1007             The last four fields can be blank, if they are not applicable. However,
1008             being blank when they are applicable will result in failure of the
1009             program to analyze the data properly
1010              
1011             =head2 data_filename
1012              
1013             Name of the tab-delimited file containing the data to be processed.
1014              
1015             Required in the constructor.
1016              
1017             The text '[Data]' in a line indicates that remaining lines are all data.
1018             The next line contains column headers, which are in fact the sample ids.
1019             Sample ids missing from the manifest file will not be processed.
1020             The next line contains the name of the SNP in the first field and data in
1021             the remaining fields.
1022              
1023             Data must be in the format of SNP_name{tab}AA{tab}GG{tab}.
1024              
1025             =head1 OUTPUT FILES
1026              
1027             Upon object construction, two files are produced: one that summarizes the
1028             input and another that that describes the genotypes of samples in terms of
1029             their "parents". For example, a sample with a genotype of "CG" whose
1030             'parentA' has a genotype of "CC" and whose 'parentB' has a genotype of
1031             "GG" would have a heterozygous genotype, labeled as 'H'.
1032              
1033             Here are the possible allele designations that result:
1034              
1035             Allele designations for informative genotypes:
1036             A = parentA genotype
1037             B = parentB genotype
1038             H = heterozygous genotype
1039              
1040             Allele designations for noninformative genotypes:
1041             ~ = nonpolymorphic parents (i.e. both parents have same genotype)
1042             - = missing data
1043             -- = missing data for at least one parental
1044             % = polymorphic parent
1045              
1046             Error codes:
1047             # = conflict of nonpolymorphic expectation, meaning both parents
1048             have the same genotype, but the sample has a different
1049             genotype. For example, parentA and parentB both have the
1050             genotype 'CC', but the sample has a genotype of 'TT'.
1051              
1052             ! = nonparental genotype, meaning each parent has a different
1053             genotype, but the sample has at least one allele not seen
1054             in either parent. For example, getting 'AG' for the
1055             offspring when the parents have 'GG' and 'TT'.
1056             (This should not even be seen when the data was obtained
1057             from a biallelic assay.)
1058              
1059             !! = genotype of the F1 for parentA x parentB is incongruent with
1060             the genotype for parentA
1061              
1062             See the bundled tests for examples.
1063              
1064             =head1 TODO
1065              
1066             Output report detailing which samples have been processed and in what way.
1067             Also give descendents and ancestor relationships.
1068              
1069             Document ability to process files using F1 and parentA info (i.e. in the
1070             absence of parentB info).
1071              
1072             Add simple means of adding map info so that distances and chromosomes are
1073             output along with the marker names.
1074              
1075             Give crossover info?
1076              
1077             Give introgressions/regions attributable to specific ancestor(s).
1078              
1079             Use benchmarking to find out which (if any) to memoize:
1080             _nonredundant_chars
1081             _trim
1082             _is_comprised_from
1083             _sorted_characters
1084             _sort_and_join
1085             _chars_from
1086             _sorted_first_two_char
1087              
1088             Test bad file names
1089              
1090             =head1 DIAGNOSTICS
1091              
1092             TODO
1093              
1094             =head1 CONFIGURATION AND ENVIRONMENT
1095              
1096             TODO
1097              
1098             =head1 DEPENDENCIES
1099              
1100             TODO
1101              
1102             =head1 INCOMPATIBILITIES
1103              
1104             TODO
1105              
1106             =head1 BUGS
1107              
1108             Please report any you find. None have been reported as of the current release.
1109              
1110             =head1 LIMITATIONS
1111              
1112             This is ALPHA code. Use at your own risk. There are some major changes
1113             that I want to do to it.
1114              
1115             Be consciencious with the preparation of your input files (i.e. manifest file
1116             and data file). Correct results depend on correct input files.
1117              
1118             =head1 AUTHOR
1119              
1120             Christopher Bottoms, C<< <molecules at cpan.org> >>
1121              
1122             =head1 SUPPORT
1123              
1124             You can find documentation for this module with the perldoc command.
1125              
1126             perldoc Bio::SNP::Inherit
1127              
1128             =head1 ACKNOWLEDGEMENTS
1129              
1130             =head1 LICENSE AND COPYRIGHT
1131              
1132             This program is free software; you can redistribute it and/or modify it
1133             under the terms of either: the GNU General Public License as published
1134             by the Free Software Foundation; or the Artistic License.
1135              
1136             See http://dev.perl.org/licenses/ for more information.
1137              
1138             Copyright 2010 Christopher Bottoms.
1139              
1140             =cut