File Coverage

blib/lib/Bio/SNP/Inherit.pm

Criterion	Covered	Total	%
statement	5	7	71.4
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	8	10	80.0

line	stmt	sub	time	code
1				package Bio::SNP::Inherit;
2				BEGIN {
3	11	11	302951	$Bio::SNP::Inherit::VERSION = '0.002';
4				}
5	11	11	104	use strict;
	11		19
	11		398
6				#ABSTRACT: Module for determining the parental origin of specific SNPs based on genotype data.
7	11	11	5707	use Moose; #Obect system. Also turns on strict and warnings.
	0
	0
8				use namespace::autoclean; #"Keep imports out of your namespace"
9				use IO::File; #Provides file methods (in lieu of <> )
10				use Carp qw{ confess }; #Die with stack trace
11				use List::MoreUtils qw{ all any none uniq zip }; #list-related functions
12				#use Smart::Comments '###'; #Used for debugging
13
14				#****************************************************************************#
15				# BUILD (START)#
16				# (This allows us to perform some tasks just after the object is created #
17				# but before it is returned to the user.) #
18				#----------------------------------------------------------------------------#
19				sub BUILD{
20				my $self = shift;
21
22				#Get manifest data. Should die if there is an error.
23				$self->_process_manifest_file();
24
25				#Process data file and output summary and abh results
26				$self->_output_summary_and_abh_files();
27
28				return;
29				}
30				#----------------------------------------------------------------------------#
31				# BUILD (END)#
32				#****************************************************************************#
33
34
35				#****************************************************************************#
36				# CONSTANTS (START)#
37				#----------------------------------------------------------------------------#
38				my $EMPTY_STRING = q{};
39
40				#Inheritance constants
41				my $MISSING = q{-};
42				my $MISSING_PARENT = $MISSING x 2;
43				my $NONPARENTAL = q{!};
44				my $NONPARENTAL_F1 = $NONPARENTAL x 2;
45				my $POLYM = q{%};
46				my $ALLELE_A = q{A};
47				my $ALLELE_B = q{B};
48				my $HETEROZYGOUS = q{H};
49				my $NONPOLYM = q{~};
50				my $NONPOLYM_CONFLICT = q{#};
51				#----------------------------------------------------------------------------#
52				# CONSTANTS (END)#
53				#****************************************************************************#
54
55
56				#****************************************************************************#
57				# MANIFEST FILE (START)#
58				#----------------------------------------------------------------------------#
59				has 'manifest_filename' => (
60				is=>'ro',
61				isa=>'Str',
62				required => 1,
63				);
64
65				has '_sample_for' => (
66				traits => ['Hash'],
67				is => 'ro',
68				isa => 'HashRef',
69				default => sub{ {} },
70				handles =>{
71				_get__sample_for => 'get',
72				_sample_ids_from_manifest => 'keys',
73				},
74				);
75
76				sub _process_manifest_file {
77				my $self = shift;
78				my %sample_for;
79
80				#Open filehandle from manifest filename
81				my $fh = IO::File->new($self->manifest_filename, '<') or die $@;
82
83				#discard first line
84				my $first_line = <$fh>;
85
86				#read in each id with its associated data
87				while ( my $line = <$fh> ) {
88
89				#remove newline character
90				$line = _sans_newlines($line);
91
92				#load each value from the line into appropriate variables
93				my ( $id, $name, $group, $parentA, $parentB, $replicate_of,
94				$F1_ancestor )
95				= split /\t/, $line;
96
97				$id = _trim($id);
98
99				#confess or next . . .
100				#If $id is an empty string, throw an error if anything but
101				# whitespace exists on the line
102				if ( $id eq $EMPTY_STRING ) {
103				if ( $line =~ m{ \A \s+ \z}xms ) {
104				#Skip this empty line
105				next;
106				}
107				else {
108				confess "Empty id in manifest file: '$line'";
109				}
110				}
111
112				#confess if hash for this given id already exists
113				confess 'duplicate sample id in manifest file' . "'$id'"
114				if exists $sample_for{$id};
115
116				#remove leading/trailing whitespace
117				foreach ( $id, $name, $group, $parentA, $parentB, $replicate_of, $F1_ancestor ) {
118				$_ = _trim($_);
119				}
120
121				$sample_for{$id} = {};
122				my $sample = $sample_for{$id};
123
124				#only store values that aren't empty
125				_add_if_not_empty( $sample, name => $name );
126				_add_if_not_empty( $sample, group => $group );
127				_add_if_not_empty( $sample, parentA => $parentA );
128				_add_if_not_empty( $sample, parentB => $parentB );
129				_add_if_not_empty( $sample, replicate_of => $replicate_of );
130				_add_if_not_empty( $sample, F1_ancestor => $F1_ancestor );
131
132				#If this is a replicate of another sample, then store id in
133				# replicates array of the proper sample
134				if($replicate_of){
135				push @{ $sample_for{$replicate_of}{replicates} }, $id;
136				}
137				}
138
139				#Finished reading data, so close the filehandle
140				$fh->close();
141
142				#Store reference to the hash containing all of the data
143				$self->{_sample_for} = \%sample_for;
144
145				return;
146				}
147
148				#Trim away leading and trailing whitespace
149				sub _trim{
150				my $string = shift;
151
152				#Don't try substitutions on undef
153				return if ! defined( $string);
154
155				#Trim all leading whitespace
156				$string =~ s/\A \s+//xms;
157
158				#Trim all trailing whitespace
159				$string =~ s/\s+ \z//xms;
160
161				#Return trimmed string
162				return $string;
163				}
164
165				#Adds key, value pairs to a hash if the value is not empty.
166				sub _add_if_not_empty{
167				my $hashref = shift;
168				my $key = shift;
169				my $value = shift;
170
171				#Only assign to hash for keys that are defined and not empty
172				if(defined($value) && $value ne $EMPTY_STRING){
173				${$hashref}{$key} = $value;
174				}
175				return $hashref;
176				}
177				#----------------------------------------------------------------------------#
178				# MANIFEST FILE (END)#
179				#****************************************************************************#
180
181
182				#****************************************************************************#
183				# SUMMARIZE DATA (START)#
184				#----------------------------------------------------------------------------#
185
186				#Must be specified in the constructor. This is indicated here by (1) making it
187				# required and by (2) not giving it a default nor a builder
188				has 'data_filename' => (
189				is => 'ro',
190				isa => 'Str',
191				required => 1,
192				);
193
194				# leading underscore indicates a private attribute by convention
195				has '_data_fh' => (
196				is => 'ro',
197				isa => 'IO::File',
198				lazy => 1,
199				required => 1,
200				builder => '_build__data_fh',
201				);
202
203				sub _get_data_line{
204				my $self = shift;
205				return $self->_data_fh->getline();
206				}
207
208				#sub name = '_build_' . '_data_fh'
209				sub _build__data_fh {
210				my $self = shift;
211				my $fh = IO::File->new( $self->data_filename(), '<' ) or die $@;
212				return $fh;
213				}
214
215				has 'sample_ids' => (
216				traits => ['Array'],
217				is => 'ro',
218				isa => 'ArrayRef',
219				lazy => 1,
220				required => 1,
221				builder => '_build_sample_ids',
222				handles => {
223				_ids_from_data_file => 'elements',
224				},
225				);
226
227				sub _build_sample_ids {
228				my $self = shift;
229
230				#skip all lines until finding the [Data] label.
231				while ( my $line = $self->_data_fh->getline() ) {
232				last if $line =~ /\[ data \]/xmsi; #i makes it case insensitive
233				}
234
235				#read in line containing ids
236				my $line_of_ids = $self->_data_fh->getline();
237				if ( !defined $line_of_ids ) {
238				my $dying_message =
239				'either no sample ids or no "[Data]" tag found in file: '
240				. $self->data_filename;
241				confess $dying_message;
242				}
243
244				#remove newline(s)
245				$line_of_ids = _sans_newlines($line_of_ids);
246
247				#extract ids from line, ignoring blank field at the beginning
248				my ( undef, @ids ) = split /\t/, $line_of_ids;
249				return \@ids;
250				}
251
252				has '_columns_for' => (
253				traits => ['Hash'],
254				is => 'ro',
255				isa => 'HashRef',
256				lazy => 1,
257				builder => '_build__columns_for',
258				handles => { _columns_for_id_aref => 'get', },
259				);
260
261				#getting columns corresponding to given id.
262				sub _get_columns_for{
263				my $self = shift;
264				my $id = shift;
265				confess if ! defined($id);
266				return @{ $self->_columns_for_id_aref($id) };
267				}
268
269				sub _build__columns_for{
270				my $self = shift;
271
272				#get all of the sample ids, in their column order
273				my @data_ids = $self->_ids_from_data_file();
274
275				#get all of the column indices
276				my @indices = (0 .. $#data_ids);
277
278				#store column indicdes
279				my @columns_aref;
280				for my $index (0 .. $#indices){
281				push @columns_aref, [ $index ];
282				}
283
284				#Create hash to hold column info
285				my %columns_for = zip(@data_ids, @columns_aref);
286
287				#-------------------------------------------------------------------------
288				#For each sample that has replicates, add column indices of the replicates
289				# This just checks ids from the manifest file, since they are the only
290				# ones that have replicate information.
291				for my $manifest_id ( $self->_sample_ids_from_manifest() ) {
292
293				my $sample = $self->{_sample_for}->{$manifest_id};
294
295				#if sample has replicates, add column indices of replicates
296				if ( defined $sample->{replicates} ) {
297				for my $replicate ( @{ $sample->{replicates} } ) {
298				if( defined $columns_for{$replicate}){
299				push @{ $columns_for{$manifest_id} }, @{ $columns_for{$replicate} };
300				}
301				}
302				}
303				}
304				#-------------------------------------------------------------------------
305
306				return \%columns_for;
307				}
308
309				sub _in_group{
310				my $self = shift;
311				my $group_regex = shift;
312				my $id = shift;
313				my $sample = $self->{_sample_for}->{$id};
314				return 0 if(!defined $sample);
315				if($sample->{group} =~ / $group_regex /xms){
316				return 1;
317				}else{
318				return 0;
319				}
320				}
321
322				sub _parented_href{
323				my $self = shift;
324				my %parented = (
325				names => [],
326				ids => [],
327				groups => [],
328				parentA_names => [],
329				parentB_names => [],
330				F1_ancestors => [],
331				);
332
333
334				GROUPS_OF_CODES:
335				for my $id ($self->_ids_from_data_file()) {
336				my $sample = $self->{_sample_for}->{$id};
337
338				#Skip this id, if no parentA
339				next if ! defined( $sample->{parentA});
340
341				if ( defined( $sample->{parentB} ) \|\| defined( $sample->{F1_ancestor} )) {
342
343				my $parentA_id = $sample->{parentA};
344				my $parentA_name = $self->{_sample_for}->{$parentA_id}->{name};
345
346				#Get $parentB_id.
347				#Change it to undef if it is the empty string
348				my $parentB_id = $sample->{parentB} \|\| undef;
349
350				my $F1_ancestor_id = $sample->{F1_ancestor} \|\| undef;
351
352				my $parentB_name;
353				if ($parentB_id) {
354				$parentB_name = $self->{_sample_for}->{$parentB_id}->{name};
355				}
356				elsif ($F1_ancestor_id) {
357				$parentB_name =
358				'(F1)' . $self->{_sample_for}->{$F1_ancestor_id}->{name};
359				}
360
361				push @{ $parented{ids} }, $id;
362				push @{ $parented{parentA_names} }, $parentA_name;
363				push @{ $parented{parentB_names} }, $parentB_name;
364				push @{ $parented{groups} }, $sample->{group};
365				push @{ $parented{names} }, $sample->{name};
366				}
367				else{
368				#Skip this one since it doesn't have a parentB or an F1 ancestor
369				next;
370				}
371				}
372				return \%parented;
373				}
374
375				sub _output_summary_and_abh_files {
376				my $self = shift;
377
378				#Column headers for Summary output file
379				$self->_summary_fh->print($self->_summary_header());
380
381				my @all_sample_ids = $self->_ids_from_data_file;
382
383				my $nam_regex = qr{ NAM [ ]* F1}xms;
384				my @nam_f1_ids =
385				grep { $self->_in_group( $nam_regex, $_ ) } @all_sample_ids;
386
387				my %parented = %{ $self->_parented_href() };
388
389				#Columns headers for ABH output file
390				for my $item (qw{ ids parentA_names parentB_names groups names}) {
391				my $abh_header_row = "\t" . join "\t", @{ $parented{$item} };
392				$self->_abh_fh->print($abh_header_row . "\n");
393				}
394
395				#Read data from file and analyze it one line at a time
396				# (if already done, this will be skipped due to filehandle
397				# already being at the end of the file)
398				while ( my $line = $self->_get_data_line() ) {
399
400				#Remove newline character
401				$line = _sans_newlines($line);
402
403				#Extract SNP name and the data from this line
404				my ( $snp, @genotypes ) = split /\t/, $line;
405
406				#Remove whitespace at beginning and end of SNP name
407				$snp = _trim($snp);
408
409				#Skip blank lines
410				next if $snp eq $EMPTY_STRING;
411
412				#Print SNP name in first column of current line of ABH output file
413				$self->_abh_fh->print($snp);
414
415				#Tally occurrences of eacy genotype
416				my @summarized_genotypes = _summarize_genotypes(@genotypes);
417
418				#Associated data ids with genotypes
419				my %genotype_for = zip( @all_sample_ids, @genotypes );
420
421				#Initialize counters with zeros. Thus zeros will show up in reports.
422				my $inconsistent_count = 0;
423				my %num_of = (
424				parentA => {
425				$POLYM => 0,
426				$MISSING => 0,
427				},
428				parentB => {
429				$POLYM => 0,
430				$MISSING => 0,
431				},
432				F1 => { $MISSING => 0, },
433				);
434
435				#----------------------------------------------------------------START
436				#Summarize this one line of data for all of the NAM F1's
437				#---------------------------------------------------------------------
438				NAM_F1_LOOP:
439				for my $sample_id (@nam_f1_ids) {
440
441				my $sample = $self->_get__sample_for($sample_id);
442				my $sample_gt = $genotype_for{$sample_id};
443
444				#For our current purposes, we will not evaluate an F1 unless there is data for both parents.
445				my $both_parents_present =
446				defined $sample->{parentA} && defined $sample->{parentB};
447
448				next NAM_F1_LOOP if (! $both_parents_present);
449
450				my @genotype_chars;
451				my $parents_missing_data_for_this_sample_count;
452				my %gt_chars_of;
453				for my $parent (qw{ parentA parentB}) {
454				my @parent_columns =
455				$self->_get_columns_for( $sample->{$parent} );
456				my @parent_genotypes = @genotypes[@parent_columns];
457
458				#_nonredundant_chars
459				my @parent_gt_chars = @{ $gt_chars_of{$parent} } =
460				_nonredundant_chars(@parent_genotypes);
461				push @genotype_chars, @parent_gt_chars;
462
463				my $num_parent_char = @parent_gt_chars;
464				if ( $num_parent_char > 1 ) { $num_of{$parent}{$POLYM}++; }
465				elsif ( $num_parent_char == 1 ) {
466				if ( $parent_gt_chars[0] eq $MISSING ) {
467				$num_of{$parent}{$MISSING}++;
468				$parents_missing_data_for_this_sample_count++;
469				}
470				}
471				elsif ( $_ < 1 ) {
472				confess
473				'_nonredundant_chars should return at least one char';
474				}
475				}
476
477				#Keep track of how many F1s are missing data
478				if ( $sample_gt eq $MISSING x 2 ) {
479				$num_of{F1}{$MISSING}++;
480				}
481				elsif (
482				!_can_be_F1_of(
483				$sample_gt, $gt_chars_of{parentA},
484				$gt_chars_of{parentB}
485				)
486				)
487				{
488				$inconsistent_count++;
489				}
490				}
491				my $sum_of_odds_and_ends = $inconsistent_count
492				+ $num_of{parentA}{$POLYM}
493				+ $num_of{parentB}{$POLYM}
494				+ $num_of{parentA}{$MISSING}
495				+ $num_of{parentB}{$MISSING};
496
497				my $summary_line = join "\t", $snp, @summarized_genotypes,
498				$inconsistent_count, $num_of{parentA}{$POLYM},
499				$num_of{parentB}{$POLYM}, $num_of{parentA}{$MISSING},
500				$num_of{parentB}{$MISSING}, $sum_of_odds_and_ends,
501				$num_of{F1}{$MISSING};
502
503				$self->_summary_fh->print($summary_line . "\n");
504				#---------------------------------------------------------------------
505				#Summarize this one line of data for all of the NAM F1's
506				#------------------------------------------------------------------END
507
508
509				#Output inheritance codes
510				#First step: calculate for those with two parents
511				#Second step: calculate for those with parentA and one F1 ancestor
512				my $abh_line = $EMPTY_STRING;
513
514				#----------------------------------------------------------------START
515				#Calculate inheritance codes for everything with two parents
516				#---------------------------------------------------------------------
517				for my $sample_id ( @{ $parented{ids} } ) {
518
519				my $sample = $self->_get__sample_for($sample_id);
520				my $sample_gt = $genotype_for{$sample_id};
521				my $parentA_id = $sample->{parentA};
522				my @parentA_columns = $self->_get_columns_for($parentA_id);
523				my @parentA_genotypes = @genotypes[@parentA_columns];
524
525				my $sample_abh;
526				if ( $sample->{parentB} ) {
527				my $parentB_id = $sample->{parentB};
528				my @parentB_columns = $self->_get_columns_for($parentB_id);
529				my @parentB_genotypes = @genotypes[@parentB_columns];
530
531				$sample_abh = _calculate_abh( $sample_gt, \@parentA_genotypes,
532				\@parentB_genotypes );
533				}elsif($sample->{F1_ancestor}){
534				my $F1_ancestor_id = $sample->{F1_ancestor};
535				my @F1_columns = $self->_get_columns_for($F1_ancestor_id);
536				my @F1_genotypes = @genotypes[@F1_columns];
537				$sample_abh = _calculate_abh_from_F1( $sample_gt, \@parentA_genotypes,
538				\@F1_genotypes);
539				}
540				$abh_line .= "\t" . $sample_abh;
541				}
542
543				#print abh_line to file and add newline
544				$self->_abh_fh->print($abh_line ."\n");
545				#------------------------------------------------------------------END
546
547
548				}
549				$self->_summary_fh->flush();
550				$self->_summary_fh->close();
551				$self->_abh_fh->flush();
552				$self->_abh_fh->close();
553
554				return;
555				}
556
557
558				has '_abh_fh' => (
559				is => 'ro',
560				isa => 'IO::File',
561				lazy => 1,
562				required => 1,
563				builder => '_build__abh_fh',
564				);
565
566				sub _build__abh_fh {
567				my $self = shift;
568				my $filename = $self->data_filename() . '_abh.tab';
569				my $fh = IO::File->new( $filename, '>' ) or die $@;
570				return $fh;
571				}
572
573				has '_summary_fh' => (
574				is => 'ro',
575				isa => 'IO::File',
576				lazy => 1,
577				required => 1,
578				builder => '_build__summary_fh',
579				);
580
581				sub _build__summary_fh {
582				my $self = shift;
583				my $filename = $self->data_filename() . '_summary.tab';
584				my $fh = IO::File->new( $filename, '>' ) or die $@;
585				return $fh;
586				}
587
588				sub _sorted_characters{
589				my $string = shift;
590				$string =~ s/ \s //xms;
591				#mea culpa
592				return join $EMPTY_STRING, sort split $EMPTY_STRING, $string;
593
594				}
595
596				sub _sort_and_join{
597				my @chars = @_;
598				return join $EMPTY_STRING, sort @chars;
599				}
600
601				sub _can_be_F1_of{
602				my $sample_gt = shift;
603				my $parentA_aref = shift; # 'aref' means array reference
604				my $parentB_aref = shift;
605				my @parentA_chars = _nonredundant_chars( @{$parentA_aref} );
606				my @parentB_chars = _nonredundant_chars( @{$parentB_aref} );
607
608				#If parentA or parentB only have missing data, then they could be
609				# anything
610				if (all { $_ eq $MISSING } @parentA_chars){
611				@parentA_chars = qw{ A C G T - };
612				}
613				if (all { $_ eq $MISSING } @parentB_chars){
614				@parentB_chars = qw{ A C G T - };
615				}
616
617				my $sorted_sample_gt = _sorted_characters($sample_gt);
618				return 1 if $sorted_sample_gt eq $MISSING x 2;
619
620				for my $parentA_char (@parentA_chars){
621				for my $parentB_char (@parentB_chars){
622				my $expected_gt = _sort_and_join($parentA_char, $parentB_char);
623				return 1 if($expected_gt eq $sorted_sample_gt);
624				}
625				}
626
627				#If nothing matched, return false
628				return 0;
629				}
630
631
632				sub _calculate_abh_from_F1 {
633				my $sample_gt = shift;
634				my $parentA_aref = shift;
635				my $F1_aref = shift;
636
637				_calculate_abh($sample_gt, $parentA_aref, $F1_aref, 'F1');
638				}
639
640				sub _remove_matching_char{
641				my $char = shift;
642				my @array = @_;
643				my @final_array;
644				for(@array){
645				next if $_ eq $char;
646				push @final_array, $_;
647				}
648				return @final_array;
649				}
650
651				sub _equivalent_arrays{
652				my $first_aref = shift;
653				my $second_aref = shift;
654				my @first = @{ $first_aref };
655				my @second = @{ $second_aref };
656
657				my $same_size = scalar @first == scalar @second;
658				return if ! $same_size;
659
660				for (0 .. $#first ){
661				return if $first[$_] ne $second[$_] ;
662				}
663				return 1;
664				}
665
666				sub _calculate_abh {
667				my $sample_gt = shift; # 'gt' means genotype
668				my $parentA_aref = shift; # 'aref' means array reference
669				my $parentB_aref = shift;
670				my $parentB_is_F1 = shift;
671				my @parentA_chars = _nonredundant_chars( @{$parentA_aref} );
672				my @parentB_chars = _nonredundant_chars( @{$parentB_aref} );
673
674				#If genotype has one or more $MISSING alleles, then return $MISSING
675				return $MISSING if ( $sample_gt =~ m{ $MISSING }xms );
676
677				#Find out if the parents have exactly the same genotype
678				my $parents_same;
679				if ( _equivalent_arrays(\@parentA_chars, \@parentB_chars )) {
680				$parents_same = 1;
681				}
682
683				#Return 'missing' if one of the parents have only missing data
684				#Return error code if sample has alleles not found in parents
685				#Must check in this order, otherwise having no parental info will be
686				# misdiagnosed as having a nonparental allele
687				if ( @parentA_chars == 1 && $parentA_chars[0] eq $MISSING ) {
688				return $MISSING_PARENT;
689				}
690				elsif ( @parentB_chars == 1 && $parentB_chars[0] eq $MISSING ) {
691				return $MISSING_PARENT;
692				}
693				else {
694
695				#combine parental character sets
696				my @both_parent_chars =
697				_nonredundant_chars( @parentA_chars, @parentB_chars );
698
699				#Check if the sample genotype can have descended from parents
700				if ( !_is_comprised_from( $sample_gt, @both_parent_chars ) ) {
701				if ($parents_same) {
702				return $NONPOLYM_CONFLICT;
703				}
704				else {
705				return $NONPARENTAL;
706				}
707				}
708				else {
709
710				#Good! No errors and no missing data.
711				}
712				}
713
714				if ( !$parentB_is_F1 ) {
715
716				#If genotype only contains alleles from parents, then assign A, B, or H
717				my $allele_A_count = 0;
718				my $allele_B_count = 0;
719
720				#Check if both parents are nonpolymorphic
721				if ( @parentA_chars == 1 && @parentB_chars == 1 ) {
722				my $parentA_char = $parentA_chars[0];
723				my $parentB_char = $parentB_chars[0];
724
725				#Indicate noninformative if parents are the same
726				return $NONPOLYM if ($parents_same);
727
728				#Extract characters from sample genotype
729				my @sample_chars = _chars_from($sample_gt);
730
731				#Tally how many are from which parent
732				for my $sample_char (@sample_chars) {
733				if($sample_char eq $parentA_char){
734				$allele_A_count++;
735				}
736				elsif ($sample_char eq $parentB_char) { $allele_B_count++;}
737				}
738
739				if ( $allele_A_count == 2 ) {
740				return $ALLELE_A;
741				}
742				elsif ( $allele_B_count == 2 ) {
743				return $ALLELE_B;
744				}
745				elsif ( $allele_A_count == 1 && $allele_B_count == 1 ) {
746				return $HETEROZYGOUS;
747				}
748				else {
749
750				#Since we already checked for values that were missing or
751				# nonparental, this should be unreachable
752				my $dying_message = "error calculating 'ABH' from:\n"
753				. join "'\n",
754				q{parentA => '} . join( "' '", @{$parentA_aref} ),
755				q{parentB => '} . join( "' '", @{$parentB_aref} ),
756				q{sample genotype =>'} . $sample_gt;
757
758				confess $dying_message;
759				}
760				}
761				else {
762
763				#indicate that it is noninformative because of a polymorphic parent
764				return $POLYM;
765				}
766				}
767				else {
768				for my $F1_gt ( @{$parentB_aref} ) {
769				return $NONPARENTAL_F1
770				if !_can_be_F1_of( $F1_gt, $parentA_aref, [$MISSING] );
771				}
772
773				return $POLYM if @parentA_chars > 1;
774
775				if (@parentB_chars == 2) {
776				my $parentA_char = $parentA_chars[0];
777				my @revised_parentB_chars =
778				_remove_matching_char( $parentA_char, @parentB_chars );
779				return _calculate_abh( $sample_gt, $parentA_aref,
780				\@revised_parentB_chars );
781				}
782				elsif (@parentB_chars == 1) {
783				if ( _equivalent_arrays(\@parentB_chars, \@parentA_chars )) {
784				return _calculate_abh( $sample_gt, $parentA_aref,
785				$parentB_aref );
786				}
787				else {
788				### $sample_gt
789				### @parentA_chars
790				### @parentB_chars
791				confess "I didn't think this was reachable";
792				}
793				}
794				}
795				}
796
797				sub _summary_header{
798				return "\t" . join("\t", qw{ AA AC AG AT CC CG CT GG GT TT --
799				inconsistent parentA_polymorphic
800				parentB_polymorphic parentA_unknown
801				parentB_unknown sum_of_odds_and_ends
802				F1_missing }) . "\n";
803				}
804
805				sub _summarize_genotypes {
806				my @data = @_;
807				my @genotypes = qw{ AA AC AG AT CC CG CT GG GT TT -- };
808				my %count;
809
810				#Initialize hash %count, so that zeros show up in the summary
811				for my $genotype (@genotypes) {
812				$count{$genotype} = 0;
813				}
814
815				#Count occurrences of each genotype
816				for my $datum (@data){
817
818				#Remove any leading or trailing whitespace
819				$datum = _trim($datum);
820
821				#Get just the first two characters of each data string
822				my $sorted_char_datum = _sorted_first_two_char($datum);
823
824				#Add one to the count for this genotype
825				$count{$sorted_char_datum}++;
826				}
827
828				#Return an array of just the specified genotypes
829				# (this is a 'hash slice')
830				return @count{ @genotypes };
831				}
832				#----------------------------------------------------------------------------#
833				# SUMMARIZE DATA (END)#
834				#****************************************************************************#
835
836				#****************************************************************************#
837				# GENERAL UTILITIES #
838				#----------------------------------------------------------------------------#
839
840				sub _nonredundant_chars {
841				my @strings = @_;
842				confess 'arguments required' if( @strings < 1);
843				for( 0 .. $#strings){
844				$strings[$_] = _sans_newlines($strings[$_]);
845				}
846
847				#Get just the unique characters out of the strings
848				my $combined = join $EMPTY_STRING, @strings;
849				my @single_chars = uniq _chars_from($combined);
850
851				#Remove 'missing data' and empty characters
852				my @filtered_chars =
853				grep { $_ ne $MISSING && $_ ne $EMPTY_STRING } @single_chars;
854
855				#Return $MISSING if there aren't any left
856				return $MISSING if(@filtered_chars == 0);
857
858				#Check that the characters are all DNA bases
859				my $DNA_BASES = qr{ [ACGT] }xms;
860				for my $filtered_char(@filtered_chars){
861
862				#Throw exception if character is not a valid DNA base
863				if($filtered_char !~ m{\A $DNA_BASES \z}xms){
864				confess "Unknown character: '$filtered_char'";
865				}
866				}
867
868				#Return nonredundant characters, now that they've all been checked
869				return @filtered_chars;
870				}
871
872				# Are all of the characters in a string a subset of the characters found in
873				# this array of strings?
874				sub _is_comprised_from{
875				#Get the string to be analyzed, followed by strings with acceptable chars
876				my ($string, @acceptable_strings) = @_;
877
878				#Put all of the characters from all of these strings into one array
879				my @acceptable_chars;
880				for my $acceptable_string( @acceptable_strings){
881				push @acceptable_chars, uniq _chars_from($acceptable_string);
882				}
883
884				confess 'Empty string found'
885				if _is_an_element_of($EMPTY_STRING, @acceptable_chars);
886
887				#Get all of the characters out of the string to be analyzed
888				my @chars = uniq _chars_from($string);
889
890				#If any character is not acceptable, return false
891				for my $char (@chars){
892				if(! _is_an_element_of($char, @acceptable_chars)){
893				return 0;
894				}
895				}
896
897				#Since no unacceptable character was found, return true
898				return 1;
899				}
900
901				sub _is_an_element_of{
902
903				my $query = shift;
904				my @elements = @_;
905				for my $element( @elements){
906				return 1 if $query eq $element;
907				}
908
909				return 0;
910				}
911
912				sub _chars_from{
913				my @strings = shift;
914
915				my @string_chars;
916				for my $string (@strings){
917				push @string_chars, split $EMPTY_STRING, $string;
918				}
919
920				#Return the individual characters from the string as an array
921				return @string_chars;
922				}
923
924				sub _sorted_first_two_char {
925				my $string = shift;
926
927				#Get first two characters from the string
928				my $first_two_char = substr $string, 0, 2;
929
930				#Sort these two characters alphabetically
931				my @sorted_char = sort { $a cmp $b } split $EMPTY_STRING, $first_two_char;
932
933				#Concatenate and return the characters
934				return join $EMPTY_STRING, @sorted_char;
935				}
936
937				sub _sans_newlines{
938				my $string = shift;
939				chomp $string;
940
941				$string =~ s{ \r }{}xms;
942
943				my $string_sans_newlines = $string;
944
945				return $string_sans_newlines;
946				}
947
948
949				__PACKAGE__->meta->make_immutable;
950
951				1; # End of Bio::SNP::Inherit
952
953				__END__
954				=head1 NAME
955
956				=head1 SYNOPSIS
957
958				=head1 VERSION
959
960				version 0.002
961
962				my $foo = Bio::SNP::Inherit->new(
963				manifest_filename => 'manifest.tab',
964				data_filename => 'data.tab'
965				);
966
967				#Upon object construction, this outputs a summary file
968				# 'data.tab_summary.tab' and a detailed file 'data.tab_abh.tab'
969				# containing parental allele designations for each sample that has
970				# parents defined for it in the manifest file
971
972				=head1 DESCRIPTION
973
974				This is a module for converting Single Nucleotide Polymorphism (SNP) genotype
975				data to parental allele designations. This helps with creating files suitable
976				for mapping, identifying and characterizing crossovers, and also helps with
977				quality control.
978
979				=head1 SUBROUTINES/METHODS
980
981				=head2 BUILD
982
983				Since the integrity of the data in the manifest file is absolutely vital,
984				building an object fails if there are duplicate sample ids in the
985				manifest file.
986
987				=head1 ATTRIBUTES
988
989				=head2 manifest_filename
990
991				Name of the file containing information for each sample id
992
993				Required in the constructor
994
995				The first line contains headers and the remaining lines contain
996				tab-delimited fields in the following order:
997
998				sample id or "Institute Sample Label" (e.g. "WG0096796-DNAA05" )
999				sample name or "Sample name" (e.g. "B73xB97" )
1000				group name or "Group" (e.g. "NAM F1" )
1001				parentA or "Mother" (e.g. "WG0096795-DNAA01" )
1002				parentB or "Father" (e.g. "WG0096796-DNAF01" )
1003				replicate of or "Replicate(s)" (id of sample that this replicates
1004				e.g. "WG0096796-DNAA05" )
1005				AxB F1 or "F1 of parentA and parentB" (e.g. "WG0096795-DNAA02" )
1006
1007				The last four fields can be blank, if they are not applicable. However,
1008				being blank when they are applicable will result in failure of the
1009				program to analyze the data properly
1010
1011				=head2 data_filename
1012
1013				Name of the tab-delimited file containing the data to be processed.
1014
1015				Required in the constructor.
1016
1017				The text '[Data]' in a line indicates that remaining lines are all data.
1018				The next line contains column headers, which are in fact the sample ids.
1019				Sample ids missing from the manifest file will not be processed.
1020				The next line contains the name of the SNP in the first field and data in
1021				the remaining fields.
1022
1023				Data must be in the format of SNP_name{tab}AA{tab}GG{tab}.
1024
1025				=head1 OUTPUT FILES
1026
1027				Upon object construction, two files are produced: one that summarizes the
1028				input and another that that describes the genotypes of samples in terms of
1029				their "parents". For example, a sample with a genotype of "CG" whose
1030				'parentA' has a genotype of "CC" and whose 'parentB' has a genotype of
1031				"GG" would have a heterozygous genotype, labeled as 'H'.
1032
1033				Here are the possible allele designations that result:
1034
1035				Allele designations for informative genotypes:
1036				A = parentA genotype
1037				B = parentB genotype
1038				H = heterozygous genotype
1039
1040				Allele designations for noninformative genotypes:
1041				~ = nonpolymorphic parents (i.e. both parents have same genotype)
1042				- = missing data
1043				-- = missing data for at least one parental
1044				% = polymorphic parent
1045
1046				Error codes:
1047				# = conflict of nonpolymorphic expectation, meaning both parents
1048				have the same genotype, but the sample has a different
1049				genotype. For example, parentA and parentB both have the
1050				genotype 'CC', but the sample has a genotype of 'TT'.
1051
1052				! = nonparental genotype, meaning each parent has a different
1053				genotype, but the sample has at least one allele not seen
1054				in either parent. For example, getting 'AG' for the
1055				offspring when the parents have 'GG' and 'TT'.
1056				(This should not even be seen when the data was obtained
1057				from a biallelic assay.)
1058
1059				!! = genotype of the F1 for parentA x parentB is incongruent with
1060				the genotype for parentA
1061
1062				See the bundled tests for examples.
1063
1064				=head1 TODO
1065
1066				Output report detailing which samples have been processed and in what way.
1067				Also give descendents and ancestor relationships.
1068
1069				Document ability to process files using F1 and parentA info (i.e. in the
1070				absence of parentB info).
1071
1072				Add simple means of adding map info so that distances and chromosomes are
1073				output along with the marker names.
1074
1075				Give crossover info?
1076
1077				Give introgressions/regions attributable to specific ancestor(s).
1078
1079				Use benchmarking to find out which (if any) to memoize:
1080				_nonredundant_chars
1081				_trim
1082				_is_comprised_from
1083				_sorted_characters
1084				_sort_and_join
1085				_chars_from
1086				_sorted_first_two_char
1087
1088				Test bad file names
1089
1090				=head1 DIAGNOSTICS
1091
1092				TODO
1093
1094				=head1 CONFIGURATION AND ENVIRONMENT
1095
1096				TODO
1097
1098				=head1 DEPENDENCIES
1099
1100				TODO
1101
1102				=head1 INCOMPATIBILITIES
1103
1104				TODO
1105
1106				=head1 BUGS
1107
1108				Please report any you find. None have been reported as of the current release.
1109
1110				=head1 LIMITATIONS
1111
1112				This is ALPHA code. Use at your own risk. There are some major changes
1113				that I want to do to it.
1114
1115				Be consciencious with the preparation of your input files (i.e. manifest file
1116				and data file). Correct results depend on correct input files.
1117
1118				=head1 AUTHOR
1119
1120				Christopher Bottoms, C<< <molecules at cpan.org> >>
1121
1122				=head1 SUPPORT
1123
1124				You can find documentation for this module with the perldoc command.
1125
1126				perldoc Bio::SNP::Inherit
1127
1128				=head1 ACKNOWLEDGEMENTS
1129
1130				=head1 LICENSE AND COPYRIGHT
1131
1132				This program is free software; you can redistribute it and/or modify it
1133				under the terms of either: the GNU General Public License as published
1134				by the Free Software Foundation; or the Artistic License.
1135
1136				See http://dev.perl.org/licenses/ for more information.
1137
1138				Copyright 2010 Christopher Bottoms.
1139
1140				=cut