File Coverage

blib/lib/Shatterproof.pm
Criterion Covered Total %
statement 797 1818 43.8
branch 174 490 35.5
condition 42 180 23.3
subroutine 34 51 66.6
pod 0 27 0.0
total 1047 2566 40.8


line stmt bran cond sub pod time code
1             #!/usr/local/bin/perl
2             #The ShatterProof package is copyright (c) 2013 Ontario Institute for Cancer Research (OICR).
3             #
4             #This package and its accompanying libraries is free software; you can redistribute it and/or modify it under the terms of the GPL (either version 1, or at your option, any later version) or the Artistic License 2.0. Refer to LICENSE for the full license text.
5              
6             #OICR makes no representations whatsoever as to the SOFTWARE contained herein. It is experimental in nature and is provided WITHOUT WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE OR ANY OTHER WARRANTY, EXPRESS OR IMPLIED. CSHL MAKES NO REPRESENTATION OR WARRANTY THAT THE USE OF THIS SOFTWARE WILL NOT INFRINGE ANY PATENT OR OTHER PROPRIETARY RIGHT.
7              
8             #By downloading this SOFTWARE, your Institution hereby indemnifies OICR against any loss, claim, damage or liability, of whatsoever kind or
9             #nature, which may arise from your Institution's respective use, handling or storage of the SOFTWARE.
10              
11             #If publications result from research using this SOFTWARE, we ask that the Ontario Institute for Cancer Research be acknowledged and/or credit be given to OICR scientists, as scientifically appropriate.
12              
13             ### Shatterproof.pm ###############################################################################
14             # ShatterProof is a tool that can be used to analyze next generation sequencing data for signs
15             # of chromothripsis.
16             # See POD at end of file for more description
17             #
18              
19             ### INCLUDES ######################################################################################
20 9     9   384330 use strict;
  9         21  
  9         317  
21 9     9   43 use warnings;
  9         12  
  9         260  
22 9     9   37 use Carp;
  9         18  
  9         801  
23              
24 9     9   38 use vars qw($VERSIONS);
  9         12  
  9         420  
25              
26 9     9   40 use feature 'switch';
  9         11  
  9         873  
27             #use Switch;
28 9     9   38 use File::Basename;
  9         14  
  9         532  
29 9     9   41 use List::Util qw[min max];
  9         12  
  9         759  
30 9     9   5065 use Statistics::Distributions;
  9         29320  
  9         620  
31 9     9   6056 use POSIX;
  9         62493  
  9         61  
32             ###################################################################################################
33              
34             ### HISTORY #######################################################################################
35             # Version Date Coder Comments
36             # 0.001 2012/03/19 sgovind Versioning start point
37             # 0.002 2012/04/03 sgovind moved input validation methods and run methods
38             # from shatterproof.pl to here
39             # 0.003 2012/10/08 sgovind Updated POD
40             # 0.04 2012/11/25 sgovind Stable build before changing translocation scoring equation
41             # 0.05 2012/12/26 sgovind See change log for details
42             # 0.06 2012/12/27 sgovind Added additional documentation for new config variable
43             # 0.07 2012/12/27 sgovind Added example guide for provided sample data
44             # 0.08 2013/05/22 sgovind Added EXPORT code for test case, added minor error checking
45             # 0.09 2013/06/10 sgovind Minor changes to accomodate testing
46             # 0.10 2013/06/19 sgovind Minor changes to accomodate testing
47             # 0.11 2013/06/24 sgovind Changed all sorts to stable sorts. Reduced number of posix
48             # calculations.
49             # 0.12 2013/06/28 sgovind Changed sort order of interchromosomal translocation output
50             # 0.13 2013/0716 sgovind Corrected logical error in calculate_loh_score
51              
52             our $VERSION = '0.14';
53              
54             package Shatterproof;
55 9     9   24114 use Exporter;
  9         62  
  9         78034  
56             our @ISA = 'Exporter';
57             our @EXPORT = qw(
58             $bin_size
59             $genome_cnv_data_hash_ref
60             $chromosome_copy_number_count_hash_ref
61             $chromosome_cnv_breakpoints_hash_ref
62             $tp53_mutation_found
63             $genome_trans_data_hash_ref
64             $chromosome_translocation_count_hash_ref
65             $genome_trans_breakpoints_hash_ref
66             $genome_mutation_density_hash_ref
67             $suspect_regions_array_ref
68             $likely_regions_array_ref
69             $genome_cnv_data_windows_hash_ref
70             $genome_trans_data_windows_hash_ref
71             $genome_mutation_data_windows_hash_ref
72             $localization_window_size
73             );
74              
75             ### Global Variables ##############################################################################
76             my $pos = 0; #used to parse command line variables
77             my $ARGC; #stores the number of command line arguments provided
78              
79             my %chromosome_length = ( #stores the sequence length of each chromosome
80             X => 154913754,
81             Y => 57741652,
82             1 => 247199719,
83             2 => 242751149,
84             3 => 199446827,
85             4 => 191263063,
86             5 => 180837866,
87             6 => 170896993,
88             7 => 158821424,
89             8 => 146274826,
90             9 => 140442298,
91             10 => 135374737,
92             11 => 134452384,
93             12 => 132289534,
94             13 => 114127980,
95             14 => 106360585,
96             15 => 100338915,
97             16 => 88822254,
98             17 => 78654742,
99             18 => 76117153,
100             19 => 63806651,
101             20 => 62435965,
102             21 => 46944323,
103             22 => 49528953
104             );
105              
106             my $TP53_start = 1000000*7.57; #start base pair of the TP53 gene
107             my $TP53_end = 1000000*7.59; #end base pair of the TP53 gene
108              
109             my $insertion_data_present = 0;
110             my $LOH_data_present = 0;
111              
112              
113             #The values for the following 13 variables are defined in the config.pl file
114             our $bin_size; #number of bases pairs that will be compressed into 1 region when analyzing the genome
115             #this value defines how many base pairs are included in one array element in the data_hash_ref varaibles
116              
117             our $localization_window_size; #number of regions to sum together when performing sliding window analysis of the genome
118              
119             our $expected_mutation_density; #the expected mutation density of translocations in a highly mutated region
120             #used to calculate spread factor of translocations
121             our $low_mutation_density_threshold; #the mutation density that will be used to call likely regions
122              
123             our $collapse_regions; #flag variable
124             #value 1: merge overlapping CNV regions that have the same copy number
125             #value 0: do not merge overlapping CNV regions that have the same copy number. If such
126             # regions are found an error is thrown
127              
128             our $outlier_deviation; #the number of standard deviations away from the mean a value has to be in order to be considered non-significant
129              
130             our $translocation_cut_off_count; #the max number of translocation chromosomes that will be tolerated before translocation score is set to 0
131              
132             our $chromosome_localization_weight; #the scoring formula weight given to the localization of mutations to a specific region on the chromosome
133             our $genome_localization_weight; #the scoring formula weight given to the localization of mutations to a specific chromosome
134             our $cnv_weight; #the scoring formula weight given to the aberrant CNV hallmark
135             our $translocation_weight; #the scoring formula weight given to the localization of translocations
136             our $insertion_breakpoint_weight; #the scoring formula weight given to the number of insertions found at translocation breakpoints
137             our $loh_weight; #the scoring formula weight given to the amount of heterozygosity that is retained in a mutated region
138             our $tp53_mutated_weight; #the scoring formula weight given to the presents or absence of a TP53 mutation
139              
140             ### SUB METHODS ###################################################################################
141              
142             #=head2 Sub-Method: run
143              
144             ### run ###########################################################################################
145             # Description:
146             # Main method called by shatterproof.pl
147             # Calls primary sub methods
148             #
149             # Input variables:
150             # $argv_ref: reference to @ARGV
151              
152             #=cut
153             sub run {
154              
155 0     0 0 0 my $argv_ref = shift; #parse parameters
156 0         0 my @argv = @{$argv_ref}; #dereference array reference
  0         0  
157              
158 0         0 my $cnv_directory; #stores the path to the directory where the CNV input files are found
159             my $trans_directory; #stores the path to the directory where the translocation input files are found
160 0         0 my $insertion_directory; #stores the path to the directory where the insertion input files are found
161 0         0 my $loh_directory; #stores the path to the directory where the loss of hetrozygosity input files are found
162 0         0 my $output_directory; #stores the path to the directory where output files will be placed
163              
164 0         0 my $config_file_path; #stores the path to the configuration file
165              
166 0         0 my $tp53_mutated = 0; #flag variable to indicate if the TP53 gene should be considered to be mutated
167 0         0 my $tp53_mutation_found = 0; #flag variable to indicate if a mutation was found in the TP53 region. This does not affect scoring
168              
169 0         0 my @cnv_files; #list of CNV input files
170             my @trans_files; #list of translocation input files
171 0         0 my @insertion_files; #list of insertion input files
172 0         0 my @loh_files; #list of LOH input files
173              
174            
175 0         0 my $chromosome_copy_number_count_hash_ref; #hash
176             #key1: chromosome eg. 1,2,X,Y
177             #key2: copy-number state eg. 0,1,3,20
178             #value: number of regions with copy number key2
179              
180 0         0 my $chromosome_cnv_breakpoints_hash_ref; #hash
181             #key: chromosome eg. 1,2,X,Y
182             #value: an array storing the start and end points of all cnv regions on key
183              
184 0         0 my $chromosome_translocation_count_hash_ref; #hash
185             #key1: chromosome eg. 1,2,X,Y
186             #key2: chromosome eg. 1,2,X,Y
187             #value: number of translocations between key1 and key2
188              
189 0         0 my $chromosome_insertion_count_hash_ref; #hash
190             #key: chromosome eg. 1,2,X,Y
191             #value: number of insertions on key
192              
193 0         0 my $chromosome_loh_breakpoints_hash_ref; #hash
194             #key: chromosome eg. 1,2,X,Y
195             #value: an array storing the start and end points of all loh regions on key
196              
197              
198 0         0 my $genome_trans_breakpoints_hash_ref; #hash
199             #key: chromosome eg. 1,2,X,Y
200             #value: an array storing all the translocation breakpoints on key
201            
202 0         0 my $genome_trans_insertion_breakpoints_hash_ref; #hash
203             #key: chromosome eg. 1,2,X,Y
204             #value: an array storing only the translocation breakpoints on key that have a insertion with 10 base pairs
205              
206 0         0 my $genome_mutation_density_hash_ref; #hash
207             #key: chromosome eg. 1,2,X,Y
208             #value: the total number of mutation on key divided by the sequence length of key
209              
210              
211 0         0 my $genome_cnv_data_hash_ref = initialize_genome_hash(); #hash {key1}[index]{key2}
212             #key1: chromosome eg. 1,2,X,Y
213             #value: an array storing references to hashes which contain information about the
214             # CNVs in each region of key1. The index of the array indicated the region
215             #key2: 'BPcount' -> gives the number of CNV breakpoints in the region.
216             # a number, eg. '1' -> gives the number of subregions within the region that
217             # have a copy number of 1
218            
219 0         0 my $genome_trans_data_hash_ref = initialize_genome_hash(); #hash {key1}[index]{key2}{key3}
220             #key1: chromosome eg. 1,2,X,Y
221             #value1:an array storing references to hashes which contain information about the
222             # translocations in each region of key1. The index of the array indicated the region.
223             #key2: 'BPcount' -> gives the number of translocation breakpoints in the region
224             # 'in' -> gives a reference to a hash that contains information about translocations
225             # into the region
226             # 'out' -> gives a reference to a hash that contains information about translocations
227             # out of the region
228             #key3: chromosome eg. 1,2,X,Y
229             #value2:the number of subregions in the region that were translocated to key1 from key3 if key2 = 'in'
230             # or
231             # the number of subregions in the region that were translocated from key1 to key3 if key2 = 'out'
232            
233 0         0 my $genome_insertion_data_hash_ref = initialize_genome_hash(); #hash {key1}[index]
234             #key1: chromosome eg. 1,2,X,Y
235             #value: an array storing the number of insertions in each region of key1
236             # The index of the array indicated the region
237              
238              
239 0         0 my $genome_cnv_data_windows_hash_ref; #hash {key1}[index]
240             #key1: chromosome eg. 1,2,X,Y
241             #value: an array storing the number of CNV breakpoints in each window of the genome.
242             # Each window begins at the region indicated by the array index
243            
244             my $genome_trans_data_windows_hash_ref; #hash {key1}[index]
245             #key1: chromosome eg. 1,2,X,Y
246             #value: an array storing the number of translocation breakpoints in each window of the genome.
247             # Each window begins at the region indicated by the array index
248            
249              
250 0         0 my $genome_mutation_data_windows_hash_ref; #hash {key1}[index]
251             #key1: chromosome eg. 1,2,X,Y
252             #value: an array storing the total number of mutation breakpoints in each window of the genome.
253             # Each window begins at the region indicated by the array index
254              
255 0         0 my $suspect_regions_array_ref; #reference to array that stores regions where chromothripsis most likely occured. Format: chr start end
256 0         0 my $likely_regions_array_ref; #reference to array that stores regions where chromothripsis may have occured. Format: chr start end
257              
258              
259             #Validate input arguements and parse them to the correct variables
260 0         0 validate_input(\@argv, \$cnv_directory, \$trans_directory, \$insertion_directory, \$loh_directory, \$tp53_mutated, \$output_directory, \$config_file_path);
261              
262             #Load the values from the config file
263 0 0       0 if(load_config_file($config_file_path)!=1){
264 0         0 die("ERROR - could not load config file\n");
265             }
266              
267 0         0 print "CNV dir:\t$cnv_directory\n";
268 0         0 print "Trans dir:\t$trans_directory\n";
269              
270 0 0       0 if(defined($insertion_directory)){
271 0         0 print "insertion dir:\t$insertion_directory\n";
272             }
273              
274 0 0       0 if(defined($loh_directory)){
275 0         0 print "LOH dir:\t$loh_directory\n";
276             }
277              
278 0         0 print "Output dir:\t$output_directory\n";
279              
280 0         0 print "Force TP53 Mutation:\t$tp53_mutated\n\n";
281              
282             #Get a list of files for each of the provided input directories
283 0         0 @cnv_files = glob ("$cnv_directory"."*.spc");
284 0         0 @trans_files = glob ("$trans_directory"."*.spt");
285              
286 0 0 0     0 if(scalar(@cnv_files)==0 || scalar(@trans_files)==0){
287 0         0 die "ERROR: no CNV or translocation input files found\n";
288             }
289              
290 0 0       0 if(defined($insertion_directory)){
291 0         0 @insertion_files = glob ("$insertion_directory"."*.vcf");
292             }
293              
294 0 0       0 if(defined($loh_directory)){
295 0         0 @loh_files = glob ("$loh_directory"."*.spl");
296             }
297              
298             #Echo a list of all the input files
299 0         0 $" = "\n\t\t";
300 0         0 print "CNV files:\t@cnv_files\n\n";
301 0         0 print "Trans files:\t@trans_files\n\n";
302 0         0 $" = "\n\t\t";
303 0 0       0 if(scalar(@insertion_files)==0){
304 0         0 print "Indel files:\t-none\n\n";
305             }
306             else{
307 0         0 print "Indel files:\t@insertion_files\n\n";
308             }
309 0         0 $" = "\n\t";
310 0 0       0 if(scalar(@loh_files)==0){
311 0         0 print "LOH files:\t-none\n\n";
312             }
313             else{
314 0         0 print "LOH files:\t@loh_files\n\n";
315             }
316 0         0 $" = " ";
317              
318             #Create the output directory if it does not exist
319 0 0       0 mkdir ("$output_directory",0770) unless (-d "$output_directory");
320              
321             #Check that the output directory exists
322 0 0       0 if(!(-e $output_directory)){
323 0         0 die "ERROR: could not create directory: $output_directory\n";
324             }
325              
326 0         0 print "\n--analyzing CNV data\n";
327 0         0 ($genome_cnv_data_hash_ref, $chromosome_copy_number_count_hash_ref, $chromosome_cnv_breakpoints_hash_ref) = analyze_cnv_data($output_directory, \@cnv_files, $bin_size, \$tp53_mutation_found);
328 0         0 print "---done analyzing CNV data\n\n";
329              
330 0         0 print "--analyzing translocation data\n";
331 0         0 ($genome_trans_data_hash_ref, $chromosome_translocation_count_hash_ref, $genome_trans_breakpoints_hash_ref) = analyze_trans_data($output_directory, \@trans_files, $bin_size, \$tp53_mutation_found);
332 0         0 print "---done analyzing translocation data\n\n";
333              
334             #If insertion data was provided then analyze it
335 0 0       0 if(defined($insertion_directory)){
336 0         0 print "--analyzing insertion data\n";
337 0         0 ($genome_insertion_data_hash_ref, $chromosome_insertion_count_hash_ref, $genome_trans_insertion_breakpoints_hash_ref) = analyze_insertion_data($output_directory, \@insertion_files, $bin_size, $genome_trans_breakpoints_hash_ref, \$tp53_mutation_found);
338 0         0 print "---done analyzing insertion data\n\n";
339             }
340              
341             #Delete intermediate storage
342 0         0 %$genome_trans_breakpoints_hash_ref = ();
343 0         0 undef $genome_trans_breakpoints_hash_ref;
344              
345             #If LOH data was provided then analyze it
346 0 0       0 if(defined($loh_directory)){
347 0         0 print "--analyzing LOH data\n";
348 0         0 ($chromosome_loh_breakpoints_hash_ref) = analyze_loh_data($output_directory, \@loh_files, \$tp53_mutation_found);
349 0         0 print "---done analyzing LOH data\n\n";
350              
351             #Check that the correct format of the LOH hash has been preserved
352 0         0 my %loh_hash = %{$chromosome_loh_breakpoints_hash_ref};
  0         0  
353 0         0 for my $key1 (keys %loh_hash){
354 0         0 my @a = @{$loh_hash{$key1}};
  0         0  
355 0         0 my $size = @a;
356              
357 0 0       0 if($size % 2 != 0){
358 0         0 die "ERROR: odd number of loh breakpoints recorded for chromosome $key1\n";
359             }
360             }
361              
362             }
363              
364 0         0 print "--calculating chromosome mutation densities\n";
365 0         0 $genome_mutation_density_hash_ref = calculate_genome_localization($output_directory, $chromosome_copy_number_count_hash_ref, $chromosome_translocation_count_hash_ref);
366 0         0 print "---done calculating chromosome mutation densities\n\n";
367              
368 0         0 print "--calculating chromosome region mutation densities\n";
369 0         0 ($suspect_regions_array_ref, $likely_regions_array_ref, $genome_cnv_data_windows_hash_ref, $genome_trans_data_windows_hash_ref, $genome_mutation_data_windows_hash_ref) = calculate_chromosome_localization($output_directory, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $bin_size, $localization_window_size);
370 0         0 print "---done calculating chromosome region mutation densities\n\n";
371              
372 0         0 print "--analyzing suspect regions\n";
373 0         0 analyze_suspect_regions($output_directory, $suspect_regions_array_ref, $genome_mutation_density_hash_ref, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $bin_size, $localization_window_size, $tp53_mutated, $tp53_mutation_found, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref);
374 0         0 print "---done analyzing suspect regions\n\n";
375              
376 0         0 print "--analyzing likely regions\n";
377 0         0 analyze_likely_regions($output_directory, $likely_regions_array_ref, $genome_mutation_density_hash_ref, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $bin_size, $localization_window_size);
378 0         0 print "---done analyzing likely regions\n\n";
379              
380 0         0 print "--calculating copy number count\n";
381 0         0 check_copy_number_count($output_directory, $chromosome_copy_number_count_hash_ref);
382 0         0 print "---done calculating copy number count\n\n";
383              
384 0         0 print "--calculating switch count\n";
385 0         0 check_copy_number_switches($output_directory, $chromosome_copy_number_count_hash_ref);
386 0         0 print "---done calculating switch count\n\n";
387              
388 0         0 print "--calculating interchromosomal translocation rate\n";
389 0         0 calculate_interchromosomal_translocation_rate($output_directory, $chromosome_translocation_count_hash_ref);
390 0         0 print "---done calculating interchromosomal translocation rate\n";
391              
392             }#sub run
393              
394             #=head2 Sub-Method: validate_input
395              
396             ### validate_input ################################################################################
397             # Description:
398             # Validates command line arguments. Prints error messages if some input if invalid.
399             #
400             # Input variables:
401             # $argv_ref: reference to @ARGV
402             # $cnv_directory_ref: reference to variable storing the CNV input directory
403             # $trans_directory_ref: reference to variable storing the translocation input directory
404             # $insertion_directory_ref: reference to variable storing the insertion input directory
405             # $loh_directory_ref: reference to variable storing the LOH input directory
406             # $tp53_mutated_ref: reference to variable storing the tp53 mutated flag
407             # $output_directory_ref: reference to variable storing the output directory
408             # $config_file_path_ref: reference to variable storing the path to the config file
409              
410             #=cut
411             sub validate_input {
412              
413             #Parse parameters
414 0     0 0 0 my $argv_ref = shift;
415 0         0 my @argv = @{$argv_ref};
  0         0  
416              
417 0         0 my $cnv_directory_ref = shift;
418 0         0 my $trans_directory_ref = shift;
419 0         0 my $insertion_directory_ref = shift;
420 0         0 my $loh_directory_ref = shift;
421 0         0 my $tp53_mutated_ref = shift;
422 0         0 my $output_directory_ref = shift;
423 0         0 my $config_file_path_ref = shift;
424            
425             #Determine number of command line arguements
426 0         0 $ARGC = @argv;
427              
428             #Parse the command line arguements
429 0         0 given ($ARGC) {
430 0         0 when (/^0$/) { usage(0); } #Print error message if no arguements were entered
  0         0  
431              
432 0         0 when (/^1$/) { #Check for help option
433 0 0       0 if($argv[0] eq "--help"){
434 0         0 man_text();
435             }
436             else{
437 0         0 usage(1);
438             }
439             }#case 1
440              
441 0         0 default {
442              
443 0 0       0 if($argv[$pos] eq "--cnv"){ #Check for the cnv input directory option, this field is mandatory
444 0         0 next_arg(2);
445 0         0 $$cnv_directory_ref = $argv[$pos];
446 0 0       0 if(!(substr($$cnv_directory_ref,-1,1) eq '/')){
447 0         0 $$cnv_directory_ref = $$cnv_directory_ref.'/';
448             }
449 0         0 next_arg(3);
450             }
451             else {
452 0         0 usage(4);
453             }
454              
455              
456 0 0       0 if($argv[$pos] eq "--trans"){ #Check for the translocation input directory option, this field is mandatory
457 0         0 next_arg(5);
458 0         0 $$trans_directory_ref = $argv[$pos];
459 0 0       0 if(!(substr($$trans_directory_ref,-1,1) eq '/')){
460 0         0 $$trans_directory_ref = $$trans_directory_ref.'/';
461             }
462 0         0 next_arg(6);
463             }
464             else {
465 0         0 usage(7);
466             }
467              
468 0 0       0 if($argv[$pos] eq "--insrt"){ #Check for the insertion input directory option
469 0         0 next_arg(8);
470 0         0 $$insertion_directory_ref = $argv[$pos];
471 0 0       0 if(!(substr($$insertion_directory_ref,-1,1) eq '/')){
472 0         0 $$insertion_directory_ref = $$insertion_directory_ref.'/';
473             }
474 0         0 $insertion_data_present = 1;
475 0         0 next_arg(9);
476             }
477              
478 0 0       0 if($argv[$pos] eq "--loh"){ #Check for the LOH input directory option
479 0         0 next_arg(10);
480 0         0 $$loh_directory_ref = $argv[$pos];
481 0 0       0 if(!(substr($$loh_directory_ref,-1,1) eq '/')){
482 0         0 $$loh_directory_ref = $$loh_directory_ref.'/';
483             }
484 0         0 $LOH_data_present = 1;
485 0         0 next_arg(11);
486             }
487              
488 0 0       0 if($argv[$pos] eq "--tp53"){ #Check for the TP53 gene mutation check option
489 0         0 $$tp53_mutated_ref = 1;
490 0         0 next_arg(12);
491             }
492              
493 0 0       0 if($argv[$pos] eq "--config"){ #Check for the config file option, this field is mandatory
494 0         0 next_arg(13);
495 0         0 $$config_file_path_ref = $argv[$pos];
496 0         0 next_arg(14);
497             }
498             else{
499 0         0 usage(15);
500             }
501              
502 0 0       0 if($argv[$pos] eq "--output"){ #Check for the output directory option, this field is mandatory
503 0         0 next_arg(16);
504 0         0 $$output_directory_ref = $argv[$pos];
505 0 0       0 if(!(substr($$output_directory_ref,-1,1) eq '/')){
506 0         0 $$output_directory_ref = $$output_directory_ref.'/';
507             }
508             }
509             else {
510 0         0 usage(17);
511             }
512              
513             #Check that there are no other command line arguments
514 0 0       0 if($pos != $ARGC-1){
515 0         0 usage(18);
516             }
517             }#default case
518             }#given ($ARGC)
519              
520             }#sub validate_input
521              
522             #=head2 Sub-Method: analyze_cnv_data
523              
524             ### analyze_cnv_data ##############################################################################
525             # Description:
526             # Reads data from files located in the CNV input directory and populates:
527             # $genome_cnv_data_hash_ref
528             # $chromosome_copy_number_count_hash_ref
529             # $chromosome_cnv_breakpoints_hash_ref
530             #
531             # Input variables:
532             # $output_directory: stores the path to the output directory
533             # $cnv_files_array_ref: reference to array containing all the CNV input files
534             # $bin_size: stores the size of the bins which the chromosome will be divided into
535             # $tp53_mutation_found_ref: reference to the tp53 mutation found flag
536              
537             #=cut
538             sub analyze_cnv_data {
539              
540             #Parse the parameters
541 11     11 0 16050574 my $output_directory = shift;
542              
543 11         22 my $cnv_files_array_ref = shift;
544 11         26 my @cnv_files = @$cnv_files_array_ref;
545              
546 11         21 my $bin_size = shift;
547 11         18 my $tp53_mutation_found_ref = shift;
548              
549 11         19 my %genome_cnv_data = (); #hash
550             #key: chromosome eg. 1,2,X,Y
551             #value: a reference to an array where each element corresponds to a bin along the
552             # chromosome
553              
554 11         19 my @file_data; #an array storing all the entries from every input file
555              
556             my $CURRENT_FILE; #file handle to the current file that is open
557 0         0 my $TP53_FILE; #file handle to the TP53 CNV mutation output file
558              
559 0         0 my $line; #stores raw line read in from file
560 0         0 my @line_data; #stores tokenized line read in from file
561              
562 11         22 my %chromosome_copy_number_count = (); #hash {chr}{copy number}{count}
563             #key1: chromosome eg. 1,2,X,Y
564             #key2: a copy-number state eg 0,1,3,15
565             #value: the number of region on key1 that have a copy number of key2
566              
567 11         205 my %chromsome_cnv_breakpoints = ( #hash {chr}[start and end pairs]
568             X => [], #key: chromosome eg. 1,2,X,Y
569             Y => [], #value: an array that stored an ordered list of CNV breakpoints on key
570             1 => [],
571             2 => [],
572             3 => [],
573             4 => [],
574             5 => [],
575             6 => [],
576             7 => [],
577             8 => [],
578             9 => [],
579             10 => [],
580             11 => [],
581             12 => [],
582             13 => [],
583             14 => [],
584             15 => [],
585             16 => [],
586             17 => [],
587             18 => [],
588             19 => [],
589             20 => [],
590             21 => [],
591             22 => []
592             );
593              
594             #Read the contents of the cnv files into memory
595 11 100       50 if($#cnv_files==-1){
596 1         7 die "ERROR: no cnv files found in analyze_cnv_data\n";
597             }
598              
599 10         28 foreach my $file (@cnv_files){
600             #Open the file
601 16 100       559 open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";
602              
603             #Check that the file is not empty
604 15 50       6695 if(eof($CURRENT_FILE)){
605 0         0 close ($CURRENT_FILE);
606 0         0 die "ERROR: $file is empty\n";
607             }
608              
609             #Read header line and validate
610 15         58 $line = <$CURRENT_FILE>;
611 15         29 chomp($line);
612              
613             #Check that the format of the header line is correct
614 15 100       92 if(!($line =~ m/^#chr\tstart\tend\tnumber\tquality$/)){
615 1         14 close ($CURRENT_FILE);
616 1         38 die "ERROR: header of cnv file $file is invalid\n";
617             }
618              
619             #Read all the data lines in the file
620 14         40 while( !(eof($CURRENT_FILE)) ){
621             #read data line
622 836         1273 $line = <$CURRENT_FILE>;
623 836         643 chomp($line);
624              
625             #Validate the data line
626 836 100       2702 if(!($line =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){
627 1         32 die "ERROR: invalid line found ($line) in file $file\n";
628             }
629              
630             #Split the data line and add it to the file_data array
631 835         2515 @line_data = (split (/\t/,$line));
632              
633 835         2396 push(@file_data,[@line_data]);
634             }
635              
636 13         155 close ($CURRENT_FILE);
637              
638             }#foreach my $file (@cnv_files)
639              
640             #Check that there are no overlapping CNV regions with different copy-numbers
641 7         40 @file_data = check_for_overlaps("cnv", \@file_data);
642              
643              
644             #Create TP53 directory and output folder
645 7 100       963 mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
646 7 50       115 if(!(-e "$output_directory"."TP53")){
647 0         0 die "ERROR: could not create folder $output_directory"."TP53\n";
648             }
649              
650             #Create the TP53 CNV mutation file
651 7 50       674 open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spc") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spc";
652              
653             #Print the header of the file (same format as a .spc file)
654 7         134 print $TP53_FILE "#chr\tstart\tend\tnumber\tquality";
655              
656             #For every data line that was read in from an input file,
657             #record the CNV mutation in the genome_cnv_data_hash,
658             #record the exact breakpoints of the CNV,
659             #update the chromosome_copy_number_count hash and
660             #check if the CNV is in the TP53 region
661 7         42 for (my $n = 0; $n < scalar(@file_data); $n++){
662 739         829 my $hash = {};
663              
664             #Ensure that the chromosome value is valid
665 739 50       3693 if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
666 0         0 die "ERROR: invalid line found in CNV input file: @{$file_data[$n]}\n";
  0         0  
667             }
668              
669             #Parse the chromosome
670 739         1259 my $chr = $2;
671              
672             #Increment the copy-number count hash based on the line data
673 739         1616 $chromosome_copy_number_count {$chr}{$file_data[$n][3]}++;
674              
675             #Record the exact breakpoints of the CNV
676 739         637 push (@{$chromsome_cnv_breakpoints{$chr}}, ($file_data[$n][1],$file_data[$n][2]));
  739         2005  
677              
678             #Calculate the bin for the start and end breakpoint
679 739         1289 my $start_index = int($file_data[$n][1]/$bin_size);
680 739         946 my $end_index = int($file_data[$n][2]/$bin_size);
681              
682             my $update_bin = sub {
683 0     0   0 my $source_chr = shift;
684 0         0 my $index = shift;
685 0         0 my $copy_num = shift;
686              
687 0         0 my $genome_hash_ref = shift;
688 0         0 my %genome_hash = %{$genome_hash_ref};
  0         0  
689            
690 0         0 my $hash = ();
691              
692 0 0       0 if(!defined(@{$genome_hash{$source_chr}}[$index])){
  0         0  
693 0         0 $hash->{'BPcount'} = 1;
694 0         0 $hash->{$copy_num} = 0.5;
695 0         0 @{$genome_hash{$source_chr}}[$index] = $hash;
  0         0  
696             }
697             else{ #if a bin does exist then increment the counts
698 0         0 ${@{$genome_hash{$source_chr}}[$index]}{'BPcount'}++;
  0         0  
  0         0  
699 0         0 ${@{$genome_hash{$source_chr}}[$index]}{$copy_num}+=0.5;
  0         0  
  0         0  
700             }
701 739         3330 };
702              
703             #Check if a bin exists at the start index
704             #if not, create one
705 739 50       690 if(!defined(@{$genome_cnv_data{$chr}}[$start_index])){
  739         1428  
706 739         1053 $hash->{'BPcount'} = 1;
707 739         986 $hash->{$file_data[$n][3]} = 0.5;
708 739         621 @{$genome_cnv_data{$chr}}[$start_index] = $hash;
  739         84957  
709             }
710             else{ #If one does exist increment the counts
711 0         0 ${@{$genome_cnv_data{$chr}}[$start_index]}{'BPcount'}++;
  0         0  
  0         0  
712 0         0 ${@{$genome_cnv_data{$chr}}[$start_index]}{$file_data[$n][3]}+=0.5;
  0         0  
  0         0  
713             }
714              
715 739         1268 $hash = {};
716              
717             #Check if a bin exists at the end index
718             #if not, create one
719 739 100       693 if(!defined(@{$genome_cnv_data{$chr}}[$end_index])){
  739         1432  
720 738         1158 $hash->{'BPcount'} = 1;
721 738         1049 $hash->{$file_data[$n][3]} = 0.5;
722 738         646 @{$genome_cnv_data{$chr}}[$end_index] = $hash;
  738         4641  
723             }
724             else{ #If one does exist increment the counts
725 1         2 ${@{$genome_cnv_data{$chr}}[$end_index]}{'BPcount'}++;
  1         3  
  1         3  
726 1         2 ${@{$genome_cnv_data{$chr}}[$end_index]}{$file_data[$n][3]}+=0.5;
  1         1  
  1         6  
727             }
728              
729              
730             #Check if the variation was in the TP53 gene
731 739 100 100     8074 if(
      100        
732             ( $chr ne 'X' && $chr ne 'Y' ) &&
733             ( $chr==17 )
734             ){
735 37 100 66     622 if(
      33        
736             ( $file_data[$n][3] != 2 ) &&
737             ( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) || ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) )
738             ){
739             #If a CNV was found in the TP53 region
740 1         2 $$tp53_mutation_found_ref = 1;
741              
742             #Record the mutation in the TP53 CNV file
743 1         3 print $TP53_FILE "\n";
744 1         2 for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
  6         22  
745 5         36 print $TP53_FILE "$file_data[$n][$i]";
746 5 100       3 if($i != scalar(@{$file_data[$n]})-1){
  5         13  
747 4         5 print $TP53_FILE "\t";
748             }#if
749             }#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
750             }#if
751             }#if
752             }#for (my $n = 0; $n < scalar(@file_data); $n++)
753              
754 7         1952 close($TP53_FILE);
755              
756             #return hash
757 7         562 return(\%genome_cnv_data, \%chromosome_copy_number_count, \%chromsome_cnv_breakpoints);
758              
759             }#sub analyze_cnv_data
760              
761             #=head2 Sub-Method: check_for_overlaps
762              
763             ### check_for_overlaps ############################################################################
764             # Description:
765             # Checks if there were overlapping CNV regions with different copy-numbers in the input files.
766             # Also checks if there are any overlapping translocation destinations or overlapping LOH
767             # regions.
768             #
769             # Input variables:
770             # $type: flag variable indicating which type of overlap to check for "cnv", "trans",
771             # or "loh"
772             # $file_data_ref: reference to an array storing all the data lines read in from the specific
773             # type of input file
774              
775             #=cut
776             sub check_for_overlaps {
777              
778 7     7 0 59 my $type = shift;
779 7         13 my $file_data_ref = shift;
780 7         70 my @file_data = @$file_data_ref;
781              
782 7         39 my $start_overlap = 0; #Flag variable indicating if the start position of one region is overlapping with
783             #another region
784              
785 7         22 my $end_overlap = 0; #Flag variable indicating if the end position of one region is overlapping with
786             #another region
787              
788             #Check for overlapping regions
789             #Compares each entry in the array to every following entry
790 7         45 for (my $n = 0; $n < scalar(@file_data); $n++){
791 739         1232 for (my $k = $n+1; $k < scalar(@file_data); $k++){
792              
793             #Check if the 2 regions in question are on the same chromosome
794 55656 100       124506 if($file_data[$n][0] eq $file_data[$k][0]) {
795              
796             #Check if the end point of region 1 is within region 2
797 2862 100 100     7298 if($file_data[$n][2]>=$file_data[$k][1] && $file_data[$n][2]<=$file_data[$k][2]){
798 96         103 $end_overlap = 1;
799             }
800              
801             #Check if the start point of region 1 is within region 2
802 2862 100 100     6389 if($file_data[$n][1]>=$file_data[$k][1] && $file_data[$n][1]<=$file_data[$k][2]){
803 78         95 $start_overlap = 1;
804             }
805              
806             #If an overlap was detected
807 2862 100 100     16682 if($start_overlap==1 || $end_overlap==1) {
    50 66        
808              
809             #if it was a translocation overlap then throw an error
810 96 50       200 if($type eq "trans"){
811 0         0 die "ERROR: found overlapping translocation source regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
  0         0  
  0         0  
812             }
813              
814             #if it was a LOH overlap then throw an error
815 96 50       171 if($type eq "loh"){
816 0         0 die "ERROR: found overlapping LOH regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
  0         0  
  0         0  
817             }
818              
819             #if it was a CNV overlap check if the copy numbers are the same
820             #If they are different throw an error
821 96 50 33     597 if(
    50          
    50          
822             ( $type eq "cnv") &&
823             ( $file_data[$n][3] != $file_data[$k][3] )
824             ){
825 0         0 die "ERROR: found overlapping regions with different copy number values:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
  0         0  
  0         0  
826             }
827              
828             #if they are the same
829             #and the user does not wish to collapse overlapping regions with the same copy number then throw an error
830             elsif ($collapse_regions==0) {
831 0         0 die "ERROR: found overlapping copy number regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
  0         0  
  0         0  
832             }
833              
834             #if the user wishes to collapses overlapping regions with the same copy number then do so
835             elsif ($collapse_regions==1) {
836             #Region 2 completely encompasses region 1
837             #So replace region 1 with region 2
838 96 100 66     340 if($start_overlap==1 && $end_overlap==1){
    50          
    50          
839 78         158 $file_data[$n][1] = $file_data[$k][1];
840 78         102 $file_data[$n][2] = $file_data[$k][2];
841             }
842             #The start point of region 1 is within region 2
843             #So replace the start point of region 1 with the start point of region 2
844             elsif($start_overlap==1){
845 0         0 $file_data[$n][1] = $file_data[$k][1];
846             }
847             #The end point of region 1 is within region 2
848             #So replace the end point of region 1 with the end point of region 2
849             elsif($end_overlap==1){
850 18         41 $file_data[$n][2] = $file_data[$k][2];
851             }
852             #If region 1 was modified then remove region 2 and re-check for overlaps
853 96 50 66     377 if($start_overlap==1 || $end_overlap==1){
854 96         2482 @file_data = (@file_data[0..($k-1),($k+1)..(scalar(@file_data)-1)]);
855 96         539 $start_overlap = 0;
856 96         88 $end_overlap = 0;
857 96         92 $k = $n+1;
858 96         129 redo;
859             }
860              
861             }#elsif ($collapse_regions==1)
862             }#if($start_overlap==1 || $end_overlap==1)
863              
864             #Check if region 1 completely encompasses region 2
865             elsif($file_data[$n][1]<=$file_data[$k][1] && $file_data[$n][2]>=$file_data[$k][2]){
866              
867 0 0       0 if($type eq "trans"){
868 0         0 die "ERROR: found overlapping translocation source regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
  0         0  
  0         0  
869             }
870              
871 0 0       0 if($type eq "loh"){
872 0         0 die "ERROR: found overlapping LOH regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
  0         0  
  0         0  
873             }
874              
875             #If the copy numbers are different throw an error
876 0 0 0     0 if(
    0          
    0          
877             ( $type eq "cnv") &&
878             ( $file_data[$n][3] != $file_data[$k][3] )
879             ){
880 0         0 die "ERROR: found overlapping regions with different copy number values:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
  0         0  
  0         0  
881             }
882              
883             #If the copy numbers are the same but the user does not want to collapse then throw an error
884             elsif ($collapse_regions==0) {
885 0         0 die "ERROR: found overlapping copy number regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
  0         0  
  0         0  
886             }
887              
888             #If the user does wish to collapse then remove the second region
889             elsif ($collapse_regions==1) {
890 0         0 @file_data = (@file_data[0..($k-1),($k+1)..(scalar(@file_data)-1)]);
891             }
892             }#elsif($file_data[$n][1]<=$file_data[$k][1] && $file_data[$n][2]>=$file_data[$k][2])
893             }#if($file_data[$n][0] eq $file_data[$k][0])
894             }#for (my $n = 0; $n < scalar(@file_data); $n++)
895             }#for (my $n = 0; $n < scalar(@file_data); $n++)
896              
897             #Return the updated entries
898 7         278 return (@file_data);
899              
900             }#sub check_for_overlaps
901              
902             #=head2 Sub-Method: analyze_trans_data
903              
904             ### analyze_trans_data ############################################################################
905             # Description:
906             # Reads data from files located in the trans input directory and popultates:
907             # $genome_trans_data_hash_ref
908             # $chromosome_translocation_count_hash_ref
909             # $genome_trans_breakpoints_hash_ref
910             #
911             # Input variables:
912             # $output_directory: stores the path to the output directory
913             # $trans_files_array_ref: reference to array containing all the translocation
914             # input files
915             # $bin_size: stores the size of the bins which the chromosome will be divided into
916             # $tp53_mutation_found_ref: reference to the tp53 mutation found flag
917              
918             #=cut
919             sub analyze_trans_data {
920              
921             #Parse the parameters
922 10     10 0 8172214 my $output_directory = shift;
923 10         57 my $trans_files_array_ref = shift;
924 10         26 my @trans_files = @$trans_files_array_ref;
925              
926 10         13 my $bin_size = shift;
927              
928 10         14 my $tp53_mutation_found_ref = shift;
929              
930 10         19 my %genome_trans_data = (); #hash
931             #key: chromosome eg. 1,2,X,Y
932             #value: a reference to an array where each element corresponds to a bin along the
933             # chromosome
934              
935 10         14 my @file_data; #an array storing all the entries from every input file
936              
937             my $CURRENT_FILE; #file handle to the current file that is open
938 0         0 my $TP53_FILE; #file handle to the TP53 translocation mutation output file
939              
940 0         0 my $line; #stores raw line read in from file
941 0         0 my @line_data; #stores tokenized line read in from file
942              
943 0         0 my $chr1;
944 0         0 my $chr2;
945              
946 10         16 my %chromosome_trans_count = (); #hash {chr}{chr}{count}
947             #key1: chromosome eg. 1,2,X,Y
948             #key2: chromosome eg. 1,2,X,Y
949             #value: the number of translocations between key1 and key2
950              
951 10         175 my %genome_trans_breakpoints = ( #hash {chr}[array]
952             X => [], #key: chromosome eg. 1,2,X,Y
953             Y => [], #value: an array storing all the translocation breakpoints
954             1 => [], # on key
955             2 => [],
956             3 => [],
957             4 => [],
958             5 => [],
959             6 => [],
960             7 => [],
961             8 => [],
962             9 => [],
963             10 => [],
964             11 => [],
965             12 => [],
966             13 => [],
967             14 => [],
968             15 => [],
969             16 => [],
970             17 => [],
971             18 => [],
972             19 => [],
973             20 => [],
974             21 => [],
975             22 => []
976             );
977              
978              
979             #Read the contents of the cnv files into memory
980 10 100       43 if($#trans_files==-1){
981 1         8 die "ERROR: no trans files found in analyze_trans_data\n";
982             }
983              
984 9         23 foreach my $file (@trans_files){
985             #open the file
986 14 50       582 open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";
987              
988             #check that the file is not empty
989 14 100       2187 if(eof($CURRENT_FILE)){
990 1         6 close ($CURRENT_FILE);
991 1         24 die "ERROR: $file is empty\n";
992             }
993              
994             #read header line and validate
995 13         53 $line = <$CURRENT_FILE>;
996 13         25 chomp($line);
997              
998             #validate the header line
999 13 100       81 if(!($line =~ m/^#chr1\tstart\tend\tchr2\tstart\tend\tquality$/)){
1000 1         11 close ($CURRENT_FILE);
1001 1         24 die "ERROR: header of translocation file $file is invalid\n";
1002             }
1003              
1004             #read in every data line
1005 12         37 while( !(eof($CURRENT_FILE)) ){
1006             #read data line
1007 742         1245 $line = <$CURRENT_FILE>;
1008 742         677 chomp($line);
1009              
1010             #validate the format of the data line
1011 742 100       3183 if(!($line =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){
1012 1         33 die "ERROR: invalid line found in translocation input file: ($line) in file $file\n";
1013 0         0 next;
1014             }
1015              
1016             #Split the data line and add it to the array
1017 741         2793 @line_data = (split (/\t/,$line));
1018              
1019             #if the start position is greater than the end position for either the source or destination throw an error
1020 741 100 66     2797 if(
1021             ( $line_data[1] >= $line_data[2] ) ||
1022             ( $line_data[4] >= $line_data[5] )
1023             ){
1024             #warn "ERROR: invalid line found ($line) in file $file. Start or end values invalid\n";
1025             #next;
1026             }
1027              
1028             #Add the data line to the file_data array
1029 741         2566 push(@file_data,[@line_data]);
1030             }
1031              
1032 11         104 close ($CURRENT_FILE);
1033              
1034             }#foreach my $file (@trans_files)
1035              
1036             #ignoring overlapping translocations for now
1037             #@file_data = check_for_overlaps("trans", \@file_data);
1038              
1039             #Create TP53 directory and output folder
1040 6 100       265 mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
1041 6 50       69 if(!(-e "$output_directory"."TP53")){
1042 0         0 die "ERROR: could not create folder $output_directory"."TP53\n";
1043             }
1044              
1045             #create TP53 translocation mutation file
1046 6 50       510 open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spt") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spt";
1047            
1048             #Print the header of the file (same format as a .spt file)
1049 6         36 print $TP53_FILE "#chr1\tstart\tend\tchr2\tstart\tend\tquality";
1050              
1051             #For every data line that was read in from an input file,
1052             #record the translocation mutation in the genome_trans_data_hash,
1053             #record the exact breakpoints of the translocation,
1054             #update the chromosome_trans_count hash and
1055             #check if the translocation is in the TP53 region
1056 6         116 for (my $n = 0; $n < scalar(@file_data); $n++){
1057            
1058 741         831 my $hash = {};
1059              
1060             #verify that the chromosome 1 is valid
1061 741 50       3569 if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
1062 0         0 die "ERROR: invalid chromosome field detected in translocation file\n";
1063             }
1064              
1065             #parse out chromosome 1
1066 741         1400 my $chr1 = $2;
1067              
1068             #verify that the chromosome 2 is valid
1069 741 50       1892 if(!($file_data[$n][3] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
1070 0         0 die "ERROR: invalid chromosome field detected in translocation file\n";
1071             }
1072              
1073             #parse out chromosome 2
1074 741         825 my $chr2 = $2;
1075              
1076             #calculate the bin where each breakpoint will be placed
1077 741         1397 my $start_index1 = int($file_data[$n][1]/$bin_size);
1078 741         801 my $end_index1 = int($file_data[$n][2]/$bin_size);
1079              
1080 741         950 my $start_index2 = int($file_data[$n][4]/$bin_size);
1081 741         848 my $end_index2 = int($file_data[$n][5]/$bin_size);
1082              
1083             my $update_bin = sub {
1084 0     0   0 my $source_chr = shift;
1085 0         0 my $dest_chr = shift;
1086 0         0 my $index = shift;
1087 0         0 my $data = shift;
1088 0         0 my $type = shift;
1089              
1090 0         0 my $genome_hash_ref = shift;
1091 0         0 my %genome_hash = %{$genome_hash_ref};
  0         0  
1092            
1093 0         0 my $hash = ();
1094              
1095 0 0       0 if(!defined(@{$genome_hash{$source_chr}}[$index])){
  0         0  
1096 0         0 $hash->{'BPcount'} = 1;
1097 0         0 push (@{$hash->{$type}{$dest_chr}}, $data);
  0         0  
1098 0         0 @{$genome_hash{$source_chr}}[$index] = $hash;
  0         0  
1099             }
1100             else{ #if a bin does exist then increment the counts
1101 0         0 ${@{$genome_hash{$source_chr}}[$index]}{'BPcount'}++;
  0         0  
  0         0  
1102 0         0 push (@{${@{$genome_hash{$source_chr}}[$index]}{$type}{$dest_chr}}, $data);
  0         0  
  0         0  
  0         0  
1103             }
1104              
1105 741         2560 };
1106              
1107             #check if a bin exists at $start_index1
1108             #if not, create one
1109 741 100       713 if(!defined(@{$genome_trans_data{$chr1}}[$start_index1])){
  741         1665  
1110 516         741 $hash->{'BPcount'} = 1;
1111 516         446 push (@{$hash->{'out'}{$chr2}}, $file_data[$n][4]);
  516         1283  
1112 516         495 @{$genome_trans_data{$chr1}}[$start_index1] = $hash;
  516         39736  
1113             }
1114             else{ #if a bin does exist then increment the counts
1115 225         176 ${@{$genome_trans_data{$chr1}}[$start_index1]}{'BPcount'}++;
  225         164  
  225         341  
1116 225         189 push (@{${@{$genome_trans_data{$chr1}}[$start_index1]}{'out'}{$chr2}}, $file_data[$n][4]);
  225         163  
  225         168  
  225         531  
1117             }
1118              
1119 741         1103 $hash = {};
1120 741 100       618 if(!defined(@{$genome_trans_data{$chr1}}[$end_index1])){
  741         1328  
1121 275         405 $hash->{'BPcount'} = 1;
1122 275         214 push (@{$hash->{'out'}{$chr2}}, $file_data[$n][5]);
  275         617  
1123 275         263 @{$genome_trans_data{$chr1}}[$end_index1] = $hash;
  275         1026  
1124             }
1125             else{
1126 466         320 ${@{$genome_trans_data{$chr1}}[$end_index1]}{'BPcount'}++;
  466         350  
  466         614  
1127 466         389 push (@{${@{$genome_trans_data{$chr1}}[$end_index1]}{'out'}{$chr2}}, $file_data[$n][5]);
  466         328  
  466         349  
  466         945  
1128             }
1129              
1130 741         817 $hash = {};
1131 741 100       676 if(!defined(@{$genome_trans_data{$chr2}}[$start_index2])){
  741         1168  
1132 361         548 $hash->{'BPcount'} = 1;
1133 361         271 push (@{$hash->{'in'}{$chr1}}, $file_data[$n][1]);
  361         785  
1134 361         332 @{$genome_trans_data{$chr2}}[$start_index2] = $hash;
  361         3532  
1135             }
1136             else{
1137 380         276 ${@{$genome_trans_data{$chr2}}[$start_index2]}{'BPcount'}++;
  380         399  
  380         549  
1138 380         346 push (@{${@{$genome_trans_data{$chr2}}[$start_index2]}{'in'}{$chr1}}, $file_data[$n][1]);
  380         255  
  380         285  
  380         980  
1139             }
1140              
1141 741         788 $hash = {};
1142 741 100       675 if(!defined(@{$genome_trans_data{$chr2}}[$end_index2])){
  741         1134  
1143 160         207 $hash->{'BPcount'} = 1;
1144 160         131 push (@{$hash->{'in'}{$chr1}}, $file_data[$n][2]);
  160         360  
1145 160         144 @{$genome_trans_data{$chr2}}[$end_index2] = $hash;
  160         573  
1146             }
1147             else{
1148 581         448 ${@{$genome_trans_data{$chr2}}[$end_index2]}{'BPcount'}++;
  581         426  
  581         739  
1149 581         521 push (@{${@{$genome_trans_data{$chr2}}[$end_index2]}{'in'}{$chr1}}, $file_data[$n][2]);
  581         411  
  581         420  
  581         1222  
1150             }
1151            
1152             #Increment hash translocation counts
1153 741         992 $chromosome_trans_count{$chr1}{$chr2}++;
1154             #if the translocation is intra-chromosomal then don't count it twice
1155 741 100       1294 if($chr1 ne $chr2){
1156 316         390 $chromosome_trans_count{$chr2}{$chr1}++;
1157             }
1158              
1159             #store the breakpoints in their bins
1160 741         563 push (@{$genome_trans_breakpoints{$chr1}}, $file_data[$n][1]);
  741         1240  
1161 741         665 push (@{$genome_trans_breakpoints{$chr1}}, $file_data[$n][2]);
  741         989  
1162              
1163 741         567 push (@{$genome_trans_breakpoints{$chr2}}, $file_data[$n][4]);
  741         1028  
1164 741         570 push (@{$genome_trans_breakpoints{$chr2}}, $file_data[$n][5]);
  741         995  
1165              
1166             #Check if the translocation origin was in the TP53 gene
1167 741 100 100     3770 if(
      100        
1168             ( $chr1 ne 'X' && $chr1 ne 'Y' ) &&
1169             ( $chr1==17 )
1170             ){
1171 1 50 33     12 if(
      0        
      33        
1172             ( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) || ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) )
1173             ){
1174             #if a mutation was found, set the TP53 mutated flag
1175 1         2 $$tp53_mutation_found_ref = 1;
1176 1         3 print $TP53_FILE "\n";
1177             #print the translocation data line to the TP53 translocation mutation output file
1178 1         2 for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
  8         15  
1179 7         7 print $TP53_FILE "$file_data[$n][$i]";
1180 7 100       6 if($i != scalar(@{$file_data[$n]})-1){
  7         13  
1181 6         8 print $TP53_FILE "\t";
1182             }
1183             }#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
1184             }#if
1185             }#if
1186              
1187             #Check if the translocation destination was in the TP53 gene
1188 741 50 66     7231 if(
      66        
1189             ( $chr2 ne 'X' && $chr2 ne 'Y' ) &&
1190             ( $chr2==17 )
1191             ){
1192 0 0 0     0 if(
      0        
      0        
1193             ( ( $file_data[$n][4] >= $TP53_start && $file_data[$n][4] <= $TP53_end ) || ( $file_data[$n][5] >= $TP53_start && $file_data[$n][5] <= $TP53_end ) )
1194             ){
1195 0         0 $$tp53_mutation_found_ref = 1;
1196 0         0 print $TP53_FILE "\n";
1197 0         0 for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
  0         0  
1198 0         0 print $TP53_FILE "$file_data[$n][$i]";
1199 0 0       0 if($i != scalar(@{$file_data[$n]})-1){
  0         0  
1200 0         0 print $TP53_FILE "\t";
1201             }
1202             }#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
1203             }#if
1204             }#if
1205             }#for (my $n = 0; $n < scalar(@file_data); $n++)
1206              
1207 6         533 close($TP53_FILE);
1208              
1209             #return hash
1210 6         680 return(\%genome_trans_data, \%chromosome_trans_count, \%genome_trans_breakpoints);
1211              
1212             }#sub analyze_trans_data
1213              
1214              
1215             #=head2 Sub-Method: analyze_insertion_data
1216              
1217             ### analyze_insertion_data ##############################################################################
1218             # Description:
1219             # Reads data from files located in the insertion input directory and populates:
1220             # $genome_insertion_data_hash_ref
1221             # $chromosome_insertion_count_hash_ref
1222             # $genome_trans_insertion_breakpoints_hash_ref
1223             #
1224             # Input variables:
1225             # $output_directory: stores the path to the output directory
1226             # $insertion_files_array_ref: reference to array containing all the insertion input files
1227             # $bin_size: stores the size of the bins which the chromosome will be divided into
1228             # $genome_trans_breakpoints_hash_ref: store reference to hash that contains the translocation breakpoints on
1229             # each chromosome
1230             # $tp53_mutation_found_ref: reference to the tp53 mutation found flag
1231             #
1232              
1233             #=cut
1234             sub analyze_insertion_data {
1235              
1236             #Parse Parameters
1237 0     0 0 0 my $output_directory = shift;
1238 0         0 my $insertion_files_array_ref = shift;
1239 0         0 my @insertion_files = @$insertion_files_array_ref;
1240              
1241 0         0 my $bin_size = shift;
1242              
1243 0         0 my $genome_trans_breakpoints_hash_ref = shift;
1244 0         0 my %genome_trans_breakpoints = %$genome_trans_breakpoints_hash_ref;
1245              
1246 0         0 my $tp53_mutation_found_ref = shift;
1247              
1248 0         0 my %genome_insertion_data = (); #hash
1249             #key: chromosome eg. 1,2,X,Y
1250             #value: a reference to an array where each element corresponds to a bin along the
1251             # chromosome
1252              
1253 0         0 my $CURRENT_FILE; #file handle to the current file that is open
1254             my $TP53_FILE; #file handle to the TP53 insertion mutation output file
1255              
1256 0         0 my $line; #stores raw line read in from file
1257 0         0 my @line_data; #stores tokenized line read in from file
1258              
1259 0         0 my $chr;
1260              
1261 0         0 my $file_name;
1262 0         0 my $path;
1263 0         0 my $suffix;
1264              
1265 0         0 my $rm_insertion_file_result;
1266              
1267 0         0 my $insertion_found = 0;
1268              
1269 0         0 my %chromosome_insertion_count = (); #hash {chr}{count}
1270             #key: chromosome eg. 1,2,X,Y
1271             #value: the number of insertions found on key
1272              
1273 0         0 my %genome_trans_insertion_breakpoints = ( #hash
1274             X => [], #key: chromosome eg. 1,2,X,Y
1275             Y => [], #value: an array storing all the insertion start positions on key
1276             1 => [],
1277             2 => [],
1278             3 => [],
1279             4 => [],
1280             5 => [],
1281             6 => [],
1282             7 => [],
1283             8 => [],
1284             9 => [],
1285             10 => [],
1286             11 => [],
1287             12 => [],
1288             13 => [],
1289             14 => [],
1290             15 => [],
1291             16 => [],
1292             17 => [],
1293             18 => [],
1294             19 => [],
1295             20 => [],
1296             21 => [],
1297             22 => []
1298             );
1299              
1300             #Create TP53 directory and output folder
1301 0 0       0 mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
1302 0 0       0 if(!(-e "$output_directory"."TP53")){
1303 0         0 die "ERROR: could not create folder $output_directory"."TP53\n";
1304             }
1305              
1306             #for each file in the insertion input file array
1307 0         0 foreach my $file (@insertion_files){
1308              
1309             #Parse the file name, path and file type
1310 0         0 ( $file_name, $path, $suffix ) = File::Basename::fileparse( $file, "\.[^.]*");
1311              
1312             #open the file
1313 0 0       0 open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";
1314              
1315             #ensure that the file is not empty
1316 0 0       0 if(eof($CURRENT_FILE)){
1317 0         0 die "ERROR: $file is empty\n";
1318             }
1319              
1320             #create the TP53 insertion mutation output file
1321 0 0       0 open ($TP53_FILE, ">", "$output_directory"."TP53/$file_name"."$suffix") or die "ERROR: could not create file: $output_directory"."TP53/$file_name"."$suffix";
1322              
1323             #read header lines
1324 0         0 $line = <$CURRENT_FILE>;
1325 0         0 chomp($line);
1326              
1327             #print the VCF header lines to the TP53 insertion mutation output file
1328 0         0 while ($line =~ m/^#(.*?)/){
1329 0         0 print $TP53_FILE "$line\n";
1330 0         0 $line = <$CURRENT_FILE>;
1331 0         0 chomp($line);
1332             }
1333              
1334             #read all the data lines in the file
1335 0         0 while(1){
1336 0         0 @line_data = (split (/\t/,$line));
1337              
1338             #verify that the chromosome is valid and that the mutation is an insertion type
1339 0 0 0     0 if(
      0        
1340             ( !($line_data[1] =~ m/^[0-9]+$/) ) ||
1341             ( !($line_data[0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|x|y|[1-9])/) ) ||
1342             ( length($line_data[4]) <= length($line_data[3]) )
1343             ){
1344 0         0 warn "ERROR: invalid chromosome or non-insertion VCF data line found and skipped:\t$line\n";
1345 0         0 $line = <$CURRENT_FILE>;
1346 0 0       0 unless($line){last;}
  0         0  
1347 0         0 chomp($line);
1348 0         0 next;
1349             }
1350              
1351             #parse the chromosome
1352 0         0 $chr = $2;
1353              
1354             #change to uppercase if 'x' or 'y' is found
1355 0 0       0 if($chr eq 'x'){
1356 0         0 $chr = 'X';
1357             }
1358 0 0       0 if($chr eq 'y'){
1359 0         0 $chr = 'Y';
1360             }
1361              
1362             #increment the insertion count of the chromosome
1363 0         0 $chromosome_insertion_count{$chr}++;
1364              
1365             #check if a bin exists at the insertion start position
1366             #if one does not, then create one
1367 0 0       0 if(!defined(@{$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)])){
  0         0  
1368 0         0 ${$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)] = 1;
  0         0  
1369             }
1370             else{ #if one does, then increment the count
1371 0         0 ${$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)]++;
  0         0  
1372             }
1373              
1374             #Search through the list of translocation breakpoints on the same chromosome
1375 0         0 foreach my $bp (@{$genome_trans_breakpoints{$chr}}){
  0         0  
1376             #if the insertion is within 10bps of the breakpoints and the insertion to the list stored in
1377             #the genome_trans_insertion_breakpoints hash
1378 0 0 0     0 if( $line_data[1] < $bp+10 && $line_data[1] > $bp-10){
1379 0         0 push (@{$genome_trans_insertion_breakpoints{$chr}}, $bp);
  0         0  
1380             }
1381             }
1382              
1383             #Check if the insertion was in the TP53 gene
1384 0 0 0     0 if(
      0        
1385             ( $chr ne 'X' && $chr ne 'Y' ) &&
1386             ( $chr==17 )
1387             ){
1388 0 0 0     0 if($line_data[1] >= $TP53_start && $line_data[1] <= $TP53_end){
1389 0         0 $$tp53_mutation_found_ref = 1; #if a mutation was found in the region set the TP53 mutated flag
1390 0         0 $insertion_found = 1;
1391 0         0 print $TP53_FILE "$line\n"; #print the culprit data line to the TP53 insertion
1392             #mutation output file
1393             }
1394             }
1395              
1396             #read the next line
1397 0         0 $line = <$CURRENT_FILE>;
1398             #check that the end of file has not been reached
1399 0 0       0 unless($line){last;}
  0         0  
1400 0         0 chomp($line);
1401             }#while(1)
1402              
1403             #close file
1404 0         0 close($CURRENT_FILE);
1405 0         0 close ($TP53_FILE);
1406              
1407             #if an insertion was not found in the current file then delete the created TP53 insertion mutation output file
1408 0 0       0 if($insertion_found!=1){
1409 0         0 my $dir = "$output_directory"."TP53/$file_name"."$suffix";
1410 0         0 $rm_insertion_file_result = `rm $dir`;
1411             }
1412              
1413 0         0 $insertion_found = 0;
1414             }#foreach my $file (@insertion_files)
1415              
1416             #return hash
1417 0         0 return(\%genome_insertion_data, \%chromosome_insertion_count, \%genome_trans_insertion_breakpoints);
1418              
1419             }#sub analyze_insertion_data
1420              
1421              
1422             #=head2 Sub-Method: analyze_loh_data
1423              
1424             ### analyze_loh_data ##############################################################################
1425             # Description:
1426             # Reads data from files located in the LOH input directory and populates:
1427             # $chromosome_loh_breakpoints_hash_ref
1428             #
1429             # Input variables:
1430             # $output_directory: stores the path to the output directory
1431             # $loh_files_array_ref: reference to array containing all the LOH input files
1432             # $tp53_mutation_found_ref: reference to tp53 mutation found flag
1433             #
1434              
1435             #=cut
1436              
1437             sub analyze_loh_data {
1438              
1439             #parse the parameters
1440 0     0 0 0 my $output_directory = shift;
1441 0         0 my $loh_files_array_ref = shift;
1442 0         0 my @loh_files = @$loh_files_array_ref;
1443              
1444 0         0 my $tp53_mutation_found_ref = shift;
1445              
1446 0         0 my @file_data; #an array storing all the entries from every input file
1447              
1448             my $CURRENT_FILE; #file handle to the current file that is open
1449 0         0 my $TP53_FILE; #file handle to the TP53 translocation mutation output file
1450              
1451 0         0 my $line; #stores raw line read in from file
1452 0         0 my @line_data; #stores tokenized line read in from files
1453              
1454 0         0 my %chromsome_loh_breakpoints = ( #hash {chr}[start and end pairs]
1455             X => [], #key: chromosome eg. 1,2,X,Y
1456             Y => [], #value: an array that stores all the LOH breakpoints on key
1457             1 => [],
1458             2 => [],
1459             3 => [],
1460             4 => [],
1461             5 => [],
1462             6 => [],
1463             7 => [],
1464             8 => [],
1465             9 => [],
1466             10 => [],
1467             11 => [],
1468             12 => [],
1469             13 => [],
1470             14 => [],
1471             15 => [],
1472             16 => [],
1473             17 => [],
1474             18 => [],
1475             19 => [],
1476             20 => [],
1477             21 => [],
1478             22 => []
1479             );
1480              
1481             #Read the contents of the cnv files into memory
1482 0         0 foreach my $file (@loh_files){
1483             #open the file
1484 0 0       0 open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";
1485              
1486             #Ensure that the file is not empty
1487 0 0       0 if(eof($CURRENT_FILE)){
1488 0         0 close ($CURRENT_FILE);
1489 0         0 die "ERROR: $file is empty\n";
1490             }
1491              
1492             #read header line and validate
1493 0         0 $line = <$CURRENT_FILE>;
1494 0         0 chomp($line);
1495              
1496             #Validate the header line
1497 0 0       0 if(!($line =~ m/^#chr\tstart\tend\tquality$/)){
1498 0         0 close ($CURRENT_FILE);
1499 0         0 die "ERROR: header of loh file $file is invalid\n";
1500             }
1501              
1502             #Read all the data lines
1503 0         0 while( !(eof($CURRENT_FILE)) ){
1504            
1505             #read data line
1506 0         0 $line = <$CURRENT_FILE>;
1507 0         0 chomp($line);
1508              
1509             #validate the data line
1510 0 0       0 if(!($line =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){
1511 0         0 die "ERROR: invalid line found ($line) in file $file\n";
1512             }
1513              
1514             #Split the data line and add it to the array
1515 0         0 @line_data = (split (/\t/,$line));
1516 0         0 push(@file_data,[@line_data]);
1517             }
1518              
1519 0         0 close ($CURRENT_FILE);
1520              
1521             }#foreach my $file (@cnv_files)
1522              
1523             #Ensure that there are no overlapping LOH regions, or join them if the user indicated
1524 0         0 @file_data = check_for_overlaps("loh", \@file_data);
1525              
1526             #Create TP53 directory and output folder
1527 0 0       0 mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
1528 0 0       0 if(!(-e "$output_directory"."TP53")){
1529 0         0 die "ERROR: could not create folder $output_directory"."TP53\n";
1530             }
1531              
1532             #Create the TP53 LOH mutation output data file
1533 0 0       0 open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spl") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spl";
1534             #Print the header for the output file (same format as a .spl file)
1535 0         0 print $TP53_FILE "#chr\tstart\tend\tquality";
1536              
1537             #For every data line that was read in
1538 0         0 for (my $n = 0; $n < scalar(@file_data); $n++){
1539              
1540             #Validate the chromosome field
1541 0 0       0 if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
1542 0         0 die "ERROR: invalid chromosome field detected\n";
1543             }
1544              
1545             #Parse the chromosome
1546 0         0 my $chr = $2;
1547              
1548             #Add the breakpoints to the array for the chromosome
1549 0         0 push (@{$chromsome_loh_breakpoints{$chr}}, ($file_data[$n][1],$file_data[$n][2]));
  0         0  
1550              
1551              
1552             #Check if the variation was in the TP53 gene
1553 0 0 0     0 if(
      0        
1554             ( $chr ne 'X' && $chr ne 'Y' ) &&
1555             ( $chr==17 )
1556             ){
1557 0 0 0     0 if(
      0        
      0        
1558             ( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) || ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) )
1559             ){
1560             #If a mutation was found in the TP53 region then set the TP53 mutated flag
1561 0         0 $$tp53_mutation_found_ref = 1;
1562              
1563             #Print the LOH data line to the TP53 LOH mutation output file
1564 0         0 print $TP53_FILE "\n";
1565 0         0 for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
  0         0  
1566 0         0 print $TP53_FILE "$file_data[$n][$i]";
1567 0 0       0 if($i != scalar(@{$file_data[$n]})-1){
  0         0  
1568 0         0 print $TP53_FILE "\t";
1569             }#if
1570             }#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
1571             }#if
1572             }#if
1573              
1574             }#for (my $n = 0; $n < scalar(@file_data); $n++)
1575              
1576 0         0 close($TP53_FILE);
1577              
1578             #return hash
1579 0         0 return(\%chromsome_loh_breakpoints);
1580              
1581             }#sub analyze_loh_data
1582              
1583              
1584              
1585             #=head2 Sub-Method: calculate_genome_localization
1586              
1587             ### calculate_genome_localization #################################################################
1588             # Description:
1589             # Caculates the mutation density for each chromosome
1590             #
1591             # Input variables:
1592             # $output_directory: stores the path to the output directory
1593             # $chromosome_copy_number_count_hash_ref: stores a reference to the hash storing the number
1594             # of CNV events on each chromosome
1595             # #chromosome_translocation_count_hash_ref: stores a reference to the hash storing the number
1596             # of translocation events on each chromosome
1597             #
1598              
1599             #=cut
1600              
1601             sub calculate_genome_localization {
1602              
1603             #parse the parameters
1604 2     2 0 22 my $output_directory = shift;
1605 2         5 my $chromosome_copy_number_count_hash_ref = shift;
1606 2         4 my $chromosome_translocation_count_hash_ref = shift;
1607              
1608 2         4 my %chromosome_mutation_count; #hash
1609             #key: chromosome eg. 1,2,X,Y
1610             #value: the density of translocation and CNV events on key
1611            
1612             my $density; #store the mutation density for a chromosome
1613              
1614 0         0 my $OUTPUT_FILE; #file handle to the output file
1615              
1616             #initialize all the counts to 0
1617 2         13 for (my $i=1; $i<23; $i++){
1618 44         134 $chromosome_mutation_count{$i} = 0;
1619             }
1620 2         6 $chromosome_mutation_count{'X'} = 0;
1621 2         4 $chromosome_mutation_count{'Y'} = 0;
1622              
1623             #add the number of CNV events on each chromosome
1624 2         19 for my $cnv_key1 ( keys %$chromosome_copy_number_count_hash_ref){
1625 40         29 for my $cnv_key2 (keys %{$chromosome_copy_number_count_hash_ref->{$cnv_key1}}){
  40         99  
1626 160         217 $chromosome_mutation_count{$cnv_key1} += $chromosome_copy_number_count_hash_ref->{$cnv_key1}->{$cnv_key2};
1627             }
1628             }
1629            
1630             #add the number of translocation events on each chromosome
1631 2         10 for my $trans_key1 ( keys %$chromosome_translocation_count_hash_ref){
1632 26         19 for my $trans_key2 (keys %{$chromosome_translocation_count_hash_ref->{$trans_key1}}){
  26         46  
1633 52         62 $chromosome_mutation_count{$trans_key1} += $chromosome_translocation_count_hash_ref->{$trans_key1}->{$trans_key2};
1634             }
1635             }
1636              
1637             #Create the output file
1638 2 50       238 open ($OUTPUT_FILE, ">", "$output_directory/genome_localization.log") or die "ERROR: could not create file $output_directory/genome_localization.log\n";
1639             #Print the header
1640 2         15 print $OUTPUT_FILE "#chr\tcount\tdensity\n";
1641              
1642             #for each chromosome print the count and overall density
1643 9     9   6922 {use sort 'stable';
  9         5236  
  9         50  
  2         3  
1644 2         24 for my $chr ( sort keys %chromosome_mutation_count){
1645 48         61 $density = $chromosome_mutation_count{$chr}/$chromosome_length{$chr};
1646            
1647 48         37 print $OUTPUT_FILE "$chr";
1648 48         42 print $OUTPUT_FILE "\t";
1649 48         48 print $OUTPUT_FILE $chromosome_mutation_count{$chr};
1650 48         34 print $OUTPUT_FILE "\t";
1651 48         114 print $OUTPUT_FILE "$density";
1652 48         49 print $OUTPUT_FILE "\n";
1653              
1654             #Replace the count with the density
1655 48         47 $chromosome_mutation_count{$chr} = $density;
1656             }
1657             }#use sort 'stable'
1658              
1659 2         72 close ($OUTPUT_FILE);
1660              
1661             #return the hash containing the densities
1662 2         15 return(\%chromosome_mutation_count);
1663              
1664             }#sub calculate_genome_localization
1665              
1666              
1667             #=head2 Sub-Method: calculate_chromosome_localization
1668              
1669             ### calculate_chromosome_localization #############################################################
1670             # Description:
1671             # Performs a sliding window analysis on the CNV and translocation data. Identifies regions
1672             # that have a density of mutation much greater than the average rate of mutation of the
1673             # genome.
1674             #
1675             # Input variables:
1676             # $output_directory: stores the directory where output files are created
1677             # $genome_cnv_data_hash_ref: reference to hash that stores position of all CNV breakpoints in
1678             # the genome
1679             # $genome_trans_data_hash_ref: reference to hash that stores position of all the
1680             # translocation breakpoints in the genome
1681             # $bin_size: size of the bins that divide up the genome
1682             # $window_size: number of bins to evaluate in each window
1683             #
1684              
1685             #=cut
1686             sub calculate_chromosome_localization {
1687              
1688             #parse parameters
1689 2     2 0 15 my $output_directory = shift;
1690 2         3 my $genome_cnv_data_hash_ref = shift;
1691 2         4 my $genome_trans_data_hash_ref = shift;
1692 2         2 my $bin_size = shift;
1693 2         4 my $window_size = shift;
1694              
1695 2         3 my @suspect_regions; #array storing the start position, end position and chromosome
1696             #of very highly mutated regions
1697             my @likely_regions; #array storing the start position, end position and chromosome
1698             #of somewhat highly mutated regions
1699              
1700 2         3 my $in_suspect_region = 0; #flag variables used in identifying highly mutated regions
1701 2         3 my $in_likely_region = 0;
1702              
1703 2         2 my $suspect_chr = -1;
1704 2         4 my $suspect_start = -1;
1705 2         2 my $suspect_end = -1;
1706            
1707 2         4 my $likely_chr = -1;
1708 2         2 my $likely_start = -1;
1709 2         3 my $likely_end = -1;
1710              
1711 2         45 my %genome_cnv_data_windows = ( #hash
1712             X => [], #key: chromosome eg. 1,2,X,Y
1713             Y => [], #value: an array storing the count of CNVs
1714             1 => [], # in each window along the chromosome
1715             2 => [],
1716             3 => [],
1717             4 => [],
1718             5 => [],
1719             6 => [],
1720             7 => [],
1721             8 => [],
1722             9 => [],
1723             10 => [],
1724             11 => [],
1725             12 => [],
1726             13 => [],
1727             14 => [],
1728             15 => [],
1729             16 => [],
1730             17 => [],
1731             18 => [],
1732             19 => [],
1733             20 => [],
1734             21 => [],
1735             22 => []
1736             );
1737              
1738 2         23 my %genome_trans_data_windows = ( #hash
1739             X => [], #key: chromosome eg. 1,2,X,Y
1740             Y => [], #value: an array storing a count of translocation
1741             1 => [], # in each window along the chromosome
1742             2 => [],
1743             3 => [],
1744             4 => [],
1745             5 => [],
1746             6 => [],
1747             7 => [],
1748             8 => [],
1749             9 => [],
1750             10 => [],
1751             11 => [],
1752             12 => [],
1753             13 => [],
1754             14 => [],
1755             15 => [],
1756             16 => [],
1757             17 => [],
1758             18 => [],
1759             19 => [],
1760             20 => [],
1761             21 => [],
1762             22 => []
1763             );
1764              
1765 2         23 my %genome_mutation_data_windows = ( #hash
1766             X => [], #key: chromosome eg. 1,2,X,Y
1767             Y => [], #value: an array storing a count of all mutations
1768             1 => [], # in each window along the chromosome
1769             2 => [],
1770             3 => [],
1771             4 => [],
1772             5 => [],
1773             6 => [],
1774             7 => [],
1775             8 => [],
1776             9 => [],
1777             10 => [],
1778             11 => [],
1779             12 => [],
1780             13 => [],
1781             14 => [],
1782             15 => [],
1783             16 => [],
1784             17 => [],
1785             18 => [],
1786             19 => [],
1787             20 => [],
1788             21 => [],
1789             22 => []
1790             );
1791              
1792              
1793 2         4 my $current_chr; #current chromosome being analyzed
1794 2         5 my @current_chr_data = (); #array storing the bins for the current chromosome
1795              
1796 2         3 my $genome_mean_mutation_density = 0; #average density of all the windows across the genome
1797 2         3 my $total_genome_windows = 0; #total number of windows across the genome
1798 2         1 my $genome_mutation_density_standard_deviation = 0; #standard deviation of the mutation densities for
1799             #all the windows
1800              
1801 2         3 my $OUTPUT_FILE; #file handle to output file
1802              
1803 2         4 $output_directory = $output_directory."mutation_clustering";
1804              
1805             #create output directories
1806 2 50       162 mkdir ("$output_directory",0770) unless (-d "$output_directory");
1807 2 50       25 if(!(-e "$output_directory")){
1808 0         0 die "ERROR: could not create folder $output_directory\n";
1809             }
1810              
1811 2 50       92 mkdir ("$output_directory/cnv",0770) unless (-d "$output_directory/cnv");
1812 2 50       21 if(!(-e "$output_directory")){
1813 0         0 die "ERROR: could not create folder $output_directory/cnv\n";
1814             }
1815            
1816 2 50       85 mkdir ("$output_directory/translocations",0770) unless (-d "$output_directory/translocations");
1817 2 50       20 if(!(-e "$output_directory")){
1818 0         0 die "ERROR: could not create folder $output_directory/translocations\n";
1819             }
1820              
1821 2 50       88 mkdir ("$output_directory/all_types",0770) unless (-d "$output_directory/all_types");
1822 2 50       30 if(!(-e "$output_directory")){
1823 0         0 die "ERROR: could not create folder $output_directory/all_types\n";
1824             }
1825              
1826             #compute the density of CNV mutations in each window
1827 2         19 for my $cnv_key ( keys %$genome_cnv_data_hash_ref){
1828             #get the array storing the CNV bins for the current chromosome
1829 40         122 @current_chr_data = @{$genome_cnv_data_hash_ref->{$cnv_key}};
  40         161833  
1830              
1831             #check that the array is not empty
1832 40 50       321 if(scalar(@current_chr_data) > 0){
1833              
1834             #create an output file for this chromosome
1835 40 50       5776 open ($OUTPUT_FILE, ">", "$output_directory/cnv/chr$cnv_key"."_cnv_localization.log") or die "ERROR: could not create file $output_directory/cnv/chr$cnv_key"."_cnv_localization.log";
1836             #print the header for the output file
1837 40         369 print $OUTPUT_FILE "#chr\tstart\tend\tdensity";
1838              
1839 40         82 @{$genome_cnv_data_windows{$cnv_key}}[0] = 0; #initialize the count in the first window
  40         350  
1840              
1841             #Calculate the mutation count in the first window
1842 40         167 for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++){
1843 400000         252297 my %region_hash;
1844 400000 100       497514 if(!defined($current_chr_data[$chr_pos])){
1845 399924         554999 next;
1846             }
1847 76         85 %region_hash = %{$current_chr_data[$chr_pos]};
  76         523  
1848              
1849 76         124 @{$genome_cnv_data_windows{$cnv_key}}[0] += $region_hash{'BPcount'};
  76         273  
1850             }
1851              
1852             #print the values from the first window to the output file
1853 40         293 print $OUTPUT_FILE "\n";
1854 40         141 print $OUTPUT_FILE "$cnv_key";
1855 40         82 print $OUTPUT_FILE "\t";
1856 40         90 print $OUTPUT_FILE "0";
1857 40         72 print $OUTPUT_FILE "\t";
1858 40         444 print $OUTPUT_FILE ($window_size)*$bin_size;
1859 40         104 print $OUTPUT_FILE "\t";
1860              
1861 40 50       51 if(!defined(@{$genome_cnv_data_windows{$cnv_key}}[0])){
  40         359  
1862 0         0 print $OUTPUT_FILE "0";
1863             }
1864             else{
1865 40         74 my $rounded = POSIX::ceil((@{$genome_cnv_data_windows{$cnv_key}}[0])/2);
  40         350  
1866 40         377 print $OUTPUT_FILE ($rounded)/($window_size*$bin_size);
1867             #add the cnv count to the total mutation count for the region
1868 40         75 @{$genome_mutation_data_windows{$cnv_key}}[0] += $rounded;
  40         177  
1869             }
1870              
1871             #perform the sliding window analysis for the rest of the chromosome
1872 40         176 for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++){
1873              
1874             #check that the window will not overshoot the length of the chromosome
1875 4391064 100       7110032 if( (($chr_pos+($window_size-1))*$bin_size) > $chromosome_length{$cnv_key} ){
1876 24         63 last;
1877             }
1878              
1879 4391040         3328028 @{$genome_cnv_data_windows{$cnv_key}}[$chr_pos] = 0; #initialize the count for the current window
  4391040         5322717  
1880              
1881 4391040         3337689 my %past_region_hash;
1882             my %next_region_hash;
1883              
1884 4391040         3266511 my $prev_value = 0;
1885 4391040         3032145 my $next_value = 0;
1886              
1887             #get the count of the from the first bin from the previous window
1888 4391040 100       6563697 if(defined($current_chr_data[$chr_pos-1])){
1889 392         444 %past_region_hash = %{$current_chr_data[$chr_pos-1]};
  392         1890  
1890 392         662 $prev_value = $past_region_hash{'BPcount'};
1891             }
1892              
1893             #get the count from the bin following the last bin in the previous window
1894 4391040 100       6265114 if(defined($current_chr_data[$chr_pos+($window_size-1)])){
1895 392         389 %next_region_hash = %{$current_chr_data[$chr_pos+($window_size-1)]};
  392         2295  
1896 392         767 $next_value = $next_region_hash{'BPcount'};
1897             }
1898              
1899             #the count for the current window = the count from the previous window - the first bin of the previous window + the next bin along the chromosome
1900 4391040         3148944 @{$genome_cnv_data_windows{$cnv_key}}[$chr_pos] += (@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos-1]) - ($prev_value) + ($next_value);
  4391040         4679775  
  4391040         4701707  
1901              
1902             #print the values for this window
1903 4391040         4426893 print $OUTPUT_FILE "\n";
1904 4391040         3828238 print $OUTPUT_FILE "$cnv_key";
1905 4391040         3367321 print $OUTPUT_FILE "\t";
1906 4391040         9612242 print $OUTPUT_FILE $chr_pos*$bin_size;
1907 4391040         3683462 print $OUTPUT_FILE "\t";
1908 4391040         8420479 print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size;
1909 4391040         4298892 print $OUTPUT_FILE "\t";
1910              
1911 4391040 50       3274786 if(!defined(@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos])){
  4391040         6625449  
1912 0         0 print $OUTPUT_FILE "0";
1913             }
1914             else{
1915 4391040         3171268 my $rounded = POSIX::ceil((@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos])/2);
  4391040         7883313  
1916 4391040         7366705 print $OUTPUT_FILE ($rounded)/($window_size*$bin_size);
1917             #add the cnv count to the total mutation count for the region
1918 4391040         3276286 @{$genome_mutation_data_windows{$cnv_key}}[$chr_pos] += $rounded;
  4391040         11546829  
1919             }
1920             }#for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++)
1921              
1922 40         5548 close ($OUTPUT_FILE);
1923             }#if(scalar(@current_chr_data) > 0)
1924              
1925 40         73689 @current_chr_data = ();
1926             }#for my $cnv_key ( keys %$genome_cnv_data_hash_ref)
1927              
1928             #perform the sliding window analysis on the translocation mutation data
1929 2         31 for my $trans_key ( keys %$genome_trans_data_hash_ref){
1930              
1931             #get the array storing the translocation bins for the current chromosome
1932 26         72 @current_chr_data = @{$genome_trans_data_hash_ref->{$trans_key}};
  26         81590  
1933              
1934             #check that the array is not empty
1935 26 50       189 if(scalar(@current_chr_data) > 0){
1936              
1937             #create the output file
1938 26 50       3939 open ($OUTPUT_FILE, ">", "$output_directory/translocations/chr$trans_key"."_translocation_localization.log") or die "ERROR: could not create file $output_directory/translocations/chr$trans_key"."_translocation_localization.log";
1939             #print the header for the output file
1940 26         217 print $OUTPUT_FILE "#chr\tstart\tend\tdensity";
1941              
1942             #initialize the translocation count for the first window
1943 26         59 @{$genome_trans_data_windows{$trans_key}}[0] = 0;
  26         164  
1944              
1945             #calculate the translocation mutation count in the first window
1946 26         125 for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++){
1947 260000         157701 my %region_hash;
1948 260000 100       318639 if(!defined($current_chr_data[$chr_pos])){
1949 259970         359428 next;
1950             }
1951 30         34 %region_hash = %{$current_chr_data[$chr_pos]};
  30         152  
1952              
1953 30         41 my %trans_hash_in;
1954             my %trans_hash_out;
1955 30         27 my $size = 0;
1956              
1957             #calculate the number of inbound translocation breakpoints
1958 30 100       62 if(defined($region_hash{'in'})){
1959 20         23 %trans_hash_in = %{$region_hash{'in'}};
  20         46  
1960              
1961 20         33 for my $key (keys %trans_hash_in){
1962 20         18 $size = @{$trans_hash_in{$key}};
  20         20  
1963 20         33 $size = $size/2;
1964 20         19 @{$genome_trans_data_windows{$trans_key}}[0] += $size;
  20         57  
1965             }
1966             }
1967              
1968             #calculate the number of outbound translocation breakpoints
1969 30 100       77 if(defined($region_hash{'out'})){
1970 18         17 %trans_hash_out = %{$region_hash{'out'}};
  18         47  
1971              
1972 18         37 for my $key (keys %trans_hash_out){
1973 18 50       34 if($key eq $trans_key){
1974 18         67 next;
1975             }
1976 0         0 $size = @{$trans_hash_out{$key}};
  0         0  
1977 0         0 $size = $size/2;
1978 0         0 @{$genome_trans_data_windows{$trans_key}}[0] += $size;
  0         0  
1979             }
1980             }
1981              
1982             }#for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++)
1983              
1984             #print the values from the first window to the output file
1985 26         204 print $OUTPUT_FILE "\n";
1986 26         85 print $OUTPUT_FILE "$trans_key";
1987 26         48 print $OUTPUT_FILE "\t";
1988 26         47 print $OUTPUT_FILE "0";
1989 26         158 print $OUTPUT_FILE "\t";
1990 26         333 print $OUTPUT_FILE ($window_size)*$bin_size;
1991 26         52 print $OUTPUT_FILE "\t";
1992              
1993 26 50       63 if(!defined(@{$genome_trans_data_windows{$trans_key}}[0])){
  26         193  
1994 0         0 print $OUTPUT_FILE "0";
1995             }
1996             else{
1997 26         40 my $rounded = POSIX::ceil(@{$genome_trans_data_windows{$trans_key}}[0]);
  26         218  
1998 26         158 print $OUTPUT_FILE ($rounded)/($window_size*$bin_size);
1999             #add the translocation mutation count to the total mutation count for the region
2000 26         56 @{$genome_mutation_data_windows{$trans_key}}[0] += $rounded;
  26         113  
2001             }
2002              
2003             #perform the sliding window analysis for the rest of the chromosome
2004 26         120 for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++){
2005              
2006 2520292 100       3952139 if( (($chr_pos+($window_size-1))*$bin_size) > $chromosome_length{$trans_key} ){
2007 10         28 last;
2008             }
2009              
2010 2520282         1805582 @{$genome_trans_data_windows{$trans_key}}[$chr_pos] = 0;
  2520282         2989656  
2011 2520282         1860985 my %prev_region_hash;
2012             my %next_region_hash;
2013              
2014 2520282         1823580 my $prev_value = 0;
2015 2520282         1624361 my $next_value = 0;
2016              
2017             #Caculate the number of mutations in the first bin of the previous window
2018 2520282 100       3578042 if(defined($current_chr_data[$chr_pos-1])){
2019 428         408 %prev_region_hash = %{$current_chr_data[$chr_pos-1]};
  428         1520  
2020              
2021 428         479 my $size = 0;
2022 428         387 my %prev_trans_hash_in;
2023             my %prev_trans_hash_out;
2024              
2025 428 100       888 if(defined($prev_region_hash{'in'})){
2026 234         240 %prev_trans_hash_in = %{$prev_region_hash{'in'}};
  234         495  
2027              
2028 234         385 for my $key (keys %prev_trans_hash_in){
2029 262         217 $size = @{$prev_trans_hash_in{$key}};
  262         289  
2030 262         289 $size = $size/2;
2031 262         468 $prev_value += $size;
2032             }
2033             }
2034              
2035 428 100       862 if(defined($prev_region_hash{'out'})){
2036 278         281 %prev_trans_hash_out = %{$prev_region_hash{'out'}};
  278         618  
2037              
2038 278         518 for my $key (keys %prev_trans_hash_out){
2039 284 100       625 if($key eq $trans_key){
2040 184         546 next;
2041             }
2042 100         81 $size = @{$prev_trans_hash_out{$key}};
  100         112  
2043 100         107 $size = $size/2;
2044 100         241 $prev_value += $size;
2045             }
2046             }
2047              
2048             }
2049              
2050             #Caculate the number of mutations in the last bin of the current window
2051 2520282 100       3544084 if(defined($current_chr_data[$chr_pos+($window_size-1)])){
2052 456         393 %next_region_hash = %{$current_chr_data[$chr_pos+($window_size-1)]};
  456         2045  
2053              
2054 456         516 my $size = 0;
2055 456         437 my %next_trans_hash_in;
2056             my %next_trans_hash_out;
2057              
2058 456 100       849 if(defined($next_region_hash{'in'})){
2059 238         253 %next_trans_hash_in = %{$next_region_hash{'in'}};
  238         573  
2060              
2061 238         421 for my $key (keys %next_trans_hash_in){
2062 266         197 $size = @{$next_trans_hash_in{$key}};
  266         363  
2063 266         288 $size = $size/2;
2064 266         485 $next_value += $size;
2065             }
2066             }
2067              
2068 456 100       912 if(defined($next_region_hash{'out'})){
2069 306         250 %next_trans_hash_out = %{$next_region_hash{'out'}};
  306         804  
2070              
2071 306         507 for my $key (keys %next_trans_hash_out){
2072 312 100       678 if($key eq $trans_key){
2073 192         540 next;
2074             }
2075 120         107 $size = @{$next_trans_hash_out{$key}};
  120         147  
2076 120         128 $size = $size/2;
2077 120         286 $next_value += $size;
2078             }
2079             }
2080              
2081              
2082             }
2083              
2084             #total number of translocation mutations in the current window = number of mutations in previous window - the first bin of the previous window + next bin along the chromosome
2085 2520282         1706221 @{$genome_trans_data_windows{$trans_key}}[$chr_pos] += (@{$genome_trans_data_windows{$trans_key}}[$chr_pos-1]) - ($prev_value) + ($next_value);
  2520282         2618698  
  2520282         2522535  
2086              
2087             #print values from this window
2088 2520282         2412434 print $OUTPUT_FILE "\n";
2089 2520282         4826823 print $OUTPUT_FILE "$trans_key";
2090 2520282         1946175 print $OUTPUT_FILE "\t";
2091 2520282         7425819 print $OUTPUT_FILE $chr_pos*$bin_size;
2092 2520282         1903350 print $OUTPUT_FILE "\t";
2093 2520282         4044352 print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size;
2094 2520282         2127756 print $OUTPUT_FILE "\t";
2095              
2096 2520282 50       1771966 if(!defined(@{$genome_trans_data_windows{$trans_key}}[$chr_pos])){
  2520282         3549301  
2097 0         0 print $OUTPUT_FILE "0";
2098             }
2099             else{
2100 2520282         1853634 my $rounded = POSIX::ceil(@{$genome_trans_data_windows{$trans_key}}[$chr_pos]);
  2520282         3837198  
2101 2520282         3853291 print $OUTPUT_FILE ($rounded)/($window_size*$bin_size);
2102 2520282         1811317 @{$genome_mutation_data_windows{$trans_key}}[$chr_pos] += $rounded;
  2520282         6147554  
2103             }
2104             }#for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++)
2105              
2106              
2107 26         2682 close ($OUTPUT_FILE);
2108             }
2109 26         40244 @current_chr_data = ();
2110             }
2111              
2112             #calculate the density of both types of mutations in each window in the genome
2113 2         27 for my $mutation_key ( keys %genome_mutation_data_windows){
2114            
2115             #check that some data exisits for the current chromosome
2116 48 100       100 if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){
  48         280  
2117              
2118             #create the output file
2119 42 50       277793 open ($OUTPUT_FILE, ">", "$output_directory/all_types/chr$mutation_key"."_mutation_localization.log") or die "ERROR: could not create file $output_directory/all_types/chr$current_chr"."_mutation_localization.log";
2120             #print the header for the output file
2121 42         176 print $OUTPUT_FILE "#chr\tstart\tend\tdensity";
2122            
2123 42         61 my $density;
2124              
2125             #for every bin along the chromosome
2126 42         109 for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){
  4776020         7680664  
2127 4775978         3117194 $total_genome_windows++; #increment the total number of windows in the genome
2128            
2129             #calculate the density of mutations in the window
2130 4775978 50       3148584 if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){
  4775978         6429662  
2131 0         0 $density = 0;
2132             }
2133             else{
2134 4775978         3119126 $density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size);
  4775978         5690124  
2135             }
2136              
2137             #sum the density values to calculate the mean
2138 4775978         3950367 $genome_mean_mutation_density += $density;
2139              
2140             #print the values for the window
2141 4775978         3822716 print $OUTPUT_FILE "\n";
2142 4775978         3806028 print $OUTPUT_FILE "$mutation_key";
2143 4775978         3519477 print $OUTPUT_FILE "\t";
2144 4775978         9288168 print $OUTPUT_FILE $chr_pos*$bin_size;
2145 4775978         3564978 print $OUTPUT_FILE "\t";
2146 4775978         6235293 print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size;
2147 4775978         4516896 print $OUTPUT_FILE "\t";
2148 4775978         6815745 print $OUTPUT_FILE $density;
2149              
2150             }#for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++)
2151              
2152 42         3911 close ($OUTPUT_FILE);
2153             }
2154             }#for my $mutation_key ( keys %genome_mutation_data_windows)
2155              
2156             #calculate the mean mutation density for the windows in the genome
2157 2         23 $genome_mean_mutation_density = $genome_mean_mutation_density/$total_genome_windows;
2158              
2159             #find the sum of squared difference between the density of mutation of each window and the mean density of mutation
2160 2         20 for my $mutation_key ( keys %genome_mutation_data_windows){
2161 48 100       79 if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){
  48         221  
2162 42         42 my $density;
2163              
2164 42         93 for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){
  4776020         7097656  
2165 4775978 50       3122829 if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){
  4775978         6280127  
2166 0         0 $density = 0;
2167             }
2168             else{
2169 4775978         3149281 $density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size);
  4775978         4898174  
2170             }
2171              
2172             #sum the squared differences
2173 4775978         5092304 $genome_mutation_density_standard_deviation += ($density-$genome_mean_mutation_density)**2;
2174             }
2175             }
2176             }#for my $mutation_key ( keys %genome_mutation_data_windows)
2177              
2178             #divided the sum of the squared differnces by the total number of windows and take the square root
2179 2         39 $genome_mutation_density_standard_deviation = ($genome_mutation_density_standard_deviation/$total_genome_windows)**0.5;
2180              
2181             #calculate z scores for each window and check if the window is greater than 2 SDs away from genome mean
2182             #use this value to identify highly mutated regions
2183 2         14 for my $mutation_key ( keys %genome_mutation_data_windows){
2184 48 100       64 if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){
  48         198  
2185 42         44 my $density;
2186 42         72 my $region_z_score = 0;
2187              
2188 42         58 for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){
  4776020         7101599  
2189            
2190 4775978 50       3201243 if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){
  4775978         6496156  
2191 0         0 $density = 0;
2192             }
2193             else{
2194 4775978         3127809 $density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size);
  4775978         5265502  
2195             }
2196              
2197             #calculate z score for the window
2198 4775978         3629181 $region_z_score = ($density-$genome_mean_mutation_density)/$genome_mutation_density_standard_deviation;
2199              
2200             #check if the z score is above the threshold
2201 4775978 100       8215524 if( $region_z_score >= $outlier_deviation ) {
    100          
2202 88404 100       112082 if($in_suspect_region!=1){
2203 8         12 $suspect_start = $chr_pos*$bin_size;
2204 8         12 $in_suspect_region = 1;
2205             }
2206 88404         64480 $suspect_chr = $mutation_key;
2207 88404         68734 $suspect_end = ($chr_pos+$window_size)*$bin_size;
2208             }
2209             elsif ($in_suspect_region==1){
2210             #once a region has been called push the chromosome, start and end positions into the suspect region array
2211 8         53 push (@suspect_regions, ($suspect_chr,$suspect_start,$suspect_end));
2212 8         12 $suspect_chr = -1;
2213 8         10 $suspect_start = -1;
2214 8         10 $suspect_end = -1;
2215 8         8 $in_suspect_region = 0;
2216             }
2217              
2218             #check if the z score is below the threshold but still suspicously high
2219 4775978 100 100     15196012 if(
    100          
2220             ( $region_z_score < $outlier_deviation ) &&
2221             ( $region_z_score >= ($outlier_deviation-1) )
2222             ){
2223 66812 100       84473 if($in_likely_region!=1){
2224 26         33 $likely_start = $chr_pos*$bin_size;
2225 26         42 $in_likely_region = 1;
2226             }
2227 66812         53962 $likely_chr = $mutation_key;
2228 66812         65530 $likely_end = ($chr_pos+$window_size)*$bin_size;
2229             }
2230             elsif ($in_likely_region==1){
2231 24         105 push (@likely_regions, ($likely_chr,$likely_start,$likely_end));
2232 24         29 $likely_chr = -1;
2233 24         29 $likely_start = -1;
2234 24         23 $likely_end = -1;
2235 24         35 $in_likely_region = 0;
2236             }
2237              
2238             }#for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++)
2239              
2240             #check if the last region analyzed was suspicious and push its data into the appropriate array
2241 42 50       156 if ($in_suspect_region==1){
2242 0         0 push (@suspect_regions, ($suspect_chr,$suspect_start,$suspect_end));
2243 0         0 $suspect_chr = -1;
2244 0         0 $suspect_start = -1;
2245 0         0 $suspect_end = -1;
2246 0         0 $in_suspect_region = 0;
2247             }
2248              
2249 42 100       220 if ($in_likely_region==1){
2250 2         18 push (@likely_regions, ($likely_chr,$likely_start,$likely_end));
2251 2         4 $likely_chr = -1;
2252 2         6 $likely_start = -1;
2253 2         4 $likely_end = -1;
2254 2         8 $in_likely_region = 0;
2255             }
2256             }
2257             }#for my $mutation_key ( keys %genome_mutation_data_windows)
2258              
2259              
2260 2         50 return (\@suspect_regions, \@likely_regions, \%genome_cnv_data_windows, \%genome_trans_data_windows, \%genome_mutation_data_windows);
2261              
2262             }#sub calculate_chromosome_localization
2263              
2264              
2265             #=head2 Sub-Method: check_copy_number_count
2266              
2267             ### check_copy_number_count #######################################################################
2268             # Description:
2269             # Produces an output file that records the number of regions of copy-number variation that
2270             # are present in each chromosome.
2271             #
2272             # Input variables:
2273             # $output_directory: stores the path to the output directory
2274             # $chromosome_copy_number_count_hash_ref: reference to hash that stores the count of regions
2275             # of copy-number variation on each chromosome
2276             #
2277              
2278             #=cut
2279              
2280             sub check_copy_number_count {
2281              
2282             #parse parameters
2283 1     1 0 9 my $output_directory = shift;
2284 1         39 my $chromosome_copy_number_count_hash_ref = shift;
2285              
2286 1         1 my $OUTPUT_FILE; #file handle to output file
2287              
2288             #open the output file
2289 1 50       145 open ($OUTPUT_FILE, ">", "$output_directory/copy_number_count.log") or die "ERROR: could not create file $output_directory/copy_number_count.log\n";
2290              
2291             #print the header
2292 1         5 print $OUTPUT_FILE "#chr\tcopy_number\tnumber_of_regions";
2293              
2294             #for each chromosome
2295             #print out the number of regions with the given copy-number
2296 9     9   35802 {use sort 'stable';
  9         23  
  9         69  
  1         1  
2297 1         44 for my $chr (sort keys %$chromosome_copy_number_count_hash_ref){
2298 20         12 my %intermediate_hash = %{$chromosome_copy_number_count_hash_ref->{$chr}};
  20         63  
2299 20         35 for my $CN (sort {$a <=> $b} keys %intermediate_hash){
  103         106  
2300 80         108 print $OUTPUT_FILE "\n";
2301 80         57 print $OUTPUT_FILE "$chr"; #chromosome
2302 80         53 print $OUTPUT_FILE "\t";
2303 80         57 print $OUTPUT_FILE "$CN"; #copy-number
2304 80         56 print $OUTPUT_FILE "\t";
2305 80         131 print $OUTPUT_FILE $chromosome_copy_number_count_hash_ref->{$chr}->{$CN}; #number of regions with copy-number $CN
2306             }
2307             }
2308             }#use sort 'stable'
2309 1         96 close ($OUTPUT_FILE);
2310            
2311             }#sub check_copy_number_count
2312              
2313             #=head2 Sub-Method: check_copy_number_switches
2314              
2315             ### check_copy_number_switches ####################################################################
2316             # Description:
2317             # Creates an output file that records the number of breakpoints between CNV regions on each
2318             # chromosome
2319             #
2320             # Input variables:
2321             # $output_directory: stores path to output directory
2322             # $chromosome_copy_number_count_hash_ref: reference to hash that stores the count of regions
2323             # of copy-number variation on each chromosome
2324             #
2325              
2326             #=cut
2327              
2328             sub check_copy_number_switches {
2329              
2330             #parse parameters
2331 1     1 0 45 my $output_directory = shift;
2332 1         2 my $chromosome_copy_number_count_hash_ref = shift;
2333              
2334 1         1 my $switch_count = 0;
2335              
2336 1         2 my $OUTPUT_FILE; #file handle to output file
2337              
2338             #open output file
2339 1 50       102 open ($OUTPUT_FILE, ">", "$output_directory/copy_number_switches.log") or die "ERROR: could not create file $output_directory/copy_number_switches.log\n";
2340              
2341             #print header
2342 1         4 print $OUTPUT_FILE "#chr\tswitch_count";
2343              
2344             #for each chromosome
2345             #sum the total number of CNV events
2346             #multiply the sum by 2 to get the number of CNV breakpoints
2347 9     9   2627 {use sort 'stable';
  9         16  
  9         37  
  1         2  
2348 1         15 for my $chr (sort keys %$chromosome_copy_number_count_hash_ref){
2349 20         17 my %intermediate_hash = %{$chromosome_copy_number_count_hash_ref->{$chr}};
  20         53  
2350 20         37 for my $CN (sort {$intermediate_hash{$b} <=> $intermediate_hash{$a} } keys %intermediate_hash){
  106         94  
2351             #only count regions that have abberant copy numbers
2352 80 50       103 if($CN != 2){
2353 80         91 $switch_count += ($chromosome_copy_number_count_hash_ref->{$chr}->{$CN} * 2);
2354             }
2355             }
2356              
2357             #print values to output file
2358 20         24 print $OUTPUT_FILE "\n";
2359 20         16 print $OUTPUT_FILE "$chr";
2360 20         14 print $OUTPUT_FILE "\t";
2361 20         30 print $OUTPUT_FILE $switch_count;
2362              
2363 20         23 $switch_count = 0;
2364             }
2365             }#use sort 'stable'
2366              
2367 1         33 close ($OUTPUT_FILE);
2368            
2369             }#sub check_copy_number_switches
2370              
2371             #=head2 Sub-Method: calculate_interchromosomal_translocation_rate
2372              
2373             ### calculate_interchromosomal_translocation_rate #################################################
2374             # Description:
2375             # Create an output file that records the number of translocations between each and every
2376             # chromosome
2377             #
2378             # Input variables:
2379             # $output_directory: stores path to output directory
2380             # $chromosome_translocation_count_hash_ref: reference to hash that stores the count of
2381             # translocations between each chromosome
2382             #
2383              
2384             #=cut
2385              
2386             sub calculate_interchromosomal_translocation_rate {
2387              
2388             #parse parameters
2389 1     1 0 9 my $output_directory = shift;
2390 1         3 my $chromosome_translocation_count_hash_ref = shift;
2391              
2392 1         2 my $OUTPUT_FILE; #file handle to output file
2393              
2394             #open output file
2395 1 50       153 open ($OUTPUT_FILE, ">", "$output_directory/interchromosomal_translocation_rate.log") or die "ERROR: could not create file $output_directory/interchromosomal_translocation_rate.log\n";
2396              
2397             #print header
2398 1         19 print $OUTPUT_FILE "#chr1\tchr2\tcount";
2399              
2400             #for each chromosome
2401             #print the number of translocations between every other chromosome
2402 9     9   2679 {use sort 'stable';
  9         18  
  9         45  
  1         2  
2403 1         14 for my $chr1 (sort keys %$chromosome_translocation_count_hash_ref){
2404 13         12 my %intermediate_hash = %{$chromosome_translocation_count_hash_ref->{$chr1}};
  13         34  
2405              
2406 13         23 for my $chr2 (sort keys %intermediate_hash){
2407 26         24 print $OUTPUT_FILE "\n";
2408 26         18 print $OUTPUT_FILE $chr1;
2409 26         25 print $OUTPUT_FILE "\t";
2410 26         19 print $OUTPUT_FILE $chr2;
2411 26         19 print $OUTPUT_FILE "\t";
2412 26         58 print $OUTPUT_FILE $chromosome_translocation_count_hash_ref->{$chr1}->{$chr2};
2413             }
2414             }
2415             }#use sort 'stable'
2416 1         34 close ($OUTPUT_FILE);
2417            
2418             }#sub calculate_interchromosomal_translocation_rate
2419              
2420              
2421             #=head2 Sub-Method: analyze_suspect_regions
2422              
2423             ### analyze_suspect_regions #######################################################################
2424             # Description:
2425             # Produces the final report output file, that includes the chromothriptic scores for each of
2426             # the highly mutated regions
2427             #
2428             # Input variables:
2429             # $output_directory: stores path to the output directory
2430             # $suspect_regions_array_ref: reference to array storing the chromosome,
2431             # start, and end position of highly mutated
2432             # regions
2433             # $genome_mutation_density_hash_ref: stores the average mutation density of each
2434             # chromosome
2435             # $genome_cnv_data_hash_ref: stores the position of CNV mutations on
2436             # each chromosome
2437             # $genome_trans_data_hash_ref: stores the position of translocation events
2438             # on each chromosome
2439             # $genome_trans_insertion_breakpoints_hash_ref: stores the position of insertions on each
2440             # chromosome
2441             # $bin_size: stores the size of a single bin
2442             # $localization_window_size: stores the number of bins to include in a
2443             # window
2444             # $tp53_mutated: stores whether the TP53 gene is mutatated
2445             # or not
2446             # $tp53_mutation_found: stores whether or not a mutation was found
2447             # in the TP53 loci
2448             # $chromosome_cnv_breakpoints_hash_ref: stores the breakpoints of CNV mutations on
2449             # each chromosome
2450             # $chromosome_loh_breakpoints_hash_ref: stores the breakpoints of LOH regions on
2451             # each chromosome
2452             #
2453              
2454             #=cut
2455              
2456             sub analyze_suspect_regions {
2457              
2458             #parse parameters
2459 0     0 0 0 my $output_directory = shift;
2460              
2461 0         0 my $suspect_regions_array_ref = shift;
2462 0         0 my @suspect_regions = @$suspect_regions_array_ref;
2463              
2464 0         0 my $genome_mutation_density_hash_ref = shift;
2465 0         0 my %genome_mutation_density_hash = %{$genome_mutation_density_hash_ref};
  0         0  
2466            
2467 0         0 my $genome_cnv_data_hash_ref = shift;
2468 0         0 my $genome_trans_data_hash_ref = shift;
2469 0         0 my $genome_trans_insertion_breakpoints_hash_ref = shift;
2470              
2471 0         0 my $bin_size = shift;
2472 0         0 my $localization_window_size = shift;
2473              
2474 0         0 my $tp53_mutated = shift;
2475 0         0 my $tp53_mutation_found = shift;
2476              
2477 0         0 my $chromosome_cnv_breakpoints_hash_ref = shift;
2478 0         0 my $chromosome_loh_breakpoints_hash_ref = shift;
2479              
2480              
2481 0         0 my $suspect_regions_size = @suspect_regions;
2482              
2483 0         0 my $OUTPUT_FILE;
2484              
2485 0         0 my @suspect_region_data = ();
2486              
2487 0         0 my $header_string;
2488              
2489             #check that the suspect region data array is not malformed, should contain sets of 3 elements
2490 0 0       0 if($suspect_regions_size % 3 != 0){
2491 0         0 die "ERROR: suspect_regions_array has $suspect_regions_size entries. Value must be divisible by 3.\n";
2492             }
2493              
2494             #create output directory for report files
2495 0 0       0 mkdir ("$output_directory"."suspect_regions",0770) unless (-d "$output_directory"."suspect_regions");
2496 0 0       0 if(!(-e "$output_directory"."suspect_regions")){
2497 0         0 die "ERROR: could not create folder $output_directory"."suspect_regions";
2498             }
2499              
2500             #open final report output file
2501 0 0       0 open ($OUTPUT_FILE, ">", "$output_directory"."suspect_regions/suspect_regions.yml") or die "ERROR: could not create file: $output_directory"."suspect_regions/suspect_regions.yml\n";
2502              
2503             #construct and print header
2504 0         0 $header_string = "file: Suspect Chromothriptic Regions\n";
2505 0         0 $header_string .= "bin_size:\t\t\t$bin_size\n";
2506 0         0 $header_string .= "localization_window_size:\t$localization_window_size\n";
2507 0         0 $header_string .= "\n";
2508 0         0 $header_string .= "genome_localization_score_weight:\t$genome_localization_weight\n";
2509 0         0 $header_string .= "chromosome_localization_score_weight:\t$chromosome_localization_weight\n";
2510 0         0 $header_string .= "cnv_score_weight:\t\t\t$cnv_weight\n";
2511 0         0 $header_string .= "translocation_score_weight:\t\t$translocation_weight\n";
2512 0         0 $header_string .= "insertion_breakpoint_score_weight:\t$insertion_breakpoint_weight\n";
2513 0         0 $header_string .= "loh_score_weight:\t\t\t$loh_weight\n";
2514 0         0 $header_string .= "tp53_mutation_score_weight:\t\t$tp53_mutated_weight\n";
2515 0         0 $header_string .= "\n";
2516 0         0 $header_string .= "min_mutation_density_z_score:\t$outlier_deviation\n";
2517 0         0 $header_string .= "---\n";
2518 0         0 $header_string .= "\n";
2519 0         0 print $OUTPUT_FILE $header_string;
2520              
2521             #calculate a chromothripsis score for each region that was present in the suspect region array
2522             #store the results of the score calculation in a 2d array where elements in the first dimension correspond to each suspect region
2523             #and the elements in the second dimension are the results of the score calculation
2524 0         0 for (my $i = 0; $i < $suspect_regions_size; $i+=3){
2525 0         0 my @region_data = ();
2526 0         0 $region_data[0] = $suspect_regions[$i]; #chr
2527 0         0 $region_data[1] = $suspect_regions[$i+1]; #start
2528 0         0 $region_data[2] = $suspect_regions[$i+2]; #end
2529              
2530 0         0 ($region_data[3], $region_data[4], $region_data[5], $region_data[6], $region_data[7], $region_data[8], $region_data[9], $region_data[10], $region_data[11], $region_data[12], $region_data[13]) = calculate_score($region_data[0], $region_data[1], $region_data[2], $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $tp53_mutated, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref, $bin_size);
2531              
2532             #add the results of the score calculation for this region to the array storing all the results
2533 0         0 push @suspect_region_data, [@region_data];
2534             }
2535              
2536             #sort the results so that the region with the highest chromothriptic score will be printed
2537             #to the final report output file first
2538 9     9   6023 {use sort 'stable';
  9         18  
  9         39  
  0         0  
2539 0         0 @suspect_region_data = sort {$b->[3] <=> $a->[3] } @suspect_region_data;
  0         0  
2540             }#use sort 'stable'A
2541              
2542             #for each score that is generated print the score and the related statistics for that region
2543 0         0 foreach my $score_data (@suspect_region_data){
2544 0         0 my $chr = $score_data->[0]; #chr
2545 0         0 my $start = $score_data->[1]; #start
2546 0         0 my $end = $score_data->[2]; #end
2547            
2548 0         0 my $score = sprintf("%.5f",$score_data->[3]);
2549 0         0 my $chr_z_score = $score_data->[4];
2550 0         0 my $region_density = sprintf("%e",$score_data->[5]);
2551              
2552 0         0 my $cnv_number_hash_ref = $score_data->[6];
2553 0         0 my %cnv_number_hash;
2554             my $num_copy_num;
2555 0 0       0 if(defined($cnv_number_hash_ref)){
2556 0         0 %cnv_number_hash = %{$cnv_number_hash_ref};
  0         0  
2557 0         0 $num_copy_num = keys %cnv_number_hash;
2558             }
2559             else{
2560 0         0 $num_copy_num = 0;
2561             }
2562              
2563 0         0 my $cnv_density = sprintf("%e",$score_data->[7]);
2564              
2565 0         0 my $intertranslocation_hash_ref = $score_data->[8];
2566 0         0 my $translocation_density = $score_data->[9];
2567 0         0 my %intertranslocation_hash;
2568             my $num_trans_chr;
2569 0 0       0 if(defined($intertranslocation_hash_ref)){
2570 0         0 %intertranslocation_hash = %{$intertranslocation_hash_ref};
  0         0  
2571 0         0 $num_trans_chr = keys %intertranslocation_hash;
2572             }
2573             else{
2574 0         0 $num_trans_chr = 0;
2575             }
2576              
2577 0         0 my $breakpoint_insertions_array_ref = $score_data->[10];
2578 0         0 my @breakpoint_insertions_array;
2579             my $breakpoint_percentage;
2580 0 0       0 if(defined($breakpoint_insertions_array_ref)){
2581 0         0 @breakpoint_insertions_array = @$breakpoint_insertions_array_ref;
2582 0         0 $breakpoint_percentage = sprintf("%.2f",($breakpoint_insertions_array[0]/$breakpoint_insertions_array[1])*100);
2583             }
2584              
2585 0         0 my $loh_size = $score_data->[11];
2586 0         0 my $hz_size = $score_data->[12];
2587 0         0 my $percent_hz_lost;
2588 0 0 0     0 if(defined($loh_size) && defined($hz_size)){
2589 0         0 $percent_hz_lost = sprintf("%.2f",($loh_size/$hz_size)*100);
2590             }
2591              
2592 0         0 my @score_array = @{$score_data->[13]};
  0         0  
2593              
2594 0         0 my $chr_density = $genome_mutation_density_hash{$chr};
2595              
2596 0         0 my $print_string;
2597              
2598 0         0 $print_string = "chromosome:\t$chr\n";
2599 0         0 $print_string .= "start:\t\t$start\n";
2600 0         0 $print_string .= "end:\t\t$end\n";
2601 0         0 $print_string .= "\n";
2602              
2603 0         0 $print_string .= "final_score:\t\t\t$score\n";
2604 0         0 $print_string .= "genome_localization_score:\t".$score_array[2]*$genome_localization_weight."\t(".$score_array[2].")"."\n";
2605 0         0 $print_string .= "chromosome_localization_score:\t".$score_array[1]*$chromosome_localization_weight."\t(".$score_array[1].")"."\n";
2606 0         0 $print_string .= "cnv_score:\t\t\t".$score_array[0]*$cnv_weight."\t(".$score_array[0].")"."\n";
2607 0         0 $print_string .= "translocation_score:\t\t".$score_array[3]*$translocation_weight."\t(".$score_array[3].")"."\n";
2608 0         0 $print_string .= "insertion_breakpoint_score:\t".$score_array[4]*$insertion_breakpoint_weight."\t(".$score_array[4].")"."\n";
2609 0         0 $print_string .= "loh_score:\t\t\t".$score_array[5]*$loh_weight."\t(".$score_array[5].")"."\n";
2610 0         0 $print_string .= "tp53_score:\t\t\t".$score_array[6]*$tp53_mutated_weight."\t(".$score_array[6].")\n";
2611            
2612 0         0 $print_string .= "\n";
2613              
2614 0         0 $print_string .= "mutation_density_of_region:\t$region_density\n";
2615 0         0 $print_string .= "mutation_density_of_chromosome:\t$chr_density\n";
2616 0         0 $print_string .= "standard_deviations_from_mean_of_chromosome_mutation_density:\t$chr_z_score\n";
2617 0         0 $print_string .= "\n";
2618              
2619 0         0 $print_string .= "density_of_copy_number_switches: $cnv_density\n";
2620 0         0 $print_string .= "number_of_aberrant_copy_number_states:\t$num_copy_num\n";
2621 0 0       0 if($num_copy_num>0){
2622 0         0 $print_string .= "aberrant_copy_number_states:\n";
2623 9     9   7085 {use sort 'stable';
  9         30  
  9         47  
  0         0  
2624 0         0 foreach my $key (sort {$cnv_number_hash{$b} <=> $cnv_number_hash{$a} } keys %cnv_number_hash){
  0         0  
2625 0         0 $print_string .= "\t$key:\t$cnv_number_hash{$key}\n";
2626             }
2627             }#use sort 'stable'
2628             }
2629              
2630 0         0 $print_string .= "\n";
2631              
2632 0         0 $print_string .= "density_of_translocation_breakpoints: $translocation_density\n";
2633 0         0 $print_string .= "number_of_translocation_chromosomes:\t$num_trans_chr\n";
2634              
2635 0 0       0 if($num_trans_chr>0){
2636 0         0 $print_string .= "translocation_chromosomes:\n";
2637 9     9   1488 {use sort 'stable';
  9         14  
  9         34  
  0         0  
2638 0         0 foreach my $key (sort {$intertranslocation_hash{$b} <=> $intertranslocation_hash{$a} } keys %intertranslocation_hash){
  0         0  
2639 0         0 $print_string .= "\t$key:\t$intertranslocation_hash{$key}\n";
2640             }
2641             }#use sort 'stable'
2642             }
2643 0         0 $print_string .= "\n";
2644              
2645 0 0       0 if(defined($breakpoint_insertions_array_ref)){
2646 0         0 $print_string .= "insertion_data:\n";
2647 0         0 $print_string .= "\tinsertions_found_at_translocation_breakpoints:\t$breakpoint_insertions_array[0]\n";
2648 0         0 $print_string .= "\ttotal_translocation_breakpoints:\t$breakpoint_insertions_array[1]\n";
2649 0         0 $print_string .= "\tpercentage:\t$breakpoint_percentage"."%\n";
2650 0         0 $print_string .= "\n";
2651             }
2652              
2653 0 0       0 if($loh_size!=-1){
2654 0         0 $print_string .= "loh_data:\n";
2655 0         0 $print_string .= "\ttotal_size_of_loh:\t$loh_size\n";
2656 0         0 $print_string .= "\ttotal_size_of_original_heterozygosity:\t$hz_size\n";
2657 0         0 $print_string .= "\tpercent_heterozygosity_lost:\t$percent_hz_lost"."%\n";
2658 0         0 $print_string .= "\n";
2659             }
2660              
2661 0 0 0     0 if($tp53_mutated && $tp53_mutation_found){
    0          
    0          
2662 0         0 $print_string .= "tp53_mutation_present:\t1 (forced and mutations found)\n";
2663             }
2664             elsif($tp53_mutated){
2665 0         0 $print_string .= "tp53_mutation_present:\t1 (forced)\n";
2666             }
2667             elsif($tp53_mutation_found){
2668 0         0 $print_string .= "tp53_mutation_present:\t1 (mutations found)\n";
2669             }
2670             else{
2671 0         0 $print_string .= "tp53_mutation_present:\t0\n";
2672             }
2673              
2674              
2675 0         0 $print_string .= "---\n";
2676 0         0 $print_string .= "\n";
2677 0         0 print $OUTPUT_FILE $print_string;
2678 0         0 $print_string = "";
2679             }
2680              
2681 0         0 print $OUTPUT_FILE "...";
2682 0         0 close($OUTPUT_FILE);
2683             }#sub analyze_suspect_regions
2684              
2685              
2686             #=head2 Sub-Method: analyze_likely_regions
2687              
2688             ### analyze_likely_regions ########################################################################
2689             # Description:
2690             # Generates an output file that lists the regions that have a mutation density that is less
2691             # than the outlier cut off but greater than 1 - the outlier cut off
2692             #
2693             # Input variables:
2694             # $output_directory: stores path to the output directory
2695             # $likely_regions_array_ref: reference to array storing the chromosome, start,
2696             # and end position of highly mutated regions
2697             # $genome_mutation_density_hash_ref: stores the average mutation density of each
2698             # chromosome
2699             # $genome_cnv_data_hash_ref: stores the position of CNV mutations on each
2700             # chromosome
2701             # $genome_trans_data_hash_ref: stores the position of translocation events on each
2702             # chromosome
2703             # $bin_size: stores the size of a single bin
2704             #
2705              
2706             #=cut
2707              
2708             sub analyze_likely_regions {
2709              
2710             #parse parameters
2711 1     1 0 9 my $output_directory = shift;
2712              
2713 1         2 my $likely_regions_array_ref = shift;
2714 1         18 my @likely_regions = @$likely_regions_array_ref;
2715              
2716 1         1 my $genome_mutation_density_hash_ref = shift;
2717 1         3 my $genome_cnv_data_hash_ref = shift;
2718 1         1 my $genome_trans_data_hash_ref = shift;
2719 1         1 my $bin_size = shift;
2720              
2721 1         2 my $likely_regions_size = @likely_regions;
2722              
2723 1         1 my $OUTPUT_FILE;
2724              
2725             my @return_vals;
2726              
2727 1         2 my @likely_region_data = (); #stores start, end, chromsome and mutation density for each region
2728              
2729             #check that the likely region array is not malformed, should contain sets of 3 elements
2730 1 50       4 if($likely_regions_size % 3 != 0){
2731 0         0 die "ERROR: suspect_regions_array has $likely_regions_size entries. Value must be divisible by 3.\n";
2732             }
2733              
2734              
2735             #create output directory
2736 1 50       116 mkdir ("$output_directory"."suspect_regions",0770) unless (-d "$output_directory"."suspect_regions");
2737 1 50       13 if(!(-e "$output_directory"."suspect_regions")){
2738 0         0 die "ERROR: could not create folder $output_directory"."suspect_regions";
2739             }
2740              
2741             #create output file
2742 1 50       75 open ($OUTPUT_FILE, ">", "$output_directory"."suspect_regions/likely_regions.log") or die "ERROR: could not create file: $output_directory"."suspect_regions/likely_regions.log\n";
2743              
2744             #print file header
2745 1         11 print $OUTPUT_FILE "Likely Chromothriptic Regions\n";
2746 1         4 print $OUTPUT_FILE "High Mutation Density Z-Score:\t$outlier_deviation\n";
2747 1         2 print $OUTPUT_FILE "Min Mutation Density Z-Score:\t$outlier_deviation-1\n";
2748 1         2 print $OUTPUT_FILE "---------------------------------------\n";
2749 1         1 print $OUTPUT_FILE "#chr\tstart\tend\tmutation_density";
2750              
2751             #for each likely region calculate the mutation density for the region and store it in the likely_region_data array
2752 1         5 for (my $i = 0; $i < $likely_regions_size; $i+=3){
2753 13         22 my @region_data = ();
2754 13         34 $region_data[0] = $likely_regions[$i]; #chr
2755 13         21 $region_data[1] = $likely_regions[$i+1]; #start
2756 13         23 $region_data[2] = $likely_regions[$i+2]; #end
2757              
2758 13         45 ($region_data[3],$region_data[4]) = calculate_region_mutation_density_score($region_data[0], $region_data[1], $region_data[2], $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $bin_size);
2759              
2760 13         161 push @likely_region_data, [@region_data];
2761             }
2762              
2763             #sort the regions by density, largest to smallest
2764 9     9   7599 {use sort 'stable';
  9         24  
  9         49  
  1         4  
2765 1         8 @likely_region_data = sort {$b->[3] <=> $a->[3] } @likely_region_data;
  34         51  
2766             }#use sort 'stable'
2767              
2768             #print the density for each region to the output file
2769 1         5 foreach my $i (@likely_region_data){
2770 13         17 my $chr = $i->[0]; #chr
2771 13         9 my $start = $i->[1]; #start
2772 13         11 my $end = $i->[2]; #end
2773            
2774 13         12 my $region_density = $i->[3]; #mutation density
2775              
2776 13         13 print $OUTPUT_FILE "\n";
2777 13         45 print $OUTPUT_FILE "$chr\t$start\t$end\t$region_density";
2778 13         33 push (@return_vals,$chr,$start,$end,$region_density);
2779             }
2780              
2781 1         98 close($OUTPUT_FILE);
2782 1         23 return(\@return_vals);
2783             }#sub analyze_likely_regions
2784              
2785              
2786             #=head2 Sub-Method: calculate_score
2787              
2788             ### calculate_score ###############################################################################
2789             # Description:
2790             # Calculates the chromothripic score for the given region. Calls sub methods to generate the
2791             # score for each hallmark
2792             #
2793             # Input variables:
2794             # $chr: stores the chromosome on which the region
2795             # is found
2796             # $start: stores the start base pair of the region
2797             # $end: stores the end base pair of the region
2798             # $genome_cnv_data_hash_ref: stores the position of CNV mutations on
2799             # each chromosome
2800             # $genome_trans_data_hash_ref: stores the position of translocation events
2801             # on each chromosome
2802             # $genome_mutation_density_hash_ref: stores the average mutation density of each
2803             # chromosome
2804             # $genome_trans_insertion_breakpoints_hash_ref: stores the position of insertions on each
2805             # chromosome
2806             # $tp53_mutated: stores whether the TP53 gene is mutatated
2807             # or not
2808             # $chromosome_cnv_breakpoints_hash_ref: stores the breakpoints of CNV mutations on
2809             # each chromosome
2810             # $chromosome_loh_breakpoints_hash_ref: stores the breakpoints of LOH regions on
2811             # each chromosome
2812             # $bin_size: stores the size of a single bin
2813             #
2814              
2815             #=cut
2816              
2817             sub calculate_score{
2818              
2819             #parse parameters
2820 0     0 0 0 my $chr = shift;
2821 0         0 my $start = shift;
2822 0         0 my $end = shift;
2823              
2824 0         0 my $genome_cnv_data_hash_ref = shift;
2825 0         0 my $genome_trans_data_hash_ref = shift;
2826 0         0 my $genome_mutation_density_hash_ref = shift;
2827 0         0 my $genome_trans_insertion_breakpoints_hash_ref = shift;
2828 0         0 my $tp53_mutated = shift;
2829 0         0 my $chromosome_cnv_breakpoints_hash_ref = shift;
2830 0         0 my $chromosome_loh_breakpoints_hash_ref = shift;
2831 0         0 my $bin_size = shift;
2832              
2833             #initialize variable to store scores for each hallmark
2834 0         0 my $cnv_score = 0;
2835 0         0 my $mutation_density_score = 0;
2836 0         0 my $genome_localization_score = 0;
2837 0         0 my $translocation_score = 0;
2838 0         0 my $insertion_breakpoint_score = 0;
2839 0         0 my $loh_score = 0;
2840 0         0 my $final_score = 0;
2841              
2842 0         0 my @score_array; #array in which hallmark scores will be returned
2843              
2844             my $chr_mutation_density; #stores the average mutation density of the chromosome where the region is found
2845 0         0 my $chr_z_score; #stores the z_score of the mutation density of the chromosome where the region is found vs
2846             #all the other chromosomes
2847 0         0 my $cnv_number_hash_ref; #stores a hash that contains the number of regions of each abberant copy-number
2848 0         0 my $cnv_density; #stores the density of cnv mutations in the region
2849 0         0 my $translocation_density; #stores the density of translocation mutations in the region
2850 0         0 my $mutation_density; #stores the density of all mutations in the region
2851 0         0 my $intertranslocation_hash_ref; #stores the number of translocations between all other chromosomes and the region
2852 0         0 my $breakpoint_insertions_array_ref; #stores the total number of translocation breakpoints, and the number that have insertions nearby
2853 0         0 my $loh_size = -1; #stores the amount of heterozygosity that was lost in the region
2854 0         0 my $heterozygous_size; #stores the original amount of heterozygosity in the region
2855            
2856 0         0 ($cnv_score, $cnv_number_hash_ref, $cnv_density) = calculate_copy_number_scores($chr, $start, $end, $genome_cnv_data_hash_ref, $bin_size);
2857 0         0 ($genome_localization_score, $chr_z_score, $chr_mutation_density) = calculate_genome_localization_score($chr, $genome_mutation_density_hash_ref);
2858 0         0 ($translocation_score, $intertranslocation_hash_ref, $translocation_density) = calculate_translocation_score($chr, $start, $end, $genome_trans_data_hash_ref, $bin_size);
2859              
2860 0 0       0 if(defined($genome_trans_insertion_breakpoints_hash_ref)){
2861 0         0 ($insertion_breakpoint_score, $breakpoint_insertions_array_ref) = calculate_insertion_breakpoint_score($chr, $start, $end, $genome_trans_data_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $bin_size);
2862             }
2863              
2864 0         0 ($mutation_density, $mutation_density_score) = calculate_region_mutation_density_score($chr, $start, $end, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $bin_size);
2865              
2866 0 0       0 if(defined($chromosome_loh_breakpoints_hash_ref)){
2867 0         0 ($loh_score, $loh_size, $heterozygous_size) = calculate_loh_score($chr, $start, $end, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref);
2868             }
2869              
2870             #calculate overall score for region based on hallmark weights and scores
2871 0         0 $final_score = ($cnv_score*$cnv_weight) + ($mutation_density_score*$chromosome_localization_weight) + ($genome_localization_score*$genome_localization_weight) + ($translocation_score*$translocation_weight) + ($insertion_breakpoint_score*$insertion_breakpoint_weight) + ($tp53_mutated*$tp53_mutated_weight) + ($loh_score*$loh_weight);
2872              
2873             #push the hallmark scores into the score array
2874 0         0 push (@score_array, ($cnv_score, $mutation_density_score, $genome_localization_score, $translocation_score, $insertion_breakpoint_score, $loh_score, $tp53_mutated));
2875              
2876             #return the scores and other region statistics
2877 0         0 return ($final_score, $chr_z_score, $mutation_density, $cnv_number_hash_ref, $cnv_density, $intertranslocation_hash_ref , $translocation_density, $breakpoint_insertions_array_ref, $loh_size, $heterozygous_size, \@score_array);
2878             }#sub calculate_score
2879              
2880              
2881              
2882             #=head2 Sub-Method: calculate_copy_number_score
2883              
2884             ### calculate_copy_number_score ##################################################################
2885             # Description:
2886             # Calculates the score for the copy-number variation hallmark
2887             #
2888             # Input variables:
2889             # $chr: stores the chromsome where the region is located
2890             # $start: stores the starting location of the region
2891             # $end: stores the end location of the region
2892             # $genome_cnv_data_hash_ref: stores the position of CNV mutations on each chromosome
2893             # $bin_size: stores the size of single bin
2894             #
2895              
2896             #=cut
2897              
2898             sub calculate_copy_number_scores {
2899              
2900             #parse parameters
2901 0     0 0 0 my $chr = shift;
2902 0         0 my $start = shift;
2903 0         0 my $end = shift;
2904              
2905 0         0 my $genome_cnv_data_hash_ref = shift;
2906 0         0 my %genome_cnv_data_hash = %$genome_cnv_data_hash_ref;
2907              
2908 0         0 my $bin_size = shift;
2909              
2910             #calculate array index where the data for the first and last bins of the region are located
2911 0         0 my $start_index = $start / ($bin_size);
2912 0         0 my $end_index = $end / ($bin_size);
2913              
2914 0         0 my $cnv_score = 0; #stores final score to return
2915              
2916 0         0 my %cnv_number_hash; #hash
2917             #key: copy number eg 0,1,3,4
2918             #value: the number of regions with the given copy number
2919              
2920 0         0 my $cnv_switch_count = 0; #number of switches between different copy numbers
2921 0         0 my $cnv_switch_density = 0; #density of cnv events in the region
2922              
2923 0         0 my @chr_data; #stores all the bins for the chromosome where the region is located
2924             my %cnv_hash; #stores the cnv hash from each bin
2925              
2926 0         0 my $mean = 0; #stores the average number of regions of aberrant copy-number
2927 0         0 my $SD = 0; #stores the standard deviation of the number of regions of aberrant copy-number
2928              
2929 0         0 my %cnv_significant; #hash
2930             #key: copy number (but only significant ones are stored)
2931             #value: the number of regions with the given copy number
2932              
2933 0         0 my $significant_count = 0; #stores the number of unique significant copy-numbers
2934              
2935             #check if there is cnv data for the chromosome
2936 0 0       0 if(defined($genome_cnv_data_hash{$chr})){
2937              
2938             #extract the bin data for the chromosome
2939 0         0 @chr_data = @{$genome_cnv_data_hash{$chr}};
  0         0  
2940              
2941             #collect the data from the bins that contain the region
2942 0         0 for (my $i = $start_index; $i < $end_index+1; $i++){
2943 0 0       0 if(!defined($chr_data[$i])){
2944 0         0 next;
2945             }
2946 0         0 %cnv_hash = %{$chr_data[$i]};
  0         0  
2947              
2948 0         0 for my $key (keys %cnv_hash){
2949 0 0       0 if($key eq 'BPcount'){
2950 0         0 $cnv_switch_count += $cnv_hash{$key};
2951             }
2952             else{
2953 0         0 $cnv_number_hash{$key}+= $cnv_hash{$key};
2954             }
2955             }
2956             }
2957              
2958             #calculate the breakpoint density of cnv mutations for the region
2959 0         0 $cnv_switch_density = $cnv_switch_count/ ($end-$start);
2960              
2961             #calculate the number of cnv events in the region (half the number of breakpoints)
2962 0         0 for my $key (keys %cnv_number_hash){
2963 0         0 $cnv_number_hash{$key} = POSIX::ceil($cnv_number_hash{$key});
2964 0         0 $mean += $cnv_number_hash{$key};
2965             }
2966              
2967             #if no cnv mutations were found return a score of 0
2968 0 0       0 if(scalar(keys %cnv_number_hash)==0){
2969 0         0 $cnv_score = 0;
2970 0         0 return ($cnv_score, \%cnv_number_hash, $cnv_switch_density);
2971             }
2972              
2973             #calculate the mean of the number regions of each copy-number
2974 0         0 $mean = $mean/(scalar(keys %cnv_number_hash));
2975              
2976             #calculate the standard deviation of the number of regions of each copy-number
2977 0         0 for my $key (keys %cnv_number_hash){
2978 0         0 $SD += ($cnv_number_hash{$key}-$mean)**2;
2979             }
2980 0         0 $SD = $SD/(scalar(keys %cnv_number_hash));
2981 0         0 $SD = $SD**0.5;
2982              
2983             #determine which copy-numbers are significant (ie are not low out liers)
2984 0         0 for my $key (keys %cnv_number_hash){
2985 0 0 0     0 if(
2986             ( $SD==0 )||
2987             ( (($cnv_number_hash{$key}-$mean)/$SD) >= -1*$outlier_deviation )
2988             ){
2989 0         0 $cnv_significant{$key} = $cnv_number_hash{$key};
2990 0         0 $cnv_score = $cnv_score + $cnv_significant{$key}**2;
2991             }
2992             }
2993              
2994             #score calculation
2995 0         0 $cnv_score = $cnv_score / (scalar(keys %cnv_significant));
2996 0         0 $cnv_score = log($cnv_score)/log(2);
2997 0         0 $cnv_score += 1;
2998 0         0 $cnv_score = 1 - (1/$cnv_score);
2999 0         0 $cnv_score = $cnv_score/(scalar(keys %cnv_significant));
3000             #$cnv_score = (1/(scalar(keys %cnv_significant)))*(0.25) + (1-(1/(1+log($cnv_score/(scalar(keys %cnv_significant)))/log(2))))*(0.75);
3001             }
3002              
3003 0         0 return ($cnv_score, \%cnv_number_hash, $cnv_switch_density);
3004              
3005             }#sub calculate_copy_number_scores
3006              
3007              
3008             #=head2 Sub-Method: calculate_genome_localization_score
3009              
3010             ### calculate_genome_localization_score ##########################################################
3011             # Description:
3012             # Calculates the genome localization hallmark score
3013             #
3014             # Input variables:
3015             # $chr: store the chromosome where the region is located
3016             # $genome_mutation_density_hash_ref: stores the average mutation density of each
3017             # chromosome
3018             #
3019              
3020             #=cut
3021              
3022             sub calculate_genome_localization_score {
3023              
3024             #parse parameters
3025 0     0 0 0 my $chr = shift;
3026              
3027 0         0 my $genome_mutation_density_hash_ref = shift;
3028 0         0 my %genome_mutation_density_hash = %{$genome_mutation_density_hash_ref};
  0         0  
3029              
3030             #read mutation density for the chromosome
3031 0         0 my $chr_mutation_density = $genome_mutation_density_hash{$chr};
3032              
3033 0         0 my $mean_density = 0; #stores the average density of mutations for the chromosomes
3034              
3035 0         0 my $standard_deviation = 0; #stores the standard deviation of the mutation densities of the chromosomes
3036              
3037 0         0 my $z_score = 0; #stores the z-score for the suspect chromosome
3038 0         0 my $p_val = 0; #stores the p-value calculated from the z-score for the suspect chromosome
3039              
3040             #sum mutation densities of all the chromosomes
3041 0         0 for my $key (keys %genome_mutation_density_hash){
3042 0         0 $mean_density += $genome_mutation_density_hash{$key}/$chromosome_length{$chr};
3043             }
3044              
3045             #calculate the mean
3046 0         0 $mean_density = $mean_density / 24;
3047              
3048             #calculate the standard deviation
3049 0         0 for my $key (keys %genome_mutation_density_hash){
3050 0         0 $standard_deviation += ((($genome_mutation_density_hash{$key}) - ($mean_density) )**2);
3051             }
3052 0         0 $standard_deviation = $standard_deviation / 24;
3053 0         0 $standard_deviation = ($standard_deviation)**0.5;
3054              
3055             #check for case where the standard deviation or mean comes back as 0
3056 0 0 0     0 if($mean_density == 0 || $standard_deviation == 0){
3057 0         0 return (0, 0, $chr_mutation_density);
3058             }
3059              
3060             #calculate the z-score for suspect chromosome
3061 0         0 $z_score = ($chr_mutation_density - $mean_density) / $standard_deviation;
3062              
3063             #calculate the p-value from the z-score
3064 0         0 $p_val = Statistics::Distributions::uprob($z_score);
3065             #only consider top tail
3066 0         0 $p_val = 0.5-$p_val;
3067 0         0 $p_val = $p_val/0.5;
3068              
3069             #check for case where z-score comes back as 0
3070 0 0       0 if($z_score < 0){
3071 0         0 $p_val = 0;
3072             }
3073              
3074             #p_val is the score for this hallmark
3075 0         0 return ($p_val, $z_score, $chr_mutation_density);
3076              
3077             }#sub calculate_genome_localization_score
3078              
3079              
3080             #=head2 Sub-Method: calculate_region_mutation_density_score
3081              
3082             ### calculate_region_mutation_density_score ######################################################
3083             # Description:
3084             # Calculates the chromosome localization hallmark score
3085             #
3086             # Input variables:
3087             # $chr: chromosome where the region is located
3088             # $start: starting location of the region
3089             # $end: end location of the region
3090             # $genome_cnv_data_hash_ref: stores the position of CNV mutations on each
3091             # chromosome
3092             # $genome_trans_data_hash_ref: stores the position of translocation events
3093             # on each chromosome
3094             # $genome_mutation_density_hash_ref: stores the average mutation density of each
3095             # chromosome
3096             # $bin_size: stores the size of single bin
3097             #
3098              
3099             #=cut
3100              
3101             sub calculate_region_mutation_density_score {
3102              
3103             #parse parameters
3104 13     13 0 25 my $chr = shift;
3105 13         22 my $start = shift;
3106 13         21 my $end = shift;
3107              
3108 13         14 my $genome_cnv_data_hash_ref = shift;
3109 13         18 my %genome_cnv_data_hash = %{$genome_cnv_data_hash_ref};
  13         189  
3110              
3111 13         25 my $genome_trans_data_hash_ref = shift;
3112 13         21 my %genome_trans_data_hash = %{$genome_trans_data_hash_ref};
  13         88  
3113              
3114 13         19 my $genome_mutation_density_hash_ref = shift;
3115 13         21 my %genome_mutation_density_hash = %{$genome_mutation_density_hash_ref};
  13         161  
3116              
3117 13         24 my $bin_size = shift;
3118              
3119             #calculate array index where the data for the first and last bins of the region are located
3120 13         29 my $start_index = $start / $bin_size;
3121 13         16 my $end_index = $end / $bin_size;
3122              
3123             #get the mean mutation density for the suspect chromosome
3124 13         25 my $mean_chr_mutation_density = $genome_mutation_density_hash{$chr};
3125              
3126 13         11 my $chis_stat; #stores the chi squared statistic value
3127              
3128 13         14 my $mutation_count = 0; #stores the total mutation count in the region
3129 13         14 my $cnv_count = 0; #stores the number of cnv breakpoints in the region
3130 13         18 my $trans_count = 0; #stores the number of translocation breakpoints in the region
3131 13         13 my $mutation_density = 0; #store the mutation density of the region
3132 13         14 my $mutation_density_score; #stores the final score for the hallmark
3133              
3134             my @chr_data; #stores the bin data for the chromosome
3135 0         0 my %cnv_hash; #stores the cnv hash for each bin
3136 0         0 my %trans_hash; #stores the translocation hash for each bin
3137              
3138             #check that there is cnv data for the chromosome
3139 13 100       59 if(defined($genome_cnv_data_hash{$chr})){
3140             #get the cnv data for the chromosome
3141 12         10 @chr_data = @{$genome_cnv_data_hash{$chr}};
  12         41384  
3142              
3143             #sum the number of cnv breakpoints in the suspect region
3144 12         115 for (my $i = $start_index; $i < $end_index+1; $i++){
3145 145335 100       175498 if(!defined($chr_data[$i])){
3146 145301         181774 next;
3147             }
3148              
3149 34         38 %cnv_hash = %{$chr_data[$i]};
  34         186  
3150 34         89 $cnv_count += $cnv_hash{'BPcount'};
3151             }
3152             }
3153              
3154 13         20176 @chr_data = ();
3155              
3156             #check that there is translocation data for the chromosome
3157 13 50       155 if(defined($genome_trans_data_hash{$chr})){
3158             #get the translocation data for the chromosome
3159 13         21 @chr_data = @{$genome_trans_data_hash{$chr}};
  13         25997  
3160              
3161             #sum the number of translocation breakpoints in the suspect region
3162 13         108 for (my $i = $start_index; $i < $end_index+1; $i++){
3163 163406 100       201253 if(!defined($chr_data[$i])){
3164 163313         211797 next;
3165             }
3166 93         68 %trans_hash = %{$chr_data[$i]};
  93         404  
3167 93         214 $trans_count += $trans_hash{'BPcount'};
3168             }
3169             }
3170              
3171             #divide the count by 2 to get the number of translocation events
3172 13         116 $trans_count = POSIX::ceil($trans_count/2);
3173              
3174             #calculate the total number events in the region
3175 13         37 $mutation_count = $cnv_count + $trans_count;
3176              
3177             #calculate the mutation density for the region
3178 13         34 $mutation_density = $mutation_count / ($end-$start);
3179              
3180             #calculate the chi squared statistic
3181 13         101 $chis_stat = abs(((log($mutation_density)-log($mean_chr_mutation_density))**2)/(log($mean_chr_mutation_density)));
3182              
3183             #generate a p-value using the chi squared test
3184 13         97 $mutation_density_score = 1-(Statistics::Distributions::chisqrprob(1,$chis_stat));
3185              
3186 13         17643 return ($mutation_density, $mutation_density_score);
3187             }#calculate_region_mutation_density_score
3188              
3189              
3190             #=head2 Sub-Method: calculate_translocation_score
3191              
3192             ### calculate_translocation_score #################################################################
3193             # Description:
3194             # Calculates the translocation hallmark score
3195             #
3196             # Input variables:
3197             # $chr: chromosome where the region is located
3198             # $start: starting location of the region
3199             # $end: end location of the region
3200             # $genome_trans_data_hash_ref: stores the position of translocation events on each
3201             # chromosome
3202             # $bin_size: stores the size of single bin
3203             #
3204              
3205             #=cut
3206              
3207             sub calculate_translocation_score {
3208              
3209             #parse parameters
3210 0     0 0 0 my $chr = shift;
3211 0         0 my $start = shift;
3212 0         0 my $end = shift;
3213              
3214 0         0 my $genome_trans_data_hash_ref = shift;
3215 0         0 my %genome_trans_data_hash = %{$genome_trans_data_hash_ref};
  0         0  
3216              
3217 0         0 my $bin_size = shift;
3218              
3219             #calculate array index where the data for the first and last bins of the region are located
3220 0         0 my $start_index = $start / ($bin_size);
3221 0         0 my $end_index = $end / ($bin_size);
3222              
3223 0         0 my $translocation_density = 0; #stores the density of translocation events in the region
3224              
3225 0         0 my @chr_data; #stores the bin data for the chromosome
3226             my %trans_breakpoints; #hash
3227             #key: chromosome eg 1,2,X,Y
3228             #value: an array storing the position of the translocation breakpoints
3229              
3230 0         0 my %trans_breakpoint_spreads; #stores the average distance between translocation breakpoints
3231 0         0 my @significant_chrs; #stores a list of chromosomes that have a significant number of
3232             #translocation to or from the region
3233 0         0 my @diffs; #stores the distance between adjacent translocation breakpoints on
3234             #one chromosome
3235 0         0 my $diff_sum; #stores the sum of the distances
3236 0         0 my $diff_count; #store the number regions between translocation breakpoints
3237              
3238 0         0 my %trans_number_hash; #hash
3239             #key: chromosome eg 1,2,X,Y
3240             #value: the number of events between the chromosome and the region
3241              
3242 0         0 my $mean = 0; #stores the average number of translocations between each chromosome
3243             #and the region
3244 0         0 my $SD = 0; #stores the standard deviation of the above value
3245              
3246 0         0 my $weighted_sum = 0.00; #component of the score calculation
3247              
3248 0         0 my $size = 0.00; #intermediate variable used to collect sums
3249 0         0 my $count = 0; #intermeidate variable
3250              
3251 0         0 my $translocation_score = 0; #final hallmark score
3252              
3253 0         0 my $spread_factor = 0;
3254              
3255 0         0 my $translocation_count = 0; #stores the total number of translocations from significant chromosomes
3256              
3257             #check that there is translocation data for the chromosome
3258 0 0       0 if(defined($genome_trans_data_hash{$chr})){
3259             #get the translocation data for the chromosome
3260 0         0 @chr_data = @{$genome_trans_data_hash{$chr}};
  0         0  
3261              
3262             #for each bin sum the number of translocation events
3263             #and record the position of breakpoints in the trans_breakpoints hash
3264 0         0 for (my $i = $start_index; $i < $end_index+1; $i++){
3265 0 0       0 if(!defined($chr_data[$i])){
3266 0         0 next;
3267             }
3268 0         0 my %trans_hash = %{$chr_data[$i]};
  0         0  
3269 0         0 my %trans_hash_in;
3270             my %trans_hash_out;
3271              
3272             #analyze the breakpoints from translocation into the region
3273 0 0       0 if(defined($trans_hash{'in'})){
3274 0         0 %trans_hash_in = %{$trans_hash{'in'}};
  0         0  
3275              
3276 0         0 for my $key (keys %trans_hash_in){
3277 0         0 $size = @{$trans_hash_in{$key}};
  0         0  
3278              
3279             #calculate the number of translocation events
3280 0         0 $size = $size/2;
3281              
3282             #add the count to the appropriate hash
3283 0         0 $trans_number_hash{$key} += $size;
3284              
3285             #add the breakpoints to the trans_breakpoints hash
3286 0         0 push(@{$trans_breakpoints{$key}},(@{$trans_hash_in{$key}}));
  0         0  
  0         0  
3287             }
3288             }
3289              
3290             #analyze the breakpoints from translocation out of the region
3291 0 0       0 if(defined($trans_hash{'out'})){
3292 0         0 %trans_hash_out = %{$trans_hash{'out'}};
  0         0  
3293              
3294 0         0 for my $key (keys %trans_hash_out){
3295 0         0 $size = @{$trans_hash_out{$key}};
  0         0  
3296 0         0 $count = $size;
3297 0 0       0 if($key eq $chr){
3298 0         0 foreach my $val (@{$trans_hash_out{$key}}){
  0         0  
3299 0 0 0     0 if($val > $start && $val < $end){
3300 0         0 $count--;
3301             }
3302             }
3303             }
3304             #calculate the number of translocation events
3305 0         0 $count = $count/2;
3306              
3307             #add the count to the appropriate hash
3308 0         0 $trans_number_hash{$key} += $count;
3309              
3310             #add the breakpoints to the trans_breakpoints hash
3311 0         0 push(@{$trans_breakpoints{$key}},(@{$trans_hash_out{$key}}));
  0         0  
  0         0  
3312             }
3313             }
3314             }
3315              
3316 0         0 $count = 0;
3317              
3318             #check that some translocation events were found else return a score of 0
3319 0 0       0 if (keys(%trans_number_hash) == 0){
3320 0         0 $translocation_score = 0;
3321 0         0 $translocation_density = 0;
3322 0         0 return ($translocation_score, \%trans_number_hash, $translocation_density);
3323             }
3324              
3325 0         0 for my $key (keys %trans_number_hash){
3326             #round the event count up since if only one breakpoint was present at the end
3327             #of the region we still count that as a whole translocation event
3328 0         0 $trans_number_hash{$key} = POSIX::ceil($trans_number_hash{$key});
3329              
3330             #sum the number of translocation events in the region
3331 0         0 $count += $trans_number_hash{$key};
3332             }
3333              
3334             #calculate the translocation density for the region
3335 0         0 $translocation_density = $count / ($end-$start);
3336              
3337             #calculate the mean and standard deviation of the number of translocation between the region
3338             #and each chromosome
3339 0         0 ($SD, $mean) = standard_deviation_and_mean(\%trans_number_hash,0);
3340            
3341             #identify chromosomes that have a high number of translocations to or from the region and
3342             #add them to the significant chromosome list
3343 0         0 for my $key (keys %trans_number_hash){
3344 0 0 0     0 if(
3345             ( $SD==0 )||
3346             #( (($trans_number_hash{$key}-$mean)/$SD)>-2*$outlier_deviation)
3347             ( (($trans_number_hash{$key}-$mean)/$SD)>-1*$outlier_deviation)
3348             ){
3349 0         0 push (@significant_chrs, $key);
3350             }
3351             }
3352              
3353             #calculate the total number of translocations from significant chromosomes
3354 0         0 foreach my $key (@significant_chrs){
3355 0         0 $translocation_count += $trans_number_hash{$key};
3356             }
3357              
3358             #for each significant chromosome calculate the spread between translocation events
3359 0         0 foreach my $key (@significant_chrs){
3360             #sort the breakpoints
3361 9     9   22217 {use sort 'stable';
  9         21  
  9         51  
  0         0  
3362 0         0 @{$trans_breakpoints{$key}} = sort {$a <=> $b} @{$trans_breakpoints{$key}};
  0         0  
  0         0  
  0         0  
3363             }#use sort 'stable'
3364              
3365 0         0 $size = @{$trans_breakpoints{$key}};
  0         0  
3366 0         0 @diffs = ();
3367 0         0 $diff_sum = 0;
3368 0         0 $diff_count = 0;
3369              
3370             #calculate and store the distance between adjacent breakpoints
3371 0         0 for (my $i = 1; $i<$size; $i++){
3372 0         0 push (@diffs, @{$trans_breakpoints{$key}}[$i]-@{$trans_breakpoints{$key}}[$i-1]);
  0         0  
  0         0  
3373             }
3374              
3375             #check that more that one distance was calculated
3376 0 0       0 if($size==1){
3377 0         0 $trans_breakpoint_spreads{$key} = 0;
3378 0         0 $diff_count = 1;
3379             }
3380             else{ #calculate the standard deviation and mean for the distance between breakpoints
3381 0         0 ($SD,$mean) = standard_deviation_and_mean(\@diffs,1);
3382              
3383             #sum the distances that are not high outliers, indicating distance between 2 translocation
3384             #events and not distance between breakpoints of the same event
3385 0         0 foreach my $val (@diffs){
3386 0 0 0     0 if(
3387             ( $SD==0 )||
3388             ( (($val-$mean)/$SD)<$outlier_deviation )
3389             ){
3390 0         0 $diff_sum += $val;
3391 0         0 $diff_count++;
3392             }
3393             }
3394             }
3395              
3396             #calculate the average spread of translocation breakpoints
3397 0         0 $trans_breakpoint_spreads{$key} = $diff_sum / $diff_count;
3398              
3399             #calculate the spread factor for the chromosome
3400             #my $spread_factor = (log($trans_breakpoint_spreads{$key}+1)/log(10))/((log($expected_mutation_density)/log(10))*-1);
3401 0         0 $spread_factor = (log($trans_breakpoint_spreads{$key}+1)/log(10))/((log($expected_mutation_density)/log(10))*-1);
3402 0 0       0 if($spread_factor==0){
3403 0         0 $spread_factor = 1;
3404             }
3405              
3406              
3407             #increase the weighted sum based on the number of translocation events and their spread multiplied by the proportion of translocations
3408             #from this specific chromosome relative to the total number of translocation events
3409 0         0 $weighted_sum += ($trans_number_hash{$key}/$spread_factor)*($trans_number_hash{$key}/$translocation_count);
3410             }#foreach my $key (@significant_chrs)
3411              
3412 0         0 my $t2 = (1-(1/(log(1+$weighted_sum)/log(2))));
3413             #final hallmark score calculation
3414 0         0 $size = @significant_chrs;
3415              
3416             #calculate second term of score
3417 0         0 $translocation_score = (1-(1/(log(1+$weighted_sum)/log(2))));
3418              
3419             #calculate first term of score
3420 0 0 0     0 if($size<$translocation_cut_off_count && $size>2){
3421 0         0 $translocation_score = (1-(0.10*($size-2)))*$translocation_score;
3422             }
3423 0 0       0 if($size>=$translocation_cut_off_count){
3424 0         0 $translocation_score = 0;
3425             }
3426              
3427 0 0       0 if($translocation_score>=1){
3428 0         0 print "score: ".$translocation_score."\n";
3429 0         0 print "ws: ".$weighted_sum."\n";
3430 0         0 print "sf: ".$spread_factor."\n";
3431 0         0 print "term 1: ";
3432 0         0 print (1/(1+(log($size)/log(4))));
3433 0         0 print "\n";
3434 0         0 print "term 2: ";
3435 0         0 print (1-(1/(log($weighted_sum)/log(2))));
3436 0         0 print "\n";
3437             }
3438              
3439             }#if(defined($genome_trans_data_hash{$chr}))
3440              
3441 0         0 return ($translocation_score, \%trans_number_hash, $translocation_density);
3442              
3443             }#sub calculate_translocation_score
3444              
3445              
3446             #=head2 Sub-Method: calculate_insertion_breakpoint_score
3447              
3448             ### calculate_insertion_breakpoint_score ##########################################################
3449             # Description:
3450             # Calculates the insertions at translocation breakpoints hallmark score
3451             #
3452             # Input variables:
3453             # $chr: chromosome where the region is located
3454             # $start: starting location of the region
3455             # $end: end location of the region
3456             # $genome_trans_data_hash_ref: stores the position of translocation events
3457             # on each chromosome
3458             # $genome_trans_insertion_breakpoints_hash_ref: stores the position of breakpoints with
3459             # insertions nearby
3460             # $bin_size: stores the size of single bin
3461             #
3462              
3463             #=cut
3464              
3465             sub calculate_insertion_breakpoint_score {
3466              
3467             #parse parameters
3468 0     0 0 0 my $chr = shift;
3469 0         0 my $start = shift;
3470 0         0 my $end = shift;
3471              
3472 0         0 my $genome_trans_data_hash_ref = shift;
3473 0         0 my %genome_trans_data_hash = %{$genome_trans_data_hash_ref};
  0         0  
3474              
3475 0         0 my $genome_trans_insertion_breakpoints_hash_ref = shift;
3476 0         0 my %genome_trans_insertion_breakpoints_hash = %{$genome_trans_insertion_breakpoints_hash_ref};
  0         0  
3477              
3478 0         0 my $bin_size = shift;
3479              
3480             #calculate array index where the data for the first and last bins of the region are located
3481 0         0 my $start_index = $start / ($bin_size);
3482 0         0 my $end_index = $end / ($bin_size);
3483              
3484 0         0 my $total_breakpoints = 0; #total number of breakpoints in the region
3485 0         0 my $inserted_breakpoints = 0; #total number of breakpoints with nearby insertions in the region
3486              
3487 0         0 my $insertion_breakpoint_score = 0;
3488              
3489 0         0 my @chr_data; #stores the bin data for the chromosome
3490             my %trans_hash; #stores translocation hash for each bin
3491              
3492 0         0 my @inserted_breakpoint_list; #stores the breakpoints that have insertions nearby
3493 0         0 my @breakpoint_data; #stores $total_breakpoints and $inserted_breakpoints for return
3494              
3495             #check if there is translocation data for the region
3496 0 0       0 if(defined($genome_trans_data_hash{$chr})){
3497              
3498             #get the translocation data
3499 0         0 @chr_data = @{$genome_trans_data_hash{$chr}};
  0         0  
3500              
3501             #for each bin in the region sum the number of breakpoints
3502 0         0 for (my $i = $start_index; $i < $end_index+1; $i++){
3503 0 0       0 if(!defined($chr_data[$i])){
3504 0         0 next;
3505             }
3506 0         0 %trans_hash = %{$chr_data[$i]};
  0         0  
3507 0         0 $total_breakpoints += $trans_hash{'BPcount'};
3508             }
3509              
3510             #get the list of breakpoints with insertions nearby on the chromosome
3511 0         0 @inserted_breakpoint_list = @{$genome_trans_insertion_breakpoints_hash{$chr}};
  0         0  
3512              
3513             #sort the above list
3514 9     9   7784 {use sort 'stable';
  9         22  
  9         39  
  0         0  
3515 0         0 @inserted_breakpoint_list = sort {$a <=> $b} @inserted_breakpoint_list;
  0         0  
3516             }#use sort 'stable'
3517              
3518             #calculate how many of the breakpoints with insertions are in the region
3519 0         0 foreach my $breakpoint (@inserted_breakpoint_list){
3520 0 0       0 if($breakpoint > $end){
3521 0         0 last;
3522             }
3523 0 0       0 if($breakpoint > $start){
3524 0         0 $inserted_breakpoints++;
3525             }
3526             }
3527              
3528             #calculate the hallmark score
3529 0 0       0 if ($total_breakpoints > 0) {
3530 0         0 $insertion_breakpoint_score = $inserted_breakpoints/$total_breakpoints;
3531             }
3532              
3533 0 0       0 if($insertion_breakpoint_score > 1){
3534 0         0 die "ERROR: found a insertion_breakpoint_score greater than 1\n";
3535             }
3536              
3537             }
3538              
3539 0         0 push (@breakpoint_data, $inserted_breakpoints);
3540 0         0 push (@breakpoint_data, $total_breakpoints);
3541              
3542 0         0 return ($insertion_breakpoint_score, \@breakpoint_data);
3543             }
3544              
3545              
3546             #=head2 Sub-Method: calculate_loh_score
3547              
3548             ### calculate_loh_score ###########################################################################
3549             # Description:
3550             # Calculates the loss of heterozgozity hallmark score
3551             #
3552             # Input variables:
3553             # $chr: chromosome where the region is located
3554             # $start: starting location of the region
3555             # $end: end location of the region
3556             # $chromosome_cnv_breakpoints_hash_ref: stores the breakpoints of CNV mutations on each
3557             # chromosome
3558             # $chromosome_loh_breakpoints_hash_ref: stores the breakpoints of LOH regions on each
3559             # chromosome
3560             #
3561              
3562             #=cut
3563              
3564             sub calculate_loh_score {
3565              
3566             #parse parameters
3567 0     0 0 0 my $chr = shift;
3568 0         0 my $start = shift;
3569 0         0 my $end = shift;
3570              
3571 0         0 my $chromosome_cnv_breakpoints_hash_ref = shift;
3572 0         0 my %chromosome_cnv_breakpoints_hash = %{$chromosome_cnv_breakpoints_hash_ref};
  0         0  
3573              
3574 0         0 my $chromosome_loh_breakpoints_hash_ref = shift;
3575 0         0 my %chromosome_loh_breakpoints_hash = %{$chromosome_loh_breakpoints_hash_ref};
  0         0  
3576              
3577 0         0 my @cnv_breakpoints; #stores cnv breakpoints in the region
3578             my $cnv_breakpoints_size; #stores the number of cnv breakpoints in the region
3579              
3580 0         0 my @loh_breakpoints; #stores the LOH breakpoints in the region
3581 0         0 my $loh_breakpoints_size; #stores the number of LOH breakpoints in the region
3582              
3583             #calculate maximum potential amount of heterozygosity
3584 0         0 my $original_heterozygous_size = $end - $start;
3585              
3586             #calculate maximum pontentail amount of heterozygosity that can remain
3587 0         0 my $remaining_heterozygous_size = $end - $start;
3588              
3589 0         0 my $loh_size = 0; #stores the size of all LOH regions in the region
3590              
3591 0         0 my $loh_score = 0; #final hallmark score
3592              
3593             #check if there is any cnv data for the chromosome
3594 0 0 0     0 if(
3595             ( !defined($chromosome_cnv_breakpoints_hash{$chr}) ) ||
3596             ( !defined($chromosome_loh_breakpoints_hash{$chr}) )
3597             ){
3598 0         0 $loh_score = 0;
3599 0         0 $loh_size = -1;
3600 0         0 $remaining_heterozygous_size = -1;
3601 0         0 return($loh_score, $loh_size, $remaining_heterozygous_size);
3602             }
3603              
3604             #get the cnv breakpoints for the chromosome
3605 0         0 @cnv_breakpoints = @{$chromosome_cnv_breakpoints_hash{$chr}};
  0         0  
3606              
3607             #sort the list
3608 9     9   3714 {use sort 'stable';
  9         21  
  9         36  
  0         0  
3609 0         0 @cnv_breakpoints = sort {$a <=> $b} @cnv_breakpoints;
  0         0  
3610             }#use sort 'stable'
3611              
3612             #get the number of breakpoints
3613 0         0 $cnv_breakpoints_size = @cnv_breakpoints;
3614              
3615             #find all the cnv events that occur in the region and subtract the size of these regions
3616             #from the $original_heterozygous_size value
3617 0         0 for (my $i = 0; $i< $cnv_breakpoints_size; $i+=2){
3618 0         0 my $cnv_start = $i;
3619 0         0 my $cnv_end = $i+1;
3620              
3621 0         0 my $end_overlap = 0;
3622 0         0 my $start_overlap = 0;
3623              
3624 0 0       0 if($cnv_breakpoints[$cnv_start] > $end){
3625 0         0 last;
3626             }
3627              
3628             #Check if the end point of the cnv region is within the suspect region
3629 0 0 0     0 if($cnv_breakpoints[$cnv_end] >= $start && $cnv_breakpoints[$cnv_end] <= $end){
3630 0         0 $end_overlap = 1;
3631             }
3632              
3633             #Check if the start point of the region cnv is within the suspect region
3634 0 0 0     0 if($cnv_breakpoints[$cnv_start] >= $start && $cnv_breakpoints[$cnv_start] <= $end){
3635 0         0 $start_overlap = 1;
3636             }
3637              
3638             #If an overlap was detected
3639 0 0 0     0 if($start_overlap==1 && $end_overlap==1) {
    0 0        
    0          
    0          
3640 0         0 $remaining_heterozygous_size -= ($cnv_breakpoints[$cnv_end] - $cnv_breakpoints[$cnv_start]);
3641             }
3642             elsif($start_overlap==1){
3643 0         0 $remaining_heterozygous_size -= ($end-$cnv_breakpoints[$cnv_start]);
3644             }
3645             elsif($end_overlap==1){
3646 0         0 $remaining_heterozygous_size -= ($cnv_breakpoints[$cnv_end]-$start);
3647             }
3648             elsif($cnv_breakpoints[$cnv_start] < $start && $cnv_breakpoints[$cnv_end] > $end){
3649 0         0 $remaining_heterozygous_size = 0;
3650             }
3651             }
3652              
3653             #check if there is no potential heterozygous regions in the suspect region or if there are no cnv events
3654             #in the suspect region, if either is the case return a score of 0
3655 0 0 0     0 if($remaining_heterozygous_size == 0 || $remaining_heterozygous_size == $original_heterozygous_size){
3656 0         0 $loh_score = 0;
3657 0         0 $loh_size = -1;
3658 0         0 $remaining_heterozygous_size = -1;
3659 0         0 return($loh_score, $loh_size, $remaining_heterozygous_size);
3660             }
3661            
3662             #get a list of the LOH breakpoints on the chromosome
3663 0         0 @loh_breakpoints = @{$chromosome_loh_breakpoints_hash{$chr}};
  0         0  
3664              
3665             #sort the list
3666 9     9   2594 {use sort 'stable';
  9         19  
  9         38  
  0         0  
3667 0         0 @loh_breakpoints = sort {$a <=> $b} @loh_breakpoints;
  0         0  
3668             }#use sort 'stable'
3669              
3670             #get the number of breakpoints
3671 0         0 $loh_breakpoints_size = @loh_breakpoints;
3672              
3673             #determine which LOH regions are in the suspect region
3674 0         0 for (my $i = 0; $i< $loh_breakpoints_size; $i+=2){
3675 0         0 my $start_overlap_region_loh = 0;
3676 0         0 my $end_overlap_region_loh = 0;
3677              
3678 0         0 my $loh_start = $i;
3679 0         0 my $loh_end = $i+1;
3680              
3681 0         0 my $loh_start_breakpoint = $loh_breakpoints[$loh_start];
3682 0         0 my $loh_end_breakpoint = $loh_breakpoints[$loh_end];
3683 0         0 my $loh_region_size;
3684              
3685 0 0       0 if($loh_breakpoints[$loh_start] > $end){
3686 0         0 last;
3687             }
3688              
3689             #Check if the end point of the cnv region is within the loh region
3690 0 0 0     0 if($loh_breakpoints[$loh_end] >= $start && $loh_breakpoints[$loh_end] <= $end){
3691 0         0 $end_overlap_region_loh = 1;
3692             }
3693              
3694             #Check if the start point of region 1 is within region 2
3695 0 0 0     0 if($loh_breakpoints[$loh_start] >= $start && $loh_breakpoints[$loh_start] <= $end){
3696 0         0 $start_overlap_region_loh = 1;
3697             }
3698              
3699             #If an overlap was detected
3700 0 0 0     0 if($start_overlap_region_loh==1 && $end_overlap_region_loh!=1){
    0 0        
    0 0        
3701 0         0 $loh_end_breakpoint = $end;
3702             }
3703             elsif($end_overlap_region_loh==1 && $start_overlap_region_loh!=1){
3704 0         0 $loh_start_breakpoint = $start;
3705             }
3706             elsif($loh_breakpoints[$loh_start] < $start && $loh_breakpoints[$loh_end] > $end){
3707 0         0 $loh_start_breakpoint = $start;
3708 0         0 $loh_end_breakpoint = $end;
3709 0         0 $start_overlap_region_loh=1;
3710 0         0 $end_overlap_region_loh=1;
3711             }
3712 0 0 0     0 if($start_overlap_region_loh != 1 && $end_overlap_region_loh != 1){ #if the loh region is not in the suspect region go to the next loh region
3713 0         0 next;
3714             }
3715              
3716             #if the loh region is in the suspect region then reduce the size of the loh by subtracting the size of cnv regions that over lap with it
3717             #this will tell us how much original heterozygosity remains
3718 0         0 $loh_region_size = $loh_end_breakpoint - $loh_start_breakpoint;
3719              
3720             #check for overlaps between the LOH region and cnv regions
3721 0         0 for (my $k = 0; $k< $cnv_breakpoints_size; $k+=2){
3722 0         0 my $start_overlap_loh_cnv = 0;
3723 0         0 my $end_overlap_loh_cnv = 0;
3724              
3725 0         0 my $cnv_start = $k;
3726 0         0 my $cnv_end = $k+1;
3727              
3728 0 0       0 if($cnv_breakpoints[$cnv_start] > $loh_end_breakpoint){
3729 0         0 last;
3730             }
3731              
3732             #Check if the end point of the cnv region is within the loh region
3733 0 0 0     0 if($cnv_breakpoints[$cnv_end] >= $loh_start_breakpoint && $cnv_breakpoints[$cnv_end] <= $loh_end_breakpoint){
3734 0         0 $end_overlap_loh_cnv = 1;
3735             }
3736              
3737             #Check if the start point of region 1 is within region 2
3738 0 0 0     0 if($cnv_breakpoints[$cnv_start] >= $loh_start_breakpoint && $cnv_breakpoints[$cnv_start] <= $loh_end_breakpoint){
3739 0         0 $start_overlap_loh_cnv = 1;
3740             }
3741              
3742             #If an overlap was detected
3743 0 0 0     0 if($start_overlap_loh_cnv==1 && $end_overlap_loh_cnv==1) {
    0 0        
    0          
    0          
3744 0         0 $loh_region_size -= ($cnv_breakpoints[$cnv_end] - $cnv_breakpoints[$cnv_start]);
3745             }
3746             elsif($start_overlap_loh_cnv==1){
3747 0         0 $loh_region_size -= ($loh_end_breakpoint-$cnv_breakpoints[$cnv_start]);
3748             }
3749             elsif($end_overlap_loh_cnv==1){
3750 0         0 $loh_region_size -= ($cnv_breakpoints[$cnv_end]-$loh_start_breakpoint);
3751             }
3752             elsif($cnv_breakpoints[$cnv_start] < $loh_start_breakpoint && $cnv_breakpoints[$cnv_end] > $loh_end_breakpoint){
3753 0         0 $loh_region_size = 0;
3754             }
3755             }
3756 0         0 $loh_size += $loh_region_size;
3757             }
3758              
3759              
3760             #calculate the LOH score
3761 0         0 $loh_score = 1 - ($loh_size/$remaining_heterozygous_size);
3762              
3763 0 0       0 if($loh_size> $remaining_heterozygous_size){
3764 0         0 die "ERROR: invalid LOH size value found\n";
3765             }
3766              
3767 0         0 return($loh_score, $loh_size, $remaining_heterozygous_size);
3768              
3769             }#sub calculate_loh_score
3770              
3771              
3772             #=head2 Sub-Method: standard_deviation_and_mean
3773              
3774             ### standard_deviation_and_mean ###################################################################
3775             # Description:
3776             # Calculates the standard deviation and mean for a given set of values
3777             #
3778             # Input variables:
3779             # $data_ref: reference to either a hash or an array
3780             # $type: 0 indicates a hash, 1 indicates an array
3781             #
3782              
3783             #=cut
3784              
3785             sub standard_deviation_and_mean{
3786              
3787             #parse parameters
3788 11     11 0 2881 my $data_ref = shift;
3789 11         12 my $type = shift;
3790              
3791 11         9 my %hash;
3792             my @array;
3793 0         0 my $size;
3794              
3795 11         9 my $mean = 0;
3796 11         11 my $SD = 0;
3797              
3798 11 100       24 if($type==0){
    100          
3799 5         5 %hash = %{$data_ref};
  5         27  
3800              
3801 3 100       10 if((scalar(keys %hash))==0){
3802 1         5 die"Found sample size of 0 when calculating SD-hash\n";
3803             }
3804              
3805             #calculate mean
3806 2         6 for my $key (keys %hash){
3807 11         12 $mean += $hash{$key};
3808             }
3809            
3810 2         5 $mean = $mean/(scalar(keys %hash));
3811              
3812             #calculate sum of squared differences
3813 2         3 for my $key (keys %hash){
3814 11         16 $SD += ($hash{$key}-$mean)**2;
3815             }
3816            
3817             #calculate final standard deviation value
3818 2         4 $SD = $SD/(scalar(keys %hash));
3819 2         4 $SD = $SD**0.5;
3820             }
3821             elsif($type==1){
3822 5         5 @array = @{$data_ref};
  5         24  
3823 3         2 $size = @array;
3824              
3825 3 100       8 if($size==0){
3826 1         8 die"Found sample size of 0 when calculating SD-array\n";
3827             }
3828            
3829             #calculate mean
3830 2         3 foreach my $val (@array){
3831 11         12 $mean += $val;
3832             }
3833              
3834 2         3 $mean = $mean/$size;
3835              
3836             #calculate sum of squared differences
3837 2         3 foreach my $val (@array){
3838 11         22 $SD += ($val-$mean)**2;
3839             }
3840              
3841             #calculate final standard deviation value
3842 2         4 $SD = $SD/$size;
3843 2         10 $SD = $SD**0.5;
3844            
3845             }
3846             else{
3847 1         6 die"ERROR: invalid SD/mean type found\n";
3848             }
3849 4         13 return ($SD, $mean);
3850              
3851             }#sub standard_deviation_and_mean
3852              
3853              
3854             ### next_arg ######################################################################################
3855             # Parse the next arguement from the command line
3856             #
3857             sub next_arg {
3858 0     0 0 0 my $code = shift;
3859 0         0 $pos++;
3860 0 0       0 if($pos == $ARGC){
3861 0         0 usage($code);
3862             }
3863             }#sub next_arg
3864              
3865             ### man_text ######################################################################################
3866             # Print the manual help text
3867             #
3868             sub man_text {
3869 0     0 0 0 print "Main Usage:\n";
3870 0         0 print "\tperl -w shatterproof.pl --cnv --trans [--insrt ] [--loh ] [--tp53] --config --output \n";
3871 0         0 print "\n";
3872 0         0 print "\tArguments:\n";
3873 0         0 print "\t\t--cnv\t\tDefine the path to the directory containing the CNV input files\n";
3874 0         0 print "\t\t--trans\t\tDefine the path to the directory containing the Translocation input files\n";
3875 0         0 print "\t\t--insrt\t\tDefine the path to the directory containing the insertion VCF input files\n";
3876 0         0 print "\t\t--loh\t\tDefine the path to the directory containing the LOH input files\n";
3877 0         0 print "\t\t--tp53\t\tIndicate that TP53 should be considered mutated, regardless of data\n";
3878 0         0 print "\t\t--config\tDefine the path to the ShatterProof config file\n";
3879 0         0 print "\t\t--output\tDefine the path to the directory where output should be placed\n";
3880 0         0 print "\t\tdir\t\tPath to a directory\n";
3881 0         0 print "\t\tpath\t\tPath to a file\n";
3882 0         0 print "\n";
3883 0         0 print "Help Usage:\n";
3884 0         0 print "\tperl -w shatterproof.pl --help\t\tThis help message.\n";
3885 0         0 print "\n";
3886 0         0 exit 0;
3887             }#sub man_text
3888              
3889             ### usage #########################################################################################
3890             # Prints an error message when invalid command line arguements are found
3891             #
3892             sub usage {
3893 0     0 0 0 my $usage_msg = shift;
3894              
3895 0         0 print "u $usage_msg \n";
3896            
3897 0         0 given($usage_msg){
3898 0         0 when (/^0/) { print "ERROR: missing arguments\n"; }
  0         0  
3899 0         0 when (/^1/) { print "ERROR: 2nd argument missing\n"; }
  0         0  
3900 0         0 when (/^2/) { print "ERROR: CNV directory missing\n"; }
  0         0  
3901 0         0 when (/^3/) { print "ERROR: --trans option missing\n" }
  0         0  
3902 0         0 when (/^4/) { print "ERROR: --cnv option missing\n" }
  0         0  
3903 0         0 when (/^5/) { print "ERROR: Translocation directory missing\n" }
  0         0  
3904 0         0 when (/^6/) { print "ERROR: --config option missing\n" }
  0         0  
3905 0         0 when (/^7/) { print "ERROR: --trans option missing\n" }
  0         0  
3906 0         0 when (/^8/) { print "ERROR: insertion directory missing\n" }
  0         0  
3907 0         0 when (/^9/) { print "ERROR: --config option missing\n" }
  0         0  
3908 0         0 when (/^10/) { print "ERROR: LOH directory missing \n" }
  0         0  
3909 0         0 when (/^11/) { print "ERROR: --config option missing\n" }
  0         0  
3910 0         0 when (/^12/) { print "ERROR: --config option missing\n" }
  0         0  
3911 0         0 when (/^13/) { print "ERROR: Path to config file missing\n" }
  0         0  
3912 0         0 when (/^14/) { print "ERROR: --output option missing\n" }
  0         0  
3913 0         0 when (/^15/) { print "ERROR: --config option missing\n" }
  0         0  
3914 0         0 when (/^16/) { print "ERROR: Output directory missing\n" }
  0         0  
3915 0         0 when (/^17/) { print "ERROR: --output option missing\n" }
  0         0  
3916 0         0 when (/^18/) { print "ERROR: too many arguments\n" }
  0         0  
3917             }
3918 0         0 print "Try perl -w shatteproof.pl --help\n";
3919 0         0 exit 0;
3920             }#sub usage
3921              
3922             ### initialize_genome_hash ########################################################################
3923             # Description:
3924             # Initializes a hash to store an array for each chromosome
3925             #
3926             sub initialize_genome_hash {
3927              
3928 0     0 0 0 my %genome_region_data = ( #{chr}[region_num]->%region_data
3929             X => [],
3930             Y => [],
3931             1 => [],
3932             2 => [],
3933             3 => [],
3934             4 => [],
3935             5 => [],
3936             6 => [],
3937             7 => [],
3938             8 => [],
3939             9 => [],
3940             10 => [],
3941             11 => [],
3942             12 => [],
3943             13 => [],
3944             14 => [],
3945             15 => [],
3946             16 => [],
3947             17 => [],
3948             18 => [],
3949             19 => [],
3950             20 => [],
3951             21 => [],
3952             22 => []
3953             );
3954              
3955 0         0 return (\%genome_region_data);
3956             }#sub initialize_genome_hash
3957              
3958             ### load_config_file #########################################################################################
3959             # Description:
3960             # Opens the config file and reads the parameter values from it
3961             #
3962             # Input variables:
3963             # $path: path to config file
3964             #
3965             sub load_config_file {
3966            
3967 8     8 0 1730 my $path = shift;
3968 8         520 print "\nLoading configuration file";
3969              
3970             #Load the configuration file config.pl
3971 8         19 my $CONFIG;
3972 8 50       386 open($CONFIG, "<","$path") or die "COULD NOT OPEN CONFIG FILE at path: $path \n";
3973 8         1009 eval (<$CONFIG>) while (!eof($CONFIG));
3974 8         93 close($CONFIG);
3975              
3976 8         728 print " - Done\n";
3977 8         87 1;
3978             }#sub load_config_file
3979              
3980             =head1 NAME
3981              
3982             ShatterProof - a script for analyzing next-generation sequencing data
3983              
3984             =head1 SYNOPSIS
3985              
3986             use Shatterproof
3987              
3988             See "shatterproof.pl" in the scripts directory for a simple perl script which calls the ShatterProof module
3989              
3990             Call ShatterProof via:
3991              
3992             ShatterProof::run(\@ARGV);
3993              
3994             =head1 DESCRIPTION
3995              
3996             ShatterProof is a tool that can be used to analyze next generation sequencing data for signs of chromothripsis. ShatterProof is implemented as a Perl module that processes input files and produces output files in both tab-delimited and YAML format. Perl version 5.0 or greater is required to run ShatterProof. Link to publication will be posted soon.
3997              
3998             =head1 README
3999              
4000             =head2 Installing ShatterProof
4001              
4002             To install this module type the following:
4003              
4004             perl Makefile.PL
4005             make
4006             make test
4007             make install
4008              
4009             Make sure that you have admin permission rights when running the previous commands.
4010              
4011             =head2 Input File Types
4012              
4013             ShatterProof bases its analysis of genomic data on calls of translocations, copy number variations (CNV), loss of heterozygosity (LOH) and insertions.
4014             ShatterProof can takes as input 4 different types of input files. See the scripts/conversion_scripts directory for some Perl scripts which will convert some common tools' output to the required input formats.
4015              
4016             =head3 Translocation Input Files (.spt)
4017              
4018             Tab delimited columns
4019             First line is header line:
4020             #chr1 start end chr2 start end quality
4021              
4022             Example data entry line:
4023              
4024             1 1000 2000 4 4000 5000 78
4025              
4026             If no value is available for quality, use a "." eg.:
4027              
4028             1 1000 2000 4 4000 5000 .
4029              
4030              
4031             =head3 Copy-Number Input Files (.spc)
4032              
4033             Tab delimited columns
4034             First line is header line:
4035             #chr start end number quality
4036              
4037              
4038             Example data entry line:
4039             12 2000 3000 2 63
4040              
4041             If no value is available for quality, use a "." eg.:
4042              
4043             12 2000 3000 2 .
4044              
4045             =head3 Loss of Heterozygozity Input Files (.spl)
4046              
4047             Tab delimited columns
4048             First line is header line:
4049             #chr start end quality
4050              
4051              
4052             Example data entry line:
4053              
4054             12 2000 3000 63
4055              
4056             If no value is available for quality, use a "." eg.:
4057              
4058             12 2000 3000 .
4059              
4060             =head3 Insertion Input Files (.vcf)
4061              
4062             Additionally, ShatterProof accepts insertion calls in VCF files as input. See http://www.1000genomes.org/node/101 for details on the VCF file format.
4063             ShatterProof analyzes the CHROM and POS fields of these files.
4064            
4065              
4066             =head2 Configuring ShatterProof
4067              
4068             See the config.pl file in the scripts directory for a sample ShatterProof configuration file.
4069              
4070             $bin_size: number (integer) of base pairs to include in each bin of the sliding window analysis
4071              
4072             $localization_window_size: number (integer) of bins to include in each window of the sliding window analysis
4073              
4074             $expected_mutation_density: a reference value (double) used in determining if the concentration of translocation events on a particular chromosome is higher than expected.
4075              
4076             $collapse_regions:
4077              
4078             flag variable
4079              
4080             value 1: merge overlapping CNV regions that have the same copy number
4081              
4082             value 0: do not merge overlapping CNV regions that have the same copy number. If such regions are found an error is thrown
4083              
4084             $outlier_deviations: the number of standard deviations away from the mean a value has to be in order to be considered non-significant. Used to identify highly mutated regions.
4085              
4086             $translocation_cut_off_count: the maximum number of translocation chromosomes to tolerate before the translocation score for a region is set to 0.
4087              
4088             $genome_localization_weight: weight given to the localization of mutations to one chromosome hallmark
4089              
4090             $chromosome_localization_weight: weight given to the localization of mutations to one area of a particular chromosome hallmark
4091              
4092             $cnv_weight: weight given to the concentrated CNV hallmark
4093              
4094             $translocation_weight: weight give to the concentrated translocations hallmark
4095              
4096             $insertion_breakpoint_weight: weight given the the short breakpoint insertions hallmark
4097              
4098             $loh_weight: weight given to the loss/retention of heterozygosity hallmark
4099              
4100             $tp53_mutated_weight: weight given to the TP53 mutation hallmark
4101              
4102              
4103             =head2 Running ShatterProof
4104              
4105             From the scripts directory run execute the shatterproof.pl file using Perl.
4106              
4107             Main Usage:
4108              
4109             perl -w shatterproof.pl --cnv --trans [--insrt ] [--loh ] [--tp53] --config --output
4110              
4111             Arguments:
4112              
4113             --cnv Define the path to the directory containing the CNV input files
4114              
4115             --trans Define the path to the directory containing the Translocation input files
4116              
4117             --insrt Define the path to the directory containing the insertion VCF input files
4118              
4119             --loh Define the path to the directory containing the LOH input files
4120              
4121             --tp53 Indicate that TP53 should be considered mutated, regardless of data
4122              
4123             --config Define the path to the ShatterProof config file
4124              
4125             --output Define the path to the directory where output should be placed
4126              
4127             dir Path to a directory
4128              
4129             path Path to a file
4130              
4131             =head1 PREREQUISITES
4132              
4133             strict;
4134             warnings;
4135             Carp;
4136             Switch;
4137             File::Basename;
4138             List::Util qw[min max];
4139             Statistics::Distributions;
4140             POSIX
4141              
4142             =pod OSNAMES
4143              
4144             any
4145              
4146             =pod SCRIPT CATEGORIES
4147              
4148             CPAN
4149              
4150             =cut
4151              
4152             1;
4153             __END__