File Coverage

blib/lib/Shatterproof.pm

Criterion	Covered	Total	%
statement	797	1818	43.8
branch	174	490	35.5
condition	42	180	23.3
subroutine	34	51	66.6
pod	0	27	0.0
total	1047	2566	40.8

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/local/bin/perl
2							#The ShatterProof package is copyright (c) 2013 Ontario Institute for Cancer Research (OICR).
3							#
4							#This package and its accompanying libraries is free software; you can redistribute it and/or modify it under the terms of the GPL (either version 1, or at your option, any later version) or the Artistic License 2.0. Refer to LICENSE for the full license text.
5
6							#OICR makes no representations whatsoever as to the SOFTWARE contained herein. It is experimental in nature and is provided WITHOUT WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE OR ANY OTHER WARRANTY, EXPRESS OR IMPLIED. CSHL MAKES NO REPRESENTATION OR WARRANTY THAT THE USE OF THIS SOFTWARE WILL NOT INFRINGE ANY PATENT OR OTHER PROPRIETARY RIGHT.
7
8							#By downloading this SOFTWARE, your Institution hereby indemnifies OICR against any loss, claim, damage or liability, of whatsoever kind or
9							#nature, which may arise from your Institution's respective use, handling or storage of the SOFTWARE.
10
11							#If publications result from research using this SOFTWARE, we ask that the Ontario Institute for Cancer Research be acknowledged and/or credit be given to OICR scientists, as scientifically appropriate.
12
13							### Shatterproof.pm ###############################################################################
14							# ShatterProof is a tool that can be used to analyze next generation sequencing data for signs
15							# of chromothripsis.
16							# See POD at end of file for more description
17							#
18
19							### INCLUDES ######################################################################################
20	9			9		384330	use strict;
	9					21
	9					317
21	9			9		43	use warnings;
	9					12
	9					260
22	9			9		37	use Carp;
	9					18
	9					801
23
24	9			9		38	use vars qw($VERSIONS);
	9					12
	9					420
25
26	9			9		40	use feature 'switch';
	9					11
	9					873
27							#use Switch;
28	9			9		38	use File::Basename;
	9					14
	9					532
29	9			9		41	use List::Util qw[min max];
	9					12
	9					759
30	9			9		5065	use Statistics::Distributions;
	9					29320
	9					620
31	9			9		6056	use POSIX;
	9					62493
	9					61
32							###################################################################################################
33
34							### HISTORY #######################################################################################
35							# Version Date Coder Comments
36							# 0.001 2012/03/19 sgovind Versioning start point
37							# 0.002 2012/04/03 sgovind moved input validation methods and run methods
38							# from shatterproof.pl to here
39							# 0.003 2012/10/08 sgovind Updated POD
40							# 0.04 2012/11/25 sgovind Stable build before changing translocation scoring equation
41							# 0.05 2012/12/26 sgovind See change log for details
42							# 0.06 2012/12/27 sgovind Added additional documentation for new config variable
43							# 0.07 2012/12/27 sgovind Added example guide for provided sample data
44							# 0.08 2013/05/22 sgovind Added EXPORT code for test case, added minor error checking
45							# 0.09 2013/06/10 sgovind Minor changes to accomodate testing
46							# 0.10 2013/06/19 sgovind Minor changes to accomodate testing
47							# 0.11 2013/06/24 sgovind Changed all sorts to stable sorts. Reduced number of posix
48							# calculations.
49							# 0.12 2013/06/28 sgovind Changed sort order of interchromosomal translocation output
50							# 0.13 2013/0716 sgovind Corrected logical error in calculate_loh_score
51
52							our $VERSION = '0.14';
53
54							package Shatterproof;
55	9			9		24114	use Exporter;
	9					62
	9					78034
56							our @ISA = 'Exporter';
57							our @EXPORT = qw(
58							$bin_size
59							$genome_cnv_data_hash_ref
60							$chromosome_copy_number_count_hash_ref
61							$chromosome_cnv_breakpoints_hash_ref
62							$tp53_mutation_found
63							$genome_trans_data_hash_ref
64							$chromosome_translocation_count_hash_ref
65							$genome_trans_breakpoints_hash_ref
66							$genome_mutation_density_hash_ref
67							$suspect_regions_array_ref
68							$likely_regions_array_ref
69							$genome_cnv_data_windows_hash_ref
70							$genome_trans_data_windows_hash_ref
71							$genome_mutation_data_windows_hash_ref
72							$localization_window_size
73							);
74
75							### Global Variables ##############################################################################
76							my $pos = 0; #used to parse command line variables
77							my $ARGC; #stores the number of command line arguments provided
78
79							my %chromosome_length = ( #stores the sequence length of each chromosome
80							X => 154913754,
81							Y => 57741652,
82							1 => 247199719,
83							2 => 242751149,
84							3 => 199446827,
85							4 => 191263063,
86							5 => 180837866,
87							6 => 170896993,
88							7 => 158821424,
89							8 => 146274826,
90							9 => 140442298,
91							10 => 135374737,
92							11 => 134452384,
93							12 => 132289534,
94							13 => 114127980,
95							14 => 106360585,
96							15 => 100338915,
97							16 => 88822254,
98							17 => 78654742,
99							18 => 76117153,
100							19 => 63806651,
101							20 => 62435965,
102							21 => 46944323,
103							22 => 49528953
104							);
105
106							my $TP53_start = 1000000*7.57; #start base pair of the TP53 gene
107							my $TP53_end = 1000000*7.59; #end base pair of the TP53 gene
108
109							my $insertion_data_present = 0;
110							my $LOH_data_present = 0;
111
112
113							#The values for the following 13 variables are defined in the config.pl file
114							our $bin_size; #number of bases pairs that will be compressed into 1 region when analyzing the genome
115							#this value defines how many base pairs are included in one array element in the data_hash_ref varaibles
116
117							our $localization_window_size; #number of regions to sum together when performing sliding window analysis of the genome
118
119							our $expected_mutation_density; #the expected mutation density of translocations in a highly mutated region
120							#used to calculate spread factor of translocations
121							our $low_mutation_density_threshold; #the mutation density that will be used to call likely regions
122
123							our $collapse_regions; #flag variable
124							#value 1: merge overlapping CNV regions that have the same copy number
125							#value 0: do not merge overlapping CNV regions that have the same copy number. If such
126							# regions are found an error is thrown
127
128							our $outlier_deviation; #the number of standard deviations away from the mean a value has to be in order to be considered non-significant
129
130							our $translocation_cut_off_count; #the max number of translocation chromosomes that will be tolerated before translocation score is set to 0
131
132							our $chromosome_localization_weight; #the scoring formula weight given to the localization of mutations to a specific region on the chromosome
133							our $genome_localization_weight; #the scoring formula weight given to the localization of mutations to a specific chromosome
134							our $cnv_weight; #the scoring formula weight given to the aberrant CNV hallmark
135							our $translocation_weight; #the scoring formula weight given to the localization of translocations
136							our $insertion_breakpoint_weight; #the scoring formula weight given to the number of insertions found at translocation breakpoints
137							our $loh_weight; #the scoring formula weight given to the amount of heterozygosity that is retained in a mutated region
138							our $tp53_mutated_weight; #the scoring formula weight given to the presents or absence of a TP53 mutation
139
140							### SUB METHODS ###################################################################################
141
142							#=head2 Sub-Method: run
143
144							### run ###########################################################################################
145							# Description:
146							# Main method called by shatterproof.pl
147							# Calls primary sub methods
148							#
149							# Input variables:
150							# $argv_ref: reference to @ARGV
151
152							#=cut
153							sub run {
154
155	0			0	0	0	my $argv_ref = shift; #parse parameters
156	0					0	my @argv = @{$argv_ref}; #dereference array reference
	0					0
157
158	0					0	my $cnv_directory; #stores the path to the directory where the CNV input files are found
159							my $trans_directory; #stores the path to the directory where the translocation input files are found
160	0					0	my $insertion_directory; #stores the path to the directory where the insertion input files are found
161	0					0	my $loh_directory; #stores the path to the directory where the loss of hetrozygosity input files are found
162	0					0	my $output_directory; #stores the path to the directory where output files will be placed
163
164	0					0	my $config_file_path; #stores the path to the configuration file
165
166	0					0	my $tp53_mutated = 0; #flag variable to indicate if the TP53 gene should be considered to be mutated
167	0					0	my $tp53_mutation_found = 0; #flag variable to indicate if a mutation was found in the TP53 region. This does not affect scoring
168
169	0					0	my @cnv_files; #list of CNV input files
170							my @trans_files; #list of translocation input files
171	0					0	my @insertion_files; #list of insertion input files
172	0					0	my @loh_files; #list of LOH input files
173
174
175	0					0	my $chromosome_copy_number_count_hash_ref; #hash
176							#key1: chromosome eg. 1,2,X,Y
177							#key2: copy-number state eg. 0,1,3,20
178							#value: number of regions with copy number key2
179
180	0					0	my $chromosome_cnv_breakpoints_hash_ref; #hash
181							#key: chromosome eg. 1,2,X,Y
182							#value: an array storing the start and end points of all cnv regions on key
183
184	0					0	my $chromosome_translocation_count_hash_ref; #hash
185							#key1: chromosome eg. 1,2,X,Y
186							#key2: chromosome eg. 1,2,X,Y
187							#value: number of translocations between key1 and key2
188
189	0					0	my $chromosome_insertion_count_hash_ref; #hash
190							#key: chromosome eg. 1,2,X,Y
191							#value: number of insertions on key
192
193	0					0	my $chromosome_loh_breakpoints_hash_ref; #hash
194							#key: chromosome eg. 1,2,X,Y
195							#value: an array storing the start and end points of all loh regions on key
196
197
198	0					0	my $genome_trans_breakpoints_hash_ref; #hash
199							#key: chromosome eg. 1,2,X,Y
200							#value: an array storing all the translocation breakpoints on key
201
202	0					0	my $genome_trans_insertion_breakpoints_hash_ref; #hash
203							#key: chromosome eg. 1,2,X,Y
204							#value: an array storing only the translocation breakpoints on key that have a insertion with 10 base pairs
205
206	0					0	my $genome_mutation_density_hash_ref; #hash
207							#key: chromosome eg. 1,2,X,Y
208							#value: the total number of mutation on key divided by the sequence length of key
209
210
211	0					0	my $genome_cnv_data_hash_ref = initialize_genome_hash(); #hash {key1}[index]{key2}
212							#key1: chromosome eg. 1,2,X,Y
213							#value: an array storing references to hashes which contain information about the
214							# CNVs in each region of key1. The index of the array indicated the region
215							#key2: 'BPcount' -> gives the number of CNV breakpoints in the region.
216							# a number, eg. '1' -> gives the number of subregions within the region that
217							# have a copy number of 1
218
219	0					0	my $genome_trans_data_hash_ref = initialize_genome_hash(); #hash {key1}[index]{key2}{key3}
220							#key1: chromosome eg. 1,2,X,Y
221							#value1:an array storing references to hashes which contain information about the
222							# translocations in each region of key1. The index of the array indicated the region.
223							#key2: 'BPcount' -> gives the number of translocation breakpoints in the region
224							# 'in' -> gives a reference to a hash that contains information about translocations
225							# into the region
226							# 'out' -> gives a reference to a hash that contains information about translocations
227							# out of the region
228							#key3: chromosome eg. 1,2,X,Y
229							#value2:the number of subregions in the region that were translocated to key1 from key3 if key2 = 'in'
230							# or
231							# the number of subregions in the region that were translocated from key1 to key3 if key2 = 'out'
232
233	0					0	my $genome_insertion_data_hash_ref = initialize_genome_hash(); #hash {key1}[index]
234							#key1: chromosome eg. 1,2,X,Y
235							#value: an array storing the number of insertions in each region of key1
236							# The index of the array indicated the region
237
238
239	0					0	my $genome_cnv_data_windows_hash_ref; #hash {key1}[index]
240							#key1: chromosome eg. 1,2,X,Y
241							#value: an array storing the number of CNV breakpoints in each window of the genome.
242							# Each window begins at the region indicated by the array index
243
244							my $genome_trans_data_windows_hash_ref; #hash {key1}[index]
245							#key1: chromosome eg. 1,2,X,Y
246							#value: an array storing the number of translocation breakpoints in each window of the genome.
247							# Each window begins at the region indicated by the array index
248
249
250	0					0	my $genome_mutation_data_windows_hash_ref; #hash {key1}[index]
251							#key1: chromosome eg. 1,2,X,Y
252							#value: an array storing the total number of mutation breakpoints in each window of the genome.
253							# Each window begins at the region indicated by the array index
254
255	0					0	my $suspect_regions_array_ref; #reference to array that stores regions where chromothripsis most likely occured. Format: chr start end
256	0					0	my $likely_regions_array_ref; #reference to array that stores regions where chromothripsis may have occured. Format: chr start end
257
258
259							#Validate input arguements and parse them to the correct variables
260	0					0	validate_input(\@argv, \$cnv_directory, \$trans_directory, \$insertion_directory, \$loh_directory, \$tp53_mutated, \$output_directory, \$config_file_path);
261
262							#Load the values from the config file
263	0	0				0	if(load_config_file($config_file_path)!=1){
264	0					0	die("ERROR - could not load config file\n");
265							}
266
267	0					0	print "CNV dir:\t$cnv_directory\n";
268	0					0	print "Trans dir:\t$trans_directory\n";
269
270	0	0				0	if(defined($insertion_directory)){
271	0					0	print "insertion dir:\t$insertion_directory\n";
272							}
273
274	0	0				0	if(defined($loh_directory)){
275	0					0	print "LOH dir:\t$loh_directory\n";
276							}
277
278	0					0	print "Output dir:\t$output_directory\n";
279
280	0					0	print "Force TP53 Mutation:\t$tp53_mutated\n\n";
281
282							#Get a list of files for each of the provided input directories
283	0					0	@cnv_files = glob ("$cnv_directory"."*.spc");
284	0					0	@trans_files = glob ("$trans_directory"."*.spt");
285
286	0	0	0			0	if(scalar(@cnv_files)==0 \|\| scalar(@trans_files)==0){
287	0					0	die "ERROR: no CNV or translocation input files found\n";
288							}
289
290	0	0				0	if(defined($insertion_directory)){
291	0					0	@insertion_files = glob ("$insertion_directory"."*.vcf");
292							}
293
294	0	0				0	if(defined($loh_directory)){
295	0					0	@loh_files = glob ("$loh_directory"."*.spl");
296							}
297
298							#Echo a list of all the input files
299	0					0	$" = "\n\t\t";
300	0					0	print "CNV files:\t@cnv_files\n\n";
301	0					0	print "Trans files:\t@trans_files\n\n";
302	0					0	$" = "\n\t\t";
303	0	0				0	if(scalar(@insertion_files)==0){
304	0					0	print "Indel files:\t-none\n\n";
305							}
306							else{
307	0					0	print "Indel files:\t@insertion_files\n\n";
308							}
309	0					0	$" = "\n\t";
310	0	0				0	if(scalar(@loh_files)==0){
311	0					0	print "LOH files:\t-none\n\n";
312							}
313							else{
314	0					0	print "LOH files:\t@loh_files\n\n";
315							}
316	0					0	$" = " ";
317
318							#Create the output directory if it does not exist
319	0	0				0	mkdir ("$output_directory",0770) unless (-d "$output_directory");
320
321							#Check that the output directory exists
322	0	0				0	if(!(-e $output_directory)){
323	0					0	die "ERROR: could not create directory: $output_directory\n";
324							}
325
326	0					0	print "\n--analyzing CNV data\n";
327	0					0	($genome_cnv_data_hash_ref, $chromosome_copy_number_count_hash_ref, $chromosome_cnv_breakpoints_hash_ref) = analyze_cnv_data($output_directory, \@cnv_files, $bin_size, \$tp53_mutation_found);
328	0					0	print "---done analyzing CNV data\n\n";
329
330	0					0	print "--analyzing translocation data\n";
331	0					0	($genome_trans_data_hash_ref, $chromosome_translocation_count_hash_ref, $genome_trans_breakpoints_hash_ref) = analyze_trans_data($output_directory, \@trans_files, $bin_size, \$tp53_mutation_found);
332	0					0	print "---done analyzing translocation data\n\n";
333
334							#If insertion data was provided then analyze it
335	0	0				0	if(defined($insertion_directory)){
336	0					0	print "--analyzing insertion data\n";
337	0					0	($genome_insertion_data_hash_ref, $chromosome_insertion_count_hash_ref, $genome_trans_insertion_breakpoints_hash_ref) = analyze_insertion_data($output_directory, \@insertion_files, $bin_size, $genome_trans_breakpoints_hash_ref, \$tp53_mutation_found);
338	0					0	print "---done analyzing insertion data\n\n";
339							}
340
341							#Delete intermediate storage
342	0					0	%$genome_trans_breakpoints_hash_ref = ();
343	0					0	undef $genome_trans_breakpoints_hash_ref;
344
345							#If LOH data was provided then analyze it
346	0	0				0	if(defined($loh_directory)){
347	0					0	print "--analyzing LOH data\n";
348	0					0	($chromosome_loh_breakpoints_hash_ref) = analyze_loh_data($output_directory, \@loh_files, \$tp53_mutation_found);
349	0					0	print "---done analyzing LOH data\n\n";
350
351							#Check that the correct format of the LOH hash has been preserved
352	0					0	my %loh_hash = %{$chromosome_loh_breakpoints_hash_ref};
	0					0
353	0					0	for my $key1 (keys %loh_hash){
354	0					0	my @a = @{$loh_hash{$key1}};
	0					0
355	0					0	my $size = @a;
356
357	0	0				0	if($size % 2 != 0){
358	0					0	die "ERROR: odd number of loh breakpoints recorded for chromosome $key1\n";
359							}
360							}
361
362							}
363
364	0					0	print "--calculating chromosome mutation densities\n";
365	0					0	$genome_mutation_density_hash_ref = calculate_genome_localization($output_directory, $chromosome_copy_number_count_hash_ref, $chromosome_translocation_count_hash_ref);
366	0					0	print "---done calculating chromosome mutation densities\n\n";
367
368	0					0	print "--calculating chromosome region mutation densities\n";
369	0					0	($suspect_regions_array_ref, $likely_regions_array_ref, $genome_cnv_data_windows_hash_ref, $genome_trans_data_windows_hash_ref, $genome_mutation_data_windows_hash_ref) = calculate_chromosome_localization($output_directory, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $bin_size, $localization_window_size);
370	0					0	print "---done calculating chromosome region mutation densities\n\n";
371
372	0					0	print "--analyzing suspect regions\n";
373	0					0	analyze_suspect_regions($output_directory, $suspect_regions_array_ref, $genome_mutation_density_hash_ref, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $bin_size, $localization_window_size, $tp53_mutated, $tp53_mutation_found, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref);
374	0					0	print "---done analyzing suspect regions\n\n";
375
376	0					0	print "--analyzing likely regions\n";
377	0					0	analyze_likely_regions($output_directory, $likely_regions_array_ref, $genome_mutation_density_hash_ref, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $bin_size, $localization_window_size);
378	0					0	print "---done analyzing likely regions\n\n";
379
380	0					0	print "--calculating copy number count\n";
381	0					0	check_copy_number_count($output_directory, $chromosome_copy_number_count_hash_ref);
382	0					0	print "---done calculating copy number count\n\n";
383
384	0					0	print "--calculating switch count\n";
385	0					0	check_copy_number_switches($output_directory, $chromosome_copy_number_count_hash_ref);
386	0					0	print "---done calculating switch count\n\n";
387
388	0					0	print "--calculating interchromosomal translocation rate\n";
389	0					0	calculate_interchromosomal_translocation_rate($output_directory, $chromosome_translocation_count_hash_ref);
390	0					0	print "---done calculating interchromosomal translocation rate\n";
391
392							}#sub run
393
394							#=head2 Sub-Method: validate_input
395
396							### validate_input ################################################################################
397							# Description:
398							# Validates command line arguments. Prints error messages if some input if invalid.
399							#
400							# Input variables:
401							# $argv_ref: reference to @ARGV
402							# $cnv_directory_ref: reference to variable storing the CNV input directory
403							# $trans_directory_ref: reference to variable storing the translocation input directory
404							# $insertion_directory_ref: reference to variable storing the insertion input directory
405							# $loh_directory_ref: reference to variable storing the LOH input directory
406							# $tp53_mutated_ref: reference to variable storing the tp53 mutated flag
407							# $output_directory_ref: reference to variable storing the output directory
408							# $config_file_path_ref: reference to variable storing the path to the config file
409
410							#=cut
411							sub validate_input {
412
413							#Parse parameters
414	0			0	0	0	my $argv_ref = shift;
415	0					0	my @argv = @{$argv_ref};
	0					0
416
417	0					0	my $cnv_directory_ref = shift;
418	0					0	my $trans_directory_ref = shift;
419	0					0	my $insertion_directory_ref = shift;
420	0					0	my $loh_directory_ref = shift;
421	0					0	my $tp53_mutated_ref = shift;
422	0					0	my $output_directory_ref = shift;
423	0					0	my $config_file_path_ref = shift;
424
425							#Determine number of command line arguements
426	0					0	$ARGC = @argv;
427
428							#Parse the command line arguements
429	0					0	given ($ARGC) {
430	0					0	when (/^0$/) { usage(0); } #Print error message if no arguements were entered
	0					0
431
432	0					0	when (/^1$/) { #Check for help option
433	0	0				0	if($argv[0] eq "--help"){
434	0					0	man_text();
435							}
436							else{
437	0					0	usage(1);
438							}
439							}#case 1
440
441	0					0	default {
442
443	0	0				0	if($argv[$pos] eq "--cnv"){ #Check for the cnv input directory option, this field is mandatory
444	0					0	next_arg(2);
445	0					0	$$cnv_directory_ref = $argv[$pos];
446	0	0				0	if(!(substr($$cnv_directory_ref,-1,1) eq '/')){
447	0					0	$$cnv_directory_ref = $$cnv_directory_ref.'/';
448							}
449	0					0	next_arg(3);
450							}
451							else {
452	0					0	usage(4);
453							}
454
455
456	0	0				0	if($argv[$pos] eq "--trans"){ #Check for the translocation input directory option, this field is mandatory
457	0					0	next_arg(5);
458	0					0	$$trans_directory_ref = $argv[$pos];
459	0	0				0	if(!(substr($$trans_directory_ref,-1,1) eq '/')){
460	0					0	$$trans_directory_ref = $$trans_directory_ref.'/';
461							}
462	0					0	next_arg(6);
463							}
464							else {
465	0					0	usage(7);
466							}
467
468	0	0				0	if($argv[$pos] eq "--insrt"){ #Check for the insertion input directory option
469	0					0	next_arg(8);
470	0					0	$$insertion_directory_ref = $argv[$pos];
471	0	0				0	if(!(substr($$insertion_directory_ref,-1,1) eq '/')){
472	0					0	$$insertion_directory_ref = $$insertion_directory_ref.'/';
473							}
474	0					0	$insertion_data_present = 1;
475	0					0	next_arg(9);
476							}
477
478	0	0				0	if($argv[$pos] eq "--loh"){ #Check for the LOH input directory option
479	0					0	next_arg(10);
480	0					0	$$loh_directory_ref = $argv[$pos];
481	0	0				0	if(!(substr($$loh_directory_ref,-1,1) eq '/')){
482	0					0	$$loh_directory_ref = $$loh_directory_ref.'/';
483							}
484	0					0	$LOH_data_present = 1;
485	0					0	next_arg(11);
486							}
487
488	0	0				0	if($argv[$pos] eq "--tp53"){ #Check for the TP53 gene mutation check option
489	0					0	$$tp53_mutated_ref = 1;
490	0					0	next_arg(12);
491							}
492
493	0	0				0	if($argv[$pos] eq "--config"){ #Check for the config file option, this field is mandatory
494	0					0	next_arg(13);
495	0					0	$$config_file_path_ref = $argv[$pos];
496	0					0	next_arg(14);
497							}
498							else{
499	0					0	usage(15);
500							}
501
502	0	0				0	if($argv[$pos] eq "--output"){ #Check for the output directory option, this field is mandatory
503	0					0	next_arg(16);
504	0					0	$$output_directory_ref = $argv[$pos];
505	0	0				0	if(!(substr($$output_directory_ref,-1,1) eq '/')){
506	0					0	$$output_directory_ref = $$output_directory_ref.'/';
507							}
508							}
509							else {
510	0					0	usage(17);
511							}
512
513							#Check that there are no other command line arguments
514	0	0				0	if($pos != $ARGC-1){
515	0					0	usage(18);
516							}
517							}#default case
518							}#given ($ARGC)
519
520							}#sub validate_input
521
522							#=head2 Sub-Method: analyze_cnv_data
523
524							### analyze_cnv_data ##############################################################################
525							# Description:
526							# Reads data from files located in the CNV input directory and populates:
527							# $genome_cnv_data_hash_ref
528							# $chromosome_copy_number_count_hash_ref
529							# $chromosome_cnv_breakpoints_hash_ref
530							#
531							# Input variables:
532							# $output_directory: stores the path to the output directory
533							# $cnv_files_array_ref: reference to array containing all the CNV input files
534							# $bin_size: stores the size of the bins which the chromosome will be divided into
535							# $tp53_mutation_found_ref: reference to the tp53 mutation found flag
536
537							#=cut
538							sub analyze_cnv_data {
539
540							#Parse the parameters
541	11			11	0	16050574	my $output_directory = shift;
542
543	11					22	my $cnv_files_array_ref = shift;
544	11					26	my @cnv_files = @$cnv_files_array_ref;
545
546	11					21	my $bin_size = shift;
547	11					18	my $tp53_mutation_found_ref = shift;
548
549	11					19	my %genome_cnv_data = (); #hash
550							#key: chromosome eg. 1,2,X,Y
551							#value: a reference to an array where each element corresponds to a bin along the
552							# chromosome
553
554	11					19	my @file_data; #an array storing all the entries from every input file
555
556							my $CURRENT_FILE; #file handle to the current file that is open
557	0					0	my $TP53_FILE; #file handle to the TP53 CNV mutation output file
558
559	0					0	my $line; #stores raw line read in from file
560	0					0	my @line_data; #stores tokenized line read in from file
561
562	11					22	my %chromosome_copy_number_count = (); #hash {chr}{copy number}{count}
563							#key1: chromosome eg. 1,2,X,Y
564							#key2: a copy-number state eg 0,1,3,15
565							#value: the number of region on key1 that have a copy number of key2
566
567	11					205	my %chromsome_cnv_breakpoints = ( #hash {chr}[start and end pairs]
568							X => [], #key: chromosome eg. 1,2,X,Y
569							Y => [], #value: an array that stored an ordered list of CNV breakpoints on key
570							1 => [],
571							2 => [],
572							3 => [],
573							4 => [],
574							5 => [],
575							6 => [],
576							7 => [],
577							8 => [],
578							9 => [],
579							10 => [],
580							11 => [],
581							12 => [],
582							13 => [],
583							14 => [],
584							15 => [],
585							16 => [],
586							17 => [],
587							18 => [],
588							19 => [],
589							20 => [],
590							21 => [],
591							22 => []
592							);
593
594							#Read the contents of the cnv files into memory
595	11	100				50	if($#cnv_files==-1){
596	1					7	die "ERROR: no cnv files found in analyze_cnv_data\n";
597							}
598
599	10					28	foreach my $file (@cnv_files){
600							#Open the file
601	16	100				559	open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";
602
603							#Check that the file is not empty
604	15	50				6695	if(eof($CURRENT_FILE)){
605	0					0	close ($CURRENT_FILE);
606	0					0	die "ERROR: $file is empty\n";
607							}
608
609							#Read header line and validate
610	15					58	$line = <$CURRENT_FILE>;
611	15					29	chomp($line);
612
613							#Check that the format of the header line is correct
614	15	100				92	if(!($line =~ m/^#chr\tstart\tend\tnumber\tquality$/)){
615	1					14	close ($CURRENT_FILE);
616	1					38	die "ERROR: header of cnv file $file is invalid\n";
617							}
618
619							#Read all the data lines in the file
620	14					40	while( !(eof($CURRENT_FILE)) ){
621							#read data line
622	836					1273	$line = <$CURRENT_FILE>;
623	836					643	chomp($line);
624
625							#Validate the data line
626	836	100				2702	if(!($line =~ m/^(chr)?(1[0-9]\|2[0-2]\|X\|Y\|[1-9])\t[0-9]+\t[0-9]+\t[0-9]+\t([0-9]+\|\.)$/)){
627	1					32	die "ERROR: invalid line found ($line) in file $file\n";
628							}
629
630							#Split the data line and add it to the file_data array
631	835					2515	@line_data = (split (/\t/,$line));
632
633	835					2396	push(@file_data,[@line_data]);
634							}
635
636	13					155	close ($CURRENT_FILE);
637
638							}#foreach my $file (@cnv_files)
639
640							#Check that there are no overlapping CNV regions with different copy-numbers
641	7					40	@file_data = check_for_overlaps("cnv", \@file_data);
642
643
644							#Create TP53 directory and output folder
645	7	100				963	mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
646	7	50				115	if(!(-e "$output_directory"."TP53")){
647	0					0	die "ERROR: could not create folder $output_directory"."TP53\n";
648							}
649
650							#Create the TP53 CNV mutation file
651	7	50				674	open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spc") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spc";
652
653							#Print the header of the file (same format as a .spc file)
654	7					134	print $TP53_FILE "#chr\tstart\tend\tnumber\tquality";
655
656							#For every data line that was read in from an input file,
657							#record the CNV mutation in the genome_cnv_data_hash,
658							#record the exact breakpoints of the CNV,
659							#update the chromosome_copy_number_count hash and
660							#check if the CNV is in the TP53 region
661	7					42	for (my $n = 0; $n < scalar(@file_data); $n++){
662	739					829	my $hash = {};
663
664							#Ensure that the chromosome value is valid
665	739	50				3693	if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]\|2[0-2]\|X\|Y\|[1-9])$/)){
666	0					0	die "ERROR: invalid line found in CNV input file: @{$file_data[$n]}\n";
	0					0
667							}
668
669							#Parse the chromosome
670	739					1259	my $chr = $2;
671
672							#Increment the copy-number count hash based on the line data
673	739					1616	$chromosome_copy_number_count {$chr}{$file_data[$n][3]}++;
674
675							#Record the exact breakpoints of the CNV
676	739					637	push (@{$chromsome_cnv_breakpoints{$chr}}, ($file_data[$n][1],$file_data[$n][2]));
	739					2005
677
678							#Calculate the bin for the start and end breakpoint
679	739					1289	my $start_index = int($file_data[$n][1]/$bin_size);
680	739					946	my $end_index = int($file_data[$n][2]/$bin_size);
681
682							my $update_bin = sub {
683	0			0		0	my $source_chr = shift;
684	0					0	my $index = shift;
685	0					0	my $copy_num = shift;
686
687	0					0	my $genome_hash_ref = shift;
688	0					0	my %genome_hash = %{$genome_hash_ref};
	0					0
689
690	0					0	my $hash = ();
691
692	0	0				0	if(!defined(@{$genome_hash{$source_chr}}[$index])){
	0					0
693	0					0	$hash->{'BPcount'} = 1;
694	0					0	$hash->{$copy_num} = 0.5;
695	0					0	@{$genome_hash{$source_chr}}[$index] = $hash;
	0					0
696							}
697							else{ #if a bin does exist then increment the counts
698	0					0	${@{$genome_hash{$source_chr}}[$index]}{'BPcount'}++;
	0					0
	0					0
699	0					0	${@{$genome_hash{$source_chr}}[$index]}{$copy_num}+=0.5;
	0					0
	0					0
700							}
701	739					3330	};
702
703							#Check if a bin exists at the start index
704							#if not, create one
705	739	50				690	if(!defined(@{$genome_cnv_data{$chr}}[$start_index])){
	739					1428
706	739					1053	$hash->{'BPcount'} = 1;
707	739					986	$hash->{$file_data[$n][3]} = 0.5;
708	739					621	@{$genome_cnv_data{$chr}}[$start_index] = $hash;
	739					84957
709							}
710							else{ #If one does exist increment the counts
711	0					0	${@{$genome_cnv_data{$chr}}[$start_index]}{'BPcount'}++;
	0					0
	0					0
712	0					0	${@{$genome_cnv_data{$chr}}[$start_index]}{$file_data[$n][3]}+=0.5;
	0					0
	0					0
713							}
714
715	739					1268	$hash = {};
716
717							#Check if a bin exists at the end index
718							#if not, create one
719	739	100				693	if(!defined(@{$genome_cnv_data{$chr}}[$end_index])){
	739					1432
720	738					1158	$hash->{'BPcount'} = 1;
721	738					1049	$hash->{$file_data[$n][3]} = 0.5;
722	738					646	@{$genome_cnv_data{$chr}}[$end_index] = $hash;
	738					4641
723							}
724							else{ #If one does exist increment the counts
725	1					2	${@{$genome_cnv_data{$chr}}[$end_index]}{'BPcount'}++;
	1					3
	1					3
726	1					2	${@{$genome_cnv_data{$chr}}[$end_index]}{$file_data[$n][3]}+=0.5;
	1					1
	1					6
727							}
728
729
730							#Check if the variation was in the TP53 gene
731	739	100	100			8074	if(
			100
732							( $chr ne 'X' && $chr ne 'Y' ) &&
733							( $chr==17 )
734							){
735	37	100	66			622	if(
			33
736							( $file_data[$n][3] != 2 ) &&
737							( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) \|\| ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) )
738							){
739							#If a CNV was found in the TP53 region
740	1					2	$$tp53_mutation_found_ref = 1;
741
742							#Record the mutation in the TP53 CNV file
743	1					3	print $TP53_FILE "\n";
744	1					2	for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
	6					22
745	5					36	print $TP53_FILE "$file_data[$n][$i]";
746	5	100				3	if($i != scalar(@{$file_data[$n]})-1){
	5					13
747	4					5	print $TP53_FILE "\t";
748							}#if
749							}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
750							}#if
751							}#if
752							}#for (my $n = 0; $n < scalar(@file_data); $n++)
753
754	7					1952	close($TP53_FILE);
755
756							#return hash
757	7					562	return(\%genome_cnv_data, \%chromosome_copy_number_count, \%chromsome_cnv_breakpoints);
758
759							}#sub analyze_cnv_data
760
761							#=head2 Sub-Method: check_for_overlaps
762
763							### check_for_overlaps ############################################################################
764							# Description:
765							# Checks if there were overlapping CNV regions with different copy-numbers in the input files.
766							# Also checks if there are any overlapping translocation destinations or overlapping LOH
767							# regions.
768							#
769							# Input variables:
770							# $type: flag variable indicating which type of overlap to check for "cnv", "trans",
771							# or "loh"
772							# $file_data_ref: reference to an array storing all the data lines read in from the specific
773							# type of input file
774
775							#=cut
776							sub check_for_overlaps {
777
778	7			7	0	59	my $type = shift;
779	7					13	my $file_data_ref = shift;
780	7					70	my @file_data = @$file_data_ref;
781
782	7					39	my $start_overlap = 0; #Flag variable indicating if the start position of one region is overlapping with
783							#another region
784
785	7					22	my $end_overlap = 0; #Flag variable indicating if the end position of one region is overlapping with
786							#another region
787
788							#Check for overlapping regions
789							#Compares each entry in the array to every following entry
790	7					45	for (my $n = 0; $n < scalar(@file_data); $n++){
791	739					1232	for (my $k = $n+1; $k < scalar(@file_data); $k++){
792
793							#Check if the 2 regions in question are on the same chromosome
794	55656	100				124506	if($file_data[$n][0] eq $file_data[$k][0]) {
795
796							#Check if the end point of region 1 is within region 2
797	2862	100	100			7298	if($file_data[$n][2]>=$file_data[$k][1] && $file_data[$n][2]<=$file_data[$k][2]){
798	96					103	$end_overlap = 1;
799							}
800
801							#Check if the start point of region 1 is within region 2
802	2862	100	100			6389	if($file_data[$n][1]>=$file_data[$k][1] && $file_data[$n][1]<=$file_data[$k][2]){
803	78					95	$start_overlap = 1;
804							}
805
806							#If an overlap was detected
807	2862	100	100			16682	if($start_overlap==1 \|\| $end_overlap==1) {
		50	66
808
809							#if it was a translocation overlap then throw an error
810	96	50				200	if($type eq "trans"){
811	0					0	die "ERROR: found overlapping translocation source regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
	0					0
	0					0
812							}
813
814							#if it was a LOH overlap then throw an error
815	96	50				171	if($type eq "loh"){
816	0					0	die "ERROR: found overlapping LOH regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
	0					0
	0					0
817							}
818
819							#if it was a CNV overlap check if the copy numbers are the same
820							#If they are different throw an error
821	96	50	33			597	if(
		50
		50
822							( $type eq "cnv") &&
823							( $file_data[$n][3] != $file_data[$k][3] )
824							){
825	0					0	die "ERROR: found overlapping regions with different copy number values:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
	0					0
	0					0
826							}
827
828							#if they are the same
829							#and the user does not wish to collapse overlapping regions with the same copy number then throw an error
830							elsif ($collapse_regions==0) {
831	0					0	die "ERROR: found overlapping copy number regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
	0					0
	0					0
832							}
833
834							#if the user wishes to collapses overlapping regions with the same copy number then do so
835							elsif ($collapse_regions==1) {
836							#Region 2 completely encompasses region 1
837							#So replace region 1 with region 2
838	96	100	66			340	if($start_overlap==1 && $end_overlap==1){
		50
		50
839	78					158	$file_data[$n][1] = $file_data[$k][1];
840	78					102	$file_data[$n][2] = $file_data[$k][2];
841							}
842							#The start point of region 1 is within region 2
843							#So replace the start point of region 1 with the start point of region 2
844							elsif($start_overlap==1){
845	0					0	$file_data[$n][1] = $file_data[$k][1];
846							}
847							#The end point of region 1 is within region 2
848							#So replace the end point of region 1 with the end point of region 2
849							elsif($end_overlap==1){
850	18					41	$file_data[$n][2] = $file_data[$k][2];
851							}
852							#If region 1 was modified then remove region 2 and re-check for overlaps
853	96	50	66			377	if($start_overlap==1 \|\| $end_overlap==1){
854	96					2482	@file_data = (@file_data[0..($k-1),($k+1)..(scalar(@file_data)-1)]);
855	96					539	$start_overlap = 0;
856	96					88	$end_overlap = 0;
857	96					92	$k = $n+1;
858	96					129	redo;
859							}
860
861							}#elsif ($collapse_regions==1)
862							}#if($start_overlap==1 \|\| $end_overlap==1)
863
864							#Check if region 1 completely encompasses region 2
865							elsif($file_data[$n][1]<=$file_data[$k][1] && $file_data[$n][2]>=$file_data[$k][2]){
866
867	0	0				0	if($type eq "trans"){
868	0					0	die "ERROR: found overlapping translocation source regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
	0					0
	0					0
869							}
870
871	0	0				0	if($type eq "loh"){
872	0					0	die "ERROR: found overlapping LOH regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
	0					0
	0					0
873							}
874
875							#If the copy numbers are different throw an error
876	0	0	0			0	if(
		0
		0
877							( $type eq "cnv") &&
878							( $file_data[$n][3] != $file_data[$k][3] )
879							){
880	0					0	die "ERROR: found overlapping regions with different copy number values:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
	0					0
	0					0
881							}
882
883							#If the copy numbers are the same but the user does not want to collapse then throw an error
884							elsif ($collapse_regions==0) {
885	0					0	die "ERROR: found overlapping copy number regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n";
	0					0
	0					0
886							}
887
888							#If the user does wish to collapse then remove the second region
889							elsif ($collapse_regions==1) {
890	0					0	@file_data = (@file_data[0..($k-1),($k+1)..(scalar(@file_data)-1)]);
891							}
892							}#elsif($file_data[$n][1]<=$file_data[$k][1] && $file_data[$n][2]>=$file_data[$k][2])
893							}#if($file_data[$n][0] eq $file_data[$k][0])
894							}#for (my $n = 0; $n < scalar(@file_data); $n++)
895							}#for (my $n = 0; $n < scalar(@file_data); $n++)
896
897							#Return the updated entries
898	7					278	return (@file_data);
899
900							}#sub check_for_overlaps
901
902							#=head2 Sub-Method: analyze_trans_data
903
904							### analyze_trans_data ############################################################################
905							# Description:
906							# Reads data from files located in the trans input directory and popultates:
907							# $genome_trans_data_hash_ref
908							# $chromosome_translocation_count_hash_ref
909							# $genome_trans_breakpoints_hash_ref
910							#
911							# Input variables:
912							# $output_directory: stores the path to the output directory
913							# $trans_files_array_ref: reference to array containing all the translocation
914							# input files
915							# $bin_size: stores the size of the bins which the chromosome will be divided into
916							# $tp53_mutation_found_ref: reference to the tp53 mutation found flag
917
918							#=cut
919							sub analyze_trans_data {
920
921							#Parse the parameters
922	10			10	0	8172214	my $output_directory = shift;
923	10					57	my $trans_files_array_ref = shift;
924	10					26	my @trans_files = @$trans_files_array_ref;
925
926	10					13	my $bin_size = shift;
927
928	10					14	my $tp53_mutation_found_ref = shift;
929
930	10					19	my %genome_trans_data = (); #hash
931							#key: chromosome eg. 1,2,X,Y
932							#value: a reference to an array where each element corresponds to a bin along the
933							# chromosome
934
935	10					14	my @file_data; #an array storing all the entries from every input file
936
937							my $CURRENT_FILE; #file handle to the current file that is open
938	0					0	my $TP53_FILE; #file handle to the TP53 translocation mutation output file
939
940	0					0	my $line; #stores raw line read in from file
941	0					0	my @line_data; #stores tokenized line read in from file
942
943	0					0	my $chr1;
944	0					0	my $chr2;
945
946	10					16	my %chromosome_trans_count = (); #hash {chr}{chr}{count}
947							#key1: chromosome eg. 1,2,X,Y
948							#key2: chromosome eg. 1,2,X,Y
949							#value: the number of translocations between key1 and key2
950
951	10					175	my %genome_trans_breakpoints = ( #hash {chr}[array]
952							X => [], #key: chromosome eg. 1,2,X,Y
953							Y => [], #value: an array storing all the translocation breakpoints
954							1 => [], # on key
955							2 => [],
956							3 => [],
957							4 => [],
958							5 => [],
959							6 => [],
960							7 => [],
961							8 => [],
962							9 => [],
963							10 => [],
964							11 => [],
965							12 => [],
966							13 => [],
967							14 => [],
968							15 => [],
969							16 => [],
970							17 => [],
971							18 => [],
972							19 => [],
973							20 => [],
974							21 => [],
975							22 => []
976							);
977
978
979							#Read the contents of the cnv files into memory
980	10	100				43	if($#trans_files==-1){
981	1					8	die "ERROR: no trans files found in analyze_trans_data\n";
982							}
983
984	9					23	foreach my $file (@trans_files){
985							#open the file
986	14	50				582	open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";
987
988							#check that the file is not empty
989	14	100				2187	if(eof($CURRENT_FILE)){
990	1					6	close ($CURRENT_FILE);
991	1					24	die "ERROR: $file is empty\n";
992							}
993
994							#read header line and validate
995	13					53	$line = <$CURRENT_FILE>;
996	13					25	chomp($line);
997
998							#validate the header line
999	13	100				81	if(!($line =~ m/^#chr1\tstart\tend\tchr2\tstart\tend\tquality$/)){
1000	1					11	close ($CURRENT_FILE);
1001	1					24	die "ERROR: header of translocation file $file is invalid\n";
1002							}
1003
1004							#read in every data line
1005	12					37	while( !(eof($CURRENT_FILE)) ){
1006							#read data line
1007	742					1245	$line = <$CURRENT_FILE>;
1008	742					677	chomp($line);
1009
1010							#validate the format of the data line
1011	742	100				3183	if(!($line =~ m/^(chr)?(1[0-9]\|2[0-2]\|X\|Y\|[1-9])\t[0-9]+\t[0-9]+\t(chr)?(1[0-9]\|2[0-2]\|X\|Y\|[1-9])\t[0-9]+\t[0-9]+\t([0-9]+\|\.)$/)){
1012	1					33	die "ERROR: invalid line found in translocation input file: ($line) in file $file\n";
1013	0					0	next;
1014							}
1015
1016							#Split the data line and add it to the array
1017	741					2793	@line_data = (split (/\t/,$line));
1018
1019							#if the start position is greater than the end position for either the source or destination throw an error
1020	741	100	66			2797	if(
1021							( $line_data[1] >= $line_data[2] ) \|\|
1022							( $line_data[4] >= $line_data[5] )
1023							){
1024							#warn "ERROR: invalid line found ($line) in file $file. Start or end values invalid\n";
1025							#next;
1026							}
1027
1028							#Add the data line to the file_data array
1029	741					2566	push(@file_data,[@line_data]);
1030							}
1031
1032	11					104	close ($CURRENT_FILE);
1033
1034							}#foreach my $file (@trans_files)
1035
1036							#ignoring overlapping translocations for now
1037							#@file_data = check_for_overlaps("trans", \@file_data);
1038
1039							#Create TP53 directory and output folder
1040	6	100				265	mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
1041	6	50				69	if(!(-e "$output_directory"."TP53")){
1042	0					0	die "ERROR: could not create folder $output_directory"."TP53\n";
1043							}
1044
1045							#create TP53 translocation mutation file
1046	6	50				510	open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spt") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spt";
1047
1048							#Print the header of the file (same format as a .spt file)
1049	6					36	print $TP53_FILE "#chr1\tstart\tend\tchr2\tstart\tend\tquality";
1050
1051							#For every data line that was read in from an input file,
1052							#record the translocation mutation in the genome_trans_data_hash,
1053							#record the exact breakpoints of the translocation,
1054							#update the chromosome_trans_count hash and
1055							#check if the translocation is in the TP53 region
1056	6					116	for (my $n = 0; $n < scalar(@file_data); $n++){
1057
1058	741					831	my $hash = {};
1059
1060							#verify that the chromosome 1 is valid
1061	741	50				3569	if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]\|2[0-2]\|X\|Y\|[1-9])$/)){
1062	0					0	die "ERROR: invalid chromosome field detected in translocation file\n";
1063							}
1064
1065							#parse out chromosome 1
1066	741					1400	my $chr1 = $2;
1067
1068							#verify that the chromosome 2 is valid
1069	741	50				1892	if(!($file_data[$n][3] =~ m/^(chr)?(1[0-9]\|2[0-2]\|X\|Y\|[1-9])$/)){
1070	0					0	die "ERROR: invalid chromosome field detected in translocation file\n";
1071							}
1072
1073							#parse out chromosome 2
1074	741					825	my $chr2 = $2;
1075
1076							#calculate the bin where each breakpoint will be placed
1077	741					1397	my $start_index1 = int($file_data[$n][1]/$bin_size);
1078	741					801	my $end_index1 = int($file_data[$n][2]/$bin_size);
1079
1080	741					950	my $start_index2 = int($file_data[$n][4]/$bin_size);
1081	741					848	my $end_index2 = int($file_data[$n][5]/$bin_size);
1082
1083							my $update_bin = sub {
1084	0			0		0	my $source_chr = shift;
1085	0					0	my $dest_chr = shift;
1086	0					0	my $index = shift;
1087	0					0	my $data = shift;
1088	0					0	my $type = shift;
1089
1090	0					0	my $genome_hash_ref = shift;
1091	0					0	my %genome_hash = %{$genome_hash_ref};
	0					0
1092
1093	0					0	my $hash = ();
1094
1095	0	0				0	if(!defined(@{$genome_hash{$source_chr}}[$index])){
	0					0
1096	0					0	$hash->{'BPcount'} = 1;
1097	0					0	push (@{$hash->{$type}{$dest_chr}}, $data);
	0					0
1098	0					0	@{$genome_hash{$source_chr}}[$index] = $hash;
	0					0
1099							}
1100							else{ #if a bin does exist then increment the counts
1101	0					0	${@{$genome_hash{$source_chr}}[$index]}{'BPcount'}++;
	0					0
	0					0
1102	0					0	push (@{${@{$genome_hash{$source_chr}}[$index]}{$type}{$dest_chr}}, $data);
	0					0
	0					0
	0					0
1103							}
1104
1105	741					2560	};
1106
1107							#check if a bin exists at $start_index1
1108							#if not, create one
1109	741	100				713	if(!defined(@{$genome_trans_data{$chr1}}[$start_index1])){
	741					1665
1110	516					741	$hash->{'BPcount'} = 1;
1111	516					446	push (@{$hash->{'out'}{$chr2}}, $file_data[$n][4]);
	516					1283
1112	516					495	@{$genome_trans_data{$chr1}}[$start_index1] = $hash;
	516					39736
1113							}
1114							else{ #if a bin does exist then increment the counts
1115	225					176	${@{$genome_trans_data{$chr1}}[$start_index1]}{'BPcount'}++;
	225					164
	225					341
1116	225					189	push (@{${@{$genome_trans_data{$chr1}}[$start_index1]}{'out'}{$chr2}}, $file_data[$n][4]);
	225					163
	225					168
	225					531
1117							}
1118
1119	741					1103	$hash = {};
1120	741	100				618	if(!defined(@{$genome_trans_data{$chr1}}[$end_index1])){
	741					1328
1121	275					405	$hash->{'BPcount'} = 1;
1122	275					214	push (@{$hash->{'out'}{$chr2}}, $file_data[$n][5]);
	275					617
1123	275					263	@{$genome_trans_data{$chr1}}[$end_index1] = $hash;
	275					1026
1124							}
1125							else{
1126	466					320	${@{$genome_trans_data{$chr1}}[$end_index1]}{'BPcount'}++;
	466					350
	466					614
1127	466					389	push (@{${@{$genome_trans_data{$chr1}}[$end_index1]}{'out'}{$chr2}}, $file_data[$n][5]);
	466					328
	466					349
	466					945
1128							}
1129
1130	741					817	$hash = {};
1131	741	100				676	if(!defined(@{$genome_trans_data{$chr2}}[$start_index2])){
	741					1168
1132	361					548	$hash->{'BPcount'} = 1;
1133	361					271	push (@{$hash->{'in'}{$chr1}}, $file_data[$n][1]);
	361					785
1134	361					332	@{$genome_trans_data{$chr2}}[$start_index2] = $hash;
	361					3532
1135							}
1136							else{
1137	380					276	${@{$genome_trans_data{$chr2}}[$start_index2]}{'BPcount'}++;
	380					399
	380					549
1138	380					346	push (@{${@{$genome_trans_data{$chr2}}[$start_index2]}{'in'}{$chr1}}, $file_data[$n][1]);
	380					255
	380					285
	380					980
1139							}
1140
1141	741					788	$hash = {};
1142	741	100				675	if(!defined(@{$genome_trans_data{$chr2}}[$end_index2])){
	741					1134
1143	160					207	$hash->{'BPcount'} = 1;
1144	160					131	push (@{$hash->{'in'}{$chr1}}, $file_data[$n][2]);
	160					360
1145	160					144	@{$genome_trans_data{$chr2}}[$end_index2] = $hash;
	160					573
1146							}
1147							else{
1148	581					448	${@{$genome_trans_data{$chr2}}[$end_index2]}{'BPcount'}++;
	581					426
	581					739
1149	581					521	push (@{${@{$genome_trans_data{$chr2}}[$end_index2]}{'in'}{$chr1}}, $file_data[$n][2]);
	581					411
	581					420
	581					1222
1150							}
1151
1152							#Increment hash translocation counts
1153	741					992	$chromosome_trans_count{$chr1}{$chr2}++;
1154							#if the translocation is intra-chromosomal then don't count it twice
1155	741	100				1294	if($chr1 ne $chr2){
1156	316					390	$chromosome_trans_count{$chr2}{$chr1}++;
1157							}
1158
1159							#store the breakpoints in their bins
1160	741					563	push (@{$genome_trans_breakpoints{$chr1}}, $file_data[$n][1]);
	741					1240
1161	741					665	push (@{$genome_trans_breakpoints{$chr1}}, $file_data[$n][2]);
	741					989
1162
1163	741					567	push (@{$genome_trans_breakpoints{$chr2}}, $file_data[$n][4]);
	741					1028
1164	741					570	push (@{$genome_trans_breakpoints{$chr2}}, $file_data[$n][5]);
	741					995
1165
1166							#Check if the translocation origin was in the TP53 gene
1167	741	100	100			3770	if(
			100
1168							( $chr1 ne 'X' && $chr1 ne 'Y' ) &&
1169							( $chr1==17 )
1170							){
1171	1	50	33			12	if(
			0
			33
1172							( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) \|\| ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) )
1173							){
1174							#if a mutation was found, set the TP53 mutated flag
1175	1					2	$$tp53_mutation_found_ref = 1;
1176	1					3	print $TP53_FILE "\n";
1177							#print the translocation data line to the TP53 translocation mutation output file
1178	1					2	for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
	8					15
1179	7					7	print $TP53_FILE "$file_data[$n][$i]";
1180	7	100				6	if($i != scalar(@{$file_data[$n]})-1){
	7					13
1181	6					8	print $TP53_FILE "\t";
1182							}
1183							}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
1184							}#if
1185							}#if
1186
1187							#Check if the translocation destination was in the TP53 gene
1188	741	50	66			7231	if(
			66
1189							( $chr2 ne 'X' && $chr2 ne 'Y' ) &&
1190							( $chr2==17 )
1191							){
1192	0	0	0			0	if(
			0
			0
1193							( ( $file_data[$n][4] >= $TP53_start && $file_data[$n][4] <= $TP53_end ) \|\| ( $file_data[$n][5] >= $TP53_start && $file_data[$n][5] <= $TP53_end ) )
1194							){
1195	0					0	$$tp53_mutation_found_ref = 1;
1196	0					0	print $TP53_FILE "\n";
1197	0					0	for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
	0					0
1198	0					0	print $TP53_FILE "$file_data[$n][$i]";
1199	0	0				0	if($i != scalar(@{$file_data[$n]})-1){
	0					0
1200	0					0	print $TP53_FILE "\t";
1201							}
1202							}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
1203							}#if
1204							}#if
1205							}#for (my $n = 0; $n < scalar(@file_data); $n++)
1206
1207	6					533	close($TP53_FILE);
1208
1209							#return hash
1210	6					680	return(\%genome_trans_data, \%chromosome_trans_count, \%genome_trans_breakpoints);
1211
1212							}#sub analyze_trans_data
1213
1214
1215							#=head2 Sub-Method: analyze_insertion_data
1216
1217							### analyze_insertion_data ##############################################################################
1218							# Description:
1219							# Reads data from files located in the insertion input directory and populates:
1220							# $genome_insertion_data_hash_ref
1221							# $chromosome_insertion_count_hash_ref
1222							# $genome_trans_insertion_breakpoints_hash_ref
1223							#
1224							# Input variables:
1225							# $output_directory: stores the path to the output directory
1226							# $insertion_files_array_ref: reference to array containing all the insertion input files
1227							# $bin_size: stores the size of the bins which the chromosome will be divided into
1228							# $genome_trans_breakpoints_hash_ref: store reference to hash that contains the translocation breakpoints on
1229							# each chromosome
1230							# $tp53_mutation_found_ref: reference to the tp53 mutation found flag
1231							#
1232
1233							#=cut
1234							sub analyze_insertion_data {
1235
1236							#Parse Parameters
1237	0			0	0	0	my $output_directory = shift;
1238	0					0	my $insertion_files_array_ref = shift;
1239	0					0	my @insertion_files = @$insertion_files_array_ref;
1240
1241	0					0	my $bin_size = shift;
1242
1243	0					0	my $genome_trans_breakpoints_hash_ref = shift;
1244	0					0	my %genome_trans_breakpoints = %$genome_trans_breakpoints_hash_ref;
1245
1246	0					0	my $tp53_mutation_found_ref = shift;
1247
1248	0					0	my %genome_insertion_data = (); #hash
1249							#key: chromosome eg. 1,2,X,Y
1250							#value: a reference to an array where each element corresponds to a bin along the
1251							# chromosome
1252
1253	0					0	my $CURRENT_FILE; #file handle to the current file that is open
1254							my $TP53_FILE; #file handle to the TP53 insertion mutation output file
1255
1256	0					0	my $line; #stores raw line read in from file
1257	0					0	my @line_data; #stores tokenized line read in from file
1258
1259	0					0	my $chr;
1260
1261	0					0	my $file_name;
1262	0					0	my $path;
1263	0					0	my $suffix;
1264
1265	0					0	my $rm_insertion_file_result;
1266
1267	0					0	my $insertion_found = 0;
1268
1269	0					0	my %chromosome_insertion_count = (); #hash {chr}{count}
1270							#key: chromosome eg. 1,2,X,Y
1271							#value: the number of insertions found on key
1272
1273	0					0	my %genome_trans_insertion_breakpoints = ( #hash
1274							X => [], #key: chromosome eg. 1,2,X,Y
1275							Y => [], #value: an array storing all the insertion start positions on key
1276							1 => [],
1277							2 => [],
1278							3 => [],
1279							4 => [],
1280							5 => [],
1281							6 => [],
1282							7 => [],
1283							8 => [],
1284							9 => [],
1285							10 => [],
1286							11 => [],
1287							12 => [],
1288							13 => [],
1289							14 => [],
1290							15 => [],
1291							16 => [],
1292							17 => [],
1293							18 => [],
1294							19 => [],
1295							20 => [],
1296							21 => [],
1297							22 => []
1298							);
1299
1300							#Create TP53 directory and output folder
1301	0	0				0	mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
1302	0	0				0	if(!(-e "$output_directory"."TP53")){
1303	0					0	die "ERROR: could not create folder $output_directory"."TP53\n";
1304							}
1305
1306							#for each file in the insertion input file array
1307	0					0	foreach my $file (@insertion_files){
1308
1309							#Parse the file name, path and file type
1310	0					0	( $file_name, $path, $suffix ) = File::Basename::fileparse( $file, "\.[^.]*");
1311
1312							#open the file
1313	0	0				0	open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";
1314
1315							#ensure that the file is not empty
1316	0	0				0	if(eof($CURRENT_FILE)){
1317	0					0	die "ERROR: $file is empty\n";
1318							}
1319
1320							#create the TP53 insertion mutation output file
1321	0	0				0	open ($TP53_FILE, ">", "$output_directory"."TP53/$file_name"."$suffix") or die "ERROR: could not create file: $output_directory"."TP53/$file_name"."$suffix";
1322
1323							#read header lines
1324	0					0	$line = <$CURRENT_FILE>;
1325	0					0	chomp($line);
1326
1327							#print the VCF header lines to the TP53 insertion mutation output file
1328	0					0	while ($line =~ m/^#(.*?)/){
1329	0					0	print $TP53_FILE "$line\n";
1330	0					0	$line = <$CURRENT_FILE>;
1331	0					0	chomp($line);
1332							}
1333
1334							#read all the data lines in the file
1335	0					0	while(1){
1336	0					0	@line_data = (split (/\t/,$line));
1337
1338							#verify that the chromosome is valid and that the mutation is an insertion type
1339	0	0	0			0	if(
			0
1340							( !($line_data[1] =~ m/^[0-9]+$/) ) \|\|
1341							( !($line_data[0] =~ m/^(chr)?(1[0-9]\|2[0-2]\|X\|Y\|x\|y\|[1-9])/) ) \|\|
1342							( length($line_data[4]) <= length($line_data[3]) )
1343							){
1344	0					0	warn "ERROR: invalid chromosome or non-insertion VCF data line found and skipped:\t$line\n";
1345	0					0	$line = <$CURRENT_FILE>;
1346	0	0				0	unless($line){last;}
	0					0
1347	0					0	chomp($line);
1348	0					0	next;
1349							}
1350
1351							#parse the chromosome
1352	0					0	$chr = $2;
1353
1354							#change to uppercase if 'x' or 'y' is found
1355	0	0				0	if($chr eq 'x'){
1356	0					0	$chr = 'X';
1357							}
1358	0	0				0	if($chr eq 'y'){
1359	0					0	$chr = 'Y';
1360							}
1361
1362							#increment the insertion count of the chromosome
1363	0					0	$chromosome_insertion_count{$chr}++;
1364
1365							#check if a bin exists at the insertion start position
1366							#if one does not, then create one
1367	0	0				0	if(!defined(@{$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)])){
	0					0
1368	0					0	${$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)] = 1;
	0					0
1369							}
1370							else{ #if one does, then increment the count
1371	0					0	${$genome_insertion_data{$chr}}[int($line_data[1]/$bin_size)]++;
	0					0
1372							}
1373
1374							#Search through the list of translocation breakpoints on the same chromosome
1375	0					0	foreach my $bp (@{$genome_trans_breakpoints{$chr}}){
	0					0
1376							#if the insertion is within 10bps of the breakpoints and the insertion to the list stored in
1377							#the genome_trans_insertion_breakpoints hash
1378	0	0	0			0	if( $line_data[1] < $bp+10 && $line_data[1] > $bp-10){
1379	0					0	push (@{$genome_trans_insertion_breakpoints{$chr}}, $bp);
	0					0
1380							}
1381							}
1382
1383							#Check if the insertion was in the TP53 gene
1384	0	0	0			0	if(
			0
1385							( $chr ne 'X' && $chr ne 'Y' ) &&
1386							( $chr==17 )
1387							){
1388	0	0	0			0	if($line_data[1] >= $TP53_start && $line_data[1] <= $TP53_end){
1389	0					0	$$tp53_mutation_found_ref = 1; #if a mutation was found in the region set the TP53 mutated flag
1390	0					0	$insertion_found = 1;
1391	0					0	print $TP53_FILE "$line\n"; #print the culprit data line to the TP53 insertion
1392							#mutation output file
1393							}
1394							}
1395
1396							#read the next line
1397	0					0	$line = <$CURRENT_FILE>;
1398							#check that the end of file has not been reached
1399	0	0				0	unless($line){last;}
	0					0
1400	0					0	chomp($line);
1401							}#while(1)
1402
1403							#close file
1404	0					0	close($CURRENT_FILE);
1405	0					0	close ($TP53_FILE);
1406
1407							#if an insertion was not found in the current file then delete the created TP53 insertion mutation output file
1408	0	0				0	if($insertion_found!=1){
1409	0					0	my $dir = "$output_directory"."TP53/$file_name"."$suffix";
1410	0					0	$rm_insertion_file_result = `rm $dir`;
1411							}
1412
1413	0					0	$insertion_found = 0;
1414							}#foreach my $file (@insertion_files)
1415
1416							#return hash
1417	0					0	return(\%genome_insertion_data, \%chromosome_insertion_count, \%genome_trans_insertion_breakpoints);
1418
1419							}#sub analyze_insertion_data
1420
1421
1422							#=head2 Sub-Method: analyze_loh_data
1423
1424							### analyze_loh_data ##############################################################################
1425							# Description:
1426							# Reads data from files located in the LOH input directory and populates:
1427							# $chromosome_loh_breakpoints_hash_ref
1428							#
1429							# Input variables:
1430							# $output_directory: stores the path to the output directory
1431							# $loh_files_array_ref: reference to array containing all the LOH input files
1432							# $tp53_mutation_found_ref: reference to tp53 mutation found flag
1433							#
1434
1435							#=cut
1436
1437							sub analyze_loh_data {
1438
1439							#parse the parameters
1440	0			0	0	0	my $output_directory = shift;
1441	0					0	my $loh_files_array_ref = shift;
1442	0					0	my @loh_files = @$loh_files_array_ref;
1443
1444	0					0	my $tp53_mutation_found_ref = shift;
1445
1446	0					0	my @file_data; #an array storing all the entries from every input file
1447
1448							my $CURRENT_FILE; #file handle to the current file that is open
1449	0					0	my $TP53_FILE; #file handle to the TP53 translocation mutation output file
1450
1451	0					0	my $line; #stores raw line read in from file
1452	0					0	my @line_data; #stores tokenized line read in from files
1453
1454	0					0	my %chromsome_loh_breakpoints = ( #hash {chr}[start and end pairs]
1455							X => [], #key: chromosome eg. 1,2,X,Y
1456							Y => [], #value: an array that stores all the LOH breakpoints on key
1457							1 => [],
1458							2 => [],
1459							3 => [],
1460							4 => [],
1461							5 => [],
1462							6 => [],
1463							7 => [],
1464							8 => [],
1465							9 => [],
1466							10 => [],
1467							11 => [],
1468							12 => [],
1469							13 => [],
1470							14 => [],
1471							15 => [],
1472							16 => [],
1473							17 => [],
1474							18 => [],
1475							19 => [],
1476							20 => [],
1477							21 => [],
1478							22 => []
1479							);
1480
1481							#Read the contents of the cnv files into memory
1482	0					0	foreach my $file (@loh_files){
1483							#open the file
1484	0	0				0	open ($CURRENT_FILE, "<", $file) or die "ERROR: could not open file at path $file\n";
1485
1486							#Ensure that the file is not empty
1487	0	0				0	if(eof($CURRENT_FILE)){
1488	0					0	close ($CURRENT_FILE);
1489	0					0	die "ERROR: $file is empty\n";
1490							}
1491
1492							#read header line and validate
1493	0					0	$line = <$CURRENT_FILE>;
1494	0					0	chomp($line);
1495
1496							#Validate the header line
1497	0	0				0	if(!($line =~ m/^#chr\tstart\tend\tquality$/)){
1498	0					0	close ($CURRENT_FILE);
1499	0					0	die "ERROR: header of loh file $file is invalid\n";
1500							}
1501
1502							#Read all the data lines
1503	0					0	while( !(eof($CURRENT_FILE)) ){
1504
1505							#read data line
1506	0					0	$line = <$CURRENT_FILE>;
1507	0					0	chomp($line);
1508
1509							#validate the data line
1510	0	0				0	if(!($line =~ m/^(chr)?(1[0-9]\|2[0-2]\|X\|Y\|[1-9])\t[0-9]+\t[0-9]+\t([0-9]+\|\.)$/)){
1511	0					0	die "ERROR: invalid line found ($line) in file $file\n";
1512							}
1513
1514							#Split the data line and add it to the array
1515	0					0	@line_data = (split (/\t/,$line));
1516	0					0	push(@file_data,[@line_data]);
1517							}
1518
1519	0					0	close ($CURRENT_FILE);
1520
1521							}#foreach my $file (@cnv_files)
1522
1523							#Ensure that there are no overlapping LOH regions, or join them if the user indicated
1524	0					0	@file_data = check_for_overlaps("loh", \@file_data);
1525
1526							#Create TP53 directory and output folder
1527	0	0				0	mkdir ("$output_directory"."TP53",0770) unless (-d "$output_directory"."TP53");
1528	0	0				0	if(!(-e "$output_directory"."TP53")){
1529	0					0	die "ERROR: could not create folder $output_directory"."TP53\n";
1530							}
1531
1532							#Create the TP53 LOH mutation output data file
1533	0	0				0	open ($TP53_FILE, ">", "$output_directory"."TP53/TP53.spl") or die "ERROR: could not create file: $output_directory"."TP53/TP53.spl";
1534							#Print the header for the output file (same format as a .spl file)
1535	0					0	print $TP53_FILE "#chr\tstart\tend\tquality";
1536
1537							#For every data line that was read in
1538	0					0	for (my $n = 0; $n < scalar(@file_data); $n++){
1539
1540							#Validate the chromosome field
1541	0	0				0	if(!($file_data[$n][0] =~ m/^(chr)?(1[0-9]\|2[0-2]\|X\|Y\|[1-9])$/)){
1542	0					0	die "ERROR: invalid chromosome field detected\n";
1543							}
1544
1545							#Parse the chromosome
1546	0					0	my $chr = $2;
1547
1548							#Add the breakpoints to the array for the chromosome
1549	0					0	push (@{$chromsome_loh_breakpoints{$chr}}, ($file_data[$n][1],$file_data[$n][2]));
	0					0
1550
1551
1552							#Check if the variation was in the TP53 gene
1553	0	0	0			0	if(
			0
1554							( $chr ne 'X' && $chr ne 'Y' ) &&
1555							( $chr==17 )
1556							){
1557	0	0	0			0	if(
			0
			0
1558							( ( $file_data[$n][1] >= $TP53_start && $file_data[$n][1] <= $TP53_end ) \|\| ( $file_data[$n][2] >= $TP53_start && $file_data[$n][2] <= $TP53_end ) )
1559							){
1560							#If a mutation was found in the TP53 region then set the TP53 mutated flag
1561	0					0	$$tp53_mutation_found_ref = 1;
1562
1563							#Print the LOH data line to the TP53 LOH mutation output file
1564	0					0	print $TP53_FILE "\n";
1565	0					0	for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++){
	0					0
1566	0					0	print $TP53_FILE "$file_data[$n][$i]";
1567	0	0				0	if($i != scalar(@{$file_data[$n]})-1){
	0					0
1568	0					0	print $TP53_FILE "\t";
1569							}#if
1570							}#for (my $i = 0; $i < scalar(@{$file_data[$n]}); $i++)
1571							}#if
1572							}#if
1573
1574							}#for (my $n = 0; $n < scalar(@file_data); $n++)
1575
1576	0					0	close($TP53_FILE);
1577
1578							#return hash
1579	0					0	return(\%chromsome_loh_breakpoints);
1580
1581							}#sub analyze_loh_data
1582
1583
1584
1585							#=head2 Sub-Method: calculate_genome_localization
1586
1587							### calculate_genome_localization #################################################################
1588							# Description:
1589							# Caculates the mutation density for each chromosome
1590							#
1591							# Input variables:
1592							# $output_directory: stores the path to the output directory
1593							# $chromosome_copy_number_count_hash_ref: stores a reference to the hash storing the number
1594							# of CNV events on each chromosome
1595							# #chromosome_translocation_count_hash_ref: stores a reference to the hash storing the number
1596							# of translocation events on each chromosome
1597							#
1598
1599							#=cut
1600
1601							sub calculate_genome_localization {
1602
1603							#parse the parameters
1604	2			2	0	22	my $output_directory = shift;
1605	2					5	my $chromosome_copy_number_count_hash_ref = shift;
1606	2					4	my $chromosome_translocation_count_hash_ref = shift;
1607
1608	2					4	my %chromosome_mutation_count; #hash
1609							#key: chromosome eg. 1,2,X,Y
1610							#value: the density of translocation and CNV events on key
1611
1612							my $density; #store the mutation density for a chromosome
1613
1614	0					0	my $OUTPUT_FILE; #file handle to the output file
1615
1616							#initialize all the counts to 0
1617	2					13	for (my $i=1; $i<23; $i++){
1618	44					134	$chromosome_mutation_count{$i} = 0;
1619							}
1620	2					6	$chromosome_mutation_count{'X'} = 0;
1621	2					4	$chromosome_mutation_count{'Y'} = 0;
1622
1623							#add the number of CNV events on each chromosome
1624	2					19	for my $cnv_key1 ( keys %$chromosome_copy_number_count_hash_ref){
1625	40					29	for my $cnv_key2 (keys %{$chromosome_copy_number_count_hash_ref->{$cnv_key1}}){
	40					99
1626	160					217	$chromosome_mutation_count{$cnv_key1} += $chromosome_copy_number_count_hash_ref->{$cnv_key1}->{$cnv_key2};
1627							}
1628							}
1629
1630							#add the number of translocation events on each chromosome
1631	2					10	for my $trans_key1 ( keys %$chromosome_translocation_count_hash_ref){
1632	26					19	for my $trans_key2 (keys %{$chromosome_translocation_count_hash_ref->{$trans_key1}}){
	26					46
1633	52					62	$chromosome_mutation_count{$trans_key1} += $chromosome_translocation_count_hash_ref->{$trans_key1}->{$trans_key2};
1634							}
1635							}
1636
1637							#Create the output file
1638	2	50				238	open ($OUTPUT_FILE, ">", "$output_directory/genome_localization.log") or die "ERROR: could not create file $output_directory/genome_localization.log\n";
1639							#Print the header
1640	2					15	print $OUTPUT_FILE "#chr\tcount\tdensity\n";
1641
1642							#for each chromosome print the count and overall density
1643	9			9		6922	{use sort 'stable';
	9					5236
	9					50
	2					3
1644	2					24	for my $chr ( sort keys %chromosome_mutation_count){
1645	48					61	$density = $chromosome_mutation_count{$chr}/$chromosome_length{$chr};
1646
1647	48					37	print $OUTPUT_FILE "$chr";
1648	48					42	print $OUTPUT_FILE "\t";
1649	48					48	print $OUTPUT_FILE $chromosome_mutation_count{$chr};
1650	48					34	print $OUTPUT_FILE "\t";
1651	48					114	print $OUTPUT_FILE "$density";
1652	48					49	print $OUTPUT_FILE "\n";
1653
1654							#Replace the count with the density
1655	48					47	$chromosome_mutation_count{$chr} = $density;
1656							}
1657							}#use sort 'stable'
1658
1659	2					72	close ($OUTPUT_FILE);
1660
1661							#return the hash containing the densities
1662	2					15	return(\%chromosome_mutation_count);
1663
1664							}#sub calculate_genome_localization
1665
1666
1667							#=head2 Sub-Method: calculate_chromosome_localization
1668
1669							### calculate_chromosome_localization #############################################################
1670							# Description:
1671							# Performs a sliding window analysis on the CNV and translocation data. Identifies regions
1672							# that have a density of mutation much greater than the average rate of mutation of the
1673							# genome.
1674							#
1675							# Input variables:
1676							# $output_directory: stores the directory where output files are created
1677							# $genome_cnv_data_hash_ref: reference to hash that stores position of all CNV breakpoints in
1678							# the genome
1679							# $genome_trans_data_hash_ref: reference to hash that stores position of all the
1680							# translocation breakpoints in the genome
1681							# $bin_size: size of the bins that divide up the genome
1682							# $window_size: number of bins to evaluate in each window
1683							#
1684
1685							#=cut
1686							sub calculate_chromosome_localization {
1687
1688							#parse parameters
1689	2			2	0	15	my $output_directory = shift;
1690	2					3	my $genome_cnv_data_hash_ref = shift;
1691	2					4	my $genome_trans_data_hash_ref = shift;
1692	2					2	my $bin_size = shift;
1693	2					4	my $window_size = shift;
1694
1695	2					3	my @suspect_regions; #array storing the start position, end position and chromosome
1696							#of very highly mutated regions
1697							my @likely_regions; #array storing the start position, end position and chromosome
1698							#of somewhat highly mutated regions
1699
1700	2					3	my $in_suspect_region = 0; #flag variables used in identifying highly mutated regions
1701	2					3	my $in_likely_region = 0;
1702
1703	2					2	my $suspect_chr = -1;
1704	2					4	my $suspect_start = -1;
1705	2					2	my $suspect_end = -1;
1706
1707	2					4	my $likely_chr = -1;
1708	2					2	my $likely_start = -1;
1709	2					3	my $likely_end = -1;
1710
1711	2					45	my %genome_cnv_data_windows = ( #hash
1712							X => [], #key: chromosome eg. 1,2,X,Y
1713							Y => [], #value: an array storing the count of CNVs
1714							1 => [], # in each window along the chromosome
1715							2 => [],
1716							3 => [],
1717							4 => [],
1718							5 => [],
1719							6 => [],
1720							7 => [],
1721							8 => [],
1722							9 => [],
1723							10 => [],
1724							11 => [],
1725							12 => [],
1726							13 => [],
1727							14 => [],
1728							15 => [],
1729							16 => [],
1730							17 => [],
1731							18 => [],
1732							19 => [],
1733							20 => [],
1734							21 => [],
1735							22 => []
1736							);
1737
1738	2					23	my %genome_trans_data_windows = ( #hash
1739							X => [], #key: chromosome eg. 1,2,X,Y
1740							Y => [], #value: an array storing a count of translocation
1741							1 => [], # in each window along the chromosome
1742							2 => [],
1743							3 => [],
1744							4 => [],
1745							5 => [],
1746							6 => [],
1747							7 => [],
1748							8 => [],
1749							9 => [],
1750							10 => [],
1751							11 => [],
1752							12 => [],
1753							13 => [],
1754							14 => [],
1755							15 => [],
1756							16 => [],
1757							17 => [],
1758							18 => [],
1759							19 => [],
1760							20 => [],
1761							21 => [],
1762							22 => []
1763							);
1764
1765	2					23	my %genome_mutation_data_windows = ( #hash
1766							X => [], #key: chromosome eg. 1,2,X,Y
1767							Y => [], #value: an array storing a count of all mutations
1768							1 => [], # in each window along the chromosome
1769							2 => [],
1770							3 => [],
1771							4 => [],
1772							5 => [],
1773							6 => [],
1774							7 => [],
1775							8 => [],
1776							9 => [],
1777							10 => [],
1778							11 => [],
1779							12 => [],
1780							13 => [],
1781							14 => [],
1782							15 => [],
1783							16 => [],
1784							17 => [],
1785							18 => [],
1786							19 => [],
1787							20 => [],
1788							21 => [],
1789							22 => []
1790							);
1791
1792
1793	2					4	my $current_chr; #current chromosome being analyzed
1794	2					5	my @current_chr_data = (); #array storing the bins for the current chromosome
1795
1796	2					3	my $genome_mean_mutation_density = 0; #average density of all the windows across the genome
1797	2					3	my $total_genome_windows = 0; #total number of windows across the genome
1798	2					1	my $genome_mutation_density_standard_deviation = 0; #standard deviation of the mutation densities for
1799							#all the windows
1800
1801	2					3	my $OUTPUT_FILE; #file handle to output file
1802
1803	2					4	$output_directory = $output_directory."mutation_clustering";
1804
1805							#create output directories
1806	2	50				162	mkdir ("$output_directory",0770) unless (-d "$output_directory");
1807	2	50				25	if(!(-e "$output_directory")){
1808	0					0	die "ERROR: could not create folder $output_directory\n";
1809							}
1810
1811	2	50				92	mkdir ("$output_directory/cnv",0770) unless (-d "$output_directory/cnv");
1812	2	50				21	if(!(-e "$output_directory")){
1813	0					0	die "ERROR: could not create folder $output_directory/cnv\n";
1814							}
1815
1816	2	50				85	mkdir ("$output_directory/translocations",0770) unless (-d "$output_directory/translocations");
1817	2	50				20	if(!(-e "$output_directory")){
1818	0					0	die "ERROR: could not create folder $output_directory/translocations\n";
1819							}
1820
1821	2	50				88	mkdir ("$output_directory/all_types",0770) unless (-d "$output_directory/all_types");
1822	2	50				30	if(!(-e "$output_directory")){
1823	0					0	die "ERROR: could not create folder $output_directory/all_types\n";
1824							}
1825
1826							#compute the density of CNV mutations in each window
1827	2					19	for my $cnv_key ( keys %$genome_cnv_data_hash_ref){
1828							#get the array storing the CNV bins for the current chromosome
1829	40					122	@current_chr_data = @{$genome_cnv_data_hash_ref->{$cnv_key}};
	40					161833
1830
1831							#check that the array is not empty
1832	40	50				321	if(scalar(@current_chr_data) > 0){
1833
1834							#create an output file for this chromosome
1835	40	50				5776	open ($OUTPUT_FILE, ">", "$output_directory/cnv/chr$cnv_key"."_cnv_localization.log") or die "ERROR: could not create file $output_directory/cnv/chr$cnv_key"."_cnv_localization.log";
1836							#print the header for the output file
1837	40					369	print $OUTPUT_FILE "#chr\tstart\tend\tdensity";
1838
1839	40					82	@{$genome_cnv_data_windows{$cnv_key}}[0] = 0; #initialize the count in the first window
	40					350
1840
1841							#Calculate the mutation count in the first window
1842	40					167	for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++){
1843	400000					252297	my %region_hash;
1844	400000	100				497514	if(!defined($current_chr_data[$chr_pos])){
1845	399924					554999	next;
1846							}
1847	76					85	%region_hash = %{$current_chr_data[$chr_pos]};
	76					523
1848
1849	76					124	@{$genome_cnv_data_windows{$cnv_key}}[0] += $region_hash{'BPcount'};
	76					273
1850							}
1851
1852							#print the values from the first window to the output file
1853	40					293	print $OUTPUT_FILE "\n";
1854	40					141	print $OUTPUT_FILE "$cnv_key";
1855	40					82	print $OUTPUT_FILE "\t";
1856	40					90	print $OUTPUT_FILE "0";
1857	40					72	print $OUTPUT_FILE "\t";
1858	40					444	print $OUTPUT_FILE ($window_size)*$bin_size;
1859	40					104	print $OUTPUT_FILE "\t";
1860
1861	40	50				51	if(!defined(@{$genome_cnv_data_windows{$cnv_key}}[0])){
	40					359
1862	0					0	print $OUTPUT_FILE "0";
1863							}
1864							else{
1865	40					74	my $rounded = POSIX::ceil((@{$genome_cnv_data_windows{$cnv_key}}[0])/2);
	40					350
1866	40					377	print $OUTPUT_FILE ($rounded)/($window_size*$bin_size);
1867							#add the cnv count to the total mutation count for the region
1868	40					75	@{$genome_mutation_data_windows{$cnv_key}}[0] += $rounded;
	40					177
1869							}
1870
1871							#perform the sliding window analysis for the rest of the chromosome
1872	40					176	for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++){
1873
1874							#check that the window will not overshoot the length of the chromosome
1875	4391064	100				7110032	if( (($chr_pos+($window_size-1))*$bin_size) > $chromosome_length{$cnv_key} ){
1876	24					63	last;
1877							}
1878
1879	4391040					3328028	@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos] = 0; #initialize the count for the current window
	4391040					5322717
1880
1881	4391040					3337689	my %past_region_hash;
1882							my %next_region_hash;
1883
1884	4391040					3266511	my $prev_value = 0;
1885	4391040					3032145	my $next_value = 0;
1886
1887							#get the count of the from the first bin from the previous window
1888	4391040	100				6563697	if(defined($current_chr_data[$chr_pos-1])){
1889	392					444	%past_region_hash = %{$current_chr_data[$chr_pos-1]};
	392					1890
1890	392					662	$prev_value = $past_region_hash{'BPcount'};
1891							}
1892
1893							#get the count from the bin following the last bin in the previous window
1894	4391040	100				6265114	if(defined($current_chr_data[$chr_pos+($window_size-1)])){
1895	392					389	%next_region_hash = %{$current_chr_data[$chr_pos+($window_size-1)]};
	392					2295
1896	392					767	$next_value = $next_region_hash{'BPcount'};
1897							}
1898
1899							#the count for the current window = the count from the previous window - the first bin of the previous window + the next bin along the chromosome
1900	4391040					3148944	@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos] += (@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos-1]) - ($prev_value) + ($next_value);
	4391040					4679775
	4391040					4701707
1901
1902							#print the values for this window
1903	4391040					4426893	print $OUTPUT_FILE "\n";
1904	4391040					3828238	print $OUTPUT_FILE "$cnv_key";
1905	4391040					3367321	print $OUTPUT_FILE "\t";
1906	4391040					9612242	print $OUTPUT_FILE $chr_pos*$bin_size;
1907	4391040					3683462	print $OUTPUT_FILE "\t";
1908	4391040					8420479	print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size;
1909	4391040					4298892	print $OUTPUT_FILE "\t";
1910
1911	4391040	50				3274786	if(!defined(@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos])){
	4391040					6625449
1912	0					0	print $OUTPUT_FILE "0";
1913							}
1914							else{
1915	4391040					3171268	my $rounded = POSIX::ceil((@{$genome_cnv_data_windows{$cnv_key}}[$chr_pos])/2);
	4391040					7883313
1916	4391040					7366705	print $OUTPUT_FILE ($rounded)/($window_size*$bin_size);
1917							#add the cnv count to the total mutation count for the region
1918	4391040					3276286	@{$genome_mutation_data_windows{$cnv_key}}[$chr_pos] += $rounded;
	4391040					11546829
1919							}
1920							}#for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++)
1921
1922	40					5548	close ($OUTPUT_FILE);
1923							}#if(scalar(@current_chr_data) > 0)
1924
1925	40					73689	@current_chr_data = ();
1926							}#for my $cnv_key ( keys %$genome_cnv_data_hash_ref)
1927
1928							#perform the sliding window analysis on the translocation mutation data
1929	2					31	for my $trans_key ( keys %$genome_trans_data_hash_ref){
1930
1931							#get the array storing the translocation bins for the current chromosome
1932	26					72	@current_chr_data = @{$genome_trans_data_hash_ref->{$trans_key}};
	26					81590
1933
1934							#check that the array is not empty
1935	26	50				189	if(scalar(@current_chr_data) > 0){
1936
1937							#create the output file
1938	26	50				3939	open ($OUTPUT_FILE, ">", "$output_directory/translocations/chr$trans_key"."_translocation_localization.log") or die "ERROR: could not create file $output_directory/translocations/chr$trans_key"."_translocation_localization.log";
1939							#print the header for the output file
1940	26					217	print $OUTPUT_FILE "#chr\tstart\tend\tdensity";
1941
1942							#initialize the translocation count for the first window
1943	26					59	@{$genome_trans_data_windows{$trans_key}}[0] = 0;
	26					164
1944
1945							#calculate the translocation mutation count in the first window
1946	26					125	for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++){
1947	260000					157701	my %region_hash;
1948	260000	100				318639	if(!defined($current_chr_data[$chr_pos])){
1949	259970					359428	next;
1950							}
1951	30					34	%region_hash = %{$current_chr_data[$chr_pos]};
	30					152
1952
1953	30					41	my %trans_hash_in;
1954							my %trans_hash_out;
1955	30					27	my $size = 0;
1956
1957							#calculate the number of inbound translocation breakpoints
1958	30	100				62	if(defined($region_hash{'in'})){
1959	20					23	%trans_hash_in = %{$region_hash{'in'}};
	20					46
1960
1961	20					33	for my $key (keys %trans_hash_in){
1962	20					18	$size = @{$trans_hash_in{$key}};
	20					20
1963	20					33	$size = $size/2;
1964	20					19	@{$genome_trans_data_windows{$trans_key}}[0] += $size;
	20					57
1965							}
1966							}
1967
1968							#calculate the number of outbound translocation breakpoints
1969	30	100				77	if(defined($region_hash{'out'})){
1970	18					17	%trans_hash_out = %{$region_hash{'out'}};
	18					47
1971
1972	18					37	for my $key (keys %trans_hash_out){
1973	18	50				34	if($key eq $trans_key){
1974	18					67	next;
1975							}
1976	0					0	$size = @{$trans_hash_out{$key}};
	0					0
1977	0					0	$size = $size/2;
1978	0					0	@{$genome_trans_data_windows{$trans_key}}[0] += $size;
	0					0
1979							}
1980							}
1981
1982							}#for(my $chr_pos = 0; $chr_pos < $window_size; $chr_pos++)
1983
1984							#print the values from the first window to the output file
1985	26					204	print $OUTPUT_FILE "\n";
1986	26					85	print $OUTPUT_FILE "$trans_key";
1987	26					48	print $OUTPUT_FILE "\t";
1988	26					47	print $OUTPUT_FILE "0";
1989	26					158	print $OUTPUT_FILE "\t";
1990	26					333	print $OUTPUT_FILE ($window_size)*$bin_size;
1991	26					52	print $OUTPUT_FILE "\t";
1992
1993	26	50				63	if(!defined(@{$genome_trans_data_windows{$trans_key}}[0])){
	26					193
1994	0					0	print $OUTPUT_FILE "0";
1995							}
1996							else{
1997	26					40	my $rounded = POSIX::ceil(@{$genome_trans_data_windows{$trans_key}}[0]);
	26					218
1998	26					158	print $OUTPUT_FILE ($rounded)/($window_size*$bin_size);
1999							#add the translocation mutation count to the total mutation count for the region
2000	26					56	@{$genome_mutation_data_windows{$trans_key}}[0] += $rounded;
	26					113
2001							}
2002
2003							#perform the sliding window analysis for the rest of the chromosome
2004	26					120	for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++){
2005
2006	2520292	100				3952139	if( (($chr_pos+($window_size-1))*$bin_size) > $chromosome_length{$trans_key} ){
2007	10					28	last;
2008							}
2009
2010	2520282					1805582	@{$genome_trans_data_windows{$trans_key}}[$chr_pos] = 0;
	2520282					2989656
2011	2520282					1860985	my %prev_region_hash;
2012							my %next_region_hash;
2013
2014	2520282					1823580	my $prev_value = 0;
2015	2520282					1624361	my $next_value = 0;
2016
2017							#Caculate the number of mutations in the first bin of the previous window
2018	2520282	100				3578042	if(defined($current_chr_data[$chr_pos-1])){
2019	428					408	%prev_region_hash = %{$current_chr_data[$chr_pos-1]};
	428					1520
2020
2021	428					479	my $size = 0;
2022	428					387	my %prev_trans_hash_in;
2023							my %prev_trans_hash_out;
2024
2025	428	100				888	if(defined($prev_region_hash{'in'})){
2026	234					240	%prev_trans_hash_in = %{$prev_region_hash{'in'}};
	234					495
2027
2028	234					385	for my $key (keys %prev_trans_hash_in){
2029	262					217	$size = @{$prev_trans_hash_in{$key}};
	262					289
2030	262					289	$size = $size/2;
2031	262					468	$prev_value += $size;
2032							}
2033							}
2034
2035	428	100				862	if(defined($prev_region_hash{'out'})){
2036	278					281	%prev_trans_hash_out = %{$prev_region_hash{'out'}};
	278					618
2037
2038	278					518	for my $key (keys %prev_trans_hash_out){
2039	284	100				625	if($key eq $trans_key){
2040	184					546	next;
2041							}
2042	100					81	$size = @{$prev_trans_hash_out{$key}};
	100					112
2043	100					107	$size = $size/2;
2044	100					241	$prev_value += $size;
2045							}
2046							}
2047
2048							}
2049
2050							#Caculate the number of mutations in the last bin of the current window
2051	2520282	100				3544084	if(defined($current_chr_data[$chr_pos+($window_size-1)])){
2052	456					393	%next_region_hash = %{$current_chr_data[$chr_pos+($window_size-1)]};
	456					2045
2053
2054	456					516	my $size = 0;
2055	456					437	my %next_trans_hash_in;
2056							my %next_trans_hash_out;
2057
2058	456	100				849	if(defined($next_region_hash{'in'})){
2059	238					253	%next_trans_hash_in = %{$next_region_hash{'in'}};
	238					573
2060
2061	238					421	for my $key (keys %next_trans_hash_in){
2062	266					197	$size = @{$next_trans_hash_in{$key}};
	266					363
2063	266					288	$size = $size/2;
2064	266					485	$next_value += $size;
2065							}
2066							}
2067
2068	456	100				912	if(defined($next_region_hash{'out'})){
2069	306					250	%next_trans_hash_out = %{$next_region_hash{'out'}};
	306					804
2070
2071	306					507	for my $key (keys %next_trans_hash_out){
2072	312	100				678	if($key eq $trans_key){
2073	192					540	next;
2074							}
2075	120					107	$size = @{$next_trans_hash_out{$key}};
	120					147
2076	120					128	$size = $size/2;
2077	120					286	$next_value += $size;
2078							}
2079							}
2080
2081
2082							}
2083
2084							#total number of translocation mutations in the current window = number of mutations in previous window - the first bin of the previous window + next bin along the chromosome
2085	2520282					1706221	@{$genome_trans_data_windows{$trans_key}}[$chr_pos] += (@{$genome_trans_data_windows{$trans_key}}[$chr_pos-1]) - ($prev_value) + ($next_value);
	2520282					2618698
	2520282					2522535
2086
2087							#print values from this window
2088	2520282					2412434	print $OUTPUT_FILE "\n";
2089	2520282					4826823	print $OUTPUT_FILE "$trans_key";
2090	2520282					1946175	print $OUTPUT_FILE "\t";
2091	2520282					7425819	print $OUTPUT_FILE $chr_pos*$bin_size;
2092	2520282					1903350	print $OUTPUT_FILE "\t";
2093	2520282					4044352	print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size;
2094	2520282					2127756	print $OUTPUT_FILE "\t";
2095
2096	2520282	50				1771966	if(!defined(@{$genome_trans_data_windows{$trans_key}}[$chr_pos])){
	2520282					3549301
2097	0					0	print $OUTPUT_FILE "0";
2098							}
2099							else{
2100	2520282					1853634	my $rounded = POSIX::ceil(@{$genome_trans_data_windows{$trans_key}}[$chr_pos]);
	2520282					3837198
2101	2520282					3853291	print $OUTPUT_FILE ($rounded)/($window_size*$bin_size);
2102	2520282					1811317	@{$genome_mutation_data_windows{$trans_key}}[$chr_pos] += $rounded;
	2520282					6147554
2103							}
2104							}#for(my $chr_pos = 1; $chr_pos < scalar(@current_chr_data); $chr_pos++)
2105
2106
2107	26					2682	close ($OUTPUT_FILE);
2108							}
2109	26					40244	@current_chr_data = ();
2110							}
2111
2112							#calculate the density of both types of mutations in each window in the genome
2113	2					27	for my $mutation_key ( keys %genome_mutation_data_windows){
2114
2115							#check that some data exisits for the current chromosome
2116	48	100				100	if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){
	48					280
2117
2118							#create the output file
2119	42	50				277793	open ($OUTPUT_FILE, ">", "$output_directory/all_types/chr$mutation_key"."_mutation_localization.log") or die "ERROR: could not create file $output_directory/all_types/chr$current_chr"."_mutation_localization.log";
2120							#print the header for the output file
2121	42					176	print $OUTPUT_FILE "#chr\tstart\tend\tdensity";
2122
2123	42					61	my $density;
2124
2125							#for every bin along the chromosome
2126	42					109	for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){
	4776020					7680664
2127	4775978					3117194	$total_genome_windows++; #increment the total number of windows in the genome
2128
2129							#calculate the density of mutations in the window
2130	4775978	50				3148584	if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){
	4775978					6429662
2131	0					0	$density = 0;
2132							}
2133							else{
2134	4775978					3119126	$density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size);
	4775978					5690124
2135							}
2136
2137							#sum the density values to calculate the mean
2138	4775978					3950367	$genome_mean_mutation_density += $density;
2139
2140							#print the values for the window
2141	4775978					3822716	print $OUTPUT_FILE "\n";
2142	4775978					3806028	print $OUTPUT_FILE "$mutation_key";
2143	4775978					3519477	print $OUTPUT_FILE "\t";
2144	4775978					9288168	print $OUTPUT_FILE $chr_pos*$bin_size;
2145	4775978					3564978	print $OUTPUT_FILE "\t";
2146	4775978					6235293	print $OUTPUT_FILE ($chr_pos+$window_size)*$bin_size;
2147	4775978					4516896	print $OUTPUT_FILE "\t";
2148	4775978					6815745	print $OUTPUT_FILE $density;
2149
2150							}#for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++)
2151
2152	42					3911	close ($OUTPUT_FILE);
2153							}
2154							}#for my $mutation_key ( keys %genome_mutation_data_windows)
2155
2156							#calculate the mean mutation density for the windows in the genome
2157	2					23	$genome_mean_mutation_density = $genome_mean_mutation_density/$total_genome_windows;
2158
2159							#find the sum of squared difference between the density of mutation of each window and the mean density of mutation
2160	2					20	for my $mutation_key ( keys %genome_mutation_data_windows){
2161	48	100				79	if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){
	48					221
2162	42					42	my $density;
2163
2164	42					93	for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){
	4776020					7097656
2165	4775978	50				3122829	if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){
	4775978					6280127
2166	0					0	$density = 0;
2167							}
2168							else{
2169	4775978					3149281	$density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size);
	4775978					4898174
2170							}
2171
2172							#sum the squared differences
2173	4775978					5092304	$genome_mutation_density_standard_deviation += ($density-$genome_mean_mutation_density)**2;
2174							}
2175							}
2176							}#for my $mutation_key ( keys %genome_mutation_data_windows)
2177
2178							#divided the sum of the squared differnces by the total number of windows and take the square root
2179	2					39	$genome_mutation_density_standard_deviation = ($genome_mutation_density_standard_deviation/$total_genome_windows)**0.5;
2180
2181							#calculate z scores for each window and check if the window is greater than 2 SDs away from genome mean
2182							#use this value to identify highly mutated regions
2183	2					14	for my $mutation_key ( keys %genome_mutation_data_windows){
2184	48	100				64	if(scalar(@{$genome_mutation_data_windows{$mutation_key}}) > 0){
	48					198
2185	42					44	my $density;
2186	42					72	my $region_z_score = 0;
2187
2188	42					58	for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++){
	4776020					7101599
2189
2190	4775978	50				3201243	if(!defined(@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])){
	4775978					6496156
2191	0					0	$density = 0;
2192							}
2193							else{
2194	4775978					3127809	$density = (@{$genome_mutation_data_windows{$mutation_key}}[$chr_pos])/($window_size*$bin_size);
	4775978					5265502
2195							}
2196
2197							#calculate z score for the window
2198	4775978					3629181	$region_z_score = ($density-$genome_mean_mutation_density)/$genome_mutation_density_standard_deviation;
2199
2200							#check if the z score is above the threshold
2201	4775978	100				8215524	if( $region_z_score >= $outlier_deviation ) {
		100
2202	88404	100				112082	if($in_suspect_region!=1){
2203	8					12	$suspect_start = $chr_pos*$bin_size;
2204	8					12	$in_suspect_region = 1;
2205							}
2206	88404					64480	$suspect_chr = $mutation_key;
2207	88404					68734	$suspect_end = ($chr_pos+$window_size)*$bin_size;
2208							}
2209							elsif ($in_suspect_region==1){
2210							#once a region has been called push the chromosome, start and end positions into the suspect region array
2211	8					53	push (@suspect_regions, ($suspect_chr,$suspect_start,$suspect_end));
2212	8					12	$suspect_chr = -1;
2213	8					10	$suspect_start = -1;
2214	8					10	$suspect_end = -1;
2215	8					8	$in_suspect_region = 0;
2216							}
2217
2218							#check if the z score is below the threshold but still suspicously high
2219	4775978	100	100			15196012	if(
		100
2220							( $region_z_score < $outlier_deviation ) &&
2221							( $region_z_score >= ($outlier_deviation-1) )
2222							){
2223	66812	100				84473	if($in_likely_region!=1){
2224	26					33	$likely_start = $chr_pos*$bin_size;
2225	26					42	$in_likely_region = 1;
2226							}
2227	66812					53962	$likely_chr = $mutation_key;
2228	66812					65530	$likely_end = ($chr_pos+$window_size)*$bin_size;
2229							}
2230							elsif ($in_likely_region==1){
2231	24					105	push (@likely_regions, ($likely_chr,$likely_start,$likely_end));
2232	24					29	$likely_chr = -1;
2233	24					29	$likely_start = -1;
2234	24					23	$likely_end = -1;
2235	24					35	$in_likely_region = 0;
2236							}
2237
2238							}#for(my $chr_pos = 0; $chr_pos < scalar(@{$genome_mutation_data_windows{$mutation_key}}); $chr_pos++)
2239
2240							#check if the last region analyzed was suspicious and push its data into the appropriate array
2241	42	50				156	if ($in_suspect_region==1){
2242	0					0	push (@suspect_regions, ($suspect_chr,$suspect_start,$suspect_end));
2243	0					0	$suspect_chr = -1;
2244	0					0	$suspect_start = -1;
2245	0					0	$suspect_end = -1;
2246	0					0	$in_suspect_region = 0;
2247							}
2248
2249	42	100				220	if ($in_likely_region==1){
2250	2					18	push (@likely_regions, ($likely_chr,$likely_start,$likely_end));
2251	2					4	$likely_chr = -1;
2252	2					6	$likely_start = -1;
2253	2					4	$likely_end = -1;
2254	2					8	$in_likely_region = 0;
2255							}
2256							}
2257							}#for my $mutation_key ( keys %genome_mutation_data_windows)
2258
2259
2260	2					50	return (\@suspect_regions, \@likely_regions, \%genome_cnv_data_windows, \%genome_trans_data_windows, \%genome_mutation_data_windows);
2261
2262							}#sub calculate_chromosome_localization
2263
2264
2265							#=head2 Sub-Method: check_copy_number_count
2266
2267							### check_copy_number_count #######################################################################
2268							# Description:
2269							# Produces an output file that records the number of regions of copy-number variation that
2270							# are present in each chromosome.
2271							#
2272							# Input variables:
2273							# $output_directory: stores the path to the output directory
2274							# $chromosome_copy_number_count_hash_ref: reference to hash that stores the count of regions
2275							# of copy-number variation on each chromosome
2276							#
2277
2278							#=cut
2279
2280							sub check_copy_number_count {
2281
2282							#parse parameters
2283	1			1	0	9	my $output_directory = shift;
2284	1					39	my $chromosome_copy_number_count_hash_ref = shift;
2285
2286	1					1	my $OUTPUT_FILE; #file handle to output file
2287
2288							#open the output file
2289	1	50				145	open ($OUTPUT_FILE, ">", "$output_directory/copy_number_count.log") or die "ERROR: could not create file $output_directory/copy_number_count.log\n";
2290
2291							#print the header
2292	1					5	print $OUTPUT_FILE "#chr\tcopy_number\tnumber_of_regions";
2293
2294							#for each chromosome
2295							#print out the number of regions with the given copy-number
2296	9			9		35802	{use sort 'stable';
	9					23
	9					69
	1					1
2297	1					44	for my $chr (sort keys %$chromosome_copy_number_count_hash_ref){
2298	20					12	my %intermediate_hash = %{$chromosome_copy_number_count_hash_ref->{$chr}};
	20					63
2299	20					35	for my $CN (sort {$a <=> $b} keys %intermediate_hash){
	103					106
2300	80					108	print $OUTPUT_FILE "\n";
2301	80					57	print $OUTPUT_FILE "$chr"; #chromosome
2302	80					53	print $OUTPUT_FILE "\t";
2303	80					57	print $OUTPUT_FILE "$CN"; #copy-number
2304	80					56	print $OUTPUT_FILE "\t";
2305	80					131	print $OUTPUT_FILE $chromosome_copy_number_count_hash_ref->{$chr}->{$CN}; #number of regions with copy-number $CN
2306							}
2307							}
2308							}#use sort 'stable'
2309	1					96	close ($OUTPUT_FILE);
2310
2311							}#sub check_copy_number_count
2312
2313							#=head2 Sub-Method: check_copy_number_switches
2314
2315							### check_copy_number_switches ####################################################################
2316							# Description:
2317							# Creates an output file that records the number of breakpoints between CNV regions on each
2318							# chromosome
2319							#
2320							# Input variables:
2321							# $output_directory: stores path to output directory
2322							# $chromosome_copy_number_count_hash_ref: reference to hash that stores the count of regions
2323							# of copy-number variation on each chromosome
2324							#
2325
2326							#=cut
2327
2328							sub check_copy_number_switches {
2329
2330							#parse parameters
2331	1			1	0	45	my $output_directory = shift;
2332	1					2	my $chromosome_copy_number_count_hash_ref = shift;
2333
2334	1					1	my $switch_count = 0;
2335
2336	1					2	my $OUTPUT_FILE; #file handle to output file
2337
2338							#open output file
2339	1	50				102	open ($OUTPUT_FILE, ">", "$output_directory/copy_number_switches.log") or die "ERROR: could not create file $output_directory/copy_number_switches.log\n";
2340
2341							#print header
2342	1					4	print $OUTPUT_FILE "#chr\tswitch_count";
2343
2344							#for each chromosome
2345							#sum the total number of CNV events
2346							#multiply the sum by 2 to get the number of CNV breakpoints
2347	9			9		2627	{use sort 'stable';
	9					16
	9					37
	1					2
2348	1					15	for my $chr (sort keys %$chromosome_copy_number_count_hash_ref){
2349	20					17	my %intermediate_hash = %{$chromosome_copy_number_count_hash_ref->{$chr}};
	20					53
2350	20					37	for my $CN (sort {$intermediate_hash{$b} <=> $intermediate_hash{$a} } keys %intermediate_hash){
	106					94
2351							#only count regions that have abberant copy numbers
2352	80	50				103	if($CN != 2){
2353	80					91	$switch_count += ($chromosome_copy_number_count_hash_ref->{$chr}->{$CN} * 2);
2354							}
2355							}
2356
2357							#print values to output file
2358	20					24	print $OUTPUT_FILE "\n";
2359	20					16	print $OUTPUT_FILE "$chr";
2360	20					14	print $OUTPUT_FILE "\t";
2361	20					30	print $OUTPUT_FILE $switch_count;
2362
2363	20					23	$switch_count = 0;
2364							}
2365							}#use sort 'stable'
2366
2367	1					33	close ($OUTPUT_FILE);
2368
2369							}#sub check_copy_number_switches
2370
2371							#=head2 Sub-Method: calculate_interchromosomal_translocation_rate
2372
2373							### calculate_interchromosomal_translocation_rate #################################################
2374							# Description:
2375							# Create an output file that records the number of translocations between each and every
2376							# chromosome
2377							#
2378							# Input variables:
2379							# $output_directory: stores path to output directory
2380							# $chromosome_translocation_count_hash_ref: reference to hash that stores the count of
2381							# translocations between each chromosome
2382							#
2383
2384							#=cut
2385
2386							sub calculate_interchromosomal_translocation_rate {
2387
2388							#parse parameters
2389	1			1	0	9	my $output_directory = shift;
2390	1					3	my $chromosome_translocation_count_hash_ref = shift;
2391
2392	1					2	my $OUTPUT_FILE; #file handle to output file
2393
2394							#open output file
2395	1	50				153	open ($OUTPUT_FILE, ">", "$output_directory/interchromosomal_translocation_rate.log") or die "ERROR: could not create file $output_directory/interchromosomal_translocation_rate.log\n";
2396
2397							#print header
2398	1					19	print $OUTPUT_FILE "#chr1\tchr2\tcount";
2399
2400							#for each chromosome
2401							#print the number of translocations between every other chromosome
2402	9			9		2679	{use sort 'stable';
	9					18
	9					45
	1					2
2403	1					14	for my $chr1 (sort keys %$chromosome_translocation_count_hash_ref){
2404	13					12	my %intermediate_hash = %{$chromosome_translocation_count_hash_ref->{$chr1}};
	13					34
2405
2406	13					23	for my $chr2 (sort keys %intermediate_hash){
2407	26					24	print $OUTPUT_FILE "\n";
2408	26					18	print $OUTPUT_FILE $chr1;
2409	26					25	print $OUTPUT_FILE "\t";
2410	26					19	print $OUTPUT_FILE $chr2;
2411	26					19	print $OUTPUT_FILE "\t";
2412	26					58	print $OUTPUT_FILE $chromosome_translocation_count_hash_ref->{$chr1}->{$chr2};
2413							}
2414							}
2415							}#use sort 'stable'
2416	1					34	close ($OUTPUT_FILE);
2417
2418							}#sub calculate_interchromosomal_translocation_rate
2419
2420
2421							#=head2 Sub-Method: analyze_suspect_regions
2422
2423							### analyze_suspect_regions #######################################################################
2424							# Description:
2425							# Produces the final report output file, that includes the chromothriptic scores for each of
2426							# the highly mutated regions
2427							#
2428							# Input variables:
2429							# $output_directory: stores path to the output directory
2430							# $suspect_regions_array_ref: reference to array storing the chromosome,
2431							# start, and end position of highly mutated
2432							# regions
2433							# $genome_mutation_density_hash_ref: stores the average mutation density of each
2434							# chromosome
2435							# $genome_cnv_data_hash_ref: stores the position of CNV mutations on
2436							# each chromosome
2437							# $genome_trans_data_hash_ref: stores the position of translocation events
2438							# on each chromosome
2439							# $genome_trans_insertion_breakpoints_hash_ref: stores the position of insertions on each
2440							# chromosome
2441							# $bin_size: stores the size of a single bin
2442							# $localization_window_size: stores the number of bins to include in a
2443							# window
2444							# $tp53_mutated: stores whether the TP53 gene is mutatated
2445							# or not
2446							# $tp53_mutation_found: stores whether or not a mutation was found
2447							# in the TP53 loci
2448							# $chromosome_cnv_breakpoints_hash_ref: stores the breakpoints of CNV mutations on
2449							# each chromosome
2450							# $chromosome_loh_breakpoints_hash_ref: stores the breakpoints of LOH regions on
2451							# each chromosome
2452							#
2453
2454							#=cut
2455
2456							sub analyze_suspect_regions {
2457
2458							#parse parameters
2459	0			0	0	0	my $output_directory = shift;
2460
2461	0					0	my $suspect_regions_array_ref = shift;
2462	0					0	my @suspect_regions = @$suspect_regions_array_ref;
2463
2464	0					0	my $genome_mutation_density_hash_ref = shift;
2465	0					0	my %genome_mutation_density_hash = %{$genome_mutation_density_hash_ref};
	0					0
2466
2467	0					0	my $genome_cnv_data_hash_ref = shift;
2468	0					0	my $genome_trans_data_hash_ref = shift;
2469	0					0	my $genome_trans_insertion_breakpoints_hash_ref = shift;
2470
2471	0					0	my $bin_size = shift;
2472	0					0	my $localization_window_size = shift;
2473
2474	0					0	my $tp53_mutated = shift;
2475	0					0	my $tp53_mutation_found = shift;
2476
2477	0					0	my $chromosome_cnv_breakpoints_hash_ref = shift;
2478	0					0	my $chromosome_loh_breakpoints_hash_ref = shift;
2479
2480
2481	0					0	my $suspect_regions_size = @suspect_regions;
2482
2483	0					0	my $OUTPUT_FILE;
2484
2485	0					0	my @suspect_region_data = ();
2486
2487	0					0	my $header_string;
2488
2489							#check that the suspect region data array is not malformed, should contain sets of 3 elements
2490	0	0				0	if($suspect_regions_size % 3 != 0){
2491	0					0	die "ERROR: suspect_regions_array has $suspect_regions_size entries. Value must be divisible by 3.\n";
2492							}
2493
2494							#create output directory for report files
2495	0	0				0	mkdir ("$output_directory"."suspect_regions",0770) unless (-d "$output_directory"."suspect_regions");
2496	0	0				0	if(!(-e "$output_directory"."suspect_regions")){
2497	0					0	die "ERROR: could not create folder $output_directory"."suspect_regions";
2498							}
2499
2500							#open final report output file
2501	0	0				0	open ($OUTPUT_FILE, ">", "$output_directory"."suspect_regions/suspect_regions.yml") or die "ERROR: could not create file: $output_directory"."suspect_regions/suspect_regions.yml\n";
2502
2503							#construct and print header
2504	0					0	$header_string = "file: Suspect Chromothriptic Regions\n";
2505	0					0	$header_string .= "bin_size:\t\t\t$bin_size\n";
2506	0					0	$header_string .= "localization_window_size:\t$localization_window_size\n";
2507	0					0	$header_string .= "\n";
2508	0					0	$header_string .= "genome_localization_score_weight:\t$genome_localization_weight\n";
2509	0					0	$header_string .= "chromosome_localization_score_weight:\t$chromosome_localization_weight\n";
2510	0					0	$header_string .= "cnv_score_weight:\t\t\t$cnv_weight\n";
2511	0					0	$header_string .= "translocation_score_weight:\t\t$translocation_weight\n";
2512	0					0	$header_string .= "insertion_breakpoint_score_weight:\t$insertion_breakpoint_weight\n";
2513	0					0	$header_string .= "loh_score_weight:\t\t\t$loh_weight\n";
2514	0					0	$header_string .= "tp53_mutation_score_weight:\t\t$tp53_mutated_weight\n";
2515	0					0	$header_string .= "\n";
2516	0					0	$header_string .= "min_mutation_density_z_score:\t$outlier_deviation\n";
2517	0					0	$header_string .= "---\n";
2518	0					0	$header_string .= "\n";
2519	0					0	print $OUTPUT_FILE $header_string;
2520
2521							#calculate a chromothripsis score for each region that was present in the suspect region array
2522							#store the results of the score calculation in a 2d array where elements in the first dimension correspond to each suspect region
2523							#and the elements in the second dimension are the results of the score calculation
2524	0					0	for (my $i = 0; $i < $suspect_regions_size; $i+=3){
2525	0					0	my @region_data = ();
2526	0					0	$region_data[0] = $suspect_regions[$i]; #chr
2527	0					0	$region_data[1] = $suspect_regions[$i+1]; #start
2528	0					0	$region_data[2] = $suspect_regions[$i+2]; #end
2529
2530	0					0	($region_data[3], $region_data[4], $region_data[5], $region_data[6], $region_data[7], $region_data[8], $region_data[9], $region_data[10], $region_data[11], $region_data[12], $region_data[13]) = calculate_score($region_data[0], $region_data[1], $region_data[2], $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $tp53_mutated, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref, $bin_size);
2531
2532							#add the results of the score calculation for this region to the array storing all the results
2533	0					0	push @suspect_region_data, [@region_data];
2534							}
2535
2536							#sort the results so that the region with the highest chromothriptic score will be printed
2537							#to the final report output file first
2538	9			9		6023	{use sort 'stable';
	9					18
	9					39
	0					0
2539	0					0	@suspect_region_data = sort {$b->[3] <=> $a->[3] } @suspect_region_data;
	0					0
2540							}#use sort 'stable'A
2541
2542							#for each score that is generated print the score and the related statistics for that region
2543	0					0	foreach my $score_data (@suspect_region_data){
2544	0					0	my $chr = $score_data->[0]; #chr
2545	0					0	my $start = $score_data->[1]; #start
2546	0					0	my $end = $score_data->[2]; #end
2547
2548	0					0	my $score = sprintf("%.5f",$score_data->[3]);
2549	0					0	my $chr_z_score = $score_data->[4];
2550	0					0	my $region_density = sprintf("%e",$score_data->[5]);
2551
2552	0					0	my $cnv_number_hash_ref = $score_data->[6];
2553	0					0	my %cnv_number_hash;
2554							my $num_copy_num;
2555	0	0				0	if(defined($cnv_number_hash_ref)){
2556	0					0	%cnv_number_hash = %{$cnv_number_hash_ref};
	0					0
2557	0					0	$num_copy_num = keys %cnv_number_hash;
2558							}
2559							else{
2560	0					0	$num_copy_num = 0;
2561							}
2562
2563	0					0	my $cnv_density = sprintf("%e",$score_data->[7]);
2564
2565	0					0	my $intertranslocation_hash_ref = $score_data->[8];
2566	0					0	my $translocation_density = $score_data->[9];
2567	0					0	my %intertranslocation_hash;
2568							my $num_trans_chr;
2569	0	0				0	if(defined($intertranslocation_hash_ref)){
2570	0					0	%intertranslocation_hash = %{$intertranslocation_hash_ref};
	0					0
2571	0					0	$num_trans_chr = keys %intertranslocation_hash;
2572							}
2573							else{
2574	0					0	$num_trans_chr = 0;
2575							}
2576
2577	0					0	my $breakpoint_insertions_array_ref = $score_data->[10];
2578	0					0	my @breakpoint_insertions_array;
2579							my $breakpoint_percentage;
2580	0	0				0	if(defined($breakpoint_insertions_array_ref)){
2581	0					0	@breakpoint_insertions_array = @$breakpoint_insertions_array_ref;
2582	0					0	$breakpoint_percentage = sprintf("%.2f",($breakpoint_insertions_array[0]/$breakpoint_insertions_array[1])*100);
2583							}
2584
2585	0					0	my $loh_size = $score_data->[11];
2586	0					0	my $hz_size = $score_data->[12];
2587	0					0	my $percent_hz_lost;
2588	0	0	0			0	if(defined($loh_size) && defined($hz_size)){
2589	0					0	$percent_hz_lost = sprintf("%.2f",($loh_size/$hz_size)*100);
2590							}
2591
2592	0					0	my @score_array = @{$score_data->[13]};
	0					0
2593
2594	0					0	my $chr_density = $genome_mutation_density_hash{$chr};
2595
2596	0					0	my $print_string;
2597
2598	0					0	$print_string = "chromosome:\t$chr\n";
2599	0					0	$print_string .= "start:\t\t$start\n";
2600	0					0	$print_string .= "end:\t\t$end\n";
2601	0					0	$print_string .= "\n";
2602
2603	0					0	$print_string .= "final_score:\t\t\t$score\n";
2604	0					0	$print_string .= "genome_localization_score:\t".$score_array[2]*$genome_localization_weight."\t(".$score_array[2].")"."\n";
2605	0					0	$print_string .= "chromosome_localization_score:\t".$score_array[1]*$chromosome_localization_weight."\t(".$score_array[1].")"."\n";
2606	0					0	$print_string .= "cnv_score:\t\t\t".$score_array[0]*$cnv_weight."\t(".$score_array[0].")"."\n";
2607	0					0	$print_string .= "translocation_score:\t\t".$score_array[3]*$translocation_weight."\t(".$score_array[3].")"."\n";
2608	0					0	$print_string .= "insertion_breakpoint_score:\t".$score_array[4]*$insertion_breakpoint_weight."\t(".$score_array[4].")"."\n";
2609	0					0	$print_string .= "loh_score:\t\t\t".$score_array[5]*$loh_weight."\t(".$score_array[5].")"."\n";
2610	0					0	$print_string .= "tp53_score:\t\t\t".$score_array[6]*$tp53_mutated_weight."\t(".$score_array[6].")\n";
2611
2612	0					0	$print_string .= "\n";
2613
2614	0					0	$print_string .= "mutation_density_of_region:\t$region_density\n";
2615	0					0	$print_string .= "mutation_density_of_chromosome:\t$chr_density\n";
2616	0					0	$print_string .= "standard_deviations_from_mean_of_chromosome_mutation_density:\t$chr_z_score\n";
2617	0					0	$print_string .= "\n";
2618
2619	0					0	$print_string .= "density_of_copy_number_switches: $cnv_density\n";
2620	0					0	$print_string .= "number_of_aberrant_copy_number_states:\t$num_copy_num\n";
2621	0	0				0	if($num_copy_num>0){
2622	0					0	$print_string .= "aberrant_copy_number_states:\n";
2623	9			9		7085	{use sort 'stable';
	9					30
	9					47
	0					0
2624	0					0	foreach my $key (sort {$cnv_number_hash{$b} <=> $cnv_number_hash{$a} } keys %cnv_number_hash){
	0					0
2625	0					0	$print_string .= "\t$key:\t$cnv_number_hash{$key}\n";
2626							}
2627							}#use sort 'stable'
2628							}
2629
2630	0					0	$print_string .= "\n";
2631
2632	0					0	$print_string .= "density_of_translocation_breakpoints: $translocation_density\n";
2633	0					0	$print_string .= "number_of_translocation_chromosomes:\t$num_trans_chr\n";
2634
2635	0	0				0	if($num_trans_chr>0){
2636	0					0	$print_string .= "translocation_chromosomes:\n";
2637	9			9		1488	{use sort 'stable';
	9					14
	9					34
	0					0
2638	0					0	foreach my $key (sort {$intertranslocation_hash{$b} <=> $intertranslocation_hash{$a} } keys %intertranslocation_hash){
	0					0
2639	0					0	$print_string .= "\t$key:\t$intertranslocation_hash{$key}\n";
2640							}
2641							}#use sort 'stable'
2642							}
2643	0					0	$print_string .= "\n";
2644
2645	0	0				0	if(defined($breakpoint_insertions_array_ref)){
2646	0					0	$print_string .= "insertion_data:\n";
2647	0					0	$print_string .= "\tinsertions_found_at_translocation_breakpoints:\t$breakpoint_insertions_array[0]\n";
2648	0					0	$print_string .= "\ttotal_translocation_breakpoints:\t$breakpoint_insertions_array[1]\n";
2649	0					0	$print_string .= "\tpercentage:\t$breakpoint_percentage"."%\n";
2650	0					0	$print_string .= "\n";
2651							}
2652
2653	0	0				0	if($loh_size!=-1){
2654	0					0	$print_string .= "loh_data:\n";
2655	0					0	$print_string .= "\ttotal_size_of_loh:\t$loh_size\n";
2656	0					0	$print_string .= "\ttotal_size_of_original_heterozygosity:\t$hz_size\n";
2657	0					0	$print_string .= "\tpercent_heterozygosity_lost:\t$percent_hz_lost"."%\n";
2658	0					0	$print_string .= "\n";
2659							}
2660
2661	0	0	0			0	if($tp53_mutated && $tp53_mutation_found){
		0
		0
2662	0					0	$print_string .= "tp53_mutation_present:\t1 (forced and mutations found)\n";
2663							}
2664							elsif($tp53_mutated){
2665	0					0	$print_string .= "tp53_mutation_present:\t1 (forced)\n";
2666							}
2667							elsif($tp53_mutation_found){
2668	0					0	$print_string .= "tp53_mutation_present:\t1 (mutations found)\n";
2669							}
2670							else{
2671	0					0	$print_string .= "tp53_mutation_present:\t0\n";
2672							}
2673
2674
2675	0					0	$print_string .= "---\n";
2676	0					0	$print_string .= "\n";
2677	0					0	print $OUTPUT_FILE $print_string;
2678	0					0	$print_string = "";
2679							}
2680
2681	0					0	print $OUTPUT_FILE "...";
2682	0					0	close($OUTPUT_FILE);
2683							}#sub analyze_suspect_regions
2684
2685
2686							#=head2 Sub-Method: analyze_likely_regions
2687
2688							### analyze_likely_regions ########################################################################
2689							# Description:
2690							# Generates an output file that lists the regions that have a mutation density that is less
2691							# than the outlier cut off but greater than 1 - the outlier cut off
2692							#
2693							# Input variables:
2694							# $output_directory: stores path to the output directory
2695							# $likely_regions_array_ref: reference to array storing the chromosome, start,
2696							# and end position of highly mutated regions
2697							# $genome_mutation_density_hash_ref: stores the average mutation density of each
2698							# chromosome
2699							# $genome_cnv_data_hash_ref: stores the position of CNV mutations on each
2700							# chromosome
2701							# $genome_trans_data_hash_ref: stores the position of translocation events on each
2702							# chromosome
2703							# $bin_size: stores the size of a single bin
2704							#
2705
2706							#=cut
2707
2708							sub analyze_likely_regions {
2709
2710							#parse parameters
2711	1			1	0	9	my $output_directory = shift;
2712
2713	1					2	my $likely_regions_array_ref = shift;
2714	1					18	my @likely_regions = @$likely_regions_array_ref;
2715
2716	1					1	my $genome_mutation_density_hash_ref = shift;
2717	1					3	my $genome_cnv_data_hash_ref = shift;
2718	1					1	my $genome_trans_data_hash_ref = shift;
2719	1					1	my $bin_size = shift;
2720
2721	1					2	my $likely_regions_size = @likely_regions;
2722
2723	1					1	my $OUTPUT_FILE;
2724
2725							my @return_vals;
2726
2727	1					2	my @likely_region_data = (); #stores start, end, chromsome and mutation density for each region
2728
2729							#check that the likely region array is not malformed, should contain sets of 3 elements
2730	1	50				4	if($likely_regions_size % 3 != 0){
2731	0					0	die "ERROR: suspect_regions_array has $likely_regions_size entries. Value must be divisible by 3.\n";
2732							}
2733
2734
2735							#create output directory
2736	1	50				116	mkdir ("$output_directory"."suspect_regions",0770) unless (-d "$output_directory"."suspect_regions");
2737	1	50				13	if(!(-e "$output_directory"."suspect_regions")){
2738	0					0	die "ERROR: could not create folder $output_directory"."suspect_regions";
2739							}
2740
2741							#create output file
2742	1	50				75	open ($OUTPUT_FILE, ">", "$output_directory"."suspect_regions/likely_regions.log") or die "ERROR: could not create file: $output_directory"."suspect_regions/likely_regions.log\n";
2743
2744							#print file header
2745	1					11	print $OUTPUT_FILE "Likely Chromothriptic Regions\n";
2746	1					4	print $OUTPUT_FILE "High Mutation Density Z-Score:\t$outlier_deviation\n";
2747	1					2	print $OUTPUT_FILE "Min Mutation Density Z-Score:\t$outlier_deviation-1\n";
2748	1					2	print $OUTPUT_FILE "---------------------------------------\n";
2749	1					1	print $OUTPUT_FILE "#chr\tstart\tend\tmutation_density";
2750
2751							#for each likely region calculate the mutation density for the region and store it in the likely_region_data array
2752	1					5	for (my $i = 0; $i < $likely_regions_size; $i+=3){
2753	13					22	my @region_data = ();
2754	13					34	$region_data[0] = $likely_regions[$i]; #chr
2755	13					21	$region_data[1] = $likely_regions[$i+1]; #start
2756	13					23	$region_data[2] = $likely_regions[$i+2]; #end
2757
2758	13					45	($region_data[3],$region_data[4]) = calculate_region_mutation_density_score($region_data[0], $region_data[1], $region_data[2], $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $bin_size);
2759
2760	13					161	push @likely_region_data, [@region_data];
2761							}
2762
2763							#sort the regions by density, largest to smallest
2764	9			9		7599	{use sort 'stable';
	9					24
	9					49
	1					4
2765	1					8	@likely_region_data = sort {$b->[3] <=> $a->[3] } @likely_region_data;
	34					51
2766							}#use sort 'stable'
2767
2768							#print the density for each region to the output file
2769	1					5	foreach my $i (@likely_region_data){
2770	13					17	my $chr = $i->[0]; #chr
2771	13					9	my $start = $i->[1]; #start
2772	13					11	my $end = $i->[2]; #end
2773
2774	13					12	my $region_density = $i->[3]; #mutation density
2775
2776	13					13	print $OUTPUT_FILE "\n";
2777	13					45	print $OUTPUT_FILE "$chr\t$start\t$end\t$region_density";
2778	13					33	push (@return_vals,$chr,$start,$end,$region_density);
2779							}
2780
2781	1					98	close($OUTPUT_FILE);
2782	1					23	return(\@return_vals);
2783							}#sub analyze_likely_regions
2784
2785
2786							#=head2 Sub-Method: calculate_score
2787
2788							### calculate_score ###############################################################################
2789							# Description:
2790							# Calculates the chromothripic score for the given region. Calls sub methods to generate the
2791							# score for each hallmark
2792							#
2793							# Input variables:
2794							# $chr: stores the chromosome on which the region
2795							# is found
2796							# $start: stores the start base pair of the region
2797							# $end: stores the end base pair of the region
2798							# $genome_cnv_data_hash_ref: stores the position of CNV mutations on
2799							# each chromosome
2800							# $genome_trans_data_hash_ref: stores the position of translocation events
2801							# on each chromosome
2802							# $genome_mutation_density_hash_ref: stores the average mutation density of each
2803							# chromosome
2804							# $genome_trans_insertion_breakpoints_hash_ref: stores the position of insertions on each
2805							# chromosome
2806							# $tp53_mutated: stores whether the TP53 gene is mutatated
2807							# or not
2808							# $chromosome_cnv_breakpoints_hash_ref: stores the breakpoints of CNV mutations on
2809							# each chromosome
2810							# $chromosome_loh_breakpoints_hash_ref: stores the breakpoints of LOH regions on
2811							# each chromosome
2812							# $bin_size: stores the size of a single bin
2813							#
2814
2815							#=cut
2816
2817							sub calculate_score{
2818
2819							#parse parameters
2820	0			0	0	0	my $chr = shift;
2821	0					0	my $start = shift;
2822	0					0	my $end = shift;
2823
2824	0					0	my $genome_cnv_data_hash_ref = shift;
2825	0					0	my $genome_trans_data_hash_ref = shift;
2826	0					0	my $genome_mutation_density_hash_ref = shift;
2827	0					0	my $genome_trans_insertion_breakpoints_hash_ref = shift;
2828	0					0	my $tp53_mutated = shift;
2829	0					0	my $chromosome_cnv_breakpoints_hash_ref = shift;
2830	0					0	my $chromosome_loh_breakpoints_hash_ref = shift;
2831	0					0	my $bin_size = shift;
2832
2833							#initialize variable to store scores for each hallmark
2834	0					0	my $cnv_score = 0;
2835	0					0	my $mutation_density_score = 0;
2836	0					0	my $genome_localization_score = 0;
2837	0					0	my $translocation_score = 0;
2838	0					0	my $insertion_breakpoint_score = 0;
2839	0					0	my $loh_score = 0;
2840	0					0	my $final_score = 0;
2841
2842	0					0	my @score_array; #array in which hallmark scores will be returned
2843
2844							my $chr_mutation_density; #stores the average mutation density of the chromosome where the region is found
2845	0					0	my $chr_z_score; #stores the z_score of the mutation density of the chromosome where the region is found vs
2846							#all the other chromosomes
2847	0					0	my $cnv_number_hash_ref; #stores a hash that contains the number of regions of each abberant copy-number
2848	0					0	my $cnv_density; #stores the density of cnv mutations in the region
2849	0					0	my $translocation_density; #stores the density of translocation mutations in the region
2850	0					0	my $mutation_density; #stores the density of all mutations in the region
2851	0					0	my $intertranslocation_hash_ref; #stores the number of translocations between all other chromosomes and the region
2852	0					0	my $breakpoint_insertions_array_ref; #stores the total number of translocation breakpoints, and the number that have insertions nearby
2853	0					0	my $loh_size = -1; #stores the amount of heterozygosity that was lost in the region
2854	0					0	my $heterozygous_size; #stores the original amount of heterozygosity in the region
2855
2856	0					0	($cnv_score, $cnv_number_hash_ref, $cnv_density) = calculate_copy_number_scores($chr, $start, $end, $genome_cnv_data_hash_ref, $bin_size);
2857	0					0	($genome_localization_score, $chr_z_score, $chr_mutation_density) = calculate_genome_localization_score($chr, $genome_mutation_density_hash_ref);
2858	0					0	($translocation_score, $intertranslocation_hash_ref, $translocation_density) = calculate_translocation_score($chr, $start, $end, $genome_trans_data_hash_ref, $bin_size);
2859
2860	0	0				0	if(defined($genome_trans_insertion_breakpoints_hash_ref)){
2861	0					0	($insertion_breakpoint_score, $breakpoint_insertions_array_ref) = calculate_insertion_breakpoint_score($chr, $start, $end, $genome_trans_data_hash_ref, $genome_trans_insertion_breakpoints_hash_ref, $bin_size);
2862							}
2863
2864	0					0	($mutation_density, $mutation_density_score) = calculate_region_mutation_density_score($chr, $start, $end, $genome_cnv_data_hash_ref, $genome_trans_data_hash_ref, $genome_mutation_density_hash_ref, $bin_size);
2865
2866	0	0				0	if(defined($chromosome_loh_breakpoints_hash_ref)){
2867	0					0	($loh_score, $loh_size, $heterozygous_size) = calculate_loh_score($chr, $start, $end, $chromosome_cnv_breakpoints_hash_ref, $chromosome_loh_breakpoints_hash_ref);
2868							}
2869
2870							#calculate overall score for region based on hallmark weights and scores
2871	0					0	$final_score = ($cnv_score$cnv_weight) + ($mutation_density_score$chromosome_localization_weight) + ($genome_localization_score$genome_localization_weight) + ($translocation_score$translocation_weight) + ($insertion_breakpoint_score$insertion_breakpoint_weight) + ($tp53_mutated$tp53_mutated_weight) + ($loh_score*$loh_weight);
2872
2873							#push the hallmark scores into the score array
2874	0					0	push (@score_array, ($cnv_score, $mutation_density_score, $genome_localization_score, $translocation_score, $insertion_breakpoint_score, $loh_score, $tp53_mutated));
2875
2876							#return the scores and other region statistics
2877	0					0	return ($final_score, $chr_z_score, $mutation_density, $cnv_number_hash_ref, $cnv_density, $intertranslocation_hash_ref , $translocation_density, $breakpoint_insertions_array_ref, $loh_size, $heterozygous_size, \@score_array);
2878							}#sub calculate_score
2879
2880
2881
2882							#=head2 Sub-Method: calculate_copy_number_score
2883
2884							### calculate_copy_number_score ##################################################################
2885							# Description:
2886							# Calculates the score for the copy-number variation hallmark
2887							#
2888							# Input variables:
2889							# $chr: stores the chromsome where the region is located
2890							# $start: stores the starting location of the region
2891							# $end: stores the end location of the region
2892							# $genome_cnv_data_hash_ref: stores the position of CNV mutations on each chromosome
2893							# $bin_size: stores the size of single bin
2894							#
2895
2896							#=cut
2897
2898							sub calculate_copy_number_scores {
2899
2900							#parse parameters
2901	0			0	0	0	my $chr = shift;
2902	0					0	my $start = shift;
2903	0					0	my $end = shift;
2904
2905	0					0	my $genome_cnv_data_hash_ref = shift;
2906	0					0	my %genome_cnv_data_hash = %$genome_cnv_data_hash_ref;
2907
2908	0					0	my $bin_size = shift;
2909
2910							#calculate array index where the data for the first and last bins of the region are located
2911	0					0	my $start_index = $start / ($bin_size);
2912	0					0	my $end_index = $end / ($bin_size);
2913
2914	0					0	my $cnv_score = 0; #stores final score to return
2915
2916	0					0	my %cnv_number_hash; #hash
2917							#key: copy number eg 0,1,3,4
2918							#value: the number of regions with the given copy number
2919
2920	0					0	my $cnv_switch_count = 0; #number of switches between different copy numbers
2921	0					0	my $cnv_switch_density = 0; #density of cnv events in the region
2922
2923	0					0	my @chr_data; #stores all the bins for the chromosome where the region is located
2924							my %cnv_hash; #stores the cnv hash from each bin
2925
2926	0					0	my $mean = 0; #stores the average number of regions of aberrant copy-number
2927	0					0	my $SD = 0; #stores the standard deviation of the number of regions of aberrant copy-number
2928
2929	0					0	my %cnv_significant; #hash
2930							#key: copy number (but only significant ones are stored)
2931							#value: the number of regions with the given copy number
2932
2933	0					0	my $significant_count = 0; #stores the number of unique significant copy-numbers
2934
2935							#check if there is cnv data for the chromosome
2936	0	0				0	if(defined($genome_cnv_data_hash{$chr})){
2937
2938							#extract the bin data for the chromosome
2939	0					0	@chr_data = @{$genome_cnv_data_hash{$chr}};
	0					0
2940
2941							#collect the data from the bins that contain the region
2942	0					0	for (my $i = $start_index; $i < $end_index+1; $i++){
2943	0	0				0	if(!defined($chr_data[$i])){
2944	0					0	next;
2945							}
2946	0					0	%cnv_hash = %{$chr_data[$i]};
	0					0
2947
2948	0					0	for my $key (keys %cnv_hash){
2949	0	0				0	if($key eq 'BPcount'){
2950	0					0	$cnv_switch_count += $cnv_hash{$key};
2951							}
2952							else{
2953	0					0	$cnv_number_hash{$key}+= $cnv_hash{$key};
2954							}
2955							}
2956							}
2957
2958							#calculate the breakpoint density of cnv mutations for the region
2959	0					0	$cnv_switch_density = $cnv_switch_count/ ($end-$start);
2960
2961							#calculate the number of cnv events in the region (half the number of breakpoints)
2962	0					0	for my $key (keys %cnv_number_hash){
2963	0					0	$cnv_number_hash{$key} = POSIX::ceil($cnv_number_hash{$key});
2964	0					0	$mean += $cnv_number_hash{$key};
2965							}
2966
2967							#if no cnv mutations were found return a score of 0
2968	0	0				0	if(scalar(keys %cnv_number_hash)==0){
2969	0					0	$cnv_score = 0;
2970	0					0	return ($cnv_score, \%cnv_number_hash, $cnv_switch_density);
2971							}
2972
2973							#calculate the mean of the number regions of each copy-number
2974	0					0	$mean = $mean/(scalar(keys %cnv_number_hash));
2975
2976							#calculate the standard deviation of the number of regions of each copy-number
2977	0					0	for my $key (keys %cnv_number_hash){
2978	0					0	$SD += ($cnv_number_hash{$key}-$mean)**2;
2979							}
2980	0					0	$SD = $SD/(scalar(keys %cnv_number_hash));
2981	0					0	$SD = $SD**0.5;
2982
2983							#determine which copy-numbers are significant (ie are not low out liers)
2984	0					0	for my $key (keys %cnv_number_hash){
2985	0	0	0			0	if(
2986							( $SD==0 )\|\|
2987							( (($cnv_number_hash{$key}-$mean)/$SD) >= -1*$outlier_deviation )
2988							){
2989	0					0	$cnv_significant{$key} = $cnv_number_hash{$key};
2990	0					0	$cnv_score = $cnv_score + $cnv_significant{$key}**2;
2991							}
2992							}
2993
2994							#score calculation
2995	0					0	$cnv_score = $cnv_score / (scalar(keys %cnv_significant));
2996	0					0	$cnv_score = log($cnv_score)/log(2);
2997	0					0	$cnv_score += 1;
2998	0					0	$cnv_score = 1 - (1/$cnv_score);
2999	0					0	$cnv_score = $cnv_score/(scalar(keys %cnv_significant));
3000							#$cnv_score = (1/(scalar(keys %cnv_significant)))(0.25) + (1-(1/(1+log($cnv_score/(scalar(keys %cnv_significant)))/log(2))))(0.75);
3001							}
3002
3003	0					0	return ($cnv_score, \%cnv_number_hash, $cnv_switch_density);
3004
3005							}#sub calculate_copy_number_scores
3006
3007
3008							#=head2 Sub-Method: calculate_genome_localization_score
3009
3010							### calculate_genome_localization_score ##########################################################
3011							# Description:
3012							# Calculates the genome localization hallmark score
3013							#
3014							# Input variables:
3015							# $chr: store the chromosome where the region is located
3016							# $genome_mutation_density_hash_ref: stores the average mutation density of each
3017							# chromosome
3018							#
3019
3020							#=cut
3021
3022							sub calculate_genome_localization_score {
3023
3024							#parse parameters
3025	0			0	0	0	my $chr = shift;
3026
3027	0					0	my $genome_mutation_density_hash_ref = shift;
3028	0					0	my %genome_mutation_density_hash = %{$genome_mutation_density_hash_ref};
	0					0
3029
3030							#read mutation density for the chromosome
3031	0					0	my $chr_mutation_density = $genome_mutation_density_hash{$chr};
3032
3033	0					0	my $mean_density = 0; #stores the average density of mutations for the chromosomes
3034
3035	0					0	my $standard_deviation = 0; #stores the standard deviation of the mutation densities of the chromosomes
3036
3037	0					0	my $z_score = 0; #stores the z-score for the suspect chromosome
3038	0					0	my $p_val = 0; #stores the p-value calculated from the z-score for the suspect chromosome
3039
3040							#sum mutation densities of all the chromosomes
3041	0					0	for my $key (keys %genome_mutation_density_hash){
3042	0					0	$mean_density += $genome_mutation_density_hash{$key}/$chromosome_length{$chr};
3043							}
3044
3045							#calculate the mean
3046	0					0	$mean_density = $mean_density / 24;
3047
3048							#calculate the standard deviation
3049	0					0	for my $key (keys %genome_mutation_density_hash){
3050	0					0	$standard_deviation += ((($genome_mutation_density_hash{$key}) - ($mean_density) )**2);
3051							}
3052	0					0	$standard_deviation = $standard_deviation / 24;
3053	0					0	$standard_deviation = ($standard_deviation)**0.5;
3054
3055							#check for case where the standard deviation or mean comes back as 0
3056	0	0	0			0	if($mean_density == 0 \|\| $standard_deviation == 0){
3057	0					0	return (0, 0, $chr_mutation_density);
3058							}
3059
3060							#calculate the z-score for suspect chromosome
3061	0					0	$z_score = ($chr_mutation_density - $mean_density) / $standard_deviation;
3062
3063							#calculate the p-value from the z-score
3064	0					0	$p_val = Statistics::Distributions::uprob($z_score);
3065							#only consider top tail
3066	0					0	$p_val = 0.5-$p_val;
3067	0					0	$p_val = $p_val/0.5;
3068
3069							#check for case where z-score comes back as 0
3070	0	0				0	if($z_score < 0){
3071	0					0	$p_val = 0;
3072							}
3073
3074							#p_val is the score for this hallmark
3075	0					0	return ($p_val, $z_score, $chr_mutation_density);
3076
3077							}#sub calculate_genome_localization_score
3078
3079
3080							#=head2 Sub-Method: calculate_region_mutation_density_score
3081
3082							### calculate_region_mutation_density_score ######################################################
3083							# Description:
3084							# Calculates the chromosome localization hallmark score
3085							#
3086							# Input variables:
3087							# $chr: chromosome where the region is located
3088							# $start: starting location of the region
3089							# $end: end location of the region
3090							# $genome_cnv_data_hash_ref: stores the position of CNV mutations on each
3091							# chromosome
3092							# $genome_trans_data_hash_ref: stores the position of translocation events
3093							# on each chromosome
3094							# $genome_mutation_density_hash_ref: stores the average mutation density of each
3095							# chromosome
3096							# $bin_size: stores the size of single bin
3097							#
3098
3099							#=cut
3100
3101							sub calculate_region_mutation_density_score {
3102
3103							#parse parameters
3104	13			13	0	25	my $chr = shift;
3105	13					22	my $start = shift;
3106	13					21	my $end = shift;
3107
3108	13					14	my $genome_cnv_data_hash_ref = shift;
3109	13					18	my %genome_cnv_data_hash = %{$genome_cnv_data_hash_ref};
	13					189
3110
3111	13					25	my $genome_trans_data_hash_ref = shift;
3112	13					21	my %genome_trans_data_hash = %{$genome_trans_data_hash_ref};
	13					88
3113
3114	13					19	my $genome_mutation_density_hash_ref = shift;
3115	13					21	my %genome_mutation_density_hash = %{$genome_mutation_density_hash_ref};
	13					161
3116
3117	13					24	my $bin_size = shift;
3118
3119							#calculate array index where the data for the first and last bins of the region are located
3120	13					29	my $start_index = $start / $bin_size;
3121	13					16	my $end_index = $end / $bin_size;
3122
3123							#get the mean mutation density for the suspect chromosome
3124	13					25	my $mean_chr_mutation_density = $genome_mutation_density_hash{$chr};
3125
3126	13					11	my $chis_stat; #stores the chi squared statistic value
3127
3128	13					14	my $mutation_count = 0; #stores the total mutation count in the region
3129	13					14	my $cnv_count = 0; #stores the number of cnv breakpoints in the region
3130	13					18	my $trans_count = 0; #stores the number of translocation breakpoints in the region
3131	13					13	my $mutation_density = 0; #store the mutation density of the region
3132	13					14	my $mutation_density_score; #stores the final score for the hallmark
3133
3134							my @chr_data; #stores the bin data for the chromosome
3135	0					0	my %cnv_hash; #stores the cnv hash for each bin
3136	0					0	my %trans_hash; #stores the translocation hash for each bin
3137
3138							#check that there is cnv data for the chromosome
3139	13	100				59	if(defined($genome_cnv_data_hash{$chr})){
3140							#get the cnv data for the chromosome
3141	12					10	@chr_data = @{$genome_cnv_data_hash{$chr}};
	12					41384
3142
3143							#sum the number of cnv breakpoints in the suspect region
3144	12					115	for (my $i = $start_index; $i < $end_index+1; $i++){
3145	145335	100				175498	if(!defined($chr_data[$i])){
3146	145301					181774	next;
3147							}
3148
3149	34					38	%cnv_hash = %{$chr_data[$i]};
	34					186
3150	34					89	$cnv_count += $cnv_hash{'BPcount'};
3151							}
3152							}
3153
3154	13					20176	@chr_data = ();
3155
3156							#check that there is translocation data for the chromosome
3157	13	50				155	if(defined($genome_trans_data_hash{$chr})){
3158							#get the translocation data for the chromosome
3159	13					21	@chr_data = @{$genome_trans_data_hash{$chr}};
	13					25997
3160
3161							#sum the number of translocation breakpoints in the suspect region
3162	13					108	for (my $i = $start_index; $i < $end_index+1; $i++){
3163	163406	100				201253	if(!defined($chr_data[$i])){
3164	163313					211797	next;
3165							}
3166	93					68	%trans_hash = %{$chr_data[$i]};
	93					404
3167	93					214	$trans_count += $trans_hash{'BPcount'};
3168							}
3169							}
3170
3171							#divide the count by 2 to get the number of translocation events
3172	13					116	$trans_count = POSIX::ceil($trans_count/2);
3173
3174							#calculate the total number events in the region
3175	13					37	$mutation_count = $cnv_count + $trans_count;
3176
3177							#calculate the mutation density for the region
3178	13					34	$mutation_density = $mutation_count / ($end-$start);
3179
3180							#calculate the chi squared statistic
3181	13					101	$chis_stat = abs(((log($mutation_density)-log($mean_chr_mutation_density))**2)/(log($mean_chr_mutation_density)));
3182
3183							#generate a p-value using the chi squared test
3184	13					97	$mutation_density_score = 1-(Statistics::Distributions::chisqrprob(1,$chis_stat));
3185
3186	13					17643	return ($mutation_density, $mutation_density_score);
3187							}#calculate_region_mutation_density_score
3188
3189
3190							#=head2 Sub-Method: calculate_translocation_score
3191
3192							### calculate_translocation_score #################################################################
3193							# Description:
3194							# Calculates the translocation hallmark score
3195							#
3196							# Input variables:
3197							# $chr: chromosome where the region is located
3198							# $start: starting location of the region
3199							# $end: end location of the region
3200							# $genome_trans_data_hash_ref: stores the position of translocation events on each
3201							# chromosome
3202							# $bin_size: stores the size of single bin
3203							#
3204
3205							#=cut
3206
3207							sub calculate_translocation_score {
3208
3209							#parse parameters
3210	0			0	0	0	my $chr = shift;
3211	0					0	my $start = shift;
3212	0					0	my $end = shift;
3213
3214	0					0	my $genome_trans_data_hash_ref = shift;
3215	0					0	my %genome_trans_data_hash = %{$genome_trans_data_hash_ref};
	0					0
3216
3217	0					0	my $bin_size = shift;
3218
3219							#calculate array index where the data for the first and last bins of the region are located
3220	0					0	my $start_index = $start / ($bin_size);
3221	0					0	my $end_index = $end / ($bin_size);
3222
3223	0					0	my $translocation_density = 0; #stores the density of translocation events in the region
3224
3225	0					0	my @chr_data; #stores the bin data for the chromosome
3226							my %trans_breakpoints; #hash
3227							#key: chromosome eg 1,2,X,Y
3228							#value: an array storing the position of the translocation breakpoints
3229
3230	0					0	my %trans_breakpoint_spreads; #stores the average distance between translocation breakpoints
3231	0					0	my @significant_chrs; #stores a list of chromosomes that have a significant number of
3232							#translocation to or from the region
3233	0					0	my @diffs; #stores the distance between adjacent translocation breakpoints on
3234							#one chromosome
3235	0					0	my $diff_sum; #stores the sum of the distances
3236	0					0	my $diff_count; #store the number regions between translocation breakpoints
3237
3238	0					0	my %trans_number_hash; #hash
3239							#key: chromosome eg 1,2,X,Y
3240							#value: the number of events between the chromosome and the region
3241
3242	0					0	my $mean = 0; #stores the average number of translocations between each chromosome
3243							#and the region
3244	0					0	my $SD = 0; #stores the standard deviation of the above value
3245
3246	0					0	my $weighted_sum = 0.00; #component of the score calculation
3247
3248	0					0	my $size = 0.00; #intermediate variable used to collect sums
3249	0					0	my $count = 0; #intermeidate variable
3250
3251	0					0	my $translocation_score = 0; #final hallmark score
3252
3253	0					0	my $spread_factor = 0;
3254
3255	0					0	my $translocation_count = 0; #stores the total number of translocations from significant chromosomes
3256
3257							#check that there is translocation data for the chromosome
3258	0	0				0	if(defined($genome_trans_data_hash{$chr})){
3259							#get the translocation data for the chromosome
3260	0					0	@chr_data = @{$genome_trans_data_hash{$chr}};
	0					0
3261
3262							#for each bin sum the number of translocation events
3263							#and record the position of breakpoints in the trans_breakpoints hash
3264	0					0	for (my $i = $start_index; $i < $end_index+1; $i++){
3265	0	0				0	if(!defined($chr_data[$i])){
3266	0					0	next;
3267							}
3268	0					0	my %trans_hash = %{$chr_data[$i]};
	0					0
3269	0					0	my %trans_hash_in;
3270							my %trans_hash_out;
3271
3272							#analyze the breakpoints from translocation into the region
3273	0	0				0	if(defined($trans_hash{'in'})){
3274	0					0	%trans_hash_in = %{$trans_hash{'in'}};
	0					0
3275
3276	0					0	for my $key (keys %trans_hash_in){
3277	0					0	$size = @{$trans_hash_in{$key}};
	0					0
3278
3279							#calculate the number of translocation events
3280	0					0	$size = $size/2;
3281
3282							#add the count to the appropriate hash
3283	0					0	$trans_number_hash{$key} += $size;
3284
3285							#add the breakpoints to the trans_breakpoints hash
3286	0					0	push(@{$trans_breakpoints{$key}},(@{$trans_hash_in{$key}}));
	0					0
	0					0
3287							}
3288							}
3289
3290							#analyze the breakpoints from translocation out of the region
3291	0	0				0	if(defined($trans_hash{'out'})){
3292	0					0	%trans_hash_out = %{$trans_hash{'out'}};
	0					0
3293
3294	0					0	for my $key (keys %trans_hash_out){
3295	0					0	$size = @{$trans_hash_out{$key}};
	0					0
3296	0					0	$count = $size;
3297	0	0				0	if($key eq $chr){
3298	0					0	foreach my $val (@{$trans_hash_out{$key}}){
	0					0
3299	0	0	0			0	if($val > $start && $val < $end){
3300	0					0	$count--;
3301							}
3302							}
3303							}
3304							#calculate the number of translocation events
3305	0					0	$count = $count/2;
3306
3307							#add the count to the appropriate hash
3308	0					0	$trans_number_hash{$key} += $count;
3309
3310							#add the breakpoints to the trans_breakpoints hash
3311	0					0	push(@{$trans_breakpoints{$key}},(@{$trans_hash_out{$key}}));
	0					0
	0					0
3312							}
3313							}
3314							}
3315
3316	0					0	$count = 0;
3317
3318							#check that some translocation events were found else return a score of 0
3319	0	0				0	if (keys(%trans_number_hash) == 0){
3320	0					0	$translocation_score = 0;
3321	0					0	$translocation_density = 0;
3322	0					0	return ($translocation_score, \%trans_number_hash, $translocation_density);
3323							}
3324
3325	0					0	for my $key (keys %trans_number_hash){
3326							#round the event count up since if only one breakpoint was present at the end
3327							#of the region we still count that as a whole translocation event
3328	0					0	$trans_number_hash{$key} = POSIX::ceil($trans_number_hash{$key});
3329
3330							#sum the number of translocation events in the region
3331	0					0	$count += $trans_number_hash{$key};
3332							}
3333
3334							#calculate the translocation density for the region
3335	0					0	$translocation_density = $count / ($end-$start);
3336
3337							#calculate the mean and standard deviation of the number of translocation between the region
3338							#and each chromosome
3339	0					0	($SD, $mean) = standard_deviation_and_mean(\%trans_number_hash,0);
3340
3341							#identify chromosomes that have a high number of translocations to or from the region and
3342							#add them to the significant chromosome list
3343	0					0	for my $key (keys %trans_number_hash){
3344	0	0	0			0	if(
3345							( $SD==0 )\|\|
3346							#( (($trans_number_hash{$key}-$mean)/$SD)>-2*$outlier_deviation)
3347							( (($trans_number_hash{$key}-$mean)/$SD)>-1*$outlier_deviation)
3348							){
3349	0					0	push (@significant_chrs, $key);
3350							}
3351							}
3352
3353							#calculate the total number of translocations from significant chromosomes
3354	0					0	foreach my $key (@significant_chrs){
3355	0					0	$translocation_count += $trans_number_hash{$key};
3356							}
3357
3358							#for each significant chromosome calculate the spread between translocation events
3359	0					0	foreach my $key (@significant_chrs){
3360							#sort the breakpoints
3361	9			9		22217	{use sort 'stable';
	9					21
	9					51
	0					0
3362	0					0	@{$trans_breakpoints{$key}} = sort {$a <=> $b} @{$trans_breakpoints{$key}};
	0					0
	0					0
	0					0
3363							}#use sort 'stable'
3364
3365	0					0	$size = @{$trans_breakpoints{$key}};
	0					0
3366	0					0	@diffs = ();
3367	0					0	$diff_sum = 0;
3368	0					0	$diff_count = 0;
3369
3370							#calculate and store the distance between adjacent breakpoints
3371	0					0	for (my $i = 1; $i<$size; $i++){
3372	0					0	push (@diffs, @{$trans_breakpoints{$key}}[$i]-@{$trans_breakpoints{$key}}[$i-1]);
	0					0
	0					0
3373							}
3374
3375							#check that more that one distance was calculated
3376	0	0				0	if($size==1){
3377	0					0	$trans_breakpoint_spreads{$key} = 0;
3378	0					0	$diff_count = 1;
3379							}
3380							else{ #calculate the standard deviation and mean for the distance between breakpoints
3381	0					0	($SD,$mean) = standard_deviation_and_mean(\@diffs,1);
3382
3383							#sum the distances that are not high outliers, indicating distance between 2 translocation
3384							#events and not distance between breakpoints of the same event
3385	0					0	foreach my $val (@diffs){
3386	0	0	0			0	if(
3387							( $SD==0 )\|\|
3388							( (($val-$mean)/$SD)<$outlier_deviation )
3389							){
3390	0					0	$diff_sum += $val;
3391	0					0	$diff_count++;
3392							}
3393							}
3394							}
3395
3396							#calculate the average spread of translocation breakpoints
3397	0					0	$trans_breakpoint_spreads{$key} = $diff_sum / $diff_count;
3398
3399							#calculate the spread factor for the chromosome
3400							#my $spread_factor = (log($trans_breakpoint_spreads{$key}+1)/log(10))/((log($expected_mutation_density)/log(10))*-1);
3401	0					0	$spread_factor = (log($trans_breakpoint_spreads{$key}+1)/log(10))/((log($expected_mutation_density)/log(10))*-1);
3402	0	0				0	if($spread_factor==0){
3403	0					0	$spread_factor = 1;
3404							}
3405
3406
3407							#increase the weighted sum based on the number of translocation events and their spread multiplied by the proportion of translocations
3408							#from this specific chromosome relative to the total number of translocation events
3409	0					0	$weighted_sum += ($trans_number_hash{$key}/$spread_factor)*($trans_number_hash{$key}/$translocation_count);
3410							}#foreach my $key (@significant_chrs)
3411
3412	0					0	my $t2 = (1-(1/(log(1+$weighted_sum)/log(2))));
3413							#final hallmark score calculation
3414	0					0	$size = @significant_chrs;
3415
3416							#calculate second term of score
3417	0					0	$translocation_score = (1-(1/(log(1+$weighted_sum)/log(2))));
3418
3419							#calculate first term of score
3420	0	0	0			0	if($size<$translocation_cut_off_count && $size>2){
3421	0					0	$translocation_score = (1-(0.10($size-2)))$translocation_score;
3422							}
3423	0	0				0	if($size>=$translocation_cut_off_count){
3424	0					0	$translocation_score = 0;
3425							}
3426
3427	0	0				0	if($translocation_score>=1){
3428	0					0	print "score: ".$translocation_score."\n";
3429	0					0	print "ws: ".$weighted_sum."\n";
3430	0					0	print "sf: ".$spread_factor."\n";
3431	0					0	print "term 1: ";
3432	0					0	print (1/(1+(log($size)/log(4))));
3433	0					0	print "\n";
3434	0					0	print "term 2: ";
3435	0					0	print (1-(1/(log($weighted_sum)/log(2))));
3436	0					0	print "\n";
3437							}
3438
3439							}#if(defined($genome_trans_data_hash{$chr}))
3440
3441	0					0	return ($translocation_score, \%trans_number_hash, $translocation_density);
3442
3443							}#sub calculate_translocation_score
3444
3445
3446							#=head2 Sub-Method: calculate_insertion_breakpoint_score
3447
3448							### calculate_insertion_breakpoint_score ##########################################################
3449							# Description:
3450							# Calculates the insertions at translocation breakpoints hallmark score
3451							#
3452							# Input variables:
3453							# $chr: chromosome where the region is located
3454							# $start: starting location of the region
3455							# $end: end location of the region
3456							# $genome_trans_data_hash_ref: stores the position of translocation events
3457							# on each chromosome
3458							# $genome_trans_insertion_breakpoints_hash_ref: stores the position of breakpoints with
3459							# insertions nearby
3460							# $bin_size: stores the size of single bin
3461							#
3462
3463							#=cut
3464
3465							sub calculate_insertion_breakpoint_score {
3466
3467							#parse parameters
3468	0			0	0	0	my $chr = shift;
3469	0					0	my $start = shift;
3470	0					0	my $end = shift;
3471
3472	0					0	my $genome_trans_data_hash_ref = shift;
3473	0					0	my %genome_trans_data_hash = %{$genome_trans_data_hash_ref};
	0					0
3474
3475	0					0	my $genome_trans_insertion_breakpoints_hash_ref = shift;
3476	0					0	my %genome_trans_insertion_breakpoints_hash = %{$genome_trans_insertion_breakpoints_hash_ref};
	0					0
3477
3478	0					0	my $bin_size = shift;
3479
3480							#calculate array index where the data for the first and last bins of the region are located
3481	0					0	my $start_index = $start / ($bin_size);
3482	0					0	my $end_index = $end / ($bin_size);
3483
3484	0					0	my $total_breakpoints = 0; #total number of breakpoints in the region
3485	0					0	my $inserted_breakpoints = 0; #total number of breakpoints with nearby insertions in the region
3486
3487	0					0	my $insertion_breakpoint_score = 0;
3488
3489	0					0	my @chr_data; #stores the bin data for the chromosome
3490							my %trans_hash; #stores translocation hash for each bin
3491
3492	0					0	my @inserted_breakpoint_list; #stores the breakpoints that have insertions nearby
3493	0					0	my @breakpoint_data; #stores $total_breakpoints and $inserted_breakpoints for return
3494
3495							#check if there is translocation data for the region
3496	0	0				0	if(defined($genome_trans_data_hash{$chr})){
3497
3498							#get the translocation data
3499	0					0	@chr_data = @{$genome_trans_data_hash{$chr}};
	0					0
3500
3501							#for each bin in the region sum the number of breakpoints
3502	0					0	for (my $i = $start_index; $i < $end_index+1; $i++){
3503	0	0				0	if(!defined($chr_data[$i])){
3504	0					0	next;
3505							}
3506	0					0	%trans_hash = %{$chr_data[$i]};
	0					0
3507	0					0	$total_breakpoints += $trans_hash{'BPcount'};
3508							}
3509
3510							#get the list of breakpoints with insertions nearby on the chromosome
3511	0					0	@inserted_breakpoint_list = @{$genome_trans_insertion_breakpoints_hash{$chr}};
	0					0
3512
3513							#sort the above list
3514	9			9		7784	{use sort 'stable';
	9					22
	9					39
	0					0
3515	0					0	@inserted_breakpoint_list = sort {$a <=> $b} @inserted_breakpoint_list;
	0					0
3516							}#use sort 'stable'
3517
3518							#calculate how many of the breakpoints with insertions are in the region
3519	0					0	foreach my $breakpoint (@inserted_breakpoint_list){
3520	0	0				0	if($breakpoint > $end){
3521	0					0	last;
3522							}
3523	0	0				0	if($breakpoint > $start){
3524	0					0	$inserted_breakpoints++;
3525							}
3526							}
3527
3528							#calculate the hallmark score
3529	0	0				0	if ($total_breakpoints > 0) {
3530	0					0	$insertion_breakpoint_score = $inserted_breakpoints/$total_breakpoints;
3531							}
3532
3533	0	0				0	if($insertion_breakpoint_score > 1){
3534	0					0	die "ERROR: found a insertion_breakpoint_score greater than 1\n";
3535							}
3536
3537							}
3538
3539	0					0	push (@breakpoint_data, $inserted_breakpoints);
3540	0					0	push (@breakpoint_data, $total_breakpoints);
3541
3542	0					0	return ($insertion_breakpoint_score, \@breakpoint_data);
3543							}
3544
3545
3546							#=head2 Sub-Method: calculate_loh_score
3547
3548							### calculate_loh_score ###########################################################################
3549							# Description:
3550							# Calculates the loss of heterozgozity hallmark score
3551							#
3552							# Input variables:
3553							# $chr: chromosome where the region is located
3554							# $start: starting location of the region
3555							# $end: end location of the region
3556							# $chromosome_cnv_breakpoints_hash_ref: stores the breakpoints of CNV mutations on each
3557							# chromosome
3558							# $chromosome_loh_breakpoints_hash_ref: stores the breakpoints of LOH regions on each
3559							# chromosome
3560							#
3561
3562							#=cut
3563
3564							sub calculate_loh_score {
3565
3566							#parse parameters
3567	0			0	0	0	my $chr = shift;
3568	0					0	my $start = shift;
3569	0					0	my $end = shift;
3570
3571	0					0	my $chromosome_cnv_breakpoints_hash_ref = shift;
3572	0					0	my %chromosome_cnv_breakpoints_hash = %{$chromosome_cnv_breakpoints_hash_ref};
	0					0
3573
3574	0					0	my $chromosome_loh_breakpoints_hash_ref = shift;
3575	0					0	my %chromosome_loh_breakpoints_hash = %{$chromosome_loh_breakpoints_hash_ref};
	0					0
3576
3577	0					0	my @cnv_breakpoints; #stores cnv breakpoints in the region
3578							my $cnv_breakpoints_size; #stores the number of cnv breakpoints in the region
3579
3580	0					0	my @loh_breakpoints; #stores the LOH breakpoints in the region
3581	0					0	my $loh_breakpoints_size; #stores the number of LOH breakpoints in the region
3582
3583							#calculate maximum potential amount of heterozygosity
3584	0					0	my $original_heterozygous_size = $end - $start;
3585
3586							#calculate maximum pontentail amount of heterozygosity that can remain
3587	0					0	my $remaining_heterozygous_size = $end - $start;
3588
3589	0					0	my $loh_size = 0; #stores the size of all LOH regions in the region
3590
3591	0					0	my $loh_score = 0; #final hallmark score
3592
3593							#check if there is any cnv data for the chromosome
3594	0	0	0			0	if(
3595							( !defined($chromosome_cnv_breakpoints_hash{$chr}) ) \|\|
3596							( !defined($chromosome_loh_breakpoints_hash{$chr}) )
3597							){
3598	0					0	$loh_score = 0;
3599	0					0	$loh_size = -1;
3600	0					0	$remaining_heterozygous_size = -1;
3601	0					0	return($loh_score, $loh_size, $remaining_heterozygous_size);
3602							}
3603
3604							#get the cnv breakpoints for the chromosome
3605	0					0	@cnv_breakpoints = @{$chromosome_cnv_breakpoints_hash{$chr}};
	0					0
3606
3607							#sort the list
3608	9			9		3714	{use sort 'stable';
	9					21
	9					36
	0					0
3609	0					0	@cnv_breakpoints = sort {$a <=> $b} @cnv_breakpoints;
	0					0
3610							}#use sort 'stable'
3611
3612							#get the number of breakpoints
3613	0					0	$cnv_breakpoints_size = @cnv_breakpoints;
3614
3615							#find all the cnv events that occur in the region and subtract the size of these regions
3616							#from the $original_heterozygous_size value
3617	0					0	for (my $i = 0; $i< $cnv_breakpoints_size; $i+=2){
3618	0					0	my $cnv_start = $i;
3619	0					0	my $cnv_end = $i+1;
3620
3621	0					0	my $end_overlap = 0;
3622	0					0	my $start_overlap = 0;
3623
3624	0	0				0	if($cnv_breakpoints[$cnv_start] > $end){
3625	0					0	last;
3626							}
3627
3628							#Check if the end point of the cnv region is within the suspect region
3629	0	0	0			0	if($cnv_breakpoints[$cnv_end] >= $start && $cnv_breakpoints[$cnv_end] <= $end){
3630	0					0	$end_overlap = 1;
3631							}
3632
3633							#Check if the start point of the region cnv is within the suspect region
3634	0	0	0			0	if($cnv_breakpoints[$cnv_start] >= $start && $cnv_breakpoints[$cnv_start] <= $end){
3635	0					0	$start_overlap = 1;
3636							}
3637
3638							#If an overlap was detected
3639	0	0	0			0	if($start_overlap==1 && $end_overlap==1) {
		0	0
		0
		0
3640	0					0	$remaining_heterozygous_size -= ($cnv_breakpoints[$cnv_end] - $cnv_breakpoints[$cnv_start]);
3641							}
3642							elsif($start_overlap==1){
3643	0					0	$remaining_heterozygous_size -= ($end-$cnv_breakpoints[$cnv_start]);
3644							}
3645							elsif($end_overlap==1){
3646	0					0	$remaining_heterozygous_size -= ($cnv_breakpoints[$cnv_end]-$start);
3647							}
3648							elsif($cnv_breakpoints[$cnv_start] < $start && $cnv_breakpoints[$cnv_end] > $end){
3649	0					0	$remaining_heterozygous_size = 0;
3650							}
3651							}
3652
3653							#check if there is no potential heterozygous regions in the suspect region or if there are no cnv events
3654							#in the suspect region, if either is the case return a score of 0
3655	0	0	0			0	if($remaining_heterozygous_size == 0 \|\| $remaining_heterozygous_size == $original_heterozygous_size){
3656	0					0	$loh_score = 0;
3657	0					0	$loh_size = -1;
3658	0					0	$remaining_heterozygous_size = -1;
3659	0					0	return($loh_score, $loh_size, $remaining_heterozygous_size);
3660							}
3661
3662							#get a list of the LOH breakpoints on the chromosome
3663	0					0	@loh_breakpoints = @{$chromosome_loh_breakpoints_hash{$chr}};
	0					0
3664
3665							#sort the list
3666	9			9		2594	{use sort 'stable';
	9					19
	9					38
	0					0
3667	0					0	@loh_breakpoints = sort {$a <=> $b} @loh_breakpoints;
	0					0
3668							}#use sort 'stable'
3669
3670							#get the number of breakpoints
3671	0					0	$loh_breakpoints_size = @loh_breakpoints;
3672
3673							#determine which LOH regions are in the suspect region
3674	0					0	for (my $i = 0; $i< $loh_breakpoints_size; $i+=2){
3675	0					0	my $start_overlap_region_loh = 0;
3676	0					0	my $end_overlap_region_loh = 0;
3677
3678	0					0	my $loh_start = $i;
3679	0					0	my $loh_end = $i+1;
3680
3681	0					0	my $loh_start_breakpoint = $loh_breakpoints[$loh_start];
3682	0					0	my $loh_end_breakpoint = $loh_breakpoints[$loh_end];
3683	0					0	my $loh_region_size;
3684
3685	0	0				0	if($loh_breakpoints[$loh_start] > $end){
3686	0					0	last;
3687							}
3688
3689							#Check if the end point of the cnv region is within the loh region
3690	0	0	0			0	if($loh_breakpoints[$loh_end] >= $start && $loh_breakpoints[$loh_end] <= $end){
3691	0					0	$end_overlap_region_loh = 1;
3692							}
3693
3694							#Check if the start point of region 1 is within region 2
3695	0	0	0			0	if($loh_breakpoints[$loh_start] >= $start && $loh_breakpoints[$loh_start] <= $end){
3696	0					0	$start_overlap_region_loh = 1;
3697							}
3698
3699							#If an overlap was detected
3700	0	0	0			0	if($start_overlap_region_loh==1 && $end_overlap_region_loh!=1){
		0	0
		0	0
3701	0					0	$loh_end_breakpoint = $end;
3702							}
3703							elsif($end_overlap_region_loh==1 && $start_overlap_region_loh!=1){
3704	0					0	$loh_start_breakpoint = $start;
3705							}
3706							elsif($loh_breakpoints[$loh_start] < $start && $loh_breakpoints[$loh_end] > $end){
3707	0					0	$loh_start_breakpoint = $start;
3708	0					0	$loh_end_breakpoint = $end;
3709	0					0	$start_overlap_region_loh=1;
3710	0					0	$end_overlap_region_loh=1;
3711							}
3712	0	0	0			0	if($start_overlap_region_loh != 1 && $end_overlap_region_loh != 1){ #if the loh region is not in the suspect region go to the next loh region
3713	0					0	next;
3714							}
3715
3716							#if the loh region is in the suspect region then reduce the size of the loh by subtracting the size of cnv regions that over lap with it
3717							#this will tell us how much original heterozygosity remains
3718	0					0	$loh_region_size = $loh_end_breakpoint - $loh_start_breakpoint;
3719
3720							#check for overlaps between the LOH region and cnv regions
3721	0					0	for (my $k = 0; $k< $cnv_breakpoints_size; $k+=2){
3722	0					0	my $start_overlap_loh_cnv = 0;
3723	0					0	my $end_overlap_loh_cnv = 0;
3724
3725	0					0	my $cnv_start = $k;
3726	0					0	my $cnv_end = $k+1;
3727
3728	0	0				0	if($cnv_breakpoints[$cnv_start] > $loh_end_breakpoint){
3729	0					0	last;
3730							}
3731
3732							#Check if the end point of the cnv region is within the loh region
3733	0	0	0			0	if($cnv_breakpoints[$cnv_end] >= $loh_start_breakpoint && $cnv_breakpoints[$cnv_end] <= $loh_end_breakpoint){
3734	0					0	$end_overlap_loh_cnv = 1;
3735							}
3736
3737							#Check if the start point of region 1 is within region 2
3738	0	0	0			0	if($cnv_breakpoints[$cnv_start] >= $loh_start_breakpoint && $cnv_breakpoints[$cnv_start] <= $loh_end_breakpoint){
3739	0					0	$start_overlap_loh_cnv = 1;
3740							}
3741
3742							#If an overlap was detected
3743	0	0	0			0	if($start_overlap_loh_cnv==1 && $end_overlap_loh_cnv==1) {
		0	0
		0
		0
3744	0					0	$loh_region_size -= ($cnv_breakpoints[$cnv_end] - $cnv_breakpoints[$cnv_start]);
3745							}
3746							elsif($start_overlap_loh_cnv==1){
3747	0					0	$loh_region_size -= ($loh_end_breakpoint-$cnv_breakpoints[$cnv_start]);
3748							}
3749							elsif($end_overlap_loh_cnv==1){
3750	0					0	$loh_region_size -= ($cnv_breakpoints[$cnv_end]-$loh_start_breakpoint);
3751							}
3752							elsif($cnv_breakpoints[$cnv_start] < $loh_start_breakpoint && $cnv_breakpoints[$cnv_end] > $loh_end_breakpoint){
3753	0					0	$loh_region_size = 0;
3754							}
3755							}
3756	0					0	$loh_size += $loh_region_size;
3757							}
3758
3759
3760							#calculate the LOH score
3761	0					0	$loh_score = 1 - ($loh_size/$remaining_heterozygous_size);
3762
3763	0	0				0	if($loh_size> $remaining_heterozygous_size){
3764	0					0	die "ERROR: invalid LOH size value found\n";
3765							}
3766
3767	0					0	return($loh_score, $loh_size, $remaining_heterozygous_size);
3768
3769							}#sub calculate_loh_score
3770
3771
3772							#=head2 Sub-Method: standard_deviation_and_mean
3773
3774							### standard_deviation_and_mean ###################################################################
3775							# Description:
3776							# Calculates the standard deviation and mean for a given set of values
3777							#
3778							# Input variables:
3779							# $data_ref: reference to either a hash or an array
3780							# $type: 0 indicates a hash, 1 indicates an array
3781							#
3782
3783							#=cut
3784
3785							sub standard_deviation_and_mean{
3786
3787							#parse parameters
3788	11			11	0	2881	my $data_ref = shift;
3789	11					12	my $type = shift;
3790
3791	11					9	my %hash;
3792							my @array;
3793	0					0	my $size;
3794
3795	11					9	my $mean = 0;
3796	11					11	my $SD = 0;
3797
3798	11	100				24	if($type==0){
		100
3799	5					5	%hash = %{$data_ref};
	5					27
3800
3801	3	100				10	if((scalar(keys %hash))==0){
3802	1					5	die"Found sample size of 0 when calculating SD-hash\n";
3803							}
3804
3805							#calculate mean
3806	2					6	for my $key (keys %hash){
3807	11					12	$mean += $hash{$key};
3808							}
3809
3810	2					5	$mean = $mean/(scalar(keys %hash));
3811
3812							#calculate sum of squared differences
3813	2					3	for my $key (keys %hash){
3814	11					16	$SD += ($hash{$key}-$mean)**2;
3815							}
3816
3817							#calculate final standard deviation value
3818	2					4	$SD = $SD/(scalar(keys %hash));
3819	2					4	$SD = $SD**0.5;
3820							}
3821							elsif($type==1){
3822	5					5	@array = @{$data_ref};
	5					24
3823	3					2	$size = @array;
3824
3825	3	100				8	if($size==0){
3826	1					8	die"Found sample size of 0 when calculating SD-array\n";
3827							}
3828
3829							#calculate mean
3830	2					3	foreach my $val (@array){
3831	11					12	$mean += $val;
3832							}
3833
3834	2					3	$mean = $mean/$size;
3835
3836							#calculate sum of squared differences
3837	2					3	foreach my $val (@array){
3838	11					22	$SD += ($val-$mean)**2;
3839							}
3840
3841							#calculate final standard deviation value
3842	2					4	$SD = $SD/$size;
3843	2					10	$SD = $SD**0.5;
3844
3845							}
3846							else{
3847	1					6	die"ERROR: invalid SD/mean type found\n";
3848							}
3849	4					13	return ($SD, $mean);
3850
3851							}#sub standard_deviation_and_mean
3852
3853
3854							### next_arg ######################################################################################
3855							# Parse the next arguement from the command line
3856							#
3857							sub next_arg {
3858	0			0	0	0	my $code = shift;
3859	0					0	$pos++;
3860	0	0				0	if($pos == $ARGC){
3861	0					0	usage($code);
3862							}
3863							}#sub next_arg
3864
3865							### man_text ######################################################################################
3866							# Print the manual help text
3867							#
3868							sub man_text {
3869	0			0	0	0	print "Main Usage:\n";
3870	0					0	print "\tperl -w shatterproof.pl --cnv --trans [--insrt ] [--loh ] [--tp53] --config --output \n";
3871	0					0	print "\n";
3872	0					0	print "\tArguments:\n";
3873	0					0	print "\t\t--cnv\t\tDefine the path to the directory containing the CNV input files\n";
3874	0					0	print "\t\t--trans\t\tDefine the path to the directory containing the Translocation input files\n";
3875	0					0	print "\t\t--insrt\t\tDefine the path to the directory containing the insertion VCF input files\n";
3876	0					0	print "\t\t--loh\t\tDefine the path to the directory containing the LOH input files\n";
3877	0					0	print "\t\t--tp53\t\tIndicate that TP53 should be considered mutated, regardless of data\n";
3878	0					0	print "\t\t--config\tDefine the path to the ShatterProof config file\n";
3879	0					0	print "\t\t--output\tDefine the path to the directory where output should be placed\n";
3880	0					0	print "\t\tdir\t\tPath to a directory\n";
3881	0					0	print "\t\tpath\t\tPath to a file\n";
3882	0					0	print "\n";
3883	0					0	print "Help Usage:\n";
3884	0					0	print "\tperl -w shatterproof.pl --help\t\tThis help message.\n";
3885	0					0	print "\n";
3886	0					0	exit 0;
3887							}#sub man_text
3888
3889							### usage #########################################################################################
3890							# Prints an error message when invalid command line arguements are found
3891							#
3892							sub usage {
3893	0			0	0	0	my $usage_msg = shift;
3894
3895	0					0	print "u $usage_msg \n";
3896
3897	0					0	given($usage_msg){
3898	0					0	when (/^0/) { print "ERROR: missing arguments\n"; }
	0					0
3899	0					0	when (/^1/) { print "ERROR: 2nd argument missing\n"; }
	0					0
3900	0					0	when (/^2/) { print "ERROR: CNV directory missing\n"; }
	0					0
3901	0					0	when (/^3/) { print "ERROR: --trans option missing\n" }
	0					0
3902	0					0	when (/^4/) { print "ERROR: --cnv option missing\n" }
	0					0
3903	0					0	when (/^5/) { print "ERROR: Translocation directory missing\n" }
	0					0
3904	0					0	when (/^6/) { print "ERROR: --config option missing\n" }
	0					0
3905	0					0	when (/^7/) { print "ERROR: --trans option missing\n" }
	0					0
3906	0					0	when (/^8/) { print "ERROR: insertion directory missing\n" }
	0					0
3907	0					0	when (/^9/) { print "ERROR: --config option missing\n" }
	0					0
3908	0					0	when (/^10/) { print "ERROR: LOH directory missing \n" }
	0					0
3909	0					0	when (/^11/) { print "ERROR: --config option missing\n" }
	0					0
3910	0					0	when (/^12/) { print "ERROR: --config option missing\n" }
	0					0
3911	0					0	when (/^13/) { print "ERROR: Path to config file missing\n" }
	0					0
3912	0					0	when (/^14/) { print "ERROR: --output option missing\n" }
	0					0
3913	0					0	when (/^15/) { print "ERROR: --config option missing\n" }
	0					0
3914	0					0	when (/^16/) { print "ERROR: Output directory missing\n" }
	0					0
3915	0					0	when (/^17/) { print "ERROR: --output option missing\n" }
	0					0
3916	0					0	when (/^18/) { print "ERROR: too many arguments\n" }
	0					0
3917							}
3918	0					0	print "Try perl -w shatteproof.pl --help\n";
3919	0					0	exit 0;
3920							}#sub usage
3921
3922							### initialize_genome_hash ########################################################################
3923							# Description:
3924							# Initializes a hash to store an array for each chromosome
3925							#
3926							sub initialize_genome_hash {
3927
3928	0			0	0	0	my %genome_region_data = ( #{chr}[region_num]->%region_data
3929							X => [],
3930							Y => [],
3931							1 => [],
3932							2 => [],
3933							3 => [],
3934							4 => [],
3935							5 => [],
3936							6 => [],
3937							7 => [],
3938							8 => [],
3939							9 => [],
3940							10 => [],
3941							11 => [],
3942							12 => [],
3943							13 => [],
3944							14 => [],
3945							15 => [],
3946							16 => [],
3947							17 => [],
3948							18 => [],
3949							19 => [],
3950							20 => [],
3951							21 => [],
3952							22 => []
3953							);
3954
3955	0					0	return (\%genome_region_data);
3956							}#sub initialize_genome_hash
3957
3958							### load_config_file #########################################################################################
3959							# Description:
3960							# Opens the config file and reads the parameter values from it
3961							#
3962							# Input variables:
3963							# $path: path to config file
3964							#
3965							sub load_config_file {
3966
3967	8			8	0	1730	my $path = shift;
3968	8					520	print "\nLoading configuration file";
3969
3970							#Load the configuration file config.pl
3971	8					19	my $CONFIG;
3972	8	50				386	open($CONFIG, "<","$path") or die "COULD NOT OPEN CONFIG FILE at path: $path \n";
3973	8					1009	eval (<$CONFIG>) while (!eof($CONFIG));
3974	8					93	close($CONFIG);
3975
3976	8					728	print " - Done\n";
3977	8					87	1;
3978							}#sub load_config_file
3979
3980							=head1 NAME
3981
3982							ShatterProof - a script for analyzing next-generation sequencing data
3983
3984							=head1 SYNOPSIS
3985
3986							use Shatterproof
3987
3988							See "shatterproof.pl" in the scripts directory for a simple perl script which calls the ShatterProof module
3989
3990							Call ShatterProof via:
3991
3992							ShatterProof::run(\@ARGV);
3993
3994							=head1 DESCRIPTION
3995
3996							ShatterProof is a tool that can be used to analyze next generation sequencing data for signs of chromothripsis. ShatterProof is implemented as a Perl module that processes input files and produces output files in both tab-delimited and YAML format. Perl version 5.0 or greater is required to run ShatterProof. Link to publication will be posted soon.
3997
3998							=head1 README
3999
4000							=head2 Installing ShatterProof
4001
4002							To install this module type the following:
4003
4004							perl Makefile.PL
4005							make
4006							make test
4007							make install
4008
4009							Make sure that you have admin permission rights when running the previous commands.
4010
4011							=head2 Input File Types
4012
4013							ShatterProof bases its analysis of genomic data on calls of translocations, copy number variations (CNV), loss of heterozygosity (LOH) and insertions.
4014							ShatterProof can takes as input 4 different types of input files. See the scripts/conversion_scripts directory for some Perl scripts which will convert some common tools' output to the required input formats.
4015
4016							=head3 Translocation Input Files (.spt)
4017
4018							Tab delimited columns
4019							First line is header line:
4020							#chr1 start end chr2 start end quality
4021
4022							Example data entry line:
4023
4024							1 1000 2000 4 4000 5000 78
4025
4026							If no value is available for quality, use a "." eg.:
4027
4028							1 1000 2000 4 4000 5000 .
4029
4030
4031							=head3 Copy-Number Input Files (.spc)
4032
4033							Tab delimited columns
4034							First line is header line:
4035							#chr start end number quality
4036
4037
4038							Example data entry line:
4039							12 2000 3000 2 63
4040
4041							If no value is available for quality, use a "." eg.:
4042
4043							12 2000 3000 2 .
4044
4045							=head3 Loss of Heterozygozity Input Files (.spl)
4046
4047							Tab delimited columns
4048							First line is header line:
4049							#chr start end quality
4050
4051
4052							Example data entry line:
4053
4054							12 2000 3000 63
4055
4056							If no value is available for quality, use a "." eg.:
4057
4058							12 2000 3000 .
4059
4060							=head3 Insertion Input Files (.vcf)
4061
4062							Additionally, ShatterProof accepts insertion calls in VCF files as input. See http://www.1000genomes.org/node/101 for details on the VCF file format.
4063							ShatterProof analyzes the CHROM and POS fields of these files.
4064
4065
4066							=head2 Configuring ShatterProof
4067
4068							See the config.pl file in the scripts directory for a sample ShatterProof configuration file.
4069
4070							$bin_size: number (integer) of base pairs to include in each bin of the sliding window analysis
4071
4072							$localization_window_size: number (integer) of bins to include in each window of the sliding window analysis
4073
4074							$expected_mutation_density: a reference value (double) used in determining if the concentration of translocation events on a particular chromosome is higher than expected.
4075
4076							$collapse_regions:
4077
4078							flag variable
4079
4080							value 1: merge overlapping CNV regions that have the same copy number
4081
4082							value 0: do not merge overlapping CNV regions that have the same copy number. If such regions are found an error is thrown
4083
4084							$outlier_deviations: the number of standard deviations away from the mean a value has to be in order to be considered non-significant. Used to identify highly mutated regions.
4085
4086							$translocation_cut_off_count: the maximum number of translocation chromosomes to tolerate before the translocation score for a region is set to 0.
4087
4088							$genome_localization_weight: weight given to the localization of mutations to one chromosome hallmark
4089
4090							$chromosome_localization_weight: weight given to the localization of mutations to one area of a particular chromosome hallmark
4091
4092							$cnv_weight: weight given to the concentrated CNV hallmark
4093
4094							$translocation_weight: weight give to the concentrated translocations hallmark
4095
4096							$insertion_breakpoint_weight: weight given the the short breakpoint insertions hallmark
4097
4098							$loh_weight: weight given to the loss/retention of heterozygosity hallmark
4099
4100							$tp53_mutated_weight: weight given to the TP53 mutation hallmark
4101
4102
4103							=head2 Running ShatterProof
4104
4105							From the scripts directory run execute the shatterproof.pl file using Perl.
4106
4107							Main Usage:
4108
4109							perl -w shatterproof.pl --cnv --trans [--insrt ] [--loh ] [--tp53] --config --output
4110
4111							Arguments:
4112
4113							--cnv Define the path to the directory containing the CNV input files
4114
4115							--trans Define the path to the directory containing the Translocation input files
4116
4117							--insrt Define the path to the directory containing the insertion VCF input files
4118
4119							--loh Define the path to the directory containing the LOH input files
4120
4121							--tp53 Indicate that TP53 should be considered mutated, regardless of data
4122
4123							--config Define the path to the ShatterProof config file
4124
4125							--output Define the path to the directory where output should be placed
4126
4127							dir Path to a directory
4128
4129							path Path to a file
4130
4131							=head1 PREREQUISITES
4132
4133							strict;
4134							warnings;
4135							Carp;
4136							Switch;
4137							File::Basename;
4138							List::Util qw[min max];
4139							Statistics::Distributions;
4140							POSIX
4141
4142							=pod OSNAMES
4143
4144							any
4145
4146							=pod SCRIPT CATEGORIES
4147
4148							CPAN
4149
4150							=cut
4151
4152							1;
4153							__END__