File Coverage

lib/Bio/App/SELEX/RNAmotifAnalysis.pm

Criterion	Covered	Total	%
statement	174	287	60.6
branch	27	50	54.0
condition	13	50	26.0
subroutine	19	32	59.3
pod	0	14	0.0
total	233	433	53.8

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/env perl
2							package Bio::App::SELEX::RNAmotifAnalysis;
3							# ABSTRACT: Cluster SELEX sequences and calculate their structures
4	2			2		214402	use 5.008;
	2					6
	2					65
5	2			2		10	use strict;
	2					2
	2					49
6	2			2		9	use warnings;
	2					5
	2					65
7	2			2		1248	use Text::LevenshteinXS qw( distance );
	2					8201
	2					131
8	2			2		1913	use Config::Tiny;
	2					2438
	2					85
9	2			2		1388	use autodie;
	2					25022
	2					15
10	2			2		13933	use Hash::Util qw( lock_keys );
	2					5516
	2					12
11	2			2		197	use List::Util qw( min );
	2					4
	2					185
12	2			2		2568	use Getopt::Long;
	2					30735
	2					19
13	2			2		420	use Carp qw( croak confess);
	2					4
	2					7800
14
15							my $DEFAULT_CONFIG = 'cluster.cfg';
16
17							# CONSTANTS
18							my $TRUE = 1;
19							my $FALSE = 0;
20							my $SPACE = q{ };
21							my $EMPTY_STRING = q{};
22							my $VERBOSE = 1;
23
24							my $FASTQ_TYPE = 'fastq';
25							my $SIMPLE_TYPE = 'simple';
26
27							# Act like a script if called as one
28							unless ( caller() ) { main(); }
29
30							sub main {
31
32	0			0	0	0	my $max_clusters = 10;
33	0					0	my $num_cpus = 5;
34	0					0	my $max_distance = 5;
35	0					0	my $max_top_seqs = 300;
36	0					0	my $config_filename = $DEFAULT_CONFIG;
37	0					0	my $options = GetOptions(
38
39							# Required (one of these)
40							"$FASTQ_TYPE=s" => \my $fastq,
41							"$SIMPLE_TYPE=s" => \my $simple,
42
43							# Optional
44							'max_distance=i' => \$max_distance,
45							'max_clusters=i' => \$max_clusters,
46							'max_top_seqs=i' => \$max_top_seqs,
47							'seed=s' => \my $seed_filename,
48							'cpus=i' => \$num_cpus,
49							'config=s' => \$config_filename,
50							'run' => \my $run_scripts,
51							);
52
53	0					0	my $file_type;
54							my $infile;
55	0	0				0	if(defined $fastq){
		0
56	0	0				0	if(defined $simple){
57	0					0	warn "--$FASTQ_TYPE and --$SIMPLE_TYPE flags are mutually exclusive!\n";
58	0					0	help();
59							}
60	0					0	$infile = $fastq;
61	0					0	$file_type = $FASTQ_TYPE;
62							}elsif(defined $simple){
63	0					0	$infile = $simple;
64	0					0	$file_type = $SIMPLE_TYPE;
65							}else{
66	0					0	warn "either --$FASTQ_TYPE or --$SIMPLE_TYPE must be used!\n";
67	0					0	help();
68							}
69
70							# Eliminate case sensitivity for 'type'
71	0					0	$file_type = lc $file_type;
72
73	0					0	my $config = get_config($config_filename);
74
75	0					0	open( my $fh_in, '<', $infile );
76
77	0					0	my $seed_fh;
78	0	0	0			0	if ( defined $seed_filename && -e $seed_filename ) {
79	0					0	open( $seed_fh, '<', $seed_filename );
80							}
81
82	0					0	my ($cluster_href, $distance_href) = cluster(
83							fh => $fh_in,
84							max_distance => $max_distance,
85							max_clusters => $max_clusters,
86							seed_fh => $seed_fh,
87							file_type => $file_type,
88							);
89
90	0					0	open( my $fh_all_clusters, '>', 'clusters.txt' );
91	0					0	write_out_clusters(
92							cluster_href => $cluster_href,
93							distance_href => $distance_href,
94							fh_all_clusters => $fh_all_clusters,
95							max_top_seqs => $max_top_seqs,
96							);
97	0					0	create_batch_files( $config, $num_cpus, $run_scripts );
98	0					0	return;
99							}
100
101
102							sub get_config {
103	0			0	0	0	my $config_filename = shift;
104
105	0					0	my $config;
106	0	0				0	if ( ! -e $config_filename ) {
107	0					0	my $home_dir = $ENV{'HOME'};
108	0					0	$config = Config::Tiny->new();
109	0					0	$config->{executables} = {
110							mafft => 'mafft',
111							RNAalifold => 'RNAalifold',
112							cmalign => 'cmalign',
113							cmbuild => 'cmbuild',
114							cmcalibrate => 'cmcalibrate',
115							cmsearch => 'cmsearch',
116							CreateStockholm => 'selex_clustal2stockholm.pl',
117							stock2fasta => 'selex_stock2fasta.pl',
118							};
119	0					0	$config->{Flags_for} = {
120							RNAalifold => '-r -cv 0.6 -nc 10 -p -d2 -noLP -color -aln',
121							mafft => '--preservecase --clustalout',
122							};
123	0					0	$config->write($config_filename);
124	0					0	warn "\nNo configuration file found. Creating new configuration file '$config_filename'\n";
125	0					0	warn external_dependecnies();
126	0					0	warn <<"MSG";
127
128							If you have problems, you may need to ensure that each executable
129							listed in '$config_filename' is located in a directory that is found
130							in your PATH environment variable.\n";
131							MSG
132							}
133	0					0	$config = Config::Tiny->read($config_filename);
134	0					0	return $config;
135							}
136
137							sub create_batch_files {
138	0			0	0	0	my $config = shift;
139	0					0	my $num_cpus = shift;
140	0					0	my $run_scripts = shift;
141
142							# Get all the file names to be processed
143	0					0	my @fasta_filenames = glob '*_top.fasta';
144
145							# Reduce number of cpus if there are fewer files
146	0					0	$num_cpus = min(scalar @fasta_filenames, $num_cpus);
147
148							# Create a batch of commands for each CPU to work on
149	0					0	my @workload = map { $EMPTY_STRING } 1 .. $num_cpus;
	0					0
150	0					0	my $add_work = __add_work(
151							{
152							workload_aref => \@workload,
153							num_cpus => $num_cpus,
154							config => $config,
155							}
156							);
157	0					0	$add_work->($_) for @fasta_filenames;
158
159							# Execute the commands for each CPU
160	0					0	for my $batch_num ( 1 .. $num_cpus ){
161
162							# name script file for each batch
163	0					0	my $batch_filename = "batch_$batch_num";
164
165							# Create a directory for each batch
166	0					0	system("mkdir $batch_filename.dir");
167
168							# Move into batch directory
169	0					0	chdir "$batch_filename.dir";
170
171							# Write batch instruction to script file
172	0					0	open( my $fh, '>', $batch_filename);
173	0					0	print {$fh} $workload[ $batch_num - 1];
	0					0
174	0					0	close $fh;
175
176							# Make script file executable
177	0					0	system("chmod u+x $batch_filename");
178
179							# Run the script, if desired
180	0	0				0	if($run_scripts){
181	0					0	system("./$batch_filename &");
182							}
183
184							# Return to directory about batch directory
185	0					0	chdir '..';
186							}
187	0					0	return;
188							}
189
190							sub __add_work {
191	0			0		0	my %opt = %{ shift() };
	0					0
192	0		0			0	my $workload_aref = $opt{workload_aref} \|\| croak "'workload_aref' required";
193	0		0			0	my $num_cpus = $opt{num_cpus} \|\| croak "'num_cpus' required";
194	0		0			0	my $config = $opt{config} \|\| croak "'config' required";
195	0					0	my $work_index = 0;
196	0					0	my @filenames_to_rename = _filenames_to_rename();
197
198	0					0	my $MAFFT_cmd = "$config->{executables}{mafft} $config->{Flags_for}{mafft}";
199	0					0	my $RNAalifold_cmd = "$config->{executables}{RNAalifold} $config->{Flags_for}{RNAalifold}";
200							return sub {
201	0			0		0	my $fasta_filename = shift;
202	0					0	my %file = file_name_hash($fasta_filename);
203
204							# Don't allow accidentally creating new keys
205	0					0	lock_keys %file;
206
207							# Cycle through the sets number of work orders
208	0	0				0	if ( $work_index >= $num_cpus ) {
209	0					0	$work_index = 0;
210							}
211
212	0					0	$workload_aref->[$work_index] .= join(
213							"\n",
214
215							# Pull fasta file into current directory
216							"mv ../$file{fasta} .",
217
218							# Do alignment of sequences against each other
219							"$MAFFT_cmd $file{fasta} > $file{aligned}",
220
221							# Calculate the secondary structure
222							"$RNAalifold_cmd < $file{aligned} > $file{sec_struct}",
223
224							# Rename resulting files
225	0					0	( map { "mv $_ $file{$_}" } @filenames_to_rename ),
226
227							# Convert secondary structure file to Stockholm format
228							"$config->{executables}{CreateStockholm} $file{aligned} $file{sec_struct} > $file{stock}",
229
230							# Determine covariance model
231							"$config->{executables}{cmbuild} $file{covar_model} $file{stock}",
232
233							) . "\n";
234
235							# Increment work index
236	0					0	$work_index++;
237
238	0					0	return;
239	0					0	};
240							}
241
242							sub _pairs_from_filenames_to_rename {
243	0			0		0	my $base_filename = shift;
244	0					0	my @pairs = _pairs_from_array( $base_filename . '_', _filenames_to_rename());
245	0					0	return @pairs;
246							}
247
248							sub _pairs_from_array {
249	0			0		0	my ($added_string, @array) = @_;
250	0					0	return map { $_ => $added_string . $_ } @array;
	0					0
251							}
252
253							sub _filenames_to_rename {
254	0			0		0	return qw( alirna.ps alidot.ps aln.ps alifold.out);
255							}
256
257							sub file_name_hash {
258	0			0	0	0	my $fasta_filename = shift;
259	0					0	my $base_filename = base_filename($fasta_filename);
260	0					0	my %file = (
261							fasta => $fasta_filename,
262							aligned => $base_filename . '.aln',
263							sec_struct => $base_filename . '.gc',
264							_pairs_from_filenames_to_rename($base_filename),
265							stock => $base_filename . '.sto', #stockholm format
266							covar_model => $base_filename . '.cm',
267							);
268	0					0	return %file;
269							}
270
271							sub base_filename {
272	0			0	0	0	my $filename = shift;
273	0					0	$filename =~ s/(\.\w+)\z//;
274	0					0	return $filename;
275							}
276
277							#: PUBLIC_SUBS
278							sub cluster {
279	7			7	0	23664	my %opt = @_;
280
281							# Required parameters
282	7		33			88	my $max_distance = $opt{max_distance} \|\| croak 'max_distance required';
283	7		33			21	my $fh = $opt{fh} \|\| croak 'fh required';
284	7		33			20	my $max_clusters = $opt{max_clusters} \|\| croak 'max_clusters required';
285	7		33			18	my $file_type = $opt{file_type} \|\| confess 'file_type required';
286
287							# Optional parameters
288	7					11	my $seed_fh = $opt{seed_fh};
289
290	7					10	my @seed_pairs;
291
292	7					19	my @seq_count_pairs = get_sequences_from($fh, $file_type);
293
294							# Add any seed sequences to the beginning of the sequence list
295							# Oops, this also sorts and counts the sequences in the seed list.
296							# Is that a problem?
297	7	100				23	if( defined $seed_fh){
298	2					15	@seed_pairs = get_sequences_from($seed_fh, $file_type);
299	2					6	unshift @seq_count_pairs, @seed_pairs;
300							}
301
302	7					9	my %cluster_for;
303							my %distance_for; # Hash that keeps track of edit distances (parallel to %cluster_for)
304	7					9	my $next_id = 1;
305
306							# Create the first cluster with the first sequence count pair.
307	7					14	my $first_pair = shift @seq_count_pairs;
308	7					22	$cluster_for{$next_id} = [ $first_pair];
309
310							# No distance from itself!
311	7					22	$distance_for{$next_id}{$first_pair->[0]} = 0;
312
313							# Increment next cluster id, since it has already been used.
314	7					10	$next_id++;
315
316							# Add sequences to existing cluster, or create new ones up to the maximum
317	7					22	while ( my $seq_count_pair = shift @seq_count_pairs ) {
318	34					75	my ($cluster_id, $distance) = matching_cluster_and_distance( $max_distance, \%cluster_for, $seq_count_pair );
319
320							# Create new cluster, if one wasn't found
321	34	100				82	if( ! defined $cluster_id){
322	13					13	$cluster_id = $next_id;
323	13					16	$distance = 0;
324
325							# Increment counter
326	13					16	$next_id++;
327							}
328
329							# Add sequence info to the cluster, if the cluster is within the maximum requested
330	34	100				269	if( $cluster_id <= $max_clusters){
331	31					33	push @{ $cluster_for{$cluster_id} }, $seq_count_pair ;
	31					66
332	31					119	$distance_for{$cluster_id}{$seq_count_pair->[0]} = $distance;
333							}
334							}
335
336	7					41	return (\%cluster_for, \%distance_for);
337							}
338
339							sub total_plus_cluster {
340	9			9	0	12	my $opt = shift;
341	9					14	my $cluster = $opt->{cluster};
342	9					9	my @seqs_with_count = @{$cluster};
	9					22
343	9					14	my $total_count = 0;
344
345	9					13	for my $seq_with_count (@seqs_with_count) {
346	21					36	$total_count += $seq_with_count->[1];
347							}
348
349							return {
350	9					55	total => $total_count,
351							cluster => \@seqs_with_count,
352							original_id => $opt->{original_id},
353							};
354							}
355
356							sub write_out_clusters {
357	3			3	0	28	my %opt = @_;
358
359							# Required input
360	3		33			17	my $cluster_href = $opt{cluster_href} \|\| croak 'cluster_href required';
361	3		33			12	my $distance_href = $opt{distance_href} \|\| croak 'distance_href required';
362	3		33			9	my $fh_all_clusters = $opt{fh_all_clusters} \|\| croak 'fh_all_clusters required';
363	3		33			10	my $max_top_seqs = $opt{max_top_seqs} \|\| croak 'max_top_seqs required';
364
365							# Optional input
366	3		50			9	my $fh_href = $opt{fh_href} \|\| {};
367
368							# Sort clusters by number of sequences they contain (including redundant
369							# ones).
370	9					40	my @total_plus_clusters =
371	3					8	map { total_plus_cluster( {original_id => $_, cluster => $cluster_href->{$_}} ) }
372	3					4	keys %{$cluster_href};
373
374	3					7	my $new_id = 1;
375							# Sort clusters by number of sequences they contain (including redundant ones).
376	3					9	for my $total_plus_cluster ( reverse sort { $a->{total} <=> $b->{total} } @total_plus_clusters ) {
	7					19
377
378	9					10	my $number_of_sequences_in_cluster = scalar @{ $total_plus_cluster->{cluster} };
	9					18
379
380	9					12	my $grouping;
381
382							# Call it a 'single' if only one unique sequence
383	9	100				22	if($number_of_sequences_in_cluster == 1){
384	3					6	$grouping = 'single';
385							}else{
386	6					10	$grouping = 'cluster';
387							};
388
389							# Print header for each cluster/single
390	9					10	print {$fh_all_clusters} "######## $grouping $new_id ########\n";
	9					37
391
392							# Use prescribed filehandle for each sequence, or create one
393	9					10	my $fh;
394	9	50				21	if ( defined $fh_href->{$new_id} ) {
395	9					18	$fh = $fh_href->{$new_id};
396							}
397							else {
398	0					0	my $filename = $grouping . '_' . $new_id . '_top.fasta';
399	0					0	open( $fh, '>', $filename );
400	0	0				0	print "created output file '$filename'\n" if $VERBOSE;
401							}
402
403							# Write cluster info to the "all" and individual cluster files
404	9					111	write_cluster(
405							{
406							fh_all_clusters => $fh_all_clusters,
407							fh => $fh,
408							cluster_aref => $total_plus_cluster->{cluster},
409							cluster_number => $new_id,
410							original_cluster_id => $total_plus_cluster->{original_id},
411							max_top_seqs => $max_top_seqs,
412							distance_href => $distance_href,
413							}
414							);
415
416							# Increment cluster id
417	9					31	$new_id++;
418							}
419	3					21	return;
420							}
421
422							sub write_cluster {
423	9			9	0	34	my $opt = shift;
424
425	9					14	my $fh_all_clusters = $opt->{fh_all_clusters};
426	9					13	my $fh = $opt->{fh};
427	9					14	my $cluster_aref = $opt->{cluster_aref};
428	9					13	my $distance_href = $opt->{distance_href};
429	9					16	my $cluster_number = $opt->{cluster_number};
430	9					13	my $max_top_seqs = $opt->{max_top_seqs};
431	9					14	my $original_id = $opt->{original_cluster_id};
432
433	9					12	my @seq_w_counts = @{$cluster_aref};
	9					22
434
435							# Remove total count, leaving just pairs with counts
436	9					12	my $internal_seq_id = 1;
437
438	9					11	my $is_single;
439	9	100				75	$is_single = $TRUE if @seq_w_counts == 1;
440
441	9					13	my $num_seqs = scalar @seq_w_counts;
442
443							# If there are more than max_top_seqs, then split them into two arrays:
444							# One that will be processed and the other that will simply be output
445	9					11	my @overage_seqs;
446	9	100				18	if ( $num_seqs > $max_top_seqs ) {
447	2					6	@overage_seqs = @seq_w_counts[ $max_top_seqs .. ( $num_seqs - 1 ) ];
448	2					9	@seq_w_counts = @seq_w_counts[ 0 .. ( $max_top_seqs - 1 ) ];
449							}
450
451							# SEQ_W_COUNT_LOOP
452	9					16	for my $seq_w_count (@seq_w_counts) {
453	19					18	my ( $seq, $count ) = @{$seq_w_count};
	19					43
454
455	19					38	my $distance = $distance_href->{$original_id}{$seq};
456
457	19					46	my $unique_id = join( '.', $cluster_number, $internal_seq_id, $count, $distance );
458
459							# Print to individual cluster file and all clusters file
460	19					23	print {$fh_all_clusters} "$unique_id\t$seq\n";
	19					98
461
462	19					24	print {$fh} ">$unique_id\n$seq\n";
	19					55
463
464							# Print second copy if this is a singleton (to make
465							# multi-sequence alignment behave well)
466	19	100				68	if ($is_single) {
467	3					6	$unique_id .= 'b';
468	3					5	print {$fh} ">$unique_id\n$seq\n";
	3					10
469							}
470	19					42	$internal_seq_id++;
471							}
472
473							# OVERAGE LOOP
474	9	100				22	if (@overage_seqs) {
475	2					6	my $filename = "cluster_${cluster_number}_overage.fasta";
476
477	2					12	open( my $overage_fh, '>', $filename );
478	2					3516	for my $seq_w_count (@overage_seqs) {
479	2					6	my ( $seq, $count ) = @{$seq_w_count};
	2					5
480
481	2					7	my $distance = $distance_href->{$original_id}{$seq};
482	2					8	my $unique_id = join( '.', $cluster_number, $internal_seq_id, $count, $distance );
483	2					3	print {$overage_fh} ">$unique_id\n$seq\n";
	2					16
484	2					2	print {$fh_all_clusters} "$unique_id\t$seq\n";
	2					9
485	2					151	$internal_seq_id++;
486							}
487							}
488
489	9					25	return;
490							}
491
492							sub matching_cluster_and_distance {
493	35			35	0	2119	my $max_distance = shift;
494	35					44	my $cluster_href = shift;
495	35					38	my $seq_aref = shift;
496
497	35					90	ID_LOOP:
498	35					34	for my $id ( keys %{$cluster_href} ) {
499	58					87	my $cluster_seq = $cluster_href->{$id}->[0];
500
501	58					303	my $distance = distance( $seq_aref->[0], $cluster_seq->[0] );
502
503							# Short circuit ID_LOOP when one is found (supposedly only one will match)
504	58	100				147	if ( $distance < $max_distance ) {
505	22					61	return ($id, $distance);
506							}
507							}
508	13					30	return;
509							}
510
511							sub get_sequences_from {
512	10		33	10	0	9257	my $fh = shift \|\| croak 'fh required';
513	10		33			32	my $type = shift \|\| confess 'file type required';
514	10					14	my %count_of;
515	10					31	my $next_seq = _next_sequence_for($fh, $type);
516	10					16	while (1) {
517	82					173	my $seq = $next_seq->();
518	82	100				157	last if ! defined $seq;
519	72	50				137	next if $seq eq $EMPTY_STRING;
520
521							# If sequence has not been seen before, start counting it. Otherwise, add to the count.
522	72	100				170	if( ! exists $count_of{$seq} ){
523	48					147	$count_of{$seq} = 1;
524							}else {
525	24					48	$count_of{$seq} = $count_of{$seq} + 1;
526							}
527							}
528
529							# Convert hash into an array containing paired values.
530							# First value in the pair is the sequence.
531							# \| The second value in the pair is the number of times (the
532							# \| \| count) that that sequence is seen in the file.
533							# v v
534	10					32	my @sequences = map { [ $_, $count_of{$_} ] } keys %count_of;
	48					124
535
536							# Sort the sequences so that the most abundant occur first in the array.
537	10					46	@sequences = sort { $b->[1] <=> $a->[1] } @sequences;
	76					110
538
539	10					66	return @sequences;
540							}
541
542							sub _next_sequence_for {
543	10		33	10		26	my $fh = shift \|\| croak 'fh (first positional parameter) required';
544	10		33			28	my $type = shift \|\| confess 'file_type (second positional parameter) required';
545
546							# Simply use next_line if we want each and every line
547	82			82		129	return sub { next_line($fh) }
548	10	50				65	if $type eq $SIMPLE_TYPE;
549
550							# Only other supported file type is 'fastq'
551	0	0				0	confess "Unrecognized type '$type'. Only '$SIMPLE_TYPE' and '$FASTQ_TYPE' are currently recognized."
552							if $type ne $FASTQ_TYPE;
553
554							# Skip first header line
555	0					0	readline $fh;
556
557							return sub {
558	0			0		0	my $line = readline $fh;
559	0	0				0	return if !defined $line;
560
561							# remove newline and carriage return
562	0					0	chomp $line;
563	0					0	$line =~ s/\r//g;
564
565							# Skip quality header, quality score, and next sequence header
566	0					0	readline $fh;
567	0					0	readline $fh;
568	0					0	readline $fh;
569
570	0					0	return $line;
571	0					0	};
572							}
573
574							sub next_line {
575	82			82	0	96	my $fh = shift;
576
577							# Get next line from file
578	82					144	my $line = readline $fh;
579
580							# return undef if nothing left to read.
581	82	100				161	return if ! defined $line;
582
583							#remove newline
584	72					82	chomp $line;
585
586							#remove carriage return
587	72					90	$line =~ s/\r//g;
588	72					161	return $line;
589							}
590
591							sub help {
592	0			0	0		print << "END";
593							$0 --$FASTQ_TYPE=FILENAME [OPTIONS]
594							$0 --$SIMPLE_TYPE=FILENAME [OPTIONS]
595
596							OPTIONS (showing defaults)
597							--max_distance 5
598							--cpus 5
599							--max_clusters 10
600							--max_top_seqs 300
601							--config cluster.cfg
602							END
603	0						exit();
604							}
605
606							sub external_dependecnies {
607	0			0	0		return <
608							External dependencies:
609							mafft (see http://mafft.cbrc.jp/alignment/software/)
610							Infernal (see http://infernal.janelia.org/), specifically:
611							cmalign
612							cmbuild
613							cmcalibrate
614							cmsearch
615							RNA Vienna Package (see http://www.tbi.univie.ac.at/~ivo/RNA/), specifically:
616							RNAalifold
617							These must be installed and in a directory that is your PATH environment variable.
618							END
619
620							};
621
622							1;
623
624							=pod
625
626							=head1 NAME
627
628							Bio::App::SELEX::RNAmotifAnalysis - Cluster SELEX sequences and calculate their structures
629
630							=head1 SYNOPSIS
631
632							RNAmotifAnalysis --fastq seqs.fq --cpus 4 --run
633
634							=head1 DESCRIPTION
635
636							This module pipelines steps in the analysis of SELEX (Systematic Evolution
637							of Ligands through EXponential enrichment) data.
638
639							This main module creates scripts to do the following:
640
641							(1) Cluster similar sequences based on edit distance.
642
643							(2) Align sequences within each cluster (using mafft).
644
645							(3) Calculate the secondary structure of the aligned sequences (using
646							RNAalifold, from the Vienna RNA package)
647
648							(4) Build covariance models using cmbuild from Infernal.
649
650							Another useful utility installed with this distribution is
651							"selex_covarianceSearch" for doing iterative refinements of
652							covariance models.
653
654							If you want to use files that simply list sequences, then use
655							the "--simple" flag instead of the "--fastq" flag.
656
657							This script assumes that you've already done all of the quality
658							control of your sequences beforehand. If the FASTQ format is
659							used, quality scores are ignored.
660
661							=head1 EXAMPLE USE
662
663							RNAmotifAnalysis --infile seqs.fq --cpus 4 --run
664
665							This will cluster the sequences found in 'seqs.fq' and create a FASTA file
666							for each one. The FASTA files will be grouped into batches (i.e. one per
667							cpu requested) that will be placed in a separate directory for each batch,
668							and processed within that directory. At the end of processing, for each
669							cluster there will be a covariance model and postscript illustration
670							files. The batch script used to process each batch will be located in the
671							respective batch directory. To produce the scripts without running them,
672							simply exclude the --run flag from the command line.
673
674							The output file contains names that contain four period delimited values
675							For example, 2.3.1.5 means
676							that this is the second cluster
677							this is the third sequence in the cluster
678							there is one copy of this sequences
679							it is an edit distance of 5 from the reference sequence
680
681							=head1 CONFIGURATION AND ENVIRONMENT
682
683							As written, this code makes heavy use of UNIX utilities and is
684							therefore only supported on UNIX-like environemnts (e.g. Linux, UNIX, Mac
685							OS X).
686
687							Install Infernal, MAFFT, and the RNA Vienna package ahead of time and add
688							the directories containing their executables to your PATH, so that the
689							first time you run RNAmotifAnalysis.pm the configuration file (cluster.cfg)
690							that is generated will have all of the correct parameters. Otherwise,
691							you'll need to update the configuration file manually.
692
693							To update the PATH environment variable with the directory '/usr/local/myapps/bin/',
694							update your .bashrc file, thus:
695
696							echo 'export PATH=/usr/local/myapps/bin:$PATH' >> ~/.bashrc.
697
698							Now, every time you open a new terminal window, the PATH environment
699							variable will contain '/usr/local/myapps/bin/'. To make your new .bashrc
700							file effective immediately (i.e. without having to open a new terminal
701							window), use the following command:
702
703							source ~/.bashrc
704
705							=head1 INSTALLATION
706
707							These installation instructions assume being able to open and use a
708							terminal window on Linux.
709
710							(0) Some systems need several dependencies installed ahead of time.
711
712							You may be able to skip this step. However, if subsequent steps don't
713							work, then be sure that some basic libraries are installed, as shown
714							below (or ask a system administrator to take care of it). For the
715							applicable distribution, open a terminal and then type the commands as
716							indicated:
717
718							For RedHat or CentOS 5.x systems (tested on CentOS 5.5)
719
720							sudo yum install gcc
721
722							For RedHat or CentOS 6.x systems (tested on "Minimal Desktop" CentOS 6.0)
723
724							sudo yum install gcc
725							sudo yum install perl-devel
726
727							For Ubuntu systems (tested on Ubuntu 12-04 LTS)
728
729							sudo apt-get install curl
730
731							For Debian 5.x systems:
732
733							sudo apt-get install gcc
734							sudo apt-get install make
735
736							(1) Install the non-Perl dependencies:
737							(Versions shown are those that we've tested. Please contact us if
738							newer versions do not work.)
739
740							Infernal 1.0.2 (http://infernal.janelia.org/)
741							MAFFT 6.849b (http://mafft.cbrc.jp/alignment/software/)
742							RNA Vienna package 1.8.4 (http://www.tbi.univie.ac.at/~ivo/RNA/)
743
744							After installing these, make sure all of the foloowing executables are
745							in directories within your PATH:
746
747							cmbuild
748							cmcalibrate
749							cmsearch
750							cmalign
751							mafft
752							RNAalifold
753
754							(2) Use a CPAN client to install Bio::App::SELEX::RNAmotifAnalysis.
755
756							Here we demonstrate the use of cpanminus to install it to a local Perl module directory. These instructions assume absolutely no experience with cpanminus.
757
758							1. Download cpanminus
759
760							curl -LOk http://xrl.us/cpanm
761
762
763							2. Make it executable
764
765							chmod u+x cpanm
766
767
768							3. Make a local lib/perl5 directory (if it doesn't already exist)
769
770							mkdir -p ~/lib/perl5
771
772
773							4. Add relevant directories to your PERL5LIB and PATH environment
774							variables by adding the following text to your ~/.bashrc
775							file:
776
777
778							# Set PERL5LIB if it doesn't already exist
779							: ${PERL5LIB:=~/lib/perl5}
780
781							# Prepend to PERL5LIB if directory not already found in PERL5LIB
782							if ! echo $PERL5LIB \| egrep -q "(^\|:)~/perl5/lib/perl5($\|:)"; then
783							export PERL5LIB=~/lib/perl5:$PERL5LIB;
784							fi
785
786							# Prepend to PATH if directory not already found in PATH
787							if ! echo $PATH \| egrep -q "(^\|:)~/perl5/bin($\|:)"; then
788							export PATH=~/bin:$PATH;
789							fi
790
791
792							5. Update environment variables immediately
793
794							source ~/.bashrc
795
796
797							6. Install Module::Build
798
799							./cpanm Module::Build
800
801
802							7. Install Text::LevenshteinXS (even if you already have it installed elsewhere)
803
804							./cpanm Text::LevenshteinXS
805
806
807							8. Install Bio::App::SELEX::RNAmotifAnalysis
808
809							./cpanm Bio::App::SELEX::RNAmotifAnalysis
810
811
812							Please contact the author if, after consulting this documentation and
813							searching Google with error messages, you still encounter difficulties
814							during the installation process.
815
816							=head1 INCOMPATIBILITIES
817
818							Windows: lacks necessary *nix utilities
819							SGI: problems with compiled dependency Text::LevenshteinXS
820							Sun/Solaris: problems with compiled dependency Text::LevenshteinXS
821							BSD: problems with compiled dependency Text::LevenshteinXS
822
823							=head1 BUGS AND LIMITATIONS
824
825							There are no known bugs in this module.
826							Please report problems to molecules cpan org
827							Patches are welcome.
828
829							=head1 RELATED PUBLICATIONS
830
831							Ditzler MA, Lange MJ, Bose D, Bottoms CA, Virkler KF, et al. (2013) High-
832							throughput sequence analysis reveals structural diversity and improved
833							potency among RNA inhibitors of HIV reverse transcriptase. Nucleic Acids
834							Res 41(3):1873-1884. doi: 10.1093/nar/gks1190
835
836							=cut