File Coverage

lib/Bio/App/SELEX/RNAmotifAnalysis.pm
Criterion Covered Total %
statement 174 287 60.6
branch 27 50 54.0
condition 13 50 26.0
subroutine 19 32 59.3
pod 0 14 0.0
total 233 433 53.8


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2             package Bio::App::SELEX::RNAmotifAnalysis;
3             # ABSTRACT: Cluster SELEX sequences and calculate their structures
4 2     2   214402 use 5.008;
  2         6  
  2         65  
5 2     2   10 use strict;
  2         2  
  2         49  
6 2     2   9 use warnings;
  2         5  
  2         65  
7 2     2   1248 use Text::LevenshteinXS qw( distance );
  2         8201  
  2         131  
8 2     2   1913 use Config::Tiny;
  2         2438  
  2         85  
9 2     2   1388 use autodie;
  2         25022  
  2         15  
10 2     2   13933 use Hash::Util qw( lock_keys );
  2         5516  
  2         12  
11 2     2   197 use List::Util qw( min );
  2         4  
  2         185  
12 2     2   2568 use Getopt::Long;
  2         30735  
  2         19  
13 2     2   420 use Carp qw( croak confess);
  2         4  
  2         7800  
14              
15             my $DEFAULT_CONFIG = 'cluster.cfg';
16              
17             # CONSTANTS
18             my $TRUE = 1;
19             my $FALSE = 0;
20             my $SPACE = q{ };
21             my $EMPTY_STRING = q{};
22             my $VERBOSE = 1;
23              
24             my $FASTQ_TYPE = 'fastq';
25             my $SIMPLE_TYPE = 'simple';
26              
27             # Act like a script if called as one
28             unless ( caller() ) { main(); }
29              
30             sub main {
31              
32 0     0 0 0 my $max_clusters = 10;
33 0         0 my $num_cpus = 5;
34 0         0 my $max_distance = 5;
35 0         0 my $max_top_seqs = 300;
36 0         0 my $config_filename = $DEFAULT_CONFIG;
37 0         0 my $options = GetOptions(
38              
39             # Required (one of these)
40             "$FASTQ_TYPE=s" => \my $fastq,
41             "$SIMPLE_TYPE=s" => \my $simple,
42              
43             # Optional
44             'max_distance=i' => \$max_distance,
45             'max_clusters=i' => \$max_clusters,
46             'max_top_seqs=i' => \$max_top_seqs,
47             'seed=s' => \my $seed_filename,
48             'cpus=i' => \$num_cpus,
49             'config=s' => \$config_filename,
50             'run' => \my $run_scripts,
51             );
52              
53 0         0 my $file_type;
54             my $infile;
55 0 0       0 if(defined $fastq){
    0          
56 0 0       0 if(defined $simple){
57 0         0 warn "--$FASTQ_TYPE and --$SIMPLE_TYPE flags are mutually exclusive!\n";
58 0         0 help();
59             }
60 0         0 $infile = $fastq;
61 0         0 $file_type = $FASTQ_TYPE;
62             }elsif(defined $simple){
63 0         0 $infile = $simple;
64 0         0 $file_type = $SIMPLE_TYPE;
65             }else{
66 0         0 warn "either --$FASTQ_TYPE or --$SIMPLE_TYPE must be used!\n";
67 0         0 help();
68             }
69              
70             # Eliminate case sensitivity for 'type'
71 0         0 $file_type = lc $file_type;
72              
73 0         0 my $config = get_config($config_filename);
74              
75 0         0 open( my $fh_in, '<', $infile );
76              
77 0         0 my $seed_fh;
78 0 0 0     0 if ( defined $seed_filename && -e $seed_filename ) {
79 0         0 open( $seed_fh, '<', $seed_filename );
80             }
81              
82 0         0 my ($cluster_href, $distance_href) = cluster(
83             fh => $fh_in,
84             max_distance => $max_distance,
85             max_clusters => $max_clusters,
86             seed_fh => $seed_fh,
87             file_type => $file_type,
88             );
89              
90 0         0 open( my $fh_all_clusters, '>', 'clusters.txt' );
91 0         0 write_out_clusters(
92             cluster_href => $cluster_href,
93             distance_href => $distance_href,
94             fh_all_clusters => $fh_all_clusters,
95             max_top_seqs => $max_top_seqs,
96             );
97 0         0 create_batch_files( $config, $num_cpus, $run_scripts );
98 0         0 return;
99             }
100              
101              
102             sub get_config {
103 0     0 0 0 my $config_filename = shift;
104              
105 0         0 my $config;
106 0 0       0 if ( ! -e $config_filename ) {
107 0         0 my $home_dir = $ENV{'HOME'};
108 0         0 $config = Config::Tiny->new();
109 0         0 $config->{executables} = {
110             mafft => 'mafft',
111             RNAalifold => 'RNAalifold',
112             cmalign => 'cmalign',
113             cmbuild => 'cmbuild',
114             cmcalibrate => 'cmcalibrate',
115             cmsearch => 'cmsearch',
116             CreateStockholm => 'selex_clustal2stockholm.pl',
117             stock2fasta => 'selex_stock2fasta.pl',
118             };
119 0         0 $config->{Flags_for} = {
120             RNAalifold => '-r -cv 0.6 -nc 10 -p -d2 -noLP -color -aln',
121             mafft => '--preservecase --clustalout',
122             };
123 0         0 $config->write($config_filename);
124 0         0 warn "\nNo configuration file found. Creating new configuration file '$config_filename'\n";
125 0         0 warn external_dependecnies();
126 0         0 warn <<"MSG";
127              
128             If you have problems, you may need to ensure that each executable
129             listed in '$config_filename' is located in a directory that is found
130             in your PATH environment variable.\n";
131             MSG
132             }
133 0         0 $config = Config::Tiny->read($config_filename);
134 0         0 return $config;
135             }
136              
137             sub create_batch_files {
138 0     0 0 0 my $config = shift;
139 0         0 my $num_cpus = shift;
140 0         0 my $run_scripts = shift;
141              
142             # Get all the file names to be processed
143 0         0 my @fasta_filenames = glob '*_top.fasta';
144              
145             # Reduce number of cpus if there are fewer files
146 0         0 $num_cpus = min(scalar @fasta_filenames, $num_cpus);
147              
148             # Create a batch of commands for each CPU to work on
149 0         0 my @workload = map { $EMPTY_STRING } 1 .. $num_cpus;
  0         0  
150 0         0 my $add_work = __add_work(
151             {
152             workload_aref => \@workload,
153             num_cpus => $num_cpus,
154             config => $config,
155             }
156             );
157 0         0 $add_work->($_) for @fasta_filenames;
158              
159             # Execute the commands for each CPU
160 0         0 for my $batch_num ( 1 .. $num_cpus ){
161              
162             # name script file for each batch
163 0         0 my $batch_filename = "batch_$batch_num";
164              
165             # Create a directory for each batch
166 0         0 system("mkdir $batch_filename.dir");
167              
168             # Move into batch directory
169 0         0 chdir "$batch_filename.dir";
170              
171             # Write batch instruction to script file
172 0         0 open( my $fh, '>', $batch_filename);
173 0         0 print {$fh} $workload[ $batch_num - 1];
  0         0  
174 0         0 close $fh;
175              
176             # Make script file executable
177 0         0 system("chmod u+x $batch_filename");
178              
179             # Run the script, if desired
180 0 0       0 if($run_scripts){
181 0         0 system("./$batch_filename &");
182             }
183              
184             # Return to directory about batch directory
185 0         0 chdir '..';
186             }
187 0         0 return;
188             }
189              
190             sub __add_work {
191 0     0   0 my %opt = %{ shift() };
  0         0  
192 0   0     0 my $workload_aref = $opt{workload_aref} || croak "'workload_aref' required";
193 0   0     0 my $num_cpus = $opt{num_cpus} || croak "'num_cpus' required";
194 0   0     0 my $config = $opt{config} || croak "'config' required";
195 0         0 my $work_index = 0;
196 0         0 my @filenames_to_rename = _filenames_to_rename();
197              
198 0         0 my $MAFFT_cmd = "$config->{executables}{mafft} $config->{Flags_for}{mafft}";
199 0         0 my $RNAalifold_cmd = "$config->{executables}{RNAalifold} $config->{Flags_for}{RNAalifold}";
200             return sub {
201 0     0   0 my $fasta_filename = shift;
202 0         0 my %file = file_name_hash($fasta_filename);
203              
204             # Don't allow accidentally creating new keys
205 0         0 lock_keys %file;
206              
207             # Cycle through the sets number of work orders
208 0 0       0 if ( $work_index >= $num_cpus ) {
209 0         0 $work_index = 0;
210             }
211              
212 0         0 $workload_aref->[$work_index] .= join(
213             "\n",
214              
215             # Pull fasta file into current directory
216             "mv ../$file{fasta} .",
217              
218             # Do alignment of sequences against each other
219             "$MAFFT_cmd $file{fasta} > $file{aligned}",
220              
221             # Calculate the secondary structure
222             "$RNAalifold_cmd < $file{aligned} > $file{sec_struct}",
223              
224             # Rename resulting files
225 0         0 ( map { "mv $_ $file{$_}" } @filenames_to_rename ),
226              
227             # Convert secondary structure file to Stockholm format
228             "$config->{executables}{CreateStockholm} $file{aligned} $file{sec_struct} > $file{stock}",
229              
230             # Determine covariance model
231             "$config->{executables}{cmbuild} $file{covar_model} $file{stock}",
232              
233             ) . "\n";
234              
235             # Increment work index
236 0         0 $work_index++;
237              
238 0         0 return;
239 0         0 };
240             }
241              
242             sub _pairs_from_filenames_to_rename {
243 0     0   0 my $base_filename = shift;
244 0         0 my @pairs = _pairs_from_array( $base_filename . '_', _filenames_to_rename());
245 0         0 return @pairs;
246             }
247              
248             sub _pairs_from_array {
249 0     0   0 my ($added_string, @array) = @_;
250 0         0 return map { $_ => $added_string . $_ } @array;
  0         0  
251             }
252              
253             sub _filenames_to_rename {
254 0     0   0 return qw( alirna.ps alidot.ps aln.ps alifold.out);
255             }
256              
257             sub file_name_hash {
258 0     0 0 0 my $fasta_filename = shift;
259 0         0 my $base_filename = base_filename($fasta_filename);
260 0         0 my %file = (
261             fasta => $fasta_filename,
262             aligned => $base_filename . '.aln',
263             sec_struct => $base_filename . '.gc',
264             _pairs_from_filenames_to_rename($base_filename),
265             stock => $base_filename . '.sto', #stockholm format
266             covar_model => $base_filename . '.cm',
267             );
268 0         0 return %file;
269             }
270              
271             sub base_filename {
272 0     0 0 0 my $filename = shift;
273 0         0 $filename =~ s/(\.\w+)\z//;
274 0         0 return $filename;
275             }
276              
277             #: PUBLIC_SUBS
278             sub cluster {
279 7     7 0 23664 my %opt = @_;
280              
281             # Required parameters
282 7   33     88 my $max_distance = $opt{max_distance} || croak 'max_distance required';
283 7   33     21 my $fh = $opt{fh} || croak 'fh required';
284 7   33     20 my $max_clusters = $opt{max_clusters} || croak 'max_clusters required';
285 7   33     18 my $file_type = $opt{file_type} || confess 'file_type required';
286              
287             # Optional parameters
288 7         11 my $seed_fh = $opt{seed_fh};
289              
290 7         10 my @seed_pairs;
291              
292 7         19 my @seq_count_pairs = get_sequences_from($fh, $file_type);
293              
294             # Add any seed sequences to the beginning of the sequence list
295             # Oops, this also sorts and counts the sequences in the seed list.
296             # Is that a problem?
297 7 100       23 if( defined $seed_fh){
298 2         15 @seed_pairs = get_sequences_from($seed_fh, $file_type);
299 2         6 unshift @seq_count_pairs, @seed_pairs;
300             }
301              
302 7         9 my %cluster_for;
303             my %distance_for; # Hash that keeps track of edit distances (parallel to %cluster_for)
304 7         9 my $next_id = 1;
305              
306             # Create the first cluster with the first sequence count pair.
307 7         14 my $first_pair = shift @seq_count_pairs;
308 7         22 $cluster_for{$next_id} = [ $first_pair];
309              
310             # No distance from itself!
311 7         22 $distance_for{$next_id}{$first_pair->[0]} = 0;
312              
313             # Increment next cluster id, since it has already been used.
314 7         10 $next_id++;
315              
316             # Add sequences to existing cluster, or create new ones up to the maximum
317 7         22 while ( my $seq_count_pair = shift @seq_count_pairs ) {
318 34         75 my ($cluster_id, $distance) = matching_cluster_and_distance( $max_distance, \%cluster_for, $seq_count_pair );
319              
320             # Create new cluster, if one wasn't found
321 34 100       82 if( ! defined $cluster_id){
322 13         13 $cluster_id = $next_id;
323 13         16 $distance = 0;
324              
325             # Increment counter
326 13         16 $next_id++;
327             }
328              
329             # Add sequence info to the cluster, if the cluster is within the maximum requested
330 34 100       269 if( $cluster_id <= $max_clusters){
331 31         33 push @{ $cluster_for{$cluster_id} }, $seq_count_pair ;
  31         66  
332 31         119 $distance_for{$cluster_id}{$seq_count_pair->[0]} = $distance;
333             }
334             }
335              
336 7         41 return (\%cluster_for, \%distance_for);
337             }
338              
339             sub total_plus_cluster {
340 9     9 0 12 my $opt = shift;
341 9         14 my $cluster = $opt->{cluster};
342 9         9 my @seqs_with_count = @{$cluster};
  9         22  
343 9         14 my $total_count = 0;
344              
345 9         13 for my $seq_with_count (@seqs_with_count) {
346 21         36 $total_count += $seq_with_count->[1];
347             }
348              
349             return {
350 9         55 total => $total_count,
351             cluster => \@seqs_with_count,
352             original_id => $opt->{original_id},
353             };
354             }
355              
356             sub write_out_clusters {
357 3     3 0 28 my %opt = @_;
358              
359             # Required input
360 3   33     17 my $cluster_href = $opt{cluster_href} || croak 'cluster_href required';
361 3   33     12 my $distance_href = $opt{distance_href} || croak 'distance_href required';
362 3   33     9 my $fh_all_clusters = $opt{fh_all_clusters} || croak 'fh_all_clusters required';
363 3   33     10 my $max_top_seqs = $opt{max_top_seqs} || croak 'max_top_seqs required';
364              
365             # Optional input
366 3   50     9 my $fh_href = $opt{fh_href} || {};
367              
368             # Sort clusters by number of sequences they contain (including redundant
369             # ones).
370 9         40 my @total_plus_clusters =
371 3         8 map { total_plus_cluster( {original_id => $_, cluster => $cluster_href->{$_}} ) }
372 3         4 keys %{$cluster_href};
373              
374 3         7 my $new_id = 1;
375             # Sort clusters by number of sequences they contain (including redundant ones).
376 3         9 for my $total_plus_cluster ( reverse sort { $a->{total} <=> $b->{total} } @total_plus_clusters ) {
  7         19  
377              
378 9         10 my $number_of_sequences_in_cluster = scalar @{ $total_plus_cluster->{cluster} };
  9         18  
379              
380 9         12 my $grouping;
381              
382             # Call it a 'single' if only one unique sequence
383 9 100       22 if($number_of_sequences_in_cluster == 1){
384 3         6 $grouping = 'single';
385             }else{
386 6         10 $grouping = 'cluster';
387             };
388              
389             # Print header for each cluster/single
390 9         10 print {$fh_all_clusters} "######## $grouping $new_id ########\n";
  9         37  
391              
392             # Use prescribed filehandle for each sequence, or create one
393 9         10 my $fh;
394 9 50       21 if ( defined $fh_href->{$new_id} ) {
395 9         18 $fh = $fh_href->{$new_id};
396             }
397             else {
398 0         0 my $filename = $grouping . '_' . $new_id . '_top.fasta';
399 0         0 open( $fh, '>', $filename );
400 0 0       0 print "created output file '$filename'\n" if $VERBOSE;
401             }
402              
403             # Write cluster info to the "all" and individual cluster files
404 9         111 write_cluster(
405             {
406             fh_all_clusters => $fh_all_clusters,
407             fh => $fh,
408             cluster_aref => $total_plus_cluster->{cluster},
409             cluster_number => $new_id,
410             original_cluster_id => $total_plus_cluster->{original_id},
411             max_top_seqs => $max_top_seqs,
412             distance_href => $distance_href,
413             }
414             );
415              
416             # Increment cluster id
417 9         31 $new_id++;
418             }
419 3         21 return;
420             }
421              
422             sub write_cluster {
423 9     9 0 34 my $opt = shift;
424              
425 9         14 my $fh_all_clusters = $opt->{fh_all_clusters};
426 9         13 my $fh = $opt->{fh};
427 9         14 my $cluster_aref = $opt->{cluster_aref};
428 9         13 my $distance_href = $opt->{distance_href};
429 9         16 my $cluster_number = $opt->{cluster_number};
430 9         13 my $max_top_seqs = $opt->{max_top_seqs};
431 9         14 my $original_id = $opt->{original_cluster_id};
432              
433 9         12 my @seq_w_counts = @{$cluster_aref};
  9         22  
434              
435             # Remove total count, leaving just pairs with counts
436 9         12 my $internal_seq_id = 1;
437              
438 9         11 my $is_single;
439 9 100       75 $is_single = $TRUE if @seq_w_counts == 1;
440              
441 9         13 my $num_seqs = scalar @seq_w_counts;
442              
443             # If there are more than max_top_seqs, then split them into two arrays:
444             # One that will be processed and the other that will simply be output
445 9         11 my @overage_seqs;
446 9 100       18 if ( $num_seqs > $max_top_seqs ) {
447 2         6 @overage_seqs = @seq_w_counts[ $max_top_seqs .. ( $num_seqs - 1 ) ];
448 2         9 @seq_w_counts = @seq_w_counts[ 0 .. ( $max_top_seqs - 1 ) ];
449             }
450              
451             # SEQ_W_COUNT_LOOP
452 9         16 for my $seq_w_count (@seq_w_counts) {
453 19         18 my ( $seq, $count ) = @{$seq_w_count};
  19         43  
454              
455 19         38 my $distance = $distance_href->{$original_id}{$seq};
456              
457 19         46 my $unique_id = join( '.', $cluster_number, $internal_seq_id, $count, $distance );
458              
459             # Print to individual cluster file and all clusters file
460 19         23 print {$fh_all_clusters} "$unique_id\t$seq\n";
  19         98  
461              
462 19         24 print {$fh} ">$unique_id\n$seq\n";
  19         55  
463              
464             # Print second copy if this is a singleton (to make
465             # multi-sequence alignment behave well)
466 19 100       68 if ($is_single) {
467 3         6 $unique_id .= 'b';
468 3         5 print {$fh} ">$unique_id\n$seq\n";
  3         10  
469             }
470 19         42 $internal_seq_id++;
471             }
472              
473             # OVERAGE LOOP
474 9 100       22 if (@overage_seqs) {
475 2         6 my $filename = "cluster_${cluster_number}_overage.fasta";
476              
477 2         12 open( my $overage_fh, '>', $filename );
478 2         3516 for my $seq_w_count (@overage_seqs) {
479 2         6 my ( $seq, $count ) = @{$seq_w_count};
  2         5  
480              
481 2         7 my $distance = $distance_href->{$original_id}{$seq};
482 2         8 my $unique_id = join( '.', $cluster_number, $internal_seq_id, $count, $distance );
483 2         3 print {$overage_fh} ">$unique_id\n$seq\n";
  2         16  
484 2         2 print {$fh_all_clusters} "$unique_id\t$seq\n";
  2         9  
485 2         151 $internal_seq_id++;
486             }
487             }
488              
489 9         25 return;
490             }
491              
492             sub matching_cluster_and_distance {
493 35     35 0 2119 my $max_distance = shift;
494 35         44 my $cluster_href = shift;
495 35         38 my $seq_aref = shift;
496              
497 35         90 ID_LOOP:
498 35         34 for my $id ( keys %{$cluster_href} ) {
499 58         87 my $cluster_seq = $cluster_href->{$id}->[0];
500              
501 58         303 my $distance = distance( $seq_aref->[0], $cluster_seq->[0] );
502              
503             # Short circuit ID_LOOP when one is found (supposedly only one will match)
504 58 100       147 if ( $distance < $max_distance ) {
505 22         61 return ($id, $distance);
506             }
507             }
508 13         30 return;
509             }
510              
511             sub get_sequences_from {
512 10   33 10 0 9257 my $fh = shift || croak 'fh required';
513 10   33     32 my $type = shift || confess 'file type required';
514 10         14 my %count_of;
515 10         31 my $next_seq = _next_sequence_for($fh, $type);
516 10         16 while (1) {
517 82         173 my $seq = $next_seq->();
518 82 100       157 last if ! defined $seq;
519 72 50       137 next if $seq eq $EMPTY_STRING;
520              
521             # If sequence has not been seen before, start counting it. Otherwise, add to the count.
522 72 100       170 if( ! exists $count_of{$seq} ){
523 48         147 $count_of{$seq} = 1;
524             }else {
525 24         48 $count_of{$seq} = $count_of{$seq} + 1;
526             }
527             }
528              
529             # Convert hash into an array containing paired values.
530             # First value in the pair is the sequence.
531             # | The second value in the pair is the number of times (the
532             # | | count) that that sequence is seen in the file.
533             # v v
534 10         32 my @sequences = map { [ $_, $count_of{$_} ] } keys %count_of;
  48         124  
535              
536             # Sort the sequences so that the most abundant occur first in the array.
537 10         46 @sequences = sort { $b->[1] <=> $a->[1] } @sequences;
  76         110  
538              
539 10         66 return @sequences;
540             }
541              
542             sub _next_sequence_for {
543 10   33 10   26 my $fh = shift || croak 'fh (first positional parameter) required';
544 10   33     28 my $type = shift || confess 'file_type (second positional parameter) required';
545              
546             # Simply use next_line if we want each and every line
547 82     82   129 return sub { next_line($fh) }
548 10 50       65 if $type eq $SIMPLE_TYPE;
549              
550             # Only other supported file type is 'fastq'
551 0 0       0 confess "Unrecognized type '$type'. Only '$SIMPLE_TYPE' and '$FASTQ_TYPE' are currently recognized."
552             if $type ne $FASTQ_TYPE;
553              
554             # Skip first header line
555 0         0 readline $fh;
556              
557             return sub {
558 0     0   0 my $line = readline $fh;
559 0 0       0 return if !defined $line;
560              
561             # remove newline and carriage return
562 0         0 chomp $line;
563 0         0 $line =~ s/\r//g;
564              
565             # Skip quality header, quality score, and next sequence header
566 0         0 readline $fh;
567 0         0 readline $fh;
568 0         0 readline $fh;
569              
570 0         0 return $line;
571 0         0 };
572             }
573              
574             sub next_line {
575 82     82 0 96 my $fh = shift;
576              
577             # Get next line from file
578 82         144 my $line = readline $fh;
579              
580             # return undef if nothing left to read.
581 82 100       161 return if ! defined $line;
582              
583             #remove newline
584 72         82 chomp $line;
585              
586             #remove carriage return
587 72         90 $line =~ s/\r//g;
588 72         161 return $line;
589             }
590              
591             sub help {
592 0     0 0   print << "END";
593             $0 --$FASTQ_TYPE=FILENAME [OPTIONS]
594             $0 --$SIMPLE_TYPE=FILENAME [OPTIONS]
595              
596             OPTIONS (showing defaults)
597             --max_distance 5
598             --cpus 5
599             --max_clusters 10
600             --max_top_seqs 300
601             --config cluster.cfg
602             END
603 0           exit();
604             }
605              
606             sub external_dependecnies {
607 0     0 0   return <
608             External dependencies:
609             mafft (see http://mafft.cbrc.jp/alignment/software/)
610             Infernal (see http://infernal.janelia.org/), specifically:
611             cmalign
612             cmbuild
613             cmcalibrate
614             cmsearch
615             RNA Vienna Package (see http://www.tbi.univie.ac.at/~ivo/RNA/), specifically:
616             RNAalifold
617             These must be installed and in a directory that is your PATH environment variable.
618             END
619              
620             };
621              
622             1;
623              
624             =pod
625              
626             =head1 NAME
627              
628             Bio::App::SELEX::RNAmotifAnalysis - Cluster SELEX sequences and calculate their structures
629              
630             =head1 SYNOPSIS
631              
632             RNAmotifAnalysis --fastq seqs.fq --cpus 4 --run
633              
634             =head1 DESCRIPTION
635              
636             This module pipelines steps in the analysis of SELEX (Systematic Evolution
637             of Ligands through EXponential enrichment) data.
638              
639             This main module creates scripts to do the following:
640              
641             (1) Cluster similar sequences based on edit distance.
642              
643             (2) Align sequences within each cluster (using mafft).
644              
645             (3) Calculate the secondary structure of the aligned sequences (using
646             RNAalifold, from the Vienna RNA package)
647              
648             (4) Build covariance models using cmbuild from Infernal.
649              
650             Another useful utility installed with this distribution is
651             "selex_covarianceSearch" for doing iterative refinements of
652             covariance models.
653              
654             If you want to use files that simply list sequences, then use
655             the "--simple" flag instead of the "--fastq" flag.
656              
657             This script assumes that you've already done all of the quality
658             control of your sequences beforehand. If the FASTQ format is
659             used, quality scores are ignored.
660              
661             =head1 EXAMPLE USE
662              
663             RNAmotifAnalysis --infile seqs.fq --cpus 4 --run
664              
665             This will cluster the sequences found in 'seqs.fq' and create a FASTA file
666             for each one. The FASTA files will be grouped into batches (i.e. one per
667             cpu requested) that will be placed in a separate directory for each batch,
668             and processed within that directory. At the end of processing, for each
669             cluster there will be a covariance model and postscript illustration
670             files. The batch script used to process each batch will be located in the
671             respective batch directory. To produce the scripts without running them,
672             simply exclude the --run flag from the command line.
673              
674             The output file contains names that contain four period delimited values
675             For example, 2.3.1.5 means
676             that this is the second cluster
677             this is the third sequence in the cluster
678             there is one copy of this sequences
679             it is an edit distance of 5 from the reference sequence
680              
681             =head1 CONFIGURATION AND ENVIRONMENT
682              
683             As written, this code makes heavy use of UNIX utilities and is
684             therefore only supported on UNIX-like environemnts (e.g. Linux, UNIX, Mac
685             OS X).
686              
687             Install Infernal, MAFFT, and the RNA Vienna package ahead of time and add
688             the directories containing their executables to your PATH, so that the
689             first time you run RNAmotifAnalysis.pm the configuration file (cluster.cfg)
690             that is generated will have all of the correct parameters. Otherwise,
691             you'll need to update the configuration file manually.
692              
693             To update the PATH environment variable with the directory '/usr/local/myapps/bin/',
694             update your .bashrc file, thus:
695              
696             echo 'export PATH=/usr/local/myapps/bin:$PATH' >> ~/.bashrc.
697              
698             Now, every time you open a new terminal window, the PATH environment
699             variable will contain '/usr/local/myapps/bin/'. To make your new .bashrc
700             file effective immediately (i.e. without having to open a new terminal
701             window), use the following command:
702              
703             source ~/.bashrc
704              
705             =head1 INSTALLATION
706              
707             These installation instructions assume being able to open and use a
708             terminal window on Linux.
709              
710             (0) Some systems need several dependencies installed ahead of time.
711              
712             You may be able to skip this step. However, if subsequent steps don't
713             work, then be sure that some basic libraries are installed, as shown
714             below (or ask a system administrator to take care of it). For the
715             applicable distribution, open a terminal and then type the commands as
716             indicated:
717              
718             For RedHat or CentOS 5.x systems (tested on CentOS 5.5)
719              
720             sudo yum install gcc
721              
722             For RedHat or CentOS 6.x systems (tested on "Minimal Desktop" CentOS 6.0)
723              
724             sudo yum install gcc
725             sudo yum install perl-devel
726              
727             For Ubuntu systems (tested on Ubuntu 12-04 LTS)
728              
729             sudo apt-get install curl
730              
731             For Debian 5.x systems:
732              
733             sudo apt-get install gcc
734             sudo apt-get install make
735              
736             (1) Install the non-Perl dependencies:
737             (Versions shown are those that we've tested. Please contact us if
738             newer versions do not work.)
739              
740             Infernal 1.0.2 (http://infernal.janelia.org/)
741             MAFFT 6.849b (http://mafft.cbrc.jp/alignment/software/)
742             RNA Vienna package 1.8.4 (http://www.tbi.univie.ac.at/~ivo/RNA/)
743              
744             After installing these, make sure all of the foloowing executables are
745             in directories within your PATH:
746              
747             cmbuild
748             cmcalibrate
749             cmsearch
750             cmalign
751             mafft
752             RNAalifold
753              
754             (2) Use a CPAN client to install Bio::App::SELEX::RNAmotifAnalysis.
755              
756             Here we demonstrate the use of cpanminus to install it to a local Perl module directory. These instructions assume absolutely no experience with cpanminus.
757              
758             1. Download cpanminus
759              
760             curl -LOk http://xrl.us/cpanm
761              
762              
763             2. Make it executable
764              
765             chmod u+x cpanm
766              
767              
768             3. Make a local lib/perl5 directory (if it doesn't already exist)
769              
770             mkdir -p ~/lib/perl5
771              
772              
773             4. Add relevant directories to your PERL5LIB and PATH environment
774             variables by adding the following text to your ~/.bashrc
775             file:
776              
777              
778             # Set PERL5LIB if it doesn't already exist
779             : ${PERL5LIB:=~/lib/perl5}
780              
781             # Prepend to PERL5LIB if directory not already found in PERL5LIB
782             if ! echo $PERL5LIB | egrep -q "(^|:)~/perl5/lib/perl5($|:)"; then
783             export PERL5LIB=~/lib/perl5:$PERL5LIB;
784             fi
785              
786             # Prepend to PATH if directory not already found in PATH
787             if ! echo $PATH | egrep -q "(^|:)~/perl5/bin($|:)"; then
788             export PATH=~/bin:$PATH;
789             fi
790              
791              
792             5. Update environment variables immediately
793              
794             source ~/.bashrc
795              
796              
797             6. Install Module::Build
798              
799             ./cpanm Module::Build
800              
801              
802             7. Install Text::LevenshteinXS (even if you already have it installed elsewhere)
803              
804             ./cpanm Text::LevenshteinXS
805              
806              
807             8. Install Bio::App::SELEX::RNAmotifAnalysis
808              
809             ./cpanm Bio::App::SELEX::RNAmotifAnalysis
810              
811              
812             Please contact the author if, after consulting this documentation and
813             searching Google with error messages, you still encounter difficulties
814             during the installation process.
815              
816             =head1 INCOMPATIBILITIES
817              
818             Windows: lacks necessary *nix utilities
819             SGI: problems with compiled dependency Text::LevenshteinXS
820             Sun/Solaris: problems with compiled dependency Text::LevenshteinXS
821             BSD: problems with compiled dependency Text::LevenshteinXS
822              
823             =head1 BUGS AND LIMITATIONS
824              
825             There are no known bugs in this module.
826             Please report problems to molecules cpan org
827             Patches are welcome.
828              
829             =head1 RELATED PUBLICATIONS
830              
831             Ditzler MA, Lange MJ, Bose D, Bottoms CA, Virkler KF, et al. (2013) High-
832             throughput sequence analysis reveals structural diversity and improved
833             potency among RNA inhibitors of HIV reverse transcriptase. Nucleic Acids
834             Res 41(3):1873-1884. doi: 10.1093/nar/gks1190
835              
836             =cut