File Coverage

lib/SmotifTF/Psipred.pm
Criterion Covered Total %
statement 13 15 86.6
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             package SmotifTF::Psipred;
2              
3 1     1   20223 use v5.8.8;
  1         3  
  1         38  
4 1     1   4 use strict;
  1         1  
  1         22  
5 1     1   3 use warnings;
  1         5  
  1         30  
6              
7 1     1   568 use File::Spec::Functions qw(catfile catdir);
  1         727  
  1         69  
8 1     1   508 use SmotifTF::GeometricalCalculations;
  0            
  0            
9             use Carp;
10             use Proc::Simple;
11             use Data::Dumper;
12              
13             BEGIN {
14             use Exporter ();
15             our ( $VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
16             $VERSION = "0.01";
17              
18             @ISA = qw(Exporter);
19              
20             # name of the functions to export
21             @EXPORT = qw(
22             run
23             analyze_psipred
24             parse
25             write_outfile
26             count_smotifs
27             count_pdb_smotifs
28             );
29              
30             @EXPORT_OK = qw(
31             split_ss_string
32             getFirstResidueNumber
33             ); # symbols to export on request
34             }
35              
36             our @EXPORT_OK;
37              
38             use Config::Simple;
39             my $config_file = $ENV{'SMOTIFTF_CONFIG_FILE'};
40             croak "Environmental variable SMOTIFTF_CONFIG_FILE should be set"
41             unless $config_file;
42             my $cfg = new Config::Simple($config_file);
43              
44             use constant SMOTIFS_NUMBER_UPPER_LIMIT => 14;
45             use constant SMOTIFS_NUMBER_LOWER_LIMIT => 2;
46              
47             my $modeller = $cfg->param( -block => 'psipred' );
48             my $PSIPRED_PATH = $modeller->{'path'};
49             my $PSIPRED_EXEC = $modeller->{'exec'};
50              
51              
52             =head1 NAME
53              
54             Psipred
55              
56             =head1 VERSION
57              
58             Version 0.01
59              
60             =cut
61              
62             our $VERSION = '0.01';
63              
64             =head1 SYNOPSIS
65              
66             Set of routines to run and analyze Psipred
67              
68             Usage:
69              
70             use Psipred;
71              
72             run ($sequence, $directory);
73             analyze_psipred ($pdb, $chain, $directory, $havestructure);
74              
75             =cut
76              
77             =head2 run
78              
79             run psipred
80             /usr/local/bin/runpsipred 4wyq.fasta
81            
82             =cut
83              
84             sub run {
85             use Proc::Simple;
86              
87             my %args = (
88             sequence => '',
89             directory => '',
90             @_,
91             );
92             my $sequence = $args{'sequence'} || undef;
93             my $directory = $args{'directory'} || "./";
94              
95             croak "sequence in fasta format is required" unless $sequence;
96              
97             my $myproc = Proc::Simple->new();
98              
99             my $log_file = "$directory/psipred_log.txt";
100             my $error_file= "$directory/psipred_err.txt";
101             $myproc->redirect_output($log_file, $error_file);
102              
103             chdir $directory;
104             my $runpsipred = catfile( $PSIPRED_PATH, $PSIPRED_EXEC );
105             my $status = $myproc->start( "$runpsipred", $sequence );
106            
107             # Wait until process is done
108             my $exit_status = $myproc->wait();
109              
110             return $exit_status;
111              
112             }
113              
114              
115             =head2 analyze_psipred
116              
117             It will return the secondary structure prediction and sequence
118             from psipred output file.
119              
120             Script to get the Smotif defintions from PSIPRED output
121              
122             INPUT ARGUMENTS:
123             1) $pdb : pdb code (the PSIPRED output file should be in the subdirectory identified by the pdb code e.g. 1ptf/1ptf.horiz)
124             2) $chain : chain ID
125             3) If structure exists, 1, else, 0.
126              
127             INPUT FILES:
128             In the folder:
129             1)/.horiz (PSIPRED output file)
130             2)/pdb.ent (PDB file), if structure exists.
131              
132             OUTPUT FILES:
133             In the folder:
134             .out :File with smotif information: Protein code, Chain, Smotif type, Smotif start residue, Loop length, Length of first secondary structure, Length of second secondary structure, Sequence
135              
136             =cut
137              
138             sub analyze_psipred {
139            
140             my %args = (
141             pdb => '',
142             chain => '',
143             directory => '',
144             havestructure => '',
145             @_,
146             );
147             my $pdb = $args{'pdb'} || undef;
148             my $chain = $args{'chain'} || 'A';
149             my $directory = $args{'directory'} || "./";
150             my $havestructure= $args{'havestructure'} || 0;
151            
152             croak "pdb_id is required" unless $pdb;
153            
154             my %psipred = parse("$directory/$pdb.horiz");
155            
156             #print Dumper(\%psipred);
157            
158             croak "Error processing psipred out files. Pred was not found."
159             unless exists $psipred{"Pred"};
160            
161             croak "Error processing psipred out files. AA was not found."
162             unless exists $psipred{"AA"};
163            
164             my $sslist = $psipred{"Pred"};
165             my $seq = $psipred{"AA"};
166              
167             my $start = 0; #Initialize start residue
168             my @ss = split_ss_string( $sslist, \$start);
169              
170             # use Data::Dumper;
171             # print Dumper(\@ss);
172              
173             # find actual start residue in pdb file, if structure exists
174             my $shiftindex = 1;
175             if ($havestructure == 1) {
176             my $localpdb = SmotifTF::GeometricalCalculations::get_full_path_name_for_pdb_code($pdb, $chain);
177             unless (-e $localpdb) {croak "PDB structure file missing for $pdb $chain"};
178             $shiftindex = getFirstResidueNumber($localpdb);
179             }
180            
181             # Print smotif information to $pdb/$pdb.out file
182             $start = $start + $shiftindex;
183             my $filename = catfile ($directory, "$pdb.out");
184             write_outfile ($pdb,$chain,$filename,$start,$seq,@ss);
185              
186             # Check the number of Smotifs in the Query protein
187             my $num_motifs = count_smotifs ($filename);
188            
189             # print "Returning num_motifs = $num_motifs\n";
190             return ($seq, $num_motifs);
191             }
192              
193             =head2 write_outfile
194            
195             Writes the smotif definitions obtained from Psipred to the $pdb/$pdb.out file
196              
197             Input: $filename, $seq, @ss
198             This is the most important file and has to named $pdb/$pdb.out
199             and the format of this file is
200             print OUTFILE "Name\tChain\tType\tStart\tLooplength\tSS1length\tSS2length\tSequence\n";
201              
202             =cut
203            
204             sub write_outfile {
205            
206             my ($pdb,$chain,$filename,$start,$seq,@ss) = @_;
207            
208             croak "4-letter pdb code is required" unless $pdb;
209             # croak "Chain ID is required" unless $chain;
210             croak "Output filename is required" unless $filename;
211             croak "Protein sequence is required" unless $seq;
212             croak "SS array is requires" unless @ss;
213             croak "Smotif start residue number is required" unless $start;
214            
215             # Query chain-name is not manadatory for calculation further down.
216             # That's why we set it to A by default.
217             $chain = 'A' unless defined $chain;
218              
219             open(OUTFILE,">$filename");
220             print OUTFILE "Name\tChain\tType\tStart\tLooplength\tSS1length\tSS2length\tSequence\n";
221             for (my $aa=0;$aa<(scalar(@ss)-1)/2;$aa++) {
222             print OUTFILE "$pdb";
223             print OUTFILE ".pdb\t$chain\t";
224             print OUTFILE $ss[$aa*2][0];
225             print OUTFILE $ss[$aa*2+2][0];
226             print OUTFILE "\t",$start,"\t";
227             print OUTFILE scalar(@{$ss[$aa*2+1]}),"\t";
228             print OUTFILE scalar(@{$ss[$aa*2]}),"\t";
229             print OUTFILE scalar(@{$ss[$aa*2+2]}),"\t";
230             my $len=scalar(@{$ss[$aa*2]})+scalar(@{$ss[$aa*2+1]})+scalar(@{$ss[$aa*2+2]});
231             print OUTFILE substr($seq,$start,$len),"\n";
232             $start+=scalar(@{$ss[$aa*2]})+scalar(@{$ss[$aa*2+1]});
233             }
234             close(OUTFILE);
235              
236             }
237              
238             =head2 split_ss_string
239              
240             SUBROUTINE to convert a series of secondary structure types to a list of start and end points for smotifs
241             #Input 1: $ss - a string of secondary structure types,
242             where H=alpha helix and E=strand e.g. (HHHHLLLLEEEE)
243             #Input 2: $startpos - residue at which first smotif begins
244              
245             =cut
246             sub split_ss_string {
247             my ($ss, $startpos) = @_;
248              
249             croak "secondary structure string is needed" unless $ss;
250             croak "start position is not defined" unless defined $startpos;
251              
252             my @ss = split(//, $ss);
253             # convert all non-helix/non-strand positions to 'L'
254             foreach (@ss) {
255             if ( (($_) ne 'H') and (($_) ne 'E') ) {
256             $_='L'
257             }
258             }
259            
260             # convert strands/helices less than 2 residues long into loops
261             my $sslength = 1;
262             my $prev = $ss[0];
263             for (my $count=1; $count
264             if ($ss[$count] eq $prev) {
265             $sslength++;
266             } else {
267             if ((($sslength<2) and ($prev ne 'L')) or (($sslength<4) and ($prev eq 'H'))) {
268             for (my $aa = $count-$sslength; $aa<$count; $aa++) {
269             $ss[$aa]='L'
270             };
271             }
272             $prev = $ss[$count];
273             $sslength= 1;
274             }
275             }
276             my @out;
277             my $start = 0;
278             for (my $aa=0; $aa
279             if ($ss[$aa] ne $ss[$aa+1]) { #switch in secondary structure type implies a landmark
280             push( @out,[@ss[$start..$aa]]);
281             if (($ss[$aa] ne 'L') and ($ss[$aa+1] ne 'L')) {
282             $ss[$aa+1] = 'L';
283             }
284             $start = $aa + 1;
285             }
286             }
287             if ($out[0][0] eq 'L') {
288             $$startpos = scalar(@{$out[0]});
289             shift(@out);
290             }
291             if ($out[-1][0] eq 'L') {
292             pop(@out)
293             };
294             return @out;
295             }
296              
297             =head2 getFirstResidueNumber
298              
299             Subroutine to get starting residue number of a pdb file
300              
301             =cut
302              
303             sub getFirstResidueNumber {
304              
305             my ($localpdb) = @_;
306              
307             croak "PDB file name is required" unless $localpdb;
308            
309             my $resno = 1;
310             open PDB, "<$localpdb";
311             WLOOP: while (my $line = ) {
312             chomp $line;
313             #print "$line\n";
314             if ($line =~ /^ATOM/) {
315             $resno = substr($line, 22, 5); $resno =~ s/\s+//g;
316             last WLOOP;
317             } elsif ($line =~ /^HETATM/) {
318             $resno = substr($line, 22, 5); $resno =~ s/\s+//g;
319             last WLOOP;
320             }
321             }
322             $resno = 1 if ($resno =~ /[a-zA-Z]/);
323              
324             close PDB;
325             return $resno;
326             }
327              
328             =head2 parse
329              
330             It returns a hash containing the following keys:
331             $VAR1 = {
332             'AA' => 'MNLQPIFWIGLISSVCCVF...'
333             'Conf' => '9975402235541123446...'
334             'Pred' => 'CCCCCCHHHHHHHCEEEEE...'
335             };
336              
337             =cut
338             sub parse {
339            
340             my ($file) = @_;
341            
342             chomp $file if $file;
343            
344             open my $fh, "<". $file
345             or die "cannot open $file $!";
346            
347             croak "$file does end in .horiz"
348             unless $file =~ m/\.horiz$/;
349              
350             my %ppred;
351             while (my $line = <$fh>) {
352             if ($line =~ /^Conf:/) {
353             $line =~ s/^Conf://g;
354             $line =~ s/\s+//g;
355             if ( exists($ppred{'Conf'}) ){
356             $ppred{'Conf'} .= $line;
357             }
358             else {
359             $ppred{'Conf'} = $line;
360             }
361             }
362             if ($line =~ /^Pred:/) {
363             $line =~ s/^Pred://g;
364             $line =~ s/\s+//g;
365             if ( exists($ppred{'Pred'}) ){
366             $ppred{'Pred'} .= $line;
367             }
368             else {
369             $ppred{'Pred'} = $line;
370             }
371             }
372             if ($line =~ /^\s+AA:/) {
373             $line =~ s/^\s+AA://g;
374             $line =~ s/\s+//g;
375             if ( exists($ppred{Conf}) ){
376             $ppred{'AA'} .= $line;
377             }
378             else {
379             $ppred{'AA'} = $line;
380             }
381             }
382             }
383             close $fh;
384             return %ppred;
385             }
386              
387             =head2 count_smotifs
388              
389             Subroutine to count the number of Smotifs in a given Smotif definition file (typically $pdb/$pdb.out).
390             Input : Filename with Smotif definition
391             Output: Returns the number of Smotifs in the given file
392              
393             =cut
394              
395             sub count_smotifs {
396              
397             my ($filename) = @_;
398            
399             # print "opening $filename ...\n";
400             open(INFILE, $filename) or croak "Unable to open Smotif definition file $filename\n";
401             my $smotifs = 0;
402             my $line = ; #skip header line
403             while (my $line = ) {
404             chomp $line;
405             my @lin = split (/\s+/,$line);
406             # print Dumper(\@lin);
407             croak "Unrecognized file format in $filename\n"
408             if not (scalar(@lin) == 8);
409             $smotifs++;
410             }
411             close (INFILE);
412             # print "smotifs = $smotifs\n";
413             return $smotifs;
414             }
415              
416             =head2 count_pdb_smotifs
417              
418             Subroutine to count the number of Smotifs in a given Smotif definition file (typically $pdb/$pdb.out).
419             Input : Filename with Smotif definition
420             Output: Returns the number of Smotifs in the given file
421              
422             =cut
423              
424             sub count_pdb_smotifs {
425             my ( $pdb ) = @_;
426             use File::Spec::Functions qw(catfile);
427              
428             croak "4-letter pdb code is required" unless $pdb;
429              
430             # my $filename = "$pdb\/$pdb" . ".out";
431             my $filename = catfile( $pdb, "$pdb.out" );
432              
433             croak "$filename does not exists. First run 'Psipred' step 1."
434             unless ( -e $filename );
435              
436             my $smotifs = count_smotifs( $filename );
437             return $smotifs;
438             }
439              
440             =head1 AUTHORS
441              
442             Fiserlab Members , C<< >>
443              
444             =head1 BUGS
445              
446             Please report any bugs or feature requests to C, or through
447             the web interface at L.
448              
449             =head1 SUPPORT
450              
451             You can find documentation for this module with the perldoc command.
452              
453             perldoc Psipred
454              
455             =over 4
456              
457             =item * RT: CPAN's request tracker (report bugs here)
458              
459             L
460              
461             =item * AnnoCPAN: Annotated CPAN documentation
462              
463             L
464              
465             =item * CPAN Ratings
466              
467             L
468              
469             =item * Search CPAN
470              
471             L
472              
473             =back
474              
475             =head1 LICENSE AND COPYRIGHT
476              
477             Copyright 2015 Fiserlab Members .
478              
479             This program is free software; you can redistribute it and/or modify it
480             under the terms of the the Artistic License (2.0). You may obtain a
481             copy of the full license at:
482              
483             L
484              
485             Any use, modification, and distribution of the Standard or Modified
486             Versions is governed by this Artistic License. By using, modifying or
487             distributing the Package, you accept this license. Do not use, modify,
488             or distribute the Package, if you do not accept this license.
489              
490             If your Modified Version has been derived from a Modified Version made
491             by someone other than you, you are nevertheless required to ensure that
492             your Modified Version complies with the requirements of this license.
493              
494             This license does not grant you the right to use any trademark, service
495             mark, tradename, or logo of the Copyright Holder.
496              
497             This license includes the non-exclusive, worldwide, free-of-charge
498             patent license to make, have made, use, offer to sell, sell, import and
499             otherwise transfer the Package with respect to any patent claims
500             licensable by the Copyright Holder that are necessarily infringed by the
501             Package. If you institute patent litigation (including a cross-claim or
502             counterclaim) against any party alleging that the Package constitutes
503             direct or contributory patent infringement, then this Artistic License
504             to you shall terminate on the date that such litigation is filed.
505              
506             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
507             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
508             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
509             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
510             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
511             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
512             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
513             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
514              
515              
516             =cut
517              
518             1; # End of Psipred