File Coverage

blib/lib/Peptide/Pubmed.pm
Criterion Covered Total %
statement 511 525 97.3
branch 202 294 68.7
condition 75 140 53.5
subroutine 51 51 100.0
pod 37 37 100.0
total 876 1047 83.6


line stmt bran cond sub pod time code
1             package Peptide::Pubmed;
2              
3 4     4   93925 use vars qw($VERSION);
  4         12  
  4         681  
4             $VERSION = '1.02';
5              
6             =head1 NAME
7              
8             Peptide::Pubmed - extract peptide sequences from MEDLINE article abstracts.
9              
10             =head1 SYNOPSIS
11              
12             use Peptide::Pubmed;
13             $parser = Peptide::Pubmed->new;
14             $in = {
15             PMID => q[15527327],
16             Author => q[Doe JJ, Smith Q],
17             Journal => q[J Biological Foo. 2004;8(2):123-30.],
18             Title => q[Foo, bar and its significance in phage display.],
19             Abstract =>
20             q[Peptide sequences EYHHYNK and Arg-Gly-Asp, but not ACCCGTNA or VEGFRI.],
21             Mesh => q[Genes, p53/genetics; Humans; Bar],
22             Chemical => q[Multienzyme Complexes; Peptide Library; Foo],
23             };
24             $parser->parse_abstract($in);
25              
26             # get the peptide sequences in 1 letter symbols (select all words where the
27             # combined word/abstract score is above threshold:
28             # WordAbstScore >= WordAbstScoreMin):
29             @seqs = $parser->get_seqs;
30             print "@seqs\n"; # prints: 'EYHHYNK RGD'
31              
32             =head1 EXAMPLES
33              
34             # same as above, set threshold explicitly:
35             $parser->WordAbstScoreMin(0.4);
36             @seqs = $parser->get_seqs;
37              
38             # set low threshold to get more peptide sequences (but at a cost of getting
39             # more false positives)
40             $parser->WordAbstScoreMin(-1);
41             @seqs = $parser->get_seqs;
42             print "@seqs\n"; # prints: 'EYHHYNK RGD ACCCGTNA VEGFRI'
43              
44             # reset threshold back:
45             $parser->WordAbstScoreMin(0.4);
46              
47             # get more data for the abstract:
48             $abst = $parser->get_abst;
49             print "$abst->{AbstScore}\n"; # abstract score, in the [0,1] interval
50             print "$abst->{AbstMtext}\n"; # abstract with sequences marked up:
51             # 'Peptide sequences EYHHYNK and Arg-Gly-Asp,
52             # but not ACCCGTNA or VEGFRI.'
53              
54             # get more data for the words, in addition to peptide sequences:
55             @words = $parser->get_words;
56             for my $word (@words) {
57             # combined word/abstract score, in the [0,1] interval
58             print "$word->{WordAbstScore}\n";
59             # word as found in the abstract, eg 'Arg-Gly-Asp,'
60             print "$word->{WordOrig}\n";
61             # peptide sequence in 1 letter symbols, eg 'RGD'
62             print "$word->{WordSequence}\n";
63             }
64              
65             # There are no mandatory input fields. This will work too, but may give lower score.
66             $in = {
67             Abstract =>
68             q[Peptide sequences EYHHYNK and Arg-Gly-Asp, but not ACCCGTNA or VEGFRI.],
69             };
70             $parser->parse_abstract($in);
71             @words = $parser->get_words;
72              
73             # No peptide sequences are found in empty input:
74             $in = undef;
75             $parser->parse_abstract($in);
76             @words = $parser->get_words;
77              
78             =head1 DESCRIPTION
79              
80             Provides common methods to parse peptide sequences from Pubmed
81             abstracts. The computed variables can be used for classification in
82             external programs.
83              
84             For all variables below:
85              
86             Varables (Abst|Word)Is*: Allowed values: 1/0.
87              
88             Variables (Abst|Word)Num*: Allowed values: integer.
89              
90             Variables (Abst|Word)Prop*: Allowed values: real in [0,1].
91              
92             Variables (Abst|Word)Score*: Allowed values: real in [0,1]. Score near
93             1 corresponds to "more relevant" abstracts or words (that is, likely
94             to contain peptide sequences), score near 0 - to "less relevant"
95             abstracts or words.
96              
97             Variables (Abst|Word)* (all other): Allowed values: string, unless
98             otherwise specified below.
99              
100             =head2 Input variables
101              
102             Input variables can be used optionally without the 'Abst' prefix, so
103             'AbstPMID' and 'PMID' are treated identically.
104              
105             AbstPMID: PubMed ID.
106              
107             AbstAuthor: article authors.
108              
109             AbstJournal: journal citation, with year if available. For format, see
110             examples.
111              
112             AbstTitle: article title.
113              
114             AbstAbstract: abstract.
115              
116             AbstMesh: Medical Subject Headings (MeSH) terms.
117              
118             AbstChemical: chemical list.
119              
120             =head2 Output variables, for the abstract text
121              
122             AbstNumAb: number of matches to words like antibody, epitope, etc.
123              
124             AbstNumAllCap: number of all capitalized words in the abstract.
125              
126             AbstNumBind: number of matches to words like binds, interacts, etc.
127              
128             AbstNumDigest: number of matches to words like Edman, MS/MS, trypsin,
129             etc.
130              
131             AbstNumMHC: number of matches to words like MHC, TCR, etc.
132              
133             AbstNumPeptide: number of matches to words like peptide, oligopeptide,
134             motif, etc.
135              
136             AbstNumPhage: number of matches to words like phage, display, etc.
137              
138             AbstNumProtease: number of matches to words like peptidase, cutting,
139             etc.
140              
141             AbstNumWords: number of words. Allowed values: integer.
142              
143             AbstPropAllCap: proportion of all capitalized words in the abstract (=
144             AbstNumAllCap / AbstNumWords).
145              
146             AbstScore: heuristic score to predict whether the abstract contains a
147             peptide sequence, computed based on Abst* variables.
148              
149             Other variables:
150              
151             AbstComment: free text comment for debugging.
152              
153             AbstMtext: abstract with sequences marked using '...'
154             tags.
155              
156             =head2 Output variables, for the word
157              
158             WordIsDNA: is an all DNA sequence? Example: ACCTTG.
159              
160             WordIsDict: is in the dictionary (currently, of english words and of
161             scientific terms, software names and abbreviations)? Example:
162             MATERIALS, RT-PCR.
163              
164             WordIsGene: is a gene name, protein name, gene symbol, protein symbol,
165             etc? Example: TFIIA.
166              
167             WordSeqLen: peptide length, in amino acids.
168              
169             WordOrig: the word as found in the abstract text.
170              
171             WordPropDegen: proportion of degenerate amino acids, e.g, 0.6 for
172             AXXXC.
173              
174             WordPropProtein: a measure which is positive if a given word
175             composition looks more like a protein sequence than like an english
176             word, and negative otherwise. Computed using frequencies of
177             overlapping k-mers. It is defined as follows: WordPropProtein = sum
178             (over all overlapping k-mers within the word) of (log10Pp -
179             log10Pn). log10Pp is log base 10 of the proportion of the k-mer in the
180             database of known protein sequences, and log10Pn - same, for
181             non-sequences (here, english words from pubmed abstracts not related
182             to peptides). For a word with all k-mers equally frequent among
183             sequences and non-sequences, log10Pp = log10Pn, and WordPropProtein =
184             0. Allowed values: real, [-Inf,+Inf]
185              
186             WordScore: heuristic score to predict whether the word contains a
187             peptide sequence, computed based on Word* variables.
188              
189             WordAbstScore: heuristic score to predict whether the word contains a
190             peptide sequence, computed based on both Abst* and Word* variables.
191              
192             WordSequence: word converted to peptide sequence in 1-letter amino
193             acid symbols.
194              
195             Other variables:
196              
197             WordAaSymbols: amino acid code. Allowed values: 1 (1 letter), 3 (3
198             letter or full name). Note that separate handling of 3 letter symbols
199             and full names is currently not implemented.
200              
201             WordIdx: word index in the abstract. Allowed values: nonnegative
202             integer.
203              
204             =head1 NOTES
205              
206             =head2 WordPropProtein
207              
208             Note that to optimize classification using frequencies of k-mers, k
209             should be chosen so that for a given text, there are 'not too many'
210             empty cells, that is, 'not too many; k-mers that did not occur. For a
211             typical english text, with the combined text length of 100,000,
212             alphabet size of 26 (A-Z, case-insensitive), k=3 is a good choice,
213             because there are 26 ** 3 = 17576 different k-mers, and the expected
214             frequency of each k-mer is 5.7. Actually, the expected frequency is
215             somewhat less because of the effect of the word boundaries. That is,
216             text: 'foobletch' contains more 3-mers than 'foo bletch' because after
217             splitting on whitespace, 'oob', 'obl' do not occur in 'foo bletch'.
218              
219             =head2 Variable and method names
220              
221             By convention, method, variable and keys names like these:
222             VarNamesLikeThese are used for cases where the corresponding field
223             names may be printed in the output table, such as the rdb table in
224             parse_file(). For the rest of the names, var_names_like_these are
225             used.
226              
227             =head1 KNOWN BUGS
228              
229             =head2 False negatives
230              
231             =head3 Peptide length cutoff
232              
233             Peptide length cutoff is 20 amino acids. This is a somewhat arbitary
234             choice. In various sources, cutoffs between 15 and 50 amino acids are
235             used to define oligopeptides.
236              
237             =head2 False positives
238              
239             =head3 Gene symbols, english words, scientific terms and abbreviations
240              
241             Some of these were misclassified as peptide sequence, even though this
242             code uses several dictionaries to find such non-sequence words.
243              
244             =head2 Incorrectly parsed sequence
245              
246             The recommendations of IUPAC can be found in: Nomenclature and
247             Symbolism for Amino Acids and Peptides,
248              
249             http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html
250              
251             http://www.chem.qmul.ac.uk/iupac/AminoAcid/A1416.html
252              
253             They are not always followed in the published abstracts. More flexible
254             input rules are thus allowed for peptide sequences. However, the
255             following bugs may occur in parsed sequences.
256              
257             =head3 Amino acid position and the number of repetitions are not
258             resolved
259              
260             Y(n) usually means amino acid Y at position n, but sometimes also
261             means Y repeated n times. It is always resolved as Y, and n is
262             ignored. However, X(n), where X is 'X', 'Xaa', 'Xxx', etc, usually
263             means 'any amino acid, repeated n times. It is always resolved as X
264             repeated n times.
265              
266             =head1 REFERENCES
267              
268             =head2 ADAM
269              
270             This section refers to ADAM, on which Peptide::DictionaryAbbreviations
271             is based. See http://arrowsmith.psych.uic.edu. I would like to thank
272             ADAM authors (in particular, Neil Smalheiser) for graciously providing
273             ADAM.
274              
275             ADAM citation:
276              
277             Zhou W, Torvik VI, Smalheiser NR. ADAM: Another Database of
278             Abbreviations in MEDLINE. Bioinformatics 2006; 22(22): 2813-2818.
279              
280             ADAM OVERVIEW
281            
282             ADAM is an abbreviation database generated from the 2006
283             baseline of MEDLINE. It covers frequently used abbreviations
284             and their definitions (or long-forms), including both
285             acronyms and non-acronym abbreviations.
286             Reference
287            
288             Zhou W, Torvik VI, Smalheiser NR. ADAM: Another Database
289             of Abbreviations in MEDLINE. Bioinformatics 2006; 22(22): 2813-2818.
290              
291             ADAM Copyright
292            
293             University of Illinois at Chicago, 2006.
294            
295             ADAM License
296            
297             By using this software, you expressly agree that your use will be
298             noncommercial, that you will not use this software to make money, and that
299             you will not distribute the software to anyone else or let anyone else use
300             it. Moreover, you will give credit to the University of Illinois and Dr.
301             Smalheiser as the author of the software. The software is provided "as is"
302             and without warranties of any kind, express or implied, including but not
303             limited to the implied warranties of merchantability and fitness, and
304             statutory warranties of noninfringement.
305              
306             =head1 AUTHOR
307              
308             Timur Shtatland, tshtatland at mgh dot harvard dot edu
309              
310             Copyright (C) 2007 by The General Hospital Corporation.
311              
312             This program is free software; you can redistribute it and/or modify
313             it under the same terms as Perl itself.
314              
315             See http://www.perl.com/perl/misc/Artistic.html
316              
317             =head1 SEE ALSO
318              
319             RDB - a fast, portable, relational database management system without
320             arbitrary limits, (other than memory and processor speed) that runs
321             under, and interacts with, the UNIX Operating system, by Walter
322             V. Hobbs.
323              
324             =head1 APPENDIX
325              
326             The rest of the documentation details the methods.
327              
328             =cut
329              
330             # for printing diagnostic msgs when the module is loaded.
331             # use $obj->{verbose} for object related diagnostic msgs.
332             our $VERBOSE;
333             BEGIN {
334 4   50 4   185 $VERBOSE ||= $ENV{VERBOSE} || $ENV{TEST_VERBOSE} || 0;
      33        
335             }
336              
337 4     4   13368 use Peptide::DictionaryAbbreviations;
  4         9  
  4         179  
338 4     4   65719 use Peptide::DictionaryEnglish;
  4         13  
  4         171  
339 4     4   9878 use Peptide::DictionaryFrequent;
  4         11  
  4         133  
340 4     4   2507 use Peptide::DictionaryScientific;
  4         11  
  4         109  
341 4     4   21995 use Peptide::Gene;
  4         13  
  4         146  
342 4     4   29337 use Peptide::Sequence;
  4         13  
  4         212  
343 4     4   27601 use Peptide::NonSequence;
  4         14  
  4         164  
344 4     4   7349 use Data::Dumper;
  4         42794  
  4         298  
345 4     4   36 use Carp;
  4         8  
  4         316  
346 4     4   21 use warnings;
  4         9  
  4         134  
347 4     4   20 use strict;
  4         8  
  4         196  
348              
349 4     4   22 use constant LOG_10_2 => log(2) / log(10); # empirically derived weight for WordPropProtein
  4         7  
  4         43886  
350             our $MAX_PEPTIDE_LENGTH = 20; # Somewhat arbitrary cutoff, consistent with IUPAC. Find the peptides sequences not longer than this cutoff.
351              
352             # word / score assignment
353             #
354              
355             # for WordScore()
356              
357             my %Dict = map { $_ => 1 }
358             splitln(Peptide::DictionaryAbbreviations->words),
359             splitln(Peptide::DictionaryEnglish->words),
360             splitln(Peptide::DictionaryFrequent->words),
361             splitln(Peptide::DictionaryScientific->words);
362             my %Gene = map { $_ => 1 }
363             splitln(Peptide::Gene->words);
364              
365             # for WordPropProtein()
366              
367             my %log_prop_sequence = map /(\S+)\s+(\S+)\s*\n/g, Peptide::Sequence->log_prop_kmers;
368             my %log_prop_nonsequence = map /(\S+)\s+(\S+)\s*\n/g, Peptide::NonSequence->log_prop_kmers;
369              
370             if ($VERBOSE > 3) {
371             foreach (sort keys %log_prop_sequence) {
372             print join("\t", $_, $log_prop_sequence{$_}, $log_prop_nonsequence{$_}), "\n";
373             }
374             }
375              
376             {
377             my %args_default
378             = (
379             AbstScoreMin => 0.2,
380             WordScoreMin => 0.2,
381             WordAbstScoreMin => 0.2,
382             WordNumPrintMin => -1,
383             WordNumPrintMax => -1,
384             in_col => [ qw(PMID Author Journal Title Abstract Mesh Chemical) ],
385             kmer_length => 3,
386             print_col => [ qw(
387             AbstPMID AbstAuthor AbstJournal AbstTitle AbstAbstract AbstMesh AbstChemical
388              
389             AbstNumAb AbstNumAllCap AbstNumBind AbstNumDigest AbstNumMHC AbstNumPeptide AbstNumPhage AbstNumProtease AbstNumWords AbstPropAllCap AbstScore AbstComment AbstMtext
390              
391             WordAbstScore WordIsDNA WordIsDict WordIsGene WordSeqLen WordOrig WordPropDegen WordPropProtein WordScore WordSequence WordAaSymbols WordIdx
392              
393             )],
394             print_colf => undef,
395             verbose => 1,
396             );
397              
398             =head2 new
399              
400             Args : %named_parameters:
401              
402             AbstScoreMin : print abstract data if AbstScore >= this threshold.
403             WordScoreMin : print word data if WordScore >= this threshold.
404             WordAbstScoreMin : print word data if WordAbstScore >= this threshold.
405             Allowed values: real in [0,1] interval, or -1.
406             Set to -1 to undefine these, and print all that satisfy
407             *PrintMin*, *PrintMax* thresholds.
408              
409             WordNumPrintMin : min number of words to print per abstract.
410             Allowed values: 1 and -1
411             WordNumPrintMax : max number of words to print per abstract.
412             Allowed values: non-negative integer, or -1.
413             Set to -1 to undefine these, and print all that satisfy
414             *ScoreMin* thresholds.
415              
416             If WordNumPrintMin = 1, at least one line is printed
417             per each abstract for which AbstScore satisfies
418             AbstScoreMin threshold. That is, 1 line is printed
419             even if no sequences were found (many Word* vars will
420             be empty or 0 when this happens) or if no sequences
421             satisfy WordScore / WordScoreMin threshold. If there
422             are sequences found, at least 1 line is printed (which
423             may or may not satisfy WordScore / WordScoreMin
424             threshold), and the rest are printed only if they
425             satisfy it. If WordNumPrintMax = N, where N is a
426             positive integer, only the first N words are printed
427             (_not_ the best N words by score). A special case is
428             (WordNumPrintMin => 1, WordNumPrintMax => 1), which
429             prints exactly 1 line per abstract. Word* vars are not
430             computed, which make the code much faster. An even
431             more special case is (AbstScoreMin => -1, WordScoreMin
432             => -1, WordAbstScoreMin => -1, WordNumPrintMin => 1,
433             WordNumPrintMax => 1), which prints all abstracts, 1
434             abstract per line, and computes Abst*, but not Word*
435             vars, thus allowing to separate computation into 2
436             steps: compute and select abstracts with good
437             AbstScore, then compute and select abstracts and words
438             with good WordAbstScore, WordScore.
439              
440             in_col : columns for input
441             kmer_length : for computing WordPropProtein: => 3,
442            
443             print_col : columns for printing
444             print_colf : column formats for rdb table, eg [qw(50S 10N)];
445             default: undef, to be determined automatically from col names.
446             verbose : verbosity level for diagnostic msgs. Allowed values: 0..4.
447             Example : my $parser = Peptide::Pubmed->new(verbose => $verbose) or
448             carp "not ok: new Peptide::Pubmed" and return;
449             Description : bare constructor. All work done by init().
450             Returns : TRUE if successful, FALSE otherwise.
451              
452             =cut
453              
454             sub new {
455 9     9 1 11553 my $class = shift;
456 9         54 my %args = @_;
457 9         55 foreach (sort keys %args) {
458 39 50 0     101 exists $args_default{$_} or carp "not ok: arg=$_ is not allowed in new()" and
459             return;
460             }
461 9   33     149 my $self = bless { %args_default, %args }, ref($class) || $class;
462 9 50       41 $self->init or return;
463 9         37 return $self;
464             }
465             }
466              
467             =head2 get_abst
468              
469             Args : none
470             Example : $abst = $parser->get_abst;
471             Description : get the output data for Abst* variables.
472             Returns : a ref to hash with Abst* variables if successful,
473             ref to empty hash otherwise.
474              
475             =cut
476              
477             sub get_abst {
478 1     1 1 5 my ($self) = @_;
479 1         3 my $abst = {};
480 1         3 foreach my $key (grep /^Abst/, @{ $self->{abst_col} } ) {
  1         37  
481 20 50       88 $abst->{$key} = $self->{abst}{$key} if exists $self->{abst}{$key};
482             }
483 1         6 return $abst;
484             }
485              
486             =head2 get_words
487              
488             Args : none
489             Example : # get data for words likely to contain peptide sequences:
490             @words = $parser->get_words;
491             Description : get the output data for Words* variables.
492             Returns : an array where each element is a ref to hash with
493             Word* variables output data for 1 word. Only words
494             that satisfy the threshold: combined word/abstract score
495             WordAbstScore greater than or equal to WordAbstScoreMin,
496             are included, otherwise an empty array is returned.
497             To return all words, set WordAbstScoreMin to -1.
498              
499             =cut
500              
501             sub get_words {
502 2     2 1 1082 my ($self) = @_;
503 2         4 my @words = ();
504 2         3 foreach my $rh_word ( @{ $self->{words} } ) {
  2         6  
505 19 100       51 next unless $rh_word->{WordAbstScore} >= $self->{WordAbstScoreMin};
506 2         5 my $rh_word_out = {};
507 2         3 foreach my $key (grep /^Word/, @{ $self->{words_col} } ) {
  2         22  
508 24 50       91 $rh_word_out->{$key} = $rh_word->{$key} if exists $rh_word->{$key};
509             }
510 2         9 push @words, $rh_word_out;
511             }
512 2         15 return @words;
513             }
514              
515              
516             =head2 get_seqs
517              
518             Args : none
519             Example : @seqs = $parser->get_seqs;
520             Description :
521             Returns : a list with sequences for all words in the abstract,
522             empty list if none found.
523              
524             =cut
525              
526             sub get_seqs {
527 3     3 1 12 my ($self) = @_;
528 3         6 my @seqs = ();
529 3         4 foreach my $rh_word ( @{ $self->{words} } ) {
  3         7  
530 57 100       131 next unless $rh_word->{WordAbstScore} >= $self->{WordAbstScoreMin};
531 23 100       52 push @seqs, $rh_word->{WordSequence} if $rh_word->{WordSequence};
532             }
533 3         15 return @seqs;
534             }
535              
536              
537             =head2 WordAbstScoreMin
538              
539             Args : (optional) new value
540             Example : $parser->WordAbstScoreMin(0.4);
541             $parser->WordAbstScoreMin;
542             Description : assign to WordAbstScoreMin a new value, if called with an argument.
543             Returns : WordAbstScoreMin value (after assignment)
544              
545             =cut
546              
547             sub WordAbstScoreMin {
548 3     3 1 1994 my ($self, $val) = @_;
549 3 50       10 if (@_ == 2) {
550 3         7 $self->{WordAbstScoreMin} = $val;
551             }
552 3         7 return $self->{WordAbstScoreMin};
553             }
554              
555             =head2 init
556              
557             Args : none
558             Example : $parser->init or return;
559             Description : check to see if the entry is ok. Initialize several fields used for
560             printing, such as column definitions.
561             Returns : always TRUE
562              
563             =cut
564              
565             sub init {
566 9     9 1 20 my ($self) = @_;
567 9 50       44 unless (defined $self->{print_colf}) {
568             # assign to numeric or text
569 288 100       859 $self->{print_colf} = [
570             map {
571 9         24 /^(Abst|Word|AbstWord)(Idx|Is|Num|Prop|Score)([A-Z]|$)/ ?
572             "10N" : "50S"
573             }
574 9         14 @{ $self->{print_col} }
575             ]
576             }
577             # cols to be printed are either words cols, or abstract cols
578 9         30 $self->{abst_col} = [ grep { /^Abst/ } @{ $self->{print_col} } ];
  288         555  
  9         23  
579 9         16 $self->{words_col} = [ grep { /^Word/ } @{ $self->{print_col} } ];
  288         489  
  9         20  
580 9 50       35 if ($self->{verbose} > 2) {
581 0         0 foreach (sort keys %{ $self }) {
  0         0  
582 0         0 warn "$_ => $self->{$_}";
583             }
584             }
585 9         31 return 1;
586             }
587              
588             =head2 splitln
589              
590             Arg[1] : string of items, one item per line.
591             Arg[2] : (optional) %named_parameters:
592             lc : convert to lowercase? 1 = yes, 0 = no. Default = 1.
593             Example : my %words = map { $_ => 1 } splitln("foo\nbar\n", lc => 0);
594             my %words = map { $_ => 1 } $this->splitln("foo\nbar\n");
595             Description : parse input string into list of items, remove leading and trailing
596             whitespace, remove empty lines, convert to lc. Can be used as a
597             method call or direct call.
598             Returns : list of items
599              
600             =cut
601            
602             sub splitln {
603 20 50   20 1 135 ref $_[0] and shift;
604 20         57 my $str = shift;
605 20         106 my %args = (lc => 1, @_);
606 303664         529571 my @lines =
607 303664         832295 grep { $_ ne '' }
608 20 50       75369 map { s/^\s+|\s+$//g; $args{lc} ? lc $_ : $_ }
  303664         726980  
609             split /\n/, $str;
610 20         96838 return @lines;
611             }
612              
613             =head2 parse_file
614              
615             Args : %named_parameters:
616             mandatory:
617             in_fname : input file name.
618             out_fname : output file name.
619             Example : $parser->parse_file(in_fname => $in_fname, out_fname => $out_fname);
620             Description : reads input and write output, both in rdb
621             format. Input data is 1 abstract per line. Prints data
622             for all sequences found, 1 sequence per line. Skips
623             abstracts and words based on rules described in
624             new(). Because it prints data for 1 word per line, the
625             same abstract data is printed for all sequences from
626             the same abstract.
627             Returns : TRUE if successful, FALSE otherwise.
628              
629             =cut
630              
631             sub parse_file {
632 2     2 1 1224 my ($self, %args) = @_;
633 2         7 foreach (qw(in_fname out_fname)) {
634 4         13 $self->{$_} = $args{$_};
635             }
636 2         6 my $in_fh;
637             my $out_fh;
638 2 50 0     81 open $in_fh, $self->{in_fname} or carp "not ok: open $self->{in_fname}: $!" and return;
639 2 50 0     4659 open $out_fh, ">$self->{out_fname}" or
640             carp "not ok: open $self->{out_fname}: $!" and return;
641 2         13 my ($comment, $ra_col, $ra_colf) = read_rdb_header($in_fh);
642 2         12 print_rdb_header($out_fh, $comment, $self->{print_col}, $self->{print_colf});
643 2         28 while (<$in_fh>) { # for 1 pubmed entry (=abstract)
644 3         6 chomp;
645 3 50 33     15 print STDERR "processing line $.\n" if ($self->{verbose} > 1 and not $. % 500);
646 3         7 my $rh_pm_in = {}; # input data for 1 abstract
647 3         39 @{$rh_pm_in}{ @$ra_col } = split "\t", $_;
  3         26  
648 3 50 0     12 my $rh_pm_out = $self->parse_abstract($rh_pm_in) or
649             carp "not ok: parse_abstract" and return;
650 3 50       20 next unless $rh_pm_out->{abst}{AbstScore} >= $self->{AbstScoreMin};
651 3         5 my %seen;
652 3         5 my $num_words = 0;
653 3         7 foreach my $rh_word (@{ $rh_pm_out->{words} }) {
  3         7  
654 593         808 $num_words++;
655 593 50 66     2479 last if $self->{WordNumPrintMax} != -1 and $num_words > $self->{WordNumPrintMax};
656 593 100 33     2007 if ($self->{WordNumPrintMin} == -1 ||
      66        
657             ($self->{WordNumPrintMin} == 1 && $num_words > $self->{WordNumPrintMin}) ) {
658 591 50 33     6625 next unless $rh_word->{WordScore} >= $self->{WordScoreMin} &&
659             $rh_word->{WordAbstScore} >= $self->{WordAbstScoreMin};
660 591 100 100     2052 next if defined $rh_word->{WordSequence} and
661             $seen{ $rh_word->{WordSequence} }++;
662             }
663 36992 100       77993 my %line = map { tr/\t/ /; $_ } # tab is used as the delimiter
  36992         76408  
  6936         22263  
664             (
665 578         1437 map( { $_ => defined $rh_word->{$_} ? $rh_word->{$_} : '' }
666 11560 50       33892 @{ $self->{words_col} }),
667 578         1240 map( { $_ => defined $rh_pm_out->{abst}{$_} ? $rh_pm_out->{abst}{$_} : '' }
668 578         916 @{ $self->{abst_col} }),
669             );
670 578         7012 print $out_fh join("\t", @line{ @{ $self->{print_col} } }), "\n";
  578         260511  
671             }
672             }
673 2 50 0     157 close $out_fh or carp "not ok: close $self->{out_fname}: $!" and return;
674 2 50 0     22 close $in_fh or carp "not ok: close $self->{in_fname}: $!" and return;
675 2         35 return 1;
676             }
677              
678             =head2 read_rdb_header
679              
680             Args : input rdb file handle
681             Example : my ($comment, $ra_col, $ra_colf) = read_rdb_header($in_fh);
682             Description : Reads header of an rdb table.
683             Returns : Returns comment (or empty string), ref to array of column names,
684             reference to array of column definitions.
685              
686             =cut
687              
688             sub read_rdb_header {
689 2     2 1 5 my ($in_fh) = @_;
690 2         5 my ($comment, $ra_col, $ra_colf);
691 2         64 while (<$in_fh>) {
692 2 50 0     10 $comment .= $_ and next if /^\#/; # table comments (optional)
693 2         7 chomp;
694 2         14 @$ra_col = split /\t/; # column names (eg, Hit, Expect)
695 2 50       11 last unless defined ($_ = <$in_fh> );
696 2         3 chomp;
697 2         11 @$ra_colf = split /\t/; # column formats (eg, 10N, 50S)
698 2         5 last;
699             }
700 2   50     20 return $comment || '', $ra_col, $ra_colf;
701             }
702              
703             =head2 print_rdb_header
704              
705             Args : output rdb file handle, comment (or undef), ref to array of
706             column names, reference to array of column definitions.
707             Example : print_rdb_header($out_fh, $comment, $self->{print_col},
708             $self->{print_colf});
709             Description : Prints header of an rdb table. If comment is undefined,
710             it is not printed.
711             Returns : always TRUE.
712              
713             =cut
714              
715             sub print_rdb_header {
716 2     2 1 5 my ($out_fh, $comment, $ra_col, $ra_colf) = @_;
717 2 50       11 print $out_fh $comment if defined $comment;
718 2         31 print $out_fh join( "\t", @$ra_col ), "\n";
719 2 50       18 print $out_fh join( "\t", @$ra_colf ), "\n" if defined $ra_colf;
720 2         6 return 1;
721             }
722              
723             =head2 parse_abstract
724              
725             Args : ref to hash with input data for one pubmed abstract
726             Example : $parser->parse_abstract($rh_pm_in);
727             Description : does all parsing for 1 abstract by calling init_abstract, AbstVars.
728             Returns : ref to hash with parsed data if successful, FALSE if error.
729             If no sequences are found, the hash will have a single sequence
730             consisting of an empty string (this is not considered an error).
731              
732             =cut
733              
734             sub parse_abstract {
735 12     12 1 3571 my ($self, $rh_pm_in) = @_;
736 12         21 my $rh_pm_out = {};
737 12 50       35 $self->init_abstract($rh_pm_in) or return;
738 12 50       33 $self->AbstVars or return;
739 12         24 %$rh_pm_out = map { $_ => $self->{$_} } qw(words abst);
  24         91  
740 12 50       42 carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$self], ['self']))
741             if $self->{verbose} > 2;
742 12         45 return $rh_pm_out;
743             }
744              
745             {
746             my %abst_default =
747             (
748             AbstComment => undef,
749             AbstMtext => undef,
750             AbstNumAb => 0,
751             AbstNumAllCap => 0,
752             AbstNumBind => 0,
753             AbstNumDigest => 0,
754             AbstNumMHC => 0,
755             AbstNumPeptide => 0,
756             AbstNumPhage => 0,
757             AbstNumProtease => 0,
758             AbstNumWords => 0,
759             AbstPropAllCap => 0,
760             AbstScore => 0,
761             );
762              
763             =head2 init_abstract
764              
765             Args : ref to hash with input data with field names 'Title', 'Abstract', etc
766             (optionally, with prefix 'Abst', eg 'AbstTitle', 'AbstAbstract').
767             See field names listed in in_col.
768             Example : $rh_pm_in = { Title => q[Some title.], Abstract => q[Some abstract.] };
769             $self->init_abstract($rh_pm_in) or return;
770             Description : Reads the input data and stores in 'Abst*' fields
771             ('Abst' prefix is added if needed). Adds prefix 'Abst'
772             to field names if needed. Converts to lower case and
773             stores text in AbstLc* fields. Undefined fields in the
774             input are converted to empty strings and
775             stored. Currently, there are no mandatory fields:
776             undefined $rh_pm_in is not considered an error. In
777             such case, all fields will be empty strings. No
778             sequences will be found, and the default score (zero)
779             will be assigned.
780             Returns : TRUE if successful, FALSE otherwise.
781              
782             =cut
783              
784             sub init_abstract {
785 12     12 1 16 my ($self, $rh_pm_in) = @_;
786             # reset abst and words data from the prev parsed abstract, if any
787 12         127 $self->{abst} = { %abst_default };
788 12         89 $self->{words} = [];
789 12         93 my @in_col = @{ $self->{in_col} };
  12         46  
790 12         16 foreach (@{ $self->{in_col} }) {
  12         34  
791 84         294 $self->{abst}{"Abst$_"} = do {
792 84 100       213 if (defined $rh_pm_in->{$_}) {
    50          
793 40         127 $rh_pm_in->{$_}
794             } elsif (defined $rh_pm_in->{"Abst$_"}) {
795 0         0 $rh_pm_in->{"Abst$_"}
796             } else {
797 44         151 ''
798             }
799             };
800             }
801             # mandatory field:
802             #my @missing = grep { $self->{abst}{$_} eq '' } qw(AbstAbstract);
803             #@missing and carp "not ok: missing fields: @missing" and return;
804 12   50     190 $self->{abst}{AbstAllText} =
805             join(";; ",
806             map "$_: $self->{abst}{$_}",
807             qw(AbstTitle AbstAbstract AbstMesh AbstChemical)
808             ) || '';
809 12         84 $self->{abst}{AbstLcAllText} = lc $self->{abst}{AbstAllText};
810 12         55 $self->{abst}{AbstLcAbstract} = lc $self->{abst}{AbstAbstract};
811 12         47 return 1;
812             }
813             }
814              
815             =head2 AbstVars
816              
817             Args : none
818             Example : $parser->AbstVars or return;
819             Description : Calls all Abst* methods that compute the corresponding variables,
820             eg AbstNumAb. Calls Words which in turn calls Word* methods.
821             Returns : TRUE if successful, FALSE otherwise.
822              
823             =cut
824              
825             sub AbstVars {
826 12     12 1 18 my ($self) = @_;
827 12 50       66 $self->AbstNumAb or return;
828             # AbstNumAllCap is computed in AbstScore()
829 12 50       31 $self->AbstNumBind or return;
830 12 50       46 $self->AbstNumDigest or return;
831 12 50       36 $self->AbstNumMHC or return;
832 12 50       32 $self->AbstNumPeptide or return;
833 12 50       30 $self->AbstNumPhage or return;
834 12 50       30 $self->AbstNumProtease or return;
835 12 50       32 $self->AbstScore or return;
836 12 50       31 $self->Words or return;
837 12 50       45 $self->WordAbstScore or return;
838 12 50       52 $self->AbstMtext or return;
839 12         27 foreach (qw(AbstMaxWordScore AbstPropAllCap AbstScore)) {
840 36         98 $self->{abst}{$_} = formatScoreProp( $self->{abst}{$_} );
841             }
842 12         20 foreach my $rh_word ( @{ $self->{words} }) {
  12         26  
843 1126         1430 foreach (qw(WordScore WordAbstScore WordPropDNA WordPropDegen)) {
844 4504         8734 $rh_word->{$_} = formatScoreProp( $rh_word->{$_} );
845             }
846             }
847 12 50       48 if ($self->{verbose} > 0) {
848 61         220 $self->{abst}{AbstComment} .= join '; ', map {
849 12         42 defined $self->{abst}{"ra_$_"} &&
850             ref $self->{abst}{"ra_$_"} eq 'ARRAY' ?
851 84 100 66     560 "$_='" . join(", ", @{ $self->{abst}{"ra_$_"} }) . "'" : "$_=''"
852             }
853             qw(
854             AbstNumAb AbstNumBind AbstNumDigest AbstNumMHC AbstNumPeptide
855             AbstNumPhage AbstNumProtease
856             );
857             }
858 12         50 return 1;
859             }
860              
861             =head2 AbstNum*
862              
863             All AbstNum* methods use the same general scheme, unless specified otherwise. Each method refers to a specific class of patterns, eg AbstNumAb() refers to antibody related patterns. Each method looks searches the AbstLcAllText field (concatenated abstract, mesh terms, chemical terms) for matches to the class of patterns. The matches are stored in an array ref (eg $self->{abst}{ra_AbstNumAb}). The number of matches is stored in a scalar ref (eg, $self->{abst}{AbstNumAb}).
864              
865             Pattern matching is done in several steps: patterns that can match anywhere in the word, and patterns that must match the entire word (typically shorter patterns, like 'mab' or 'fab'). Often, there are additional steps. Step 1 pattern matches are mandatory - if there are no matches at step 1, then step 2 is skipped. If step 1 succeeds, then step 2 (optional) pattern matches are done. For example, for AbstNumPeptide(), step 1 patterns include 'peptid', step 2 - 'synthe'. This is because 'synthe' by itself often refers to non-peptides, but together with 'peptid' matches often refers to peptides.
866              
867             =cut
868              
869             =head2 AbstNumAb
870              
871             Args : none
872             Example : $parser->AbstNumAb or return;
873             Description : See package DESCRIPTION above.
874             Returns : TRUE if successful, FALSE otherwise.
875              
876             =cut
877              
878             sub AbstNumAb {
879 12     12 1 20 my ($self) = @_;
880 12 50       84 return unless defined $self->{abst}{AbstLcAllText};
881 12         219 (my $var = (caller(0))[3]) =~ s/.*:://;
882 12         28 my @matches;
883 12         32 for ($self->{abst}{AbstLcAllText}) {
884 12         663 push @matches,
885             m[
886             adjuvant|
887             antibod|
888             antigen|
889             antiser|
890             epitope|
891             freund|
892             heavy\ chain|
893             immun|
894             light\ chain|
895             mimotope|
896             monoclonal|
897             phagotope|
898             polyclonal|
899             recombinant\ fab|
900             vaccin|
901             variable\ region|
902             variable\ regions
903             ]gx,
904             m[
905             \b(
906             mab|
907             mabs|
908             rfab|
909             sera|
910             serum
911             )\b
912             ]gx;
913 12         31 $self->{abst}{$var} = @matches;
914 12         53 $self->{abst}{"ra_$var"} = [ @matches ];
915             }
916 12         42 return 1;
917             }
918              
919             =head2 AbstNumBind
920              
921             Args : none
922             Example : $parser->AbstNumBind or return;
923             Description : See package DESCRIPTION above.
924             Returns : TRUE if successful, FALSE otherwise.
925              
926             =cut
927              
928             sub AbstNumBind {
929 12     12 1 15 my ($self) = @_;
930 12 50       38 return unless defined $self->{abst}{AbstLcAllText};
931 12         90 (my $var = (caller(0))[3]) =~ s/.*:://;
932 12         21 my @matches;
933 12         27 for ($self->{abst}{AbstLcAllText}) {
934 12         652 push @matches,
935             m[
936             abolish|
937             activate|
938             adher|
939             adhes|
940             affinit|
941             agonise|
942             agonist|
943             agonize|
944             associat|
945             bind|
946             block|
947             bound|
948             compet|
949             disrupt|
950             dissociat|
951             inhibit|
952             interact|
953             interfere|
954             ligand|
955             prevent|
956             recogn|
957             regulat|
958             suppress
959             ]gx,
960             m[
961             \b(
962             ec50|
963             ic50|
964             kd|
965             ki
966             )\b
967             ]gx;
968 12         24 $self->{abst}{$var} = @matches;
969 12         51 $self->{abst}{"ra_$var"} = [ @matches ];
970             }
971 12         39 return 1;
972             }
973              
974             {
975             # older papers are more likely to have data on protein digestion
976             # into peptides which are relevant only to the protein,
977             # rather than to the peptides.
978             # for these time intervals, multiply by this factor
979             my %AbstNumDigestScoreForYear;
980             for (1900..2100) {
981             if ($_ > 1993) {
982             $AbstNumDigestScoreForYear{$_} = 1;
983             } elsif ($_ > 1991) {
984             $AbstNumDigestScoreForYear{$_} = 1.5;
985             } elsif ($_ > 1990) {
986             $AbstNumDigestScoreForYear{$_} = 2;
987             } elsif ($_ > 1989) {
988             $AbstNumDigestScoreForYear{$_} = 3;
989             } else { # <= 1989
990             $AbstNumDigestScoreForYear{$_} = 4;
991             } ## stopped here
992             }
993              
994             =head2 AbstNumDigest
995              
996             Args : none
997             Example : $parser->AbstNumDigest or return;
998             Description : See package DESCRIPTION above.
999             Returns : TRUE if successful, FALSE otherwise.
1000              
1001             =cut
1002              
1003             sub AbstNumDigest {
1004 12     12 1 17 my ($self) = @_;
1005 12 50       43 return unless defined $self->{abst}{AbstLcAllText};
1006 12         92 (my $var = (caller(0))[3]) =~ s/.*:://;
1007 12         23 my @matches;
1008 12         28 for ($self->{abst}{AbstLcAllText}) {
1009             # do not shorten to 'trypt' to avoid confusion with tryptophan
1010 12         364 push @matches,
1011             m[
1012             borohydride|
1013             complete\ amino\ acid\ sequence|
1014             chromatogra|
1015             cnbr|
1016             cyanogen\ bromide|
1017             dansyl|
1018             digest|
1019             edman|
1020             hplc|
1021             lsims|
1022             maldi-tof|
1023             mass\ spec|
1024             matrix-assisted\ laser\ desorption|
1025             nabh4|
1026             peptide\ map|
1027             protease\ digestion|
1028             proteolytic\ digestion|
1029             rplc|
1030             spectrometry,\ mass|
1031             spectrum analysis,\ mass|
1032             thermolysin|
1033             thermolytic|
1034             tryptic|
1035             trypsin|
1036             v-8\ prote|
1037             v8\ prote
1038             ]gx,
1039             m[
1040             \b(
1041             ms
1042             )\b
1043             ]gx;
1044 12         28 $self->{abst}{$var} = @matches;
1045 12         35 $self->{abst}{"ra_$var"} = [ @matches ];
1046 12         31 for ($self->{abst}{AbstJournal}) {
1047 12 100 66     145 $self->{abst}{$var} *= $AbstNumDigestScoreForYear{$1} if
      66        
1048             defined $_ and /\b((19|20)\d\d)\b/ and $AbstNumDigestScoreForYear{$1};
1049             }
1050 12         62 $self->{abst}{$var} = sprintf "%.0f", $self->{abst}{$var};
1051             }
1052 12         39 return 1;
1053             }
1054             }
1055              
1056             =head2 AbstNumMHC
1057              
1058             Args : none
1059             Example : $parser->AbstNumMHC or return;
1060             Description : See package DESCRIPTION above.
1061             Returns : TRUE if successful, FALSE otherwise.
1062              
1063             =cut
1064              
1065             sub AbstNumMHC {
1066 12     12 1 58 my ($self) = @_;
1067 12 50       36 return unless defined $self->{abst}{AbstLcAllText};
1068 12         89 (my $var = (caller(0))[3]) =~ s/.*:://;
1069 12         28 my @matches;
1070 12         29 for ($self->{abst}{AbstLcAllText}) {
1071 12         695 push @matches,
1072             m[
1073             alloge|
1074             allograft|
1075             antibod|
1076             antigen|
1077             b\ cell|
1078             dendri|
1079             epitope|
1080             histocompatibility|
1081             immun|
1082             present|
1083             restrict|
1084             tolero|
1085             t\ cell
1086             ]gx,
1087             m[
1088             \b(
1089             cd\d{1,2}|
1090             class\ i{1,2}|
1091             hla|
1092             mhc|
1093             self|
1094             tcr
1095             )\b
1096             ]gx;
1097 12         29 $self->{abst}{$var} = @matches;
1098 12         50 $self->{abst}{"ra_$var"} = [ @matches ];
1099             }
1100 12         40 return 1;
1101             }
1102              
1103             =head2 AbstNumPeptide
1104              
1105             Args : none
1106             Example : $parser->AbstNumPeptide or return;
1107             Description : See package DESCRIPTION above.
1108             Returns : TRUE if successful, FALSE otherwise.
1109              
1110             =cut
1111              
1112             sub AbstNumPeptide {
1113 12     12 1 18 my ($self) = @_;
1114 12 50       32 return unless defined $self->{abst}{AbstLcAllText};
1115 12         94 (my $var = (caller(0))[3]) =~ s/.*:://;
1116 12         24 my @matches;
1117             my @matches_neg;
1118             ##cc TODO: possibly add matches to /protein/ and /fragment/ in the same sentence.
1119 12         29 for ($self->{abst}{AbstLcAllText}) {
1120             # step 1, mandatory matches
1121 12         162 push @matches,
1122             m[
1123             benzyloxycarbonyl|
1124             hormone|
1125             peptid
1126             ]gx;
1127 12 100       38 next unless @matches;
1128 6         78 push @matches_neg,
1129             m[
1130             peptidase|
1131             polypeptide
1132             ]gx;
1133 6 50       36 next unless (scalar(@matches) - scalar(@matches_neg)) > 0;
1134             # step 2, optional matches
1135 6         1411 push @matches,
1136             m[
1137             consensus|
1138             constrain|
1139             \bcyclic|
1140             librar|
1141             linear|
1142             motif|
1143             sequenc|
1144             synthe
1145             ]gx,
1146             m[
1147             \b(
1148             ac|
1149             acetyl|
1150             amide|
1151             boc|
1152             ch2nh|
1153             conh|
1154             cyclo|
1155             dansyl|
1156             h2n|
1157             nh|
1158             nh2|
1159             ome|
1160             tos
1161             )\b
1162             ]gx;
1163 6         20 $self->{abst}{$var} = scalar(@matches) - scalar(@matches_neg);
1164 6         38 $self->{abst}{"ra_$var"} = [ @matches ];
1165             }
1166 12         40 return 1;
1167             }
1168              
1169             =head2 AbstNumPhage
1170              
1171             Args : none
1172             Example : $parser->AbstNumPhage or return;
1173             Description : See package DESCRIPTION above.
1174             Returns : TRUE if successful, FALSE otherwise.
1175              
1176             =cut
1177              
1178             sub AbstNumPhage {
1179 12     12 1 17 my ($self) = @_;
1180 12 50       36 return unless defined $self->{abst}{AbstLcAllText};
1181 12         85 (my $var = (caller(0))[3]) =~ s/.*:://;
1182 12         24 my @matches;
1183             my @matches_neg;
1184 12         31 for ($self->{abst}{AbstLcAllText}) {
1185             # step 1, mandatory matches
1186 12         123 push @matches,
1187             m[
1188             bio-pan|
1189             biopan|
1190             phagemid|
1191             phage\b|
1192             phages\b
1193             ]gx,
1194             m[
1195             \b(
1196             panned| # match panning related, but not spanning, pan-caspase, etc
1197             panning|
1198             pannings|
1199             pans
1200             )\b
1201             ]gx;
1202 12 100       36 next unless @matches;
1203 5         25 push @matches_neg,
1204             m[
1205             macrophage
1206             ]gx;
1207 5 50       18 next unless (scalar(@matches) - scalar(@matches_neg)) > 0;
1208             # step 2, optional matches
1209 5         162 push @matches,
1210             m[
1211             clone|
1212             cloni|
1213             display|
1214             librar|
1215             screen|
1216             select
1217             ]gx,
1218             m[
1219             \b(
1220             round|
1221             rounds
1222             )\b
1223             ]gx;
1224 5         14 $self->{abst}{$var} = scalar(@matches) - scalar(@matches_neg);
1225 5         30 $self->{abst}{"ra_$var"} = [ @matches ];
1226              
1227             }
1228 12         38 return 1;
1229             }
1230              
1231             =head2 AbstNumProtease
1232              
1233             Args : none
1234             Example : $parser->AbstNumProtease or return;
1235             Description : See package DESCRIPTION above.
1236             Returns : TRUE if successful, FALSE otherwise.
1237              
1238             =cut
1239              
1240             sub AbstNumProtease {
1241 12     12 1 19 my ($self) = @_;
1242 12 50       39 return unless defined $self->{abst}{AbstLcAllText};
1243 12         90 (my $var = (caller(0))[3]) =~ s/.*:://;
1244 12         23 my @matches;
1245 12         33 for ($self->{abst}{AbstLcAllText}) {
1246 12         199 push @matches,
1247             m[
1248             aminobenzoyl|
1249             caspase|
1250             cathepsin|
1251             convertase|
1252             dinitrophenyl|
1253             nitroanilide|
1254             peptidase|
1255             protease|
1256             proteasom|
1257             proteinase|
1258             proteol
1259             ]gx;
1260 12 100       41 next unless @matches;
1261             # match: MMP, MMPs MMP1, MMP-1, but not MMPFOO
1262             # match: Dnp-Met
1263 2         847 push @matches,
1264             m[
1265             (?:\b|[^a-z])(
1266             abz|
1267             amc
1268             dnp|
1269             dpa|
1270             eddnp|
1271             fmk|
1272             mca|
1273             mmp|
1274             mmps
1275             )(?:\b|[^a-z])
1276             ]gx;
1277 2         6 $self->{abst}{$var} = @matches;
1278 2         13 $self->{abst}{"ra_$var"} = [ @matches ];
1279             }
1280 12         36 return 1;
1281             }
1282              
1283             =head2 AbstScore
1284              
1285             Args : none
1286             Example : $parser->AbstScore or return;
1287             Description : Computes AbstScore based on the results of Abst* vars in the abstract.
1288             Returns : TRUE if successful, FALSE otherwise.
1289              
1290             =cut
1291              
1292             sub AbstScore {
1293 12     12 1 17 my ($self) = @_;
1294 12         19 my $rh_abst = $self->{abst};
1295             # Split with saving the original whitespace, which is then added back
1296             # during join after marking up is done, in AbstMtext().
1297             # This preserves the original character positions.
1298 12         596 @{ $rh_abst->{ra_WordOrig} } = split /(\s+)/, $rh_abst->{AbstAbstract};
  12         361  
1299            
1300 12         69 $rh_abst->{AbstNumWords} = grep /\S/, @{ $rh_abst->{ra_WordOrig} };
  12         572  
1301 12 100       16 $rh_abst->{AbstNumAllCap} = @{[grep { /[A-Z]/ and not /[a-z]/ }
  2160         5015  
  12         28  
1302 12         15 @{ $rh_abst->{ra_WordOrig} } ]};
1303             # initialize:
1304 12         31 $rh_abst->{AbstScore} = 0;
1305             # linear terms:
1306 12         41 $rh_abst->{AbstScore} +=
1307             (-1) * $rh_abst->{AbstNumDigest} + $rh_abst->{AbstNumPeptide} + $rh_abst->{AbstNumPhage};
1308 12 50       40 warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2;
1309              
1310             # interaction terms (score increases more if both vars increase simultaneously)
1311 12         44 $rh_abst->{AbstScore} +=
1312             $rh_abst->{AbstNumBind} *
1313             ($rh_abst->{AbstNumPeptide} + $rh_abst->{AbstNumPhage}) +
1314             $rh_abst->{AbstNumProtease} *
1315             ($rh_abst->{AbstNumPeptide} + $rh_abst->{AbstNumPhage}) +
1316             $rh_abst->{AbstNumPeptide} * $rh_abst->{AbstNumPhage};
1317 12 50       35 warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2;
1318              
1319             # abstracts with too many all cap words are more likely to have all cap
1320             # non-sequence words rather than true sequences.
1321 12   100     54 $rh_abst->{AbstPropAllCap} = $rh_abst->{AbstNumAllCap} / ($rh_abst->{AbstNumWords} || 1);
1322 12 100 66     68 if ($rh_abst->{AbstPropAllCap} > 0.3 || $rh_abst->{AbstNumAllCap} > 30) {
1323 2         6 $rh_abst->{AbstScore} *= 0.01; # strong penalty if above threshold
1324             } else {
1325 10         23 $rh_abst->{AbstScore} *= (1 - $rh_abst->{AbstPropAllCap});
1326             }
1327 12 50       42 warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2;
1328             # AbstScore < 0 can happen for hi AbstNumDigest:
1329 12 50       31 $rh_abst->{AbstScore} = 0 if $rh_abst->{AbstScore} < 0;
1330             # transform (0, +inf) to (0,1):
1331 12         30 $rh_abst->{AbstScore} = $rh_abst->{AbstScore} / (1 + $rh_abst->{AbstScore});
1332 12 50       29 warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2;
1333 12 50       29 carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$self], ['self']))
1334             if $self->{verbose} > 2;
1335 12         32 return 1;
1336             }
1337              
1338             {
1339            
1340             my %word_default =
1341             (
1342             WordAbstScore => 0,
1343             WordAaSymbols => 0,
1344             WordIsDNA => 0,
1345             WordIsDNALen => 0,
1346             WordIsDict => 0,
1347             WordIsGene => 0,
1348             WordIsRomanLen => 0,
1349             WordNumDegen => 0,
1350             WordOrig => undef,
1351             WordPropDNA => 0,
1352             WordPropDegen => 0,
1353             WordPropProtein => 0,
1354             WordSeqLen => 0,
1355             WordScore => 0,
1356             WordSequence => undef,
1357             WordSubLenMax => 0,
1358             ##cc provide defaults for all , to be used for 3 letter and full names of aa.
1359             );
1360              
1361             =head2 Words
1362            
1363             Args : none
1364             Example : $parser->Words or return;
1365             Description : Parses all peptide sequences from abstract text in AbstAbstract.
1366             Calls the necessary methods, and stores results in array ref
1367             $parser->{words}.
1368             Returns : TRUE if successful, FALSE otherwise.
1369              
1370             =cut
1371              
1372             sub Words {
1373 12     12 1 24 my ($self) = @_;
1374 12         16 my @words; # each el is a ref to hash with data
1375 12 100 66     48 if ($self->{WordNumPrintMin} == 1 and $self->{WordNumPrintMax} == 1) {
1376             # need exactly 1 empty word
1377 5         66 $self->{words} = [ { %word_default } ];
1378 5         24 return 1;
1379             }
1380 7         10 my $idx = 0;
1381 7         8 my @WordOrig = @{ $self->{abst}{ra_WordOrig} };
  7         154  
1382 7         18 foreach (@WordOrig) {
1383 1121         10325 my $rh_word = { %word_default };
1384 1121         3265 $rh_word->{WordOrig} = $_;
1385 1121         2034 $rh_word->{WordLcOrig} = lc $_;
1386 1121         1591 $rh_word->{WordIdx} = $idx++;
1387 1121 100       3921 if (not /\S/) {
    100          
    100          
1388             # do nothing
1389             }
1390             elsif ( $_ = $self->aa3_to_aa1( $rh_word->{WordOrig} ) ) {
1391 6         13 $rh_word->{WordAaSymbols} = 3; # 3 letter aa symbols (and full names, too)
1392 6         12 $rh_word->{WordSequence} = $_;
1393 6 50       18 $self->WordVars($rh_word) or return;
1394             }
1395             elsif ( $_ = $self->to_aa1( $rh_word->{WordOrig} ) ) {
1396 72         105 $rh_word->{WordAaSymbols} = 1; # 1 letter aa symbols:
1397 72         111 $rh_word->{WordSequence} = $_;
1398 72 50       157 $self->WordVars($rh_word) or return;
1399             }
1400             # else: has non-whitespace chars, but is not a sequence: do nothing
1401 1121 50       2274 $self->WordScore($rh_word) or return;
1402 1121         2051 push @words, $rh_word;
1403             }
1404 7 50 33     37 if ($self->{WordNumPrintMin} == 1 and not @words) { # need at least 1 word
1405 0         0 push @words, { %word_default };
1406             }
1407 7         239 $self->{words} = [ @words ];
1408 7         148 return 1;
1409             }
1410             }
1411              
1412             {
1413             # See more: Nomenclature and Symbolism for Amino Acids and Peptides
1414             # http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html
1415             # Amino acids below are converted to X for blasting, no need to worry here.
1416              
1417             # keep order for correct substitution below. Generally, longer patterns go first, so that less non-replaced chars remain, so that there is less to clean up later.
1418             my @aa3_to_aa1 =
1419             (
1420             # full names:
1421             # Omitted: B, U, Z
1422             alanine => 'A',
1423             cysteine => 'C',
1424             cystein => 'C', # alternative spelling
1425             aspartic => 'D',
1426             aspartate => 'D',
1427             glutamic => 'E',
1428             glutamate => 'E',
1429             phenylalanine => 'F',
1430             glycine => 'G',
1431             histidine => 'H',
1432             isoleucine => 'I',
1433             lysine => 'K',
1434             leucine => 'L',
1435             methionine => 'M',
1436             asparagine => 'N',
1437             proline => 'P',
1438             glutamine => 'Q',
1439             arginine => 'R',
1440             serine => 'S',
1441             threonine => 'T',
1442             selenocysteine => 'U',
1443             valine => 'V',
1444             tryptophane => 'W', # alternative spelling
1445             tryptophan => 'W',
1446             tyrosine => 'Y',
1447              
1448             # full names, 'yl' endings:
1449             # matches:
1450             # glycyl-L-prolyl-L-glutamic acid;
1451             # arginyl glycyl aspartyl serine
1452             alanyl => 'A',
1453             cysteinyl => 'C',
1454             aspartyl => 'D',
1455             glutamyl => 'E',
1456             phenylalanyl => 'F',
1457             glycyl => 'G',
1458             histidyl => 'H',
1459             isoleucyl => 'I',
1460             lysyl => 'K',
1461             leucyl => 'L',
1462             methionyl => 'M',
1463             asparaginyl => 'N',
1464             prolyl => 'P',
1465             glutaminyl => 'Q',
1466             arginyl => 'R',
1467             seryl => 'S',
1468             threonyl => 'T',
1469             selenocysteinyl => 'U',
1470             valyl => 'V',
1471             tryptophyl => 'W',
1472             tyrosyl => 'Y',
1473              
1474             # 3 letter aa symbols to 1 letter.
1475             Ala => 'A',
1476             Cys => 'C',
1477             Asp => 'D',
1478             Glu => 'E',
1479             Phe => 'F',
1480             Gly => 'G',
1481             His => 'H',
1482             Ile => 'I',
1483             Lys => 'K',
1484             Leu => 'L',
1485             Met => 'M',
1486             Asn => 'N',
1487             Pro => 'P',
1488             Gln => 'Q',
1489             Arg => 'R',
1490             Ser => 'S',
1491             Thr => 'T',
1492             Sec => 'U',
1493             Val => 'V',
1494             Trp => 'W',
1495             Tyr => 'Y',
1496              
1497             # non-standard amino acids are all converted to X
1498             Aad => 'X',
1499             Abu => 'X',
1500             Aib => 'X',
1501             Ahx => 'X',
1502             Ape => 'X',
1503             Apm => 'X',
1504             Asx => 'X', # B in http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html
1505             Cit => 'X',
1506             Dab => 'X',
1507             Dap => 'X',
1508             Dpm => 'X',
1509             Dpr => 'X',
1510             Glx => 'X', # Z in http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html
1511             Hyl => 'X',
1512             Hyp => 'X',
1513             Nle => 'X',
1514             Nva => 'X',
1515             Orn => 'X',
1516             Pyr => 'X',
1517              
1518             # unknown amino acids
1519             Xaa => 'X', # keep order as shown, so that these replacements are correct: 'Asn-x-Glu-x-x-(aromatic)-x-x-Gly',
1520             # unknown amino acids, not in IUPAC, but frequently used:
1521             Xxx => 'X', # such as Gly-Xxx-Gly, but some XXX or xxx may refer to X{3}, eg Gly-xxx-Gly
1522             Yaa => 'X',
1523             Zaa => 'X',
1524              
1525             );
1526              
1527             my %aa3_to_aa1 = @aa3_to_aa1;
1528             my @aa3_to_aa1_keys = grep exists $aa3_to_aa1{$_}, @aa3_to_aa1;
1529             # print to redo sort:
1530             # for (sort { $aa3_to_aa1{$a} cmp $aa3_to_aa1{$b} } keys %aa3_to_aa1) {
1531             # next unless length $_ == 3;
1532             # print "$_ => '$aa3_to_aa1{$_}',\n";
1533             # }
1534             my $aa3_to_aa1 = join "|", @aa3_to_aa1_keys;
1535             my $aa3_to_aa1_re = qr!$aa3_to_aa1!;
1536              
1537             # enable parsing lowercase 3 letter symbols, eg 'NAc-lys-gly-gln-OH'.
1538             my %aa3_to_aa1_lc = map { lc($_) => $aa3_to_aa1{$_} } keys %aa3_to_aa1;
1539             my @aa3_to_aa1_keys_lc = map { lc $_ } @aa3_to_aa1_keys;
1540             my $aa3_to_aa1_lc = join "|", @aa3_to_aa1_keys_lc;
1541             my $aa3_to_aa1_lc_re = qr!$aa3_to_aa1_lc!;
1542              
1543             =head2 aa3_to_aa1
1544              
1545             Args : string with a peptide sequence
1546             Example : $str = $self->aa3_to_aa1($str)
1547             Description : finds the first string that looks like protein
1548             sequence in 3 letter amino acid symbols or full
1549             names. Cleans (remove non-informative parens, -, /,
1550             etc), and returns the cleaned sequence. Currently,
1551             full names are also handled here, but this may change.
1552             Returns : sequence (TRUE) if a sequence is found, FALSE otherwise.
1553              
1554             =cut
1555              
1556             sub aa3_to_aa1 {
1557 605     605 1 789 my ($self, $str) = @_;
1558             # first try more frequent, uppercase, then lowercase.
1559             # tag with _BeginTag_ and _EndTag_ the replaced amino acids,
1560             # then use the tag to remove the rest
1561             # of the chars, except for a set of allowed chars that did not
1562             # need 3 letter to 1 letter conversion and thus did not get tagged.
1563             # proceed if > 1 replacements , eg > 1 aa.
1564             return unless
1565             (
1566 605 100 66     13258 ($_ = $str and s!($aa3_to_aa1_re)!_BeginTag_$aa3_to_aa1{$1}_EndTag_!g > 1) or
      66        
      66        
1567             ($_ = $str and s!\b($aa3_to_aa1_lc_re)\b!_BeginTag_$aa3_to_aa1_lc{$1}_EndTag_!g > 1)
1568             );
1569             # warn "tag aa3_to_aa1: \$str=$str;" if $self->{verbose} > 2;
1570             # warn "tag aa3_to_aa1: \$_=$_;" if $self->{verbose} > 2;
1571             # handle x and X here, so that the method returns TRUE only if 3 letter
1572             # symbols occurred. x and X do not count.
1573 49         116 s!\bx\b!_BeginTag_X_EndTag_!g;
1574             # neg lookbehind: tag each X, except those already tagged
1575             # (eg, converted from Xaa to X) (= preceded by _), or
1576             # those where X means something other than amino acid
1577             # (eg, Met(OX) means Met, oxidized) (= preceded by non-X upper case letter).
1578 49         103 s!(?<\![_A-WYZ])X!_BeginTag_X_EndTag_!g;
1579             # warn "tag X: \$_=$_;" if $self->{verbose} > 2;
1580             # change 'X(3)' to 'XXX'
1581             # repeat only Xs, because X(n) usually refers to X repeated n times,
1582             # while for other aa, such as G,
1583             # G(n) refers to the position within the protein.
1584             # Some cases where X(n) refers to position n are eliminated later if n is too large,
1585             # eg X(20) results in sequence length > 20 and is assigned lower score.
1586 49         99 s!_BeginTag_X_EndTag_\W*(\d)\W*!'_BeginTag_X_EndTag_' x $1!eg;
  8         33  
1587 49         530 $_ = join '', grep { defined $_ } m!([\-/()])|_BeginTag_(.)_EndTag_!g;
  1078         1470  
1588 49         150 return clean_seq($_);
1589             }
1590             }
1591              
1592              
1593             {
1594              
1595             # p = phospho, eg pY = phosphotyrosine
1596             # x and X are both frequently used in motifs as any aa.
1597             # B, Z, U are not included because they do not occur very frequently in aa sequences in abstracts.
1598             my $aa1_allowed = '\-/()ACDEFGHIKLMNPQRSTVWYXpx';
1599             my $aa1_re = qr!
1600             # not immed preceded by other letters
1601             # optional N-terminus
1602             (?:N\-|HN\-|H2N\-|H\(2\)N\-|[\W\d]|^)
1603             # sequence, >= 2 aa
1604             ([$aa1_allowed]{2,})
1605             # not immed followed by other letters
1606             (?:[\W\d]|\-.*|$)
1607             !x;
1608              
1609             =head2 to_aa1
1610              
1611             Args : string with a peptide sequence
1612             Example : $str = $self->to_aa1($str)
1613             Description : finds the first string that looks like peptide sequence in uppercase
1614             1 letter amino acid symbols. Cleans (remove non-informative parens,
1615             -, /, etc), and returns the cleaned sequence.
1616             Returns : sequence (TRUE) if a sequence is found, FALSE otherwise.
1617              
1618             =cut
1619              
1620             sub to_aa1 {
1621 576     576 1 813 my ($self, $str) = @_;
1622 576         853 for ($str) {
1623 576 100       3732 if (m!$aa1_re!) {
1624 91         188 return clean_seq($1);
1625             }
1626             }
1627 485         1369 return;
1628             }
1629             }
1630              
1631             =head2 clean_seq
1632              
1633             Args : string with a peptide sequence in 1 letter aa symbols
1634             Example : $str = clean_seq($str);
1635             Description : for sequence in 1 letter aa symbols, remove extra parens, etc.
1636             Simple motifs are ok: A(B/C/D)E...
1637             Returns : cleaned string.
1638              
1639             =cut
1640              
1641             sub clean_seq {
1642 140     140 1 281 my ($str) = @_;
1643 140         206 for ($str) {
1644 140         210 s!\-NH[()\d]*$!!; # remove C/N terminal mark
1645 140         420 s!(^[CN]'|[CN]'$)!!g;
1646 140         172 s!FMK\W*$!!; # remove aa modifications, eg fluorophores
1647 140         193 tr!\-!!d; # delete connecting '-', eg change 'G-E-T' to 'GET'
1648 140         160 tr!p!!d; # delete p=phospho, not P=proline. lowercase only.
1649 140         204 $_ = uc $_; # do this after resolving: p=phospho, not P=proline;
1650             # Xxx = X, not XXX = X{3}
1651 140         192 s!X\W*(\d)\W*!'X' x $1!eg;
  0         0  
1652 140         226 $_ = clean_orphan_parens($_);
1653 140         177 s!^/+!!; # clean up beginning and end
1654 140         168 s!/+$!!; # clean up beginning and end
1655 140         188 s!/+!/!g; # clean up repeated separator chars
1656 140         234 $_ = parse_slashes($_);
1657 140         238 $_ = clean_orphan_parens($_);
1658 140         677 return $_;
1659             }
1660             }
1661              
1662             =head2 clean_orphan_parens
1663              
1664             Args : string with a sequence to clean up.
1665             Example : $_ = clean_orphan_parens($_);
1666             Description : removes orphan parens from a string.
1667             Returns : cleaned string.
1668              
1669             =cut
1670              
1671             sub clean_orphan_parens {
1672 280     280 1 363 my ($str) = @_;
1673 280         616 for (my $i = 0; $i < 100; $i++) { # magic number 100 to prevent inf loops
1674 1903         14306 my $num_changes = 0;
1675 1903         2051 $num_changes += s!^\)+!!g; # delete wrong sided parens
1676 1903         2858 $num_changes += s!\(+$!!g; # delete wrong sided parens
1677 1903         4853 $num_changes += s!\(+!\(!g; # change '((GET)' to '(GET)'
1678 1903         4674 $num_changes += s!\)+!\)!g; # change '(GET))' to '(GET)'
1679 1903         3710 $num_changes += s!\([^A-Z]*\)!!g; # remove '(/)', '()'
1680 1903         2333 $num_changes += s!^\((.)\)!$1!; # change '(G)ETRAPL' to 'GETRAPL'
1681             # change 'GETR(A)PL' to 'GETRAPL'
1682             # change 'GET(RAP)L' to 'GETRAPL'
1683 1903         3027 $num_changes += s!\(([^()/]*)\)!$1!g;
1684             # change '(GETR(A)PL)' to 'GETR(A)PL' - delete outer parens in 2 steps:
1685 1903         2741 $num_changes += s!\(([^()]+)\(!$1\(!g; # change '(GETR(A)PL)' to 'GETR(A)PL)'
1686 1903         2603 $num_changes += s!\)([^()]+)\)!\)$1!g; # change 'GETR(A)PL)' to 'GETR(A)PL'
1687 1903         2068 my $num_parens = tr!()!!;
1688 1903 100       3433 if ($num_parens == 1) {
1689 13         71 $num_changes += tr!()!!d; # change 'GE(TRAPL' to 'GETRAPL'
1690             }
1691 1903 100       4769 last unless $num_changes; # clean stubborn sequences, eg:
1692             # 'G(ET(RA)PL' => 'G(ETRAPL' (after iteration 1), 'GETRAPL' (after iteration 2).
1693             }
1694 280         537 return $_;
1695             }
1696              
1697              
1698             =head2 parse_slashes
1699              
1700             Args : string with a sequence.
1701             Example : $_ = parse_slashes($_);
1702             Description : handles slashes. Changes A/B/C/etc to (A/B/C/etc)
1703             which means any of A, B, C, etc. Returns the resulting
1704             string. Exceptions are as follows, based on the the
1705             fact that the result would make no sense. If X occurs
1706             next to '/' or if the result would contain a repeated
1707             character. For exceptions, splits the string on
1708             slashes, and returns the first longest string.
1709             Returns : see above
1710              
1711             =cut
1712              
1713             sub parse_slashes {
1714 140     140 1 150 $_ = $_[0];
1715 140 100       601 return $_ unless m!/!;
1716 15 100 100     100 if (m!/X|X/! or has_repeats_at_slashes($_) ) {
1717 3         5 my $str = '';
1718 3         16 for (split m!/+!) {
1719 6 50       22 $str = $_ if length($str) < length($_); # change PXXP/PXPXP to PXPXP
1720             }
1721 3         8 return $str;
1722             } else {
1723 12         100 s!(\w(/\w)+)!($1)!g; # change PXXP/GXPXP to PXX(P/G)XPXP
1724 12         29 return $_;
1725             }
1726             }
1727              
1728              
1729             =head2 has_repeats_at_slashes
1730              
1731             Args : string with a sequence.
1732             Example : print 1 if has_repeats_at_slashes($_);
1733             Description : see below
1734             Returns : TRUE if the string has a repeated char next to a series of
1735             slashes and chars, FALSE otherwise.
1736              
1737             =cut
1738              
1739             sub has_repeats_at_slashes {
1740 14     14 1 32 for ($_[0]) {
1741 14         77 for (m!(\w(?:/\w)+)!g) {
1742 19         57 my %seen;
1743 19         78 for (/\w/g) {
1744 39 100       274 return 1 if $seen{$_}++;
1745             }
1746             }
1747             }
1748 12         54 return;
1749             }
1750              
1751             =head2 WordVars
1752              
1753             Args : $rh_word - ref to hash with data for the word
1754             Example : $parser->WordVars($rh_word) or return;
1755             Description : computes variables for WordScore, eg WordPropDegen, WordIsDNA, etc.
1756             Returns : TRUE if successful, FALSE otherwise.
1757              
1758             =cut
1759              
1760             sub WordVars {
1761 78     78 1 110 my ($self, $rh_word) = @_;
1762 78 50 33     388 return unless $rh_word and ref $rh_word eq 'HASH';
1763 78         154 $_ = $rh_word->{WordSequence};
1764             # keep allowed chars in kmers(): 'a'..'z' in sync with
1765             # WordPropProtein() args : lc tr/A-Z//cd;
1766 78         187 ( $rh_word->{WordAlpha} = $_ ) =~ tr/A-Z//cd;
1767            
1768 78 50       179 if ( $rh_word->{WordSequence} ) {
1769 78         119 $rh_word->{WordSeqLen} = length $rh_word->{WordAlpha};
1770 78         103 $rh_word->{WordNumDegen} = tr/X//;
1771 78         156 $rh_word->{WordNumNotDegen} = $rh_word->{WordSeqLen} - $rh_word->{WordNumDegen};
1772 78   50     274 $rh_word->{WordPropDegen} = $rh_word->{WordNumDegen} / ($rh_word->{WordSeqLen} || 1);
1773 78         174 $rh_word->{WordLcSequence} = lc $rh_word->{WordSequence};
1774             }
1775            
1776 78 100       181 if ($rh_word->{WordAaSymbols} == 1) {
1777             # Compute WordPropProtein only for 1 letter symbols, because
1778             # it is important for classification if 'GET' looks like an english word,
1779             # but not important if 'Gly-Glu-Thr' does.
1780 72         210 $rh_word->{WordPropProtein} = $self->WordPropProtein(lc $rh_word->{WordAlpha});
1781 72         141 $rh_word->{WordIsDNALen} = tr/ACTGN//;
1782 72 50       248 $rh_word->{WordPropDNA} = $rh_word->{WordIsDNALen} / $rh_word->{WordSeqLen}
1783             if $rh_word->{WordSeqLen};
1784             # if all or mostly dna, eg, '(A/C)TGG' or 'WCCGGGCGGCCCC'
1785 72 100       167 $rh_word->{WordIsDNA} = 1 if $rh_word->{WordPropDNA} >= 0.9;
1786             }
1787 78         211 return 1;
1788             }
1789              
1790             {
1791             my $re;
1792              
1793             =head2 WordPropProtein
1794              
1795             Args : $word : string
1796             Example : $rh_word->{WordPropProtein} = $self->WordPropProtein($word);
1797             Description : See package DESCRIPTION above.
1798             Returns : sum of log10(proportion(k-mers))
1799              
1800             =cut
1801            
1802             sub WordPropProtein {
1803 72     72 1 106 my ($self, $str) = @_;
1804 72         86 my $WordPropProtein = 0;
1805 72 100       201 return $WordPropProtein if (length($str) < $self->{kmer_length});
1806 66   66     174 $re ||= qr/(?=(.{$self->{kmer_length}}))/;
1807 66         662 foreach ($str =~ /$re/g) {
1808 218 50       597 unless (defined $log_prop_sequence{$_}) {
1809 0         0 carp "not ok: undefined log_prop_sequence for $_";
1810 0         0 next;
1811             }
1812 218 50       509 unless (defined $log_prop_nonsequence{$_}) {
1813 0         0 carp "not ok: undefined log_prop_nonsequence for $_";
1814 0         0 next;
1815             }
1816 218         768 $WordPropProtein += ($log_prop_sequence{$_} - $log_prop_nonsequence{$_});
1817             }
1818 66         419 $WordPropProtein = sprintf("%.1f", $WordPropProtein);
1819 66         206 return $WordPropProtein;
1820             }
1821             }
1822              
1823             {
1824             # WordScore factors for different values of vars, eg
1825             # for WordAaSymbols = 3, multiply WordScore by 1000.
1826              
1827             # WordScoreForAaSymbols:
1828             # Keep the ratio of 3 letter or full name symbols relative to 1 letter symbols high,
1829             # because even for very short words they are more likely to be a sequence, eg:
1830             # 'Pro-Ala-Pro' relative to 'PAP'.
1831             my %WordScoreForAaSymbols =
1832             (
1833             0 => 0,
1834             1 => 1,
1835             3 => 1000,
1836             5 => 10000, # full names (not implemented)
1837             );
1838             #
1839             my %WordScoreForSeqLen =
1840             (
1841             0 => 0,
1842             1 => 0,
1843             2 => 0.0001,
1844             3 => 0.1,
1845             4 => 0.5,
1846             5 => 1,
1847             );
1848              
1849             # as WordSeqLen increases between 5 and $MAX_PEPTIDE_LENGTH,
1850             # WordScore factor increases linearly from 1 to 5
1851             foreach (5..$MAX_PEPTIDE_LENGTH) {
1852             $WordScoreForSeqLen{$_} = 1 + 4 * ($_ - 5) / ($MAX_PEPTIDE_LENGTH - 5);
1853             }
1854             my $max_penalized_length = 6; # after this length, flat rate penalty applies:
1855             my $WordScoreForMatchLenMin = 3**(-$max_penalized_length);
1856             # up to $max_penalized_length, penalize by this factor (divide score by 3 for each char)
1857             my %WordScoreForMatchLen = map { ($_ => 3**(-$_)) } 0..$max_penalized_length;
1858              
1859             =head2 WordScore
1860              
1861             Args : $rh_word - ref to hash with word data.
1862             Example : $parser->WordScore($rh_word) or return;
1863             Description : See package DESCRIPTION above.
1864             Returns : TRUE if successful, FALSE otherwise.
1865              
1866             =cut
1867              
1868             sub WordScore {
1869 1121     1121 1 1475 my ($self, $rh_word) = @_;
1870 1121 50 0     2269 ref $rh_word eq 'HASH' or
1871             carp "not ok: expected WordScore arg[1] = hash ref, got $rh_word" and return;
1872 1121         1386 $rh_word->{WordScore} = 1; # initialize WordScore
1873 1121 100 66     2778 unless ($rh_word->{WordSequence} and $rh_word->{WordOrig}) {
1874 1043         1385 $rh_word->{WordScore} = 0;
1875             }
1876 1121 50       2140 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1877 1121 100       3400 return 1 unless $rh_word->{WordScore};
1878 78         185 $rh_word->{WordScore} *= $WordScoreForAaSymbols{ $rh_word->{WordAaSymbols} };
1879 78 50       167 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1880 78 50       207 if (exists $WordScoreForSeqLen{ $rh_word->{WordSeqLen} }) {
1881 78         165 $rh_word->{WordScore} *= $WordScoreForSeqLen{ $rh_word->{WordSeqLen} };
1882             } else { # if length > $MAX_PEPTIDE_LENGTH, discard regardless of anything else
1883 0         0 $rh_word->{WordScore} = 0;
1884             }
1885 78 50       160 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1886 78 100       154 return 1 unless $rh_word->{WordScore};
1887            
1888 77 100 33     874 if (
    100 100        
    50 33        
      100        
      33        
1889             $rh_word->{WordPropDegen} > 0.95 ||
1890             $rh_word->{WordNumNotDegen} <= 1 ||
1891             ($rh_word->{WordNumNotDegen} <= 2 && $rh_word->{WordSeqLen} >= 6) # <= 2 of 6 aa
1892             ) {
1893 2         4 $rh_word->{WordScore} = 0;
1894             }
1895             elsif (
1896             $rh_word->{WordPropDegen} > 0.7 ||
1897             ($rh_word->{WordNumNotDegen} <= 2 && $rh_word->{WordSeqLen} >= 5)
1898             ) {
1899 1         3 $rh_word->{WordScore} *= 0.001;
1900             }
1901             elsif ($rh_word->{WordPropDegen} > 0.6) {
1902 0         0 $rh_word->{WordScore} *= 0.01;
1903             }
1904 77 50       170 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1905 77 100       152 return 1 unless $rh_word->{WordScore};
1906              
1907 75 50       153 $self->WordScoreForMatch($rh_word) or return;
1908 75 50       185 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1909 75 50       197 return 1 unless $rh_word->{WordScore};
1910             # normalize for the ease of use, so that (after all transformations),
1911             # at WordScore = 0.5, approximately 50% of the words are
1912             # true positives by manual inspection
1913 75         125 $rh_word->{WordScore} = (1/6) * $rh_word->{WordScore};
1914 75 50       142 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1915             # transform (0, +inf) to (0,1):
1916 75         135 $rh_word->{WordScore} = $rh_word->{WordScore} / ( 1 + $rh_word->{WordScore} );
1917 75 50       144 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1918 75         175 return 1;
1919             }
1920              
1921             =head2 WordScoreForMatch
1922              
1923             Args : $rh_word - ref to hash with word data.
1924             Example : $parser->WordScoreForMatch($rh_word) or return;
1925             Description : Calls WordIsGeneOrDict to compute WordIsDict, WordIsGene.
1926             Changes WordScore based on these vars, as well as based on WordIsDNA,
1927             WordPropProtein, etc.
1928             Returns : TRUE if successful, FALSE otherwise.
1929              
1930             =cut
1931            
1932             sub WordScoreForMatch {
1933 75     75 1 92 my ($self, $rh_word) = @_;
1934 75 50 0     183 ref $rh_word eq 'HASH' or
1935             carp "not ok: expected WordScoreForMatch arg[1]: hash ref, got: $rh_word" and
1936             return;
1937             # no need to check this for 3 and full letter symbols:
1938 75 100       200 return 1 unless $rh_word->{WordAaSymbols} == 1;
1939 70 50       138 $self->WordIsGeneOrDict($rh_word) or return;
1940 70 50       170 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1941             # roman numeral < 50 that ends within 0 or 1 char of the end of word, eg
1942             # TFIII or TFIIA. WordIsRomanLen = length of the roman
1943             # numeral (minus 1 if there is 1 char between the numeral and the
1944             # end of the word).
1945 70         385 while ($rh_word->{WordOrig} =~ /(X{0,3})(I[XV]|V?I{0,3})([A-Z])?\b/g) {
1946 261 100       912 my $next_roman_len = length($1) + length($2) - ($3 ? length($3) : 0 );
1947 261 100       1420 next unless $next_roman_len > 0; # skip 0 or -1 (not a roman numeral).
1948 14         23 $rh_word->{WordIsRomanLen} += $next_roman_len;
1949 14 100       83 $rh_word->{WordIsRoman} = 1 if $rh_word->{WordIsRomanLen} >= 2; # eg, HYNKIIIA
1950             }
1951             # for these vars with values: 0/1, penalize based on length:
1952 70         117 foreach (qw(WordIsDNA WordIsDict WordIsGene WordIsRoman)) {
1953 280 100       619 next unless $rh_word->{$_};
1954             # limit the penalty by the sequence length, eg
1955             # if 'SDS-PAGE' is entered into dictionary twice,
1956             # as both 'SDS-PAGE' and 'SDSPAGE'
1957 95 100       261 $rh_word->{"${_}Len"} = $rh_word->{WordSeqLen}
1958             if $rh_word->{"${_}Len"} > $rh_word->{WordSeqLen};
1959 95   66     336 $rh_word->{WordScore} *=
1960             ( $WordScoreForMatchLen{ $rh_word->{"${_}Len"} } ||
1961             $WordScoreForMatchLenMin );
1962 95 50       226 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1963             }
1964             # if the word was not flagged by WordIsGene WordIsDict:
1965 70 100 100     296 unless ($rh_word->{WordIsGene} || $rh_word->{WordIsDict}) {
1966             # penalize for similarity to a gene symbol, eg KLF4.
1967 21         121 foreach ($rh_word->{WordOrig} =~ m!([^A-Z]|\b)[A-Z]{2,3}\d*\b!g) {
1968 5         10 $rh_word->{WordScore} *= 0.1;
1969 5 50       125 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1970             # penalize for similarity to abbreviations, or to gene/protein interactions, eg
1971             # 'GM-CSF', 'HPLC/FPLC', 'MAP/MAPKK'.
1972 5 100 100     28 $rh_word->{WordScore} *= 0.01
1973             if $rh_word->{WordSubLenMax} && # if the word could be split into subwords and
1974             $rh_word->{WordSubLenMax} <= 4; # all sub-words are short
1975 5 50       21 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1976             }
1977             }
1978             # WordPropProtein is a good predictor if abs(WordPropProtein) > 8.
1979             # WordPropProtein is very high for WordIsDNA, eg 'CGGTTAAAA', or
1980             # WordIsRoman, eg 'HYNKIIIA', thus not used.
1981 70 100 100     401 unless(
      66        
      100        
1982             $rh_word->{WordIsDNA} || $rh_word->{WordIsDict} ||
1983             $rh_word->{WordIsGene} || $rh_word->{WordIsRoman}
1984             ) {
1985 13         88 $rh_word->{WordScore} *=
1986             (10 ** (LOG_10_2 * $rh_word->{WordPropProtein}) );
1987             }
1988 70 50       165 warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1989 70 50       143 carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$rh_word], ['rh_word']))
1990             if $self->{verbose} > 2;
1991 70         170 return 1;
1992             }
1993             }
1994              
1995             =head2 WordIsGeneOrDict
1996              
1997             Args : $rh_word - ref to hash with word data.
1998             Example : $parser->WordIsGeneOrDict($rh_word) or return;
1999             Description : Computes WordIsDict, WordIsGene and related vars.
2000             Returns : TRUE if successful, FALSE otherwise.
2001              
2002             =cut
2003              
2004             sub WordIsGeneOrDict {
2005 70     70 1 85 my ($self, $rh_word) = @_;
2006 70 50 0     137 ref $rh_word eq 'HASH' or
2007             carp "not ok: expected WordIsGeneOrDict arg[1]: hash ref, got: $rh_word" and
2008             return;
2009             # select words for matching. Do not penalize more than once for the
2010             # same subword, unless it actually occurs more than once.
2011 70         323 my @subwords = split /[^A-Za-z]+/, lc $rh_word->{WordLcOrig};
2012 70         92 my @subwords_long;
2013 70 100       153 if (@subwords != 1) { # if could split; otherwise just handle $rh_word->{WordLcOrig} below.
2014             # select sub-words that are long in absolute terms and also long relative
2015             # to the original word
2016             # (eg, select 'RAS' in 'RAS/EGFR' , but not in 'RAS/GETRAPLGETRAPL'
2017 20         39 foreach (@subwords) {
2018 52         59 my $length = length $_;
2019 52 100 66     195 push @subwords_long, $_
2020             if $length > 2 and ( $length / $rh_word->{WordSeqLen} ) > 0.25;
2021             # WordSubLenMax = max sub-word length, if could split; otherwise 0.
2022 52 100       153 $rh_word->{WordSubLenMax} = $length if $rh_word->{WordSubLenMax} < $length;
2023             }
2024             }
2025 70 100       232 foreach my $subword ( $rh_word->{WordLcSequence},
2026             # do not penalize more than once for 1 occurrence:
2027             ( $rh_word->{WordLcOrig} eq $rh_word->{WordLcSequence} ?
2028             () : $rh_word->{WordLcOrig} ),
2029             @subwords_long ) {
2030 131         191 foreach my $bad (qw(Dict Gene)) {
2031             # if ( eval { ${$bad}{$subword} } ) { # eval does not work - ??
2032 262         246 my $is_bad = do {
2033 262 100       566 if ($bad eq 'Dict') { $Dict{$subword} }
  131 50       303  
2034 131         413 elsif ($bad eq 'Gene') { $Gene{$subword} }
2035             };
2036 262 100       681 if ($is_bad) {
2037 109         176 $rh_word->{"WordIs$bad"} = 1;
2038 109         341 $rh_word->{"WordIs${bad}Len"} += length($subword);
2039             }
2040             }
2041             }
2042 70         213 return 1;
2043             }
2044              
2045             =head2 WordAbstScore
2046              
2047             Args : none
2048             Example : $parser->WordAbstScore or return;
2049             Description : Computes WordAbstScore based on the results of
2050             AbstScore and WordScore vars for all words in the
2051             abstract. AbstMaxWordScore is high if either AbstScore
2052             is high or WordScore is high, and higher if both are
2053             high. Instead of using simply AbstMaxWordScore =
2054             AbstScore * WordScore, AbstMaxWordScore is also added
2055             to WordScore with a smaller weight. This is because
2056             probability that a given word is a sequence increases
2057             if another word in the same abstract is a
2058             sequence. The weight for AbstMaxWordScore is smaller
2059             than that for WordScore in order not to make all words
2060             have equal AbstMaxWordScore. WordAbstScore = 0 for
2061             WordScore = 0, to reduce obvious noise (otherwise, all
2062             words that are clearly not peptide sequences will get
2063             positive WordAbstScore when they occur in an abstract
2064             with at least one sequence). Skip transform to (0,1)
2065             because the scores below are all in (0,1), and the
2066             weights add to 1, so WordAbstScore is already in
2067             (0,1).
2068             Returns : TRUE if successful, FALSE otherwise.
2069              
2070             =cut
2071              
2072             sub WordAbstScore {
2073 12     12 1 16 my ($self) = @_;
2074 12         23 my $rh_abst = $self->{abst};
2075             # max WordScore for all words in this abstract:
2076 12         34 $rh_abst->{AbstMaxWordScore} = 0;
2077 12         18 foreach my $rh_word (@{ $self->{words} }) {
  12         29  
2078 1126 100       2418 $rh_abst->{AbstMaxWordScore} = $rh_word->{WordScore}
2079             if $rh_abst->{AbstMaxWordScore} < $rh_word->{WordScore};
2080             }
2081            
2082 12         21 foreach my $rh_word (@{ $self->{words} }) {
  12         27  
2083 1126 100       2110 $rh_word->{WordAbstScore} = $rh_word->{WordScore} > 0 ?
2084             $rh_abst->{AbstScore} *
2085             (0.1 * $rh_abst->{AbstMaxWordScore} + 0.9 * $rh_word->{WordScore})
2086             : 0;
2087             }
2088 12 50       36 if ($self->{verbose} > 0) {
2089 12         113 $rh_abst->{AbstComment} .=
2090             sprintf("AbstMaxWordScore=%.4f; ", $rh_abst->{AbstMaxWordScore});
2091             }
2092 12 50       38 carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$self], ['self']))
2093             if $self->{verbose} > 2;
2094 12         32 return 1;
2095             }
2096              
2097             =head2 AbstMtext
2098              
2099             Args : none
2100             Example : $parser->AbstMtext
2101             Description : Marks all words that are putative sequences with
2102             '...' tags. This is done only for words
2103             for which WordScore is at least WordScoreMin.
2104             Returns : always TRUE
2105              
2106             =cut
2107              
2108             sub AbstMtext {
2109 12     12 1 17 my ($self) = @_;
2110 12 50 33     80 if (
2111             $self->{words} and
2112             ref $self->{words} eq 'ARRAY'
2113             ) {
2114             $self->{abst}{AbstMtext} =
2115             join '', map {
2116 1126 100 66     11811 if (! defined $_->{WordOrig} ) { () } # happens if WordNumPrintMin = 1
  5 100       16  
  12         35  
2117             elsif (
2118             $_->{WordScore} >= $self->{WordScoreMin} &&
2119             $_->{WordAbstScore} >= $self->{WordAbstScoreMin}
2120             )
2121 597         1873 { "$_->{WordOrig}"}
2122 524         1109 else { $_->{WordOrig} }
2123             }
2124 12         19 @{ $self->{words} } ;
2125             }
2126             else {
2127 0   0     0 $self->{abst}{AbstMtext} = $self->{abst}{AbstAbstract} || '';
2128             }
2129 12         120 return 1;
2130             }
2131              
2132             =head2 formatScoreProp
2133              
2134             Args[1] : $number - score or proportion.
2135             Example : $rh_word->{WordScore} = formatScoreProp($rh_word->{WordScore});
2136             Description : changes $number to be in (0,1) interval, formats $number
2137             for nice printing
2138             Returns : formatted score
2139              
2140             =cut
2141              
2142             sub formatScoreProp {
2143 4540     4540 1 4647 my ($number) = @_;
2144 4540 50       7768 return unless defined $number;
2145             # number should be between 0 and 1,
2146             # In methods in this package, the input $number is supposed to be always in (0,1) -
2147             # maybe issue warning otherwise?
2148 4540 50       10136 $number = 0 if $number < 0;
2149 4540 50       21907 $number = 1 if $number > 1;
2150 4540         19396 $number = sprintf "%.4f", $number;
2151             }
2152              
2153             =head2 find
2154              
2155             Args[1] : $key - field name
2156             Args[2] : $val - value
2157             Example : # print WordScore for the first GETRAPL sequence.
2158             # print $parser->find(WordSequence => 'GETRAPL')->{WordScore};
2159             Description : finds the first word for which the value of field $key
2160             is equal to $val (string eq, not numeric ==)
2161             Returns : returns this word (hash ref) if successful, FALSE otherwise.
2162              
2163             =cut
2164              
2165             sub find {
2166 40     40 1 104 my ($self, $key, $val) = @_;
2167 40         53 foreach ( @{ $self->{words} } ) {
  40         107  
2168 16962 100 66     72372 return $_ if defined $_->{$key} and $_->{$key} eq $val;
2169             }
2170 0           return;
2171             }
2172              
2173             1;