File Coverage

blib/lib/Peptide/Pubmed.pm

Criterion	Covered	Total	%
statement	511	525	97.3
branch	202	294	68.7
condition	75	140	53.5
subroutine	51	51	100.0
pod	37	37	100.0
total	876	1047	83.6

line	stmt	bran	cond	sub	pod	time	code
1							package Peptide::Pubmed;
2
3	4			4		93925	use vars qw($VERSION);
	4					12
	4					681
4							$VERSION = '1.02';
5
6							=head1 NAME
7
8							Peptide::Pubmed - extract peptide sequences from MEDLINE article abstracts.
9
10							=head1 SYNOPSIS
11
12							use Peptide::Pubmed;
13							$parser = Peptide::Pubmed->new;
14							$in = {
15							PMID => q[15527327],
16							Author => q[Doe JJ, Smith Q],
17							Journal => q[J Biological Foo. 2004;8(2):123-30.],
18							Title => q[Foo, bar and its significance in phage display.],
19							Abstract =>
20							q[Peptide sequences EYHHYNK and Arg-Gly-Asp, but not ACCCGTNA or VEGFRI.],
21							Mesh => q[Genes, p53/genetics; Humans; Bar],
22							Chemical => q[Multienzyme Complexes; Peptide Library; Foo],
23							};
24							$parser->parse_abstract($in);
25
26							# get the peptide sequences in 1 letter symbols (select all words where the
27							# combined word/abstract score is above threshold:
28							# WordAbstScore >= WordAbstScoreMin):
29							@seqs = $parser->get_seqs;
30							print "@seqs\n"; # prints: 'EYHHYNK RGD'
31
32							=head1 EXAMPLES
33
34							# same as above, set threshold explicitly:
35							$parser->WordAbstScoreMin(0.4);
36							@seqs = $parser->get_seqs;
37
38							# set low threshold to get more peptide sequences (but at a cost of getting
39							# more false positives)
40							$parser->WordAbstScoreMin(-1);
41							@seqs = $parser->get_seqs;
42							print "@seqs\n"; # prints: 'EYHHYNK RGD ACCCGTNA VEGFRI'
43
44							# reset threshold back:
45							$parser->WordAbstScoreMin(0.4);
46
47							# get more data for the abstract:
48							$abst = $parser->get_abst;
49							print "$abst->{AbstScore}\n"; # abstract score, in the [0,1] interval
50							print "$abst->{AbstMtext}\n"; # abstract with sequences marked up:
51							# 'Peptide sequences EYHHYNK and Arg-Gly-Asp,
52							# but not ACCCGTNA or VEGFRI.'
53
54							# get more data for the words, in addition to peptide sequences:
55							@words = $parser->get_words;
56							for my $word (@words) {
57							# combined word/abstract score, in the [0,1] interval
58							print "$word->{WordAbstScore}\n";
59							# word as found in the abstract, eg 'Arg-Gly-Asp,'
60							print "$word->{WordOrig}\n";
61							# peptide sequence in 1 letter symbols, eg 'RGD'
62							print "$word->{WordSequence}\n";
63							}
64
65							# There are no mandatory input fields. This will work too, but may give lower score.
66							$in = {
67							Abstract =>
68							q[Peptide sequences EYHHYNK and Arg-Gly-Asp, but not ACCCGTNA or VEGFRI.],
69							};
70							$parser->parse_abstract($in);
71							@words = $parser->get_words;
72
73							# No peptide sequences are found in empty input:
74							$in = undef;
75							$parser->parse_abstract($in);
76							@words = $parser->get_words;
77
78							=head1 DESCRIPTION
79
80							Provides common methods to parse peptide sequences from Pubmed
81							abstracts. The computed variables can be used for classification in
82							external programs.
83
84							For all variables below:
85
86							Varables (Abst\|Word)Is*: Allowed values: 1/0.
87
88							Variables (Abst\|Word)Num*: Allowed values: integer.
89
90							Variables (Abst\|Word)Prop*: Allowed values: real in [0,1].
91
92							Variables (Abst\|Word)Score*: Allowed values: real in [0,1]. Score near
93							1 corresponds to "more relevant" abstracts or words (that is, likely
94							to contain peptide sequences), score near 0 - to "less relevant"
95							abstracts or words.
96
97							Variables (Abst\|Word)* (all other): Allowed values: string, unless
98							otherwise specified below.
99
100							=head2 Input variables
101
102							Input variables can be used optionally without the 'Abst' prefix, so
103							'AbstPMID' and 'PMID' are treated identically.
104
105							AbstPMID: PubMed ID.
106
107							AbstAuthor: article authors.
108
109							AbstJournal: journal citation, with year if available. For format, see
110							examples.
111
112							AbstTitle: article title.
113
114							AbstAbstract: abstract.
115
116							AbstMesh: Medical Subject Headings (MeSH) terms.
117
118							AbstChemical: chemical list.
119
120							=head2 Output variables, for the abstract text
121
122							AbstNumAb: number of matches to words like antibody, epitope, etc.
123
124							AbstNumAllCap: number of all capitalized words in the abstract.
125
126							AbstNumBind: number of matches to words like binds, interacts, etc.
127
128							AbstNumDigest: number of matches to words like Edman, MS/MS, trypsin,
129							etc.
130
131							AbstNumMHC: number of matches to words like MHC, TCR, etc.
132
133							AbstNumPeptide: number of matches to words like peptide, oligopeptide,
134							motif, etc.
135
136							AbstNumPhage: number of matches to words like phage, display, etc.
137
138							AbstNumProtease: number of matches to words like peptidase, cutting,
139							etc.
140
141							AbstNumWords: number of words. Allowed values: integer.
142
143							AbstPropAllCap: proportion of all capitalized words in the abstract (=
144							AbstNumAllCap / AbstNumWords).
145
146							AbstScore: heuristic score to predict whether the abstract contains a
147							peptide sequence, computed based on Abst* variables.
148
149							Other variables:
150
151							AbstComment: free text comment for debugging.
152
153							AbstMtext: abstract with sequences marked using '...'
154							tags.
155
156							=head2 Output variables, for the word
157
158							WordIsDNA: is an all DNA sequence? Example: ACCTTG.
159
160							WordIsDict: is in the dictionary (currently, of english words and of
161							scientific terms, software names and abbreviations)? Example:
162							MATERIALS, RT-PCR.
163
164							WordIsGene: is a gene name, protein name, gene symbol, protein symbol,
165							etc? Example: TFIIA.
166
167							WordSeqLen: peptide length, in amino acids.
168
169							WordOrig: the word as found in the abstract text.
170
171							WordPropDegen: proportion of degenerate amino acids, e.g, 0.6 for
172							AXXXC.
173
174							WordPropProtein: a measure which is positive if a given word
175							composition looks more like a protein sequence than like an english
176							word, and negative otherwise. Computed using frequencies of
177							overlapping k-mers. It is defined as follows: WordPropProtein = sum
178							(over all overlapping k-mers within the word) of (log10Pp -
179							log10Pn). log10Pp is log base 10 of the proportion of the k-mer in the
180							database of known protein sequences, and log10Pn - same, for
181							non-sequences (here, english words from pubmed abstracts not related
182							to peptides). For a word with all k-mers equally frequent among
183							sequences and non-sequences, log10Pp = log10Pn, and WordPropProtein =
184							0. Allowed values: real, [-Inf,+Inf]
185
186							WordScore: heuristic score to predict whether the word contains a
187							peptide sequence, computed based on Word* variables.
188
189							WordAbstScore: heuristic score to predict whether the word contains a
190							peptide sequence, computed based on both Abst* and Word* variables.
191
192							WordSequence: word converted to peptide sequence in 1-letter amino
193							acid symbols.
194
195							Other variables:
196
197							WordAaSymbols: amino acid code. Allowed values: 1 (1 letter), 3 (3
198							letter or full name). Note that separate handling of 3 letter symbols
199							and full names is currently not implemented.
200
201							WordIdx: word index in the abstract. Allowed values: nonnegative
202							integer.
203
204							=head1 NOTES
205
206							=head2 WordPropProtein
207
208							Note that to optimize classification using frequencies of k-mers, k
209							should be chosen so that for a given text, there are 'not too many'
210							empty cells, that is, 'not too many; k-mers that did not occur. For a
211							typical english text, with the combined text length of 100,000,
212							alphabet size of 26 (A-Z, case-insensitive), k=3 is a good choice,
213							because there are 26 ** 3 = 17576 different k-mers, and the expected
214							frequency of each k-mer is 5.7. Actually, the expected frequency is
215							somewhat less because of the effect of the word boundaries. That is,
216							text: 'foobletch' contains more 3-mers than 'foo bletch' because after
217							splitting on whitespace, 'oob', 'obl' do not occur in 'foo bletch'.
218
219							=head2 Variable and method names
220
221							By convention, method, variable and keys names like these:
222							VarNamesLikeThese are used for cases where the corresponding field
223							names may be printed in the output table, such as the rdb table in
224							parse_file(). For the rest of the names, var_names_like_these are
225							used.
226
227							=head1 KNOWN BUGS
228
229							=head2 False negatives
230
231							=head3 Peptide length cutoff
232
233							Peptide length cutoff is 20 amino acids. This is a somewhat arbitary
234							choice. In various sources, cutoffs between 15 and 50 amino acids are
235							used to define oligopeptides.
236
237							=head2 False positives
238
239							=head3 Gene symbols, english words, scientific terms and abbreviations
240
241							Some of these were misclassified as peptide sequence, even though this
242							code uses several dictionaries to find such non-sequence words.
243
244							=head2 Incorrectly parsed sequence
245
246							The recommendations of IUPAC can be found in: Nomenclature and
247							Symbolism for Amino Acids and Peptides,
248
249							http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html
250
251							http://www.chem.qmul.ac.uk/iupac/AminoAcid/A1416.html
252
253							They are not always followed in the published abstracts. More flexible
254							input rules are thus allowed for peptide sequences. However, the
255							following bugs may occur in parsed sequences.
256
257							=head3 Amino acid position and the number of repetitions are not
258							resolved
259
260							Y(n) usually means amino acid Y at position n, but sometimes also
261							means Y repeated n times. It is always resolved as Y, and n is
262							ignored. However, X(n), where X is 'X', 'Xaa', 'Xxx', etc, usually
263							means 'any amino acid, repeated n times. It is always resolved as X
264							repeated n times.
265
266							=head1 REFERENCES
267
268							=head2 ADAM
269
270							This section refers to ADAM, on which Peptide::DictionaryAbbreviations
271							is based. See http://arrowsmith.psych.uic.edu. I would like to thank
272							ADAM authors (in particular, Neil Smalheiser) for graciously providing
273							ADAM.
274
275							ADAM citation:
276
277							Zhou W, Torvik VI, Smalheiser NR. ADAM: Another Database of
278							Abbreviations in MEDLINE. Bioinformatics 2006; 22(22): 2813-2818.
279
280							ADAM OVERVIEW
281
282							ADAM is an abbreviation database generated from the 2006
283							baseline of MEDLINE. It covers frequently used abbreviations
284							and their definitions (or long-forms), including both
285							acronyms and non-acronym abbreviations.
286							Reference
287
288							Zhou W, Torvik VI, Smalheiser NR. ADAM: Another Database
289							of Abbreviations in MEDLINE. Bioinformatics 2006; 22(22): 2813-2818.
290
291							ADAM Copyright
292
293							University of Illinois at Chicago, 2006.
294
295							ADAM License
296
297							By using this software, you expressly agree that your use will be
298							noncommercial, that you will not use this software to make money, and that
299							you will not distribute the software to anyone else or let anyone else use
300							it. Moreover, you will give credit to the University of Illinois and Dr.
301							Smalheiser as the author of the software. The software is provided "as is"
302							and without warranties of any kind, express or implied, including but not
303							limited to the implied warranties of merchantability and fitness, and
304							statutory warranties of noninfringement.
305
306							=head1 AUTHOR
307
308							Timur Shtatland, tshtatland at mgh dot harvard dot edu
309
310							Copyright (C) 2007 by The General Hospital Corporation.
311
312							This program is free software; you can redistribute it and/or modify
313							it under the same terms as Perl itself.
314
315							See http://www.perl.com/perl/misc/Artistic.html
316
317							=head1 SEE ALSO
318
319							RDB - a fast, portable, relational database management system without
320							arbitrary limits, (other than memory and processor speed) that runs
321							under, and interacts with, the UNIX Operating system, by Walter
322							V. Hobbs.
323
324							=head1 APPENDIX
325
326							The rest of the documentation details the methods.
327
328							=cut
329
330							# for printing diagnostic msgs when the module is loaded.
331							# use $obj->{verbose} for object related diagnostic msgs.
332							our $VERBOSE;
333							BEGIN {
334	4		50	4		185	$VERBOSE \|\|= $ENV{VERBOSE} \|\| $ENV{TEST_VERBOSE} \|\| 0;
			33
335							}
336
337	4			4		13368	use Peptide::DictionaryAbbreviations;
	4					9
	4					179
338	4			4		65719	use Peptide::DictionaryEnglish;
	4					13
	4					171
339	4			4		9878	use Peptide::DictionaryFrequent;
	4					11
	4					133
340	4			4		2507	use Peptide::DictionaryScientific;
	4					11
	4					109
341	4			4		21995	use Peptide::Gene;
	4					13
	4					146
342	4			4		29337	use Peptide::Sequence;
	4					13
	4					212
343	4			4		27601	use Peptide::NonSequence;
	4					14
	4					164
344	4			4		7349	use Data::Dumper;
	4					42794
	4					298
345	4			4		36	use Carp;
	4					8
	4					316
346	4			4		21	use warnings;
	4					9
	4					134
347	4			4		20	use strict;
	4					8
	4					196
348
349	4			4		22	use constant LOG_10_2 => log(2) / log(10); # empirically derived weight for WordPropProtein
	4					7
	4					43886
350							our $MAX_PEPTIDE_LENGTH = 20; # Somewhat arbitrary cutoff, consistent with IUPAC. Find the peptides sequences not longer than this cutoff.
351
352							# word / score assignment
353							#
354
355							# for WordScore()
356
357							my %Dict = map { $_ => 1 }
358							splitln(Peptide::DictionaryAbbreviations->words),
359							splitln(Peptide::DictionaryEnglish->words),
360							splitln(Peptide::DictionaryFrequent->words),
361							splitln(Peptide::DictionaryScientific->words);
362							my %Gene = map { $_ => 1 }
363							splitln(Peptide::Gene->words);
364
365							# for WordPropProtein()
366
367							my %log_prop_sequence = map /(\S+)\s+(\S+)\s*\n/g, Peptide::Sequence->log_prop_kmers;
368							my %log_prop_nonsequence = map /(\S+)\s+(\S+)\s*\n/g, Peptide::NonSequence->log_prop_kmers;
369
370							if ($VERBOSE > 3) {
371							foreach (sort keys %log_prop_sequence) {
372							print join("\t", $_, $log_prop_sequence{$_}, $log_prop_nonsequence{$_}), "\n";
373							}
374							}
375
376							{
377							my %args_default
378							= (
379							AbstScoreMin => 0.2,
380							WordScoreMin => 0.2,
381							WordAbstScoreMin => 0.2,
382							WordNumPrintMin => -1,
383							WordNumPrintMax => -1,
384							in_col => [ qw(PMID Author Journal Title Abstract Mesh Chemical) ],
385							kmer_length => 3,
386							print_col => [ qw(
387							AbstPMID AbstAuthor AbstJournal AbstTitle AbstAbstract AbstMesh AbstChemical
388
389							AbstNumAb AbstNumAllCap AbstNumBind AbstNumDigest AbstNumMHC AbstNumPeptide AbstNumPhage AbstNumProtease AbstNumWords AbstPropAllCap AbstScore AbstComment AbstMtext
390
391							WordAbstScore WordIsDNA WordIsDict WordIsGene WordSeqLen WordOrig WordPropDegen WordPropProtein WordScore WordSequence WordAaSymbols WordIdx
392
393							)],
394							print_colf => undef,
395							verbose => 1,
396							);
397
398							=head2 new
399
400							Args : %named_parameters:
401
402							AbstScoreMin : print abstract data if AbstScore >= this threshold.
403							WordScoreMin : print word data if WordScore >= this threshold.
404							WordAbstScoreMin : print word data if WordAbstScore >= this threshold.
405							Allowed values: real in [0,1] interval, or -1.
406							Set to -1 to undefine these, and print all that satisfy
407							PrintMin, PrintMax thresholds.
408
409							WordNumPrintMin : min number of words to print per abstract.
410							Allowed values: 1 and -1
411							WordNumPrintMax : max number of words to print per abstract.
412							Allowed values: non-negative integer, or -1.
413							Set to -1 to undefine these, and print all that satisfy
414							ScoreMin thresholds.
415
416							If WordNumPrintMin = 1, at least one line is printed
417							per each abstract for which AbstScore satisfies
418							AbstScoreMin threshold. That is, 1 line is printed
419							even if no sequences were found (many Word* vars will
420							be empty or 0 when this happens) or if no sequences
421							satisfy WordScore / WordScoreMin threshold. If there
422							are sequences found, at least 1 line is printed (which
423							may or may not satisfy WordScore / WordScoreMin
424							threshold), and the rest are printed only if they
425							satisfy it. If WordNumPrintMax = N, where N is a
426							positive integer, only the first N words are printed
427							(_not_ the best N words by score). A special case is
428							(WordNumPrintMin => 1, WordNumPrintMax => 1), which
429							prints exactly 1 line per abstract. Word* vars are not
430							computed, which make the code much faster. An even
431							more special case is (AbstScoreMin => -1, WordScoreMin
432							=> -1, WordAbstScoreMin => -1, WordNumPrintMin => 1,
433							WordNumPrintMax => 1), which prints all abstracts, 1
434							abstract per line, and computes Abst, but not Word
435							vars, thus allowing to separate computation into 2
436							steps: compute and select abstracts with good
437							AbstScore, then compute and select abstracts and words
438							with good WordAbstScore, WordScore.
439
440							in_col : columns for input
441							kmer_length : for computing WordPropProtein: => 3,
442
443							print_col : columns for printing
444							print_colf : column formats for rdb table, eg [qw(50S 10N)];
445							default: undef, to be determined automatically from col names.
446							verbose : verbosity level for diagnostic msgs. Allowed values: 0..4.
447							Example : my $parser = Peptide::Pubmed->new(verbose => $verbose) or
448							carp "not ok: new Peptide::Pubmed" and return;
449							Description : bare constructor. All work done by init().
450							Returns : TRUE if successful, FALSE otherwise.
451
452							=cut
453
454							sub new {
455	9			9	1	11553	my $class = shift;
456	9					54	my %args = @_;
457	9					55	foreach (sort keys %args) {
458	39	50	0			101	exists $args_default{$_} or carp "not ok: arg=$_ is not allowed in new()" and
459							return;
460							}
461	9		33			149	my $self = bless { %args_default, %args }, ref($class) \|\| $class;
462	9	50				41	$self->init or return;
463	9					37	return $self;
464							}
465							}
466
467							=head2 get_abst
468
469							Args : none
470							Example : $abst = $parser->get_abst;
471							Description : get the output data for Abst* variables.
472							Returns : a ref to hash with Abst* variables if successful,
473							ref to empty hash otherwise.
474
475							=cut
476
477							sub get_abst {
478	1			1	1	5	my ($self) = @_;
479	1					3	my $abst = {};
480	1					3	foreach my $key (grep /^Abst/, @{ $self->{abst_col} } ) {
	1					37
481	20	50				88	$abst->{$key} = $self->{abst}{$key} if exists $self->{abst}{$key};
482							}
483	1					6	return $abst;
484							}
485
486							=head2 get_words
487
488							Args : none
489							Example : # get data for words likely to contain peptide sequences:
490							@words = $parser->get_words;
491							Description : get the output data for Words* variables.
492							Returns : an array where each element is a ref to hash with
493							Word* variables output data for 1 word. Only words
494							that satisfy the threshold: combined word/abstract score
495							WordAbstScore greater than or equal to WordAbstScoreMin,
496							are included, otherwise an empty array is returned.
497							To return all words, set WordAbstScoreMin to -1.
498
499							=cut
500
501							sub get_words {
502	2			2	1	1082	my ($self) = @_;
503	2					4	my @words = ();
504	2					3	foreach my $rh_word ( @{ $self->{words} } ) {
	2					6
505	19	100				51	next unless $rh_word->{WordAbstScore} >= $self->{WordAbstScoreMin};
506	2					5	my $rh_word_out = {};
507	2					3	foreach my $key (grep /^Word/, @{ $self->{words_col} } ) {
	2					22
508	24	50				91	$rh_word_out->{$key} = $rh_word->{$key} if exists $rh_word->{$key};
509							}
510	2					9	push @words, $rh_word_out;
511							}
512	2					15	return @words;
513							}
514
515
516							=head2 get_seqs
517
518							Args : none
519							Example : @seqs = $parser->get_seqs;
520							Description :
521							Returns : a list with sequences for all words in the abstract,
522							empty list if none found.
523
524							=cut
525
526							sub get_seqs {
527	3			3	1	12	my ($self) = @_;
528	3					6	my @seqs = ();
529	3					4	foreach my $rh_word ( @{ $self->{words} } ) {
	3					7
530	57	100				131	next unless $rh_word->{WordAbstScore} >= $self->{WordAbstScoreMin};
531	23	100				52	push @seqs, $rh_word->{WordSequence} if $rh_word->{WordSequence};
532							}
533	3					15	return @seqs;
534							}
535
536
537							=head2 WordAbstScoreMin
538
539							Args : (optional) new value
540							Example : $parser->WordAbstScoreMin(0.4);
541							$parser->WordAbstScoreMin;
542							Description : assign to WordAbstScoreMin a new value, if called with an argument.
543							Returns : WordAbstScoreMin value (after assignment)
544
545							=cut
546
547							sub WordAbstScoreMin {
548	3			3	1	1994	my ($self, $val) = @_;
549	3	50				10	if (@_ == 2) {
550	3					7	$self->{WordAbstScoreMin} = $val;
551							}
552	3					7	return $self->{WordAbstScoreMin};
553							}
554
555							=head2 init
556
557							Args : none
558							Example : $parser->init or return;
559							Description : check to see if the entry is ok. Initialize several fields used for
560							printing, such as column definitions.
561							Returns : always TRUE
562
563							=cut
564
565							sub init {
566	9			9	1	20	my ($self) = @_;
567	9	50				44	unless (defined $self->{print_colf}) {
568							# assign to numeric or text
569	288	100				859	$self->{print_colf} = [
570							map {
571	9					24	/^(Abst\|Word\|AbstWord)(Idx\|Is\|Num\|Prop\|Score)([A-Z]\|$)/ ?
572							"10N" : "50S"
573							}
574	9					14	@{ $self->{print_col} }
575							]
576							}
577							# cols to be printed are either words cols, or abstract cols
578	9					30	$self->{abst_col} = [ grep { /^Abst/ } @{ $self->{print_col} } ];
	288					555
	9					23
579	9					16	$self->{words_col} = [ grep { /^Word/ } @{ $self->{print_col} } ];
	288					489
	9					20
580	9	50				35	if ($self->{verbose} > 2) {
581	0					0	foreach (sort keys %{ $self }) {
	0					0
582	0					0	warn "$_ => $self->{$_}";
583							}
584							}
585	9					31	return 1;
586							}
587
588							=head2 splitln
589
590							Arg[1] : string of items, one item per line.
591							Arg[2] : (optional) %named_parameters:
592							lc : convert to lowercase? 1 = yes, 0 = no. Default = 1.
593							Example : my %words = map { $_ => 1 } splitln("foo\nbar\n", lc => 0);
594							my %words = map { $_ => 1 } $this->splitln("foo\nbar\n");
595							Description : parse input string into list of items, remove leading and trailing
596							whitespace, remove empty lines, convert to lc. Can be used as a
597							method call or direct call.
598							Returns : list of items
599
600							=cut
601
602							sub splitln {
603	20	50		20	1	135	ref $_[0] and shift;
604	20					57	my $str = shift;
605	20					106	my %args = (lc => 1, @_);
606	303664					529571	my @lines =
607	303664					832295	grep { $_ ne '' }
608	20	50				75369	map { s/^\s+\|\s+$//g; $args{lc} ? lc $_ : $_ }
	303664					726980
609							split /\n/, $str;
610	20					96838	return @lines;
611							}
612
613							=head2 parse_file
614
615							Args : %named_parameters:
616							mandatory:
617							in_fname : input file name.
618							out_fname : output file name.
619							Example : $parser->parse_file(in_fname => $in_fname, out_fname => $out_fname);
620							Description : reads input and write output, both in rdb
621							format. Input data is 1 abstract per line. Prints data
622							for all sequences found, 1 sequence per line. Skips
623							abstracts and words based on rules described in
624							new(). Because it prints data for 1 word per line, the
625							same abstract data is printed for all sequences from
626							the same abstract.
627							Returns : TRUE if successful, FALSE otherwise.
628
629							=cut
630
631							sub parse_file {
632	2			2	1	1224	my ($self, %args) = @_;
633	2					7	foreach (qw(in_fname out_fname)) {
634	4					13	$self->{$_} = $args{$_};
635							}
636	2					6	my $in_fh;
637							my $out_fh;
638	2	50	0			81	open $in_fh, $self->{in_fname} or carp "not ok: open $self->{in_fname}: $!" and return;
639	2	50	0			4659	open $out_fh, ">$self->{out_fname}" or
640							carp "not ok: open $self->{out_fname}: $!" and return;
641	2					13	my ($comment, $ra_col, $ra_colf) = read_rdb_header($in_fh);
642	2					12	print_rdb_header($out_fh, $comment, $self->{print_col}, $self->{print_colf});
643	2					28	while (<$in_fh>) { # for 1 pubmed entry (=abstract)
644	3					6	chomp;
645	3	50	33			15	print STDERR "processing line $.\n" if ($self->{verbose} > 1 and not $. % 500);
646	3					7	my $rh_pm_in = {}; # input data for 1 abstract
647	3					39	@{$rh_pm_in}{ @$ra_col } = split "\t", $_;
	3					26
648	3	50	0			12	my $rh_pm_out = $self->parse_abstract($rh_pm_in) or
649							carp "not ok: parse_abstract" and return;
650	3	50				20	next unless $rh_pm_out->{abst}{AbstScore} >= $self->{AbstScoreMin};
651	3					5	my %seen;
652	3					5	my $num_words = 0;
653	3					7	foreach my $rh_word (@{ $rh_pm_out->{words} }) {
	3					7
654	593					808	$num_words++;
655	593	50	66			2479	last if $self->{WordNumPrintMax} != -1 and $num_words > $self->{WordNumPrintMax};
656	593	100	33			2007	if ($self->{WordNumPrintMin} == -1 \|\|
			66
657							($self->{WordNumPrintMin} == 1 && $num_words > $self->{WordNumPrintMin}) ) {
658	591	50	33			6625	next unless $rh_word->{WordScore} >= $self->{WordScoreMin} &&
659							$rh_word->{WordAbstScore} >= $self->{WordAbstScoreMin};
660	591	100	100			2052	next if defined $rh_word->{WordSequence} and
661							$seen{ $rh_word->{WordSequence} }++;
662							}
663	36992	100				77993	my %line = map { tr/\t/ /; $_ } # tab is used as the delimiter
	36992					76408
	6936					22263
664							(
665	578					1437	map( { $_ => defined $rh_word->{$_} ? $rh_word->{$_} : '' }
666	11560	50				33892	@{ $self->{words_col} }),
667	578					1240	map( { $_ => defined $rh_pm_out->{abst}{$_} ? $rh_pm_out->{abst}{$_} : '' }
668	578					916	@{ $self->{abst_col} }),
669							);
670	578					7012	print $out_fh join("\t", @line{ @{ $self->{print_col} } }), "\n";
	578					260511
671							}
672							}
673	2	50	0			157	close $out_fh or carp "not ok: close $self->{out_fname}: $!" and return;
674	2	50	0			22	close $in_fh or carp "not ok: close $self->{in_fname}: $!" and return;
675	2					35	return 1;
676							}
677
678							=head2 read_rdb_header
679
680							Args : input rdb file handle
681							Example : my ($comment, $ra_col, $ra_colf) = read_rdb_header($in_fh);
682							Description : Reads header of an rdb table.
683							Returns : Returns comment (or empty string), ref to array of column names,
684							reference to array of column definitions.
685
686							=cut
687
688							sub read_rdb_header {
689	2			2	1	5	my ($in_fh) = @_;
690	2					5	my ($comment, $ra_col, $ra_colf);
691	2					64	while (<$in_fh>) {
692	2	50	0			10	$comment .= $_ and next if /^\#/; # table comments (optional)
693	2					7	chomp;
694	2					14	@$ra_col = split /\t/; # column names (eg, Hit, Expect)
695	2	50				11	last unless defined ($_ = <$in_fh> );
696	2					3	chomp;
697	2					11	@$ra_colf = split /\t/; # column formats (eg, 10N, 50S)
698	2					5	last;
699							}
700	2		50			20	return $comment \|\| '', $ra_col, $ra_colf;
701							}
702
703							=head2 print_rdb_header
704
705							Args : output rdb file handle, comment (or undef), ref to array of
706							column names, reference to array of column definitions.
707							Example : print_rdb_header($out_fh, $comment, $self->{print_col},
708							$self->{print_colf});
709							Description : Prints header of an rdb table. If comment is undefined,
710							it is not printed.
711							Returns : always TRUE.
712
713							=cut
714
715							sub print_rdb_header {
716	2			2	1	5	my ($out_fh, $comment, $ra_col, $ra_colf) = @_;
717	2	50				11	print $out_fh $comment if defined $comment;
718	2					31	print $out_fh join( "\t", @$ra_col ), "\n";
719	2	50				18	print $out_fh join( "\t", @$ra_colf ), "\n" if defined $ra_colf;
720	2					6	return 1;
721							}
722
723							=head2 parse_abstract
724
725							Args : ref to hash with input data for one pubmed abstract
726							Example : $parser->parse_abstract($rh_pm_in);
727							Description : does all parsing for 1 abstract by calling init_abstract, AbstVars.
728							Returns : ref to hash with parsed data if successful, FALSE if error.
729							If no sequences are found, the hash will have a single sequence
730							consisting of an empty string (this is not considered an error).
731
732							=cut
733
734							sub parse_abstract {
735	12			12	1	3571	my ($self, $rh_pm_in) = @_;
736	12					21	my $rh_pm_out = {};
737	12	50				35	$self->init_abstract($rh_pm_in) or return;
738	12	50				33	$self->AbstVars or return;
739	12					24	%$rh_pm_out = map { $_ => $self->{$_} } qw(words abst);
	24					91
740	12	50				42	carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$self], ['self']))
741							if $self->{verbose} > 2;
742	12					45	return $rh_pm_out;
743							}
744
745							{
746							my %abst_default =
747							(
748							AbstComment => undef,
749							AbstMtext => undef,
750							AbstNumAb => 0,
751							AbstNumAllCap => 0,
752							AbstNumBind => 0,
753							AbstNumDigest => 0,
754							AbstNumMHC => 0,
755							AbstNumPeptide => 0,
756							AbstNumPhage => 0,
757							AbstNumProtease => 0,
758							AbstNumWords => 0,
759							AbstPropAllCap => 0,
760							AbstScore => 0,
761							);
762
763							=head2 init_abstract
764
765							Args : ref to hash with input data with field names 'Title', 'Abstract', etc
766							(optionally, with prefix 'Abst', eg 'AbstTitle', 'AbstAbstract').
767							See field names listed in in_col.
768							Example : $rh_pm_in = { Title => q[Some title.], Abstract => q[Some abstract.] };
769							$self->init_abstract($rh_pm_in) or return;
770							Description : Reads the input data and stores in 'Abst*' fields
771							('Abst' prefix is added if needed). Adds prefix 'Abst'
772							to field names if needed. Converts to lower case and
773							stores text in AbstLc* fields. Undefined fields in the
774							input are converted to empty strings and
775							stored. Currently, there are no mandatory fields:
776							undefined $rh_pm_in is not considered an error. In
777							such case, all fields will be empty strings. No
778							sequences will be found, and the default score (zero)
779							will be assigned.
780							Returns : TRUE if successful, FALSE otherwise.
781
782							=cut
783
784							sub init_abstract {
785	12			12	1	16	my ($self, $rh_pm_in) = @_;
786							# reset abst and words data from the prev parsed abstract, if any
787	12					127	$self->{abst} = { %abst_default };
788	12					89	$self->{words} = [];
789	12					93	my @in_col = @{ $self->{in_col} };
	12					46
790	12					16	foreach (@{ $self->{in_col} }) {
	12					34
791	84					294	$self->{abst}{"Abst$_"} = do {
792	84	100				213	if (defined $rh_pm_in->{$_}) {
		50
793	40					127	$rh_pm_in->{$_}
794							} elsif (defined $rh_pm_in->{"Abst$_"}) {
795	0					0	$rh_pm_in->{"Abst$_"}
796							} else {
797	44					151	''
798							}
799							};
800							}
801							# mandatory field:
802							#my @missing = grep { $self->{abst}{$_} eq '' } qw(AbstAbstract);
803							#@missing and carp "not ok: missing fields: @missing" and return;
804	12		50			190	$self->{abst}{AbstAllText} =
805							join(";; ",
806							map "$_: $self->{abst}{$_}",
807							qw(AbstTitle AbstAbstract AbstMesh AbstChemical)
808							) \|\| '';
809	12					84	$self->{abst}{AbstLcAllText} = lc $self->{abst}{AbstAllText};
810	12					55	$self->{abst}{AbstLcAbstract} = lc $self->{abst}{AbstAbstract};
811	12					47	return 1;
812							}
813							}
814
815							=head2 AbstVars
816
817							Args : none
818							Example : $parser->AbstVars or return;
819							Description : Calls all Abst* methods that compute the corresponding variables,
820							eg AbstNumAb. Calls Words which in turn calls Word* methods.
821							Returns : TRUE if successful, FALSE otherwise.
822
823							=cut
824
825							sub AbstVars {
826	12			12	1	18	my ($self) = @_;
827	12	50				66	$self->AbstNumAb or return;
828							# AbstNumAllCap is computed in AbstScore()
829	12	50				31	$self->AbstNumBind or return;
830	12	50				46	$self->AbstNumDigest or return;
831	12	50				36	$self->AbstNumMHC or return;
832	12	50				32	$self->AbstNumPeptide or return;
833	12	50				30	$self->AbstNumPhage or return;
834	12	50				30	$self->AbstNumProtease or return;
835	12	50				32	$self->AbstScore or return;
836	12	50				31	$self->Words or return;
837	12	50				45	$self->WordAbstScore or return;
838	12	50				52	$self->AbstMtext or return;
839	12					27	foreach (qw(AbstMaxWordScore AbstPropAllCap AbstScore)) {
840	36					98	$self->{abst}{$_} = formatScoreProp( $self->{abst}{$_} );
841							}
842	12					20	foreach my $rh_word ( @{ $self->{words} }) {
	12					26
843	1126					1430	foreach (qw(WordScore WordAbstScore WordPropDNA WordPropDegen)) {
844	4504					8734	$rh_word->{$_} = formatScoreProp( $rh_word->{$_} );
845							}
846							}
847	12	50				48	if ($self->{verbose} > 0) {
848	61					220	$self->{abst}{AbstComment} .= join '; ', map {
849	12					42	defined $self->{abst}{"ra_$_"} &&
850							ref $self->{abst}{"ra_$_"} eq 'ARRAY' ?
851	84	100	66			560	"$_='" . join(", ", @{ $self->{abst}{"ra_$_"} }) . "'" : "$_=''"
852							}
853							qw(
854							AbstNumAb AbstNumBind AbstNumDigest AbstNumMHC AbstNumPeptide
855							AbstNumPhage AbstNumProtease
856							);
857							}
858	12					50	return 1;
859							}
860
861							=head2 AbstNum*
862
863							All AbstNum* methods use the same general scheme, unless specified otherwise. Each method refers to a specific class of patterns, eg AbstNumAb() refers to antibody related patterns. Each method looks searches the AbstLcAllText field (concatenated abstract, mesh terms, chemical terms) for matches to the class of patterns. The matches are stored in an array ref (eg $self->{abst}{ra_AbstNumAb}). The number of matches is stored in a scalar ref (eg, $self->{abst}{AbstNumAb}).
864
865							Pattern matching is done in several steps: patterns that can match anywhere in the word, and patterns that must match the entire word (typically shorter patterns, like 'mab' or 'fab'). Often, there are additional steps. Step 1 pattern matches are mandatory - if there are no matches at step 1, then step 2 is skipped. If step 1 succeeds, then step 2 (optional) pattern matches are done. For example, for AbstNumPeptide(), step 1 patterns include 'peptid', step 2 - 'synthe'. This is because 'synthe' by itself often refers to non-peptides, but together with 'peptid' matches often refers to peptides.
866
867							=cut
868
869							=head2 AbstNumAb
870
871							Args : none
872							Example : $parser->AbstNumAb or return;
873							Description : See package DESCRIPTION above.
874							Returns : TRUE if successful, FALSE otherwise.
875
876							=cut
877
878							sub AbstNumAb {
879	12			12	1	20	my ($self) = @_;
880	12	50				84	return unless defined $self->{abst}{AbstLcAllText};
881	12					219	(my $var = (caller(0))[3]) =~ s/.*:://;
882	12					28	my @matches;
883	12					32	for ($self->{abst}{AbstLcAllText}) {
884	12					663	push @matches,
885							m[
886							adjuvant\|
887							antibod\|
888							antigen\|
889							antiser\|
890							epitope\|
891							freund\|
892							heavy\ chain\|
893							immun\|
894							light\ chain\|
895							mimotope\|
896							monoclonal\|
897							phagotope\|
898							polyclonal\|
899							recombinant\ fab\|
900							vaccin\|
901							variable\ region\|
902							variable\ regions
903							]gx,
904							m[
905							\b(
906							mab\|
907							mabs\|
908							rfab\|
909							sera\|
910							serum
911							)\b
912							]gx;
913	12					31	$self->{abst}{$var} = @matches;
914	12					53	$self->{abst}{"ra_$var"} = [ @matches ];
915							}
916	12					42	return 1;
917							}
918
919							=head2 AbstNumBind
920
921							Args : none
922							Example : $parser->AbstNumBind or return;
923							Description : See package DESCRIPTION above.
924							Returns : TRUE if successful, FALSE otherwise.
925
926							=cut
927
928							sub AbstNumBind {
929	12			12	1	15	my ($self) = @_;
930	12	50				38	return unless defined $self->{abst}{AbstLcAllText};
931	12					90	(my $var = (caller(0))[3]) =~ s/.*:://;
932	12					21	my @matches;
933	12					27	for ($self->{abst}{AbstLcAllText}) {
934	12					652	push @matches,
935							m[
936							abolish\|
937							activate\|
938							adher\|
939							adhes\|
940							affinit\|
941							agonise\|
942							agonist\|
943							agonize\|
944							associat\|
945							bind\|
946							block\|
947							bound\|
948							compet\|
949							disrupt\|
950							dissociat\|
951							inhibit\|
952							interact\|
953							interfere\|
954							ligand\|
955							prevent\|
956							recogn\|
957							regulat\|
958							suppress
959							]gx,
960							m[
961							\b(
962							ec50\|
963							ic50\|
964							kd\|
965							ki
966							)\b
967							]gx;
968	12					24	$self->{abst}{$var} = @matches;
969	12					51	$self->{abst}{"ra_$var"} = [ @matches ];
970							}
971	12					39	return 1;
972							}
973
974							{
975							# older papers are more likely to have data on protein digestion
976							# into peptides which are relevant only to the protein,
977							# rather than to the peptides.
978							# for these time intervals, multiply by this factor
979							my %AbstNumDigestScoreForYear;
980							for (1900..2100) {
981							if ($_ > 1993) {
982							$AbstNumDigestScoreForYear{$_} = 1;
983							} elsif ($_ > 1991) {
984							$AbstNumDigestScoreForYear{$_} = 1.5;
985							} elsif ($_ > 1990) {
986							$AbstNumDigestScoreForYear{$_} = 2;
987							} elsif ($_ > 1989) {
988							$AbstNumDigestScoreForYear{$_} = 3;
989							} else { # <= 1989
990							$AbstNumDigestScoreForYear{$_} = 4;
991							} ## stopped here
992							}
993
994							=head2 AbstNumDigest
995
996							Args : none
997							Example : $parser->AbstNumDigest or return;
998							Description : See package DESCRIPTION above.
999							Returns : TRUE if successful, FALSE otherwise.
1000
1001							=cut
1002
1003							sub AbstNumDigest {
1004	12			12	1	17	my ($self) = @_;
1005	12	50				43	return unless defined $self->{abst}{AbstLcAllText};
1006	12					92	(my $var = (caller(0))[3]) =~ s/.*:://;
1007	12					23	my @matches;
1008	12					28	for ($self->{abst}{AbstLcAllText}) {
1009							# do not shorten to 'trypt' to avoid confusion with tryptophan
1010	12					364	push @matches,
1011							m[
1012							borohydride\|
1013							complete\ amino\ acid\ sequence\|
1014							chromatogra\|
1015							cnbr\|
1016							cyanogen\ bromide\|
1017							dansyl\|
1018							digest\|
1019							edman\|
1020							hplc\|
1021							lsims\|
1022							maldi-tof\|
1023							mass\ spec\|
1024							matrix-assisted\ laser\ desorption\|
1025							nabh4\|
1026							peptide\ map\|
1027							protease\ digestion\|
1028							proteolytic\ digestion\|
1029							rplc\|
1030							spectrometry,\ mass\|
1031							spectrum analysis,\ mass\|
1032							thermolysin\|
1033							thermolytic\|
1034							tryptic\|
1035							trypsin\|
1036							v-8\ prote\|
1037							v8\ prote
1038							]gx,
1039							m[
1040							\b(
1041							ms
1042							)\b
1043							]gx;
1044	12					28	$self->{abst}{$var} = @matches;
1045	12					35	$self->{abst}{"ra_$var"} = [ @matches ];
1046	12					31	for ($self->{abst}{AbstJournal}) {
1047	12	100	66			145	$self->{abst}{$var} *= $AbstNumDigestScoreForYear{$1} if
			66
1048							defined $_ and /\b((19\|20)\d\d)\b/ and $AbstNumDigestScoreForYear{$1};
1049							}
1050	12					62	$self->{abst}{$var} = sprintf "%.0f", $self->{abst}{$var};
1051							}
1052	12					39	return 1;
1053							}
1054							}
1055
1056							=head2 AbstNumMHC
1057
1058							Args : none
1059							Example : $parser->AbstNumMHC or return;
1060							Description : See package DESCRIPTION above.
1061							Returns : TRUE if successful, FALSE otherwise.
1062
1063							=cut
1064
1065							sub AbstNumMHC {
1066	12			12	1	58	my ($self) = @_;
1067	12	50				36	return unless defined $self->{abst}{AbstLcAllText};
1068	12					89	(my $var = (caller(0))[3]) =~ s/.*:://;
1069	12					28	my @matches;
1070	12					29	for ($self->{abst}{AbstLcAllText}) {
1071	12					695	push @matches,
1072							m[
1073							alloge\|
1074							allograft\|
1075							antibod\|
1076							antigen\|
1077							b\ cell\|
1078							dendri\|
1079							epitope\|
1080							histocompatibility\|
1081							immun\|
1082							present\|
1083							restrict\|
1084							tolero\|
1085							t\ cell
1086							]gx,
1087							m[
1088							\b(
1089							cd\d{1,2}\|
1090							class\ i{1,2}\|
1091							hla\|
1092							mhc\|
1093							self\|
1094							tcr
1095							)\b
1096							]gx;
1097	12					29	$self->{abst}{$var} = @matches;
1098	12					50	$self->{abst}{"ra_$var"} = [ @matches ];
1099							}
1100	12					40	return 1;
1101							}
1102
1103							=head2 AbstNumPeptide
1104
1105							Args : none
1106							Example : $parser->AbstNumPeptide or return;
1107							Description : See package DESCRIPTION above.
1108							Returns : TRUE if successful, FALSE otherwise.
1109
1110							=cut
1111
1112							sub AbstNumPeptide {
1113	12			12	1	18	my ($self) = @_;
1114	12	50				32	return unless defined $self->{abst}{AbstLcAllText};
1115	12					94	(my $var = (caller(0))[3]) =~ s/.*:://;
1116	12					24	my @matches;
1117							my @matches_neg;
1118							##cc TODO: possibly add matches to /protein/ and /fragment/ in the same sentence.
1119	12					29	for ($self->{abst}{AbstLcAllText}) {
1120							# step 1, mandatory matches
1121	12					162	push @matches,
1122							m[
1123							benzyloxycarbonyl\|
1124							hormone\|
1125							peptid
1126							]gx;
1127	12	100				38	next unless @matches;
1128	6					78	push @matches_neg,
1129							m[
1130							peptidase\|
1131							polypeptide
1132							]gx;
1133	6	50				36	next unless (scalar(@matches) - scalar(@matches_neg)) > 0;
1134							# step 2, optional matches
1135	6					1411	push @matches,
1136							m[
1137							consensus\|
1138							constrain\|
1139							\bcyclic\|
1140							librar\|
1141							linear\|
1142							motif\|
1143							sequenc\|
1144							synthe
1145							]gx,
1146							m[
1147							\b(
1148							ac\|
1149							acetyl\|
1150							amide\|
1151							boc\|
1152							ch2nh\|
1153							conh\|
1154							cyclo\|
1155							dansyl\|
1156							h2n\|
1157							nh\|
1158							nh2\|
1159							ome\|
1160							tos
1161							)\b
1162							]gx;
1163	6					20	$self->{abst}{$var} = scalar(@matches) - scalar(@matches_neg);
1164	6					38	$self->{abst}{"ra_$var"} = [ @matches ];
1165							}
1166	12					40	return 1;
1167							}
1168
1169							=head2 AbstNumPhage
1170
1171							Args : none
1172							Example : $parser->AbstNumPhage or return;
1173							Description : See package DESCRIPTION above.
1174							Returns : TRUE if successful, FALSE otherwise.
1175
1176							=cut
1177
1178							sub AbstNumPhage {
1179	12			12	1	17	my ($self) = @_;
1180	12	50				36	return unless defined $self->{abst}{AbstLcAllText};
1181	12					85	(my $var = (caller(0))[3]) =~ s/.*:://;
1182	12					24	my @matches;
1183							my @matches_neg;
1184	12					31	for ($self->{abst}{AbstLcAllText}) {
1185							# step 1, mandatory matches
1186	12					123	push @matches,
1187							m[
1188							bio-pan\|
1189							biopan\|
1190							phagemid\|
1191							phage\b\|
1192							phages\b
1193							]gx,
1194							m[
1195							\b(
1196							panned\| # match panning related, but not spanning, pan-caspase, etc
1197							panning\|
1198							pannings\|
1199							pans
1200							)\b
1201							]gx;
1202	12	100				36	next unless @matches;
1203	5					25	push @matches_neg,
1204							m[
1205							macrophage
1206							]gx;
1207	5	50				18	next unless (scalar(@matches) - scalar(@matches_neg)) > 0;
1208							# step 2, optional matches
1209	5					162	push @matches,
1210							m[
1211							clone\|
1212							cloni\|
1213							display\|
1214							librar\|
1215							screen\|
1216							select
1217							]gx,
1218							m[
1219							\b(
1220							round\|
1221							rounds
1222							)\b
1223							]gx;
1224	5					14	$self->{abst}{$var} = scalar(@matches) - scalar(@matches_neg);
1225	5					30	$self->{abst}{"ra_$var"} = [ @matches ];
1226
1227							}
1228	12					38	return 1;
1229							}
1230
1231							=head2 AbstNumProtease
1232
1233							Args : none
1234							Example : $parser->AbstNumProtease or return;
1235							Description : See package DESCRIPTION above.
1236							Returns : TRUE if successful, FALSE otherwise.
1237
1238							=cut
1239
1240							sub AbstNumProtease {
1241	12			12	1	19	my ($self) = @_;
1242	12	50				39	return unless defined $self->{abst}{AbstLcAllText};
1243	12					90	(my $var = (caller(0))[3]) =~ s/.*:://;
1244	12					23	my @matches;
1245	12					33	for ($self->{abst}{AbstLcAllText}) {
1246	12					199	push @matches,
1247							m[
1248							aminobenzoyl\|
1249							caspase\|
1250							cathepsin\|
1251							convertase\|
1252							dinitrophenyl\|
1253							nitroanilide\|
1254							peptidase\|
1255							protease\|
1256							proteasom\|
1257							proteinase\|
1258							proteol
1259							]gx;
1260	12	100				41	next unless @matches;
1261							# match: MMP, MMPs MMP1, MMP-1, but not MMPFOO
1262							# match: Dnp-Met
1263	2					847	push @matches,
1264							m[
1265							(?:\b\|[^a-z])(
1266							abz\|
1267							amc
1268							dnp\|
1269							dpa\|
1270							eddnp\|
1271							fmk\|
1272							mca\|
1273							mmp\|
1274							mmps
1275							)(?:\b\|[^a-z])
1276							]gx;
1277	2					6	$self->{abst}{$var} = @matches;
1278	2					13	$self->{abst}{"ra_$var"} = [ @matches ];
1279							}
1280	12					36	return 1;
1281							}
1282
1283							=head2 AbstScore
1284
1285							Args : none
1286							Example : $parser->AbstScore or return;
1287							Description : Computes AbstScore based on the results of Abst* vars in the abstract.
1288							Returns : TRUE if successful, FALSE otherwise.
1289
1290							=cut
1291
1292							sub AbstScore {
1293	12			12	1	17	my ($self) = @_;
1294	12					19	my $rh_abst = $self->{abst};
1295							# Split with saving the original whitespace, which is then added back
1296							# during join after marking up is done, in AbstMtext().
1297							# This preserves the original character positions.
1298	12					596	@{ $rh_abst->{ra_WordOrig} } = split /(\s+)/, $rh_abst->{AbstAbstract};
	12					361
1299
1300	12					69	$rh_abst->{AbstNumWords} = grep /\S/, @{ $rh_abst->{ra_WordOrig} };
	12					572
1301	12	100				16	$rh_abst->{AbstNumAllCap} = @{[grep { /[A-Z]/ and not /[a-z]/ }
	2160					5015
	12					28
1302	12					15	@{ $rh_abst->{ra_WordOrig} } ]};
1303							# initialize:
1304	12					31	$rh_abst->{AbstScore} = 0;
1305							# linear terms:
1306	12					41	$rh_abst->{AbstScore} +=
1307							(-1) * $rh_abst->{AbstNumDigest} + $rh_abst->{AbstNumPeptide} + $rh_abst->{AbstNumPhage};
1308	12	50				40	warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2;
1309
1310							# interaction terms (score increases more if both vars increase simultaneously)
1311	12					44	$rh_abst->{AbstScore} +=
1312							$rh_abst->{AbstNumBind} *
1313							($rh_abst->{AbstNumPeptide} + $rh_abst->{AbstNumPhage}) +
1314							$rh_abst->{AbstNumProtease} *
1315							($rh_abst->{AbstNumPeptide} + $rh_abst->{AbstNumPhage}) +
1316							$rh_abst->{AbstNumPeptide} * $rh_abst->{AbstNumPhage};
1317	12	50				35	warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2;
1318
1319							# abstracts with too many all cap words are more likely to have all cap
1320							# non-sequence words rather than true sequences.
1321	12		100			54	$rh_abst->{AbstPropAllCap} = $rh_abst->{AbstNumAllCap} / ($rh_abst->{AbstNumWords} \|\| 1);
1322	12	100	66			68	if ($rh_abst->{AbstPropAllCap} > 0.3 \|\| $rh_abst->{AbstNumAllCap} > 30) {
1323	2					6	$rh_abst->{AbstScore} *= 0.01; # strong penalty if above threshold
1324							} else {
1325	10					23	$rh_abst->{AbstScore} *= (1 - $rh_abst->{AbstPropAllCap});
1326							}
1327	12	50				42	warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2;
1328							# AbstScore < 0 can happen for hi AbstNumDigest:
1329	12	50				31	$rh_abst->{AbstScore} = 0 if $rh_abst->{AbstScore} < 0;
1330							# transform (0, +inf) to (0,1):
1331	12					30	$rh_abst->{AbstScore} = $rh_abst->{AbstScore} / (1 + $rh_abst->{AbstScore});
1332	12	50				29	warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2;
1333	12	50				29	carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$self], ['self']))
1334							if $self->{verbose} > 2;
1335	12					32	return 1;
1336							}
1337
1338							{
1339
1340							my %word_default =
1341							(
1342							WordAbstScore => 0,
1343							WordAaSymbols => 0,
1344							WordIsDNA => 0,
1345							WordIsDNALen => 0,
1346							WordIsDict => 0,
1347							WordIsGene => 0,
1348							WordIsRomanLen => 0,
1349							WordNumDegen => 0,
1350							WordOrig => undef,
1351							WordPropDNA => 0,
1352							WordPropDegen => 0,
1353							WordPropProtein => 0,
1354							WordSeqLen => 0,
1355							WordScore => 0,
1356							WordSequence => undef,
1357							WordSubLenMax => 0,
1358							##cc provide defaults for all , to be used for 3 letter and full names of aa.
1359							);
1360
1361							=head2 Words
1362
1363							Args : none
1364							Example : $parser->Words or return;
1365							Description : Parses all peptide sequences from abstract text in AbstAbstract.
1366							Calls the necessary methods, and stores results in array ref
1367							$parser->{words}.
1368							Returns : TRUE if successful, FALSE otherwise.
1369
1370							=cut
1371
1372							sub Words {
1373	12			12	1	24	my ($self) = @_;
1374	12					16	my @words; # each el is a ref to hash with data
1375	12	100	66			48	if ($self->{WordNumPrintMin} == 1 and $self->{WordNumPrintMax} == 1) {
1376							# need exactly 1 empty word
1377	5					66	$self->{words} = [ { %word_default } ];
1378	5					24	return 1;
1379							}
1380	7					10	my $idx = 0;
1381	7					8	my @WordOrig = @{ $self->{abst}{ra_WordOrig} };
	7					154
1382	7					18	foreach (@WordOrig) {
1383	1121					10325	my $rh_word = { %word_default };
1384	1121					3265	$rh_word->{WordOrig} = $_;
1385	1121					2034	$rh_word->{WordLcOrig} = lc $_;
1386	1121					1591	$rh_word->{WordIdx} = $idx++;
1387	1121	100				3921	if (not /\S/) {
		100
		100
1388							# do nothing
1389							}
1390							elsif ( $_ = $self->aa3_to_aa1( $rh_word->{WordOrig} ) ) {
1391	6					13	$rh_word->{WordAaSymbols} = 3; # 3 letter aa symbols (and full names, too)
1392	6					12	$rh_word->{WordSequence} = $_;
1393	6	50				18	$self->WordVars($rh_word) or return;
1394							}
1395							elsif ( $_ = $self->to_aa1( $rh_word->{WordOrig} ) ) {
1396	72					105	$rh_word->{WordAaSymbols} = 1; # 1 letter aa symbols:
1397	72					111	$rh_word->{WordSequence} = $_;
1398	72	50				157	$self->WordVars($rh_word) or return;
1399							}
1400							# else: has non-whitespace chars, but is not a sequence: do nothing
1401	1121	50				2274	$self->WordScore($rh_word) or return;
1402	1121					2051	push @words, $rh_word;
1403							}
1404	7	50	33			37	if ($self->{WordNumPrintMin} == 1 and not @words) { # need at least 1 word
1405	0					0	push @words, { %word_default };
1406							}
1407	7					239	$self->{words} = [ @words ];
1408	7					148	return 1;
1409							}
1410							}
1411
1412							{
1413							# See more: Nomenclature and Symbolism for Amino Acids and Peptides
1414							# http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html
1415							# Amino acids below are converted to X for blasting, no need to worry here.
1416
1417							# keep order for correct substitution below. Generally, longer patterns go first, so that less non-replaced chars remain, so that there is less to clean up later.
1418							my @aa3_to_aa1 =
1419							(
1420							# full names:
1421							# Omitted: B, U, Z
1422							alanine => 'A',
1423							cysteine => 'C',
1424							cystein => 'C', # alternative spelling
1425							aspartic => 'D',
1426							aspartate => 'D',
1427							glutamic => 'E',
1428							glutamate => 'E',
1429							phenylalanine => 'F',
1430							glycine => 'G',
1431							histidine => 'H',
1432							isoleucine => 'I',
1433							lysine => 'K',
1434							leucine => 'L',
1435							methionine => 'M',
1436							asparagine => 'N',
1437							proline => 'P',
1438							glutamine => 'Q',
1439							arginine => 'R',
1440							serine => 'S',
1441							threonine => 'T',
1442							selenocysteine => 'U',
1443							valine => 'V',
1444							tryptophane => 'W', # alternative spelling
1445							tryptophan => 'W',
1446							tyrosine => 'Y',
1447
1448							# full names, 'yl' endings:
1449							# matches:
1450							# glycyl-L-prolyl-L-glutamic acid;
1451							# arginyl glycyl aspartyl serine
1452							alanyl => 'A',
1453							cysteinyl => 'C',
1454							aspartyl => 'D',
1455							glutamyl => 'E',
1456							phenylalanyl => 'F',
1457							glycyl => 'G',
1458							histidyl => 'H',
1459							isoleucyl => 'I',
1460							lysyl => 'K',
1461							leucyl => 'L',
1462							methionyl => 'M',
1463							asparaginyl => 'N',
1464							prolyl => 'P',
1465							glutaminyl => 'Q',
1466							arginyl => 'R',
1467							seryl => 'S',
1468							threonyl => 'T',
1469							selenocysteinyl => 'U',
1470							valyl => 'V',
1471							tryptophyl => 'W',
1472							tyrosyl => 'Y',
1473
1474							# 3 letter aa symbols to 1 letter.
1475							Ala => 'A',
1476							Cys => 'C',
1477							Asp => 'D',
1478							Glu => 'E',
1479							Phe => 'F',
1480							Gly => 'G',
1481							His => 'H',
1482							Ile => 'I',
1483							Lys => 'K',
1484							Leu => 'L',
1485							Met => 'M',
1486							Asn => 'N',
1487							Pro => 'P',
1488							Gln => 'Q',
1489							Arg => 'R',
1490							Ser => 'S',
1491							Thr => 'T',
1492							Sec => 'U',
1493							Val => 'V',
1494							Trp => 'W',
1495							Tyr => 'Y',
1496
1497							# non-standard amino acids are all converted to X
1498							Aad => 'X',
1499							Abu => 'X',
1500							Aib => 'X',
1501							Ahx => 'X',
1502							Ape => 'X',
1503							Apm => 'X',
1504							Asx => 'X', # B in http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html
1505							Cit => 'X',
1506							Dab => 'X',
1507							Dap => 'X',
1508							Dpm => 'X',
1509							Dpr => 'X',
1510							Glx => 'X', # Z in http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html
1511							Hyl => 'X',
1512							Hyp => 'X',
1513							Nle => 'X',
1514							Nva => 'X',
1515							Orn => 'X',
1516							Pyr => 'X',
1517
1518							# unknown amino acids
1519							Xaa => 'X', # keep order as shown, so that these replacements are correct: 'Asn-x-Glu-x-x-(aromatic)-x-x-Gly',
1520							# unknown amino acids, not in IUPAC, but frequently used:
1521							Xxx => 'X', # such as Gly-Xxx-Gly, but some XXX or xxx may refer to X{3}, eg Gly-xxx-Gly
1522							Yaa => 'X',
1523							Zaa => 'X',
1524
1525							);
1526
1527							my %aa3_to_aa1 = @aa3_to_aa1;
1528							my @aa3_to_aa1_keys = grep exists $aa3_to_aa1{$_}, @aa3_to_aa1;
1529							# print to redo sort:
1530							# for (sort { $aa3_to_aa1{$a} cmp $aa3_to_aa1{$b} } keys %aa3_to_aa1) {
1531							# next unless length $_ == 3;
1532							# print "$_ => '$aa3_to_aa1{$_}',\n";
1533							# }
1534							my $aa3_to_aa1 = join "\|", @aa3_to_aa1_keys;
1535							my $aa3_to_aa1_re = qr!$aa3_to_aa1!;
1536
1537							# enable parsing lowercase 3 letter symbols, eg 'NAc-lys-gly-gln-OH'.
1538							my %aa3_to_aa1_lc = map { lc($_) => $aa3_to_aa1{$_} } keys %aa3_to_aa1;
1539							my @aa3_to_aa1_keys_lc = map { lc $_ } @aa3_to_aa1_keys;
1540							my $aa3_to_aa1_lc = join "\|", @aa3_to_aa1_keys_lc;
1541							my $aa3_to_aa1_lc_re = qr!$aa3_to_aa1_lc!;
1542
1543							=head2 aa3_to_aa1
1544
1545							Args : string with a peptide sequence
1546							Example : $str = $self->aa3_to_aa1($str)
1547							Description : finds the first string that looks like protein
1548							sequence in 3 letter amino acid symbols or full
1549							names. Cleans (remove non-informative parens, -, /,
1550							etc), and returns the cleaned sequence. Currently,
1551							full names are also handled here, but this may change.
1552							Returns : sequence (TRUE) if a sequence is found, FALSE otherwise.
1553
1554							=cut
1555
1556							sub aa3_to_aa1 {
1557	605			605	1	789	my ($self, $str) = @_;
1558							# first try more frequent, uppercase, then lowercase.
1559							# tag with _BeginTag_ and _EndTag_ the replaced amino acids,
1560							# then use the tag to remove the rest
1561							# of the chars, except for a set of allowed chars that did not
1562							# need 3 letter to 1 letter conversion and thus did not get tagged.
1563							# proceed if > 1 replacements , eg > 1 aa.
1564							return unless
1565							(
1566	605	100	66			13258	($_ = $str and s!($aa3_to_aa1_re)!_BeginTag_$aa3_to_aa1{$1}_EndTag_!g > 1) or
			66
			66
1567							($_ = $str and s!\b($aa3_to_aa1_lc_re)\b!_BeginTag_$aa3_to_aa1_lc{$1}_EndTag_!g > 1)
1568							);
1569							# warn "tag aa3_to_aa1: \$str=$str;" if $self->{verbose} > 2;
1570							# warn "tag aa3_to_aa1: \$_=$_;" if $self->{verbose} > 2;
1571							# handle x and X here, so that the method returns TRUE only if 3 letter
1572							# symbols occurred. x and X do not count.
1573	49					116	s!\bx\b!_BeginTag_X_EndTag_!g;
1574							# neg lookbehind: tag each X, except those already tagged
1575							# (eg, converted from Xaa to X) (= preceded by _), or
1576							# those where X means something other than amino acid
1577							# (eg, Met(OX) means Met, oxidized) (= preceded by non-X upper case letter).
1578	49					103	s!(?<\![_A-WYZ])X!_BeginTag_X_EndTag_!g;
1579							# warn "tag X: \$_=$_;" if $self->{verbose} > 2;
1580							# change 'X(3)' to 'XXX'
1581							# repeat only Xs, because X(n) usually refers to X repeated n times,
1582							# while for other aa, such as G,
1583							# G(n) refers to the position within the protein.
1584							# Some cases where X(n) refers to position n are eliminated later if n is too large,
1585							# eg X(20) results in sequence length > 20 and is assigned lower score.
1586	49					99	s!_BeginTag_X_EndTag_\W(\d)\W!'_BeginTag_X_EndTag_' x $1!eg;
	8					33
1587	49					530	$_ = join '', grep { defined $_ } m!([\-/()])\|_BeginTag_(.)_EndTag_!g;
	1078					1470
1588	49					150	return clean_seq($_);
1589							}
1590							}
1591
1592
1593							{
1594
1595							# p = phospho, eg pY = phosphotyrosine
1596							# x and X are both frequently used in motifs as any aa.
1597							# B, Z, U are not included because they do not occur very frequently in aa sequences in abstracts.
1598							my $aa1_allowed = '\-/()ACDEFGHIKLMNPQRSTVWYXpx';
1599							my $aa1_re = qr!
1600							# not immed preceded by other letters
1601							# optional N-terminus
1602							(?:N\-\|HN\-\|H2N\-\|H$2$N\-\|[\W\d]\|^)
1603							# sequence, >= 2 aa
1604							([$aa1_allowed]{2,})
1605							# not immed followed by other letters
1606							(?:[\W\d]\|\-.*\|$)
1607							!x;
1608
1609							=head2 to_aa1
1610
1611							Args : string with a peptide sequence
1612							Example : $str = $self->to_aa1($str)
1613							Description : finds the first string that looks like peptide sequence in uppercase
1614							1 letter amino acid symbols. Cleans (remove non-informative parens,
1615							-, /, etc), and returns the cleaned sequence.
1616							Returns : sequence (TRUE) if a sequence is found, FALSE otherwise.
1617
1618							=cut
1619
1620							sub to_aa1 {
1621	576			576	1	813	my ($self, $str) = @_;
1622	576					853	for ($str) {
1623	576	100				3732	if (m!$aa1_re!) {
1624	91					188	return clean_seq($1);
1625							}
1626							}
1627	485					1369	return;
1628							}
1629							}
1630
1631							=head2 clean_seq
1632
1633							Args : string with a peptide sequence in 1 letter aa symbols
1634							Example : $str = clean_seq($str);
1635							Description : for sequence in 1 letter aa symbols, remove extra parens, etc.
1636							Simple motifs are ok: A(B/C/D)E...
1637							Returns : cleaned string.
1638
1639							=cut
1640
1641							sub clean_seq {
1642	140			140	1	281	my ($str) = @_;
1643	140					206	for ($str) {
1644	140					210	s!\-NH[()\d]*$!!; # remove C/N terminal mark
1645	140					420	s!(^[CN]'\|[CN]'$)!!g;
1646	140					172	s!FMK\W*$!!; # remove aa modifications, eg fluorophores
1647	140					193	tr!\-!!d; # delete connecting '-', eg change 'G-E-T' to 'GET'
1648	140					160	tr!p!!d; # delete p=phospho, not P=proline. lowercase only.
1649	140					204	$_ = uc $_; # do this after resolving: p=phospho, not P=proline;
1650							# Xxx = X, not XXX = X{3}
1651	140					192	s!X\W(\d)\W!'X' x $1!eg;
	0					0
1652	140					226	$_ = clean_orphan_parens($_);
1653	140					177	s!^/+!!; # clean up beginning and end
1654	140					168	s!/+$!!; # clean up beginning and end
1655	140					188	s!/+!/!g; # clean up repeated separator chars
1656	140					234	$_ = parse_slashes($_);
1657	140					238	$_ = clean_orphan_parens($_);
1658	140					677	return $_;
1659							}
1660							}
1661
1662							=head2 clean_orphan_parens
1663
1664							Args : string with a sequence to clean up.
1665							Example : $_ = clean_orphan_parens($_);
1666							Description : removes orphan parens from a string.
1667							Returns : cleaned string.
1668
1669							=cut
1670
1671							sub clean_orphan_parens {
1672	280			280	1	363	my ($str) = @_;
1673	280					616	for (my $i = 0; $i < 100; $i++) { # magic number 100 to prevent inf loops
1674	1903					14306	my $num_changes = 0;
1675	1903					2051	$num_changes += s!^\)+!!g; # delete wrong sided parens
1676	1903					2858	$num_changes += s!\(+$!!g; # delete wrong sided parens
1677	1903					4853	$num_changes += s!\(+!\(!g; # change '((GET)' to '(GET)'
1678	1903					4674	$num_changes += s!\)+!\)!g; # change '(GET))' to '(GET)'
1679	1903					3710	$num_changes += s!$[^A-Z]*$!!g; # remove '(/)', '()'
1680	1903					2333	$num_changes += s!^$(.)$!$1!; # change '(G)ETRAPL' to 'GETRAPL'
1681							# change 'GETR(A)PL' to 'GETRAPL'
1682							# change 'GET(RAP)L' to 'GETRAPL'
1683	1903					3027	$num_changes += s!$([^()/]*)$!$1!g;
1684							# change '(GETR(A)PL)' to 'GETR(A)PL' - delete outer parens in 2 steps:
1685	1903					2741	$num_changes += s!\(([^()]+)\(!$1\(!g; # change '(GETR(A)PL)' to 'GETR(A)PL)'
1686	1903					2603	$num_changes += s!\)([^()]+)\)!\)$1!g; # change 'GETR(A)PL)' to 'GETR(A)PL'
1687	1903					2068	my $num_parens = tr!()!!;
1688	1903	100				3433	if ($num_parens == 1) {
1689	13					71	$num_changes += tr!()!!d; # change 'GE(TRAPL' to 'GETRAPL'
1690							}
1691	1903	100				4769	last unless $num_changes; # clean stubborn sequences, eg:
1692							# 'G(ET(RA)PL' => 'G(ETRAPL' (after iteration 1), 'GETRAPL' (after iteration 2).
1693							}
1694	280					537	return $_;
1695							}
1696
1697
1698							=head2 parse_slashes
1699
1700							Args : string with a sequence.
1701							Example : $_ = parse_slashes($_);
1702							Description : handles slashes. Changes A/B/C/etc to (A/B/C/etc)
1703							which means any of A, B, C, etc. Returns the resulting
1704							string. Exceptions are as follows, based on the the
1705							fact that the result would make no sense. If X occurs
1706							next to '/' or if the result would contain a repeated
1707							character. For exceptions, splits the string on
1708							slashes, and returns the first longest string.
1709							Returns : see above
1710
1711							=cut
1712
1713							sub parse_slashes {
1714	140			140	1	150	$_ = $_[0];
1715	140	100				601	return $_ unless m!/!;
1716	15	100	100			100	if (m!/X\|X/! or has_repeats_at_slashes($_) ) {
1717	3					5	my $str = '';
1718	3					16	for (split m!/+!) {
1719	6	50				22	$str = $_ if length($str) < length($_); # change PXXP/PXPXP to PXPXP
1720							}
1721	3					8	return $str;
1722							} else {
1723	12					100	s!(\w(/\w)+)!($1)!g; # change PXXP/GXPXP to PXX(P/G)XPXP
1724	12					29	return $_;
1725							}
1726							}
1727
1728
1729							=head2 has_repeats_at_slashes
1730
1731							Args : string with a sequence.
1732							Example : print 1 if has_repeats_at_slashes($_);
1733							Description : see below
1734							Returns : TRUE if the string has a repeated char next to a series of
1735							slashes and chars, FALSE otherwise.
1736
1737							=cut
1738
1739							sub has_repeats_at_slashes {
1740	14			14	1	32	for ($_[0]) {
1741	14					77	for (m!(\w(?:/\w)+)!g) {
1742	19					57	my %seen;
1743	19					78	for (/\w/g) {
1744	39	100				274	return 1 if $seen{$_}++;
1745							}
1746							}
1747							}
1748	12					54	return;
1749							}
1750
1751							=head2 WordVars
1752
1753							Args : $rh_word - ref to hash with data for the word
1754							Example : $parser->WordVars($rh_word) or return;
1755							Description : computes variables for WordScore, eg WordPropDegen, WordIsDNA, etc.
1756							Returns : TRUE if successful, FALSE otherwise.
1757
1758							=cut
1759
1760							sub WordVars {
1761	78			78	1	110	my ($self, $rh_word) = @_;
1762	78	50	33			388	return unless $rh_word and ref $rh_word eq 'HASH';
1763	78					154	$_ = $rh_word->{WordSequence};
1764							# keep allowed chars in kmers(): 'a'..'z' in sync with
1765							# WordPropProtein() args : lc tr/A-Z//cd;
1766	78					187	( $rh_word->{WordAlpha} = $_ ) =~ tr/A-Z//cd;
1767
1768	78	50				179	if ( $rh_word->{WordSequence} ) {
1769	78					119	$rh_word->{WordSeqLen} = length $rh_word->{WordAlpha};
1770	78					103	$rh_word->{WordNumDegen} = tr/X//;
1771	78					156	$rh_word->{WordNumNotDegen} = $rh_word->{WordSeqLen} - $rh_word->{WordNumDegen};
1772	78		50			274	$rh_word->{WordPropDegen} = $rh_word->{WordNumDegen} / ($rh_word->{WordSeqLen} \|\| 1);
1773	78					174	$rh_word->{WordLcSequence} = lc $rh_word->{WordSequence};
1774							}
1775
1776	78	100				181	if ($rh_word->{WordAaSymbols} == 1) {
1777							# Compute WordPropProtein only for 1 letter symbols, because
1778							# it is important for classification if 'GET' looks like an english word,
1779							# but not important if 'Gly-Glu-Thr' does.
1780	72					210	$rh_word->{WordPropProtein} = $self->WordPropProtein(lc $rh_word->{WordAlpha});
1781	72					141	$rh_word->{WordIsDNALen} = tr/ACTGN//;
1782	72	50				248	$rh_word->{WordPropDNA} = $rh_word->{WordIsDNALen} / $rh_word->{WordSeqLen}
1783							if $rh_word->{WordSeqLen};
1784							# if all or mostly dna, eg, '(A/C)TGG' or 'WCCGGGCGGCCCC'
1785	72	100				167	$rh_word->{WordIsDNA} = 1 if $rh_word->{WordPropDNA} >= 0.9;
1786							}
1787	78					211	return 1;
1788							}
1789
1790							{
1791							my $re;
1792
1793							=head2 WordPropProtein
1794
1795							Args : $word : string
1796							Example : $rh_word->{WordPropProtein} = $self->WordPropProtein($word);
1797							Description : See package DESCRIPTION above.
1798							Returns : sum of log10(proportion(k-mers))
1799
1800							=cut
1801
1802							sub WordPropProtein {
1803	72			72	1	106	my ($self, $str) = @_;
1804	72					86	my $WordPropProtein = 0;
1805	72	100				201	return $WordPropProtein if (length($str) < $self->{kmer_length});
1806	66		66			174	$re \|\|= qr/(?=(.{$self->{kmer_length}}))/;
1807	66					662	foreach ($str =~ /$re/g) {
1808	218	50				597	unless (defined $log_prop_sequence{$_}) {
1809	0					0	carp "not ok: undefined log_prop_sequence for $_";
1810	0					0	next;
1811							}
1812	218	50				509	unless (defined $log_prop_nonsequence{$_}) {
1813	0					0	carp "not ok: undefined log_prop_nonsequence for $_";
1814	0					0	next;
1815							}
1816	218					768	$WordPropProtein += ($log_prop_sequence{$_} - $log_prop_nonsequence{$_});
1817							}
1818	66					419	$WordPropProtein = sprintf("%.1f", $WordPropProtein);
1819	66					206	return $WordPropProtein;
1820							}
1821							}
1822
1823							{
1824							# WordScore factors for different values of vars, eg
1825							# for WordAaSymbols = 3, multiply WordScore by 1000.
1826
1827							# WordScoreForAaSymbols:
1828							# Keep the ratio of 3 letter or full name symbols relative to 1 letter symbols high,
1829							# because even for very short words they are more likely to be a sequence, eg:
1830							# 'Pro-Ala-Pro' relative to 'PAP'.
1831							my %WordScoreForAaSymbols =
1832							(
1833							0 => 0,
1834							1 => 1,
1835							3 => 1000,
1836							5 => 10000, # full names (not implemented)
1837							);
1838							#
1839							my %WordScoreForSeqLen =
1840							(
1841							0 => 0,
1842							1 => 0,
1843							2 => 0.0001,
1844							3 => 0.1,
1845							4 => 0.5,
1846							5 => 1,
1847							);
1848
1849							# as WordSeqLen increases between 5 and $MAX_PEPTIDE_LENGTH,
1850							# WordScore factor increases linearly from 1 to 5
1851							foreach (5..$MAX_PEPTIDE_LENGTH) {
1852							$WordScoreForSeqLen{$_} = 1 + 4 * ($_ - 5) / ($MAX_PEPTIDE_LENGTH - 5);
1853							}
1854							my $max_penalized_length = 6; # after this length, flat rate penalty applies:
1855							my $WordScoreForMatchLenMin = 3**(-$max_penalized_length);
1856							# up to $max_penalized_length, penalize by this factor (divide score by 3 for each char)
1857							my %WordScoreForMatchLen = map { ($_ => 3**(-$_)) } 0..$max_penalized_length;
1858
1859							=head2 WordScore
1860
1861							Args : $rh_word - ref to hash with word data.
1862							Example : $parser->WordScore($rh_word) or return;
1863							Description : See package DESCRIPTION above.
1864							Returns : TRUE if successful, FALSE otherwise.
1865
1866							=cut
1867
1868							sub WordScore {
1869	1121			1121	1	1475	my ($self, $rh_word) = @_;
1870	1121	50	0			2269	ref $rh_word eq 'HASH' or
1871							carp "not ok: expected WordScore arg[1] = hash ref, got $rh_word" and return;
1872	1121					1386	$rh_word->{WordScore} = 1; # initialize WordScore
1873	1121	100	66			2778	unless ($rh_word->{WordSequence} and $rh_word->{WordOrig}) {
1874	1043					1385	$rh_word->{WordScore} = 0;
1875							}
1876	1121	50				2140	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1877	1121	100				3400	return 1 unless $rh_word->{WordScore};
1878	78					185	$rh_word->{WordScore} *= $WordScoreForAaSymbols{ $rh_word->{WordAaSymbols} };
1879	78	50				167	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1880	78	50				207	if (exists $WordScoreForSeqLen{ $rh_word->{WordSeqLen} }) {
1881	78					165	$rh_word->{WordScore} *= $WordScoreForSeqLen{ $rh_word->{WordSeqLen} };
1882							} else { # if length > $MAX_PEPTIDE_LENGTH, discard regardless of anything else
1883	0					0	$rh_word->{WordScore} = 0;
1884							}
1885	78	50				160	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1886	78	100				154	return 1 unless $rh_word->{WordScore};
1887
1888	77	100	33			874	if (
		100	100
		50	33
			100
			33
1889							$rh_word->{WordPropDegen} > 0.95 \|\|
1890							$rh_word->{WordNumNotDegen} <= 1 \|\|
1891							($rh_word->{WordNumNotDegen} <= 2 && $rh_word->{WordSeqLen} >= 6) # <= 2 of 6 aa
1892							) {
1893	2					4	$rh_word->{WordScore} = 0;
1894							}
1895							elsif (
1896							$rh_word->{WordPropDegen} > 0.7 \|\|
1897							($rh_word->{WordNumNotDegen} <= 2 && $rh_word->{WordSeqLen} >= 5)
1898							) {
1899	1					3	$rh_word->{WordScore} *= 0.001;
1900							}
1901							elsif ($rh_word->{WordPropDegen} > 0.6) {
1902	0					0	$rh_word->{WordScore} *= 0.01;
1903							}
1904	77	50				170	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1905	77	100				152	return 1 unless $rh_word->{WordScore};
1906
1907	75	50				153	$self->WordScoreForMatch($rh_word) or return;
1908	75	50				185	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1909	75	50				197	return 1 unless $rh_word->{WordScore};
1910							# normalize for the ease of use, so that (after all transformations),
1911							# at WordScore = 0.5, approximately 50% of the words are
1912							# true positives by manual inspection
1913	75					125	$rh_word->{WordScore} = (1/6) * $rh_word->{WordScore};
1914	75	50				142	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1915							# transform (0, +inf) to (0,1):
1916	75					135	$rh_word->{WordScore} = $rh_word->{WordScore} / ( 1 + $rh_word->{WordScore} );
1917	75	50				144	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1918	75					175	return 1;
1919							}
1920
1921							=head2 WordScoreForMatch
1922
1923							Args : $rh_word - ref to hash with word data.
1924							Example : $parser->WordScoreForMatch($rh_word) or return;
1925							Description : Calls WordIsGeneOrDict to compute WordIsDict, WordIsGene.
1926							Changes WordScore based on these vars, as well as based on WordIsDNA,
1927							WordPropProtein, etc.
1928							Returns : TRUE if successful, FALSE otherwise.
1929
1930							=cut
1931
1932							sub WordScoreForMatch {
1933	75			75	1	92	my ($self, $rh_word) = @_;
1934	75	50	0			183	ref $rh_word eq 'HASH' or
1935							carp "not ok: expected WordScoreForMatch arg[1]: hash ref, got: $rh_word" and
1936							return;
1937							# no need to check this for 3 and full letter symbols:
1938	75	100				200	return 1 unless $rh_word->{WordAaSymbols} == 1;
1939	70	50				138	$self->WordIsGeneOrDict($rh_word) or return;
1940	70	50				170	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1941							# roman numeral < 50 that ends within 0 or 1 char of the end of word, eg
1942							# TFIII or TFIIA. WordIsRomanLen = length of the roman
1943							# numeral (minus 1 if there is 1 char between the numeral and the
1944							# end of the word).
1945	70					385	while ($rh_word->{WordOrig} =~ /(X{0,3})(I[XV]\|V?I{0,3})([A-Z])?\b/g) {
1946	261	100				912	my $next_roman_len = length($1) + length($2) - ($3 ? length($3) : 0 );
1947	261	100				1420	next unless $next_roman_len > 0; # skip 0 or -1 (not a roman numeral).
1948	14					23	$rh_word->{WordIsRomanLen} += $next_roman_len;
1949	14	100				83	$rh_word->{WordIsRoman} = 1 if $rh_word->{WordIsRomanLen} >= 2; # eg, HYNKIIIA
1950							}
1951							# for these vars with values: 0/1, penalize based on length:
1952	70					117	foreach (qw(WordIsDNA WordIsDict WordIsGene WordIsRoman)) {
1953	280	100				619	next unless $rh_word->{$_};
1954							# limit the penalty by the sequence length, eg
1955							# if 'SDS-PAGE' is entered into dictionary twice,
1956							# as both 'SDS-PAGE' and 'SDSPAGE'
1957	95	100				261	$rh_word->{"${_}Len"} = $rh_word->{WordSeqLen}
1958							if $rh_word->{"${_}Len"} > $rh_word->{WordSeqLen};
1959	95		66			336	$rh_word->{WordScore} *=
1960							( $WordScoreForMatchLen{ $rh_word->{"${_}Len"} } \|\|
1961							$WordScoreForMatchLenMin );
1962	95	50				226	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1963							}
1964							# if the word was not flagged by WordIsGene WordIsDict:
1965	70	100	100			296	unless ($rh_word->{WordIsGene} \|\| $rh_word->{WordIsDict}) {
1966							# penalize for similarity to a gene symbol, eg KLF4.
1967	21					121	foreach ($rh_word->{WordOrig} =~ m!([^A-Z]\|\b)[A-Z]{2,3}\d*\b!g) {
1968	5					10	$rh_word->{WordScore} *= 0.1;
1969	5	50				125	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1970							# penalize for similarity to abbreviations, or to gene/protein interactions, eg
1971							# 'GM-CSF', 'HPLC/FPLC', 'MAP/MAPKK'.
1972	5	100	100			28	$rh_word->{WordScore} *= 0.01
1973							if $rh_word->{WordSubLenMax} && # if the word could be split into subwords and
1974							$rh_word->{WordSubLenMax} <= 4; # all sub-words are short
1975	5	50				21	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1976							}
1977							}
1978							# WordPropProtein is a good predictor if abs(WordPropProtein) > 8.
1979							# WordPropProtein is very high for WordIsDNA, eg 'CGGTTAAAA', or
1980							# WordIsRoman, eg 'HYNKIIIA', thus not used.
1981	70	100	100			401	unless(
			66
			100
1982							$rh_word->{WordIsDNA} \|\| $rh_word->{WordIsDict} \|\|
1983							$rh_word->{WordIsGene} \|\| $rh_word->{WordIsRoman}
1984							) {
1985	13					88	$rh_word->{WordScore} *=
1986							(10 ** (LOG_10_2 * $rh_word->{WordPropProtein}) );
1987							}
1988	70	50				165	warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2;
1989	70	50				143	carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$rh_word], ['rh_word']))
1990							if $self->{verbose} > 2;
1991	70					170	return 1;
1992							}
1993							}
1994
1995							=head2 WordIsGeneOrDict
1996
1997							Args : $rh_word - ref to hash with word data.
1998							Example : $parser->WordIsGeneOrDict($rh_word) or return;
1999							Description : Computes WordIsDict, WordIsGene and related vars.
2000							Returns : TRUE if successful, FALSE otherwise.
2001
2002							=cut
2003
2004							sub WordIsGeneOrDict {
2005	70			70	1	85	my ($self, $rh_word) = @_;
2006	70	50	0			137	ref $rh_word eq 'HASH' or
2007							carp "not ok: expected WordIsGeneOrDict arg[1]: hash ref, got: $rh_word" and
2008							return;
2009							# select words for matching. Do not penalize more than once for the
2010							# same subword, unless it actually occurs more than once.
2011	70					323	my @subwords = split /[^A-Za-z]+/, lc $rh_word->{WordLcOrig};
2012	70					92	my @subwords_long;
2013	70	100				153	if (@subwords != 1) { # if could split; otherwise just handle $rh_word->{WordLcOrig} below.
2014							# select sub-words that are long in absolute terms and also long relative
2015							# to the original word
2016							# (eg, select 'RAS' in 'RAS/EGFR' , but not in 'RAS/GETRAPLGETRAPL'
2017	20					39	foreach (@subwords) {
2018	52					59	my $length = length $_;
2019	52	100	66			195	push @subwords_long, $_
2020							if $length > 2 and ( $length / $rh_word->{WordSeqLen} ) > 0.25;
2021							# WordSubLenMax = max sub-word length, if could split; otherwise 0.
2022	52	100				153	$rh_word->{WordSubLenMax} = $length if $rh_word->{WordSubLenMax} < $length;
2023							}
2024							}
2025	70	100				232	foreach my $subword ( $rh_word->{WordLcSequence},
2026							# do not penalize more than once for 1 occurrence:
2027							( $rh_word->{WordLcOrig} eq $rh_word->{WordLcSequence} ?
2028							() : $rh_word->{WordLcOrig} ),
2029							@subwords_long ) {
2030	131					191	foreach my $bad (qw(Dict Gene)) {
2031							# if ( eval { ${$bad}{$subword} } ) { # eval does not work - ??
2032	262					246	my $is_bad = do {
2033	262	100				566	if ($bad eq 'Dict') { $Dict{$subword} }
	131	50				303
2034	131					413	elsif ($bad eq 'Gene') { $Gene{$subword} }
2035							};
2036	262	100				681	if ($is_bad) {
2037	109					176	$rh_word->{"WordIs$bad"} = 1;
2038	109					341	$rh_word->{"WordIs${bad}Len"} += length($subword);
2039							}
2040							}
2041							}
2042	70					213	return 1;
2043							}
2044
2045							=head2 WordAbstScore
2046
2047							Args : none
2048							Example : $parser->WordAbstScore or return;
2049							Description : Computes WordAbstScore based on the results of
2050							AbstScore and WordScore vars for all words in the
2051							abstract. AbstMaxWordScore is high if either AbstScore
2052							is high or WordScore is high, and higher if both are
2053							high. Instead of using simply AbstMaxWordScore =
2054							AbstScore * WordScore, AbstMaxWordScore is also added
2055							to WordScore with a smaller weight. This is because
2056							probability that a given word is a sequence increases
2057							if another word in the same abstract is a
2058							sequence. The weight for AbstMaxWordScore is smaller
2059							than that for WordScore in order not to make all words
2060							have equal AbstMaxWordScore. WordAbstScore = 0 for
2061							WordScore = 0, to reduce obvious noise (otherwise, all
2062							words that are clearly not peptide sequences will get
2063							positive WordAbstScore when they occur in an abstract
2064							with at least one sequence). Skip transform to (0,1)
2065							because the scores below are all in (0,1), and the
2066							weights add to 1, so WordAbstScore is already in
2067							(0,1).
2068							Returns : TRUE if successful, FALSE otherwise.
2069
2070							=cut
2071
2072							sub WordAbstScore {
2073	12			12	1	16	my ($self) = @_;
2074	12					23	my $rh_abst = $self->{abst};
2075							# max WordScore for all words in this abstract:
2076	12					34	$rh_abst->{AbstMaxWordScore} = 0;
2077	12					18	foreach my $rh_word (@{ $self->{words} }) {
	12					29
2078	1126	100				2418	$rh_abst->{AbstMaxWordScore} = $rh_word->{WordScore}
2079							if $rh_abst->{AbstMaxWordScore} < $rh_word->{WordScore};
2080							}
2081
2082	12					21	foreach my $rh_word (@{ $self->{words} }) {
	12					27
2083	1126	100				2110	$rh_word->{WordAbstScore} = $rh_word->{WordScore} > 0 ?
2084							$rh_abst->{AbstScore} *
2085							(0.1 * $rh_abst->{AbstMaxWordScore} + 0.9 * $rh_word->{WordScore})
2086							: 0;
2087							}
2088	12	50				36	if ($self->{verbose} > 0) {
2089	12					113	$rh_abst->{AbstComment} .=
2090							sprintf("AbstMaxWordScore=%.4f; ", $rh_abst->{AbstMaxWordScore});
2091							}
2092	12	50				38	carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$self], ['self']))
2093							if $self->{verbose} > 2;
2094	12					32	return 1;
2095							}
2096
2097							=head2 AbstMtext
2098
2099							Args : none
2100							Example : $parser->AbstMtext
2101							Description : Marks all words that are putative sequences with
2102							'...' tags. This is done only for words
2103							for which WordScore is at least WordScoreMin.
2104							Returns : always TRUE
2105
2106							=cut
2107
2108							sub AbstMtext {
2109	12			12	1	17	my ($self) = @_;
2110	12	50	33			80	if (
2111							$self->{words} and
2112							ref $self->{words} eq 'ARRAY'
2113							) {
2114							$self->{abst}{AbstMtext} =
2115							join '', map {
2116	1126	100	66			11811	if (! defined $_->{WordOrig} ) { () } # happens if WordNumPrintMin = 1
	5	100				16
	12					35
2117							elsif (
2118							$_->{WordScore} >= $self->{WordScoreMin} &&
2119							$_->{WordAbstScore} >= $self->{WordAbstScoreMin}
2120							)
2121	597					1873	{ "$_->{WordOrig}"}
2122	524					1109	else { $_->{WordOrig} }
2123							}
2124	12					19	@{ $self->{words} } ;
2125							}
2126							else {
2127	0		0			0	$self->{abst}{AbstMtext} = $self->{abst}{AbstAbstract} \|\| '';
2128							}
2129	12					120	return 1;
2130							}
2131
2132							=head2 formatScoreProp
2133
2134							Args[1] : $number - score or proportion.
2135							Example : $rh_word->{WordScore} = formatScoreProp($rh_word->{WordScore});
2136							Description : changes $number to be in (0,1) interval, formats $number
2137							for nice printing
2138							Returns : formatted score
2139
2140							=cut
2141
2142							sub formatScoreProp {
2143	4540			4540	1	4647	my ($number) = @_;
2144	4540	50				7768	return unless defined $number;
2145							# number should be between 0 and 1,
2146							# In methods in this package, the input $number is supposed to be always in (0,1) -
2147							# maybe issue warning otherwise?
2148	4540	50				10136	$number = 0 if $number < 0;
2149	4540	50				21907	$number = 1 if $number > 1;
2150	4540					19396	$number = sprintf "%.4f", $number;
2151							}
2152
2153							=head2 find
2154
2155							Args[1] : $key - field name
2156							Args[2] : $val - value
2157							Example : # print WordScore for the first GETRAPL sequence.
2158							# print $parser->find(WordSequence => 'GETRAPL')->{WordScore};
2159							Description : finds the first word for which the value of field $key
2160							is equal to $val (string eq, not numeric ==)
2161							Returns : returns this word (hash ref) if successful, FALSE otherwise.
2162
2163							=cut
2164
2165							sub find {
2166	40			40	1	104	my ($self, $key, $val) = @_;
2167	40					53	foreach ( @{ $self->{words} } ) {
	40					107
2168	16962	100	66			72372	return $_ if defined $_->{$key} and $_->{$key} eq $val;
2169							}
2170	0						return;
2171							}
2172
2173							1;