File Coverage

blib/lib/Biblio/Citation/Parser/Jiao/Utility.pm
Criterion Covered Total %
statement 107 122 87.7
branch 10 22 45.4
condition 0 3 0.0
subroutine 10 12 83.3
pod 0 9 0.0
total 127 168 75.6


line stmt bran cond sub pod time code
1             package Biblio::Citation::Parser::Jiao::Utility;
2              
3             ######################################################################
4             #
5             # ParaTools::Citation::Parser::Jiao::Utility;
6             #
7             ######################################################################
8             #
9             # This file is part of ParaCite Tools
10             # Based on Zhuoan Jiao's (zj@ecs.soton.ac.uk) citation parser (available
11             # at http://arabica.ecs.soton.ac.uk/code/doc/ReadMe.html)
12             #
13             # The code is relatively unchanged, except to bring into compliance
14             # with the ParaCite metadata style, and to allow interoperability with
15             # the other parsers.
16             #
17             # Copyright (c) 2002 University of Southampton, UK. SO17 1BJ.
18             #
19             # ParaTools is free software; you can redistribute it and/or modify
20             # it under the terms of the GNU General Public License as published by
21             # the Free Software Foundation; either version 2 of the License, or
22             # (at your option) any later version.
23             #
24             # ParaTools is distributed in the hope that it will be useful,
25             # but WITHOUT ANY WARRANTY; without even the implied warranty of
26             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27             # GNU General Public License for more details.
28             #
29             # You should have received a copy of the GNU General Public License
30             # along with ParaTools; if not, write to the Free Software
31             # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32             #
33             ######################################################################
34              
35 1     1   5 use strict;
  1         1  
  1         37  
36 1     1   5 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
  1         1  
  1         562  
37              
38             require Exporter;
39              
40             @ISA = qw(Exporter);
41             @EXPORT = qw(&normalisation &normalise_html &normalise_name
42             &normalise_date &normalise_journal &num_of_figures);
43              
44             $VERSION = '0.01';
45              
46             #
47             # Normalisation utilities
48             #
49             sub normalisation {
50 2     2 0 4 my($Text) = @_;
51             # replace embedded '\n' with ' '
52 2         6 $Text =~ s/^\s+//s;
53 2         10 $Text =~ s/\s+$//s;
54 2         24 $Text =~ s/\s+/ /g; # Use single space
55             # while ($Text =~ /.+?\n.+?/g) {
56             # $Text =~ s/(.+?)\n(.+?)/$1 $2/
57             # };
58 2         5 $Text =~ s/``(.*?)''/"$1"/sg; # Replace ``A Paper Title'' with "A Paper Title"
59              
60 2         17 $Text =~ s/\s*-\s*/-/g; # remove space around '-'
61 2         4 $Text =~ s/\s*'\s*/'/g; # remove space around '
62 2         15 $Text =~ s/\s*:\s*/:/g; # remove space around :
63 2         7 $Text =~ s/\(\s+/\(/g; # ( 1998) ==> (1998)
64 2         9 $Text =~ s/\s+\)/\)/g; # (1998 ) ==> (1998)
65             # while ($Text =~/--/g) {$Text =~ s/--/-/}; # use single '-'.
66 2         4 $Text =~ s/--+/-/g;
67 2         4 $Text =~ s/~//g; # remove '~' (e.g. C.~B.~Hanna)
68 2         13 $Text =~ s/[,;\s]+$//; # remove last ',;\s' on a line
69             # 'Nr.' caused error in 'find_jnl_name', i.e. it became as
70             # 'journal name' if not removed. (see: arXiv:quant-ph/9905016)
71 2         6 $Text =~ s/([,;])\s*Nr\.\s*(\d+[,;])/$1$2/i;
72             # "[12] For example, R. Machleidt, ...Phys. Rep. 149, 1 (1987)
73 2         5 $Text =~ s/^([^a-z]+)for example\W+/$1/i;
74             # '[18] *** G. Do Dang, ... (arXiv:nucl-th/9911081)
75 2         4 $Text =~ s/\*+//g;
76             # Phys. Rev. D56 => Phys. Rev. D. 56
77 2         6 $Text =~ s/phys.{1,6}rev.{1,6}([a-z])(\d+)/PHYS. REV. $1 $2/ig;
78             # Physica 34 D => Physic D 34
79 2         5 $Text =~ s/physica\s+(\d+)\s+([a-z])/PHYSICA $2 $1/ig;
80             # Nucl. Phys. B567 => Nucl. Phys. B
81 2         5 $Text =~ s/nuc.{1,6}phys.{1,6}\s+([a-z])(\d+)/NUCL. PHYS. $1 $2/ig;
82 2         6 return $Text;
83             };
84            
85             sub chr_valid {
86 0     0 0 0 my $c = shift;
87 0 0 0     0 if( $c < 128 || $c > 255 ) {
88 0         0 return chr($c);
89             } else {
90 0         0 return ' ';
91             }
92             }
93              
94             sub normalise_html {
95 1     1 0 3 my($Text) = @_;
96            
97 1     1   1047 use utf8;
  1         12  
  1         12  
98              
99             # remove
tag
100 1         3 $Text =~ s/
//ig;
101             # Convert HTML entities to Unicode
102 1         3 $Text =~ s/\&#x(\w+);/chr_valid(hex($1))/eg;
  0         0  
103 1         3 $Text =~ s/\&#(\d+);/chr_valid($1)/eg;
  0         0  
104 1         2 $Text =~ s/&(\w)acute[;,]/$1/g; # a ' on top of (\w)
105 1         4 $Text =~ s/&(\w)cedil[;,]/$1/g; # a 'tail' under (\w), e.g Francios
106 1         2 $Text =~ s/&(\w)grave[;,]/$1/g; # a ` on top of (\w)
107 1         2 $Text =~ s/&(\w)tilde[;,]/$1/g;
108 1         4 $Text =~ s/&(\w)uml[;,]/$1e/g; # a '..' on top of (\w)
109 1         3 $Text =~ s/&(\w)slash[;,]/$1/g;
110             #$Text =~ s/&#-88;(\w)/$1/g; # as &(\w)uml (see astro-ph/9811179)
111 1         3 $Text =~ s/&#-\d+;\s*(\w)/$1/g;
112 1         2 $Text =~ s/ß[;,]/ss/g;
113 1         3 $Text =~ s/&[;,]/ and /g;
114 1         3 $Text =~ s/\s*(\w)/$1/g; # a ~ on top of (\w)
115 1         2 $Text =~ s/\/?i>//g; # cogprints
116 1         3 $Text =~ s/&[a-z]+;//g; # otherwise ';' cause ref line break.
117             # $Text =~ s/&#\d+;//g; # e.g. '' in 'hep-th/0001001 [99]'.
118 1         3 $Text =~ s/\\"(\w)/$1e/g; # G\"unter => Gueter
119            
120 1         3 $Text =~ s/\^//g; # remove ^
121 1         3 $Text =~ s/([A-Z])\s*&\s*([A-Z])/$1 and $2/g; # replace '&' with 'and'
122 1         3 $Text =~ s/[, ]+& / and /;
123             # remove HTML markups ( etc.)
124 1         3 $Text =~ s/<[a-z\/]{1,3}>//ig;
125 1         3 return $Text
126             }
127            
128            
129             sub normalise_name {
130 1     1 0 2 my($Text) = @_;
131 1         3 my $Suffix = '';
132 1         3 $Text =~ s/~//; # remove typo
133             # Jr.
134 1 50       11 if ($Text =~ s/[, \.]+(Jr|Sr|Snr)\.?\s*$//i){
    50          
135 0         0 $Suffix = $1
136             }
137             elsif ($Text =~ s/([, \.]+)(Jr|Sr|Snr)[. ]/$1/i){
138 0         0 $Suffix = $2
139             };
140            
141             # van der Buren D => D van der Buren"
142 1 50       7 if ($Text =~ /^\s*(((van|von|de|den|der)\s+)+)(\S\S+)\s+(.+)/i) {
143 0         0 $Text = "$5 $1 $4"
144             };
145 1         5 $Text =~ s/\s+/ /g; # single space
146 1         3 $Text =~ s/^\W+//;
147 1         4 $Text =~ s/\s+$//;
148             # "A. Smith" => "A.Smith"
149 1         2 $Text =~ s/([a-z])s+\./$1\./ig;
150             # Ghisellini G. A. ==> G.A. Ghisellini
151             # Konenkov D. Yu. => D.Yu. Konenkov
152 1 50       6 if ($Text =~ /^([^\s.]{2,})\s+(([A-Z][a-zA-Z]?\W+)*)([A-Z][a-zA-Z]?)\W*$/) {
153 0         0 $Text = "$2$4 $1"
154             };
155            
156 1 50       3 $Text = "$Text $Suffix" if ($Suffix);
157 1         3 $Text = tdb_normalise_name($Text);
158 1         5 return $Text;
159             };
160              
161             # Based on Tim's Authors::splitauthors and Authors::_cmonauthor();
162             # This subroutine is called simply because we want the author names
163             # to be transformed to a same style used by Tim's programs. Otherwise
164             # a join on author names in Publication and Reference tables will
165             # miss a lot of targets.
166             sub tdb_normalise_name{
167 1     1 0 3 my $author = shift;
168            
169             # Strip any brackets
170 1         2 $author =~ s/\s*\([^\)]*\)\s*//g;
171             # Get rid of any dashes (except for dashes like Hu-Su)
172 1         3 $author =~ s/(\W)-/$1/g;
173             # Remove any "the"s, e.g. The OPAL Collaboration
174 1         845 $author =~ s/\bthe\s+//ig;
175             # Sort out Jr/Jr.
176 1         5 $author =~ s/,?\sJr\.?/_Jr/ig;
177            
178 1         3 $author =~ s/[\{\}]//g; # Remove any specialisations
179 1         2 $author =~ s/\\.//g; # Remove any escapes
180             # Convert Convert Hawking S. to S.Hawking (already done - zj)
181             # ($author =~ /(\w\w+)\s+([\.\w\s]+\.)$/) && ($author = $2.$1);
182 1         5 $author =~ s/\.\s/\./g; # Convert S.W. Hawking to S.W.Hawking
183 1         11 $author =~ s/([A-Z])\s/$1\./g; # Convert S W Hawking to S.W.Hawking
184 1 50       69 ($author = lc($author)) && ($author =~ s/\b(\w)/\U$1/g);
185             # Convert STEPHEN_HAWKING to Stephen_Hawking
186 1         3 $author =~ s/\s/_/g; # Convert Stephen W.Hawking to Stephen_W.Hawking
187 1         3 $author =~ s/\.\.+/\./g; # Remove double dots
188 1         4 return $author;
189             }
190              
191              
192             sub normalise_date {
193 1     1 0 2 my($Text) = @_;
194            
195             # 12-14 Dec.
196 1         12 $Text =~ s/[^\w\/][0-3][0-9]?[a-z]*?(\s*\-\s*[0-3][0-9]?[a-z]*?)?\s+
197             (Jan[\.\s]|January\b|Feb[\.\s]|February\b|
198             Mar[\.\s]|March\b|Apr[\.\s]|April\b|May|
199             Jun[\.\s]|June\b|Jul[\.\s]|July\b|Aug[\.\s]|August\b|
200             Sep[\.\s]|September\b|Oct[\.\s]|October\b|
201             Nov[\.\s]|November\b|Dec[\.\s]|December\b)//xig;
202            
203             # Dec 12-14
204 1         11 $Text =~ s/(Jan[\.\s]|January\b|Feb[\.\s]|February\b|
205             Mar[\.\s]|March\b|Apr[\.\s]|April\b|May|
206             Jun[\.\s]|June\b|Jul[\.\s]|July\b|Aug[\.\s]|August\b|
207             Sep[\.\s]|September\b|Oct[\.\s]|October\b|
208             Nov[\.\s]|November\b|Dec[\.\s]|December\b)
209             [^\w\/]*[0-3][0-9]?[a-z]*?(\s*\-\s*[0-3][0-9]?[a-z]*?)?\b//xig;
210            
211 1         3 return $Text;
212             };
213            
214            
215             sub normalise_journal {
216 1     1 0 5 my($Text) = @_;
217 1         3 $Text =~ s/^in\s+//i; # "in ..."
218 1         4 $Text =~ s/^(see )?also\s+//i; # "also ...";
219 1         3 $Text =~ s/\s*:\s*/:/g;
220 1         3 $Text =~ s/\s\s/ /g; # single space;
221 1         3 $Text =~ s/\.\s/\./g; # "J. Physics" => "J.Physics"
222 1         2 $Text =~ s/\.\(/\. \(/g;
223            
224 1         3 $Text =~ s/^\W+//;
225 1         5 $Text =~ s/[^\w.]+$//;
226            
227             # remove anything in brackets at the end.
228 1         2 $Text =~ s/\s*\([^)]+$//; # e.g. R. Ram, J. Phys. (Paris
229 1         3 $Text =~ s/^[^(]+\)\s*//; # e.g. a) R. Ram, J. Phys. 10, 120, 1998
230            
231             # unify cases
232             #($Text = lc($Text)) && ($Text =~ s/\b(\w)/\U$1/g);
233 1         2 $Text = uc($Text);
234 1         5 return $Text;
235             };
236              
237              
238             sub num_of_figures {
239 1     1 0 2 my($Text) = @_;
240 1         3 my($N, @Nlist);
241 1         2 $N = 0;
242 1         2 @Nlist =();
243            
244 1         3 $Text = normalisation($Text);
245             # e.g. "p. 24-26" regarded as one number.
246             # ignore 'N = 2' kind of equations, and
247             # ignore '25th' kinds (e.g. Proc. 25th ICRC).
248             # ignore 'protein Aquaporin-1 in ...'
249             # ignore 'hep-th/9901001'
250             # ignore ' ... 1.55, ...'
251 1         15 while ($Text =~ /(?:^|\b)[a-z]*(\d+)([a-z]*)
252             (?:-[a-z]*\d+[a-z]*)*
253             (?:\b|$)/gix) {
254 3 50       9 next if ($2 =~ /^th$/i);
255 3 50       11 next if ($' =~ /^\.\d+/);
256 3 50       9 next if ($` =~ /\d+\.$/);
257 3 50       11 next if ($` =~ /[=<>\/-]\s*$/);
258 3         36 push(@Nlist, $1);
259             };
260 1         2 $N = scalar(@Nlist);
261 1         5 return $N
262             }
263            
264            
265             sub remove_extra_spc {
266 0     0 0   my($Text) = @_;
267 0           $Text =~ s/^\s+//;
268 0           $Text =~ s/\s+$//;
269 0           $Text =~ s/\s\s+/ /g;
270 0           return $Text
271             };
272              
273             1;
274              
275             __END__