File Coverage

blib/lib/Biblio/Citation/Parser/Jiao/Utility.pm

Criterion	Covered	Total	%
statement	107	122	87.7
branch	10	22	45.4
condition	0	3	0.0
subroutine	10	12	83.3
pod	0	9	0.0
total	127	168	75.6

line	stmt	bran	cond	sub	pod	time	code
1							package Biblio::Citation::Parser::Jiao::Utility;
2
3							######################################################################
4							#
5							# ParaTools::Citation::Parser::Jiao::Utility;
6							#
7							######################################################################
8							#
9							# This file is part of ParaCite Tools
10							# Based on Zhuoan Jiao's (zj@ecs.soton.ac.uk) citation parser (available
11							# at http://arabica.ecs.soton.ac.uk/code/doc/ReadMe.html)
12							#
13							# The code is relatively unchanged, except to bring into compliance
14							# with the ParaCite metadata style, and to allow interoperability with
15							# the other parsers.
16							#
17							# Copyright (c) 2002 University of Southampton, UK. SO17 1BJ.
18							#
19							# ParaTools is free software; you can redistribute it and/or modify
20							# it under the terms of the GNU General Public License as published by
21							# the Free Software Foundation; either version 2 of the License, or
22							# (at your option) any later version.
23							#
24							# ParaTools is distributed in the hope that it will be useful,
25							# but WITHOUT ANY WARRANTY; without even the implied warranty of
26							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27							# GNU General Public License for more details.
28							#
29							# You should have received a copy of the GNU General Public License
30							# along with ParaTools; if not, write to the Free Software
31							# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32							#
33							######################################################################
34
35	1			1		5	use strict;
	1					1
	1					37
36	1			1		5	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
	1					1
	1					562
37
38							require Exporter;
39
40							@ISA = qw(Exporter);
41							@EXPORT = qw(&normalisation &normalise_html &normalise_name
42							&normalise_date &normalise_journal &num_of_figures);
43
44							$VERSION = '0.01';
45
46							#
47							# Normalisation utilities
48							#
49							sub normalisation {
50	2			2	0	4	my($Text) = @_;
51							# replace embedded '\n' with ' '
52	2					6	$Text =~ s/^\s+//s;
53	2					10	$Text =~ s/\s+$//s;
54	2					24	$Text =~ s/\s+/ /g; # Use single space
55							# while ($Text =~ /.+?\n.+?/g) {
56							# $Text =~ s/(.+?)\n(.+?)/$1 $2/
57							# };
58	2					5	$Text =~ s/``(.*?)''/"$1"/sg; # Replace ``A Paper Title'' with "A Paper Title"
59
60	2					17	$Text =~ s/\s-\s/-/g; # remove space around '-'
61	2					4	$Text =~ s/\s'\s/'/g; # remove space around '
62	2					15	$Text =~ s/\s:\s/:/g; # remove space around :
63	2					7	$Text =~ s/\(\s+/\(/g; # ( 1998) ==> (1998)
64	2					9	$Text =~ s/\s+\)/\)/g; # (1998 ) ==> (1998)
65							# while ($Text =~/--/g) {$Text =~ s/--/-/}; # use single '-'.
66	2					4	$Text =~ s/--+/-/g;
67	2					4	$Text =~ s/~//g; # remove '~' (e.g. C.~B.~Hanna)
68	2					13	$Text =~ s/[,;\s]+$//; # remove last ',;\s' on a line
69							# 'Nr.' caused error in 'find_jnl_name', i.e. it became as
70							# 'journal name' if not removed. (see: arXiv:quant-ph/9905016)
71	2					6	$Text =~ s/([,;])\sNr\.\s(\d+[,;])/$1$2/i;
72							# "[12] For example, R. Machleidt, ...Phys. Rep. 149, 1 (1987)
73	2					5	$Text =~ s/^([^a-z]+)for example\W+/$1/i;
74							# '[18] *** G. Do Dang, ... (arXiv:nucl-th/9911081)
75	2					4	$Text =~ s/\*+//g;
76							# Phys. Rev. D56 => Phys. Rev. D. 56
77	2					6	$Text =~ s/phys.{1,6}rev.{1,6}([a-z])(\d+)/PHYS. REV. $1 $2/ig;
78							# Physica 34 D => Physic D 34
79	2					5	$Text =~ s/physica\s+(\d+)\s+([a-z])/PHYSICA $2 $1/ig;
80							# Nucl. Phys. B567 => Nucl. Phys. B
81	2					5	$Text =~ s/nuc.{1,6}phys.{1,6}\s+([a-z])(\d+)/NUCL. PHYS. $1 $2/ig;
82	2					6	return $Text;
83							};
84
85							sub chr_valid {
86	0			0	0	0	my $c = shift;
87	0	0	0			0	if( $c < 128 \|\| $c > 255 ) {
88	0					0	return chr($c);
89							} else {
90	0					0	return ' ';
91							}
92							}
93
94							sub normalise_html {
95	1			1	0	3	my($Text) = @_;
96
97	1			1		1047	use utf8;
	1					12
	1					12
98
99							# remove tag
100	1					3	$Text =~ s/ //ig;
101							# Convert HTML entities to Unicode
102	1					3	$Text =~ s/\&#x(\w+);/chr_valid(hex($1))/eg;
	0					0
103	1					3	$Text =~ s/\&#(\d+);/chr_valid($1)/eg;
	0					0
104	1					2	$Text =~ s/&(\w)acute[;,]/$1/g; # a ' on top of (\w)
105	1					4	$Text =~ s/&(\w)cedil[;,]/$1/g; # a 'tail' under (\w), e.g Francios
106	1					2	$Text =~ s/&(\w)grave[;,]/$1/g; # a ` on top of (\w)
107	1					2	$Text =~ s/&(\w)tilde[;,]/$1/g;
108	1					4	$Text =~ s/&(\w)uml[;,]/$1e/g; # a '..' on top of (\w)
109	1					3	$Text =~ s/&(\w)slash[;,]/$1/g;
110							#$Text =~ s/&#-88;(\w)/$1/g; # as &(\w)uml (see astro-ph/9811179)
111	1					3	$Text =~ s/&#-\d+;\s*(\w)/$1/g;
112	1					2	$Text =~ s/ß[;,]/ss/g;
113	1					3	$Text =~ s/&[;,]/ and /g;
114	1					3	$Text =~ s/\s*(\w)/$1/g; # a ~ on top of (\w)
115	1					2	$Text =~ s/\/?i>//g; # cogprints
116	1					3	$Text =~ s/&[a-z]+;//g; # otherwise ';' cause ref line break.
117							# $Text =~ s/&#\d+;//g; # e.g. '' in 'hep-th/0001001 [99]'.
118	1					3	$Text =~ s/\\"(\w)/$1e/g; # G\"unter => Gueter
119
120	1					3	$Text =~ s/\^//g; # remove ^
121	1					3	$Text =~ s/([A-Z])\s&\s([A-Z])/$1 and $2/g; # replace '&' with 'and'
122	1					3	$Text =~ s/[, ]+& / and /;
123							# remove HTML markups ( etc.)
124	1					3	$Text =~ s/<[a-z\/]{1,3}>//ig;
125	1					3	return $Text
126							}
127
128
129							sub normalise_name {
130	1			1	0	2	my($Text) = @_;
131	1					3	my $Suffix = '';
132	1					3	$Text =~ s/~//; # remove typo
133							# Jr.
134	1	50				11	if ($Text =~ s/[, \.]+(Jr\|Sr\|Snr)\.?\s*$//i){
		50
135	0					0	$Suffix = $1
136							}
137							elsif ($Text =~ s/([, \.]+)(Jr\|Sr\|Snr)[. ]/$1/i){
138	0					0	$Suffix = $2
139							};
140
141							# van der Buren D => D van der Buren"
142	1	50				7	if ($Text =~ /^\s*(((van\|von\|de\|den\|der)\s+)+)(\S\S+)\s+(.+)/i) {
143	0					0	$Text = "$5 $1 $4"
144							};
145	1					5	$Text =~ s/\s+/ /g; # single space
146	1					3	$Text =~ s/^\W+//;
147	1					4	$Text =~ s/\s+$//;
148							# "A. Smith" => "A.Smith"
149	1					2	$Text =~ s/([a-z])s+\./$1\./ig;
150							# Ghisellini G. A. ==> G.A. Ghisellini
151							# Konenkov D. Yu. => D.Yu. Konenkov
152	1	50				6	if ($Text =~ /^([^\s.]{2,})\s+(([A-Z][a-zA-Z]?\W+))([A-Z][a-zA-Z]?)\W$/) {
153	0					0	$Text = "$2$4 $1"
154							};
155
156	1	50				3	$Text = "$Text $Suffix" if ($Suffix);
157	1					3	$Text = tdb_normalise_name($Text);
158	1					5	return $Text;
159							};
160
161							# Based on Tim's Authors::splitauthors and Authors::_cmonauthor();
162							# This subroutine is called simply because we want the author names
163							# to be transformed to a same style used by Tim's programs. Otherwise
164							# a join on author names in Publication and Reference tables will
165							# miss a lot of targets.
166							sub tdb_normalise_name{
167	1			1	0	3	my $author = shift;
168
169							# Strip any brackets
170	1					2	$author =~ s/\s$[^$]\)\s*//g;
171							# Get rid of any dashes (except for dashes like Hu-Su)
172	1					3	$author =~ s/(\W)-/$1/g;
173							# Remove any "the"s, e.g. The OPAL Collaboration
174	1					845	$author =~ s/\bthe\s+//ig;
175							# Sort out Jr/Jr.
176	1					5	$author =~ s/,?\sJr\.?/_Jr/ig;
177
178	1					3	$author =~ s/[\{\}]//g; # Remove any specialisations
179	1					2	$author =~ s/\\.//g; # Remove any escapes
180							# Convert Convert Hawking S. to S.Hawking (already done - zj)
181							# ($author =~ /(\w\w+)\s+([\.\w\s]+\.)$/) && ($author = $2.$1);
182	1					5	$author =~ s/\.\s/\./g; # Convert S.W. Hawking to S.W.Hawking
183	1					11	$author =~ s/([A-Z])\s/$1\./g; # Convert S W Hawking to S.W.Hawking
184	1	50				69	($author = lc($author)) && ($author =~ s/\b(\w)/\U$1/g);
185							# Convert STEPHEN_HAWKING to Stephen_Hawking
186	1					3	$author =~ s/\s/_/g; # Convert Stephen W.Hawking to Stephen_W.Hawking
187	1					3	$author =~ s/\.\.+/\./g; # Remove double dots
188	1					4	return $author;
189							}
190
191
192							sub normalise_date {
193	1			1	0	2	my($Text) = @_;
194
195							# 12-14 Dec.
196	1					12	$Text =~ s/[^\w\/][0-3][0-9]?[a-z]?(\s\-\s[0-3][0-9]?[a-z]?)?\s+
197							(Jan[\.\s]\|January\b\|Feb[\.\s]\|February\b\|
198							Mar[\.\s]\|March\b\|Apr[\.\s]\|April\b\|May\|
199							Jun[\.\s]\|June\b\|Jul[\.\s]\|July\b\|Aug[\.\s]\|August\b\|
200							Sep[\.\s]\|September\b\|Oct[\.\s]\|October\b\|
201							Nov[\.\s]\|November\b\|Dec[\.\s]\|December\b)//xig;
202
203							# Dec 12-14
204	1					11	$Text =~ s/(Jan[\.\s]\|January\b\|Feb[\.\s]\|February\b\|
205							Mar[\.\s]\|March\b\|Apr[\.\s]\|April\b\|May\|
206							Jun[\.\s]\|June\b\|Jul[\.\s]\|July\b\|Aug[\.\s]\|August\b\|
207							Sep[\.\s]\|September\b\|Oct[\.\s]\|October\b\|
208							Nov[\.\s]\|November\b\|Dec[\.\s]\|December\b)
209							[^\w\/][0-3][0-9]?[a-z]?(\s\-\s[0-3][0-9]?[a-z]*?)?\b//xig;
210
211	1					3	return $Text;
212							};
213
214
215							sub normalise_journal {
216	1			1	0	5	my($Text) = @_;
217	1					3	$Text =~ s/^in\s+//i; # "in ..."
218	1					4	$Text =~ s/^(see )?also\s+//i; # "also ...";
219	1					3	$Text =~ s/\s:\s/:/g;
220	1					3	$Text =~ s/\s\s/ /g; # single space;
221	1					3	$Text =~ s/\.\s/\./g; # "J. Physics" => "J.Physics"
222	1					2	$Text =~ s/\.\(/\. \(/g;
223
224	1					3	$Text =~ s/^\W+//;
225	1					5	$Text =~ s/[^\w.]+$//;
226
227							# remove anything in brackets at the end.
228	1					2	$Text =~ s/\s*\([^)]+$//; # e.g. R. Ram, J. Phys. (Paris
229	1					3	$Text =~ s/^[^(]+\)\s*//; # e.g. a) R. Ram, J. Phys. 10, 120, 1998
230
231							# unify cases
232							#($Text = lc($Text)) && ($Text =~ s/\b(\w)/\U$1/g);
233	1					2	$Text = uc($Text);
234	1					5	return $Text;
235							};
236
237
238							sub num_of_figures {
239	1			1	0	2	my($Text) = @_;
240	1					3	my($N, @Nlist);
241	1					2	$N = 0;
242	1					2	@Nlist =();
243
244	1					3	$Text = normalisation($Text);
245							# e.g. "p. 24-26" regarded as one number.
246							# ignore 'N = 2' kind of equations, and
247							# ignore '25th' kinds (e.g. Proc. 25th ICRC).
248							# ignore 'protein Aquaporin-1 in ...'
249							# ignore 'hep-th/9901001'
250							# ignore ' ... 1.55, ...'
251	1					15	while ($Text =~ /(?:^\|\b)[a-z](\d+)([a-z])
252							(?:-[a-z]\d+[a-z])*
253							(?:\b\|$)/gix) {
254	3	50				9	next if ($2 =~ /^th$/i);
255	3	50				11	next if ($' =~ /^\.\d+/);
256	3	50				9	next if ($` =~ /\d+\.$/);
257	3	50				11	next if ($` =~ /[=<>\/-]\s*$/);
258	3					36	push(@Nlist, $1);
259							};
260	1					2	$N = scalar(@Nlist);
261	1					5	return $N
262							}
263
264
265							sub remove_extra_spc {
266	0			0	0		my($Text) = @_;
267	0						$Text =~ s/^\s+//;
268	0						$Text =~ s/\s+$//;
269	0						$Text =~ s/\s\s+/ /g;
270	0						return $Text
271							};
272
273							1;
274
275							__END__