File Coverage

blib/lib/Biblio/bp/lib/bp-p-utils.pl

Criterion	Covered	Total	%
statement	35	248	14.1
branch	16	140	11.4
condition	0	33	0.0
subroutine	2	10	20.0
pod			n/a
total	53	431	12.3

line	stmt	bran	cond	sub	time	code
1						#
2						# bibliography package for Perl
3						#
4						# utility subroutines
5						#
6						# Dana Jacobsen (dana@acm.org)
7						# 11 January 1995
8						#
9
10						package bp_util;
11
12						######
13
14						$opt_complex = 1;
15
16						# The global key registry.
17						%glb_keyreg = ();
18
19						#
20						# mname_to_canon takes a name string and returns it back as a Canonical name.
21						#
22						# Example input:
23						#
24						# John von Jones, Jr., Ed Krol, Ludwig von Beethoven
25						#
26						# output:
27						#
28						# Jones,von,John,Jr./Krol,Ed,/Beethoven,von,Ludwig,
29						#
30						# (the actual seperators are $cs_sep for '/' and $cs_sep2 for ',')
31						#
32						# This is a total heuristic hack, and if you know where names are split,
33						# use multiple calls to name_to_canon instead. Use this routine if you
34						# expect the input to be some sort of free-form such that you can't
35						# easily seperate the names yourself.
36						#
37						# This routine assumes there can be multiple authors per line, seperated by
38						# "and" or commas, and it's going to try to guess how to break them up,
39						# given that it can get "name1, name2, jr, name3" as a 3 name string with
40						# "name2, jr" as the second name. This method precludes the ability to
41						# also correctly parse "last, first" format strings. If that is the format
42						# your string is in, call the function with a "1" as the second argument.
43						#
44						# Note that no-break-space ("tie", ~ in TeX, \0 in troff) is \240.
45						#
46						sub mname_to_canon {
47	0			0	0	local($allnames, $revauthor) = @_;
48	0				0	local($firstn, $vonn, $lastn, $jrn);
49	0				0	local(@names, $name, $oname, $nname, $rest);
50	0				0	local(@cnames) = ();
51
52						# Squeeze all spaces into one space.
53	0				0	$allnames =~ s/\s+/ /g;
54						# remove any beginning and trailing ands.
55	0				0	$allnames =~ s/^and //;
56	0				0	$allnames =~ s/ and$//;
57
58	0				0	@names = split(/,? and /, $allnames);
59	0				0	while (@names) {
60	0				0	$oname = $name = shift @names;
61	0				0	$firstn = $vonn = $lastn = $jrn = '';
62						# name has no spaces at beginning or end
63
64						# squeeze all spaces around commas. They aren't telling us anything that
65						# we can rely on, and it simplifies matching. Also combine them.
66	0				0	$name =~ s/,+/,/g;
67	0				0	$name =~ s/ ,/,/g;
68	0				0	$name =~ s/, /,/g;
69
70	0	0	0		0	if ( $revauthor && ($name =~ /,/) ) {
71	0	0			0	if ($name =~ s/[, ]+([sj]r\.?\|I+)$//i) {
72	0				0	$jrn = ",$1";
73						}
74	0				0	$name =~ s/^(.),(.)/$2 $1$jrn/g;
75						# name has no spaces at beg or end
76						}
77
78	0				0	$name =~ s/[ \240]+([sj]r\.?\|$?edi?t?o?r?s?\.?$?\|I+)(,\|$)/,$1/i;
79	0				0	($nname, $rest, $jrn) = split(/,([^\240])/, $name, 2);
80	0	0			0	$jrn = (defined $jrn) ? "$rest$jrn" : '';
81						#$jrn =~ s/,+$//;
82						# nname has no spaces at beg or end.
83						# jrn has no spaces at beg or end.
84	0	0			0	if ($jrn =~ / /) {
85	0				0	($jrn, $rest) = $jrn =~ /([sj]r\.?\|$?edi?t?o?r?s?\.?$?\|I+)?,?(.*)$/i;
86	0	0			0	unshift(@names, $rest) if defined $rest;
87	0	0			0	$jrn = '' unless defined $jrn;
88						}
89	0				0	($firstn) = $nname =~ /^((\S* )*)/;
90	0				0	$nname = substr($nname, length($firstn));
91						# nname has no spaces at beg or end.
92	0				0	$lastn = $nname;
93	0				0	$lastn =~ s/\240+/ /g;
94	0				0	$firstn =~ s/\240+/ /g;
95	0				0	$jrn =~ s/\240+/ /g;
96	0				0	while ($firstn =~ / ([a-z]+ )$/) {
97	0				0	$rest = $1;
98	0				0	substr($vonn, 0, 0) = $rest;
99						# XXXXX removed " - 1" from position argument
100	0				0	substr($firstn, length($firstn) - length($rest)) = '';
101						}
102	0				0	while ($lastn =~ /^([a-z]+ )/) {
103	0				0	$rest = $1;
104	0				0	$vonn .= $rest;
105	0				0	$lastn = substr($lastn, length($rest));
106						}
107	0				0	$vonn =~ s/\s+$//;
108	0				0	$firstn =~ s/\s+$//;
109						#print STDERR ":$vonn:$lastn:$firstn:$jrn:\n";
110
111	0	0			0	if ($jrn) {
112	0	0			0	if ($jrn =~ /^(et\.? ?al\.?)\|(others)$/i) {
113	0				0	$jrn = '';
114	0				0	unshift(@names, "et al.");
115						}
116	0	0			0	if ($jrn =~ /^inc[\.]?$/i) {
117	0				0	$lastn .= ", " . $jrn;
118	0				0	$jrn = '';
119						}
120						}
121	0	0			0	if ($lastn =~ /^(et ?al)\|(others)$/i) {
122	0				0	$lastn = "et al.";
123						}
124
125	0				0	push( @cnames, join($bib'cs_sep2, $lastn, $vonn, $firstn, $jrn) );
126						}
127
128	0				0	$name = join( $bib'cs_sep, @cnames );
129	0				0	$name =~ s/\s+$//;
130	0				0	$name =~ s/\s+/ /g;
131
132						# remove any spaces before and after parts of names.
133	0				0	1 while $name =~ s/ ${bib'cs_sep2}/${bib'cs_sep2}/go;
134	0				0	1 while $name =~ s/${bib'cs_sep2} /${bib'cs_sep2}/go;
135
136	0				0	$name;
137						}
138
139						#########
140
141						#
142						# name_to_canon takes a _single_ name and returns it back as a Canonical name.
143						#
144						# This will be faster than mname_to_canon. I also wrote it for bp, and
145						# mname_to_canon is full of weird TeX things from r2b.
146						#
147						# Note that there are a few differences between the two. Notably, that
148						# we only break out a von if it is space seperated -- a nbsp (tie) will
149						# prevent us from breaking it. Note that nbsp => \240.
150						#
151
152						sub name_to_canon {
153	0			0	0	local($name, $revauthor) = @_;
154	0				0	local($first, $last, $von, $jrn);
155
156	0	0			0	&bib'panic("name_to_canon called with no arguments") unless defined $name;
157
158	0				0	$name =~ s/\s+/ /g;
159	0				0	$name =~ s/ $//;
160	0				0	$von = ''; $jrn = '';
	0				0
161
162	0	0			0	if ($name =~ s/[, ]+([sj]r\.?\|I+)$//i) {
163	0				0	$jrn = $1;
164						}
165						# name has no space at end
166						# jrn has no space at beg or end
167	0	0	0		0	if ( $revauthor && ($name =~ /,/) ) {
168	0				0	$name =~ s/^(.)\s,\s(.)/$2 $1/g;
169						}
170						# strip off Jr., but leave "Hunt,\0Jr." alone.
171	0	0	0		0	if (($name =~ /,/) && ($name !~ /,\240/) ) {
172						# XXXXX Check the logic here
173	0	0			0	if (!$revauthor) {
174	0	0			0	if ($jrn) {
175						# possibly reversed?
176	0				0	local($newname) = &name_to_canon($name, 'reverse');
177	0	0			0	if (defined $newname) {
178	0				0	&bib'gotwarn("Names are in reverse order?");
179	0				0	return $newname;
180						} else {
181	0				0	&bib'goterror("name_to_canon already got jr!");
182						}
183						} else {
184	0	0			0	&bib'goterror("Names seem to be reversed!") if $jrn;
185						}
186						}
187	0				0	($name, $jrn) = split(/ ?, ?/, $name, 2);
188						}
189	0	0			0	if ($name =~ / /) {
190	0				0	($first, $last) = $name =~ /(.) (\S)$/;
191						} else {
192	0				0	$first = '';
193	0				0	$last = $name;
194						}
195	0	0			0	if ($first =~ / ([a-z].*)$/) {
196	0				0	$von = $1;
197	0				0	$von =~ s/\240/ /g;
198	0				0	substr($first, length($first)-length($von)-1) = '';
199						#$first =~ s/ $von//;
200						}
201	0				0	while ($last =~ /^([a-z]+)\240/) {
202	0				0	$von .= " $1";
203	0				0	substr($last, 0, length($1)+1) = '';
204						}
205	0				0	$von =~ s/^ //;
206	0				0	$last =~ s/\240/ /g;
207
208						#print STDERR ":$last:$von:$first:$jrn:\n";
209
210	0				0	$name = join( $bib'cs_sep2, $last, $von, $first, $jrn);
211
212	0				0	$name =~ s/\s+$//;
213	0				0	$name =~ s/\s+/ /g;
214						# remove spaces before and after seperators.
215	0				0	1 while $name =~ s/ ${bib'cs_sep2}/${bib'cs_sep2}/go;
216	0				0	1 while $name =~ s/${bib'cs_sep2} /${bib'cs_sep2}/go;
217
218	0	0			0	if ($opt_complex > 1) {
219	0				0	($last, $von, $first, $jrn) = split($bib'cs_sep2, $name);
220						# Look for corporations
221	0	0			0	if ($jrn =~ /^Inc\.$/i) {
222	0				0	$jrn = '';
223	0				0	$last = $last . ", Inc.";
224						}
225						# put it back together
226	0				0	$name = join( $bib'cs_sep2, $last, $von, $first, $jrn);
227						}
228
229	0				0	$name;
230						}
231
232						# This routine turns a name string (possibly containing multiple names) in
233						# canon format into a string suitable for output.
234						#
235						# The styles supported are:
236						#
237						# bibtex First von Last [or] von Last, First [or] von Last, Jr, First
238						#
239						# plain First von Last, Jr
240						#
241						# reverse von Last, First, Jr
242						#
243						# reverse2 Last, First von, Jr
244						#
245						# lname1 von Last, Jr, First [for first author]
246						# First von Last [for subsequesent authors]
247						#
248						# XXXXX
249						#
250						# What we should do instead is have a more general solution. We could specify
251						# names in the above sort of format, and have it parse that. But then how do
252						# we handle BibTeX, which will make decisions based on what fields exist? But
253						# for most of these, something like "FvL,J" or "vL,F,J" or "L,Fv,J" would work.
254						#
255						# Also, we really need a generic output form, that handles more subtle
256						# variations, like when to put "et al." in place of 150 names, and a different
257						# separator for the last name (", and " instead of ", "), initials, and so on.
258						#
259						# XXXXX Check out bibtex parsing. We look for a space, but we've tied all
260						# spaces already!
261
262						sub canon_to_name {
263	120			120	249	local($cname, $how) = @_;
264	120				188	local(@names);
265	120				127	local($name);
266	120				174	local($n, $von, $last, $jr, $first);
267	120				180	local($namenum) = 0;
268
269	120	50			232	&bib'panic("canon_to_name called with no arguments") unless defined $cname;
270	120	50			194	$how = 'bibtex' unless defined $how;
271
272	120				407	foreach $name ( split(/$bib'cs_sep/o, $cname) ) {
273	304				318	$namenum++;
274	304				878	($last, $von, $first, $jr) = split(/$bib'cs_sep2/o, $name, 4);
275	304				468	$last =~ s/ /\240/g;
276	304				333	$von =~ s/ /\240/g;
277	304	50			1005	if ($how =~ /^bibtex/) {
		50
		0
		0
		0
278						# Turn ties back into spaces.
279	0				0	$last =~ s/([^,])\240/$1 /g;
280	0				0	$von =~ s/\240([a-z])/ $1/g;
281						# Do the minimal amount of commas
282	0	0	0		0	if ($jr) {
		0
283	0				0	$n = $von . ' ' . $last . ', ' . $jr . ', ' . $first;
284						} elsif ( ($last =~ /\S\s+\S/) && ($last !~ /^{.*}$/) ) {
285	0				0	$n = $von . ' ' . $last . ', ' . $first;
286						} else {
287	0				0	$n = join(' ', $first, $von, $last);
288						}
289						} elsif ($how =~ /^plain/) {
290						# plain: "First von Last, Jr" for each name
291	304				464	$n = $first;
292	304	100			531	$n .= " $von " if $von;
293	304	100			689	$n .= " $last" if $last;
294	304	50			557	$n .= ", $jr" if $jr;
295						} elsif ($how =~ /^reverse2/) {
296						# This is "Last, First von, Jr." order.
297	0				0	$n = "$last";
298	0	0	0		0	$n .= "," if ($first \|\| $von \|\| $jr);
			0
299	0	0			0	$n .= " $first" if $first;
300	0	0			0	$n .= " $von" if $von;
301	0	0			0	$n .= ", $jr" if $jr;
302						} elsif ($how =~ /^reverse/) {
303						# This is "von Last, First, Jr." order.
304	0				0	$n = "$von $last";
305	0	0	0		0	$n .= ", $first" if ($first \|\| $jr);
306	0	0			0	$n .= ", $jr" if $jr;
307						} elsif ($how =~ /^lname1/) {
308						# lname1 : First author has last name first, the rest are in normal order.
309						# Personally I hate this style, but its common in ecology.
310	0	0			0	$last .= ", $jr" if $jr;
311	0	0			0	if ($namenum == 1) {
312	0	0			0	$last = join(' ', $von, $last) if ($von);
313	0	0			0	if ($first) {
314	0				0	$n = join(', ', $last, $first);
315						} else {
316	0				0	$n = $last;
317						}
318						} else {
319	0				0	$n = join(' ', $first, $von, $last);
320						}
321						# unknown name style
322						} else {
323	0				0	return &bib'goterror("canon_to_name doesn't know form: $how");
324						}
325	304				381	$n =~ s/ \240/ /g;
326	304				463	$n =~ s/^\s+//;
327	304				671	$n =~ s/\s+$//;
328	304				927	$n =~ s/\s+/ /g;
329	304				688	push(@names, $n);
330						}
331
332	120	50			267	if (wantarray) {
333	0				0	@names;
334						} else {
335						# They want the complete string accoring to the style they asked for.
336	120	50			572	if ($how =~ /lname1\|plain/) {
337	120	100			316	if (@names <= 2) {
338	64				142	$n = join(' and ', @names);
339						} else {
340	56				103	$lname = pop(@names);
341	56				206	$n = join(', ', @names) . ', and ' . $lname;
342						}
343						} else {
344	0				0	$n = join(' and ', @names);
345						}
346	120				710	$n;
347						}
348						}
349
350						# XXXXX Obsolete?
351
352						sub parsename {
353	0			0	0	local($name, $how) = @_;
354
355	0				0	&canon_to_name( &mname_to_canon($name), $how);
356						}
357
358
359						#########
360
361						#
362						# parsedate takes a date and returns a list of month, year.
363						#
364						# taken from r2b
365						#
366						# date looks like month dec year
367						# -------------------------------- ------------------- -- ---------------
368						# 1984 84 1984
369						# 1974-1975 74 1974-1975
370						# August 1984 aug 84 1984
371						# May 1984 May 1984 may 84 1984
372						# 1976 November nov 76 1976
373						# 1976 November 1976 nov 76 1976
374						# 21 August 1984 {21 August} 84 1984
375						# August 18-21, 1984 {August 18-21} 84 1984
376						# 18-21 August 1991 {18-21 August} 91 1991
377						# July 31-August 4, 1984 1984 {July 31-August 4} 84 1984
378						# July-August 1980 {July-August} 80 1980
379						# February 1984 (revised May 1991) feb 84 1984
380						# Winter 1990 {Winter} 90 1990
381						# 1988 (in press) 88 1988 (in press)
382						# to appear ?? to appear
383
384						sub parsedate {
385	0			0	0	local($date) = @_;
386	0				0	local($year) = undef;
387	0				0	local($month);
388	0				0	local($old_date) = $date;
389
390	0	0			0	return (undef, undef) unless defined $date;
391
392	0				0	$date =~ s/(\S+)\s+(\d+)\s+\1\s+\2/$1 $2/; # handle duplicate dates
393	0				0	$date =~ s/^\s*(\d\d\d+)\s+(\S+)/$2 $1/; # handle 1976 November
394	0				0	while ($date =~ /\s[(]?((\d\d\d\d[-\/])?\d\d\d\d)[).]?\s($.*$)?$/) {
395	0				0	$year = $1;
396	0				0	$date =~ s/,?\s[(]?(\d\d\d\d[-\/])?\d\d\d\d[).]?\s($.*$)?$//;
397						}
398
399	0				0	$month = &canon_month($date);
400
401	0	0	0		0	if ($month !~ /\S/) {
		0
402	0				0	undef $month;
403						} elsif ( (!defined $year) && ($month eq $date) ) {
404	0				0	$year = $old_date;
405	0				0	undef $month;
406						}
407	0				0	($month, $year);
408						}
409
410						%month_table = (
411						'apr', 'April',
412						'aug', 'August',
413						'dec', 'December',
414						'feb', 'February',
415						'jan', 'January',
416						'jul', 'July',
417						'jun', 'June',
418						'mar', 'March',
419						'may', 'May',
420						'nov', 'November',
421						'oct', 'October',
422						'sep', 'September',
423						);
424
425						sub canon_month {
426	52			52	113	local($month) = @_;
427
428	52	100			229	return $month if $month =~ /[\d\/\-]/;
429
430	30				83	local($canm) = substr($month, 0, 3);
431
432	30				62	$canm =~ tr/A-Z/a-z/;
433
434	30	50			109	return $month unless defined $month_table{$canm};
435
436	30				248	$canm;
437						}
438
439						sub output_month {
440	0			0		local($canm, $how) = @_;
441	0					local($outm) = $month_table{$canm};
442
443						# we don't know what they have
444	0	0				return $canm unless defined $outm;
445
446	0	0	0			if ( ($how eq 'short') && (length($outm) > 4) ) {
447	0					substr($outm, 3) = '.';
448						}
449
450						# 'long' format
451	0					$outm;
452						}
453
454						sub output_date {
455	0			0		local($mo, $yr, $how) = @_;
456	0					local($date);
457
458	0	0				$how = 'short' unless defined $how;
459
460	0	0				if (defined $mo) {
461	0					$mo = &bp_util'output_month($mo, $how);
462	0	0				if (defined $yr) {
463	0					$date = "$mo $yr";
464						} else {
465	0					$date = $mo;
466						}
467						} else {
468	0	0				$date = $yr if defined $yr;
469						}
470
471	0					$date;
472						}
473
474						#
475						# Generates a key for a canonical record.
476						#
477						# XXXXX This should take an option string and parse it to generate a key.
478						#
479
480						sub genkey {
481	0			0		local(%cent) = @_;
482	0					local($key, $keytype, $sy);
483
484						# first pick out the field we're going to use
485						GETKEY: {
486	0					defined $cent{'Authors'} && do
487	0	0				{ $keytype = 'author'; $key = $cent{'Authors'}; last GETKEY; };
	0
	0
	0
488						defined $cent{'CorpAuthor'} && do
489	0	0				{ $keytype = 'org'; $key = $cent{'CorpAuthor'}; last GETKEY; };
	0
	0
	0
490						defined $cent{'Editors'} && do
491	0	0				{ $keytype = 'author'; $key = $cent{'Editors'}; last GETKEY; };
	0
	0
	0
492						defined $cent{'Publisher'} && do
493	0	0				{ $keytype = 'org'; $key = $cent{'Publisher'}; last GETKEY; };
	0
	0
	0
494						defined $cent{'Organization'} && do
495	0	0				{ $keytype = 'org'; $key = $cent{'Organization'}; last GETKEY; };
	0
	0
	0
496						# nothing defined
497	0					$keytype = 'text'; $key = "Anonymous";
	0
498						}
499
500						# next we want to reduce the name to a reasonable key
501
502						#print STDERR "$key -> ";
503
504	0	0				if ($keytype eq 'author') {
		0
505						# # turn "Stephen van Rensselaer, Jr." into "vanRensselaerJr".
506						# #$key =~ s/^([^\/])\/([^\/])\/([^\/])\/([^\\|]).*/$2$1$4/;
507						# # turn "Stephen van Rensselaer, Jr." into "Rensselaer"
508						# #$key =~ s/^([^\/])\/./$1/;
509						# Remove everything past the first seperator
510	0					local($split_sep) = index($key, $bib'cs_sep2);
511	0	0				substr($key, $split_sep) = '' if $split_sep >= $[;
512						} elsif ($keytype eq 'org') {
513	0					$key =~ s/^(\S)./$1/;
514						} else {
515						# text
516						}
517						#print STDERR "$key -> ";
518	0					$key = &bib'nocharset($key);
519						#print STDERR "$key -> ";
520	0					$key =~ tr/A-Za-z0-9\/\-//cd;
521
522						# reduce it to fit normal lengths
523	0	0				substr($key, 14) = '' if length($key) > 14;
524
525						# Now find the year
526	0	0	0			if ( (defined $cent{'Year'}) && ($cent{'Year'} =~ /(\d\d\d\d)/) ) {
		0	0
527	0					$sy = $1;
528						} elsif ( (defined $cent{'Month'}) && ($cent{'Month'} =~ /(\d\d\d\d)/) ) {
529	0					$sy = $1;
530						} else {
531	0					$sy = "????";
532						}
533						# We lop off the century part
534	0					substr($sy, 0, 2) = '';
535
536						# and add on the shortyear to the end of our key
537	0					$key .= $sy;
538
539	0					$key;
540						}
541
542						#
543						# Register a key in our global key registry, returning the possibly changed
544						# key. All this does is maintain a registry of keys, and if there is already
545						# a key that matches, it adds letters from a -> z -> aa -> az -> ba -> bz -> ...
546						# to the end of the key. A format uses these routines with something like:
547						#
548						# $can{'CiteKey'} = &bp_util'genkey(%can) unless defined $can{'CiteKey'};
549						# $can{'CiteKey'} = &bp_util'regkey($can{'CiteKey'});
550						#
551						# in it's fromcanon routines. This generates a key if necessary, and then
552						# registers it. A format may wish to do its own key generation, or even
553						# throw out the citekey it was given and make a new one, so generation and
554						# registration are seperate routines.
555						#
556						# It is recommended that keys be registered here rather than in the format, as
557						# we would like one registry even for multiple formats.
558						#
559						# XXXXX is this necessary? This goes to an output routine after all. As long
560						# as they register them all, or none, do we care?
561						#
562
563						sub regkey {
564	0			0		local($key) = @_;
565	0					local($rkey, $nextkey, $rkeylen);
566
567	0					$rkey = $key;
568	0					$rkey =~ tr/A-Z/a-z/;
569	0					$rkeylen = length($rkey);
570
571	0	0				if (defined $glb_keyreg{$rkey}) {
572	0					$nextkey = $key . 'a';
573	0					while (defined $glb_keyreg{$nextkey}) {
574						# increment the characters after the key, 'z'+1 -> 'aa'.
575	0					substr($nextkey, $rkeylen)++;
576						}
577						# going to put ourselves in $nextkey
578	0					$glb_keyreg{$nextkey} = 1;
579						# key has changed, so update it for the output.
580	0					$key .= substr($nextkey, $rkeylen);
581						} else {
582	0					$glb_keyreg{$rkey} = 1;
583						# key is unchanged
584						}
585
586	0					$key;
587						}
588
589						#######################
590						# end of package
591						#######################
592
593						1;