File Coverage

blib/lib/Biblio/bp/lib/bp-p-utils.pl
Criterion Covered Total %
statement 35 248 14.1
branch 16 140 11.4
condition 0 33 0.0
subroutine 2 10 20.0
pod n/a
total 53 431 12.3


line stmt bran cond sub pod time code
1             #
2             # bibliography package for Perl
3             #
4             # utility subroutines
5             #
6             # Dana Jacobsen (dana@acm.org)
7             # 11 January 1995
8             #
9              
10             package bp_util;
11              
12             ######
13              
14             $opt_complex = 1;
15              
16             # The global key registry.
17             %glb_keyreg = ();
18              
19             #
20             # mname_to_canon takes a name string and returns it back as a Canonical name.
21             #
22             # Example input:
23             #
24             # John von Jones, Jr., Ed Krol, Ludwig von Beethoven
25             #
26             # output:
27             #
28             # Jones,von,John,Jr./Krol,Ed,/Beethoven,von,Ludwig,
29             #
30             # (the actual seperators are $cs_sep for '/' and $cs_sep2 for ',')
31             #
32             # This is a total heuristic hack, and if you know where names are split,
33             # use multiple calls to name_to_canon instead. Use this routine if you
34             # expect the input to be some sort of free-form such that you can't
35             # easily seperate the names yourself.
36             #
37             # This routine assumes there can be multiple authors per line, seperated by
38             # "and" or commas, and it's going to try to guess how to break them up,
39             # given that it can get "name1, name2, jr, name3" as a 3 name string with
40             # "name2, jr" as the second name. This method precludes the ability to
41             # also correctly parse "last, first" format strings. If that is the format
42             # your string is in, call the function with a "1" as the second argument.
43             #
44             # Note that no-break-space ("tie", ~ in TeX, \0 in troff) is \240.
45             #
46             sub mname_to_canon {
47 0     0   0 local($allnames, $revauthor) = @_;
48 0         0 local($firstn, $vonn, $lastn, $jrn);
49 0         0 local(@names, $name, $oname, $nname, $rest);
50 0         0 local(@cnames) = ();
51            
52             # Squeeze all spaces into one space.
53 0         0 $allnames =~ s/\s+/ /g;
54             # remove any beginning and trailing ands.
55 0         0 $allnames =~ s/^and //;
56 0         0 $allnames =~ s/ and$//;
57              
58 0         0 @names = split(/,? and /, $allnames);
59 0         0 while (@names) {
60 0         0 $oname = $name = shift @names;
61 0         0 $firstn = $vonn = $lastn = $jrn = '';
62             # name has no spaces at beginning or end
63              
64             # squeeze all spaces around commas. They aren't telling us anything that
65             # we can rely on, and it simplifies matching. Also combine them.
66 0         0 $name =~ s/,+/,/g;
67 0         0 $name =~ s/ ,/,/g;
68 0         0 $name =~ s/, /,/g;
69              
70 0 0 0     0 if ( $revauthor && ($name =~ /,/) ) {
71 0 0       0 if ($name =~ s/[, ]+([sj]r\.?|I+)$//i) {
72 0         0 $jrn = ",$1";
73             }
74 0         0 $name =~ s/^(.*),(.*)/$2 $1$jrn/g;
75             # name has no spaces at beg or end
76             }
77              
78 0         0 $name =~ s/[ \240]+([sj]r\.?|\(?edi?t?o?r?s?\.?\)?|I+)(,|$)/,$1/i;
79 0         0 ($nname, $rest, $jrn) = split(/,([^\240])/, $name, 2);
80 0 0       0 $jrn = (defined $jrn) ? "$rest$jrn" : '';
81             #$jrn =~ s/,+$//;
82             # nname has no spaces at beg or end.
83             # jrn has no spaces at beg or end.
84 0 0       0 if ($jrn =~ / /) {
85 0         0 ($jrn, $rest) = $jrn =~ /([sj]r\.?|\(?edi?t?o?r?s?\.?\)?|I+)?,?(.*)$/i;
86 0 0       0 unshift(@names, $rest) if defined $rest;
87 0 0       0 $jrn = '' unless defined $jrn;
88             }
89 0         0 ($firstn) = $nname =~ /^((\S* )*)/;
90 0         0 $nname = substr($nname, length($firstn));
91             # nname has no spaces at beg or end.
92 0         0 $lastn = $nname;
93 0         0 $lastn =~ s/\240+/ /g;
94 0         0 $firstn =~ s/\240+/ /g;
95 0         0 $jrn =~ s/\240+/ /g;
96 0         0 while ($firstn =~ / ([a-z]+ )$/) {
97 0         0 $rest = $1;
98 0         0 substr($vonn, 0, 0) = $rest;
99             # XXXXX removed " - 1" from position argument
100 0         0 substr($firstn, length($firstn) - length($rest)) = '';
101             }
102 0         0 while ($lastn =~ /^([a-z]+ )/) {
103 0         0 $rest = $1;
104 0         0 $vonn .= $rest;
105 0         0 $lastn = substr($lastn, length($rest));
106             }
107 0         0 $vonn =~ s/\s+$//;
108 0         0 $firstn =~ s/\s+$//;
109             #print STDERR ":$vonn:$lastn:$firstn:$jrn:\n";
110              
111 0 0       0 if ($jrn) {
112 0 0       0 if ($jrn =~ /^(et\.? ?al\.?)|(others)$/i) {
113 0         0 $jrn = '';
114 0         0 unshift(@names, "et al.");
115             }
116 0 0       0 if ($jrn =~ /^inc[\.]?$/i) {
117 0         0 $lastn .= ", " . $jrn;
118 0         0 $jrn = '';
119             }
120             }
121 0 0       0 if ($lastn =~ /^(et ?al)|(others)$/i) {
122 0         0 $lastn = "et al.";
123             }
124              
125 0         0 push( @cnames, join($bib'cs_sep2, $lastn, $vonn, $firstn, $jrn) );
126             }
127              
128 0         0 $name = join( $bib'cs_sep, @cnames );
129 0         0 $name =~ s/\s+$//;
130 0         0 $name =~ s/\s+/ /g;
131              
132             # remove any spaces before and after parts of names.
133 0         0 1 while $name =~ s/ ${bib'cs_sep2}/${bib'cs_sep2}/go;
134 0         0 1 while $name =~ s/${bib'cs_sep2} /${bib'cs_sep2}/go;
135              
136 0         0 $name;
137             }
138              
139             #########
140              
141             #
142             # name_to_canon takes a _single_ name and returns it back as a Canonical name.
143             #
144             # This will be faster than mname_to_canon. I also wrote it for bp, and
145             # mname_to_canon is full of weird TeX things from r2b.
146             #
147             # Note that there are a few differences between the two. Notably, that
148             # we only break out a von if it is space seperated -- a nbsp (tie) will
149             # prevent us from breaking it. Note that nbsp => \240.
150             #
151              
152             sub name_to_canon {
153 0     0   0 local($name, $revauthor) = @_;
154 0         0 local($first, $last, $von, $jrn);
155              
156 0 0       0 &bib'panic("name_to_canon called with no arguments") unless defined $name;
157              
158 0         0 $name =~ s/\s+/ /g;
159 0         0 $name =~ s/ $//;
160 0         0 $von = ''; $jrn = '';
  0         0  
161              
162 0 0       0 if ($name =~ s/[, ]+([sj]r\.?|I+)$//i) {
163 0         0 $jrn = $1;
164             }
165             # name has no space at end
166             # jrn has no space at beg or end
167 0 0 0     0 if ( $revauthor && ($name =~ /,/) ) {
168 0         0 $name =~ s/^(.*)\s*,\s*(.*)/$2 $1/g;
169             }
170             # strip off Jr., but leave "Hunt,\0Jr." alone.
171 0 0 0     0 if (($name =~ /,/) && ($name !~ /,\240/) ) {
172             # XXXXX Check the logic here
173 0 0       0 if (!$revauthor) {
174 0 0       0 if ($jrn) {
175             # possibly reversed?
176 0         0 local($newname) = &name_to_canon($name, 'reverse');
177 0 0       0 if (defined $newname) {
178 0         0 &bib'gotwarn("Names are in reverse order?");
179 0         0 return $newname;
180             } else {
181 0         0 &bib'goterror("name_to_canon already got jr!");
182             }
183             } else {
184 0 0       0 &bib'goterror("Names seem to be reversed!") if $jrn;
185             }
186             }
187 0         0 ($name, $jrn) = split(/ ?, ?/, $name, 2);
188             }
189 0 0       0 if ($name =~ / /) {
190 0         0 ($first, $last) = $name =~ /(.*) (\S*)$/;
191             } else {
192 0         0 $first = '';
193 0         0 $last = $name;
194             }
195 0 0       0 if ($first =~ / ([a-z].*)$/) {
196 0         0 $von = $1;
197 0         0 $von =~ s/\240/ /g;
198 0         0 substr($first, length($first)-length($von)-1) = '';
199             #$first =~ s/ $von//;
200             }
201 0         0 while ($last =~ /^([a-z]+)\240/) {
202 0         0 $von .= " $1";
203 0         0 substr($last, 0, length($1)+1) = '';
204             }
205 0         0 $von =~ s/^ //;
206 0         0 $last =~ s/\240/ /g;
207              
208             #print STDERR ":$last:$von:$first:$jrn:\n";
209              
210 0         0 $name = join( $bib'cs_sep2, $last, $von, $first, $jrn);
211              
212 0         0 $name =~ s/\s+$//;
213 0         0 $name =~ s/\s+/ /g;
214             # remove spaces before and after seperators.
215 0         0 1 while $name =~ s/ ${bib'cs_sep2}/${bib'cs_sep2}/go;
216 0         0 1 while $name =~ s/${bib'cs_sep2} /${bib'cs_sep2}/go;
217              
218 0 0       0 if ($opt_complex > 1) {
219 0         0 ($last, $von, $first, $jrn) = split($bib'cs_sep2, $name);
220             # Look for corporations
221 0 0       0 if ($jrn =~ /^Inc\.$/i) {
222 0         0 $jrn = '';
223 0         0 $last = $last . ", Inc.";
224             }
225             # put it back together
226 0         0 $name = join( $bib'cs_sep2, $last, $von, $first, $jrn);
227             }
228              
229 0         0 $name;
230             }
231              
232             # This routine turns a name string (possibly containing multiple names) in
233             # canon format into a string suitable for output.
234             #
235             # The styles supported are:
236             #
237             # bibtex First von Last [or] von Last, First [or] von Last, Jr, First
238             #
239             # plain First von Last, Jr
240             #
241             # reverse von Last, First, Jr
242             #
243             # reverse2 Last, First von, Jr
244             #
245             # lname1 von Last, Jr, First [for first author]
246             # First von Last [for subsequesent authors]
247             #
248             # XXXXX
249             #
250             # What we should do instead is have a more general solution. We could specify
251             # names in the above sort of format, and have it parse that. But then how do
252             # we handle BibTeX, which will make decisions based on what fields exist? But
253             # for most of these, something like "FvL,J" or "vL,F,J" or "L,Fv,J" would work.
254             #
255             # Also, we really need a generic output form, that handles more subtle
256             # variations, like when to put "et al." in place of 150 names, and a different
257             # separator for the last name (", and " instead of ", "), initials, and so on.
258             #
259             # XXXXX Check out bibtex parsing. We look for a space, but we've tied all
260             # spaces already!
261              
262             sub canon_to_name {
263 120     120   249 local($cname, $how) = @_;
264 120         188 local(@names);
265 120         127 local($name);
266 120         174 local($n, $von, $last, $jr, $first);
267 120         180 local($namenum) = 0;
268              
269 120 50       232 &bib'panic("canon_to_name called with no arguments") unless defined $cname;
270 120 50       194 $how = 'bibtex' unless defined $how;
271              
272 120         407 foreach $name ( split(/$bib'cs_sep/o, $cname) ) {
273 304         318 $namenum++;
274 304         878 ($last, $von, $first, $jr) = split(/$bib'cs_sep2/o, $name, 4);
275 304         468 $last =~ s/ /\240/g;
276 304         333 $von =~ s/ /\240/g;
277 304 50       1005 if ($how =~ /^bibtex/) {
    50          
    0          
    0          
    0          
278             # Turn ties back into spaces.
279 0         0 $last =~ s/([^,])\240/$1 /g;
280 0         0 $von =~ s/\240([a-z])/ $1/g;
281             # Do the minimal amount of commas
282 0 0 0     0 if ($jr) {
    0          
283 0         0 $n = $von . ' ' . $last . ', ' . $jr . ', ' . $first;
284             } elsif ( ($last =~ /\S\s+\S/) && ($last !~ /^{.*}$/) ) {
285 0         0 $n = $von . ' ' . $last . ', ' . $first;
286             } else {
287 0         0 $n = join(' ', $first, $von, $last);
288             }
289             } elsif ($how =~ /^plain/) {
290             # plain: "First von Last, Jr" for each name
291 304         464 $n = $first;
292 304 100       531 $n .= " $von " if $von;
293 304 100       689 $n .= " $last" if $last;
294 304 50       557 $n .= ", $jr" if $jr;
295             } elsif ($how =~ /^reverse2/) {
296             # This is "Last, First von, Jr." order.
297 0         0 $n = "$last";
298 0 0 0     0 $n .= "," if ($first || $von || $jr);
      0        
299 0 0       0 $n .= " $first" if $first;
300 0 0       0 $n .= " $von" if $von;
301 0 0       0 $n .= ", $jr" if $jr;
302             } elsif ($how =~ /^reverse/) {
303             # This is "von Last, First, Jr." order.
304 0         0 $n = "$von $last";
305 0 0 0     0 $n .= ", $first" if ($first || $jr);
306 0 0       0 $n .= ", $jr" if $jr;
307             } elsif ($how =~ /^lname1/) {
308             # lname1 : First author has last name first, the rest are in normal order.
309             # Personally I hate this style, but its common in ecology.
310 0 0       0 $last .= ", $jr" if $jr;
311 0 0       0 if ($namenum == 1) {
312 0 0       0 $last = join(' ', $von, $last) if ($von);
313 0 0       0 if ($first) {
314 0         0 $n = join(', ', $last, $first);
315             } else {
316 0         0 $n = $last;
317             }
318             } else {
319 0         0 $n = join(' ', $first, $von, $last);
320             }
321             # unknown name style
322             } else {
323 0         0 return &bib'goterror("canon_to_name doesn't know form: $how");
324             }
325 304         381 $n =~ s/ \240/ /g;
326 304         463 $n =~ s/^\s+//;
327 304         671 $n =~ s/\s+$//;
328 304         927 $n =~ s/\s+/ /g;
329 304         688 push(@names, $n);
330             }
331              
332 120 50       267 if (wantarray) {
333 0         0 @names;
334             } else {
335             # They want the complete string accoring to the style they asked for.
336 120 50       572 if ($how =~ /lname1|plain/) {
337 120 100       316 if (@names <= 2) {
338 64         142 $n = join(' and ', @names);
339             } else {
340 56         103 $lname = pop(@names);
341 56         206 $n = join(', ', @names) . ', and ' . $lname;
342             }
343             } else {
344 0         0 $n = join(' and ', @names);
345             }
346 120         710 $n;
347             }
348             }
349              
350             # XXXXX Obsolete?
351              
352             sub parsename {
353 0     0   0 local($name, $how) = @_;
354              
355 0         0 &canon_to_name( &mname_to_canon($name), $how);
356             }
357              
358              
359             #########
360              
361             #
362             # parsedate takes a date and returns a list of month, year.
363             #
364             # taken from r2b
365             #
366             # date looks like month dec year
367             # -------------------------------- ------------------- -- ---------------
368             # 1984 84 1984
369             # 1974-1975 74 1974-1975
370             # August 1984 aug 84 1984
371             # May 1984 May 1984 may 84 1984
372             # 1976 November nov 76 1976
373             # 1976 November 1976 nov 76 1976
374             # 21 August 1984 {21 August} 84 1984
375             # August 18-21, 1984 {August 18-21} 84 1984
376             # 18-21 August 1991 {18-21 August} 91 1991
377             # July 31-August 4, 1984 1984 {July 31-August 4} 84 1984
378             # July-August 1980 {July-August} 80 1980
379             # February 1984 (revised May 1991) feb 84 1984
380             # Winter 1990 {Winter} 90 1990
381             # 1988 (in press) 88 1988 (in press)
382             # to appear ?? to appear
383              
384             sub parsedate {
385 0     0   0 local($date) = @_;
386 0         0 local($year) = undef;
387 0         0 local($month);
388 0         0 local($old_date) = $date;
389              
390 0 0       0 return (undef, undef) unless defined $date;
391              
392 0         0 $date =~ s/(\S+)\s+(\d+)\s+\1\s+\2/$1 $2/; # handle duplicate dates
393 0         0 $date =~ s/^\s*(\d\d\d+)\s+(\S+)/$2 $1/; # handle 1976 November
394 0         0 while ($date =~ /\s*[(]?((\d\d\d\d[-\/])?\d\d\d\d)[).]?\s*(\(.*\))?$/) {
395 0         0 $year = $1;
396 0         0 $date =~ s/,?\s*[(]?(\d\d\d\d[-\/])?\d\d\d\d[).]?\s*(\(.*\))?$//;
397             }
398              
399 0         0 $month = &canon_month($date);
400              
401 0 0 0     0 if ($month !~ /\S/) {
    0          
402 0         0 undef $month;
403             } elsif ( (!defined $year) && ($month eq $date) ) {
404 0         0 $year = $old_date;
405 0         0 undef $month;
406             }
407 0         0 ($month, $year);
408             }
409              
410             %month_table = (
411             'apr', 'April',
412             'aug', 'August',
413             'dec', 'December',
414             'feb', 'February',
415             'jan', 'January',
416             'jul', 'July',
417             'jun', 'June',
418             'mar', 'March',
419             'may', 'May',
420             'nov', 'November',
421             'oct', 'October',
422             'sep', 'September',
423             );
424              
425             sub canon_month {
426 52     52   113 local($month) = @_;
427              
428 52 100       229 return $month if $month =~ /[\d\/\-]/;
429              
430 30         83 local($canm) = substr($month, 0, 3);
431              
432 30         62 $canm =~ tr/A-Z/a-z/;
433              
434 30 50       109 return $month unless defined $month_table{$canm};
435              
436 30         248 $canm;
437             }
438              
439             sub output_month {
440 0     0     local($canm, $how) = @_;
441 0           local($outm) = $month_table{$canm};
442              
443             # we don't know what they have
444 0 0         return $canm unless defined $outm;
445              
446 0 0 0       if ( ($how eq 'short') && (length($outm) > 4) ) {
447 0           substr($outm, 3) = '.';
448             }
449              
450             # 'long' format
451 0           $outm;
452             }
453              
454             sub output_date {
455 0     0     local($mo, $yr, $how) = @_;
456 0           local($date);
457              
458 0 0         $how = 'short' unless defined $how;
459              
460 0 0         if (defined $mo) {
461 0           $mo = &bp_util'output_month($mo, $how);
462 0 0         if (defined $yr) {
463 0           $date = "$mo $yr";
464             } else {
465 0           $date = $mo;
466             }
467             } else {
468 0 0         $date = $yr if defined $yr;
469             }
470              
471 0           $date;
472             }
473              
474             #
475             # Generates a key for a canonical record.
476             #
477             # XXXXX This should take an option string and parse it to generate a key.
478             #
479              
480             sub genkey {
481 0     0     local(%cent) = @_;
482 0           local($key, $keytype, $sy);
483              
484             # first pick out the field we're going to use
485             GETKEY: {
486 0           defined $cent{'Authors'} && do
487 0 0         { $keytype = 'author'; $key = $cent{'Authors'}; last GETKEY; };
  0            
  0            
  0            
488             defined $cent{'CorpAuthor'} && do
489 0 0         { $keytype = 'org'; $key = $cent{'CorpAuthor'}; last GETKEY; };
  0            
  0            
  0            
490             defined $cent{'Editors'} && do
491 0 0         { $keytype = 'author'; $key = $cent{'Editors'}; last GETKEY; };
  0            
  0            
  0            
492             defined $cent{'Publisher'} && do
493 0 0         { $keytype = 'org'; $key = $cent{'Publisher'}; last GETKEY; };
  0            
  0            
  0            
494             defined $cent{'Organization'} && do
495 0 0         { $keytype = 'org'; $key = $cent{'Organization'}; last GETKEY; };
  0            
  0            
  0            
496             # nothing defined
497 0           $keytype = 'text'; $key = "Anonymous";
  0            
498             }
499              
500             # next we want to reduce the name to a reasonable key
501              
502             #print STDERR "$key -> ";
503              
504 0 0         if ($keytype eq 'author') {
    0          
505             # # turn "Stephen van Rensselaer, Jr." into "vanRensselaerJr".
506             # #$key =~ s/^([^\/]*)\/([^\/]*)\/([^\/]*)\/([^\|]*).*/$2$1$4/;
507             # # turn "Stephen van Rensselaer, Jr." into "Rensselaer"
508             # #$key =~ s/^([^\/]*)\/.*/$1/;
509             # Remove everything past the first seperator
510 0           local($split_sep) = index($key, $bib'cs_sep2);
511 0 0         substr($key, $split_sep) = '' if $split_sep >= $[;
512             } elsif ($keytype eq 'org') {
513 0           $key =~ s/^(\S*).*/$1/;
514             } else {
515             # text
516             }
517             #print STDERR "$key -> ";
518 0           $key = &bib'nocharset($key);
519             #print STDERR "$key -> ";
520 0           $key =~ tr/A-Za-z0-9\/\-//cd;
521              
522             # reduce it to fit normal lengths
523 0 0         substr($key, 14) = '' if length($key) > 14;
524              
525             # Now find the year
526 0 0 0       if ( (defined $cent{'Year'}) && ($cent{'Year'} =~ /(\d\d\d\d)/) ) {
    0 0        
527 0           $sy = $1;
528             } elsif ( (defined $cent{'Month'}) && ($cent{'Month'} =~ /(\d\d\d\d)/) ) {
529 0           $sy = $1;
530             } else {
531 0           $sy = "????";
532             }
533             # We lop off the century part
534 0           substr($sy, 0, 2) = '';
535              
536             # and add on the shortyear to the end of our key
537 0           $key .= $sy;
538              
539 0           $key;
540             }
541              
542             #
543             # Register a key in our global key registry, returning the possibly changed
544             # key. All this does is maintain a registry of keys, and if there is already
545             # a key that matches, it adds letters from a -> z -> aa -> az -> ba -> bz -> ...
546             # to the end of the key. A format uses these routines with something like:
547             #
548             # $can{'CiteKey'} = &bp_util'genkey(%can) unless defined $can{'CiteKey'};
549             # $can{'CiteKey'} = &bp_util'regkey($can{'CiteKey'});
550             #
551             # in it's fromcanon routines. This generates a key if necessary, and then
552             # registers it. A format may wish to do its own key generation, or even
553             # throw out the citekey it was given and make a new one, so generation and
554             # registration are seperate routines.
555             #
556             # It is recommended that keys be registered here rather than in the format, as
557             # we would like one registry even for multiple formats.
558             #
559             # XXXXX is this necessary? This goes to an output routine after all. As long
560             # as they register them all, or none, do we care?
561             #
562              
563             sub regkey {
564 0     0     local($key) = @_;
565 0           local($rkey, $nextkey, $rkeylen);
566              
567 0           $rkey = $key;
568 0           $rkey =~ tr/A-Z/a-z/;
569 0           $rkeylen = length($rkey);
570              
571 0 0         if (defined $glb_keyreg{$rkey}) {
572 0           $nextkey = $key . 'a';
573 0           while (defined $glb_keyreg{$nextkey}) {
574             # increment the characters after the key, 'z'+1 -> 'aa'.
575 0           substr($nextkey, $rkeylen)++;
576             }
577             # going to put ourselves in $nextkey
578 0           $glb_keyreg{$nextkey} = 1;
579             # key has changed, so update it for the output.
580 0           $key .= substr($nextkey, $rkeylen);
581             } else {
582 0           $glb_keyreg{$rkey} = 1;
583             # key is unchanged
584             }
585              
586 0           $key;
587             }
588              
589             #######################
590             # end of package
591             #######################
592              
593             1;