File Coverage

blib/lib/Biblio/bp/lib/bp-bibtex.pl
Criterion Covered Total %
statement 188 440 42.7
branch 88 280 31.4
condition 12 42 28.5
subroutine 7 13 53.8
pod n/a
total 295 775 38.0


line stmt bran cond sub pod time code
1             #
2             # bibliography package for Perl
3             #
4             # BibTeX routines
5             #
6             # Dana Jacobsen (dana@acm.org)
7             # 8 July 1995 (last modified 30 November 1995)
8             #
9             # 30 Nov 95: Changed the way string parsing works. It used to do a search
10             # and replace over the entire string, but some bibliographies make
11             # a string named, "el" for instance, which will just trash the
12             # entire file. So we look for _1_ unquoted string on the same
13             # line as the field name. That's a lot safer, but also won't
14             # properly handle multiple strings on a line, or strings not in
15             # the first position. Those are fairly rare though.
16             #
17             # XXXXX Parsing BibTeX is extremely difficult to do correctly in Perl. The
18             # proper thing to do in my opinion is to write a yacc parser and compile
19             # that in. Not only will it be able to handle more bizarre cases
20             # properly, but it will probably be much faster.
21              
22             package bp_bibtex;
23              
24             $version = "bibtex (dj 18 dec 96)";
25              
26             ######
27              
28             &bib'reg_format(
29             'bibtex', # name
30             'btx', # short name
31             'bp_bibtex', # package name
32             'tex', # default character set
33             'suffix is bib',
34             # our functions
35             'open is standard',
36             'close is standard',
37             'write is standard',
38             'options',
39             'read',
40             'explode',
41             'implode',
42             'tocanon',
43             'fromcanon',
44             'clear',
45             );
46              
47             ######
48              
49             # Set to 0 for handling bibclean (faster, but makes a lot of assumptions)
50             # 1 for arbitrary bibtex
51             #
52             $opt_complex = 1;
53              
54             # Set this to 0 if we want to ignore crossref fields. Handling them may use
55             # a very large amount of memory (about the same as the size of the input file).
56             #
57             $opt_crossref = 1;
58              
59             # If an entry has a field called crossrefonly with a value of "1", and it
60             # is the target of a crossref field earlier in the file, then erase it from
61             # the output. It will be used for crossref purposes, but the user will never
62             # see it from a &read() call.
63             $opt_crossrefonly = 0;
64              
65             # XXXXX Implement crossrefonly
66              
67             ######
68              
69             sub options {
70 0     0   0 local($opt) = @_;
71              
72 0 0       0 &bib'panic("bibtex options called with no arguments!") unless defined $opt;
73 0         0 &bib'debugs("parsing bibtex option '$opt'", 64);
74 0 0       0 return undef unless $opt =~ /=/;
75 0         0 local($_, $val) = split(/\s*=\s*/, $opt, 2);
76 0         0 &bib'debugs("option split: $_ = $val", 8);
77 0 0       0 /^complex$/ && do { $opt_complex = &bib'parse_num_option($val);
  0         0  
78 0         0 return 1; };
79 0 0       0 /^crossref$/ && do { $opt_crossref = &bib'parse_num_option($val);
  0         0  
80 0         0 return 1; };
81 0         0 undef;
82             }
83              
84             ######
85              
86             # This is only used for complexity 0 now.
87             $glb_eval_repl = 0;
88              
89             # Initialize the macro list with the entries from plain.bst.
90             # XXXXX At some point we need to get these from a configuration file
91             %glb_replace = (
92             'jan', 'January',
93             'feb', 'February',
94             'mar', 'March',
95             'apr', 'April',
96             'may', 'May',
97             'jun', 'June',
98             'jul', 'July',
99             'aug', 'August',
100             'sep', 'September',
101             'oct', 'October',
102             'nov', 'November',
103             'dec', 'December',
104             'acmcs', 'ACM Computing Surveys',
105             'acta', 'Acta Informatica',
106             'cacm', 'Communications of the ACM',
107             'ibmjrd', 'IBM Journal of Research and Development',
108             'ibmsj', 'IBM Systems Journal',
109             'ieeese', 'IEEE Transactions on Software Engineering',
110             'ieeetc', 'IEEE Transactions on Computers',
111             'ieeetcad', 'IEEE Transactions on Computer-Aided Design of Integrated Circuits',
112             'ipl', 'Information Processing Letters',
113             'jacm', 'Journal of the ACM',
114             'jcss', 'Journal of Computer and System Sciences',
115             'scp', 'Science of Computer Programming',
116             'sicomp', 'SIAM Journal on Computing',
117             'tocs', 'ACM Transactions on Computer Systems',
118             'tods', 'ACM Transactions on Database Systems',
119             'tog', 'ACM Transactions on Graphics',
120             'toms', 'ACM Transactions on Mathematical Software',
121             'toois', 'ACM Transactions on Office Information Systems',
122             'toplas', 'ACM Transactions on Programming Languages and Systems',
123             'tcs', 'Theoretical Computer Science',
124             );
125             $glb_replace = '';
126              
127             $glb_noreadahead = 0;
128             @glb_readahead = ();
129             %glb_crossref_entries = ();
130             %glb_crossref_needed = ();
131              
132             $ent = '';
133              
134              
135             $protectB = "${bib'cs_meta}3100";
136             $protectE = "${bib'cs_meta}3110";
137             ######
138              
139             # XXXXX todo:
140             #
141             # don't just throw away preamble statements
142             # mismatched braces in an entry will make us read the whole file
143             # looking for the ending brace!!
144             #
145             # It ssems like reading BibTeX records are perfect for a program like
146             # yacc/lex or a similar state machine. The format is nice and regular,
147             # but it contains too many oddities for a nice perl implmentation. The
148             # best way I can describe the problem is that it is character based, rather
149             # than line based, like refer. We can even have the end of one record on
150             # the same line as the beginning of the next.
151             #
152             # The opt_complex variable can be set to 0 to remove most of the time-
153             # consuming regex stuff. This will only work if the file has been run
154             # through bibclean, or is otherwise "regular" (the @ of a start record
155             # must be flush left, only { and not scribe's ( are allowed to surround
156             # a record, and records end with a single } flush left by itself).
157             #
158             # This would be a perfect use for perl5's interface to a C program. A
159             # fairly simple lex/yacc parser could be written and be _much_ faster
160             # as well as less error prone.
161             #
162             # XXXXX Break this out into subroutines, esp for string parsing.
163             #
164             sub read {
165 134     134   252 local($file) = @_;
166 134         148 local($_);
167 134         152 local($type);
168              
169             # XXXXX A single readahead for all files. Needs to be split somehow.
170 134 50 33     332 if (@glb_readahead && (!$glb_noreadahead)) {
171 0         0 return shift @glb_readahead;
172             }
173              
174             BREAD: {
175 134 50       131 if ($opt_complex == 0) {
  134 50       340  
176 0         0 while (<$bib'glb_current_fh>) {
177 0 0       0 last if /^\@/;
178             }
179 0 0       0 return undef if eof;
180 0         0 $ent = $_;
181              
182 0         0 $bib'glb_vloc = sprintf("line %5d", $.);
183              
184 0         0 ($type) = /^\@(\w+){/;
185 0 0       0 return &bib'goterror("Unable to parse field $ent") unless defined $type;
186              
187             # skip comments
188 0 0       0 redo BREAD if $type =~ /^comment/i;
189             # we really ought to do something with this instead of tossing it
190 0 0       0 redo BREAD if $type =~ /^preamble/i;
191              
192 0 0       0 if ($type =~ /^string/i) {
193             # XXXXX should handle multi-line string statements.
194 0         0 local($name, $value);
195 0 0       0 if ( ($name,$value) = /^\@string{(\S+)\s*=\s*"([^"]*)"}$/i) {
196 0         0 $name =~ s/(\W)/\\$1/g; # quote special chars
197 0         0 $name =~ s/\\ / /g;
198 0         0 $value =~ s/(\W)/\\$1/g;
199 0         0 $value =~ s/\\ / /g;
200 0         0 $glb_replace .= "s/\\b$name\\b/$value/g;\n";
201 0         0 $glb_eval_repl = 1;
202 0         0 redo BREAD;
203             } else {
204 0         0 &bib'gotwarn("Could not parse field $ent");
205             }
206             }
207              
208 0         0 while (<$bib'glb_current_fh>) {
209 0         0 $ent .= $_;
210 0 0       0 last if /^\}\s*$/;
211             }
212             } elsif ($opt_complex == 1) {
213              
214             # Assumptions made about format:
215             #
216             # An entry must start on a line of it's own, so this is ok:
217             # @ string { jgg1 = "journal of gnats" }
218             # But this is not:
219             # @string{j1 = "journal1"} @proceedings{foo, author="joe"}
220             #
221             # There are no string expansions inside string definitions. OK:
222             # @string(jgg2 = "journal" # " of " # "gnats" }
223             # But this is not:
224             # @string(j2 = j1 # " of Imaging")
225              
226 134         215 local($braces) = 1;
227 134         133 local($delim);
228              
229 134         1324 while (<$bib'glb_current_fh>) {
230 264 100       1090 if (/^\s*\@/) {
231 132         177 $ent = $_;
232 132         197 last;
233             }
234             }
235 134 100       287 return undef if eof;
236              
237 132         602 $bib'glb_vloc = sprintf("line %5d", $.);
238              
239 132         623 ($type, $delim) = $ent =~ /^\s*\@\s*(\w+)\s*([{(])/;
240 132 50       307 return &bib'goterror("Unable to parse field $ent") unless defined $type;
241 132         192 $type =~ tr/A-Z/a-z/;
242              
243             # XXXXX We should do something with comment and preamble values
244              
245 132 50       269 if ($type eq 'comment') {
246 0         0 $ent = &read_until_match($ent, $delim, 0);
247 0         0 redo BREAD;
248             }
249              
250 132 50       257 if ($type eq 'preamble') {
251 0         0 $ent = &read_until_match($ent, $delim, 0);
252 0         0 redo BREAD;
253             }
254              
255 132 50       227 if ($type eq 'string') {
256 0         0 local($name, $value);
257 0         0 local($rdelim) = '}';
258 0 0       0 $rdelim = ')' if $delim eq '(';
259              
260 0         0 $ent = &read_until_match($ent, $delim, 0);
261              
262 0         0 $delim =~ s/(\W)/\\$1/g;
263 0         0 $rdelim =~ s/(\W)/\\$1/g;
264              
265 0         0 $ent =~ s/^\s*\@\s*string\s*$delim\s*//i;
266 0         0 eval "\$ent =~ s/\\s*$rdelim\[^$rdelim\]*\$//;";
267              
268 0 0       0 $ent = &do_concat($ent) if ($ent =~ /#/);
269              
270 0 0       0 if ( ($name,$value) = $ent =~ /^(\S+)\s*=\s*[{"(]((.|\n)*)[}")]$/ ) {
271 0 0       0 if ($name =~ /["#\%'(),={}]/) {
272 0         0 &bib'gotwarn("Illegal string name: $name");
273             } else {
274 0         0 $name =~ tr/A-Z/a-z/;
275 0 0       0 &bib'gotwarn("Redefinition of string: $name") if defined $glb_replace{$name};
276             #$value =~ s/(\W)/\\$1/g;
277 0         0 $glb_replace{$name} = $value;
278 0         0 $glb_eval_repl = 1;
279 0         0 &bib'debugs("new string $name = '$value'", 32);
280             }
281             } else {
282 0         0 &bib'gotwarn("Couldn't parse string entry");
283             }
284 0         0 redo BREAD;
285             }
286              
287             # All other types
288 132         283 $ent = &read_until_match($ent, $delim, 1);
289              
290 132 100       451 if ($ent =~ /#/) {
291 32         94 local($delim_pos) = index($ent, $delim);
292 32         40 $delim_pos++;
293 32         130 substr($ent, $delim_pos) = &do_concat( substr($ent, $delim_pos) );
294             }
295 132         749 return $ent;
296              
297             } else {
298 0         0 &bib'goterror("Unknown complexity level asked for");
299             }
300             } # end of BREAD
301              
302 0         0 $_ = $ent;
303 0 0 0     0 if ( ($opt_complex == 0) && $glb_eval_repl ) {
304 0         0 study;
305 0         0 eval $glb_replace;
306 0 0       0 $@ && return &bib'goterror("Error in string eval, $@");
307             }
308 0         0 $_;
309             }
310              
311              
312             sub read_until_match {
313 132     132   341 local($line, $lmatch, $do_string_matching) = @_;
314 132         207 local($braces) = 0;
315 132         167 local($macro, $macro_lower, $mfield);
316 132         165 local($_);
317              
318 132 50       250 if ($lmatch eq '{') {
    0          
    0          
319 132         225 $rmatch = '}';
320             } elsif ($lmatch eq '(') {
321 0         0 $rmatch = ')';
322             } elsif ($lmatch eq '"') {
323 0         0 $rmatch = '"';
324             } else {
325 0         0 &bib'gotwarn("Unknown left match character: $lmatch");
326 0         0 $rmatch = $lmatch;
327             }
328              
329 132         759 $lmatch =~ s/(\W)/\\$1/g;
330 132         478 $rmatch =~ s/(\W)/\\$1/g;
331              
332 132         559 while ($line =~ /$lmatch/g) { $braces++; }
  132         370  
333 132         351 while ($line =~ /$rmatch/g) { $braces--; }
  0         0  
334 132 50       247 if ($braces < 0) {
335 0         0 &bib'goterror("negative match level looking for $lmatch$rmatch");
336             }
337 132 50       236 return $line if ($braces <= 0);
338              
339 132         739 while (<$bib'glb_current_fh>) {
340 1878 50       3285 if ($do_string_matching) {
341             # XXXXX Check that this is right.
342             # This will match a left string. Concatenation will then only
343             # have to worry about right strings.
344 1878 100       7835 if (/^(\s*(\S+)\s*=\s*)([^"#%'(),={}\s]+)/) {
345 10         31 $mfield = $2;
346 10         27 $macro = $3;
347 10         19 $macro_lower = $macro;
348 10         20 $macro_lower =~ tr/A-Z/a-z/;
349 10         15 $mfield =~ tr/A-Z/a-z/;
350             #print STDERR "found string: $macro\n";
351 10 50       41 if (defined $glb_replace{$macro_lower}) {
352 10         361 s/^(\s*\S+\s*=\s*)$macro/$1"$glb_replace{$macro_lower}"/;
353             } else {
354 0 0 0     0 if ( ($macro !~ /^\d+$/) && (defined $i_order{$mfield}) ) {
355 0         0 &bib'gotwarn("Unknown string: $macro in $mfield field");
356             }
357             }
358             }
359             }
360 1878         2955 $line .= $_;
361 1878         4658 while (/$lmatch/g) { $braces++; }
  1784         3881  
362 1878         4652 while (/$rmatch/g) { $braces--; }
  1916         4015  
363 1878 100       8706 last if ($braces <= 0);
364             # XXXXX We should try to detect an overflow -- after reading too many lines
365             }
366 132 50 33     326 if (eof && ($braces > 0)) {
367 0         0 &bib'gotwarn("File ended while still reading record");
368             }
369 132 50       258 if ($braces < 0) {
370 0         0 &bib'goterror("negative match level looking for $lmatch$rmatch");
371             }
372 132         476 $line;
373             }
374              
375              
376             #
377             # This subroutine handles concatenating strings that are seperated with
378             # a pound sign ('#'). It also will do string substitution for defined
379             # string to the right of the concatenation symbol.
380             #
381             sub do_concat {
382 32     32   125 local($rest) = @_;
383              
384 32 50       172 return $rest unless $rest =~ /#/;
385              
386             # This is _very_ ugly.
387             # Regular expressions just aren't powerful enough to do this.
388              
389 32         58 local($left, $right);
390 32         59 local($bracelev) = 0;
391 32         42 local($quotes) = 0;
392 32         60 local($finished_string) = "";
393 32         45 local($macro, $macro_lower);
394 32         40 local($string_term);
395              
396 32         115 $rest =~ s/$bib'cs_escape/$bib'cs_char_escape/go;
397              
398 32         126 while ($rest =~ /#/) {
399 68         233 ($left, $right) = split(/#/, $rest, 2);
400 68         210 while ($left =~ /\{/g) { $bracelev++; }
  346         621  
401 68         183 while ($left =~ /\}/g) { $bracelev--; }
  314         560  
402 68         178 while ($left =~ /"/g) { $quotes++; }
  10         30  
403             #print STDERR ">>\n$left\n==== $bracelev/$quotes ====\n$right<<\n\n";
404 68 50 33     180 if ( ($bracelev <= 0) && ($quotes % 2 == 0) ) {
405             # The # occured outside of their text, so we concatenate
406              
407             # Remember: if $left is changed, $quotes and $bracelev must be updated
408             ## left side, checking for macro
409 0 0       0 if ($left =~ s/\}\s*$//) {
    0          
410             # case: {foo} # ...
411 0         0 $bracelev++;
412 0         0 $string_term = '}';
413             } elsif ($left =~ s/"\s*$//) {
414             # case: "foo" # ...
415 0         0 $quotes--;
416 0         0 $string_term = '"';
417             } else {
418             # case: macro # ...
419 0         0 $left =~ s/(\S+)\s*$/"$1/;
420 0         0 $quotes++;
421 0         0 $string_term = '"';
422 0 0       0 if ($left !~ /"\d+$/) {
423 0         0 &bib'gotwarn("left string encountered during concatenation");
424             }
425             }
426              
427             ## right side, checking for macro
428 0 0       0 if ($right =~ s/^\s*([{"])//) {
429             # case: ... # "foo"
430             # We need to check for the case of {foo} # "bar", and "foo" # {bar}
431 0 0 0     0 if ( ($string_term eq '}') && ($1 eq '"') ) {
432 0         0 $right =~ s/^([^"]*)"/$1\}/;
433             }
434 0 0 0     0 if ( ($string_term eq '"') && ($1 eq '{') ) {
435 0         0 $left =~ s/"([^"]*)$/\{$1/;
436 0         0 $quotes--; $bracelev++;
  0         0  
437             }
438             } else {
439             # case: ... # macro
440 0 0       0 if ($right =~ /^\s*([^"#%'(),={}\s]+)/) {
441 0         0 $macro = $1;
442 0         0 $macro_lower = $macro;
443 0         0 $macro_lower =~ tr/A-Z/a-z/;
444 0 0       0 if (defined $glb_replace{$macro_lower}) {
445 0         0 $right =~ s/^\s*$macro/$glb_replace{$macro_lower}$string_term/;
446             } else {
447 0 0       0 if ($macro !~ /^\d+$/) {
448 0         0 &bib'gotwarn("Unknown right string: $macro");
449             }
450 0         0 $right =~ s/^\s*$macro/$macro$string_term/;
451             }
452             } else {
453 0         0 &bib'gotwarn("Unknown text '$1' found.");
454             }
455             }
456             } else {
457             # It's inside their text, so leave it
458 68         142 $left .= "$bib'cs_temp";
459             }
460 68         109 $finished_string .= $left;
461 68         182 $rest = $right;
462             }
463 32         60 $finished_string .= $rest;
464              
465 32         197 $finished_string =~ s/$bib'cs_temp/#/go;
466 32         107 $finished_string =~ s/$bib'cs_char_escape/$bib'cs_escape/go;
467 32         155 $finished_string;
468             }
469              
470             ######
471              
472             sub explode {
473 132     132   253 local($rec) = @_;
474 132         263 local(%be_entry);
475 132         185 local(@e_values);
476 132         254 local($fld, $val);
477              
478 132         3478 @e_values = split(/,\s*(\w+)\s*=\s*/, $rec);
479 132         873 ($be_entry{'TYPE'}, $be_entry{'CITEKEY'}) =
480             ( shift(@e_values) =~ /^\s*\@\s*(\w+)\s*[{(]\s*(\S+)/ );
481 132 50       364 &bib'goterror("error exploding bibtex record") unless scalar(@e_values) > 1;
482             # XXXXX 17 Dec 96, Changed ,?\s+[ to ,?\s*[
483 132         1001 $e_values[$#e_values] =~ s/\s*,?\s*[})]\s*$//; # zap the final delimiter
484 132         302 while (@e_values) {
485 1732         2727 ($fld, $val) = splice(@e_values, 0, 2);
486 1732         2082 $fld =~ tr/A-Z/a-z/;
487 1732 100       11035 $val =~ s/^\s*\{((.|\n)*)\}\s*$/$1/
488             || $val =~ s/^\s*\"((.|\n)*)\"\s*$/$1/;
489             # XXXXX Check to see if squeezing spaces here is ok.
490 1732         6087 $val =~ s/\s+/ /g;
491             # XXXXX If there are multiple fields of the same kind we will end
492             # up throwing away all but the last value!
493 1732         5023 $be_entry{$fld} = $val;
494             }
495 132 50 33     689 if ($opt_crossref && defined $be_entry{'crossref'}) {
496 0         0 %be_entry = &crossref_fill(%be_entry);
497             }
498 132         2181 %be_entry;
499             }
500              
501             ######
502              
503             # This is the ordering r2b uses.
504             %i_order = (
505             'key', 10,
506             'author', 20,
507             'affiliation', 25,
508             'editor', 30,
509             'title', 40,
510             'booktitle', 50,
511             'location', 55,
512             'institution', 60,
513             'school', 70,
514             'journal', 80,
515             'type', 90,
516             'series', 100,
517             'volume', 110,
518             'number', 120,
519             'edition', 130,
520             'chapter', 140,
521             'pages', 150,
522             'publisher', 160,
523             'address', 170,
524             'month', 180,
525             'year', 190,
526             'price', 200,
527             'copyright', 210,
528             'keywords', 220,
529             'mrnumber', 230,
530             'language', 240,
531             'annote', 250,
532             'isbn', 260,
533             'ISBN', 261,
534             'issn', 270,
535             'ISSN', 271,
536             'subject', 275,
537             'abstract', 280,
538             'note', 290,
539             'contents', 300,
540             'url', 310,
541             );
542             sub bykey {
543             # undefined fields always go last
544 0 0   0   0 return 1 unless defined $i_order{$a};
545 0 0       0 return -1 unless defined $i_order{$b};
546 0         0 $i_order{$a} <=> $i_order{$b};
547             }
548              
549             sub implode {
550 0     0   0 local(%entry) = @_;
551 0         0 local($ent);
552              
553 0 0       0 return &bib'goterror("BibTeX: no TYPE field") unless defined $entry{'TYPE'};
554 0 0       0 return &bib'goterror("BibTeX: no CITEKEY field") unless defined $entry{'CITEKEY'};
555              
556 0         0 $ent = join("", '@', $entry{'TYPE'}, '{', $entry{'CITEKEY'}, ",\n");
557 0         0 delete $entry{'TYPE'};
558 0         0 delete $entry{'CITEKEY'};
559              
560             # I hope we're using the TeX character set, because if $entry{$field}
561             # contains a { without matching }'s, we're going to have hell to pay
562             # when we try to read it. We could check for it here, but what would
563             # we replace it with? $\lbrace$ is TeX-specific. It's also very slow.
564 0         0 foreach $field (sort bykey keys %entry) {
565 0         0 $ent .= " $field = \{$entry{$field}\},\n";
566             }
567              
568             # XXXXX This should be smarter
569 0         0 $ent =~ s/ month = \{(...)\},/ month = \L$1,/;
570              
571 0         0 substr($ent, -2, 1) = '';
572 0         0 $ent .= "\}\n";
573              
574             # We now might have some fields that still have separators left in them,
575             # notably the keywords field. Right now we change them to space.
576             # XXXXX Should this be a newline, ';', '/', ',', or space?
577 0         0 $ent =~ s/$bib'cs_sep/ /go;
578              
579 0         0 $ent;
580             }
581              
582             ######
583              
584             # XXXXX A type field in an inbook citation does not mean ReportType, but
585             # the type of section.
586              
587             %btx_to_can_fields =
588             ('CITEKEY', 'CiteKey',
589             'title', 'Title',
590             'booktitle', 'SuperTitle',
591             'affiliation', 'AuthorAddress',
592             'school', 'School',
593             'organization', 'Organization',
594             'journal', 'Journal',
595             'type', 'ReportType',
596             'series', 'Series',
597             'volume', 'Volume',
598             'edition', 'Edition',
599             'chapter', 'Chapter',
600             'pages', 'Pages',
601             'howpublished', 'HowPublished',
602             'institution', 'Organization',
603             'publisher', 'Publisher',
604             'address', 'PubAddress',
605             'month', 'Month',
606             'year', 'Year',
607             'price', 'Price',
608             'copyright', 'Copyright',
609             'keywords', 'Keywords',
610             'mrnumber', 'MRNumber',
611             'language', 'Language',
612             'annote', 'Annotation',
613             'isbn', 'ISBN',
614             'issn', 'ISSN',
615             'subject', 'Field',
616             'abstract', 'Abstract',
617             'note', 'Note',
618             'contents', 'Contents',
619             'key', 'Key',
620             'url', 'Source',
621             'location', 'Location',
622             );
623              
624             sub tocanon {
625 132     132   1008 local(%rec) = @_;
626 132         215 local(%can);
627 132         193 local($name, $btxf, $canf, $btxv);
628 132         234 local($_) = $rec{'TYPE'};
629 132         249 tr/A-Z/a-z/;
630             # NEW CANON TYPE <-- ORIGINAL BIBTEX
631 132 100       375 $can{'CiteType'} = 'article' if /^article/;
632 132 100       284 $can{'CiteType'} = 'book' if /^book/;
633 132 50       255 $can{'CiteType'} = 'book' if /^booklet/;
634 132 50       217 $can{'CiteType'} = 'book' if /^collection/;
635 132 50       227 $can{'CiteType'} = 'inproceedings' if /^conference/;
636 132 100       268 $can{'CiteType'} = 'inbook' if /^inbook/;
637 132 100       295 $can{'CiteType'} = 'inbook' if /^incollection/;
638 132 100       395 $can{'CiteType'} = 'inproceedings' if /^inproceedings/;
639 132 100       293 $can{'CiteType'} = 'manual' if /^manual/;
640 132 50       240 $can{'CiteType'} = 'thesis' if /^mastersthesis/;
641 132 50       229 $can{'CiteType'} = 'misc' if /^misc/;
642 132 100       263 $can{'CiteType'} = 'thesis' if /^phdthesis/;
643 132 100       261 $can{'CiteType'} = 'proceedings' if /^proceedings/;
644 132 100       251 $can{'CiteType'} = 'report' if /^techreport/;
645 132 50       240 $can{'CiteType'} = 'unpublished' if /^unpublished/;
646              
647 132 50       340 if (!defined $can{'CiteType'}) {
648 0         0 &bib'gotwarn("Improper entry type: $rec{'TYPE'}");
649 0         0 $can{'CiteType'} = 'misc';
650             }
651              
652 132 100       376 if (!defined $rec{'type'}) {
653 126 50       561 if ( $rec{'TYPE'} =~ /^phdthesis/i ) {
    50          
654 0         0 $rec{'type'} = 'Ph.D.';
655             } elsif ( $rec{'TYPE'} =~ /^mastersthesis/i ) {
656 0         0 $rec{'type'} = 'Masters';
657             }
658             }
659              
660              
661 132 100       298 if (defined $rec{'author'} ) {
662             # check for braces around the whole name, in which case we will
663             # assume it is a corporate author.
664 110 50 33     291 if ( ($rec{'author'} =~ /^\{/) && ($rec{'author'} =~ /\}$/) ) {
665 0         0 $can{'CorpAuthor'} = substr($rec{'author'}, $[+1, length($rec{'author'})-2);
666             } else {
667 110         227 $can{'Authors'} = &bibtex_name_to_canon( $rec{'author'} );
668             }
669 110         230 delete $rec{'author'};
670             }
671              
672 132 100       291 if (defined $rec{'editor'}) {
673 10         30 $can{'Editors'} = &bibtex_name_to_canon( $rec{'editor'} );
674             # XXXXX either we don't need this, or we need it for authors also.
675 10 50       56 delete $can{'Editors'} unless $can{'Editors'} =~ /\S/;
676 10         22 delete $rec{'editor'};
677             }
678              
679 132 50 66     358 if ( defined $rec{'organization'} && defined $rec{'school'} ) {
680 0         0 &bib'gotwarn("Both school and organization defined.");
681 0         0 delete $rec{'school'};
682             }
683              
684 132 50 66     523 if ( defined $rec{'publisher'} && defined $rec{'institution'} ) {
685 0         0 &bib'gotwarn("Both publisher and institution defined.");
686 0         0 delete $rec{'institution'};
687             }
688              
689 132 100       274 if (defined $rec{'number'}) {
690 34 100       173 if ($can{'CiteType'} =~ /report|thesis/) {
691 6         17 $can{'ReportNumber'} = $rec{'number'};
692             } else {
693 28         79 $can{'Number'} = $rec{'number'};
694             }
695 34         58 delete $rec{'number'};
696             }
697              
698 132 100       275 if (defined $rec{'month'}) {
699 52         217 $can{'Month'} = &bp_util'canon_month($rec{'month'});
700 52 50       168 delete $rec{'month'} if defined $can{'Month'};
701             }
702              
703             # done with massaging the fields
704 132         205 delete $rec{'TYPE'};
705              
706 132         521 while ( ($btxf, $btxv) = each %rec) {
707 1658 50       4693 next unless $btxv =~ /\S/;
708 1658 100       2871 if (defined $btx_to_can_fields{$btxf}) {
709 934         3136 $can{$btx_to_can_fields{$btxf}} = $btxv;
710             } else {
711             # Unknown, so enter literal. Perhaps a warning?
712 724         2164 $can{$btxf} = $btxv;
713             }
714             }
715              
716             # Handle title-like fields.
717 132         233 foreach $canf ('Title', 'SuperTitle', 'ReportType') {
718 396 100       907 next unless defined $can{$canf};
719 200         359 $can{$canf} =~ s/\{([^\s}]+)\}/${bib'cs_meta}3100$1${bib'cs_meta}3110/g;
720 200         680 $can{$canf} =~ s/\s\s+/ /g;
721             }
722              
723             # tell them who we are
724 132         273 $can{'OrigFormat'} = $version;
725              
726 132         3062 %can;
727             }
728              
729             ######
730             #
731             # This routine will convert a BibTeX name into it's canon form.
732             # It protects items in braces, such as {O'Rielly and Associates}, so that
733             # they are dealt with as one unit.
734             #
735              
736             sub bibtex_name_to_canon {
737 120     120   204 local($name) = @_;
738 120         150 local($n);
739 120         223 local($vonlast, $von, $last, $jr, $first, $part);
740 120         157 local(@savechars);
741 120         182 local($saveptr) = '00';
742 120         146 local($canon_name) = '';
743              
744 120         859 $name =~ s/\s+/ /g;
745              
746             # Move each item enclosed in braces to an atomic character.
747 120         367 while ($name =~ s/(\{[^\}]*\})/$bib'cs_temp$saveptr/) {
748 0         0 push(@savechars, $1);
749 0         0 $saveptr++;
750             }
751              
752 120         424 foreach $n ( split(/ and /, $name) ) {
753              
754 304 50       1062 if ( ($vonlast, $jr, $first) = $n =~ /^([^,]*),\s*([^,]*),\s*([^,]*)$/ ) {
    50          
755             # sep vonlast
756             } elsif ( ($vonlast, $first) = $n =~ /([^,]*),\s*([^,]*)/ ) {
757 0         0 $jr = '';
758             # sep vonlast
759             } else {
760 304         324 $first = '';
761 304         296 $jr = '';
762 304         353 $vonlast = '';
763 304         758 foreach $part (split(/ /, $n)) {
764 706 100 66     3026 if ($part =~ /^[^a-z]/ && ($vonlast eq '')) {
765 698         1387 $first .= " $part";
766             } else {
767 8         20 $vonlast .= " $part";
768             }
769             }
770             }
771 304         486 $vonlast =~ s/^\s+//;
772 304         321 $von = '';
773 304 100       498 if ($vonlast ne '') {
774 4 50       14 if ( $vonlast =~ /^[a-z]/ ) {
775 4         8 $last = '';
776 4         12 foreach $part (split(/ /, $vonlast)) {
777 8 50 33     48 if ($part =~ /^[a-z]/ && ($last eq '')) {
778 8         20 $von .= " $part";
779             } else {
780 0         0 $last .= " $part";
781             }
782             }
783 4         17 $von =~ s/^\s+//;
784 4         8 $last =~ s/^\s+//;
785             } else {
786 0         0 $last = $vonlast;
787             }
788             } else {
789 300         1544 ($first, $last) = ($first =~ /^(.*)\s+(\S+)$/);
790             }
791 304         828 $first =~ s/^\s+//;
792              
793 304         980 $canon_name .= $bib'cs_sep . join($bib'cs_sep2, $last, $von, $first, $jr);
794             }
795 120         425 $canon_name =~ s/^$bib'cs_sep//o;
796              
797 120 50       369 if (@savechars) {
798 0         0 local($oldchar, $oldcharmb);
799 0         0 $saveptr = '00';
800 0         0 while (@savechars) {
801 0         0 $oldchar = shift @savechars;
802 0         0 $oldcharmb = $oldchar;
803 0         0 $oldcharmb =~ s/^{(.*)}$/$1/;
804 0 0       0 $canon_name =~ s/(^|$bib'cs_sep|$bib'cs_sep2)$bib'cs_temp$saveptr($|$bib'cs_sep|$bib'cs_sep2)/$1$oldcharmb$2/ || $canon_name =~ s/$bib'cs_temp$saveptr/$oldchar/;
805 0         0 $saveptr++;
806             }
807             }
808              
809 120         438 $canon_name;
810             }
811              
812             ######
813              
814             # XXXXX We really ought to generate these at load time from the other list.
815             # XXXXX Format?
816              
817             %can_to_btx_fields =
818             ('CiteKey', 'CITEKEY',
819             'Title', 'title',
820             'SuperTitle', 'booktitle',
821             'AuthorAddress','affiliation',
822             'School', 'school',
823             'Organization', 'organization',
824             'Journal', 'journal',
825             'ReportType', 'type',
826             'Series', 'series',
827             'Volume', 'volume',
828             'Edition', 'edition',
829             'Chapter', 'chapter',
830             'Pages', 'pages',
831             'PagesWhole', 'pages',
832             'HowPublished', 'howpublished',
833             'Publisher', 'publisher',
834             'PubAddress', 'address',
835             'Month', 'month',
836             'Year', 'year',
837             'Price', 'price',
838             'Copyright', 'copyright',
839             'Keywords', 'keywords',
840             'MRNumber', 'mrnumber',
841             'Language', 'language',
842             'Annotation', 'annote',
843             'ISBN', 'isbn',
844             'ISSN', 'issn',
845             'Field', 'subject',
846             'Abstract', 'abstract',
847             'Note', 'note',
848             'Contents', 'contents',
849             'Key', 'key',
850             'Source', 'url',
851             'Location', 'location',
852             );
853              
854             sub fromcanon {
855 0     0   0 local(%reccan) = @_;
856 0         0 local(%record);
857 0         0 local($name, $btxf, $canf, $canv);
858              
859 0 0       0 if (!defined $reccan{'CiteType'}) {
860 0         0 &bib'gotwarn("BibTeX didn't find a CiteType field!");
861 0         0 $reccan{'CiteType'} = 'book';
862             }
863              
864             # XXXXX 22Mar96: I think we had some mixup with incollection vs. inbook.
865              
866 0         0 local($_) = $reccan{'CiteType'};
867 0 0       0 if (/^article/ ) { $record{'TYPE'} = 'article'; }
  0 0       0  
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
    0          
868 0         0 elsif (/^avmaterial/ ) { $record{'TYPE'} = 'misc'; }
869             elsif (/^book/ ) {
870 0 0       0 if (defined $reccan{'Publisher'}) { $record{'TYPE'} = 'book'; }
  0         0  
871 0         0 else { $record{'TYPE'} = 'booklet'; } }
872             elsif (/^inbook/ ) {
873 0 0       0 if (defined $reccan{'SuperTitle'}) { $record{'TYPE'} = 'incollection' }
  0         0  
874 0         0 else { $record{'TYPE'} = 'inbook'; } }
875 0         0 elsif (/^inproceedings/ ) { $record{'TYPE'} = 'inproceedings'; }
876 0         0 elsif (/^manual/ ) { $record{'TYPE'} = 'manual'; }
877 0         0 elsif (/^misc/ ) { $record{'TYPE'} = 'misc'; }
878             elsif (/^thesis/ ) {
879 0 0 0     0 if ( (defined $reccan{'ReportType'}) && ($reccan{'ReportType'} =~ /master/i)
880 0         0 ) { $record{'TYPE'} = 'mastersthesis' }
881 0         0 else { $record{'TYPE'} = 'phdthesis'; } }
882 0         0 elsif (/^proceedings/ ) { $record{'TYPE'} = 'proceedings'; }
883 0         0 elsif (/^report/ ) { $record{'TYPE'} = 'techreport'; }
884 0         0 elsif (/^unpublished/ ) { $record{'TYPE'} = 'unpublished'; }
885             else {
886 0         0 &bib'gotwarn("Improper entry type: $reccan{'CiteType'}");
887 0         0 $record{'TYPE'} = 'misc';
888             }
889              
890             # generate key if necessary, using the default method.
891 0 0       0 $reccan{'CiteKey'} = &bp_util'genkey(%reccan) unless defined $reccan{'CiteKey'};
892              
893             # register our citekey
894 0         0 $reccan{'CiteKey'} = &bp_util'regkey($reccan{'CiteKey'});
895              
896 0 0       0 if ( defined $reccan{'Authors'} ) {
897 0         0 $record{'author'} = &bp_util'canon_to_name($reccan{'Authors'}, 'bibtex');
898 0         0 delete $reccan{'Authors'};
899 0 0       0 if ($record{'author'} !~ / /) {
900 0 0       0 if ($record{'author'} =~ s/\240/ /g) {
901 0         0 $record{'author'} = $protectB . $record{'author'} . $protectE;
902             }
903             }
904             }
905 0 0       0 if ( defined $reccan{'CorpAuthor'} ) {
906             # no need for no-break spaces, as we're putting braces around it.
907 0         0 $reccan{'CorpAuthor'} =~ s/\240/ /g;
908 0 0       0 if (defined $record{'author'}) {
909 0 0       0 if (defined $reccan{'Organization'}) {
910 0         0 $record{'author'} .= ' and ' . $protectB . $reccan{'CorpAuthor'} . $protectE;
911             } else {
912 0         0 $record{'organization'} = $reccan{'CorpAuthor'};
913             }
914             } else {
915 0         0 $record{'author'} = $protectB . $reccan{'CorpAuthor'} . $protectE;
916             }
917 0         0 delete $reccan{'CorpAuthor'};
918             }
919              
920 0 0       0 if ( defined $reccan{'Editors'} ) {
921 0         0 $record{'editor'} = &bp_util'canon_to_name($reccan{'Editors'}, 'bibtex');
922 0         0 delete $reccan{'Editors'};
923             }
924              
925 0 0       0 if ( $reccan{'CiteType'} =~ /^(report|unpublished)/ ) {
926 0 0       0 if ( defined $reccan{'Publisher'} ) {
    0          
927 0         0 $record{'institution'} = $reccan{'Publisher'};
928 0         0 delete $reccan{'Publisher'};
929             } elsif ( defined $reccan{'Organization'} ) {
930 0         0 $record{'institution'} = $reccan{'Organization'};
931 0         0 delete $reccan{'Organization'};
932             }
933             }
934              
935             # if ( $reccan{'CiteType'} =~ /^thesis/ ) {
936             # if ( defined $reccan{'Organization'} ) {
937             # $record{'school'} = $reccan{'Organization'};
938             # delete $reccan{'Organization'};
939             # }
940             # }
941              
942 0 0       0 if (defined $reccan{'ReportNumber'}) {
    0          
943 0 0       0 if (defined $reccan{'Number'}) {
944 0         0 &bib'gotwarn("Both Number and ReportNumber.");
945 0         0 delete $reccan{'Number'};
946             }
947 0 0       0 if ($reccan{'CiteType'} !~ /report|thesis/) {
948 0         0 &bib'gotwarn("ReportNumber defined, but not in a report.");
949             }
950 0         0 $record{'number'} = $reccan{'ReportNumber'};
951 0         0 delete $reccan{'ReportNumber'};
952             } elsif (defined $reccan{'Number'}) {
953 0 0       0 if ($reccan{'CiteType'} =~ /report|thesis/) {
954 0         0 &bib'gotwarn("Number defined inside a report.");
955             }
956 0         0 $record{'number'} = $reccan{'Number'};
957 0         0 delete $reccan{'Number'};
958             }
959              
960 0 0       0 if (defined $reccan{'ReportType'}) {
961 0 0       0 if ($reccan{'ReportType'} !~ /($protectB|$protectE)/o) {
962 0         0 $reccan{'ReportType'} =~ s/Ph\.\s*D\./${protectB}Ph.D.${protectE}/o;
963             }
964             }
965              
966             # done with massaging the fields
967 0         0 delete $reccan{'CiteType'};
968             # We don't know any special information about any types
969 0         0 delete $reccan{'OrigFormat'};
970              
971 0         0 while ( ($canf, $canv) = each %reccan) {
972 0 0       0 if (defined $can_to_btx_fields{$canf}) {
973 0         0 $record{$can_to_btx_fields{$canf}} = $canv;
974             } else {
975             ### &bib'gotwarn("Unknown field: $canf"); ### this is really annoying! (pt/02/09/11)
976 0         0 $record{$canf} = $canv;
977             }
978             }
979              
980 0         0 %record;
981             }
982              
983             ######
984              
985             sub clear {
986 2     2   6 local($file) = @_;
987              
988             # XXXXX currently we have just one strings mapping for all files.
989              
990 2         44 %glb_replace = ();
991 2         7 $glb_eval_repl = 0;
992             }
993              
994             ######
995              
996             sub crossref_fill {
997 0     0     local(%bent) = @_;
998 0           local($id) = $bent{'crossref'};
999 0           local(%crossent);
1000 0           local($cfield, $cval);
1001              
1002 0           &bib'debugs("trying to crossref $id in $bent{'CITEKEY'}", 64);
1003 0 0         if (!defined $glb_crossref_entries{$id}) {
1004 0 0         if ( ! &get_record_ahead($id) ) {
1005 0           &bib'gotwarn("Could not find bibtex crossref: $id");
1006 0           return %bent;
1007             }
1008             }
1009             #print STDERR "using crossref $id. readahead has $#glb_readahead entries\n";
1010              
1011 0           %crossent = &explode( $glb_crossref_entries{$id} );
1012              
1013             # Merge the two records
1014             # We do this by simply adding any fields from the crossref entry that
1015             # don't exist in the original record.
1016 0           while ( ($cfield, $cval) = each %crossent) {
1017 0 0         next if defined $bent{$cfield};
1018 0           $bent{$cfield} = $cval;
1019             }
1020             # Now that we've successfully merged the entries, we can remove the
1021             # crossref entry
1022 0           delete $bent{'crossref'};
1023            
1024 0           %bent;
1025             }
1026              
1027             sub get_record_ahead {
1028 0     0     local($needed_id) = @_;
1029 0           local($id) = undef;
1030 0           local($next_record);
1031              
1032             #print STDERR "new crossref: $needed_id\n" unless defined $glb_crossref_needed{$needed_id};
1033 0           $glb_crossref_needed{$needed_id} = 1;
1034              
1035             # We can't have the read routine returning our own results to us!
1036 0           $glb_noreadahead = 1;
1037              
1038 0           while ($next_record = &read) {
1039             # We look in each record to see if there is another crossref field.
1040             # If there seems to be, we make a note of it, so we will store the
1041             # record right away.
1042 0 0         if ($next_record =~ /crossref\s*=\s*[{"]([^}"]+)/i) {
1043 0 0         if (!defined $glb_crossref_entries{$1}) {
1044 0           $glb_crossref_needed{$1} = 1;
1045             }
1046             }
1047              
1048 0           ($id) = ( $next_record =~ /^\s*\@\s*\w+\s*[{(]\s*([^,\s]+)/ );
1049              
1050 0 0         if (defined $glb_crossref_needed{$id}) {
1051 0           $glb_crossref_entries{$id} = $next_record;
1052 0           delete $glb_crossref_needed{$id};
1053             }
1054              
1055 0           push(@glb_readahead, $next_record);
1056 0 0         last if $id eq $needed_id;
1057             }
1058              
1059 0           &bib'debugs("crossref looking for " . join(" ", keys %glb_crossref_needed) . ".", 4);
1060 0           &bib'debugs("crossref has " . join(" ", keys %glb_crossref_entries) . ".", 4);
1061              
1062             # Now let the read routine use the readahead information
1063 0           $glb_noreadahead = 0;
1064              
1065 0           ($id eq $needed_id);
1066             }
1067              
1068              
1069             #######################
1070             # end of package
1071             #######################
1072              
1073             1;