File Coverage

blib/lib/Biblio/bp/lib/bp-bibtex.pl

Criterion	Covered	Total	%
statement	188	440	42.7
branch	88	280	31.4
condition	12	42	28.5
subroutine	7	13	53.8
pod			n/a
total	295	775	38.0

line	stmt	bran	cond	sub	time	code
1						#
2						# bibliography package for Perl
3						#
4						# BibTeX routines
5						#
6						# Dana Jacobsen (dana@acm.org)
7						# 8 July 1995 (last modified 30 November 1995)
8						#
9						# 30 Nov 95: Changed the way string parsing works. It used to do a search
10						# and replace over the entire string, but some bibliographies make
11						# a string named, "el" for instance, which will just trash the
12						# entire file. So we look for _1_ unquoted string on the same
13						# line as the field name. That's a lot safer, but also won't
14						# properly handle multiple strings on a line, or strings not in
15						# the first position. Those are fairly rare though.
16						#
17						# XXXXX Parsing BibTeX is extremely difficult to do correctly in Perl. The
18						# proper thing to do in my opinion is to write a yacc parser and compile
19						# that in. Not only will it be able to handle more bizarre cases
20						# properly, but it will probably be much faster.
21
22						package bp_bibtex;
23
24						$version = "bibtex (dj 18 dec 96)";
25
26						######
27
28						&bib'reg_format(
29						'bibtex', # name
30						'btx', # short name
31						'bp_bibtex', # package name
32						'tex', # default character set
33						'suffix is bib',
34						# our functions
35						'open is standard',
36						'close is standard',
37						'write is standard',
38						'options',
39						'read',
40						'explode',
41						'implode',
42						'tocanon',
43						'fromcanon',
44						'clear',
45						);
46
47						######
48
49						# Set to 0 for handling bibclean (faster, but makes a lot of assumptions)
50						# 1 for arbitrary bibtex
51						#
52						$opt_complex = 1;
53
54						# Set this to 0 if we want to ignore crossref fields. Handling them may use
55						# a very large amount of memory (about the same as the size of the input file).
56						#
57						$opt_crossref = 1;
58
59						# If an entry has a field called crossrefonly with a value of "1", and it
60						# is the target of a crossref field earlier in the file, then erase it from
61						# the output. It will be used for crossref purposes, but the user will never
62						# see it from a &read() call.
63						$opt_crossrefonly = 0;
64
65						# XXXXX Implement crossrefonly
66
67						######
68
69						sub options {
70	0			0	0	local($opt) = @_;
71
72	0	0			0	&bib'panic("bibtex options called with no arguments!") unless defined $opt;
73	0				0	&bib'debugs("parsing bibtex option '$opt'", 64);
74	0	0			0	return undef unless $opt =~ /=/;
75	0				0	local($_, $val) = split(/\s=\s/, $opt, 2);
76	0				0	&bib'debugs("option split: $_ = $val", 8);
77	0	0			0	/^complex$/ && do { $opt_complex = &bib'parse_num_option($val);
	0				0
78	0				0	return 1; };
79	0	0			0	/^crossref$/ && do { $opt_crossref = &bib'parse_num_option($val);
	0				0
80	0				0	return 1; };
81	0				0	undef;
82						}
83
84						######
85
86						# This is only used for complexity 0 now.
87						$glb_eval_repl = 0;
88
89						# Initialize the macro list with the entries from plain.bst.
90						# XXXXX At some point we need to get these from a configuration file
91						%glb_replace = (
92						'jan', 'January',
93						'feb', 'February',
94						'mar', 'March',
95						'apr', 'April',
96						'may', 'May',
97						'jun', 'June',
98						'jul', 'July',
99						'aug', 'August',
100						'sep', 'September',
101						'oct', 'October',
102						'nov', 'November',
103						'dec', 'December',
104						'acmcs', 'ACM Computing Surveys',
105						'acta', 'Acta Informatica',
106						'cacm', 'Communications of the ACM',
107						'ibmjrd', 'IBM Journal of Research and Development',
108						'ibmsj', 'IBM Systems Journal',
109						'ieeese', 'IEEE Transactions on Software Engineering',
110						'ieeetc', 'IEEE Transactions on Computers',
111						'ieeetcad', 'IEEE Transactions on Computer-Aided Design of Integrated Circuits',
112						'ipl', 'Information Processing Letters',
113						'jacm', 'Journal of the ACM',
114						'jcss', 'Journal of Computer and System Sciences',
115						'scp', 'Science of Computer Programming',
116						'sicomp', 'SIAM Journal on Computing',
117						'tocs', 'ACM Transactions on Computer Systems',
118						'tods', 'ACM Transactions on Database Systems',
119						'tog', 'ACM Transactions on Graphics',
120						'toms', 'ACM Transactions on Mathematical Software',
121						'toois', 'ACM Transactions on Office Information Systems',
122						'toplas', 'ACM Transactions on Programming Languages and Systems',
123						'tcs', 'Theoretical Computer Science',
124						);
125						$glb_replace = '';
126
127						$glb_noreadahead = 0;
128						@glb_readahead = ();
129						%glb_crossref_entries = ();
130						%glb_crossref_needed = ();
131
132						$ent = '';
133
134
135						$protectB = "${bib'cs_meta}3100";
136						$protectE = "${bib'cs_meta}3110";
137						######
138
139						# XXXXX todo:
140						#
141						# don't just throw away preamble statements
142						# mismatched braces in an entry will make us read the whole file
143						# looking for the ending brace!!
144						#
145						# It ssems like reading BibTeX records are perfect for a program like
146						# yacc/lex or a similar state machine. The format is nice and regular,
147						# but it contains too many oddities for a nice perl implmentation. The
148						# best way I can describe the problem is that it is character based, rather
149						# than line based, like refer. We can even have the end of one record on
150						# the same line as the beginning of the next.
151						#
152						# The opt_complex variable can be set to 0 to remove most of the time-
153						# consuming regex stuff. This will only work if the file has been run
154						# through bibclean, or is otherwise "regular" (the @ of a start record
155						# must be flush left, only { and not scribe's ( are allowed to surround
156						# a record, and records end with a single } flush left by itself).
157						#
158						# This would be a perfect use for perl5's interface to a C program. A
159						# fairly simple lex/yacc parser could be written and be _much_ faster
160						# as well as less error prone.
161						#
162						# XXXXX Break this out into subroutines, esp for string parsing.
163						#
164						sub read {
165	134			134	252	local($file) = @_;
166	134				148	local($_);
167	134				152	local($type);
168
169						# XXXXX A single readahead for all files. Needs to be split somehow.
170	134	50	33		332	if (@glb_readahead && (!$glb_noreadahead)) {
171	0				0	return shift @glb_readahead;
172						}
173
174						BREAD: {
175	134	50			131	if ($opt_complex == 0) {
	134	50			340
176	0				0	while (<$bib'glb_current_fh>) {
177	0	0			0	last if /^\@/;
178						}
179	0	0			0	return undef if eof;
180	0				0	$ent = $_;
181
182	0				0	$bib'glb_vloc = sprintf("line %5d", $.);
183
184	0				0	($type) = /^\@(\w+){/;
185	0	0			0	return &bib'goterror("Unable to parse field $ent") unless defined $type;
186
187						# skip comments
188	0	0			0	redo BREAD if $type =~ /^comment/i;
189						# we really ought to do something with this instead of tossing it
190	0	0			0	redo BREAD if $type =~ /^preamble/i;
191
192	0	0			0	if ($type =~ /^string/i) {
193						# XXXXX should handle multi-line string statements.
194	0				0	local($name, $value);
195	0	0			0	if ( ($name,$value) = /^\@string{(\S+)\s=\s"([^"]*)"}$/i) {
196	0				0	$name =~ s/(\W)/\\$1/g; # quote special chars
197	0				0	$name =~ s/\\ / /g;
198	0				0	$value =~ s/(\W)/\\$1/g;
199	0				0	$value =~ s/\\ / /g;
200	0				0	$glb_replace .= "s/\\b$name\\b/$value/g;\n";
201	0				0	$glb_eval_repl = 1;
202	0				0	redo BREAD;
203						} else {
204	0				0	&bib'gotwarn("Could not parse field $ent");
205						}
206						}
207
208	0				0	while (<$bib'glb_current_fh>) {
209	0				0	$ent .= $_;
210	0	0			0	last if /^\}\s*$/;
211						}
212						} elsif ($opt_complex == 1) {
213
214						# Assumptions made about format:
215						#
216						# An entry must start on a line of it's own, so this is ok:
217						# @ string { jgg1 = "journal of gnats" }
218						# But this is not:
219						# @string{j1 = "journal1"} @proceedings{foo, author="joe"}
220						#
221						# There are no string expansions inside string definitions. OK:
222						# @string(jgg2 = "journal" # " of " # "gnats" }
223						# But this is not:
224						# @string(j2 = j1 # " of Imaging")
225
226	134				215	local($braces) = 1;
227	134				133	local($delim);
228
229	134				1324	while (<$bib'glb_current_fh>) {
230	264	100			1090	if (/^\s*\@/) {
231	132				177	$ent = $_;
232	132				197	last;
233						}
234						}
235	134	100			287	return undef if eof;
236
237	132				602	$bib'glb_vloc = sprintf("line %5d", $.);
238
239	132				623	($type, $delim) = $ent =~ /^\s\@\s(\w+)\s*([{(])/;
240	132	50			307	return &bib'goterror("Unable to parse field $ent") unless defined $type;
241	132				192	$type =~ tr/A-Z/a-z/;
242
243						# XXXXX We should do something with comment and preamble values
244
245	132	50			269	if ($type eq 'comment') {
246	0				0	$ent = &read_until_match($ent, $delim, 0);
247	0				0	redo BREAD;
248						}
249
250	132	50			257	if ($type eq 'preamble') {
251	0				0	$ent = &read_until_match($ent, $delim, 0);
252	0				0	redo BREAD;
253						}
254
255	132	50			227	if ($type eq 'string') {
256	0				0	local($name, $value);
257	0				0	local($rdelim) = '}';
258	0	0			0	$rdelim = ')' if $delim eq '(';
259
260	0				0	$ent = &read_until_match($ent, $delim, 0);
261
262	0				0	$delim =~ s/(\W)/\\$1/g;
263	0				0	$rdelim =~ s/(\W)/\\$1/g;
264
265	0				0	$ent =~ s/^\s\@\sstring\s$delim\s//i;
266	0				0	eval "\$ent =~ s/\\s$rdelim\[^$rdelim\]\$//;";
267
268	0	0			0	$ent = &do_concat($ent) if ($ent =~ /#/);
269
270	0	0			0	if ( ($name,$value) = $ent =~ /^(\S+)\s=\s[{"(]((.\|\n)*)[}")]$/ ) {
271	0	0			0	if ($name =~ /["#\%'(),={}]/) {
272	0				0	&bib'gotwarn("Illegal string name: $name");
273						} else {
274	0				0	$name =~ tr/A-Z/a-z/;
275	0	0			0	&bib'gotwarn("Redefinition of string: $name") if defined $glb_replace{$name};
276						#$value =~ s/(\W)/\\$1/g;
277	0				0	$glb_replace{$name} = $value;
278	0				0	$glb_eval_repl = 1;
279	0				0	&bib'debugs("new string $name = '$value'", 32);
280						}
281						} else {
282	0				0	&bib'gotwarn("Couldn't parse string entry");
283						}
284	0				0	redo BREAD;
285						}
286
287						# All other types
288	132				283	$ent = &read_until_match($ent, $delim, 1);
289
290	132	100			451	if ($ent =~ /#/) {
291	32				94	local($delim_pos) = index($ent, $delim);
292	32				40	$delim_pos++;
293	32				130	substr($ent, $delim_pos) = &do_concat( substr($ent, $delim_pos) );
294						}
295	132				749	return $ent;
296
297						} else {
298	0				0	&bib'goterror("Unknown complexity level asked for");
299						}
300						} # end of BREAD
301
302	0				0	$_ = $ent;
303	0	0	0		0	if ( ($opt_complex == 0) && $glb_eval_repl ) {
304	0				0	study;
305	0				0	eval $glb_replace;
306	0	0			0	$@ && return &bib'goterror("Error in string eval, $@");
307						}
308	0				0	$_;
309						}
310
311
312						sub read_until_match {
313	132			132	341	local($line, $lmatch, $do_string_matching) = @_;
314	132				207	local($braces) = 0;
315	132				167	local($macro, $macro_lower, $mfield);
316	132				165	local($_);
317
318	132	50			250	if ($lmatch eq '{') {
		0
		0
319	132				225	$rmatch = '}';
320						} elsif ($lmatch eq '(') {
321	0				0	$rmatch = ')';
322						} elsif ($lmatch eq '"') {
323	0				0	$rmatch = '"';
324						} else {
325	0				0	&bib'gotwarn("Unknown left match character: $lmatch");
326	0				0	$rmatch = $lmatch;
327						}
328
329	132				759	$lmatch =~ s/(\W)/\\$1/g;
330	132				478	$rmatch =~ s/(\W)/\\$1/g;
331
332	132				559	while ($line =~ /$lmatch/g) { $braces++; }
	132				370
333	132				351	while ($line =~ /$rmatch/g) { $braces--; }
	0				0
334	132	50			247	if ($braces < 0) {
335	0				0	&bib'goterror("negative match level looking for $lmatch$rmatch");
336						}
337	132	50			236	return $line if ($braces <= 0);
338
339	132				739	while (<$bib'glb_current_fh>) {
340	1878	50			3285	if ($do_string_matching) {
341						# XXXXX Check that this is right.
342						# This will match a left string. Concatenation will then only
343						# have to worry about right strings.
344	1878	100			7835	if (/^(\s(\S+)\s=\s*)([^"#%'(),={}\s]+)/) {
345	10				31	$mfield = $2;
346	10				27	$macro = $3;
347	10				19	$macro_lower = $macro;
348	10				20	$macro_lower =~ tr/A-Z/a-z/;
349	10				15	$mfield =~ tr/A-Z/a-z/;
350						#print STDERR "found string: $macro\n";
351	10	50			41	if (defined $glb_replace{$macro_lower}) {
352	10				361	s/^(\s\S+\s=\s*)$macro/$1"$glb_replace{$macro_lower}"/;
353						} else {
354	0	0	0		0	if ( ($macro !~ /^\d+$/) && (defined $i_order{$mfield}) ) {
355	0				0	&bib'gotwarn("Unknown string: $macro in $mfield field");
356						}
357						}
358						}
359						}
360	1878				2955	$line .= $_;
361	1878				4658	while (/$lmatch/g) { $braces++; }
	1784				3881
362	1878				4652	while (/$rmatch/g) { $braces--; }
	1916				4015
363	1878	100			8706	last if ($braces <= 0);
364						# XXXXX We should try to detect an overflow -- after reading too many lines
365						}
366	132	50	33		326	if (eof && ($braces > 0)) {
367	0				0	&bib'gotwarn("File ended while still reading record");
368						}
369	132	50			258	if ($braces < 0) {
370	0				0	&bib'goterror("negative match level looking for $lmatch$rmatch");
371						}
372	132				476	$line;
373						}
374
375
376						#
377						# This subroutine handles concatenating strings that are seperated with
378						# a pound sign ('#'). It also will do string substitution for defined
379						# string to the right of the concatenation symbol.
380						#
381						sub do_concat {
382	32			32	125	local($rest) = @_;
383
384	32	50			172	return $rest unless $rest =~ /#/;
385
386						# This is _very_ ugly.
387						# Regular expressions just aren't powerful enough to do this.
388
389	32				58	local($left, $right);
390	32				59	local($bracelev) = 0;
391	32				42	local($quotes) = 0;
392	32				60	local($finished_string) = "";
393	32				45	local($macro, $macro_lower);
394	32				40	local($string_term);
395
396	32				115	$rest =~ s/$bib'cs_escape/$bib'cs_char_escape/go;
397
398	32				126	while ($rest =~ /#/) {
399	68				233	($left, $right) = split(/#/, $rest, 2);
400	68				210	while ($left =~ /\{/g) { $bracelev++; }
	346				621
401	68				183	while ($left =~ /\}/g) { $bracelev--; }
	314				560
402	68				178	while ($left =~ /"/g) { $quotes++; }
	10				30
403						#print STDERR ">>\n$left\n==== $bracelev/$quotes ====\n$right<<\n\n";
404	68	50	33		180	if ( ($bracelev <= 0) && ($quotes % 2 == 0) ) {
405						# The # occured outside of their text, so we concatenate
406
407						# Remember: if $left is changed, $quotes and $bracelev must be updated
408						## left side, checking for macro
409	0	0			0	if ($left =~ s/\}\s*$//) {
		0
410						# case: {foo} # ...
411	0				0	$bracelev++;
412	0				0	$string_term = '}';
413						} elsif ($left =~ s/"\s*$//) {
414						# case: "foo" # ...
415	0				0	$quotes--;
416	0				0	$string_term = '"';
417						} else {
418						# case: macro # ...
419	0				0	$left =~ s/(\S+)\s*$/"$1/;
420	0				0	$quotes++;
421	0				0	$string_term = '"';
422	0	0			0	if ($left !~ /"\d+$/) {
423	0				0	&bib'gotwarn("left string encountered during concatenation");
424						}
425						}
426
427						## right side, checking for macro
428	0	0			0	if ($right =~ s/^\s*([{"])//) {
429						# case: ... # "foo"
430						# We need to check for the case of {foo} # "bar", and "foo" # {bar}
431	0	0	0		0	if ( ($string_term eq '}') && ($1 eq '"') ) {
432	0				0	$right =~ s/^([^"]*)"/$1\}/;
433						}
434	0	0	0		0	if ( ($string_term eq '"') && ($1 eq '{') ) {
435	0				0	$left =~ s/"([^"]*)$/\{$1/;
436	0				0	$quotes--; $bracelev++;
	0				0
437						}
438						} else {
439						# case: ... # macro
440	0	0			0	if ($right =~ /^\s*([^"#%'(),={}\s]+)/) {
441	0				0	$macro = $1;
442	0				0	$macro_lower = $macro;
443	0				0	$macro_lower =~ tr/A-Z/a-z/;
444	0	0			0	if (defined $glb_replace{$macro_lower}) {
445	0				0	$right =~ s/^\s*$macro/$glb_replace{$macro_lower}$string_term/;
446						} else {
447	0	0			0	if ($macro !~ /^\d+$/) {
448	0				0	&bib'gotwarn("Unknown right string: $macro");
449						}
450	0				0	$right =~ s/^\s*$macro/$macro$string_term/;
451						}
452						} else {
453	0				0	&bib'gotwarn("Unknown text '$1' found.");
454						}
455						}
456						} else {
457						# It's inside their text, so leave it
458	68				142	$left .= "$bib'cs_temp";
459						}
460	68				109	$finished_string .= $left;
461	68				182	$rest = $right;
462						}
463	32				60	$finished_string .= $rest;
464
465	32				197	$finished_string =~ s/$bib'cs_temp/#/go;
466	32				107	$finished_string =~ s/$bib'cs_char_escape/$bib'cs_escape/go;
467	32				155	$finished_string;
468						}
469
470						######
471
472						sub explode {
473	132			132	253	local($rec) = @_;
474	132				263	local(%be_entry);
475	132				185	local(@e_values);
476	132				254	local($fld, $val);
477
478	132				3478	@e_values = split(/,\s(\w+)\s=\s*/, $rec);
479	132				873	($be_entry{'TYPE'}, $be_entry{'CITEKEY'}) =
480						( shift(@e_values) =~ /^\s\@\s(\w+)\s[{(]\s(\S+)/ );
481	132	50			364	&bib'goterror("error exploding bibtex record") unless scalar(@e_values) > 1;
482						# XXXXX 17 Dec 96, Changed ,?\s+[ to ,?\s*[
483	132				1001	$e_values[$#e_values] =~ s/\s,?\s[})]\s*$//; # zap the final delimiter
484	132				302	while (@e_values) {
485	1732				2727	($fld, $val) = splice(@e_values, 0, 2);
486	1732				2082	$fld =~ tr/A-Z/a-z/;
487	1732	100			11035	$val =~ s/^\s\{((.\|\n))\}\s*$/$1/
488						\|\| $val =~ s/^\s\"((.\|\n))\"\s*$/$1/;
489						# XXXXX Check to see if squeezing spaces here is ok.
490	1732				6087	$val =~ s/\s+/ /g;
491						# XXXXX If there are multiple fields of the same kind we will end
492						# up throwing away all but the last value!
493	1732				5023	$be_entry{$fld} = $val;
494						}
495	132	50	33		689	if ($opt_crossref && defined $be_entry{'crossref'}) {
496	0				0	%be_entry = &crossref_fill(%be_entry);
497						}
498	132				2181	%be_entry;
499						}
500
501						######
502
503						# This is the ordering r2b uses.
504						%i_order = (
505						'key', 10,
506						'author', 20,
507						'affiliation', 25,
508						'editor', 30,
509						'title', 40,
510						'booktitle', 50,
511						'location', 55,
512						'institution', 60,
513						'school', 70,
514						'journal', 80,
515						'type', 90,
516						'series', 100,
517						'volume', 110,
518						'number', 120,
519						'edition', 130,
520						'chapter', 140,
521						'pages', 150,
522						'publisher', 160,
523						'address', 170,
524						'month', 180,
525						'year', 190,
526						'price', 200,
527						'copyright', 210,
528						'keywords', 220,
529						'mrnumber', 230,
530						'language', 240,
531						'annote', 250,
532						'isbn', 260,
533						'ISBN', 261,
534						'issn', 270,
535						'ISSN', 271,
536						'subject', 275,
537						'abstract', 280,
538						'note', 290,
539						'contents', 300,
540						'url', 310,
541						);
542						sub bykey {
543						# undefined fields always go last
544	0	0		0	0	return 1 unless defined $i_order{$a};
545	0	0			0	return -1 unless defined $i_order{$b};
546	0				0	$i_order{$a} <=> $i_order{$b};
547						}
548
549						sub implode {
550	0			0	0	local(%entry) = @_;
551	0				0	local($ent);
552
553	0	0			0	return &bib'goterror("BibTeX: no TYPE field") unless defined $entry{'TYPE'};
554	0	0			0	return &bib'goterror("BibTeX: no CITEKEY field") unless defined $entry{'CITEKEY'};
555
556	0				0	$ent = join("", '@', $entry{'TYPE'}, '{', $entry{'CITEKEY'}, ",\n");
557	0				0	delete $entry{'TYPE'};
558	0				0	delete $entry{'CITEKEY'};
559
560						# I hope we're using the TeX character set, because if $entry{$field}
561						# contains a { without matching }'s, we're going to have hell to pay
562						# when we try to read it. We could check for it here, but what would
563						# we replace it with? $\lbrace$ is TeX-specific. It's also very slow.
564	0				0	foreach $field (sort bykey keys %entry) {
565	0				0	$ent .= " $field = \{$entry{$field}\},\n";
566						}
567
568						# XXXXX This should be smarter
569	0				0	$ent =~ s/ month = \{(...)\},/ month = \L$1,/;
570
571	0				0	substr($ent, -2, 1) = '';
572	0				0	$ent .= "\}\n";
573
574						# We now might have some fields that still have separators left in them,
575						# notably the keywords field. Right now we change them to space.
576						# XXXXX Should this be a newline, ';', '/', ',', or space?
577	0				0	$ent =~ s/$bib'cs_sep/ /go;
578
579	0				0	$ent;
580						}
581
582						######
583
584						# XXXXX A type field in an inbook citation does not mean ReportType, but
585						# the type of section.
586
587						%btx_to_can_fields =
588						('CITEKEY', 'CiteKey',
589						'title', 'Title',
590						'booktitle', 'SuperTitle',
591						'affiliation', 'AuthorAddress',
592						'school', 'School',
593						'organization', 'Organization',
594						'journal', 'Journal',
595						'type', 'ReportType',
596						'series', 'Series',
597						'volume', 'Volume',
598						'edition', 'Edition',
599						'chapter', 'Chapter',
600						'pages', 'Pages',
601						'howpublished', 'HowPublished',
602						'institution', 'Organization',
603						'publisher', 'Publisher',
604						'address', 'PubAddress',
605						'month', 'Month',
606						'year', 'Year',
607						'price', 'Price',
608						'copyright', 'Copyright',
609						'keywords', 'Keywords',
610						'mrnumber', 'MRNumber',
611						'language', 'Language',
612						'annote', 'Annotation',
613						'isbn', 'ISBN',
614						'issn', 'ISSN',
615						'subject', 'Field',
616						'abstract', 'Abstract',
617						'note', 'Note',
618						'contents', 'Contents',
619						'key', 'Key',
620						'url', 'Source',
621						'location', 'Location',
622						);
623
624						sub tocanon {
625	132			132	1008	local(%rec) = @_;
626	132				215	local(%can);
627	132				193	local($name, $btxf, $canf, $btxv);
628	132				234	local($_) = $rec{'TYPE'};
629	132				249	tr/A-Z/a-z/;
630						# NEW CANON TYPE <-- ORIGINAL BIBTEX
631	132	100			375	$can{'CiteType'} = 'article' if /^article/;
632	132	100			284	$can{'CiteType'} = 'book' if /^book/;
633	132	50			255	$can{'CiteType'} = 'book' if /^booklet/;
634	132	50			217	$can{'CiteType'} = 'book' if /^collection/;
635	132	50			227	$can{'CiteType'} = 'inproceedings' if /^conference/;
636	132	100			268	$can{'CiteType'} = 'inbook' if /^inbook/;
637	132	100			295	$can{'CiteType'} = 'inbook' if /^incollection/;
638	132	100			395	$can{'CiteType'} = 'inproceedings' if /^inproceedings/;
639	132	100			293	$can{'CiteType'} = 'manual' if /^manual/;
640	132	50			240	$can{'CiteType'} = 'thesis' if /^mastersthesis/;
641	132	50			229	$can{'CiteType'} = 'misc' if /^misc/;
642	132	100			263	$can{'CiteType'} = 'thesis' if /^phdthesis/;
643	132	100			261	$can{'CiteType'} = 'proceedings' if /^proceedings/;
644	132	100			251	$can{'CiteType'} = 'report' if /^techreport/;
645	132	50			240	$can{'CiteType'} = 'unpublished' if /^unpublished/;
646
647	132	50			340	if (!defined $can{'CiteType'}) {
648	0				0	&bib'gotwarn("Improper entry type: $rec{'TYPE'}");
649	0				0	$can{'CiteType'} = 'misc';
650						}
651
652	132	100			376	if (!defined $rec{'type'}) {
653	126	50			561	if ( $rec{'TYPE'} =~ /^phdthesis/i ) {
		50
654	0				0	$rec{'type'} = 'Ph.D.';
655						} elsif ( $rec{'TYPE'} =~ /^mastersthesis/i ) {
656	0				0	$rec{'type'} = 'Masters';
657						}
658						}
659
660
661	132	100			298	if (defined $rec{'author'} ) {
662						# check for braces around the whole name, in which case we will
663						# assume it is a corporate author.
664	110	50	33		291	if ( ($rec{'author'} =~ /^\{/) && ($rec{'author'} =~ /\}$/) ) {
665	0				0	$can{'CorpAuthor'} = substr($rec{'author'}, $[+1, length($rec{'author'})-2);
666						} else {
667	110				227	$can{'Authors'} = &bibtex_name_to_canon( $rec{'author'} );
668						}
669	110				230	delete $rec{'author'};
670						}
671
672	132	100			291	if (defined $rec{'editor'}) {
673	10				30	$can{'Editors'} = &bibtex_name_to_canon( $rec{'editor'} );
674						# XXXXX either we don't need this, or we need it for authors also.
675	10	50			56	delete $can{'Editors'} unless $can{'Editors'} =~ /\S/;
676	10				22	delete $rec{'editor'};
677						}
678
679	132	50	66		358	if ( defined $rec{'organization'} && defined $rec{'school'} ) {
680	0				0	&bib'gotwarn("Both school and organization defined.");
681	0				0	delete $rec{'school'};
682						}
683
684	132	50	66		523	if ( defined $rec{'publisher'} && defined $rec{'institution'} ) {
685	0				0	&bib'gotwarn("Both publisher and institution defined.");
686	0				0	delete $rec{'institution'};
687						}
688
689	132	100			274	if (defined $rec{'number'}) {
690	34	100			173	if ($can{'CiteType'} =~ /report\|thesis/) {
691	6				17	$can{'ReportNumber'} = $rec{'number'};
692						} else {
693	28				79	$can{'Number'} = $rec{'number'};
694						}
695	34				58	delete $rec{'number'};
696						}
697
698	132	100			275	if (defined $rec{'month'}) {
699	52				217	$can{'Month'} = &bp_util'canon_month($rec{'month'});
700	52	50			168	delete $rec{'month'} if defined $can{'Month'};
701						}
702
703						# done with massaging the fields
704	132				205	delete $rec{'TYPE'};
705
706	132				521	while ( ($btxf, $btxv) = each %rec) {
707	1658	50			4693	next unless $btxv =~ /\S/;
708	1658	100			2871	if (defined $btx_to_can_fields{$btxf}) {
709	934				3136	$can{$btx_to_can_fields{$btxf}} = $btxv;
710						} else {
711						# Unknown, so enter literal. Perhaps a warning?
712	724				2164	$can{$btxf} = $btxv;
713						}
714						}
715
716						# Handle title-like fields.
717	132				233	foreach $canf ('Title', 'SuperTitle', 'ReportType') {
718	396	100			907	next unless defined $can{$canf};
719	200				359	$can{$canf} =~ s/\{([^\s}]+)\}/${bib'cs_meta}3100$1${bib'cs_meta}3110/g;
720	200				680	$can{$canf} =~ s/\s\s+/ /g;
721						}
722
723						# tell them who we are
724	132				273	$can{'OrigFormat'} = $version;
725
726	132				3062	%can;
727						}
728
729						######
730						#
731						# This routine will convert a BibTeX name into it's canon form.
732						# It protects items in braces, such as {O'Rielly and Associates}, so that
733						# they are dealt with as one unit.
734						#
735
736						sub bibtex_name_to_canon {
737	120			120	204	local($name) = @_;
738	120				150	local($n);
739	120				223	local($vonlast, $von, $last, $jr, $first, $part);
740	120				157	local(@savechars);
741	120				182	local($saveptr) = '00';
742	120				146	local($canon_name) = '';
743
744	120				859	$name =~ s/\s+/ /g;
745
746						# Move each item enclosed in braces to an atomic character.
747	120				367	while ($name =~ s/(\{[^\}]*\})/$bib'cs_temp$saveptr/) {
748	0				0	push(@savechars, $1);
749	0				0	$saveptr++;
750						}
751
752	120				424	foreach $n ( split(/ and /, $name) ) {
753
754	304	50			1062	if ( ($vonlast, $jr, $first) = $n =~ /^([^,]),\s([^,]),\s([^,]*)$/ ) {
		50
755						# sep vonlast
756						} elsif ( ($vonlast, $first) = $n =~ /([^,]),\s([^,]*)/ ) {
757	0				0	$jr = '';
758						# sep vonlast
759						} else {
760	304				324	$first = '';
761	304				296	$jr = '';
762	304				353	$vonlast = '';
763	304				758	foreach $part (split(/ /, $n)) {
764	706	100	66		3026	if ($part =~ /^[^a-z]/ && ($vonlast eq '')) {
765	698				1387	$first .= " $part";
766						} else {
767	8				20	$vonlast .= " $part";
768						}
769						}
770						}
771	304				486	$vonlast =~ s/^\s+//;
772	304				321	$von = '';
773	304	100			498	if ($vonlast ne '') {
774	4	50			14	if ( $vonlast =~ /^[a-z]/ ) {
775	4				8	$last = '';
776	4				12	foreach $part (split(/ /, $vonlast)) {
777	8	50	33		48	if ($part =~ /^[a-z]/ && ($last eq '')) {
778	8				20	$von .= " $part";
779						} else {
780	0				0	$last .= " $part";
781						}
782						}
783	4				17	$von =~ s/^\s+//;
784	4				8	$last =~ s/^\s+//;
785						} else {
786	0				0	$last = $vonlast;
787						}
788						} else {
789	300				1544	($first, $last) = ($first =~ /^(.*)\s+(\S+)$/);
790						}
791	304				828	$first =~ s/^\s+//;
792
793	304				980	$canon_name .= $bib'cs_sep . join($bib'cs_sep2, $last, $von, $first, $jr);
794						}
795	120				425	$canon_name =~ s/^$bib'cs_sep//o;
796
797	120	50			369	if (@savechars) {
798	0				0	local($oldchar, $oldcharmb);
799	0				0	$saveptr = '00';
800	0				0	while (@savechars) {
801	0				0	$oldchar = shift @savechars;
802	0				0	$oldcharmb = $oldchar;
803	0				0	$oldcharmb =~ s/^{(.*)}$/$1/;
804	0	0			0	$canon_name =~ s/(^\|$bib'cs_sep\|$bib'cs_sep2)$bib'cs_temp$saveptr($\|$bib'cs_sep\|$bib'cs_sep2)/$1$oldcharmb$2/ \|\| $canon_name =~ s/$bib'cs_temp$saveptr/$oldchar/;
805	0				0	$saveptr++;
806						}
807						}
808
809	120				438	$canon_name;
810						}
811
812						######
813
814						# XXXXX We really ought to generate these at load time from the other list.
815						# XXXXX Format?
816
817						%can_to_btx_fields =
818						('CiteKey', 'CITEKEY',
819						'Title', 'title',
820						'SuperTitle', 'booktitle',
821						'AuthorAddress','affiliation',
822						'School', 'school',
823						'Organization', 'organization',
824						'Journal', 'journal',
825						'ReportType', 'type',
826						'Series', 'series',
827						'Volume', 'volume',
828						'Edition', 'edition',
829						'Chapter', 'chapter',
830						'Pages', 'pages',
831						'PagesWhole', 'pages',
832						'HowPublished', 'howpublished',
833						'Publisher', 'publisher',
834						'PubAddress', 'address',
835						'Month', 'month',
836						'Year', 'year',
837						'Price', 'price',
838						'Copyright', 'copyright',
839						'Keywords', 'keywords',
840						'MRNumber', 'mrnumber',
841						'Language', 'language',
842						'Annotation', 'annote',
843						'ISBN', 'isbn',
844						'ISSN', 'issn',
845						'Field', 'subject',
846						'Abstract', 'abstract',
847						'Note', 'note',
848						'Contents', 'contents',
849						'Key', 'key',
850						'Source', 'url',
851						'Location', 'location',
852						);
853
854						sub fromcanon {
855	0			0	0	local(%reccan) = @_;
856	0				0	local(%record);
857	0				0	local($name, $btxf, $canf, $canv);
858
859	0	0			0	if (!defined $reccan{'CiteType'}) {
860	0				0	&bib'gotwarn("BibTeX didn't find a CiteType field!");
861	0				0	$reccan{'CiteType'} = 'book';
862						}
863
864						# XXXXX 22Mar96: I think we had some mixup with incollection vs. inbook.
865
866	0				0	local($_) = $reccan{'CiteType'};
867	0	0			0	if (/^article/ ) { $record{'TYPE'} = 'article'; }
	0	0			0
		0
		0
		0
		0
		0
		0
		0
		0
		0
868	0				0	elsif (/^avmaterial/ ) { $record{'TYPE'} = 'misc'; }
869						elsif (/^book/ ) {
870	0	0			0	if (defined $reccan{'Publisher'}) { $record{'TYPE'} = 'book'; }
	0				0
871	0				0	else { $record{'TYPE'} = 'booklet'; } }
872						elsif (/^inbook/ ) {
873	0	0			0	if (defined $reccan{'SuperTitle'}) { $record{'TYPE'} = 'incollection' }
	0				0
874	0				0	else { $record{'TYPE'} = 'inbook'; } }
875	0				0	elsif (/^inproceedings/ ) { $record{'TYPE'} = 'inproceedings'; }
876	0				0	elsif (/^manual/ ) { $record{'TYPE'} = 'manual'; }
877	0				0	elsif (/^misc/ ) { $record{'TYPE'} = 'misc'; }
878						elsif (/^thesis/ ) {
879	0	0	0		0	if ( (defined $reccan{'ReportType'}) && ($reccan{'ReportType'} =~ /master/i)
880	0				0	) { $record{'TYPE'} = 'mastersthesis' }
881	0				0	else { $record{'TYPE'} = 'phdthesis'; } }
882	0				0	elsif (/^proceedings/ ) { $record{'TYPE'} = 'proceedings'; }
883	0				0	elsif (/^report/ ) { $record{'TYPE'} = 'techreport'; }
884	0				0	elsif (/^unpublished/ ) { $record{'TYPE'} = 'unpublished'; }
885						else {
886	0				0	&bib'gotwarn("Improper entry type: $reccan{'CiteType'}");
887	0				0	$record{'TYPE'} = 'misc';
888						}
889
890						# generate key if necessary, using the default method.
891	0	0			0	$reccan{'CiteKey'} = &bp_util'genkey(%reccan) unless defined $reccan{'CiteKey'};
892
893						# register our citekey
894	0				0	$reccan{'CiteKey'} = &bp_util'regkey($reccan{'CiteKey'});
895
896	0	0			0	if ( defined $reccan{'Authors'} ) {
897	0				0	$record{'author'} = &bp_util'canon_to_name($reccan{'Authors'}, 'bibtex');
898	0				0	delete $reccan{'Authors'};
899	0	0			0	if ($record{'author'} !~ / /) {
900	0	0			0	if ($record{'author'} =~ s/\240/ /g) {
901	0				0	$record{'author'} = $protectB . $record{'author'} . $protectE;
902						}
903						}
904						}
905	0	0			0	if ( defined $reccan{'CorpAuthor'} ) {
906						# no need for no-break spaces, as we're putting braces around it.
907	0				0	$reccan{'CorpAuthor'} =~ s/\240/ /g;
908	0	0			0	if (defined $record{'author'}) {
909	0	0			0	if (defined $reccan{'Organization'}) {
910	0				0	$record{'author'} .= ' and ' . $protectB . $reccan{'CorpAuthor'} . $protectE;
911						} else {
912	0				0	$record{'organization'} = $reccan{'CorpAuthor'};
913						}
914						} else {
915	0				0	$record{'author'} = $protectB . $reccan{'CorpAuthor'} . $protectE;
916						}
917	0				0	delete $reccan{'CorpAuthor'};
918						}
919
920	0	0			0	if ( defined $reccan{'Editors'} ) {
921	0				0	$record{'editor'} = &bp_util'canon_to_name($reccan{'Editors'}, 'bibtex');
922	0				0	delete $reccan{'Editors'};
923						}
924
925	0	0			0	if ( $reccan{'CiteType'} =~ /^(report\|unpublished)/ ) {
926	0	0			0	if ( defined $reccan{'Publisher'} ) {
		0
927	0				0	$record{'institution'} = $reccan{'Publisher'};
928	0				0	delete $reccan{'Publisher'};
929						} elsif ( defined $reccan{'Organization'} ) {
930	0				0	$record{'institution'} = $reccan{'Organization'};
931	0				0	delete $reccan{'Organization'};
932						}
933						}
934
935						# if ( $reccan{'CiteType'} =~ /^thesis/ ) {
936						# if ( defined $reccan{'Organization'} ) {
937						# $record{'school'} = $reccan{'Organization'};
938						# delete $reccan{'Organization'};
939						# }
940						# }
941
942	0	0			0	if (defined $reccan{'ReportNumber'}) {
		0
943	0	0			0	if (defined $reccan{'Number'}) {
944	0				0	&bib'gotwarn("Both Number and ReportNumber.");
945	0				0	delete $reccan{'Number'};
946						}
947	0	0			0	if ($reccan{'CiteType'} !~ /report\|thesis/) {
948	0				0	&bib'gotwarn("ReportNumber defined, but not in a report.");
949						}
950	0				0	$record{'number'} = $reccan{'ReportNumber'};
951	0				0	delete $reccan{'ReportNumber'};
952						} elsif (defined $reccan{'Number'}) {
953	0	0			0	if ($reccan{'CiteType'} =~ /report\|thesis/) {
954	0				0	&bib'gotwarn("Number defined inside a report.");
955						}
956	0				0	$record{'number'} = $reccan{'Number'};
957	0				0	delete $reccan{'Number'};
958						}
959
960	0	0			0	if (defined $reccan{'ReportType'}) {
961	0	0			0	if ($reccan{'ReportType'} !~ /($protectB\|$protectE)/o) {
962	0				0	$reccan{'ReportType'} =~ s/Ph\.\s*D\./${protectB}Ph.D.${protectE}/o;
963						}
964						}
965
966						# done with massaging the fields
967	0				0	delete $reccan{'CiteType'};
968						# We don't know any special information about any types
969	0				0	delete $reccan{'OrigFormat'};
970
971	0				0	while ( ($canf, $canv) = each %reccan) {
972	0	0			0	if (defined $can_to_btx_fields{$canf}) {
973	0				0	$record{$can_to_btx_fields{$canf}} = $canv;
974						} else {
975						### &bib'gotwarn("Unknown field: $canf"); ### this is really annoying! (pt/02/09/11)
976	0				0	$record{$canf} = $canv;
977						}
978						}
979
980	0				0	%record;
981						}
982
983						######
984
985						sub clear {
986	2			2	6	local($file) = @_;
987
988						# XXXXX currently we have just one strings mapping for all files.
989
990	2				44	%glb_replace = ();
991	2				7	$glb_eval_repl = 0;
992						}
993
994						######
995
996						sub crossref_fill {
997	0			0		local(%bent) = @_;
998	0					local($id) = $bent{'crossref'};
999	0					local(%crossent);
1000	0					local($cfield, $cval);
1001
1002	0					&bib'debugs("trying to crossref $id in $bent{'CITEKEY'}", 64);
1003	0	0				if (!defined $glb_crossref_entries{$id}) {
1004	0	0				if ( ! &get_record_ahead($id) ) {
1005	0					&bib'gotwarn("Could not find bibtex crossref: $id");
1006	0					return %bent;
1007						}
1008						}
1009						#print STDERR "using crossref $id. readahead has $#glb_readahead entries\n";
1010
1011	0					%crossent = &explode( $glb_crossref_entries{$id} );
1012
1013						# Merge the two records
1014						# We do this by simply adding any fields from the crossref entry that
1015						# don't exist in the original record.
1016	0					while ( ($cfield, $cval) = each %crossent) {
1017	0	0				next if defined $bent{$cfield};
1018	0					$bent{$cfield} = $cval;
1019						}
1020						# Now that we've successfully merged the entries, we can remove the
1021						# crossref entry
1022	0					delete $bent{'crossref'};
1023
1024	0					%bent;
1025						}
1026
1027						sub get_record_ahead {
1028	0			0		local($needed_id) = @_;
1029	0					local($id) = undef;
1030	0					local($next_record);
1031
1032						#print STDERR "new crossref: $needed_id\n" unless defined $glb_crossref_needed{$needed_id};
1033	0					$glb_crossref_needed{$needed_id} = 1;
1034
1035						# We can't have the read routine returning our own results to us!
1036	0					$glb_noreadahead = 1;
1037
1038	0					while ($next_record = &read) {
1039						# We look in each record to see if there is another crossref field.
1040						# If there seems to be, we make a note of it, so we will store the
1041						# record right away.
1042	0	0				if ($next_record =~ /crossref\s=\s[{"]([^}"]+)/i) {
1043	0	0				if (!defined $glb_crossref_entries{$1}) {
1044	0					$glb_crossref_needed{$1} = 1;
1045						}
1046						}
1047
1048	0					($id) = ( $next_record =~ /^\s\@\s\w+\s[{(]\s([^,\s]+)/ );
1049
1050	0	0				if (defined $glb_crossref_needed{$id}) {
1051	0					$glb_crossref_entries{$id} = $next_record;
1052	0					delete $glb_crossref_needed{$id};
1053						}
1054
1055	0					push(@glb_readahead, $next_record);
1056	0	0				last if $id eq $needed_id;
1057						}
1058
1059	0					&bib'debugs("crossref looking for " . join(" ", keys %glb_crossref_needed) . ".", 4);
1060	0					&bib'debugs("crossref has " . join(" ", keys %glb_crossref_entries) . ".", 4);
1061
1062						# Now let the read routine use the readahead information
1063	0					$glb_noreadahead = 0;
1064
1065	0					($id eq $needed_id);
1066						}
1067
1068
1069						#######################
1070						# end of package
1071						#######################
1072
1073						1;