File Coverage

blib/lib/Biblio/Document/Parser/Brody.pm

Criterion	Covered	Total	%
statement	27	166	16.2
branch	5	86	5.8
condition	2	36	5.5
subroutine	5	10	50.0
pod	2	7	28.5
total	41	305	13.4

line	stmt	bran	cond	sub	pod	time	code
1							package Biblio::Document::Parser::Brody;
2
3							######################################################################
4							#
5							# Biblio::Document::Parser::Brody;
6							#
7							######################################################################
8							#
9							# Reference Parser by Tim Brody
10							#
11							# This file is part of ParaCite Tools (http://paracite.eprints.org/developers/)
12							#
13							# Copyright (c) 2002 University of Southampton, UK. SO17 1BJ.
14							#
15							# ParaTools is free software; you can redistribute it and/or modify
16							# it under the terms of the GNU General Public License as published by
17							# the Free Software Foundation; either version 2 of the License, or
18							# (at your option) any later version.
19							#
20							# ParaTools is distributed in the hope that it will be useful,
21							# but WITHOUT ANY WARRANTY; without even the implied warranty of
22							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23							# GNU General Public License for more details.
24							#
25							# You should have received a copy of the GNU General Public License
26							# along with ParaTools; if not, write to the Free Software
27							# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28							#
29							######################################################################
30
31							=pod
32
33							=head1 NAME
34
35							Biblio::Document::Parser::Brody
36
37							=head1 DESCRIPTION
38
39							Module that parses reference strings from a document. Relies on a reference section starting with a title "References", "Bibliography", or "Cited". Seperates references by prefixed number (e.g. "[1]" or "1.") or by year (e.g. "Smith, J (1992)").
40
41							=head1 SYNOPSIS
42
43							use Biblio::Document::Parser::Brody;
44
45							my $parser = new Biblio::Document::Parser::Brody();
46
47							my @refs = $parser->parse(\*FILE_IO);
48							my @refs = $parser->parse($str);
49
50							=head1 METHODS
51
52							=cut
53
54	1			1		7028	use strict;
	1					3
	1					33
55
56	1			1		5	use Carp;
	1					2
	1					60
57	1			1		4	use vars qw($DEBUG $RE_BOR $RE_EOR $RE_NAME_CHARS $RE_NAME $RE_NAME_LIST_CHARS $MAX_SIZE);
	1					1
	1					2963
58
59							# Set up the input/output appropriately
60							#use open IN => ':encoding(latin1)', OUT => ':utf8';
61
62							$MAX_SIZE = 1024*2000; # 2MB
63
64							$RE_BOR = qr/^[^a-z](?:references(?:\s+cited)?)\|(?:bibliography)[^a-z]$/i;
65							$RE_EOR = qr/^\s(?:\d+\.?\s)(?:acknowledge?ment)\|(?:footnote)\|(?:appendix)\|(?:abbreviation)\|(?:glossary)\|(?:figure)[^\n]{0,10}\s$/i;
66							$RE_NAME_CHARS = qr/[a-zA-Z`'\-]/;
67							$RE_NAME_LIST_CHARS = qr/[a-zA-Z,\.;\-\s\&'`]/;
68							$RE_NAME = qr/(?:[a-zA-Z`'\-]{4,7}, (?:[a-zA-Z]\. )+)/;
69
70							=pod
71
72							=over 4
73
74							=item $p = Biblio::Document::Parser::Brody->new([-debug=>1])
75
76							Constructor method for class.
77
78							=cut
79
80							sub new {
81	1			1	1	17	my ($class,%args) = @_;
82	1					4	$DEBUG = $args{-debug};
83	1					5	return bless {}, $class;
84							}
85
86							=pod
87
88							=item @refs = $p->parse($str)
89
90							Parses a string $str and returns a list of unstructured reference strings.
91
92							=cut
93
94							sub parse {
95	1			1	1	616	my $self = shift @_;
96	1					2	my $arg = shift @_;
97	1					2	my $BIBL = '';
98
99							# UNIVERSAL::isa($arg,"IO::Handle") doesn't work?
100	1	50				6	if( ref($arg) ) {
101	1	50				48	read($arg,$BIBL,$MAX_SIZE) or croak "Error reading from file handle: $!\n";
102							} else {
103	0					0	$BIBL = join('',$arg,@_);
104							}
105
106	1	50				5	croak "No data to parse\n" unless length($BIBL);
107
108	1					3	$BIBL =~ s/\f/\n\n/sg;
109
110	1					2	my %HEADERS;
111
112	1					224	while( $BIBL =~ /(?:\n[\r[:blank:]]){2}([^\n]{0,40}\w+[^\n]{0,40})(?:\n[\r[:blank:]]){3}/osg ) {
113	0					0	$HEADERS{header_to_regexp($1)}++;
114							}
115
116	1	50				4	if( %HEADERS ) {
117	0					0	my @regexps = sort { $HEADERS{$b} <=> $HEADERS{$a} } keys %HEADERS;
	0					0
118	0					0	my $regexp = $regexps[0];
119	0	0				0	if( $HEADERS{$regexp} > 3 ) {
120	0					0	my $c = $BIBL =~ s/(?:\n[\r[:blank:]]){2}(?:$regexp)(?:\n[\r[:blank:]]){3}/\n\n/sg;
121	0	0				0	warn "Applying regexp: $regexp ($HEADERS{$regexp} original matches) Removed $c header/footers using ($HEADERS{$regexp} original matches): $regexp\n" if $DEBUG;
122							} else {
123	0	0				0	warn "Not enough matching header/footers were found\n" if $DEBUG;
124							}
125							} else {
126	1	50				4	warn "No header/footers were found\n" if $DEBUG;
127							}
128
129							# Kill any bad chars
130							# local *lat2uni = convertor( 'latin1', 'utf8' );
131							# lat2uni(\$BIBL);
132
133							# if( $BIBL =~ /$RE_BOR/mi ) {
134							# $BIBL = $';
135							# } else {
136							# croak "FATAL: Unable to find reference section\n";
137							# }
138
139
140	1					2	my @REFS;
141
142							# Attempt to find the reference section
143	1		33			125	while( !@REFS && ($BIBL =~ /$RE_BOR/mi) && ($BIBL = $') ) {
			33
144	0					0	my $c = 0;
145
146							# Count the number of occurences of [\d] over the next 2k of data or so
147	0					0	my $buffer = substr($BIBL, 0, 2048);
148	0					0	$c = 0;
149	0	0				0	while($buffer =~ m/^\s*\[\d+\]/mog) { last if ++$c == 5 }
	0					0
150	0	0				0	if( $c >= 5 ) {
151	0	0				0	warn "Style = numbered square ([1])\n" if $DEBUG;
152	0	0				0	last if (@REFS = &style_numbered_square($BIBL));
153							}
154
155							# How about 1. notation
156							# $buffer = substr($BIBL, 0, 2046);
157	0					0	$c = 0;
158	0	0				0	while($buffer =~ m/^\s*(\d+)\./mog) { last if ++$c == 5 }
	0					0
159	0	0				0	if( $c >= 5 ) {
160	0	0				0	warn "Style = numbered (1.)\n" if $DEBUG;
161							# $BIBL =~ s/^\s*(\d+)\./\[$1\]/mg;
162	0	0				0	last if (@REFS = &style_numbered($BIBL));
163							}
164
165							# Now we're getting desperate - hopefully its a name list followed by year
166							# $buffer = substr($BIBL, 0, 2048);
167	0					0	$c = 0;
168	0	0				0	while($buffer =~ m/^$RE_NAME_LIST_CHARS{10,40}[^\d\-]19\|20\d{2}[^\d\-]/mog) { last if ++$c == 5 }
	0					0
169	0	0				0	if( $c >= 5 ) {
170	0	0				0	warn "Style = years\n" if $DEBUG;
171	0	0				0	last if (@REFS = &style_years($BIBL));
172							}
173
174							# if( @REFS ) {
175							# last;
176							# } elsif( $BIBL =~ /$RE_BOR/mi ) {
177							# warn "Skipping section ...\n" if $DEBUG;
178							# $BIBL = $';
179							# } else {
180							# last;
181							# }
182							}
183
184	1					7	for( my $i = 0; $i < @REFS; $i++ ) {
185	0	0				0	my $ref = $REFS[$i] or next;
186							# $REFS[$i] = "[" . ($i+1) . "] " . unicode_string($ref);
187	0					0	$REFS[$i] = "[" . ($i+1) . "] " . $ref;
188							}
189
190	1	0				4	return grep { defined($_) && length($_) } @REFS;
	0
191							}
192
193							#my ($BIBL, $buffer);
194							#$BIBL = '';
195
196							#my $lc = 0;
197
198							#die "FATAL: Input has gone beyond $MAX_SIZE byte limit" if read(STDIN,$BIBL,$MAX_SIZE) == $MAX_SIZE;
199
200							#die "Empty input" unless length($BIBL);
201
202							#while( read(STDIN,$buffer,4096) ) {
203							# $BIBL .= $buffer;
204							# die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE;
205							#}
206
207
208							#while( <> ) {
209							# s/\f/\n\n/sg;
210							# $BIBL = $_ . $BIBL;
211							# die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE;
212							# if( $_ =~ /^(?:\n\s*){3}/ ) {
213							# # Regexp matches for the end of the string are really bad performance
214							# # Lines are in reverse order!
215							# if( $BIBL =~ /^(?:\n\s){3}([^\n]{0,40}\w+[^\n]{0,40})(?:\n\s){2}/os ) {
216							# $HEADERS{header_to_regexp($1)}++;
217							# }
218							# }
219							#}
220
221							# Put the lines back in-order
222							#my @lines = split(/\n/,$BIBL);
223							#$BIBL = '';
224							#for(@lines) {
225							# $BIBL = $_ . "\n" . $BIBL;
226							#}
227
228							# Read in the document
229							#while( read(STDIN,$buffer,4096) ) {
230							# if( length($BIBL) > $MAX_SIZE ) {
231							# die "FATAL: Input has gone beyond $MAX_SIZE Bytes limit\n";
232							# }
233							# $BIBL .= $buffer;
234							#}
235
236							#print "Ref section:\n", $BIBL;
237
238							# Change to utf8
239							#use utf8;
240
241							#### REMAINING FUNCTIONS ARE INTERNAL OR DEPRECATED ####
242
243							sub end_of_references {
244	0			0	0		my $ref = shift;
245	0	0	0				if( $$ref =~ /${RE_EOR}/im \|\|
246							$$ref =~ /^\s*acknowledgements:/im ) {
247	0						$$ref = $`;
248	0						return 1;
249							}
250	0	0					if( $$ref =~ /(?:\s*\n){3,}/s ) {
251	0						$$ref = $`;
252	0						return 1;
253							}
254	0	0					if( length($$ref) > 1024 ) {
255	0						return 1;
256							}
257	0						return 0;
258							}
259
260							sub style_numbered {
261	0			0	0		my @REFS = split(/^\s*(\d+\.)/m, shift);
262
263	0		0				shift @REFS while (@REFS && ($REFS[0] !~ /^\d+\./ \|\| substr($REFS[0],0,-1) != 1));
			0
264
265	0						my $i = 2;
266	0						while( $i < @REFS ) {
267	0	0					if( $REFS[$i] =~ /^\d+\./ ) {
268	0						my $val = substr($REFS[$i],0,-1);
269	0	0					if( $val != ($i/2)+1 ) {
270	0						$REFS[$i-1] .= splice(@REFS,$i,1);
271							} else {
272	0						$i+=2;
273							}
274							} else {
275	0						$REFS[$i-1] .= splice(@REFS,$i,1);
276							}
277	0	0					if( end_of_references(\$REFS[$i-1]) ) {
278	0						splice(@REFS,$i);
279							}
280							}
281
282	0						for( my $i = 0; $i < @REFS; $i++ ) {
283	0						$REFS[$i] .= splice(@REFS,$i+1,1);
284	0						$REFS[$i] =~ s/\s+/ /sg;
285	0						$REFS[$i] =~ s/^\s+//;
286	0						$REFS[$i] =~ s/\s+$//;
287							}
288
289	0						@REFS;
290							}
291
292							sub style_numbered_square {
293	0			0	0		my $BIBL = shift;
294
295							# Split the bibliography
296	0						$BIBL =~ /(?=\[\d+\])/;
297	0	0					my @REFS = split(/^\s*\[(\d+)\]/m, $') or return ();
298	0	0					shift @REFS unless $REFS[0];
299
300
301							# Make sure there is a "value" to go with a reference number
302							# for( my $i = 0; $i < @REFS; $i+=2 ) {
303							# if( $REFS[$i+1] =~ /\[\d+\]/ ) {
304							# splice(@REFS,$i+1,0,'');
305							# }
306							# }
307
308							# If there is a large reference its probably the end of the bibliography
309	0						for( my $i = 10; $i < @REFS; $i++ ) {
310	0	0					if( length($REFS[$i]) > 1024 ) {
311	0						splice(@REFS, $i+1);
312	0						$REFS[$i] = substr($REFS[$i],0,1024) . " RUNAWAY_REFERENCE_DETECTED ";
313							}
314							}
315
316							# Add any out-of-order chunks to the previous reference value
317	0						my $last = 0;
318	0						my $max = 0;
319	0						for( my $i = 0; $i < @REFS; $i+=2 ) {
320	0						my $n = $REFS[$i];
321							# $n =~ s/\D//g;
322	0	0					$max = $n if $n > $max;
323	0	0					if( $n == $last+1 ) {
324	0						$last++;
325	0						next;
326							} else {
327							# Join this out-of-order chunk onto the previous ref.
328	0						$REFS[$i-1] .= splice(@REFS,$i,2);
329							}
330							}
331
332							# Remove any trailing garbage
333	0						splice(@REFS, $last*2, -1);
334
335							# Presumably there is a gap between the last reference and any trailing junk
336	0						$REFS[$#REFS] =~ s/(\r?\n){2}.*//s;
337
338							# Prettify the references
339	0						for( my $i = 1; $i < @REFS; $i+=2 ) {
340	0						$REFS[$i] =~ s/[\r\n]+/ /sg;
341	0						$REFS[$i] =~ s/^\s+//sg;
342	0						$REFS[$i] =~ s/\s+$//sg;
343							}
344
345							# Get rid of the numbering
346	0						for( my $i = 0; $i < @REFS; $i++ ) {
347							# $REFS[$i] = $REFS[$i+1];
348	0						splice(@REFS,$i,2,$REFS[$i+1]);
349							}
350
351	0						return @REFS;
352							}
353
354							sub style_years {
355	0			0	0		my $BIBL = shift;
356
357	0						$BIBL =~ s/^\s+//sg;
358
359							# Convert very long lines of spaces into a return
360	0						$BIBL =~ s/ {70} */\n/sg;
361
362	0						my @REFS;
363
364							# Lets try splitting on a blank line
365	0						@REFS = split(/((?:\s*\n){2})/, $BIBL);
366
367	0		0				shift @REFS while (@REFS && $REFS[0] !~ /^$RE_NAME_LIST_CHARS+\d{4}\D/);
368
369							# That didn't work, lets split on left-aligned things (where the next line(s) are blank or indented)
370	0	0	0				if( !@REFS \|\| length($REFS[0]) > 300 ) {
371	0						@REFS = split(/\n[ ]{0,2}((?:(?:\S$RE_NAME_LIST_CHARS{10,})\|$RE_NAME[^\d\-])\d{4}[^\d\-][^\n]+)/, $BIBL);
372	0		0				shift @REFS while (@REFS && $REFS[0] !~ /^$RE_NAME_LIST_CHARS{10,}\d{4}\D/s);
373
374							#return @REFS;
375
376	0						for( my $i = 1; $i < @REFS; $i++ ) {
377	0	0					if( end_of_references(\$REFS[$i]) ) {
		0
378	0						splice(@REFS,$i+1);
379							# Indented
380							} elsif( $REFS[$i] =~ /^\s* {5}\|\t/m ) {
381	0						$REFS[$i-1] .= splice(@REFS,$i,1);
382							}
383							}
384							} else {
385	0						for( my $i = 1; $i < @REFS; $i++ ) {
386	0	0					if( end_of_references(\$REFS[$i]) ) {
387	0						splice(@REFS,$i+1);
388							}
389							}
390							}
391
392							# If we find what looks like the end of the reference section, discard the trailing rubbish
393							# for( my $i = 0; $i < @REFS; $i++ ) {
394							# if( end_of_references(\$REFS[$i]) ) {
395							# splice(@REFS,$i+1);
396							# } elsif( $BIBL =~ /(\r?\n){3}/s ) {
397							# $REFS[$i] = $`;
398							# splice(@REFS,$i+1);
399							# }
400							# }
401
402	0	0					unless( @REFS ) {
403	0						warn "Unable to split year-based references\n";
404	0						return ();
405							}
406
407							# Remove heavily indented lines following a blank line
408	0						for( my $i = 1; $i < @REFS; $i++ ) {
409	0	0	0				if( $REFS[$i-1] !~ /\S/ && $REFS[$i] =~ /^\s{40}/ ) {
410	0						splice(@REFS,$i,1);
411	0						$i--;
412							}
413							}
414
415							# Join refs with the previous reference if they are very short or are quite short and don't start with ...(year)
416	0						for( my $i = 1; $i < @REFS; $i++ ) {
417	0						my $l = $REFS[$i];
418	0						$l =~ s/\s+//sg;
419	0	0	0				if( (length($l) < 30) \|\|
			0
420							(length($l) < 50 && $REFS[$i] !~ /^$RE_NAME_LIST_CHARS{10,40}[^\d\-](\d{4})[^\d\-]/s) ) {
421	0						$REFS[$i-1] .= $REFS[$i];
422	0						splice(@REFS,$i,1);
423	0						$i--;
424							}
425							}
426
427							# If we find 3 sequential references without years near the beginning we probably have trailing garbage
428	0						my $lc = 0;
429	0						for( my $i = 10; $i < @REFS; $i++ ) {
430	0	0					if( $REFS[$i] =~ /^\D{10,50}19\|20\d{2}/s ) {
431	0						$lc = 0;
432							} else {
433	0						$lc++;
434							}
435	0	0					if( $lc == 3 ) {
436	0						splice(@REFS,$i-2);
437							}
438							}
439
440							# Remove lines without any numbers that are quite long (excluding spaces)
441	0						for( my $i = 0; $i < @REFS; $i++ ) {
442	0						my $l = $REFS[$i];
443	0						$l =~ s/\s+//sg;
444	0	0	0				if( length($l) > 100 && $REFS[$i] !~ /\d/ ) {
445	0						splice(@REFS,$i,1);
446							}
447							}
448
449							# Prettify
450	0						map { $_ =~ s/\s+/ /sg; $_ =~ s/^\s+//; $_ =~ s/\s+$//s; } @REFS;
	0
	0
	0
451
452							# This doesn't work - names are too icky
453							# Now go back in and split anything that looks like name, x (year)
454							# for( my $i = 0; $i < @REFS; $i++ ) {
455							# my @srefs = grep { $_ =~ /\S/ } split(/((?:[a-zA-Z\-\'\.]+\s,\s[a-zA-Z\.]+.{0,7})+\d{4}\b)/, $REFS[$i]);
456							# next unless @srefs > 2;
457							#print "Split reference:\n",
458							# (map { "PART: \"$_\"\n" } @srefs), "\n";
459							# }
460							#die;
461
462	0						return @REFS;
463							}
464
465							sub header_to_regexp {
466	0			0	0		my $header = shift;
467	0						$header =~ s/([\\\\|\[\]\.\*\+\?\{\}])/\\$1/g;
468	0						$header =~ s/\s+/\\s+/g;
469	0						$header =~ s/\d+/\\d+/g;
470	0						return $header;
471	0						return q/(?:\n\s){3}(/.$header.q/)(?:\n\s){2}/;
472							}
473
474							#sub unicode_string {
475							# $_ = shift();
476							# s/[\x00-\x08\x0b-\x0c\x0e-\x1f]//sg;
477							# s/([\x80-\xff])/sprintf("&#x%04x;",ord($1))/seg;
478							# return $_;
479							#}
480
481							1;
482
483							__END__