File Coverage

blib/lib/Biblio/Document/Parser/Standard.pm

Criterion	Covered	Total	%
statement	160	300	53.3
branch	54	192	28.1
condition	16	77	20.7
subroutine	9	11	81.8
pod	2	2	100.0
total	241	582	41.4

line	stmt	bran	cond	sub	pod	time	code
1							package Biblio::Document::Parser::Standard;
2
3							######################################################################
4							#
5							# Biblio::Document::Parser::Standard;
6							#
7							######################################################################
8							#
9							# This file is part of ParaCite Tools (http://paracite.eprints.org/developers/)
10							#
11							# Copyright (c) 2002 University of Southampton, UK. SO17 1BJ.
12							#
13							# ParaTools is free software; you can redistribute it and/or modify
14							# it under the terms of the GNU General Public License as published by
15							# the Free Software Foundation; either version 2 of the License, or
16							# (at your option) any later version.
17							#
18							# ParaTools is distributed in the hope that it will be useful,
19							# but WITHOUT ANY WARRANTY; without even the implied warranty of
20							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21							# GNU General Public License for more details.
22							#
23							# You should have received a copy of the GNU General Public License
24							# along with ParaTools; if not, write to the Free Software
25							# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26							#
27							######################################################################
28
29							require Exporter;
30							@ISA = ("Exporter", "Biblio::Document::Parser");
31
32	1			1		7642	use Biblio::Document::Parser::Utils qw( normalise_multichars );
	1					4
	1					75
33
34	1			1		153	use 5.006;
	1					5
	1					40
35	1			1		5	use strict;
	1					2
	1					32
36	1			1		4	use warnings;
	1					2
	1					42
37	1			1		5	use vars qw($DEBUG);
	1					1
	1					5974
38
39							our @EXPORT_OK = ( 'parse', 'new' );
40
41							$DEBUG = 0;
42
43							=pod
44
45							=head1 NAME
46
47							B - document parsing functionality
48
49							=head1 SYNOPSIS
50
51							use Biblio::Document::Parser::Standard;
52							use Biblio::Document::Parser::Utils;
53							# First read a file into an array of lines.
54							my $content = Biblio::Document::Parser::Utils::get_content("http://www.foo.com/myfile.pdf");
55							my $doc_parser = new Biblio::Document::Parser::Standard();
56							my @references = $doc_parser->parse($content);
57							# Print a list of the extracted references.
58							foreach(@references) { print "-> $_\n"; }
59
60							=head1 DESCRIPTION
61
62							Biblio::Document::Parser::Standard provides a fairly simple implementation of
63							a system to extract references from documents.
64
65							Various styles of reference are supported, including numeric and indented,
66							and documents with two columns are converted into single-column documents
67							prior to parsing. This is a very experimental module, and still contains
68							a few hard-coded constants that can probably be improved upon.
69
70							=head1 METHODS
71
72							=over 4
73
74							=item $parser = Biblio::Document::Parser::Standard-Enew()
75
76							The new() method creates a new parser instance.
77
78							=cut
79
80							sub new
81							{
82	1			1	1	16	my($class) = @_;
83	1					10	my $self = {};
84	1					7	return bless($self, $class);
85							}
86
87							=pod
88
89							=item @references = $parser-Eparse($lines, [%options])
90
91							The parse() method takes a string as input (see the get_content()
92							function in Biblio::Document::Parser::Utils for a way to obtain this), and returns a list
93							of references in plain text suitable for passing to a CiteParser module.
94
95							=cut
96
97							sub parse
98							{
99	1			1	1	8	my($self, $lines, %options) = @_;
100	1					4	$lines = _addpagebreaks($lines);
101	1					11	my @lines = split("\n", $lines);
102	1					6	my($pivot, $avelen) = $self->_decolumnise(@lines);
103
104	1					2	my $in_refs = 0;
105	1					11	my @ref_table = ();
106	1					2	my $curr_ref = "";
107	1					2	my @newlines = ();
108	1					2	my $outcount = 0;
109	1					10	my @chopped_lines = @lines;
110							# First isolate the reference array. This ensures that we handle columns correctly.
111	1					3	foreach(@lines)
112							{
113	9					10	$outcount++;
114	9					10	chomp;
115	9	100				38	if (/(?:references)\|(?:bibliography)\|(?:\s+cited)/i)
		50
116							{
117	1					2	last;
118							}
119							elsif (/\f/)
120							{
121							# No sign of any references yet, so pop off up to here
122	0					0	for(my $i=0; $i<$outcount; $i++) { shift @chopped_lines; }
	0					0
123	0					0	$outcount = 0;
124							}
125							}
126	1					2	my @arr1 = ();
127	1					2	my @arr2 = ();
128	1					2	my @arrout = ();
129	1					2	my $indnt = "";
130	1	50				3	if ($pivot)
131							{
132	1					3	my ($pivotl,$pivotr) = ($pivot-5,$pivot+5);
133	1					4	foreach(@chopped_lines)
134							{
135	15					18	chomp;
136	15	50				26	if (/\f/)
137							{
138	0					0	push @arrout, @arr1;
139	0					0	push @arrout, @arr2;
140	0					0	@arr1 = ();
141	0					0	@arr2 = ();
142							}
143							else
144							{
145	15	50				89	if(/^(.{$pivotl,$pivotr})\s{3}(\s{3,})?(\S.+?)$/)
146							{
147							# push @arr1, $indnt.$1;
148	0	0				0	push @arr1, $1 if defined($1);
149	0	0	0			0	push @arr2, ($2\|\|'').$3 if defined($3);
150							}
151							else
152							{
153	15					39	push @arr1, $indnt.$_;
154							}
155							}
156							}
157	1					7	push @arrout, @arr1;
158	1					2	push @arrout, @arr2;
159	1					7	@chopped_lines = @arrout;
160							}
161	1					2	my $prevnew = 0;
162	1					3	foreach(@chopped_lines)
163							{
164	15					19	chomp;
165	15	50	66			120	if (/^\sreferences\s$/i \|\| /REFERENCES/ \|\| /Bibliography/i \|\| /References and Notes/)
			66
			33
166							{
167	1					2	$in_refs = 1;
168	1	50				6	push @newlines, $' if defined($'); # Capture bad input
169	1					3	next;
170							}
171	14	50	33			151	if (/^\s\bappendix\b/i \|\| /_{6}.{0,10}$/ \|\| /^\s\btable\b/i \|\| /wish to thank/i \|\| /\bfigure\s+\d/)
			33
			33
			33
172							{
173	0					0	$in_refs = 0;
174							}
175
176	14	100				45	if (/^\s*$/)
177							{
178	6	100				12	if ($prevnew) { next; }
	2					4
179	4					5	$prevnew = 1;
180							}
181							else
182							{
183	8					10	$prevnew = 0;
184							}
185
186	12	50				31	if (/^\s\d+\s$/) { next; } # Page number
	0					0
187
188	12	100				26	if ($in_refs)
189							{
190	5	50				13	my $spaces = /^(\s+)/ ? length($1) : 0;
191	5	50	33			35	if( @newlines && /^(\s+)[a-z]/ && _within(length($1),length($newlines[$#newlines]),5) ) {
			33
192	0					0	s/^\s+//s;
193	0					0	$newlines[$#newlines] .= $_;
194							} else {
195	5					13	push @newlines, $_;
196							}
197							}
198							}
199							# We failed to find the reference section, we'll do a last-ditch effect at finding numbered
200							# refs
201	1	50				4	unless($in_refs) {
202	0					0	my $first = 0;
203	0					0	my $lastnum = 0;
204	0					0	my $numwith = 0;
205	0					0	my $numwo = 0;
206	0					0	for(my $i = 0; $i < @chopped_lines; $i++) {
207	0					0	$_ = $chopped_lines[$i];
208	0	0	0			0	if( /^\s[\[$](\d+)[\]$]/ \|\| /^\s(\d+)(?:\.\|\s{5,})/ ) {
		0
		0
209	0	0				0	$first = $1 if $1 == 1;
210	0	0	0			0	if( $lastnum && $1 == $lastnum+1 ) {
211	0					0	$numwo = 0;
212	0					0	$numwith++;
213	0					0	$lastnum++;
214							} else {
215	0					0	$first = $i;
216	0					0	$lastnum = $1;
217							}
218							} elsif( $numwo++ == 5 ) { # Reset
219	0					0	$first = $lastnum = $numwith = $numwo = 0;
220							} elsif( $numwith == 5 ) {
221	0					0	last;
222							}
223							}
224	0	0	0			0	@newlines = splice(@chopped_lines,$first) if $first && $numwith == 5;
225							}
226							#warn "BEGIN REF SECTION\n", join("\n",@newlines), "\nEND REF SECTION\n";
227							# Work out what sort of separation is used
228	1					2	my $type = 0;
229	1					2	my $TYPE_NEWLINE = 0;
230	1					2	my $TYPE_INDENT = 1; # First line indented
231	1					2	my $TYPE_NUMBER = 2;
232	1					2	my $TYPE_NUMBERSQ = 3;
233	1					3	my $TYPE_LETTERSQ = 4;
234	1					1	my $TYPE_INDENT_OTHER = 5; # Other lines indented
235	1					2	my $numnew = 0;
236	1					1	my $numind = 0;
237	1					2	my $numnum = 0;
238	1					2	my $numsq = 0;
239	1					2	my $lettsq = 0;
240	1					1	my $indmin = 255;
241	1					2	my $indmax = 0;
242	1					7	my @indented;
243							# Handle numbered references joined together (e.g. bad to-text conversion)
244	1					5	my $ref_sect = join "\n", @newlines;
245	1					1	my $ref_b = 1; my $ref_e = 2;
	1					2
246	1					2	my @num_refs;
247	1					19	while( $ref_sect =~ s/(\[$ref_b\].+?)(?=\[$ref_e\])//sg ) {
248	0					0	$ref_b++; $ref_e++;
	0					0
249	0					0	push @num_refs, split("\n", $1);
250							}
251	1	50				5	if( $ref_b >= 5 ) {
252	0					0	@newlines = @num_refs;
253	0	0				0	push @newlines, $ref_sect if defined($ref_sect);
254							}
255							# Resume normal processing
256	1					2	foreach(@newlines)
257							{
258	6					16	$_ = normalise_multichars($_);
259	6	100				24	if (/^\s*$/)
260							{
261	3					6	$numnew++;
262							}
263	6	50				14	if (/^(\s+)\b/)
264							{
265	0	0				0	if (length $1 < $indmin) { $indmin = length $1; }
	0					0
266	0	0				0	if (length $1 > $indmax) { $indmax = length $1; }
	0					0
267	0	0	0			0	if( length($1) >= $indmax && /^\s+[A-Z]/ ) { $numind++ }
	0					0
268							}
269	6	50				14	if (/^\s*\d+\.?\s+[[:alnum:]]/)
270							{
271	0					0	$numnum++;
272							}
273	6	50				12	if (/^\s*[\[$]\d+[\]$]\s+[[:alnum:]]/)
274							{
275	0					0	$numsq++;
276							}
277	6	50				29	if (/^\s[\[$][A-Za-z]\w[\]$]\s/)
278							{
279	0					0	$lettsq++;
280							}
281							}
282
283							# if ($numnew < ($#newlines-5) && ($indmax > $indmin) && $indmax != 0 && $indmin != 255 && $indmax < 24) { $type = $TYPE_INDENT; }
284							# If references are seperated by blank lines, then we would expect to see around one blank line
285							# for each reference?
286							#warn "indmin=$indmin, indmax=$indmax\n";
287	1	0	33			7	if ($numnew < ($#newlines/2) && ($indmax >= $indmin) && $indmax != 0 && $indmin != 255 && $indmax < 24) { $type = $numind >= $#newlines/2 ? $TYPE_INDENT : $TYPE_INDENT_OTHER; }
	0	0	33			0
			0
			0
288	1	50				5	if ($numnum > 3) { $type = $TYPE_NUMBER; }
	0					0
289	1	50				3	if ($numsq > 3) { $type = $TYPE_NUMBERSQ; }
	0					0
290	1	50				4	if ($lettsq > 3) { $type = $TYPE_LETTERSQ; }
	0					0
291	1	50				4	if ($type == $TYPE_NEWLINE)
		0
		0
		0
		0
		0
292							{
293	1	50				3	warn "type = NEWLINE" if $DEBUG;
294	1	50				5	my $indmin = $indmin>5 ? $indmin + 3 : 5;
295	1					2	foreach(@newlines)
296							{
297	6	100				41	if (/^\s*$/)
		50
298							{
299	3	100				40	if ($curr_ref) { push @ref_table, $curr_ref; }
	2					2
300	3					6	$curr_ref = "";
301	3					4	next;
302							}
303							# Indented line amongst justified text, attach to the previous reference
304							elsif( /^\s{$indmin}/ ) {
305	0					0	s/^\s(.+)\s$/$1/;
306	0	0	0			0	if( !$curr_ref && @ref_table ) {
307	0					0	$ref_table[$#ref_table] .= " ".$_;
308	0					0	next;
309							}
310							}
311							# Trim off any whitespace surrounding chunk
312	3					13	s/^\s(.+)\s$/$1/;
313	3					13	s/^(.+)[\\-]+$/$1/;
314	3	50				6	if ($curr_ref =~ /http:\/\/\S+$/) {
315	0					0	$curr_ref = $curr_ref.$_;
316							} else {
317	3					11	$curr_ref .= " ".$_;
318							}
319							}
320	1	50				10	if ($curr_ref) { push @ref_table, $curr_ref; }
	1					2
321							}
322							elsif ($type == $TYPE_INDENT)
323							{
324	0	0				0	warn "type = INDENT" if $DEBUG;
325	0					0	foreach(@newlines)
326							{
327	0	0	0			0	if (/^(\s*)\b/ && length $1 == $indmin)
328							{
329	0	0				0	if ($curr_ref) { push @ref_table, $curr_ref; }
	0					0
330	0					0	$curr_ref = $_;
331							}
332							else
333							{
334							# Trim off any whitespace surrounding chunk
335	0					0	s/^\s(.+)\s$/$1/;
336	0	0				0	if ($curr_ref =~ /http:\/\/\S+$/) { $curr_ref = $curr_ref.$_;} else
	0					0
337							{
338	0					0	$curr_ref = $curr_ref." ".$_;
339							}
340
341							}
342							}
343	0	0				0	if ($curr_ref) { push @ref_table, $curr_ref; }
	0					0
344							}
345							elsif ($type == $TYPE_INDENT_OTHER)
346							{
347	0	0				0	warn "type = INDENT_OTHER" if $DEBUG;
348	0					0	foreach(@newlines)
349							{
350	0	0	0			0	if (!$curr_ref ) { $curr_ref = $_; }
	0	0				0
351							elsif (/^(\s*)\S/ && _within(length($1),$indmax,2))
352							{
353	0					0	s/^\s+//;
354	0	0				0	if( $curr_ref =~ s/(?<=\w)\-\s*$// ) {
355	0					0	$curr_ref .= $_;
356							} else {
357	0					0	$curr_ref .= " ".$_;
358							}
359							}
360							else
361							{
362							# Trim off any whitespace surrounding chunk
363	0	0				0	if ($curr_ref =~ /http:\/\/\S+$/)
364							{
365	0					0	s/^\s(.+)\s$/$1/;
366	0					0	$curr_ref .= $_;
367							}
368							else
369							{
370	0	0				0	if ($curr_ref) { push @ref_table, $curr_ref; }
	0					0
371	0					0	$curr_ref = $_;
372							}
373							}
374							}
375	0	0				0	if ($curr_ref) { push @ref_table, $curr_ref; }
	0					0
376							}
377							elsif ($type == $TYPE_NUMBER)
378							{
379	0	0				0	warn "type = NUMBER" if $DEBUG;
380	0					0	my $lastnum = 0;
381	0					0	foreach(@newlines)
382							{
383	0					0	s/^\s(.+)\s$/$1/;
384	0	0	0			0	if (/^(\d+)\.?(([\s_]{8}\s*[,a;])\|\s+[[:alnum:]_]).+$/ && $1 == $lastnum+1 )
385							{
386	0	0				0	if ($curr_ref) { push @ref_table, $curr_ref; }
	0					0
387	0					0	$curr_ref = $_;
388	0					0	$lastnum++;
389	0					0	next;
390							}
391							else
392							{
393	0	0				0	if ($curr_ref =~ /http:\/\/\S+$/) { $curr_ref = $curr_ref.$_;} else
	0					0
394							{
395	0					0	$curr_ref = $curr_ref." ".$_;
396							}
397
398							}
399							}
400	0	0				0	if ($curr_ref) { push @ref_table, $curr_ref; }
	0					0
401							}
402							elsif ($type == $TYPE_NUMBERSQ)
403							{
404	0	0				0	warn "type = NUMBERSQ" if $DEBUG;
405	0					0	my $lastnum = 0;
406	0					0	foreach(@newlines)
407							{
408	0					0	s/^\s(.+)\s$/$1/;
409							# () used in oai:arXiv.org:math-ph/9805026
410	0	0	0			0	if (/^\s*[$\[](\d+)[\]$]\s.+$/s && $1 == $lastnum+1 )
		0
		0
411							{
412	0	0				0	push @ref_table, $curr_ref if $curr_ref;
413	0					0	$curr_ref = $_;
414	0					0	$lastnum++;
415							}
416							elsif( /^\s*$/ ) # Blank line
417							{
418	0	0				0	if ($curr_ref) { push @ref_table, $curr_ref; }
	0					0
419	0					0	undef $curr_ref;
420							}
421							elsif($curr_ref)
422							{
423	0	0				0	if ($curr_ref =~ /http:\/\/\S+$/) {
424	0					0	$curr_ref .= $_;
425							} else {
426	0					0	$curr_ref .= " ".$_;
427							}
428
429							}
430							}
431	0	0				0	push @ref_table, $curr_ref if $curr_ref;
432							}
433							elsif( $type eq $TYPE_LETTERSQ )
434							{
435	0	0				0	warn "type = LETTERSQ" if $DEBUG;
436	0					0	foreach(@newlines)
437							{
438	0					0	s/^\s(.+)\s$/$1/;
439							# () used in oai:arXiv.org:math-ph/9805026
440	0	0				0	if (/^\s*[$\[](\w+)[\]$]\s.+$/s )
		0
		0
441							{
442	0	0				0	push @ref_table, $curr_ref if $curr_ref;
443	0					0	$curr_ref = $_;
444							}
445							elsif( /^\s*$/ ) # Blank line
446							{
447	0	0				0	if ($curr_ref) { push @ref_table, $curr_ref; }
	0					0
448	0					0	undef $curr_ref;
449							}
450							elsif($curr_ref)
451							{
452	0	0				0	if ($curr_ref =~ /http:\/\/\S+$/) {
453	0					0	$curr_ref .= $_;
454							} else {
455	0					0	$curr_ref .= " ".$_;
456							}
457
458							}
459							}
460	0	0				0	push @ref_table, $curr_ref if $curr_ref;
461							}
462
463	1					4	my @refs_out = ();
464							# A little cleaning up before returning
465	1					1	my $prev_author;
466	1					3	for (@ref_table)
467							{
468	3					5	s/([[:alpha:]])\-\s+/$1/g; # End of a line hyphen
469	3					7	s/^\s*[\[$]([^\]]+)[\]$](.+)$/($1) $2/s;
470							# Same author as previous citation
471	3	50				8	$prev_author && s/^((?:[$\[]\w+[$\]])\|(?:\d{1,3}\.))[\s_]{8,}/$1 $prev_author /;
472	3	50				10	if( /^(?:(?:[$\[]\w+[$\]])\|(?:\d{1,3}\.))\s*([^,]+?)(?:,\|and)/ ) {
473	0					0	$prev_author = $1;
474							} else {
475	3					4	undef $prev_author;
476							}
477	3					8	s/\s\s+/ /g;
478	3					11	s/^\s(.+)\s$/$1/;
479							# next if length $_ > 200;
480	3					7	push @refs_out, $_;
481							}
482	1					13	return @refs_out;
483							}
484
485							# Private method to determine if/where columns are present.
486
487							sub _decolumnise
488							{
489	1			1		6	my($self, @lines) = @_;
490	1					2	my @bitsout;
491	1					2	my @lens = (0); # Removes need to check $lens[0] is defined
492	1					4	foreach(@lines)
493							{
494							# Replaces tabs with 8 spaces
495	15					16	s/\t/ /g;
496							# Ignore lines that are >75% whitespace (probably diagrams/equations)
497	15	100	66			66	next if( length($_) == 0 \|\| (($_ =~ tr/ //)/length($_)) > .75 );
498							# Split into characters
499	9					34	my @bits = unpack "c*", $_;
500							# Count lines together that vary slightly in length (within 5 chars)
501	9					23	$lens[int(scalar @bits/5)*5+2]++;
502	9	100				12	my @newbits = map { $_ = ($_==32?1:0) } @bits;
	89					173
503	9					26	for(my $i=0; $i<$#newbits; $i++) { $bitsout[$i]+=$newbits[$i]; }
	80					171
504							}
505							# Calculate the average length based on the modal.
506							# 2003-05-14 Fixed by tdb
507	1					3	my $avelen = 0;
508	1					5	for(my $i = 0; $i < @lens; $i++ ) {
509	23	100				56	next unless defined $lens[$i];
510	5	100				33	$avelen = $i if $lens[$i] > $lens[$avelen];
511							}
512	1					2	my $maxpoint = 0;
513	1					2	my $max = 0;
514							# Determine which point has the most spaces
515	1	100				4	for(my $i=0; $i<$#bitsout; $i++) { if ($bitsout[$i] > $max) { $max = $bitsout[$i]; $maxpoint = $i; } }
	19					49
	2					3
	2					6
516	1					3	my $center = int($avelen/2);
517	1					3	my $output = 0;
518							# Only accept if the max point lies around the average center.
519	1	50	33			8	if ($center-6 <= $maxpoint && $center+6>= $maxpoint) { $output = $maxpoint; } else {$output = 0;}
	1					3
	0					0
520							#warn "Decol: avelen=$avelen, center=$center, maxpoint=$maxpoint (output=$output)\n";
521	1					6	return ($output, $avelen);
522							}
523
524							# Private function that replaces header/footers with form feeds
525
526							sub _addpagebreaks {
527	1			1		3	my $doc = shift;
528	1	50				5	return $doc if $doc =~ /\f/s;
529	1					2	my %HEADERS;
530
531	1					156	while( $doc =~ /(?:\n[\r[:blank:]]){2}([^\n]{0,40}\w+[^\n]{0,40})(?:\n[\r[:blank:]]){3}/osg ) {
532	0					0	$HEADERS{_header_to_regexp($1)}++;
533							}
534
535	1	50				3	if( %HEADERS ) {
536	0					0	my @regexps = sort { $HEADERS{$b} <=> $HEADERS{$a} } keys %HEADERS;
	0					0
537	0					0	my $regexp = $regexps[0];
538	0	0				0	if( $HEADERS{$regexp} > 3 ) {
539	0					0	my $c = $doc =~ s/(?:\n[\r[:blank:]]){2}(?:$regexp)(?:\n[\r[:blank:]]){3}/\f/sg;
540							# warn "Applying regexp: $regexp ($HEADERS{$regexp} original matches) Removed $c header/footers using ($HEADERS{$regexp} original matches): $regexp\n";
541							} else {
542	0					0	warn "Not enough matching header/footers were found ($HEADERS{$regexp} only)";
543							}
544							} else {
545	1					32	warn "Header/footers not found - flying blind if this is a multi-column document";
546							}
547
548	1					4	return $doc;
549							}
550
551							sub _header_to_regexp {
552	0			0			my $header = shift;
553	0						$header =~ s/([\\\\|\[\]\.\*\+\?\{\}])/\\$1/g;
554	0						$header =~ s/\s+/\\s+/g;
555	0						$header =~ s/\d+/\\d+/g;
556	0						return $header;
557							}
558
559							sub _within {
560	0			0			my ($l,$r,$p) = @_;
561							#warn "Is $l with $p of $r?\n";
562	0		0				return $r >= $l-$p && $r <= $l+$p;
563							}
564
565							1;
566
567							__END__