File Coverage

blib/lib/Biblio/Citation/Parser/Standard.pm
Criterion Covered Total %
statement 146 208 70.1
branch 23 58 39.6
condition 5 30 16.6
subroutine 9 10 90.0
pod 7 8 87.5
total 190 314 60.5


line stmt bran cond sub pod time code
1             package Biblio::Citation::Parser::Standard;
2              
3             ######################################################################
4             #
5             # Biblio::Citation::Parser::Standard;
6             #
7             ######################################################################
8             #
9             # This file is part of ParaCite Tools (http://paracite.eprints.org/developers/)
10             #
11             # Copyright (c) 2004 University of Southampton, UK. SO17 1BJ.
12             #
13             # ParaTools is free software; you can redistribute it and/or modify
14             # it under the terms of the GNU General Public License as published by
15             # the Free Software Foundation; either version 2 of the License, or
16             # (at your option) any later version.
17             #
18             # ParaTools is distributed in the hope that it will be useful,
19             # but WITHOUT ANY WARRANTY; without even the implied warranty of
20             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21             # GNU General Public License for more details.
22             #
23             # You should have received a copy of the GNU General Public License
24             # along with ParaTools; if not, write to the Free Software
25             # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26             #
27             ######################################################################
28              
29             require Exporter;
30             @ISA = ("Exporter", "Biblio::Citation::Parser");
31              
32 1     1   6352 use strict;
  1         3  
  1         95  
33              
34 1     1   1684 use Biblio::Citation::Parser::Templates;
  1         3  
  1         4395  
35             our @EXPORT_OK = ( 'parse', 'new' );
36              
37              
38             =pod
39              
40             =head1 NAME
41              
42             B - citation parsing functionality
43              
44             =head1 SYNOPSIS
45              
46             use Biblio::Citation::Parser::Standard;
47             # Parse a simple reference
48             $parser = new Biblio::Citation::Parser::Standard;
49             $metadata = $parser->parse("M. Jewell (2004) Citation Parsing for Beginners. Journal of Madeup References 4(3).");
50             print "The title of this article is ".$metadata->{atitle}."\n";
51              
52             =head1 DESCRIPTION
53              
54             Biblio::Citation::Parser::Standard uses a relatively simple template matching
55             technique to extract metadata from citations.
56              
57             The Templates.pm module currently provides almost 400 templates, with
58             more being added regularly, and the parser returns the metadata in a
59             form that is easily massaged into OpenURLs (see the Biblio::OpenURL
60             module for an even easier way).
61              
62             =cut
63              
64              
65             my %factors =
66             (
67             "_AUFIRST_" => 0.6,
68             "_AULAST_" => 0.6,
69             "_ISSN_" => 0.95,
70             "_AUTHORS_" => 0.65,
71             "_EDITOR_" => 0.6,
72             "_DATE_" => 0.95,
73             "_YEAR_" => 0.8,
74             "_SUBTITLE_" => 0.6,
75             "_TITLE_" => 0.6,
76             "_UCTITLE_" => 0.7,
77             "_CAPTITLE_" => 0.7,
78             "_PUBLICATION_" => 0.65,
79             "_PUBLISHER_" => 0.65,
80             "_PUBLOC_" => 0.65,
81             "_UCPUBLICATION_" => 0.74,
82             "_CAPPUBLICATION_" => 0.7,
83             "_CHAPTER_" => 0.8,
84             "_VOLUME_" => 0.8,
85             "_ISSUE_" => 0.8,
86             "_PAGES_" => 0.9,
87             "_ANY_" => 0.05,
88             "_ISBN_" => 0.95,
89             "_ISSN_" => 0.95,
90             "_SPAGE_" => 0.8,
91             "_EPAGE_" => 0.8,
92             "_URL_" => 0.9,
93             );
94              
95             =pod
96              
97             =head1 METHODS
98              
99             =over 4
100              
101             =item $parser = Biblio::Citation::Parser::Standard-Enew()
102              
103             The new() method creates a new parser.
104              
105             =cut
106              
107             sub new
108             {
109 1     1 1 124 my($class) = @_;
110 1         3 my $self = {};
111 1         4 return bless($self, $class);
112             }
113              
114             =pod
115              
116             =item $reliability = Biblio::Citation::Parser::Standard::get_reliability($template)
117              
118             The get_reliability method returns a value that acts as an indicator
119             of the likelihood of a template matching correctly. Fields such as
120             page ranges, URLs, etc, have high likelihoods (as they follow rigorous
121             patterns), whereas titles, publications, etc have lower likelihoods.
122              
123             The method takes a template as a parameter, but you shouldn't really
124             need to use this method much.
125              
126             =cut
127              
128             sub get_reliability
129             {
130 8     8 1 19 my( $template ) = @_;
131 8         18 my $reliability = 0;
132 8         77 foreach(keys %factors)
133             {
134 200 100       1786 if ($template =~ /$_/)
135             {
136 30         274 while($template =~ /$_/)
137             {
138 30         72 $reliability += $factors{$_};
139 30         363 $template =~ s/$_//;
140             }
141             }
142             }
143 8         38 return $reliability;
144             }
145              
146             =pod
147              
148             =item $concreteness = Biblio::Citation::Parser::Standard::get_concreteness($template)
149              
150             As with the get_reliability() method, get_concreteness() takes
151             a template as a parameter, and returns a numeric indicator. In
152             this case, it is the number of non-field characters in the template.
153             The more 'concrete' a template, the higher the probability that
154             it will match well. For example, '_PUBLICATION_ Vol. _VOLUME_' is
155             a better match than '_PUBLICATION_ _VOLUME_', as _PUBLICATION_ is
156             likely to subsume 'Vol.' in the second case.
157              
158             =cut
159              
160             sub get_concreteness
161             {
162 8     8 1 20 my( $template ) = @_;
163 8         14 my $concreteness = 0;
164 8         40 foreach(keys %factors)
165             {
166 200         1541 $template =~ s/$_//g;
167             }
168 8         38 return length($template);
169             }
170              
171             =pod
172              
173             =item $string = Biblio::Citation::Parser::Standard::strip_spaces(@strings)
174              
175             This is a helper function to remove spaces from all elements
176             of an array.
177              
178             =cut
179              
180             sub strip_spaces
181             {
182 1     1 1 3 my(@bits) = @_;
183 1         4 foreach(@bits) { s/^[[:space:]]*(.+)[[:space:]]*$/$1/;}
  2         12  
184 1         4 return @bits;
185             }
186              
187             =pod
188              
189             =item $templates = Biblio::Citation::Parser::Standard::get_templates()
190              
191             Returns the current template list from the Biblio::Citation::Parser::Templates
192             module. Useful for giving status lists.
193              
194             =cut
195              
196             sub get_templates
197             {
198 0     0 1 0 return $Biblio::Citation::Parser::Templates::templates;
199             }
200              
201             =pod
202              
203             =item @authors = Biblio::Citation::Parser::Standard::handle_authors($string)
204              
205             This (rather large) function handles the author fields of a reference.
206             It is not all-inclusive yet, but it is usably accurate. It can handle
207             author lists that are separated by semicolons, commas, and a few other
208             delimiters, as well as &, and, and 'et al'.
209              
210             The method takes an author string as a parameter, and returns an array
211             of extracted information in the format '{family => $family, given =>
212             $given}'.
213              
214             =cut
215              
216             sub handle_authors
217             {
218 1     1 1 4 my($authstr) = @_;
219            
220 1         3 my @authsout = ();
221 1         59 $authstr =~ s/\bet al\b//;
222             # Handle semicolon lists
223 1 50 0     20 if ($authstr =~ /;/)
    50          
    50          
    0          
    0          
    0          
224             {
225 0         0 my @auths = split /[[:space:]]*;[[:space:]]*/, $authstr;
226 0         0 foreach(@auths)
227             {
228 0         0 my @bits = split /[,[:space:]]+/;
229 0         0 @bits = strip_spaces(@bits);
230 0         0 push @authsout, {family => $bits[0], given => $bits[1]};
231             }
232             }
233             elsif ($authstr =~ /^[[:upper:]\.]+[[:space:]]+[[:alnum:]]/)
234             {
235 0         0 my @bits = split /[[:space:]]+/, $authstr;
236 0         0 @bits = strip_spaces(@bits);
237 0         0 my $fam = 0;
238 0         0 my($family, $given);
239 0         0 foreach(@bits)
240             {
241 0 0 0     0 next if ($_ eq "and" || $_ eq "&" || /^[[:space:]]*$/);
      0        
242 0         0 s/,//g;
243 0 0       0 if ($fam)
244             {
245 0         0 $family = $_;
246 0         0 push @authsout, {family => $family, given => $given};
247 0         0 $fam = 0;
248             }
249             else
250             {
251 0         0 $given = $_;
252 0         0 $fam = 1;
253             }
254             }
255             }
256             elsif ($authstr =~ /^.+[[:space:]]+[[:upper:]\.]+/)
257             {
258             # Foo AJ, Bar PJ
259 1         3 my $fam = 1;
260 1         3 my $family = "";
261 1         3 my $given = "";
262 1         9 my @bits = split /[[:space:]]+/, $authstr;
263 1         6 @bits = strip_spaces(@bits);
264 1         4 foreach(@bits)
265             {
266 2         8 s/[,;\.]//g;
267 2         4 s/\bet al\b//g;
268 2         4 s/\band\b//;
269 2         4 s/\b&\b//;
270 2 50       9 next if /^[[:space:]]*$/;
271 2 100       7 if ($fam == 1)
272             {
273 1         2 $family = $_;
274 1         3 $fam = 0;
275             }
276             else
277             {
278 1         3 $given = $_;
279 1         3 $fam = 1;
280 1         8 push @authsout, {family => $family, given => $given};
281            
282             }
283             }
284             }
285             elsif ($authstr =~ /^.+,[[:space:]]*.+/ || $authstr =~ /.+\band\b.+/)
286             {
287 0         0 my $fam = 1;
288 0         0 my $family = "";
289 0         0 my $given = "";
290 0         0 my @bits = split /[[:space:]]*,|\band\b|&[[:space:]]*/, $authstr;
291 0         0 @bits = strip_spaces(@bits);
292 0         0 foreach(@bits)
293             {
294 0 0       0 next if /^[[:space:]]*$/;
295 0 0       0 if ($fam)
296             {
297 0         0 $family = $_;
298 0         0 $fam = 0;
299             }
300             else
301             {
302 0         0 $given = $_;
303 0         0 push @authsout, {family => $family, given => $given};
304 0         0 $fam = 1;
305             }
306             }
307             }
308             elsif ($authstr =~ /^[[:alpha:][:space:]]+$/)
309             {
310 0         0 $authstr =~ /^([[:alpha:]]+)[[:space:]]*([[:alpha:]]*)$/;
311 0         0 my $given = "";
312 0         0 my $family = "";
313 0 0 0     0 if (defined $1 && defined $2)
314             {
315 0         0 $given = $1;
316 0         0 $family = $2;
317             }
318 0 0 0     0 if (!defined $2 || $2 eq "")
319             {
320 0         0 $family = $1;
321 0         0 $given = "";
322             }
323 0         0 push @authsout, {family => $family, given => $given};
324             }
325             elsif( $authstr =~ /[[:word:]]+[[:space:]]+[[:word:]]?[[:space:]]*[[:word:]]+/)
326             {
327 0         0 my @bits = split /[[:space:]]+/, $authstr;
328 0         0 my $rest = $authstr;
329 0         0 $rest =~ s/$bits[-1]//;
330 0         0 push @authsout, {family => $bits[-1], given => $rest};
331             }
332             else
333             {
334            
335             }
336 1         4 return @authsout;
337             }
338              
339             =pod
340              
341             =item %metadata = $parser-Extract_metadata($reference)
342              
343             This is the key method in the Standard module, although it is not actually
344             called directly by users (the 'parse' method provides a wrapper). It takes
345             a reference, and returns a hashtable representing extracted metadata.
346              
347             A regular expression map is present in this method to transform '_AUFIRST_',
348             '_ISSN_', etc, into expressions that should match them. The method then
349             finds the template which best matches the reference, picking the result that
350             has the highest concreteness and reliability (see above), and returns the
351             fields in the hashtable. It also creates the marked-up version, that is
352             useful for further formatting.
353              
354             =cut
355              
356             sub extract_metadata
357             {
358 1     1 0 21 my($self, $ref) = @_;
359             # Skip to the first Alpha char
360 1 50       6 if ($ref !~ /^[[:digit:]]-X\.]+$/) { $ref =~ s/^[^[:alpha:]]+//; }
  1         3  
361 1         7 $ref =~ s/[[:space:]\*]+$//;
362 1         5 $ref =~ s/[[:space:]]{2}[[:space:]]+/ /g;
363 1         6 $ref =~ s/^[[:space:]\*]*(.+)[[:space:]\*]*$/$1/;
364 1         2 my %metaout = ();
365 1         4 $metaout{ref} = $ref;
366              
367 1         3 $metaout{id} = [];
368             # Pull out doi addresses
369 1 50       4 if ($ref =~ s/doi:(.+)\b//)
370             {
371 0         0 push @{$metaout{id}}, "doi:$1";
  0         0  
372             }
373 1 50       5 if ($ref =~ s/((astro-ph|cond-mat|gr-qc|hep-ex|hep-lat|hep-ph|hep-th|math-th|nucl-ex|nucl-th|physics|quant-ph|math|nlin|cs)\/\d+\b)//)
374             {
375 0         0 push @{$metaout{id}}, "arxiv:$1";
  0         0  
376             }
377 1         3 my @specific_pubs =
378             (
379             # Put any specific publications in here
380             );
381            
382 1         1 my $spec_pubs = "";
383 1 50       4 if (scalar @specific_pubs > 0)
384             {
385 0         0 $spec_pubs = join("|", @specific_pubs);
386 0         0 $spec_pubs = "|".$spec_pubs;
387             }
388              
389 1         2 my $initial_match = "(?:\\b[[:alpha:]]\\.|\\b[[:alpha:]]\\b)";
390 1         2 my $name_match = "(?:(?:[[:alpha:],;&-]+)\\b)";
391 1         2 my $conjs = "(?:\\s+und\\s+|\\s+band\\s+|\\s|,|&|;)";
392              
393 1         25 my %matches =
394             (
395             "_AUFIRST_" => "([[:alpha:]\.]+)",
396             "_AULAST_" => "([[:alpha:]-]+)",
397             "_ISSN_" => "([[:digit:]-]+)",
398             "_AUTHORS_" => "((?:$initial_match|$name_match|$conjs)+?)",
399             "_DATE_" => "([[:digit:]]{2}/[[:digit:]]{2}/[[:digit]]{2})",
400             "_YEAR_" => "([[:digit:]]{4})",
401             "_TITLE_" => "(.+?[a-zA-Z]+.+?)",
402             "_SUBTITLE_" => "(.+)",
403             "_CHAPTER_" => "([[:digit:]]+)",
404             "_UCTITLE_" => "([^[:lower:]]+)",
405             "_CAPTITLE_" => "([[:upper:]][^[:upper:]]+)",
406             "_PUBLICATION_" => "([^0-9\(\);\"']{4,}$spec_pubs)",
407             "_PUBLISHER_" => "(.+)",
408             "_PUBLOC_" => "(.+)",
409             "_EDITOR_" => "([[:alpha:]\\.,;\\s&-]+)",
410             "_UCPUBLICATION_" => "([^[:lower:]]+)",
411             "_CAPPUBLICATION_" => "([[:upper:]][^[:upper:]]+)",
412             "_VOLUME_" => "([[:digit:]]+)",
413             "_ISSUE_" => "([[:digit:]]+)",
414             "_PAGES_" => "([[:digit:]]+-{1,2}[[:digit:]]+?)",
415             "_ANY_" => "(.+?)",
416             "_ISBN_" => "([[:digit:]X-]+)",
417             "_ISSN_" => "([[:digit:]X-]+)",
418             "_SPAGE_" => "([[:digit:]]+)",
419             "_EPAGE_" => "([[:digit:]]+)",
420             "_URL_" => "(((http(s?):\\/\\/(www\\.)?)|(\\bwww\\.)|(ftp:\\/\\/(ftp\\.)?))([-\\w\\.:\\/\\s]+)(\\/|\\.\\S+|#\\w+))",
421             );
422              
423              
424 1         3 my(@newtemplates) = ();
425 1         3 foreach my $template (@$Biblio::Citation::Parser::Templates::templates)
426             {
427 385         490 $_ = $template;
428 385         462 s/\\/\\\\/g;
429 385         1511 s/\(/\\\(/g;
430 385         906 s/\)/\\\)/g;
431 385         437 s/\[/\\\[/g;
432 385         381 s/\]/\\\]/g;
433 385         962 s/\./\\\./g;
434 385         1466 s/ /\[\[:space:\]\]+/g;
435 385         490 s/\?/\\\?/g;
436 385         1306 foreach my $key (keys %matches)
437             {
438 9625         67032 s/$key/$matches{$key}/g;
439             }
440 385         1029 $_ .= "[.]?";
441 385         686 push @newtemplates,$_;
442             }
443 1         3 my $index = 0;
444 1         4 my @vars = ();
445 1         3 my @matchedvars = ();
446              
447 1         2 my $curr_conc = 0;
448 1         2 my $curr_rel = 0;
449 1         3 my $max_conc = 0;
450 1         2 my $max_rel = 0;
451 1         3 my $best_match = "";
452 1         2 my $best_orig = "";
453 1         3 foreach my $currtemplate (@newtemplates)
454             {
455 385         865 my $original = $Biblio::Citation::Parser::Templates::templates->[$index];
456 385 100       184868 if ($ref =~ /^$currtemplate$/)
457             {
458 8         40 $curr_rel = get_reliability($original);
459 8         28 $curr_conc = get_concreteness($original);
460 8 100 66     54 if ($curr_rel > $max_rel)
    50          
461             {
462 3         8 $best_match = $currtemplate;
463 3         6 $best_orig = $original;
464 3         5 $max_conc = $curr_conc;
465 3         6 $max_rel = $curr_rel;
466             }
467             elsif ($curr_rel == $max_rel && $curr_conc > $max_conc)
468             {
469 0         0 $best_match = $currtemplate;
470 0         0 $best_orig = $original;
471 0         0 $max_conc = $curr_conc;
472 0         0 $max_rel = $curr_rel;
473             }
474             }
475 385         37215 $index++;
476             }
477              
478 1         7 $metaout{match} = $best_orig;
479 1         17 @vars = ($best_orig =~ /_([A-Z]+)_/g);
480 1         298 @matchedvars = ($ref =~ /^$best_match$/);
481              
482 1         74 $index = 0;
483 1 50       6 if (scalar @matchedvars > 0)
484             {
485 1         4 foreach(@vars)
486             {
487 6         32 $matchedvars[$index] =~ s/^\s*(.+)\s*$/$1/;
488 6         21 $metaout{lc $_} = $matchedvars[$index];
489 6         10 $index++;
490             }
491             }
492 1         9 foreach(keys %metaout)
493             {
494 9 50       20 if (/^uc/)
495             {
496 0         0 my $alt = $_;
497 0         0 $alt =~ s/^uc//;
498 0 0 0     0 if (!defined $metaout{$alt} || $metaout{$alt} eq "")
499             {
500 0         0 $metaout{$alt} = $metaout{$_};
501             }
502             }
503             }
504              
505             # Create a marked-up version
506 1         3 my $in_ref = $ref;
507 1         3 my $in_tmp = $best_orig;
508 1         2 my $in_tmp2 = $best_orig;
509 1         6 foreach(keys %metaout)
510             {
511 9 50 33     69 next if (!defined $metaout{$_} || $metaout{$_} eq "" || $_ eq "any");
      33        
512 9         23 my $toreplace = "_".(uc $_)."_";
513 9         97 $in_tmp =~ s/$toreplace/<$_>$metaout{$_}<\/$_>/g;
514 9         92 $in_tmp2 =~ s/$toreplace/$metaout{$_}/g;
515             }
516              
517             # Fix any _ANY_s
518 1         4 $in_tmp2 =~ s/\\/\\\\/g;
519 1         5 $in_tmp2 =~ s/\(/\\\(/g;
520 1         5 $in_tmp2 =~ s/\)/\\\)/g;
521 1         4 $in_tmp2 =~ s/\[/\\\[/g;
522 1         3 $in_tmp2 =~ s/\]/\\\]/g;
523 1         7 $in_tmp2 =~ s/\./\\\./g;
524 1         9 $in_tmp2 =~ s/ /\[\[:space:\]\]+/g;
525 1         3 $in_tmp2 =~ s/\?/\\\?/g;
526 1         3 $in_tmp2 =~ s/_ANY_/(.+)/g;
527 1         40 my(@anys) = ($in_ref =~ /$in_tmp2/g);
528            
529 1         4 foreach(@anys)
530             {
531 1         5 $in_tmp =~ s/_ANY_/$_<\/any>/;
532             }
533 1         4 $metaout{marked} = $in_tmp;
534             # Map to OpenURL
535 1 50       4 if (defined $metaout{authors})
536             {
537 1         8 $metaout{authors} = [handle_authors($metaout{authors})];
538 1         4 $metaout{aulast} = $metaout{authors}[0]->{family};
539 1         5 $metaout{aufirst} = $metaout{authors}[0]->{given};
540             }
541 1 50 33     42 if (defined $metaout{publisher} && !defined $metaout{publication})
542             {
543 0         0 $metaout{genre} = "book";
544             }
545 1         3 $metaout{atitle} = $metaout{title};
546 1         2 $metaout{title} = $metaout{publication};
547 1 50       4 if (defined $metaout{cappublication}) { $metaout{title} = $metaout{cappublication} };
  0         0  
548 1         3 $metaout{date} = $metaout{year};
549 1         107 return %metaout;
550              
551             }
552              
553             =pod
554              
555             =item $metadata = $parser-Eparse($reference);
556              
557             This method provides a wrapper to the extract_metadata
558             function. Simply pass a reference string, and a metadata
559             hash is returned.
560              
561             =cut
562              
563             sub parse
564             {
565 1     1 1 7 my($self, $ref) = @_;
566 1         6 my $hashout = {$self->extract_metadata($ref)};
567 1         7 return $hashout;
568             }
569              
570             1;
571              
572             __END__