File Coverage

blib/lib/Biblio/Citation/Compare.pm

Criterion	Covered	Total	%
statement	159	192	82.8
branch	75	118	63.5
condition	75	119	63.0
subroutine	19	22	86.3
pod	2	13	15.3
total	330	464	71.1

line	stmt	bran	cond	sub	pod	time	code
1							package Biblio::Citation::Compare;
2
3	3			3		72126	use 5.0;
	3					27
4	3			3		16	use strict;
	3					6
	3					80
5	3			3		19	use warnings;
	3					16
	3					97
6	3			3		1334	use Text::LevenshteinXS qw(distance);
	3					9084
	3					163
7	3			3		1546	use HTML::Entities;
	3					20933
	3					258
8	3			3		4654	use Text::Names qw/samePerson cleanName parseName parseName2/;
	3					140834
	3					326
9	3			3		1293	use Text::Roman qw/isroman roman2int/;
	3					3385
	3					180
10	3			3		25	use utf8;
	3					7
	3					18
11
12							require Exporter;
13
14							our @ISA = qw(Exporter);
15
16							our %EXPORT_TAGS = ( 'all' => [ qw(
17							sameWork sameAuthors toString extractEdition sameAuthorBits sameTitle sameAuthorsLoose
18							) ] );
19
20							our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
21
22							our @EXPORT = qw( );
23
24							our $VERSION = '0.56';
25
26							# to correct bogus windows entities. unfixable ones are converted to spaces.
27							my %WIN2UTF = (
28							hex('80')=> hex('20AC'),# #EURO SIGN
29							hex('81')=> hex('0020'), #UNDEFINED
30							hex('82')=> hex('201A'),# #SINGLE LOW-9 QUOTATION MARK
31							hex('83')=> hex('0192'),# #LATIN SMALL LETTER F WITH HOOK
32							hex('84')=> hex('201E'),# #DOUBLE LOW-9 QUOTATION MARK
33							hex('85')=> hex('2026'),# #HORIZONTAL ELLIPSIS
34							hex('86')=> hex('2020'),# #DAGGER
35							hex('87')=> hex('2021'),# #DOUBLE DAGGER
36							hex('88')=> hex('02C6'),# #MODIFIER LETTER CIRCUMFLEX ACCENT
37							hex('89')=> hex('2030'),# #PER MILLE SIGN
38							hex('8A')=> hex('0160'),# #LATIN CAPITAL LETTER S WITH CARON
39							hex('8B')=> hex('2039'),# #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
40							hex('8C')=> hex('0152'),# #LATIN CAPITAL LIGATURE OE
41							hex('8D')=> hex('0020'),# #UNDEFINED
42							hex('8E')=> hex('017D'),# #LATIN CAPITAL LETTER Z WITH CARON
43							hex('8F')=> hex('0020'),# #UNDEFINED
44							hex('90')=> hex('0020'),# #UNDEFINED
45							hex('91')=> hex('2018'),# #LEFT SINGLE QUOTATION MARK
46							hex('92')=> hex('2019'),# #RIGHT SINGLE QUOTATION MARK
47							hex('93')=> hex('201C'),# #LEFT DOUBLE QUOTATION MARK
48							hex('94')=> hex('201D'),# #RIGHT DOUBLE QUOTATION MARK
49							hex('95')=> hex('2022'),# #BULLET
50							hex('96')=> hex('2013'),# #EN DASH
51							hex('97')=> hex('2014'),# #EM DASH
52							hex('98')=> hex('02DC'),# #SMALL TILDE
53							hex('99')=> hex('2122'),# #TRADE MARK SIGN
54							hex('9A')=> hex('0161'),# #LATIN SMALL LETTER S WITH CARON
55							hex('9B')=> hex('203A'),# #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
56							hex('9C')=> hex('0153'),# #LATIN SMALL LIGATURE OE
57							hex('9D')=> hex('0020'),# #UNDEFINED
58							hex('9E')=> hex('017E'),# #LATIN SMALL LETTER Z WITH CARON
59							hex('9F')=> hex('0178')# #LATIN CAPITAL LETTER Y WITH DIAERESIS
60							);
61							my $PARENS = '\s[\[$](.+?)[\]$]\s';
62							my $QUOTE = '"“”`¨´‘’‛“”‟„′″‴‵‶‷⁗❛❜❝❞';
63							my @ED_RES = (
64							'(first\|second\|third\|fourth\|fifth\|sixth\|seventh\|eighth\|ninth\|tenth)',
65							'([1-9])\s?\w{2,5}\s[ée]d',
66							'\bv\.?(?:ersion)?\s?([0-9IXV]+)',
67							'\s([IXV0-9]+)(?:$\|:)'
68							);
69
70							#die "no" unless "2nd edition" =~ /$EDITION/i;
71
72							#my $TITLE_SPLIT = '(?:\?\|\:\|\.\|!\|\"\|[$QUOTE]\b)';
73							my $TITLE_SPLIT = '(?:\?\|\:\|\.\|!)';
74
75							sub sameAuthors {
76	66			66	1	2961	my ($list1, $list2, %opts) = @_;
77	66	100	100			209	return 0 if $#$list1 != $#$list2 and $opts{strict};
78	60	100				151	if ($#$list2 > $#$list1) {
79	6					11	my $t = $list1;
80	6					8	$list1 = $list2;
81	6					10	$list2 = $t;
82							}
83	60					166	for (my $i = 0; $i <= $#$list2; $i++) {
84	51	100				2803	return 0 unless grep { samePerson($list2->[$i],$_, %opts) } @$list1;
	89					17357
85							}
86	56					29724	return 1;
87							}
88
89							sub firstAuthor {
90	0			0	0	0	my $e = shift;
91	0					0	my $a = $e->{authors};
92	0	0				0	if ($#$a > -1) {
93	0					0	return $a->[0];
94							} else {
95	0					0	return undef;
96							}
97							}
98
99							sub sameWork {
100
101	53			53	1	28887	my ($e, $c, $threshold,$loose,$nolinks,%opts) = @_;
102
103	53		50			235	my $debug = $opts{debug} \|\| 0;
104
105	53	50				127	$loose = 0 unless defined $loose;
106	53	50				108	$threshold = 0.15 unless $threshold;
107	53	50				117	$opts{loose} = 1 if $loose;
108
109	53	50				132	if ($debug) {
110	0					0	warn "sameWork 1: " . toString($e);
111	0					0	warn "sameWork 2: " . toString($c);
112							}
113
114	53	0	33			150	if (defined $e->{doi} and length $e->{doi} and defined $c->{doi} and length $c->{doi}) {
			33
			0
115	0	0				0	if ($e->{doi} eq $c->{doi}) {
116							# we don't use doi to say 1 because often we have dois that are for a whole issue
117							# however same doi lowers the threshold
118	0	0				0	$threshold /= 2 if $e->{doi} eq $c->{doi};
119	0					0	$loose = 1;
120	0					0	$opts{loose} = 1;
121							} else {
122	0					0	return 0;
123							}
124							}
125
126	53	50				114	return 0 if (!$c);
127
128							# normalize encoding of relevant fields
129	53					125	local $e->{title} = decodeHTMLEntities($e->{title});
130	53					105	local $c->{title} = decodeHTMLEntities($c->{title});
131
132							# first check if authors,date, and title are almost literally the same
133	53	100				182	my $tsame = (lc $e->{title} eq lc $c->{title}) ? 1 : 0;
134	53					143	my $asame = sameAuthors($e->{authors},$c->{authors},strict=>1);
135	53		100			507	my $asame_loose = $asame \|\| sameAuthors($e->{authors},$c->{authors},strict=>0); #asame_loose will be 1 while same is 0 when there are extra authors in one paper but all overlap authors match
136	53		66			417	my $asame_bits = $asame_loose \|\| sameAuthorBits($e->{authors},$c->{authors});
137	53	100	100			286	my $dsame = (defined $e->{date} and defined $c->{date} and $e->{date} eq $c->{date}) ? 1 : 0;
138
139	53	50				111	if ($debug) {
140	0					0	warn "tsame: $tsame";
141	0					0	warn "asame: $asame";
142	0					0	warn "asame_loose: $asame_loose";
143	0					0	warn "asame_bits: $asame_bits";
144	0					0	warn "dsame: $dsame";
145							}
146
147	53	100	100			152	return 1 if ($tsame and $asame and $dsame);
			66
148
149							# if authors quite different, not same
150	48	100				92	if (!$asame_bits) {
151	1	50				25	warn "authors too different" if $debug;
152	1					5	return 0;
153							}
154							# at this point the authors are plausibly the same
155
156							# check dates
157	47					73	my $date_wildcards = '^forthcoming\|in press\|manuscript\|unknown\|web$';
158	47		33			276	my $compat_dates = ($dsame or ($e->{date} && $e->{date} =~ /$date_wildcards/) or ($c->{date} && $c->{date} =~ /$date_wildcards/));
159	47	100	66			135	if (!$dsame and !$compat_dates) {
160
161							#disabled for most cases because we want to conflate editions and republications for now.
162	24	50	33			110	if ($e->{title} =~ /^Introduction.?$/ or $e->{title} =~ /^Preface.?$/) {
163							return 0 if ($e->{source} and $e->{source} ne $c->{source}) or
164	0	0	0			0	($e->{volume} and $e->{volume} ne $c->{volume});
			0
			0
165							}
166
167							# numeric dates
168	24	100	66			118	if ($e->{date} and $e->{date} =~ /^\d\d\d\d$/ and $c->{date} and $c->{date} =~ /^\d\d\d\d$/) {
			100
			66
169	7					19	my $date_diff = $e->{date} - $c->{date};
170							# quite often people misremember dates so we permit some slack
171							# we will consider the dates compat if they close in time
172							# if dates are far apart, we know they are not exactly the same publicatoins.
173							# but they might be reprints of the same thing, which we want to conflate.
174	7	100	66			27	if ($date_diff > 3 or $date_diff < -3) {
175	3	50				7	if ($asame_bits) {
176	3					8	$threshold /= 2;
177	3	50				8	warn "dates different, lowering similarity threshold" if $debug;
178							} else {
179	0	0				0	warn "dates+authors too different" if $debug;
180	0					0	return 0;
181							}
182
183							} else {
184							# nearby date
185	4					11	$threshold /= 2;
186							}
187
188							} else {
189							#messed up dates, assume the worst
190	17					47	$threshold /=2;
191							}
192
193							} else {
194	23	50	33			61	$loose = 1 if $asame_loose or $asame_bits;
195							}
196
197
198
199
200	47	50				84	warn "pre title length" if $debug;
201							# if title very different in lengths and do not contain ":" or brackets, not the same
202							return 0 if !$tsame and (
203							abs(length($e->{title}) - length($c->{title})) > 20
204							and
205							($e->{title} !~ /$TITLE_SPLIT/ and $c->{title} !~ /$TITLE_SPLIT/)
206							and
207	47	50	100			431	($e->{title} !~ /$PARENS/ and $c->{title} !~ /$PARENS/)
			100
			33
			66
			66
208							);
209
210							# Compare links
211							# if (!$nolinks) {
212							# foreach my $l (@{$e->{links}}) {
213							# print "Links e:\n" . join("\n",$e->getLinks);
214							# print "Links c:\n" . join("\n",$c->getLinks);
215							# return 1 if grep { $l eq $_} @{$c->{links}};
216							# }
217							# }
218
219	47	50				131	warn "pre loose mode: loose = $loose" if $debug;
220
221							#print "threshold $lname1,$lname2: $threshold\n";
222							# ok if distance short enough without doing anything
223							#print "distance: " . distance(lc $e->{title},lc $c->{title}) / (length($e->{title}) +1) . "\n";
224
225							# perform fuzzy matching
226							#my $str1 = "$e->{date}\|$e->{title}";
227	47					120	my $str1 = lc _strip_non_word($e->{title});
228	47					101	my $str2 = lc _strip_non_word($c->{title});
229
230							# check for edition strings
231	47					121	my $ed1 = extractEdition($str1);
232	47					659	my $ed2 = extractEdition($str2);
233	47	50				249	warn "ed1: $ed1" if $debug;
234	47	50				100	warn "ed2: $ed2" if $debug;
235	47	100	100			167	$loose =1 if $ed1 and $ed2 and $ed1 == $ed2 and !$dsame and $asame_loose;
			100
			100
			66
236
237	47	100	100			313	return 0 if ($ed1 and !$ed2) or ($ed2 and !$ed1) or ($ed1 && $ed1 != $ed2);
			100
			100
			100
			100
238	40	50				85	warn "not diff editions" if $debug;
239
240							# remove brackets
241	40					67	my ($parens1,$parens2);
242	40					233	$str1 =~ s/$PARENS//g;
243	40					107	$parens1 = $1;
244	40					203	$str2 =~ s/$PARENS//g;
245	40					93	$parens2 = $1;
246	40	100	66			119	return 0 if $parens1 && $parens2 && numdiff($parens1,$parens2);
			100
247
248	38	50				70	warn "the text comparison is: '$str1' vs '$str2'" if $debug;
249
250	38	50				75	warn "pre number check" if $debug;
251							# if titles differ by a number, not the same
252	38	100				76	return 0 if numdiff($str1,$str2);
253
254							# ultimate test
255							#dbg("$str1\n$str2\n");
256							#dbg(my_dist_text($str1,$str2));
257	33					79	my $score = (my_dist_text($str1,$str2) / (length($str1) +1));
258
259	33	50				72	warn "score: $score (threshold: $threshold)" if $debug;
260							#print $score . " \n";
261	33	100				198	return 1 if ( $score < $threshold);
262
263							# now if loose mode and only one of the titles has a ":" or other punctuation, compare the part before the punc with the other title instead
264	11	100				38	if ($loose) {
265
266	7	50				14	warn "loose: $str1 -- $str2" if $debug;
267
268	7	100				321	if ($e->{title} =~ /(.+?)\s$TITLE_SPLIT\s(.+)/) {
		50
269
270	5					14	my $str1 = _strip_non_word($1);
271	5	50				255	if ($c->{title} =~ /(.+?)\s$TITLE_SPLIT\s(.+)/) {
272	0					0	return 0;
273							} else {
274	5	50				15	if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) {
275	5					28	return 1;
276							}
277							}
278
279							} elsif ($c->{title} =~ /(.+?)\s$TITLE_SPLIT\s(.+)/) {
280
281	2					6	my $str2 = _strip_non_word($1);
282	2	100				6	if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) {
283	1					6	return 1;
284							}
285
286							} else {
287
288	0					0	return 0;
289
290							}
291							}
292
293	5					33	return 0;
294							}
295
296							sub sameAuthorsLoose {
297	0			0	0	0	my ($a, $b) = @_;
298	0					0	my $asame = sameAuthors($a,$b,strict=>1);
299	0		0			0	my $asame_loose = $asame \|\| sameAuthors($a,$b,strict=>0);
300	0		0			0	return $asame_loose \|\| sameAuthorBits($a,$b);
301							}
302
303							sub sameAuthorBits {
304	3			3	0	161	my ($a, $b) = @_;
305	3					7	my (@alist, @blist);
306	3					9	for (@$a) {
307	4					12	my $v = lc $_; # we copy so we don't modify the original
308	4					21	$v =~ s/[,\.]//g;
309							#$v =~ s/(\p{Ll})(\p{Lu})/$1 $2/g;
310	4					34	push @alist, split(/\s+/, $v);
311							}
312	3					7	for (@$b) {
313	3					8	my $v = lc $_;
314	3					13	$v =~ s/[,\.]//g;
315							#$v =~ s/(\p{Ll})(\p{Lu})/$1 $2/g;
316	3					17	push @blist, split(/\s+/, $v);
317							}
318							#use Data::Dumper;
319	3					13	@alist = sort @alist;
320	3					8	@blist = sort @blist;
321							#print Dumper(\@alist);
322							#print Dumper(\@blist);
323	3	50				23	return 0 if $#alist != $#blist;
324	3					11	for (my $i=0; $i<= $#alist; $i++) {
325	6	100				26	return 0 if lc $alist[$i] ne lc $blist[$i];
326							}
327	1					6	return 1;
328							}
329
330							#wip
331							#sub author_bits {
332							# my $list_ref = shift;
333							# my @new;
334							# for (@$list_ref) {
335							# my $v = $_; # we copy so we don't modify the original
336							# $v =~ s/,//;
337							# $v =~ s/(\p{Ll}\p
338							# push @alist, split(/\s+/, $v);
339							# }
340							#}
341
342							sub _strip_non_word {
343	101			101		161	my $str = shift;
344							#abbreviation "volume" v
345	101					335	$str =~ s/\bvolume\b/v/gi;
346	101					283	$str =~ s/\bvol\.?\b/v/gi;
347	101					260	$str =~ s/\bv\.\b/v/gi;
348
349	101					623	$str =~ s/[^[0-9a-zA-Z\)\]\(\[]+/ /g;
350	101					527	$str =~ s/\s+/ /g;
351	101					205	$str =~ s/^\s+//;
352	101					291	$str =~ s/\s+$//;
353	101					262	$str;
354							}
355
356							my %nums = (
357							first => 1,
358							second => 2,
359							third => 3,
360							fourth => 4,
361							fifth => 5,
362							sixth => 6,
363							seventh => 7,
364							eighth => 8,
365							ninth => 9,
366							tenth => 10,
367							);
368							sub extract_num {
369	28			28	0	98	my $s = shift;
370	28	100				155	if ($s =~ /\b(\d+)/) {
371	13					78	return $1;
372							}
373	15	100				61	if (isroman($s)) {
374	10					647	return roman2int($s);
375							}
376
377	5					132	for my $n (keys %nums) {
378	35	100				247	if ($s =~ /\b$n\b/i) {
379	5					31	return $nums{$n};
380							}
381
382							}
383	0					0	return $s;
384							}
385
386							sub extractEdition {
387	106			106	0	6220	my $s = shift;
388	106					221	for my $re (@ED_RES) {
389	391	100				11603	if ($s =~ /$re/i) {
390	28					103	return extract_num($1);
391							}
392							}
393	78					220	return undef;
394							}
395
396							sub numdiff {
397	48			48	0	107	my ($s1,$s2) = @_;
398	48					347	my @n1 = ($s1 =~ /\b([IXV0-9]{1,4}\|one\|two\|three\|four\|five\|six\|seven\|eight\|nine\|ten\|eleven\|twelve\|first\|second\|third\|fourth\|fifth\|sixth\|seventh\|eighth\|ninth\|tenth\|eleventh\|twelveth\|1st\|2nd\|3rd\|4th\|5th\|6th\|7th\|8th\|9th\|10th\|11th\|12th)\b/ig);
399	48					252	my @n2 = ($s2 =~ /\b([IXV0-9]{1,4}\|one\|two\|three\|four\|five\|six\|seven\|eight\|nine\|ten\|eleven\|twelve\|first\|second\|third\|fourth\|fifth\|sixth\|seventh\|eighth\|ninth\|tenth\|eleventh\|twelveth\|1st\|2nd\|3rd\|4th\|5th\|6th\|7th\|8th\|9th\|10th\|11th\|12th)\b/ig);
400							#print "In s1:" . join(",",@n1) . "\n";
401							#print "In s2:" . join(",",@n2) . "\n";
402	48	100				181	return 0 if $#n1 ne $#n2;
403	44					136	for (0..$#n1) {
404	13	100				126	return 1 if lc $n1[$_] ne lc $n2[$_];
405							}
406	37					102	return 0;
407							}
408
409
410							sub my_dist_text {
411	40			40	0	87	my $a = lc shift;
412	40					77	my $b = lc shift;
413	40					82	$a =~ s/_/ /g;
414	40					69	$b =~ s/_/ /g;
415	40					395	return distance($a, $b);
416
417							}
418							sub decodeHTMLEntities {
419	106			106	0	190	my $in = shift;
420	106					215	$in =~ s/&([\d\w\#]+);/&safe_decode($1)/gei;
	0					0
421	106					236	return $in;
422							}
423
424							sub safe_decode {
425	0			0	0	0	my $in = shift;
426	0	0				0	if (substr($in,0,1) eq '#') {
427	0	0				0	my $num = substr($in,1,1) eq 'x' ? hex(substr($in,1)) : substr($in,1);
428							# we check and fix cp1232 entities
429							return ($num < 127 or $num > 159) ?
430							HTML::Entities::decode_entities("&$in;") :
431	0	0	0			0	HTML::Entities::decode_entities("&#" . $WIN2UTF{$num} . ";");
432							} else {
433	0					0	HTML::Entities::decode_entities("&$in;")
434							}
435							}
436
437							sub toString {
438	72			72	0	181	my $h = shift;
439	72					103	return join("; ",@{$h->{authors}}) . " ($h->{date}) $h->{title}\n";
	72					413
440							}
441
442							sub sameTitle {
443	16			16	0	5252	my ($a, $b, $threshold,$loose,$nolinks,%opts) = @_;
444	16					80	return sameWork({ title => $a }, { title => $b }, $threshold,$loose,$nolinks,%opts);
445							}
446
447							1;
448							__END__