| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Biblio::Citation::Compare; |
|
2
|
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
69117
|
use 5.0; |
|
|
3
|
|
|
|
|
25
|
|
|
4
|
3
|
|
|
3
|
|
19
|
use strict; |
|
|
3
|
|
|
|
|
6
|
|
|
|
3
|
|
|
|
|
73
|
|
|
5
|
3
|
|
|
3
|
|
15
|
use warnings; |
|
|
3
|
|
|
|
|
18
|
|
|
|
3
|
|
|
|
|
103
|
|
|
6
|
3
|
|
|
3
|
|
1281
|
use Text::LevenshteinXS qw(distance); |
|
|
3
|
|
|
|
|
8744
|
|
|
|
3
|
|
|
|
|
166
|
|
|
7
|
3
|
|
|
3
|
|
1408
|
use HTML::Entities; |
|
|
3
|
|
|
|
|
20630
|
|
|
|
3
|
|
|
|
|
293
|
|
|
8
|
3
|
|
|
3
|
|
4145
|
use Text::Names qw/samePerson cleanName parseName parseName2/; |
|
|
3
|
|
|
|
|
139703
|
|
|
|
3
|
|
|
|
|
385
|
|
|
9
|
3
|
|
|
3
|
|
1390
|
use Text::Roman qw/isroman roman2int/; |
|
|
3
|
|
|
|
|
3704
|
|
|
|
3
|
|
|
|
|
184
|
|
|
10
|
3
|
|
|
3
|
|
24
|
use utf8; |
|
|
3
|
|
|
|
|
8
|
|
|
|
3
|
|
|
|
|
53
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
require Exporter; |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw( |
|
17
|
|
|
|
|
|
|
sameWork sameAuthors toString extractEdition sameAuthorBits sameTitle sameAuthorsLoose |
|
18
|
|
|
|
|
|
|
) ] ); |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our @EXPORT = qw( ); |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our $VERSION = '0.57'; |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# to correct bogus windows entities. unfixable ones are converted to spaces. |
|
27
|
|
|
|
|
|
|
my %WIN2UTF = ( |
|
28
|
|
|
|
|
|
|
hex('80')=> hex('20AC'),# #EURO SIGN |
|
29
|
|
|
|
|
|
|
hex('81')=> hex('0020'), #UNDEFINED |
|
30
|
|
|
|
|
|
|
hex('82')=> hex('201A'),# #SINGLE LOW-9 QUOTATION MARK |
|
31
|
|
|
|
|
|
|
hex('83')=> hex('0192'),# #LATIN SMALL LETTER F WITH HOOK |
|
32
|
|
|
|
|
|
|
hex('84')=> hex('201E'),# #DOUBLE LOW-9 QUOTATION MARK |
|
33
|
|
|
|
|
|
|
hex('85')=> hex('2026'),# #HORIZONTAL ELLIPSIS |
|
34
|
|
|
|
|
|
|
hex('86')=> hex('2020'),# #DAGGER |
|
35
|
|
|
|
|
|
|
hex('87')=> hex('2021'),# #DOUBLE DAGGER |
|
36
|
|
|
|
|
|
|
hex('88')=> hex('02C6'),# #MODIFIER LETTER CIRCUMFLEX ACCENT |
|
37
|
|
|
|
|
|
|
hex('89')=> hex('2030'),# #PER MILLE SIGN |
|
38
|
|
|
|
|
|
|
hex('8A')=> hex('0160'),# #LATIN CAPITAL LETTER S WITH CARON |
|
39
|
|
|
|
|
|
|
hex('8B')=> hex('2039'),# #SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
|
40
|
|
|
|
|
|
|
hex('8C')=> hex('0152'),# #LATIN CAPITAL LIGATURE OE |
|
41
|
|
|
|
|
|
|
hex('8D')=> hex('0020'),# #UNDEFINED |
|
42
|
|
|
|
|
|
|
hex('8E')=> hex('017D'),# #LATIN CAPITAL LETTER Z WITH CARON |
|
43
|
|
|
|
|
|
|
hex('8F')=> hex('0020'),# #UNDEFINED |
|
44
|
|
|
|
|
|
|
hex('90')=> hex('0020'),# #UNDEFINED |
|
45
|
|
|
|
|
|
|
hex('91')=> hex('2018'),# #LEFT SINGLE QUOTATION MARK |
|
46
|
|
|
|
|
|
|
hex('92')=> hex('2019'),# #RIGHT SINGLE QUOTATION MARK |
|
47
|
|
|
|
|
|
|
hex('93')=> hex('201C'),# #LEFT DOUBLE QUOTATION MARK |
|
48
|
|
|
|
|
|
|
hex('94')=> hex('201D'),# #RIGHT DOUBLE QUOTATION MARK |
|
49
|
|
|
|
|
|
|
hex('95')=> hex('2022'),# #BULLET |
|
50
|
|
|
|
|
|
|
hex('96')=> hex('2013'),# #EN DASH |
|
51
|
|
|
|
|
|
|
hex('97')=> hex('2014'),# #EM DASH |
|
52
|
|
|
|
|
|
|
hex('98')=> hex('02DC'),# #SMALL TILDE |
|
53
|
|
|
|
|
|
|
hex('99')=> hex('2122'),# #TRADE MARK SIGN |
|
54
|
|
|
|
|
|
|
hex('9A')=> hex('0161'),# #LATIN SMALL LETTER S WITH CARON |
|
55
|
|
|
|
|
|
|
hex('9B')=> hex('203A'),# #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
|
56
|
|
|
|
|
|
|
hex('9C')=> hex('0153'),# #LATIN SMALL LIGATURE OE |
|
57
|
|
|
|
|
|
|
hex('9D')=> hex('0020'),# #UNDEFINED |
|
58
|
|
|
|
|
|
|
hex('9E')=> hex('017E'),# #LATIN SMALL LETTER Z WITH CARON |
|
59
|
|
|
|
|
|
|
hex('9F')=> hex('0178')# #LATIN CAPITAL LETTER Y WITH DIAERESIS |
|
60
|
|
|
|
|
|
|
); |
|
61
|
|
|
|
|
|
|
my $PARENS = '\s*[\[\(](.+?)[\]\)]\s*'; |
|
62
|
|
|
|
|
|
|
my $QUOTE = '"“”`¨´‘’‛“”‟„′″‴‵‶‷⁗❛❜❝❞'; |
|
63
|
|
|
|
|
|
|
my @ED_RES = ( |
|
64
|
|
|
|
|
|
|
'(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth)', |
|
65
|
|
|
|
|
|
|
'([1-9])\s?\w{2,5}\s[ée]d', |
|
66
|
|
|
|
|
|
|
'\bv\.?(?:ersion)?\s?([0-9IXV]+)', |
|
67
|
|
|
|
|
|
|
'\s([IXV0-9]+)(?:$|:)' |
|
68
|
|
|
|
|
|
|
); |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
#die "no" unless "2nd edition" =~ /$EDITION/i; |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
#my $TITLE_SPLIT = '(?:\?|\:|\.|!|\"|[$QUOTE]\b)'; |
|
73
|
|
|
|
|
|
|
my $TITLE_SPLIT = '(?:\?|\:|\.|!)'; |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub sameAuthors { |
|
76
|
66
|
|
|
66
|
1
|
3069
|
my ($list1, $list2, %opts) = @_; |
|
77
|
66
|
100
|
100
|
|
|
215
|
return 0 if $#$list1 != $#$list2 and $opts{strict}; |
|
78
|
60
|
100
|
|
|
|
158
|
if ($#$list2 > $#$list1) { |
|
79
|
6
|
|
|
|
|
10
|
my $t = $list1; |
|
80
|
6
|
|
|
|
|
10
|
$list1 = $list2; |
|
81
|
6
|
|
|
|
|
10
|
$list2 = $t; |
|
82
|
|
|
|
|
|
|
} |
|
83
|
60
|
|
|
|
|
152
|
for (my $i = 0; $i <= $#$list2; $i++) { |
|
84
|
51
|
100
|
|
|
|
2712
|
return 0 unless grep { samePerson($list2->[$i],$_, %opts) } @$list1; |
|
|
89
|
|
|
|
|
16995
|
|
|
85
|
|
|
|
|
|
|
} |
|
86
|
56
|
|
|
|
|
29130
|
return 1; |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub firstAuthor { |
|
90
|
0
|
|
|
0
|
0
|
0
|
my $e = shift; |
|
91
|
0
|
|
|
|
|
0
|
my $a = $e->{authors}; |
|
92
|
0
|
0
|
|
|
|
0
|
if ($#$a > -1) { |
|
93
|
0
|
|
|
|
|
0
|
return $a->[0]; |
|
94
|
|
|
|
|
|
|
} else { |
|
95
|
0
|
|
|
|
|
0
|
return undef; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub sameWork { |
|
100
|
|
|
|
|
|
|
|
|
101
|
53
|
|
|
53
|
1
|
28214
|
my ($e, $c, $threshold,$loose,$nolinks,%opts) = @_; |
|
102
|
|
|
|
|
|
|
|
|
103
|
53
|
|
50
|
|
|
246
|
my $debug = $opts{debug} || 0; |
|
104
|
|
|
|
|
|
|
|
|
105
|
53
|
50
|
|
|
|
167
|
$loose = 0 unless defined $loose; |
|
106
|
53
|
50
|
|
|
|
116
|
$threshold = 0.15 unless $threshold; |
|
107
|
53
|
50
|
|
|
|
108
|
$opts{loose} = 1 if $loose; |
|
108
|
|
|
|
|
|
|
|
|
109
|
53
|
50
|
|
|
|
105
|
if ($debug) { |
|
110
|
0
|
|
|
|
|
0
|
warn "sameWork 1: " . toString($e); |
|
111
|
0
|
|
|
|
|
0
|
warn "sameWork 2: " . toString($c); |
|
112
|
|
|
|
|
|
|
} |
|
113
|
|
|
|
|
|
|
|
|
114
|
53
|
0
|
33
|
|
|
137
|
if (defined $e->{doi} and length $e->{doi} and defined $c->{doi} and length $c->{doi}) { |
|
|
|
|
33
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
115
|
0
|
0
|
|
|
|
0
|
if ($e->{doi} eq $c->{doi}) { |
|
116
|
|
|
|
|
|
|
# we don't use doi to say 1 because often we have dois that are for a whole issue |
|
117
|
|
|
|
|
|
|
# however same doi lowers the threshold |
|
118
|
0
|
0
|
|
|
|
0
|
$threshold /= 2 if $e->{doi} eq $c->{doi}; |
|
119
|
0
|
|
|
|
|
0
|
$loose = 1; |
|
120
|
0
|
|
|
|
|
0
|
$opts{loose} = 1; |
|
121
|
|
|
|
|
|
|
} else { |
|
122
|
0
|
|
|
|
|
0
|
return 0; |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
|
|
126
|
53
|
50
|
|
|
|
117
|
return 0 if (!$c); |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# normalize encoding of relevant fields |
|
129
|
53
|
|
|
|
|
129
|
local $e->{title} = decodeHTMLEntities($e->{title}); |
|
130
|
53
|
|
|
|
|
107
|
local $c->{title} = decodeHTMLEntities($c->{title}); |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# first check if authors,date, and title are almost literally the same |
|
133
|
53
|
100
|
|
|
|
177
|
my $tsame = (lc $e->{title} eq lc $c->{title}) ? 1 : 0; |
|
134
|
53
|
|
|
|
|
134
|
my $asame = sameAuthors($e->{authors},$c->{authors},strict=>1); |
|
135
|
53
|
|
100
|
|
|
477
|
my $asame_loose = $asame || sameAuthors($e->{authors},$c->{authors},strict=>0); #asame_loose will be 1 while same is 0 when there are extra authors in one paper but all overlap authors match |
|
136
|
53
|
|
66
|
|
|
403
|
my $asame_bits = $asame_loose || sameAuthorBits($e->{authors},$c->{authors}); |
|
137
|
53
|
100
|
100
|
|
|
271
|
my $dsame = (defined $e->{date} and defined $c->{date} and $e->{date} eq $c->{date}) ? 1 : 0; |
|
138
|
|
|
|
|
|
|
|
|
139
|
53
|
50
|
|
|
|
115
|
if ($debug) { |
|
140
|
0
|
|
|
|
|
0
|
warn "tsame: $tsame"; |
|
141
|
0
|
|
|
|
|
0
|
warn "asame: $asame"; |
|
142
|
0
|
|
|
|
|
0
|
warn "asame_loose: $asame_loose"; |
|
143
|
0
|
|
|
|
|
0
|
warn "asame_bits: $asame_bits"; |
|
144
|
0
|
|
|
|
|
0
|
warn "dsame: $dsame"; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
|
|
147
|
53
|
100
|
100
|
|
|
153
|
return 1 if ($tsame and $asame and $dsame); |
|
|
|
|
66
|
|
|
|
|
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# if authors quite different, not same |
|
150
|
48
|
100
|
|
|
|
106
|
if (!$asame_bits) { |
|
151
|
1
|
50
|
|
|
|
28
|
warn "authors too different" if $debug; |
|
152
|
1
|
|
|
|
|
5
|
return 0; |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
# at this point the authors are plausibly the same |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# check dates |
|
157
|
47
|
|
|
|
|
76
|
my $date_wildcards = '^forthcoming|in press|manuscript|unknown|web$'; |
|
158
|
47
|
|
33
|
|
|
274
|
my $compat_dates = ($dsame or ($e->{date} && $e->{date} =~ /$date_wildcards/) or ($c->{date} && $c->{date} =~ /$date_wildcards/)); |
|
159
|
47
|
100
|
66
|
|
|
138
|
if (!$dsame and !$compat_dates) { |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
#disabled for most cases because we want to conflate editions and republications for now. |
|
162
|
24
|
50
|
33
|
|
|
103
|
if ($e->{title} =~ /^Introduction.?$/ or $e->{title} =~ /^Preface.?$/) { |
|
163
|
|
|
|
|
|
|
return 0 if ($e->{source} and $e->{source} ne $c->{source}) or |
|
164
|
0
|
0
|
0
|
|
|
0
|
($e->{volume} and $e->{volume} ne $c->{volume}); |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# numeric dates |
|
168
|
24
|
100
|
66
|
|
|
117
|
if ($e->{date} and $e->{date} =~ /^\d\d\d\d$/ and $c->{date} and $c->{date} =~ /^\d\d\d\d$/) { |
|
|
|
|
100
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
169
|
7
|
|
|
|
|
15
|
my $date_diff = $e->{date} - $c->{date}; |
|
170
|
|
|
|
|
|
|
# quite often people misremember dates so we permit some slack |
|
171
|
|
|
|
|
|
|
# we will consider the dates compat if they close in time |
|
172
|
|
|
|
|
|
|
# if dates are far apart, we know they are not exactly the same publicatoins. |
|
173
|
|
|
|
|
|
|
# but they might be reprints of the same thing, which we want to conflate. |
|
174
|
7
|
100
|
66
|
|
|
28
|
if ($date_diff > 3 or $date_diff < -3) { |
|
175
|
3
|
50
|
|
|
|
7
|
if ($asame_bits) { |
|
176
|
3
|
|
|
|
|
8
|
$threshold /= 2; |
|
177
|
3
|
50
|
|
|
|
8
|
warn "dates different, lowering similarity threshold" if $debug; |
|
178
|
|
|
|
|
|
|
} else { |
|
179
|
0
|
0
|
|
|
|
0
|
warn "dates+authors too different" if $debug; |
|
180
|
0
|
|
|
|
|
0
|
return 0; |
|
181
|
|
|
|
|
|
|
} |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
} else { |
|
184
|
|
|
|
|
|
|
# nearby date |
|
185
|
4
|
|
|
|
|
9
|
$threshold /= 2; |
|
186
|
|
|
|
|
|
|
} |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
} else { |
|
189
|
|
|
|
|
|
|
#messed up dates, assume the worst |
|
190
|
17
|
|
|
|
|
41
|
$threshold /=2; |
|
191
|
|
|
|
|
|
|
} |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
} else { |
|
194
|
23
|
50
|
33
|
|
|
55
|
$loose = 1 if $asame_loose or $asame_bits; |
|
195
|
|
|
|
|
|
|
} |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
|
|
200
|
47
|
50
|
|
|
|
97
|
warn "pre title length" if $debug; |
|
201
|
|
|
|
|
|
|
# if title very different in lengths and do not contain ":" or brackets, not the same |
|
202
|
|
|
|
|
|
|
return 0 if !$tsame and ( |
|
203
|
|
|
|
|
|
|
abs(length($e->{title}) - length($c->{title})) > 20 |
|
204
|
|
|
|
|
|
|
and |
|
205
|
|
|
|
|
|
|
($e->{title} !~ /$TITLE_SPLIT/ and $c->{title} !~ /$TITLE_SPLIT/) |
|
206
|
|
|
|
|
|
|
and |
|
207
|
47
|
50
|
100
|
|
|
408
|
($e->{title} !~ /$PARENS/ and $c->{title} !~ /$PARENS/) |
|
|
|
|
100
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
208
|
|
|
|
|
|
|
); |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# Compare links |
|
211
|
|
|
|
|
|
|
# if (!$nolinks) { |
|
212
|
|
|
|
|
|
|
# foreach my $l (@{$e->{links}}) { |
|
213
|
|
|
|
|
|
|
# print "Links e:\n" . join("\n",$e->getLinks); |
|
214
|
|
|
|
|
|
|
# print "Links c:\n" . join("\n",$c->getLinks); |
|
215
|
|
|
|
|
|
|
# return 1 if grep { $l eq $_} @{$c->{links}}; |
|
216
|
|
|
|
|
|
|
# } |
|
217
|
|
|
|
|
|
|
# } |
|
218
|
|
|
|
|
|
|
|
|
219
|
47
|
50
|
|
|
|
98
|
warn "pre loose mode: loose = $loose" if $debug; |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
#print "threshold $lname1,$lname2: $threshold\n"; |
|
222
|
|
|
|
|
|
|
# ok if distance short enough without doing anything |
|
223
|
|
|
|
|
|
|
#print "distance: " . distance(lc $e->{title},lc $c->{title}) / (length($e->{title}) +1) . "\n"; |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# perform fuzzy matching |
|
226
|
|
|
|
|
|
|
#my $str1 = "$e->{date}|$e->{title}"; |
|
227
|
47
|
|
|
|
|
118
|
my $str1 = lc _strip_non_word($e->{title}); |
|
228
|
47
|
|
|
|
|
106
|
my $str2 = lc _strip_non_word($c->{title}); |
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# check for edition strings |
|
231
|
47
|
|
|
|
|
108
|
my $ed1 = extractEdition($str1); |
|
232
|
47
|
|
|
|
|
669
|
my $ed2 = extractEdition($str2); |
|
233
|
47
|
50
|
|
|
|
269
|
warn "ed1: $ed1" if $debug; |
|
234
|
47
|
50
|
|
|
|
101
|
warn "ed2: $ed2" if $debug; |
|
235
|
47
|
100
|
100
|
|
|
152
|
$loose =1 if $ed1 and $ed2 and $ed1 == $ed2 and !$dsame and $asame_loose; |
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
236
|
|
|
|
|
|
|
|
|
237
|
47
|
100
|
100
|
|
|
324
|
return 0 if ($ed1 and !$ed2) or ($ed2 and !$ed1) or ($ed1 && $ed1 != $ed2); |
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
238
|
40
|
50
|
|
|
|
81
|
warn "not diff editions" if $debug; |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# remove brackets |
|
241
|
40
|
|
|
|
|
69
|
my ($parens1,$parens2); |
|
242
|
40
|
|
|
|
|
216
|
$str1 =~ s/$PARENS//g; |
|
243
|
40
|
|
|
|
|
103
|
$parens1 = $1; |
|
244
|
40
|
|
|
|
|
178
|
$str2 =~ s/$PARENS//g; |
|
245
|
40
|
|
|
|
|
73
|
$parens2 = $1; |
|
246
|
40
|
100
|
66
|
|
|
119
|
return 0 if $parens1 && $parens2 && numdiff($parens1,$parens2); |
|
|
|
|
100
|
|
|
|
|
|
247
|
|
|
|
|
|
|
|
|
248
|
38
|
50
|
|
|
|
69
|
warn "the text comparison is: '$str1' vs '$str2'" if $debug; |
|
249
|
|
|
|
|
|
|
|
|
250
|
38
|
50
|
|
|
|
108
|
warn "pre number check" if $debug; |
|
251
|
|
|
|
|
|
|
# if titles differ by a number, not the same |
|
252
|
38
|
100
|
|
|
|
84
|
return 0 if numdiff($str1,$str2); |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# ultimate test |
|
255
|
|
|
|
|
|
|
#dbg("$str1\n$str2\n"); |
|
256
|
|
|
|
|
|
|
#dbg(my_dist_text($str1,$str2)); |
|
257
|
33
|
|
|
|
|
73
|
my $score = (my_dist_text($str1,$str2) / (length($str1) +1)); |
|
258
|
|
|
|
|
|
|
|
|
259
|
33
|
50
|
|
|
|
77
|
warn "score: $score (threshold: $threshold)" if $debug; |
|
260
|
|
|
|
|
|
|
#print $score . " \n"; |
|
261
|
33
|
100
|
|
|
|
204
|
return 1 if ( $score < $threshold); |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# now if loose mode and only one of the titles has a ":" or other punctuation, compare the part before the punc with the other title instead |
|
264
|
11
|
100
|
|
|
|
29
|
if ($loose) { |
|
265
|
|
|
|
|
|
|
|
|
266
|
7
|
50
|
|
|
|
15
|
warn "loose: $str1 -- $str2" if $debug; |
|
267
|
|
|
|
|
|
|
|
|
268
|
7
|
100
|
|
|
|
305
|
if ($e->{title} =~ /(.+?)\s*$TITLE_SPLIT\s*(.+)/) { |
|
|
|
50
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
|
|
270
|
5
|
|
|
|
|
14
|
my $str1 = _strip_non_word($1); |
|
271
|
5
|
50
|
|
|
|
244
|
if ($c->{title} =~ /(.+?)\s*$TITLE_SPLIT\s*(.+)/) { |
|
272
|
0
|
|
|
|
|
0
|
return 0; |
|
273
|
|
|
|
|
|
|
} else { |
|
274
|
5
|
50
|
|
|
|
17
|
if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) { |
|
275
|
5
|
|
|
|
|
28
|
return 1; |
|
276
|
|
|
|
|
|
|
} |
|
277
|
|
|
|
|
|
|
} |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
} elsif ($c->{title} =~ /(.+?)\s*$TITLE_SPLIT\s*(.+)/) { |
|
280
|
|
|
|
|
|
|
|
|
281
|
2
|
|
|
|
|
7
|
my $str2 = _strip_non_word($1); |
|
282
|
2
|
100
|
|
|
|
14
|
if (my_dist_text($str1,$str2) / (length($str1) +1)< $threshold) { |
|
283
|
1
|
|
|
|
|
7
|
return 1; |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
} else { |
|
287
|
|
|
|
|
|
|
|
|
288
|
0
|
|
|
|
|
0
|
return 0; |
|
289
|
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
} |
|
292
|
|
|
|
|
|
|
|
|
293
|
5
|
|
|
|
|
33
|
return 0; |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
sub sameAuthorsLoose { |
|
297
|
0
|
|
|
0
|
0
|
0
|
my ($a, $b) = @_; |
|
298
|
0
|
|
|
|
|
0
|
my $asame = sameAuthors($a,$b,strict=>1); |
|
299
|
0
|
|
0
|
|
|
0
|
my $asame_loose = $asame || sameAuthors($a,$b,strict=>0); |
|
300
|
0
|
|
0
|
|
|
0
|
return $asame_loose || sameAuthorBits($a,$b); |
|
301
|
|
|
|
|
|
|
} |
|
302
|
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
sub sameAuthorBits { |
|
304
|
3
|
|
|
3
|
0
|
157
|
my ($a, $b) = @_; |
|
305
|
3
|
|
|
|
|
7
|
my (@alist, @blist); |
|
306
|
3
|
|
|
|
|
9
|
for (@$a) { |
|
307
|
4
|
|
|
|
|
11
|
my $v = lc $_; # we copy so we don't modify the original |
|
308
|
4
|
|
|
|
|
20
|
$v =~ s/[,\.]//g; |
|
309
|
|
|
|
|
|
|
#$v =~ s/(\p{Ll})(\p{Lu})/$1 $2/g; |
|
310
|
4
|
|
|
|
|
32
|
push @alist, split(/\s+/, $v); |
|
311
|
|
|
|
|
|
|
} |
|
312
|
3
|
|
|
|
|
9
|
for (@$b) { |
|
313
|
3
|
|
|
|
|
7
|
my $v = lc $_; |
|
314
|
3
|
|
|
|
|
13
|
$v =~ s/[,\.]//g; |
|
315
|
|
|
|
|
|
|
#$v =~ s/(\p{Ll})(\p{Lu})/$1 $2/g; |
|
316
|
3
|
|
|
|
|
16
|
push @blist, split(/\s+/, $v); |
|
317
|
|
|
|
|
|
|
} |
|
318
|
|
|
|
|
|
|
#use Data::Dumper; |
|
319
|
3
|
|
|
|
|
12
|
@alist = sort @alist; |
|
320
|
3
|
|
|
|
|
8
|
@blist = sort @blist; |
|
321
|
|
|
|
|
|
|
#print Dumper(\@alist); |
|
322
|
|
|
|
|
|
|
#print Dumper(\@blist); |
|
323
|
3
|
50
|
|
|
|
13
|
return 0 if $#alist != $#blist; |
|
324
|
3
|
|
|
|
|
10
|
for (my $i=0; $i<= $#alist; $i++) { |
|
325
|
6
|
100
|
|
|
|
30
|
return 0 if lc $alist[$i] ne lc $blist[$i]; |
|
326
|
|
|
|
|
|
|
} |
|
327
|
1
|
|
|
|
|
9
|
return 1; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
#wip |
|
331
|
|
|
|
|
|
|
#sub author_bits { |
|
332
|
|
|
|
|
|
|
# my $list_ref = shift; |
|
333
|
|
|
|
|
|
|
# my @new; |
|
334
|
|
|
|
|
|
|
# for (@$list_ref) { |
|
335
|
|
|
|
|
|
|
# my $v = $_; # we copy so we don't modify the original |
|
336
|
|
|
|
|
|
|
# $v =~ s/,//; |
|
337
|
|
|
|
|
|
|
# $v =~ s/(\p{Ll}\p |
|
338
|
|
|
|
|
|
|
# push @alist, split(/\s+/, $v); |
|
339
|
|
|
|
|
|
|
# } |
|
340
|
|
|
|
|
|
|
#} |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
sub _strip_non_word { |
|
343
|
101
|
|
|
101
|
|
165
|
my $str = shift; |
|
344
|
|
|
|
|
|
|
#abbreviation "volume" v |
|
345
|
101
|
|
|
|
|
329
|
$str =~ s/\bvolume\b/v/gi; |
|
346
|
101
|
|
|
|
|
307
|
$str =~ s/\bvol\.?\b/v/gi; |
|
347
|
101
|
|
|
|
|
274
|
$str =~ s/\bv\.\b/v/gi; |
|
348
|
|
|
|
|
|
|
|
|
349
|
101
|
|
|
|
|
619
|
$str =~ s/[^[0-9a-zA-Z\)\]\(\[]+/ /g; |
|
350
|
101
|
|
|
|
|
552
|
$str =~ s/\s+/ /g; |
|
351
|
101
|
|
|
|
|
201
|
$str =~ s/^\s+//; |
|
352
|
101
|
|
|
|
|
301
|
$str =~ s/\s+$//; |
|
353
|
101
|
|
|
|
|
241
|
$str; |
|
354
|
|
|
|
|
|
|
} |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
my %nums = ( |
|
357
|
|
|
|
|
|
|
first => 1, |
|
358
|
|
|
|
|
|
|
second => 2, |
|
359
|
|
|
|
|
|
|
third => 3, |
|
360
|
|
|
|
|
|
|
fourth => 4, |
|
361
|
|
|
|
|
|
|
fifth => 5, |
|
362
|
|
|
|
|
|
|
sixth => 6, |
|
363
|
|
|
|
|
|
|
seventh => 7, |
|
364
|
|
|
|
|
|
|
eighth => 8, |
|
365
|
|
|
|
|
|
|
ninth => 9, |
|
366
|
|
|
|
|
|
|
tenth => 10, |
|
367
|
|
|
|
|
|
|
); |
|
368
|
|
|
|
|
|
|
sub extract_num { |
|
369
|
28
|
|
|
28
|
0
|
81
|
my $s = shift; |
|
370
|
28
|
100
|
|
|
|
103
|
if ($s =~ /\b(\d+)/) { |
|
371
|
13
|
|
|
|
|
83
|
return $1; |
|
372
|
|
|
|
|
|
|
} |
|
373
|
15
|
100
|
|
|
|
64
|
if (isroman($s)) { |
|
374
|
10
|
|
|
|
|
656
|
return roman2int($s); |
|
375
|
|
|
|
|
|
|
} |
|
376
|
|
|
|
|
|
|
|
|
377
|
5
|
|
|
|
|
116
|
for my $n (keys %nums) { |
|
378
|
27
|
100
|
|
|
|
194
|
if ($s =~ /\b$n\b/i) { |
|
379
|
5
|
|
|
|
|
30
|
return $nums{$n}; |
|
380
|
|
|
|
|
|
|
} |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
} |
|
383
|
0
|
|
|
|
|
0
|
return $s; |
|
384
|
|
|
|
|
|
|
} |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
sub extractEdition { |
|
387
|
106
|
|
|
106
|
0
|
6390
|
my $s = shift; |
|
388
|
106
|
|
|
|
|
213
|
for my $re (@ED_RES) { |
|
389
|
391
|
100
|
|
|
|
11537
|
if ($s =~ /$re/i) { |
|
390
|
28
|
|
|
|
|
111
|
return extract_num($1); |
|
391
|
|
|
|
|
|
|
} |
|
392
|
|
|
|
|
|
|
} |
|
393
|
78
|
|
|
|
|
212
|
return undef; |
|
394
|
|
|
|
|
|
|
} |
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
sub numdiff { |
|
397
|
48
|
|
|
48
|
0
|
107
|
my ($s1,$s2) = @_; |
|
398
|
48
|
|
|
|
|
319
|
my @n1 = ($s1 =~ /\b([IXV0-9]{1,4}|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelveth|1st|2nd|3rd|4th|5th|6th|7th|8th|9th|10th|11th|12th)\b/ig); |
|
399
|
48
|
|
|
|
|
271
|
my @n2 = ($s2 =~ /\b([IXV0-9]{1,4}|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelveth|1st|2nd|3rd|4th|5th|6th|7th|8th|9th|10th|11th|12th)\b/ig); |
|
400
|
|
|
|
|
|
|
#print "In s1:" . join(",",@n1) . "\n"; |
|
401
|
|
|
|
|
|
|
#print "In s2:" . join(",",@n2) . "\n"; |
|
402
|
48
|
100
|
|
|
|
168
|
return 0 if $#n1 ne $#n2; |
|
403
|
44
|
|
|
|
|
133
|
for (0..$#n1) { |
|
404
|
13
|
100
|
|
|
|
89
|
return 1 if lc $n1[$_] ne lc $n2[$_]; |
|
405
|
|
|
|
|
|
|
} |
|
406
|
37
|
|
|
|
|
104
|
return 0; |
|
407
|
|
|
|
|
|
|
} |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
sub my_dist_text { |
|
411
|
40
|
|
|
40
|
0
|
89
|
my $a = lc shift; |
|
412
|
40
|
|
|
|
|
79
|
my $b = lc shift; |
|
413
|
40
|
|
|
|
|
81
|
$a =~ s/_/ /g; |
|
414
|
40
|
|
|
|
|
64
|
$b =~ s/_/ /g; |
|
415
|
40
|
|
|
|
|
407
|
return distance($a, $b); |
|
416
|
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
} |
|
418
|
|
|
|
|
|
|
sub decodeHTMLEntities { |
|
419
|
106
|
|
|
106
|
0
|
160
|
my $in = shift; |
|
420
|
106
|
|
|
|
|
213
|
$in =~ s/&([\d\w\#]+);/&safe_decode($1)/gei; |
|
|
0
|
|
|
|
|
0
|
|
|
421
|
106
|
|
|
|
|
247
|
return $in; |
|
422
|
|
|
|
|
|
|
} |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
sub safe_decode { |
|
425
|
0
|
|
|
0
|
0
|
0
|
my $in = shift; |
|
426
|
0
|
0
|
|
|
|
0
|
if (substr($in,0,1) eq '#') { |
|
427
|
0
|
0
|
|
|
|
0
|
my $num = substr($in,1,1) eq 'x' ? hex(substr($in,1)) : substr($in,1); |
|
428
|
|
|
|
|
|
|
# we check and fix cp1232 entities |
|
429
|
|
|
|
|
|
|
return ($num < 127 or $num > 159) ? |
|
430
|
|
|
|
|
|
|
HTML::Entities::decode_entities("&$in;") : |
|
431
|
0
|
0
|
0
|
|
|
0
|
HTML::Entities::decode_entities("" . $WIN2UTF{$num} . ";"); |
|
432
|
|
|
|
|
|
|
} else { |
|
433
|
0
|
|
|
|
|
0
|
HTML::Entities::decode_entities("&$in;") |
|
434
|
|
|
|
|
|
|
} |
|
435
|
|
|
|
|
|
|
} |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
sub toString { |
|
438
|
72
|
|
|
72
|
0
|
166
|
my $h = shift; |
|
439
|
72
|
|
|
|
|
103
|
return join("; ",@{$h->{authors}}) . " ($h->{date}) $h->{title}\n"; |
|
|
72
|
|
|
|
|
457
|
|
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
sub sameTitle { |
|
443
|
16
|
|
|
16
|
0
|
4953
|
my ($a, $b, $threshold,$loose,$nolinks,%opts) = @_; |
|
444
|
16
|
|
|
|
|
82
|
return sameWork({ title => $a }, { title => $b }, $threshold,$loose,$nolinks,%opts); |
|
445
|
|
|
|
|
|
|
} |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
1; |
|
448
|
|
|
|
|
|
|
__END__ |