File Coverage

blib/lib/Statistics/Krippendorff.pm

Criterion	Covered	Total	%
statement	151	151	100.0
branch	20	20	100.0
condition	6	6	100.0
subroutine	21	21	100.0
pod	11	11	100.0
total	209	209	100.0

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							Statistics::Krippendorff - Calculate Krippendorff's alpha
4
5							=head1 VERSION
6
7							Version 0.04
8
9							=cut
10
11							package Statistics::Krippendorff;
12
13	5			5		284120	use 5.026;
	5					21
14
15	5			5		3474	use Moo;
	5					57556
	5					44
16
17	5			5		13008	use experimental qw( signatures );
	5					25139
	5					34
18
19							our $VERSION = '0.04';
20
21	5			5		1376	use List::Util qw{ min sum };
	5					22
	5					859
22
23	5			5		3445	use namespace::clean;
	5					109895
	5					43
24
25							has units => (is => 'ro',
26							required => 1,
27							coerce => \&_units_array2hash);
28
29							has delta => (is => 'rw',
30							default => sub { \&delta_nominal },
31							trigger => sub ($self, $d) {
32							$self->delta($self->_deltas->{$d})
33							if exists $self->_deltas->{$d};
34							});
35
36							has coincidence => (is => 'lazy', init_arg => undef);
37
38							has _vals => (is => 'lazy',
39							init_arg => undef,
40							builder => '_build_vals');
41
42							has _frequency => (is => 'lazy',
43							init_arg => undef,
44							builder => '_build_frequency');
45
46							has _expected => (is => 'lazy',
47							init_arg => undef,
48							builder => '_build_expected');
49
50							has _deltas => (is => 'ro',
51							init_arg => undef,
52							default => sub { +{
53							nominal => \&delta_nominal,
54							interval => \&delta_interval,
55							ordinal => \&delta_ordinal,
56							ratio => \&delta_ratio,
57							jaccard => \&delta_jaccard,
58							masi => \&delta_masi
59							} });
60
61	11			11	1	129	sub alpha($self) {
	11					24
	11					22
62							my $d_o = sum(map {
63	11					42	my $v = $_;
	56					238
64							map {
65	56					148	$self->coincidence->{$v}{$_} * $self->delta->($self, $v, $_)
	298					10787
66							} $self->vals
67							} $self->vals);
68							my $d_e = sum(map {
69	11					72	my $v = $_;
	56					234
70							map {
71	56					153	$self->_expected->{$v}{$_} * $self->delta->($self, $v, $_)
	298					6689
72							} $self->vals
73							} $self->vals);
74	11					60	my $alpha = 1 - $d_o / $d_e;
75	11					169	return $alpha
76							}
77
78	190			190	1	345	sub vals($self) { @{ $self->_vals } }
	190					293
	190					305
	190					323
	190					4182
79
80	594			594	1	3341	sub frequency($self, $value) {
	594					930
	594					885
	594					832
81	594					11860	return $self->_frequency->{$value}
82							}
83
84	6			6	1	26	sub pairable_values($self) {
	6					13
	6					10
85	6					13	return sum(values %{ $self->_frequency })
	6					130
86							}
87
88	7			7	1	94	sub is_valid($self) {
	7					14
	7					11
89	7					14	for my $unit (@{ $self->units }) {
	7					33
90	14	100				64	return if 1 >= keys %$unit;
91	9	100				51	return if grep ! defined, values %$unit;
92							}
93	1					12	return 1
94							}
95
96	116	100		116	1	3237	sub delta_nominal($, $s1, $s2) { $s1 eq $s2 ? 0 : 1 }
	116					221
	116					206
	116					170
	116					570
97
98	82			82	1	2431	sub delta_interval($, $v0, $v1) { ($v0 - $v1) ** 2 }
	82					132
	82					136
	82					110
	82					342
99
100	50			50	1	1604	sub delta_ordinal($self, $v0, $v1) {
	50					97
	50					88
	50					85
	50					114
101	50					263	my ($from, $to) = sort { $a <=> $b } $v0, $v1;
	100					179
102	50					179	(sum(map $self->frequency($_), $from .. $to)
103							- ($self->frequency($from) + $self->frequency($to))/ 2) ** 2
104							}
105
106	50			50	1	1602	sub delta_ratio($, $v0, $v1) { (($v0 - $v1) / ($v0 + $v1)) ** 2}
	50					126
	50					106
	50					81
	50					249
107
108	270			270	1	6921	sub delta_jaccard($, $s1, $s2) {
	270					3455
	270					398
	270					386
109	270					540	my @s1 = split /,/, $s1;
110	270					520	my @s2 = split /,/, $s2;
111
112	270					381	my %union;
113	270					672	@union{ @s1, @s2 } = ();
114
115	270					391	my %intersection;
116	270					456	@intersection{@s1} = ();
117
118	270					1437	return 1 - (grep exists $intersection{$_}, @s2) / keys %union
119							}
120
121	57			57	1	2567	sub delta_masi($, $v0, $v1) {
	57					147
	57					97
	57					99
122	57					190	my @v0 = split /,/, $v0;
123	57					154	my @v1 = split /,/, $v1;
124	57					127	my %union;
125	57					171	@union{ @v0, @v1 } = ();
126	57					101	my $union = keys %union;
127
128	57					94	my %intersection;
129	57					108	@intersection{ @v0 } = ();
130	57					161	my $intersection = grep exists $intersection{$_}, @v1;
131
132							# Python's nltk uses 0.67 and 0.33 which gives a different result for
133							# precission 4.
134	57	100	100			324	my $m = (@v0 == @v1 && @v0 == $intersection) ? 1
		100
		100
135							: $intersection == min(scalar @v0, scalar @v1) ? 2 / 3
136							: $intersection > 0 ? 1 / 3
137							: 0;
138	57					424	return 1 - $intersection / $union * $m
139							}
140
141	15			15		224	sub _units_array2hash($units) {
	15					37
	15					35
142	15	100				88	if (ref [] eq ref $units->[0]) {
143							return [map {
144	9					32	my $unit = $_;
	42					77
145	42					604	+{map +($_ => $unit->[$_]),
146							grep defined $unit->[$_],
147							0 .. $#$unit}
148							} @$units]
149							}
150	6					169	return $units
151							}
152
153	6			6		58	sub _build_vals($self) {
	6					10
	6					22
154	6					12	my %subf;
155	6					13	@subf{ map values %$_, @{ $self->units } } = ();
	6					133
156	6					88	return [sort keys %subf]
157							}
158
159	6			6		58	sub _build_coincidence($self) {
	6					39
	6					11
160	6					16	my @vals = $self->vals;
161	6					55	my %coinc;
162	6					30	@{ $coinc{$_} }{@vals} = (0) x @vals for @vals;
	32					141
163
164	6					25	for my $unit (@{ $self->units }) {
	6					24
165	47					77	my %is_value;
166	47					172	@is_value{ values %$unit } = ();
167	47					100	my @values = keys %is_value;
168	47					110	my @keys = keys %$unit;
169
170	47					111	for my $v (@values) {
171	68					212	for my $v_ (@values) {
172	118					189	my $coinc_count = 0;
173	118					211	for my $key1 (@keys) {
174	319					534	for my $key2 (@keys) {
175	945	100				2008	next if $key1 eq $key2;
176
177							++$coinc_count
178							if $unit->{$key1} eq $v
179	626	100	100			1972	&& $unit->{$key2} eq $v_;
180							}
181							}
182	118					467	$coinc{$v}{$v_} += $coinc_count / (@keys - 1);
183							}
184							}
185							}
186	6					270	return \%coinc
187							}
188
189	6			6		53	sub _build_frequency($self) {
	6					14
	6					14
190	6					13	my %f;
191	6					17	@f{ $self->vals } = map sum(values %{ $self->coincidence->{$_} }),
	32					821
192							$self->vals;
193	6					119	return \%f
194							}
195
196	6			6		63	sub _build_expected($self) {
	6					29
	6					12
197	6					13	my %exp;
198	6					22	my $n = $self->pairable_values - 1;
199	6					19	for my $v ($self->vals) {
200	32					449	for my $v_ ($self->vals) {
201	182	100				1599	$exp{$v}{$v_} = ($v eq $v_
202							? $self->frequency($v) * ($self->frequency($v) - 1)
203							: $self->frequency($v) * $self->frequency($v_)
204							) / $n;
205							}
206							}
207	6					211	return \%exp
208							}
209
210							=head1 SYNOPSIS
211
212							use experimental qw( signatures );
213							use Statistics::Krippendorff ();
214
215							my @units = ({coder1 => 1, coder2 => 1},
216							{coder1 => 2, coder2 => 2, coder3 => 1},
217							{coder2 => 3, coder3 => 2});
218							my $sk = 'Statistics::Krippendorff'->new(units => \@units);
219							my $alpha1 = $sk->alpha;
220							$sk->delta('nominal'); # Same as default.
221							my $alpha2 = $sk->alpha;
222
223							my $ski = 'Statistics::Krippendorff'->new(
224							units => [[1, 1], [2,2,1], [undef,3,2]],
225							delta => sub ($, $v0, $v1) { ($v0 - $v1) ** 2 });
226							my $alpha_interval = $ski->alpha;
227
228							=head1 METHODS
229
230							=head2 new
231
232							my $sk = 'Statistics::Krippendorff'->new(
233							units => \@units,
234							delta => 'nominal');
235
236							The constructor. It accepts the following named arguments:
237
238							=head3 units
239
240							An array reference of units. All units of analysis must be of the same type,
241							but there are two possible types they all can have:
242
243							=over
244
245							=item 1.
246
247							Each unit is a hash reference of the form
248
249							{ coder1 => 'value1', coder3 => 'value2', ... }
250
251							=item 2.
252
253							Each unit is an array reference of the form
254
255							['value1', undef, 'value2']
256
257							where the coder is encoded by the position in the array, missing data are
258							indicated by an C.
259
260
261							=back
262
263							In both the cases, there must be at least two values in each unit. If you want
264							to validate this precondition, call C.
265
266							=head3 delta
267
268							An optional argument defaulting to delta_nominal. You can specify any function
269							C that compares the two values C<$v1> and C<$v2> and
270							returns their distance (a number between 0 and 1). Several common methods are
271							predefined, you can use a code reference like C<&Statistics::Krippendorff::delta_nominal> or just a string C:
272
273							=head4 delta_nominal
274
275							Used for nominal data, i.e. labels with no ordering.
276
277							=head4 delta_ordinal
278
279							Used for numeric values that are ordered, but can't be used in mathematical
280							operations, for example number of stars in a movie rating system (we don't say
281							that the distance from one star to two stars is the same as the distance from
282							three starts to four stars). See the implementation on why C<$self> is needed
283							as a parameter to delta.
284
285							=head4 delta_interval
286
287							Used for numeric values that can be used in mathematical operations.
288
289							=head4 delta_ratio
290
291							Used for non-negative numeric values (think degrees Kelvin).
292
293							=head4 delta_jaccard
294
295							This can be used when coders can specify more than one value. Join the values
296							with commas; Jaccard index then uses the formula C
297							union_size>. If you sort the values before joining them, the expected
298							coincidence matrix is smaller and the algorithm runs faster, but the resulting
299							coefficient should be the same.
300
301							=head4 delta_masi
302
303							The weighted metric for measuring agreement on set-valued items introduced by
304							R. Passonneau (2006). Use comma separated values as above in C.
305							Note that the Python implementation in L uses the
306							weights rounded with precision 2, so the resutls might be slightly different.
307
308							=head2 alpha
309
310							my $alpha = $sk->alpha;
311
312							Returns Krippendorff's alpha.
313
314							=head2 delta
315
316							$sk->delta(sub($self, $v1, $v2) {});
317							$sk->delta('jaccard');
318
319							The difference function used to calculate the alpha. You can specify it in the
320							constructor (see above), but you can later change it so something else, too.
321
322							=head2 is_valid
323
324							print "OK" if $sk->is_valid;
325
326							Check that each unit has at least two responses. If you use a hash
327							representation of a unit, the values must be always defined.
328
329							=head2 frequency
330
331							my $freq = $sk->frequency('val1');
332
333							Returns the frequency of the given value.
334
335							=head2 pairable_values
336
337							Returns the total number of all pairable values (i.e. the sum of all
338							frequencies).
339
340							=head2 vals
341
342							Returns a sorted list of all the possible values.
343
344							=head1 AUTHOR
345
346							E. Choroba, C<< >>
347
348							=head1 BUGS
349
350							Please report any bugs or feature requests to
351							L, via
352							e-mail to C, or through
353							the web interface at
354							L.
355							I will be notified, and then you'll automatically be notified of
356							progress on your bug as I make changes.
357
358							=head1 SUPPORT
359
360							You can find documentation for this module with the perldoc command.
361
362							perldoc Statistics::Krippendorff
363
364
365							You can also look for information at:
366
367							=over 4
368
369							=item * GitHub (report bugs here)
370
371							L
372
373							=item * Search CPAN
374
375							L
376
377							=item * RT: CPAN's request tracker (you can report bugs here, too)
378
379							L
380
381							=back
382
383
384							=head1 ACKNOWLEDGEMENTS
385
386							Implementation inspired by
387							L,
388							additional tests taken from
389							L.
390
391							=head1 LICENSE AND COPYRIGHT
392
393							This software is Copyright (c) 2025 by E. Choroba.
394
395							This is free software, licensed under:
396
397							The Artistic License 2.0 (GPL Compatible)
398
399
400							=cut
401
402							__PACKAGE__