File Coverage

blib/lib/Lingua/Orthon.pm
Criterion Covered Total %
statement 114 118 96.6
branch 25 34 73.5
condition 2 6 33.3
subroutine 24 25 96.0
pod 10 10 100.0
total 175 193 90.6


line stmt bran cond sub pod time code
1             package Lingua::Orthon;
2 7     7   395360 use 5.006;
  7         70  
3 7     7   44 use strict;
  7         16  
  7         129  
4 7     7   25 use warnings FATAL => 'all';
  7         8  
  7         195  
5 7     7   188 use Carp qw(croak);
  7         12  
  7         335  
6 7     7   2728 use List::AllUtils qw(any);
  7         79667  
  7         433  
7 7     7   2625 use Number::Misc qw(is_numeric);
  7         6833  
  7         334  
8 7     7   2580 use Statistics::Lite qw(mean);
  7         8263  
  7         377  
9 7     7   2915 use String::Util qw(hascontent nocontent);
  7         30037  
  7         419  
10 7     7   4097 use Unicode::Collate;
  7         46837  
  7         8462  
11            
12             $Lingua::Orthon::VERSION = '0.03';
13            
14             =pod
15            
16             =encoding CP-1252
17            
18             =head1 NAME
19            
20             Lingua-Orthon - Orthographic similarity of string to one or more others by Coltheart's N and related measures
21            
22             =head1 VERSION
23            
24             This is documentation for B of Lingua::Orthon.
25            
26             =head1 SYNOPSIS
27            
28             use Lingua::Orthon 0.03;
29             my $orthon = Lingua::Orthon->new();
30             my $bool = $orthon->are_orthons('BANG', 'BARN'); # 0
31             $bool = $orthon->are_orthons('BANG', 'BONG'); # 1
32             my $idx = $orthon->index_diff('BANK', 'BARK'); # 2
33             my $count = $orthon->index_identical('BANG', 'BARN'); # 2
34             my (@diff) = $orthon->char_diff('BANG', 'BONG'); # (qw/A O/)
35             $count = $orthon->onc(
36             test => 'BANG',
37             sample => [qw/BAND COCO BING RANG BONG SONG/]); # 4
38             my $aref = $orthon->list_orthons(
39             test => 'BANG',
40             sample => [qw/BAND COCO BING RANG BONG SONG/]); # BAND, BING, RANG, BONG
41             $count = $orthon->levenshtein('BANG', 'BARN'); # 2
42             my $float = $orthon->old(
43             test => 'BANG',
44             sample => [qw/BAND COCO BING RANG BONG SONG/]); # ~= 1.67
45            
46             =head1 DESCRIPTION
47            
48             Lingua-Orthon provides measures of similarity of character strings based on their orthographic identity, as relevant to psycholinguistic research. Case- and mark-sensitivity for determining character equality can be controlled. Wraps to Levenshtein Distance methods, extended to the OLD-20 metric, are provided for convenience of comparison. No methods are explicitly exported; all methods are called in the object-oriented way.
49            
50             =head1 SUBROUTINES/METHODS
51            
52             =head2 new
53            
54             my $ortho = Lingua::Orthon->new();
55            
56             Constructs/returns class object for accessing other methods.
57            
58             Optionally, set the argument B to an integer value ranging from 0 to 3 to control case- and mark-sensitivity. See L.
59            
60             =cut
61            
62             sub new {
63 9     9 1 1673 my ( $class, %args ) = @_;
64 9         23 my $self = {};
65 9         20 bless $self, $class;
66 9         38 $self->set_eq( match_level => $args{'match_level'} );
67 9         1115 return $self;
68             }
69            
70             =head2 are_orthons
71            
72             $bool = $orthon->are_orthons('String1', 'String2');
73            
74             Returns 0 or 1 (Coltheart's Boolean) if two given strings are orthographic neighbours by a 1-mismatch I: i.e., the strings are of the same size (are equal in character count) and there is only one discrepancy between them by a single substitution of a character in the same ordinal position (no additions, deletions or transpositions). So I and I are orthons by this measure (they differ only in the final letter), but I and I are not (the letter I is an I to I via I, or a I from I to I).
75            
76             I: If two identical letter strings are given (I, I), they are defined as I being orthons: the number of index identical characters must be at least one less than the length of the string(s).
77            
78             I: By default, identity is defined case-insensitively; e.g., I and I, and I and I are orthons. However, if B has been set (in L or L) to a higher level than 1 (or as undef or 0), then case is respected; e.g., I and I are orthons, but I and I are NOT orthons (they involve substituting both the Is and the second letters (I and I) ... but I and I, or I and I, are orthons. (This usefully applies to putting L|Lingua::Orthon/onc, coltheart_n> (the sum of single-substitution orthons a string has within a lexicon) to questions of the featural versus lexical basis of neighbourhood effects).
79            
80             See Coltheart et al. (1977) (in L). The measure is computationally simple and economical, relative to other measures, such as based on a wider array of edit-types (e.g., Levenshtein Distance), that, while having greater explanatory power (Yarkoni et al., 2008), can tax resources on the order of days to effectively compute for a single string relative to a humanly memorable corpus.
81            
82             =cut
83            
84             sub are_orthons {
85 13     13 1 2836 my ( $self, $w1, $w2 ) = @_;
86 13         32 return _are_orthons( $w1, $w2, $self->{'_EQ'} );
87             }
88            
89             =head2 index_identical
90            
91             $count = $orthon->index_identical('String1', 'String2');
92            
93             Returns a count: the number of letters that are identical and in the same serial position among two given letter-strings.
94            
95             For example, given I and I, 2 is returned for the two index-identical letters, I and I; I is in both strings, but it is ignored as it is the third letter in I but the fourth letter in I, and so not in the same serial position across the two words.
96            
97             =cut
98            
99             sub index_identical {
100 6     6 1 1418 my ( $self, $w1, $w2 ) = @_;
101 6         14 return _index_identical( $w1, $w2, $self->{'_EQ'} );
102             }
103            
104             =head2 index_diff
105            
106             $posint = $orthon->index_diff('String1', 'String2');
107            
108             Assuming the two strings are single-substitution orthons, returns the single index (anchored at zero) at which their letters differ. So if the two strings are "bring" and "being", the returned value is 1.
109            
110             =cut
111            
112             sub index_diff {
113 2     2 1 425 my ( $self, $w1, $w2 ) = @_;
114 2         9 my $idx = 0;
115 2         5 for my $i ( 0 .. _smallest_len( $w1, $w2 ) - 1 ) {
116 7 100       701 if ( not $self->{'_EQ'}->( map { substr $_, $i, 1 } ( $w1, $w2 ) ) ) {
  14         24  
117 1         157 $idx = $i;
118 1         3 last;
119             }
120             }
121 2         128 return $idx;
122             }
123            
124             =head2 char_diff
125            
126             @ari = $orthon->char_diff('String1', 'String2');
127            
128             Returns a list of the first two characters (letters) that, reading from left to right, differ between two given strings. If the strings are single-substitution orthons, these are the characters that make them so. So if the two strings are "bring" and "being", the returned list is ('r', 'e') - the order of these characters in the returned list respecting the order of the given strings. The search across the strings terminates as soon there is a mismatch; otherwise, it continues only for as long as the length of the shortest string.
129            
130             The identity match (or mismatch) is sensitive to the setting of the equality function per case and marks; see L.
131            
132             =cut
133            
134             sub char_diff {
135 4     4 1 926 my ( $self, $w1, $w2 ) = @_;
136 4         8 my @ds = ();
137 4         8 for my $i ( 0 .. _smallest_len( $w1, $w2 ) - 1 ) {
138 13         1492 my @tmp = map { substr $_, $i, 1 } ( $w1, $w2 );
  26         56  
139 13 100       23 if ( not $self->{'_EQ'}->(@tmp) ) {
140 2         264 @ds = @tmp;
141 2         4 last;
142             }
143             }
144 4         282 return @ds;
145             }
146            
147             =head2 onc, coltheart_n
148            
149             $int = $orthon->onc(test => CHARSTR, sample => AREF);
150            
151             Returns the I (ONC), a.k.a. Coltheart's I: the number of single-letter substitution orthons a particular string has with respect to a list of strings (or "lexicon") (Coltheart et al., 1977). So I has two orthons (I and I) in the list (I, I, I and I).
152            
153             =cut
154            
155             sub onc {
156 1     1 1 12 my ( $self, %args ) = @_;
157             my $test_str =
158             hascontent( $args{'test'} )
159 1 50       5 ? $args{'test'}
160             : croak 'Need a single character string to test for orthons';
161             my $sample_aref =
162             ref $args{'sample'}
163 1 50       20 ? $args{'sample'}
164             : croak
165             'Need a list (aref) of character-strings to sample for orthon listing';
166 1         2 my $count = 0;
167 1         2 for ( 0 .. ( scalar @{$sample_aref} - 1 ) ) {
  1         4  
168 6 100       11 if ( _are_orthons( $test_str, $sample_aref->[$_], $self->{'_EQ'} ) ) {
169 4         7 $count++;
170             }
171             }
172 1         4 return $count;
173             }
174             *coltheart_n = \&index_indentical;
175            
176             =head2 list_orthons
177            
178             $aref = $orthon->list_orthons(test => CHARSTR, sample => AREF);
179            
180             Returns a reference to an array of single-substitution orthographic neighbours of a given B character-string that are among a given list of B character-strings. The referenced is to an empty array if no orthons are found. The order of items in the returned array follows that in which they appear in the B.
181            
182             =cut
183            
184             sub list_orthons {
185 1     1 1 463 my ( $self, %args ) = @_;
186             my $test_str =
187             hascontent( $args{'test'} )
188 1 50       4 ? $args{'test'}
189             : croak 'Need a single character string to test for orthons';
190             my $sample_aref =
191             ref $args{'sample'}
192 1 50       13 ? $args{'sample'}
193             : croak
194             'Need a list (aref) of character-strings to sample for orthon listing';
195 1         2 my @orthon_list = ();
196 1         2 for ( 0 .. ( scalar @{$sample_aref} - 1 ) ) {
  1         3  
197 6 100       10 if ( _are_orthons( $test_str, $sample_aref->[$_], $self->{'_EQ'} ) ) {
198 4         8 push @orthon_list, $sample_aref->[$_];
199             }
200             }
201 1         3 return \@orthon_list;
202             }
203            
204             =head2 ldist, levenshtein
205            
206             $count = $orthon->ldist('String1', 'String2'); # minimal, strings will be lower-cased
207            
208             Returns the Levenshtein Distance between two given letter strings, wrapping to various Perl module's that more or less implement the Levenshtein algorithm for efficiency and case-sensitivity. Specifically, if the match level has been set at 1 (to ignore case and diacritics), the method uses L (which offers "ignoring diacritics"); otherwise, it uses L to ignore case but not marks (given present limitations of this module). The required case- and mark-sensitivity are set in the L or L methods. By default, the match is made case- and mark-Isensitively (by canned Perl L).
209            
210             =cut
211            
212             sub ldist {
213 135     135 1 2902 my ( $self, $w1, $w2 ) = @_;
214 135         150 my $ldist;
215 135 100       233 if ( $self->{'_MATCH_LEVEL'} == 1 ) {
216 91         1378 require Text::Levenshtein;
217 91         1639 $ldist =
218             Text::Levenshtein::distance( $w1, $w2, { ignore_diacritics => 1 } )
219             ; # also ignores case
220             }
221             else {
222 44         1133 require Text::Levenshtein::XS;
223 44 100       1085 if ( $self->{'_MATCH_LEVEL'} == 2 ) {
224 1         4 ( $w1, $w2 ) = map { lc } ( $w1, $w2 ); # ignore case but not marks
  2         6  
225             }
226 44         68 $ldist = Text::Levenshtein::XS::distance( $w1, $w2 )
227             ; # ignores nothing on its own
228             }
229 135         969881 return $ldist;
230             }
231            
232             =head2 old
233            
234             $mean = $orthon->old(test => CHARSTR, sample => AREF, lim => INT);
235            
236             Returns the mean orthographic Levenshtein distance (OLD) of the smallest B such edit distances for a given B string to all the strings in a B list. Based on Yarkoni et al. (2008), where, with the value of B is set to 20, the measure substantially contributes to prediction of performance in word recognition tasks. Here, if B is not defined, not numeric, or greater than the size of the B, then it is set by default to the size of the sample.
237            
238             Levenshtein distance is calculated per the method L, wrapping to external modules with respect to the conditions of string equality set in L or L. Different settings lead to different speed of calculation. The slowest calculation (by far) occurs if B => 1 so that case- and mark-insensitive matching occurs; this relies on the pure Perl implementation in Text::Levenshtein with its argument B => 1. The fastest calculation (the default) occurs by setting B => 3, when exact characters are matched, e.g., I in the test-string and I in a sample-string at the same index across them are taken as unequal and so will count as a substitution. This relies on the C-implementation in Text::Levenshtein::XS. Ignore case but not marks with B => 2.
239            
240             =cut
241            
242             sub old {
243 3     3 1 769 my ( $self, %args ) = @_;
244             my $test_str =
245             hascontent( $args{'test'} )
246 3 50       15 ? $args{'test'}
247             : croak 'Need a single character string to calculate OLD';
248             my $sample_aref =
249             ref $args{'sample'}
250 3 50       57 ? $args{'sample'}
251             : croak 'Need a list (aref) of character-strings to calculate OLD';
252 3         8 my @ldists = ();
253 3         4 for ( 0 .. ( scalar @{$sample_aref} - 1 ) ) {
  3         31  
254 123         246 push @ldists, $self->ldist( $test_str, $sample_aref->[$_] );
255             }
256             my $lim =
257             ( is_numeric( $args{'lim'} ) and $args{'lim'} <= scalar @ldists )
258 3 50 33     15 ? $args{'lim'}
259             : scalar @ldists;
260 3         85 return mean( ( sort { $a <=> $b } @ldists )[ 0 .. int $lim - 1 ] )
  380         359  
261             ; # mean of first/smallest $lim-th values
262             }
263            
264             =head2 set_eq
265            
266             $orthon->set_eq(match_level => INT); # undef, 0, 1, 2 or 3
267            
268             Sets the string-matching level used in the above methods. This is called implicitly in L when given a B, or with the default value of 0. This is adopted and slightly adapted from how L controls for case/diacritic-sensitive matching.
269            
270             =over 4
271            
272             =item match_level = undef, 0
273            
274             Match with respect to case and diacritics: same as B<3> but simply by Perl's eq. So, e.g., I<éclair> and I would be taken as non-identical, just as would I and I.
275            
276             This is the fastest option. The higher levels, as follow, use the C() function in L.
277            
278             =item match_level = 1
279            
280             Match ignoring case and diacritics: I to I involves 1 edit (from I to I only)
281            
282             =item match_level = 2
283            
284             Match ignoring case but respect diacritics: "ber" to "BéZ" involves 2 edits (the "er" to "éZ")
285            
286             =item match_level = 3
287            
288             Match with respect to case and diacritics: "ber" to "BéZ" involves 3 edits (of all its letters)
289            
290             =back
291            
292             So, for example, if the test string is "abbé", it could be picked up as having the single-substitution orthographic neighbour "able" if the match level is 1, but not if it is 0, 2 or 3.
293            
294             =cut
295            
296             sub set_eq {
297 12     12 1 846 my ( $self, %args ) = @_;
298 12         27 my $match_level_arg = $args{'match_level'};
299 12 50 33     63 if ( nocontent($match_level_arg) or $match_level_arg == 0 ) {
    50          
300 0         0 $self->{'_MATCH_LEVEL'} = 0;
301 0     0   0 $self->{'_EQ'} = sub { return $_[0] eq $_[1] };
  0         0  
302             }
303 22     22   383 elsif ( any { $match_level_arg == $_ } ( 1 .. 3 ) ) {
304 12         40 $self->{'_MATCH_LEVEL'} = $match_level_arg;
305 12         66 my $collator = Unicode::Collate->new(
306             normalization => undef,
307             level => $match_level_arg
308             );
309             $self->{'_EQ'} = sub {
310 134     134   260 return $collator->eq(@_);
311 12         440947 };
312             }
313             else {
314 0         0 croak "Invalid value '$match_level_arg' given as a match level";
315             }
316 12         64 return;
317             }
318            
319             # private methods
320            
321             sub _smallest_len {
322 35     35   57 my @strs = @_;
323 35         62 return ( sort { $a <=> $b } map { length } @strs )[0];
  35         117  
  70         155  
324             }
325            
326             sub _are_orthons {
327 25     25   39 my ( $w1, $w2, $eq_fn ) = @_;
328 25 100       48 return 0 if length $w1 != length $w2;
329 23         37 return _index_identical( $w1, $w2, $eq_fn ) == ( length $w1 ) - 1;
330             }
331            
332             sub _index_identical {
333 29     29   41 my ( $w1, $w2, $eq_fn ) = @_;
334 29         32 my $count = 0;
335 29         62 for my $i ( 0 .. _smallest_len( $w1, $w2 ) - 1 ) {
336 114 100       4851 if ( $eq_fn->( map { substr $_, $i, 1 } ( $w1, $w2 ) ) ) {
  228         410  
337 62         8479 $count++;
338             }
339             }
340 29         2011 return $count;
341             }
342            
343             =head1 DIAGNOSTICS
344            
345             =over 4
346            
347             =item Invalid value '...' given as a match level
348            
349             Argument B in new() or set_eq() needs to be an integer in range 0 .. 3, or undefined.
350            
351             =item Need a single character string to test for orthons
352            
353             Argument B for calculating ONC and OLD, and listing orthons, needs to be defined and not empty.
354            
355             =item Need a single character string to test for orthons
356            
357             Argument B should reference an array of character-strings when calculating ONC and OLD, and listing orthons.
358            
359             =back
360            
361             =head1 REFERENCES
362            
363             Coltheart, M., Davelaar, E., Jonasson, J. T., & Besner, D. (1977). Access to the internal lexicon. In S. Dornic (Ed.), I (Vol. 6, pp. 535-555). London, UK: Academic.
364            
365             Yarkoni, T., Balota, D. A., & Yap, M. (2008). Moving beyond Coltheart's I: A new measure of orthographic similarity. I, I<15>, 971-979. doi: L<10.3758/PBR.15.5.971|http://dx.doi.org/10.3758/PBR.15.5.971>.
366            
367             =head1 DEPENDENCIES
368            
369             L
370            
371             L
372            
373             L
374            
375             L
376            
377             L
378            
379             L
380            
381             L
382            
383             =head1 AUTHOR
384            
385             Roderick Garton, C<< >>
386            
387             =head1 SEE ALSO
388            
389             L
390            
391             L
392            
393             L
394            
395             =head1 BUGS AND LIMITATIONS
396            
397             Please report any bugs or feature requests to C, or through
398             the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
399            
400             =head1 SUPPORT
401            
402             You can find documentation for this module with the perldoc command.
403            
404             perldoc Lingua::Orthon
405            
406             You can also look for information at:
407            
408             =over 4
409            
410             =item * RT: CPAN's request tracker (report bugs here)
411            
412             L
413            
414             =item * AnnoCPAN: Annotated CPAN documentation
415            
416             L
417            
418             =item * CPAN Ratings
419            
420             L
421            
422             =item * Search CPAN
423            
424             L
425            
426             =back
427            
428             =head1 LICENSE AND COPYRIGHT
429            
430             Copyright 2011-2018 Roderick Garton.
431            
432             This program is free software; you can redistribute it and/or modify it
433             under the terms of either: the GNU General Public License as published
434             by the Free Software Foundation; or the Artistic License.
435            
436             See L for more information.
437            
438             =cut
439            
440             1; # End of Lingua::Orthon