File Coverage

blib/lib/Lingua/Norms/SUBTLEX.pm
Criterion Covered Total %
statement 257 296 86.8
branch 103 172 59.8
condition 11 15 73.3
subroutine 37 43 86.0
pod 24 24 100.0
total 432 550 78.5


line stmt bran cond sub pod time code
1             package Lingua::Norms::SUBTLEX;
2 8     8   216806 use 5.006;
  8         23  
  8         277  
3 8     8   36 use strict;
  8         10  
  8         274  
4 8     8   43 use warnings FATAL => 'all';
  8         18  
  8         329  
5 8     8   32 use Config;
  8         9  
  8         297  
6 8     8   32 use Carp qw(carp croak);
  8         13  
  8         387  
7 8     8   4010 use English '-no_match_vars';
  8         23463  
  8         35  
8 8     8   6311 use File::Slurp qw(read_dir);
  8         76935  
  8         474  
9 8     8   55 use File::Spec;
  8         9  
  8         137  
10 8     8   4026 use List::AllUtils qw(none);
  8         84164  
  8         661  
11 8     8   2663 use Statistics::Lite qw(max mean median stddev);
  8         6883  
  8         553  
12 8     8   4006 use String::Util qw(hascontent nocontent);
  8         31749  
  8         597  
13 8     8   3885 use Text::CSV::Separator qw(get_separator);
  8         12772  
  8         452  
14 8     8   3777 use Readonly;
  8         18589  
  8         26064  
15             Readonly my $YARKONI_MAX => 20;
16            
17             $Lingua::Norms::SUBTLEX::VERSION = '0.05';
18            
19             =head1 NAME
20            
21             Lingua::Norms::SUBTLEX - Retrieve frequency values and frequency-based lists for words from Subtitles Corpora
22            
23             =head1 VERSION
24            
25             Version 0.05
26            
27             =head1 SYNOPSIS
28            
29             use feature qw(say);
30             use Lingua::Norms::SUBTLEX;
31             my $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'US'); # or NL, UK, DE
32             my $bool = $subtlex->is_normed(string => 'fuip'); # isa_word ?
33             my $frq = $subtlex->frq_opm(string => 'frog'); # freq. per million, or get log/zipf
34             my $href = $subtlex->freqhash(words => [qw/frog fish ape/]); # freqs. for a list of words
35             say "$_ freq per mill = $href->{$_}" for keys %{$href};
36            
37             # stats, parts-of-speech, orthographic relations:
38             say "mean freq per mill = ", $subtlex->mean_frq(words => [qw/frog fish ape/]); # or median, std-dev.
39             say "frog part-of-speech = ", $subtlex->pos(string => 'frog');
40             my ($count, $orthons_aref) = $subtlex->on_count(string => 'frog'); # or scalar context for count only; or freq_max/mean
41             say "orthon of frog = $_" for @{$orthons_aref}; # e.g., from
42            
43             # retrieve (list of) words to certain specs:
44             my $aref = $subtlex->list_words(freq => [2, 400], onc => [1,], length => [4, 4], cv_pattern => 'CCVC', regex => '^f');
45             my $string = $subltex->random_word();
46            
47             =head1 DESCRIPTION
48            
49             The module facilitates access to raw data and descriptive statistics on word-frequency and parts-of-speech, as provided in the SUBTLEX-DE, SUBTLEX-NL, SUBTLEX-UK and SUBTLEX-US databases (see L). For example, the SUBTLEX-US database is based on a study of 74,286 letter-strings, with frequencies of occurrence within a corpus of some 30 million words from the subtitles of 8,388 film and television episodes. The frequency data obtained in this way have been shown to offer more psychologically predictive measures than those derived from books, newsgroup posts, and similar.
50            
51             There are three groups of retrievable stats and sampling rules: (1) frequency; (2)contextual diversity (number of films/episodes appeared in); and (3) parts-of-speech. Depending on the source language, frequency is given as a count (L), occurrences per million (L), logarithm of the opm (L), and/or 7-point scaled (L); contextual diversity is given as a count (L), a percentage (L), or a logarithm (L). For parts-of-speech, L returns a string giving the dominant part. Sampling is given by the same labels, with keys with min/max values (or a whitelist of acceptable parts-of-speech).
52            
53             A small sample from each of the databases is included in the installation distribution for testing purposes. The complete files need to be downloaded via the following URLs. The local directory location or actual pathname of these files can be given in class construction (by the arguments B and B); otherwise the default location--the directory "SUBTLEX" alongside the module itself in the locally configured Perl sitelib--will be used, and the correct file determined by inclusion of B value within its filename. The filenames of the original files downloaded from the following sites are supported in this way, and it does not matter if (as varies between the files) the fields are comma-separated or tab-delimited.
54            
55             The three databases (comprised of one file per language) do not provide values for all methods. All three provide values for only the methods frq_count, cd_count, cd_pct, and pos. Further details of unsupported methods per database/lang are given below.
56            
57             =over 4
58            
59             =item SUBTLEX-US
60            
61             For the B norms, install the file "SUBTLEXusExcel2007.csv" from L. All methods are supported by this database.
62            
63             =item SUBTLEX-UK
64            
65             For the B norms, install the file "SUBTLEX-UK.txt" from within the "SUBTLEX-UK.zip" archive via L. This database does not define values for occurrences per million (or log occurrences per million); the methods for these stats will return an empty string.
66            
67             =item SUBTLEX-NL
68            
69             For the B norms, install the file "SUBTLEX-NL.with-pos.txt" from within the archive "SUBTLEX-NL.with-pos.txt.zip" via L. This database does not define a value for Zipf frequency, so the "zipf" method will return an empty string if called with NL as the "lang".
70            
71             =item SUBTLEX-DE
72            
73             For the B norms, dowload the file "SUBTLEX-DE_cleaned_with_Google00.txt" via L. There is no CD, POS or Zipf data at this point, so only the "frq_" methods, and the "on_" methods (based on realtime calculation work with this language. The file contains other information, including Google-based frequencies, for which this module does not provide retrieval at this time.
74            
75             =back
76            
77             There are several other languages from this project which might be supported by this module in a later version (originally, only SUBTLEX-US was supported).
78            
79             =head1 SUBROUTINES/METHODS
80            
81             All methods are called via the class object, and with named (hash of) arguments, usually B, where relevant.
82            
83             =head2 new
84            
85             $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'US'); # or 'UK', 'NL', 'DE' - looking in Perl sitelib
86             $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'US', dir => 'file_location'); # where to look
87             $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'US', path => 'actual_file');
88            
89             Returns a class object for accessing other methods. The parameter B should be set to specify the particular language database: DE (German), NL (Dutch), UK (British) or US (American); otherwise US (being the first published in the series) is the default. Optional arguments B or B can be given to specify the location or actual file (respectively) of the database. The default location is within the "Lingua/Norms/SUBTLEX" directory within the 'sitelib' configured for the local Perl installation (as per L). The method will C if the given B or default location cannot be found.
90            
91             =cut
92            
93             sub new {
94 9     9 1 3235 my ( $class, %args ) = @_;
95 9 50       38 my $self = bless {}, ref($class) ? ref($class) : $class;
96            
97             # determine database location:
98 9 100       31 my $lang = $args{'lang'} ? delete $args{'lang'} : 'US';
99 9         4588 my $mod_dir =
100             File::Spec->catdir( $Config{'sitelib'}, qw/Lingua Norms SUBTLEX/ );
101 9 50       16945 if ( $args{'path'} ) { # -by specific arg:
102 9         47 $self->{'path'} = $args{'path'};
103             }
104             else { # by dir and lang args:
105 0         0 my $dir;
106 0 0       0 if ( $args{'dir'} ) { # check it's a dir:
107 0 0       0 croak "Value for argument 'dir' ($args{'dir'}) is not a directory"
108             if !-d $args{'dir'};
109 0         0 $dir = delete $args{'dir'};
110             }
111             else { # use module's dir :
112 0         0 $dir = $mod_dir;
113             }
114 0         0 for ( read_dir($dir) ) {
115 0 0       0 if (/(?:SUBTLEX\-)?\Q$lang/imsx) {
116 0         0 $self->{'path'} = File::Spec->catfile( $dir, $_ );
117 0         0 last;
118             }
119             }
120             }
121 9 50       47 croak "Cannot find required database for language $lang"
122             if nocontent( $self->{'path'} ); # or !-T $self->{'path'};
123            
124             # determine delimiter:
125 9         151 $self->{'delim'} = get_separator( path => $self->{'path'}, lucky => 1 );
126            
127             # identify needed field indices within this file:
128             # - arg 'fieldpath' only used for testing, assumed not useful for user:
129 9         1694 my %fields = ( US => 1, UK => 2, NL => 3, DE => 4 );
130 9 50       35 croak 'Cannot determine fields for given language'
131             if nocontent( $fields{$lang} );
132 9 50       125 my $fieldpath =
133             $args{'fieldpath'}
134             ? $args{'fieldpath'}
135             : File::Spec->catfile( $mod_dir, 'fields.csv' );
136 9 50       338 open( my $fh, q{<}, $fieldpath ) or carp 'Cannot determine field indices';
137 9         90 while (<$fh>) {
138 117 100       850 next if $INPUT_LINE_NUMBER == 1;
139 108         90 chomp;
140 108         239 my @dat = split m/[,]/msx;
141 108 50       209 $self->{'IDX'}->{ $dat[0] } =
142             hascontent( $dat[ $fields{$lang} ] ) ? $dat[ $fields{$lang} ] : q{};
143             }
144 9 50       149 close $fh or croak $OS_ERROR;
145 9         53 return $self;
146             }
147            
148             =head2 Frequencies and POS for individual words or word-lists
149            
150             =head3 is_normed
151            
152             $bool = $subtlex->is_normed(string => $word);
153            
154             I: isa_word
155            
156             Returns a boolean value to specify whether or not the letter-string passed as B is represented in the SUBTLEX corpus. This might be thought of as a lexical decision ("is this string a word?") but note that some very low frequency letter-strings in the corpus would not be considered words in the average context (perhaps, in part, because of misspelt subtitles).
157            
158             =cut
159            
160             sub is_normed {
161 2     2 1 11 my ( $self, %args ) = @_;
162 2 50       8 croak 'No string to test; pass a string to the function'
163             if nocontent( $args{'string'} );
164 2         15 my $str = $args{'string'};
165 2         2 my $res = 0; # boolean to return from this sub
166 2 50       54 open my $fh, q{<}, $self->{'path'} or croak $OS_ERROR;
167 2         17 while (<$fh>) {
168 35 100       46 next if $INPUT_LINE_NUMBER == 1; # skip headings
169 33 50       108 /^([^\Q$self->{'delim'}\E]+)/msx
170             or next; # isolate first token ahead of delimiter as $1 in this file
171 33 100       96 if ( $str eq $1 ) { # first token equals given string?
172 1         2 $res = 1; # set result to return as true
173 1         2 last; # got it, so abort look-up
174             }
175             }
176 2 50       14 close $fh or croak $OS_ERROR;
177 2         14 return $res; # zero if string not found in file
178             }
179             *isa_word = \&is_normed;
180            
181             =head3 frq_count
182            
183             $val = $subtlex->frq_count(string => 'aword');
184            
185             Returns the raw number of occurrences in all the films/TV episodes for the word passed as B, or the empty-string if the string is not represented in the norms.
186            
187             =cut
188            
189             sub frq_count {
190 0     0 1 0 my ( $self, %args ) = @_;
191 0         0 return _get_fieldvalue( $self, $args{'string'},
192             $self->{'IDX'}->{'frq_count'} );
193             }
194             *freq = \&frq_opm;
195            
196             =head3 frq_opm
197            
198             $val = $subtlex->frq_opm(string => 'aword');
199            
200             I: opm
201            
202             Returns frequency per million for the word passed as B, or the empty-string if the string is not represented in the norms.
203            
204             =cut
205            
206             sub frq_opm {
207 2     2 1 351 my ( $self, %args ) = @_;
208 2         11 return _get_fieldvalue( $self, $args{'string'},
209             $self->{'IDX'}->{'frq_opm'} );
210             }
211             *freq = \&frq_opm; # legacy only
212             *opm = \&frq_opm;
213            
214             =head3 frq_log
215            
216             $val = $subtlex->frq_log(string => 'aword');
217            
218             Returns log frequency per million for the word passed as B, or the empty-string if the string is not represented in the norms.
219            
220             =cut
221            
222             sub frq_log {
223 2     2 1 7 my ( $self, %args ) = @_;
224 2         7 return _get_fieldvalue( $self, $args{'string'},
225             $self->{'IDX'}->{'frq_log'} );
226             }
227             *lfreq = \&frq_log; # legacy only
228            
229             =head3 frq_zipf
230            
231             $val = $subtlex->frq_zipf(string => 'aword');
232            
233             Returns zipf frequency for the word passed as B, or the empty-string if the string is not represented in the norms. The Zipf scale ranges from 1 to 7, with values of 1-3 representing low frequency words, and values of 4-7 representing high frequency words. See Van Heuven et al. (2014) and L.
234            
235             =cut
236            
237             sub frq_zipf {
238 2     2 1 7 my ( $self, %args ) = @_;
239 2         7 return _get_fieldvalue( $self, $args{'string'},
240             $self->{'IDX'}->{'frq_zipf'} );
241             }
242             *zipf = \&frq_zipf; # legacy only
243            
244             =head3 cd_count
245            
246             $cd = $subtlex->cd_count(string => 'aword');
247            
248             Corresponds to the column labelled "CDcount" in the datafile.
249            
250             =cut
251            
252             sub cd_count {
253 0     0 1 0 my ( $self, %args ) = @_;
254 0         0 return _get_fieldvalue( $self, $args{'string'},
255             $self->{'IDX'}->{'cd_count'} );
256             }
257            
258             =head3 cd_pct
259            
260             $cd = $subtlex->cd_pct(string => 'aword');
261            
262             Returns a percentage measure to two decimal places of the number of films/TV episodes in which the given string was included in its subtitles. This corresponds to the measure "SUBTLCD" described in Brysbaert and New (2009). Note: where "cd" stands for "contextual diversity."
263            
264             =cut
265            
266             sub cd_pct {
267 2     2 1 7 my ( $self, %args ) = @_;
268 2         9 return _get_fieldvalue( $self, $args{'string'},
269             $self->{'IDX'}->{'cd_pct'} );
270             }
271            
272             =head3 cd_log
273            
274             Returns log10(L + 1) for the given string, with 4-digit precision. This corresponds to the measure "Lg10CD" described in Brysbaert and New (2009), where it is stated that "this is the best value to use if one wants to match words on word frequency" (p. 988). Note: "cd" stands for "contextual diversity," which is based on the number of films and TV episodes in which the string was represented.
275            
276             =cut
277            
278             sub cd_log {
279 2     2 1 6 my ( $self, %args ) = @_;
280 2         9 return _get_fieldvalue( $self, $args{'string'},
281             $self->{'IDX'}->{'cd_log'} );
282             }
283            
284             =head3 frq_hash
285            
286             $href = $subtlex->frq_hash(strings => [qw/word1 word2/], scale => opm|log|zipf);
287            
288             Returns frequency as values within a reference to a hash keyed by the words passed as B. By default, the values in the hash are corpus frequency per million. If the optional argument B is defined, and it equals I, then the values are log-frequency; similarly, I yields zipf-frequency. Note, however, that some databases do not support all types of scales; in which case the returned value will be the empty string.
289            
290             =cut
291            
292             sub frq_hash {
293 8     8 1 2204 my ( $self, %args ) = @_;
294 8 50       23 croak
295             'No string(s) to test; pass one or more letter-strings named \'strings\' as a referenced array'
296             if !$args{'strings'};
297 8 50       20 my $strs =
298             ref $args{'strings'}
299             ? $args{'strings'}
300             : croak 'No reference to an array of letter-strings found';
301 8 100       26 my $col_i =
302             hascontent( $args{'scale'} )
303             ? $self->{'IDX'}->{ $args{'scale'} }
304             : $self->{'IDX'}->{'frq_opm'};
305 21         57 my %frq = map { lc($_) => [ undef, $_ ] }
  8         16  
306 8         63 @{$strs}; # keep lower-case to search associated with original case
307 8 50       282 open my $fh, q{<}, $self->{'path'} or croak "$OS_ERROR\n";
308 8         74 while (<$fh>) {
309 240 100       1417 next if $INPUT_LINE_NUMBER == 1;
310 232         185 chomp;
311 232         1089 my @line = split m/\Q$self->{'delim'}\E/msx;
312 232 100       599 if ( hascontent( $frq{ lc( $line[0] ) } ) ) {
313 21         209 $frq{ lc( $line[0] ) }->[0] = $line[$col_i];
314             }
315             }
316 8 50       153 close $fh or croak $OS_ERROR;
317 8         17 return { map { $_->[1] => _nummify($_->[0]) }
  21         34  
318             values %frq }; # assign values to original case strings
319             }
320             *freqhash = \&frq_hash;
321            
322             =head3 pos
323            
324             $pos_str = $subtlex->pos(string => 'aword');
325            
326             Returns part-of-speech string for a given word. The return value is undefined if the word is not found.
327            
328             =cut
329            
330             sub pos {
331 1     1 1 7 my ( $self, %args ) = @_;
332 1 50       3 croak 'No string to test' if !$args{'string'};
333 1         1 my $word = $args{'string'};
334 1         1 my $pos;
335 1 50       25 open my $fh, q{<}, $self->{'path'} or croak "$OS_ERROR\n";
336 1         9 while (<$fh>) {
337 3 100       7 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
338 2         29 /^([^\Q$self->{'delim'}\E]+)/msx;
339 2 100       8 if ( $word eq $1 ) {
340 1         5 chomp;
341 1         12 my @line = split m/\Q$self->{'delim'}\E/msx, $_;
342 1         5 $pos = $line[ $self->{'IDX'}->{'pos'} ];
343 1         2 last;
344             }
345             }
346 1 50       9 close $fh or croak $OS_ERROR;
347 1         10 return $pos;
348             }
349            
350             =head2 Descriptive frequency statistics for lists
351            
352             These methods return a descriptive statistic (mean, median or standard deviation) for a list of B. Like L, they take an optional argument B to specify if the returned values should be raw frequencies per million, log frequencies, or zip-frequencies.
353            
354             =head3 frq_mean
355            
356             $mean = $subtlex->frq_mean(strings => [qw/word1 word2/], scale => 'raw|log|zipf');
357            
358             Returns the arithmetic mean of the frequencies for the given B, or mean of the log frequencies if B => 1.
359            
360             =cut
361            
362             sub frq_mean {
363 3     3 1 665 my ( $self, %args ) = @_;
364 3         3 return mean( values %{ $self->freqhash(%args) } );
  3         9  
365             }
366             *mean_freq = \&frq_mean;
367            
368             =head3 frq_median
369            
370             $median = $subtlex->frq_median(words => [qw/word1 word2/], scale => 'raw|log|zipf');
371            
372             Returns the median of the frequencies for the given B, or median of the log frequencies if B => 1.
373            
374             =cut
375            
376             sub frq_median {
377 1     1 1 268 my ( $self, %args ) = @_;
378 1         2 return median( values %{ $self->freqhash(%args) } );
  1         4  
379             }
380             *median_freq = \*frq_median;
381            
382             =head3 frq_sd
383            
384             $sd = $subtlex->frq_sd(words => [qw/word1 word2/], scale => 'raw|log|zipf');
385            
386             Returns the standard deviation of the frequencies for the given B, or standard deviation of the log frequencies if B => 1.
387            
388             =cut
389            
390             sub frq_sd {
391 1     1 1 317 my ( $self, %args ) = @_;
392 1         1 return stddev( values %{ $self->freqhash(%args) } );
  1         4  
393             }
394             *sd_freq = \*frq_sd;
395            
396             =head2 Orthographic neighbourhood measures
397            
398             These methods return stats re the orthographic relatedness of a specified letter-B to words in the SUBTLEX corpus. Unless otherwise stated, an orthographic neighbour here means letter-strings that are identical except for a single-letter substitution while holding string-length constant, i.e., the Coltheart I of a letter-string, as defined in Coltheart et al. (1977). These measures are calculated in realtime; they are not listed in the datafile for look-up, so expect some extra-normal delay in getting a returned value.
399            
400             =head3 on_count
401            
402             $n = $subtlex->on_count(string => $letters);
403             ($n, $orthons_aref) = $subtlex->on_count(string => $letters);
404            
405             Returns orthographic neighbourhood count (Coltheart I) within the SUBTLEX corpus. Called in array context, also returns a reference to an array of the neighbours retrieved, if any.
406            
407             =cut
408            
409             sub on_count {
410 14     14 1 363 my ( $self, %args ) = @_;
411 14 50       26 croak 'No string to test' if !$args{'string'};
412 14         23 my $word = lc( $args{'string'} );
413 14         961 require Lingua::Orthon;
414 14         47255 my $ortho = Lingua::Orthon->new();
415 14         466 my ( $z, @orthons ) = (0);
416 14 50       489 open my $fh, q{<}, $self->{'path'} or die "$OS_ERROR\n";
417 14         137 while (<$fh>) {
418 420 100       2928 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
419 406 50       1292 /^([^\Q$self->{'delim'}\E]+)/msx or next;
420 406         470 my $test = lc($1);
421 406 100       612 if ( $ortho->are_orthons( $word, $test ) ) {
422 39         440 push @orthons, $test;
423 39         96 $z++;
424             }
425             }
426 14 50       223 close $fh or croak $OS_ERROR;
427 14 100       121 return wantarray ? ( $z, \@orthons ) : $z;
428             }
429            
430             =head3 on_frq_max
431            
432             $m = $subtlex->on_frq_max(string => $letters);
433            
434             Returns the maximum SUBTLEX frequency per million among the orthographic neighbours (per Coltheart I) of a particular letter-string. If (unusually) all the frequencies are the same, then that value is returned. If the string has no (Coltheart-type) neighbours, undef is returned.
435            
436             =cut
437            
438             sub on_frq_max {
439 1     1 1 2776 my ( $self, %args ) = @_;
440 1 50       4 croak 'No string to test' if !$args{'string'};
441 1         6 my $frq_aref = _get_orthon_f(
442             $args{'string'}, $self->{'path'},
443             $self->{'IDX'}->{'frq_opm'}, $self->{'delim'}
444             );
445 1 50       1 return scalar @{$frq_aref} ? max( @{$frq_aref} ) : undef;
  1         5  
  1         6  
446             }
447             *on_freq_max = \&on_frq_max;
448            
449             =head3 on_frq_opm_mean
450            
451             $m = $subtlex->on_frq_mean(string => $letters);
452            
453             Returns the mean SUBTLEX frequencies per million of the orthographic neighbours (per Coltheart I) of a particular letter-string. If the string has no (Coltheart-type) neighbours, undef is returned.
454            
455             =cut
456            
457             sub on_frq_opm_mean {
458 1     1 1 270 my ( $self, %args ) = @_;
459 1 50       4 croak 'No string to test' if !$args{'string'};
460 1         4 my $frq_aref = _get_orthon_f(
461             $args{'string'}, $self->{'path'},
462             $self->{'IDX'}->{'frq_opm'}, $self->{'delim'}
463             );
464 1 50       2 return scalar @{$frq_aref} ? mean( @{$frq_aref} ) : undef;
  1         3  
  1         5  
465             }
466             *on_freq_mean = \&on_frq_opm_mean;
467            
468             =head3 on_frq_log_mean
469            
470             $m = $subtlex->on_frq_log_mean(string => $letters);
471            
472             Returns the mean log of SUBTLEX frequencies of the orthographic neighbours (per Coltheart I) of a particular letter-string. If the string has no (Coltheart-type) neighbours, undef is returned.
473            
474             =cut
475            
476             sub on_frq_log_mean {
477 0     0 1 0 my ( $self, %args ) = @_;
478 0 0       0 croak 'No string to test' if !$args{'string'};
479 0         0 my $frq_aref = _get_orthon_f(
480             $args{'string'}, $self->{'path'},
481             $self->{'IDX'}->{'frq_log'}, $self->{'delim'}
482             );
483 0 0       0 return scalar @{$frq_aref} ? mean( @{$frq_aref} ) : undef;
  0         0  
  0         0  
484             }
485             *on_lfreq_mean = \&on_frq_log_mean;
486            
487             =head3 on_frq_zipf_mean
488            
489             $m = $subtlex->on_frq_zipf_mean(string => $letters);
490            
491             Returns the mean zipf of SUBTLEX frequencies of the orthographic neighbours (per Coltheart I) of a given letter-string. If the string has no (Coltheart-type) neighbours, undef is returned.
492            
493             =cut
494            
495             sub on_frq_zipf_mean {
496 1     1 1 262 my ( $self, %args ) = @_;
497 1 50       4 croak 'No string to test' if !$args{'string'};
498 1         5 my $frq_aref = _get_orthon_f(
499             $args{'string'}, $self->{'path'},
500             $self->{'IDX'}->{'frq_zipf'}, $self->{'delim'}
501             );
502 1 50       2 return scalar @{$frq_aref} ? mean( @{$frq_aref} ) : undef;
  1         4  
  1         3  
503             }
504             *on_zipf_mean = \&on_frq_zipf_mean;
505            
506             =head3 on_ldist
507            
508             $m = $subtlex->on_ldist(string => $letters, lim => 20);
509            
510             I: ldist
511            
512             Returns the mean L from a letter-string to its B closest orthographic neighbours. The default Bit is 20, as defined in Yarkoni et al. (2008). The module uses the matrix-based calculation of Levenshtein Distance as implemented in L module. No defined value is returned if no Levenshtein Distance is found (whereas zero would connote "identical to everything").
513            
514             =cut
515            
516             sub on_ldist {
517 2     2 1 474 my ( $self, %args ) = @_;
518 2 50       8 croak 'No string to test' if !$args{'string'};
519 2   33     18 $args{'lim'} ||= $YARKONI_MAX;
520 2         23 return _get_orthon_ldist( $args{'string'}, $self->{'path'}, $args{'lim'},
521             $self->{'delim'} );
522             }
523             *ldist = \&on_ldist;
524            
525             =head2 Retrieving letter-strings/words
526            
527             =head3 list_strings
528            
529             $aref = $subtlex->list_words(freq => [1, 20], onc => [0, 3], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
530             $aref = $subtlex->list_words(zipf => [0, 2], onc => [0, 3], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
531            
532             I: list_words
533            
534             Returns a list of words from the SUBTLEX corpus that satisfies certain criteria: minimum and/or maximum letter-length (specified by the named argument B), minimum and/or maximum frequency (argument B) or zip-frequency (argument B), minimum and/or maximum orthographic neighbourhood count (argument B), a consonant-vowel pattern (argument B), or a specific regular expression (argument B).
535            
536             For the minimum/maximum constrained criteria, the two limits are given as a referenced array where the first element is the minimum and the second element is the maximum. For example, [3, 7] would specify letter-strings of 3 to 7 letters in length; [4, 4] specifies letter-strings of only 4 letters in length. If only one of these is to be constrained, then the array would be given as, e.g., [3] to specify a minimum of 3 letters without constraining the maximum, or ['',7] for a maximum of 7 letters without constraining the minimum (checking if the element C as per String::Util).
537            
538             The consonant-vowel pattern is specified as a string by the usual convention, e.g., 'CCVCC' defines a 5-letter word starting and ending with pairs of consonants, the pairs separated by a vowel. 'Y' is defined here as a consonant.
539            
540             A finer selection of particular letters can be made by giving a regular expression as a string to the B argument. In the example above, only letter-strings starting with the letter 'f', followed by one of more other letters, are specified. Alternatively, for example, '[^aeiouy]$' specifies that the letter-strings must not end with a vowel (here including 'y'). The entire example for '^f', including the shown arguments for B, B, B and B, would return only two words: I and I from SUBTLEX-US.
541            
542             The selection procedure will be made particularly slow wherever B is specified (as this has to be calculated in real-time) and no arguments are given for B and B (which are tested ahead of any other criteria).
543            
544             Syllable-counts might be added in future; existing algorithms in the Lingua family are not sufficiently reliable for the purposes to which the present module might often be put; an alternative is being worked on.
545            
546             The value returned is always a reference to the list of words retrieved (or to an empty list if none was retrieved).
547            
548             =cut
549            
550             sub list_strings {
551 3     3 1 5714 my ( $self, %args ) = @_;
552 3         7 my %patterns = ();
553 3 100       14 if ( hascontent( $args{'regex'} ) ) {
554 1         25 $patterns{'regex'} = qr/$args{'regex'}/msx;
555             }
556 3 100       18 if ( hascontent( $args{'cv_pattern'} ) ) {
557 1         7 my $tmp = q{};
558 1         5 my @c = split m//msx, uc( $args{'cv_pattern'} );
559 1         3 foreach (@c) {
560 4 100       9 $tmp .= $_ eq 'C' ? '[BCDFGHJKLMNPQRSTVWXYZ]' : '[AEIOU]';
561             }
562 1         74 $patterns{'cv_pattern'} = qr/^$tmp$/imsx;
563             }
564            
565 3         13 my @list = ();
566 3 50       129 open my $fh, q{<}, $self->{'path'} or croak $OS_ERROR;
567             LINES:
568 3         41 while (<$fh>) {
569 90 100       160 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
570 87         90 chomp;
571 87         472 my @line = split m/\Q$self->{'delim'}\E/msx;
572 87 100       103 next if !_in_range( length( $line[0] ), @{ $args{'length'} } );
  87         137  
573 33         67 for ( keys %patterns ) {
574 7 100       36 next LINES if $line[0] !~ $patterns{$_};
575             }
576 32         55 for (qw/freq opm frq_opm frq_log log frq_zipf zipf cd_pct cd_log/) {
577 128 100       198 if ( ref $args{$_} ) {
578             next LINES
579 32         46 if !_in_range( _nummify($line[ $self->{'IDX'}->{$_} ]),
580 32 100       59 @{ $args{$_} } );
581             }
582             }
583 12 50       23 if ( ref $args{'pos'} ) {
584             next LINES
585 0     0   0 if none { $_ eq $line[ $self->{'IDX'}->{'pos'} ] }
586 0 0       0 @{ $args{'pos'} };
  0         0  
587             }
588             next LINES
589 12         34 if !_in_range( scalar( $self->on_count( string => $line[0] ) ),
590 12 100       29 @{ $args{'onc'} } );
591 10         62 push @list, $line[0];
592             }
593 3 50       22 close $fh or croak;
594            
595 3         21 return \@list;
596             }
597             *list_words = \&list_strings;
598            
599             =head3 all_strings
600            
601             $aref = $subtlex->all_strings();
602            
603             I: all_words
604            
605             Returns a reference to an array of all letter-strings in the corpus, in their given order.
606            
607             =cut
608            
609             sub all_strings {
610 0     0 1 0 my ( $self, %args ) = @_;
611 0         0 my @list = ();
612 0 0       0 open my $fh, q{<}, $self->{'path'} or croak $OS_ERROR;
613 0         0 while (<$fh>) {
614 0 0       0 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
615 0 0       0 /^([^\Q$self->{'delim'}\E]+)/msx or next;
616 0         0 push @list, $1;
617             }
618 0 0       0 close $fh or croak;
619 0         0 return \@list;
620             }
621             *all_words = \&all_strings;
622            
623             =head3 random_string
624            
625             $string = $subtlex->random_string();
626             @data = $subtlex->random_string();
627            
628             I: random_word
629            
630             Picks a random line from the corpus, using L (except the top header line). Returns the word in that line if called in scalar context; otherwise, the array of data for that line. (A future version might let specifying a match to specific criteria, self-aborting after trying X lines.)
631            
632             =cut
633            
634             sub random_string {
635 2     2 1 372 my ( $self, %args ) = @_;
636 2         3 eval { require File::RandomLine; };
  2         496  
637 2 50       4964 croak 'Need to install and access module File::RandomLine' if $EVAL_ERROR;
638 2         20 my $rl =
639             File::RandomLine->new( $self->{'path'}, { algorithm => 'uniform' } );
640 2         240 my @ari = ();
641 2   66     7 while ( not scalar @ari or $ari[0] eq 'Word' ) {
642 2         20 @ari = split m/\Q$self->{'delim'}\E/msx, $rl->next;
643             }
644 2 100       142 return wantarray ? @ari : $ari[0];
645             }
646             *random_word = \&random_string;
647            
648             =head2 Miscellaneous
649            
650             =head3 nlines
651            
652             $num = $subtlex->nlines();
653            
654             Returns the number of lines, less the column headings, in the installed language file. Expects/accepts no arguments.
655            
656             =cut
657            
658             sub nlines {
659 0     0 1 0 my $self = shift;
660 0         0 my $z = 0;
661 0 0       0 open( my $fh, q{<}, $self->{'path'} ) or croak "$OS_ERROR\n";
662 0         0 while (<$fh>) {
663 0 0       0 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
664 0         0 $z++;
665             }
666 0 0       0 close $fh or croak $OS_ERROR;
667 0         0 return $z;
668             }
669            
670             ### PRIVATMETHODEN:
671            
672             sub _get_orthon_f {
673 3     3   5 my ( $str, $path, $idx, $delim ) = @_;
674 3         5 my $word = lc($str);
675 3         12 require Lingua::Orthon;
676 3         12 my $ortho = Lingua::Orthon->new();
677 3         96 my @freqs = ();
678 3 50       102 open( my $fh, q{<}, $path ) or croak $OS_ERROR;
679 3         45 while (<$fh>) {
680 90 100       473 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
681 87 50       297 /^([^\Q$delim\E]+)/xsm or next; # capture first token
682 87         113 my $test = lc($1);
683 87 100       138 if ( $ortho->are_orthons( $word, $test ) ) { # Lingua::Orthon method
684 33         376 chomp;
685 33         146 my @line = split m/\Q$delim\E/xsm;
686 33         65 push @freqs, _nummify($line[$idx]);
687             }
688             }
689 3 50       25 close $fh or croak $OS_ERROR;
690 3         22 return \@freqs;
691             }
692            
693             sub _get_orthon_ldist {
694 2     2   4 my ( $str, $path, $lim, $delim ) = @_;
695 2         4 my $word = lc($str);
696 2         15 require Lingua::Orthon;
697 2         12 my $ortho = Lingua::Orthon->new();
698 2         73 my @ldists = ();
699 2 50       106 open( my $fh, q{<}, $path ) or croak $OS_ERROR;
700 2         4 my @tests = ();
701 2         45 while (<$fh>) {
702 86 100       270633 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
703 84 50       550 /^([^\Q$delim\E]+)/msx or next;
704 84         176 my $test = lc($1);
705 84 100       455 next if $word eq $test;
706 82         188 push @ldists, $ortho->ldist( $word, $test );
707             }
708 2 50       6365 close $fh or croak $OS_ERROR;
709 2         19 my @sorted = sort { $a <=> $b } @ldists;
  252         169  
710 2         21 return mean( @sorted[ 0 .. $lim - 1 ] );
711             }
712            
713             sub _get_fieldvalue {
714 10     10   14 my ( $self, $str, $col_i ) = @_;
715 10 50       24 croak
716             'No word to test; pass a letter-string named \'string\' to the function'
717             if nocontent($str);
718 10         102 $str = lc($str);
719 10         13 my $val = q{}; # default value returned is empty string
720 10 50       348 open( my $fh, q{<}, $self->{'path'} ) or croak $OS_ERROR;
721 10         104 while (<$fh>) {
722 135 100       220 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
723 125         439 /^([^\Q$self->{'delim'}\E]+)/msx;
724 125 100       453 if ( $str eq $1 ) {
725 10         14 chomp; # or zipf will return with "\n" appended
726 10         81 my @line = split m/\Q$self->{'delim'}\E/msx, $_;
727 10         34 $val = _nummify($line[$col_i]);
728 10         23 last;
729             }
730             }
731 10 50       90 close $fh or croak;
732 10         101 return $val;
733             }
734            
735             sub _in_range {
736 131     131   104 my ( $n, $min, $max ) = @_;
737 131         100 my $res = 1;
738 131 100 100     178 if ( hascontent($min) and $n < $min ) { # fails min
739 48         372 $res = 0;
740             }
741 131 100 100     768 if ( $res && ( hascontent($max) and $n > $max ) ) { # fails max and min
      66        
742 28         178 $res = 0;
743             }
744 131         679 return $res;
745             }
746            
747             sub _nummify {
748 96     96   92 my $val = shift;
749 96         100 $val =~ s/,([^,]+)$/.$1/; # replace ultimate , with .
750 96         284 return $val;
751             }
752            
753             =head1 DIAGNOSTICS
754            
755             =over 4
756            
757             =item Cannot determine field indices
758            
759             When constructing the class object with L, the module needs to read in the contents of a file named "fields.csv" which should be housed within the SUBTLEX directory where the module itself is located (alongside the downloaded SUBTLEX files). This is necessary because the field indices for the various stats vary from one language file to the next. This should have been done with installation of the module itself. Check that this file is indeed within the Perl/site/lib/Lingua/Norms/SUBTLEX directory. If it is not, download and install the file to that location via the L package of this module.
760            
761             =item Value given to argument 'dir' (VALUE) in new() is not a directory
762            
763             Croaked from L if called with a value for the argument B, and this value is not actually a directory/folder. This is the directory/folder in which the actual SUBTLEX datafiles should be located.
764            
765             =item Cannot find required database for language $lang
766            
767             Croaked from L if none of the given values to arguments B, B or B are valid, and even the default site/lib directory and US database are not accessible. Check that your have indeed a file with the given value of B (DE, NL, UK or US) within the Perl/site/lib/Lingua/Norms/SUBTLEX directory, or at least that the SUBTLEX-US file is located within it, and can be read via your script.
768            
769             =item Cannot determine fields for given language
770            
771             Croaked upon construction if no fields are recognized for the given language. The value given to B must be one of DE, NL, UK or US.
772            
773             =item No string to test; pass a string to the function
774            
775             Croaked by several methods that expect a value for the named argument B, and when no such value is given. These methods require the letter-string to be passed to it as a I => I pair, with the key B followed by the value of the string to test.
776            
777             =item No string(s) to test; pass one or more letter-strings named \'strings\' as a referenced array
778            
779             Same as above but specifically croaked by L which accepts more than one string in a single call.
780            
781             =item Need to install and have access to module File::RandomLine
782            
783             Croaked by method L if the module it depends on (File::RandomLine) is not installed or accessible. This should have been installed (if not already) upon installation of the present module. See L to download and install this module manually.
784            
785             =back
786            
787             =head1 DEPENDENCIES
788            
789             L : needed to work L.
790            
791             L : handy for directory reading when calling L.
792            
793             L : needed to calculate Levenshtein Distance, assessing orthographic neighbourhood.
794            
795             L : handy C function.
796            
797             L : needed for the various statistical methods.
798            
799             L : utilities for determining valid string values.
800            
801             L : depended upon to determine the delimiter (comma or tab) within the datafiles.
802            
803             =head1 REFERENCES
804            
805             B (2011). The word frequency effect: A review of recent developments and implications for the choice of frequency estimates in German. I, I<58>, 412-424. doi: L<10.1027/1618-3169/a000123|http://dx.doi.org/10.1027/1618-3169/a000123>
806            
807             B (2009). Moving beyond Kucera and Francis: A critical evaluation of current word frequency norms and the introduction of a new and improved word frequency measure for American English. I, I<41>, 977-990. doi: L<10.3758/BRM.41.4.977|http://dx.doi.org/10.3758/BRM.41.4.977>
808            
809             B (2012). Adding part-of-speech information to the SUBTLEX-US word frequencies. I, I<44>, 991-997. doi: L<10.3758/s13428-012-0190-4|http://dx.doi.org/10.3758/s13428-012-0190-4>
810            
811             B (1977). Access to the internal lexicon. In S. Dornic (Ed.), I (Vol. 6, pp. 535-555). London, UK: Academic.
812            
813             B (2010). SUBTLEX-NL: A new frequency measure for Dutch words based on film subtitles. I, I<42>, 643-650. doi: L<10.3758/BRM.42.3.643|http://dx.doi.org/10.3758/BRM.42.3.643>
814            
815             B (2014). SUBTLEX-UK: A new and improved word frequency database for British English. I, I<67>, 1176-1190. doi: L<10.1080/17470218.2013.850521|http://dx.doi.org/10.1080/17470218.2013.850521>
816            
817             B (2008). Moving beyond Coltheart's I: A new measure of orthographic similarity. I, I<15>, 971-979. doi: L<10.3758/PBR.15.5.971|http://dx.doi.org/10.3758/PBR.15.5.971>
818            
819             =head1 AUTHOR
820            
821             Roderick Garton, C<< >>
822            
823             =head1 BUGS AND LIMITATIONS
824            
825             Please report any bugs or feature requests to C, or through
826             the web interface at L. I will be notified, and then you'll
827             automatically be notified of progress on your bug as I make changes.
828            
829             =head1 SUPPORT
830            
831             You can find documentation for this module with the perldoc command.
832            
833             perldoc Lingua::Norms::SUBTLEX
834            
835            
836             You can also look for information at:
837            
838             =over 4
839            
840             =item * RT: CPAN's request tracker (report bugs here)
841            
842             L
843            
844             =item * AnnoCPAN: Annotated CPAN documentation
845            
846             L
847            
848             =item * CPAN Ratings
849            
850             L
851            
852             =item * Search CPAN
853            
854             L
855            
856             =back
857            
858             =head1 LICENSE AND COPYRIGHT
859            
860             Copyright 2014-2015 Roderick Garton.
861            
862             This program is free software; you can redistribute it and/or modify it
863             under the terms of either: the GNU General Public License as published
864             by the Free Software Foundation; or the Artistic License.
865            
866             See L for more information.
867            
868             To the maximum extent permitted by applicable law, the author of this module disclaims all warranties, either express or implied, including but not limited to implied warranties of merchantability and fitness for a particular purpose, with regard to the software and the accompanying documentation.
869            
870             =cut
871            
872             1; # End of Lingua::Norms::SUBTLEX
873