File Coverage

blib/lib/Lingua/Norms/SUBTLEX.pm
Criterion Covered Total %
statement 335 377 88.8
branch 103 174 59.2
condition 15 29 51.7
subroutine 60 68 88.2
pod 29 29 100.0
total 542 677 80.0


line stmt bran cond sub pod time code
1             package Lingua::Norms::SUBTLEX;
2 17     17   1177150 use 5.12.0;
  17         173  
3 17     17   80 use strict;
  17         39  
  17         377  
4 17     17   74 use warnings FATAL => 'all';
  17         35  
  17         584  
5 17     17   92 use base qw(Lingua::Orthon);
  17         30  
  17         7859  
6 17     17   523329 use Config;
  17         73  
  17         605  
7 17     17   76 use Carp qw(carp croak);
  17         28  
  17         754  
8            
9             #use Encode qw(encode decode);
10             #use Encode::Guess;
11 17     17   7060 use English '-no_match_vars';
  17         48711  
  17         82  
12 17     17   4734 use File::Spec;
  17         26  
  17         356  
13 17     17   71 use List::AllUtils qw(all any first firstidx none uniq);
  17         25  
  17         903  
14 17     17   85 use Number::Misc qw(is_numeric);
  17         29  
  17         596  
15 17     17   11120 use Path::Tiny;
  17         156810  
  17         794  
16 17     17   7287 use Readonly;
  17         52891  
  17         791  
17 17     17   104 use Statistics::Lite qw(count max mean median stddev sum);
  17         34  
  17         846  
18 17     17   6278 use String::Trim qw(trim);
  17         8220  
  17         878  
19 17     17   100 use String::Util qw(hascontent crunch fullchomp nocontent unquote);
  17         32  
  17         761  
20 17     17   6931 use Text::CSV::Hashify;
  17         1413843  
  17         992  
21 17     17   6956 use Text::CSV::Separator qw(get_separator);
  17         26596  
  17         889  
22 17     17   7223 use Text::Unidecode;
  17         25313  
  17         46933  
23            
24             #use open ':encoding(utf8)';
25            
26             $Lingua::Norms::SUBTLEX::VERSION = '0.06';
27            
28             =pod
29            
30             =encoding utf8
31            
32             =head1 NAME
33            
34             Lingua::Norms::SUBTLEX - Retrieve word frequencies and related values and lists from subtitles corpora
35            
36             =head1 VERSION
37            
38             This is documentation for B of Lingua::Norms::SUBTLEX.
39            
40             =head1 SYNOPSIS
41            
42             use Lingua::Norms::SUBTLEX 0.06;
43             my $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'UK');
44            
45             # Is the string 'frog' in the subtitles corpus?
46             my $bool = $subtlex->is_normed(string => 'frog');
47            
48             # Occurrences-per-million:
49             # - for a single string:
50             my $frq = $subtlex->frq_opm(string => 'frog'); # freq. per million; also count, log-f, Zipf
51            
52             # - for a list of strings:
53             my $href = $subtlex->frq_hash(strings => [qw/frog fish ape/]); # freqs. for a list of words
54             print "'$_' opm\t$href->{$_}\n" for keys %{$href};
55            
56             # stats:
57             printf "mean opm\t%f\n", $subtlex->frq_mean(strings => [qw/frog fish ape/]); # or median, std-dev.
58            
59             # parts-of-speech:
60             printf "'frog' part-of-speech = %s\n", $subtlex->pos_dom(string => 'frog');
61            
62             # retrieve (list of) words to certain specs, e.g., min/max range:
63             my $aref = $subtlex->select_words(freq => [2, 400], length => [4, 4], cv_pattern => 'CCVC', regex => '^f');
64             printf "Number of 4-letter CCVC strings with 2-400 opm starting with 'f' = %d\n", scalar @{$aref};
65            
66             printf "A randomly selected subtitles string is '%s'\n", $subtlex->random_string();
67            
68             =head1 DESCRIPTION
69            
70             This module facilitates access to corpus frequency and other lexical attributes of character strings (generally, words), as provided in the various SUBTLEX and related projects (see L) on the basis of the representation of these strings in film and television subtitles (see L). Word frequencies obtained in this way have been shown to be generally more predictive of performance in word recognition tasks than frequencies derived from books, newsgroup posts, and similar sources (but see Herdagdelen & Marelli, 2017).
71            
72             There are three main groups of measures that are potentially retrievable from the SUBTLEX datatables: (1) frequency; (2) contextual diversity (number of films/TV episodes appeared in); and (3) parts-of-speech. The module tries to uniformly offer, across the available files, frequency as a count (L), occurrences per million (L), logarithm of the opm or frequency count (L), and/or the 7-point scaled Zipf frequency (L). "Contextual diversity" is given as a count (L), a percentage (L), and/or a logarithm (L). For parts-of-speech, the module returns, via L, the dominant linguistic syntactical role of the word, as well as all defined parts-of-speech for a word (via L).
73            
74             However, not all these methods are available across all projects; e.g., SUBTLEX-NL does not define Zipf frequency, and SUBTLEX-DE does not define CD, POS or Zipf frequency. In these cases, the method in question will return an empty string.
75            
76             =head1 CORPORA SPECS and SOURCES
77            
78             The SUBTLEX files need to be downloaded via the URLs shown in the table below (only a small sample from each of each of the SUBTLEX corpora is included in the installation distribution for testing purposes). So, for example, for the I norms, install the file named "SUBTLEX-US frequency list with PoS and Zipf information.csv" via L.
79            
80             The local directory location or actual pathname of these files can be given in class construction (by the arguments B and B, respectively); or it will be sought from the default location--within the directory "SUBTLEX" alongside the module itself in the locally configured Perl sitelib--given the B argument to L, or to L. The filenames of the original files downloaded from the following sites should be found in this way, but it should uniquely include the "key" shown in the table. The module will attempt to identify the correct field separator for the file (which can be comma-separated or tab-delimited). Only the files specified in the table are likely to be reliably accessed at this time.
81            
82             =for html

  

83            
LanguageKeyURLFile
84            
DutchNL_allcrr.ugent.beSUBTLEX-NL.with-pos.txt
85            
 NL_mincrr.ugent.beSUBTLEX-NL.cd-above2.with-pos.txt
86            
English (American)USexpsy.ugent.be/subtlexusSUBTLEX-US frequency list with PoS and Zipf information.csv
87            
88            
English (British)UKpsychology.nottingham.ac.ukSUBTLEX-UK.txt
89            
FrenchFRlexique.orgLexique381.txt
90            
GermanDEcrr.ugent.beSUBTLEX-DE_cleaned_with_Google00.txt
91            
PortuguesePTp-pal.di.uminho.ptSUBTLEX-PT_Soares_et_al._QJEP.csv
92            

93            
94             Notes regarding these different corpora.
95            
96             =over 4
97            
98             =item * SUBTLEX-DE
99            
100             The file has separate entries for words starting with an uppercase and a lowercase letter (e.g., for when a letter-string is both a noun and an adjective).
101            
102             =item * Lexique (SUBTLEX-FR)
103            
104             If not giving the full path to this file, it should be renamed to include "FR" (e.g., "FR_Lexique.csv") and stored in the default directory. The file also includes frequencies from books.
105            
106             =item * SUBTLEX-PT
107            
108             The I subtitles data are available as an Excel file (directly from L). This file needs to be saved as a (csv) text file to be usable here.
109            
110             =item * SUBTLEX-UK
111            
112             Includes words that might be spelled with a dash both with a dash and without; so there are separate entries for I and I, and for I and I. It includes some strings with apostrophes (e.g., I, I); but common contractions like I, I and I do not appear; they are stripped of their apostrophes, listed, e.g., as I, I and I. All strings are in lower-case; so I is represented as I.
113            
114             =item * SUBTLEX-US
115            
116             There are no strings with capitalized onsets in this file, or with punctuation marks, including apostrophes and dashes (e.g., I and I are represented as I and I; I as I, and I as I).
117            
118             The earlier, original file "SUBTLEXusExcel2007.csv" presents strings as they were originally capitalised: there is, e.g., I and I--but neither I nor I. This file does not provide part-of-speech or Zipf frequencies.
119            
120             =back
121            
122             There are several other languages from this project which might be supported by this module in a later version (originally, only SUBTLEX-US was supported).
123            
124             See the new() method as to how this module handles case-sensitivity and diacritical marks. For files where strings are UTF-8 encoded, the strings being looked up should also be UTF-8 encoded (if they are diacritically marked, e.g. "embâcle")(see L).
125            
126             If using Miscrosoft Excel to save any of these files, even if in CSV format, Excel will turn the words "true" and "false" into the Boolean strings "TRUE" and "FALSE", as well as throw them aside from alphabetic sorting (right down to the bottom of an alphabetic sort). That will surely stuff up any neatly intended pattern-matching for these words.
127            
128             =head1 SUBROUTINES/METHODS
129            
130             All methods are called via the class object, and with named (hash of) arguments, usually B, where relevant.
131            
132             =head2 new
133            
134             $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'DE'); # - looking in Perl sitelib
135             $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'DE', dir => 'file_directory'); # folder in which file is located
136             $subtlex = Lingua::Norms::SUBTLEX->new(lang => 'DE', path => 'file/is/here.csv'); # complete path to file for given language
137            
138             Returns a class object for accessing other methods. The argument B is required, specifying the particular language datafile by a "key" as given in the above table. Optional arguments B or B can be given to specify the location or filepath of the database. The default location is the "Lingua/Norms/SUBTLEX" directory within the 'sitelib' configured for the local Perl installation (as per L). The method will C if the file cannot be found.
139            
140             The optional argument B specifies how string comparison, as when looking up a given word in the SUBTLEX corpus, should be conducted, with the function used to test string equality being derived from the C function in L (part of the standard Perl distribution). This matching level applies to the look-up of strings within all methods, including those specifically assessing orthographic equality. This argument can take one of three values: see L:
141            
142             =cut
143            
144             sub new {
145 23     23 1 12272 my ( $class, %args ) = @_;
146 23 50       97 my $self = bless {}, ref($class) ? ref($class) : $class;
147             $self->{'_MODULE_DIR'} =
148 23         1402 File::Spec->catdir( $Config{'sitelib'}, qw/Lingua Norms SUBTLEX/ );
149 23         134 $self->_set_spec_hash( $args{'fieldpath'} );
150 23         118 $self->set_lang(%args);
151 23         225 $self->set_eq( match_level => $args{'match_level'} );
152            
153             #_set_encoding($args{'decode'});
154 23         898 return $self;
155             }
156            
157             =head2 Frequencies and POS for individual words or word-lists
158            
159             =head3 is_normed
160            
161             $bool = $subtlex->is_normed(string => $word);
162            
163             I: isa_word
164            
165             Returns 1 or 0 as to whether or not the letter-string passed as B is represented in the subtitles file. For some files, this might be thought of as a lexical decision ("does this string spell a word?"); but others include misspelled words (e.g., "pyscho"), digit strings, abbreviations ...
166            
167             =cut
168            
169             sub is_normed {
170 10     10 1 115767 my ( $self, %args ) = @_;
171 10         37 my $str = _get_usr_str( $args{'string'} );
172 10         16 my $res = 0; # boolean to return from this sub
173 10 50       436 open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
174 10         283 while (<$fh>) {
175 227 100       61158 next if $INPUT_LINE_NUMBER == 1; # skip headings
176 217         549 my $file_str = _get_file_str( $_, $self->{'_DELIM'} )
177             ; # have to declare as can be empty (!)
178 217 50       5051 next if nocontent($file_str);
179 217 100       1939 if ( $self->{'_EQ'}->( $str, $file_str ) )
180             { # first token equals given string?
181 6         1762 $res = 1; # set result to return as true
182 6         13 last; # got it, so abort look-up
183             }
184             }
185 10 50       947 close $fh or croak $OS_ERROR;
186 10         79 return $res; # zero if string not found in file
187             }
188             *isa_word = \&is_normed;
189            
190             =head3 frq_count
191            
192             $int = $subtlex->frq_count(string => 'aword');
193            
194             Returns the raw number of occurrences in all the films/TV episodes for the word passed as B, or 0 if the string is not found in language file.
195            
196             =cut
197            
198             sub frq_count {
199 6     6 1 3041 my ( $self, %args ) = @_;
200             return _val_or_0(
201             _get_val_for_str(
202             _get_usr_str( $args{'string'} ),
203             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_count_idx' ),
204 6         22 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  18         126  
205             )
206             );
207             }
208            
209             =head3 frq_opm
210            
211             $val = $subtlex->frq_opm(string => 'aword');
212            
213             I: opm
214            
215             Returns frequency per million for the word passed as B, or 0 if the string is not found in language file.
216            
217             =cut
218            
219             sub frq_opm {
220 30     30 1 6661 my ( $self, %args ) = @_;
221             return _val_or_0(
222             _get_val_for_str(
223             _get_usr_str( $args{'string'} ),
224             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_opm_idx' ),
225 30         86 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  90         563  
226             )
227             );
228             }
229             *freq = \&frq_opm; # legacy only
230            
231             =head3 frq_log
232            
233             $val = $subtlex->frq_log(string => 'aword');
234            
235             Returns log frequency per million for the word passed as B, or the empty-string if the string is not represented in the norms.
236            
237             =cut
238            
239             sub frq_log {
240 5     5 1 1826 my ( $self, %args ) = @_;
241             return _get_val_for_str(
242             _get_usr_str( $args{'string'} ),
243             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_log_idx' ),
244 5         15 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  15         86  
245             );
246             }
247             *lfreq = \&frq_log; # legacy only
248            
249             =head3 frq_zipf
250            
251             $val = $subtlex->frq_zipf(string => 'aword');
252            
253             Returns Zipf frequency for the word passed as B, or the empty-string if the string is not represented in the language file. The Zipf scale ranges from about 1 to 7, with values of 1-3 generally representing low frequency words, and values of generally 4-7+ representing high frequency words, with respect to various recognition measures used in the study of word frequency effects. See Van Heuven et al. (2014) and L for more information.
254            
255             =cut
256            
257             sub frq_zipf {
258 6     6 1 1476 my ( $self, %args ) = @_;
259             return _get_val_for_str(
260             _get_usr_str( $args{'string'} ),
261             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_zipf_idx' ),
262 6         21 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  18         126  
263             );
264             }
265             *zipf = \&frq_zipf; # legacy only
266            
267             =head3 frq_zipf_calc
268            
269             $calc = $subtlex->frq_zipf_calc( string => 'favourite' );
270             $calc = $subtlex->frq_zipf_calc( string => 'favourite', corpus_size => POS_FLOAT_in_millions, n_wordtypes => POS_INT );
271            
272             Returns an estimate of Zipf frequency by calculating its value from the given or retrievable L or L, and the given or retrievable values of the corpus_size and n_wordtypes for the particular SUBTLEX project; i.e., the values of corpus_size and n_wordtypes can be provided as named arguments. As introduced by Van Heuven et al. (2014) (see also L):
273            
274             =for html

  Zipf = log10[ ( frq_count + 1 ) / ( corpus_size + n_wordtypes )/1000000 ] + 3

275            
276             How well the returned value satisfies the "border relations" desired of the index (e.g., that up to 1 opm corresponds to Zipf of E 3) depends on the reliability of the corpus size and wordtype counts, and any rounding of these values (where relevant) and (if required) of the opm. Examinations of the returned values show that, when using the canned and reported values (which is the default here), they align with these definitions, and with any canned Zipf values, within the margins of about the third or fourth decimal place.
277            
278             =cut
279            
280             sub frq_zipf_calc {
281 0     0 1 0 my ( $self, %args ) = @_;
282             my $corpus_size =
283             defined $args{'size_corpus'}
284             ? $args{'size_corpus'}
285 0 0       0 : $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'size_corpus_mill' );
286             my $n_wordtypes =
287             defined $args{'n_wordtypes'}
288             ? $args{'n_wordtypes'}
289 0 0       0 : $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'n_wordtypes' );
290            
291 0         0 $n_wordtypes /= 1_000_000;
292            
293             my $frq_count =
294             is_numeric( $args{'frq_count'} )
295             ? $args{'frq_count'}
296             : is_numeric( $args{'frq_opm'} ) ? sprintf "%.0f",
297 0 0       0 $args{'frq_opm'} * $corpus_size : eval { $self->frq_count(%args) };
  0 0       0  
298            
299 0 0 0     0 if ($EVAL_ERROR or not is_numeric($frq_count) ) {
300 0         0 my $frq_opm = eval { $self->frq_opm(%args) };
  0         0  
301 0 0 0     0 if (not $EVAL_ERROR and is_numeric($frq_opm) ) {
302 0         0 $frq_count = sprintf "%.0f", $frq_opm * $corpus_size;
303             }
304             }
305 0   0     0 $frq_count ||= 0;
306            
307 0         0 return _log10( ( 1 + $frq_count ) / ( $corpus_size + $n_wordtypes ) ) + 3;
308            
309             }
310            
311             =head3 frq_opm2count
312            
313             $int = $subtlex->frq_opm2count(string => STRING);
314            
315             Returns the raw number of occurrences of a string (the frq_count) based on the number of occurrences per million (frq_opm), and the corpus size in millions. Returns 0 if the string is not found in language file.
316            
317             The B can be given as a named argument, or it will be retrieved by the L respective method, where this is defined for a particular language file. The B (in millions) can also be given as a named argument, or it will be retrieved from the specifications file (specs.csv in the module's directory), where this value has been obtainable from published reports.
318            
319             =cut
320            
321             sub frq_opm2count {
322 5     5 1 2298 my ( $self, %args ) = @_;
323             my $frq_opm =
324 5 50       26 defined $args{'frq_opm'} ? $args{'frq_opm'} : $self->frq_opm(%args);
325             my $corpus_size =
326             defined $args{'size_corpus'}
327             ? $args{'size_corpus'}
328 5 50       107 : $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'size_corpus_mill' );
329 5         85 return sprintf "%.0f", $frq_opm * $corpus_size;
330             }
331            
332             =head3 cd_count
333            
334             $cd = $subtlex->cd_count(string => STRING);
335            
336             Returns the number of samples (films/TV episodes) comprising the corpus in which the string occurred in its subtitles; so-called "contextual diversity". Returns 0 if the string is not found in language file.
337            
338             =cut
339            
340             sub cd_count {
341 5     5 1 977 my ( $self, %args ) = @_;
342             return _val_or_0(
343             _get_val_for_str(
344             _get_usr_str( $args{'string'} ),
345             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'cd_count_idx' ),
346 5         15 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  15         87  
347             )
348             );
349             }
350            
351             =head3 cd_pct
352            
353             $cd = $subtlex->cd_pct(string => 'aword');
354            
355             Returns a percentage measure for the number of samples (films/TV episodes) comprising the corpus in which the B occurred in its subtitles; so-called "contextual diversity". Returns 0 if the string is not found in language file.
356            
357             =cut
358            
359             sub cd_pct {
360 5     5 1 534 my ( $self, %args ) = @_;
361             return _val_or_0(
362             _get_val_for_str(
363             _get_usr_str( $args{'string'} ),
364             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'cd_pct_idx' ),
365 5         13 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  15         86  
366             )
367             );
368             }
369            
370             =head3 cd_log
371            
372             $cd = $subtlex->cd_log(string => 'aword');
373            
374             Returns log10(L + 1) for the given string, with 4-digit precision. Note: Brysbaert and New (2009) state that "this is the best value to use if one wants to match words on word frequency" (p. 988).
375            
376             =cut
377            
378             sub cd_log {
379 5     5 1 544 my ( $self, %args ) = @_;
380             return _get_val_for_str(
381             _get_usr_str( $args{'string'} ),
382             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'cd_log_idx' ),
383 5         13 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  15         86  
384             );
385             }
386            
387             =head3 pos_dom
388            
389             $pos_str = $subtlex->pos_dom(string => STRING, conform => BOOL);
390            
391             Returns the dominant part-of-speech for the given string. The return value is undefined if the string is not found. If the field in the original file (as in SUBTLEX-PT) is actually for all possible parts-of-speech, the first element in the returned string (once split by non-word characters), is returned (assuming, as in SUBTLEX-PT) that this is indeed the most frequent part-of-speech for the particular string.
392            
393             For interpretation of the POS codes: for NL, see L ("SPEC" is there defined as "often personal or geographical names" and so similar to "Name" in SUBTLEX-UK).
394            
395             To transliterate the various codes into a common two-letter code, then set B => 1 (default is not defined, returning the POS string as given in the original files). The two-letter codes are:
396            
397             NN noun (common)
398             NM name (proper)
399             PN pronoun
400             VB verb
401             AJ adjective
402             AV adverb
403             PP proposition
404             CJ conjunction
405             IJ interjection
406             DA determiner or article
407             NB number
408             OT other
409             UK unknown
410            
411             The "OT" code includes some rare POS values (e.g., "marker", "ONO"), anomalous values (e.g., "2"), and values not defined in the associated reports. The "UK" code ("unknown") is comprised of values specifically recorded as "unclassified" or similar, or where the POS field is empty.
412            
413             =cut
414            
415             sub pos_dom {
416 13     13 1 2327 my ( $self, %args ) = @_;
417             my $str = _get_val_for_str(
418             _get_usr_str( $args{'string'} ),
419             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'pos_dom_idx' ),
420 13         37 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  39         236  
421             );
422 13         56 my @ari = map { trim($_) } grep { hascontent($_) } split /[\W]/xsm, $str;
  15         155  
  15         50  
423             return $args{'conform'}
424 13 100       300 ? _pos_is( $ari[0], $self->{'_FIELDS'}, $self->{'_LANG'} )->[0]
425             : $ari[0];
426             }
427             *pos = \&pos_dom;
428            
429             =head3 pos_all
430            
431             $pos_aref = $subtlex->pos_all(string => STRING, conform => BOOL);
432            
433             Returns all parts-of-speech for the given string as a referenced array. The return value is an empty list if the string is not found. If the language file does not define this field, the returned value is simply the same as what would, if possible, be returned from L (i.e., if that value is defined), but now as a referenced array.
434            
435             =cut
436            
437             sub pos_all {
438 9     9 1 2173 my ( $self, %args ) = @_;
439             my $str = _get_val_for_str(
440             _get_usr_str( $args{'string'} ),
441             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'pos_all_idx' ),
442 9         28 map { $self->{$_} } (qw/_PATH _DELIM _EQ/)
  27         156  
443             );
444            
445             # grep to ensure no empty values as might come from a head/trailing delimiter:
446 9         57 my @ari = map { trim($_) } grep { hascontent($_) } split /[\W]/xsm, $str;
  26         300  
  28         142  
447             return $args{'conform'}
448 9 100       152 ? [ map { @{ _pos_is( $_, $self->{'_FIELDS'}, $self->{'_LANG'} ) } }
  8         9  
  8         15  
449             @ari ]
450             : \@ari;
451             }
452            
453             =head2 Multiple strings/values lists
454            
455             Array given as measures to the following methods might include one or more of the following:
456            
457             frq_count
458             frq_opm
459             frq_log
460             frq_zipf
461             cd_count
462             cd_pct
463             cd_log
464             pos_dom
465             pos_all
466            
467             =head3 values_list
468            
469             $aref = $subtlex->values_list(string => STRING, values => AREF);
470            
471             Returns values for a single letter-string as a referenced array.
472            
473             =cut
474            
475             sub values_list {
476 3     3 1 1235 my ( $self, %args ) = @_;
477 3         5 my @idx_ari;
478 3 50       8 if ( ref $args{'values'} ) {
479 3         5 for my $field ( @{ $args{'values'} } ) {
  3         6  
480             push @idx_ari,
481 6         68 $self->{'_FIELDS'}->datum( $self->{'_LANG'}, $field . '_idx' );
482             }
483             }
484             return _get_val_for_strs( _get_usr_str( $args{'string'} ),
485 3         34 \@idx_ari, map { $self->{$_} } (qw/_PATH _DELIM _EQ/) );
  9         20  
486             }
487            
488             =head3 multi_list
489            
490             $hashref = $subtlex->multi_list(strings => AREF_of_char_strings, measures => AREF_of_FIELD_NAMES);
491            
492             $frq_hashref = $subtlex->multi_list(strings => [qw/ICH PEA CHOWDER ZEER AIME/], measures => [qw/frq_opm frq_zipf/]);
493             # $frq_hashref = {
494             # ICH => {
495             # frq_opm => 20000,
496             # frq_zipf => 7.01,
497             # },
498             # PEA => {
499             # frq_opm ...
500             # },
501             # ...
502             # }
503            
504             Returns multiple values for a list of strings as a hashref of hashrefs. This is perhaps the most efficient method here for retrieving several values for several words, but only for a small number of words; it could take a long time to return given large lists.
505            
506             So, given one or more words in the array ref B, and several measures/values to find for each of them (such as 'frq_opm', 'pos_dom' or any other values defined for the particular language file) in the the array B, the method looks line-by-line through the file to check if the line's string is equal to any of those in B. If so, it collates the relevant measures in a hash keyed by the string, whose values are themselves a hash of the measure-names keying each respective measure-value. The found string is then removed from the look-up list, and the next line is looked-up in the same way. The search stops as soon as there are no more strings in the look-up list (all have been found).
507            
508             In this way, there is only one pass through the file for the entire search; no line is looked-up more than once for all strings or their respective measure values. The method could be used for looking up a single string and/or a single value, but the other methods for doing this avoid the overhead of checking an array of strings, and splitting the line against the delimiter; this is only done here to facilitate caching multiple values whereas other methods avoid doing this as they only need to find one value after a known number of delimiters.
509            
510             =cut
511            
512             sub multi_list {
513 1     1 1 222 my ( $self, %args ) = @_;
514             croak 'Need a referenced list of strings to look up'
515 1 50       4 if !ref $args{'strings'};
516 1         3 my @strings = map { _get_usr_str($_) } @{ $args{'strings'} };
  2         6  
  1         3  
517 1         3 my %idx_hash = ();
518 1 50       3 if ( ref $args{'measures'} ) {
519 1         2 for my $field ( @{ $args{'measures'} } ) {
  1         3  
520             my $idx =
521 3         9 $self->{'_FIELDS'}->datum( $self->{'_LANG'}, $field . '_idx' );
522 3 50       53 if ( nocontent($idx) ) {
523 0         0 next;
524            
525             # croak "The requested value '$field' is not defined for the current SUBTLEX file";
526             }
527 3         28 $idx_hash{$idx} = $field;
528             }
529             }
530             return _get_any_vals_for_string_list( [@strings], \%idx_hash,
531 1         5 map { $self->{$_} } (qw/_PATH _DELIM _EQ/) );
  3         8  
532             }
533            
534             =head2 Descriptive frequency statistics for lists
535            
536             These methods return a descriptive statistic (sum, mean, median or standard deviation) for a list of B. Like L, they take an optional argument B to specify if the returned values should be occurrences per million, log frequencies, or Zipf values. Providing this as an argument obviates the need to provide multiple methods for each different type of frequency measure, e.g., "mean_opm()", mean_log_opm()", ...
537            
538             Because not all types of frequency scales (count, opm, log, Zipf) are provided in all SUBTLEX corpora, these methods will C if there are no canned stats for the particular scale called for.
539            
540             It might be thought useful to allow any valid scale to be returned by, say, providing each method without a value for B; a hash-ref of frequency values, keyed by scale-type, might be returned. However, this seems to be unrecommended; it assumes that users are blind as to what measures they want (as well as to what they can get).
541            
542             =head3 frq_sum
543            
544             $sum = $subtlex->frq_sum(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
545            
546             Returns the sum of the count, opm, log (usually opm) or Zipf frequency, depending on the value of B.
547            
548             =cut
549            
550             sub frq_sum {
551 0     0 1 0 my ( $self, %args ) = @_;
552 0         0 return sum( $self->_frq_vals(%args) );
553             }
554            
555             =head3 frq_mean
556            
557             $mean = $subtlex->frq_mean(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
558            
559             Returns the arithmetic average of the count, opm, log (usually opm) or Zipf frequency, depending on the value of B.
560            
561             =cut
562            
563             sub frq_mean {
564 5     5 1 37732 my ( $self, %args ) = @_;
565 5         21 return mean( $self->_frq_vals(%args) );
566             }
567             *mean_freq = \&frq_mean;
568            
569             =head3 frq_median
570            
571             $median = $subtlex->frq_median(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
572            
573             Returns the median count, opm, log (usually opm) or Zipf frequency for the given B, depending on the value of B.
574            
575             =cut
576            
577             sub frq_median {
578 1     1 1 421 my ( $self, %args ) = @_;
579 1         4 return median( $self->_frq_vals(%args) );
580             }
581             *median_freq = \*frq_median;
582            
583             =head3 frq_sd
584            
585             $sd = $subtlex->frq_sd(strings => [qw/word1 word2/], scale => 'count|opm|log|zipf');
586            
587             Returns the standard deviation of the count, opm, log (usually opm) or Zipf frequency, depending on the value of B.
588            
589             =cut
590            
591             sub frq_sd {
592 1     1 1 365 my ( $self, %args ) = @_;
593 1         5 return stddev( $self->_frq_vals(%args) );
594             }
595             *sd_freq = \*frq_sd;
596            
597             sub _frq_vals {
598 7     7   18 my ( $self, %args ) = @_;
599             croak
600             'No string(s) to test; pass one or more letter-strings named \'strings\' as a referenced array'
601 7 50       20 if !$args{'strings'};
602             my $strs =
603             ref $args{'strings'}
604 7 50       22 ? $args{'strings'}
605             : croak 'No reference to an array of letter-strings found';
606             my $col_i =
607             hascontent( $args{'scale'} )
608             ? $self->{'_FIELDS'}
609             ->datum( $self->{'_LANG'}, 'frq_' . $args{'scale'} . '_idx' )
610 7 100       24 : $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_opm_idx' );
611 7         169 my @vals = ();
612 7         9 for my $str ( @{$strs} ) {
  7         16  
613             push @vals,
614             _get_val_for_str( _get_usr_str($str), $col_i,
615 19         43 map { $self->{$_} } (qw/_PATH _DELIM _EQ/) );
  57         115  
616             }
617 7         46 return @vals;
618             }
619            
620             =head2 Retrieving letter-strings/words
621            
622             =head3 select_strings
623            
624             $aref = $subtlex->select_strings(frq_opm => [1, 20], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
625             $aref = $subtlex->select_strings(frq_zipf => [0, 2], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
626            
627             I: select_words
628            
629             Returns a list of strings (presumably words) from the SUBTLEX corpus that satisfies certain criteria, as per the following arguments:
630            
631             =over 2
632            
633             =item length
634            
635             minimum and/or maximum length of the string (or "letter-length")
636            
637             =item frq_opm, frq_log, cd_count, etc.
638            
639             minimum and/or maximum frequency (as given in whatever unit offered by the datafile for the set language)
640            
641             =item cv_pattern
642            
643             a consonant-vowel pattern, given as a string by the usual convention, e.g., 'CCVCC' defines a 5-letter word starting and ending with pairs of consonants, the pairs separated by a vowel. 'Y' is defined here as a consonant. The tested strings are stripped of marks and otherwise ASCII transliterated (using L) ahead of the check.
644            
645             =item regex
646            
647             a regular expression (L). In the examples above, only letter-strings starting with the letter 'f', followed by one of more other letters, are specified for retrieval. Alternatively, for example, the regex value '[^aeiouy]$' specifies that the letter-strings to be returned must not end with a vowel (or 'y'). The tested strings are stripped of marks and otherwise ASCII transliterated (using L) ahead of matching, so if the string in the file has, say, a 'u' with an Umlaut, it will match a 'u' in the regex.
648            
649             =back
650            
651             For the minimum/maximum constrained criteria, the two limits are given as a referenced array where the first element is the minimum and the second element is the maximum. For example, [3, 7] would specify letter-strings of 3 to 7 letters in length; [4, 4] specifies letter-strings of only 4 letters in length. If only one of these is to be constrained, then the array would be given as, e.g., [3] to specify a minimum of 3 letters without constraining the maximum, or ['',7] for a maximum of 7 letters without constraining the minimum (checking if the element C as per String::Util).
652            
653             The value returned is always a reference to the list of words retrieved (or to an empty list if none was retrieved).
654            
655             Calling this method as "list_strings" or "list_words" is deprecated; to avoid confusion with L, which also returns a list of strings. A deprecation warning and wrap to the method is in place as of version 0.06 if using this name; they will be removed in a subsequent version.
656            
657             =cut
658            
659             sub select_strings {
660 7     7 1 5726 my ( $self, %args ) = @_;
661 7         14 my %patterns = ();
662 7 100       25 if ( hascontent( $args{'regex'} ) ) {
663 1         20 $patterns{'regex'} = qr/$args{'regex'}/xms;
664             }
665 7 100       45 if ( hascontent( $args{'cv_pattern'} ) ) {
666 5         42 my $tmp = q{};
667 5         27 my @c = split m//ms, uc( $args{'cv_pattern'} );
668 5         14 foreach (@c) {
669 36 100       64 $tmp .= $_ eq 'C' ? '[BCDFGHJKLMNPQRSTVWXYZ]' : '[AEIOU]';
670             }
671 5         243 $patterns{'cv_pattern'} = qr/^$tmp$/ixms;
672             }
673            
674 7         25 my @list = ();
675 7 50       256 open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
676             LINES:
677 7         115 while (<$fh>) {
678 218 100       4892 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
679 211         867 my @line = split m/\Q$self->{'_DELIM'}\E/xms;
680 211 100       278 next if !_in_range( length( $line[0] ), @{ $args{'length'} } );
  211         362  
681 157         296 for ( keys %patterns ) {
682 131 100       292 next LINES if unidecode( $line[0] ) !~ $patterns{$_};
683             }
684 36         177 for (qw/frq_count frq_opm frq_log frq_zipf cd_count cd_pct cd_log/) {
685 152 100 66     389 if (
686             ref $args{$_}
687             and hascontent(
688             $self->{'_FIELDS'}->datum( $self->{'_LANG'}, $_ . '_idx' )
689             )
690             )
691             {
692             next LINES
693             if !_in_range(
694             _clean_value(
695             $line[
696             $self->{'_FIELDS'}
697             ->datum( $self->{'_LANG'}, $_ . '_idx' )
698             ]
699             ),
700 32 100       605 @{ $args{$_} }
  32         727  
701             );
702             }
703             }
704 16 50       27 if ( ref $args{'pos'} ) {
705             next LINES
706             if none {
707             $_ eq $line[ $self->{'_FIELDS'}
708 0     0   0 ->datum( $self->{'_LANG'}, 'pos_dom_idx' ) ]
709             }
710 0 0       0 @{ $args{'pos'} };
  0         0  
711             }
712 16         64 push @list, $line[0];
713             }
714 7 50       154 close $fh or croak $OS_ERROR;
715            
716 7         53 return \@list;
717             }
718             *select_words = \&select_strings;
719            
720             =head3 all_strings
721            
722             $aref = $subtlex->all_strings();
723            
724             I: all_words
725            
726             Returns a reference to an array of all letter-strings in the corpus. These are culled of empty and duplicate strings, and then alphabetically sorted.
727            
728             =cut
729            
730             sub all_strings {
731 1     1 1 2 my ( $self, %args ) = @_;
732 1         2 my @list = ();
733 1 50       36 open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
734 1         13 while (<$fh>) {
735 7 100       130 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
736 6         9 push @list, _get_file_str( $_, $self->{'_DELIM'} );
737             }
738 1 50       37 close $fh or croak $OS_ERROR;
739 1         4 return [ sort { lc($a) cmp lc($b) } uniq( grep { hascontent($_) } @list ) ];
  8         24  
  6         32  
740             }
741             *all_words = \&all_strings;
742            
743             =head3 random_string
744            
745             $string = $subtlex->random_string();
746             @data = $subtlex->random_string();
747            
748             I: random_word
749            
750             Picks a random line from the corpus, using L (except the top header line). Returns the word in that line if called in scalar context; otherwise, the array of data for that line. (A future version might let specifying a match to specific criteria, self-aborting after trying X lines.)
751            
752             =cut
753            
754             sub random_string {
755 2     2 1 410 my ( $self, %args ) = @_;
756 2         13 eval { require File::RandomLine; };
  2         379  
757 2 50       2446 croak 'Need to install and access module File::RandomLine' if $EVAL_ERROR;
758             my $rl =
759 2         16 File::RandomLine->new( $self->{'_PATH'}, { algorithm => 'uniform' } );
760 2         312 my @ari = ();
761 2   66     8 while ( not scalar @ari or $ari[0] eq 'Word' ) {
762 2         17 @ari = split m/\Q$self->{'_DELIM'}\E/xms, $rl->next;
763             }
764 2 100       152 return wantarray ? @ari : $ari[0];
765             }
766             *random_word = \&random_string;
767            
768             =head2 Miscellaneous
769            
770             =head3 n_lines
771            
772             $num = $subtlex->n_lines();
773            
774             Returns the number of lines, less the column headings and any lines with no content, in the installed language file. Expects/accepts no arguments.
775            
776             =cut
777            
778             sub n_lines {
779 3     3 1 1216 my $self = shift;
780 3         8 my $z = 0;
781 3 50       108 open( my $fh, q{<}, $self->{'_PATH'} ) or croak $OS_ERROR;
782 3         42 while (<$fh>) {
783 94 100       153 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
784 91 50       129 next if nocontent($_);
785 91         710 $z++;
786             }
787 3 50       30 close $fh or croak $OS_ERROR;
788 3         16 return $z;
789             }
790             *nlines = \&n_lines; # legacy alias
791            
792             =head3 pct_alpha
793            
794             Returns the percentage of strings in the subtitles file that satisfy "look like words" relative to the number of lines (as per L). Specifically, after ASCII transliteration of the string (per L), does it match to /[\p{XPosixAlpha}\-']/ (per L, but including apostrophes and dashes)?
795            
796             =cut
797            
798             sub pct_alpha {
799 1     1 1 227 my ( $self, %args ) = @_;
800 1         2 my $all_strs_aref = $self->all_strings();
801 1         2 my $count_all = count( @{$all_strs_aref} );
  1         3  
802 1         11 my $pct = q{};
803 1 50       3 if ( $count_all > 0 ) {
804 17     17   129 my $count_alpha_strings = count( grep { m/[\p{XPosixAlpha}\-']/xsm }
  17         35  
  17         236  
  5         19  
805 1         2 map { unidecode($_) } @{$all_strs_aref} );
  5         47  
  1         1  
806 1         10 $pct = 100 * $count_alpha_strings / $count_all;
807             }
808 1         3 return $pct;
809             }
810            
811             =head3 set_lang
812            
813             $lang = $subtlex->set_lang(lang => STR); # DE, FR, NL_all, NL_min, PT, UK or US
814             $lang = $subtlex->set_lang(lang => STR, path => 'this/is/the/file.csv');
815             $lang = $subtlex->set_lang(lang => STR, dir => 'file/is/in/here');
816            
817             Set or guess location of datafile; see L. Naturally, the given value of B (required)--which is used as a database ID--should correspond with any given B to the SUBTLEX datafile (optional but recommended). If only a B value is given, the SUBTLEX datafile should be named so that it uniquely includes the specific value of B.
818            
819             =cut
820            
821             sub set_lang {
822 25     25 1 375 my ( $self, %args ) = @_;
823             ## firstly, establish the language to use, and the directory in which this module lives:
824 25 100       150 return if nocontent($args{'lang'});
825             #@ is the complete pathname actually given in args?
826 24 50       378 croak 'Need a valid attribute' if not ref $self->{'_FIELDS'}->record( $args{'lang'} );
827            
828 24         326 $self->{'_LANG'} = delete $args{'lang'};
829            
830 24 50       69 if ( hascontent( $args{'path'} ) ) {
831 24 50       588 if ( !-e $args{'path'} ) {
832 0         0 croak
833             "Path given for SUBTLEX corpus does not exist: '$args{'path'}'";
834             }
835             else {
836 24         88 $self->{'_PATH'} = delete $args{'path'};
837             }
838             }
839             else {
840 0         0 my ( $lang, $dir, $path ) = ( $self->{'_LANG'} );
841 0 0       0 if ( $args{'dir'} ) { # check it's a dir:
842             croak "Value for argument 'dir' ($args{'dir'}) is not a directory"
843 0 0       0 if !-d $args{'dir'};
844 0         0 $dir = delete $args{'dir'};
845             }
846             else { # use module's dir :
847 0         0 $dir = $self->{'_MODULE_DIR'};
848             }
849 0         0 for ( path($dir)->children ) {
850 0 0       0 if (/(?:SUBTLEX[\-_])?\Q$lang/imsx) {
851 0         0 $path = $_;
852 0         0 last;
853             }
854             }
855 0 0 0     0 if ( nocontent($path) or not -T $path )
856             { # only already defined if it exists
857 0         0 croak
858             "Cannot find required SUBTLEX datafile for language '$self->{'_LANG'}' within '$dir'.\nInstall the database (from the URL given in the POD) into either:\n\t(1) the Lingua/Norms/SUBTLEX directory within your Perl distribution (with the filename specified in the POD);\n\t(2) a directory you specify to new(dir => 'my/dir/to/lang/file') (again with the filename specified in the POD); or\n\t(3) a directory, specifying the complete path to that file in new(path => 'this/is/the/file.csv'), including its filename";
859             }
860             else {
861 0         0 $self->{'_PATH'} = $path;
862             }
863             }
864            
865             $self->{'_DELIM'} =
866 24         114 get_separator( path => $self->{'_PATH'}, lucky => 1 );
867            
868 24         5237 return $self->{'_LANG'};
869             }
870            
871             =head3 get_lang
872            
873             $str = $subtlex->get_lang();
874            
875             Returns the language code (e.g., 'UK', 'FR') currently set for the module (which determines the file being looked up, if not explicitly given). The empty string is returned if the language has not been set.
876            
877             =cut
878            
879             sub get_lang {
880 8     8 1 2525 my ( $self, %args ) = @_;
881 8 50       34 return hascontent( $self->{'_LANG'} ) ? $self->{'_LANG'} : q{};
882             }
883            
884             =head3 get_path2db
885            
886             $path = $subtlex->get_path2db();
887            
888             Returns the path (directory and filename) from which the module's methods are currently set to look-up strings, frequencies, etc.
889            
890             =cut
891            
892             sub get_path2db {
893 0     0 1 0 my ( $self, %args ) = @_;
894 0         0 return path( $self->{'_PATH'} )->stringify;
895             }
896            
897             =head3 get_index
898            
899             $int = $subtlex->get_index(measure => 'frq_opm');
900            
901             Returns the index within the currently looked-up file that contains the given B.
902            
903             =cut
904            
905             sub get_index {
906 5     5 1 2673 my ( $self, %args ) = @_;
907 5 50       20 my $var = delete $args{'measure'} or croak 'Need a named measure';
908 5         25 return $self->{'_FIELDS'}->datum( $self->{'_LANG'}, $var . '_idx' )
909             ; #{$var};
910             }
911            
912             =head3 set_eq
913            
914             $subtlex->set_eq(match_level => INT); # undef, 0, 1, 2 or 3
915            
916             See L.
917            
918             =head3 url2datafile
919            
920             $url = $subtlex->url2datafile(lang => STRING);
921             %loc = $subtlex->url2datafile(lang => STRING);
922            
923             Returns the URL (complete path) where the SUBTLEX file for a given language is stored, and from which it should be downloadable. These are locations as specified (at the time of releasing this version of the module) at L and/or L, and so as listed in the L section. This could include an archive from within which the file needs to be retrieved. Called in list context, this method returns a hash with keys for 'www_dir', 'archive' (if the file is within an archive) and 'filename'. (This module does not fetch the file off the WWW itself; it should be installed and available on the local machine/network--see L).
924            
925             =cut
926            
927       0 1   sub url2datafile {
928            
929             #my ($self, %args) = @_;
930             #croak 'A value for the argument needs to be given for SUBTLEX url2datafile' if nocontent($args{'lang'});
931             #my $lang = delete $args{'lang'};
932             #croak "The value for the argument => $lang is not recognised" if none { $_ => $lang } (qw/UK US NL DE/);
933             # Hard-copy of WWW dirs, archives (where rel) and filenames for the SUBTLEX files:
934             ## some datafiles are within compressed archives, some not, so ...
935             #my %req_filespecs = %{$path_hash{$lang}};
936            
937             #return wantarray ? %req_filespecs : File::Spec->catfile($req_filespecs{$lang}->{'www_dir'}, $req_filespecs{$lang}->{'archive'}, $req_filespecs{$lang}->{'file'});
938             }
939            
940             ### PRIVATMETHODEN:
941            
942             sub _get_usr_str {
943 118     118   173 my $str = shift;
944 118 50       273 croak 'No string to test; pass a value for to the requested method'
945             if nocontent($str);
946 118         1397 return $str;
947            
948             #return decode( 'UTF-8', $str );#
949             }
950            
951             # Given a line from a SUBTLEX file, return all the characters from the start of the line up to the delimiter for that file, after stripping it of any quote characters - e.g., if the line starts: "abacus",20,30 ... and the delimiter is a comma, return: abacus
952             sub _get_file_str {
953 1680     1680   2812 my ( $line, $delim ) = @_;
954 1680         5538 $line =~ /^([^\Q$delim\E]+)/xms;
955 1680         2988 return trim( unquote($1) );
956            
957             #my $str = decode('UTF-8', trim(unquote($1)) );
958             #print STDERR "<$str>\n";
959             #return $str;
960            
961             #my $code = guess_encoding($str, qw/ascii utf8 utf16 iso-8859-1 cp1250 latin1 greek/);
962             #print STDERR "$str\t", $code->decode($str), "\n";
963             #return $code->decode($str);
964             }
965            
966             sub _get_val_for_str {
967 103     103   202 my ( $str, $col_i, $path, $delim, $eq_fn ) = @_;
968 103 50       166 croak
969             'No word to test; pass a letter-string named \'string\' to the function'
970             if nocontent($str);
971 103 50       851 croak "The requested value is not defined for the current SUBTLEX corpus"
972             if nocontent($col_i);
973            
974 103         785 my $val = q{}; # default value returned is empty string
975 103 50       3545 open( my $fh, q{<}, $path ) or croak $OS_ERROR;
976 103         1273 while (<$fh>) {
977 1522 100       69664 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
978 1419         1876 my $file_str =
979             _get_file_str( $_, $delim ); # have to declare as can be empty (!)
980 1419 50       30140 next if nocontent($file_str);
981 1419 100       11500 if ( $eq_fn->( $str, $file_str ) ) {
982 102         4827 $val = _get_val( $_, $delim, $col_i );
983 102         2376 last;
984             }
985             }
986 103 50       1488 close $fh or croak $OS_ERROR;
987 103         593 return $val;
988             }
989            
990             sub _get_val {
991 102     102   203 my ( $line, $delim, $col_i ) = @_;
992            
993             # if the line has quoted fields, and uses the delimiter within the quotes,
994             # as in SUBTLEX-PT, need to firstly clean the line up:
995             # this "fix" assumes the quotes are either double- or single quotes and nothing else,
996             # and there is no trailing delimiter.
997             # It strips the quotes, and replaces the comma with a vertical bar:
998 102         736 $line =~ s/["']([^"'\Q$delim\E]+)\Q$delim\E([^"'\Q$delim\E]+)["']/$1|$2/gxsm;
999            
1000 102         2184 $line =~ m/^(
1001             [^\Q$delim\E]* # any character from the start not including the delimiter (which might be \t)
1002             \Q$delim\E # now ending with the delimiter, perhaps as a quoted string
1003             )
1004             {$col_i,}? # as many times as necessary to get to the required field value
1005             ([^\Q$delim\E]*) # which should be here
1006             /msx;
1007 102         293 return _clean_value($2); # now format the number, strip space ...
1008             }
1009            
1010             sub _get_val_for_strs {
1011 3     3   7 my ( $str, $col_i_aref, $path, $delim, $eq_fn ) = @_;
1012            
1013             # Check we have a string, and valid filed indices:
1014 3 50       6 croak
1015             'No word to test; pass a letter-string named \'string\' to the function'
1016             if nocontent($str);
1017             croak "The requested value is not defined for the SUBTLEX corpus"
1018 3 50   6   28 if any { nocontent($_) } @{$col_i_aref};
  6         26  
  3         11  
1019            
1020 3         29 my $val = [];
1021            
1022             # Search for the string, and isolate the requested values:
1023 3 50       104 open( my $fh, q{<}, $path ) or croak $OS_ERROR;
1024 3         44 while (<$fh>) {
1025 37 100       154 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
1026 34         52 my $file_str =
1027             _get_file_str( $_, $delim ); # have to declare as can be empty (!)
1028 34 50       731 next if nocontent($file_str);
1029 34 100       318 if ( $eq_fn->( $str, $file_str ) ) {
1030 3         36 my @line = split m/\Q$delim\E/xms;
1031 3         5 for my $col_i ( @{$col_i_aref} ) {
  3         8  
1032 6         75 push @{$val}, _clean_value( $line[$col_i] );
  6         15  
1033             }
1034 3         61 last;
1035             }
1036             }
1037 3 50       43 close $fh or croak;
1038            
1039             # return the reference to array if there is more than 1 value, otherwise just the single value itself
1040             ## but if the string itself was not found, return the empty string for the number of requested fields:
1041 3         7 my $n_vals = scalar grep { hascontent($_) } @{$val};
  6         30  
  3         6  
1042             return
1043             $n_vals
1044             ? $n_vals > 1
1045             ? $val
1046             : $val->[0]
1047 3 50       35 : scalar @{$col_i_aref} > 1 ? [ q{} x scalar @{$col_i_aref} ]
  0 0       0  
  0 50       0  
1048             : q{};
1049             }
1050            
1051             sub _get_any_vals_for_string_list {
1052 1     1   3 my ( $str_aref, $col_i_href, $path, $delim, $eq_fn ) = @_;
1053 1         3 my %string_vals = ();
1054 1         2 my @usr_strings = sort { $a cmp $b } @{$str_aref};
  1         4  
  1         4  
1055            
1056             # Search for the string, and isolate the requested values:
1057 1 50       47 open( my $fh, q{<}, $path ) or croak $OS_ERROR;
1058 1         12 while (<$fh>) {
1059 5 100       26 next if $INPUT_LINE_NUMBER == 1; # skip column heading line
1060 4         21 my $file_str =
1061             _get_file_str( $_, $delim ); # have to declare as can be empty (!)
1062 4 50       136 next if nocontent($file_str);
1063 4 100   7   60 if ( my $found = first { $eq_fn->( $_, $file_str ) } @usr_strings ) {
  7         30  
1064 2         30 my @line = split m/\Q$delim\E/xms; # split the line
1065 2         3 for my $col_i ( keys %{$col_i_href} ) {
  2         6  
1066 6         94 $string_vals{$file_str}->{ $col_i_href->{$col_i} } =
1067             _clean_value( $line[$col_i] );
1068             }
1069 2 100       41 last if scalar keys %string_vals == scalar @{$str_aref};
  2         6  
1070 1     2   8 splice @usr_strings, ( firstidx { $_ eq $found } @usr_strings ), 1;
  2         10  
1071            
1072             #print STDERR "checking ",join(q{,}, @usr_strings),"\n";
1073             }
1074             }
1075 1 50       16 close $fh or croak;
1076 1         18 return \%string_vals;
1077            
1078             # return the reference to array if there is more than 1 value, otherwise just the single value itself
1079             ## but if the string itself was not found, return the empty string for the number of requested fields:
1080             #my $n_vals = scalar grep { hascontent($_) } @{$val};
1081             #return $n_vals ? $n_vals > 1 ? $val : $val->[0] : scalar @{$col_i_aref} > 1 ? [q{} x scalar @{$col_i_aref}] : q{};
1082             }
1083            
1084             # Loads a hash-ref of the "specs" for each language file, including the field indices in each file for the measures they contain:
1085             ## Called only by new() after setting the MODULE_DIR
1086             sub _set_spec_hash {
1087 23     23   70 my ( $self, $fieldpath ) = @_;
1088 23   66     81 $fieldpath ||= File::Spec->catfile( $self->{'_MODULE_DIR'}, 'specs.csv' );
1089 23         194 $self->{'_FIELDS'} = Text::CSV::Hashify->new(
1090             { file => $fieldpath, format => 'hoh', key => 'Lang_stub' } );
1091 23         34436 return;
1092             }
1093            
1094             sub _in_range {
1095 243     243   355 my ( $n, $min, $max ) = @_;
1096 243         261 my $res = 1;
1097 243 50       375 if ( !is_numeric($n) ) {
1098 0         0 $res = 0;
1099             }
1100             else {
1101 243 100 100     3170 if ( hascontent($min) and $n < $min ) { # fails min
1102 48         418 $res = 0;
1103             }
1104 243 100 100     1229 if ( $res && ( hascontent($max) and $n > $max ) ) { # fails max and min
      100        
1105 26         196 $res = 0;
1106             }
1107             }
1108 243         1103 return $res;
1109             }
1110            
1111             sub _clean_value {
1112 146     146   590 my $val = shift;
1113 146 50       339 return q{} if nocontent($val);
1114 146         1286 $val =~ s/,([^,]+)$/.$1/xsm; # replace ultimate , with .
1115 146         275 return trim( unquote($val) );
1116             }
1117            
1118             sub _pos_is {
1119 12     12   693 my ( $pos_aref, $fields, $lang ) = @_;
1120 12 50       32 $pos_aref = [$pos_aref] if !ref $pos_aref;
1121 12         15 my @test_str = map { split /[\W\.]+/xsm } @{$pos_aref};
  12         35  
  12         16  
1122 12 50       22 return [qw/UK/] if !scalar @test_str;
1123 12         15 my @pos_ari = ();
1124 12         19 for my $pos_str (@test_str) {
1125             push @pos_ari, first {
1126             hascontent( $fields->datum( $lang, 'pos_' . $_ ) )
1127 47 100   47   162 and first { $_ =~ m/^$pos_str$/xsm }(split /\|/, $fields->datum( $lang, 'pos_' . $_ ))
  46         1402  
1128             }
1129 12         54 qw/NN VB AJ AV CJ PN PP DA NM IJ NB OT UK/;
1130            
1131             }
1132 12         48 return \@pos_ari;
1133             }
1134            
1135             sub _log10 {
1136 0     0   0 return log(shift) / log(10);
1137             }
1138            
1139             sub _val_or_0 {
1140 46     46   77 my $val = shift;
1141 46 100       115 return ( is_numeric($val) ) ? $val : 0;
1142             }
1143            
1144             sub _croak_defunct {
1145 0     0     croak
1146             'That method is defunct. See the POD for an alternative, and the CHANGES file';
1147             }
1148             *freqhash = \&_croak_defunct;
1149             *ldist = \&_croak_defunct;
1150             *on_count = \&_croak_defunct;
1151             *on_ldist = \&_croak_defunct;
1152             *on_freq_max = \&_croak_defunct;
1153             *on_zipf_mean = \&_croak_defunct;
1154             *on_freq_mean = \&_croak_defunct;
1155             *on_lfreq_mean = \&_croak_defunct;
1156             *on_frq_opm_max = \&_croak_defunct;
1157             *on_frq_opm_max = \&_croak_defunct;
1158             *on_frq_zipf_mean = \&_croak_defunct;
1159            
1160             sub _carp_deprecated {
1161 0     0     my ( $self, %args ) = @_;
1162 0           carp
1163             'That method is deprecated. See the POD for an alternative, and the CHANGES file';
1164 0           return;
1165             }
1166             *list_words = \&_carp_deprecated;
1167             *list_strings = \&_carp_deprecated;
1168            
1169             =head1 DIAGNOSTICS
1170            
1171             =over 4
1172            
1173             =item * Need a valid attribute
1174            
1175             When constructing the class object with L, the B argument must have a valid value, as indicated in the table above. Also, the module needs to read in the contents of a file named "specs.csv" which should be located within the SUBTLEX directory where the module itself is located (alongside the downloaded SUBTLEX files). This file specifies the field indices for the various stats within each SUBTLEX datafile. Check that this file is indeed within the Perl/site/lib/Lingua/Norms/SUBTLEX directory. If it is not, download and install the file to that location via the L package of this module.
1176            
1177             =item * Value given to argument 'dir' (VALUE) in new() is not a directory
1178            
1179             Croaked from L if called with a value for the argument B, and this value is not actually a directory/folder. This is the directory/folder in which the actual SUBTLEX datafiles should be located.
1180            
1181             =item * Cannot find required database for language ...
1182            
1183             Croaked from L if none of the given values to arguments B, B or B are valid, and even the default site/lib directory and US database are not accessible. Check that your have indeed a file with the given value of B (DE, NL, UK or US) within the Perl/site/lib/Lingua/Norms/SUBTLEX directory, or at least that the SUBTLEX-US file is located within it, and can be read via your script.
1184            
1185             =item * Cannot determine fields for given language
1186            
1187             Croaked upon construction if no fields are recognized for the given language. The value given to B must be one of DE, NL, UK or US.
1188            
1189             =item * The requested value is not defined for the ... SUBTLEX corpus
1190            
1191             Croaked when calling for a value for a statistic that is not defined for a given language, e.g., when requesting a value for the Zipf frequency in the NL corpus.
1192            
1193             =item * No string to test; pass a value for to FUNCTION()
1194            
1195             Croaked by several methods that expect a value for the named argument B, and when no such value is given. These methods require the letter-string to be passed to it as a I => I pair, with the key B followed by the value of the string to test.
1196            
1197             =item * No string(s) to test; pass one or more letter-strings named \'strings\' as a referenced array
1198            
1199             Same as above but specifically croaked by L which accepts more than one string in a single call.
1200            
1201             =item * Need to install and have access to module File::RandomLine
1202            
1203             Croaked by method L if the module it depends on (File::RandomLine) is not installed or accessible. This should have been installed (if not already) upon installation of the present module. See L to download and install this module manually.
1204            
1205             =back
1206            
1207             =head1 DEPENDENCIES
1208            
1209             L : for L
1210            
1211             L : for C method
1212            
1213             L : C, C, C, C and other functions
1214            
1215             L : C
1216            
1217             L : for directory reading when calling L
1218            
1219             L : for various statistical methods
1220            
1221             L : C
1222            
1223             L : for determining valid string values
1224            
1225             L : reads in the specs file
1226            
1227             L : for determining the field delimiter within the datafiles
1228            
1229             L : for plain ASCII transliterations of Unicode text
1230            
1231             =head1 REFERENCES
1232            
1233             Brysbaert, M., Buchmeier, M., Conrad, M., Jacobs, A.M., Boelte, J., & Boehl, A. (2011). The word frequency effect: A review of recent developments and implications for the choice of frequency estimates in German. I, I<58>, 412-424. doi: L<10.1027/1618-3169/a000123|http://dx.doi.org/10.1027/1618-3169/a000123>
1234            
1235             Brysbaert, M., & New, B. (2009). Moving beyond Kucera and Francis: A critical evaluation of current word frequency norms and the introduction of a new and improved word frequency measure for American English. I, I<41>, 977-990. doi: L<10.3758/BRM.41.4.977|http://dx.doi.org/10.3758/BRM.41.4.977>
1236            
1237             Brysbaert, M., New, B., & Keuleers,E. (2012). Adding part-of-speech information to the SUBTLEX-US word frequencies. I, I<44>, 991-997. doi: L<10.3758/s13428-012-0190-4|http://dx.doi.org/10.3758/s13428-012-0190-4>
1238            
1239             Herdagdelen, A., & Marelli, M. (2017). Social media and language processing: How Facebook and Twitter provide the best frequency estimates for studying word recognition. I, I<41>, 976-995. doi:L<10.1111/cogs.12392|http://dx.doi.org/10.1111/cogs.12392>
1240            
1241             Keuleers, E., Brysbaert, M., & New, B. (2010). SUBTLEX-NL: A new frequency measure for Dutch words based on film subtitles. I, I<42>, 643-650. doi: L<10.3758/BRM.42.3.643|http://dx.doi.org/10.3758/BRM.42.3.643>
1242            
1243             New, B., Brysbaert, M., Veronis, J., & Pallier, C. (2007). The use of film subtitles to estimate word frequencies. I, I<28>, 661-677.
1244            
1245             Soares, A. P., Machado, J., Costa, A., Comesaña, M., & Perea, M. (in press). On the advantages of frequency measures extracted from subtitles: The case of Portuguese. I.
1246            
1247             Van Heuven, W. J. B., Mandera, P., Keuleers, E., & Brysbaert, M. (2014). SUBTLEX-UK: A new and improved word frequency database for British English. I, I<67>, 1176-1190. doi: L<10.1080/17470218.2013.850521|http://dx.doi.org/10.1080/17470218.2013.850521>
1248            
1249             =head1 AUTHOR
1250            
1251             Roderick Garton, C<< >>
1252            
1253             =head1 BUGS AND LIMITATIONS
1254            
1255             Please report any bugs or feature requests to C, or through
1256             the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
1257            
1258             =head1 SUPPORT
1259            
1260             You can find documentation for this module with the perldoc command.
1261            
1262             perldoc Lingua::Norms::SUBTLEX
1263            
1264             You can also look for information at:
1265            
1266             =over 4
1267            
1268             =item * RT: CPAN's request tracker (report bugs here)
1269            
1270             L
1271            
1272             =item * AnnoCPAN: Annotated CPAN documentation
1273            
1274             L
1275            
1276             =item * CPAN Ratings
1277            
1278             L
1279            
1280             =item * Search CPAN
1281            
1282             L
1283            
1284             =back
1285            
1286             =head1 LICENSE AND COPYRIGHT
1287            
1288             Copyright 2014-2018 Roderick Garton.
1289            
1290             This program is free software; you can redistribute it and/or modify it
1291             under the terms of either: the GNU General Public License as published
1292             by the Free Software Foundation; or the Artistic License.
1293            
1294             See L for more information.
1295            
1296             To the maximum extent permitted by applicable law, the author of this module disclaims all warranties, either express or implied, including but not limited to implied warranties of merchantability and fitness for a particular purpose, with regard to the software and the accompanying documentation.
1297            
1298             =cut
1299            
1300             1; # End of Lingua::Norms::SUBTLEX